diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 00000000..5cfa5854 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,84 @@ +{ + "hooks": { + "PostToolUse": [ + { + "matcher": "Task", + "hooks": [ + { + "type": "command", + "command": "entire hooks claude-code post-task" + } + ] + }, + { + "matcher": "TodoWrite", + "hooks": [ + { + "type": "command", + "command": "entire hooks claude-code post-todo" + } + ] + } + ], + "PreToolUse": [ + { + "matcher": "Task", + "hooks": [ + { + "type": "command", + "command": "entire hooks claude-code pre-task" + } + ] + } + ], + "SessionEnd": [ + { + "matcher": "", + "hooks": [ + { + "type": "command", + "command": "entire hooks claude-code session-end" + } + ] + } + ], + "SessionStart": [ + { + "matcher": "", + "hooks": [ + { + "type": "command", + "command": "entire hooks claude-code session-start" + } + ] + } + ], + "Stop": [ + { + "matcher": "", + "hooks": [ + { + "type": "command", + "command": "entire hooks claude-code stop" + } + ] + } + ], + "UserPromptSubmit": [ + { + "matcher": "", + "hooks": [ + { + "type": "command", + "command": "entire hooks claude-code user-prompt-submit" + } + ] + } + ] + }, + "permissions": { + "deny": [ + "Read(./.entire/metadata/**)" + ] + } +} diff --git a/.entire/.gitignore b/.entire/.gitignore new file mode 100644 index 00000000..2cffdefa --- /dev/null +++ b/.entire/.gitignore @@ -0,0 +1,4 @@ +tmp/ +settings.local.json +metadata/ +logs/ diff --git a/.entire/settings.json b/.entire/settings.json new file mode 100644 index 00000000..7cce5590 --- /dev/null +++ b/.entire/settings.json @@ -0,0 +1,4 @@ +{ + "enabled": true, + "telemetry": true +} diff --git a/.github/dependabot.yml b/.github/dependabot.yml deleted file mode 100644 index 87600f2c..00000000 --- a/.github/dependabot.yml +++ /dev/null @@ -1,15 +0,0 @@ -# To get started with Dependabot version updates, you'll need to specify which -# package ecosystems to update and where the package manifests are located. -# Please see the documentation for all configuration options: -# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file - -version: 2 -updates: - - package-ecosystem: "gomod" - directory: "/" - schedule: - interval: "weekly" - - package-ecosystem: "npm" - directory: "/web/frontend" - schedule: - interval: "weekly" diff --git a/.gitignore b/.gitignore index e03d8071..67dbb510 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,7 @@ /var/checkpoints* migrateTimestamps.pl -test_ccms_write_api.sh +test_ccms_* /web/frontend/public/build /web/frontend/node_modules diff --git a/.goreleaser.yaml b/.goreleaser.yaml index 3edcb7d6..f861e3c2 100644 --- a/.goreleaser.yaml +++ b/.goreleaser.yaml @@ -1,3 +1,4 @@ +version: 2 before: hooks: - go mod tidy @@ -34,6 +35,19 @@ builds: main: ./tools/archive-manager tags: - static_build + - env: + - CGO_ENABLED=0 + goos: + - linux + goarch: + - amd64 + goamd64: + - v3 + id: "archive-migration" + binary: archive-migration + main: ./tools/archive-migration + tags: + - static_build - env: - CGO_ENABLED=0 goos: @@ -48,7 +62,7 @@ builds: tags: - static_build archives: - - format: tar.gz + - formats: tar.gz # this name template makes the OS and Arch compatible with the results of uname. name_template: >- {{ .ProjectName }}_ @@ -59,7 +73,7 @@ archives: checksum: name_template: "checksums.txt" snapshot: - name_template: "{{ incpatch .Version }}-next" + version_template: "{{ incpatch .Version }}-next" changelog: sort: asc filters: @@ -87,7 +101,7 @@ changelog: release: draft: false footer: | - Supports job archive version 2 and database version 8. + Supports job archive version 3 and database version 10. Please check out the [Release Notes](https://github.com/ClusterCockpit/cc-backend/blob/master/ReleaseNotes.md) for further details on breaking changes. # vim: set ts=2 sw=2 tw=0 fo=cnqoj diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..847bc094 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,26 @@ +# ClusterCockpit Backend - Agent Guidelines + +## Build/Test Commands + +- Build: `make` or `go build ./cmd/cc-backend` +- Run all tests: `make test` (runs: `go clean -testcache && go build ./... && go vet ./... && go test ./...`) +- Run single test: `go test -run TestName ./path/to/package` +- Run single test file: `go test ./path/to/package -run TestName` +- Frontend build: `cd web/frontend && npm install && npm run build` +- Generate GraphQL: `make graphql` (uses gqlgen) +- Generate Swagger: `make swagger` (uses swaggo/swag) + +## Code Style + +- **Formatting**: Use `gofumpt` for all Go files (strict requirement) +- **Copyright header**: All files must include copyright header (see existing files) +- **Package docs**: Document packages with comprehensive package-level comments explaining purpose, usage, configuration +- **Imports**: Standard library first, then external packages, then internal packages (grouped with blank lines) +- **Naming**: Use camelCase for private, PascalCase for exported; descriptive names (e.g., `JobRepository`, `handleError`) +- **Error handling**: Return errors, don't panic; use custom error types where appropriate; log with cclog package +- **Logging**: Use `cclog` package (e.g., `cclog.Errorf()`, `cclog.Warnf()`, `cclog.Debugf()`) +- **Testing**: Use standard `testing` package; use `testify/assert` for assertions; name tests `TestFunctionName` +- **Comments**: Document all exported functions/types with godoc-style comments +- **Structs**: Document fields with inline comments, especially for complex configurations +- **HTTP handlers**: Return proper status codes; use `handleError()` helper for consistent error responses +- **JSON**: Use struct tags for JSON marshaling; `DisallowUnknownFields()` for strict decoding diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..a8d56571 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,306 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with +code in this repository. + +## Project Overview + +ClusterCockpit is a job-specific performance monitoring framework for HPC +clusters. This is a Golang backend that provides REST and GraphQL APIs, serves a +Svelte-based frontend, and manages job archives and metric data from various +time-series databases. + +## Build and Development Commands + +### Building + +```bash +# Build everything (frontend + backend) +make + +# Build only the frontend +make frontend + +# Build only the backend (requires frontend to be built first) +go build -ldflags='-s -X main.date=$(date +"%Y-%m-%d:T%H:%M:%S") -X main.version=1.5.0 -X main.commit=$(git rev-parse --short HEAD)' ./cmd/cc-backend +``` + +### Testing + +```bash +# Run all tests +make test + +# Run tests with verbose output +go test -v ./... + +# Run tests for a specific package +go test ./internal/repository +``` + +### Code Generation + +```bash +# Regenerate GraphQL schema and resolvers (after modifying api/schema.graphqls) +make graphql + +# Regenerate Swagger/OpenAPI docs (after modifying API comments) +make swagger +``` + +### Frontend Development + +```bash +cd web/frontend + +# Install dependencies +npm install + +# Build for production +npm run build + +# Development mode with watch +npm run dev +``` + +### Running + +```bash +# Initialize database and create admin user +./cc-backend -init-db -add-user demo:admin:demo + +# Start server in development mode (enables GraphQL Playground and Swagger UI) +./cc-backend -server -dev -loglevel info + +# Start demo with sample data +./startDemo.sh +``` + +## Architecture + +### Backend Structure + +The backend follows a layered architecture with clear separation of concerns: + +- **cmd/cc-backend**: Entry point, orchestrates initialization of all subsystems +- **internal/repository**: Data access layer using repository pattern + - Abstracts database operations (SQLite3 only) + - Implements LRU caching for performance + - Provides repositories for Job, User, Node, and Tag entities + - Transaction support for batch operations +- **internal/api**: REST API endpoints (Swagger/OpenAPI documented) +- **internal/graph**: GraphQL API (uses gqlgen) + - Schema in `api/schema.graphqls` + - Generated code in `internal/graph/generated/` + - Resolvers in `internal/graph/schema.resolvers.go` +- **internal/auth**: Authentication layer + - Supports local accounts, LDAP, OIDC, and JWT tokens + - Implements rate limiting for login attempts +- **pkg/metricstore**: Metric store with data loading API + - In-memory metric storage with checkpointing + - Query API for loading job metric data +- **internal/archiver**: Job archiving to file-based archive +- **internal/api/nats.go**: NATS-based API for job and node operations + - Subscribes to NATS subjects for job events (start/stop) + - Handles node state updates via NATS + - Uses InfluxDB line protocol message format +- **pkg/archive**: Job archive backend implementations + - File system backend (default) + - S3 backend + - SQLite backend (experimental) + - **parquet** sub-package: Parquet format support (schema, reader, writer, conversion) +- **internal/metricstoreclient**: Client for cc-metric-store queries + +### Frontend Structure + +- **web/frontend**: Svelte 5 application + - Uses Rollup for building + - Components organized by feature (analysis, job, user, etc.) + - GraphQL client using @urql/svelte + - Bootstrap 5 + SvelteStrap for UI + - uPlot for time-series visualization +- **web/templates**: Server-side Go templates + +### Key Concepts + +**Job Archive**: Completed jobs are stored in a file-based archive following the +[ClusterCockpit job-archive +specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive). +Each job has a `meta.json` file with metadata and metric data files. + +**Metric Data Repositories**: Time-series metric data is stored separately from +job metadata. The system supports multiple backends (cc-metric-store is +recommended). Configuration is per-cluster in `config.json`. + +**Authentication Flow**: + +1. Multiple authenticators can be configured (local, LDAP, OIDC, JWT) +2. Each authenticator's `CanLogin` method is called to determine if it should handle the request +3. The first authenticator that returns true performs the actual `Login` +4. JWT tokens are used for API authentication + +**Database Migrations**: SQL migrations in `internal/repository/migrations/sqlite3/` are +applied automatically on startup. Version tracking in `version` table. + +**Scopes**: Metrics can be collected at different scopes: + +- Node scope (always available) +- Core scope (for jobs with ≤8 nodes) +- Accelerator scope (for GPU/accelerator metrics) + +## Configuration + +- **config.json**: Main configuration (clusters, metric repositories, archive settings) + - `main.apiSubjects`: NATS subject configuration (optional) + - `subjectJobEvent`: Subject for job start/stop events (e.g., "cc.job.event") + - `subjectNodeState`: Subject for node state updates (e.g., "cc.node.state") + - `nats`: NATS client connection configuration (optional) + - `address`: NATS server address (e.g., "nats://localhost:4222") + - `username`: Authentication username (optional) + - `password`: Authentication password (optional) + - `creds-file-path`: Path to NATS credentials file (optional) +- **.env**: Environment variables (secrets like JWT keys) + - Copy from `configs/env-template.txt` + - NEVER commit this file +- **cluster.json**: Cluster topology and metric definitions (loaded from archive or config) + +## Database + +- Default: SQLite 3 (`./var/job.db`) +- Connection managed by `internal/repository` +- Schema version in `internal/repository/migration.go` + +## Code Generation + +**GraphQL** (gqlgen): + +- Schema: `api/schema.graphqls` +- Config: `gqlgen.yml` +- Generated code: `internal/graph/generated/` +- Custom resolvers: `internal/graph/schema.resolvers.go` +- Run `make graphql` after schema changes + +**Swagger/OpenAPI**: + +- Annotations in `internal/api/*.go` +- Generated docs: `internal/api/docs.go`, `api/swagger.yaml` +- Run `make swagger` after API changes + +## Testing Conventions + +- Test files use `_test.go` suffix +- Test data in `testdata/` subdirectories +- Repository tests use in-memory SQLite +- API tests use httptest + +## Common Workflows + +### Adding a new GraphQL field + +1. Edit schema in `api/schema.graphqls` +2. Run `make graphql` +3. Implement resolver in `internal/graph/schema.resolvers.go` + +### Adding a new REST endpoint + +1. Add handler in `internal/api/*.go` +2. Add route in `internal/api/rest.go` +3. Add Swagger annotations +4. Run `make swagger` + +### Adding a new metric data backend + +1. Implement metric loading functions in `pkg/metricstore/query.go` +2. Add cluster configuration to metric store initialization +3. Update config.json schema documentation + +### Modifying database schema + +1. Create new migration in `internal/repository/migrations/sqlite3/` +2. Increment `repository.Version` +3. Test with fresh database and existing database + +## NATS API + +The backend supports a NATS-based API as an alternative to the REST API for job and node operations. + +### Setup + +1. Configure NATS client connection in `config.json`: + ```json + { + "nats": { + "address": "nats://localhost:4222", + "username": "user", + "password": "pass" + } + } + ``` + +2. Configure API subjects in `config.json` under `main`: + ```json + { + "main": { + "apiSubjects": { + "subjectJobEvent": "cc.job.event", + "subjectNodeState": "cc.node.state" + } + } + } + ``` + +### Message Format + +Messages use **InfluxDB line protocol** format with the following structure: + +#### Job Events + +**Start Job:** +``` +job,function=start_job event="{\"jobId\":123,\"user\":\"alice\",\"cluster\":\"test\", ...}" 1234567890000000000 +``` + +**Stop Job:** +``` +job,function=stop_job event="{\"jobId\":123,\"cluster\":\"test\",\"startTime\":1234567890,\"stopTime\":1234571490,\"jobState\":\"completed\"}" 1234571490000000000 +``` + +**Tags:** +- `function`: Either `start_job` or `stop_job` + +**Fields:** +- `event`: JSON payload containing job data (see REST API documentation for schema) + +#### Node State Updates + +```json +{ + "cluster": "testcluster", + "nodes": [ + { + "hostname": "node001", + "states": ["allocated"], + "cpusAllocated": 8, + "memoryAllocated": 16384, + "gpusAllocated": 0, + "jobsRunning": 1 + } + ] +} +``` + +### Implementation Notes + +- NATS API mirrors REST API functionality but uses messaging +- Job start/stop events are processed asynchronously +- Duplicate job detection is handled (same as REST API) +- All validation rules from REST API apply +- Messages are logged; no responses are sent back to publishers +- If NATS client is unavailable, API subscriptions are skipped (logged as warning) + +## Dependencies + +- Go 1.24.0+ (check go.mod for exact version) +- Node.js (for frontend builds) +- SQLite 3 (only supported database) +- Optional: NATS server for NATS API integration diff --git a/Makefile b/Makefile index 0e19095a..5829beae 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ TARGET = ./cc-backend FRONTEND = ./web/frontend -VERSION = 1.4.4 +VERSION = 1.5.0 GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development') CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S") LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}' @@ -46,16 +46,16 @@ $(TARGET): $(SVELTE_TARGETS) frontend: $(info ===> BUILD frontend) - cd web/frontend && npm install && npm run build + cd web/frontend && npm ci && npm run build swagger: $(info ===> GENERATE swagger) - @go run github.com/swaggo/swag/cmd/swag init --parseDependency -d ./internal/api -g rest.go -o ./api + @go tool github.com/swaggo/swag/cmd/swag init --parseDependency -d ./internal/api -g rest.go -o ./api @mv ./api/docs.go ./internal/api/docs.go graphql: $(info ===> GENERATE graphql) - @go run github.com/99designs/gqlgen + @go tool github.com/99designs/gqlgen clean: $(info ===> CLEAN) @@ -84,4 +84,4 @@ $(VAR): $(SVELTE_TARGETS): $(SVELTE_SRC) $(info ===> BUILD frontend) - cd web/frontend && npm install && npm run build + cd web/frontend && npm ci && npm run build diff --git a/README.md b/README.md index 0799bd92..3306f838 100644 --- a/README.md +++ b/README.md @@ -22,19 +22,23 @@ switching from PHP Symfony to a Golang based solution are explained ## Overview This is a Golang web backend for the ClusterCockpit job-specific performance -monitoring framework. It provides a REST API for integrating ClusterCockpit with -an HPC cluster batch system and external analysis scripts. Data exchange between -the web front-end and the back-end is based on a GraphQL API. The web frontend -is also served by the backend using [Svelte](https://svelte.dev/) components. -Layout and styling are based on [Bootstrap 5](https://getbootstrap.com/) using +monitoring framework. It provides a REST API and an optional NATS-based messaging +API for integrating ClusterCockpit with an HPC cluster batch system and external +analysis scripts. Data exchange between the web front-end and the back-end is +based on a GraphQL API. The web frontend is also served by the backend using +[Svelte](https://svelte.dev/) components. Layout and styling are based on +[Bootstrap 5](https://getbootstrap.com/) using [Bootstrap Icons](https://icons.getbootstrap.com/). -The backend uses [SQLite 3](https://sqlite.org/) as a relational SQL database by -default. Optionally it can use a MySQL/MariaDB database server. While there are -metric data backends for the InfluxDB and Prometheus time series databases, the -only tested and supported setup is to use cc-metric-store as the metric data -backend. Documentation on how to integrate ClusterCockpit with other time series -databases will be added in the future. +The backend uses [SQLite 3](https://sqlite.org/) as the relational SQL database. +While there are metric data backends for the InfluxDB and Prometheus time series +databases, the only tested and supported setup is to use cc-metric-store as the +metric data backend. Documentation on how to integrate ClusterCockpit with other +time series databases will be added in the future. + +For real-time integration with HPC systems, the backend can subscribe to +[NATS](https://nats.io/) subjects to receive job start/stop events and node +state updates, providing an alternative to REST API polling. Completed batch jobs are stored in a file-based job archive according to [this specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/job-archive). @@ -131,27 +135,59 @@ ln -s ./var/job-archive ## Project file structure +- [`.github/`](https://github.com/ClusterCockpit/cc-backend/tree/master/.github) + GitHub Actions workflows and dependabot configuration for CI/CD. - [`api/`](https://github.com/ClusterCockpit/cc-backend/tree/master/api) contains the API schema files for the REST and GraphQL APIs. The REST API is documented in the OpenAPI 3.0 format in - [./api/openapi.yaml](./api/openapi.yaml). + [./api/swagger.yaml](./api/swagger.yaml). The GraphQL schema is in + [./api/schema.graphqls](./api/schema.graphqls). - [`cmd/cc-backend`](https://github.com/ClusterCockpit/cc-backend/tree/master/cmd/cc-backend) - contains `main.go` for the main application. + contains the main application entry point and CLI implementation. - [`configs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/configs) contains documentation about configuration and command line options and required - environment variables. A sample configuration file is provided. -- [`docs/`](https://github.com/ClusterCockpit/cc-backend/tree/master/docs) - contains more in-depth documentation. + environment variables. Sample configuration files are provided. - [`init/`](https://github.com/ClusterCockpit/cc-backend/tree/master/init) contains an example of setting up systemd for production use. - [`internal/`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal) contains library source code that is not intended for use by others. + - [`api`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/api) + REST API handlers and NATS integration + - [`archiver`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/archiver) + Job archiving functionality + - [`auth`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/auth) + Authentication (local, LDAP, OIDC) and JWT token handling + - [`config`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/config) + Configuration management and validation + - [`graph`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/graph) + GraphQL schema and resolvers + - [`importer`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/importer) + Job data import and database initialization + - [`metricdispatch`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/metricdispatch) + Dispatches metric data loading to appropriate backends + - [`repository`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/repository) + Database repository layer for jobs and metadata + - [`routerConfig`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/routerConfig) + HTTP router configuration and middleware + - [`tagger`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/tagger) + Job classification and application detection + - [`taskmanager`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/taskmanager) + Background task management and scheduled jobs + - [`metricstoreclient`](https://github.com/ClusterCockpit/cc-backend/tree/master/internal/metricstoreclient) + Client for cc-metric-store queries - [`pkg/`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg) contains Go packages that can be used by other projects. + - [`archive`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/archive) + Job archive backend implementations (filesystem, S3, SQLite) + - [`metricstore`](https://github.com/ClusterCockpit/cc-backend/tree/master/pkg/metricstore) + In-memory metric data store with checkpointing and metric loading - [`tools/`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools) Additional command line helper tools. - [`archive-manager`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-manager) - Commands for getting infos about and existing job archive. + Commands for getting infos about an existing job archive, importing jobs + between archive backends, and converting archives between JSON and Parquet formats. + - [`archive-migration`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/archive-migration) + Tool for migrating job archives between formats. - [`convert-pem-pubkey`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/convert-pem-pubkey) Tool to convert external pubkey for use in `cc-backend`. - [`gen-keypair`](https://github.com/ClusterCockpit/cc-backend/tree/master/tools/gen-keypair) @@ -163,7 +199,7 @@ ln -s ./var/job-archive - [`frontend`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/frontend) Svelte components and static assets for the frontend UI - [`templates`](https://github.com/ClusterCockpit/cc-backend/tree/master/web/templates) - Server-side Go templates + Server-side Go templates, including monitoring views - [`gqlgen.yml`](https://github.com/ClusterCockpit/cc-backend/blob/master/gqlgen.yml) Configures the behaviour and generation of [gqlgen](https://github.com/99designs/gqlgen). diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 860f62a4..5447167e 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -1,47 +1,279 @@ -# `cc-backend` version 1.4.4 +# `cc-backend` version 1.5.0 -Supports job archive version 2 and database version 8. +Supports job archive version 3 and database version 10. -This is a bug fix release of `cc-backend`, the API backend and frontend +This is a feature release of `cc-backend`, the API backend and frontend implementation of ClusterCockpit. For release specific notes visit the [ClusterCockpit Documentation](https://clusterockpit.org/docs/release/). ## Breaking changes -The option `apiAllowedIPs` is now a required configuration attribute in -`config.json`. This option restricts access to the admin API. +### Configuration changes -To retain the previous behavior that the API is per default accessible from -everywhere set: +- **JSON attribute naming**: All JSON configuration attributes now use `kebab-case` + style consistently (e.g., `api-allowed-ips` instead of `apiAllowedIPs`). + Update your `config.json` accordingly. +- **Removed `disable-archive` option**: This obsolete configuration option has been removed. +- **Removed `clusters` config section**: The separate clusters configuration section + has been removed. Cluster information is now derived from the job archive. +- **`apiAllowedIPs` is now optional**: If not specified, defaults to not + restricted. + +### Architecture changes + +- **Web framework replaced**: Migrated from `gorilla/mux` to `chi` as the HTTP + router. This should be transparent to users but affects how middleware and + routes are composed. A proper 404 handler is now in place. +- **MetricStore moved**: The `metricstore` package has been moved from `internal/` + to `pkg/` as it is now part of the public API. +- **MySQL/MariaDB support removed**: Only SQLite is now supported as the database backend. +- **Archive to Cleanup renaming**: Archive-related functions have been refactored + and renamed to "Cleanup" for clarity. +- **`minRunningFor` filter removed**: This undocumented filter has been removed + from the API and frontend. + +### Dependency changes + +- **cc-lib v2.5.1**: Switched to cc-lib version 2 with updated APIs (currently at v2.5.1) +- **cclib NATS client**: Now using the cclib NATS client implementation +- Removed obsolete `util.Float` usage from cclib + +## Major new features + +### NATS API Integration + +- **Real-time job events**: Subscribe to job start/stop events via NATS +- **Node state updates**: Receive real-time node state changes via NATS +- **Configurable subjects**: NATS API subjects are now configurable via `api-subjects` +- **Deadlock fixes**: Improved NATS client stability and graceful shutdown + +### Public Dashboard + +- **Public-facing interface**: New public dashboard route for external users +- **DoubleMetricPlot component**: New visualization component for comparing metrics +- **Improved layout**: Reviewed and optimized dashboard layouts for better readability + +### Enhanced Node Management + +- **Node state tracking**: New node table in database with timestamp tracking +- **Node state filtering**: Filter jobs by node state in systems view +- **Node list enhancements**: Improved paging, filtering, and continuous scroll support +- **Nodestate retention and archiving**: Node state data is now subject to configurable + retention policies and can be archived to Parquet format for long-term storage +- **Faulty node metric tracking**: Faulty node state metric lists are persisted to the database + +### Health Monitoring + +- **Health status dashboard**: New dedicated "Health" tab in the status details view + showing per-node metric health across the cluster +- **CCMS health check**: Support for querying health status of external + cc-metric-store (CCMS) instances via the API +- **GraphQL health endpoints**: New GraphQL queries and resolvers for health data +- **Cluster/subcluster filter**: Filter health status view by cluster or subcluster + +### Log Viewer + +- **Web-based log viewer**: New log viewer page in the admin interface for inspecting + backend log output directly from the browser without shell access +- **Accessible from header**: Quick access link from the navigation header + +### MetricStore Improvements + +- **Memory tracking worker**: New worker for CCMS memory usage tracking +- **Dynamic retention**: Support for job specific dynamic retention times +- **Improved compression**: Transparent compression for job archive imports +- **Parallel processing**: Parallelized Iter function in all archive backends + +### Job Tagging System + +- **Job tagger option**: Enable automatic job tagging via configuration flag +- **Application detection**: Automatic detection of applications (MATLAB, GROMACS, etc.) +- **Job classification**: Automatic detection of pathological jobs +- **omit-tagged**: Option to exclude tagged jobs from retention/cleanup operations (`none`, `all`, or `user`) +- **Admin UI trigger**: Taggers can be run on-demand from the admin web interface + without restarting the backend + +### Archive Backends + +- **Parquet archive format**: New Parquet file format for job archiving, providing + columnar storage with efficient compression for analytical workloads +- **S3 backend**: Full support for S3-compatible object storage +- **SQLite backend**: Full support for SQLite backend using blobs +- **Performance improvements**: Fixed performance bugs in archive backends +- **Better error handling**: Improved error messages and fallback handling +- **Zstd compression**: Parquet writers use zstd compression for better + compression ratios compared to the previous snappy default +- **Optimized sort order**: Job and nodestate Parquet files are sorted by + cluster, subcluster, and start time for efficient range queries + +### Unified Archive Retention and Format Conversion + +- **Uniform retention policy**: Job archive retention now supports both JSON and + Parquet as target formats under a single, consistent policy configuration +- **Archive manager tool**: The `tools/archive-manager` utility now supports + format conversion between JSON and Parquet job archives +- **Parquet reader**: Full Parquet archive reader implementation for reading back + archived job data + +## New features and improvements + +### Frontend + +- **Loading indicators**: Added loading indicators to status detail and job lists +- **Job info layout**: Reviewed and improved job info row layout +- **Metric selection**: Enhanced metric selection with drag-and-drop fixes +- **Filter presets**: Move list filter preset to URL for easy sharing +- **Job comparison**: Improved job comparison views and plots +- **Subcluster reactivity**: Job list now reacts to subcluster filter changes +- **Short jobs quick selection**: New "Short jobs" quick-filter button in job lists + replaces the removed undocumented `minRunningFor` filter +- **Row plot cursor sync**: Cursor position is now synchronized across all metric + plots in a job list row for easier cross-metric comparison +- **Disabled metrics handling**: Improved handling and display of disabled metrics + across job view, node view, and list rows +- **"Not configured" info cards**: Informational cards shown when optional features + are not yet configured +- **Frontend dependencies**: Bumped frontend dependencies to latest versions +- **Svelte 5 compatibility**: Fixed Svelte state warnings and compatibility issues + +### Backend + +- **Progress bars**: Import function now shows progress during long operations +- **Better logging**: Improved logging with appropriate log levels throughout +- **Graceful shutdown**: Fixed shutdown timeout bugs and hanging issues +- **Configuration defaults**: Sensible defaults for most configuration options +- **Documentation**: Extensive documentation improvements across packages +- **Server flag in systemd unit**: Example systemd unit now includes the `-server` flag + +### Security + +- **LDAP security hardening**: Improved input validation, connection handling, and + error reporting in the LDAP authenticator +- **OIDC security hardening**: Stricter token validation and improved error handling + in the OIDC authenticator +- **Auth schema extensions**: Additional schema fields for improved auth configuration + +### API improvements + +- **Role-based metric visibility**: Metrics can now have role-based access control +- **Job exclusivity filter**: New filter for exclusive vs. shared jobs +- **Improved error messages**: Better error messages and documentation in REST API +- **GraphQL enhancements**: Improved GraphQL queries and resolvers +- **Stop job lookup order**: Reversed lookup order in stop job requests for + more reliable job matching (cluster+jobId first, then jobId alone) + +### Performance + +- **Database indices**: Optimized SQLite indices for better query performance +- **Job cache**: Introduced caching table for faster job inserts +- **Parallel imports**: Archive imports now run in parallel where possible +- **External tool integration**: Optimized use of external tools (fd) for better performance +- **Node repository queries**: Reviewed and optimized node repository SQL queries +- **Buffer pool**: Resized and pooled internal buffers for better memory reuse + +### Developer experience + +- **AI agent guidelines**: Added documentation for AI coding agents (AGENTS.md, CLAUDE.md) +- **Example API payloads**: Added example JSON API payloads for testing +- **Unit tests**: Added more unit tests for NATS API, node repository, and other components +- **Test improvements**: Better test coverage; test DB is now copied before unit tests + to avoid state pollution between test runs +- **Parquet writer tests**: Comprehensive tests for Parquet archive writing and conversion + +## Bug fixes + +- Fixed nodelist paging issues +- Fixed metric select drag and drop functionality +- Fixed render race conditions in nodeList +- Fixed tag count grouping including type +- Fixed wrong metricstore schema (missing comma) +- Fixed configuration issues causing shutdown hangs +- Fixed deadlock when NATS is not configured +- Fixed archive backend performance bugs +- Fixed continuous scroll buildup on refresh +- Improved footprint calculation logic +- Fixed polar plot data query decoupling +- Fixed missing resolution parameter handling +- Fixed node table initialization fallback +- Fixed reactivity key placement in nodeList +- Fixed nodeList resolver data handling and increased nodestate filter cutoff +- Fixed job always being transferred to main job table before archiving +- Fixed AppTagger error handling and logging +- Fixed log endpoint formatting and correctness +- Fixed automatic refresh in metric status tab +- Fixed NULL value handling in `health_state` and `health_metrics` columns +- Fixed bugs related to `job_cache` IDs being used in the main job table +- Fixed SyncJobs bug causing start job hooks to be called with wrong (cache) IDs +- Fixed 404 handler route for sub-routers + +## Configuration changes + +### New configuration options ```json - "apiAllowedIPs": [ - "*" - ] +{ + "main": { + "enable-job-taggers": true, + "resampling": { + "minimum-points": 600, + "trigger": 180, + "resolutions": [240, 60] + }, + "api-subjects": { + "subject-job-event": "cc.job.event", + "subject-node-state": "cc.node.state" + } + }, + "nats": { + "address": "nats://0.0.0.0:4222", + "username": "root", + "password": "root" + }, + "cron": { + "commit-job-worker": "1m", + "duration-worker": "5m", + "footprint-worker": "10m" + }, + "metric-store": { + "cleanup": { + "mode": "archive", + "interval": "48h", + "directory": "./var/archive" + } + }, + "archive": { + "retention": { + "policy": "delete", + "age": "6months", + "target-format": "parquet" + } + }, + "nodestate": { + "retention": { + "policy": "archive", + "age": "30d", + "archive-path": "./var/nodestate-archive" + } + } +} ``` -## Breaking changes for minor release 1.4.x +## Migration notes -- You need to perform a database migration. Depending on your database size the - migration might require several hours! -- You need to adapt the `cluster.json` configuration files in the job-archive, - add new required attributes to the metric list and after that edit - `./job-archive/version.txt` to version 2. Only metrics that have the footprint - attribute set can be filtered and show up in the footprint UI and polar plot. -- Continuous scrolling is default now in all job lists. You can change this back - to paging globally, also every user can configure to use paging or continuous - scrolling individually. -- Tags have a scope now. Existing tags will get global scope in the database - migration. - -## New features - -- Enable to delete tags from the web interface +- Review and update your `config.json` to use kebab-case attribute names +- If using NATS, configure the new `nats` and `api-subjects` sections +- If using S3 archive backend, configure the new `archive` section options +- Test the new public dashboard at `/public` route +- Review cron worker configuration if you need different frequencies +- If using the archive retention feature, configure the `target-format` option + to choose between `json` (default) and `parquet` output formats +- Consider enabling nodestate retention if you track node states over time ## Known issues +- The new dynamic memory management is not bullet proof yet across restarts. We + will fix that in a subsequent patch release - Currently energy footprint metrics of type energy are ignored for calculating total energy. -- Resampling for running jobs only works with cc-metric-store - With energy footprint metrics of type power the unit is ignored and it is assumed the metric has the unit Watt. diff --git a/api/schema.graphqls b/api/schema.graphqls index 8f5e1c7c..e4e2b8ed 100644 --- a/api/schema.graphqls +++ b/api/schema.graphqls @@ -19,6 +19,7 @@ type Node { schedulerState: SchedulerState! healthState: MonitoringState! metaData: Any + healthData: Any } type NodeStates { @@ -164,6 +165,13 @@ type JobMetricWithName { metric: JobMetric! } +type ClusterMetricWithName { + name: String! + unit: Unit + timestep: Int! + data: [NullableFloat!]! +} + type JobMetric { unit: Unit timestep: Int! @@ -267,6 +275,11 @@ type NodeMetrics { metrics: [JobMetricWithName!]! } +type ClusterMetrics { + nodeCount: Int! + metrics: [ClusterMetricWithName!]! +} + type NodesResultList { items: [NodeMetrics!]! offset: Int @@ -316,6 +329,7 @@ type Query { ## Node Queries New node(id: ID!): Node nodes(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList! + nodesWithMeta(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList! nodeStates(filter: [NodeFilter!]): [NodeStates!]! nodeStatesTimed(filter: [NodeFilter!], type: String!): [NodeStatesTimed!]! @@ -385,6 +399,13 @@ type Query { page: PageRequest resolution: Int ): NodesResultList! + + clusterMetrics( + cluster: String! + metrics: [String!] + from: Time! + to: Time! + ): ClusterMetrics! } type Mutation { @@ -410,7 +431,7 @@ type TimeRangeOutput { input NodeFilter { hostname: StringInput cluster: StringInput - subcluster: StringInput + subCluster: StringInput schedulerState: SchedulerState healthState: MonitoringState timeStart: Int @@ -425,6 +446,7 @@ input JobFilter { project: StringInput jobName: StringInput cluster: StringInput + subCluster: StringInput partition: StringInput duration: IntRange energy: FloatRange @@ -439,6 +461,7 @@ input JobFilter { state: [JobState!] metricStats: [MetricStatItem!] shared: String + schedule: String node: StringInput } diff --git a/api/swagger.json b/api/swagger.json index 0327a91d..c9c36de1 100644 --- a/api/swagger.json +++ b/api/swagger.json @@ -18,11 +18,6 @@ "paths": { "/api/clusters/": { "get": { - "security": [ - { - "ApiKeyAuth": [] - } - ], "description": "Get a list of all cluster configs. Specific cluster can be requested using query parameter.", "produces": [ "application/json" @@ -43,7 +38,7 @@ "200": { "description": "Array of clusters", "schema": { - "$ref": "#/definitions/api.GetClustersApiResponse" + "$ref": "#/definitions/api.GetClustersAPIResponse" } }, "400": { @@ -70,16 +65,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/": { + "get": { "description": "Get a list of all jobs. Filters can be applied using query parameters.\nNumber of results can be limited by page. Results are sorted by descending startTime.", "produces": [ "application/json" @@ -138,7 +133,7 @@ "200": { "description": "Job array and page info", "schema": { - "$ref": "#/definitions/api.GetJobsApiResponse" + "$ref": "#/definitions/api.GetJobsAPIResponse" } }, "400": { @@ -165,16 +160,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job/": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job/": { + "delete": { "description": "Job to delete is specified by request body. All fields are required in this case.", "consumes": [ "application/json" @@ -193,7 +188,7 @@ "in": "body", "required": true, "schema": { - "$ref": "#/definitions/api.DeleteJobApiRequest" + "$ref": "#/definitions/api.DeleteJobAPIRequest" } } ], @@ -201,7 +196,7 @@ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -240,16 +235,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job/{id}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job/{id}": { + "delete": { "description": "Job to remove is specified by database ID. This will not remove the job from the job archive.", "produces": [ "application/json" @@ -271,7 +266,7 @@ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -310,16 +305,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job_before/{ts}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job_before/{ts}": { + "delete": { "description": "Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.", "produces": [ "application/json" @@ -335,13 +330,19 @@ "name": "ts", "in": "path", "required": true + }, + { + "type": "boolean", + "description": "Omit jobs with tags from deletion", + "name": "omit-tagged", + "in": "query" } ], "responses": { "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -380,16 +381,79 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] } }, - "/api/jobs/edit_meta/{id}": { - "post": { + "/api/jobs/edit_meta/": { + "patch": { "security": [ { "ApiKeyAuth": [] } ], + "description": "Edit key value pairs in metadata json of job specified by jobID, StartTime and Cluster\nIf a key already exists its content will be overwritten", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "Job add and modify" + ], + "summary": "Edit meta-data json by request", + "parameters": [ + { + "description": "Specifies job and payload to add or update", + "name": "request", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/api.JobMetaRequest" + } + } + ], + "responses": { + "200": { + "description": "Updated job resource", + "schema": { + "$ref": "#/definitions/schema.Job" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Job does not exist", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + }, + "/api/jobs/edit_meta/{id}": { + "patch": { "description": "Edit key value pairs in job metadata json\nIf a key already exists its content will be overwritten", "consumes": [ "application/json" @@ -450,16 +514,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/start_job/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/start_job/": { + "post": { "description": "Job specified in request body will be saved to database as \"running\" with new DB ID.\nJob specifications follow the 'Job' scheme, API will fail to execute if requirements are not met.", "consumes": [ "application/json" @@ -486,7 +550,7 @@ "201": { "description": "Job added successfully", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -519,16 +583,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/stop_job/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/stop_job/": { + "post": { "description": "Job to stop is specified by request body. All fields are required in this case.\nReturns full job resource information according to 'Job' scheme.", "produces": [ "application/json" @@ -544,7 +608,7 @@ "in": "body", "required": true, "schema": { - "$ref": "#/definitions/api.StopJobApiRequest" + "$ref": "#/definitions/api.StopJobAPIRequest" } } ], @@ -591,16 +655,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/tag_job/{id}": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/tag_job/{id}": { + "post": { "description": "Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.\nTag Scope for frontend visibility will default to \"global\" if none entered, other options: \"admin\" or specific username.\nIf tagged job is already finished: Tag will be written directly to respective archive files.", "consumes": [ "application/json" @@ -628,7 +692,7 @@ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -664,16 +728,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/{id}": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/{id}": { + "get": { "description": "Job to get is specified by database ID\nReturns full job resource information according to 'Job' scheme and all metrics according to 'JobData'.", "produces": [ "application/json" @@ -701,7 +765,7 @@ "200": { "description": "Job resource", "schema": { - "$ref": "#/definitions/api.GetJobApiResponse" + "$ref": "#/definitions/api.GetJobAPIResponse" } }, "400": { @@ -740,14 +804,14 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - }, - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + }, + "post": { "description": "Job to get is specified by database ID\nReturns full job resource information according to 'Job' scheme and all metrics according to 'JobData'.", "consumes": [ "application/json" @@ -784,7 +848,7 @@ "200": { "description": "Job resource", "schema": { - "$ref": "#/definitions/api.GetJobApiResponse" + "$ref": "#/definitions/api.GetJobAPIResponse" } }, "400": { @@ -823,16 +887,16 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/nodestats/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/nodestats/": { + "post": { "description": "Returns a JSON-encoded list of users.\nRequired query-parameter defines if all users or only users with additional special roles are returned.", "produces": [ "application/json" @@ -856,7 +920,7 @@ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -883,16 +947,86 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/users/": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } + ] + } + }, + "/api/user/{id}": { + "post": { + "description": "Allows admins to add/remove roles and projects for a user", + "produces": [ + "text/plain" ], + "tags": [ + "User" + ], + "summary": "Update user roles and projects", + "parameters": [ + { + "type": "string", + "description": "Username", + "name": "id", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Role to add", + "name": "add-role", + "in": "formData" + }, + { + "type": "string", + "description": "Role to remove", + "name": "remove-role", + "in": "formData" + }, + { + "type": "string", + "description": "Project to add", + "name": "add-project", + "in": "formData" + }, + { + "type": "string", + "description": "Project to remove", + "name": "remove-project", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/api/users/": { + "get": { "description": "Returns a JSON-encoded list of users.\nRequired query-parameter defines if all users or only users with additional special roles are returned.", "produces": [ "application/json" @@ -916,7 +1050,7 @@ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiReturnedUser" + "$ref": "#/definitions/api.APIReturnedUser" } } }, @@ -944,16 +1078,361 @@ "type": "string" } } - } - } - }, - "/jobs/tag_job/{id}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } + ] + }, + "post": { + "description": "Creates a new user with specified credentials and role", + "produces": [ + "text/plain" ], + "tags": [ + "User" + ], + "summary": "Create a new user", + "parameters": [ + { + "type": "string", + "description": "Username", + "name": "username", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Password (not required for API users)", + "name": "password", + "in": "formData" + }, + { + "type": "string", + "description": "User role", + "name": "role", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Full name", + "name": "name", + "in": "formData" + }, + { + "type": "string", + "description": "Email address", + "name": "email", + "in": "formData" + }, + { + "type": "string", + "description": "Project (required for managers)", + "name": "project", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + }, + "delete": { + "description": "Deletes a user from the system", + "produces": [ + "text/plain" + ], + "tags": [ + "User" + ], + "summary": "Delete a user", + "parameters": [ + { + "type": "string", + "description": "Username to delete", + "name": "username", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "Success", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/configuration/": { + "post": { + "description": "Updates a user's configuration key-value pair.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Frontend" + ], + "summary": "Update user configuration", + "parameters": [ + { + "type": "string", + "description": "Configuration key", + "name": "key", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Configuration value", + "name": "value", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "success", + "schema": { + "type": "string" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/debug/": { + "post": { + "description": "This endpoint allows the users to print the content of", + "produces": [ + "application/json" + ], + "tags": [ + "debug" + ], + "summary": "Debug endpoint", + "parameters": [ + { + "type": "string", + "description": "Selector", + "name": "selector", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Debug dump", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/free/": { + "post": { + "description": "This endpoint allows the users to free the Buffers from the", + "produces": [ + "application/json" + ], + "tags": [ + "free" + ], + "parameters": [ + { + "type": "string", + "description": "up to timestamp", + "name": "to", + "in": "query" + } + ], + "responses": { + "200": { + "description": "ok", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/healthcheck/": { + "get": { + "description": "This endpoint allows the users to check if a node is healthy", + "produces": [ + "application/json" + ], + "tags": [ + "healthcheck" + ], + "summary": "HealthCheck endpoint", + "parameters": [ + { + "type": "string", + "description": "Selector", + "name": "selector", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Debug dump", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/jobs/tag_job/{id}": { + "delete": { "description": "Removes tag(s) from a job specified by DB ID. Name and Type of Tag(s) must match.\nTag Scope is required for matching, options: \"global\", \"admin\". Private tags can not be deleted via API.\nIf tagged job is already finished: Tag will be removed from respective archive files.", "consumes": [ "application/json" @@ -981,7 +1460,7 @@ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -1017,16 +1496,276 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/tags/": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } + ] + } + }, + "/jwt/": { + "get": { + "description": "Generates a JWT token for a user. Admins can generate tokens for any user, regular users only for themselves.", + "consumes": [ + "multipart/form-data" ], + "produces": [ + "text/plain" + ], + "tags": [ + "Frontend" + ], + "summary": "Generate JWT token", + "parameters": [ + { + "type": "string", + "description": "Username to generate JWT for", + "name": "username", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "JWT token", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "User Not Found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/machine_state/{cluster}/{host}": { + "get": { + "description": "Retrieves stored machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.", + "produces": [ + "application/json" + ], + "tags": [ + "Machine State" + ], + "summary": "Retrieve machine state", + "parameters": [ + { + "type": "string", + "description": "Cluster name", + "name": "cluster", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Host name", + "name": "host", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "Machine state JSON data", + "schema": { + "type": "object" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Machine state not enabled or file not found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + }, + "put": { + "description": "Stores machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.", + "consumes": [ + "application/json" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Machine State" + ], + "summary": "Store machine state", + "parameters": [ + { + "type": "string", + "description": "Cluster name", + "name": "cluster", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Host name", + "name": "host", + "in": "path", + "required": true + } + ], + "responses": { + "201": { + "description": "Created" + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Machine state not enabled", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/notice/": { + "post": { + "description": "Updates the notice.txt file content. Only admins are allowed. Content is limited to 10000 characters.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Config" + ], + "summary": "Update system notice", + "parameters": [ + { + "type": "string", + "description": "New notice content (max 10000 characters)", + "name": "new-content", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "Update Notice Content Success", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/roles/": { + "get": { + "description": "Returns a list of valid user roles. Only admins are allowed.", + "produces": [ + "application/json" + ], + "tags": [ + "Config" + ], + "summary": "Get available roles", + "responses": { + "200": { + "description": "List of role names", + "schema": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/tags/": { + "delete": { "description": "Removes tags by type and name. Name and Type of Tag(s) must match.\nTag Scope is required for matching, options: \"global\", \"admin\". Private tags can not be deleted via API.\nTag wills be removed from respective archive files.", "consumes": [ "application/json" @@ -1047,7 +1786,7 @@ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -1083,12 +1822,72 @@ "$ref": "#/definitions/api.ErrorResponse" } } - } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/write/": { + "post": { + "consumes": [ + "text/plain" + ], + "produces": [ + "application/json" + ], + "parameters": [ + { + "type": "string", + "description": "If the lines in the body do not have a cluster tag, use this value instead.", + "name": "cluster", + "in": "query" + } + ], + "responses": { + "200": { + "description": "ok", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] } } }, "definitions": { - "api.ApiReturnedUser": { + "api.APIReturnedUser": { "type": "object", "properties": { "email": { @@ -1114,7 +1913,7 @@ } } }, - "api.ApiTag": { + "api.APITag": { "type": "object", "properties": { "name": { @@ -1134,7 +1933,7 @@ } } }, - "api.DefaultApiResponse": { + "api.DefaultAPIResponse": { "type": "object", "properties": { "msg": { @@ -1142,7 +1941,7 @@ } } }, - "api.DeleteJobApiRequest": { + "api.DeleteJobAPIRequest": { "type": "object", "required": [ "jobId" @@ -1191,7 +1990,7 @@ } } }, - "api.GetClustersApiResponse": { + "api.GetClustersAPIResponse": { "type": "object", "properties": { "clusters": { @@ -1203,7 +2002,7 @@ } } }, - "api.GetJobApiResponse": { + "api.GetJobAPIResponse": { "type": "object", "properties": { "data": { @@ -1217,7 +2016,7 @@ } } }, - "api.GetJobsApiResponse": { + "api.GetJobsAPIResponse": { "type": "object", "properties": { "items": { @@ -1251,39 +2050,7 @@ } } }, - "api.Node": { - "type": "object", - "properties": { - "cpusAllocated": { - "type": "integer" - }, - "cpusTotal": { - "type": "integer" - }, - "gpusAllocated": { - "type": "integer" - }, - "gpusTotal": { - "type": "integer" - }, - "hostname": { - "type": "string" - }, - "memoryAllocated": { - "type": "integer" - }, - "memoryTotal": { - "type": "integer" - }, - "states": { - "type": "array", - "items": { - "type": "string" - } - } - } - }, - "api.StopJobApiRequest": { + "api.StopJobAPIRequest": { "type": "object", "required": [ "jobState", @@ -1326,7 +2093,7 @@ "nodes": { "type": "array", "items": { - "$ref": "#/definitions/api.Node" + "$ref": "#/definitions/schema.NodePayload" } } } @@ -1335,12 +2102,15 @@ "type": "object", "properties": { "id": { + "description": "Unique identifier for the accelerator (e.g., \"0\", \"1\", \"GPU-0\")", "type": "string" }, "model": { + "description": "Specific model name (e.g., \"A100\", \"MI100\")", "type": "string" }, "type": { + "description": "Type of accelerator (e.g., \"Nvidia GPU\", \"AMD GPU\")", "type": "string" } } @@ -1349,15 +2119,18 @@ "type": "object", "properties": { "metricConfig": { + "description": "Cluster-wide metric configurations", "type": "array", "items": { "$ref": "#/definitions/schema.MetricConfig" } }, "name": { + "description": "Unique cluster name (e.g., \"fritz\", \"alex\")", "type": "string" }, "subClusters": { + "description": "Homogeneous partitions within the cluster", "type": "array", "items": { "$ref": "#/definitions/schema.SubCluster" @@ -1366,6 +2139,7 @@ } }, "schema.Job": { + "description": "Information of a HPC job.", "type": "object", "properties": { "arrayJobId": { @@ -1394,6 +2168,13 @@ "format": "float64" } }, + "exclusive": { + "description": "for backwards compatibility", + "type": "integer", + "maximum": 2, + "minimum": 0, + "example": 1 + }, "footprint": { "type": "object", "additionalProperties": { @@ -1416,7 +2197,7 @@ "deadline", "failed", "node_fail", - "out_of_memory", + "out-of-memory", "pending", "preempted", "running", @@ -1528,9 +2309,11 @@ "type": "object", "properties": { "id": { + "description": "Internal database ID", "type": "integer" }, "jobId": { + "description": "The job's external job ID", "type": "integer" } } @@ -1539,9 +2322,11 @@ "type": "object", "properties": { "count": { + "description": "Total count of available items", "type": "integer" }, "items": { + "description": "List of job links", "type": "array", "items": { "$ref": "#/definitions/schema.JobLink" @@ -1553,19 +2338,31 @@ "type": "object", "properties": { "series": { + "description": "Individual time series data", "type": "array", "items": { "$ref": "#/definitions/schema.Series" } }, "statisticsSeries": { - "$ref": "#/definitions/schema.StatsSeries" + "description": "Aggregated statistics over time", + "allOf": [ + { + "$ref": "#/definitions/schema.StatsSeries" + } + ] }, "timestep": { + "description": "Sampling interval in seconds", "type": "integer" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1631,46 +2428,71 @@ "type": "object", "properties": { "aggregation": { + "description": "Aggregation function (avg, sum, min, max)", "type": "string" }, "alert": { + "description": "Alert threshold (requires attention)", "type": "number" }, "caution": { + "description": "Caution threshold (concerning but not critical)", "type": "number" }, "energy": { + "description": "Energy measurement method", "type": "string" }, "footprint": { + "description": "Footprint category", "type": "string" }, "lowerIsBetter": { + "description": "Whether lower values are better", "type": "boolean" }, "name": { + "description": "Metric name (e.g., \"cpu_load\", \"mem_used\")", "type": "string" }, "normal": { + "description": "Normal/typical value (good performance)", "type": "number" }, "peak": { + "description": "Peak/maximum expected value (best performance)", "type": "number" }, + "restrict": { + "description": "Restrict visibility to non user roles", + "type": "boolean" + }, "scope": { - "$ref": "#/definitions/schema.MetricScope" + "description": "Metric scope (node, socket, core, etc.)", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricScope" + } + ] }, "subClusters": { + "description": "Subcluster-specific overrides", "type": "array", "items": { "$ref": "#/definitions/schema.SubClusterConfig" } }, "timestep": { + "description": "Measurement interval in seconds", "type": "integer" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1699,12 +2521,15 @@ "type": "object", "properties": { "avg": { + "description": "Average/mean value", "type": "number" }, "max": { + "description": "Maximum value", "type": "number" }, "min": { + "description": "Minimum value", "type": "number" } } @@ -1713,30 +2538,72 @@ "type": "object", "properties": { "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement (e.g., FLOP/s, GB/s)", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] }, "value": { + "description": "Numeric value of the measurement", "type": "number" } } }, + "schema.NodePayload": { + "type": "object", + "properties": { + "cpusAllocated": { + "description": "Number of allocated CPUs", + "type": "integer" + }, + "gpusAllocated": { + "description": "Number of allocated GPUs", + "type": "integer" + }, + "hostname": { + "description": "Node hostname", + "type": "string" + }, + "jobsRunning": { + "description": "Number of running jobs", + "type": "integer" + }, + "memoryAllocated": { + "description": "Allocated memory in MB", + "type": "integer" + }, + "states": { + "description": "State strings (flexible format)", + "type": "array", + "items": { + "type": "string" + } + } + } + }, "schema.Resource": { "description": "A resource used by a job", "type": "object", "properties": { "accelerators": { + "description": "Allocated accelerator IDs (e.g., GPU IDs)", "type": "array", "items": { "type": "string" } }, "configuration": { + "description": "Optional configuration identifier", "type": "string" }, "hostname": { + "description": "Node hostname", "type": "string" }, "hwthreads": { + "description": "Allocated hardware thread IDs", "type": "array", "items": { "type": "integer" @@ -1748,19 +2615,27 @@ "type": "object", "properties": { "data": { + "description": "Time series measurements", "type": "array", "items": { "type": "number" } }, "hostname": { + "description": "Source hostname", "type": "string" }, "id": { + "description": "Optional ID (e.g., core ID, GPU ID)", "type": "string" }, "statistics": { - "$ref": "#/definitions/schema.MetricStatistics" + "description": "Statistical summary (min/avg/max)", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricStatistics" + } + ] } } }, @@ -1768,30 +2643,35 @@ "type": "object", "properties": { "max": { + "description": "Maximum values over time", "type": "array", "items": { "type": "number" } }, "mean": { + "description": "Mean values over time", "type": "array", "items": { "type": "number" } }, "median": { + "description": "Median values over time", "type": "array", "items": { "type": "number" } }, "min": { + "description": "Minimum values over time", "type": "array", "items": { "type": "number" } }, "percentiles": { + "description": "Percentile values over time (e.g., 10th, 50th, 90th)", "type": "object", "additionalProperties": { "type": "array", @@ -1807,52 +2687,81 @@ "type": "object", "properties": { "coresPerSocket": { + "description": "Number of cores per CPU socket", "type": "integer" }, "energyFootprint": { + "description": "Energy-related footprint metrics", "type": "array", "items": { "type": "string" } }, "flopRateScalar": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical scalar FLOP rate per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "flopRateSimd": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical SIMD FLOP rate per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "footprint": { + "description": "Default footprint metrics for jobs", "type": "array", "items": { "type": "string" } }, "memoryBandwidth": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical memory bandwidth per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "metricConfig": { + "description": "Subcluster-specific metric configurations", "type": "array", "items": { "$ref": "#/definitions/schema.MetricConfig" } }, "name": { + "description": "Name of the subcluster (e.g., \"main\", \"gpu\", \"bigmem\")", "type": "string" }, "nodes": { + "description": "Node list in condensed format (e.g., \"node[001-100]\")", "type": "string" }, "processorType": { + "description": "CPU model (e.g., \"Intel Xeon Gold 6148\")", "type": "string" }, "socketsPerNode": { + "description": "Number of CPU sockets per node", "type": "integer" }, "threadsPerCore": { + "description": "Number of hardware threads per core (SMT level)", "type": "integer" }, "topology": { - "$ref": "#/definitions/schema.Topology" + "description": "Hardware topology of nodes in this subcluster", + "allOf": [ + { + "$ref": "#/definitions/schema.Topology" + } + ] } } }, @@ -1860,34 +2769,52 @@ "type": "object", "properties": { "alert": { + "description": "Alert threshold (requires attention)", "type": "number" }, "caution": { + "description": "Caution threshold (concerning but not critical)", "type": "number" }, "energy": { + "description": "Energy measurement configuration", "type": "string" }, "footprint": { + "description": "Footprint category for this metric", "type": "string" }, "lowerIsBetter": { + "description": "Whether lower values indicate better performance", "type": "boolean" }, "name": { + "description": "Metric name (e.g., \"cpu_load\", \"mem_used\")", "type": "string" }, "normal": { + "description": "Normal/typical value (good performance)", "type": "number" }, "peak": { + "description": "Peak/maximum expected value (best performance)", "type": "number" }, "remove": { + "description": "Whether to exclude this metric for this subcluster", + "type": "boolean" + }, + "restrict": { + "description": "Restrict visibility to non user roles", "type": "boolean" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1916,12 +2843,14 @@ "type": "object", "properties": { "accelerators": { + "description": "Attached accelerators (GPUs, etc.)", "type": "array", "items": { "$ref": "#/definitions/schema.Accelerator" } }, "core": { + "description": "Hardware threads grouped by core", "type": "array", "items": { "type": "array", @@ -1931,6 +2860,7 @@ } }, "die": { + "description": "Hardware threads grouped by die (optional)", "type": "array", "items": { "type": "array", @@ -1940,6 +2870,7 @@ } }, "memoryDomain": { + "description": "Hardware threads grouped by NUMA domain", "type": "array", "items": { "type": "array", @@ -1949,12 +2880,14 @@ } }, "node": { + "description": "All hardware thread IDs on this node", "type": "array", "items": { "type": "integer" } }, "socket": { + "description": "Hardware threads grouped by socket", "type": "array", "items": { "type": "array", @@ -1969,9 +2902,11 @@ "type": "object", "properties": { "base": { + "description": "Base unit (e.g., \"B/s\", \"F/s\", \"W\")", "type": "string" }, "prefix": { + "description": "SI prefix (e.g., \"G\", \"M\", \"K\", \"T\")", "type": "string" } } diff --git a/api/swagger.yaml b/api/swagger.yaml index 119e9529..def939dd 100644 --- a/api/swagger.yaml +++ b/api/swagger.yaml @@ -1,5 +1,5 @@ definitions: - api.ApiReturnedUser: + api.APIReturnedUser: properties: email: type: string @@ -16,7 +16,7 @@ definitions: username: type: string type: object - api.ApiTag: + api.APITag: properties: name: description: Tag Name @@ -31,12 +31,12 @@ definitions: example: Debug type: string type: object - api.DefaultApiResponse: + api.DefaultAPIResponse: properties: msg: type: string type: object - api.DeleteJobApiRequest: + api.DeleteJobAPIRequest: properties: cluster: description: Cluster of job @@ -71,7 +71,7 @@ definitions: description: Statustext of Errorcode type: string type: object - api.GetClustersApiResponse: + api.GetClustersAPIResponse: properties: clusters: description: Array of clusters @@ -79,7 +79,7 @@ definitions: $ref: '#/definitions/schema.Cluster' type: array type: object - api.GetJobApiResponse: + api.GetJobAPIResponse: properties: data: items: @@ -88,7 +88,7 @@ definitions: meta: $ref: '#/definitions/schema.Job' type: object - api.GetJobsApiResponse: + api.GetJobsAPIResponse: properties: items: description: Number of jobs returned @@ -102,6 +102,27 @@ definitions: description: Page id returned type: integer type: object + api.JobMetaRequest: + properties: + cluster: + description: Cluster of job + example: fritz + type: string + jobId: + description: Cluster Job ID of job + example: 123000 + type: integer + payload: + allOf: + - $ref: '#/definitions/api.EditMetaRequest' + description: Content to Add to Job Meta_Data + startTime: + description: Start Time of job as epoch + example: 1649723812 + type: integer + required: + - jobId + type: object api.JobMetricWithName: properties: metric: @@ -111,28 +132,7 @@ definitions: scope: $ref: '#/definitions/schema.MetricScope' type: object - api.Node: - properties: - cpusAllocated: - type: integer - cpusTotal: - type: integer - gpusAllocated: - type: integer - gpusTotal: - type: integer - hostname: - type: string - memoryAllocated: - type: integer - memoryTotal: - type: integer - states: - items: - type: string - type: array - type: object - api.StopJobApiRequest: + api.StopJobAPIRequest: properties: cluster: example: fritz @@ -161,32 +161,39 @@ definitions: type: string nodes: items: - $ref: '#/definitions/api.Node' + $ref: '#/definitions/schema.NodePayload' type: array type: object schema.Accelerator: properties: id: + description: Unique identifier for the accelerator (e.g., "0", "1", "GPU-0") type: string model: + description: Specific model name (e.g., "A100", "MI100") type: string type: + description: Type of accelerator (e.g., "Nvidia GPU", "AMD GPU") type: string type: object schema.Cluster: properties: metricConfig: + description: Cluster-wide metric configurations items: $ref: '#/definitions/schema.MetricConfig' type: array name: + description: Unique cluster name (e.g., "fritz", "alex") type: string subClusters: + description: Homogeneous partitions within the cluster items: $ref: '#/definitions/schema.SubCluster' type: array type: object schema.Job: + description: Information of a HPC job. properties: arrayJobId: example: 123000 @@ -207,6 +214,12 @@ definitions: format: float64 type: number type: object + exclusive: + description: for backwards compatibility + example: 1 + maximum: 2 + minimum: 0 + type: integer footprint: additionalProperties: format: float64 @@ -227,7 +240,7 @@ definitions: - deadline - failed - node_fail - - out_of_memory + - out-of-memory - pending - preempted - running @@ -307,15 +320,19 @@ definitions: schema.JobLink: properties: id: + description: Internal database ID type: integer jobId: + description: The job's external job ID type: integer type: object schema.JobLinkResultList: properties: count: + description: Total count of available items type: integer items: + description: List of job links items: $ref: '#/definitions/schema.JobLink' type: array @@ -323,15 +340,21 @@ definitions: schema.JobMetric: properties: series: + description: Individual time series data items: $ref: '#/definitions/schema.Series' type: array statisticsSeries: - $ref: '#/definitions/schema.StatsSeries' + allOf: + - $ref: '#/definitions/schema.StatsSeries' + description: Aggregated statistics over time timestep: + description: Sampling interval in seconds type: integer unit: - $ref: '#/definitions/schema.Unit' + allOf: + - $ref: '#/definitions/schema.Unit' + description: Unit of measurement type: object schema.JobState: enum: @@ -385,33 +408,51 @@ definitions: schema.MetricConfig: properties: aggregation: + description: Aggregation function (avg, sum, min, max) type: string alert: + description: Alert threshold (requires attention) type: number caution: + description: Caution threshold (concerning but not critical) type: number energy: + description: Energy measurement method type: string footprint: + description: Footprint category type: string lowerIsBetter: + description: Whether lower values are better type: boolean name: + description: Metric name (e.g., "cpu_load", "mem_used") type: string normal: + description: Normal/typical value (good performance) type: number peak: + description: Peak/maximum expected value (best performance) type: number + restrict: + description: Restrict visibility to non user roles + type: boolean scope: - $ref: '#/definitions/schema.MetricScope' + allOf: + - $ref: '#/definitions/schema.MetricScope' + description: Metric scope (node, socket, core, etc.) subClusters: + description: Subcluster-specific overrides items: $ref: '#/definitions/schema.SubClusterConfig' type: array timestep: + description: Measurement interval in seconds type: integer unit: - $ref: '#/definitions/schema.Unit' + allOf: + - $ref: '#/definitions/schema.Unit' + description: Unit of measurement type: object schema.MetricScope: enum: @@ -434,31 +475,64 @@ definitions: schema.MetricStatistics: properties: avg: + description: Average/mean value type: number max: + description: Maximum value type: number min: + description: Minimum value type: number type: object schema.MetricValue: properties: unit: - $ref: '#/definitions/schema.Unit' + allOf: + - $ref: '#/definitions/schema.Unit' + description: Unit of measurement (e.g., FLOP/s, GB/s) value: + description: Numeric value of the measurement type: number type: object + schema.NodePayload: + properties: + cpusAllocated: + description: Number of allocated CPUs + type: integer + gpusAllocated: + description: Number of allocated GPUs + type: integer + hostname: + description: Node hostname + type: string + jobsRunning: + description: Number of running jobs + type: integer + memoryAllocated: + description: Allocated memory in MB + type: integer + states: + description: State strings (flexible format) + items: + type: string + type: array + type: object schema.Resource: description: A resource used by a job properties: accelerators: + description: Allocated accelerator IDs (e.g., GPU IDs) items: type: string type: array configuration: + description: Optional configuration identifier type: string hostname: + description: Node hostname type: string hwthreads: + description: Allocated hardware thread IDs items: type: integer type: array @@ -466,31 +540,40 @@ definitions: schema.Series: properties: data: + description: Time series measurements items: type: number type: array hostname: + description: Source hostname type: string id: + description: Optional ID (e.g., core ID, GPU ID) type: string statistics: - $ref: '#/definitions/schema.MetricStatistics' + allOf: + - $ref: '#/definitions/schema.MetricStatistics' + description: Statistical summary (min/avg/max) type: object schema.StatsSeries: properties: max: + description: Maximum values over time items: type: number type: array mean: + description: Mean values over time items: type: number type: array median: + description: Median values over time items: type: number type: array min: + description: Minimum values over time items: type: number type: array @@ -500,65 +583,97 @@ definitions: format: float64 type: number type: array + description: Percentile values over time (e.g., 10th, 50th, 90th) type: object type: object schema.SubCluster: properties: coresPerSocket: + description: Number of cores per CPU socket type: integer energyFootprint: + description: Energy-related footprint metrics items: type: string type: array flopRateScalar: - $ref: '#/definitions/schema.MetricValue' + allOf: + - $ref: '#/definitions/schema.MetricValue' + description: Theoretical scalar FLOP rate per node flopRateSimd: - $ref: '#/definitions/schema.MetricValue' + allOf: + - $ref: '#/definitions/schema.MetricValue' + description: Theoretical SIMD FLOP rate per node footprint: + description: Default footprint metrics for jobs items: type: string type: array memoryBandwidth: - $ref: '#/definitions/schema.MetricValue' + allOf: + - $ref: '#/definitions/schema.MetricValue' + description: Theoretical memory bandwidth per node metricConfig: + description: Subcluster-specific metric configurations items: $ref: '#/definitions/schema.MetricConfig' type: array name: + description: Name of the subcluster (e.g., "main", "gpu", "bigmem") type: string nodes: + description: Node list in condensed format (e.g., "node[001-100]") type: string processorType: + description: CPU model (e.g., "Intel Xeon Gold 6148") type: string socketsPerNode: + description: Number of CPU sockets per node type: integer threadsPerCore: + description: Number of hardware threads per core (SMT level) type: integer topology: - $ref: '#/definitions/schema.Topology' + allOf: + - $ref: '#/definitions/schema.Topology' + description: Hardware topology of nodes in this subcluster type: object schema.SubClusterConfig: properties: alert: + description: Alert threshold (requires attention) type: number caution: + description: Caution threshold (concerning but not critical) type: number energy: + description: Energy measurement configuration type: string footprint: + description: Footprint category for this metric type: string lowerIsBetter: + description: Whether lower values indicate better performance type: boolean name: + description: Metric name (e.g., "cpu_load", "mem_used") type: string normal: + description: Normal/typical value (good performance) type: number peak: + description: Peak/maximum expected value (best performance) type: number remove: + description: Whether to exclude this metric for this subcluster + type: boolean + restrict: + description: Restrict visibility to non user roles type: boolean unit: - $ref: '#/definitions/schema.Unit' + allOf: + - $ref: '#/definitions/schema.Unit' + description: Unit of measurement type: object schema.Tag: description: Defines a tag using name and type. @@ -578,32 +693,38 @@ definitions: schema.Topology: properties: accelerators: + description: Attached accelerators (GPUs, etc.) items: $ref: '#/definitions/schema.Accelerator' type: array core: + description: Hardware threads grouped by core items: items: type: integer type: array type: array die: + description: Hardware threads grouped by die (optional) items: items: type: integer type: array type: array memoryDomain: + description: Hardware threads grouped by NUMA domain items: items: type: integer type: array type: array node: + description: All hardware thread IDs on this node items: type: integer type: array socket: + description: Hardware threads grouped by socket items: items: type: integer @@ -613,8 +734,10 @@ definitions: schema.Unit: properties: base: + description: Base unit (e.g., "B/s", "F/s", "W") type: string prefix: + description: SI prefix (e.g., "G", "M", "K", "T") type: string type: object host: localhost:8080 @@ -645,7 +768,7 @@ paths: "200": description: Array of clusters schema: - $ref: '#/definitions/api.GetClustersApiResponse' + $ref: '#/definitions/api.GetClustersAPIResponse' "400": description: Bad Request schema: @@ -710,7 +833,7 @@ paths: "200": description: Job array and page info schema: - $ref: '#/definitions/api.GetJobsApiResponse' + $ref: '#/definitions/api.GetJobsAPIResponse' "400": description: Bad Request schema: @@ -753,7 +876,7 @@ paths: "200": description: Job resource schema: - $ref: '#/definitions/api.GetJobApiResponse' + $ref: '#/definitions/api.GetJobAPIResponse' "400": description: Bad Request schema: @@ -810,7 +933,7 @@ paths: "200": description: Job resource schema: - $ref: '#/definitions/api.GetJobApiResponse' + $ref: '#/definitions/api.GetJobAPIResponse' "400": description: Bad Request schema: @@ -853,14 +976,14 @@ paths: name: request required: true schema: - $ref: '#/definitions/api.DeleteJobApiRequest' + $ref: '#/definitions/api.DeleteJobAPIRequest' produces: - application/json responses: "200": description: Success message schema: - $ref: '#/definitions/api.DefaultApiResponse' + $ref: '#/definitions/api.DefaultAPIResponse' "400": description: Bad Request schema: @@ -907,7 +1030,7 @@ paths: "200": description: Success message schema: - $ref: '#/definitions/api.DefaultApiResponse' + $ref: '#/definitions/api.DefaultAPIResponse' "400": description: Bad Request schema: @@ -948,13 +1071,17 @@ paths: name: ts required: true type: integer + - description: Omit jobs with tags from deletion + in: query + name: omit-tagged + type: boolean produces: - application/json responses: "200": description: Success message schema: - $ref: '#/definitions/api.DefaultApiResponse' + $ref: '#/definitions/api.DefaultAPIResponse' "400": description: Bad Request schema: @@ -985,8 +1112,50 @@ paths: summary: Remove a job from the sql database tags: - Job remove + /api/jobs/edit_meta/: + patch: + consumes: + - application/json + description: |- + Edit key value pairs in metadata json of job specified by jobID, StartTime and Cluster + If a key already exists its content will be overwritten + parameters: + - description: Specifies job and payload to add or update + in: body + name: request + required: true + schema: + $ref: '#/definitions/api.JobMetaRequest' + produces: + - application/json + responses: + "200": + description: Updated job resource + schema: + $ref: '#/definitions/schema.Job' + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: Job does not exist + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Edit meta-data json by request + tags: + - Job add and modify /api/jobs/edit_meta/{id}: - post: + patch: consumes: - application/json description: |- @@ -1052,7 +1221,7 @@ paths: "201": description: Job added successfully schema: - $ref: '#/definitions/api.DefaultApiResponse' + $ref: '#/definitions/api.DefaultAPIResponse' "400": description: Bad Request schema: @@ -1090,7 +1259,7 @@ paths: name: request required: true schema: - $ref: '#/definitions/api.StopJobApiRequest' + $ref: '#/definitions/api.StopJobAPIRequest' produces: - application/json responses: @@ -1147,7 +1316,7 @@ paths: required: true schema: items: - $ref: '#/definitions/api.ApiTag' + $ref: '#/definitions/api.APITag' type: array produces: - application/json @@ -1195,7 +1364,7 @@ paths: "200": description: Success message schema: - $ref: '#/definitions/api.DefaultApiResponse' + $ref: '#/definitions/api.DefaultAPIResponse' "400": description: Bad Request schema: @@ -1217,7 +1386,80 @@ paths: summary: Deliver updated Slurm node states tags: - Nodestates + /api/user/{id}: + post: + description: Allows admins to add/remove roles and projects for a user + parameters: + - description: Username + in: path + name: id + required: true + type: string + - description: Role to add + in: formData + name: add-role + type: string + - description: Role to remove + in: formData + name: remove-role + type: string + - description: Project to add + in: formData + name: add-project + type: string + - description: Project to remove + in: formData + name: remove-project + type: string + produces: + - text/plain + responses: + "200": + description: Success message + schema: + type: string + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: Unprocessable Entity + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Update user roles and projects + tags: + - User /api/users/: + delete: + description: Deletes a user from the system + parameters: + - description: Username to delete + in: formData + name: username + required: true + type: string + produces: + - text/plain + responses: + "200": + description: Success + schema: + type: string + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: Unprocessable Entity + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Delete a user + tags: + - User get: description: |- Returns a JSON-encoded list of users. @@ -1236,7 +1478,7 @@ paths: description: List of users returned successfully schema: items: - $ref: '#/definitions/api.ApiReturnedUser' + $ref: '#/definitions/api.APIReturnedUser' type: array "400": description: Bad Request @@ -1259,6 +1501,198 @@ paths: summary: Returns a list of users tags: - User + post: + description: Creates a new user with specified credentials and role + parameters: + - description: Username + in: formData + name: username + required: true + type: string + - description: Password (not required for API users) + in: formData + name: password + type: string + - description: User role + in: formData + name: role + required: true + type: string + - description: Full name + in: formData + name: name + type: string + - description: Email address + in: formData + name: email + type: string + - description: Project (required for managers) + in: formData + name: project + type: string + produces: + - text/plain + responses: + "200": + description: Success message + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "422": + description: Unprocessable Entity + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Create a new user + tags: + - User + /configuration/: + post: + consumes: + - multipart/form-data + description: Updates a user's configuration key-value pair. + parameters: + - description: Configuration key + in: formData + name: key + required: true + type: string + - description: Configuration value + in: formData + name: value + required: true + type: string + produces: + - text/plain + responses: + "200": + description: success + schema: + type: string + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Update user configuration + tags: + - Frontend + /debug/: + post: + description: This endpoint allows the users to print the content of + parameters: + - description: Selector + in: query + name: selector + type: string + produces: + - application/json + responses: + "200": + description: Debug dump + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Debug endpoint + tags: + - debug + /free/: + post: + description: This endpoint allows the users to free the Buffers from the + parameters: + - description: up to timestamp + in: query + name: to + type: string + produces: + - application/json + responses: + "200": + description: ok + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + tags: + - free + /healthcheck/: + get: + description: This endpoint allows the users to check if a node is healthy + parameters: + - description: Selector + in: query + name: selector + type: string + produces: + - application/json + responses: + "200": + description: Debug dump + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: HealthCheck endpoint + tags: + - healthcheck /jobs/tag_job/{id}: delete: consumes: @@ -1279,7 +1713,7 @@ paths: required: true schema: items: - $ref: '#/definitions/api.ApiTag' + $ref: '#/definitions/api.APITag' type: array produces: - application/json @@ -1309,6 +1743,176 @@ paths: summary: Removes one or more tags from a job tags: - Job add and modify + /jwt/: + get: + consumes: + - multipart/form-data + description: Generates a JWT token for a user. Admins can generate tokens for + any user, regular users only for themselves. + parameters: + - description: Username to generate JWT for + in: formData + name: username + required: true + type: string + produces: + - text/plain + responses: + "200": + description: JWT token + schema: + type: string + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: User Not Found + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Generate JWT token + tags: + - Frontend + /machine_state/{cluster}/{host}: + get: + description: Retrieves stored machine state data for a specific cluster node. + Validates cluster and host names to prevent path traversal. + parameters: + - description: Cluster name + in: path + name: cluster + required: true + type: string + - description: Host name + in: path + name: host + required: true + type: string + produces: + - application/json + responses: + "200": + description: Machine state JSON data + schema: + type: object + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: Machine state not enabled or file not found + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Retrieve machine state + tags: + - Machine State + put: + consumes: + - application/json + description: Stores machine state data for a specific cluster node. Validates + cluster and host names to prevent path traversal. + parameters: + - description: Cluster name + in: path + name: cluster + required: true + type: string + - description: Host name + in: path + name: host + required: true + type: string + produces: + - text/plain + responses: + "201": + description: Created + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: Machine state not enabled + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Store machine state + tags: + - Machine State + /notice/: + post: + consumes: + - multipart/form-data + description: Updates the notice.txt file content. Only admins are allowed. Content + is limited to 10000 characters. + parameters: + - description: New notice content (max 10000 characters) + in: formData + name: new-content + required: true + type: string + produces: + - text/plain + responses: + "200": + description: Update Notice Content Success + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Update system notice + tags: + - Config + /roles/: + get: + description: Returns a list of valid user roles. Only admins are allowed. + produces: + - application/json + responses: + "200": + description: List of role names + schema: + items: + type: string + type: array + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] + summary: Get available roles + tags: + - Config /tags/: delete: consumes: @@ -1324,7 +1928,7 @@ paths: required: true schema: items: - $ref: '#/definitions/api.ApiTag' + $ref: '#/definitions/api.APITag' type: array produces: - text/plain @@ -1354,6 +1958,41 @@ paths: summary: Removes all tags and job-relations for type:name tuple tags: - Tag remove + /write/: + post: + consumes: + - text/plain + parameters: + - description: If the lines in the body do not have a cluster tag, use this + value instead. + in: query + name: cluster + type: string + produces: + - application/json + responses: + "200": + description: ok + schema: + type: string + "400": + description: Bad Request + schema: + $ref: '#/definitions/api.ErrorResponse' + "401": + description: Unauthorized + schema: + $ref: '#/definitions/api.ErrorResponse' + "403": + description: Forbidden + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal Server Error + schema: + $ref: '#/definitions/api.ErrorResponse' + security: + - ApiKeyAuth: [] securityDefinitions: ApiKeyAuth: in: header diff --git a/cmd/cc-backend/cli.go b/cmd/cc-backend/cli.go index af32b643..9ee56cb2 100644 --- a/cmd/cc-backend/cli.go +++ b/cmd/cc-backend/cli.go @@ -33,6 +33,6 @@ func cliInit() { flag.StringVar(&flagDelUser, "del-user", "", "Remove a existing user. Argument format: ") flag.StringVar(&flagGenJWT, "jwt", "", "Generate and print a JWT for the user specified by its `username`") flag.StringVar(&flagImportJob, "import-job", "", "Import a job. Argument format: `:,...`") - flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug, info (default), warn, err, crit]`") + flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug, info , warn (default), err, crit]`") flag.Parse() } diff --git a/cmd/cc-backend/init.go b/cmd/cc-backend/init.go index ee60b12c..09ad2084 100644 --- a/cmd/cc-backend/init.go +++ b/cmd/cc-backend/init.go @@ -12,11 +12,10 @@ import ( "encoding/json" "os" - "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/util" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/util" ) const envString = ` @@ -35,20 +34,20 @@ const configString = ` "addr": "127.0.0.1:8080", "short-running-jobs-duration": 300, "resampling": { - "minimumPoints": 600, - "trigger": 180, + "minimum-points": 600, + "trigger": 300, "resolutions": [ 240, 60 ] }, - "apiAllowedIPs": [ + "api-allowed-ips": [ "*" ], "emission-constant": 317 }, "cron": { - "commit-job-worker": "2m", + "commit-job-worker": "1m", "duration-worker": "5m", "footprint-worker": "10m" }, @@ -60,31 +59,7 @@ const configString = ` "jwts": { "max-age": "2000h" } - }, - "clusters": [ - { - "name": "name", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 64 - }, - "duration": { - "from": 0, - "to": 86400 - }, - "startTime": { - "from": "2023-01-01T00:00:00Z", - "to": null - } - } - } - ] + } } ` @@ -105,15 +80,15 @@ func initEnv() { cclog.Abortf("Could not create default ./var folder with permissions '0o777'. Application initialization failed, exited.\nError: %s\n", err.Error()) } - err := repository.MigrateDB("sqlite3", "./var/job.db") + err := repository.MigrateDB("./var/job.db") if err != nil { - cclog.Abortf("Could not initialize default sqlite3 database as './var/job.db'. Application initialization failed, exited.\nError: %s\n", err.Error()) + cclog.Abortf("Could not initialize default SQLite database as './var/job.db'. Application initialization failed, exited.\nError: %s\n", err.Error()) } if err := os.Mkdir("var/job-archive", 0o777); err != nil { cclog.Abortf("Could not create default ./var/job-archive folder with permissions '0o777'. Application initialization failed, exited.\nError: %s\n", err.Error()) } archiveCfg := "{\"kind\": \"file\",\"path\": \"./var/job-archive\"}" - if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil { + if err := archive.Init(json.RawMessage(archiveCfg)); err != nil { cclog.Abortf("Could not initialize job-archive, exited.\nError: %s\n", err.Error()) } } diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index d89109e3..57c8d65b 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -24,22 +24,22 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/importer" - "github.com/ClusterCockpit/cc-backend/internal/memorystore" - "github.com/ClusterCockpit/cc-backend/internal/metricdata" + "github.com/ClusterCockpit/cc-backend/internal/metricdispatch" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/tagger" "github.com/ClusterCockpit/cc-backend/internal/taskmanager" "github.com/ClusterCockpit/cc-backend/pkg/archive" + "github.com/ClusterCockpit/cc-backend/pkg/metricstore" "github.com/ClusterCockpit/cc-backend/web" - ccconf "github.com/ClusterCockpit/cc-lib/ccConfig" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/runtimeEnv" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/ClusterCockpit/cc-lib/util" + ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/nats" + "github.com/ClusterCockpit/cc-lib/v2/runtime" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" "github.com/google/gops/agent" "github.com/joho/godotenv" - _ "github.com/go-sql-driver/mysql" _ "github.com/mattn/go-sqlite3" ) @@ -103,46 +103,41 @@ func initConfiguration() error { return fmt.Errorf("main configuration must be present") } - clustercfg := ccconf.GetPackageConfig("clusters") - if clustercfg == nil { - return fmt.Errorf("cluster configuration must be present") - } - - config.Init(cfg, clustercfg) + config.Init(cfg) return nil } func initDatabase() error { - repository.Connect(config.Keys.DBDriver, config.Keys.DB) + repository.Connect(config.Keys.DB) return nil } func handleDatabaseCommands() error { if flagMigrateDB { - err := repository.MigrateDB(config.Keys.DBDriver, config.Keys.DB) + err := repository.MigrateDB(config.Keys.DB) if err != nil { return fmt.Errorf("migrating database to version %d: %w", repository.Version, err) } - cclog.Exitf("MigrateDB Success: Migrated '%s' database at location '%s' to version %d.\n", - config.Keys.DBDriver, config.Keys.DB, repository.Version) + cclog.Exitf("MigrateDB Success: Migrated SQLite database at '%s' to version %d.\n", + config.Keys.DB, repository.Version) } if flagRevertDB { - err := repository.RevertDB(config.Keys.DBDriver, config.Keys.DB) + err := repository.RevertDB(config.Keys.DB) if err != nil { return fmt.Errorf("reverting database to version %d: %w", repository.Version-1, err) } - cclog.Exitf("RevertDB Success: Reverted '%s' database at location '%s' to version %d.\n", - config.Keys.DBDriver, config.Keys.DB, repository.Version-1) + cclog.Exitf("RevertDB Success: Reverted SQLite database at '%s' to version %d.\n", + config.Keys.DB, repository.Version-1) } if flagForceDB { - err := repository.ForceDB(config.Keys.DBDriver, config.Keys.DB) + err := repository.ForceDB(config.Keys.DB) if err != nil { return fmt.Errorf("forcing database to version %d: %w", repository.Version, err) } - cclog.Exitf("ForceDB Success: Forced '%s' database at location '%s' to version %d.\n", - config.Keys.DBDriver, config.Keys.DB, repository.Version) + cclog.Exitf("ForceDB Success: Forced SQLite database at '%s' to version %d.\n", + config.Keys.DB, repository.Version) } return nil @@ -253,7 +248,7 @@ func generateJWT(authHandle *auth.Authentication, username string) error { return fmt.Errorf("getting user '%s': %w", username, err) } - if !user.HasRole(schema.RoleApi) { + if !user.HasRole(schema.RoleAPI) { cclog.Warnf("JWT: User '%s' does not have the role 'api'. REST API endpoints will return error!\n", user.Username) } @@ -262,25 +257,28 @@ func generateJWT(authHandle *auth.Authentication, username string) error { return fmt.Errorf("generating JWT for user '%s': %w", user.Username, err) } - cclog.Infof("JWT: Successfully generated JWT for user '%s': %s", user.Username, jwt) + cclog.Printf("JWT: Successfully generated JWT for user '%s': %s\n", user.Username, jwt) return nil } func initSubsystems() error { + // Initialize nats client + natsConfig := ccconf.GetPackageConfig("nats") + if err := nats.Init(natsConfig); err != nil { + cclog.Warnf("initializing (optional) nats client: %s", err.Error()) + } + nats.Connect() + // Initialize job archive archiveCfg := ccconf.GetPackageConfig("archive") if archiveCfg == nil { + cclog.Debug("Archive configuration not found, using default archive configuration") archiveCfg = json.RawMessage(defaultArchiveConfig) } - if err := archive.Init(archiveCfg, config.Keys.DisableArchive); err != nil { + if err := archive.Init(archiveCfg); err != nil { return fmt.Errorf("initializing archive: %w", err) } - // Initialize metricdata - if err := metricdata.Init(); err != nil { - return fmt.Errorf("initializing metricdata repository: %w", err) - } - // Handle database re-initialization if flagReinitDB { if err := importer.InitDB(); err != nil { @@ -304,6 +302,8 @@ func initSubsystems() error { // Apply tags if requested if flagApplyTags { + tagger.Init() + if err := tagger.RunTaggers(); err != nil { return fmt.Errorf("running job taggers: %w", err) } @@ -315,13 +315,38 @@ func initSubsystems() error { func runServer(ctx context.Context) error { var wg sync.WaitGroup - // Start metric store if enabled - if memorystore.InternalCCMSFlag { - mscfg := ccconf.GetPackageConfig("metric-store") - if mscfg == nil { - return fmt.Errorf("metric store configuration must be present") + // Initialize metric store if configuration is provided + haveMetricstore := false + mscfg := ccconf.GetPackageConfig("metric-store") + if mscfg != nil { + metrics := metricstore.BuildMetricList() + metricstore.Init(mscfg, metrics, &wg) + + // Inject repository as NodeProvider to break import cycle + ms := metricstore.GetMemoryStore() + jobRepo := repository.GetJobRepository() + ms.SetNodeProvider(jobRepo) + metricstore.MetricStoreHandle = &metricstore.InternalMetricStore{} + haveMetricstore = true + } else { + metricstore.MetricStoreHandle = nil + cclog.Debug("missing internal metricstore configuration") + } + + // Initialize external metric stores if configuration is provided + mscfg = ccconf.GetPackageConfig("metric-store-external") + if mscfg != nil { + err := metricdispatch.Init(mscfg) + + if err != nil { + cclog.Debugf("error while initializing external metricdispatch: %v", err) + } else { + haveMetricstore = true } - memorystore.Init(mscfg, &wg) + } + + if !haveMetricstore { + return fmt.Errorf("missing metricstore configuration") } // Start archiver and task manager @@ -344,13 +369,11 @@ func runServer(ctx context.Context) error { errChan := make(chan error, 1) // Start HTTP server - wg.Add(1) - go func() { - defer wg.Done() + wg.Go(func() { if err := srv.Start(ctx); err != nil { errChan <- err } - }() + }) // Handle shutdown signals wg.Add(1) @@ -364,7 +387,7 @@ func runServer(ctx context.Context) error { case <-ctx.Done(): } - runtimeEnv.SystemdNotifiy(false, "Shutting down ...") + runtime.SystemdNotify(false, "Shutting down ...") srv.Shutdown(ctx) util.FsWatcherShutdown() taskmanager.Shutdown() @@ -372,26 +395,42 @@ func runServer(ctx context.Context) error { // Set GC percent if not configured if os.Getenv(envGOGC) == "" { - debug.SetGCPercent(25) + // trigger GC when heap grows 15% above the previous live set + debug.SetGCPercent(15) } - runtimeEnv.SystemdNotifiy(true, "running") + runtime.SystemdNotify(true, "running") - // Wait for completion or error + waitDone := make(chan struct{}) go func() { wg.Wait() + close(waitDone) + }() + + go func() { + <-waitDone close(errChan) }() - // Check for server startup errors + // Wait for either: + // 1. An error from server startup + // 2. Completion of all goroutines (normal shutdown or crash) select { case err := <-errChan: + // errChan will be closed when waitDone is closed, which happens + // when all goroutines complete (either from normal shutdown or error) if err != nil { return err } case <-time.After(100 * time.Millisecond): - // Server started successfully, wait for completion - if err := <-errChan; err != nil { - return err + // Give the server 100ms to start and report any immediate startup errors + // After that, just wait for normal shutdown completion + select { + case err := <-errChan: + if err != nil { + return err + } + case <-waitDone: + // Normal shutdown completed } } diff --git a/cmd/cc-backend/server.go b/cmd/cc-backend/server.go index 975d38a1..8687db63 100644 --- a/cmd/cc-backend/server.go +++ b/cmd/cc-backend/server.go @@ -2,6 +2,7 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + // Package main provides the entry point for the ClusterCockpit backend server. // This file contains HTTP server setup, routing configuration, and // authentication middleware integration. @@ -13,7 +14,6 @@ import ( "encoding/json" "errors" "fmt" - "io" "net" "net/http" "os" @@ -29,13 +29,15 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/graph" "github.com/ClusterCockpit/cc-backend/internal/graph/generated" - "github.com/ClusterCockpit/cc-backend/internal/memorystore" "github.com/ClusterCockpit/cc-backend/internal/routerConfig" + "github.com/ClusterCockpit/cc-backend/pkg/metricstore" "github.com/ClusterCockpit/cc-backend/web" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/runtimeEnv" - "github.com/gorilla/handlers" - "github.com/gorilla/mux" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/nats" + "github.com/ClusterCockpit/cc-lib/v2/runtime" + "github.com/go-chi/chi/v5" + "github.com/go-chi/chi/v5/middleware" + "github.com/go-chi/cors" httpSwagger "github.com/swaggo/http-swagger" ) @@ -48,9 +50,10 @@ const ( // Server encapsulates the HTTP server state and dependencies type Server struct { - router *mux.Router - server *http.Server - apiHandle *api.RestAPI + router chi.Router + server *http.Server + restAPIHandle *api.RestAPI + natsAPIHandle *api.NatsAPI } func onFailureResponse(rw http.ResponseWriter, r *http.Request, err error) { @@ -67,7 +70,7 @@ func NewServer(version, commit, buildDate string) (*Server, error) { buildInfo = web.Build{Version: version, Hash: commit, Buildtime: buildDate} s := &Server{ - router: mux.NewRouter(), + router: chi.NewRouter(), } if err := s.init(); err != nil { @@ -103,7 +106,28 @@ func (s *Server) init() error { authHandle := auth.GetAuthInstance() - s.apiHandle = api.New() + // Middleware must be defined before routes in chi + s.router.Use(func(next http.Handler) http.Handler { + return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + start := time.Now() + ww := middleware.NewWrapResponseWriter(rw, r.ProtoMajor) + next.ServeHTTP(ww, r) + cclog.Debugf("%s %s (%d, %.02fkb, %dms)", + r.Method, r.URL.RequestURI(), + ww.Status(), float32(ww.BytesWritten())/1024, + time.Since(start).Milliseconds()) + }) + }) + s.router.Use(middleware.Compress(5)) + s.router.Use(middleware.Recoverer) + s.router.Use(cors.Handler(cors.Options{ + AllowCredentials: true, + AllowedHeaders: []string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}, + AllowedMethods: []string{"GET", "POST", "HEAD", "OPTIONS"}, + AllowedOrigins: []string{"*"}, + })) + + s.restAPIHandle = api.New() info := map[string]any{} info["hasOpenIDConnect"] = false @@ -114,11 +138,11 @@ func (s *Server) init() error { info["hasOpenIDConnect"] = true } - s.router.HandleFunc("/login", func(rw http.ResponseWriter, r *http.Request) { + s.router.Get("/login", func(rw http.ResponseWriter, r *http.Request) { rw.Header().Add("Content-Type", "text/html; charset=utf-8") cclog.Debugf("##%v##", info) web.RenderTemplate(rw, "login.tmpl", &web.Page{Title: "Login", Build: buildInfo, Infos: info}) - }).Methods(http.MethodGet) + }) s.router.HandleFunc("/imprint", func(rw http.ResponseWriter, r *http.Request) { rw.Header().Add("Content-Type", "text/html; charset=utf-8") web.RenderTemplate(rw, "imprint.tmpl", &web.Page{Title: "Imprint", Build: buildInfo}) @@ -128,13 +152,6 @@ func (s *Server) init() error { web.RenderTemplate(rw, "privacy.tmpl", &web.Page{Title: "Privacy", Build: buildInfo}) }) - secured := s.router.PathPrefix("/").Subrouter() - securedapi := s.router.PathPrefix("/api").Subrouter() - userapi := s.router.PathPrefix("/userapi").Subrouter() - configapi := s.router.PathPrefix("/config").Subrouter() - frontendapi := s.router.PathPrefix("/frontend").Subrouter() - metricstoreapi := s.router.PathPrefix("/metricstore").Subrouter() - if !config.Keys.DisableAuthentication { // Create login failure handler (used by both /login and /jwt-login) loginFailureHandler := func(rw http.ResponseWriter, r *http.Request, err error) { @@ -149,10 +166,10 @@ func (s *Server) init() error { }) } - s.router.Handle("/login", authHandle.Login(loginFailureHandler)).Methods(http.MethodPost) - s.router.Handle("/jwt-login", authHandle.Login(loginFailureHandler)) + s.router.Post("/login", authHandle.Login(loginFailureHandler).ServeHTTP) + s.router.HandleFunc("/jwt-login", authHandle.Login(loginFailureHandler).ServeHTTP) - s.router.Handle("/logout", authHandle.Logout( + s.router.Post("/logout", authHandle.Logout( http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { rw.Header().Add("Content-Type", "text/html; charset=utf-8") rw.WriteHeader(http.StatusOK) @@ -163,111 +180,158 @@ func (s *Server) init() error { Build: buildInfo, Infos: info, }) - }))).Methods(http.MethodPost) - - secured.Use(func(next http.Handler) http.Handler { - return authHandle.Auth( - // On success; - next, - - // On failure: - func(rw http.ResponseWriter, r *http.Request, err error) { - rw.WriteHeader(http.StatusUnauthorized) - web.RenderTemplate(rw, "login.tmpl", &web.Page{ - Title: "Authentication failed - ClusterCockpit", - MsgType: "alert-danger", - Message: err.Error(), - Build: buildInfo, - Infos: info, - Redirect: r.RequestURI, - }) - }) - }) - - securedapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthAPI( - // On success; - next, - // On failure: JSON Response - onFailureResponse) - }) - - userapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthUserAPI( - // On success; - next, - // On failure: JSON Response - onFailureResponse) - }) - - metricstoreapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthMetricStoreAPI( - // On success; - next, - // On failure: JSON Response - onFailureResponse) - }) - - configapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthConfigAPI( - // On success; - next, - // On failure: JSON Response - onFailureResponse) - }) - - frontendapi.Use(func(next http.Handler) http.Handler { - return authHandle.AuthFrontendAPI( - // On success; - next, - // On failure: JSON Response - onFailureResponse) - }) + })).ServeHTTP) } if flagDev { s.router.Handle("/playground", playground.Handler("GraphQL playground", "/query")) - s.router.PathPrefix("/swagger/").Handler(httpSwagger.Handler( - httpSwagger.URL("http://" + config.Keys.Addr + "/swagger/doc.json"))).Methods(http.MethodGet) + s.router.Get("/swagger/*", httpSwagger.Handler( + httpSwagger.URL("http://"+config.Keys.Addr+"/swagger/doc.json"))) } - secured.Handle("/query", graphQLServer) - // Send a searchId and then reply with a redirect to a user, or directly send query to job table for jobid and project. - secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) { - routerConfig.HandleSearchBar(rw, r, buildInfo) + // Secured routes (require authentication) + s.router.Group(func(secured chi.Router) { + if !config.Keys.DisableAuthentication { + secured.Use(func(next http.Handler) http.Handler { + return authHandle.Auth( + next, + func(rw http.ResponseWriter, r *http.Request, err error) { + rw.WriteHeader(http.StatusUnauthorized) + web.RenderTemplate(rw, "login.tmpl", &web.Page{ + Title: "Authentication failed - ClusterCockpit", + MsgType: "alert-danger", + Message: err.Error(), + Build: buildInfo, + Infos: info, + Redirect: r.RequestURI, + }) + }) + }) + } + + secured.Handle("/query", graphQLServer) + + secured.HandleFunc("/search", func(rw http.ResponseWriter, r *http.Request) { + routerConfig.HandleSearchBar(rw, r, buildInfo) + }) + + routerConfig.SetupRoutes(secured, buildInfo) }) - // Mount all /monitoring/... and /api/... routes. - routerConfig.SetupRoutes(secured, buildInfo) - s.apiHandle.MountAPIRoutes(securedapi) - s.apiHandle.MountUserAPIRoutes(userapi) - s.apiHandle.MountConfigAPIRoutes(configapi) - s.apiHandle.MountFrontendAPIRoutes(frontendapi) + // API routes (JWT token auth) + s.router.Route("/api", func(apiRouter chi.Router) { + // Main API routes with API auth + apiRouter.Group(func(securedapi chi.Router) { + if !config.Keys.DisableAuthentication { + securedapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountAPIRoutes(securedapi) + }) - if memorystore.InternalCCMSFlag { - s.apiHandle.MountMetricStoreAPIRoutes(metricstoreapi) + // Metric store API routes with separate auth + apiRouter.Group(func(metricstoreapi chi.Router) { + if !config.Keys.DisableAuthentication { + metricstoreapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthMetricStoreAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountMetricStoreAPIRoutes(metricstoreapi) + }) + }) + + // User API routes + s.router.Route("/userapi", func(userapi chi.Router) { + if !config.Keys.DisableAuthentication { + userapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthUserAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountUserAPIRoutes(userapi) + }) + + // Config API routes (uses Group with full paths to avoid shadowing + // the /config page route that is registered in the secured group) + s.router.Group(func(configapi chi.Router) { + if !config.Keys.DisableAuthentication { + configapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthConfigAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountConfigAPIRoutes(configapi) + }) + + // Frontend API routes + s.router.Route("/frontend", func(frontendapi chi.Router) { + if !config.Keys.DisableAuthentication { + frontendapi.Use(func(next http.Handler) http.Handler { + return authHandle.AuthFrontendAPI(next, onFailureResponse) + }) + } + s.restAPIHandle.MountFrontendAPIRoutes(frontendapi) + }) + + if config.Keys.APISubjects != nil { + s.natsAPIHandle = api.NewNatsAPI() + if err := s.natsAPIHandle.StartSubscriptions(); err != nil { + return fmt.Errorf("starting NATS subscriptions: %w", err) + } } + // 404 handler for pages and API routes + notFoundHandler := func(rw http.ResponseWriter, r *http.Request) { + if strings.HasPrefix(r.URL.Path, "/api/") || strings.HasPrefix(r.URL.Path, "/userapi/") || + strings.HasPrefix(r.URL.Path, "/frontend/") || strings.HasPrefix(r.URL.Path, "/config/") { + rw.Header().Set("Content-Type", "application/json") + rw.WriteHeader(http.StatusNotFound) + json.NewEncoder(rw).Encode(map[string]string{ + "status": "Resource not found", + "error": "the requested endpoint does not exist", + }) + return + } + rw.Header().Set("Content-Type", "text/html; charset=utf-8") + rw.WriteHeader(http.StatusNotFound) + web.RenderTemplate(rw, "404.tmpl", &web.Page{ + Title: "Page Not Found", + Build: buildInfo, + }) + } + + // Set NotFound on the router so chi uses it for all unmatched routes, + // including those under subrouters like /api, /userapi, /frontend, etc. + s.router.NotFound(notFoundHandler) + if config.Keys.EmbedStaticFiles { if i, err := os.Stat("./var/img"); err == nil { if i.IsDir() { cclog.Info("Use local directory for static images") - s.router.PathPrefix("/img/").Handler(http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img")))) + s.router.Handle("/img/*", http.StripPrefix("/img/", http.FileServer(http.Dir("./var/img")))) } } - s.router.PathPrefix("/").Handler(http.StripPrefix("/", web.ServeFiles())) + fileServer := http.StripPrefix("/", web.ServeFiles()) + s.router.Handle("/*", http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + if web.StaticFileExists(r.URL.Path) { + fileServer.ServeHTTP(rw, r) + return + } + notFoundHandler(rw, r) + })) } else { - s.router.PathPrefix("/").Handler(http.FileServer(http.Dir(config.Keys.StaticFiles))) + staticDir := http.Dir(config.Keys.StaticFiles) + fileServer := http.FileServer(staticDir) + s.router.Handle("/*", http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + f, err := staticDir.Open(r.URL.Path) + if err == nil { + f.Close() + fileServer.ServeHTTP(rw, r) + return + } + notFoundHandler(rw, r) + })) } - s.router.Use(handlers.CompressHandler) - s.router.Use(handlers.RecoveryHandler(handlers.PrintRecoveryStack(true))) - s.router.Use(handlers.CORS( - handlers.AllowCredentials(), - handlers.AllowedHeaders([]string{"X-Requested-With", "Content-Type", "Authorization", "Origin"}), - handlers.AllowedMethods([]string{"GET", "POST", "HEAD", "OPTIONS"}), - handlers.AllowedOrigins([]string{"*"}))) - return nil } @@ -278,20 +342,6 @@ const ( ) func (s *Server) Start(ctx context.Context) error { - handler := handlers.CustomLoggingHandler(io.Discard, s.router, func(_ io.Writer, params handlers.LogFormatterParams) { - if strings.HasPrefix(params.Request.RequestURI, "/api/") { - cclog.Debugf("%s %s (%d, %.02fkb, %dms)", - params.Request.Method, params.URL.RequestURI(), - params.StatusCode, float32(params.Size)/1024, - time.Since(params.TimeStamp).Milliseconds()) - } else { - cclog.Debugf("%s %s (%d, %.02fkb, %dms)", - params.Request.Method, params.URL.RequestURI(), - params.StatusCode, float32(params.Size)/1024, - time.Since(params.TimeStamp).Milliseconds()) - } - }) - // Use configurable timeouts with defaults readTimeout := time.Duration(defaultReadTimeout) * time.Second writeTimeout := time.Duration(defaultWriteTimeout) * time.Second @@ -299,7 +349,7 @@ func (s *Server) Start(ctx context.Context) error { s.server = &http.Server{ ReadTimeout: readTimeout, WriteTimeout: writeTimeout, - Handler: handler, + Handler: s.router, Addr: config.Keys.Addr, } @@ -338,7 +388,7 @@ func (s *Server) Start(ctx context.Context) error { // Because this program will want to bind to a privileged port (like 80), the listener must // be established first, then the user can be changed, and after that, // the actual http server can be started. - if err := runtimeEnv.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil { + if err := runtime.DropPrivileges(config.Keys.Group, config.Keys.User); err != nil { return fmt.Errorf("dropping privileges: %w", err) } @@ -363,14 +413,21 @@ func (s *Server) Shutdown(ctx context.Context) { shutdownCtx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() + nc := nats.GetClient() + if nc != nil { + nc.Close() + } + // First shut down the server gracefully (waiting for all ongoing requests) if err := s.server.Shutdown(shutdownCtx); err != nil { cclog.Errorf("Server shutdown error: %v", err) } // Archive all the metric store data - if memorystore.InternalCCMSFlag { - memorystore.Shutdown() + ms := metricstore.GetMemoryStore() + + if ms != nil { + metricstore.Shutdown() } // Shutdown archiver with 10 second timeout for fast shutdown diff --git a/configs/config-demo.json b/configs/config-demo.json index 70ca2a02..8c72e37f 100644 --- a/configs/config-demo.json +++ b/configs/config-demo.json @@ -1,91 +1,26 @@ { "main": { - "addr": "127.0.0.1:8080", - "short-running-jobs-duration": 300, - "resampling": { - "minimumPoints": 600, - "trigger": 180, - "resolutions": [ - 240, - 60 - ] - }, - "apiAllowedIPs": [ - "*" - ], - "emission-constant": 317 + "addr": "127.0.0.1:8080" }, "cron": { - "commit-job-worker": "2m", - "duration-worker": "5m", - "footprint-worker": "10m" - }, - "archive": { - "kind": "file", - "path": "./var/job-archive" + "commit-job-worker": "1m", + "duration-worker": "3m", + "footprint-worker": "5m" }, "auth": { "jwts": { "max-age": "2000h" } }, - "clusters": [ + "metric-store-external": [ { - "name": "fritz", - "metricDataRepository": { - "kind": "cc-metric-store-internal", - "url": "http://localhost:8082", - "token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 64 - }, - "duration": { - "from": 0, - "to": 86400 - }, - "startTime": { - "from": "2022-01-01T00:00:00Z", - "to": null - } - } - }, - { - "name": "alex", - "metricDataRepository": { - "kind": "cc-metric-store-internal", - "url": "http://localhost:8082", - "token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 64 - }, - "duration": { - "from": 0, - "to": 86400 - }, - "startTime": { - "from": "2022-01-01T00:00:00Z", - "to": null - } - } + "scope": "fritz", + "url": "http://0.0.0.0:8082", + "token": "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NzU3Nzg4NDQsImlhdCI6MTc2ODU3ODg0NCwicm9sZXMiOlsiYWRtaW4iLCJhcGkiXSwic3ViIjoiZGVtbyJ9._SDEW9WaUVXSBFmWqGhyIZXLoqoDU8F1hkfh4cXKIqF4yw7w50IUpfUBtwUFUOnoviFKoi563f6RAMC7XxeLDA" } ], "metric-store": { - "checkpoints": { - "file-format": "avro", - "interval": "1h", - "directory": "./var/checkpoints", - "restore": "48h" - }, - "archive": { - "interval": "1h", - "directory": "./var/archive" - }, - "retention-in-memory": "48h" + "retention-in-memory": "24h", + "memory-cap": 100 } -} \ No newline at end of file +} diff --git a/configs/config-mariadb.json b/configs/config-mariadb.json deleted file mode 100644 index 38bb8a93..00000000 --- a/configs/config-mariadb.json +++ /dev/null @@ -1,64 +0,0 @@ -{ - "addr": "127.0.0.1:8080", - "short-running-jobs-duration": 300, - "archive": { - "kind": "file", - "path": "./var/job-archive" - }, - "jwts": { - "max-age": "2000h" - }, - "db-driver": "mysql", - "db": "clustercockpit:demo@tcp(127.0.0.1:3306)/clustercockpit", - "enable-resampling": { - "trigger": 30, - "resolutions": [600, 300, 120, 60] - }, - "emission-constant": 317, - "clusters": [ - { - "name": "fritz", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 64 - }, - "duration": { - "from": 0, - "to": 86400 - }, - "startTime": { - "from": "2022-01-01T00:00:00Z", - "to": null - } - } - }, - { - "name": "alex", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 64 - }, - "duration": { - "from": 0, - "to": 86400 - }, - "startTime": { - "from": "2022-01-01T00:00:00Z", - "to": null - } - } - } - ] -} diff --git a/configs/config.json b/configs/config.json index 5bffc969..6b654e4f 100644 --- a/configs/config.json +++ b/configs/config.json @@ -5,46 +5,93 @@ "https-key-file": "/etc/letsencrypt/live/url/privkey.pem", "user": "clustercockpit", "group": "clustercockpit", - "validate": false, - "apiAllowedIPs": ["*"], + "api-allowed-ips": ["*"], "short-running-jobs-duration": 300, + "enable-job-taggers": true, + "nodestate-retention": { + "policy": "move", + "target-kind": "file", + "target-path": "./var/nodestate-archive" + }, "resampling": { - "trigger": 30, - "resolutions": [600, 300, 120, 60] + "minimum-points": 600, + "trigger": 180, + "resolutions": [240, 60] + }, + "api-subjects": { + "subject-job-event": "cc.job.event", + "subject-node-state": "cc.node.state" + } + }, + "nats": { + "address": "nats://0.0.0.0:4222", + "username": "root", + "password": "root" + }, + "auth": { + "jwts": { + "max-age": "2000h" } }, "cron": { - "commit-job-worker": "2m", + "commit-job-worker": "1m", "duration-worker": "5m", "footprint-worker": "10m" }, "archive": { - "kind": "file", - "path": "./var/job-archive" - }, - "clusters": [ - { - "name": "test", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "eyJhbGciOiJF-E-pQBQ" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 64 - }, - "duration": { - "from": 0, - "to": 86400 - }, - "startTime": { - "from": "2022-01-01T00:00:00Z", - "to": null - } - } + "kind": "s3", + "endpoint": "http://x.x.x.x", + "bucket": "jobarchive", + "access-key": "xx", + "secret-key": "xx", + "retention": { + "policy": "move", + "age": 365, + "location": "./var/archive" } - ] + }, + "metric-store-external": [ + { + "scope": "*", + "url": "http://x.x.x.x:8082", + "token": "MySecret" + }, + { + "scope": "fritz", + "url": "http://x.x.x.x:8084", + "token": "MySecret" + }, + { + "scope": "fritz-spr1tb", + "url": "http://x.x.x.x:8083", + "token": "MySecret" + }, + { + "scope": "alex", + "url": "http://x.x.x.x:8084", + "token": "MySecret" + } + ], + "metric-store": { + "checkpoints": { + "directory": "./var/checkpoints" + }, + "memory-cap": 100, + "retention-in-memory": "24h", + "cleanup": { + "mode": "archive", + "directory": "./var/archive" + }, + "nats-subscriptions": [ + { + "subscribe-to": "hpc-nats", + "cluster-tag": "fritz" + }, + { + "subscribe-to": "hpc-nats", + "cluster-tag": "alex" + } + ] + }, + "ui-file": "ui-config.json" } - diff --git a/configs/startJobPayload.json b/configs/startJobPayload.json new file mode 100644 index 00000000..9517876f --- /dev/null +++ b/configs/startJobPayload.json @@ -0,0 +1,22 @@ +{ + "cluster": "fritz", + "jobId": 123000, + "jobState": "running", + "numAcc": 0, + "numHwthreads": 72, + "numNodes": 1, + "partition": "main", + "requestedMemory": 128000, + "resources": [{ "hostname": "f0726" }], + "startTime": 1649723812, + "subCluster": "main", + "submitTime": 1649723812, + "user": "k106eb10", + "project": "k106eb", + "walltime": 86400, + "metaData": { + "slurmInfo": "JobId=398759\nJobName=myJob\nUserId=dummyUser\nGroupId=dummyGroup\nAccount=dummyAccount\nQOS=normal Requeue=False Restarts=0 BatchFlag=True\nTimeLimit=1439'\nSubmitTime=2023-02-09T14:10:18\nPartition=singlenode\nNodeList=xx\nNumNodes=xx NumCPUs=72 NumTasks=72 CPUs/Task=1\nNTasksPerNode:Socket:Core=0:None:None\nTRES_req=cpu=72,mem=250000M,node=1,billing=72\nTRES_alloc=cpu=72,node=1,billing=72\nCommand=myCmd\nWorkDir=myDir\nStdErr=\nStdOut=\n", + "jobScript": "#!/bin/bash -l\n#SBATCH --job-name=dummy_job\n#SBATCH --time=23:59:00\n#SBATCH --partition=singlenode\n#SBATCH --ntasks=72\n#SBATCH --hint=multithread\n#SBATCH --chdir=/home/atuin/k106eb/dummy/\n#SBATCH --export=NONE\nunset SLURM_EXPORT_ENV\n\n#This is a dummy job script\n./mybinary\n", + "jobName": "ams_pipeline" + } +} diff --git a/configs/stopJobPayload.json b/configs/stopJobPayload.json new file mode 100644 index 00000000..baf76f95 --- /dev/null +++ b/configs/stopJobPayload.json @@ -0,0 +1,7 @@ +{ + "cluster": "fritz", + "jobId": 123000, + "jobState": "completed", + "startTime": 1649723812, + "stopTime": 1649763839 +} diff --git a/configs/tagger/README.md b/configs/tagger/README.md new file mode 100644 index 00000000..759cbe97 --- /dev/null +++ b/configs/tagger/README.md @@ -0,0 +1,419 @@ +# Job Tagging Configuration + +ClusterCockpit provides automatic job tagging functionality to classify and +categorize jobs based on configurable rules. The tagging system consists of two +main components: + +1. **Application Detection** - Identifies which application a job is running +2. **Job Classification** - Analyzes job performance characteristics and applies classification tags + +## Directory Structure + +``` +configs/tagger/ +├── apps/ # Application detection patterns +│ ├── vasp.txt +│ ├── gromacs.txt +│ └── ... +└── jobclasses/ # Job classification rules + ├── parameters.json + ├── lowUtilization.json + ├── highload.json + └── ... +``` + +## Activating Tagger Rules + +### Step 1: Copy Configuration Files + +To activate tagging, review, adapt, and copy the configuration files from +`configs/tagger/` to `var/tagger/`: + +```bash +# From the cc-backend root directory +mkdir -p var/tagger +cp -r configs/tagger/apps var/tagger/ +cp -r configs/tagger/jobclasses var/tagger/ +``` + +### Step 2: Enable Tagging in Configuration + +Add or set the following configuration key in the `main` section of your `config.json`: + +```json +{ + "enable-job-taggers": true +} +``` + +**Important**: Automatic tagging is disabled by default. You must explicitly +enable it by setting `enable-job-taggers: true` in the main configuration file. + +### Step 3: Restart cc-backend + +The tagger system automatically loads configuration from `./var/tagger/` at +startup. After copying the files and enabling the feature, restart cc-backend: + +```bash +./cc-backend -server +``` + +### Step 4: Verify Configuration Loaded + +Check the logs for messages indicating successful configuration loading: + +``` +[INFO] Setup file watch for ./var/tagger/apps +[INFO] Setup file watch for ./var/tagger/jobclasses +``` + +## How Tagging Works + +### Automatic Tagging + +When `enable-job-taggers` is set to `true` in the configuration, tags are +automatically applied when: + +- **Job Start**: Application detection runs immediately when a job starts +- **Job Stop**: Job classification runs when a job completes + +The system analyzes job metadata and metrics to determine appropriate tags. + +**Note**: Automatic tagging only works for jobs that start or stop after the +feature is enabled. Existing jobs are not automatically retagged. + +### Manual Tagging (Retroactive) + +To apply tags to existing jobs in the database, use the `-apply-tags` command +line option: + +```bash +./cc-backend -apply-tags +``` + +This processes all jobs in the database and applies current tagging rules. This +is useful when: + +- You have existing jobs that were created before tagging was enabled +- You've added new tagging rules and want to apply them to historical data +- You've modified existing rules and want to re-evaluate all jobs + +### Hot Reload + +The tagger system watches the configuration directories for changes. You can +modify or add rules without restarting `cc-backend`: + +- Changes to `var/tagger/apps/*` are detected automatically +- Changes to `var/tagger/jobclasses/*` are detected automatically + +## Application Detection + +Application detection identifies which software a job is running by matching +patterns in the job script. + +### Configuration Format + +Application patterns are stored in text files under `var/tagger/apps/`. Each +file contains one or more regular expression patterns (one per line) that match +against the job script. + +**Example: `apps/vasp.txt`** + +``` +vasp +VASP +``` + +### How It Works + +1. When a job starts, the system retrieves the job script from metadata +2. Each line in the app files is treated as a regex pattern +3. Patterns are matched case-insensitively against the lowercased job script +4. If a match is found, a tag of type `app` with the filename (without extension) is applied +5. Only the first matching application is tagged + +### Adding New Applications + +1. Create a new file in `var/tagger/apps/` (e.g., `tensorflow.txt`) +2. Add regex patterns, one per line: + + ``` + tensorflow + tf\.keras + import tensorflow + ``` + +3. The file is automatically detected and loaded + +**Note**: The tag name will be the filename without the `.txt` extension (e.g., `tensorflow`). + +## Job Classification + +Job classification analyzes completed jobs based on their metrics and properties +to identify performance issues or characteristics. + +### Configuration Format + +Job classification rules are defined in JSON files under +`var/tagger/jobclasses/`. Each rule file defines: + +- **Metrics required**: Which job metrics to analyze +- **Requirements**: Pre-conditions that must be met +- **Variables**: Computed values used in the rule +- **Rule expression**: Boolean expression that determines if the rule matches +- **Hint template**: Message displayed when the rule matches + +### Parameters File + +`jobclasses/parameters.json` defines shared threshold values used across multiple rules: + +```json +{ + "lowcpuload_threshold_factor": 0.9, + "highmemoryusage_threshold_factor": 0.9, + "job_min_duration_seconds": 600.0, + "sampling_interval_seconds": 30.0 +} +``` + +### Rule File Structure + +**Example: `jobclasses/lowUtilization.json`** + +```json +{ + "name": "Low resource utilization", + "tag": "lowutilization", + "parameters": ["job_min_duration_seconds"], + "metrics": ["flops_any", "mem_bw"], + "requirements": [ + "job.shared == \"none\"", + "job.duration > job_min_duration_seconds" + ], + "variables": [ + { + "name": "mem_bw_perc", + "expr": "1.0 - (mem_bw.avg / mem_bw.limits.peak)" + } + ], + "rule": "flops_any.avg < flops_any.limits.alert", + "hint": "Average flop rate {{.flops_any.avg}} falls below threshold {{.flops_any.limits.alert}}" +} +``` + +#### Field Descriptions + +| Field | Description | +| -------------- | ----------------------------------------------------------------------------- | +| `name` | Human-readable description of the rule | +| `tag` | Tag identifier applied when the rule matches | +| `parameters` | List of parameter names from `parameters.json` to include in rule environment | +| `metrics` | List of metrics required for evaluation (must be present in job data) | +| `requirements` | Boolean expressions that must all be true for the rule to be evaluated | +| `variables` | Named expressions computed before evaluating the main rule | +| `rule` | Boolean expression that determines if the job matches this classification | +| `hint` | Go template string for generating a user-visible message | + +### Expression Environment + +Expressions in `requirements`, `variables`, and `rule` have access to: + +**Job Properties:** + +- `job.shared` - Shared node allocation type +- `job.duration` - Job runtime in seconds +- `job.numCores` - Number of CPU cores +- `job.numNodes` - Number of nodes +- `job.jobState` - Job completion state +- `job.numAcc` - Number of accelerators +- `job.smt` - SMT setting + +**Metric Statistics (for each metric in `metrics`):** + +- `.min` - Minimum value +- `.max` - Maximum value +- `.avg` - Average value +- `.limits.peak` - Peak limit from cluster config +- `.limits.normal` - Normal threshold +- `.limits.caution` - Caution threshold +- `.limits.alert` - Alert threshold + +**Parameters:** + +- All parameters listed in the `parameters` field + +**Variables:** + +- All variables defined in the `variables` array + +### Expression Language + +Rules use the [expr](https://github.com/expr-lang/expr) language for expressions. Supported operations: + +- **Arithmetic**: `+`, `-`, `*`, `/`, `%`, `^` +- **Comparison**: `==`, `!=`, `<`, `<=`, `>`, `>=` +- **Logical**: `&&`, `||`, `!` +- **Functions**: Standard math functions (see expr documentation) + +### Hint Templates + +Hints use Go's `text/template` syntax. Variables from the evaluation environment are accessible: + +``` +{{.flops_any.avg}} # Access metric average +{{.job.duration}} # Access job property +{{.my_variable}} # Access computed variable +``` + +### Adding New Classification Rules + +1. Create a new JSON file in `var/tagger/jobclasses/` (e.g., `memoryLeak.json`) +2. Define the rule structure: + + ```json + { + "name": "Memory Leak Detection", + "tag": "memory_leak", + "parameters": ["memory_leak_slope_threshold"], + "metrics": ["mem_used"], + "requirements": ["job.duration > 3600"], + "variables": [ + { + "name": "mem_growth", + "expr": "(mem_used.max - mem_used.min) / job.duration" + } + ], + "rule": "mem_growth > memory_leak_slope_threshold", + "hint": "Memory usage grew by {{.mem_growth}} per second" + } + ``` + +3. Add any new parameters to `parameters.json` +4. The file is automatically detected and loaded + +## Configuration Paths + +The tagger system reads from these paths (relative to cc-backend working directory): + +- **Application patterns**: `./var/tagger/apps/` +- **Job classification rules**: `./var/tagger/jobclasses/` + +These paths are defined as constants in the source code and cannot be changed without recompiling. + +## Troubleshooting + +### Tags Not Applied + +1. **Check tagging is enabled**: Verify `enable-job-taggers: true` is set in `config.json` + +2. **Check configuration exists**: + + ```bash + ls -la var/tagger/apps + ls -la var/tagger/jobclasses + ``` + +3. **Check logs for errors**: + + ```bash + ./cc-backend -server -loglevel debug + ``` + +4. **Verify file permissions**: Ensure cc-backend can read the configuration files + +5. **For existing jobs**: Use `./cc-backend -apply-tags` to retroactively tag jobs + +### Rules Not Matching + +1. **Enable debug logging**: Set `loglevel: debug` to see detailed rule evaluation +2. **Check requirements**: Ensure all requirements in the rule are satisfied +3. **Verify metrics exist**: Classification rules require job metrics to be available +4. **Check metric names**: Ensure metric names match those in your cluster configuration + +### File Watch Not Working + +If changes to configuration files aren't detected: + +1. Restart cc-backend to reload all configuration +2. Check filesystem supports file watching (network filesystems may not) +3. Check logs for file watch setup messages + +## Best Practices + +1. **Start Simple**: Begin with basic rules and refine based on results +2. **Use Requirements**: Filter out irrelevant jobs early with requirements +3. **Test Incrementally**: Add one rule at a time and verify behavior +4. **Document Rules**: Use descriptive names and clear hint messages +5. **Share Parameters**: Define common thresholds in `parameters.json` for consistency +6. **Version Control**: Keep your `var/tagger/` configuration in version control +7. **Backup Before Changes**: Test new rules on a copy before deploying to production + +## Examples + +### Simple Application Detection + +**File: `var/tagger/apps/python.txt`** + +``` +python +python3 +\.py +``` + +This detects jobs running Python scripts. + +### Complex Classification Rule + +**File: `var/tagger/jobclasses/cpuImbalance.json`** + +```json +{ + "name": "CPU Load Imbalance", + "tag": "cpu_imbalance", + "parameters": ["core_load_imbalance_threshold_factor"], + "metrics": ["cpu_load"], + "requirements": ["job.numCores > 1", "job.duration > 600"], + "variables": [ + { + "name": "load_variance", + "expr": "(cpu_load.max - cpu_load.min) / cpu_load.avg" + } + ], + "rule": "load_variance > core_load_imbalance_threshold_factor", + "hint": "CPU load varies by {{printf \"%.1f%%\" (load_variance * 100)}} across cores" +} +``` + +This detects jobs where CPU load is unevenly distributed across cores. + +## Reference + +### Configuration Options + +**Main Configuration (`config.json`)**: + +- `enable-job-taggers` (boolean, default: `false`) - Enables automatic job tagging system + - Must be set to `true` to activate automatic tagging on job start/stop events + - Does not affect the `-apply-tags` command line option + +**Command Line Options**: + +- `-apply-tags` - Apply all tagging rules to existing jobs in the database + - Works independently of `enable-job-taggers` configuration + - Useful for retroactively tagging jobs or re-evaluating with updated rules + +### Default Configuration Location + +The example configurations are provided in: + +- `configs/tagger/apps/` - Example application patterns (16 applications) +- `configs/tagger/jobclasses/` - Example classification rules (3 rules) + +Copy these to `var/tagger/` and customize for your environment. + +### Tag Types + +- `app` - Application tags (e.g., "vasp", "gromacs") +- `jobClass` - Classification tags (e.g., "lowutilization", "highload") + +Tags can be queried and filtered in the ClusterCockpit UI and API. diff --git a/internal/tagger/apps/alf.txt b/configs/tagger/apps/alf.txt similarity index 100% rename from internal/tagger/apps/alf.txt rename to configs/tagger/apps/alf.txt diff --git a/internal/tagger/apps/caracal.txt b/configs/tagger/apps/caracal.txt similarity index 87% rename from internal/tagger/apps/caracal.txt rename to configs/tagger/apps/caracal.txt index ed615121..5c5311f7 100644 --- a/internal/tagger/apps/caracal.txt +++ b/configs/tagger/apps/caracal.txt @@ -2,6 +2,5 @@ calc_rate qmdffgen dynamic evbopt -explore black_box poly_qmdff diff --git a/internal/tagger/apps/chroma.txt b/configs/tagger/apps/chroma.txt similarity index 100% rename from internal/tagger/apps/chroma.txt rename to configs/tagger/apps/chroma.txt diff --git a/internal/tagger/apps/cp2k.txt b/configs/tagger/apps/cp2k.txt similarity index 100% rename from internal/tagger/apps/cp2k.txt rename to configs/tagger/apps/cp2k.txt diff --git a/internal/tagger/apps/cpmd.txt b/configs/tagger/apps/cpmd.txt similarity index 100% rename from internal/tagger/apps/cpmd.txt rename to configs/tagger/apps/cpmd.txt diff --git a/internal/tagger/apps/flame.txt b/configs/tagger/apps/flame.txt similarity index 100% rename from internal/tagger/apps/flame.txt rename to configs/tagger/apps/flame.txt diff --git a/internal/tagger/apps/gromacs.txt b/configs/tagger/apps/gromacs.txt similarity index 100% rename from internal/tagger/apps/gromacs.txt rename to configs/tagger/apps/gromacs.txt diff --git a/internal/tagger/apps/julia.txt b/configs/tagger/apps/julia.txt similarity index 100% rename from internal/tagger/apps/julia.txt rename to configs/tagger/apps/julia.txt diff --git a/configs/tagger/apps/lammps.txt b/configs/tagger/apps/lammps.txt new file mode 100644 index 00000000..38d3aa5d --- /dev/null +++ b/configs/tagger/apps/lammps.txt @@ -0,0 +1 @@ +\blmp\s+ diff --git a/internal/tagger/apps/matlab.txt b/configs/tagger/apps/matlab.txt similarity index 100% rename from internal/tagger/apps/matlab.txt rename to configs/tagger/apps/matlab.txt diff --git a/internal/tagger/apps/openfoam.txt b/configs/tagger/apps/openfoam.txt similarity index 100% rename from internal/tagger/apps/openfoam.txt rename to configs/tagger/apps/openfoam.txt diff --git a/internal/tagger/apps/orca.txt b/configs/tagger/apps/orca.txt similarity index 100% rename from internal/tagger/apps/orca.txt rename to configs/tagger/apps/orca.txt diff --git a/internal/tagger/apps/python.txt b/configs/tagger/apps/python.txt similarity index 100% rename from internal/tagger/apps/python.txt rename to configs/tagger/apps/python.txt diff --git a/internal/tagger/apps/starccm.txt b/configs/tagger/apps/starccm.txt similarity index 100% rename from internal/tagger/apps/starccm.txt rename to configs/tagger/apps/starccm.txt diff --git a/internal/tagger/apps/turbomole.txt b/configs/tagger/apps/turbomole.txt similarity index 100% rename from internal/tagger/apps/turbomole.txt rename to configs/tagger/apps/turbomole.txt diff --git a/configs/tagger/apps/vasp.txt b/configs/tagger/apps/vasp.txt new file mode 100644 index 00000000..79014e1d --- /dev/null +++ b/configs/tagger/apps/vasp.txt @@ -0,0 +1,3 @@ +vasp_gam +vasp_ncl +vasp_std diff --git a/configs/tagger/jobclasses/highMemoryUsage.json b/configs/tagger/jobclasses/highMemoryUsage.json new file mode 100644 index 00000000..878cd669 --- /dev/null +++ b/configs/tagger/jobclasses/highMemoryUsage.json @@ -0,0 +1,21 @@ +{ + "name": "High memory usage", + "tag": "highmemory", + "parameters": [ + "highmemoryusage_threshold_factor", + "job_min_duration_seconds" + ], + "metrics": ["mem_used"], + "requirements": [ + "job.shared == \"none\"", + "job.duration > job_min_duration_seconds" + ], + "variables": [ + { + "name": "memory_usage_pct", + "expr": "mem_used.max / mem_used.limits.peak * 100.0" + } + ], + "rule": "mem_used.max > mem_used.limits.alert", + "hint": "This job used high memory: peak memory usage {{.mem_used.max}} GB ({{.memory_usage_pct}}% of {{.mem_used.limits.peak}} GB node capacity), exceeding the {{.highmemoryusage_threshold_factor}} utilization threshold. Risk of out-of-memory conditions." +} diff --git a/internal/tagger/jobclasses/highload.json b/configs/tagger/jobclasses/highload.json similarity index 58% rename from internal/tagger/jobclasses/highload.json rename to configs/tagger/jobclasses/highload.json index 9667011b..a442a3ac 100644 --- a/internal/tagger/jobclasses/highload.json +++ b/configs/tagger/jobclasses/highload.json @@ -3,8 +3,7 @@ "tag": "excessiveload", "parameters": [ "excessivecpuload_threshold_factor", - "job_min_duration_seconds", - "sampling_interval_seconds" + "job_min_duration_seconds" ], "metrics": ["cpu_load"], "requirements": [ @@ -15,12 +14,8 @@ { "name": "load_threshold", "expr": "cpu_load.limits.peak * excessivecpuload_threshold_factor" - }, - { - "name": "load_perc", - "expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)" } ], "rule": "cpu_load.avg > load_threshold", - "hint": "This job was detected as excessiveload because the average cpu load {{.cpu_load.avg}} falls above the threshold {{.load_threshold}}." + "hint": "This job was detected as having excessive CPU load: average cpu load {{.cpu_load.avg}} exceeds the oversubscription threshold {{.load_threshold}} ({{.excessivecpuload_threshold_factor}} \u00d7 {{.cpu_load.limits.peak}} peak cores), indicating CPU contention." } diff --git a/configs/tagger/jobclasses/lowUtilization.json b/configs/tagger/jobclasses/lowUtilization.json new file mode 100644 index 00000000..1d365150 --- /dev/null +++ b/configs/tagger/jobclasses/lowUtilization.json @@ -0,0 +1,22 @@ +{ + "name": "Low resource utilization", + "tag": "lowutilization", + "parameters": ["job_min_duration_seconds"], + "metrics": ["flops_any", "mem_bw"], + "requirements": [ + "job.shared == \"none\"", + "job.duration > job_min_duration_seconds" + ], + "variables": [ + { + "name": "mem_bw_pct", + "expr": "mem_bw.avg / mem_bw.limits.peak * 100.0" + }, + { + "name": "flops_any_pct", + "expr": "flops_any.avg / flops_any.limits.peak * 100.0" + } + ], + "rule": "flops_any.avg < flops_any.limits.alert && mem_bw.avg < mem_bw.limits.alert", + "hint": "This job shows low resource utilization: FLOP rate {{.flops_any.avg}} GF/s ({{.flops_any_pct}}% of peak) and memory bandwidth {{.mem_bw.avg}} GB/s ({{.mem_bw_pct}}% of peak) are both below their alert thresholds." +} diff --git a/configs/tagger/jobclasses/lowload.json b/configs/tagger/jobclasses/lowload.json new file mode 100644 index 00000000..767d8f45 --- /dev/null +++ b/configs/tagger/jobclasses/lowload.json @@ -0,0 +1,18 @@ +{ + "name": "Low CPU load", + "tag": "lowload", + "parameters": ["lowcpuload_threshold_factor", "job_min_duration_seconds"], + "metrics": ["cpu_load"], + "requirements": [ + "job.shared == \"none\"", + "job.duration > job_min_duration_seconds" + ], + "variables": [ + { + "name": "load_threshold", + "expr": "cpu_load.limits.peak * lowcpuload_threshold_factor" + } + ], + "rule": "cpu_load.avg < load_threshold", + "hint": "This job was detected as low CPU load: average cpu load {{.cpu_load.avg}} is below the threshold {{.load_threshold}} ({{.lowcpuload_threshold_factor}})." +} diff --git a/configs/tagger/jobclasses/memoryBound.json b/configs/tagger/jobclasses/memoryBound.json new file mode 100644 index 00000000..01368c08 --- /dev/null +++ b/configs/tagger/jobclasses/memoryBound.json @@ -0,0 +1,22 @@ +{ + "name": "Memory bandwidth bound", + "tag": "memorybound", + "parameters": ["membound_bw_threshold_factor", "job_min_duration_seconds"], + "metrics": ["mem_bw"], + "requirements": [ + "job.shared == \"none\"", + "job.duration > job_min_duration_seconds" + ], + "variables": [ + { + "name": "mem_bw_threshold", + "expr": "mem_bw.limits.peak * membound_bw_threshold_factor" + }, + { + "name": "mem_bw_pct", + "expr": "mem_bw.avg / mem_bw.limits.peak * 100.0" + } + ], + "rule": "mem_bw.avg > mem_bw_threshold", + "hint": "This job is memory bandwidth bound: memory bandwidth {{.mem_bw.avg}} GB/s ({{.mem_bw_pct}}% of peak) is within {{.membound_bw_threshold_factor}} of peak bandwidth. Consider improving data reuse or compute intensity." +} diff --git a/internal/tagger/jobclasses/parameters.json b/configs/tagger/jobclasses/parameters.json similarity index 77% rename from internal/tagger/jobclasses/parameters.json rename to configs/tagger/jobclasses/parameters.json index 39e94c1c..c3fb5cdc 100644 --- a/internal/tagger/jobclasses/parameters.json +++ b/configs/tagger/jobclasses/parameters.json @@ -1,11 +1,12 @@ { - "lowcpuload_threshold_factor": 0.9, - "excessivecpuload_threshold_factor": 1.1, + "lowcpuload_threshold_factor": 0.85, + "excessivecpuload_threshold_factor": 1.2, "highmemoryusage_threshold_factor": 0.9, "node_load_imbalance_threshold_factor": 0.1, "core_load_imbalance_threshold_factor": 0.1, "high_memory_load_threshold_factor": 0.9, "lowgpuload_threshold_factor": 0.7, + "membound_bw_threshold_factor": 0.8, "memory_leak_slope_threshold": 0.1, "job_min_duration_seconds": 600.0, "sampling_interval_seconds": 30.0, diff --git a/configs/uiConfig.json b/configs/uiConfig.json index 2a3ed307..f2b89232 100644 --- a/configs/uiConfig.json +++ b/configs/uiConfig.json @@ -1,38 +1,38 @@ { - "jobList": { - "usePaging": false, - "showFootprint":false + "job-list": { + "use-paging": false, + "show-footprint":false }, - "jobView": { - "showPolarPlot": true, - "showFootprint": true, - "showRoofline": true, - "showStatTable": true + "job-view": { + "show-polar-plot": true, + "show-footprint": true, + "show-roofline": true, + "show-stat-table": true }, - "metricConfig": { - "jobListMetrics": ["mem_bw", "flops_dp"], - "jobViewPlotMetrics": ["mem_bw", "flops_dp"], - "jobViewTableMetrics": ["mem_bw", "flops_dp"], + "metric-config": { + "job-list-metrics": ["mem_bw", "flops_dp"], + "job-view-plot-metrics": ["mem_bw", "flops_dp"], + "job-view-table-metrics": ["mem_bw", "flops_dp"], "clusters": [ { "name": "test", - "subClusters": [ + "sub-clusters": [ { "name": "one", - "jobListMetrics": ["mem_used", "flops_sp"] + "job-list-metrics": ["mem_used", "flops_sp"] } ] } ] }, - "nodeList": { - "usePaging": true + "node-list": { + "use-paging": true }, - "plotConfiguration": { - "plotsPerRow": 3, - "colorBackground": true, - "lineWidth": 3, - "colorScheme": [ + "plot-configuration": { + "plots-per-row": 3, + "color-background": true, + "line-width": 3, + "color-scheme": [ "#00bfff", "#0000ff", "#ff00ff", diff --git a/go.mod b/go.mod index 3b3583bd..b03e7e0b 100644 --- a/go.mod +++ b/go.mod @@ -1,123 +1,123 @@ module github.com/ClusterCockpit/cc-backend -go 1.24.0 +go 1.25.0 -toolchain go1.24.1 +tool ( + github.com/99designs/gqlgen + github.com/swaggo/swag/cmd/swag +) require ( - github.com/99designs/gqlgen v0.17.84 - github.com/ClusterCockpit/cc-lib v1.0.0 + github.com/99designs/gqlgen v0.17.87 + github.com/ClusterCockpit/cc-lib/v2 v2.8.0 + github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 github.com/Masterminds/squirrel v1.5.4 - github.com/aws/aws-sdk-go-v2 v1.41.0 - github.com/aws/aws-sdk-go-v2/config v1.31.20 - github.com/aws/aws-sdk-go-v2/credentials v1.18.24 - github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2 - github.com/coreos/go-oidc/v3 v3.16.0 - github.com/expr-lang/expr v1.17.6 - github.com/go-co-op/gocron/v2 v2.18.2 + github.com/aws/aws-sdk-go-v2 v1.41.2 + github.com/aws/aws-sdk-go-v2/config v1.32.10 + github.com/aws/aws-sdk-go-v2/credentials v1.19.10 + github.com/aws/aws-sdk-go-v2/service/s3 v1.96.2 + github.com/coreos/go-oidc/v3 v3.17.0 + github.com/expr-lang/expr v1.17.8 + github.com/go-chi/chi/v5 v5.2.5 + github.com/go-chi/cors v1.2.2 + github.com/go-co-op/gocron/v2 v2.19.1 github.com/go-ldap/ldap/v3 v3.4.12 - github.com/go-sql-driver/mysql v1.9.3 - github.com/golang-jwt/jwt/v5 v5.3.0 + github.com/golang-jwt/jwt/v5 v5.3.1 github.com/golang-migrate/migrate/v4 v4.19.1 - github.com/google/gops v0.3.28 - github.com/gorilla/handlers v1.5.2 - github.com/gorilla/mux v1.8.1 + github.com/google/gops v0.3.29 github.com/gorilla/sessions v1.4.0 - github.com/influxdata/line-protocol/v2 v2.2.1 github.com/jmoiron/sqlx v1.4.0 github.com/joho/godotenv v1.5.1 - github.com/linkedin/goavro/v2 v2.14.1 - github.com/mattn/go-sqlite3 v1.14.32 - github.com/nats-io/nats.go v1.47.0 - github.com/prometheus/client_golang v1.23.2 - github.com/prometheus/common v0.67.4 + github.com/mattn/go-sqlite3 v1.14.34 + github.com/parquet-go/parquet-go v0.28.0 github.com/qustavo/sqlhooks/v2 v2.1.0 github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 github.com/stretchr/testify v1.11.1 github.com/swaggo/http-swagger v1.3.4 github.com/swaggo/swag v1.16.6 - github.com/vektah/gqlparser/v2 v2.5.31 - golang.org/x/crypto v0.45.0 - golang.org/x/oauth2 v0.32.0 + github.com/vektah/gqlparser/v2 v2.5.32 + golang.org/x/crypto v0.48.0 + golang.org/x/oauth2 v0.35.0 golang.org/x/time v0.14.0 ) require ( - filippo.io/edwards25519 v1.1.0 // indirect - github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 // indirect + github.com/Azure/go-ntlmssp v0.1.0 // indirect github.com/KyleBanks/depth v1.2.1 // indirect github.com/agnivade/levenshtein v1.2.1 // indirect - github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3 // indirect - github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13 // indirect + github.com/andybalholm/brotli v1.2.0 // indirect + github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect + github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.5 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.18 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.18 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.18 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect - github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13 // indirect - github.com/aws/aws-sdk-go-v2/service/sso v1.30.3 // indirect - github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.7 // indirect - github.com/aws/aws-sdk-go-v2/service/sts v1.40.2 // indirect - github.com/aws/smithy-go v1.24.0 // indirect - github.com/beorn7/perks v1.0.1 // indirect - github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.18 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.5 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.10 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.18 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.18 // indirect + github.com/aws/aws-sdk-go-v2/service/signin v1.0.6 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.30.11 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.15 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.41.7 // indirect + github.com/aws/smithy-go v1.24.2 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 // indirect github.com/go-jose/go-jose/v4 v4.1.3 // indirect - github.com/go-openapi/jsonpointer v0.22.3 // indirect - github.com/go-openapi/jsonreference v0.21.3 // indirect - github.com/go-openapi/spec v0.22.1 // indirect - github.com/go-openapi/swag/conv v0.25.4 // indirect - github.com/go-openapi/swag/jsonname v0.25.4 // indirect - github.com/go-openapi/swag/jsonutils v0.25.4 // indirect - github.com/go-openapi/swag/loading v0.25.4 // indirect - github.com/go-openapi/swag/stringutils v0.25.4 // indirect - github.com/go-openapi/swag/typeutils v0.25.4 // indirect - github.com/go-openapi/swag/yamlutils v0.25.4 // indirect - github.com/go-viper/mapstructure/v2 v2.4.0 // indirect - github.com/goccy/go-yaml v1.19.0 // indirect - github.com/golang/snappy v0.0.4 // indirect + github.com/go-openapi/jsonpointer v0.22.5 // indirect + github.com/go-openapi/jsonreference v0.21.5 // indirect + github.com/go-openapi/spec v0.22.4 // indirect + github.com/go-openapi/swag/conv v0.25.5 // indirect + github.com/go-openapi/swag/jsonname v0.25.5 // indirect + github.com/go-openapi/swag/jsonutils v0.25.5 // indirect + github.com/go-openapi/swag/loading v0.25.5 // indirect + github.com/go-openapi/swag/stringutils v0.25.5 // indirect + github.com/go-openapi/swag/typeutils v0.25.5 // indirect + github.com/go-openapi/swag/yamlutils v0.25.5 // indirect + github.com/go-viper/mapstructure/v2 v2.5.0 // indirect + github.com/goccy/go-yaml v1.19.2 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/securecookie v1.1.2 // indirect github.com/gorilla/websocket v1.5.3 // indirect github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect + github.com/influxdata/influxdb-client-go/v2 v2.14.0 // indirect + github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect github.com/jonboulle/clockwork v0.5.0 // indirect - github.com/jpillora/backoff v1.0.0 // indirect - github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.18.1 // indirect + github.com/klauspost/compress v1.18.4 // indirect + github.com/kr/pretty v0.3.1 // indirect github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect - github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect - github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect - github.com/nats-io/nkeys v0.4.11 // indirect + github.com/nats-io/nats.go v1.49.0 // indirect + github.com/nats-io/nkeys v0.4.15 // indirect github.com/nats-io/nuid v1.0.1 // indirect + github.com/oapi-codegen/runtime v1.2.0 // indirect + github.com/parquet-go/bitpack v1.0.0 // indirect + github.com/parquet-go/jsonlite v1.4.0 // indirect + github.com/pierrec/lz4/v4 v4.1.25 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/procfs v0.16.1 // indirect github.com/robfig/cron/v3 v3.0.1 // indirect + github.com/rogpeppe/go-internal v1.10.0 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect - github.com/sosodev/duration v1.3.1 // indirect + github.com/sosodev/duration v1.4.0 // indirect + github.com/stmcginnis/gofish v0.21.4 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/swaggo/files v1.0.1 // indirect + github.com/twpayne/go-geom v1.6.1 // indirect github.com/urfave/cli/v2 v2.27.7 // indirect - github.com/urfave/cli/v3 v3.6.1 // indirect + github.com/urfave/cli/v3 v3.6.2 // indirect github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/mod v0.30.0 // indirect - golang.org/x/net v0.47.0 // indirect - golang.org/x/sync v0.18.0 // indirect - golang.org/x/sys v0.38.0 // indirect - golang.org/x/text v0.31.0 // indirect - golang.org/x/tools v0.39.0 // indirect - google.golang.org/protobuf v1.36.10 // indirect + golang.org/x/mod v0.33.0 // indirect + golang.org/x/net v0.51.0 // indirect + golang.org/x/sync v0.19.0 // indirect + golang.org/x/sys v0.41.0 // indirect + golang.org/x/text v0.34.0 // indirect + golang.org/x/tools v0.42.0 // indirect + google.golang.org/protobuf v1.36.11 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index e8630b7c..812e4b83 100644 --- a/go.sum +++ b/go.sum @@ -1,81 +1,89 @@ filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= -github.com/99designs/gqlgen v0.17.84 h1:iVMdiStgUVx/BFkMb0J5GAXlqfqtQ7bqMCYK6v52kQ0= -github.com/99designs/gqlgen v0.17.84/go.mod h1:qjoUqzTeiejdo+bwUg8unqSpeYG42XrcrQboGIezmFA= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= -github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8= -github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU= -github.com/ClusterCockpit/cc-lib v1.0.0 h1:/8DFRomt4BpVWKWrsEZ/ru4K8x76QTVnEgdwHc5eSps= -github.com/ClusterCockpit/cc-lib v1.0.0/go.mod h1:UGdOvXEnjFqlnPSxtvtFwO6BtXYW6NnXFoud9FtN93k= +github.com/99designs/gqlgen v0.17.87 h1:pSnCIMhBQezAE8bc1GNmfdLXFmnWtWl1GRDFEE/nHP8= +github.com/99designs/gqlgen v0.17.87/go.mod h1:fK05f1RqSNfQpd4CfW5qk/810Tqi4/56Wf6Nem0khAg= +github.com/Azure/go-ntlmssp v0.1.0 h1:DjFo6YtWzNqNvQdrwEyr/e4nhU3vRiwenz5QX7sFz+A= +github.com/Azure/go-ntlmssp v0.1.0/go.mod h1:NYqdhxd/8aAct/s4qSYZEerdPuH1liG2/X9DiVTbhpk= +github.com/ClusterCockpit/cc-lib/v2 v2.8.0 h1:ROduRzRuusi+6kLB991AAu3Pp2AHOasQJFJc7JU/n/E= +github.com/ClusterCockpit/cc-lib/v2 v2.8.0/go.mod h1:FwD8vnTIbBM3ngeLNKmCvp9FoSjQZm7xnuaVxEKR23o= +github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 h1:hIzxgTBWcmCIHtoDKDkSCsKCOCOwUC34sFsbD2wcW0Q= +github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0/go.mod h1:y42qUu+YFmu5fdNuUAS4VbbIKxVjxCvbVqFdpdh8ahY= +github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU= +github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM= github.com/Masterminds/squirrel v1.5.4/go.mod h1:NNaOrjSoIDfDA40n7sr2tPNZRfjzjA400rg+riTZj10= -github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= -github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw= github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4= github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw= github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ= +github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk= github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM= github.com/agnivade/levenshtein v1.2.1/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU= +github.com/alecthomas/assert/v2 v2.10.0 h1:jjRCHsj6hBJhkmhznrCzoNpbA3zqy0fYiUcYZP/GkPY= +github.com/alecthomas/assert/v2 v2.10.0/go.mod h1:Bze95FyfUr7x34QZrjL+XP+0qgp/zg8yS+TtBj1WA3k= +github.com/alecthomas/repr v0.4.0 h1:GhI2A8MACjfegCPVq9f1FLvIBS+DrQ2KQBFZP1iFzXc= +github.com/alecthomas/repr v0.4.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4= github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e h1:4dAU9FXIyQktpoUAgOJK3OTFc/xug0PCXYCqU0FgDKI= github.com/alexbrainman/sspi v0.0.0-20250919150558-7d374ff0d59e/go.mod h1:cEWa1LVoE5KvSD9ONXsZrj0z6KqySlCCNKHlLzbqAt4= github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNgfBlViaCIJKLlCJ6/fmUseuG0wVQ= github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8= +github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ= +github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= +github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op h1:Ucf+QxEKMbPogRO5guBNe5cgd9uZgfoJLOYs8WWhtjM= +github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E= github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ= github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE= -github.com/aws/aws-sdk-go-v2 v1.41.0 h1:tNvqh1s+v0vFYdA1xq0aOJH+Y5cRyZ5upu6roPgPKd4= -github.com/aws/aws-sdk-go-v2 v1.41.0/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3 h1:DHctwEM8P8iTXFxC/QK0MRjwEpWQeM9yzidCRjldUz0= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3/go.mod h1:xdCzcZEtnSTKVDOmUZs4l/j3pSV6rpo1WXl5ugNsL8Y= -github.com/aws/aws-sdk-go-v2/config v1.31.20 h1:/jWF4Wu90EhKCgjTdy1DGxcbcbNrjfBHvksEL79tfQc= -github.com/aws/aws-sdk-go-v2/config v1.31.20/go.mod h1:95Hh1Tc5VYKL9NJ7tAkDcqeKt+MCXQB1hQZaRdJIZE0= -github.com/aws/aws-sdk-go-v2/credentials v1.18.24 h1:iJ2FmPT35EaIB0+kMa6TnQ+PwG5A1prEdAw+PsMzfHg= -github.com/aws/aws-sdk-go-v2/credentials v1.18.24/go.mod h1:U91+DrfjAiXPDEGYhh/x29o4p0qHX5HDqG7y5VViv64= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 h1:T1brd5dR3/fzNFAQch/iBKeX07/ffu/cLu+q+RuzEWk= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13/go.mod h1:Peg/GBAQ6JDt+RoBf4meB1wylmAipb7Kg2ZFakZTlwk= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13 h1:a+8/MLcWlIxo1lF9xaGt3J/u3yOZx+CdSveSNwjhD40= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13/go.mod h1:oGnKwIYZ4XttyU2JWxFrwvhF6YKiK/9/wmE3v3Iu9K8= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13 h1:HBSI2kDkMdWz4ZM7FjwE7e/pWDEZ+nR95x8Ztet1ooY= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13/go.mod h1:YE94ZoDArI7awZqJzBAZ3PDD2zSfuP7w6P2knOzIn8M= +github.com/aws/aws-sdk-go-v2 v1.41.2 h1:LuT2rzqNQsauaGkPK/7813XxcZ3o3yePY0Iy891T2ls= +github.com/aws/aws-sdk-go-v2 v1.41.2/go.mod h1:IvvlAZQXvTXznUPfRVfryiG1fbzE2NGK6m9u39YQ+S4= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.5 h1:zWFmPmgw4sveAYi1mRqG+E/g0461cJ5M4bJ8/nc6d3Q= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.5/go.mod h1:nVUlMLVV8ycXSb7mSkcNu9e3v/1TJq2RTlrPwhYWr5c= +github.com/aws/aws-sdk-go-v2/config v1.32.10 h1:9DMthfO6XWZYLfzZglAgW5Fyou2nRI5CuV44sTedKBI= +github.com/aws/aws-sdk-go-v2/config v1.32.10/go.mod h1:2rUIOnA2JaiqYmSKYmRJlcMWy6qTj1vuRFscppSBMcw= +github.com/aws/aws-sdk-go-v2/credentials v1.19.10 h1:EEhmEUFCE1Yhl7vDhNOI5OCL/iKMdkkYFTRpZXNw7m8= +github.com/aws/aws-sdk-go-v2/credentials v1.19.10/go.mod h1:RnnlFCAlxQCkN2Q379B67USkBMu1PipEEiibzYN5UTE= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.18 h1:Ii4s+Sq3yDfaMLpjrJsqD6SmG/Wq/P5L/hw2qa78UAY= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.18/go.mod h1:6x81qnY++ovptLE6nWQeWrpXxbnlIex+4H4eYYGcqfc= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.18 h1:F43zk1vemYIqPAwhjTjYIz0irU2EY7sOb/F5eJ3HuyM= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.18/go.mod h1:w1jdlZXrGKaJcNoL+Nnrj+k5wlpGXqnNrKoP22HvAug= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.18 h1:xCeWVjj0ki0l3nruoyP2slHsGArMxeiiaoPN5QZH6YQ= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.18/go.mod h1:r/eLGuGCBw6l36ZRWiw6PaZwPXb6YOj+i/7MizNl5/k= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13 h1:eg/WYAa12vqTphzIdWMzqYRVKKnCboVPRlvaybNCqPA= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13/go.mod h1:/FDdxWhz1486obGrKKC1HONd7krpk38LBt+dutLcN9k= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3 h1:x2Ibm/Af8Fi+BH+Hsn9TXGdT+hKbDd5XOTZxTMxDk7o= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3/go.mod h1:IW1jwyrQgMdhisceG8fQLmQIydcT/jWY21rFhzgaKwo= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4 h1:NvMjwvv8hpGUILarKw7Z4Q0w1H9anXKsesMxtw++MA4= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4/go.mod h1:455WPHSwaGj2waRSpQp7TsnpOnBfw8iDfPfbwl7KPJE= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13 h1:kDqdFvMY4AtKoACfzIGD8A0+hbT41KTKF//gq7jITfM= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13/go.mod h1:lmKuogqSU3HzQCwZ9ZtcqOc5XGMqtDK7OIc2+DxiUEg= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13 h1:zhBJXdhWIFZ1acfDYIhu4+LCzdUS2Vbcum7D01dXlHQ= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13/go.mod h1:JaaOeCE368qn2Hzi3sEzY6FgAZVCIYcC2nwbro2QCh8= -github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2 h1:DhdbtDl4FdNlj31+xiRXANxEE+eC7n8JQz+/ilwQ8Uc= -github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2/go.mod h1:+wArOOrcHUevqdto9k1tKOF5++YTe9JEcPSc9Tx2ZSw= -github.com/aws/aws-sdk-go-v2/service/sso v1.30.3 h1:NjShtS1t8r5LUfFVtFeI8xLAHQNTa7UI0VawXlrBMFQ= -github.com/aws/aws-sdk-go-v2/service/sso v1.30.3/go.mod h1:fKvyjJcz63iL/ftA6RaM8sRCtN4r4zl4tjL3qw5ec7k= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.7 h1:gTsnx0xXNQ6SBbymoDvcoRHL+q4l/dAFsQuKfDWSaGc= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.7/go.mod h1:klO+ejMvYsB4QATfEOIXk8WAEwN4N0aBfJpvC+5SZBo= -github.com/aws/aws-sdk-go-v2/service/sts v1.40.2 h1:HK5ON3KmQV2HcAunnx4sKLB9aPf3gKGwVAf7xnx0QT0= -github.com/aws/aws-sdk-go-v2/service/sts v1.40.2/go.mod h1:E19xDjpzPZC7LS2knI9E6BaRFDK43Eul7vd6rSq2HWk= -github.com/aws/smithy-go v1.24.0 h1:LpilSUItNPFr1eY85RYgTIg5eIEPtvFbskaFcmmIUnk= -github.com/aws/smithy-go v1.24.0/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.18 h1:eZioDaZGJ0tMM4gzmkNIO2aAoQd+je7Ug7TkvAzlmkU= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.18/go.mod h1:CCXwUKAJdoWr6/NcxZ+zsiPr6oH/Q5aTooRGYieAyj4= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.5 h1:CeY9LUdur+Dxoeldqoun6y4WtJ3RQtzk0JMP2gfUay0= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.5/go.mod h1:AZLZf2fMaahW5s/wMRciu1sYbdsikT/UHwbUjOdEVTc= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.10 h1:fJvQ5mIBVfKtiyx0AHY6HeWcRX5LGANLpq8SVR+Uazs= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.10/go.mod h1:Kzm5e6OmNH8VMkgK9t+ry5jEih4Y8whqs+1hrkxim1I= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.18 h1:LTRCYFlnnKFlKsyIQxKhJuDuA3ZkrDQMRYm6rXiHlLY= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.18/go.mod h1:XhwkgGG6bHSd00nO/mexWTcTjgd6PjuvWQMqSn2UaEk= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.18 h1:/A/xDuZAVD2BpsS2fftFRo/NoEKQJ8YTnJDEHBy2Gtg= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.18/go.mod h1:hWe9b4f+djUQGmyiGEeOnZv69dtMSgpDRIvNMvuvzvY= +github.com/aws/aws-sdk-go-v2/service/s3 v1.96.2 h1:M1A9AjcFwlxTLuf0Faj88L8Iqw0n/AJHjpZTQzMMsSc= +github.com/aws/aws-sdk-go-v2/service/s3 v1.96.2/go.mod h1:KsdTV6Q9WKUZm2mNJnUFmIoXfZux91M3sr/a4REX8e0= +github.com/aws/aws-sdk-go-v2/service/signin v1.0.6 h1:MzORe+J94I+hYu2a6XmV5yC9huoTv8NRcCrUNedDypQ= +github.com/aws/aws-sdk-go-v2/service/signin v1.0.6/go.mod h1:hXzcHLARD7GeWnifd8j9RWqtfIgxj4/cAtIVIK7hg8g= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.11 h1:7oGD8KPfBOJGXiCoRKrrrQkbvCp8N++u36hrLMPey6o= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.11/go.mod h1:0DO9B5EUJQlIDif+XJRWCljZRKsAFKh3gpFz7UnDtOo= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.15 h1:edCcNp9eGIUDUCrzoCu1jWAXLGFIizeqkdkKgRlJwWc= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.15/go.mod h1:lyRQKED9xWfgkYC/wmmYfv7iVIM68Z5OQ88ZdcV1QbU= +github.com/aws/aws-sdk-go-v2/service/sts v1.41.7 h1:NITQpgo9A5NrDZ57uOWj+abvXSb83BbyggcUBVksN7c= +github.com/aws/aws-sdk-go-v2/service/sts v1.41.7/go.mod h1:sks5UWBhEuWYDPdwlnRFn1w7xWdH29Jcpe+/PJQefEs= +github.com/aws/smithy-go v1.24.2 h1:FzA3bu/nt/vDvmnkg+R8Xl46gmzEDam6mZ1hzmwXFng= +github.com/aws/smithy-go v1.24.2/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= -github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= -github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= -github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk= -github.com/coreos/go-oidc/v3 v3.16.0 h1:qRQUCFstKpXwmEjDQTIbyY/5jF00+asXzSkmkoa/mow= -github.com/coreos/go-oidc/v3 v3.16.0/go.mod h1:wqPbKFrVnE90vty060SB40FCJ8fTHTxSwyXJqZH+sI8= +github.com/coreos/go-oidc/v3 v3.17.0 h1:hWBGaQfbi0iVviX4ibC7bk8OKT5qNr4klBaCHVNvehc= +github.com/coreos/go-oidc/v3 v3.17.0/go.mod h1:wqPbKFrVnE90vty060SB40FCJ8fTHTxSwyXJqZH+sI8= github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= @@ -85,95 +93,72 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7cNTs5R6Hk4V2lcmLz2NsG2VnInyNo= github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= -github.com/dhui/dktest v0.4.6 h1:+DPKyScKSEp3VLtbMDHcUq6V5Lm5zfZZVb0Sk7Ahom4= -github.com/dhui/dktest v0.4.6/go.mod h1:JHTSYDtKkvFNFHJKqCzVzqXecyv+tKt8EzceOmQOgbU= -github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= -github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= -github.com/docker/docker v28.3.3+incompatible h1:Dypm25kh4rmk49v1eiVbsAtpAsYURjYkaKubwuBdxEI= -github.com/docker/docker v28.3.3+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= -github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= -github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= -github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= -github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= -github.com/expr-lang/expr v1.17.6 h1:1h6i8ONk9cexhDmowO/A64VPxHScu7qfSl2k8OlINec= -github.com/expr-lang/expr v1.17.6/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= -github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= -github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= -github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= -github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= +github.com/expr-lang/expr v1.17.8 h1:W1loDTT+0PQf5YteHSTpju2qfUfNoBt4yw9+wOEU9VM= +github.com/expr-lang/expr v1.17.8/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk= github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667 h1:BP4M0CvQ4S3TGls2FvczZtj5Re/2ZzkV9VwqPHH/3Bo= github.com/go-asn1-ber/asn1-ber v1.5.8-0.20250403174932-29230038a667/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0= -github.com/go-co-op/gocron/v2 v2.18.2 h1:+5VU41FUXPWSPKLXZQ/77SGzUiPCcakU0v7ENc2H20Q= -github.com/go-co-op/gocron/v2 v2.18.2/go.mod h1:Zii6he+Zfgy5W9B+JKk/KwejFOW0kZTFvHtwIpR4aBI= +github.com/go-chi/chi/v5 v5.2.5 h1:Eg4myHZBjyvJmAFjFvWgrqDTXFyOzjj7YIm3L3mu6Ug= +github.com/go-chi/chi/v5 v5.2.5/go.mod h1:X7Gx4mteadT3eDOMTsXzmI4/rwUpOwBHLpAfupzFJP0= +github.com/go-chi/cors v1.2.2 h1:Jmey33TE+b+rB7fT8MUy1u0I4L+NARQlK6LhzKPSyQE= +github.com/go-chi/cors v1.2.2/go.mod h1:sSbTewc+6wYHBBCW7ytsFSn836hqM7JxpglAy2Vzc58= +github.com/go-co-op/gocron/v2 v2.19.1 h1:B4iLeA0NB/2iO3EKQ7NfKn5KsQgZfjb2fkvoZJU3yBI= +github.com/go-co-op/gocron/v2 v2.19.1/go.mod h1:5lEiCKk1oVJV39Zg7/YG10OnaVrDAV5GGR6O0663k6U= github.com/go-jose/go-jose/v4 v4.1.3 h1:CVLmWDhDVRa6Mi/IgCgaopNosCaHz7zrMeF9MlZRkrs= github.com/go-jose/go-jose/v4 v4.1.3/go.mod h1:x4oUasVrzR7071A4TnHLGSPpNOm2a21K9Kf04k1rs08= github.com/go-ldap/ldap/v3 v3.4.12 h1:1b81mv7MagXZ7+1r7cLTWmyuTqVqdwbtJSjC0DAp9s4= github.com/go-ldap/ldap/v3 v3.4.12/go.mod h1:+SPAGcTtOfmGsCb3h1RFiq4xpp4N636G75OEace8lNo= -github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= -github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= -github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= -github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= -github.com/go-openapi/jsonpointer v0.22.3 h1:dKMwfV4fmt6Ah90zloTbUKWMD+0he+12XYAsPotrkn8= -github.com/go-openapi/jsonpointer v0.22.3/go.mod h1:0lBbqeRsQ5lIanv3LHZBrmRGHLHcQoOXQnf88fHlGWo= -github.com/go-openapi/jsonreference v0.21.3 h1:96Dn+MRPa0nYAR8DR1E03SblB5FJvh7W6krPI0Z7qMc= -github.com/go-openapi/jsonreference v0.21.3/go.mod h1:RqkUP0MrLf37HqxZxrIAtTWW4ZJIK1VzduhXYBEeGc4= -github.com/go-openapi/spec v0.22.1 h1:beZMa5AVQzRspNjvhe5aG1/XyBSMeX1eEOs7dMoXh/k= -github.com/go-openapi/spec v0.22.1/go.mod h1:c7aeIQT175dVowfp7FeCvXXnjN/MrpaONStibD2WtDA= +github.com/go-openapi/jsonpointer v0.22.5 h1:8on/0Yp4uTb9f4XvTrM2+1CPrV05QPZXu+rvu2o9jcA= +github.com/go-openapi/jsonpointer v0.22.5/go.mod h1:gyUR3sCvGSWchA2sUBJGluYMbe1zazrYWIkWPjjMUY0= +github.com/go-openapi/jsonreference v0.21.5 h1:6uCGVXU/aNF13AQNggxfysJ+5ZcU4nEAe+pJyVWRdiE= +github.com/go-openapi/jsonreference v0.21.5/go.mod h1:u25Bw85sX4E2jzFodh1FOKMTZLcfifd1Q+iKKOUxExw= +github.com/go-openapi/spec v0.22.4 h1:4pxGjipMKu0FzFiu/DPwN3CTBRlVM2yLf/YTWorYfDQ= +github.com/go-openapi/spec v0.22.4/go.mod h1:WQ6Ai0VPWMZgMT4XySjlRIE6GP1bGQOtEThn3gcWLtQ= github.com/go-openapi/swag v0.19.15 h1:D2NRCBzS9/pEY3gP9Nl8aDqGUcPFrwG2p+CNFrLyrCM= -github.com/go-openapi/swag/conv v0.25.4 h1:/Dd7p0LZXczgUcC/Ikm1+YqVzkEeCc9LnOWjfkpkfe4= -github.com/go-openapi/swag/conv v0.25.4/go.mod h1:3LXfie/lwoAv0NHoEuY1hjoFAYkvlqI/Bn5EQDD3PPU= -github.com/go-openapi/swag/jsonname v0.25.4 h1:bZH0+MsS03MbnwBXYhuTttMOqk+5KcQ9869Vye1bNHI= -github.com/go-openapi/swag/jsonname v0.25.4/go.mod h1:GPVEk9CWVhNvWhZgrnvRA6utbAltopbKwDu8mXNUMag= -github.com/go-openapi/swag/jsonutils v0.25.4 h1:VSchfbGhD4UTf4vCdR2F4TLBdLwHyUDTd1/q4i+jGZA= -github.com/go-openapi/swag/jsonutils v0.25.4/go.mod h1:7OYGXpvVFPn4PpaSdPHJBtF0iGnbEaTk8AvBkoWnaAY= -github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4 h1:IACsSvBhiNJwlDix7wq39SS2Fh7lUOCJRmx/4SN4sVo= -github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4/go.mod h1:Mt0Ost9l3cUzVv4OEZG+WSeoHwjWLnarzMePNDAOBiM= -github.com/go-openapi/swag/loading v0.25.4 h1:jN4MvLj0X6yhCDduRsxDDw1aHe+ZWoLjW+9ZQWIKn2s= -github.com/go-openapi/swag/loading v0.25.4/go.mod h1:rpUM1ZiyEP9+mNLIQUdMiD7dCETXvkkC30z53i+ftTE= -github.com/go-openapi/swag/stringutils v0.25.4 h1:O6dU1Rd8bej4HPA3/CLPciNBBDwZj9HiEpdVsb8B5A8= -github.com/go-openapi/swag/stringutils v0.25.4/go.mod h1:GTsRvhJW5xM5gkgiFe0fV3PUlFm0dr8vki6/VSRaZK0= -github.com/go-openapi/swag/typeutils v0.25.4 h1:1/fbZOUN472NTc39zpa+YGHn3jzHWhv42wAJSN91wRw= -github.com/go-openapi/swag/typeutils v0.25.4/go.mod h1:Ou7g//Wx8tTLS9vG0UmzfCsjZjKhpjxayRKTHXf2pTE= -github.com/go-openapi/swag/yamlutils v0.25.4 h1:6jdaeSItEUb7ioS9lFoCZ65Cne1/RZtPBZ9A56h92Sw= -github.com/go-openapi/swag/yamlutils v0.25.4/go.mod h1:MNzq1ulQu+yd8Kl7wPOut/YHAAU/H6hL91fF+E2RFwc= -github.com/go-openapi/testify/enable/yaml/v2 v2.0.2 h1:0+Y41Pz1NkbTHz8NngxTuAXxEodtNSI1WG1c/m5Akw4= -github.com/go-openapi/testify/enable/yaml/v2 v2.0.2/go.mod h1:kme83333GCtJQHXQ8UKX3IBZu6z8T5Dvy5+CW3NLUUg= -github.com/go-openapi/testify/v2 v2.0.2 h1:X999g3jeLcoY8qctY/c/Z8iBHTbwLz7R2WXd6Ub6wls= -github.com/go-openapi/testify/v2 v2.0.2/go.mod h1:HCPmvFFnheKK2BuwSA0TbbdxJ3I16pjwMkYkP4Ywn54= +github.com/go-openapi/swag/conv v0.25.5 h1:wAXBYEXJjoKwE5+vc9YHhpQOFj2JYBMF2DUi+tGu97g= +github.com/go-openapi/swag/conv v0.25.5/go.mod h1:CuJ1eWvh1c4ORKx7unQnFGyvBbNlRKbnRyAvDvzWA4k= +github.com/go-openapi/swag/jsonname v0.25.5 h1:8p150i44rv/Drip4vWI3kGi9+4W9TdI3US3uUYSFhSo= +github.com/go-openapi/swag/jsonname v0.25.5/go.mod h1:jNqqikyiAK56uS7n8sLkdaNY/uq6+D2m2LANat09pKU= +github.com/go-openapi/swag/jsonutils v0.25.5 h1:XUZF8awQr75MXeC+/iaw5usY/iM7nXPDwdG3Jbl9vYo= +github.com/go-openapi/swag/jsonutils v0.25.5/go.mod h1:48FXUaz8YsDAA9s5AnaUvAmry1UcLcNVWUjY42XkrN4= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.5 h1:SX6sE4FrGb4sEnnxbFL/25yZBb5Hcg1inLeErd86Y1U= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.5/go.mod h1:/2KvOTrKWjVA5Xli3DZWdMCZDzz3uV/T7bXwrKWPquo= +github.com/go-openapi/swag/loading v0.25.5 h1:odQ/umlIZ1ZVRteI6ckSrvP6e2w9UTF5qgNdemJHjuU= +github.com/go-openapi/swag/loading v0.25.5/go.mod h1:I8A8RaaQ4DApxhPSWLNYWh9NvmX2YKMoB9nwvv6oW6g= +github.com/go-openapi/swag/stringutils v0.25.5 h1:NVkoDOA8YBgtAR/zvCx5rhJKtZF3IzXcDdwOsYzrB6M= +github.com/go-openapi/swag/stringutils v0.25.5/go.mod h1:PKK8EZdu4QJq8iezt17HM8RXnLAzY7gW0O1KKarrZII= +github.com/go-openapi/swag/typeutils v0.25.5 h1:EFJ+PCga2HfHGdo8s8VJXEVbeXRCYwzzr9u4rJk7L7E= +github.com/go-openapi/swag/typeutils v0.25.5/go.mod h1:itmFmScAYE1bSD8C4rS0W+0InZUBrB2xSPbWt6DLGuc= +github.com/go-openapi/swag/yamlutils v0.25.5 h1:kASCIS+oIeoc55j28T4o8KwlV2S4ZLPT6G0iq2SSbVQ= +github.com/go-openapi/swag/yamlutils v0.25.5/go.mod h1:Gek1/SjjfbYvM+Iq4QGwa/2lEXde9n2j4a3wI3pNuOQ= +github.com/go-openapi/testify/enable/yaml/v2 v2.4.0 h1:7SgOMTvJkM8yWrQlU8Jm18VeDPuAvB/xWrdxFJkoFag= +github.com/go-openapi/testify/enable/yaml/v2 v2.4.0/go.mod h1:14iV8jyyQlinc9StD7w1xVPW3CO3q1Gj04Jy//Kw4VM= +github.com/go-openapi/testify/v2 v2.4.0 h1:8nsPrHVCWkQ4p8h1EsRVymA2XABB4OT40gcvAu+voFM= +github.com/go-openapi/testify/v2 v2.4.0/go.mod h1:HCPmvFFnheKK2BuwSA0TbbdxJ3I16pjwMkYkP4Ywn54= github.com/go-sql-driver/mysql v1.4.1/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w= +github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y= github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg= -github.com/go-sql-driver/mysql v1.9.3 h1:U/N249h2WzJ3Ukj8SowVFjdtZKfu9vlLZxjPXV1aweo= -github.com/go-sql-driver/mysql v1.9.3/go.mod h1:qn46aNg1333BRMNU69Lq93t8du/dwxI64Gl8i5p1WMU= -github.com/go-viper/mapstructure/v2 v2.4.0 h1:EBsztssimR/CONLSZZ04E8qAkxNYq4Qp9LvH92wZUgs= -github.com/go-viper/mapstructure/v2 v2.4.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM= -github.com/goccy/go-yaml v1.19.0 h1:EmkZ9RIsX+Uq4DYFowegAuJo8+xdX3T/2dwNPXbxEYE= -github.com/goccy/go-yaml v1.19.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= -github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= -github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo= -github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= +github.com/go-viper/mapstructure/v2 v2.5.0 h1:vM5IJoUAy3d7zRSVtIwQgBj7BiWtMPfmPEgAXnvj1Ro= +github.com/go-viper/mapstructure/v2 v2.5.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM= +github.com/goccy/go-yaml v1.19.2 h1:PmFC1S6h8ljIz6gMRBopkjP1TVT7xuwrButHID66PoM= +github.com/goccy/go-yaml v1.19.2/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= +github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY= +github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= github.com/golang-migrate/migrate/v4 v4.19.1 h1:OCyb44lFuQfYXYLx1SCxPZQGU7mcaZ7gH9yH4jSFbBA= github.com/golang-migrate/migrate/v4 v4.19.1/go.mod h1:CTcgfjxhaUtsLipnLoQRWCrjYXycRz/g5+RWDuYgPrE= -github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= -github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= -github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/go-tpm v0.9.7 h1:u89J4tUUeDTlH8xxC3CTW7OHZjbjKoHdQ9W7gCUhtxA= +github.com/google/go-tpm v0.9.7/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/gops v0.3.28 h1:2Xr57tqKAmQYRAfG12E+yLcoa2Y42UJo2lOrUFL9ark= -github.com/google/gops v0.3.28/go.mod h1:6f6+Nl8LcHrzJwi8+p0ii+vmBFSlB4f8cOOkTJ7sk4c= +github.com/google/gops v0.3.29 h1:n98J2qSOK1NJvRjdLDcjgDryjpIBGhbaqph1mXKL0rY= +github.com/google/gops v0.3.29/go.mod h1:8N3jZftuPazvUwtYY/ncG4iPrjp15ysNKLfq+QQPiwc= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE= -github.com/gorilla/handlers v1.5.2/go.mod h1:dX+xVpaxdSw+q0Qek8SSsl3dfMk3jNddUkMzo0GtH0w= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= github.com/gorilla/securecookie v1.1.2 h1:YCIWL56dvtr73r6715mJs5ZvhtnY73hBvEF8kXD8ePA= @@ -186,17 +171,14 @@ github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/C github.com/hashicorp/go-uuid v1.0.3/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= +github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg= github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjwJdUHnwvfjMF71M1iI4= github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI= github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU= github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo= -github.com/influxdata/line-protocol-corpus v0.0.0-20210519164801-ca6fa5da0184/go.mod h1:03nmhxzZ7Xk2pdG+lmMd7mHDfeVOYFyhOgwO61qWU98= github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig= github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo= -github.com/influxdata/line-protocol/v2 v2.0.0-20210312151457-c52fdecb625a/go.mod h1:6+9Xt5Sq1rWx+glMgxhcg2c0DUaehK+5TDcPZ76GypY= -github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxksNj7PX9aUSeYOYE/ceHY= -github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE= -github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM= github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8= github.com/jcmturner/aescts/v2 v2.0.0/go.mod h1:AiaICIRyfYg35RUkr8yESTqvSy7csK90qZ5xfvvsoNs= github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo= @@ -215,17 +197,11 @@ github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= github.com/jonboulle/clockwork v0.5.0 h1:Hyh9A8u51kptdkR+cqRpT1EebBwTn1oK9YfGYbdFz6I= github.com/jonboulle/clockwork v0.5.0/go.mod h1:3mZlmanh0g2NDKO5TWZVJAfofYk64M7XN3SzBPjZF60= -github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= -github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= -github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= -github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/klauspost/compress v1.18.1 h1:bcSGx7UbpBqMChDtsF28Lw6v/G94LPrrbMbdC3JH2co= -github.com/klauspost/compress v1.18.1/go.mod h1:ZQFFVG+MdnR0P+l6wpXgIL4NTtwiKIdBnrBd8Nrxr+0= -github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE= +github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c= +github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 h1:SOEGU9fKiNWd/HOJuq6+3iTQz8KNCLtVX6idSoTLdUw= @@ -235,43 +211,36 @@ github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0/go.mod h1:vmVJ0l/dxyfGW6Fm github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= -github.com/linkedin/goavro/v2 v2.14.1 h1:/8VjDpd38PRsy02JS0jflAu7JZPfJcGTwqWgMkFS2iI= -github.com/linkedin/goavro/v2 v2.14.1/go.mod h1:KXx+erlq+RPlGSPmLF7xGo6SAbh8sCQ53x064+ioxhk= github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc= github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= -github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs= -github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= -github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= -github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= -github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= -github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= -github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= -github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= -github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= -github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= +github.com/mattn/go-sqlite3 v1.14.34 h1:3NtcvcUnFBPsuRcno8pUtupspG/GM+9nZ88zgJcp6Zk= +github.com/mattn/go-sqlite3 v1.14.34/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= +github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 h1:KGuD/pM2JpL9FAYvBrnBBeENKZNh6eNtjqytV6TYjnk= +github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= -github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= -github.com/nats-io/nats.go v1.47.0 h1:YQdADw6J/UfGUd2Oy6tn4Hq6YHxCaJrVKayxxFqYrgM= -github.com/nats-io/nats.go v1.47.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g= -github.com/nats-io/nkeys v0.4.11 h1:q44qGV008kYd9W1b1nEBkNzvnWxtRSQ7A8BoqRrcfa0= -github.com/nats-io/nkeys v0.4.11/go.mod h1:szDimtgmfOi9n25JpfIdGw12tZFYXqhGxjhVxsatHVE= +github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g= +github.com/nats-io/jwt/v2 v2.8.0/go.mod h1:me11pOkwObtcBNR8AiMrUbtVOUGkqYjMQZ6jnSdVUIA= +github.com/nats-io/nats-server/v2 v2.12.3 h1:KRv+1n7lddMVgkJPQer+pt36TcO0ENxjilBmeWdjcHs= +github.com/nats-io/nats-server/v2 v2.12.3/go.mod h1:MQXjG9WjyXKz9koWzUc3jYUMKD8x3CLmTNy91IQQz3Y= +github.com/nats-io/nats.go v1.49.0 h1:yh/WvY59gXqYpgl33ZI+XoVPKyut/IcEaqtsiuTJpoE= +github.com/nats-io/nats.go v1.49.0/go.mod h1:fDCn3mN5cY8HooHwE2ukiLb4p4G4ImmzvXyJt+tGwdw= +github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4= +github.com/nats-io/nkeys v0.4.15/go.mod h1:CpMchTXC9fxA5zrMo4KpySxNjiDVvr8ANOSZdiNfUrs= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= -github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= -github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro= -github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg= -github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= -github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= -github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= -github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM= +github.com/oapi-codegen/runtime v1.2.0 h1:RvKc1CVS1QeKSNzO97FBQbSMZyQ8s6rZd+LpmzwHMP4= +github.com/oapi-codegen/runtime v1.2.0/go.mod h1:Y7ZhmmlE8ikZOmuHRRndiIm7nf3xcVv+YMweKgG1DT0= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= -github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/parquet-go/bitpack v1.0.0 h1:AUqzlKzPPXf2bCdjfj4sTeacrUwsT7NlcYDMUQxPcQA= +github.com/parquet-go/bitpack v1.0.0/go.mod h1:XnVk9TH+O40eOOmvpAVZ7K2ocQFrQwysLMnc6M/8lgs= +github.com/parquet-go/jsonlite v1.4.0 h1:RTG7prqfO0HD5egejU8MUDBN8oToMj55cgSV1I0zNW4= +github.com/parquet-go/jsonlite v1.4.0/go.mod h1:nDjpkpL4EOtqs6NQugUsi0Rleq9sW/OtC1NnZEnxzF0= +github.com/parquet-go/parquet-go v0.28.0 h1:ECyksyv8T2pOrlLsN7aWJIoQakyk/HtxQ2lchgS4els= +github.com/parquet-go/parquet-go v0.28.0/go.mod h1:navtkAYr2LGoJVp141oXPlO/sxLvaOe3la2JEoD8+rg= +github.com/pierrec/lz4/v4 v4.1.25 h1:kocOqRffaIbU5djlIBr7Wh+cx82C0vtFb0fOurZHqD0= +github.com/pierrec/lz4/v4 v4.1.25/go.mod h1:EoQMVJgeeEOMsCqCzqFm2O0cJvljX2nGZjcRIPL34O4= +github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -279,33 +248,34 @@ github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.67.4 h1:yR3NqWO1/UyO1w2PhUvXlGQs/PtFmoveVO0KZ4+Lvsc= -github.com/prometheus/common v0.67.4/go.mod h1:gP0fq6YjjNCLssJCQp0yk4M8W6ikLURwkdd/YKtTbyI= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= +github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= +github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= +github.com/prometheus/procfs v0.20.0 h1:AA7aCvjxwAquZAlonN7888f2u4IN8WVeFgBi4k82M4Q= +github.com/prometheus/procfs v0.20.0/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo= github.com/qustavo/sqlhooks/v2 v2.1.0 h1:54yBemHnGHp/7xgT+pxwmIlMSDNYKx5JW5dfRAiCZi0= github.com/qustavo/sqlhooks/v2 v2.1.0/go.mod h1:aMREyKo7fOKTwiLuWPsaHRXEmtqG4yREztO0idF83AU= github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= -github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= -github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4= github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY= github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= -github.com/sosodev/duration v1.3.1 h1:qtHBDMQ6lvMQsL15g4aopM4HEfOaYuhWBw3NPTtlqq4= -github.com/sosodev/duration v1.3.1/go.mod h1:RQIBBX0+fMLc/D9+Jb/fwvVmo0eZvDDEERAikUR6SDg= +github.com/sosodev/duration v1.4.0 h1:35ed0KiVFriGHHzZZJaZLgmTEEICIyt8Sx0RQfj9IjE= +github.com/sosodev/duration v1.4.0/go.mod h1:RQIBBX0+fMLc/D9+Jb/fwvVmo0eZvDDEERAikUR6SDg= +github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0= +github.com/stmcginnis/gofish v0.21.4 h1:daexK8sh31CgeSMkPUNs21HWHHA9ecCPJPyLCTxukCg= +github.com/stmcginnis/gofish v0.21.4/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.7.5/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/swaggo/files v1.0.1 h1:J1bVJ4XHZNq0I46UU90611i9/YzdrF7x92oX1ig5IdE= @@ -314,25 +284,19 @@ github.com/swaggo/http-swagger v1.3.4 h1:q7t/XLx0n15H1Q9/tk3Y9L4n210XzJF5WtnDX64 github.com/swaggo/http-swagger v1.3.4/go.mod h1:9dAh0unqMBAlbp1uE2Uc2mQTxNMU/ha4UbucIg1MFkQ= github.com/swaggo/swag v1.16.6 h1:qBNcx53ZaX+M5dxVyTrgQ0PJ/ACK+NzhwcbieTt+9yI= github.com/swaggo/swag v1.16.6/go.mod h1:ngP2etMK5a0P3QBizic5MEwpRmluJZPHjXcMoj4Xesg= +github.com/twpayne/go-geom v1.6.1 h1:iLE+Opv0Ihm/ABIcvQFGIiFBXd76oBIar9drAwHFhR4= +github.com/twpayne/go-geom v1.6.1/go.mod h1:Kr+Nly6BswFsKM5sd31YaoWS5PeDDH2NftJTK7Gd028= github.com/urfave/cli/v2 v2.27.7 h1:bH59vdhbjLv3LAvIu6gd0usJHgoTTPhCFib8qqOwXYU= github.com/urfave/cli/v2 v2.27.7/go.mod h1:CyNAG/xg+iAOg0N4MPGZqVmv2rCoP267496AOXUZjA4= -github.com/urfave/cli/v3 v3.6.1 h1:j8Qq8NyUawj/7rTYdBGrxcH7A/j7/G8Q5LhWEW4G3Mo= -github.com/urfave/cli/v3 v3.6.1/go.mod h1:ysVLtOEmg2tOy6PknnYVhDoouyC/6N42TMeoMzskhso= -github.com/vektah/gqlparser/v2 v2.5.31 h1:YhWGA1mfTjID7qJhd1+Vxhpk5HTgydrGU9IgkWBTJ7k= -github.com/vektah/gqlparser/v2 v2.5.31/go.mod h1:c1I28gSOVNzlfc4WuDlqU7voQnsqI6OG2amkBAFmgts= +github.com/urfave/cli/v3 v3.6.2 h1:lQuqiPrZ1cIz8hz+HcrG0TNZFxU70dPZ3Yl+pSrH9A8= +github.com/urfave/cli/v3 v3.6.2/go.mod h1:ysVLtOEmg2tOy6PknnYVhDoouyC/6N42TMeoMzskhso= +github.com/vektah/gqlparser/v2 v2.5.32 h1:k9QPJd4sEDTL+qB4ncPLflqTJ3MmjB9SrVzJrawpFSc= +github.com/vektah/gqlparser/v2 v2.5.32/go.mod h1:c1I28gSOVNzlfc4WuDlqU7voQnsqI6OG2amkBAFmgts= github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342 h1:FnBeRrxr7OU4VvAzt5X7s6266i6cSVkkFPS0TuXWbIg= github.com/xrash/smetrics v0.0.0-20250705151800-55b8f293f342/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= +github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU= +github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= -go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q= -go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= -go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= -go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= -go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= -go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= -go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= @@ -341,33 +305,31 @@ go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= -golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= -golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b h1:M2rDM6z3Fhozi9O7NWsxAkg/yqS/lQJ6PmkyIV3YP+o= -golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8= +golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts= +golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk= -golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc= +golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8= +golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= -golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= -golang.org/x/oauth2 v0.32.0 h1:jsCblLleRMDrxMN29H3z/k1KliIvpLgCkE6R8FXXNgY= -golang.org/x/oauth2 v0.32.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= +golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo= +golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y= +golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ= +golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= -golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= +golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= -golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= +golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= @@ -375,26 +337,22 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= -golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= +golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= +golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.39.0 h1:ik4ho21kwuQln40uelmciQPp9SipgNDdrafrYA4TmQQ= -golang.org/x/tools v0.39.0/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ= +golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= +golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= -google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= diff --git a/gqlgen.yml b/gqlgen.yml index 5f5272b4..40410b48 100644 --- a/gqlgen.yml +++ b/gqlgen.yml @@ -52,51 +52,51 @@ models: - github.com/99designs/gqlgen/graphql.Int64 - github.com/99designs/gqlgen/graphql.Int32 Job: - model: "github.com/ClusterCockpit/cc-lib/schema.Job" + model: "github.com/ClusterCockpit/cc-lib/v2/schema.Job" fields: tags: resolver: true metaData: resolver: true Cluster: - model: "github.com/ClusterCockpit/cc-lib/schema.Cluster" + model: "github.com/ClusterCockpit/cc-lib/v2/schema.Cluster" fields: partitions: resolver: true # Node: - # model: "github.com/ClusterCockpit/cc-lib/schema.Node" + # model: "github.com/ClusterCockpit/cc-lib/v2/schema.Node" # fields: # metaData: # resolver: true - NullableFloat: { model: "github.com/ClusterCockpit/cc-lib/schema.Float" } - MetricScope: { model: "github.com/ClusterCockpit/cc-lib/schema.MetricScope" } - MetricValue: { model: "github.com/ClusterCockpit/cc-lib/schema.MetricValue" } + NullableFloat: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Float" } + MetricScope: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricScope" } + MetricValue: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricValue" } JobStatistics: - { model: "github.com/ClusterCockpit/cc-lib/schema.JobStatistics" } + { model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobStatistics" } GlobalMetricListItem: - { model: "github.com/ClusterCockpit/cc-lib/schema.GlobalMetricListItem" } + { model: "github.com/ClusterCockpit/cc-lib/v2/schema.GlobalMetricListItem" } ClusterSupport: - { model: "github.com/ClusterCockpit/cc-lib/schema.ClusterSupport" } - Tag: { model: "github.com/ClusterCockpit/cc-lib/schema.Tag" } - Resource: { model: "github.com/ClusterCockpit/cc-lib/schema.Resource" } - JobState: { model: "github.com/ClusterCockpit/cc-lib/schema.JobState" } - Node: { model: "github.com/ClusterCockpit/cc-lib/schema.Node" } + { model: "github.com/ClusterCockpit/cc-lib/v2/schema.ClusterSupport" } + Tag: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Tag" } + Resource: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Resource" } + JobState: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobState" } + Node: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Node" } SchedulerState: - { model: "github.com/ClusterCockpit/cc-lib/schema.SchedulerState" } + { model: "github.com/ClusterCockpit/cc-lib/v2/schema.SchedulerState" } HealthState: - { model: "github.com/ClusterCockpit/cc-lib/schema.MonitoringState" } - JobMetric: { model: "github.com/ClusterCockpit/cc-lib/schema.JobMetric" } - Series: { model: "github.com/ClusterCockpit/cc-lib/schema.Series" } + { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MonitoringState" } + JobMetric: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.JobMetric" } + Series: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Series" } MetricStatistics: - { model: "github.com/ClusterCockpit/cc-lib/schema.MetricStatistics" } + { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricStatistics" } MetricConfig: - { model: "github.com/ClusterCockpit/cc-lib/schema.MetricConfig" } + { model: "github.com/ClusterCockpit/cc-lib/v2/schema.MetricConfig" } SubClusterConfig: - { model: "github.com/ClusterCockpit/cc-lib/schema.SubClusterConfig" } - Accelerator: { model: "github.com/ClusterCockpit/cc-lib/schema.Accelerator" } - Topology: { model: "github.com/ClusterCockpit/cc-lib/schema.Topology" } + { model: "github.com/ClusterCockpit/cc-lib/v2/schema.SubClusterConfig" } + Accelerator: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Accelerator" } + Topology: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Topology" } FilterRanges: - { model: "github.com/ClusterCockpit/cc-lib/schema.FilterRanges" } - SubCluster: { model: "github.com/ClusterCockpit/cc-lib/schema.SubCluster" } - StatsSeries: { model: "github.com/ClusterCockpit/cc-lib/schema.StatsSeries" } - Unit: { model: "github.com/ClusterCockpit/cc-lib/schema.Unit" } + { model: "github.com/ClusterCockpit/cc-lib/v2/schema.FilterRanges" } + SubCluster: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.SubCluster" } + StatsSeries: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.StatsSeries" } + Unit: { model: "github.com/ClusterCockpit/cc-lib/v2/schema.Unit" } diff --git a/init/clustercockpit.service b/init/clustercockpit.service index 0a9448de..3c977e34 100644 --- a/init/clustercockpit.service +++ b/init/clustercockpit.service @@ -3,7 +3,7 @@ Description=ClusterCockpit Web Server Documentation=https://github.com/ClusterCockpit/cc-backend Wants=network-online.target After=network-online.target -After=mariadb.service mysql.service +# Database is file-based SQLite - no service dependency required [Service] WorkingDirectory=/opt/monitoring/cc-backend @@ -12,7 +12,7 @@ NotifyAccess=all Restart=on-failure RestartSec=30 TimeoutStopSec=100 -ExecStart=/opt/monitoring/cc-backend/cc-backend --config ./config.json +ExecStart=/opt/monitoring/cc-backend/cc-backend --config ./config.json --server [Install] WantedBy=multi-user.target diff --git a/internal/api/api_test.go b/internal/api/api_test.go index 70b0f0aa..a8aef889 100644 --- a/internal/api/api_test.go +++ b/internal/api/api_test.go @@ -23,47 +23,45 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/graph" - "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" - "github.com/ClusterCockpit/cc-backend/internal/metricdata" + "github.com/ClusterCockpit/cc-backend/internal/metricdispatch" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/archive" - ccconf "github.com/ClusterCockpit/cc-lib/ccConfig" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/gorilla/mux" + "github.com/ClusterCockpit/cc-backend/pkg/metricstore" + ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/go-chi/chi/v5" _ "github.com/mattn/go-sqlite3" ) func setup(t *testing.T) *api.RestAPI { + repository.ResetConnection() + const testconfig = `{ - "main": { - "addr": "0.0.0.0:8080", - "validate": false, - "apiAllowedIPs": [ - "*" - ] - }, + "main": { + "addr": "0.0.0.0:8080", + "validate": false, + "api-allowed-ips": [ + "*" + ] + }, + "metric-store": { + "checkpoints": { + "interval": "12h" + }, + "retention-in-memory": "48h", + "memory-cap": 100 + }, "archive": { - "kind": "file", - "path": "./var/job-archive" + "kind": "file", + "path": "./var/job-archive" }, "auth": { - "jwts": { - "max-age": "2m" + "jwts": { + "max-age": "2m" + } } - }, - "clusters": [ - { - "name": "testcluster", - "metricDataRepository": {"kind": "test", "url": "bla:8081"}, - "filterRanges": { - "numNodes": { "from": 1, "to": 64 }, - "duration": { "from": 0, "to": 86400 }, - "startTime": { "from": "2022-01-01T00:00:00Z", "to": null } - } - } - ] }` const testclusterJSON = `{ "name": "testcluster", @@ -141,7 +139,7 @@ func setup(t *testing.T) *api.RestAPI { } dbfilepath := filepath.Join(tmpdir, "test.db") - err := repository.MigrateDB("sqlite3", dbfilepath) + err := repository.MigrateDB(dbfilepath) if err != nil { t.Fatal(err) } @@ -152,28 +150,23 @@ func setup(t *testing.T) *api.RestAPI { } ccconf.Init(cfgFilePath) + metricstore.MetricStoreHandle = &metricstore.InternalMetricStore{} // Load and check main configuration if cfg := ccconf.GetPackageConfig("main"); cfg != nil { - if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil { - config.Init(cfg, clustercfg) - } else { - cclog.Abort("Cluster configuration must be present") - } + config.Init(cfg) } else { cclog.Abort("Main configuration must be present") } archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive) - repository.Connect("sqlite3", dbfilepath) + repository.Connect(dbfilepath) - if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil { + if err := archive.Init(json.RawMessage(archiveCfg)); err != nil { t.Fatal(err) } - if err := metricdata.Init(); err != nil { - t.Fatal(err) - } + // metricstore initialization removed - it's initialized via callback in tests archiver.Start(repository.GetJobRepository(), context.Background()) @@ -190,11 +183,9 @@ func setup(t *testing.T) *api.RestAPI { } func cleanup() { - // Gracefully shutdown archiver with timeout if err := archiver.Shutdown(5 * time.Second); err != nil { cclog.Warnf("Archiver shutdown timeout in tests: %v", err) } - // TODO: Clear all caches, reset all modules, etc... } /* @@ -221,16 +212,14 @@ func TestRestApi(t *testing.T) { }, } - metricdata.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) { + metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) { return testData, nil } - r := mux.NewRouter() - r.PathPrefix("/api").Subrouter() - r.StrictSlash(true) + r := chi.NewRouter() restapi.MountAPIRoutes(r) - var TestJobId int64 = 123 + var TestJobID int64 = 123 TestClusterName := "testcluster" var TestStartTime int64 = 123456789 @@ -280,7 +269,7 @@ func TestRestApi(t *testing.T) { } // resolver := graph.GetResolverInstance() restapi.JobRepository.SyncJobs() - job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) + job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime) if err != nil { t.Fatal(err) } @@ -338,7 +327,7 @@ func TestRestApi(t *testing.T) { } // Archiving happens asynchronously, will be completed in cleanup - job, err := restapi.JobRepository.Find(&TestJobId, &TestClusterName, &TestStartTime) + job, err := restapi.JobRepository.Find(&TestJobID, &TestClusterName, &TestStartTime) if err != nil { t.Fatal(err) } @@ -366,7 +355,7 @@ func TestRestApi(t *testing.T) { } t.Run("CheckArchive", func(t *testing.T) { - data, err := metricDataDispatcher.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60) + data, err := metricdispatch.LoadData(stoppedJob, []string{"load_one"}, []schema.MetricScope{schema.MetricScopeNode}, context.Background(), 60) if err != nil { t.Fatal(err) } @@ -464,4 +453,198 @@ func TestRestApi(t *testing.T) { if !ok { t.Fatal("subtest failed") } + + t.Run("GetUsedNodesNoRunning", func(t *testing.T) { + contextUserValue := &schema.User{ + Username: "testuser", + Projects: make([]string, 0), + Roles: []string{"api"}, + AuthType: 0, + AuthSource: 2, + } + + req := httptest.NewRequest(http.MethodGet, "/jobs/used_nodes?ts=123456790", nil) + recorder := httptest.NewRecorder() + + ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue) + + r.ServeHTTP(recorder, req.WithContext(ctx)) + response := recorder.Result() + if response.StatusCode != http.StatusOK { + t.Fatal(response.Status, recorder.Body.String()) + } + + var result api.GetUsedNodesAPIResponse + if err := json.NewDecoder(response.Body).Decode(&result); err != nil { + t.Fatal(err) + } + + if result.UsedNodes == nil { + t.Fatal("expected usedNodes to be non-nil") + } + + if len(result.UsedNodes) != 0 { + t.Fatalf("expected no used nodes for stopped jobs, got: %v", result.UsedNodes) + } + }) +} + +// TestStopJobWithReusedJobId verifies that stopping a recently started job works +// even when an older job with the same jobId exists in the job table (e.g. with +// state "failed"). This is a regression test for the bug where Find() on the job +// table would match the old job instead of the new one still in job_cache. +func TestStopJobWithReusedJobId(t *testing.T) { + restapi := setup(t) + t.Cleanup(cleanup) + + testData := schema.JobData{ + "load_one": map[schema.MetricScope]*schema.JobMetric{ + schema.MetricScopeNode: { + Unit: schema.Unit{Base: "load"}, + Timestep: 60, + Series: []schema.Series{ + { + Hostname: "host123", + Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3}, + Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3}, + }, + }, + }, + }, + } + + metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) { + return testData, nil + } + + r := chi.NewRouter() + restapi.MountAPIRoutes(r) + + const contextUserKey repository.ContextKey = "user" + contextUserValue := &schema.User{ + Username: "testuser", + Projects: make([]string, 0), + Roles: []string{"user"}, + AuthType: 0, + AuthSource: 2, + } + + // Step 1: Start the first job (jobId=999) + const startJobBody1 string = `{ + "jobId": 999, + "user": "testuser", + "project": "testproj", + "cluster": "testcluster", + "partition": "default", + "walltime": 3600, + "numNodes": 1, + "numHwthreads": 8, + "numAcc": 0, + "shared": "none", + "monitoringStatus": 1, + "smt": 1, + "resources": [{"hostname": "host123", "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]}], + "startTime": 200000000 + }` + + if ok := t.Run("StartFirstJob", func(t *testing.T) { + req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody1))) + recorder := httptest.NewRecorder() + ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue) + r.ServeHTTP(recorder, req.WithContext(ctx)) + if recorder.Result().StatusCode != http.StatusCreated { + t.Fatal(recorder.Result().Status, recorder.Body.String()) + } + }); !ok { + return + } + + // Step 2: Sync to move job from cache to job table, then stop it as "failed" + time.Sleep(1 * time.Second) + restapi.JobRepository.SyncJobs() + + const stopJobBody1 string = `{ + "jobId": 999, + "startTime": 200000000, + "cluster": "testcluster", + "jobState": "failed", + "stopTime": 200001000 + }` + + if ok := t.Run("StopFirstJobAsFailed", func(t *testing.T) { + req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody1))) + recorder := httptest.NewRecorder() + ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue) + r.ServeHTTP(recorder, req.WithContext(ctx)) + if recorder.Result().StatusCode != http.StatusOK { + t.Fatal(recorder.Result().Status, recorder.Body.String()) + } + + jobid, cluster := int64(999), "testcluster" + job, err := restapi.JobRepository.Find(&jobid, &cluster, nil) + if err != nil { + t.Fatal(err) + } + if job.State != schema.JobStateFailed { + t.Fatalf("expected first job to be failed, got: %s", job.State) + } + }); !ok { + return + } + + // Wait for archiving to complete + time.Sleep(1 * time.Second) + + // Step 3: Start a NEW job with the same jobId=999 but different startTime. + // This job will sit in job_cache (not yet synced). + const startJobBody2 string = `{ + "jobId": 999, + "user": "testuser", + "project": "testproj", + "cluster": "testcluster", + "partition": "default", + "walltime": 3600, + "numNodes": 1, + "numHwthreads": 8, + "numAcc": 0, + "shared": "none", + "monitoringStatus": 1, + "smt": 1, + "resources": [{"hostname": "host123", "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7]}], + "startTime": 300000000 + }` + + if ok := t.Run("StartSecondJob", func(t *testing.T) { + req := httptest.NewRequest(http.MethodPost, "/jobs/start_job/", bytes.NewBuffer([]byte(startJobBody2))) + recorder := httptest.NewRecorder() + ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue) + r.ServeHTTP(recorder, req.WithContext(ctx)) + if recorder.Result().StatusCode != http.StatusCreated { + t.Fatal(recorder.Result().Status, recorder.Body.String()) + } + }); !ok { + return + } + + // Step 4: Stop the second job WITHOUT syncing first. + // Before the fix, this would fail because Find() on the job table would + // match the old failed job (jobId=999) and reject with "already stopped". + const stopJobBody2 string = `{ + "jobId": 999, + "startTime": 300000000, + "cluster": "testcluster", + "jobState": "completed", + "stopTime": 300001000 + }` + + t.Run("StopSecondJobBeforeSync", func(t *testing.T) { + req := httptest.NewRequest(http.MethodPost, "/jobs/stop_job/", bytes.NewBuffer([]byte(stopJobBody2))) + recorder := httptest.NewRecorder() + ctx := context.WithValue(req.Context(), contextUserKey, contextUserValue) + r.ServeHTTP(recorder, req.WithContext(ctx)) + if recorder.Result().StatusCode != http.StatusOK { + t.Fatalf("expected stop to succeed for cached job, got: %s %s", + recorder.Result().Status, recorder.Body.String()) + } + }) } diff --git a/internal/api/cluster.go b/internal/api/cluster.go index 28d7c109..5e6e3a27 100644 --- a/internal/api/cluster.go +++ b/internal/api/cluster.go @@ -13,7 +13,7 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/archive" - "github.com/ClusterCockpit/cc-lib/schema" + "github.com/ClusterCockpit/cc-lib/v2/schema" ) // GetClustersAPIResponse model @@ -27,7 +27,7 @@ type GetClustersAPIResponse struct { // @description Get a list of all cluster configs. Specific cluster can be requested using query parameter. // @produce json // @param cluster query string false "Job Cluster" -// @success 200 {object} api.GetClustersApiResponse "Array of clusters" +// @success 200 {object} api.GetClustersAPIResponse "Array of clusters" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -36,9 +36,9 @@ type GetClustersAPIResponse struct { // @router /api/clusters/ [get] func (api *RestAPI) getClusters(rw http.ResponseWriter, r *http.Request) { if user := repository.GetUserFromContext(r.Context()); user != nil && - !user.HasRole(schema.RoleApi) { + !user.HasRole(schema.RoleAPI) { - handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleApi)), http.StatusForbidden, rw) + handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw) return } diff --git a/internal/api/docs.go b/internal/api/docs.go index d0b5c6fb..de3cf506 100644 --- a/internal/api/docs.go +++ b/internal/api/docs.go @@ -25,11 +25,6 @@ const docTemplate = `{ "paths": { "/api/clusters/": { "get": { - "security": [ - { - "ApiKeyAuth": [] - } - ], "description": "Get a list of all cluster configs. Specific cluster can be requested using query parameter.", "produces": [ "application/json" @@ -50,7 +45,7 @@ const docTemplate = `{ "200": { "description": "Array of clusters", "schema": { - "$ref": "#/definitions/api.GetClustersApiResponse" + "$ref": "#/definitions/api.GetClustersAPIResponse" } }, "400": { @@ -77,16 +72,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/": { + "get": { "description": "Get a list of all jobs. Filters can be applied using query parameters.\nNumber of results can be limited by page. Results are sorted by descending startTime.", "produces": [ "application/json" @@ -145,7 +140,7 @@ const docTemplate = `{ "200": { "description": "Job array and page info", "schema": { - "$ref": "#/definitions/api.GetJobsApiResponse" + "$ref": "#/definitions/api.GetJobsAPIResponse" } }, "400": { @@ -172,16 +167,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job/": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job/": { + "delete": { "description": "Job to delete is specified by request body. All fields are required in this case.", "consumes": [ "application/json" @@ -200,7 +195,7 @@ const docTemplate = `{ "in": "body", "required": true, "schema": { - "$ref": "#/definitions/api.DeleteJobApiRequest" + "$ref": "#/definitions/api.DeleteJobAPIRequest" } } ], @@ -208,7 +203,7 @@ const docTemplate = `{ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -247,16 +242,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job/{id}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job/{id}": { + "delete": { "description": "Job to remove is specified by database ID. This will not remove the job from the job archive.", "produces": [ "application/json" @@ -278,7 +273,7 @@ const docTemplate = `{ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -317,16 +312,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/delete_job_before/{ts}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/delete_job_before/{ts}": { + "delete": { "description": "Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive.", "produces": [ "application/json" @@ -342,13 +337,19 @@ const docTemplate = `{ "name": "ts", "in": "path", "required": true + }, + { + "type": "boolean", + "description": "Omit jobs with tags from deletion", + "name": "omit-tagged", + "in": "query" } ], "responses": { "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -387,16 +388,79 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] } }, - "/api/jobs/edit_meta/{id}": { - "post": { + "/api/jobs/edit_meta/": { + "patch": { "security": [ { "ApiKeyAuth": [] } ], + "description": "Edit key value pairs in metadata json of job specified by jobID, StartTime and Cluster\nIf a key already exists its content will be overwritten", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "Job add and modify" + ], + "summary": "Edit meta-data json by request", + "parameters": [ + { + "description": "Specifies job and payload to add or update", + "name": "request", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/api.JobMetaRequest" + } + } + ], + "responses": { + "200": { + "description": "Updated job resource", + "schema": { + "$ref": "#/definitions/schema.Job" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Job does not exist", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + }, + "/api/jobs/edit_meta/{id}": { + "patch": { "description": "Edit key value pairs in job metadata json\nIf a key already exists its content will be overwritten", "consumes": [ "application/json" @@ -457,16 +521,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/start_job/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/start_job/": { + "post": { "description": "Job specified in request body will be saved to database as \"running\" with new DB ID.\nJob specifications follow the 'Job' scheme, API will fail to execute if requirements are not met.", "consumes": [ "application/json" @@ -493,7 +557,7 @@ const docTemplate = `{ "201": { "description": "Job added successfully", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -526,16 +590,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/stop_job/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/stop_job/": { + "post": { "description": "Job to stop is specified by request body. All fields are required in this case.\nReturns full job resource information according to 'Job' scheme.", "produces": [ "application/json" @@ -551,7 +615,7 @@ const docTemplate = `{ "in": "body", "required": true, "schema": { - "$ref": "#/definitions/api.StopJobApiRequest" + "$ref": "#/definitions/api.StopJobAPIRequest" } } ], @@ -598,16 +662,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/tag_job/{id}": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/tag_job/{id}": { + "post": { "description": "Adds tag(s) to a job specified by DB ID. Name and Type of Tag(s) can be chosen freely.\nTag Scope for frontend visibility will default to \"global\" if none entered, other options: \"admin\" or specific username.\nIf tagged job is already finished: Tag will be written directly to respective archive files.", "consumes": [ "application/json" @@ -635,7 +699,7 @@ const docTemplate = `{ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -671,16 +735,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/jobs/{id}": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/jobs/{id}": { + "get": { "description": "Job to get is specified by database ID\nReturns full job resource information according to 'Job' scheme and all metrics according to 'JobData'.", "produces": [ "application/json" @@ -708,7 +772,7 @@ const docTemplate = `{ "200": { "description": "Job resource", "schema": { - "$ref": "#/definitions/api.GetJobApiResponse" + "$ref": "#/definitions/api.GetJobAPIResponse" } }, "400": { @@ -747,14 +811,14 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - }, - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + }, + "post": { "description": "Job to get is specified by database ID\nReturns full job resource information according to 'Job' scheme and all metrics according to 'JobData'.", "consumes": [ "application/json" @@ -791,7 +855,7 @@ const docTemplate = `{ "200": { "description": "Job resource", "schema": { - "$ref": "#/definitions/api.GetJobApiResponse" + "$ref": "#/definitions/api.GetJobAPIResponse" } }, "400": { @@ -830,16 +894,16 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/nodestats/": { - "post": { + }, "security": [ { "ApiKeyAuth": [] } - ], + ] + } + }, + "/api/nodestats/": { + "post": { "description": "Returns a JSON-encoded list of users.\nRequired query-parameter defines if all users or only users with additional special roles are returned.", "produces": [ "application/json" @@ -863,7 +927,7 @@ const docTemplate = `{ "200": { "description": "Success message", "schema": { - "$ref": "#/definitions/api.DefaultApiResponse" + "$ref": "#/definitions/api.DefaultAPIResponse" } }, "400": { @@ -890,16 +954,86 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/api/users/": { - "get": { + }, "security": [ { "ApiKeyAuth": [] } + ] + } + }, + "/api/user/{id}": { + "post": { + "description": "Allows admins to add/remove roles and projects for a user", + "produces": [ + "text/plain" ], + "tags": [ + "User" + ], + "summary": "Update user roles and projects", + "parameters": [ + { + "type": "string", + "description": "Username", + "name": "id", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Role to add", + "name": "add-role", + "in": "formData" + }, + { + "type": "string", + "description": "Role to remove", + "name": "remove-role", + "in": "formData" + }, + { + "type": "string", + "description": "Project to add", + "name": "add-project", + "in": "formData" + }, + { + "type": "string", + "description": "Project to remove", + "name": "remove-project", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/api/users/": { + "get": { "description": "Returns a JSON-encoded list of users.\nRequired query-parameter defines if all users or only users with additional special roles are returned.", "produces": [ "application/json" @@ -923,7 +1057,7 @@ const docTemplate = `{ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiReturnedUser" + "$ref": "#/definitions/api.APIReturnedUser" } } }, @@ -951,16 +1085,361 @@ const docTemplate = `{ "type": "string" } } - } - } - }, - "/jobs/tag_job/{id}": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } + ] + }, + "post": { + "description": "Creates a new user with specified credentials and role", + "produces": [ + "text/plain" ], + "tags": [ + "User" + ], + "summary": "Create a new user", + "parameters": [ + { + "type": "string", + "description": "Username", + "name": "username", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Password (not required for API users)", + "name": "password", + "in": "formData" + }, + { + "type": "string", + "description": "User role", + "name": "role", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Full name", + "name": "name", + "in": "formData" + }, + { + "type": "string", + "description": "Email address", + "name": "email", + "in": "formData" + }, + { + "type": "string", + "description": "Project (required for managers)", + "name": "project", + "in": "formData" + } + ], + "responses": { + "200": { + "description": "Success message", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + }, + "delete": { + "description": "Deletes a user from the system", + "produces": [ + "text/plain" + ], + "tags": [ + "User" + ], + "summary": "Delete a user", + "parameters": [ + { + "type": "string", + "description": "Username to delete", + "name": "username", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "Success", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "422": { + "description": "Unprocessable Entity", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/configuration/": { + "post": { + "description": "Updates a user's configuration key-value pair.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Frontend" + ], + "summary": "Update user configuration", + "parameters": [ + { + "type": "string", + "description": "Configuration key", + "name": "key", + "in": "formData", + "required": true + }, + { + "type": "string", + "description": "Configuration value", + "name": "value", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "success", + "schema": { + "type": "string" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/debug/": { + "post": { + "description": "This endpoint allows the users to print the content of", + "produces": [ + "application/json" + ], + "tags": [ + "debug" + ], + "summary": "Debug endpoint", + "parameters": [ + { + "type": "string", + "description": "Selector", + "name": "selector", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Debug dump", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/free/": { + "post": { + "description": "This endpoint allows the users to free the Buffers from the", + "produces": [ + "application/json" + ], + "tags": [ + "free" + ], + "parameters": [ + { + "type": "string", + "description": "up to timestamp", + "name": "to", + "in": "query" + } + ], + "responses": { + "200": { + "description": "ok", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/healthcheck/": { + "get": { + "description": "This endpoint allows the users to check if a node is healthy", + "produces": [ + "application/json" + ], + "tags": [ + "healthcheck" + ], + "summary": "HealthCheck endpoint", + "parameters": [ + { + "type": "string", + "description": "Selector", + "name": "selector", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Debug dump", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/jobs/tag_job/{id}": { + "delete": { "description": "Removes tag(s) from a job specified by DB ID. Name and Type of Tag(s) must match.\nTag Scope is required for matching, options: \"global\", \"admin\". Private tags can not be deleted via API.\nIf tagged job is already finished: Tag will be removed from respective archive files.", "consumes": [ "application/json" @@ -988,7 +1467,7 @@ const docTemplate = `{ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -1024,16 +1503,276 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } - } - }, - "/tags/": { - "delete": { + }, "security": [ { "ApiKeyAuth": [] } + ] + } + }, + "/jwt/": { + "get": { + "description": "Generates a JWT token for a user. Admins can generate tokens for any user, regular users only for themselves.", + "consumes": [ + "multipart/form-data" ], + "produces": [ + "text/plain" + ], + "tags": [ + "Frontend" + ], + "summary": "Generate JWT token", + "parameters": [ + { + "type": "string", + "description": "Username to generate JWT for", + "name": "username", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "JWT token", + "schema": { + "type": "string" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "User Not Found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/machine_state/{cluster}/{host}": { + "get": { + "description": "Retrieves stored machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.", + "produces": [ + "application/json" + ], + "tags": [ + "Machine State" + ], + "summary": "Retrieve machine state", + "parameters": [ + { + "type": "string", + "description": "Cluster name", + "name": "cluster", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Host name", + "name": "host", + "in": "path", + "required": true + } + ], + "responses": { + "200": { + "description": "Machine state JSON data", + "schema": { + "type": "object" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Machine state not enabled or file not found", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + }, + "put": { + "description": "Stores machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal.", + "consumes": [ + "application/json" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Machine State" + ], + "summary": "Store machine state", + "parameters": [ + { + "type": "string", + "description": "Cluster name", + "name": "cluster", + "in": "path", + "required": true + }, + { + "type": "string", + "description": "Host name", + "name": "host", + "in": "path", + "required": true + } + ], + "responses": { + "201": { + "description": "Created" + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Machine state not enabled", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/notice/": { + "post": { + "description": "Updates the notice.txt file content. Only admins are allowed. Content is limited to 10000 characters.", + "consumes": [ + "multipart/form-data" + ], + "produces": [ + "text/plain" + ], + "tags": [ + "Config" + ], + "summary": "Update system notice", + "parameters": [ + { + "type": "string", + "description": "New notice content (max 10000 characters)", + "name": "new-content", + "in": "formData", + "required": true + } + ], + "responses": { + "200": { + "description": "Update Notice Content Success", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/roles/": { + "get": { + "description": "Returns a list of valid user roles. Only admins are allowed.", + "produces": [ + "application/json" + ], + "tags": [ + "Config" + ], + "summary": "Get available roles", + "responses": { + "200": { + "description": "List of role names", + "schema": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/tags/": { + "delete": { "description": "Removes tags by type and name. Name and Type of Tag(s) must match.\nTag Scope is required for matching, options: \"global\", \"admin\". Private tags can not be deleted via API.\nTag wills be removed from respective archive files.", "consumes": [ "application/json" @@ -1054,7 +1793,7 @@ const docTemplate = `{ "schema": { "type": "array", "items": { - "$ref": "#/definitions/api.ApiTag" + "$ref": "#/definitions/api.APITag" } } } @@ -1090,12 +1829,72 @@ const docTemplate = `{ "$ref": "#/definitions/api.ErrorResponse" } } - } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] + } + }, + "/write/": { + "post": { + "consumes": [ + "text/plain" + ], + "produces": [ + "application/json" + ], + "parameters": [ + { + "type": "string", + "description": "If the lines in the body do not have a cluster tag, use this value instead.", + "name": "cluster", + "in": "query" + } + ], + "responses": { + "200": { + "description": "ok", + "schema": { + "type": "string" + } + }, + "400": { + "description": "Bad Request", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "401": { + "description": "Unauthorized", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "403": { + "description": "Forbidden", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal Server Error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + }, + "security": [ + { + "ApiKeyAuth": [] + } + ] } } }, "definitions": { - "api.ApiReturnedUser": { + "api.APIReturnedUser": { "type": "object", "properties": { "email": { @@ -1121,7 +1920,7 @@ const docTemplate = `{ } } }, - "api.ApiTag": { + "api.APITag": { "type": "object", "properties": { "name": { @@ -1141,7 +1940,7 @@ const docTemplate = `{ } } }, - "api.DefaultApiResponse": { + "api.DefaultAPIResponse": { "type": "object", "properties": { "msg": { @@ -1149,7 +1948,7 @@ const docTemplate = `{ } } }, - "api.DeleteJobApiRequest": { + "api.DeleteJobAPIRequest": { "type": "object", "required": [ "jobId" @@ -1198,7 +1997,7 @@ const docTemplate = `{ } } }, - "api.GetClustersApiResponse": { + "api.GetClustersAPIResponse": { "type": "object", "properties": { "clusters": { @@ -1210,7 +2009,7 @@ const docTemplate = `{ } } }, - "api.GetJobApiResponse": { + "api.GetJobAPIResponse": { "type": "object", "properties": { "data": { @@ -1224,7 +2023,7 @@ const docTemplate = `{ } } }, - "api.GetJobsApiResponse": { + "api.GetJobsAPIResponse": { "type": "object", "properties": { "items": { @@ -1258,39 +2057,7 @@ const docTemplate = `{ } } }, - "api.Node": { - "type": "object", - "properties": { - "cpusAllocated": { - "type": "integer" - }, - "cpusTotal": { - "type": "integer" - }, - "gpusAllocated": { - "type": "integer" - }, - "gpusTotal": { - "type": "integer" - }, - "hostname": { - "type": "string" - }, - "memoryAllocated": { - "type": "integer" - }, - "memoryTotal": { - "type": "integer" - }, - "states": { - "type": "array", - "items": { - "type": "string" - } - } - } - }, - "api.StopJobApiRequest": { + "api.StopJobAPIRequest": { "type": "object", "required": [ "jobState", @@ -1333,7 +2100,7 @@ const docTemplate = `{ "nodes": { "type": "array", "items": { - "$ref": "#/definitions/api.Node" + "$ref": "#/definitions/schema.NodePayload" } } } @@ -1342,12 +2109,15 @@ const docTemplate = `{ "type": "object", "properties": { "id": { + "description": "Unique identifier for the accelerator (e.g., \"0\", \"1\", \"GPU-0\")", "type": "string" }, "model": { + "description": "Specific model name (e.g., \"A100\", \"MI100\")", "type": "string" }, "type": { + "description": "Type of accelerator (e.g., \"Nvidia GPU\", \"AMD GPU\")", "type": "string" } } @@ -1356,15 +2126,18 @@ const docTemplate = `{ "type": "object", "properties": { "metricConfig": { + "description": "Cluster-wide metric configurations", "type": "array", "items": { "$ref": "#/definitions/schema.MetricConfig" } }, "name": { + "description": "Unique cluster name (e.g., \"fritz\", \"alex\")", "type": "string" }, "subClusters": { + "description": "Homogeneous partitions within the cluster", "type": "array", "items": { "$ref": "#/definitions/schema.SubCluster" @@ -1373,6 +2146,7 @@ const docTemplate = `{ } }, "schema.Job": { + "description": "Information of a HPC job.", "type": "object", "properties": { "arrayJobId": { @@ -1401,6 +2175,13 @@ const docTemplate = `{ "format": "float64" } }, + "exclusive": { + "description": "for backwards compatibility", + "type": "integer", + "maximum": 2, + "minimum": 0, + "example": 1 + }, "footprint": { "type": "object", "additionalProperties": { @@ -1423,7 +2204,7 @@ const docTemplate = `{ "deadline", "failed", "node_fail", - "out_of_memory", + "out-of-memory", "pending", "preempted", "running", @@ -1535,9 +2316,11 @@ const docTemplate = `{ "type": "object", "properties": { "id": { + "description": "Internal database ID", "type": "integer" }, "jobId": { + "description": "The job's external job ID", "type": "integer" } } @@ -1546,9 +2329,11 @@ const docTemplate = `{ "type": "object", "properties": { "count": { + "description": "Total count of available items", "type": "integer" }, "items": { + "description": "List of job links", "type": "array", "items": { "$ref": "#/definitions/schema.JobLink" @@ -1560,19 +2345,31 @@ const docTemplate = `{ "type": "object", "properties": { "series": { + "description": "Individual time series data", "type": "array", "items": { "$ref": "#/definitions/schema.Series" } }, "statisticsSeries": { - "$ref": "#/definitions/schema.StatsSeries" + "description": "Aggregated statistics over time", + "allOf": [ + { + "$ref": "#/definitions/schema.StatsSeries" + } + ] }, "timestep": { + "description": "Sampling interval in seconds", "type": "integer" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1638,46 +2435,71 @@ const docTemplate = `{ "type": "object", "properties": { "aggregation": { + "description": "Aggregation function (avg, sum, min, max)", "type": "string" }, "alert": { + "description": "Alert threshold (requires attention)", "type": "number" }, "caution": { + "description": "Caution threshold (concerning but not critical)", "type": "number" }, "energy": { + "description": "Energy measurement method", "type": "string" }, "footprint": { + "description": "Footprint category", "type": "string" }, "lowerIsBetter": { + "description": "Whether lower values are better", "type": "boolean" }, "name": { + "description": "Metric name (e.g., \"cpu_load\", \"mem_used\")", "type": "string" }, "normal": { + "description": "Normal/typical value (good performance)", "type": "number" }, "peak": { + "description": "Peak/maximum expected value (best performance)", "type": "number" }, + "restrict": { + "description": "Restrict visibility to non user roles", + "type": "boolean" + }, "scope": { - "$ref": "#/definitions/schema.MetricScope" + "description": "Metric scope (node, socket, core, etc.)", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricScope" + } + ] }, "subClusters": { + "description": "Subcluster-specific overrides", "type": "array", "items": { "$ref": "#/definitions/schema.SubClusterConfig" } }, "timestep": { + "description": "Measurement interval in seconds", "type": "integer" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1706,12 +2528,15 @@ const docTemplate = `{ "type": "object", "properties": { "avg": { + "description": "Average/mean value", "type": "number" }, "max": { + "description": "Maximum value", "type": "number" }, "min": { + "description": "Minimum value", "type": "number" } } @@ -1720,30 +2545,72 @@ const docTemplate = `{ "type": "object", "properties": { "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement (e.g., FLOP/s, GB/s)", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] }, "value": { + "description": "Numeric value of the measurement", "type": "number" } } }, + "schema.NodePayload": { + "type": "object", + "properties": { + "cpusAllocated": { + "description": "Number of allocated CPUs", + "type": "integer" + }, + "gpusAllocated": { + "description": "Number of allocated GPUs", + "type": "integer" + }, + "hostname": { + "description": "Node hostname", + "type": "string" + }, + "jobsRunning": { + "description": "Number of running jobs", + "type": "integer" + }, + "memoryAllocated": { + "description": "Allocated memory in MB", + "type": "integer" + }, + "states": { + "description": "State strings (flexible format)", + "type": "array", + "items": { + "type": "string" + } + } + } + }, "schema.Resource": { "description": "A resource used by a job", "type": "object", "properties": { "accelerators": { + "description": "Allocated accelerator IDs (e.g., GPU IDs)", "type": "array", "items": { "type": "string" } }, "configuration": { + "description": "Optional configuration identifier", "type": "string" }, "hostname": { + "description": "Node hostname", "type": "string" }, "hwthreads": { + "description": "Allocated hardware thread IDs", "type": "array", "items": { "type": "integer" @@ -1755,19 +2622,27 @@ const docTemplate = `{ "type": "object", "properties": { "data": { + "description": "Time series measurements", "type": "array", "items": { "type": "number" } }, "hostname": { + "description": "Source hostname", "type": "string" }, "id": { + "description": "Optional ID (e.g., core ID, GPU ID)", "type": "string" }, "statistics": { - "$ref": "#/definitions/schema.MetricStatistics" + "description": "Statistical summary (min/avg/max)", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricStatistics" + } + ] } } }, @@ -1775,30 +2650,35 @@ const docTemplate = `{ "type": "object", "properties": { "max": { + "description": "Maximum values over time", "type": "array", "items": { "type": "number" } }, "mean": { + "description": "Mean values over time", "type": "array", "items": { "type": "number" } }, "median": { + "description": "Median values over time", "type": "array", "items": { "type": "number" } }, "min": { + "description": "Minimum values over time", "type": "array", "items": { "type": "number" } }, "percentiles": { + "description": "Percentile values over time (e.g., 10th, 50th, 90th)", "type": "object", "additionalProperties": { "type": "array", @@ -1814,52 +2694,81 @@ const docTemplate = `{ "type": "object", "properties": { "coresPerSocket": { + "description": "Number of cores per CPU socket", "type": "integer" }, "energyFootprint": { + "description": "Energy-related footprint metrics", "type": "array", "items": { "type": "string" } }, "flopRateScalar": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical scalar FLOP rate per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "flopRateSimd": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical SIMD FLOP rate per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "footprint": { + "description": "Default footprint metrics for jobs", "type": "array", "items": { "type": "string" } }, "memoryBandwidth": { - "$ref": "#/definitions/schema.MetricValue" + "description": "Theoretical memory bandwidth per node", + "allOf": [ + { + "$ref": "#/definitions/schema.MetricValue" + } + ] }, "metricConfig": { + "description": "Subcluster-specific metric configurations", "type": "array", "items": { "$ref": "#/definitions/schema.MetricConfig" } }, "name": { + "description": "Name of the subcluster (e.g., \"main\", \"gpu\", \"bigmem\")", "type": "string" }, "nodes": { + "description": "Node list in condensed format (e.g., \"node[001-100]\")", "type": "string" }, "processorType": { + "description": "CPU model (e.g., \"Intel Xeon Gold 6148\")", "type": "string" }, "socketsPerNode": { + "description": "Number of CPU sockets per node", "type": "integer" }, "threadsPerCore": { + "description": "Number of hardware threads per core (SMT level)", "type": "integer" }, "topology": { - "$ref": "#/definitions/schema.Topology" + "description": "Hardware topology of nodes in this subcluster", + "allOf": [ + { + "$ref": "#/definitions/schema.Topology" + } + ] } } }, @@ -1867,34 +2776,52 @@ const docTemplate = `{ "type": "object", "properties": { "alert": { + "description": "Alert threshold (requires attention)", "type": "number" }, "caution": { + "description": "Caution threshold (concerning but not critical)", "type": "number" }, "energy": { + "description": "Energy measurement configuration", "type": "string" }, "footprint": { + "description": "Footprint category for this metric", "type": "string" }, "lowerIsBetter": { + "description": "Whether lower values indicate better performance", "type": "boolean" }, "name": { + "description": "Metric name (e.g., \"cpu_load\", \"mem_used\")", "type": "string" }, "normal": { + "description": "Normal/typical value (good performance)", "type": "number" }, "peak": { + "description": "Peak/maximum expected value (best performance)", "type": "number" }, "remove": { + "description": "Whether to exclude this metric for this subcluster", + "type": "boolean" + }, + "restrict": { + "description": "Restrict visibility to non user roles", "type": "boolean" }, "unit": { - "$ref": "#/definitions/schema.Unit" + "description": "Unit of measurement", + "allOf": [ + { + "$ref": "#/definitions/schema.Unit" + } + ] } } }, @@ -1923,12 +2850,14 @@ const docTemplate = `{ "type": "object", "properties": { "accelerators": { + "description": "Attached accelerators (GPUs, etc.)", "type": "array", "items": { "$ref": "#/definitions/schema.Accelerator" } }, "core": { + "description": "Hardware threads grouped by core", "type": "array", "items": { "type": "array", @@ -1938,6 +2867,7 @@ const docTemplate = `{ } }, "die": { + "description": "Hardware threads grouped by die (optional)", "type": "array", "items": { "type": "array", @@ -1947,6 +2877,7 @@ const docTemplate = `{ } }, "memoryDomain": { + "description": "Hardware threads grouped by NUMA domain", "type": "array", "items": { "type": "array", @@ -1956,12 +2887,14 @@ const docTemplate = `{ } }, "node": { + "description": "All hardware thread IDs on this node", "type": "array", "items": { "type": "integer" } }, "socket": { + "description": "Hardware threads grouped by socket", "type": "array", "items": { "type": "array", @@ -1976,9 +2909,11 @@ const docTemplate = `{ "type": "object", "properties": { "base": { + "description": "Base unit (e.g., \"B/s\", \"F/s\", \"W\")", "type": "string" }, "prefix": { + "description": "SI prefix (e.g., \"G\", \"M\", \"K\", \"T\")", "type": "string" } } diff --git a/internal/api/job.go b/internal/api/job.go index 7701374a..76ec3e2a 100644 --- a/internal/api/job.go +++ b/internal/api/job.go @@ -22,12 +22,12 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/graph" "github.com/ClusterCockpit/cc-backend/internal/graph/model" "github.com/ClusterCockpit/cc-backend/internal/importer" - "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" + "github.com/ClusterCockpit/cc-backend/internal/metricdispatch" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/gorilla/mux" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/go-chi/chi/v5" ) const ( @@ -72,6 +72,14 @@ type EditMetaRequest struct { Value string `json:"value" example:"bash script"` } +// JobMetaRequest model +type JobMetaRequest struct { + JobId *int64 `json:"jobId" validate:"required" example:"123000"` // Cluster Job ID of job + Cluster *string `json:"cluster" example:"fritz"` // Cluster of job + StartTime *int64 `json:"startTime" example:"1649723812"` // Start Time of job as epoch + Payload EditMetaRequest `json:"payload"` // Content to Add to Job Meta_Data +} + type TagJobAPIRequest []*APITag type GetJobAPIRequest []string @@ -104,7 +112,7 @@ type JobMetricWithName struct { // @param items-per-page query int false "Items per page (Default: 25)" // @param page query int false "Page Number (Default: 1)" // @param with-metadata query bool false "Include metadata (e.g. jobScript) in response" -// @success 200 {object} api.GetJobsApiResponse "Job array and page info" +// @success 200 {object} api.GetJobsAPIResponse "Job array and page info" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -232,7 +240,7 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) { // @produce json // @param id path int true "Database ID of Job" // @param all-metrics query bool false "Include all available metrics" -// @success 200 {object} api.GetJobApiResponse "Job resource" +// @success 200 {object} api.GetJobAPIResponse "Job resource" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -243,17 +251,17 @@ func (api *RestAPI) getJobs(rw http.ResponseWriter, r *http.Request) { // @router /api/jobs/{id} [get] func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request) { // Fetch job from db - id, ok := mux.Vars(r)["id"] + id := chi.URLParam(r, "id") var job *schema.Job var err error - if ok { + if id != "" { id, e := strconv.ParseInt(id, 10, 64) if e != nil { handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) return } - job, err = api.JobRepository.FindById(r.Context(), id) // Get Job from Repo by ID + job, err = api.JobRepository.FindByID(r.Context(), id) // Get Job from Repo by ID } else { handleError(fmt.Errorf("the parameter 'id' is required"), http.StatusBadRequest, rw) return @@ -293,7 +301,7 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request) } if r.URL.Query().Get("all-metrics") == "true" { - data, err = metricDataDispatcher.LoadData(job, nil, scopes, r.Context(), resolution) + data, err = metricdispatch.LoadData(job, nil, scopes, r.Context(), resolution) if err != nil { cclog.Warnf("REST: error while loading all-metrics job data for JobID %d on %s", job.JobID, job.Cluster) return @@ -324,8 +332,8 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request) // @accept json // @produce json // @param id path int true "Database ID of Job" -// @param request body api.GetJobApiRequest true "Array of metric names" -// @success 200 {object} api.GetJobApiResponse "Job resource" +// @param request body api.GetJobAPIRequest true "Array of metric names" +// @success 200 {object} api.GetJobAPIResponse "Job resource" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -336,17 +344,17 @@ func (api *RestAPI) getCompleteJobByID(rw http.ResponseWriter, r *http.Request) // @router /api/jobs/{id} [post] func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) { // Fetch job from db - id, ok := mux.Vars(r)["id"] + id := chi.URLParam(r, "id") var job *schema.Job var err error - if ok { + if id != "" { id, e := strconv.ParseInt(id, 10, 64) if e != nil { handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) return } - job, err = api.JobRepository.FindById(r.Context(), id) + job, err = api.JobRepository.FindByID(r.Context(), id) } else { handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw) return @@ -389,7 +397,7 @@ func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) { resolution = max(resolution, mc.Timestep) } - data, err := metricDataDispatcher.LoadData(job, metrics, scopes, r.Context(), resolution) + data, err := metricdispatch.LoadData(job, metrics, scopes, r.Context(), resolution) if err != nil { cclog.Warnf("REST: error while loading job data for JobID %d on %s", job.JobID, job.Cluster) return @@ -423,29 +431,29 @@ func (api *RestAPI) getJobByID(rw http.ResponseWriter, r *http.Request) { } // editMeta godoc -// @summary Edit meta-data json +// @summary Edit meta-data json of job identified by database id // @tags Job add and modify -// @description Edit key value pairs in job metadata json +// @description Edit key value pairs in job metadata json of job specified by database id // @description If a key already exists its content will be overwritten // @accept json // @produce json // @param id path int true "Job Database ID" -// @param request body api.EditMetaRequest true "Kay value pair to add" +// @param request body api.EditMetaRequest true "Metadata Key value pair to add or update" // @success 200 {object} schema.Job "Updated job resource" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 404 {object} api.ErrorResponse "Job does not exist" // @failure 500 {object} api.ErrorResponse "Internal Server Error" // @security ApiKeyAuth -// @router /api/jobs/edit_meta/{id} [post] +// @router /api/jobs/edit_meta/{id} [patch] func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) { - id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) + id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64) if err != nil { handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw) return } - job, err := api.JobRepository.FindById(r.Context(), id) + job, err := api.JobRepository.FindByID(r.Context(), id) if err != nil { handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw) return @@ -469,6 +477,54 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) { } } +// editMetaByRequest godoc +// @summary Edit meta-data json of job identified by request +// @tags Job add and modify +// @description Edit key value pairs in metadata json of job specified by jobID, StartTime and Cluster +// @description If a key already exists its content will be overwritten +// @accept json +// @produce json +// @param request body api.JobMetaRequest true "Specifies job and payload to add or update" +// @success 200 {object} schema.Job "Updated job resource" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 404 {object} api.ErrorResponse "Job does not exist" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/edit_meta/ [patch] +func (api *RestAPI) editMetaByRequest(rw http.ResponseWriter, r *http.Request) { + // Parse request body + req := JobMetaRequest{} + if err := decode(r.Body, &req); err != nil { + handleError(fmt.Errorf("parsing request body failed: %w", err), http.StatusBadRequest, rw) + return + } + + // Fetch job (that will have its meta_data edited) from db + var job *schema.Job + var err error + if req.JobId == nil { + handleError(errors.New("the field 'jobId' is required"), http.StatusBadRequest, rw) + return + } + + // log.Printf("loading db job for editMetaByRequest... : JobMetaRequest=%v", req) + job, err = api.JobRepository.Find(req.JobId, req.Cluster, req.StartTime) + if err != nil { + handleError(fmt.Errorf("finding job failed: %w", err), http.StatusUnprocessableEntity, rw) + return + } + + if err := api.JobRepository.UpdateMetadata(job, req.Payload.Key, req.Payload.Value); err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } + + rw.Header().Add("Content-Type", "application/json") + rw.WriteHeader(http.StatusOK) + json.NewEncoder(rw).Encode(job) +} + // tagJob godoc // @summary Adds one or more tags to a job // @tags Job add and modify @@ -478,7 +534,7 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) { // @accept json // @produce json // @param id path int true "Job Database ID" -// @param request body api.TagJobApiRequest true "Array of tag-objects to add" +// @param request body api.TagJobAPIRequest true "Array of tag-objects to add" // @success 200 {object} schema.Job "Updated job resource" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" @@ -487,13 +543,13 @@ func (api *RestAPI) editMeta(rw http.ResponseWriter, r *http.Request) { // @security ApiKeyAuth // @router /api/jobs/tag_job/{id} [post] func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) { - id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) + id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64) if err != nil { handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw) return } - job, err := api.JobRepository.FindById(r.Context(), id) + job, err := api.JobRepository.FindByID(r.Context(), id) if err != nil { handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw) return @@ -542,7 +598,7 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) { // @accept json // @produce json // @param id path int true "Job Database ID" -// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" +// @param request body api.TagJobAPIRequest true "Array of tag-objects to remove" // @success 200 {object} schema.Job "Updated job resource" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" @@ -551,13 +607,13 @@ func (api *RestAPI) tagJob(rw http.ResponseWriter, r *http.Request) { // @security ApiKeyAuth // @router /jobs/tag_job/{id} [delete] func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) { - id, err := strconv.ParseInt(mux.Vars(r)["id"], 10, 64) + id, err := strconv.ParseInt(chi.URLParam(r, "id"), 10, 64) if err != nil { handleError(fmt.Errorf("parsing job ID failed: %w", err), http.StatusBadRequest, rw) return } - job, err := api.JobRepository.FindById(r.Context(), id) + job, err := api.JobRepository.FindByID(r.Context(), id) if err != nil { handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw) return @@ -606,7 +662,7 @@ func (api *RestAPI) removeTagJob(rw http.ResponseWriter, r *http.Request) { // @description Tag wills be removed from respective archive files. // @accept json // @produce plain -// @param request body api.TagJobApiRequest true "Array of tag-objects to remove" +// @param request body api.TagJobAPIRequest true "Array of tag-objects to remove" // @success 200 {string} string "Success Response" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" @@ -650,7 +706,7 @@ func (api *RestAPI) removeTags(rw http.ResponseWriter, r *http.Request) { // @accept json // @produce json // @param request body schema.Job true "Job to add" -// @success 201 {object} api.DefaultApiResponse "Job added successfully" +// @success 201 {object} api.DefaultAPIResponse "Job added successfully" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -691,13 +747,21 @@ func (api *RestAPI) startJob(rw http.ResponseWriter, r *http.Request) { for _, job := range jobs { // Check if jobs are within the same day (prevent duplicates) if (req.StartTime - job.StartTime) < secondsPerDay { - handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", job.ID, job.JobID), http.StatusUnprocessableEntity, rw) + handleError(fmt.Errorf("a job with that jobId, cluster and startTime already exists: dbid: %d, jobid: %d", *job.ID, job.JobID), http.StatusUnprocessableEntity, rw) return } } } - id, err := api.JobRepository.Start(&req) + // When tags are present, insert directly into the job table so that the + // returned ID can be used with AddTagOrCreate (which queries the job table). + // Jobs without tags use the cache path as before. + var id int64 + if len(req.Tags) > 0 { + id, err = api.JobRepository.StartDirect(&req) + } else { + id, err = api.JobRepository.Start(&req) + } if err != nil { handleError(fmt.Errorf("insert into database failed: %w", err), http.StatusInternalServerError, rw) return @@ -728,7 +792,7 @@ func (api *RestAPI) startJob(rw http.ResponseWriter, r *http.Request) { // @description Job to stop is specified by request body. All fields are required in this case. // @description Returns full job resource information according to 'Job' scheme. // @produce json -// @param request body api.StopJobApiRequest true "All fields required" +// @param request body api.StopJobAPIRequest true "All fields required" // @success 200 {object} schema.Job "Success message" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" @@ -754,20 +818,20 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { return } - // cclog.Printf("loading db job for stopJobByRequest... : stopJobApiRequest=%v", req) - job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime) + isCached := false + job, err = api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime) if err != nil { - // Try cached jobs if not found in main repository - cachedJob, cachedErr := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime) - if cachedErr != nil { - // Combine both errors for better debugging - handleError(fmt.Errorf("finding job failed: %w (cached lookup also failed: %v)", err, cachedErr), http.StatusNotFound, rw) + // Not in cache, try main job table + job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime) + if err != nil { + handleError(fmt.Errorf("finding job failed: %w", err), http.StatusNotFound, rw) return } - job = cachedJob + } else { + isCached = true } - api.checkAndHandleStopJob(rw, job, req) + api.checkAndHandleStopJob(rw, job, req, isCached) } // deleteJobByID godoc @@ -776,7 +840,7 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { // @description Job to remove is specified by database ID. This will not remove the job from the job archive. // @produce json // @param id path int true "Database ID of Job" -// @success 200 {object} api.DefaultApiResponse "Success message" +// @success 200 {object} api.DefaultAPIResponse "Success message" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -787,16 +851,16 @@ func (api *RestAPI) stopJobByRequest(rw http.ResponseWriter, r *http.Request) { // @router /api/jobs/delete_job/{id} [delete] func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) { // Fetch job (that will be stopped) from db - id, ok := mux.Vars(r)["id"] + id := chi.URLParam(r, "id") var err error - if ok { + if id != "" { id, e := strconv.ParseInt(id, 10, 64) if e != nil { handleError(fmt.Errorf("integer expected in path for id: %w", e), http.StatusBadRequest, rw) return } - err = api.JobRepository.DeleteJobById(id) + err = api.JobRepository.DeleteJobByID(id) } else { handleError(errors.New("the parameter 'id' is required"), http.StatusBadRequest, rw) return @@ -820,8 +884,8 @@ func (api *RestAPI) deleteJobByID(rw http.ResponseWriter, r *http.Request) { // @description Job to delete is specified by request body. All fields are required in this case. // @accept json // @produce json -// @param request body api.DeleteJobApiRequest true "All fields required" -// @success 200 {object} api.DefaultApiResponse "Success message" +// @param request body api.DeleteJobAPIRequest true "All fields required" +// @success 200 {object} api.DefaultAPIResponse "Success message" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -852,7 +916,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) return } - err = api.JobRepository.DeleteJobById(*job.ID) + err = api.JobRepository.DeleteJobByID(*job.ID) if err != nil { handleError(fmt.Errorf("deleting job failed: %w", err), http.StatusUnprocessableEntity, rw) return @@ -861,7 +925,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) rw.Header().Add("Content-Type", "application/json") rw.WriteHeader(http.StatusOK) if err := json.NewEncoder(rw).Encode(DefaultAPIResponse{ - Message: fmt.Sprintf("Successfully deleted job %d", job.ID), + Message: fmt.Sprintf("Successfully deleted job %d", *job.ID), }); err != nil { cclog.Errorf("Failed to encode response: %v", err) } @@ -873,7 +937,7 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) // @description Remove all jobs with start time before timestamp. The jobs will not be removed from the job archive. // @produce json // @param ts path int true "Unix epoch timestamp" -// @success 200 {object} api.DefaultApiResponse "Success message" +// @success 200 {object} api.DefaultAPIResponse "Success message" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -886,9 +950,9 @@ func (api *RestAPI) deleteJobByRequest(rw http.ResponseWriter, r *http.Request) func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) { var cnt int // Fetch job (that will be stopped) from db - id, ok := mux.Vars(r)["ts"] + id := chi.URLParam(r, "ts") var err error - if ok { + if id != "" { ts, e := strconv.ParseInt(id, 10, 64) if e != nil { handleError(fmt.Errorf("integer expected in path for ts: %w", e), http.StatusBadRequest, rw) @@ -896,11 +960,13 @@ func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) { } // Check for omit-tagged query parameter - omitTagged := false + omitTagged := "none" if omitTaggedStr := r.URL.Query().Get("omit-tagged"); omitTaggedStr != "" { - omitTagged, e = strconv.ParseBool(omitTaggedStr) - if e != nil { - handleError(fmt.Errorf("boolean expected for omit-tagged parameter: %w", e), http.StatusBadRequest, rw) + switch omitTaggedStr { + case "none", "all", "user": + omitTagged = omitTaggedStr + default: + handleError(fmt.Errorf("omit-tagged must be one of: none, all, user"), http.StatusBadRequest, rw) return } } @@ -924,20 +990,20 @@ func (api *RestAPI) deleteJobBefore(rw http.ResponseWriter, r *http.Request) { } } -func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest) { +func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Job, req StopJobAPIRequest, isCached bool) { // Sanity checks if job.State != schema.JobStateRunning { - handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw) + handleError(fmt.Errorf("jobId %d (id %d) on %s : job has already been stopped (state is: %s)", job.JobID, *job.ID, job.Cluster, job.State), http.StatusUnprocessableEntity, rw) return } if job.StartTime > req.StopTime { - handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw) + handleError(fmt.Errorf("jobId %d (id %d) on %s : stopTime %d must be larger/equal than startTime %d", job.JobID, *job.ID, job.Cluster, req.StopTime, job.StartTime), http.StatusBadRequest, rw) return } if req.State != "" && !req.State.Valid() { - handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, job.ID, job.Cluster, req.State), http.StatusBadRequest, rw) + handleError(fmt.Errorf("jobId %d (id %d) on %s : invalid requested job state: %#v", job.JobID, *job.ID, job.Cluster, req.State), http.StatusBadRequest, rw) return } else if req.State == "" { req.State = schema.JobStateCompleted @@ -949,14 +1015,24 @@ func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo api.JobRepository.Mutex.Lock() defer api.JobRepository.Mutex.Unlock() - if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - if err := api.JobRepository.StopCached(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { - handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw) + // If the job is still in job_cache, transfer it to the job table first + // so that job.ID always points to the job table for downstream code + if isCached { + newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID) + if err != nil { + handleError(fmt.Errorf("jobId %d (id %d) on %s : transferring cached job failed: %w", job.JobID, *job.ID, job.Cluster, err), http.StatusInternalServerError, rw) return } + cclog.Infof("transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID) + job.ID = &newID } - cclog.Infof("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) + if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { + handleError(fmt.Errorf("jobId %d (id %d) on %s : marking job as '%s' (duration: %d) in DB failed: %w", job.JobID, *job.ID, job.Cluster, job.State, job.Duration, err), http.StatusInternalServerError, rw) + return + } + + cclog.Infof("archiving job... (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", *job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) // Send a response (with status OK). This means that errors that happen from here on forward // can *NOT* be communicated to the client. If reading from a MetricDataRepository or @@ -977,7 +1053,7 @@ func (api *RestAPI) checkAndHandleStopJob(rw http.ResponseWriter, job *schema.Jo } func (api *RestAPI) getJobMetrics(rw http.ResponseWriter, r *http.Request) { - id := mux.Vars(r)["id"] + id := chi.URLParam(r, "id") metrics := r.URL.Query()["metric"] var scopes []schema.MetricScope for _, scope := range r.URL.Query()["scope"] { @@ -1022,3 +1098,57 @@ func (api *RestAPI) getJobMetrics(rw http.ResponseWriter, r *http.Request) { cclog.Errorf("Failed to encode response: %v", err) } } + +// GetUsedNodesAPIResponse model +type GetUsedNodesAPIResponse struct { + UsedNodes map[string][]string `json:"usedNodes"` // Map of cluster names to lists of used node hostnames +} + +// getUsedNodes godoc +// @summary Lists used nodes by cluster +// @tags Job query +// @description Get a map of cluster names to lists of unique hostnames that are currently in use by running jobs that started before the specified timestamp. +// @produce json +// @param ts query int true "Unix timestamp to filter jobs (jobs with start_time < ts)" +// @success 200 {object} api.GetUsedNodesAPIResponse "Map of cluster names to hostname lists" +// @failure 400 {object} api.ErrorResponse "Bad Request" +// @failure 401 {object} api.ErrorResponse "Unauthorized" +// @failure 403 {object} api.ErrorResponse "Forbidden" +// @failure 500 {object} api.ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /api/jobs/used_nodes [get] +func (api *RestAPI) getUsedNodes(rw http.ResponseWriter, r *http.Request) { + if user := repository.GetUserFromContext(r.Context()); user != nil && + !user.HasRole(schema.RoleAPI) { + handleError(fmt.Errorf("missing role: %v", schema.GetRoleString(schema.RoleAPI)), http.StatusForbidden, rw) + return + } + + tsStr := r.URL.Query().Get("ts") + if tsStr == "" { + handleError(fmt.Errorf("missing required query parameter: ts"), http.StatusBadRequest, rw) + return + } + + ts, err := strconv.ParseInt(tsStr, 10, 64) + if err != nil { + handleError(fmt.Errorf("invalid timestamp format: %w", err), http.StatusBadRequest, rw) + return + } + + usedNodes, err := api.JobRepository.GetUsedNodes(ts) + if err != nil { + handleError(fmt.Errorf("failed to get used nodes: %w", err), http.StatusInternalServerError, rw) + return + } + + rw.Header().Add("Content-Type", "application/json") + payload := GetUsedNodesAPIResponse{ + UsedNodes: usedNodes, + } + + if err := json.NewEncoder(rw).Encode(payload); err != nil { + handleError(err, http.StatusInternalServerError, rw) + return + } +} diff --git a/internal/api/log.go b/internal/api/log.go new file mode 100644 index 00000000..90add9bb --- /dev/null +++ b/internal/api/log.go @@ -0,0 +1,165 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package api + +import ( + "bufio" + "encoding/json" + "fmt" + "net/http" + "os/exec" + "regexp" + "strconv" + "strings" + + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-backend/internal/repository" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +type LogEntry struct { + Timestamp string `json:"timestamp"` + Priority int `json:"priority"` + Message string `json:"message"` + Unit string `json:"unit"` +} + +var safePattern = regexp.MustCompile(`^[a-zA-Z0-9 :\-\.]+$`) + +func (api *RestAPI) getJournalLog(rw http.ResponseWriter, r *http.Request) { + user := repository.GetUserFromContext(r.Context()) + if !user.HasRole(schema.RoleAdmin) { + handleError(fmt.Errorf("only admins are allowed to view logs"), http.StatusForbidden, rw) + return + } + + since := r.URL.Query().Get("since") + if since == "" { + since = "1 hour ago" + } + if !safePattern.MatchString(since) { + handleError(fmt.Errorf("invalid 'since' parameter"), http.StatusBadRequest, rw) + return + } + + lines := 200 + if l := r.URL.Query().Get("lines"); l != "" { + n, err := strconv.Atoi(l) + if err != nil || n < 1 { + handleError(fmt.Errorf("invalid 'lines' parameter"), http.StatusBadRequest, rw) + return + } + if n > 1000 { + n = 1000 + } + lines = n + } + + unit := config.Keys.SystemdUnit + if unit == "" { + unit = "clustercockpit.service" + } + + args := []string{ + "--output=json", + "--no-pager", + "-n", fmt.Sprintf("%d", lines), + "--since", since, + "-u", unit, + } + + if level := r.URL.Query().Get("level"); level != "" { + n, err := strconv.Atoi(level) + if err != nil || n < 0 || n > 7 { + handleError(fmt.Errorf("invalid 'level' parameter (must be 0-7)"), http.StatusBadRequest, rw) + return + } + args = append(args, "--priority", fmt.Sprintf("%d", n)) + } + + if search := r.URL.Query().Get("search"); search != "" { + if !safePattern.MatchString(search) { + handleError(fmt.Errorf("invalid 'search' parameter"), http.StatusBadRequest, rw) + return + } + args = append(args, "--grep", search) + } + + cclog.Debugf("calling journalctl with %s", strings.Join(args, " ")) + cmd := exec.CommandContext(r.Context(), "journalctl", args...) + stdout, err := cmd.StdoutPipe() + if err != nil { + handleError(fmt.Errorf("failed to create pipe: %w", err), http.StatusInternalServerError, rw) + return + } + + if err := cmd.Start(); err != nil { + handleError(fmt.Errorf("failed to start journalctl: %w", err), http.StatusInternalServerError, rw) + return + } + + entries := make([]LogEntry, 0, lines) + scanner := bufio.NewScanner(stdout) + for scanner.Scan() { + var raw map[string]any + if err := json.Unmarshal(scanner.Bytes(), &raw); err != nil { + cclog.Debugf("error unmarshal log output: %v", err) + continue + } + + priority := 6 // default info + if p, ok := raw["PRIORITY"]; ok { + switch v := p.(type) { + case string: + if n, err := strconv.Atoi(v); err == nil { + priority = n + } + case float64: + priority = int(v) + } + } + + msg := "" + if m, ok := raw["MESSAGE"]; ok { + if s, ok := m.(string); ok { + msg = s + } + } + + ts := "" + if t, ok := raw["__REALTIME_TIMESTAMP"]; ok { + if s, ok := t.(string); ok { + ts = s + } + } + + unitName := "" + if u, ok := raw["_SYSTEMD_UNIT"]; ok { + if s, ok := u.(string); ok { + unitName = s + } + } + + entries = append(entries, LogEntry{ + Timestamp: ts, + Priority: priority, + Message: msg, + Unit: unitName, + }) + } + + if err := cmd.Wait(); err != nil { + // journalctl returns exit code 1 when --grep matches nothing + if len(entries) == 0 { + cclog.Debugf("journalctl exited with: %v", err) + } + } + + rw.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(rw).Encode(entries); err != nil { + cclog.Errorf("Failed to encode log entries: %v", err) + } +} diff --git a/internal/api/memorystore.go b/internal/api/metricstore.go similarity index 69% rename from internal/api/memorystore.go rename to internal/api/metricstore.go index 1b883792..325b26ba 100644 --- a/internal/api/memorystore.go +++ b/internal/api/metricstore.go @@ -10,15 +10,14 @@ import ( "encoding/json" "errors" "fmt" - "io" "net/http" "strconv" "strings" - "github.com/ClusterCockpit/cc-backend/internal/memorystore" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + "github.com/ClusterCockpit/cc-backend/pkg/metricstore" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" - "github.com/influxdata/line-protocol/v2/lineprotocol" + "github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol" ) // handleFree godoc @@ -58,7 +57,7 @@ func freeMetrics(rw http.ResponseWriter, r *http.Request) { return } - ms := memorystore.GetMemoryStore() + ms := metricstore.GetMemoryStore() n := 0 for _, sel := range selectors { bn, err := ms.Free(sel, to) @@ -90,16 +89,17 @@ func freeMetrics(rw http.ResponseWriter, r *http.Request) { // @security ApiKeyAuth // @router /write/ [post] func writeMetrics(rw http.ResponseWriter, r *http.Request) { - bytes, err := io.ReadAll(r.Body) rw.Header().Add("Content-Type", "application/json") - if err != nil { - handleError(err, http.StatusInternalServerError, rw) - return - } - ms := memorystore.GetMemoryStore() - dec := lineprotocol.NewDecoderWithBytes(bytes) - if err := memorystore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil { + // Extract the "cluster" query parameter without allocating a url.Values map. + cluster := queryParam(r.URL.RawQuery, "cluster") + + // Stream directly from the request body instead of copying it into a + // temporary buffer via io.ReadAll. The line-protocol decoder supports + // io.Reader natively, so this avoids the largest heap allocation. + ms := metricstore.GetMemoryStore() + dec := lineprotocol.NewDecoder(r.Body) + if err := metricstore.DecodeLine(dec, ms, cluster); err != nil { cclog.Errorf("/api/write error: %s", err.Error()) handleError(err, http.StatusBadRequest, rw) return @@ -107,6 +107,20 @@ func writeMetrics(rw http.ResponseWriter, r *http.Request) { rw.WriteHeader(http.StatusOK) } +// queryParam extracts a single query-parameter value from a raw query string +// without allocating a url.Values map. Returns "" if the key is not present. +func queryParam(raw, key string) string { + for raw != "" { + var kv string + kv, raw, _ = strings.Cut(raw, "&") + k, v, _ := strings.Cut(kv, "=") + if k == key { + return v + } + } + return "" +} + // handleDebug godoc // @summary Debug endpoint // @tags debug @@ -129,42 +143,9 @@ func debugMetrics(rw http.ResponseWriter, r *http.Request) { selector = strings.Split(raw, ":") } - ms := memorystore.GetMemoryStore() + ms := metricstore.GetMemoryStore() if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil { handleError(err, http.StatusBadRequest, rw) return } } - -// handleHealthCheck godoc -// @summary HealthCheck endpoint -// @tags healthcheck -// @description This endpoint allows the users to check if a node is healthy -// @produce json -// @param selector query string false "Selector" -// @success 200 {string} string "Debug dump" -// @failure 400 {object} api.ErrorResponse "Bad Request" -// @failure 401 {object} api.ErrorResponse "Unauthorized" -// @failure 403 {object} api.ErrorResponse "Forbidden" -// @failure 500 {object} api.ErrorResponse "Internal Server Error" -// @security ApiKeyAuth -// @router /healthcheck/ [get] -func metricsHealth(rw http.ResponseWriter, r *http.Request) { - rawCluster := r.URL.Query().Get("cluster") - rawNode := r.URL.Query().Get("node") - - if rawCluster == "" || rawNode == "" { - handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw) - return - } - - rw.Header().Add("Content-Type", "application/json") - - selector := []string{rawCluster, rawNode} - - ms := memorystore.GetMemoryStore() - if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil { - handleError(err, http.StatusBadRequest, rw) - return - } -} diff --git a/internal/api/nats.go b/internal/api/nats.go new file mode 100644 index 00000000..efa4ab6f --- /dev/null +++ b/internal/api/nats.go @@ -0,0 +1,400 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package api + +import ( + "database/sql" + "encoding/json" + "strings" + "sync" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/archiver" + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-backend/internal/importer" + "github.com/ClusterCockpit/cc-backend/internal/repository" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" + "github.com/ClusterCockpit/cc-lib/v2/nats" + "github.com/ClusterCockpit/cc-lib/v2/receivers" + "github.com/ClusterCockpit/cc-lib/v2/schema" + influx "github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol" +) + +// NatsAPI provides NATS subscription-based handlers for Job and Node operations. +// It mirrors the functionality of the REST API but uses NATS messaging with +// InfluxDB line protocol as the message format. +// +// # Message Format +// +// All NATS messages use InfluxDB line protocol format (https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/) +// with the following structure: +// +// measurement,tag1=value1,tag2=value2 field1=value1,field2=value2 timestamp +// +// # Job Events +// +// Job start/stop events use the "job" measurement with a "function" tag to distinguish operations: +// +// job,function=start_job event="{...JSON payload...}" +// job,function=stop_job event="{...JSON payload...}" +// +// The JSON payload in the "event" field follows the schema.Job or StopJobAPIRequest structure. +// +// Example job start message: +// +// job,function=start_job event="{\"jobId\":1001,\"user\":\"testuser\",\"cluster\":\"testcluster\",...}" 1234567890000000000 +// +// # Node State Events +// +// Node state updates use the "nodestate" measurement with cluster information: +// +// nodestate event="{...JSON payload...}" +// +// The JSON payload follows the UpdateNodeStatesRequest structure. +// +// Example node state message: +// +// nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node01\",\"states\":[\"idle\"]}]}" 1234567890000000000 +type NatsAPI struct { + JobRepository *repository.JobRepository + // RepositoryMutex protects job creation operations from race conditions + // when checking for duplicate jobs during startJob calls. + RepositoryMutex sync.Mutex +} + +// NewNatsAPI creates a new NatsAPI instance with default dependencies. +func NewNatsAPI() *NatsAPI { + return &NatsAPI{ + JobRepository: repository.GetJobRepository(), + } +} + +// StartSubscriptions registers all NATS subscriptions for Job and Node APIs. +// Returns an error if the NATS client is not available or subscription fails. +func (api *NatsAPI) StartSubscriptions() error { + client := nats.GetClient() + if client == nil { + cclog.Warn("NATS client not available, skipping API subscriptions") + return nil + } + + if config.Keys.APISubjects != nil { + + s := config.Keys.APISubjects + + if err := client.Subscribe(s.SubjectJobEvent, api.handleJobEvent); err != nil { + return err + } + + if err := client.Subscribe(s.SubjectNodeState, api.handleNodeState); err != nil { + return err + } + + cclog.Info("NATS API subscriptions started") + } + return nil +} + +// processJobEvent routes job event messages to the appropriate handler based on the "function" tag. +// Validates that required tags and fields are present before processing. +func (api *NatsAPI) processJobEvent(msg lp.CCMessage) { + function, ok := msg.GetTag("function") + if !ok { + cclog.Errorf("Job event is missing required tag 'function': measurement=%s", msg.Name()) + return + } + + switch function { + case "start_job": + v, ok := msg.GetEventValue() + if !ok { + cclog.Errorf("Job start event is missing event field with JSON payload") + return + } + api.handleStartJob(v) + + case "stop_job": + v, ok := msg.GetEventValue() + if !ok { + cclog.Errorf("Job stop event is missing event field with JSON payload") + return + } + api.handleStopJob(v) + + default: + cclog.Warnf("Unknown job event function '%s', expected 'start_job' or 'stop_job'", function) + } +} + +// handleJobEvent processes job-related messages received via NATS using InfluxDB line protocol. +// The message must be in line protocol format with measurement="job" and include: +// - tag "function" with value "start_job" or "stop_job" +// - field "event" containing JSON payload (schema.Job or StopJobAPIRequest) +// +// Example: job,function=start_job event="{\"jobId\":1001,...}" 1234567890000000000 +func (api *NatsAPI) handleJobEvent(subject string, data []byte) { + if len(data) == 0 { + cclog.Warnf("NATS %s: received empty message", subject) + return + } + + d := influx.NewDecoderWithBytes(data) + + for d.Next() { + m, err := receivers.DecodeInfluxMessage(d) + if err != nil { + cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err) + return + } + + if !m.IsEvent() { + cclog.Debugf("NATS %s: received non-event message, skipping", subject) + continue + } + + if m.Name() == "job" { + api.processJobEvent(m) + } else { + cclog.Debugf("NATS %s: unexpected measurement name '%s', expected 'job'", subject, m.Name()) + } + } +} + +// handleStartJob processes job start messages received via NATS. +// The payload parameter contains JSON following the schema.Job structure. +// Jobs are validated, checked for duplicates, and inserted into the database. +func (api *NatsAPI) handleStartJob(payload string) { + if payload == "" { + cclog.Error("NATS start job: payload is empty") + return + } + req := schema.Job{ + Shared: "none", + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + } + + dec := json.NewDecoder(strings.NewReader(payload)) + dec.DisallowUnknownFields() + if err := dec.Decode(&req); err != nil { + cclog.Errorf("NATS start job: parsing request failed: %v", err) + return + } + + cclog.Debugf("NATS start job: %s", req.GoString()) + req.State = schema.JobStateRunning + + if err := importer.SanityChecks(&req); err != nil { + cclog.Errorf("NATS start job: sanity check failed: %v", err) + return + } + + var unlockOnce sync.Once + api.RepositoryMutex.Lock() + defer unlockOnce.Do(api.RepositoryMutex.Unlock) + + jobs, err := api.JobRepository.FindAll(&req.JobID, &req.Cluster, nil) + if err != nil && err != sql.ErrNoRows { + cclog.Errorf("NATS start job: checking for duplicate failed: %v", err) + return + } + if err == nil { + for _, job := range jobs { + if (req.StartTime - job.StartTime) < secondsPerDay { + cclog.Errorf("NATS start job: job with jobId %d, cluster %s already exists (dbid: %d)", + req.JobID, req.Cluster, job.ID) + return + } + } + } + + // When tags are present, insert directly into the job table so that the + // returned ID can be used with AddTagOrCreate (which queries the job table). + var id int64 + if len(req.Tags) > 0 { + id, err = api.JobRepository.StartDirect(&req) + } else { + id, err = api.JobRepository.Start(&req) + } + if err != nil { + cclog.Errorf("NATS start job: insert into database failed: %v", err) + return + } + unlockOnce.Do(api.RepositoryMutex.Unlock) + + for _, tag := range req.Tags { + if _, err := api.JobRepository.AddTagOrCreate(nil, id, tag.Type, tag.Name, tag.Scope); err != nil { + cclog.Errorf("NATS start job: adding tag to new job %d failed: %v", id, err) + return + } + } + + cclog.Infof("NATS: new job (id: %d): cluster=%s, jobId=%d, user=%s, startTime=%d", + id, req.Cluster, req.JobID, req.User, req.StartTime) +} + +// handleStopJob processes job stop messages received via NATS. +// The payload parameter contains JSON following the StopJobAPIRequest structure. +// The job is marked as stopped in the database and archiving is triggered if monitoring is enabled. +func (api *NatsAPI) handleStopJob(payload string) { + if payload == "" { + cclog.Error("NATS stop job: payload is empty") + return + } + var req StopJobAPIRequest + + dec := json.NewDecoder(strings.NewReader(payload)) + dec.DisallowUnknownFields() + if err := dec.Decode(&req); err != nil { + cclog.Errorf("NATS job stop: parsing request failed: %v", err) + return + } + + if req.JobID == nil { + cclog.Errorf("NATS job stop: the field 'jobId' is required") + return + } + + isCached := false + job, err := api.JobRepository.FindCached(req.JobID, req.Cluster, req.StartTime) + if err != nil { + // Not in cache, try main job table + job, err = api.JobRepository.Find(req.JobID, req.Cluster, req.StartTime) + if err != nil { + cclog.Errorf("NATS job stop: finding job failed: %v", err) + return + } + } else { + isCached = true + } + + if job.State != schema.JobStateRunning { + cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: job has already been stopped (state is: %s)", + job.JobID, job.ID, job.Cluster, job.State) + return + } + + if job.StartTime > req.StopTime { + cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: stopTime %d must be >= startTime %d", + job.JobID, job.ID, job.Cluster, req.StopTime, job.StartTime) + return + } + + if req.State != "" && !req.State.Valid() { + cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: invalid job state: %#v", + job.JobID, job.ID, job.Cluster, req.State) + return + } else if req.State == "" { + req.State = schema.JobStateCompleted + } + + job.Duration = int32(req.StopTime - job.StartTime) + job.State = req.State + api.JobRepository.Mutex.Lock() + defer api.JobRepository.Mutex.Unlock() + + // If the job is still in job_cache, transfer it to the job table first + if isCached { + newID, err := api.JobRepository.TransferCachedJobToMain(*job.ID) + if err != nil { + cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: transferring cached job failed: %v", + job.JobID, *job.ID, job.Cluster, err) + return + } + cclog.Infof("NATS: transferred cached job to main table: old id %d -> new id %d (jobId=%d)", *job.ID, newID, job.JobID) + job.ID = &newID + } + + if err := api.JobRepository.Stop(*job.ID, job.Duration, job.State, job.MonitoringStatus); err != nil { + cclog.Errorf("NATS job stop: jobId %d (id %d) on %s: marking job as '%s' failed: %v", + job.JobID, *job.ID, job.Cluster, job.State, err) + return + } + + cclog.Infof("NATS: archiving job (dbid: %d): cluster=%s, jobId=%d, user=%s, startTime=%d, duration=%d, state=%s", + *job.ID, job.Cluster, job.JobID, job.User, job.StartTime, job.Duration, job.State) + + if job.MonitoringStatus == schema.MonitoringStatusDisabled { + return + } + + archiver.TriggerArchiving(job) +} + +// processNodestateEvent extracts and processes node state data from the InfluxDB message. +// Updates node states in the repository for all nodes in the payload. +func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) { + v, ok := msg.GetEventValue() + if !ok { + cclog.Errorf("Nodestate event is missing event field with JSON payload") + return + } + + var req UpdateNodeStatesRequest + + dec := json.NewDecoder(strings.NewReader(v)) + dec.DisallowUnknownFields() + if err := dec.Decode(&req); err != nil { + cclog.Errorf("NATS nodestate: parsing request failed: %v", err) + return + } + + repo := repository.GetNodeRepository() + requestReceived := time.Now().Unix() + + for _, node := range req.Nodes { + state := determineState(node.States) + nodeState := schema.NodeStateDB{ + TimeStamp: requestReceived, + NodeState: state, + CpusAllocated: node.CpusAllocated, + MemoryAllocated: node.MemoryAllocated, + GpusAllocated: node.GpusAllocated, + HealthState: schema.MonitoringStateFull, + JobsRunning: node.JobsRunning, + } + + if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil { + cclog.Errorf("NATS nodestate: updating node state for %s on %s failed: %v", + node.Hostname, req.Cluster, err) + } + } + + cclog.Debugf("NATS nodestate: updated %d node states for cluster %s", len(req.Nodes), req.Cluster) +} + +// handleNodeState processes node state update messages received via NATS using InfluxDB line protocol. +// The message must be in line protocol format with measurement="nodestate" and include: +// - field "event" containing JSON payload (UpdateNodeStatesRequest) +// +// Example: nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[...]}" 1234567890000000000 +func (api *NatsAPI) handleNodeState(subject string, data []byte) { + if len(data) == 0 { + cclog.Warnf("NATS %s: received empty message", subject) + return + } + + d := influx.NewDecoderWithBytes(data) + + for d.Next() { + m, err := receivers.DecodeInfluxMessage(d) + if err != nil { + cclog.Errorf("NATS %s: failed to decode InfluxDB line protocol message: %v", subject, err) + return + } + + if !m.IsEvent() { + cclog.Warnf("NATS %s: received non-event message, skipping", subject) + continue + } + + if m.Name() == "nodestate" { + api.processNodestateEvent(m) + } else { + cclog.Warnf("NATS %s: unexpected measurement name '%s', expected 'nodestate'", subject, m.Name()) + } + } +} diff --git a/internal/api/nats_test.go b/internal/api/nats_test.go new file mode 100644 index 00000000..b1d2a624 --- /dev/null +++ b/internal/api/nats_test.go @@ -0,0 +1,947 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package api + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + "os" + "path/filepath" + "testing" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/archiver" + "github.com/ClusterCockpit/cc-backend/internal/auth" + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-backend/internal/graph" + "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/archive" + "github.com/ClusterCockpit/cc-backend/pkg/metricstore" + ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" + "github.com/ClusterCockpit/cc-lib/v2/schema" + + _ "github.com/mattn/go-sqlite3" +) + +func setupNatsTest(t *testing.T) *NatsAPI { + repository.ResetConnection() + + const testconfig = `{ + "main": { + "addr": "0.0.0.0:8080", + "validate": false, + "api-allowed-ips": [ + "*" + ] + }, + "archive": { + "kind": "file", + "path": "./var/job-archive" + }, + "auth": { + "jwts": { + "max-age": "2m" + } + } +}` + const testclusterJSON = `{ + "name": "testcluster", + "subClusters": [ + { + "name": "sc1", + "nodes": "host123,host124,host125", + "processorType": "Intel Core i7-4770", + "socketsPerNode": 1, + "coresPerSocket": 4, + "threadsPerCore": 2, + "flopRateScalar": { + "unit": { + "prefix": "G", + "base": "F/s" + }, + "value": 14 + }, + "flopRateSimd": { + "unit": { + "prefix": "G", + "base": "F/s" + }, + "value": 112 + }, + "memoryBandwidth": { + "unit": { + "prefix": "G", + "base": "B/s" + }, + "value": 24 + }, + "numberOfNodes": 70, + "topology": { + "node": [0, 1, 2, 3, 4, 5, 6, 7], + "socket": [[0, 1, 2, 3, 4, 5, 6, 7]], + "memoryDomain": [[0, 1, 2, 3, 4, 5, 6, 7]], + "die": [[0, 1, 2, 3, 4, 5, 6, 7]], + "core": [[0], [1], [2], [3], [4], [5], [6], [7]] + } + } + ], + "metricConfig": [ + { + "name": "load_one", + "unit": { "base": ""}, + "scope": "node", + "timestep": 60, + "aggregation": "avg", + "peak": 8, + "normal": 0, + "caution": 0, + "alert": 0 + } + ] + }` + + cclog.Init("info", true) + tmpdir := t.TempDir() + jobarchive := filepath.Join(tmpdir, "job-archive") + if err := os.Mkdir(jobarchive, 0o777); err != nil { + t.Fatal(err) + } + + if err := os.WriteFile(filepath.Join(jobarchive, "version.txt"), fmt.Appendf(nil, "%d", 3), 0o666); err != nil { + t.Fatal(err) + } + + if err := os.Mkdir(filepath.Join(jobarchive, "testcluster"), 0o777); err != nil { + t.Fatal(err) + } + + if err := os.WriteFile(filepath.Join(jobarchive, "testcluster", "cluster.json"), []byte(testclusterJSON), 0o666); err != nil { + t.Fatal(err) + } + + dbfilepath := filepath.Join(tmpdir, "test.db") + err := repository.MigrateDB(dbfilepath) + if err != nil { + t.Fatal(err) + } + + cfgFilePath := filepath.Join(tmpdir, "config.json") + if err := os.WriteFile(cfgFilePath, []byte(testconfig), 0o666); err != nil { + t.Fatal(err) + } + + ccconf.Init(cfgFilePath) + + // Load and check main configuration + if cfg := ccconf.GetPackageConfig("main"); cfg != nil { + config.Init(cfg) + } else { + cclog.Abort("Main configuration must be present") + } + archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive) + + repository.Connect(dbfilepath) + + if err := archive.Init(json.RawMessage(archiveCfg)); err != nil { + t.Fatal(err) + } + + // metricstore initialization removed - it's initialized via callback in tests + + archiver.Start(repository.GetJobRepository(), context.Background()) + + if cfg := ccconf.GetPackageConfig("auth"); cfg != nil { + auth.Init(&cfg) + } else { + cclog.Warn("Authentication disabled due to missing configuration") + auth.Init(nil) + } + + graph.Init() + + return NewNatsAPI() +} + +func cleanupNatsTest() { + if err := archiver.Shutdown(5 * time.Second); err != nil { + cclog.Warnf("Archiver shutdown timeout in tests: %v", err) + } +} + +func TestNatsHandleStartJob(t *testing.T) { + natsAPI := setupNatsTest(t) + t.Cleanup(cleanupNatsTest) + + tests := []struct { + name string + payload string + expectError bool + validateJob func(t *testing.T, job *schema.Job) + shouldFindJob bool + }{ + { + name: "valid job start", + payload: `{ + "jobId": 1001, + "user": "testuser1", + "project": "testproj1", + "cluster": "testcluster", + "partition": "main", + "walltime": 7200, + "numNodes": 1, + "numHwthreads": 8, + "numAcc": 0, + "shared": "none", + "monitoringStatus": 1, + "smt": 1, + "resources": [ + { + "hostname": "host123", + "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7] + } + ], + "startTime": 1234567890 + }`, + expectError: false, + shouldFindJob: true, + validateJob: func(t *testing.T, job *schema.Job) { + if job.JobID != 1001 { + t.Errorf("expected JobID 1001, got %d", job.JobID) + } + if job.User != "testuser1" { + t.Errorf("expected user testuser1, got %s", job.User) + } + if job.State != schema.JobStateRunning { + t.Errorf("expected state running, got %s", job.State) + } + }, + }, + { + name: "invalid JSON", + payload: `{ + "jobId": "not a number", + "user": "testuser2" + }`, + expectError: true, + shouldFindJob: false, + }, + { + name: "missing required fields", + payload: `{ + "jobId": 1002 + }`, + expectError: true, + shouldFindJob: false, + }, + { + name: "job with unknown fields (should fail due to DisallowUnknownFields)", + payload: `{ + "jobId": 1003, + "user": "testuser3", + "project": "testproj3", + "cluster": "testcluster", + "partition": "main", + "walltime": 3600, + "numNodes": 1, + "numHwthreads": 8, + "unknownField": "should cause error", + "startTime": 1234567900 + }`, + expectError: true, + shouldFindJob: false, + }, + { + name: "job with tags", + payload: `{ + "jobId": 1004, + "user": "testuser4", + "project": "testproj4", + "cluster": "testcluster", + "partition": "main", + "walltime": 3600, + "numNodes": 1, + "numHwthreads": 8, + "numAcc": 0, + "shared": "none", + "monitoringStatus": 1, + "smt": 1, + "resources": [ + { + "hostname": "host123", + "hwthreads": [0, 1, 2, 3] + } + ], + "tags": [ + { + "type": "test", + "name": "testtag", + "scope": "testuser4" + } + ], + "startTime": 1234567910 + }`, + expectError: false, + shouldFindJob: true, + validateJob: func(t *testing.T, job *schema.Job) { + if job.JobID != 1004 { + t.Errorf("expected JobID 1004, got %d", job.JobID) + } + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + natsAPI.handleStartJob(tt.payload) + natsAPI.JobRepository.SyncJobs() + + // Allow some time for async operations + time.Sleep(100 * time.Millisecond) + + if tt.shouldFindJob { + // Extract jobId from payload + var payloadMap map[string]any + json.Unmarshal([]byte(tt.payload), &payloadMap) + jobID := int64(payloadMap["jobId"].(float64)) + cluster := payloadMap["cluster"].(string) + startTime := int64(payloadMap["startTime"].(float64)) + + job, err := natsAPI.JobRepository.Find(&jobID, &cluster, &startTime) + if err != nil { + if !tt.expectError { + t.Fatalf("expected to find job, but got error: %v", err) + } + return + } + + if tt.validateJob != nil { + tt.validateJob(t, job) + } + } + }) + } +} + +func TestNatsHandleStopJob(t *testing.T) { + natsAPI := setupNatsTest(t) + t.Cleanup(cleanupNatsTest) + + // First, create a running job + startPayload := `{ + "jobId": 2001, + "user": "testuser", + "project": "testproj", + "cluster": "testcluster", + "partition": "main", + "walltime": 3600, + "numNodes": 1, + "numHwthreads": 8, + "numAcc": 0, + "shared": "none", + "monitoringStatus": 1, + "smt": 1, + "resources": [ + { + "hostname": "host123", + "hwthreads": [0, 1, 2, 3, 4, 5, 6, 7] + } + ], + "startTime": 1234567890 + }` + + natsAPI.handleStartJob(startPayload) + natsAPI.JobRepository.SyncJobs() + time.Sleep(100 * time.Millisecond) + + tests := []struct { + name string + payload string + expectError bool + validateJob func(t *testing.T, job *schema.Job) + setupJobFunc func() // Optional: create specific test job + }{ + { + name: "valid job stop - completed", + payload: `{ + "jobId": 2001, + "cluster": "testcluster", + "startTime": 1234567890, + "jobState": "completed", + "stopTime": 1234571490 + }`, + expectError: false, + validateJob: func(t *testing.T, job *schema.Job) { + if job.State != schema.JobStateCompleted { + t.Errorf("expected state completed, got %s", job.State) + } + expectedDuration := int32(1234571490 - 1234567890) + if job.Duration != expectedDuration { + t.Errorf("expected duration %d, got %d", expectedDuration, job.Duration) + } + }, + }, + { + name: "valid job stop - failed", + setupJobFunc: func() { + startPayloadFailed := `{ + "jobId": 2002, + "user": "testuser", + "project": "testproj", + "cluster": "testcluster", + "partition": "main", + "walltime": 3600, + "numNodes": 1, + "numHwthreads": 8, + "numAcc": 0, + "shared": "none", + "monitoringStatus": 1, + "smt": 1, + "resources": [ + { + "hostname": "host123", + "hwthreads": [0, 1, 2, 3] + } + ], + "startTime": 1234567900 + }` + natsAPI.handleStartJob(startPayloadFailed) + natsAPI.JobRepository.SyncJobs() + time.Sleep(100 * time.Millisecond) + }, + payload: `{ + "jobId": 2002, + "cluster": "testcluster", + "startTime": 1234567900, + "jobState": "failed", + "stopTime": 1234569900 + }`, + expectError: false, + validateJob: func(t *testing.T, job *schema.Job) { + if job.State != schema.JobStateFailed { + t.Errorf("expected state failed, got %s", job.State) + } + }, + }, + { + name: "invalid JSON", + payload: `{ + "jobId": "not a number" + }`, + expectError: true, + }, + { + name: "missing jobId", + payload: `{ + "cluster": "testcluster", + "jobState": "completed", + "stopTime": 1234571490 + }`, + expectError: true, + }, + { + name: "invalid job state", + setupJobFunc: func() { + startPayloadInvalid := `{ + "jobId": 2003, + "user": "testuser", + "project": "testproj", + "cluster": "testcluster", + "partition": "main", + "walltime": 3600, + "numNodes": 1, + "numHwthreads": 8, + "numAcc": 0, + "shared": "none", + "monitoringStatus": 1, + "smt": 1, + "resources": [ + { + "hostname": "host123", + "hwthreads": [0, 1] + } + ], + "startTime": 1234567910 + }` + natsAPI.handleStartJob(startPayloadInvalid) + natsAPI.JobRepository.SyncJobs() + time.Sleep(100 * time.Millisecond) + }, + payload: `{ + "jobId": 2003, + "cluster": "testcluster", + "startTime": 1234567910, + "jobState": "invalid_state", + "stopTime": 1234571510 + }`, + expectError: true, + }, + { + name: "stopTime before startTime", + setupJobFunc: func() { + startPayloadTime := `{ + "jobId": 2004, + "user": "testuser", + "project": "testproj", + "cluster": "testcluster", + "partition": "main", + "walltime": 3600, + "numNodes": 1, + "numHwthreads": 8, + "numAcc": 0, + "shared": "none", + "monitoringStatus": 1, + "smt": 1, + "resources": [ + { + "hostname": "host123", + "hwthreads": [0] + } + ], + "startTime": 1234567920 + }` + natsAPI.handleStartJob(startPayloadTime) + natsAPI.JobRepository.SyncJobs() + time.Sleep(100 * time.Millisecond) + }, + payload: `{ + "jobId": 2004, + "cluster": "testcluster", + "startTime": 1234567920, + "jobState": "completed", + "stopTime": 1234567900 + }`, + expectError: true, + }, + { + name: "job not found", + payload: `{ + "jobId": 99999, + "cluster": "testcluster", + "startTime": 1234567890, + "jobState": "completed", + "stopTime": 1234571490 + }`, + expectError: true, + }, + } + + testData := schema.JobData{ + "load_one": map[schema.MetricScope]*schema.JobMetric{ + schema.MetricScopeNode: { + Unit: schema.Unit{Base: "load"}, + Timestep: 60, + Series: []schema.Series{ + { + Hostname: "host123", + Statistics: schema.MetricStatistics{Min: 0.1, Avg: 0.2, Max: 0.3}, + Data: []schema.Float{0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3}, + }, + }, + }, + }, + } + + metricstore.TestLoadDataCallback = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) { + return testData, nil + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.setupJobFunc != nil { + tt.setupJobFunc() + } + + natsAPI.handleStopJob(tt.payload) + + // Allow some time for async operations + time.Sleep(100 * time.Millisecond) + + if !tt.expectError && tt.validateJob != nil { + // Extract job details from payload + var payloadMap map[string]any + json.Unmarshal([]byte(tt.payload), &payloadMap) + jobID := int64(payloadMap["jobId"].(float64)) + cluster := payloadMap["cluster"].(string) + + var startTime *int64 + if st, ok := payloadMap["startTime"]; ok { + t := int64(st.(float64)) + startTime = &t + } + + job, err := natsAPI.JobRepository.Find(&jobID, &cluster, startTime) + if err != nil { + t.Fatalf("expected to find job, but got error: %v", err) + } + + tt.validateJob(t, job) + } + }) + } +} + +func TestNatsHandleNodeState(t *testing.T) { + natsAPI := setupNatsTest(t) + t.Cleanup(cleanupNatsTest) + + tests := []struct { + name string + data []byte + expectError bool + validateFn func(t *testing.T) + }{ + { + name: "valid node state update", + data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"allocated\"],\"cpusAllocated\":8,\"memoryAllocated\":16384,\"gpusAllocated\":0,\"jobsRunning\":1}]}" 1234567890000000000`), + expectError: false, + validateFn: func(t *testing.T) { + // In a full test, we would verify the node state was updated in the database + // For now, just ensure no error occurred + }, + }, + { + name: "multiple nodes", + data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"host124\",\"states\":[\"allocated\"],\"cpusAllocated\":4,\"memoryAllocated\":8192,\"gpusAllocated\":1,\"jobsRunning\":1}]}" 1234567890000000000`), + expectError: false, + }, + { + name: "invalid JSON in event field", + data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":\"not an array\"}" 1234567890000000000`), + expectError: true, + }, + { + name: "empty nodes array", + data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1234567890000000000`), + expectError: false, // Empty array should not cause error + }, + { + name: "invalid line protocol format", + data: []byte(`invalid line protocol format`), + expectError: true, + }, + { + name: "empty data", + data: []byte(``), + expectError: false, // Should be handled gracefully with warning + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + natsAPI.handleNodeState("test.subject", tt.data) + + // Allow some time for async operations + time.Sleep(50 * time.Millisecond) + + if tt.validateFn != nil { + tt.validateFn(t) + } + }) + } +} + +func TestNatsProcessJobEvent(t *testing.T) { + natsAPI := setupNatsTest(t) + t.Cleanup(cleanupNatsTest) + + msgStartJob, err := lp.NewMessage( + "job", + map[string]string{"function": "start_job"}, + nil, + map[string]any{ + "event": `{ + "jobId": 3001, + "user": "testuser", + "project": "testproj", + "cluster": "testcluster", + "partition": "main", + "walltime": 3600, + "numNodes": 1, + "numHwthreads": 8, + "numAcc": 0, + "shared": "none", + "monitoringStatus": 1, + "smt": 1, + "resources": [ + { + "hostname": "host123", + "hwthreads": [0, 1, 2, 3] + } + ], + "startTime": 1234567890 + }`, + }, + time.Now(), + ) + if err != nil { + t.Fatalf("failed to create test message: %v", err) + } + + msgMissingTag, err := lp.NewMessage( + "job", + map[string]string{}, + nil, + map[string]any{ + "event": `{}`, + }, + time.Now(), + ) + if err != nil { + t.Fatalf("failed to create test message: %v", err) + } + + msgUnknownFunc, err := lp.NewMessage( + "job", + map[string]string{"function": "unknown_function"}, + nil, + map[string]any{ + "event": `{}`, + }, + time.Now(), + ) + if err != nil { + t.Fatalf("failed to create test message: %v", err) + } + + tests := []struct { + name string + message lp.CCMessage + expectError bool + }{ + { + name: "start_job function", + message: msgStartJob, + expectError: false, + }, + { + name: "missing function tag", + message: msgMissingTag, + expectError: true, + }, + { + name: "unknown function", + message: msgUnknownFunc, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + natsAPI.processJobEvent(tt.message) + time.Sleep(50 * time.Millisecond) + }) + } +} + +func TestNatsHandleJobEvent(t *testing.T) { + natsAPI := setupNatsTest(t) + t.Cleanup(cleanupNatsTest) + + tests := []struct { + name string + data []byte + expectError bool + }{ + { + name: "valid influx line protocol", + data: []byte(`job,function=start_job event="{\"jobId\":4001,\"user\":\"testuser\",\"project\":\"testproj\",\"cluster\":\"testcluster\",\"partition\":\"main\",\"walltime\":3600,\"numNodes\":1,\"numHwthreads\":8,\"numAcc\":0,\"shared\":\"none\",\"monitoringStatus\":1,\"smt\":1,\"resources\":[{\"hostname\":\"host123\",\"hwthreads\":[0,1,2,3]}],\"startTime\":1234567890}" 1234567890000000000`), + expectError: false, + }, + { + name: "invalid influx line protocol", + data: []byte(`invalid line protocol format`), + expectError: true, + }, + { + name: "empty data", + data: []byte(``), + expectError: false, // Decoder should handle empty input gracefully + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // HandleJobEvent doesn't return errors, it logs them + // We're just ensuring it doesn't panic + natsAPI.handleJobEvent("test.subject", tt.data) + time.Sleep(50 * time.Millisecond) + }) + } +} + +func TestNatsHandleJobEventEdgeCases(t *testing.T) { + natsAPI := setupNatsTest(t) + t.Cleanup(cleanupNatsTest) + + tests := []struct { + name string + data []byte + expectError bool + description string + }{ + { + name: "non-event message (metric data)", + data: []byte(`job,function=start_job value=123.45 1234567890000000000`), + expectError: false, + description: "Should skip non-event messages gracefully", + }, + { + name: "wrong measurement name", + data: []byte(`wrongmeasurement,function=start_job event="{}" 1234567890000000000`), + expectError: false, + description: "Should warn about unexpected measurement but not fail", + }, + { + name: "missing event field", + data: []byte(`job,function=start_job other_field="value" 1234567890000000000`), + expectError: true, + description: "Should error when event field is missing", + }, + { + name: "multiple measurements in one message", + data: []byte("job,function=start_job event=\"{}\" 1234567890000000000\njob,function=stop_job event=\"{}\" 1234567890000000000"), + expectError: false, + description: "Should process multiple lines", + }, + { + name: "escaped quotes in JSON payload", + data: []byte(`job,function=start_job event="{\"jobId\":6001,\"user\":\"test\\\"user\",\"cluster\":\"test\"}" 1234567890000000000`), + expectError: true, + description: "Should handle escaped quotes (though JSON parsing may fail)", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + natsAPI.handleJobEvent("test.subject", tt.data) + time.Sleep(50 * time.Millisecond) + }) + } +} + +func TestNatsHandleNodeStateEdgeCases(t *testing.T) { + natsAPI := setupNatsTest(t) + t.Cleanup(cleanupNatsTest) + + tests := []struct { + name string + data []byte + expectError bool + description string + }{ + { + name: "missing cluster field in JSON", + data: []byte(`nodestate event="{\"nodes\":[]}" 1234567890000000000`), + expectError: true, + description: "Should fail when cluster is missing", + }, + { + name: "malformed JSON with unescaped quotes", + data: []byte(`nodestate event="{\"cluster\":\"test"cluster\",\"nodes\":[]}" 1234567890000000000`), + expectError: true, + description: "Should fail on malformed JSON", + }, + { + name: "unicode characters in hostname", + data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"host-ñ123\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`), + expectError: false, + description: "Should handle unicode characters", + }, + { + name: "very large node count", + data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[{\"hostname\":\"node1\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node2\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0},{\"hostname\":\"node3\",\"states\":[\"idle\"],\"cpusAllocated\":0,\"memoryAllocated\":0,\"gpusAllocated\":0,\"jobsRunning\":0}]}" 1234567890000000000`), + expectError: false, + description: "Should handle multiple nodes efficiently", + }, + { + name: "timestamp in past", + data: []byte(`nodestate event="{\"cluster\":\"testcluster\",\"nodes\":[]}" 1000000000000000000`), + expectError: false, + description: "Should accept any valid timestamp", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + natsAPI.handleNodeState("test.subject", tt.data) + time.Sleep(50 * time.Millisecond) + }) + } +} + +func TestNatsHandleStartJobDuplicatePrevention(t *testing.T) { + natsAPI := setupNatsTest(t) + t.Cleanup(cleanupNatsTest) + + // Start a job + payload := `{ + "jobId": 5001, + "user": "testuser", + "project": "testproj", + "cluster": "testcluster", + "partition": "main", + "walltime": 3600, + "numNodes": 1, + "numHwthreads": 8, + "numAcc": 0, + "shared": "none", + "monitoringStatus": 1, + "smt": 1, + "resources": [ + { + "hostname": "host123", + "hwthreads": [0, 1, 2, 3] + } + ], + "startTime": 1234567890 + }` + + natsAPI.handleStartJob(payload) + natsAPI.JobRepository.SyncJobs() + time.Sleep(100 * time.Millisecond) + + // Try to start the same job again (within 24 hours) + duplicatePayload := `{ + "jobId": 5001, + "user": "testuser", + "project": "testproj", + "cluster": "testcluster", + "partition": "main", + "walltime": 3600, + "numNodes": 1, + "numHwthreads": 8, + "numAcc": 0, + "shared": "none", + "monitoringStatus": 1, + "smt": 1, + "resources": [ + { + "hostname": "host123", + "hwthreads": [0, 1, 2, 3] + } + ], + "startTime": 1234567900 + }` + + natsAPI.handleStartJob(duplicatePayload) + natsAPI.JobRepository.SyncJobs() + time.Sleep(100 * time.Millisecond) + + // Verify only one job exists + jobID := int64(5001) + cluster := "testcluster" + jobs, err := natsAPI.JobRepository.FindAll(&jobID, &cluster, nil) + if err != nil && err != sql.ErrNoRows { + t.Fatalf("unexpected error: %v", err) + } + + if len(jobs) != 1 { + t.Errorf("expected 1 job, got %d", len(jobs)) + } +} diff --git a/internal/api/node.go b/internal/api/node.go index 8953e5b9..5032ed7b 100644 --- a/internal/api/node.go +++ b/internal/api/node.go @@ -7,12 +7,17 @@ package api import ( "fmt" + "maps" "net/http" "strings" "time" + "github.com/ClusterCockpit/cc-backend/internal/metricdispatch" "github.com/ClusterCockpit/cc-backend/internal/repository" - "github.com/ClusterCockpit/cc-lib/schema" + "github.com/ClusterCockpit/cc-backend/pkg/archive" + "github.com/ClusterCockpit/cc-backend/pkg/metricstore" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" ) type UpdateNodeStatesRequest struct { @@ -20,6 +25,15 @@ type UpdateNodeStatesRequest struct { Cluster string `json:"cluster" example:"fritz"` } +// metricListToNames converts a map of metric configurations to a list of metric names +func metricListToNames(metricList map[string]*schema.Metric) []string { + names := make([]string, 0, len(metricList)) + for name := range metricList { + names = append(names, name) + } + return names +} + // this routine assumes that only one of them exists per node func determineState(states []string) schema.SchedulerState { for _, state := range states { @@ -47,7 +61,7 @@ func determineState(states []string) schema.SchedulerState { // @description Required query-parameter defines if all users or only users with additional special roles are returned. // @produce json // @param request body UpdateNodeStatesRequest true "Request body containing nodes and their states" -// @success 200 {object} api.DefaultApiResponse "Success message" +// @success 200 {object} api.DefaultAPIResponse "Success message" // @failure 400 {object} api.ErrorResponse "Bad Request" // @failure 401 {object} api.ErrorResponse "Unauthorized" // @failure 403 {object} api.ErrorResponse "Forbidden" @@ -62,19 +76,70 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) { http.StatusBadRequest, rw) return } + requestReceived := time.Now().Unix() repo := repository.GetNodeRepository() + m := make(map[string][]string) + metricNames := make(map[string][]string) + healthResults := make(map[string]metricstore.HealthCheckResult) + + startMs := time.Now() + + // Step 1: Build nodeList and metricList per subcluster + for _, node := range req.Nodes { + if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil { + m[sc] = append(m[sc], node.Hostname) + } + } + + for sc := range m { + if sc != "" { + metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc) + metricNames[sc] = metricListToNames(metricList) + } + } + + // Step 2: Determine which metric store to query and perform health check + healthRepo, err := metricdispatch.GetHealthCheckRepo(req.Cluster) + if err != nil { + cclog.Warnf("updateNodeStates: no metric store for cluster %s, skipping health check: %v", req.Cluster, err) + } else { + for sc, nl := range m { + if sc != "" { + if results, err := healthRepo.HealthCheck(req.Cluster, nl, metricNames[sc]); err == nil { + maps.Copy(healthResults, results) + } + } + } + } + + cclog.Debugf("Timer updateNodeStates, MemStore HealthCheck: %s", time.Since(startMs)) + startDB := time.Now() + for _, node := range req.Nodes { state := determineState(node.States) + healthState := schema.MonitoringStateFailed + var healthMetrics string + if result, ok := healthResults[node.Hostname]; ok { + healthState = result.State + healthMetrics = result.HealthMetrics + } nodeState := schema.NodeStateDB{ - TimeStamp: time.Now().Unix(), NodeState: state, + TimeStamp: requestReceived, + NodeState: state, CpusAllocated: node.CpusAllocated, MemoryAllocated: node.MemoryAllocated, GpusAllocated: node.GpusAllocated, - HealthState: schema.MonitoringStateFull, + HealthState: healthState, + HealthMetrics: healthMetrics, JobsRunning: node.JobsRunning, } - repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState) + if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil { + cclog.Errorf("updateNodeStates: updating node state for %s on %s failed: %v", + node.Hostname, req.Cluster, err) + } } + + cclog.Debugf("Timer updateNodeStates, SQLite Inserts: %s", time.Since(startDB)) } diff --git a/internal/api/rest.go b/internal/api/rest.go index 8232b64e..613867a8 100644 --- a/internal/api/rest.go +++ b/internal/api/rest.go @@ -22,10 +22,11 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/repository" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/ClusterCockpit/cc-lib/util" - "github.com/gorilla/mux" + "github.com/ClusterCockpit/cc-backend/internal/tagger" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" + "github.com/go-chi/chi/v5" ) // @title ClusterCockpit REST API @@ -48,6 +49,7 @@ import ( const ( noticeFilePath = "./var/notice.txt" noticeFilePerms = 0o644 + maxNoticeLength = 10000 // Maximum allowed notice content length in characters ) type RestAPI struct { @@ -61,6 +63,7 @@ type RestAPI struct { RepositoryMutex sync.Mutex } +// New creates and initializes a new RestAPI instance with configured dependencies. func New() *RestAPI { return &RestAPI{ JobRepository: repository.GetJobRepository(), @@ -69,79 +72,100 @@ func New() *RestAPI { } } -func (api *RestAPI) MountAPIRoutes(r *mux.Router) { - r.StrictSlash(true) +// MountAPIRoutes registers REST API endpoints for job and cluster management. +// These routes use JWT token authentication via the X-Auth-Token header. +func (api *RestAPI) MountAPIRoutes(r chi.Router) { // REST API Uses TokenAuth // User List - r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet) + r.Get("/users/", api.getUsers) // Cluster List - r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet) + r.Get("/clusters/", api.getClusters) // Slurm node state - r.HandleFunc("/nodestate/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut) + r.Post("/nodestate/", api.updateNodeStates) + r.Put("/nodestate/", api.updateNodeStates) // Job Handler - r.HandleFunc("/jobs/start_job/", api.startJob).Methods(http.MethodPost, http.MethodPut) - r.HandleFunc("/jobs/stop_job/", api.stopJobByRequest).Methods(http.MethodPost, http.MethodPut) - r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet) - r.HandleFunc("/jobs/{id}", api.getJobByID).Methods(http.MethodPost) - r.HandleFunc("/jobs/{id}", api.getCompleteJobByID).Methods(http.MethodGet) - r.HandleFunc("/jobs/tag_job/{id}", api.tagJob).Methods(http.MethodPost, http.MethodPatch) - r.HandleFunc("/jobs/tag_job/{id}", api.removeTagJob).Methods(http.MethodDelete) - r.HandleFunc("/jobs/edit_meta/{id}", api.editMeta).Methods(http.MethodPost, http.MethodPatch) - r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet) - r.HandleFunc("/jobs/delete_job/", api.deleteJobByRequest).Methods(http.MethodDelete) - r.HandleFunc("/jobs/delete_job/{id}", api.deleteJobByID).Methods(http.MethodDelete) - r.HandleFunc("/jobs/delete_job_before/{ts}", api.deleteJobBefore).Methods(http.MethodDelete) + if config.Keys.APISubjects == nil { + cclog.Info("Enabling REST start/stop job API") + r.Post("/jobs/start_job/", api.startJob) + r.Put("/jobs/start_job/", api.startJob) + r.Post("/jobs/stop_job/", api.stopJobByRequest) + r.Put("/jobs/stop_job/", api.stopJobByRequest) + } + r.Get("/jobs/", api.getJobs) + r.Get("/jobs/used_nodes", api.getUsedNodes) + r.Post("/jobs/tag_job/{id}", api.tagJob) + r.Patch("/jobs/tag_job/{id}", api.tagJob) + r.Delete("/jobs/tag_job/{id}", api.removeTagJob) + r.Patch("/jobs/edit_meta/{id}", api.editMeta) + r.Patch("/jobs/edit_meta/", api.editMetaByRequest) + r.Get("/jobs/metrics/{id}", api.getJobMetrics) + r.Delete("/jobs/delete_job/", api.deleteJobByRequest) + r.Delete("/jobs/delete_job/{id}", api.deleteJobByID) + r.Delete("/jobs/delete_job_before/{ts}", api.deleteJobBefore) + r.Post("/jobs/{id}", api.getJobByID) + r.Get("/jobs/{id}", api.getCompleteJobByID) - r.HandleFunc("/tags/", api.removeTags).Methods(http.MethodDelete) + r.Delete("/tags/", api.removeTags) if api.MachineStateDir != "" { - r.HandleFunc("/machine_state/{cluster}/{host}", api.getMachineState).Methods(http.MethodGet) - r.HandleFunc("/machine_state/{cluster}/{host}", api.putMachineState).Methods(http.MethodPut, http.MethodPost) + r.Get("/machine_state/{cluster}/{host}", api.getMachineState) + r.Put("/machine_state/{cluster}/{host}", api.putMachineState) + r.Post("/machine_state/{cluster}/{host}", api.putMachineState) } } -func (api *RestAPI) MountUserAPIRoutes(r *mux.Router) { - r.StrictSlash(true) +// MountUserAPIRoutes registers user-accessible REST API endpoints. +// These are limited endpoints for regular users with JWT token authentication. +func (api *RestAPI) MountUserAPIRoutes(r chi.Router) { // REST API Uses TokenAuth - r.HandleFunc("/jobs/", api.getJobs).Methods(http.MethodGet) - r.HandleFunc("/jobs/{id}", api.getJobByID).Methods(http.MethodPost) - r.HandleFunc("/jobs/{id}", api.getCompleteJobByID).Methods(http.MethodGet) - r.HandleFunc("/jobs/metrics/{id}", api.getJobMetrics).Methods(http.MethodGet) + r.Get("/jobs/", api.getJobs) + r.Post("/jobs/{id}", api.getJobByID) + r.Get("/jobs/{id}", api.getCompleteJobByID) + r.Get("/jobs/metrics/{id}", api.getJobMetrics) } -func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) { +// MountMetricStoreAPIRoutes registers metric storage API endpoints. +// These endpoints handle metric data ingestion and health checks with JWT token authentication. +func (api *RestAPI) MountMetricStoreAPIRoutes(r chi.Router) { // REST API Uses TokenAuth - // Note: StrictSlash handles trailing slash variations automatically - r.HandleFunc("/api/free", freeMetrics).Methods(http.MethodPost) - r.HandleFunc("/api/write", writeMetrics).Methods(http.MethodPost) - r.HandleFunc("/api/debug", debugMetrics).Methods(http.MethodGet) - r.HandleFunc("/api/healthcheck", metricsHealth).Methods(http.MethodGet) + r.Post("/free", freeMetrics) + r.Post("/write", writeMetrics) + r.Get("/debug", debugMetrics) + r.Post("/healthcheck", api.updateNodeStates) // Same endpoints but with trailing slash - r.HandleFunc("/api/free/", freeMetrics).Methods(http.MethodPost) - r.HandleFunc("/api/write/", writeMetrics).Methods(http.MethodPost) - r.HandleFunc("/api/debug/", debugMetrics).Methods(http.MethodGet) - r.HandleFunc("/api/healthcheck/", metricsHealth).Methods(http.MethodGet) + r.Post("/free/", freeMetrics) + r.Post("/write/", writeMetrics) + r.Get("/debug/", debugMetrics) + r.Post("/healthcheck/", api.updateNodeStates) } -func (api *RestAPI) MountConfigAPIRoutes(r *mux.Router) { - r.StrictSlash(true) +// MountConfigAPIRoutes registers configuration and user management endpoints. +// These routes use session-based authentication and require admin privileges. +// Routes use full paths (including /config prefix) to avoid conflicting with +// the /config page route when registered via Group instead of Route. +func (api *RestAPI) MountConfigAPIRoutes(r chi.Router) { // Settings Frontend Uses SessionAuth if api.Authentication != nil { - r.HandleFunc("/roles/", api.getRoles).Methods(http.MethodGet) - r.HandleFunc("/users/", api.createUser).Methods(http.MethodPost, http.MethodPut) - r.HandleFunc("/users/", api.getUsers).Methods(http.MethodGet) - r.HandleFunc("/users/", api.deleteUser).Methods(http.MethodDelete) - r.HandleFunc("/user/{id}", api.updateUser).Methods(http.MethodPost) - r.HandleFunc("/notice/", api.editNotice).Methods(http.MethodPost) + r.Get("/config/roles/", api.getRoles) + r.Post("/config/users/", api.createUser) + r.Put("/config/users/", api.createUser) + r.Get("/config/users/", api.getUsers) + r.Delete("/config/users/", api.deleteUser) + r.Post("/config/user/{id}", api.updateUser) + r.Post("/config/notice/", api.editNotice) + r.Get("/config/taggers/", api.getTaggers) + r.Post("/config/taggers/run/", api.runTagger) } } -func (api *RestAPI) MountFrontendAPIRoutes(r *mux.Router) { - r.StrictSlash(true) +// MountFrontendAPIRoutes registers frontend-specific API endpoints. +// These routes support JWT generation and user configuration updates with session authentication. +func (api *RestAPI) MountFrontendAPIRoutes(r chi.Router) { + r.Get("/logs/", api.getJournalLog) // Settings Frontend Uses SessionAuth if api.Authentication != nil { - r.HandleFunc("/jwt/", api.getJWT).Methods(http.MethodGet) - r.HandleFunc("/configuration/", api.updateConfiguration).Methods(http.MethodPost) + r.Get("/jwt/", api.getJWT) + r.Post("/configuration/", api.updateConfiguration) } } @@ -157,6 +181,8 @@ type DefaultAPIResponse struct { Message string `json:"msg"` } +// handleError writes a standardized JSON error response with the given status code. +// It logs the error at WARN level and ensures proper Content-Type headers are set. func handleError(err error, statusCode int, rw http.ResponseWriter) { cclog.Warnf("REST ERROR : %s", err.Error()) rw.Header().Add("Content-Type", "application/json") @@ -169,15 +195,38 @@ func handleError(err error, statusCode int, rw http.ResponseWriter) { } } +// decode reads JSON from r into val with strict validation that rejects unknown fields. func decode(r io.Reader, val any) error { dec := json.NewDecoder(r) dec.DisallowUnknownFields() return dec.Decode(val) } -func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) { - // SecuredCheck() only worked with TokenAuth: Removed +// validatePathComponent checks if a path component contains potentially malicious patterns +// that could be used for path traversal attacks. Returns an error if validation fails. +func validatePathComponent(component, componentName string) error { + if strings.Contains(component, "..") || + strings.Contains(component, "/") || + strings.Contains(component, "\\") { + return fmt.Errorf("invalid %s", componentName) + } + return nil +} +// editNotice godoc +// @summary Update system notice +// @tags Config +// @description Updates the notice.txt file content. Only admins are allowed. Content is limited to 10000 characters. +// @accept mpfd +// @produce plain +// @param new-content formData string true "New notice content (max 10000 characters)" +// @success 200 {string} string "Update Notice Content Success" +// @failure 400 {object} ErrorResponse "Bad Request" +// @failure 403 {object} ErrorResponse "Forbidden" +// @failure 500 {object} ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /notice/ [post] +func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) { if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { handleError(fmt.Errorf("only admins are allowed to update the notice.txt file"), http.StatusForbidden, rw) return @@ -186,9 +235,8 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) { // Get Value newContent := r.FormValue("new-content") - // Validate content length to prevent DoS - if len(newContent) > 10000 { - handleError(fmt.Errorf("notice content exceeds maximum length of 10000 characters"), http.StatusBadRequest, rw) + if len(newContent) > maxNoticeLength { + handleError(fmt.Errorf("notice content exceeds maximum length of %d characters", maxNoticeLength), http.StatusBadRequest, rw) return } @@ -200,7 +248,9 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) { handleError(fmt.Errorf("creating notice file failed: %w", err), http.StatusInternalServerError, rw) return } - ntxt.Close() + if err := ntxt.Close(); err != nil { + cclog.Warnf("Failed to close notice file: %v", err) + } } if err := os.WriteFile(noticeFilePath, []byte(newContent), noticeFilePerms); err != nil { @@ -210,13 +260,66 @@ func (api *RestAPI) editNotice(rw http.ResponseWriter, r *http.Request) { rw.Header().Set("Content-Type", "text/plain") rw.WriteHeader(http.StatusOK) + var msg []byte if newContent != "" { - rw.Write([]byte("Update Notice Content Success")) + msg = []byte("Update Notice Content Success") } else { - rw.Write([]byte("Empty Notice Content Success")) + msg = []byte("Empty Notice Content Success") + } + if _, err := rw.Write(msg); err != nil { + cclog.Errorf("Failed to write response: %v", err) } } +func (api *RestAPI) getTaggers(rw http.ResponseWriter, r *http.Request) { + if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { + handleError(fmt.Errorf("only admins are allowed to list taggers"), http.StatusForbidden, rw) + return + } + + rw.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(rw).Encode(tagger.ListTaggers()); err != nil { + cclog.Errorf("Failed to encode tagger list: %v", err) + } +} + +func (api *RestAPI) runTagger(rw http.ResponseWriter, r *http.Request) { + if user := repository.GetUserFromContext(r.Context()); !user.HasRole(schema.RoleAdmin) { + handleError(fmt.Errorf("only admins are allowed to run taggers"), http.StatusForbidden, rw) + return + } + + name := r.FormValue("name") + if name == "" { + handleError(fmt.Errorf("missing required parameter: name"), http.StatusBadRequest, rw) + return + } + + if err := tagger.RunTaggerByName(name); err != nil { + handleError(err, http.StatusConflict, rw) + return + } + + rw.Header().Set("Content-Type", "text/plain") + rw.WriteHeader(http.StatusOK) + if _, err := rw.Write(fmt.Appendf(nil, "Tagger %s started", name)); err != nil { + cclog.Errorf("Failed to write response: %v", err) + } +} + +// getJWT godoc +// @summary Generate JWT token +// @tags Frontend +// @description Generates a JWT token for a user. Admins can generate tokens for any user, regular users only for themselves. +// @accept mpfd +// @produce plain +// @param username formData string true "Username to generate JWT for" +// @success 200 {string} string "JWT token" +// @failure 403 {object} ErrorResponse "Forbidden" +// @failure 404 {object} ErrorResponse "User Not Found" +// @failure 500 {object} ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /jwt/ [get] func (api *RestAPI) getJWT(rw http.ResponseWriter, r *http.Request) { rw.Header().Set("Content-Type", "text/plain") username := r.FormValue("username") @@ -241,12 +344,22 @@ func (api *RestAPI) getJWT(rw http.ResponseWriter, r *http.Request) { } rw.WriteHeader(http.StatusOK) - rw.Write([]byte(jwt)) + if _, err := rw.Write([]byte(jwt)); err != nil { + cclog.Errorf("Failed to write JWT response: %v", err) + } } +// getRoles godoc +// @summary Get available roles +// @tags Config +// @description Returns a list of valid user roles. Only admins are allowed. +// @produce json +// @success 200 {array} string "List of role names" +// @failure 403 {object} ErrorResponse "Forbidden" +// @failure 500 {object} ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /roles/ [get] func (api *RestAPI) getRoles(rw http.ResponseWriter, r *http.Request) { - // SecuredCheck() only worked with TokenAuth: Removed - user := repository.GetUserFromContext(r.Context()) if !user.HasRole(schema.RoleAdmin) { handleError(fmt.Errorf("only admins are allowed to fetch a list of roles"), http.StatusForbidden, rw) @@ -265,6 +378,18 @@ func (api *RestAPI) getRoles(rw http.ResponseWriter, r *http.Request) { } } +// updateConfiguration godoc +// @summary Update user configuration +// @tags Frontend +// @description Updates a user's configuration key-value pair. +// @accept mpfd +// @produce plain +// @param key formData string true "Configuration key" +// @param value formData string true "Configuration value" +// @success 200 {string} string "success" +// @failure 500 {object} ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /configuration/ [post] func (api *RestAPI) updateConfiguration(rw http.ResponseWriter, r *http.Request) { rw.Header().Set("Content-Type", "text/plain") key, value := r.FormValue("key"), r.FormValue("value") @@ -275,26 +400,40 @@ func (api *RestAPI) updateConfiguration(rw http.ResponseWriter, r *http.Request) } rw.WriteHeader(http.StatusOK) - rw.Write([]byte("success")) + if _, err := rw.Write([]byte("success")); err != nil { + cclog.Errorf("Failed to write response: %v", err) + } } +// putMachineState godoc +// @summary Store machine state +// @tags Machine State +// @description Stores machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal. +// @accept json +// @produce plain +// @param cluster path string true "Cluster name" +// @param host path string true "Host name" +// @success 201 "Created" +// @failure 400 {object} ErrorResponse "Bad Request" +// @failure 404 {object} ErrorResponse "Machine state not enabled" +// @failure 500 {object} ErrorResponse "Internal Server Error" +// @security ApiKeyAuth +// @router /machine_state/{cluster}/{host} [put] func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) { if api.MachineStateDir == "" { handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw) return } - vars := mux.Vars(r) - cluster := vars["cluster"] - host := vars["host"] + cluster := chi.URLParam(r, "cluster") + host := chi.URLParam(r, "host") - // Validate cluster and host to prevent path traversal attacks - if strings.Contains(cluster, "..") || strings.Contains(cluster, "/") || strings.Contains(cluster, "\\") { - handleError(fmt.Errorf("invalid cluster name"), http.StatusBadRequest, rw) + if err := validatePathComponent(cluster, "cluster name"); err != nil { + handleError(err, http.StatusBadRequest, rw) return } - if strings.Contains(host, "..") || strings.Contains(host, "/") || strings.Contains(host, "\\") { - handleError(fmt.Errorf("invalid host name"), http.StatusBadRequest, rw) + if err := validatePathComponent(host, "host name"); err != nil { + handleError(err, http.StatusBadRequest, rw) return } @@ -320,23 +459,33 @@ func (api *RestAPI) putMachineState(rw http.ResponseWriter, r *http.Request) { rw.WriteHeader(http.StatusCreated) } +// getMachineState godoc +// @summary Retrieve machine state +// @tags Machine State +// @description Retrieves stored machine state data for a specific cluster node. Validates cluster and host names to prevent path traversal. +// @produce json +// @param cluster path string true "Cluster name" +// @param host path string true "Host name" +// @success 200 {object} object "Machine state JSON data" +// @failure 400 {object} ErrorResponse "Bad Request" +// @failure 404 {object} ErrorResponse "Machine state not enabled or file not found" +// @security ApiKeyAuth +// @router /machine_state/{cluster}/{host} [get] func (api *RestAPI) getMachineState(rw http.ResponseWriter, r *http.Request) { if api.MachineStateDir == "" { handleError(fmt.Errorf("machine state not enabled"), http.StatusNotFound, rw) return } - vars := mux.Vars(r) - cluster := vars["cluster"] - host := vars["host"] + cluster := chi.URLParam(r, "cluster") + host := chi.URLParam(r, "host") - // Validate cluster and host to prevent path traversal attacks - if strings.Contains(cluster, "..") || strings.Contains(cluster, "/") || strings.Contains(cluster, "\\") { - handleError(fmt.Errorf("invalid cluster name"), http.StatusBadRequest, rw) + if err := validatePathComponent(cluster, "cluster name"); err != nil { + handleError(err, http.StatusBadRequest, rw) return } - if strings.Contains(host, "..") || strings.Contains(host, "/") || strings.Contains(host, "\\") { - handleError(fmt.Errorf("invalid host name"), http.StatusBadRequest, rw) + if err := validatePathComponent(host, "host name"); err != nil { + handleError(err, http.StatusBadRequest, rw) return } diff --git a/internal/api/user.go b/internal/api/user.go index f9ddee33..e2f78165 100644 --- a/internal/api/user.go +++ b/internal/api/user.go @@ -11,9 +11,9 @@ import ( "net/http" "github.com/ClusterCockpit/cc-backend/internal/repository" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/gorilla/mux" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/go-chi/chi/v5" ) type APIReturnedUser struct { @@ -31,7 +31,7 @@ type APIReturnedUser struct { // @description Required query-parameter defines if all users or only users with additional special roles are returned. // @produce json // @param not-just-user query bool true "If returned list should contain all users or only users with additional special roles" -// @success 200 {array} api.ApiReturnedUser "List of users returned successfully" +// @success 200 {array} api.APIReturnedUser "List of users returned successfully" // @failure 400 {string} string "Bad Request" // @failure 401 {string} string "Unauthorized" // @failure 403 {string} string "Forbidden" @@ -91,7 +91,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) { // Handle role updates if newrole != "" { - if err := repository.GetUserRepository().AddRole(r.Context(), mux.Vars(r)["id"], newrole); err != nil { + if err := repository.GetUserRepository().AddRole(r.Context(), chi.URLParam(r, "id"), newrole); err != nil { handleError(fmt.Errorf("adding role failed: %w", err), http.StatusUnprocessableEntity, rw) return } @@ -99,7 +99,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) { cclog.Errorf("Failed to encode response: %v", err) } } else if delrole != "" { - if err := repository.GetUserRepository().RemoveRole(r.Context(), mux.Vars(r)["id"], delrole); err != nil { + if err := repository.GetUserRepository().RemoveRole(r.Context(), chi.URLParam(r, "id"), delrole); err != nil { handleError(fmt.Errorf("removing role failed: %w", err), http.StatusUnprocessableEntity, rw) return } @@ -107,7 +107,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) { cclog.Errorf("Failed to encode response: %v", err) } } else if newproj != "" { - if err := repository.GetUserRepository().AddProject(r.Context(), mux.Vars(r)["id"], newproj); err != nil { + if err := repository.GetUserRepository().AddProject(r.Context(), chi.URLParam(r, "id"), newproj); err != nil { handleError(fmt.Errorf("adding project failed: %w", err), http.StatusUnprocessableEntity, rw) return } @@ -115,7 +115,7 @@ func (api *RestAPI) updateUser(rw http.ResponseWriter, r *http.Request) { cclog.Errorf("Failed to encode response: %v", err) } } else if delproj != "" { - if err := repository.GetUserRepository().RemoveProject(r.Context(), mux.Vars(r)["id"], delproj); err != nil { + if err := repository.GetUserRepository().RemoveProject(r.Context(), chi.URLParam(r, "id"), delproj); err != nil { handleError(fmt.Errorf("removing project failed: %w", err), http.StatusUnprocessableEntity, rw) return } @@ -164,7 +164,7 @@ func (api *RestAPI) createUser(rw http.ResponseWriter, r *http.Request) { return } - if len(password) == 0 && role != schema.GetRoleString(schema.RoleApi) { + if len(password) == 0 && role != schema.GetRoleString(schema.RoleAPI) { handleError(fmt.Errorf("only API users are allowed to have a blank password (login will be impossible)"), http.StatusBadRequest, rw) return } diff --git a/internal/archiver/README.md b/internal/archiver/README.md index 0fae04ea..53d00948 100644 --- a/internal/archiver/README.md +++ b/internal/archiver/README.md @@ -106,7 +106,7 @@ Data is archived at the highest available resolution (typically 60s intervals). ```go // In archiver.go ArchiveJob() function -jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, ctx, 300) +jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 300) // 0 = highest resolution // 300 = 5-minute resolution ``` @@ -170,7 +170,6 @@ All exported functions are safe for concurrent use: - `Start()` - Safe to call once - `TriggerArchiving()` - Safe from multiple goroutines - `Shutdown()` - Safe to call once -- `WaitForArchiving()` - Deprecated, but safe Internal state is protected by: - Channel synchronization (`archiveChannel`) @@ -185,6 +184,6 @@ Internal state is protected by: ## Dependencies - `internal/repository`: Database operations for job metadata -- `internal/metricDataDispatcher`: Loading metric data from various backends +- `internal/metricdispatch`: Loading metric data from various backends - `pkg/archive`: Archive backend abstraction (filesystem, S3, SQLite) - `cc-lib/schema`: Job and metric data structures diff --git a/internal/archiver/archiveWorker.go b/internal/archiver/archiveWorker.go index 0434844d..0639757d 100644 --- a/internal/archiver/archiveWorker.go +++ b/internal/archiver/archiveWorker.go @@ -54,8 +54,8 @@ import ( "time" "github.com/ClusterCockpit/cc-backend/internal/repository" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" sq "github.com/Masterminds/squirrel" ) @@ -126,7 +126,7 @@ func archivingWorker() { // not using meta data, called to load JobMeta into Cache? // will fail if job meta not in repository if _, err := jobRepo.FetchMetadata(job); err != nil { - cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", job.ID, err.Error()) + cclog.Errorf("archiving job (dbid: %d) failed at check metadata step: %s", *job.ID, err.Error()) jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed) archivePending.Done() continue @@ -136,7 +136,7 @@ func archivingWorker() { // Use shutdown context to allow cancellation jobMeta, err := ArchiveJob(job, shutdownCtx) if err != nil { - cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", job.ID, err.Error()) + cclog.Errorf("archiving job (dbid: %d) failed at archiving job step: %s", *job.ID, err.Error()) jobRepo.UpdateMonitoringStatus(*job.ID, schema.MonitoringStatusArchivingFailed) archivePending.Done() continue @@ -145,24 +145,24 @@ func archivingWorker() { stmt := sq.Update("job").Where("job.id = ?", job.ID) if stmt, err = jobRepo.UpdateFootprint(stmt, jobMeta); err != nil { - cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", job.ID, err.Error()) + cclog.Errorf("archiving job (dbid: %d) failed at update Footprint step: %s", *job.ID, err.Error()) archivePending.Done() continue } if stmt, err = jobRepo.UpdateEnergy(stmt, jobMeta); err != nil { - cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", job.ID, err.Error()) + cclog.Errorf("archiving job (dbid: %d) failed at update Energy step: %s", *job.ID, err.Error()) archivePending.Done() continue } // Update the jobs database entry one last time: stmt = jobRepo.MarkArchived(stmt, schema.MonitoringStatusArchivingSuccessful) if err := jobRepo.Execute(stmt); err != nil { - cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", job.ID, err.Error()) + cclog.Errorf("archiving job (dbid: %d) failed at db execute: %s", *job.ID, err.Error()) archivePending.Done() continue } cclog.Debugf("archiving job %d took %s", job.JobID, time.Since(start)) - cclog.Infof("archiving job (dbid: %d) successful", job.ID) + cclog.Infof("archiving job (dbid: %d) successful", *job.ID) repository.CallJobStopHooks(job) archivePending.Done() diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go index b88199aa..454a2358 100644 --- a/internal/archiver/archiver.go +++ b/internal/archiver/archiver.go @@ -9,11 +9,10 @@ import ( "context" "math" - "github.com/ClusterCockpit/cc-backend/internal/config" - "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" + "github.com/ClusterCockpit/cc-backend/internal/metricdispatch" "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" ) // ArchiveJob archives a completed job's metric data to the configured archive backend. @@ -60,7 +59,7 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) { scopes = append(scopes, schema.MetricScopeAccelerator) } - jobData, err := metricDataDispatcher.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s) + jobData, err := metricdispatch.LoadData(job, allMetrics, scopes, ctx, 0) // 0 Resulotion-Value retrieves highest res (60s) if err != nil { cclog.Error("Error wile loading job data for archiving") return nil, err @@ -94,12 +93,5 @@ func ArchiveJob(job *schema.Job, ctx context.Context) (*schema.Job, error) { } } - // If the file based archive is disabled, - // only return the JobMeta structure as the - // statistics in there are needed. - if config.Keys.DisableArchive { - return job, nil - } - return job, archive.GetHandle().ImportJob(job, &jobData) } diff --git a/internal/auth/auth.go b/internal/auth/auth.go index 5d947353..69f4f078 100644 --- a/internal/auth/auth.go +++ b/internal/auth/auth.go @@ -25,9 +25,9 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/repository" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/ClusterCockpit/cc-lib/util" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" "github.com/gorilla/sessions" ) @@ -40,7 +40,7 @@ type Authenticator interface { // authenticator should attempt the login. This method should not perform // expensive operations or actual authentication. CanLogin(user *schema.User, username string, rw http.ResponseWriter, r *http.Request) (*schema.User, bool) - + // Login performs the actually authentication for the user. // It returns the authenticated user or an error if authentication fails. // The user parameter may be nil if the user doesn't exist in the database yet. @@ -65,13 +65,13 @@ var ipUserLimiters sync.Map func getIPUserLimiter(ip, username string) *rate.Limiter { key := ip + ":" + username now := time.Now() - + if entry, ok := ipUserLimiters.Load(key); ok { rle := entry.(*rateLimiterEntry) rle.lastUsed = now return rle.limiter } - + // More aggressive rate limiting: 5 attempts per 15 minutes newLimiter := rate.NewLimiter(rate.Every(15*time.Minute/5), 5) ipUserLimiters.Store(key, &rateLimiterEntry{ @@ -176,7 +176,7 @@ func (auth *Authentication) AuthViaSession( func Init(authCfg *json.RawMessage) { initOnce.Do(func() { authInstance = &Authentication{} - + // Start background cleanup of rate limiters startRateLimiterCleanup() @@ -263,7 +263,7 @@ func GetAuthInstance() *Authentication { } // handleUserSync syncs or updates a user in the database based on configuration. -// This is used for both JWT and OIDC authentication when syncUserOnLogin or updateUserOnLogin is enabled. +// This is used for LDAP, JWT and OIDC authentications when syncUserOnLogin or updateUserOnLogin is enabled. func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool) { r := repository.GetUserRepository() dbUser, err := r.GetUser(user.Username) @@ -272,7 +272,7 @@ func handleUserSync(user *schema.User, syncUserOnLogin, updateUserOnLogin bool) cclog.Errorf("Error while loading user '%s': %v", user.Username, err) return } - + if err == sql.ErrNoRows && syncUserOnLogin { // Add new user if err := r.AddUser(user); err != nil { cclog.Errorf("Error while adding user '%s' to DB: %v", user.Username, err) @@ -294,6 +294,11 @@ func handleOIDCUser(OIDCUser *schema.User) { handleUserSync(OIDCUser, Keys.OpenIDConfig.SyncUserOnLogin, Keys.OpenIDConfig.UpdateUserOnLogin) } +// handleLdapUser syncs LDAP user with database +func handleLdapUser(ldapUser *schema.User) { + handleUserSync(ldapUser, Keys.LdapConfig.SyncUserOnLogin, Keys.LdapConfig.UpdateUserOnLogin) +} + func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request, user *schema.User) error { session, err := auth.sessionStore.New(r, "session") if err != nil { @@ -305,8 +310,13 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request, if auth.SessionMaxAge != 0 { session.Options.MaxAge = int(auth.SessionMaxAge.Seconds()) } - if config.Keys.HTTPSCertFile == "" && config.Keys.HTTPSKeyFile == "" { - cclog.Warn("HTTPS not configured - session cookies will not have Secure flag set (insecure for production)") + if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" { + // If neither TLS or an encrypted reverse proxy are used, do not mark cookies as secure. + cclog.Warn("Authenticating with unencrypted request. Session cookies will not have Secure flag set (insecure for production)") + if r.Header.Get("X-Forwarded-Proto") == "" { + // This warning will not be printed if e.g. X-Forwarded-Proto == http + cclog.Warn("If you are using a reverse proxy, make sure X-Forwarded-Proto is set") + } session.Options.Secure = false } session.Options.SameSite = http.SameSiteStrictMode @@ -438,13 +448,13 @@ func (auth *Authentication) AuthAPI( if user != nil { switch { case len(user.Roles) == 1: - if user.HasRole(schema.RoleApi) { + if user.HasRole(schema.RoleAPI) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return } case len(user.Roles) >= 2: - if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleApi}) { + if user.HasAllRoles([]schema.Role{schema.RoleAdmin, schema.RoleAPI}) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return @@ -474,13 +484,13 @@ func (auth *Authentication) AuthUserAPI( if user != nil { switch { case len(user.Roles) == 1: - if user.HasRole(schema.RoleApi) { + if user.HasRole(schema.RoleAPI) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return } case len(user.Roles) >= 2: - if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) { + if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleSupport, schema.RoleAdmin}) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return @@ -510,13 +520,13 @@ func (auth *Authentication) AuthMetricStoreAPI( if user != nil { switch { case len(user.Roles) == 1: - if user.HasRole(schema.RoleApi) { + if user.HasRole(schema.RoleAPI) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return } case len(user.Roles) >= 2: - if user.HasRole(schema.RoleApi) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) { + if user.HasRole(schema.RoleAPI) && user.HasAnyRole([]schema.Role{schema.RoleUser, schema.RoleManager, schema.RoleAdmin}) { ctx := context.WithValue(r.Context(), repository.ContextUserKey, user) onsuccess.ServeHTTP(rw, r.WithContext(ctx)) return @@ -616,9 +626,9 @@ func securedCheck(user *schema.User, r *http.Request) error { } // If SplitHostPort fails, IPAddress is already just a host (no port) - // If nothing declared in config: deny all request to this api endpoint + // If nothing declared in config: Continue if len(config.Keys.APIAllowedIPs) == 0 { - return fmt.Errorf("missing configuration key ApiAllowedIPs") + return nil } // If wildcard declared in config: Continue if config.Keys.APIAllowedIPs[0] == "*" { diff --git a/internal/auth/auth_test.go b/internal/auth/auth_test.go index 15f153e6..f8c6635c 100644 --- a/internal/auth/auth_test.go +++ b/internal/auth/auth_test.go @@ -15,25 +15,25 @@ import ( func TestGetIPUserLimiter(t *testing.T) { ip := "192.168.1.1" username := "testuser" - + // Get limiter for the first time limiter1 := getIPUserLimiter(ip, username) if limiter1 == nil { t.Fatal("Expected limiter to be created") } - + // Get the same limiter again limiter2 := getIPUserLimiter(ip, username) if limiter1 != limiter2 { t.Error("Expected to get the same limiter instance") } - + // Get a different limiter for different user limiter3 := getIPUserLimiter(ip, "otheruser") if limiter1 == limiter3 { t.Error("Expected different limiter for different user") } - + // Get a different limiter for different IP limiter4 := getIPUserLimiter("192.168.1.2", username) if limiter1 == limiter4 { @@ -45,16 +45,16 @@ func TestGetIPUserLimiter(t *testing.T) { func TestRateLimiterBehavior(t *testing.T) { ip := "10.0.0.1" username := "ratelimituser" - + limiter := getIPUserLimiter(ip, username) - + // Should allow first 5 attempts - for i := 0; i < 5; i++ { + for i := range 5 { if !limiter.Allow() { t.Errorf("Request %d should be allowed within rate limit", i+1) } } - + // 6th attempt should be blocked if limiter.Allow() { t.Error("Request 6 should be blocked by rate limiter") @@ -65,19 +65,19 @@ func TestRateLimiterBehavior(t *testing.T) { func TestCleanupOldRateLimiters(t *testing.T) { // Clear all existing limiters first to avoid interference from other tests cleanupOldRateLimiters(time.Now().Add(24 * time.Hour)) - + // Create some new rate limiters limiter1 := getIPUserLimiter("1.1.1.1", "user1") limiter2 := getIPUserLimiter("2.2.2.2", "user2") - + if limiter1 == nil || limiter2 == nil { t.Fatal("Failed to create test limiters") } - + // Cleanup limiters older than 1 second from now (should keep both) time.Sleep(10 * time.Millisecond) // Small delay to ensure timestamp difference cleanupOldRateLimiters(time.Now().Add(-1 * time.Second)) - + // Verify they still exist (should get same instance) if getIPUserLimiter("1.1.1.1", "user1") != limiter1 { t.Error("Limiter 1 was incorrectly cleaned up") @@ -85,10 +85,10 @@ func TestCleanupOldRateLimiters(t *testing.T) { if getIPUserLimiter("2.2.2.2", "user2") != limiter2 { t.Error("Limiter 2 was incorrectly cleaned up") } - + // Cleanup limiters older than 1 hour from now (should remove both) cleanupOldRateLimiters(time.Now().Add(2 * time.Hour)) - + // Getting them again should create new instances newLimiter1 := getIPUserLimiter("1.1.1.1", "user1") if newLimiter1 == limiter1 { @@ -107,14 +107,14 @@ func TestIPv4Extraction(t *testing.T) { {"IPv4 without port", "192.168.1.1", "192.168.1.1"}, {"Localhost with port", "127.0.0.1:3000", "127.0.0.1"}, } - + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := tt.input if host, _, err := net.SplitHostPort(result); err == nil { result = host } - + if result != tt.expected { t.Errorf("Expected %s, got %s", tt.expected, result) } @@ -122,7 +122,7 @@ func TestIPv4Extraction(t *testing.T) { } } -// TestIPv6Extraction tests extracting IPv6 addresses +// TestIPv6Extraction tests extracting IPv6 addresses func TestIPv6Extraction(t *testing.T) { tests := []struct { name string @@ -134,14 +134,14 @@ func TestIPv6Extraction(t *testing.T) { {"IPv6 without port", "2001:db8::1", "2001:db8::1"}, {"IPv6 localhost", "::1", "::1"}, } - + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := tt.input if host, _, err := net.SplitHostPort(result); err == nil { result = host } - + if result != tt.expected { t.Errorf("Expected %s, got %s", tt.expected, result) } @@ -160,14 +160,14 @@ func TestIPExtractionEdgeCases(t *testing.T) { {"Empty string", "", ""}, {"Just port", ":8080", ""}, } - + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := tt.input if host, _, err := net.SplitHostPort(result); err == nil { result = host } - + if result != tt.expected { t.Errorf("Expected %s, got %s", tt.expected, result) } diff --git a/internal/auth/jwt.go b/internal/auth/jwt.go index 4f1f3f54..abdce313 100644 --- a/internal/auth/jwt.go +++ b/internal/auth/jwt.go @@ -14,8 +14,8 @@ import ( "strings" "time" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/golang-jwt/jwt/v5" ) @@ -25,20 +25,20 @@ type JWTAuthConfig struct { MaxAge string `json:"max-age"` // Specifies which cookie should be checked for a JWT token (if no authorization header is present) - CookieName string `json:"cookieName"` + CookieName string `json:"cookie-name"` // Deny login for users not in database (but defined in JWT). // Ignore user roles defined in JWTs ('roles' claim), get them from db. - ValidateUser bool `json:"validateUser"` + ValidateUser bool `json:"validate-user"` // Specifies which issuer should be accepted when validating external JWTs ('iss' claim) - TrustedIssuer string `json:"trustedIssuer"` + TrustedIssuer string `json:"trusted-issuer"` // Should an non-existent user be added to the DB based on the information in the token - SyncUserOnLogin bool `json:"syncUserOnLogin"` + SyncUserOnLogin bool `json:"sync-user-on-login"` // Should an existent user be updated in the DB based on the information in the token - UpdateUserOnLogin bool `json:"updateUserOnLogin"` + UpdateUserOnLogin bool `json:"update-user-on-login"` } type JWTAuthenticator struct { @@ -101,20 +101,20 @@ func (ja *JWTAuthenticator) AuthViaJWT( // Token is valid, extract payload claims := token.Claims.(jwt.MapClaims) - + // Use shared helper to get user from JWT claims var user *schema.User user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthToken, -1) if err != nil { return nil, err } - + // If not validating user, we only get roles from JWT (no projects for this auth method) if !Keys.JwtConfig.ValidateUser { user.Roles = extractRolesFromClaims(claims, false) user.Projects = nil // Standard JWT auth doesn't include projects } - + return user, nil } diff --git a/internal/auth/jwtCookieSession.go b/internal/auth/jwtCookieSession.go index 44c64a0c..4c4bbeb6 100644 --- a/internal/auth/jwtCookieSession.go +++ b/internal/auth/jwtCookieSession.go @@ -12,8 +12,8 @@ import ( "net/http" "os" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/golang-jwt/jwt/v5" ) @@ -146,13 +146,13 @@ func (ja *JWTCookieSessionAuthenticator) Login( } claims := token.Claims.(jwt.MapClaims) - + // Use shared helper to get user from JWT claims user, err = getUserFromJWT(claims, jc.ValidateUser, schema.AuthSession, schema.AuthViaToken) if err != nil { return nil, err } - + // Sync or update user if configured if !jc.ValidateUser && (jc.SyncUserOnLogin || jc.UpdateUserOnLogin) { handleTokenUser(user) diff --git a/internal/auth/jwtHelpers.go b/internal/auth/jwtHelpers.go index 792722a8..8321b9c4 100644 --- a/internal/auth/jwtHelpers.go +++ b/internal/auth/jwtHelpers.go @@ -9,10 +9,11 @@ import ( "database/sql" "errors" "fmt" + "strings" "github.com/ClusterCockpit/cc-backend/internal/repository" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/golang-jwt/jwt/v5" ) @@ -28,7 +29,7 @@ func extractStringFromClaims(claims jwt.MapClaims, key string) string { // If validateRoles is true, only valid roles are returned func extractRolesFromClaims(claims jwt.MapClaims, validateRoles bool) []string { var roles []string - + if rawroles, ok := claims["roles"].([]any); ok { for _, rr := range rawroles { if r, ok := rr.(string); ok { @@ -42,14 +43,14 @@ func extractRolesFromClaims(claims jwt.MapClaims, validateRoles bool) []string { } } } - + return roles } // extractProjectsFromClaims extracts projects from JWT claims func extractProjectsFromClaims(claims jwt.MapClaims) []string { projects := make([]string, 0) - + if rawprojs, ok := claims["projects"].([]any); ok { for _, pp := range rawprojs { if p, ok := pp.(string); ok { @@ -61,7 +62,7 @@ func extractProjectsFromClaims(claims jwt.MapClaims) []string { projects = append(projects, projSlice...) } } - + return projects } @@ -72,22 +73,23 @@ func extractNameFromClaims(claims jwt.MapClaims) string { if name, ok := claims["name"].(string); ok { return name } - + // Try nested structure: {name: {values: [...]}} if wrap, ok := claims["name"].(map[string]any); ok { if vals, ok := wrap["values"].([]any); ok { if len(vals) == 0 { return "" } - - name := fmt.Sprintf("%v", vals[0]) + + var name strings.Builder + name.WriteString(fmt.Sprintf("%v", vals[0])) for i := 1; i < len(vals); i++ { - name += fmt.Sprintf(" %v", vals[i]) + name.WriteString(fmt.Sprintf(" %v", vals[i])) } - return name + return name.String() } } - + return "" } @@ -100,7 +102,7 @@ func getUserFromJWT(claims jwt.MapClaims, validateUser bool, authType schema.Aut if sub == "" { return nil, errors.New("missing 'sub' claim in JWT") } - + if validateUser { // Validate user against database ur := repository.GetUserRepository() @@ -109,22 +111,22 @@ func getUserFromJWT(claims jwt.MapClaims, validateUser bool, authType schema.Aut cclog.Errorf("Error while loading user '%v': %v", sub, err) return nil, fmt.Errorf("database error: %w", err) } - + // Deny any logins for unknown usernames if user == nil || err == sql.ErrNoRows { cclog.Warn("Could not find user from JWT in internal database.") return nil, errors.New("unknown user") } - + // Return database user (with database roles) return user, nil } - + // Create user from JWT claims name := extractNameFromClaims(claims) roles := extractRolesFromClaims(claims, true) // Validate roles projects := extractProjectsFromClaims(claims) - + return &schema.User{ Username: sub, Name: name, diff --git a/internal/auth/jwtHelpers_test.go b/internal/auth/jwtHelpers_test.go index 5cee1df5..4627f7e5 100644 --- a/internal/auth/jwtHelpers_test.go +++ b/internal/auth/jwtHelpers_test.go @@ -8,7 +8,7 @@ package auth import ( "testing" - "github.com/ClusterCockpit/cc-lib/schema" + "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/golang-jwt/jwt/v5" ) @@ -19,7 +19,7 @@ func TestExtractStringFromClaims(t *testing.T) { "email": "test@example.com", "age": 25, // not a string } - + tests := []struct { name string key string @@ -30,7 +30,7 @@ func TestExtractStringFromClaims(t *testing.T) { {"Non-existent key", "missing", ""}, {"Non-string value", "age", ""}, } - + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := extractStringFromClaims(claims, tt.key) @@ -88,16 +88,16 @@ func TestExtractRolesFromClaims(t *testing.T) { expected: []string{}, }, } - + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := extractRolesFromClaims(tt.claims, tt.validateRoles) - + if len(result) != len(tt.expected) { t.Errorf("Expected %d roles, got %d", len(tt.expected), len(result)) return } - + for i, role := range result { if i >= len(tt.expected) || role != tt.expected[i] { t.Errorf("Expected role %s at position %d, got %s", tt.expected[i], i, role) @@ -141,16 +141,16 @@ func TestExtractProjectsFromClaims(t *testing.T) { expected: []string{"project1", "project2"}, // Should skip non-strings }, } - + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := extractProjectsFromClaims(tt.claims) - + if len(result) != len(tt.expected) { t.Errorf("Expected %d projects, got %d", len(tt.expected), len(result)) return } - + for i, project := range result { if i >= len(tt.expected) || project != tt.expected[i] { t.Errorf("Expected project %s at position %d, got %s", tt.expected[i], i, project) @@ -216,7 +216,7 @@ func TestExtractNameFromClaims(t *testing.T) { expected: "123 Smith", // Should convert to string }, } - + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := extractNameFromClaims(tt.claims) @@ -235,29 +235,28 @@ func TestGetUserFromJWT_NoValidation(t *testing.T) { "roles": []any{"user", "admin"}, "projects": []any{"project1", "project2"}, } - + user, err := getUserFromJWT(claims, false, schema.AuthToken, -1) - if err != nil { t.Fatalf("Unexpected error: %v", err) } - + if user.Username != "testuser" { t.Errorf("Expected username 'testuser', got '%s'", user.Username) } - + if user.Name != "Test User" { t.Errorf("Expected name 'Test User', got '%s'", user.Name) } - + if len(user.Roles) != 2 { t.Errorf("Expected 2 roles, got %d", len(user.Roles)) } - + if len(user.Projects) != 2 { t.Errorf("Expected 2 projects, got %d", len(user.Projects)) } - + if user.AuthType != schema.AuthToken { t.Errorf("Expected AuthType %v, got %v", schema.AuthToken, user.AuthType) } @@ -268,13 +267,13 @@ func TestGetUserFromJWT_MissingSub(t *testing.T) { claims := jwt.MapClaims{ "name": "Test User", } - + _, err := getUserFromJWT(claims, false, schema.AuthToken, -1) - + if err == nil { t.Error("Expected error for missing sub claim") } - + if err.Error() != "missing 'sub' claim in JWT" { t.Errorf("Expected specific error message, got: %v", err) } diff --git a/internal/auth/jwtSession.go b/internal/auth/jwtSession.go index 15e58347..de7e985b 100644 --- a/internal/auth/jwtSession.go +++ b/internal/auth/jwtSession.go @@ -13,8 +13,8 @@ import ( "os" "strings" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/golang-jwt/jwt/v5" ) @@ -75,13 +75,13 @@ func (ja *JWTSessionAuthenticator) Login( } claims := token.Claims.(jwt.MapClaims) - + // Use shared helper to get user from JWT claims user, err = getUserFromJWT(claims, Keys.JwtConfig.ValidateUser, schema.AuthSession, schema.AuthViaToken) if err != nil { return nil, err } - + // Sync or update user if configured if !Keys.JwtConfig.ValidateUser && (Keys.JwtConfig.SyncUserOnLogin || Keys.JwtConfig.UpdateUserOnLogin) { handleTokenUser(user) diff --git a/internal/auth/ldap.go b/internal/auth/ldap.go index e96e732b..a174bb9d 100644 --- a/internal/auth/ldap.go +++ b/internal/auth/ldap.go @@ -6,35 +6,39 @@ package auth import ( - "errors" "fmt" + "net" "net/http" "os" "strings" + "time" "github.com/ClusterCockpit/cc-backend/internal/repository" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/go-ldap/ldap/v3" ) type LdapConfig struct { URL string `json:"url"` - UserBase string `json:"user_base"` - SearchDN string `json:"search_dn"` - UserBind string `json:"user_bind"` - UserFilter string `json:"user_filter"` - UserAttr string `json:"username_attr"` - SyncInterval string `json:"sync_interval"` // Parsed using time.ParseDuration. - SyncDelOldUsers bool `json:"sync_del_old_users"` + UserBase string `json:"user-base"` + SearchDN string `json:"search-dn"` + UserBind string `json:"user-bind"` + UserFilter string `json:"user-filter"` + UserAttr string `json:"username-attr"` + UIDAttr string `json:"uid-attr"` + SyncInterval string `json:"sync-interval"` // Parsed using time.ParseDuration. + SyncDelOldUsers bool `json:"sync-del-old-users"` - // Should an non-existent user be added to the DB if user exists in ldap directory - SyncUserOnLogin bool `json:"syncUserOnLogin"` + // Should a non-existent user be added to the DB if user exists in ldap directory + SyncUserOnLogin bool `json:"sync-user-on-login"` + UpdateUserOnLogin bool `json:"update-user-on-login"` } type LdapAuthenticator struct { syncPassword string UserAttr string + UIDAttr string } var _ Authenticator = (*LdapAuthenticator)(nil) @@ -51,6 +55,12 @@ func (la *LdapAuthenticator) Init() error { la.UserAttr = "gecos" } + if Keys.LdapConfig.UIDAttr != "" { + la.UIDAttr = Keys.LdapConfig.UIDAttr + } else { + la.UIDAttr = "uid" + } + return nil } @@ -66,55 +76,44 @@ func (la *LdapAuthenticator) CanLogin( if user.AuthSource == schema.AuthViaLDAP { return user, true } - } else { - if lc.SyncUserOnLogin { - l, err := la.getLdapConnection(true) - if err != nil { - cclog.Error("LDAP connection error") - return nil, false - } - defer l.Close() - - // Search for the given username - searchRequest := ldap.NewSearchRequest( - lc.UserBase, - ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false, - fmt.Sprintf("(&%s(uid=%s))", lc.UserFilter, username), - []string{"dn", "uid", la.UserAttr}, nil) - - sr, err := l.Search(searchRequest) - if err != nil { - cclog.Warn(err) - return nil, false - } - - if len(sr.Entries) != 1 { - cclog.Warn("LDAP: User does not exist or too many entries returned") - return nil, false - } - - entry := sr.Entries[0] - name := entry.GetAttributeValue(la.UserAttr) - var roles []string - roles = append(roles, schema.GetRoleString(schema.RoleUser)) - projects := make([]string, 0) - - user = &schema.User{ - Username: username, - Name: name, - Roles: roles, - Projects: projects, - AuthType: schema.AuthSession, - AuthSource: schema.AuthViaLDAP, - } - - if err := repository.GetUserRepository().AddUser(user); err != nil { - cclog.Errorf("User '%s' LDAP: Insert into DB failed", username) - return nil, false - } - - return user, true + } else if lc.SyncUserOnLogin { + l, err := la.getLdapConnection(true) + if err != nil { + cclog.Error("LDAP connection error") + return nil, false } + defer l.Close() + + // Search for the given username + searchRequest := ldap.NewSearchRequest( + lc.UserBase, + ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false, + fmt.Sprintf("(&%s(%s=%s))", lc.UserFilter, la.UIDAttr, ldap.EscapeFilter(username)), + []string{"dn", la.UIDAttr, la.UserAttr}, nil) + + sr, err := l.Search(searchRequest) + if err != nil { + cclog.Warn(err) + return nil, false + } + + if len(sr.Entries) != 1 { + cclog.Warn("LDAP: User does not exist or too many entries returned") + return nil, false + } + + entry := sr.Entries[0] + user = &schema.User{ + Username: username, + Name: entry.GetAttributeValue(la.UserAttr), + Roles: []string{schema.GetRoleString(schema.RoleUser)}, + Projects: make([]string, 0), + AuthType: schema.AuthSession, + AuthSource: schema.AuthViaLDAP, + } + + handleLdapUser(user) + return user, true } return nil, false @@ -132,7 +131,7 @@ func (la *LdapAuthenticator) Login( } defer l.Close() - userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", user.Username) + userDn := strings.ReplaceAll(Keys.LdapConfig.UserBind, "{username}", ldap.EscapeDN(user.Username)) if err := l.Bind(userDn, r.FormValue("password")); err != nil { cclog.Errorf("AUTH/LDAP > Authentication for user %s failed: %v", user.Username, err) @@ -170,7 +169,7 @@ func (la *LdapAuthenticator) Sync() error { lc.UserBase, ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false, lc.UserFilter, - []string{"dn", "uid", la.UserAttr}, nil)) + []string{"dn", la.UIDAttr, la.UserAttr}, nil)) if err != nil { cclog.Warn("LDAP search error") return err @@ -178,9 +177,9 @@ func (la *LdapAuthenticator) Sync() error { newnames := map[string]string{} for _, entry := range ldapResults.Entries { - username := entry.GetAttributeValue("uid") + username := entry.GetAttributeValue(la.UIDAttr) if username == "" { - return errors.New("no attribute 'uid'") + return fmt.Errorf("no attribute '%s'", la.UIDAttr) } _, ok := users[username] @@ -194,20 +193,19 @@ func (la *LdapAuthenticator) Sync() error { for username, where := range users { if where == InDB && lc.SyncDelOldUsers { - ur.DelUser(username) + if err := ur.DelUser(username); err != nil { + cclog.Errorf("User '%s' LDAP: Delete from DB failed: %v", username, err) + return err + } cclog.Debugf("sync: remove %v (does not show up in LDAP anymore)", username) } else if where == InLdap { name := newnames[username] - var roles []string - roles = append(roles, schema.GetRoleString(schema.RoleUser)) - projects := make([]string, 0) - user := &schema.User{ Username: username, Name: name, - Roles: roles, - Projects: projects, + Roles: []string{schema.GetRoleString(schema.RoleUser)}, + Projects: make([]string, 0), AuthSource: schema.AuthViaLDAP, } @@ -224,11 +222,13 @@ func (la *LdapAuthenticator) Sync() error { func (la *LdapAuthenticator) getLdapConnection(admin bool) (*ldap.Conn, error) { lc := Keys.LdapConfig - conn, err := ldap.DialURL(lc.URL) + conn, err := ldap.DialURL(lc.URL, + ldap.DialWithDialer(&net.Dialer{Timeout: 10 * time.Second})) if err != nil { cclog.Warn("LDAP URL dial failed") return nil, err } + conn.SetTimeout(30 * time.Second) if admin { if err := conn.Bind(lc.SearchDN, la.syncPassword); err != nil { diff --git a/internal/auth/local.go b/internal/auth/local.go index 1c9b0372..b1a7362c 100644 --- a/internal/auth/local.go +++ b/internal/auth/local.go @@ -9,8 +9,8 @@ import ( "fmt" "net/http" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" "golang.org/x/crypto/bcrypt" ) diff --git a/internal/auth/oidc.go b/internal/auth/oidc.go index 9e361302..ec6c77a7 100644 --- a/internal/auth/oidc.go +++ b/internal/auth/oidc.go @@ -9,23 +9,24 @@ import ( "context" "crypto/rand" "encoding/base64" + "fmt" "io" "net/http" "os" "time" "github.com/ClusterCockpit/cc-backend/internal/repository" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/coreos/go-oidc/v3/oidc" - "github.com/gorilla/mux" + "github.com/go-chi/chi/v5" "golang.org/x/oauth2" ) type OpenIDConfig struct { Provider string `json:"provider"` - SyncUserOnLogin bool `json:"syncUserOnLogin"` - UpdateUserOnLogin bool `json:"updateUserOnLogin"` + SyncUserOnLogin bool `json:"sync-user-on-login"` + UpdateUserOnLogin bool `json:"update-user-on-login"` } type OIDC struct { @@ -50,6 +51,7 @@ func setCallbackCookie(w http.ResponseWriter, r *http.Request, name, value strin MaxAge: int(time.Hour.Seconds()), Secure: r.TLS != nil, HttpOnly: true, + SameSite: http.SameSiteLaxMode, } http.SetCookie(w, c) } @@ -59,7 +61,7 @@ func NewOIDC(a *Authentication) *OIDC { // Use context with timeout for provider initialization ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - + provider, err := oidc.NewProvider(ctx, Keys.OpenIDConfig.Provider) if err != nil { cclog.Fatal(err) @@ -77,8 +79,7 @@ func NewOIDC(a *Authentication) *OIDC { ClientID: clientID, ClientSecret: clientSecret, Endpoint: provider.Endpoint(), - RedirectURL: "oidc-callback", - Scopes: []string{oidc.ScopeOpenID, "profile", "email"}, + Scopes: []string{oidc.ScopeOpenID, "profile"}, } oa := &OIDC{provider: provider, client: client, clientID: clientID, authentication: a} @@ -86,7 +87,7 @@ func NewOIDC(a *Authentication) *OIDC { return oa } -func (oa *OIDC) RegisterEndpoints(r *mux.Router) { +func (oa *OIDC) RegisterEndpoints(r chi.Router) { r.HandleFunc("/oidc-login", oa.OAuth2Login) r.HandleFunc("/oidc-callback", oa.OAuth2Callback) } @@ -119,57 +120,96 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) { // Exchange authorization code for token with timeout ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - + token, err := oa.client.Exchange(ctx, code, oauth2.VerifierOption(codeVerifier)) if err != nil { - http.Error(rw, "Failed to exchange token: "+err.Error(), http.StatusInternalServerError) + cclog.Errorf("token exchange failed: %s", err.Error()) + http.Error(rw, "Authentication failed during token exchange", http.StatusInternalServerError) return } // Get user info from OIDC provider with same timeout userInfo, err := oa.provider.UserInfo(ctx, oauth2.StaticTokenSource(token)) if err != nil { - http.Error(rw, "Failed to get userinfo: "+err.Error(), http.StatusInternalServerError) + cclog.Errorf("failed to get userinfo: %s", err.Error()) + http.Error(rw, "Failed to retrieve user information", http.StatusInternalServerError) return } - // // Extract the ID Token from OAuth2 token. - // rawIDToken, ok := token.Extra("id_token").(string) - // if !ok { - // http.Error(rw, "Cannot access idToken", http.StatusInternalServerError) - // } - // - // verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID}) - // // Parse and verify ID Token payload. - // idToken, err := verifier.Verify(context.Background(), rawIDToken) - // if err != nil { - // http.Error(rw, "Failed to extract idToken: "+err.Error(), http.StatusInternalServerError) - // } + // Verify ID token and nonce to prevent replay attacks + rawIDToken, ok := token.Extra("id_token").(string) + if !ok { + http.Error(rw, "ID token not found in response", http.StatusInternalServerError) + return + } + + nonceCookie, err := r.Cookie("nonce") + if err != nil { + http.Error(rw, "nonce cookie not found", http.StatusBadRequest) + return + } + + verifier := oa.provider.Verifier(&oidc.Config{ClientID: oa.clientID}) + idToken, err := verifier.Verify(ctx, rawIDToken) + if err != nil { + cclog.Errorf("ID token verification failed: %s", err.Error()) + http.Error(rw, "ID token verification failed", http.StatusInternalServerError) + return + } + + if idToken.Nonce != nonceCookie.Value { + http.Error(rw, "Nonce mismatch", http.StatusBadRequest) + return + } projects := make([]string, 0) - // Extract custom claims + // Extract custom claims from userinfo var claims struct { Username string `json:"preferred_username"` Name string `json:"name"` - Profile struct { + // Keycloak realm-level roles + RealmAccess struct { + Roles []string `json:"roles"` + } `json:"realm_access"` + // Keycloak client-level roles + ResourceAccess struct { Client struct { Roles []string `json:"roles"` } `json:"clustercockpit"` } `json:"resource_access"` } if err := userInfo.Claims(&claims); err != nil { - http.Error(rw, "Failed to extract Claims: "+err.Error(), http.StatusInternalServerError) + cclog.Errorf("failed to extract claims: %s", err.Error()) + http.Error(rw, "Failed to extract user claims", http.StatusInternalServerError) + return + } + + if claims.Username == "" { + http.Error(rw, "Username claim missing from OIDC provider", http.StatusBadRequest) + return + } + + // Merge roles from both client-level and realm-level access + oidcRoles := append(claims.ResourceAccess.Client.Roles, claims.RealmAccess.Roles...) + + roleSet := make(map[string]bool) + for _, r := range oidcRoles { + switch r { + case "user": + roleSet[schema.GetRoleString(schema.RoleUser)] = true + case "admin": + roleSet[schema.GetRoleString(schema.RoleAdmin)] = true + case "manager": + roleSet[schema.GetRoleString(schema.RoleManager)] = true + case "support": + roleSet[schema.GetRoleString(schema.RoleSupport)] = true + } } var roles []string - for _, r := range claims.Profile.Client.Roles { - switch r { - case "user": - roles = append(roles, schema.GetRoleString(schema.RoleUser)) - case "admin": - roles = append(roles, schema.GetRoleString(schema.RoleAdmin)) - } + for role := range roleSet { + roles = append(roles, role) } if len(roles) == 0 { @@ -188,8 +228,12 @@ func (oa *OIDC) OAuth2Callback(rw http.ResponseWriter, r *http.Request) { handleOIDCUser(user) } - oa.authentication.SaveSession(rw, r, user) - cclog.Infof("login successfull: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects) + if err := oa.authentication.SaveSession(rw, r, user); err != nil { + cclog.Errorf("session save failed for user %q: %s", user.Username, err.Error()) + http.Error(rw, "Failed to create session", http.StatusInternalServerError) + return + } + cclog.Infof("login successful: user: %#v (roles: %v, projects: %v)", user.Username, user.Roles, user.Projects) userCtx := context.WithValue(r.Context(), repository.ContextUserKey, user) http.RedirectHandler("/", http.StatusTemporaryRedirect).ServeHTTP(rw, r.WithContext(userCtx)) } @@ -206,7 +250,24 @@ func (oa *OIDC) OAuth2Login(rw http.ResponseWriter, r *http.Request) { codeVerifier := oauth2.GenerateVerifier() setCallbackCookie(rw, r, "verifier", codeVerifier) + // Generate nonce for ID token replay protection + nonce, err := randString(16) + if err != nil { + http.Error(rw, "Internal error", http.StatusInternalServerError) + return + } + setCallbackCookie(rw, r, "nonce", nonce) + + // Build redirect URL from the incoming request + scheme := "https" + if r.TLS == nil && r.Header.Get("X-Forwarded-Proto") != "https" { + scheme = "http" + } + oa.client.RedirectURL = fmt.Sprintf("%s://%s/oidc-callback", scheme, r.Host) + // Redirect user to consent page to ask for permission - url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline, oauth2.S256ChallengeOption(codeVerifier)) + url := oa.client.AuthCodeURL(state, oauth2.AccessTypeOffline, + oauth2.S256ChallengeOption(codeVerifier), + oidc.Nonce(nonce)) http.Redirect(rw, r, url, http.StatusFound) } diff --git a/internal/auth/schema.go b/internal/auth/schema.go index 060ff2df..b6ee0702 100644 --- a/internal/auth/schema.go +++ b/internal/auth/schema.go @@ -15,37 +15,44 @@ var configSchema = ` "description": "Configure how long a token is valid. As string parsable by time.ParseDuration()", "type": "string" }, - "cookieName": { + "cookie-name": { "description": "Cookie that should be checked for a JWT token.", "type": "string" }, - "validateUser": { + "validate-user": { "description": "Deny login for users not in database (but defined in JWT). Overwrite roles in JWT with database roles.", "type": "boolean" }, - "trustedIssuer": { + "trusted-issuer": { "description": "Issuer that should be accepted when validating external JWTs ", "type": "string" }, - "syncUserOnLogin": { + "sync-user-on-login": { "description": "Add non-existent user to DB at login attempt with values provided in JWT.", "type": "boolean" + }, + "update-user-on-login": { + "description": "Should an existent user attributes in the DB be updated at login attempt with values provided in JWT.", + "type": "boolean" } }, "required": ["max-age"] }, "oidc": { - "provider": { - "description": "", - "type": "string" - }, - "syncUserOnLogin": { - "description": "", - "type": "boolean" - }, - "updateUserOnLogin": { - "description": "", - "type": "boolean" + "type": "object", + "properties": { + "provider": { + "description": "OpenID Connect provider URL.", + "type": "string" + }, + "sync-user-on-login": { + "description": "Add non-existent user to DB at login attempt with values provided.", + "type": "boolean" + }, + "update-user-on-login": { + "description": "Should an existent user attributes in the DB be updated at login attempt with values provided.", + "type": "boolean" + } }, "required": ["provider"] }, @@ -57,40 +64,48 @@ var configSchema = ` "description": "URL of LDAP directory server.", "type": "string" }, - "user_base": { + "user-base": { "description": "Base DN of user tree root.", "type": "string" }, - "search_dn": { + "search-dn": { "description": "DN for authenticating LDAP admin account with general read rights.", "type": "string" }, - "user_bind": { + "user-bind": { "description": "Expression used to authenticate users via LDAP bind. Must contain uid={username}.", "type": "string" }, - "user_filter": { + "user-filter": { "description": "Filter to extract users for syncing.", "type": "string" }, - "username_attr": { + "username-attr": { "description": "Attribute with full username. Default: gecos", "type": "string" }, - "sync_interval": { + "sync-interval": { "description": "Interval used for syncing local user table with LDAP directory. Parsed using time.ParseDuration.", "type": "string" }, - "sync_del_old_users": { + "sync-del-old-users": { "description": "Delete obsolete users in database.", "type": "boolean" }, - "syncUserOnLogin": { + "uid-attr": { + "description": "LDAP attribute used as login username. Default: uid", + "type": "string" + }, + "sync-user-on-login": { "description": "Add non-existent user to DB at login attempt if user exists in Ldap directory", "type": "boolean" + }, + "update-user-on-login": { + "description": "Should an existent user attributes in the DB be updated at login attempt with values from LDAP.", + "type": "boolean" } }, - "required": ["url", "user_base", "search_dn", "user_bind", "user_filter"] + "required": ["url", "user-base", "search-dn", "user-bind", "user-filter"] }, "required": ["jwts"] }` diff --git a/internal/config/config.go b/internal/config/config.go index 69a44440..f635b7e4 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -11,8 +11,8 @@ import ( "encoding/json" "time" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/resampler" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/resampler" ) type ProgramConfig struct { @@ -20,7 +20,9 @@ type ProgramConfig struct { Addr string `json:"addr"` // Addresses from which secured admin API endpoints can be reached, can be wildcard "*" - APIAllowedIPs []string `json:"apiAllowedIPs"` + APIAllowedIPs []string `json:"api-allowed-ips"` + + APISubjects *NATSConfig `json:"api-subjects"` // Drop root permissions once .env was read and the port was taken. User string `json:"user"` @@ -35,16 +37,9 @@ type ProgramConfig struct { EmbedStaticFiles bool `json:"embed-static-files"` StaticFiles string `json:"static-files"` - // 'sqlite3' or 'mysql' (mysql will work for mariadb as well) - DBDriver string `json:"db-driver"` - - // For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!). + // Path to SQLite database file DB string `json:"db"` - // Keep all metric data in the metric data repositories, - // do not write to the job-archive. - DisableArchive bool `json:"disable-archive"` - EnableJobTaggers bool `json:"enable-job-taggers"` // Validate json input against schema @@ -76,17 +71,42 @@ type ProgramConfig struct { // If exists, will enable dynamic zoom in frontend metric plots using the configured values EnableResampling *ResampleConfig `json:"resampling"` + + // Systemd unit name for log viewer (default: "clustercockpit") + SystemdUnit string `json:"systemd-unit"` + + // Node state retention configuration + NodeStateRetention *NodeStateRetention `json:"nodestate-retention"` +} + +type NodeStateRetention struct { + Policy string `json:"policy"` // "delete" or "move" + Age int `json:"age"` // hours, default 24 + TargetKind string `json:"target-kind"` // "file" or "s3" + TargetPath string `json:"target-path"` + TargetEndpoint string `json:"target-endpoint"` + TargetBucket string `json:"target-bucket"` + TargetAccessKey string `json:"target-access-key"` + TargetSecretKey string `json:"target-secret-key"` + TargetRegion string `json:"target-region"` + TargetUsePathStyle bool `json:"target-use-path-style"` + MaxFileSizeMB int `json:"max-file-size-mb"` } type ResampleConfig struct { // Minimum number of points to trigger resampling of data - MinimumPoints int `json:"minimumPoints"` + MinimumPoints int `json:"minimum-points"` // Array of resampling target resolutions, in seconds; Example: [600,300,60] Resolutions []int `json:"resolutions"` // Trigger next zoom level at less than this many visible datapoints Trigger int `json:"trigger"` } +type NATSConfig struct { + SubjectJobEvent string `json:"subject-job-event"` + SubjectNodeState string `json:"subject-node-state"` +} + type IntRange struct { From int `json:"from"` To int `json:"to"` @@ -100,32 +120,20 @@ type TimeRange struct { type FilterRanges struct { Duration *IntRange `json:"duration"` - NumNodes *IntRange `json:"numNodes"` - StartTime *TimeRange `json:"startTime"` + NumNodes *IntRange `json:"num-nodes"` + StartTime *TimeRange `json:"start-time"` } -type ClusterConfig struct { - Name string `json:"name"` - FilterRanges *FilterRanges `json:"filterRanges"` - MetricDataRepository json.RawMessage `json:"metricDataRepository"` -} - -var Clusters []*ClusterConfig - var Keys ProgramConfig = ProgramConfig{ Addr: "localhost:8080", - DisableAuthentication: false, EmbedStaticFiles: true, - DBDriver: "sqlite3", DB: "./var/job.db", - DisableArchive: false, - Validate: false, SessionMaxAge: "168h", StopJobsExceedingWalltime: 0, ShortRunningJobsDuration: 5 * 60, } -func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) { +func Init(mainConfig json.RawMessage) { Validate(configSchema, mainConfig) dec := json.NewDecoder(bytes.NewReader(mainConfig)) dec.DisallowUnknownFields() @@ -133,17 +141,6 @@ func Init(mainConfig json.RawMessage, clusterConfig json.RawMessage) { cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error()) } - Validate(clustersSchema, clusterConfig) - dec = json.NewDecoder(bytes.NewReader(clusterConfig)) - dec.DisallowUnknownFields() - if err := dec.Decode(&Clusters); err != nil { - cclog.Abortf("Config Init: Could not decode config file '%s'.\nError: %s\n", mainConfig, err.Error()) - } - - if len(Clusters) < 1 { - cclog.Abort("Config Init: At least one cluster required in config. Exited with error.") - } - if Keys.EnableResampling != nil && Keys.EnableResampling.MinimumPoints > 0 { resampler.SetMinimumRequiredPoints(Keys.EnableResampling.MinimumPoints) } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 35e1c65e..e4a700ff 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -8,19 +8,15 @@ package config import ( "testing" - ccconf "github.com/ClusterCockpit/cc-lib/ccConfig" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" ) func TestInit(t *testing.T) { fp := "../../configs/config.json" ccconf.Init(fp) if cfg := ccconf.GetPackageConfig("main"); cfg != nil { - if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil { - Init(cfg, clustercfg) - } else { - cclog.Abort("Cluster configuration must be present") - } + Init(cfg) } else { cclog.Abort("Main configuration must be present") } @@ -34,11 +30,7 @@ func TestInitMinimal(t *testing.T) { fp := "../../configs/config-demo.json" ccconf.Init(fp) if cfg := ccconf.GetPackageConfig("main"); cfg != nil { - if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil { - Init(cfg, clustercfg) - } else { - cclog.Abort("Cluster configuration must be present") - } + Init(cfg) } else { cclog.Abort("Main configuration must be present") } diff --git a/internal/config/default_metrics.go b/internal/config/default_metrics.go index d710aa06..d31b1f15 100644 --- a/internal/config/default_metrics.go +++ b/internal/config/default_metrics.go @@ -15,7 +15,7 @@ import ( type DefaultMetricsCluster struct { Name string `json:"name"` - DefaultMetrics string `json:"default_metrics"` + DefaultMetrics string `json:"default-metrics"` } type DefaultMetricsConfig struct { diff --git a/internal/config/schema.go b/internal/config/schema.go index ed1f42d8..5e02732d 100644 --- a/internal/config/schema.go +++ b/internal/config/schema.go @@ -6,14 +6,14 @@ package config var configSchema = ` - { +{ "type": "object", "properties": { "addr": { "description": "Address where the http (or https) server will listen on (for example: 'localhost:80').", "type": "string" }, - "apiAllowedIPs": { + "api-allowed-ips": { "description": "Addresses from which secured API endpoints can be reached", "type": "array", "items": { @@ -41,13 +41,9 @@ var configSchema = ` "type": "string" }, "db": { - "description": "For sqlite3 a filename, for mysql a DSN in this format: https://github.com/go-sql-driver/mysql#dsn-data-source-name (Without query parameters!).", + "description": "Path to SQLite database file (e.g., './var/job.db')", "type": "string" }, - "disable-archive": { - "description": "Keep all metric data in the metric data repositories, do not write to the job-archive.", - "type": "boolean" - }, "enable-job-taggers": { "description": "Turn on automatic application and jobclass taggers", "type": "boolean" @@ -81,28 +77,22 @@ var configSchema = ` "type": "integer" }, "emission-constant": { - "description": ".", + "description": "Energy mix CO2 emission constant [g/kWh]. If set, displays estimated CO2 emission for jobs.", "type": "integer" }, - "cron-frequency": { - "description": "Frequency of cron job workers.", - "type": "object", - "properties": { - "duration-worker": { - "description": "Duration Update Worker [Defaults to '5m']", - "type": "string" - }, - "footprint-worker": { - "description": "Metric-Footprint Update Worker [Defaults to '10m']", - "type": "string" - } - } + "machine-state-dir": { + "description": "Where to store MachineState files.", + "type": "string" }, - "enable-resampling": { + "systemd-unit": { + "description": "Systemd unit name for log viewer (default: 'clustercockpit').", + "type": "string" + }, + "resampling": { "description": "Enable dynamic zoom in frontend metric plots.", "type": "object", "properties": { - "minimumPoints": { + "minimum-points": { "description": "Minimum points to trigger resampling of time-series data.", "type": "integer" }, @@ -119,87 +109,74 @@ var configSchema = ` } }, "required": ["trigger", "resolutions"] - } - }, - "required": ["apiAllowedIPs"] - }` - -var clustersSchema = ` - { - "type": "array", - "items": { + }, + "api-subjects": { + "description": "NATS subjects configuration for subscribing to job and node events.", "type": "object", "properties": { - "name": { - "description": "The name of the cluster.", + "subject-job-event": { + "description": "NATS subject for job events (start_job, stop_job)", "type": "string" }, - "metricDataRepository": { - "description": "Type of the metric data repository for this cluster", - "type": "object", - "properties": { - "kind": { - "type": "string", - "enum": ["influxdb", "prometheus", "cc-metric-store", "cc-metric-store-internal", "test"] - }, - "url": { - "type": "string" - }, - "token": { - "type": "string" - } - }, - "required": ["kind"] - }, - "filterRanges": { - "description": "This option controls the slider ranges for the UI controls of numNodes, duration, and startTime.", - "type": "object", - "properties": { - "numNodes": { - "description": "UI slider range for number of nodes", - "type": "object", - "properties": { - "from": { - "type": "integer" - }, - "to": { - "type": "integer" - } - }, - "required": ["from", "to"] - }, - "duration": { - "description": "UI slider range for duration", - "type": "object", - "properties": { - "from": { - "type": "integer" - }, - "to": { - "type": "integer" - } - }, - "required": ["from", "to"] - }, - "startTime": { - "description": "UI slider range for start time", - "type": "object", - "properties": { - "from": { - "type": "string", - "format": "date-time" - }, - "to": { - "type": "null" - } - }, - "required": ["from", "to"] - } - }, - "required": ["numNodes", "duration", "startTime"] + "subject-node-state": { + "description": "NATS subject for node state updates", + "type": "string" } }, - "required": ["name", "metricDataRepository", "filterRanges"], - "minItems": 1 + "required": ["subject-job-event", "subject-node-state"] + }, + "nodestate-retention": { + "description": "Node state retention configuration for cleaning up old node_state rows.", + "type": "object", + "properties": { + "policy": { + "description": "Retention policy: 'delete' to remove old rows, 'move' to archive to Parquet then delete.", + "type": "string", + "enum": ["delete", "move"] + }, + "age": { + "description": "Retention age in hours (default: 24).", + "type": "integer" + }, + "target-kind": { + "description": "Target kind for parquet archiving: 'file' or 's3'.", + "type": "string", + "enum": ["file", "s3"] + }, + "target-path": { + "description": "Filesystem path for parquet file target.", + "type": "string" + }, + "target-endpoint": { + "description": "S3 endpoint URL.", + "type": "string" + }, + "target-bucket": { + "description": "S3 bucket name.", + "type": "string" + }, + "target-access-key": { + "description": "S3 access key.", + "type": "string" + }, + "target-secret-key": { + "description": "S3 secret key.", + "type": "string" + }, + "target-region": { + "description": "S3 region.", + "type": "string" + }, + "target-use-path-style": { + "description": "Use path-style S3 addressing.", + "type": "boolean" + }, + "max-file-size-mb": { + "description": "Maximum parquet file size in MB (default: 128).", + "type": "integer" + } + }, + "required": ["policy"] } - }` + } +}` diff --git a/internal/config/validate.go b/internal/config/validate.go index 6ac67f5e..af8591ca 100644 --- a/internal/config/validate.go +++ b/internal/config/validate.go @@ -8,7 +8,7 @@ package config import ( "encoding/json" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/santhosh-tekuri/jsonschema/v5" ) diff --git a/internal/graph/generated/generated.go b/internal/graph/generated/generated.go index b4ade3b2..8d773222 100644 --- a/internal/graph/generated/generated.go +++ b/internal/graph/generated/generated.go @@ -8,7 +8,6 @@ import ( "errors" "fmt" "strconv" - "sync" "sync/atomic" "time" @@ -16,7 +15,7 @@ import ( "github.com/99designs/gqlgen/graphql/introspection" "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/graph/model" - "github.com/ClusterCockpit/cc-lib/schema" + "github.com/ClusterCockpit/cc-lib/v2/schema" gqlparser "github.com/vektah/gqlparser/v2" "github.com/vektah/gqlparser/v2/ast" ) @@ -25,20 +24,10 @@ import ( // NewExecutableSchema creates an ExecutableSchema from the ResolverRoot interface. func NewExecutableSchema(cfg Config) graphql.ExecutableSchema { - return &executableSchema{ - schema: cfg.Schema, - resolvers: cfg.Resolvers, - directives: cfg.Directives, - complexity: cfg.Complexity, - } + return &executableSchema{SchemaData: cfg.Schema, Resolvers: cfg.Resolvers, Directives: cfg.Directives, ComplexityRoot: cfg.Complexity} } -type Config struct { - Schema *ast.Schema - Resolvers ResolverRoot - Directives DirectiveRoot - Complexity ComplexityRoot -} +type Config = graphql.Config[ResolverRoot, DirectiveRoot, ComplexityRoot] type ResolverRoot interface { Cluster() ClusterResolver @@ -50,7 +39,8 @@ type ResolverRoot interface { SubCluster() SubClusterResolver } -type DirectiveRoot struct{} +type DirectiveRoot struct { +} type ComplexityRoot struct { Accelerator struct { @@ -65,6 +55,18 @@ type ComplexityRoot struct { SubClusters func(childComplexity int) int } + ClusterMetricWithName struct { + Data func(childComplexity int) int + Name func(childComplexity int) int + Timestep func(childComplexity int) int + Unit func(childComplexity int) int + } + + ClusterMetrics struct { + Metrics func(childComplexity int) int + NodeCount func(childComplexity int) int + } + ClusterSupport struct { Cluster func(childComplexity int) int SubClusters func(childComplexity int) int @@ -274,6 +276,7 @@ type ComplexityRoot struct { Cluster func(childComplexity int) int CpusAllocated func(childComplexity int) int GpusAllocated func(childComplexity int) int + HealthData func(childComplexity int) int HealthState func(childComplexity int) int Hostname func(childComplexity int) int ID func(childComplexity int) int @@ -318,6 +321,7 @@ type ComplexityRoot struct { Query struct { AllocatedNodes func(childComplexity int, cluster string) int + ClusterMetrics func(childComplexity int, cluster string, metrics []string, from time.Time, to time.Time) int Clusters func(childComplexity int) int GlobalMetrics func(childComplexity int) int Job func(childComplexity int, id string) int @@ -333,6 +337,7 @@ type ComplexityRoot struct { NodeStates func(childComplexity int, filter []*model.NodeFilter) int NodeStatesTimed func(childComplexity int, filter []*model.NodeFilter, typeArg string) int Nodes func(childComplexity int, filter []*model.NodeFilter, order *model.OrderByInput) int + NodesWithMeta func(childComplexity int, filter []*model.NodeFilter, order *model.OrderByInput) int RooflineHeatmap func(childComplexity int, filter []*model.JobFilter, rows int, cols int, minX float64, minY float64, maxX float64, maxY float64) int ScopedJobStats func(childComplexity int, id string, metrics []string, scopes []schema.MetricScope) int Tags func(childComplexity int) int @@ -355,7 +360,7 @@ type ComplexityRoot struct { Series struct { Data func(childComplexity int) int Hostname func(childComplexity int) int - Id func(childComplexity int) int + ID func(childComplexity int) int Statistics func(childComplexity int) int } @@ -462,6 +467,7 @@ type NodeResolver interface { SchedulerState(ctx context.Context, obj *schema.Node) (schema.SchedulerState, error) HealthState(ctx context.Context, obj *schema.Node) (string, error) MetaData(ctx context.Context, obj *schema.Node) (any, error) + HealthData(ctx context.Context, obj *schema.Node) (any, error) } type QueryResolver interface { Clusters(ctx context.Context) ([]*schema.Cluster, error) @@ -471,6 +477,7 @@ type QueryResolver interface { AllocatedNodes(ctx context.Context, cluster string) ([]*model.Count, error) Node(ctx context.Context, id string) (*schema.Node, error) Nodes(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) + NodesWithMeta(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) NodeStates(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStates, error) NodeStatesTimed(ctx context.Context, filter []*model.NodeFilter, typeArg string) ([]*model.NodeStatesTimed, error) Job(ctx context.Context, id string) (*schema.Job, error) @@ -484,806 +491,840 @@ type QueryResolver interface { RooflineHeatmap(ctx context.Context, filter []*model.JobFilter, rows int, cols int, minX float64, minY float64, maxX float64, maxY float64) ([][]float64, error) NodeMetrics(ctx context.Context, cluster string, nodes []string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time) ([]*model.NodeMetrics, error) NodeMetricsList(ctx context.Context, cluster string, subCluster string, stateFilter string, nodeFilter string, scopes []schema.MetricScope, metrics []string, from time.Time, to time.Time, page *model.PageRequest, resolution *int) (*model.NodesResultList, error) + ClusterMetrics(ctx context.Context, cluster string, metrics []string, from time.Time, to time.Time) (*model.ClusterMetrics, error) } type SubClusterResolver interface { NumberOfNodes(ctx context.Context, obj *schema.SubCluster) (int, error) } -type executableSchema struct { - schema *ast.Schema - resolvers ResolverRoot - directives DirectiveRoot - complexity ComplexityRoot -} +type executableSchema graphql.ExecutableSchemaState[ResolverRoot, DirectiveRoot, ComplexityRoot] func (e *executableSchema) Schema() *ast.Schema { - if e.schema != nil { - return e.schema + if e.SchemaData != nil { + return e.SchemaData } return parsedSchema } func (e *executableSchema) Complexity(ctx context.Context, typeName, field string, childComplexity int, rawArgs map[string]any) (int, bool) { - ec := executionContext{nil, e, 0, 0, nil} + ec := newExecutionContext(nil, e, nil) _ = ec switch typeName + "." + field { case "Accelerator.id": - if e.complexity.Accelerator.ID == nil { + if e.ComplexityRoot.Accelerator.ID == nil { break } - return e.complexity.Accelerator.ID(childComplexity), true + return e.ComplexityRoot.Accelerator.ID(childComplexity), true case "Accelerator.model": - if e.complexity.Accelerator.Model == nil { + if e.ComplexityRoot.Accelerator.Model == nil { break } - return e.complexity.Accelerator.Model(childComplexity), true + return e.ComplexityRoot.Accelerator.Model(childComplexity), true case "Accelerator.type": - if e.complexity.Accelerator.Type == nil { + if e.ComplexityRoot.Accelerator.Type == nil { break } - return e.complexity.Accelerator.Type(childComplexity), true + return e.ComplexityRoot.Accelerator.Type(childComplexity), true case "Cluster.name": - if e.complexity.Cluster.Name == nil { + if e.ComplexityRoot.Cluster.Name == nil { break } - return e.complexity.Cluster.Name(childComplexity), true + return e.ComplexityRoot.Cluster.Name(childComplexity), true case "Cluster.partitions": - if e.complexity.Cluster.Partitions == nil { + if e.ComplexityRoot.Cluster.Partitions == nil { break } - return e.complexity.Cluster.Partitions(childComplexity), true + return e.ComplexityRoot.Cluster.Partitions(childComplexity), true case "Cluster.subClusters": - if e.complexity.Cluster.SubClusters == nil { + if e.ComplexityRoot.Cluster.SubClusters == nil { break } - return e.complexity.Cluster.SubClusters(childComplexity), true + return e.ComplexityRoot.Cluster.SubClusters(childComplexity), true + + case "ClusterMetricWithName.data": + if e.ComplexityRoot.ClusterMetricWithName.Data == nil { + break + } + + return e.ComplexityRoot.ClusterMetricWithName.Data(childComplexity), true + case "ClusterMetricWithName.name": + if e.ComplexityRoot.ClusterMetricWithName.Name == nil { + break + } + + return e.ComplexityRoot.ClusterMetricWithName.Name(childComplexity), true + case "ClusterMetricWithName.timestep": + if e.ComplexityRoot.ClusterMetricWithName.Timestep == nil { + break + } + + return e.ComplexityRoot.ClusterMetricWithName.Timestep(childComplexity), true + case "ClusterMetricWithName.unit": + if e.ComplexityRoot.ClusterMetricWithName.Unit == nil { + break + } + + return e.ComplexityRoot.ClusterMetricWithName.Unit(childComplexity), true + + case "ClusterMetrics.metrics": + if e.ComplexityRoot.ClusterMetrics.Metrics == nil { + break + } + + return e.ComplexityRoot.ClusterMetrics.Metrics(childComplexity), true + case "ClusterMetrics.nodeCount": + if e.ComplexityRoot.ClusterMetrics.NodeCount == nil { + break + } + + return e.ComplexityRoot.ClusterMetrics.NodeCount(childComplexity), true case "ClusterSupport.cluster": - if e.complexity.ClusterSupport.Cluster == nil { + if e.ComplexityRoot.ClusterSupport.Cluster == nil { break } - return e.complexity.ClusterSupport.Cluster(childComplexity), true + return e.ComplexityRoot.ClusterSupport.Cluster(childComplexity), true case "ClusterSupport.subClusters": - if e.complexity.ClusterSupport.SubClusters == nil { + if e.ComplexityRoot.ClusterSupport.SubClusters == nil { break } - return e.complexity.ClusterSupport.SubClusters(childComplexity), true + return e.ComplexityRoot.ClusterSupport.SubClusters(childComplexity), true case "Count.count": - if e.complexity.Count.Count == nil { + if e.ComplexityRoot.Count.Count == nil { break } - return e.complexity.Count.Count(childComplexity), true + return e.ComplexityRoot.Count.Count(childComplexity), true case "Count.name": - if e.complexity.Count.Name == nil { + if e.ComplexityRoot.Count.Name == nil { break } - return e.complexity.Count.Name(childComplexity), true + return e.ComplexityRoot.Count.Name(childComplexity), true case "EnergyFootprintValue.hardware": - if e.complexity.EnergyFootprintValue.Hardware == nil { + if e.ComplexityRoot.EnergyFootprintValue.Hardware == nil { break } - return e.complexity.EnergyFootprintValue.Hardware(childComplexity), true + return e.ComplexityRoot.EnergyFootprintValue.Hardware(childComplexity), true case "EnergyFootprintValue.metric": - if e.complexity.EnergyFootprintValue.Metric == nil { + if e.ComplexityRoot.EnergyFootprintValue.Metric == nil { break } - return e.complexity.EnergyFootprintValue.Metric(childComplexity), true + return e.ComplexityRoot.EnergyFootprintValue.Metric(childComplexity), true case "EnergyFootprintValue.value": - if e.complexity.EnergyFootprintValue.Value == nil { + if e.ComplexityRoot.EnergyFootprintValue.Value == nil { break } - return e.complexity.EnergyFootprintValue.Value(childComplexity), true + return e.ComplexityRoot.EnergyFootprintValue.Value(childComplexity), true case "FootprintValue.name": - if e.complexity.FootprintValue.Name == nil { + if e.ComplexityRoot.FootprintValue.Name == nil { break } - return e.complexity.FootprintValue.Name(childComplexity), true + return e.ComplexityRoot.FootprintValue.Name(childComplexity), true case "FootprintValue.stat": - if e.complexity.FootprintValue.Stat == nil { + if e.ComplexityRoot.FootprintValue.Stat == nil { break } - return e.complexity.FootprintValue.Stat(childComplexity), true + return e.ComplexityRoot.FootprintValue.Stat(childComplexity), true case "FootprintValue.value": - if e.complexity.FootprintValue.Value == nil { + if e.ComplexityRoot.FootprintValue.Value == nil { break } - return e.complexity.FootprintValue.Value(childComplexity), true + return e.ComplexityRoot.FootprintValue.Value(childComplexity), true case "Footprints.metrics": - if e.complexity.Footprints.Metrics == nil { + if e.ComplexityRoot.Footprints.Metrics == nil { break } - return e.complexity.Footprints.Metrics(childComplexity), true + return e.ComplexityRoot.Footprints.Metrics(childComplexity), true case "Footprints.timeWeights": - if e.complexity.Footprints.TimeWeights == nil { + if e.ComplexityRoot.Footprints.TimeWeights == nil { break } - return e.complexity.Footprints.TimeWeights(childComplexity), true + return e.ComplexityRoot.Footprints.TimeWeights(childComplexity), true case "GlobalMetricListItem.availability": - if e.complexity.GlobalMetricListItem.Availability == nil { + if e.ComplexityRoot.GlobalMetricListItem.Availability == nil { break } - return e.complexity.GlobalMetricListItem.Availability(childComplexity), true + return e.ComplexityRoot.GlobalMetricListItem.Availability(childComplexity), true case "GlobalMetricListItem.footprint": - if e.complexity.GlobalMetricListItem.Footprint == nil { + if e.ComplexityRoot.GlobalMetricListItem.Footprint == nil { break } - return e.complexity.GlobalMetricListItem.Footprint(childComplexity), true + return e.ComplexityRoot.GlobalMetricListItem.Footprint(childComplexity), true case "GlobalMetricListItem.name": - if e.complexity.GlobalMetricListItem.Name == nil { + if e.ComplexityRoot.GlobalMetricListItem.Name == nil { break } - return e.complexity.GlobalMetricListItem.Name(childComplexity), true + return e.ComplexityRoot.GlobalMetricListItem.Name(childComplexity), true case "GlobalMetricListItem.scope": - if e.complexity.GlobalMetricListItem.Scope == nil { + if e.ComplexityRoot.GlobalMetricListItem.Scope == nil { break } - return e.complexity.GlobalMetricListItem.Scope(childComplexity), true + return e.ComplexityRoot.GlobalMetricListItem.Scope(childComplexity), true case "GlobalMetricListItem.unit": - if e.complexity.GlobalMetricListItem.Unit == nil { + if e.ComplexityRoot.GlobalMetricListItem.Unit == nil { break } - return e.complexity.GlobalMetricListItem.Unit(childComplexity), true + return e.ComplexityRoot.GlobalMetricListItem.Unit(childComplexity), true case "HistoPoint.count": - if e.complexity.HistoPoint.Count == nil { + if e.ComplexityRoot.HistoPoint.Count == nil { break } - return e.complexity.HistoPoint.Count(childComplexity), true + return e.ComplexityRoot.HistoPoint.Count(childComplexity), true case "HistoPoint.value": - if e.complexity.HistoPoint.Value == nil { + if e.ComplexityRoot.HistoPoint.Value == nil { break } - return e.complexity.HistoPoint.Value(childComplexity), true + return e.ComplexityRoot.HistoPoint.Value(childComplexity), true case "IntRangeOutput.from": - if e.complexity.IntRangeOutput.From == nil { + if e.ComplexityRoot.IntRangeOutput.From == nil { break } - return e.complexity.IntRangeOutput.From(childComplexity), true + return e.ComplexityRoot.IntRangeOutput.From(childComplexity), true case "IntRangeOutput.to": - if e.complexity.IntRangeOutput.To == nil { + if e.ComplexityRoot.IntRangeOutput.To == nil { break } - return e.complexity.IntRangeOutput.To(childComplexity), true + return e.ComplexityRoot.IntRangeOutput.To(childComplexity), true case "Job.arrayJobId": - if e.complexity.Job.ArrayJobID == nil { + if e.ComplexityRoot.Job.ArrayJobID == nil { break } - return e.complexity.Job.ArrayJobID(childComplexity), true + return e.ComplexityRoot.Job.ArrayJobID(childComplexity), true case "Job.cluster": - if e.complexity.Job.Cluster == nil { + if e.ComplexityRoot.Job.Cluster == nil { break } - return e.complexity.Job.Cluster(childComplexity), true + return e.ComplexityRoot.Job.Cluster(childComplexity), true case "Job.concurrentJobs": - if e.complexity.Job.ConcurrentJobs == nil { + if e.ComplexityRoot.Job.ConcurrentJobs == nil { break } - return e.complexity.Job.ConcurrentJobs(childComplexity), true + return e.ComplexityRoot.Job.ConcurrentJobs(childComplexity), true case "Job.duration": - if e.complexity.Job.Duration == nil { + if e.ComplexityRoot.Job.Duration == nil { break } - return e.complexity.Job.Duration(childComplexity), true + return e.ComplexityRoot.Job.Duration(childComplexity), true case "Job.energy": - if e.complexity.Job.Energy == nil { + if e.ComplexityRoot.Job.Energy == nil { break } - return e.complexity.Job.Energy(childComplexity), true + return e.ComplexityRoot.Job.Energy(childComplexity), true case "Job.energyFootprint": - if e.complexity.Job.EnergyFootprint == nil { + if e.ComplexityRoot.Job.EnergyFootprint == nil { break } - return e.complexity.Job.EnergyFootprint(childComplexity), true + return e.ComplexityRoot.Job.EnergyFootprint(childComplexity), true case "Job.footprint": - if e.complexity.Job.Footprint == nil { + if e.ComplexityRoot.Job.Footprint == nil { break } - return e.complexity.Job.Footprint(childComplexity), true + return e.ComplexityRoot.Job.Footprint(childComplexity), true case "Job.id": - if e.complexity.Job.ID == nil { + if e.ComplexityRoot.Job.ID == nil { break } - return e.complexity.Job.ID(childComplexity), true + return e.ComplexityRoot.Job.ID(childComplexity), true case "Job.jobId": - if e.complexity.Job.JobID == nil { + if e.ComplexityRoot.Job.JobID == nil { break } - return e.complexity.Job.JobID(childComplexity), true + return e.ComplexityRoot.Job.JobID(childComplexity), true case "Job.metaData": - if e.complexity.Job.MetaData == nil { + if e.ComplexityRoot.Job.MetaData == nil { break } - return e.complexity.Job.MetaData(childComplexity), true + return e.ComplexityRoot.Job.MetaData(childComplexity), true case "Job.monitoringStatus": - if e.complexity.Job.MonitoringStatus == nil { + if e.ComplexityRoot.Job.MonitoringStatus == nil { break } - return e.complexity.Job.MonitoringStatus(childComplexity), true + return e.ComplexityRoot.Job.MonitoringStatus(childComplexity), true case "Job.numAcc": - if e.complexity.Job.NumAcc == nil { + if e.ComplexityRoot.Job.NumAcc == nil { break } - return e.complexity.Job.NumAcc(childComplexity), true + return e.ComplexityRoot.Job.NumAcc(childComplexity), true case "Job.numHWThreads": - if e.complexity.Job.NumHWThreads == nil { + if e.ComplexityRoot.Job.NumHWThreads == nil { break } - return e.complexity.Job.NumHWThreads(childComplexity), true + return e.ComplexityRoot.Job.NumHWThreads(childComplexity), true case "Job.numNodes": - if e.complexity.Job.NumNodes == nil { + if e.ComplexityRoot.Job.NumNodes == nil { break } - return e.complexity.Job.NumNodes(childComplexity), true + return e.ComplexityRoot.Job.NumNodes(childComplexity), true case "Job.partition": - if e.complexity.Job.Partition == nil { + if e.ComplexityRoot.Job.Partition == nil { break } - return e.complexity.Job.Partition(childComplexity), true + return e.ComplexityRoot.Job.Partition(childComplexity), true case "Job.project": - if e.complexity.Job.Project == nil { + if e.ComplexityRoot.Job.Project == nil { break } - return e.complexity.Job.Project(childComplexity), true + return e.ComplexityRoot.Job.Project(childComplexity), true case "Job.resources": - if e.complexity.Job.Resources == nil { + if e.ComplexityRoot.Job.Resources == nil { break } - return e.complexity.Job.Resources(childComplexity), true + return e.ComplexityRoot.Job.Resources(childComplexity), true case "Job.SMT": - if e.complexity.Job.SMT == nil { + if e.ComplexityRoot.Job.SMT == nil { break } - return e.complexity.Job.SMT(childComplexity), true + return e.ComplexityRoot.Job.SMT(childComplexity), true case "Job.shared": - if e.complexity.Job.Shared == nil { + if e.ComplexityRoot.Job.Shared == nil { break } - return e.complexity.Job.Shared(childComplexity), true + return e.ComplexityRoot.Job.Shared(childComplexity), true case "Job.startTime": - if e.complexity.Job.StartTime == nil { + if e.ComplexityRoot.Job.StartTime == nil { break } - return e.complexity.Job.StartTime(childComplexity), true + return e.ComplexityRoot.Job.StartTime(childComplexity), true case "Job.state": - if e.complexity.Job.State == nil { + if e.ComplexityRoot.Job.State == nil { break } - return e.complexity.Job.State(childComplexity), true + return e.ComplexityRoot.Job.State(childComplexity), true case "Job.subCluster": - if e.complexity.Job.SubCluster == nil { + if e.ComplexityRoot.Job.SubCluster == nil { break } - return e.complexity.Job.SubCluster(childComplexity), true + return e.ComplexityRoot.Job.SubCluster(childComplexity), true case "Job.tags": - if e.complexity.Job.Tags == nil { + if e.ComplexityRoot.Job.Tags == nil { break } - return e.complexity.Job.Tags(childComplexity), true + return e.ComplexityRoot.Job.Tags(childComplexity), true case "Job.user": - if e.complexity.Job.User == nil { + if e.ComplexityRoot.Job.User == nil { break } - return e.complexity.Job.User(childComplexity), true + return e.ComplexityRoot.Job.User(childComplexity), true case "Job.userData": - if e.complexity.Job.UserData == nil { + if e.ComplexityRoot.Job.UserData == nil { break } - return e.complexity.Job.UserData(childComplexity), true + return e.ComplexityRoot.Job.UserData(childComplexity), true case "Job.walltime": - if e.complexity.Job.Walltime == nil { + if e.ComplexityRoot.Job.Walltime == nil { break } - return e.complexity.Job.Walltime(childComplexity), true + return e.ComplexityRoot.Job.Walltime(childComplexity), true case "JobLink.id": - if e.complexity.JobLink.ID == nil { + if e.ComplexityRoot.JobLink.ID == nil { break } - return e.complexity.JobLink.ID(childComplexity), true + return e.ComplexityRoot.JobLink.ID(childComplexity), true case "JobLink.jobId": - if e.complexity.JobLink.JobID == nil { + if e.ComplexityRoot.JobLink.JobID == nil { break } - return e.complexity.JobLink.JobID(childComplexity), true + return e.ComplexityRoot.JobLink.JobID(childComplexity), true case "JobLinkResultList.count": - if e.complexity.JobLinkResultList.Count == nil { + if e.ComplexityRoot.JobLinkResultList.Count == nil { break } - return e.complexity.JobLinkResultList.Count(childComplexity), true + return e.ComplexityRoot.JobLinkResultList.Count(childComplexity), true case "JobLinkResultList.items": - if e.complexity.JobLinkResultList.Items == nil { + if e.ComplexityRoot.JobLinkResultList.Items == nil { break } - return e.complexity.JobLinkResultList.Items(childComplexity), true + return e.ComplexityRoot.JobLinkResultList.Items(childComplexity), true case "JobLinkResultList.listQuery": - if e.complexity.JobLinkResultList.ListQuery == nil { + if e.ComplexityRoot.JobLinkResultList.ListQuery == nil { break } - return e.complexity.JobLinkResultList.ListQuery(childComplexity), true + return e.ComplexityRoot.JobLinkResultList.ListQuery(childComplexity), true case "JobMetric.series": - if e.complexity.JobMetric.Series == nil { + if e.ComplexityRoot.JobMetric.Series == nil { break } - return e.complexity.JobMetric.Series(childComplexity), true + return e.ComplexityRoot.JobMetric.Series(childComplexity), true case "JobMetric.statisticsSeries": - if e.complexity.JobMetric.StatisticsSeries == nil { + if e.ComplexityRoot.JobMetric.StatisticsSeries == nil { break } - return e.complexity.JobMetric.StatisticsSeries(childComplexity), true + return e.ComplexityRoot.JobMetric.StatisticsSeries(childComplexity), true case "JobMetric.timestep": - if e.complexity.JobMetric.Timestep == nil { + if e.ComplexityRoot.JobMetric.Timestep == nil { break } - return e.complexity.JobMetric.Timestep(childComplexity), true + return e.ComplexityRoot.JobMetric.Timestep(childComplexity), true case "JobMetric.unit": - if e.complexity.JobMetric.Unit == nil { + if e.ComplexityRoot.JobMetric.Unit == nil { break } - return e.complexity.JobMetric.Unit(childComplexity), true + return e.ComplexityRoot.JobMetric.Unit(childComplexity), true case "JobMetricWithName.metric": - if e.complexity.JobMetricWithName.Metric == nil { + if e.ComplexityRoot.JobMetricWithName.Metric == nil { break } - return e.complexity.JobMetricWithName.Metric(childComplexity), true + return e.ComplexityRoot.JobMetricWithName.Metric(childComplexity), true case "JobMetricWithName.name": - if e.complexity.JobMetricWithName.Name == nil { + if e.ComplexityRoot.JobMetricWithName.Name == nil { break } - return e.complexity.JobMetricWithName.Name(childComplexity), true + return e.ComplexityRoot.JobMetricWithName.Name(childComplexity), true case "JobMetricWithName.scope": - if e.complexity.JobMetricWithName.Scope == nil { + if e.ComplexityRoot.JobMetricWithName.Scope == nil { break } - return e.complexity.JobMetricWithName.Scope(childComplexity), true + return e.ComplexityRoot.JobMetricWithName.Scope(childComplexity), true case "JobResultList.count": - if e.complexity.JobResultList.Count == nil { + if e.ComplexityRoot.JobResultList.Count == nil { break } - return e.complexity.JobResultList.Count(childComplexity), true + return e.ComplexityRoot.JobResultList.Count(childComplexity), true case "JobResultList.hasNextPage": - if e.complexity.JobResultList.HasNextPage == nil { + if e.ComplexityRoot.JobResultList.HasNextPage == nil { break } - return e.complexity.JobResultList.HasNextPage(childComplexity), true + return e.ComplexityRoot.JobResultList.HasNextPage(childComplexity), true case "JobResultList.items": - if e.complexity.JobResultList.Items == nil { + if e.ComplexityRoot.JobResultList.Items == nil { break } - return e.complexity.JobResultList.Items(childComplexity), true + return e.ComplexityRoot.JobResultList.Items(childComplexity), true case "JobResultList.limit": - if e.complexity.JobResultList.Limit == nil { + if e.ComplexityRoot.JobResultList.Limit == nil { break } - return e.complexity.JobResultList.Limit(childComplexity), true + return e.ComplexityRoot.JobResultList.Limit(childComplexity), true case "JobResultList.offset": - if e.complexity.JobResultList.Offset == nil { + if e.ComplexityRoot.JobResultList.Offset == nil { break } - return e.complexity.JobResultList.Offset(childComplexity), true + return e.ComplexityRoot.JobResultList.Offset(childComplexity), true case "JobStats.cluster": - if e.complexity.JobStats.Cluster == nil { + if e.ComplexityRoot.JobStats.Cluster == nil { break } - return e.complexity.JobStats.Cluster(childComplexity), true + return e.ComplexityRoot.JobStats.Cluster(childComplexity), true case "JobStats.duration": - if e.complexity.JobStats.Duration == nil { + if e.ComplexityRoot.JobStats.Duration == nil { break } - return e.complexity.JobStats.Duration(childComplexity), true + return e.ComplexityRoot.JobStats.Duration(childComplexity), true case "JobStats.id": - if e.complexity.JobStats.ID == nil { + if e.ComplexityRoot.JobStats.ID == nil { break } - return e.complexity.JobStats.ID(childComplexity), true + return e.ComplexityRoot.JobStats.ID(childComplexity), true case "JobStats.jobId": - if e.complexity.JobStats.JobID == nil { + if e.ComplexityRoot.JobStats.JobID == nil { break } - return e.complexity.JobStats.JobID(childComplexity), true + return e.ComplexityRoot.JobStats.JobID(childComplexity), true case "JobStats.numAccelerators": - if e.complexity.JobStats.NumAccelerators == nil { + if e.ComplexityRoot.JobStats.NumAccelerators == nil { break } - return e.complexity.JobStats.NumAccelerators(childComplexity), true + return e.ComplexityRoot.JobStats.NumAccelerators(childComplexity), true case "JobStats.numHWThreads": - if e.complexity.JobStats.NumHWThreads == nil { + if e.ComplexityRoot.JobStats.NumHWThreads == nil { break } - return e.complexity.JobStats.NumHWThreads(childComplexity), true + return e.ComplexityRoot.JobStats.NumHWThreads(childComplexity), true case "JobStats.numNodes": - if e.complexity.JobStats.NumNodes == nil { + if e.ComplexityRoot.JobStats.NumNodes == nil { break } - return e.complexity.JobStats.NumNodes(childComplexity), true + return e.ComplexityRoot.JobStats.NumNodes(childComplexity), true case "JobStats.startTime": - if e.complexity.JobStats.StartTime == nil { + if e.ComplexityRoot.JobStats.StartTime == nil { break } - return e.complexity.JobStats.StartTime(childComplexity), true + return e.ComplexityRoot.JobStats.StartTime(childComplexity), true case "JobStats.stats": - if e.complexity.JobStats.Stats == nil { + if e.ComplexityRoot.JobStats.Stats == nil { break } - return e.complexity.JobStats.Stats(childComplexity), true + return e.ComplexityRoot.JobStats.Stats(childComplexity), true case "JobStats.subCluster": - if e.complexity.JobStats.SubCluster == nil { + if e.ComplexityRoot.JobStats.SubCluster == nil { break } - return e.complexity.JobStats.SubCluster(childComplexity), true + return e.ComplexityRoot.JobStats.SubCluster(childComplexity), true case "JobsStatistics.histDuration": - if e.complexity.JobsStatistics.HistDuration == nil { + if e.ComplexityRoot.JobsStatistics.HistDuration == nil { break } - return e.complexity.JobsStatistics.HistDuration(childComplexity), true + return e.ComplexityRoot.JobsStatistics.HistDuration(childComplexity), true case "JobsStatistics.histMetrics": - if e.complexity.JobsStatistics.HistMetrics == nil { + if e.ComplexityRoot.JobsStatistics.HistMetrics == nil { break } - return e.complexity.JobsStatistics.HistMetrics(childComplexity), true + return e.ComplexityRoot.JobsStatistics.HistMetrics(childComplexity), true case "JobsStatistics.histNumAccs": - if e.complexity.JobsStatistics.HistNumAccs == nil { + if e.ComplexityRoot.JobsStatistics.HistNumAccs == nil { break } - return e.complexity.JobsStatistics.HistNumAccs(childComplexity), true + return e.ComplexityRoot.JobsStatistics.HistNumAccs(childComplexity), true case "JobsStatistics.histNumCores": - if e.complexity.JobsStatistics.HistNumCores == nil { + if e.ComplexityRoot.JobsStatistics.HistNumCores == nil { break } - return e.complexity.JobsStatistics.HistNumCores(childComplexity), true + return e.ComplexityRoot.JobsStatistics.HistNumCores(childComplexity), true case "JobsStatistics.histNumNodes": - if e.complexity.JobsStatistics.HistNumNodes == nil { + if e.ComplexityRoot.JobsStatistics.HistNumNodes == nil { break } - return e.complexity.JobsStatistics.HistNumNodes(childComplexity), true + return e.ComplexityRoot.JobsStatistics.HistNumNodes(childComplexity), true case "JobsStatistics.id": - if e.complexity.JobsStatistics.ID == nil { + if e.ComplexityRoot.JobsStatistics.ID == nil { break } - return e.complexity.JobsStatistics.ID(childComplexity), true + return e.ComplexityRoot.JobsStatistics.ID(childComplexity), true case "JobsStatistics.name": - if e.complexity.JobsStatistics.Name == nil { + if e.ComplexityRoot.JobsStatistics.Name == nil { break } - return e.complexity.JobsStatistics.Name(childComplexity), true + return e.ComplexityRoot.JobsStatistics.Name(childComplexity), true case "JobsStatistics.runningJobs": - if e.complexity.JobsStatistics.RunningJobs == nil { + if e.ComplexityRoot.JobsStatistics.RunningJobs == nil { break } - return e.complexity.JobsStatistics.RunningJobs(childComplexity), true + return e.ComplexityRoot.JobsStatistics.RunningJobs(childComplexity), true case "JobsStatistics.shortJobs": - if e.complexity.JobsStatistics.ShortJobs == nil { + if e.ComplexityRoot.JobsStatistics.ShortJobs == nil { break } - return e.complexity.JobsStatistics.ShortJobs(childComplexity), true + return e.ComplexityRoot.JobsStatistics.ShortJobs(childComplexity), true case "JobsStatistics.totalAccHours": - if e.complexity.JobsStatistics.TotalAccHours == nil { + if e.ComplexityRoot.JobsStatistics.TotalAccHours == nil { break } - return e.complexity.JobsStatistics.TotalAccHours(childComplexity), true + return e.ComplexityRoot.JobsStatistics.TotalAccHours(childComplexity), true case "JobsStatistics.totalAccs": - if e.complexity.JobsStatistics.TotalAccs == nil { + if e.ComplexityRoot.JobsStatistics.TotalAccs == nil { break } - return e.complexity.JobsStatistics.TotalAccs(childComplexity), true + return e.ComplexityRoot.JobsStatistics.TotalAccs(childComplexity), true case "JobsStatistics.totalCoreHours": - if e.complexity.JobsStatistics.TotalCoreHours == nil { + if e.ComplexityRoot.JobsStatistics.TotalCoreHours == nil { break } - return e.complexity.JobsStatistics.TotalCoreHours(childComplexity), true + return e.ComplexityRoot.JobsStatistics.TotalCoreHours(childComplexity), true case "JobsStatistics.totalCores": - if e.complexity.JobsStatistics.TotalCores == nil { + if e.ComplexityRoot.JobsStatistics.TotalCores == nil { break } - return e.complexity.JobsStatistics.TotalCores(childComplexity), true + return e.ComplexityRoot.JobsStatistics.TotalCores(childComplexity), true case "JobsStatistics.totalJobs": - if e.complexity.JobsStatistics.TotalJobs == nil { + if e.ComplexityRoot.JobsStatistics.TotalJobs == nil { break } - return e.complexity.JobsStatistics.TotalJobs(childComplexity), true + return e.ComplexityRoot.JobsStatistics.TotalJobs(childComplexity), true case "JobsStatistics.totalNodeHours": - if e.complexity.JobsStatistics.TotalNodeHours == nil { + if e.ComplexityRoot.JobsStatistics.TotalNodeHours == nil { break } - return e.complexity.JobsStatistics.TotalNodeHours(childComplexity), true + return e.ComplexityRoot.JobsStatistics.TotalNodeHours(childComplexity), true case "JobsStatistics.totalNodes": - if e.complexity.JobsStatistics.TotalNodes == nil { + if e.ComplexityRoot.JobsStatistics.TotalNodes == nil { break } - return e.complexity.JobsStatistics.TotalNodes(childComplexity), true + return e.ComplexityRoot.JobsStatistics.TotalNodes(childComplexity), true case "JobsStatistics.totalUsers": - if e.complexity.JobsStatistics.TotalUsers == nil { + if e.ComplexityRoot.JobsStatistics.TotalUsers == nil { break } - return e.complexity.JobsStatistics.TotalUsers(childComplexity), true + return e.ComplexityRoot.JobsStatistics.TotalUsers(childComplexity), true case "JobsStatistics.totalWalltime": - if e.complexity.JobsStatistics.TotalWalltime == nil { + if e.ComplexityRoot.JobsStatistics.TotalWalltime == nil { break } - return e.complexity.JobsStatistics.TotalWalltime(childComplexity), true + return e.ComplexityRoot.JobsStatistics.TotalWalltime(childComplexity), true case "MetricConfig.aggregation": - if e.complexity.MetricConfig.Aggregation == nil { + if e.ComplexityRoot.MetricConfig.Aggregation == nil { break } - return e.complexity.MetricConfig.Aggregation(childComplexity), true + return e.ComplexityRoot.MetricConfig.Aggregation(childComplexity), true case "MetricConfig.alert": - if e.complexity.MetricConfig.Alert == nil { + if e.ComplexityRoot.MetricConfig.Alert == nil { break } - return e.complexity.MetricConfig.Alert(childComplexity), true + return e.ComplexityRoot.MetricConfig.Alert(childComplexity), true case "MetricConfig.caution": - if e.complexity.MetricConfig.Caution == nil { + if e.ComplexityRoot.MetricConfig.Caution == nil { break } - return e.complexity.MetricConfig.Caution(childComplexity), true + return e.ComplexityRoot.MetricConfig.Caution(childComplexity), true case "MetricConfig.lowerIsBetter": - if e.complexity.MetricConfig.LowerIsBetter == nil { + if e.ComplexityRoot.MetricConfig.LowerIsBetter == nil { break } - return e.complexity.MetricConfig.LowerIsBetter(childComplexity), true + return e.ComplexityRoot.MetricConfig.LowerIsBetter(childComplexity), true case "MetricConfig.name": - if e.complexity.MetricConfig.Name == nil { + if e.ComplexityRoot.MetricConfig.Name == nil { break } - return e.complexity.MetricConfig.Name(childComplexity), true + return e.ComplexityRoot.MetricConfig.Name(childComplexity), true case "MetricConfig.normal": - if e.complexity.MetricConfig.Normal == nil { + if e.ComplexityRoot.MetricConfig.Normal == nil { break } - return e.complexity.MetricConfig.Normal(childComplexity), true + return e.ComplexityRoot.MetricConfig.Normal(childComplexity), true case "MetricConfig.peak": - if e.complexity.MetricConfig.Peak == nil { + if e.ComplexityRoot.MetricConfig.Peak == nil { break } - return e.complexity.MetricConfig.Peak(childComplexity), true + return e.ComplexityRoot.MetricConfig.Peak(childComplexity), true case "MetricConfig.scope": - if e.complexity.MetricConfig.Scope == nil { + if e.ComplexityRoot.MetricConfig.Scope == nil { break } - return e.complexity.MetricConfig.Scope(childComplexity), true + return e.ComplexityRoot.MetricConfig.Scope(childComplexity), true case "MetricConfig.subClusters": - if e.complexity.MetricConfig.SubClusters == nil { + if e.ComplexityRoot.MetricConfig.SubClusters == nil { break } - return e.complexity.MetricConfig.SubClusters(childComplexity), true + return e.ComplexityRoot.MetricConfig.SubClusters(childComplexity), true case "MetricConfig.timestep": - if e.complexity.MetricConfig.Timestep == nil { + if e.ComplexityRoot.MetricConfig.Timestep == nil { break } - return e.complexity.MetricConfig.Timestep(childComplexity), true + return e.ComplexityRoot.MetricConfig.Timestep(childComplexity), true case "MetricConfig.unit": - if e.complexity.MetricConfig.Unit == nil { + if e.ComplexityRoot.MetricConfig.Unit == nil { break } - return e.complexity.MetricConfig.Unit(childComplexity), true + return e.ComplexityRoot.MetricConfig.Unit(childComplexity), true case "MetricFootprints.data": - if e.complexity.MetricFootprints.Data == nil { + if e.ComplexityRoot.MetricFootprints.Data == nil { break } - return e.complexity.MetricFootprints.Data(childComplexity), true + return e.ComplexityRoot.MetricFootprints.Data(childComplexity), true case "MetricFootprints.metric": - if e.complexity.MetricFootprints.Metric == nil { + if e.ComplexityRoot.MetricFootprints.Metric == nil { break } - return e.complexity.MetricFootprints.Metric(childComplexity), true + return e.ComplexityRoot.MetricFootprints.Metric(childComplexity), true case "MetricHistoPoint.bin": - if e.complexity.MetricHistoPoint.Bin == nil { + if e.ComplexityRoot.MetricHistoPoint.Bin == nil { break } - return e.complexity.MetricHistoPoint.Bin(childComplexity), true + return e.ComplexityRoot.MetricHistoPoint.Bin(childComplexity), true case "MetricHistoPoint.count": - if e.complexity.MetricHistoPoint.Count == nil { + if e.ComplexityRoot.MetricHistoPoint.Count == nil { break } - return e.complexity.MetricHistoPoint.Count(childComplexity), true + return e.ComplexityRoot.MetricHistoPoint.Count(childComplexity), true case "MetricHistoPoint.max": - if e.complexity.MetricHistoPoint.Max == nil { + if e.ComplexityRoot.MetricHistoPoint.Max == nil { break } - return e.complexity.MetricHistoPoint.Max(childComplexity), true + return e.ComplexityRoot.MetricHistoPoint.Max(childComplexity), true case "MetricHistoPoint.min": - if e.complexity.MetricHistoPoint.Min == nil { + if e.ComplexityRoot.MetricHistoPoint.Min == nil { break } - return e.complexity.MetricHistoPoint.Min(childComplexity), true + return e.ComplexityRoot.MetricHistoPoint.Min(childComplexity), true case "MetricHistoPoints.data": - if e.complexity.MetricHistoPoints.Data == nil { + if e.ComplexityRoot.MetricHistoPoints.Data == nil { break } - return e.complexity.MetricHistoPoints.Data(childComplexity), true + return e.ComplexityRoot.MetricHistoPoints.Data(childComplexity), true case "MetricHistoPoints.metric": - if e.complexity.MetricHistoPoints.Metric == nil { + if e.ComplexityRoot.MetricHistoPoints.Metric == nil { break } - return e.complexity.MetricHistoPoints.Metric(childComplexity), true + return e.ComplexityRoot.MetricHistoPoints.Metric(childComplexity), true case "MetricHistoPoints.stat": - if e.complexity.MetricHistoPoints.Stat == nil { + if e.ComplexityRoot.MetricHistoPoints.Stat == nil { break } - return e.complexity.MetricHistoPoints.Stat(childComplexity), true + return e.ComplexityRoot.MetricHistoPoints.Stat(childComplexity), true case "MetricHistoPoints.unit": - if e.complexity.MetricHistoPoints.Unit == nil { + if e.ComplexityRoot.MetricHistoPoints.Unit == nil { break } - return e.complexity.MetricHistoPoints.Unit(childComplexity), true + return e.ComplexityRoot.MetricHistoPoints.Unit(childComplexity), true case "MetricStatistics.avg": - if e.complexity.MetricStatistics.Avg == nil { + if e.ComplexityRoot.MetricStatistics.Avg == nil { break } - return e.complexity.MetricStatistics.Avg(childComplexity), true + return e.ComplexityRoot.MetricStatistics.Avg(childComplexity), true case "MetricStatistics.max": - if e.complexity.MetricStatistics.Max == nil { + if e.ComplexityRoot.MetricStatistics.Max == nil { break } - return e.complexity.MetricStatistics.Max(childComplexity), true + return e.ComplexityRoot.MetricStatistics.Max(childComplexity), true case "MetricStatistics.min": - if e.complexity.MetricStatistics.Min == nil { + if e.ComplexityRoot.MetricStatistics.Min == nil { break } - return e.complexity.MetricStatistics.Min(childComplexity), true + return e.ComplexityRoot.MetricStatistics.Min(childComplexity), true case "MetricValue.name": - if e.complexity.MetricValue.Name == nil { + if e.ComplexityRoot.MetricValue.Name == nil { break } - return e.complexity.MetricValue.Name(childComplexity), true + return e.ComplexityRoot.MetricValue.Name(childComplexity), true case "MetricValue.unit": - if e.complexity.MetricValue.Unit == nil { + if e.ComplexityRoot.MetricValue.Unit == nil { break } - return e.complexity.MetricValue.Unit(childComplexity), true + return e.ComplexityRoot.MetricValue.Unit(childComplexity), true case "MetricValue.value": - if e.complexity.MetricValue.Value == nil { + if e.ComplexityRoot.MetricValue.Value == nil { break } - return e.complexity.MetricValue.Value(childComplexity), true + return e.ComplexityRoot.MetricValue.Value(childComplexity), true case "Mutation.addTagsToJob": - if e.complexity.Mutation.AddTagsToJob == nil { + if e.ComplexityRoot.Mutation.AddTagsToJob == nil { break } @@ -1292,9 +1333,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Mutation.AddTagsToJob(childComplexity, args["job"].(string), args["tagIds"].([]string)), true + return e.ComplexityRoot.Mutation.AddTagsToJob(childComplexity, args["job"].(string), args["tagIds"].([]string)), true case "Mutation.createTag": - if e.complexity.Mutation.CreateTag == nil { + if e.ComplexityRoot.Mutation.CreateTag == nil { break } @@ -1303,9 +1344,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Mutation.CreateTag(childComplexity, args["type"].(string), args["name"].(string), args["scope"].(string)), true + return e.ComplexityRoot.Mutation.CreateTag(childComplexity, args["type"].(string), args["name"].(string), args["scope"].(string)), true case "Mutation.deleteTag": - if e.complexity.Mutation.DeleteTag == nil { + if e.ComplexityRoot.Mutation.DeleteTag == nil { break } @@ -1314,9 +1355,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Mutation.DeleteTag(childComplexity, args["id"].(string)), true + return e.ComplexityRoot.Mutation.DeleteTag(childComplexity, args["id"].(string)), true case "Mutation.removeTagFromList": - if e.complexity.Mutation.RemoveTagFromList == nil { + if e.ComplexityRoot.Mutation.RemoveTagFromList == nil { break } @@ -1325,9 +1366,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Mutation.RemoveTagFromList(childComplexity, args["tagIds"].([]string)), true + return e.ComplexityRoot.Mutation.RemoveTagFromList(childComplexity, args["tagIds"].([]string)), true case "Mutation.removeTagsFromJob": - if e.complexity.Mutation.RemoveTagsFromJob == nil { + if e.ComplexityRoot.Mutation.RemoveTagsFromJob == nil { break } @@ -1336,9 +1377,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Mutation.RemoveTagsFromJob(childComplexity, args["job"].(string), args["tagIds"].([]string)), true + return e.ComplexityRoot.Mutation.RemoveTagsFromJob(childComplexity, args["job"].(string), args["tagIds"].([]string)), true case "Mutation.updateConfiguration": - if e.complexity.Mutation.UpdateConfiguration == nil { + if e.ComplexityRoot.Mutation.UpdateConfiguration == nil { break } @@ -1347,221 +1388,222 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Mutation.UpdateConfiguration(childComplexity, args["name"].(string), args["value"].(string)), true + return e.ComplexityRoot.Mutation.UpdateConfiguration(childComplexity, args["name"].(string), args["value"].(string)), true case "NamedStats.data": - if e.complexity.NamedStats.Data == nil { + if e.ComplexityRoot.NamedStats.Data == nil { break } - return e.complexity.NamedStats.Data(childComplexity), true + return e.ComplexityRoot.NamedStats.Data(childComplexity), true case "NamedStats.name": - if e.complexity.NamedStats.Name == nil { + if e.ComplexityRoot.NamedStats.Name == nil { break } - return e.complexity.NamedStats.Name(childComplexity), true + return e.ComplexityRoot.NamedStats.Name(childComplexity), true case "NamedStatsWithScope.name": - if e.complexity.NamedStatsWithScope.Name == nil { + if e.ComplexityRoot.NamedStatsWithScope.Name == nil { break } - return e.complexity.NamedStatsWithScope.Name(childComplexity), true + return e.ComplexityRoot.NamedStatsWithScope.Name(childComplexity), true case "NamedStatsWithScope.scope": - if e.complexity.NamedStatsWithScope.Scope == nil { + if e.ComplexityRoot.NamedStatsWithScope.Scope == nil { break } - return e.complexity.NamedStatsWithScope.Scope(childComplexity), true + return e.ComplexityRoot.NamedStatsWithScope.Scope(childComplexity), true case "NamedStatsWithScope.stats": - if e.complexity.NamedStatsWithScope.Stats == nil { + if e.ComplexityRoot.NamedStatsWithScope.Stats == nil { break } - return e.complexity.NamedStatsWithScope.Stats(childComplexity), true + return e.ComplexityRoot.NamedStatsWithScope.Stats(childComplexity), true case "Node.cluster": - if e.complexity.Node.Cluster == nil { + if e.ComplexityRoot.Node.Cluster == nil { break } - return e.complexity.Node.Cluster(childComplexity), true + return e.ComplexityRoot.Node.Cluster(childComplexity), true case "Node.cpusAllocated": - if e.complexity.Node.CpusAllocated == nil { + if e.ComplexityRoot.Node.CpusAllocated == nil { break } - return e.complexity.Node.CpusAllocated(childComplexity), true + return e.ComplexityRoot.Node.CpusAllocated(childComplexity), true case "Node.gpusAllocated": - if e.complexity.Node.GpusAllocated == nil { + if e.ComplexityRoot.Node.GpusAllocated == nil { break } - return e.complexity.Node.GpusAllocated(childComplexity), true + return e.ComplexityRoot.Node.GpusAllocated(childComplexity), true + case "Node.healthData": + if e.ComplexityRoot.Node.HealthData == nil { + break + } + + return e.ComplexityRoot.Node.HealthData(childComplexity), true case "Node.healthState": - if e.complexity.Node.HealthState == nil { + if e.ComplexityRoot.Node.HealthState == nil { break } - return e.complexity.Node.HealthState(childComplexity), true + return e.ComplexityRoot.Node.HealthState(childComplexity), true case "Node.hostname": - if e.complexity.Node.Hostname == nil { + if e.ComplexityRoot.Node.Hostname == nil { break } - return e.complexity.Node.Hostname(childComplexity), true + return e.ComplexityRoot.Node.Hostname(childComplexity), true case "Node.id": - if e.complexity.Node.ID == nil { + if e.ComplexityRoot.Node.ID == nil { break } - return e.complexity.Node.ID(childComplexity), true + return e.ComplexityRoot.Node.ID(childComplexity), true case "Node.jobsRunning": - if e.complexity.Node.JobsRunning == nil { + if e.ComplexityRoot.Node.JobsRunning == nil { break } - return e.complexity.Node.JobsRunning(childComplexity), true + return e.ComplexityRoot.Node.JobsRunning(childComplexity), true case "Node.memoryAllocated": - if e.complexity.Node.MemoryAllocated == nil { + if e.ComplexityRoot.Node.MemoryAllocated == nil { break } - return e.complexity.Node.MemoryAllocated(childComplexity), true + return e.ComplexityRoot.Node.MemoryAllocated(childComplexity), true case "Node.metaData": - if e.complexity.Node.MetaData == nil { + if e.ComplexityRoot.Node.MetaData == nil { break } - return e.complexity.Node.MetaData(childComplexity), true + return e.ComplexityRoot.Node.MetaData(childComplexity), true case "Node.schedulerState": - if e.complexity.Node.SchedulerState == nil { + if e.ComplexityRoot.Node.SchedulerState == nil { break } - return e.complexity.Node.SchedulerState(childComplexity), true + return e.ComplexityRoot.Node.SchedulerState(childComplexity), true case "Node.subCluster": - if e.complexity.Node.SubCluster == nil { + if e.ComplexityRoot.Node.SubCluster == nil { break } - return e.complexity.Node.SubCluster(childComplexity), true + return e.ComplexityRoot.Node.SubCluster(childComplexity), true case "NodeMetrics.host": - if e.complexity.NodeMetrics.Host == nil { + if e.ComplexityRoot.NodeMetrics.Host == nil { break } - return e.complexity.NodeMetrics.Host(childComplexity), true - + return e.ComplexityRoot.NodeMetrics.Host(childComplexity), true case "NodeMetrics.metrics": - if e.complexity.NodeMetrics.Metrics == nil { + if e.ComplexityRoot.NodeMetrics.Metrics == nil { break } - return e.complexity.NodeMetrics.Metrics(childComplexity), true - + return e.ComplexityRoot.NodeMetrics.Metrics(childComplexity), true case "NodeMetrics.state": - if e.complexity.NodeMetrics.State == nil { + if e.ComplexityRoot.NodeMetrics.State == nil { break } - return e.complexity.NodeMetrics.State(childComplexity), true - + return e.ComplexityRoot.NodeMetrics.State(childComplexity), true case "NodeMetrics.subCluster": - if e.complexity.NodeMetrics.SubCluster == nil { + if e.ComplexityRoot.NodeMetrics.SubCluster == nil { break } - return e.complexity.NodeMetrics.SubCluster(childComplexity), true + return e.ComplexityRoot.NodeMetrics.SubCluster(childComplexity), true case "NodeStateResultList.count": - if e.complexity.NodeStateResultList.Count == nil { + if e.ComplexityRoot.NodeStateResultList.Count == nil { break } - return e.complexity.NodeStateResultList.Count(childComplexity), true + return e.ComplexityRoot.NodeStateResultList.Count(childComplexity), true case "NodeStateResultList.items": - if e.complexity.NodeStateResultList.Items == nil { + if e.ComplexityRoot.NodeStateResultList.Items == nil { break } - return e.complexity.NodeStateResultList.Items(childComplexity), true + return e.ComplexityRoot.NodeStateResultList.Items(childComplexity), true case "NodeStates.count": - if e.complexity.NodeStates.Count == nil { + if e.ComplexityRoot.NodeStates.Count == nil { break } - return e.complexity.NodeStates.Count(childComplexity), true + return e.ComplexityRoot.NodeStates.Count(childComplexity), true case "NodeStates.state": - if e.complexity.NodeStates.State == nil { + if e.ComplexityRoot.NodeStates.State == nil { break } - return e.complexity.NodeStates.State(childComplexity), true + return e.ComplexityRoot.NodeStates.State(childComplexity), true case "NodeStatesTimed.counts": - if e.complexity.NodeStatesTimed.Counts == nil { + if e.ComplexityRoot.NodeStatesTimed.Counts == nil { break } - return e.complexity.NodeStatesTimed.Counts(childComplexity), true - + return e.ComplexityRoot.NodeStatesTimed.Counts(childComplexity), true case "NodeStatesTimed.state": - if e.complexity.NodeStatesTimed.State == nil { + if e.ComplexityRoot.NodeStatesTimed.State == nil { break } - return e.complexity.NodeStatesTimed.State(childComplexity), true - + return e.ComplexityRoot.NodeStatesTimed.State(childComplexity), true case "NodeStatesTimed.times": - if e.complexity.NodeStatesTimed.Times == nil { + if e.ComplexityRoot.NodeStatesTimed.Times == nil { break } - return e.complexity.NodeStatesTimed.Times(childComplexity), true + return e.ComplexityRoot.NodeStatesTimed.Times(childComplexity), true case "NodesResultList.count": - if e.complexity.NodesResultList.Count == nil { + if e.ComplexityRoot.NodesResultList.Count == nil { break } - return e.complexity.NodesResultList.Count(childComplexity), true + return e.ComplexityRoot.NodesResultList.Count(childComplexity), true case "NodesResultList.hasNextPage": - if e.complexity.NodesResultList.HasNextPage == nil { + if e.ComplexityRoot.NodesResultList.HasNextPage == nil { break } - return e.complexity.NodesResultList.HasNextPage(childComplexity), true + return e.ComplexityRoot.NodesResultList.HasNextPage(childComplexity), true case "NodesResultList.items": - if e.complexity.NodesResultList.Items == nil { + if e.ComplexityRoot.NodesResultList.Items == nil { break } - return e.complexity.NodesResultList.Items(childComplexity), true + return e.ComplexityRoot.NodesResultList.Items(childComplexity), true case "NodesResultList.limit": - if e.complexity.NodesResultList.Limit == nil { + if e.ComplexityRoot.NodesResultList.Limit == nil { break } - return e.complexity.NodesResultList.Limit(childComplexity), true + return e.ComplexityRoot.NodesResultList.Limit(childComplexity), true case "NodesResultList.offset": - if e.complexity.NodesResultList.Offset == nil { + if e.ComplexityRoot.NodesResultList.Offset == nil { break } - return e.complexity.NodesResultList.Offset(childComplexity), true + return e.ComplexityRoot.NodesResultList.Offset(childComplexity), true case "NodesResultList.totalNodes": - if e.complexity.NodesResultList.TotalNodes == nil { + if e.ComplexityRoot.NodesResultList.TotalNodes == nil { break } - return e.complexity.NodesResultList.TotalNodes(childComplexity), true + return e.ComplexityRoot.NodesResultList.TotalNodes(childComplexity), true case "Query.allocatedNodes": - if e.complexity.Query.AllocatedNodes == nil { + if e.ComplexityRoot.Query.AllocatedNodes == nil { break } @@ -1570,21 +1612,33 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.AllocatedNodes(childComplexity, args["cluster"].(string)), true + return e.ComplexityRoot.Query.AllocatedNodes(childComplexity, args["cluster"].(string)), true + case "Query.clusterMetrics": + if e.ComplexityRoot.Query.ClusterMetrics == nil { + break + } + + args, err := ec.field_Query_clusterMetrics_args(ctx, rawArgs) + if err != nil { + return 0, false + } + + return e.ComplexityRoot.Query.ClusterMetrics(childComplexity, args["cluster"].(string), args["metrics"].([]string), args["from"].(time.Time), args["to"].(time.Time)), true case "Query.clusters": - if e.complexity.Query.Clusters == nil { + if e.ComplexityRoot.Query.Clusters == nil { break } - return e.complexity.Query.Clusters(childComplexity), true + return e.ComplexityRoot.Query.Clusters(childComplexity), true case "Query.globalMetrics": - if e.complexity.Query.GlobalMetrics == nil { + if e.ComplexityRoot.Query.GlobalMetrics == nil { break } - return e.complexity.Query.GlobalMetrics(childComplexity), true + return e.ComplexityRoot.Query.GlobalMetrics(childComplexity), true + case "Query.job": - if e.complexity.Query.Job == nil { + if e.ComplexityRoot.Query.Job == nil { break } @@ -1593,9 +1647,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.Job(childComplexity, args["id"].(string)), true + return e.ComplexityRoot.Query.Job(childComplexity, args["id"].(string)), true case "Query.jobMetrics": - if e.complexity.Query.JobMetrics == nil { + if e.ComplexityRoot.Query.JobMetrics == nil { break } @@ -1604,9 +1658,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.JobMetrics(childComplexity, args["id"].(string), args["metrics"].([]string), args["scopes"].([]schema.MetricScope), args["resolution"].(*int)), true + return e.ComplexityRoot.Query.JobMetrics(childComplexity, args["id"].(string), args["metrics"].([]string), args["scopes"].([]schema.MetricScope), args["resolution"].(*int)), true case "Query.jobStats": - if e.complexity.Query.JobStats == nil { + if e.ComplexityRoot.Query.JobStats == nil { break } @@ -1615,9 +1669,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.JobStats(childComplexity, args["id"].(string), args["metrics"].([]string)), true + return e.ComplexityRoot.Query.JobStats(childComplexity, args["id"].(string), args["metrics"].([]string)), true case "Query.jobs": - if e.complexity.Query.Jobs == nil { + if e.ComplexityRoot.Query.Jobs == nil { break } @@ -1626,9 +1680,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.Jobs(childComplexity, args["filter"].([]*model.JobFilter), args["page"].(*model.PageRequest), args["order"].(*model.OrderByInput)), true + return e.ComplexityRoot.Query.Jobs(childComplexity, args["filter"].([]*model.JobFilter), args["page"].(*model.PageRequest), args["order"].(*model.OrderByInput)), true case "Query.jobsFootprints": - if e.complexity.Query.JobsFootprints == nil { + if e.ComplexityRoot.Query.JobsFootprints == nil { break } @@ -1637,9 +1691,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.JobsFootprints(childComplexity, args["filter"].([]*model.JobFilter), args["metrics"].([]string)), true + return e.ComplexityRoot.Query.JobsFootprints(childComplexity, args["filter"].([]*model.JobFilter), args["metrics"].([]string)), true case "Query.jobsMetricStats": - if e.complexity.Query.JobsMetricStats == nil { + if e.ComplexityRoot.Query.JobsMetricStats == nil { break } @@ -1648,9 +1702,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.JobsMetricStats(childComplexity, args["filter"].([]*model.JobFilter), args["metrics"].([]string)), true + return e.ComplexityRoot.Query.JobsMetricStats(childComplexity, args["filter"].([]*model.JobFilter), args["metrics"].([]string)), true case "Query.jobsStatistics": - if e.complexity.Query.JobsStatistics == nil { + if e.ComplexityRoot.Query.JobsStatistics == nil { break } @@ -1659,9 +1713,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.JobsStatistics(childComplexity, args["filter"].([]*model.JobFilter), args["metrics"].([]string), args["page"].(*model.PageRequest), args["sortBy"].(*model.SortByAggregate), args["groupBy"].(*model.Aggregate), args["numDurationBins"].(*string), args["numMetricBins"].(*int)), true + return e.ComplexityRoot.Query.JobsStatistics(childComplexity, args["filter"].([]*model.JobFilter), args["metrics"].([]string), args["page"].(*model.PageRequest), args["sortBy"].(*model.SortByAggregate), args["groupBy"].(*model.Aggregate), args["numDurationBins"].(*string), args["numMetricBins"].(*int)), true case "Query.node": - if e.complexity.Query.Node == nil { + if e.ComplexityRoot.Query.Node == nil { break } @@ -1670,9 +1724,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.Node(childComplexity, args["id"].(string)), true + return e.ComplexityRoot.Query.Node(childComplexity, args["id"].(string)), true case "Query.nodeMetrics": - if e.complexity.Query.NodeMetrics == nil { + if e.ComplexityRoot.Query.NodeMetrics == nil { break } @@ -1681,9 +1735,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.NodeMetrics(childComplexity, args["cluster"].(string), args["nodes"].([]string), args["scopes"].([]schema.MetricScope), args["metrics"].([]string), args["from"].(time.Time), args["to"].(time.Time)), true + return e.ComplexityRoot.Query.NodeMetrics(childComplexity, args["cluster"].(string), args["nodes"].([]string), args["scopes"].([]schema.MetricScope), args["metrics"].([]string), args["from"].(time.Time), args["to"].(time.Time)), true case "Query.nodeMetricsList": - if e.complexity.Query.NodeMetricsList == nil { + if e.ComplexityRoot.Query.NodeMetricsList == nil { break } @@ -1692,10 +1746,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.NodeMetricsList(childComplexity, args["cluster"].(string), args["subCluster"].(string), args["stateFilter"].(string), args["nodeFilter"].(string), args["scopes"].([]schema.MetricScope), args["metrics"].([]string), args["from"].(time.Time), args["to"].(time.Time), args["page"].(*model.PageRequest), args["resolution"].(*int)), true - + return e.ComplexityRoot.Query.NodeMetricsList(childComplexity, args["cluster"].(string), args["subCluster"].(string), args["stateFilter"].(string), args["nodeFilter"].(string), args["scopes"].([]schema.MetricScope), args["metrics"].([]string), args["from"].(time.Time), args["to"].(time.Time), args["page"].(*model.PageRequest), args["resolution"].(*int)), true case "Query.nodeStates": - if e.complexity.Query.NodeStates == nil { + if e.ComplexityRoot.Query.NodeStates == nil { break } @@ -1704,9 +1757,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.NodeStates(childComplexity, args["filter"].([]*model.NodeFilter)), true + return e.ComplexityRoot.Query.NodeStates(childComplexity, args["filter"].([]*model.NodeFilter)), true case "Query.nodeStatesTimed": - if e.complexity.Query.NodeStatesTimed == nil { + if e.ComplexityRoot.Query.NodeStatesTimed == nil { break } @@ -1715,10 +1768,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.NodeStatesTimed(childComplexity, args["filter"].([]*model.NodeFilter), args["type"].(string)), true - + return e.ComplexityRoot.Query.NodeStatesTimed(childComplexity, args["filter"].([]*model.NodeFilter), args["type"].(string)), true case "Query.nodes": - if e.complexity.Query.Nodes == nil { + if e.ComplexityRoot.Query.Nodes == nil { break } @@ -1727,9 +1779,20 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.Nodes(childComplexity, args["filter"].([]*model.NodeFilter), args["order"].(*model.OrderByInput)), true + return e.ComplexityRoot.Query.Nodes(childComplexity, args["filter"].([]*model.NodeFilter), args["order"].(*model.OrderByInput)), true + case "Query.nodesWithMeta": + if e.ComplexityRoot.Query.NodesWithMeta == nil { + break + } + + args, err := ec.field_Query_nodesWithMeta_args(ctx, rawArgs) + if err != nil { + return 0, false + } + + return e.ComplexityRoot.Query.NodesWithMeta(childComplexity, args["filter"].([]*model.NodeFilter), args["order"].(*model.OrderByInput)), true case "Query.rooflineHeatmap": - if e.complexity.Query.RooflineHeatmap == nil { + if e.ComplexityRoot.Query.RooflineHeatmap == nil { break } @@ -1738,9 +1801,9 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.RooflineHeatmap(childComplexity, args["filter"].([]*model.JobFilter), args["rows"].(int), args["cols"].(int), args["minX"].(float64), args["minY"].(float64), args["maxX"].(float64), args["maxY"].(float64)), true + return e.ComplexityRoot.Query.RooflineHeatmap(childComplexity, args["filter"].([]*model.JobFilter), args["rows"].(int), args["cols"].(int), args["minX"].(float64), args["minY"].(float64), args["maxX"].(float64), args["maxY"].(float64)), true case "Query.scopedJobStats": - if e.complexity.Query.ScopedJobStats == nil { + if e.ComplexityRoot.Query.ScopedJobStats == nil { break } @@ -1749,15 +1812,15 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.ScopedJobStats(childComplexity, args["id"].(string), args["metrics"].([]string), args["scopes"].([]schema.MetricScope)), true + return e.ComplexityRoot.Query.ScopedJobStats(childComplexity, args["id"].(string), args["metrics"].([]string), args["scopes"].([]schema.MetricScope)), true case "Query.tags": - if e.complexity.Query.Tags == nil { + if e.ComplexityRoot.Query.Tags == nil { break } - return e.complexity.Query.Tags(childComplexity), true + return e.ComplexityRoot.Query.Tags(childComplexity), true case "Query.user": - if e.complexity.Query.User == nil { + if e.ComplexityRoot.Query.User == nil { break } @@ -1766,349 +1829,349 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin return 0, false } - return e.complexity.Query.User(childComplexity, args["username"].(string)), true + return e.ComplexityRoot.Query.User(childComplexity, args["username"].(string)), true case "Resource.accelerators": - if e.complexity.Resource.Accelerators == nil { + if e.ComplexityRoot.Resource.Accelerators == nil { break } - return e.complexity.Resource.Accelerators(childComplexity), true + return e.ComplexityRoot.Resource.Accelerators(childComplexity), true case "Resource.configuration": - if e.complexity.Resource.Configuration == nil { + if e.ComplexityRoot.Resource.Configuration == nil { break } - return e.complexity.Resource.Configuration(childComplexity), true + return e.ComplexityRoot.Resource.Configuration(childComplexity), true case "Resource.hwthreads": - if e.complexity.Resource.HWThreads == nil { + if e.ComplexityRoot.Resource.HWThreads == nil { break } - return e.complexity.Resource.HWThreads(childComplexity), true + return e.ComplexityRoot.Resource.HWThreads(childComplexity), true case "Resource.hostname": - if e.complexity.Resource.Hostname == nil { + if e.ComplexityRoot.Resource.Hostname == nil { break } - return e.complexity.Resource.Hostname(childComplexity), true + return e.ComplexityRoot.Resource.Hostname(childComplexity), true case "ScopedStats.data": - if e.complexity.ScopedStats.Data == nil { + if e.ComplexityRoot.ScopedStats.Data == nil { break } - return e.complexity.ScopedStats.Data(childComplexity), true + return e.ComplexityRoot.ScopedStats.Data(childComplexity), true case "ScopedStats.hostname": - if e.complexity.ScopedStats.Hostname == nil { + if e.ComplexityRoot.ScopedStats.Hostname == nil { break } - return e.complexity.ScopedStats.Hostname(childComplexity), true + return e.ComplexityRoot.ScopedStats.Hostname(childComplexity), true case "ScopedStats.id": - if e.complexity.ScopedStats.ID == nil { + if e.ComplexityRoot.ScopedStats.ID == nil { break } - return e.complexity.ScopedStats.ID(childComplexity), true + return e.ComplexityRoot.ScopedStats.ID(childComplexity), true case "Series.data": - if e.complexity.Series.Data == nil { + if e.ComplexityRoot.Series.Data == nil { break } - return e.complexity.Series.Data(childComplexity), true + return e.ComplexityRoot.Series.Data(childComplexity), true case "Series.hostname": - if e.complexity.Series.Hostname == nil { + if e.ComplexityRoot.Series.Hostname == nil { break } - return e.complexity.Series.Hostname(childComplexity), true + return e.ComplexityRoot.Series.Hostname(childComplexity), true case "Series.id": - if e.complexity.Series.Id == nil { + if e.ComplexityRoot.Series.ID == nil { break } - return e.complexity.Series.Id(childComplexity), true + return e.ComplexityRoot.Series.ID(childComplexity), true case "Series.statistics": - if e.complexity.Series.Statistics == nil { + if e.ComplexityRoot.Series.Statistics == nil { break } - return e.complexity.Series.Statistics(childComplexity), true + return e.ComplexityRoot.Series.Statistics(childComplexity), true case "StatsSeries.max": - if e.complexity.StatsSeries.Max == nil { + if e.ComplexityRoot.StatsSeries.Max == nil { break } - return e.complexity.StatsSeries.Max(childComplexity), true + return e.ComplexityRoot.StatsSeries.Max(childComplexity), true case "StatsSeries.mean": - if e.complexity.StatsSeries.Mean == nil { + if e.ComplexityRoot.StatsSeries.Mean == nil { break } - return e.complexity.StatsSeries.Mean(childComplexity), true + return e.ComplexityRoot.StatsSeries.Mean(childComplexity), true case "StatsSeries.median": - if e.complexity.StatsSeries.Median == nil { + if e.ComplexityRoot.StatsSeries.Median == nil { break } - return e.complexity.StatsSeries.Median(childComplexity), true + return e.ComplexityRoot.StatsSeries.Median(childComplexity), true case "StatsSeries.min": - if e.complexity.StatsSeries.Min == nil { + if e.ComplexityRoot.StatsSeries.Min == nil { break } - return e.complexity.StatsSeries.Min(childComplexity), true + return e.ComplexityRoot.StatsSeries.Min(childComplexity), true case "SubCluster.coresPerSocket": - if e.complexity.SubCluster.CoresPerSocket == nil { + if e.ComplexityRoot.SubCluster.CoresPerSocket == nil { break } - return e.complexity.SubCluster.CoresPerSocket(childComplexity), true + return e.ComplexityRoot.SubCluster.CoresPerSocket(childComplexity), true case "SubCluster.flopRateScalar": - if e.complexity.SubCluster.FlopRateScalar == nil { + if e.ComplexityRoot.SubCluster.FlopRateScalar == nil { break } - return e.complexity.SubCluster.FlopRateScalar(childComplexity), true + return e.ComplexityRoot.SubCluster.FlopRateScalar(childComplexity), true case "SubCluster.flopRateSimd": - if e.complexity.SubCluster.FlopRateSimd == nil { + if e.ComplexityRoot.SubCluster.FlopRateSimd == nil { break } - return e.complexity.SubCluster.FlopRateSimd(childComplexity), true + return e.ComplexityRoot.SubCluster.FlopRateSimd(childComplexity), true case "SubCluster.footprint": - if e.complexity.SubCluster.Footprint == nil { + if e.ComplexityRoot.SubCluster.Footprint == nil { break } - return e.complexity.SubCluster.Footprint(childComplexity), true + return e.ComplexityRoot.SubCluster.Footprint(childComplexity), true case "SubCluster.memoryBandwidth": - if e.complexity.SubCluster.MemoryBandwidth == nil { + if e.ComplexityRoot.SubCluster.MemoryBandwidth == nil { break } - return e.complexity.SubCluster.MemoryBandwidth(childComplexity), true + return e.ComplexityRoot.SubCluster.MemoryBandwidth(childComplexity), true case "SubCluster.metricConfig": - if e.complexity.SubCluster.MetricConfig == nil { + if e.ComplexityRoot.SubCluster.MetricConfig == nil { break } - return e.complexity.SubCluster.MetricConfig(childComplexity), true + return e.ComplexityRoot.SubCluster.MetricConfig(childComplexity), true case "SubCluster.name": - if e.complexity.SubCluster.Name == nil { + if e.ComplexityRoot.SubCluster.Name == nil { break } - return e.complexity.SubCluster.Name(childComplexity), true + return e.ComplexityRoot.SubCluster.Name(childComplexity), true case "SubCluster.nodes": - if e.complexity.SubCluster.Nodes == nil { + if e.ComplexityRoot.SubCluster.Nodes == nil { break } - return e.complexity.SubCluster.Nodes(childComplexity), true + return e.ComplexityRoot.SubCluster.Nodes(childComplexity), true case "SubCluster.numberOfNodes": - if e.complexity.SubCluster.NumberOfNodes == nil { + if e.ComplexityRoot.SubCluster.NumberOfNodes == nil { break } - return e.complexity.SubCluster.NumberOfNodes(childComplexity), true + return e.ComplexityRoot.SubCluster.NumberOfNodes(childComplexity), true case "SubCluster.processorType": - if e.complexity.SubCluster.ProcessorType == nil { + if e.ComplexityRoot.SubCluster.ProcessorType == nil { break } - return e.complexity.SubCluster.ProcessorType(childComplexity), true + return e.ComplexityRoot.SubCluster.ProcessorType(childComplexity), true case "SubCluster.socketsPerNode": - if e.complexity.SubCluster.SocketsPerNode == nil { + if e.ComplexityRoot.SubCluster.SocketsPerNode == nil { break } - return e.complexity.SubCluster.SocketsPerNode(childComplexity), true + return e.ComplexityRoot.SubCluster.SocketsPerNode(childComplexity), true case "SubCluster.threadsPerCore": - if e.complexity.SubCluster.ThreadsPerCore == nil { + if e.ComplexityRoot.SubCluster.ThreadsPerCore == nil { break } - return e.complexity.SubCluster.ThreadsPerCore(childComplexity), true + return e.ComplexityRoot.SubCluster.ThreadsPerCore(childComplexity), true case "SubCluster.topology": - if e.complexity.SubCluster.Topology == nil { + if e.ComplexityRoot.SubCluster.Topology == nil { break } - return e.complexity.SubCluster.Topology(childComplexity), true + return e.ComplexityRoot.SubCluster.Topology(childComplexity), true case "SubClusterConfig.alert": - if e.complexity.SubClusterConfig.Alert == nil { + if e.ComplexityRoot.SubClusterConfig.Alert == nil { break } - return e.complexity.SubClusterConfig.Alert(childComplexity), true + return e.ComplexityRoot.SubClusterConfig.Alert(childComplexity), true case "SubClusterConfig.caution": - if e.complexity.SubClusterConfig.Caution == nil { + if e.ComplexityRoot.SubClusterConfig.Caution == nil { break } - return e.complexity.SubClusterConfig.Caution(childComplexity), true + return e.ComplexityRoot.SubClusterConfig.Caution(childComplexity), true case "SubClusterConfig.name": - if e.complexity.SubClusterConfig.Name == nil { + if e.ComplexityRoot.SubClusterConfig.Name == nil { break } - return e.complexity.SubClusterConfig.Name(childComplexity), true + return e.ComplexityRoot.SubClusterConfig.Name(childComplexity), true case "SubClusterConfig.normal": - if e.complexity.SubClusterConfig.Normal == nil { + if e.ComplexityRoot.SubClusterConfig.Normal == nil { break } - return e.complexity.SubClusterConfig.Normal(childComplexity), true + return e.ComplexityRoot.SubClusterConfig.Normal(childComplexity), true case "SubClusterConfig.peak": - if e.complexity.SubClusterConfig.Peak == nil { + if e.ComplexityRoot.SubClusterConfig.Peak == nil { break } - return e.complexity.SubClusterConfig.Peak(childComplexity), true + return e.ComplexityRoot.SubClusterConfig.Peak(childComplexity), true case "SubClusterConfig.remove": - if e.complexity.SubClusterConfig.Remove == nil { + if e.ComplexityRoot.SubClusterConfig.Remove == nil { break } - return e.complexity.SubClusterConfig.Remove(childComplexity), true + return e.ComplexityRoot.SubClusterConfig.Remove(childComplexity), true case "Tag.id": - if e.complexity.Tag.ID == nil { + if e.ComplexityRoot.Tag.ID == nil { break } - return e.complexity.Tag.ID(childComplexity), true + return e.ComplexityRoot.Tag.ID(childComplexity), true case "Tag.name": - if e.complexity.Tag.Name == nil { + if e.ComplexityRoot.Tag.Name == nil { break } - return e.complexity.Tag.Name(childComplexity), true + return e.ComplexityRoot.Tag.Name(childComplexity), true case "Tag.scope": - if e.complexity.Tag.Scope == nil { + if e.ComplexityRoot.Tag.Scope == nil { break } - return e.complexity.Tag.Scope(childComplexity), true + return e.ComplexityRoot.Tag.Scope(childComplexity), true case "Tag.type": - if e.complexity.Tag.Type == nil { + if e.ComplexityRoot.Tag.Type == nil { break } - return e.complexity.Tag.Type(childComplexity), true + return e.ComplexityRoot.Tag.Type(childComplexity), true case "TimeRangeOutput.from": - if e.complexity.TimeRangeOutput.From == nil { + if e.ComplexityRoot.TimeRangeOutput.From == nil { break } - return e.complexity.TimeRangeOutput.From(childComplexity), true + return e.ComplexityRoot.TimeRangeOutput.From(childComplexity), true case "TimeRangeOutput.range": - if e.complexity.TimeRangeOutput.Range == nil { + if e.ComplexityRoot.TimeRangeOutput.Range == nil { break } - return e.complexity.TimeRangeOutput.Range(childComplexity), true + return e.ComplexityRoot.TimeRangeOutput.Range(childComplexity), true case "TimeRangeOutput.to": - if e.complexity.TimeRangeOutput.To == nil { + if e.ComplexityRoot.TimeRangeOutput.To == nil { break } - return e.complexity.TimeRangeOutput.To(childComplexity), true + return e.ComplexityRoot.TimeRangeOutput.To(childComplexity), true case "TimeWeights.accHours": - if e.complexity.TimeWeights.AccHours == nil { + if e.ComplexityRoot.TimeWeights.AccHours == nil { break } - return e.complexity.TimeWeights.AccHours(childComplexity), true + return e.ComplexityRoot.TimeWeights.AccHours(childComplexity), true case "TimeWeights.coreHours": - if e.complexity.TimeWeights.CoreHours == nil { + if e.ComplexityRoot.TimeWeights.CoreHours == nil { break } - return e.complexity.TimeWeights.CoreHours(childComplexity), true + return e.ComplexityRoot.TimeWeights.CoreHours(childComplexity), true case "TimeWeights.nodeHours": - if e.complexity.TimeWeights.NodeHours == nil { + if e.ComplexityRoot.TimeWeights.NodeHours == nil { break } - return e.complexity.TimeWeights.NodeHours(childComplexity), true + return e.ComplexityRoot.TimeWeights.NodeHours(childComplexity), true case "Topology.accelerators": - if e.complexity.Topology.Accelerators == nil { + if e.ComplexityRoot.Topology.Accelerators == nil { break } - return e.complexity.Topology.Accelerators(childComplexity), true + return e.ComplexityRoot.Topology.Accelerators(childComplexity), true case "Topology.core": - if e.complexity.Topology.Core == nil { + if e.ComplexityRoot.Topology.Core == nil { break } - return e.complexity.Topology.Core(childComplexity), true + return e.ComplexityRoot.Topology.Core(childComplexity), true case "Topology.die": - if e.complexity.Topology.Die == nil { + if e.ComplexityRoot.Topology.Die == nil { break } - return e.complexity.Topology.Die(childComplexity), true + return e.ComplexityRoot.Topology.Die(childComplexity), true case "Topology.memoryDomain": - if e.complexity.Topology.MemoryDomain == nil { + if e.ComplexityRoot.Topology.MemoryDomain == nil { break } - return e.complexity.Topology.MemoryDomain(childComplexity), true + return e.ComplexityRoot.Topology.MemoryDomain(childComplexity), true case "Topology.node": - if e.complexity.Topology.Node == nil { + if e.ComplexityRoot.Topology.Node == nil { break } - return e.complexity.Topology.Node(childComplexity), true + return e.ComplexityRoot.Topology.Node(childComplexity), true case "Topology.socket": - if e.complexity.Topology.Socket == nil { + if e.ComplexityRoot.Topology.Socket == nil { break } - return e.complexity.Topology.Socket(childComplexity), true + return e.ComplexityRoot.Topology.Socket(childComplexity), true case "Unit.base": - if e.complexity.Unit.Base == nil { + if e.ComplexityRoot.Unit.Base == nil { break } - return e.complexity.Unit.Base(childComplexity), true + return e.ComplexityRoot.Unit.Base(childComplexity), true case "Unit.prefix": - if e.complexity.Unit.Prefix == nil { + if e.ComplexityRoot.Unit.Prefix == nil { break } - return e.complexity.Unit.Prefix(childComplexity), true + return e.ComplexityRoot.Unit.Prefix(childComplexity), true case "User.email": - if e.complexity.User.Email == nil { + if e.ComplexityRoot.User.Email == nil { break } - return e.complexity.User.Email(childComplexity), true + return e.ComplexityRoot.User.Email(childComplexity), true case "User.name": - if e.complexity.User.Name == nil { + if e.ComplexityRoot.User.Name == nil { break } - return e.complexity.User.Name(childComplexity), true + return e.ComplexityRoot.User.Name(childComplexity), true case "User.username": - if e.complexity.User.Username == nil { + if e.ComplexityRoot.User.Username == nil { break } - return e.complexity.User.Username(childComplexity), true + return e.ComplexityRoot.User.Username(childComplexity), true } return 0, false @@ -2116,7 +2179,7 @@ func (e *executableSchema) Complexity(ctx context.Context, typeName, field strin func (e *executableSchema) Exec(ctx context.Context) graphql.ResponseHandler { opCtx := graphql.GetOperationContext(ctx) - ec := executionContext{opCtx, e, 0, 0, make(chan graphql.DeferredResult)} + ec := newExecutionContext(opCtx, e, make(chan graphql.DeferredResult)) inputUnmarshalMap := graphql.BuildUnmarshalerMap( ec.unmarshalInputFloatRange, ec.unmarshalInputIntRange, @@ -2140,9 +2203,9 @@ func (e *executableSchema) Exec(ctx context.Context) graphql.ResponseHandler { ctx = graphql.WithUnmarshalerMap(ctx, inputUnmarshalMap) data = ec._Query(ctx, opCtx.Operation.SelectionSet) } else { - if atomic.LoadInt32(&ec.pendingDeferred) > 0 { - result := <-ec.deferredResults - atomic.AddInt32(&ec.pendingDeferred, -1) + if atomic.LoadInt32(&ec.PendingDeferred) > 0 { + result := <-ec.DeferredResults + atomic.AddInt32(&ec.PendingDeferred, -1) data = result.Result response.Path = result.Path response.Label = result.Label @@ -2154,8 +2217,8 @@ func (e *executableSchema) Exec(ctx context.Context) graphql.ResponseHandler { var buf bytes.Buffer data.MarshalGQL(&buf) response.Data = buf.Bytes() - if atomic.LoadInt32(&ec.deferred) > 0 { - hasNext := atomic.LoadInt32(&ec.pendingDeferred) > 0 + if atomic.LoadInt32(&ec.Deferred) > 0 { + hasNext := atomic.LoadInt32(&ec.PendingDeferred) > 0 response.HasNext = &hasNext } @@ -2183,44 +2246,22 @@ func (e *executableSchema) Exec(ctx context.Context) graphql.ResponseHandler { } type executionContext struct { - *graphql.OperationContext - *executableSchema - deferred int32 - pendingDeferred int32 - deferredResults chan graphql.DeferredResult + *graphql.ExecutionContextState[ResolverRoot, DirectiveRoot, ComplexityRoot] } -func (ec *executionContext) processDeferredGroup(dg graphql.DeferredGroup) { - atomic.AddInt32(&ec.pendingDeferred, 1) - go func() { - ctx := graphql.WithFreshResponseContext(dg.Context) - dg.FieldSet.Dispatch(ctx) - ds := graphql.DeferredResult{ - Path: dg.Path, - Label: dg.Label, - Result: dg.FieldSet, - Errors: graphql.GetErrors(ctx), - } - // null fields should bubble up - if dg.FieldSet.Invalids > 0 { - ds.Result = graphql.Null - } - ec.deferredResults <- ds - }() -} - -func (ec *executionContext) introspectSchema() (*introspection.Schema, error) { - if ec.DisableIntrospection { - return nil, errors.New("introspection disabled") +func newExecutionContext( + opCtx *graphql.OperationContext, + execSchema *executableSchema, + deferredResults chan graphql.DeferredResult, +) executionContext { + return executionContext{ + ExecutionContextState: graphql.NewExecutionContextState[ResolverRoot, DirectiveRoot, ComplexityRoot]( + opCtx, + (*graphql.ExecutableSchemaState[ResolverRoot, DirectiveRoot, ComplexityRoot])(execSchema), + parsedSchema, + deferredResults, + ), } - return introspection.WrapSchema(ec.Schema()), nil -} - -func (ec *executionContext) introspectType(name string) (*introspection.Type, error) { - if ec.DisableIntrospection { - return nil, errors.New("introspection disabled") - } - return introspection.WrapTypeFromDef(ec.Schema(), ec.Schema().Types[name]), nil } var sources = []*ast.Source{ @@ -2245,6 +2286,7 @@ type Node { schedulerState: SchedulerState! healthState: MonitoringState! metaData: Any + healthData: Any } type NodeStates { @@ -2390,6 +2432,13 @@ type JobMetricWithName { metric: JobMetric! } +type ClusterMetricWithName { + name: String! + unit: Unit + timestep: Int! + data: [NullableFloat!]! +} + type JobMetric { unit: Unit timestep: Int! @@ -2493,6 +2542,11 @@ type NodeMetrics { metrics: [JobMetricWithName!]! } +type ClusterMetrics { + nodeCount: Int! + metrics: [ClusterMetricWithName!]! +} + type NodesResultList { items: [NodeMetrics!]! offset: Int @@ -2542,6 +2596,7 @@ type Query { ## Node Queries New node(id: ID!): Node nodes(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList! + nodesWithMeta(filter: [NodeFilter!], order: OrderByInput): NodeStateResultList! nodeStates(filter: [NodeFilter!]): [NodeStates!]! nodeStatesTimed(filter: [NodeFilter!], type: String!): [NodeStatesTimed!]! @@ -2611,6 +2666,13 @@ type Query { page: PageRequest resolution: Int ): NodesResultList! + + clusterMetrics( + cluster: String! + metrics: [String!] + from: Time! + to: Time! + ): ClusterMetrics! } type Mutation { @@ -2636,7 +2698,7 @@ type TimeRangeOutput { input NodeFilter { hostname: StringInput cluster: StringInput - subcluster: StringInput + subCluster: StringInput schedulerState: SchedulerState healthState: MonitoringState timeStart: Int @@ -2651,6 +2713,7 @@ input JobFilter { project: StringInput jobName: StringInput cluster: StringInput + subCluster: StringInput partition: StringInput duration: IntRange energy: FloatRange @@ -2665,6 +2728,7 @@ input JobFilter { state: [JobState!] metricStats: [MetricStatItem!] shared: String + schedule: String node: StringInput } @@ -2887,6 +2951,32 @@ func (ec *executionContext) field_Query_allocatedNodes_args(ctx context.Context, return args, nil } +func (ec *executionContext) field_Query_clusterMetrics_args(ctx context.Context, rawArgs map[string]any) (map[string]any, error) { + var err error + args := map[string]any{} + arg0, err := graphql.ProcessArgField(ctx, rawArgs, "cluster", ec.unmarshalNString2string) + if err != nil { + return nil, err + } + args["cluster"] = arg0 + arg1, err := graphql.ProcessArgField(ctx, rawArgs, "metrics", ec.unmarshalOString2ᚕstringᚄ) + if err != nil { + return nil, err + } + args["metrics"] = arg1 + arg2, err := graphql.ProcessArgField(ctx, rawArgs, "from", ec.unmarshalNTime2timeᚐTime) + if err != nil { + return nil, err + } + args["from"] = arg2 + arg3, err := graphql.ProcessArgField(ctx, rawArgs, "to", ec.unmarshalNTime2timeᚐTime) + if err != nil { + return nil, err + } + args["to"] = arg3 + return args, nil +} + func (ec *executionContext) field_Query_jobMetrics_args(ctx context.Context, rawArgs map[string]any) (map[string]any, error) { var err error args := map[string]any{} @@ -2900,7 +2990,7 @@ func (ec *executionContext) field_Query_jobMetrics_args(ctx context.Context, raw return nil, err } args["metrics"] = arg1 - arg2, err := graphql.ProcessArgField(ctx, rawArgs, "scopes", ec.unmarshalOMetricScope2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricScopeᚄ) + arg2, err := graphql.ProcessArgField(ctx, rawArgs, "scopes", ec.unmarshalOMetricScope2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricScopeᚄ) if err != nil { return nil, err } @@ -3057,7 +3147,7 @@ func (ec *executionContext) field_Query_nodeMetricsList_args(ctx context.Context return nil, err } args["nodeFilter"] = arg3 - arg4, err := graphql.ProcessArgField(ctx, rawArgs, "scopes", ec.unmarshalOMetricScope2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricScopeᚄ) + arg4, err := graphql.ProcessArgField(ctx, rawArgs, "scopes", ec.unmarshalOMetricScope2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricScopeᚄ) if err != nil { return nil, err } @@ -3103,7 +3193,7 @@ func (ec *executionContext) field_Query_nodeMetrics_args(ctx context.Context, ra return nil, err } args["nodes"] = arg1 - arg2, err := graphql.ProcessArgField(ctx, rawArgs, "scopes", ec.unmarshalOMetricScope2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricScopeᚄ) + arg2, err := graphql.ProcessArgField(ctx, rawArgs, "scopes", ec.unmarshalOMetricScope2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricScopeᚄ) if err != nil { return nil, err } @@ -3164,6 +3254,22 @@ func (ec *executionContext) field_Query_node_args(ctx context.Context, rawArgs m return args, nil } +func (ec *executionContext) field_Query_nodesWithMeta_args(ctx context.Context, rawArgs map[string]any) (map[string]any, error) { + var err error + args := map[string]any{} + arg0, err := graphql.ProcessArgField(ctx, rawArgs, "filter", ec.unmarshalONodeFilter2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeFilterᚄ) + if err != nil { + return nil, err + } + args["filter"] = arg0 + arg1, err := graphql.ProcessArgField(ctx, rawArgs, "order", ec.unmarshalOOrderByInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐOrderByInput) + if err != nil { + return nil, err + } + args["order"] = arg1 + return args, nil +} + func (ec *executionContext) field_Query_nodes_args(ctx context.Context, rawArgs map[string]any) (map[string]any, error) { var err error args := map[string]any{} @@ -3234,7 +3340,7 @@ func (ec *executionContext) field_Query_scopedJobStats_args(ctx context.Context, return nil, err } args["metrics"] = arg1 - arg2, err := graphql.ProcessArgField(ctx, rawArgs, "scopes", ec.unmarshalOMetricScope2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricScopeᚄ) + arg2, err := graphql.ProcessArgField(ctx, rawArgs, "scopes", ec.unmarshalOMetricScope2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricScopeᚄ) if err != nil { return nil, err } @@ -3428,7 +3534,7 @@ func (ec *executionContext) _Cluster_partitions(ctx context.Context, field graph field, ec.fieldContext_Cluster_partitions, func(ctx context.Context) (any, error) { - return ec.resolvers.Cluster().Partitions(ctx, obj) + return ec.Resolvers.Cluster().Partitions(ctx, obj) }, nil, ec.marshalNString2ᚕstringᚄ, @@ -3460,7 +3566,7 @@ func (ec *executionContext) _Cluster_subClusters(ctx context.Context, field grap return obj.SubClusters, nil }, nil, - ec.marshalNSubCluster2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSubClusterᚄ, + ec.marshalNSubCluster2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSubClusterᚄ, true, true, ) @@ -3507,6 +3613,196 @@ func (ec *executionContext) fieldContext_Cluster_subClusters(_ context.Context, return fc, nil } +func (ec *executionContext) _ClusterMetricWithName_name(ctx context.Context, field graphql.CollectedField, obj *model.ClusterMetricWithName) (ret graphql.Marshaler) { + return graphql.ResolveField( + ctx, + ec.OperationContext, + field, + ec.fieldContext_ClusterMetricWithName_name, + func(ctx context.Context) (any, error) { + return obj.Name, nil + }, + nil, + ec.marshalNString2string, + true, + true, + ) +} + +func (ec *executionContext) fieldContext_ClusterMetricWithName_name(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "ClusterMetricWithName", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type String does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _ClusterMetricWithName_unit(ctx context.Context, field graphql.CollectedField, obj *model.ClusterMetricWithName) (ret graphql.Marshaler) { + return graphql.ResolveField( + ctx, + ec.OperationContext, + field, + ec.fieldContext_ClusterMetricWithName_unit, + func(ctx context.Context) (any, error) { + return obj.Unit, nil + }, + nil, + ec.marshalOUnit2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐUnit, + true, + false, + ) +} + +func (ec *executionContext) fieldContext_ClusterMetricWithName_unit(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "ClusterMetricWithName", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + switch field.Name { + case "base": + return ec.fieldContext_Unit_base(ctx, field) + case "prefix": + return ec.fieldContext_Unit_prefix(ctx, field) + } + return nil, fmt.Errorf("no field named %q was found under type Unit", field.Name) + }, + } + return fc, nil +} + +func (ec *executionContext) _ClusterMetricWithName_timestep(ctx context.Context, field graphql.CollectedField, obj *model.ClusterMetricWithName) (ret graphql.Marshaler) { + return graphql.ResolveField( + ctx, + ec.OperationContext, + field, + ec.fieldContext_ClusterMetricWithName_timestep, + func(ctx context.Context) (any, error) { + return obj.Timestep, nil + }, + nil, + ec.marshalNInt2int, + true, + true, + ) +} + +func (ec *executionContext) fieldContext_ClusterMetricWithName_timestep(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "ClusterMetricWithName", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Int does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _ClusterMetricWithName_data(ctx context.Context, field graphql.CollectedField, obj *model.ClusterMetricWithName) (ret graphql.Marshaler) { + return graphql.ResolveField( + ctx, + ec.OperationContext, + field, + ec.fieldContext_ClusterMetricWithName_data, + func(ctx context.Context) (any, error) { + return obj.Data, nil + }, + nil, + ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloatᚄ, + true, + true, + ) +} + +func (ec *executionContext) fieldContext_ClusterMetricWithName_data(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "ClusterMetricWithName", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type NullableFloat does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _ClusterMetrics_nodeCount(ctx context.Context, field graphql.CollectedField, obj *model.ClusterMetrics) (ret graphql.Marshaler) { + return graphql.ResolveField( + ctx, + ec.OperationContext, + field, + ec.fieldContext_ClusterMetrics_nodeCount, + func(ctx context.Context) (any, error) { + return obj.NodeCount, nil + }, + nil, + ec.marshalNInt2int, + true, + true, + ) +} + +func (ec *executionContext) fieldContext_ClusterMetrics_nodeCount(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "ClusterMetrics", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Int does not have child fields") + }, + } + return fc, nil +} + +func (ec *executionContext) _ClusterMetrics_metrics(ctx context.Context, field graphql.CollectedField, obj *model.ClusterMetrics) (ret graphql.Marshaler) { + return graphql.ResolveField( + ctx, + ec.OperationContext, + field, + ec.fieldContext_ClusterMetrics_metrics, + func(ctx context.Context) (any, error) { + return obj.Metrics, nil + }, + nil, + ec.marshalNClusterMetricWithName2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐClusterMetricWithNameᚄ, + true, + true, + ) +} + +func (ec *executionContext) fieldContext_ClusterMetrics_metrics(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "ClusterMetrics", + Field: field, + IsMethod: false, + IsResolver: false, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + switch field.Name { + case "name": + return ec.fieldContext_ClusterMetricWithName_name(ctx, field) + case "unit": + return ec.fieldContext_ClusterMetricWithName_unit(ctx, field) + case "timestep": + return ec.fieldContext_ClusterMetricWithName_timestep(ctx, field) + case "data": + return ec.fieldContext_ClusterMetricWithName_data(ctx, field) + } + return nil, fmt.Errorf("no field named %q was found under type ClusterMetricWithName", field.Name) + }, + } + return fc, nil +} + func (ec *executionContext) _ClusterSupport_cluster(ctx context.Context, field graphql.CollectedField, obj *schema.ClusterSupport) (ret graphql.Marshaler) { return graphql.ResolveField( ctx, @@ -3908,7 +4204,7 @@ func (ec *executionContext) _GlobalMetricListItem_unit(ctx context.Context, fiel return obj.Unit, nil }, nil, - ec.marshalNUnit2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐUnit, + ec.marshalNUnit2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐUnit, true, true, ) @@ -3943,7 +4239,7 @@ func (ec *executionContext) _GlobalMetricListItem_scope(ctx context.Context, fie return obj.Scope, nil }, nil, - ec.marshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricScope, + ec.marshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricScope, true, true, ) @@ -4001,7 +4297,7 @@ func (ec *executionContext) _GlobalMetricListItem_availability(ctx context.Conte return obj.Availability, nil }, nil, - ec.marshalNClusterSupport2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐClusterSupportᚄ, + ec.marshalNClusterSupport2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐClusterSupportᚄ, true, true, ) @@ -4323,7 +4619,7 @@ func (ec *executionContext) _Job_startTime(ctx context.Context, field graphql.Co field, ec.fieldContext_Job_startTime, func(ctx context.Context) (any, error) { - return ec.resolvers.Job().StartTime(ctx, obj) + return ec.Resolvers.Job().StartTime(ctx, obj) }, nil, ec.marshalNTime2ᚖtimeᚐTime, @@ -4674,7 +4970,7 @@ func (ec *executionContext) _Job_state(ctx context.Context, field graphql.Collec return obj.State, nil }, nil, - ec.marshalNJobState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJobState, + ec.marshalNJobState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJobState, true, true, ) @@ -4700,10 +4996,10 @@ func (ec *executionContext) _Job_tags(ctx context.Context, field graphql.Collect field, ec.fieldContext_Job_tags, func(ctx context.Context) (any, error) { - return ec.resolvers.Job().Tags(ctx, obj) + return ec.Resolvers.Job().Tags(ctx, obj) }, nil, - ec.marshalNTag2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐTagᚄ, + ec.marshalNTag2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐTagᚄ, true, true, ) @@ -4742,7 +5038,7 @@ func (ec *executionContext) _Job_resources(ctx context.Context, field graphql.Co return obj.Resources, nil }, nil, - ec.marshalNResource2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐResourceᚄ, + ec.marshalNResource2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐResourceᚄ, true, true, ) @@ -4778,7 +5074,7 @@ func (ec *executionContext) _Job_concurrentJobs(ctx context.Context, field graph field, ec.fieldContext_Job_concurrentJobs, func(ctx context.Context) (any, error) { - return ec.resolvers.Job().ConcurrentJobs(ctx, obj) + return ec.Resolvers.Job().ConcurrentJobs(ctx, obj) }, nil, ec.marshalOJobLinkResultList2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobLinkResultList, @@ -4815,7 +5111,7 @@ func (ec *executionContext) _Job_footprint(ctx context.Context, field graphql.Co field, ec.fieldContext_Job_footprint, func(ctx context.Context) (any, error) { - return ec.resolvers.Job().Footprint(ctx, obj) + return ec.Resolvers.Job().Footprint(ctx, obj) }, nil, ec.marshalOFootprintValue2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐFootprintValue, @@ -4852,7 +5148,7 @@ func (ec *executionContext) _Job_energyFootprint(ctx context.Context, field grap field, ec.fieldContext_Job_energyFootprint, func(ctx context.Context) (any, error) { - return ec.resolvers.Job().EnergyFootprint(ctx, obj) + return ec.Resolvers.Job().EnergyFootprint(ctx, obj) }, nil, ec.marshalOEnergyFootprintValue2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐEnergyFootprintValue, @@ -4889,7 +5185,7 @@ func (ec *executionContext) _Job_metaData(ctx context.Context, field graphql.Col field, ec.fieldContext_Job_metaData, func(ctx context.Context) (any, error) { - return ec.resolvers.Job().MetaData(ctx, obj) + return ec.Resolvers.Job().MetaData(ctx, obj) }, nil, ec.marshalOAny2interface, @@ -4918,7 +5214,7 @@ func (ec *executionContext) _Job_userData(ctx context.Context, field graphql.Col field, ec.fieldContext_Job_userData, func(ctx context.Context) (any, error) { - return ec.resolvers.Job().UserData(ctx, obj) + return ec.Resolvers.Job().UserData(ctx, obj) }, nil, ec.marshalOUser2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐUser, @@ -5109,7 +5405,7 @@ func (ec *executionContext) _JobMetric_unit(ctx context.Context, field graphql.C return obj.Unit, nil }, nil, - ec.marshalOUnit2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐUnit, + ec.marshalOUnit2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐUnit, true, false, ) @@ -5173,7 +5469,7 @@ func (ec *executionContext) _JobMetric_series(ctx context.Context, field graphql return obj.Series, nil }, nil, - ec.marshalOSeries2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSeriesᚄ, + ec.marshalOSeries2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSeriesᚄ, true, false, ) @@ -5212,7 +5508,7 @@ func (ec *executionContext) _JobMetric_statisticsSeries(ctx context.Context, fie return obj.StatisticsSeries, nil }, nil, - ec.marshalOStatsSeries2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐStatsSeries, + ec.marshalOStatsSeries2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐStatsSeries, true, false, ) @@ -5280,7 +5576,7 @@ func (ec *executionContext) _JobMetricWithName_scope(ctx context.Context, field return obj.Scope, nil }, nil, - ec.marshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricScope, + ec.marshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricScope, true, true, ) @@ -5309,7 +5605,7 @@ func (ec *executionContext) _JobMetricWithName_metric(ctx context.Context, field return obj.Metric, nil }, nil, - ec.marshalNJobMetric2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJobMetric, + ec.marshalNJobMetric2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJobMetric, true, true, ) @@ -5348,7 +5644,7 @@ func (ec *executionContext) _JobResultList_items(ctx context.Context, field grap return obj.Items, nil }, nil, - ec.marshalNJob2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJobᚄ, + ec.marshalNJob2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJobᚄ, true, true, ) @@ -6428,7 +6724,7 @@ func (ec *executionContext) _MetricConfig_unit(ctx context.Context, field graphq return obj.Unit, nil }, nil, - ec.marshalNUnit2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐUnit, + ec.marshalNUnit2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐUnit, true, true, ) @@ -6463,7 +6759,7 @@ func (ec *executionContext) _MetricConfig_scope(ctx context.Context, field graph return obj.Scope, nil }, nil, - ec.marshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricScope, + ec.marshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricScope, true, true, ) @@ -6695,7 +6991,7 @@ func (ec *executionContext) _MetricConfig_subClusters(ctx context.Context, field return obj.SubClusters, nil }, nil, - ec.marshalNSubClusterConfig2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSubClusterConfigᚄ, + ec.marshalNSubClusterConfig2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSubClusterConfigᚄ, true, true, ) @@ -6767,7 +7063,7 @@ func (ec *executionContext) _MetricFootprints_data(ctx context.Context, field gr return obj.Data, nil }, nil, - ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐFloatᚄ, + ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloatᚄ, true, true, ) @@ -7122,7 +7418,7 @@ func (ec *executionContext) _MetricValue_name(ctx context.Context, field graphql field, ec.fieldContext_MetricValue_name, func(ctx context.Context) (any, error) { - return ec.resolvers.MetricValue().Name(ctx, obj) + return ec.Resolvers.MetricValue().Name(ctx, obj) }, nil, ec.marshalOString2ᚖstring, @@ -7154,7 +7450,7 @@ func (ec *executionContext) _MetricValue_unit(ctx context.Context, field graphql return obj.Unit, nil }, nil, - ec.marshalNUnit2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐUnit, + ec.marshalNUnit2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐUnit, true, true, ) @@ -7216,10 +7512,10 @@ func (ec *executionContext) _Mutation_createTag(ctx context.Context, field graph ec.fieldContext_Mutation_createTag, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Mutation().CreateTag(ctx, fc.Args["type"].(string), fc.Args["name"].(string), fc.Args["scope"].(string)) + return ec.Resolvers.Mutation().CreateTag(ctx, fc.Args["type"].(string), fc.Args["name"].(string), fc.Args["scope"].(string)) }, nil, - ec.marshalNTag2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐTag, + ec.marshalNTag2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐTag, true, true, ) @@ -7267,7 +7563,7 @@ func (ec *executionContext) _Mutation_deleteTag(ctx context.Context, field graph ec.fieldContext_Mutation_deleteTag, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Mutation().DeleteTag(ctx, fc.Args["id"].(string)) + return ec.Resolvers.Mutation().DeleteTag(ctx, fc.Args["id"].(string)) }, nil, ec.marshalNID2string, @@ -7308,10 +7604,10 @@ func (ec *executionContext) _Mutation_addTagsToJob(ctx context.Context, field gr ec.fieldContext_Mutation_addTagsToJob, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Mutation().AddTagsToJob(ctx, fc.Args["job"].(string), fc.Args["tagIds"].([]string)) + return ec.Resolvers.Mutation().AddTagsToJob(ctx, fc.Args["job"].(string), fc.Args["tagIds"].([]string)) }, nil, - ec.marshalNTag2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐTagᚄ, + ec.marshalNTag2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐTagᚄ, true, true, ) @@ -7359,10 +7655,10 @@ func (ec *executionContext) _Mutation_removeTagsFromJob(ctx context.Context, fie ec.fieldContext_Mutation_removeTagsFromJob, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Mutation().RemoveTagsFromJob(ctx, fc.Args["job"].(string), fc.Args["tagIds"].([]string)) + return ec.Resolvers.Mutation().RemoveTagsFromJob(ctx, fc.Args["job"].(string), fc.Args["tagIds"].([]string)) }, nil, - ec.marshalNTag2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐTagᚄ, + ec.marshalNTag2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐTagᚄ, true, true, ) @@ -7410,7 +7706,7 @@ func (ec *executionContext) _Mutation_removeTagFromList(ctx context.Context, fie ec.fieldContext_Mutation_removeTagFromList, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Mutation().RemoveTagFromList(ctx, fc.Args["tagIds"].([]string)) + return ec.Resolvers.Mutation().RemoveTagFromList(ctx, fc.Args["tagIds"].([]string)) }, nil, ec.marshalNInt2ᚕintᚄ, @@ -7451,7 +7747,7 @@ func (ec *executionContext) _Mutation_updateConfiguration(ctx context.Context, f ec.fieldContext_Mutation_updateConfiguration, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Mutation().UpdateConfiguration(ctx, fc.Args["name"].(string), fc.Args["value"].(string)) + return ec.Resolvers.Mutation().UpdateConfiguration(ctx, fc.Args["name"].(string), fc.Args["value"].(string)) }, nil, ec.marshalOString2ᚖstring, @@ -7523,7 +7819,7 @@ func (ec *executionContext) _NamedStats_data(ctx context.Context, field graphql. return obj.Data, nil }, nil, - ec.marshalNMetricStatistics2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricStatistics, + ec.marshalNMetricStatistics2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricStatistics, true, true, ) @@ -7589,7 +7885,7 @@ func (ec *executionContext) _NamedStatsWithScope_scope(ctx context.Context, fiel return obj.Scope, nil }, nil, - ec.marshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricScope, + ec.marshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricScope, true, true, ) @@ -7652,7 +7948,7 @@ func (ec *executionContext) _Node_id(ctx context.Context, field graphql.Collecte field, ec.fieldContext_Node_id, func(ctx context.Context) (any, error) { - return ec.resolvers.Node().ID(ctx, obj) + return ec.Resolvers.Node().ID(ctx, obj) }, nil, ec.marshalNID2string, @@ -7884,10 +8180,10 @@ func (ec *executionContext) _Node_schedulerState(ctx context.Context, field grap field, ec.fieldContext_Node_schedulerState, func(ctx context.Context) (any, error) { - return ec.resolvers.Node().SchedulerState(ctx, obj) + return ec.Resolvers.Node().SchedulerState(ctx, obj) }, nil, - ec.marshalNSchedulerState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSchedulerState, + ec.marshalNSchedulerState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSchedulerState, true, true, ) @@ -7913,7 +8209,7 @@ func (ec *executionContext) _Node_healthState(ctx context.Context, field graphql field, ec.fieldContext_Node_healthState, func(ctx context.Context) (any, error) { - return ec.resolvers.Node().HealthState(ctx, obj) + return ec.Resolvers.Node().HealthState(ctx, obj) }, nil, ec.marshalNMonitoringState2string, @@ -7942,7 +8238,7 @@ func (ec *executionContext) _Node_metaData(ctx context.Context, field graphql.Co field, ec.fieldContext_Node_metaData, func(ctx context.Context) (any, error) { - return ec.resolvers.Node().MetaData(ctx, obj) + return ec.Resolvers.Node().MetaData(ctx, obj) }, nil, ec.marshalOAny2interface, @@ -7964,6 +8260,35 @@ func (ec *executionContext) fieldContext_Node_metaData(_ context.Context, field return fc, nil } +func (ec *executionContext) _Node_healthData(ctx context.Context, field graphql.CollectedField, obj *schema.Node) (ret graphql.Marshaler) { + return graphql.ResolveField( + ctx, + ec.OperationContext, + field, + ec.fieldContext_Node_healthData, + func(ctx context.Context) (any, error) { + return ec.Resolvers.Node().HealthData(ctx, obj) + }, + nil, + ec.marshalOAny2interface, + true, + false, + ) +} + +func (ec *executionContext) fieldContext_Node_healthData(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Node", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + return nil, errors.New("field of type Any does not have child fields") + }, + } + return fc, nil +} + func (ec *executionContext) _NodeMetrics_host(ctx context.Context, field graphql.CollectedField, obj *model.NodeMetrics) (ret graphql.Marshaler) { return graphql.ResolveField( ctx, @@ -7994,34 +8319,19 @@ func (ec *executionContext) fieldContext_NodeMetrics_host(_ context.Context, fie } func (ec *executionContext) _NodeMetrics_state(ctx context.Context, field graphql.CollectedField, obj *model.NodeMetrics) (ret graphql.Marshaler) { - fc, err := ec.fieldContext_NodeMetrics_state(ctx, field) - if err != nil { - return graphql.Null - } - ctx = graphql.WithFieldContext(ctx, fc) - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = graphql.Null - } - }() - resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { - ctx = rctx // use context from middleware stack in children - return obj.State, nil - }) - if err != nil { - ec.Error(ctx, err) - return graphql.Null - } - if resTmp == nil { - if !graphql.HasFieldError(ctx, fc) { - ec.Errorf(ctx, "must not be null") - } - return graphql.Null - } - res := resTmp.(string) - fc.Result = res - return ec.marshalNString2string(ctx, field.Selections, res) + return graphql.ResolveField( + ctx, + ec.OperationContext, + field, + ec.fieldContext_NodeMetrics_state, + func(ctx context.Context) (any, error) { + return obj.State, nil + }, + nil, + ec.marshalNString2string, + true, + true, + ) } func (ec *executionContext) fieldContext_NodeMetrics_state(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { @@ -8113,7 +8423,7 @@ func (ec *executionContext) _NodeStateResultList_items(ctx context.Context, fiel return obj.Items, nil }, nil, - ec.marshalNNode2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐNodeᚄ, + ec.marshalNNode2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐNodeᚄ, true, true, ) @@ -8149,6 +8459,8 @@ func (ec *executionContext) fieldContext_NodeStateResultList_items(_ context.Con return ec.fieldContext_Node_healthState(ctx, field) case "metaData": return ec.fieldContext_Node_metaData(ctx, field) + case "healthData": + return ec.fieldContext_Node_healthData(ctx, field) } return nil, fmt.Errorf("no field named %q was found under type Node", field.Name) }, @@ -8273,34 +8585,19 @@ func (ec *executionContext) fieldContext_NodeStatesTimed_state(_ context.Context } func (ec *executionContext) _NodeStatesTimed_counts(ctx context.Context, field graphql.CollectedField, obj *model.NodeStatesTimed) (ret graphql.Marshaler) { - fc, err := ec.fieldContext_NodeStatesTimed_counts(ctx, field) - if err != nil { - return graphql.Null - } - ctx = graphql.WithFieldContext(ctx, fc) - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = graphql.Null - } - }() - resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { - ctx = rctx // use context from middleware stack in children - return obj.Counts, nil - }) - if err != nil { - ec.Error(ctx, err) - return graphql.Null - } - if resTmp == nil { - if !graphql.HasFieldError(ctx, fc) { - ec.Errorf(ctx, "must not be null") - } - return graphql.Null - } - res := resTmp.([]int) - fc.Result = res - return ec.marshalNInt2ᚕintᚄ(ctx, field.Selections, res) + return graphql.ResolveField( + ctx, + ec.OperationContext, + field, + ec.fieldContext_NodeStatesTimed_counts, + func(ctx context.Context) (any, error) { + return obj.Counts, nil + }, + nil, + ec.marshalNInt2ᚕintᚄ, + true, + true, + ) } func (ec *executionContext) fieldContext_NodeStatesTimed_counts(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { @@ -8317,34 +8614,19 @@ func (ec *executionContext) fieldContext_NodeStatesTimed_counts(_ context.Contex } func (ec *executionContext) _NodeStatesTimed_times(ctx context.Context, field graphql.CollectedField, obj *model.NodeStatesTimed) (ret graphql.Marshaler) { - fc, err := ec.fieldContext_NodeStatesTimed_times(ctx, field) - if err != nil { - return graphql.Null - } - ctx = graphql.WithFieldContext(ctx, fc) - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = graphql.Null - } - }() - resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { - ctx = rctx // use context from middleware stack in children - return obj.Times, nil - }) - if err != nil { - ec.Error(ctx, err) - return graphql.Null - } - if resTmp == nil { - if !graphql.HasFieldError(ctx, fc) { - ec.Errorf(ctx, "must not be null") - } - return graphql.Null - } - res := resTmp.([]int) - fc.Result = res - return ec.marshalNInt2ᚕintᚄ(ctx, field.Selections, res) + return graphql.ResolveField( + ctx, + ec.OperationContext, + field, + ec.fieldContext_NodeStatesTimed_times, + func(ctx context.Context) (any, error) { + return obj.Times, nil + }, + nil, + ec.marshalNInt2ᚕintᚄ, + true, + true, + ) } func (ec *executionContext) fieldContext_NodeStatesTimed_times(_ context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { @@ -8551,10 +8833,10 @@ func (ec *executionContext) _Query_clusters(ctx context.Context, field graphql.C field, ec.fieldContext_Query_clusters, func(ctx context.Context) (any, error) { - return ec.resolvers.Query().Clusters(ctx) + return ec.Resolvers.Query().Clusters(ctx) }, nil, - ec.marshalNCluster2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐClusterᚄ, + ec.marshalNCluster2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐClusterᚄ, true, true, ) @@ -8588,10 +8870,10 @@ func (ec *executionContext) _Query_tags(ctx context.Context, field graphql.Colle field, ec.fieldContext_Query_tags, func(ctx context.Context) (any, error) { - return ec.resolvers.Query().Tags(ctx) + return ec.Resolvers.Query().Tags(ctx) }, nil, - ec.marshalNTag2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐTagᚄ, + ec.marshalNTag2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐTagᚄ, true, true, ) @@ -8627,10 +8909,10 @@ func (ec *executionContext) _Query_globalMetrics(ctx context.Context, field grap field, ec.fieldContext_Query_globalMetrics, func(ctx context.Context) (any, error) { - return ec.resolvers.Query().GlobalMetrics(ctx) + return ec.Resolvers.Query().GlobalMetrics(ctx) }, nil, - ec.marshalNGlobalMetricListItem2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐGlobalMetricListItemᚄ, + ec.marshalNGlobalMetricListItem2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐGlobalMetricListItemᚄ, true, true, ) @@ -8669,7 +8951,7 @@ func (ec *executionContext) _Query_user(ctx context.Context, field graphql.Colle ec.fieldContext_Query_user, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Query().User(ctx, fc.Args["username"].(string)) + return ec.Resolvers.Query().User(ctx, fc.Args["username"].(string)) }, nil, ec.marshalOUser2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐUser, @@ -8718,7 +9000,7 @@ func (ec *executionContext) _Query_allocatedNodes(ctx context.Context, field gra ec.fieldContext_Query_allocatedNodes, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Query().AllocatedNodes(ctx, fc.Args["cluster"].(string)) + return ec.Resolvers.Query().AllocatedNodes(ctx, fc.Args["cluster"].(string)) }, nil, ec.marshalNCount2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐCountᚄ, @@ -8765,10 +9047,10 @@ func (ec *executionContext) _Query_node(ctx context.Context, field graphql.Colle ec.fieldContext_Query_node, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Query().Node(ctx, fc.Args["id"].(string)) + return ec.Resolvers.Query().Node(ctx, fc.Args["id"].(string)) }, nil, - ec.marshalONode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐNode, + ec.marshalONode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐNode, true, false, ) @@ -8804,6 +9086,8 @@ func (ec *executionContext) fieldContext_Query_node(ctx context.Context, field g return ec.fieldContext_Node_healthState(ctx, field) case "metaData": return ec.fieldContext_Node_metaData(ctx, field) + case "healthData": + return ec.fieldContext_Node_healthData(ctx, field) } return nil, fmt.Errorf("no field named %q was found under type Node", field.Name) }, @@ -8830,7 +9114,7 @@ func (ec *executionContext) _Query_nodes(ctx context.Context, field graphql.Coll ec.fieldContext_Query_nodes, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Query().Nodes(ctx, fc.Args["filter"].([]*model.NodeFilter), fc.Args["order"].(*model.OrderByInput)) + return ec.Resolvers.Query().Nodes(ctx, fc.Args["filter"].([]*model.NodeFilter), fc.Args["order"].(*model.OrderByInput)) }, nil, ec.marshalNNodeStateResultList2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStateResultList, @@ -8869,6 +9153,53 @@ func (ec *executionContext) fieldContext_Query_nodes(ctx context.Context, field return fc, nil } +func (ec *executionContext) _Query_nodesWithMeta(ctx context.Context, field graphql.CollectedField) (ret graphql.Marshaler) { + return graphql.ResolveField( + ctx, + ec.OperationContext, + field, + ec.fieldContext_Query_nodesWithMeta, + func(ctx context.Context) (any, error) { + fc := graphql.GetFieldContext(ctx) + return ec.Resolvers.Query().NodesWithMeta(ctx, fc.Args["filter"].([]*model.NodeFilter), fc.Args["order"].(*model.OrderByInput)) + }, + nil, + ec.marshalNNodeStateResultList2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStateResultList, + true, + true, + ) +} + +func (ec *executionContext) fieldContext_Query_nodesWithMeta(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Query", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + switch field.Name { + case "items": + return ec.fieldContext_NodeStateResultList_items(ctx, field) + case "count": + return ec.fieldContext_NodeStateResultList_count(ctx, field) + } + return nil, fmt.Errorf("no field named %q was found under type NodeStateResultList", field.Name) + }, + } + defer func() { + if r := recover(); r != nil { + err = ec.Recover(ctx, r) + ec.Error(ctx, err) + } + }() + ctx = graphql.WithFieldContext(ctx, fc) + if fc.Args, err = ec.field_Query_nodesWithMeta_args(ctx, field.ArgumentMap(ec.Variables)); err != nil { + ec.Error(ctx, err) + return fc, err + } + return fc, nil +} + func (ec *executionContext) _Query_nodeStates(ctx context.Context, field graphql.CollectedField) (ret graphql.Marshaler) { return graphql.ResolveField( ctx, @@ -8877,7 +9208,7 @@ func (ec *executionContext) _Query_nodeStates(ctx context.Context, field graphql ec.fieldContext_Query_nodeStates, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Query().NodeStates(ctx, fc.Args["filter"].([]*model.NodeFilter)) + return ec.Resolvers.Query().NodeStates(ctx, fc.Args["filter"].([]*model.NodeFilter)) }, nil, ec.marshalNNodeStates2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStatesᚄ, @@ -8917,34 +9248,20 @@ func (ec *executionContext) fieldContext_Query_nodeStates(ctx context.Context, f } func (ec *executionContext) _Query_nodeStatesTimed(ctx context.Context, field graphql.CollectedField) (ret graphql.Marshaler) { - fc, err := ec.fieldContext_Query_nodeStatesTimed(ctx, field) - if err != nil { - return graphql.Null - } - ctx = graphql.WithFieldContext(ctx, fc) - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = graphql.Null - } - }() - resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { - ctx = rctx // use context from middleware stack in children - return ec.resolvers.Query().NodeStatesTimed(rctx, fc.Args["filter"].([]*model.NodeFilter), fc.Args["type"].(string)) - }) - if err != nil { - ec.Error(ctx, err) - return graphql.Null - } - if resTmp == nil { - if !graphql.HasFieldError(ctx, fc) { - ec.Errorf(ctx, "must not be null") - } - return graphql.Null - } - res := resTmp.([]*model.NodeStatesTimed) - fc.Result = res - return ec.marshalNNodeStatesTimed2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStatesTimedᚄ(ctx, field.Selections, res) + return graphql.ResolveField( + ctx, + ec.OperationContext, + field, + ec.fieldContext_Query_nodeStatesTimed, + func(ctx context.Context) (any, error) { + fc := graphql.GetFieldContext(ctx) + return ec.Resolvers.Query().NodeStatesTimed(ctx, fc.Args["filter"].([]*model.NodeFilter), fc.Args["type"].(string)) + }, + nil, + ec.marshalNNodeStatesTimed2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStatesTimedᚄ, + true, + true, + ) } func (ec *executionContext) fieldContext_Query_nodeStatesTimed(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { @@ -8987,10 +9304,10 @@ func (ec *executionContext) _Query_job(ctx context.Context, field graphql.Collec ec.fieldContext_Query_job, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Query().Job(ctx, fc.Args["id"].(string)) + return ec.Resolvers.Query().Job(ctx, fc.Args["id"].(string)) }, nil, - ec.marshalOJob2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJob, + ec.marshalOJob2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJob, true, false, ) @@ -9082,7 +9399,7 @@ func (ec *executionContext) _Query_jobMetrics(ctx context.Context, field graphql ec.fieldContext_Query_jobMetrics, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Query().JobMetrics(ctx, fc.Args["id"].(string), fc.Args["metrics"].([]string), fc.Args["scopes"].([]schema.MetricScope), fc.Args["resolution"].(*int)) + return ec.Resolvers.Query().JobMetrics(ctx, fc.Args["id"].(string), fc.Args["metrics"].([]string), fc.Args["scopes"].([]schema.MetricScope), fc.Args["resolution"].(*int)) }, nil, ec.marshalNJobMetricWithName2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobMetricWithNameᚄ, @@ -9131,7 +9448,7 @@ func (ec *executionContext) _Query_jobStats(ctx context.Context, field graphql.C ec.fieldContext_Query_jobStats, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Query().JobStats(ctx, fc.Args["id"].(string), fc.Args["metrics"].([]string)) + return ec.Resolvers.Query().JobStats(ctx, fc.Args["id"].(string), fc.Args["metrics"].([]string)) }, nil, ec.marshalNNamedStats2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNamedStatsᚄ, @@ -9178,7 +9495,7 @@ func (ec *executionContext) _Query_scopedJobStats(ctx context.Context, field gra ec.fieldContext_Query_scopedJobStats, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Query().ScopedJobStats(ctx, fc.Args["id"].(string), fc.Args["metrics"].([]string), fc.Args["scopes"].([]schema.MetricScope)) + return ec.Resolvers.Query().ScopedJobStats(ctx, fc.Args["id"].(string), fc.Args["metrics"].([]string), fc.Args["scopes"].([]schema.MetricScope)) }, nil, ec.marshalNNamedStatsWithScope2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNamedStatsWithScopeᚄ, @@ -9227,7 +9544,7 @@ func (ec *executionContext) _Query_jobs(ctx context.Context, field graphql.Colle ec.fieldContext_Query_jobs, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Query().Jobs(ctx, fc.Args["filter"].([]*model.JobFilter), fc.Args["page"].(*model.PageRequest), fc.Args["order"].(*model.OrderByInput)) + return ec.Resolvers.Query().Jobs(ctx, fc.Args["filter"].([]*model.JobFilter), fc.Args["page"].(*model.PageRequest), fc.Args["order"].(*model.OrderByInput)) }, nil, ec.marshalNJobResultList2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobResultList, @@ -9280,7 +9597,7 @@ func (ec *executionContext) _Query_jobsStatistics(ctx context.Context, field gra ec.fieldContext_Query_jobsStatistics, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Query().JobsStatistics(ctx, fc.Args["filter"].([]*model.JobFilter), fc.Args["metrics"].([]string), fc.Args["page"].(*model.PageRequest), fc.Args["sortBy"].(*model.SortByAggregate), fc.Args["groupBy"].(*model.Aggregate), fc.Args["numDurationBins"].(*string), fc.Args["numMetricBins"].(*int)) + return ec.Resolvers.Query().JobsStatistics(ctx, fc.Args["filter"].([]*model.JobFilter), fc.Args["metrics"].([]string), fc.Args["page"].(*model.PageRequest), fc.Args["sortBy"].(*model.SortByAggregate), fc.Args["groupBy"].(*model.Aggregate), fc.Args["numDurationBins"].(*string), fc.Args["numMetricBins"].(*int)) }, nil, ec.marshalNJobsStatistics2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobsStatisticsᚄ, @@ -9359,7 +9676,7 @@ func (ec *executionContext) _Query_jobsMetricStats(ctx context.Context, field gr ec.fieldContext_Query_jobsMetricStats, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Query().JobsMetricStats(ctx, fc.Args["filter"].([]*model.JobFilter), fc.Args["metrics"].([]string)) + return ec.Resolvers.Query().JobsMetricStats(ctx, fc.Args["filter"].([]*model.JobFilter), fc.Args["metrics"].([]string)) }, nil, ec.marshalNJobStats2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobStatsᚄ, @@ -9422,7 +9739,7 @@ func (ec *executionContext) _Query_jobsFootprints(ctx context.Context, field gra ec.fieldContext_Query_jobsFootprints, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Query().JobsFootprints(ctx, fc.Args["filter"].([]*model.JobFilter), fc.Args["metrics"].([]string)) + return ec.Resolvers.Query().JobsFootprints(ctx, fc.Args["filter"].([]*model.JobFilter), fc.Args["metrics"].([]string)) }, nil, ec.marshalOFootprints2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐFootprints, @@ -9469,7 +9786,7 @@ func (ec *executionContext) _Query_rooflineHeatmap(ctx context.Context, field gr ec.fieldContext_Query_rooflineHeatmap, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Query().RooflineHeatmap(ctx, fc.Args["filter"].([]*model.JobFilter), fc.Args["rows"].(int), fc.Args["cols"].(int), fc.Args["minX"].(float64), fc.Args["minY"].(float64), fc.Args["maxX"].(float64), fc.Args["maxY"].(float64)) + return ec.Resolvers.Query().RooflineHeatmap(ctx, fc.Args["filter"].([]*model.JobFilter), fc.Args["rows"].(int), fc.Args["cols"].(int), fc.Args["minX"].(float64), fc.Args["minY"].(float64), fc.Args["maxX"].(float64), fc.Args["maxY"].(float64)) }, nil, ec.marshalNFloat2ᚕᚕfloat64ᚄ, @@ -9510,7 +9827,7 @@ func (ec *executionContext) _Query_nodeMetrics(ctx context.Context, field graphq ec.fieldContext_Query_nodeMetrics, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.resolvers.Query().NodeMetrics(ctx, fc.Args["cluster"].(string), fc.Args["nodes"].([]string), fc.Args["scopes"].([]schema.MetricScope), fc.Args["metrics"].([]string), fc.Args["from"].(time.Time), fc.Args["to"].(time.Time)) + return ec.Resolvers.Query().NodeMetrics(ctx, fc.Args["cluster"].(string), fc.Args["nodes"].([]string), fc.Args["scopes"].([]schema.MetricScope), fc.Args["metrics"].([]string), fc.Args["from"].(time.Time), fc.Args["to"].(time.Time)) }, nil, ec.marshalNNodeMetrics2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeMetricsᚄ, @@ -9554,34 +9871,20 @@ func (ec *executionContext) fieldContext_Query_nodeMetrics(ctx context.Context, } func (ec *executionContext) _Query_nodeMetricsList(ctx context.Context, field graphql.CollectedField) (ret graphql.Marshaler) { - fc, err := ec.fieldContext_Query_nodeMetricsList(ctx, field) - if err != nil { - return graphql.Null - } - ctx = graphql.WithFieldContext(ctx, fc) - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = graphql.Null - } - }() - resTmp, err := ec.ResolverMiddleware(ctx, func(rctx context.Context) (any, error) { - ctx = rctx // use context from middleware stack in children - return ec.resolvers.Query().NodeMetricsList(rctx, fc.Args["cluster"].(string), fc.Args["subCluster"].(string), fc.Args["stateFilter"].(string), fc.Args["nodeFilter"].(string), fc.Args["scopes"].([]schema.MetricScope), fc.Args["metrics"].([]string), fc.Args["from"].(time.Time), fc.Args["to"].(time.Time), fc.Args["page"].(*model.PageRequest), fc.Args["resolution"].(*int)) - }) - if err != nil { - ec.Error(ctx, err) - return graphql.Null - } - if resTmp == nil { - if !graphql.HasFieldError(ctx, fc) { - ec.Errorf(ctx, "must not be null") - } - return graphql.Null - } - res := resTmp.(*model.NodesResultList) - fc.Result = res - return ec.marshalNNodesResultList2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodesResultList(ctx, field.Selections, res) + return graphql.ResolveField( + ctx, + ec.OperationContext, + field, + ec.fieldContext_Query_nodeMetricsList, + func(ctx context.Context) (any, error) { + fc := graphql.GetFieldContext(ctx) + return ec.Resolvers.Query().NodeMetricsList(ctx, fc.Args["cluster"].(string), fc.Args["subCluster"].(string), fc.Args["stateFilter"].(string), fc.Args["nodeFilter"].(string), fc.Args["scopes"].([]schema.MetricScope), fc.Args["metrics"].([]string), fc.Args["from"].(time.Time), fc.Args["to"].(time.Time), fc.Args["page"].(*model.PageRequest), fc.Args["resolution"].(*int)) + }, + nil, + ec.marshalNNodesResultList2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodesResultList, + true, + true, + ) } func (ec *executionContext) fieldContext_Query_nodeMetricsList(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { @@ -9622,6 +9925,53 @@ func (ec *executionContext) fieldContext_Query_nodeMetricsList(ctx context.Conte return fc, nil } +func (ec *executionContext) _Query_clusterMetrics(ctx context.Context, field graphql.CollectedField) (ret graphql.Marshaler) { + return graphql.ResolveField( + ctx, + ec.OperationContext, + field, + ec.fieldContext_Query_clusterMetrics, + func(ctx context.Context) (any, error) { + fc := graphql.GetFieldContext(ctx) + return ec.Resolvers.Query().ClusterMetrics(ctx, fc.Args["cluster"].(string), fc.Args["metrics"].([]string), fc.Args["from"].(time.Time), fc.Args["to"].(time.Time)) + }, + nil, + ec.marshalNClusterMetrics2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐClusterMetrics, + true, + true, + ) +} + +func (ec *executionContext) fieldContext_Query_clusterMetrics(ctx context.Context, field graphql.CollectedField) (fc *graphql.FieldContext, err error) { + fc = &graphql.FieldContext{ + Object: "Query", + Field: field, + IsMethod: true, + IsResolver: true, + Child: func(ctx context.Context, field graphql.CollectedField) (*graphql.FieldContext, error) { + switch field.Name { + case "nodeCount": + return ec.fieldContext_ClusterMetrics_nodeCount(ctx, field) + case "metrics": + return ec.fieldContext_ClusterMetrics_metrics(ctx, field) + } + return nil, fmt.Errorf("no field named %q was found under type ClusterMetrics", field.Name) + }, + } + defer func() { + if r := recover(); r != nil { + err = ec.Recover(ctx, r) + ec.Error(ctx, err) + } + }() + ctx = graphql.WithFieldContext(ctx, fc) + if fc.Args, err = ec.field_Query_clusterMetrics_args(ctx, field.ArgumentMap(ec.Variables)); err != nil { + ec.Error(ctx, err) + return fc, err + } + return fc, nil +} + func (ec *executionContext) _Query___type(ctx context.Context, field graphql.CollectedField) (ret graphql.Marshaler) { return graphql.ResolveField( ctx, @@ -9630,7 +9980,7 @@ func (ec *executionContext) _Query___type(ctx context.Context, field graphql.Col ec.fieldContext_Query___type, func(ctx context.Context) (any, error) { fc := graphql.GetFieldContext(ctx) - return ec.introspectType(fc.Args["name"].(string)) + return ec.IntrospectType(fc.Args["name"].(string)) }, nil, ec.marshalO__Type2ᚖgithubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐType, @@ -9694,7 +10044,7 @@ func (ec *executionContext) _Query___schema(ctx context.Context, field graphql.C field, ec.fieldContext_Query___schema, func(ctx context.Context) (any, error) { - return ec.introspectSchema() + return ec.IntrospectSchema() }, nil, ec.marshalO__Schema2ᚖgithubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐSchema, @@ -9914,7 +10264,7 @@ func (ec *executionContext) _ScopedStats_data(ctx context.Context, field graphql return obj.Data, nil }, nil, - ec.marshalNMetricStatistics2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricStatistics, + ec.marshalNMetricStatistics2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricStatistics, true, true, ) @@ -9977,7 +10327,7 @@ func (ec *executionContext) _Series_id(ctx context.Context, field graphql.Collec field, ec.fieldContext_Series_id, func(ctx context.Context) (any, error) { - return obj.Id, nil + return obj.ID, nil }, nil, ec.marshalOString2ᚖstring, @@ -10009,7 +10359,7 @@ func (ec *executionContext) _Series_statistics(ctx context.Context, field graphq return obj.Statistics, nil }, nil, - ec.marshalOMetricStatistics2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricStatistics, + ec.marshalOMetricStatistics2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricStatistics, true, false, ) @@ -10046,7 +10396,7 @@ func (ec *executionContext) _Series_data(ctx context.Context, field graphql.Coll return obj.Data, nil }, nil, - ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐFloatᚄ, + ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloatᚄ, true, true, ) @@ -10075,7 +10425,7 @@ func (ec *executionContext) _StatsSeries_mean(ctx context.Context, field graphql return obj.Mean, nil }, nil, - ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐFloatᚄ, + ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloatᚄ, true, true, ) @@ -10104,7 +10454,7 @@ func (ec *executionContext) _StatsSeries_median(ctx context.Context, field graph return obj.Median, nil }, nil, - ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐFloatᚄ, + ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloatᚄ, true, true, ) @@ -10133,7 +10483,7 @@ func (ec *executionContext) _StatsSeries_min(ctx context.Context, field graphql. return obj.Min, nil }, nil, - ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐFloatᚄ, + ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloatᚄ, true, true, ) @@ -10162,7 +10512,7 @@ func (ec *executionContext) _StatsSeries_max(ctx context.Context, field graphql. return obj.Max, nil }, nil, - ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐFloatᚄ, + ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloatᚄ, true, true, ) @@ -10246,7 +10596,7 @@ func (ec *executionContext) _SubCluster_numberOfNodes(ctx context.Context, field field, ec.fieldContext_SubCluster_numberOfNodes, func(ctx context.Context) (any, error) { - return ec.resolvers.SubCluster().NumberOfNodes(ctx, obj) + return ec.Resolvers.SubCluster().NumberOfNodes(ctx, obj) }, nil, ec.marshalNInt2int, @@ -10394,7 +10744,7 @@ func (ec *executionContext) _SubCluster_flopRateScalar(ctx context.Context, fiel return obj.FlopRateScalar, nil }, nil, - ec.marshalNMetricValue2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricValue, + ec.marshalNMetricValue2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricValue, true, true, ) @@ -10431,7 +10781,7 @@ func (ec *executionContext) _SubCluster_flopRateSimd(ctx context.Context, field return obj.FlopRateSimd, nil }, nil, - ec.marshalNMetricValue2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricValue, + ec.marshalNMetricValue2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricValue, true, true, ) @@ -10468,7 +10818,7 @@ func (ec *executionContext) _SubCluster_memoryBandwidth(ctx context.Context, fie return obj.MemoryBandwidth, nil }, nil, - ec.marshalNMetricValue2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricValue, + ec.marshalNMetricValue2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricValue, true, true, ) @@ -10505,7 +10855,7 @@ func (ec *executionContext) _SubCluster_topology(ctx context.Context, field grap return obj.Topology, nil }, nil, - ec.marshalNTopology2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐTopology, + ec.marshalNTopology2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐTopology, true, true, ) @@ -10548,7 +10898,7 @@ func (ec *executionContext) _SubCluster_metricConfig(ctx context.Context, field return obj.MetricConfig, nil }, nil, - ec.marshalNMetricConfig2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricConfigᚄ, + ec.marshalNMetricConfig2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfigᚄ, true, true, ) @@ -11007,7 +11357,7 @@ func (ec *executionContext) _TimeWeights_nodeHours(ctx context.Context, field gr return obj.NodeHours, nil }, nil, - ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐFloatᚄ, + ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloatᚄ, true, true, ) @@ -11036,7 +11386,7 @@ func (ec *executionContext) _TimeWeights_accHours(ctx context.Context, field gra return obj.AccHours, nil }, nil, - ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐFloatᚄ, + ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloatᚄ, true, true, ) @@ -11065,7 +11415,7 @@ func (ec *executionContext) _TimeWeights_coreHours(ctx context.Context, field gr return obj.CoreHours, nil }, nil, - ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐFloatᚄ, + ec.marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloatᚄ, true, true, ) @@ -11239,7 +11589,7 @@ func (ec *executionContext) _Topology_accelerators(ctx context.Context, field gr return obj.Accelerators, nil }, nil, - ec.marshalOAccelerator2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐAcceleratorᚄ, + ec.marshalOAccelerator2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐAcceleratorᚄ, true, false, ) @@ -12887,7 +13237,6 @@ func (ec *executionContext) unmarshalInputFloatRange(ctx context.Context, obj an it.To = data } } - return it, nil } @@ -12921,7 +13270,6 @@ func (ec *executionContext) unmarshalInputIntRange(ctx context.Context, obj any) it.To = data } } - return it, nil } @@ -12932,7 +13280,7 @@ func (ec *executionContext) unmarshalInputJobFilter(ctx context.Context, obj any asMap[k] = v } - fieldsInOrder := [...]string{"tags", "dbId", "jobId", "arrayJobId", "user", "project", "jobName", "cluster", "partition", "duration", "energy", "minRunningFor", "numNodes", "numAccelerators", "numHWThreads", "startTime", "state", "metricStats", "shared", "node"} + fieldsInOrder := [...]string{"tags", "dbId", "jobId", "arrayJobId", "user", "project", "jobName", "cluster", "subCluster", "partition", "duration", "energy", "minRunningFor", "numNodes", "numAccelerators", "numHWThreads", "startTime", "state", "metricStats", "shared", "schedule", "node"} for _, k := range fieldsInOrder { v, ok := asMap[k] if !ok { @@ -12995,6 +13343,13 @@ func (ec *executionContext) unmarshalInputJobFilter(ctx context.Context, obj any return it, err } it.Cluster = data + case "subCluster": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("subCluster")) + data, err := ec.unmarshalOStringInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐStringInput(ctx, v) + if err != nil { + return it, err + } + it.SubCluster = data case "partition": ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("partition")) data, err := ec.unmarshalOStringInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐStringInput(ctx, v) @@ -13053,7 +13408,7 @@ func (ec *executionContext) unmarshalInputJobFilter(ctx context.Context, obj any it.StartTime = data case "state": ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("state")) - data, err := ec.unmarshalOJobState2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJobStateᚄ(ctx, v) + data, err := ec.unmarshalOJobState2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJobStateᚄ(ctx, v) if err != nil { return it, err } @@ -13072,6 +13427,13 @@ func (ec *executionContext) unmarshalInputJobFilter(ctx context.Context, obj any return it, err } it.Shared = data + case "schedule": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("schedule")) + data, err := ec.unmarshalOString2ᚖstring(ctx, v) + if err != nil { + return it, err + } + it.Schedule = data case "node": ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("node")) data, err := ec.unmarshalOStringInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐStringInput(ctx, v) @@ -13081,7 +13443,6 @@ func (ec *executionContext) unmarshalInputJobFilter(ctx context.Context, obj any it.Node = data } } - return it, nil } @@ -13115,7 +13476,6 @@ func (ec *executionContext) unmarshalInputMetricStatItem(ctx context.Context, ob it.Range = data } } - return it, nil } @@ -13126,7 +13486,7 @@ func (ec *executionContext) unmarshalInputNodeFilter(ctx context.Context, obj an asMap[k] = v } - fieldsInOrder := [...]string{"hostname", "cluster", "subcluster", "schedulerState", "healthState", "timeStart"} + fieldsInOrder := [...]string{"hostname", "cluster", "subCluster", "schedulerState", "healthState", "timeStart"} for _, k := range fieldsInOrder { v, ok := asMap[k] if !ok { @@ -13147,16 +13507,16 @@ func (ec *executionContext) unmarshalInputNodeFilter(ctx context.Context, obj an return it, err } it.Cluster = data - case "subcluster": - ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("subcluster")) + case "subCluster": + ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("subCluster")) data, err := ec.unmarshalOStringInput2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐStringInput(ctx, v) if err != nil { return it, err } - it.Subcluster = data + it.SubCluster = data case "schedulerState": ctx := graphql.WithPathContext(ctx, graphql.NewPathWithField("schedulerState")) - data, err := ec.unmarshalOSchedulerState2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSchedulerState(ctx, v) + data, err := ec.unmarshalOSchedulerState2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSchedulerState(ctx, v) if err != nil { return it, err } @@ -13177,7 +13537,6 @@ func (ec *executionContext) unmarshalInputNodeFilter(ctx context.Context, obj an it.TimeStart = data } } - return it, nil } @@ -13222,7 +13581,6 @@ func (ec *executionContext) unmarshalInputOrderByInput(ctx context.Context, obj it.Order = data } } - return it, nil } @@ -13256,7 +13614,6 @@ func (ec *executionContext) unmarshalInputPageRequest(ctx context.Context, obj a it.Page = data } } - return it, nil } @@ -13318,7 +13675,6 @@ func (ec *executionContext) unmarshalInputStringInput(ctx context.Context, obj a it.In = data } } - return it, nil } @@ -13359,7 +13715,6 @@ func (ec *executionContext) unmarshalInputTimeRange(ctx context.Context, obj any it.To = data } } - return it, nil } @@ -13406,10 +13761,10 @@ func (ec *executionContext) _Accelerator(ctx context.Context, sel ast.SelectionS return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -13486,10 +13841,105 @@ func (ec *executionContext) _Cluster(ctx context.Context, sel ast.SelectionSet, return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ + Label: label, + Path: graphql.GetPath(ctx), + FieldSet: dfs, + Context: ctx, + }) + } + + return out +} + +var clusterMetricWithNameImplementors = []string{"ClusterMetricWithName"} + +func (ec *executionContext) _ClusterMetricWithName(ctx context.Context, sel ast.SelectionSet, obj *model.ClusterMetricWithName) graphql.Marshaler { + fields := graphql.CollectFields(ec.OperationContext, sel, clusterMetricWithNameImplementors) + + out := graphql.NewFieldSet(fields) + deferred := make(map[string]*graphql.FieldSet) + for i, field := range fields { + switch field.Name { + case "__typename": + out.Values[i] = graphql.MarshalString("ClusterMetricWithName") + case "name": + out.Values[i] = ec._ClusterMetricWithName_name(ctx, field, obj) + if out.Values[i] == graphql.Null { + out.Invalids++ + } + case "unit": + out.Values[i] = ec._ClusterMetricWithName_unit(ctx, field, obj) + case "timestep": + out.Values[i] = ec._ClusterMetricWithName_timestep(ctx, field, obj) + if out.Values[i] == graphql.Null { + out.Invalids++ + } + case "data": + out.Values[i] = ec._ClusterMetricWithName_data(ctx, field, obj) + if out.Values[i] == graphql.Null { + out.Invalids++ + } + default: + panic("unknown field " + strconv.Quote(field.Name)) + } + } + out.Dispatch(ctx) + if out.Invalids > 0 { + return graphql.Null + } + + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) + + for label, dfs := range deferred { + ec.ProcessDeferredGroup(graphql.DeferredGroup{ + Label: label, + Path: graphql.GetPath(ctx), + FieldSet: dfs, + Context: ctx, + }) + } + + return out +} + +var clusterMetricsImplementors = []string{"ClusterMetrics"} + +func (ec *executionContext) _ClusterMetrics(ctx context.Context, sel ast.SelectionSet, obj *model.ClusterMetrics) graphql.Marshaler { + fields := graphql.CollectFields(ec.OperationContext, sel, clusterMetricsImplementors) + + out := graphql.NewFieldSet(fields) + deferred := make(map[string]*graphql.FieldSet) + for i, field := range fields { + switch field.Name { + case "__typename": + out.Values[i] = graphql.MarshalString("ClusterMetrics") + case "nodeCount": + out.Values[i] = ec._ClusterMetrics_nodeCount(ctx, field, obj) + if out.Values[i] == graphql.Null { + out.Invalids++ + } + case "metrics": + out.Values[i] = ec._ClusterMetrics_metrics(ctx, field, obj) + if out.Values[i] == graphql.Null { + out.Invalids++ + } + default: + panic("unknown field " + strconv.Quote(field.Name)) + } + } + out.Dispatch(ctx) + if out.Invalids > 0 { + return graphql.Null + } + + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) + + for label, dfs := range deferred { + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -13530,10 +13980,10 @@ func (ec *executionContext) _ClusterSupport(ctx context.Context, sel ast.Selecti return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -13574,10 +14024,10 @@ func (ec *executionContext) _Count(ctx context.Context, sel ast.SelectionSet, ob return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -13623,10 +14073,10 @@ func (ec *executionContext) _EnergyFootprintValue(ctx context.Context, sel ast.S return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -13672,10 +14122,10 @@ func (ec *executionContext) _FootprintValue(ctx context.Context, sel ast.Selecti return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -13716,10 +14166,10 @@ func (ec *executionContext) _Footprints(ctx context.Context, sel ast.SelectionSe return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -13772,10 +14222,10 @@ func (ec *executionContext) _GlobalMetricListItem(ctx context.Context, sel ast.S return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -13816,10 +14266,10 @@ func (ec *executionContext) _HistoPoint(ctx context.Context, sel ast.SelectionSe return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -13860,10 +14310,10 @@ func (ec *executionContext) _IntRangeOutput(ctx context.Context, sel ast.Selecti return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -14226,10 +14676,10 @@ func (ec *executionContext) _Job(ctx context.Context, sel ast.SelectionSet, obj return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -14270,10 +14720,10 @@ func (ec *executionContext) _JobLink(ctx context.Context, sel ast.SelectionSet, return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -14313,10 +14763,10 @@ func (ec *executionContext) _JobLinkResultList(ctx context.Context, sel ast.Sele return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -14358,10 +14808,10 @@ func (ec *executionContext) _JobMetric(ctx context.Context, sel ast.SelectionSet return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -14407,10 +14857,10 @@ func (ec *executionContext) _JobMetricWithName(ctx context.Context, sel ast.Sele return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -14454,10 +14904,10 @@ func (ec *executionContext) _JobResultList(ctx context.Context, sel ast.Selectio return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -14532,10 +14982,10 @@ func (ec *executionContext) _JobStats(ctx context.Context, sel ast.SelectionSet, return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -14656,10 +15106,10 @@ func (ec *executionContext) _JobsStatistics(ctx context.Context, sel ast.Selecti return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -14739,10 +15189,10 @@ func (ec *executionContext) _MetricConfig(ctx context.Context, sel ast.Selection return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -14783,10 +15233,10 @@ func (ec *executionContext) _MetricFootprints(ctx context.Context, sel ast.Selec return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -14828,10 +15278,10 @@ func (ec *executionContext) _MetricHistoPoint(ctx context.Context, sel ast.Selec return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -14876,10 +15326,10 @@ func (ec *executionContext) _MetricHistoPoints(ctx context.Context, sel ast.Sele return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -14925,10 +15375,10 @@ func (ec *executionContext) _MetricStatistics(ctx context.Context, sel ast.Selec return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -15002,10 +15452,10 @@ func (ec *executionContext) _MetricValue(ctx context.Context, sel ast.SelectionS return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -15083,10 +15533,10 @@ func (ec *executionContext) _Mutation(ctx context.Context, sel ast.SelectionSet) return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -15127,10 +15577,10 @@ func (ec *executionContext) _NamedStats(ctx context.Context, sel ast.SelectionSe return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -15176,10 +15626,10 @@ func (ec *executionContext) _NamedStatsWithScope(ctx context.Context, sel ast.Se return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -15367,6 +15817,39 @@ func (ec *executionContext) _Node(ctx context.Context, sel ast.SelectionSet, obj continue } + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + case "healthData": + field := field + + innerFunc := func(ctx context.Context, _ *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Node_healthData(ctx, field, obj) + return res + } + + if field.Deferrable != nil { + dfs, ok := deferred[field.Deferrable.Label] + di := 0 + if ok { + dfs.AddField(field) + di = len(dfs.Values) - 1 + } else { + dfs = graphql.NewFieldSet([]graphql.CollectedField{field}) + deferred[field.Deferrable.Label] = dfs + } + dfs.Concurrently(di, func(ctx context.Context) graphql.Marshaler { + return innerFunc(ctx, dfs) + }) + + // don't run the out.Concurrently() call below + out.Values[i] = graphql.Null + continue + } + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) default: panic("unknown field " + strconv.Quote(field.Name)) @@ -15377,10 +15860,10 @@ func (ec *executionContext) _Node(ctx context.Context, sel ast.SelectionSet, obj return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -15431,10 +15914,10 @@ func (ec *executionContext) _NodeMetrics(ctx context.Context, sel ast.SelectionS return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -15472,10 +15955,10 @@ func (ec *executionContext) _NodeStateResultList(ctx context.Context, sel ast.Se return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -15516,10 +15999,10 @@ func (ec *executionContext) _NodeStates(ctx context.Context, sel ast.SelectionSe return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -15565,10 +16048,10 @@ func (ec *executionContext) _NodeStatesTimed(ctx context.Context, sel ast.Select return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -15614,10 +16097,10 @@ func (ec *executionContext) _NodesResultList(ctx context.Context, sel ast.Select return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -15794,6 +16277,28 @@ func (ec *executionContext) _Query(ctx context.Context, sel ast.SelectionSet) gr func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) } + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return rrm(innerCtx) }) + case "nodesWithMeta": + field := field + + innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Query_nodesWithMeta(ctx, field) + if res == graphql.Null { + atomic.AddUint32(&fs.Invalids, 1) + } + return res + } + + rrm := func(ctx context.Context) graphql.Marshaler { + return ec.OperationContext.RootResolverMiddleware(ctx, + func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + } + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return rrm(innerCtx) }) case "nodeStates": field := field @@ -16074,6 +16579,28 @@ func (ec *executionContext) _Query(ctx context.Context, sel ast.SelectionSet) gr func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) } + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return rrm(innerCtx) }) + case "clusterMetrics": + field := field + + innerFunc := func(ctx context.Context, fs *graphql.FieldSet) (res graphql.Marshaler) { + defer func() { + if r := recover(); r != nil { + ec.Error(ctx, ec.Recover(ctx, r)) + } + }() + res = ec._Query_clusterMetrics(ctx, field) + if res == graphql.Null { + atomic.AddUint32(&fs.Invalids, 1) + } + return res + } + + rrm := func(ctx context.Context) graphql.Marshaler { + return ec.OperationContext.RootResolverMiddleware(ctx, + func(ctx context.Context) graphql.Marshaler { return innerFunc(ctx, out) }) + } + out.Concurrently(i, func(ctx context.Context) graphql.Marshaler { return rrm(innerCtx) }) case "__type": out.Values[i] = ec.OperationContext.RootResolverMiddleware(innerCtx, func(ctx context.Context) (res graphql.Marshaler) { @@ -16092,10 +16619,10 @@ func (ec *executionContext) _Query(ctx context.Context, sel ast.SelectionSet) gr return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16137,10 +16664,10 @@ func (ec *executionContext) _Resource(ctx context.Context, sel ast.SelectionSet, return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16183,10 +16710,10 @@ func (ec *executionContext) _ScopedStats(ctx context.Context, sel ast.SelectionS return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16231,10 +16758,10 @@ func (ec *executionContext) _Series(ctx context.Context, sel ast.SelectionSet, o return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16285,10 +16812,10 @@ func (ec *executionContext) _StatsSeries(ctx context.Context, sel ast.SelectionS return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16415,10 +16942,10 @@ func (ec *executionContext) _SubCluster(ctx context.Context, sel ast.SelectionSe return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16464,10 +16991,10 @@ func (ec *executionContext) _SubClusterConfig(ctx context.Context, sel ast.Selec return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16518,10 +17045,10 @@ func (ec *executionContext) _Tag(ctx context.Context, sel ast.SelectionSet, obj return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16564,10 +17091,10 @@ func (ec *executionContext) _TimeRangeOutput(ctx context.Context, sel ast.Select return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16613,10 +17140,10 @@ func (ec *executionContext) _TimeWeights(ctx context.Context, sel ast.SelectionS return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16659,10 +17186,10 @@ func (ec *executionContext) _Topology(ctx context.Context, sel ast.SelectionSet, return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16700,10 +17227,10 @@ func (ec *executionContext) _Unit(ctx context.Context, sel ast.SelectionSet, obj return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16749,10 +17276,10 @@ func (ec *executionContext) _User(ctx context.Context, sel ast.SelectionSet, obj return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16805,10 +17332,10 @@ func (ec *executionContext) ___Directive(ctx context.Context, sel ast.SelectionS return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16853,10 +17380,10 @@ func (ec *executionContext) ___EnumValue(ctx context.Context, sel ast.SelectionS return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16911,10 +17438,10 @@ func (ec *executionContext) ___Field(ctx context.Context, sel ast.SelectionSet, return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -16966,10 +17493,10 @@ func (ec *executionContext) ___InputValue(ctx context.Context, sel ast.Selection return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -17021,10 +17548,10 @@ func (ec *executionContext) ___Schema(ctx context.Context, sel ast.SelectionSet, return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -17080,10 +17607,10 @@ func (ec *executionContext) ___Type(ctx context.Context, sel ast.SelectionSet, o return graphql.Null } - atomic.AddInt32(&ec.deferred, int32(len(deferred))) + atomic.AddInt32(&ec.Deferred, int32(len(deferred))) for label, dfs := range deferred { - ec.processDeferredGroup(graphql.DeferredGroup{ + ec.ProcessDeferredGroup(graphql.DeferredGroup{ Label: label, Path: graphql.GetPath(ctx), FieldSet: dfs, @@ -17098,10 +17625,10 @@ func (ec *executionContext) ___Type(ctx context.Context, sel ast.SelectionSet, o // region ***************************** type.gotpl ***************************** -func (ec *executionContext) marshalNAccelerator2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐAccelerator(ctx context.Context, sel ast.SelectionSet, v *schema.Accelerator) graphql.Marshaler { +func (ec *executionContext) marshalNAccelerator2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐAccelerator(ctx context.Context, sel ast.SelectionSet, v *schema.Accelerator) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -17118,46 +17645,18 @@ func (ec *executionContext) marshalNBoolean2bool(ctx context.Context, sel ast.Se res := graphql.MarshalBoolean(v) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return res } -func (ec *executionContext) marshalNCluster2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐClusterᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.Cluster) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNCluster2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐCluster(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() +func (ec *executionContext) marshalNCluster2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐClusterᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.Cluster) graphql.Marshaler { + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNCluster2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐCluster(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -17168,54 +17667,66 @@ func (ec *executionContext) marshalNCluster2ᚕᚖgithubᚗcomᚋClusterCockpit return ret } -func (ec *executionContext) marshalNCluster2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐCluster(ctx context.Context, sel ast.SelectionSet, v *schema.Cluster) graphql.Marshaler { +func (ec *executionContext) marshalNCluster2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐCluster(ctx context.Context, sel ast.SelectionSet, v *schema.Cluster) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } return ec._Cluster(ctx, sel, v) } -func (ec *executionContext) marshalNClusterSupport2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐClusterSupport(ctx context.Context, sel ast.SelectionSet, v schema.ClusterSupport) graphql.Marshaler { +func (ec *executionContext) marshalNClusterMetricWithName2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐClusterMetricWithNameᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.ClusterMetricWithName) graphql.Marshaler { + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNClusterMetricWithName2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐClusterMetricWithName(ctx, sel, v[i]) + }) + + for _, e := range ret { + if e == graphql.Null { + return graphql.Null + } + } + + return ret +} + +func (ec *executionContext) marshalNClusterMetricWithName2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐClusterMetricWithName(ctx context.Context, sel ast.SelectionSet, v *model.ClusterMetricWithName) graphql.Marshaler { + if v == nil { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") + } + return graphql.Null + } + return ec._ClusterMetricWithName(ctx, sel, v) +} + +func (ec *executionContext) marshalNClusterMetrics2githubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐClusterMetrics(ctx context.Context, sel ast.SelectionSet, v model.ClusterMetrics) graphql.Marshaler { + return ec._ClusterMetrics(ctx, sel, &v) +} + +func (ec *executionContext) marshalNClusterMetrics2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐClusterMetrics(ctx context.Context, sel ast.SelectionSet, v *model.ClusterMetrics) graphql.Marshaler { + if v == nil { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") + } + return graphql.Null + } + return ec._ClusterMetrics(ctx, sel, v) +} + +func (ec *executionContext) marshalNClusterSupport2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐClusterSupport(ctx context.Context, sel ast.SelectionSet, v schema.ClusterSupport) graphql.Marshaler { return ec._ClusterSupport(ctx, sel, &v) } -func (ec *executionContext) marshalNClusterSupport2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐClusterSupportᚄ(ctx context.Context, sel ast.SelectionSet, v []schema.ClusterSupport) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNClusterSupport2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐClusterSupport(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() +func (ec *executionContext) marshalNClusterSupport2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐClusterSupportᚄ(ctx context.Context, sel ast.SelectionSet, v []schema.ClusterSupport) graphql.Marshaler { + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNClusterSupport2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐClusterSupport(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -17227,39 +17738,11 @@ func (ec *executionContext) marshalNClusterSupport2ᚕgithubᚗcomᚋClusterCock } func (ec *executionContext) marshalNCount2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐCountᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.Count) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNCount2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐCount(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNCount2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐCount(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -17273,7 +17756,7 @@ func (ec *executionContext) marshalNCount2ᚕᚖgithubᚗcomᚋClusterCockpitᚋ func (ec *executionContext) marshalNCount2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐCount(ctx context.Context, sel ast.SelectionSet, v *model.Count) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -17290,7 +17773,7 @@ func (ec *executionContext) marshalNFloat2float64(ctx context.Context, sel ast.S res := graphql.MarshalFloatContext(v) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return graphql.WrapContextMarshaler(ctx, res) @@ -17361,40 +17844,12 @@ func (ec *executionContext) unmarshalNFloatRange2ᚖgithubᚗcomᚋClusterCockpi return &res, graphql.ErrorOnPath(ctx, err) } -func (ec *executionContext) marshalNGlobalMetricListItem2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐGlobalMetricListItemᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.GlobalMetricListItem) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNGlobalMetricListItem2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐGlobalMetricListItem(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() +func (ec *executionContext) marshalNGlobalMetricListItem2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐGlobalMetricListItemᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.GlobalMetricListItem) graphql.Marshaler { + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNGlobalMetricListItem2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐGlobalMetricListItem(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -17405,10 +17860,10 @@ func (ec *executionContext) marshalNGlobalMetricListItem2ᚕᚖgithubᚗcomᚋCl return ret } -func (ec *executionContext) marshalNGlobalMetricListItem2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐGlobalMetricListItem(ctx context.Context, sel ast.SelectionSet, v *schema.GlobalMetricListItem) graphql.Marshaler { +func (ec *executionContext) marshalNGlobalMetricListItem2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐGlobalMetricListItem(ctx context.Context, sel ast.SelectionSet, v *schema.GlobalMetricListItem) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -17416,39 +17871,11 @@ func (ec *executionContext) marshalNGlobalMetricListItem2ᚖgithubᚗcomᚋClust } func (ec *executionContext) marshalNHistoPoint2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐHistoPointᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.HistoPoint) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNHistoPoint2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐHistoPoint(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNHistoPoint2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐHistoPoint(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -17462,7 +17889,7 @@ func (ec *executionContext) marshalNHistoPoint2ᚕᚖgithubᚗcomᚋClusterCockp func (ec *executionContext) marshalNHistoPoint2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐHistoPoint(ctx context.Context, sel ast.SelectionSet, v *model.HistoPoint) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -17479,7 +17906,7 @@ func (ec *executionContext) marshalNID2int64(ctx context.Context, sel ast.Select res := graphql.MarshalInt64(v) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return res @@ -17495,7 +17922,7 @@ func (ec *executionContext) marshalNID2string(ctx context.Context, sel ast.Selec res := graphql.MarshalID(v) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return res @@ -17539,7 +17966,7 @@ func (ec *executionContext) unmarshalNID2ᚖint64(ctx context.Context, v any) (* func (ec *executionContext) marshalNID2ᚖint64(ctx context.Context, sel ast.SelectionSet, v *int64) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -17547,7 +17974,7 @@ func (ec *executionContext) marshalNID2ᚖint64(ctx context.Context, sel ast.Sel res := graphql.MarshalInt64(*v) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return res @@ -17563,7 +17990,7 @@ func (ec *executionContext) marshalNInt2int(ctx context.Context, sel ast.Selecti res := graphql.MarshalInt(v) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return res @@ -17579,7 +18006,7 @@ func (ec *executionContext) marshalNInt2int32(ctx context.Context, sel ast.Selec res := graphql.MarshalInt32(v) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return res @@ -17595,7 +18022,7 @@ func (ec *executionContext) marshalNInt2int64(ctx context.Context, sel ast.Selec res := graphql.MarshalInt64(v) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return res @@ -17669,7 +18096,7 @@ func (ec *executionContext) unmarshalNInt2ᚖint(ctx context.Context, v any) (*i func (ec *executionContext) marshalNInt2ᚖint(ctx context.Context, sel ast.SelectionSet, v *int) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -17677,46 +18104,18 @@ func (ec *executionContext) marshalNInt2ᚖint(ctx context.Context, sel ast.Sele res := graphql.MarshalInt(*v) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return res } -func (ec *executionContext) marshalNJob2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJobᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.Job) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNJob2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJob(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() +func (ec *executionContext) marshalNJob2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJobᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.Job) graphql.Marshaler { + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNJob2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJob(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -17727,10 +18126,10 @@ func (ec *executionContext) marshalNJob2ᚕᚖgithubᚗcomᚋClusterCockpitᚋcc return ret } -func (ec *executionContext) marshalNJob2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJob(ctx context.Context, sel ast.SelectionSet, v *schema.Job) graphql.Marshaler { +func (ec *executionContext) marshalNJob2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJob(ctx context.Context, sel ast.SelectionSet, v *schema.Job) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -17758,39 +18157,11 @@ func (ec *executionContext) unmarshalNJobFilter2ᚖgithubᚗcomᚋClusterCockpit } func (ec *executionContext) marshalNJobLink2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobLinkᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.JobLink) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNJobLink2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobLink(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNJobLink2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobLink(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -17804,17 +18175,17 @@ func (ec *executionContext) marshalNJobLink2ᚕᚖgithubᚗcomᚋClusterCockpit func (ec *executionContext) marshalNJobLink2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobLink(ctx context.Context, sel ast.SelectionSet, v *model.JobLink) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } return ec._JobLink(ctx, sel, v) } -func (ec *executionContext) marshalNJobMetric2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJobMetric(ctx context.Context, sel ast.SelectionSet, v *schema.JobMetric) graphql.Marshaler { +func (ec *executionContext) marshalNJobMetric2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJobMetric(ctx context.Context, sel ast.SelectionSet, v *schema.JobMetric) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -17822,39 +18193,11 @@ func (ec *executionContext) marshalNJobMetric2ᚖgithubᚗcomᚋClusterCockpit } func (ec *executionContext) marshalNJobMetricWithName2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobMetricWithNameᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.JobMetricWithName) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNJobMetricWithName2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobMetricWithName(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNJobMetricWithName2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobMetricWithName(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -17868,7 +18211,7 @@ func (ec *executionContext) marshalNJobMetricWithName2ᚕᚖgithubᚗcomᚋClust func (ec *executionContext) marshalNJobMetricWithName2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobMetricWithName(ctx context.Context, sel ast.SelectionSet, v *model.JobMetricWithName) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -17882,57 +18225,29 @@ func (ec *executionContext) marshalNJobResultList2githubᚗcomᚋClusterCockpit func (ec *executionContext) marshalNJobResultList2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobResultList(ctx context.Context, sel ast.SelectionSet, v *model.JobResultList) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } return ec._JobResultList(ctx, sel, v) } -func (ec *executionContext) unmarshalNJobState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJobState(ctx context.Context, v any) (schema.JobState, error) { +func (ec *executionContext) unmarshalNJobState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJobState(ctx context.Context, v any) (schema.JobState, error) { var res schema.JobState err := res.UnmarshalGQL(v) return res, graphql.ErrorOnPath(ctx, err) } -func (ec *executionContext) marshalNJobState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJobState(ctx context.Context, sel ast.SelectionSet, v schema.JobState) graphql.Marshaler { +func (ec *executionContext) marshalNJobState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJobState(ctx context.Context, sel ast.SelectionSet, v schema.JobState) graphql.Marshaler { return v } func (ec *executionContext) marshalNJobStats2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobStatsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.JobStats) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNJobStats2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobStats(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNJobStats2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobStats(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -17946,7 +18261,7 @@ func (ec *executionContext) marshalNJobStats2ᚕᚖgithubᚗcomᚋClusterCockpit func (ec *executionContext) marshalNJobStats2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobStats(ctx context.Context, sel ast.SelectionSet, v *model.JobStats) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -17954,39 +18269,11 @@ func (ec *executionContext) marshalNJobStats2ᚖgithubᚗcomᚋClusterCockpitᚋ } func (ec *executionContext) marshalNJobsStatistics2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobsStatisticsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.JobsStatistics) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNJobsStatistics2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobsStatistics(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNJobsStatistics2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobsStatistics(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -18000,51 +18287,19 @@ func (ec *executionContext) marshalNJobsStatistics2ᚕᚖgithubᚗcomᚋClusterC func (ec *executionContext) marshalNJobsStatistics2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐJobsStatistics(ctx context.Context, sel ast.SelectionSet, v *model.JobsStatistics) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } return ec._JobsStatistics(ctx, sel, v) } -func (ec *executionContext) marshalNMetricConfig2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricConfig(ctx context.Context, sel ast.SelectionSet, v schema.MetricConfig) graphql.Marshaler { - return ec._MetricConfig(ctx, sel, &v) -} - -func (ec *executionContext) marshalNMetricConfig2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricConfigᚄ(ctx context.Context, sel ast.SelectionSet, v []schema.MetricConfig) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNMetricConfig2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricConfig(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() +func (ec *executionContext) marshalNMetricConfig2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfigᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.MetricConfig) graphql.Marshaler { + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNMetricConfig2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfig(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -18055,40 +18310,22 @@ func (ec *executionContext) marshalNMetricConfig2ᚕgithubᚗcomᚋClusterCockpi return ret } -func (ec *executionContext) marshalNMetricFootprints2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐMetricFootprintsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.MetricFootprints) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) +func (ec *executionContext) marshalNMetricConfig2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricConfig(ctx context.Context, sel ast.SelectionSet, v *schema.MetricConfig) graphql.Marshaler { + if v == nil { + if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") + } + return graphql.Null } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNMetricFootprints2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐMetricFootprints(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } + return ec._MetricConfig(ctx, sel, v) +} - } - wg.Wait() +func (ec *executionContext) marshalNMetricFootprints2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐMetricFootprintsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.MetricFootprints) graphql.Marshaler { + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNMetricFootprints2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐMetricFootprints(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -18102,7 +18339,7 @@ func (ec *executionContext) marshalNMetricFootprints2ᚕᚖgithubᚗcomᚋCluste func (ec *executionContext) marshalNMetricFootprints2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐMetricFootprints(ctx context.Context, sel ast.SelectionSet, v *model.MetricFootprints) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -18112,7 +18349,7 @@ func (ec *executionContext) marshalNMetricFootprints2ᚖgithubᚗcomᚋClusterCo func (ec *executionContext) marshalNMetricHistoPoint2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐMetricHistoPoint(ctx context.Context, sel ast.SelectionSet, v *model.MetricHistoPoint) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -18120,39 +18357,11 @@ func (ec *executionContext) marshalNMetricHistoPoint2ᚖgithubᚗcomᚋClusterCo } func (ec *executionContext) marshalNMetricHistoPoints2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐMetricHistoPointsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.MetricHistoPoints) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNMetricHistoPoints2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐMetricHistoPoints(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNMetricHistoPoints2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐMetricHistoPoints(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -18166,20 +18375,20 @@ func (ec *executionContext) marshalNMetricHistoPoints2ᚕᚖgithubᚗcomᚋClust func (ec *executionContext) marshalNMetricHistoPoints2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐMetricHistoPoints(ctx context.Context, sel ast.SelectionSet, v *model.MetricHistoPoints) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } return ec._MetricHistoPoints(ctx, sel, v) } -func (ec *executionContext) unmarshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricScope(ctx context.Context, v any) (schema.MetricScope, error) { +func (ec *executionContext) unmarshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricScope(ctx context.Context, v any) (schema.MetricScope, error) { var res schema.MetricScope err := res.UnmarshalGQL(v) return res, graphql.ErrorOnPath(ctx, err) } -func (ec *executionContext) marshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricScope(ctx context.Context, sel ast.SelectionSet, v schema.MetricScope) graphql.Marshaler { +func (ec *executionContext) marshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricScope(ctx context.Context, sel ast.SelectionSet, v schema.MetricScope) graphql.Marshaler { return v } @@ -18188,17 +18397,17 @@ func (ec *executionContext) unmarshalNMetricStatItem2ᚖgithubᚗcomᚋClusterCo return &res, graphql.ErrorOnPath(ctx, err) } -func (ec *executionContext) marshalNMetricStatistics2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricStatistics(ctx context.Context, sel ast.SelectionSet, v *schema.MetricStatistics) graphql.Marshaler { +func (ec *executionContext) marshalNMetricStatistics2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricStatistics(ctx context.Context, sel ast.SelectionSet, v *schema.MetricStatistics) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } return ec._MetricStatistics(ctx, sel, v) } -func (ec *executionContext) marshalNMetricValue2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricValue(ctx context.Context, sel ast.SelectionSet, v schema.MetricValue) graphql.Marshaler { +func (ec *executionContext) marshalNMetricValue2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricValue(ctx context.Context, sel ast.SelectionSet, v schema.MetricValue) graphql.Marshaler { return ec._MetricValue(ctx, sel, &v) } @@ -18212,46 +18421,18 @@ func (ec *executionContext) marshalNMonitoringState2string(ctx context.Context, res := graphql.MarshalString(v) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return res } func (ec *executionContext) marshalNNamedStats2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNamedStatsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.NamedStats) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNNamedStats2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNamedStats(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNNamedStats2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNamedStats(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -18265,7 +18446,7 @@ func (ec *executionContext) marshalNNamedStats2ᚕᚖgithubᚗcomᚋClusterCockp func (ec *executionContext) marshalNNamedStats2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNamedStats(ctx context.Context, sel ast.SelectionSet, v *model.NamedStats) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -18273,39 +18454,11 @@ func (ec *executionContext) marshalNNamedStats2ᚖgithubᚗcomᚋClusterCockpit } func (ec *executionContext) marshalNNamedStatsWithScope2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNamedStatsWithScopeᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.NamedStatsWithScope) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNNamedStatsWithScope2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNamedStatsWithScope(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNNamedStatsWithScope2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNamedStatsWithScope(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -18319,47 +18472,19 @@ func (ec *executionContext) marshalNNamedStatsWithScope2ᚕᚖgithubᚗcomᚋClu func (ec *executionContext) marshalNNamedStatsWithScope2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNamedStatsWithScope(ctx context.Context, sel ast.SelectionSet, v *model.NamedStatsWithScope) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } return ec._NamedStatsWithScope(ctx, sel, v) } -func (ec *executionContext) marshalNNode2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐNodeᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.Node) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNNode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐNode(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() +func (ec *executionContext) marshalNNode2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐNodeᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.Node) graphql.Marshaler { + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNNode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐNode(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -18370,10 +18495,10 @@ func (ec *executionContext) marshalNNode2ᚕᚖgithubᚗcomᚋClusterCockpitᚋc return ret } -func (ec *executionContext) marshalNNode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐNode(ctx context.Context, sel ast.SelectionSet, v *schema.Node) graphql.Marshaler { +func (ec *executionContext) marshalNNode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐNode(ctx context.Context, sel ast.SelectionSet, v *schema.Node) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -18386,39 +18511,11 @@ func (ec *executionContext) unmarshalNNodeFilter2ᚖgithubᚗcomᚋClusterCockpi } func (ec *executionContext) marshalNNodeMetrics2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeMetricsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.NodeMetrics) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNNodeMetrics2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeMetrics(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNNodeMetrics2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeMetrics(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -18432,7 +18529,7 @@ func (ec *executionContext) marshalNNodeMetrics2ᚕᚖgithubᚗcomᚋClusterCock func (ec *executionContext) marshalNNodeMetrics2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeMetrics(ctx context.Context, sel ast.SelectionSet, v *model.NodeMetrics) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -18446,7 +18543,7 @@ func (ec *executionContext) marshalNNodeStateResultList2githubᚗcomᚋClusterCo func (ec *executionContext) marshalNNodeStateResultList2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStateResultList(ctx context.Context, sel ast.SelectionSet, v *model.NodeStateResultList) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -18454,39 +18551,11 @@ func (ec *executionContext) marshalNNodeStateResultList2ᚖgithubᚗcomᚋCluste } func (ec *executionContext) marshalNNodeStates2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStatesᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.NodeStates) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNNodeStates2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStates(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNNodeStates2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStates(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -18500,7 +18569,7 @@ func (ec *executionContext) marshalNNodeStates2ᚕᚖgithubᚗcomᚋClusterCockp func (ec *executionContext) marshalNNodeStates2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStates(ctx context.Context, sel ast.SelectionSet, v *model.NodeStates) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -18508,39 +18577,11 @@ func (ec *executionContext) marshalNNodeStates2ᚖgithubᚗcomᚋClusterCockpit } func (ec *executionContext) marshalNNodeStatesTimed2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStatesTimedᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.NodeStatesTimed) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNNodeStatesTimed2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStatesTimed(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNNodeStatesTimed2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStatesTimed(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -18554,7 +18595,7 @@ func (ec *executionContext) marshalNNodeStatesTimed2ᚕᚖgithubᚗcomᚋCluster func (ec *executionContext) marshalNNodeStatesTimed2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodeStatesTimed(ctx context.Context, sel ast.SelectionSet, v *model.NodeStatesTimed) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -18568,31 +18609,31 @@ func (ec *executionContext) marshalNNodesResultList2githubᚗcomᚋClusterCockpi func (ec *executionContext) marshalNNodesResultList2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐNodesResultList(ctx context.Context, sel ast.SelectionSet, v *model.NodesResultList) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } return ec._NodesResultList(ctx, sel, v) } -func (ec *executionContext) unmarshalNNullableFloat2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐFloat(ctx context.Context, v any) (schema.Float, error) { +func (ec *executionContext) unmarshalNNullableFloat2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloat(ctx context.Context, v any) (schema.Float, error) { var res schema.Float err := res.UnmarshalGQL(v) return res, graphql.ErrorOnPath(ctx, err) } -func (ec *executionContext) marshalNNullableFloat2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐFloat(ctx context.Context, sel ast.SelectionSet, v schema.Float) graphql.Marshaler { +func (ec *executionContext) marshalNNullableFloat2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloat(ctx context.Context, sel ast.SelectionSet, v schema.Float) graphql.Marshaler { return v } -func (ec *executionContext) unmarshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐFloatᚄ(ctx context.Context, v any) ([]schema.Float, error) { +func (ec *executionContext) unmarshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloatᚄ(ctx context.Context, v any) ([]schema.Float, error) { var vSlice []any vSlice = graphql.CoerceList(v) var err error res := make([]schema.Float, len(vSlice)) for i := range vSlice { ctx := graphql.WithPathContext(ctx, graphql.NewPathWithIndex(i)) - res[i], err = ec.unmarshalNNullableFloat2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐFloat(ctx, vSlice[i]) + res[i], err = ec.unmarshalNNullableFloat2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloat(ctx, vSlice[i]) if err != nil { return nil, err } @@ -18600,10 +18641,10 @@ func (ec *executionContext) unmarshalNNullableFloat2ᚕgithubᚗcomᚋClusterCoc return res, nil } -func (ec *executionContext) marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐFloatᚄ(ctx context.Context, sel ast.SelectionSet, v []schema.Float) graphql.Marshaler { +func (ec *executionContext) marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloatᚄ(ctx context.Context, sel ast.SelectionSet, v []schema.Float) graphql.Marshaler { ret := make(graphql.Array, len(v)) for i := range v { - ret[i] = ec.marshalNNullableFloat2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐFloat(ctx, sel, v[i]) + ret[i] = ec.marshalNNullableFloat2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐFloat(ctx, sel, v[i]) } for _, e := range ret { @@ -18615,40 +18656,12 @@ func (ec *executionContext) marshalNNullableFloat2ᚕgithubᚗcomᚋClusterCockp return ret } -func (ec *executionContext) marshalNResource2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐResourceᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.Resource) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNResource2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐResource(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() +func (ec *executionContext) marshalNResource2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐResourceᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.Resource) graphql.Marshaler { + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNResource2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐResource(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -18659,67 +18672,39 @@ func (ec *executionContext) marshalNResource2ᚕᚖgithubᚗcomᚋClusterCockpit return ret } -func (ec *executionContext) marshalNResource2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐResource(ctx context.Context, sel ast.SelectionSet, v *schema.Resource) graphql.Marshaler { +func (ec *executionContext) marshalNResource2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐResource(ctx context.Context, sel ast.SelectionSet, v *schema.Resource) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } return ec._Resource(ctx, sel, v) } -func (ec *executionContext) unmarshalNSchedulerState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSchedulerState(ctx context.Context, v any) (schema.SchedulerState, error) { +func (ec *executionContext) unmarshalNSchedulerState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSchedulerState(ctx context.Context, v any) (schema.SchedulerState, error) { tmp, err := graphql.UnmarshalString(v) res := schema.SchedulerState(tmp) return res, graphql.ErrorOnPath(ctx, err) } -func (ec *executionContext) marshalNSchedulerState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSchedulerState(ctx context.Context, sel ast.SelectionSet, v schema.SchedulerState) graphql.Marshaler { +func (ec *executionContext) marshalNSchedulerState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSchedulerState(ctx context.Context, sel ast.SelectionSet, v schema.SchedulerState) graphql.Marshaler { _ = sel res := graphql.MarshalString(string(v)) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return res } func (ec *executionContext) marshalNScopedStats2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐScopedStatsᚄ(ctx context.Context, sel ast.SelectionSet, v []*model.ScopedStats) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNScopedStats2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐScopedStats(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNScopedStats2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐScopedStats(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -18733,14 +18718,14 @@ func (ec *executionContext) marshalNScopedStats2ᚕᚖgithubᚗcomᚋClusterCock func (ec *executionContext) marshalNScopedStats2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐScopedStats(ctx context.Context, sel ast.SelectionSet, v *model.ScopedStats) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } return ec._ScopedStats(ctx, sel, v) } -func (ec *executionContext) marshalNSeries2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSeries(ctx context.Context, sel ast.SelectionSet, v schema.Series) graphql.Marshaler { +func (ec *executionContext) marshalNSeries2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSeries(ctx context.Context, sel ast.SelectionSet, v schema.Series) graphql.Marshaler { return ec._Series(ctx, sel, &v) } @@ -18764,7 +18749,7 @@ func (ec *executionContext) marshalNString2string(ctx context.Context, sel ast.S res := graphql.MarshalString(v) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return res @@ -18800,40 +18785,12 @@ func (ec *executionContext) marshalNString2ᚕstringᚄ(ctx context.Context, sel return ret } -func (ec *executionContext) marshalNSubCluster2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSubClusterᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.SubCluster) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNSubCluster2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSubCluster(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() +func (ec *executionContext) marshalNSubCluster2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSubClusterᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.SubCluster) graphql.Marshaler { + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNSubCluster2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSubCluster(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -18844,50 +18801,22 @@ func (ec *executionContext) marshalNSubCluster2ᚕᚖgithubᚗcomᚋClusterCockp return ret } -func (ec *executionContext) marshalNSubCluster2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSubCluster(ctx context.Context, sel ast.SelectionSet, v *schema.SubCluster) graphql.Marshaler { +func (ec *executionContext) marshalNSubCluster2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSubCluster(ctx context.Context, sel ast.SelectionSet, v *schema.SubCluster) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } return ec._SubCluster(ctx, sel, v) } -func (ec *executionContext) marshalNSubClusterConfig2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSubClusterConfigᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.SubClusterConfig) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNSubClusterConfig2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSubClusterConfig(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() +func (ec *executionContext) marshalNSubClusterConfig2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSubClusterConfigᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.SubClusterConfig) graphql.Marshaler { + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNSubClusterConfig2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSubClusterConfig(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -18898,54 +18827,26 @@ func (ec *executionContext) marshalNSubClusterConfig2ᚕᚖgithubᚗcomᚋCluste return ret } -func (ec *executionContext) marshalNSubClusterConfig2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSubClusterConfig(ctx context.Context, sel ast.SelectionSet, v *schema.SubClusterConfig) graphql.Marshaler { +func (ec *executionContext) marshalNSubClusterConfig2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSubClusterConfig(ctx context.Context, sel ast.SelectionSet, v *schema.SubClusterConfig) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } return ec._SubClusterConfig(ctx, sel, v) } -func (ec *executionContext) marshalNTag2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐTag(ctx context.Context, sel ast.SelectionSet, v schema.Tag) graphql.Marshaler { +func (ec *executionContext) marshalNTag2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐTag(ctx context.Context, sel ast.SelectionSet, v schema.Tag) graphql.Marshaler { return ec._Tag(ctx, sel, &v) } -func (ec *executionContext) marshalNTag2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐTagᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.Tag) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNTag2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐTag(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() +func (ec *executionContext) marshalNTag2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐTagᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.Tag) graphql.Marshaler { + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNTag2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐTag(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -18956,10 +18857,10 @@ func (ec *executionContext) marshalNTag2ᚕᚖgithubᚗcomᚋClusterCockpitᚋcc return ret } -func (ec *executionContext) marshalNTag2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐTag(ctx context.Context, sel ast.SelectionSet, v *schema.Tag) graphql.Marshaler { +func (ec *executionContext) marshalNTag2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐTag(ctx context.Context, sel ast.SelectionSet, v *schema.Tag) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -18976,7 +18877,7 @@ func (ec *executionContext) marshalNTime2timeᚐTime(ctx context.Context, sel as res := graphql.MarshalTime(v) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return res @@ -18990,7 +18891,7 @@ func (ec *executionContext) unmarshalNTime2ᚖtimeᚐTime(ctx context.Context, v func (ec *executionContext) marshalNTime2ᚖtimeᚐTime(ctx context.Context, sel ast.SelectionSet, v *time.Time) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -18998,7 +18899,7 @@ func (ec *executionContext) marshalNTime2ᚖtimeᚐTime(ctx context.Context, sel res := graphql.MarshalTime(*v) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return res @@ -19007,18 +18908,18 @@ func (ec *executionContext) marshalNTime2ᚖtimeᚐTime(ctx context.Context, sel func (ec *executionContext) marshalNTimeWeights2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐTimeWeights(ctx context.Context, sel ast.SelectionSet, v *model.TimeWeights) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } return ec._TimeWeights(ctx, sel, v) } -func (ec *executionContext) marshalNTopology2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐTopology(ctx context.Context, sel ast.SelectionSet, v schema.Topology) graphql.Marshaler { +func (ec *executionContext) marshalNTopology2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐTopology(ctx context.Context, sel ast.SelectionSet, v schema.Topology) graphql.Marshaler { return ec._Topology(ctx, sel, &v) } -func (ec *executionContext) marshalNUnit2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐUnit(ctx context.Context, sel ast.SelectionSet, v schema.Unit) graphql.Marshaler { +func (ec *executionContext) marshalNUnit2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐUnit(ctx context.Context, sel ast.SelectionSet, v schema.Unit) graphql.Marshaler { return ec._Unit(ctx, sel, &v) } @@ -19027,39 +18928,11 @@ func (ec *executionContext) marshalN__Directive2githubᚗcomᚋ99designsᚋgqlge } func (ec *executionContext) marshalN__Directive2ᚕgithubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐDirectiveᚄ(ctx context.Context, sel ast.SelectionSet, v []introspection.Directive) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalN__Directive2githubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐDirective(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalN__Directive2githubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐDirective(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -19080,7 +18953,7 @@ func (ec *executionContext) marshalN__DirectiveLocation2string(ctx context.Conte res := graphql.MarshalString(v) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return res @@ -19102,39 +18975,11 @@ func (ec *executionContext) unmarshalN__DirectiveLocation2ᚕstringᚄ(ctx conte } func (ec *executionContext) marshalN__DirectiveLocation2ᚕstringᚄ(ctx context.Context, sel ast.SelectionSet, v []string) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalN__DirectiveLocation2string(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalN__DirectiveLocation2string(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -19158,39 +19003,11 @@ func (ec *executionContext) marshalN__InputValue2githubᚗcomᚋ99designsᚋgqlg } func (ec *executionContext) marshalN__InputValue2ᚕgithubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐInputValueᚄ(ctx context.Context, sel ast.SelectionSet, v []introspection.InputValue) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalN__InputValue2githubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐInputValue(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalN__InputValue2githubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐInputValue(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -19206,39 +19023,11 @@ func (ec *executionContext) marshalN__Type2githubᚗcomᚋ99designsᚋgqlgenᚋg } func (ec *executionContext) marshalN__Type2ᚕgithubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐTypeᚄ(ctx context.Context, sel ast.SelectionSet, v []introspection.Type) graphql.Marshaler { - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalN__Type2githubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐType(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalN__Type2githubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐType(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -19252,7 +19041,7 @@ func (ec *executionContext) marshalN__Type2ᚕgithubᚗcomᚋ99designsᚋgqlgen func (ec *executionContext) marshalN__Type2ᚖgithubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐType(ctx context.Context, sel ast.SelectionSet, v *introspection.Type) graphql.Marshaler { if v == nil { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } return graphql.Null } @@ -19269,49 +19058,21 @@ func (ec *executionContext) marshalN__TypeKind2string(ctx context.Context, sel a res := graphql.MarshalString(v) if res == graphql.Null { if !graphql.HasFieldError(ctx, graphql.GetFieldContext(ctx)) { - ec.Errorf(ctx, "the requested element is null which the schema does not allow") + graphql.AddErrorf(ctx, "the requested element is null which the schema does not allow") } } return res } -func (ec *executionContext) marshalOAccelerator2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐAcceleratorᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.Accelerator) graphql.Marshaler { +func (ec *executionContext) marshalOAccelerator2ᚕᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐAcceleratorᚄ(ctx context.Context, sel ast.SelectionSet, v []*schema.Accelerator) graphql.Marshaler { if v == nil { return graphql.Null } - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNAccelerator2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐAccelerator(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNAccelerator2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐAccelerator(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -19326,7 +19087,7 @@ func (ec *executionContext) unmarshalOAggregate2ᚖgithubᚗcomᚋClusterCockpit if v == nil { return nil, nil } - res := new(model.Aggregate) + var res = new(model.Aggregate) err := res.UnmarshalGQL(v) return res, graphql.ErrorOnPath(ctx, err) } @@ -19390,39 +19151,11 @@ func (ec *executionContext) marshalOEnergyFootprintValue2ᚕᚖgithubᚗcomᚋCl if v == nil { return graphql.Null } - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalOEnergyFootprintValue2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐEnergyFootprintValue(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalOEnergyFootprintValue2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐEnergyFootprintValue(ctx, sel, v[i]) + }) return ret } @@ -19457,39 +19190,11 @@ func (ec *executionContext) marshalOFootprintValue2ᚕᚖgithubᚗcomᚋClusterC if v == nil { return graphql.Null } - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalOFootprintValue2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐFootprintValue(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalOFootprintValue2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐFootprintValue(ctx, sel, v[i]) + }) return ret } @@ -19690,7 +19395,7 @@ func (ec *executionContext) unmarshalOIntRange2ᚖgithubᚗcomᚋClusterCockpit return &res, graphql.ErrorOnPath(ctx, err) } -func (ec *executionContext) marshalOJob2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJob(ctx context.Context, sel ast.SelectionSet, v *schema.Job) graphql.Marshaler { +func (ec *executionContext) marshalOJob2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJob(ctx context.Context, sel ast.SelectionSet, v *schema.Job) graphql.Marshaler { if v == nil { return graphql.Null } @@ -19722,7 +19427,7 @@ func (ec *executionContext) marshalOJobLinkResultList2ᚖgithubᚗcomᚋClusterC return ec._JobLinkResultList(ctx, sel, v) } -func (ec *executionContext) unmarshalOJobState2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJobStateᚄ(ctx context.Context, v any) ([]schema.JobState, error) { +func (ec *executionContext) unmarshalOJobState2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJobStateᚄ(ctx context.Context, v any) ([]schema.JobState, error) { if v == nil { return nil, nil } @@ -19732,7 +19437,7 @@ func (ec *executionContext) unmarshalOJobState2ᚕgithubᚗcomᚋClusterCockpit res := make([]schema.JobState, len(vSlice)) for i := range vSlice { ctx := graphql.WithPathContext(ctx, graphql.NewPathWithIndex(i)) - res[i], err = ec.unmarshalNJobState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJobState(ctx, vSlice[i]) + res[i], err = ec.unmarshalNJobState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJobState(ctx, vSlice[i]) if err != nil { return nil, err } @@ -19740,13 +19445,13 @@ func (ec *executionContext) unmarshalOJobState2ᚕgithubᚗcomᚋClusterCockpit return res, nil } -func (ec *executionContext) marshalOJobState2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJobStateᚄ(ctx context.Context, sel ast.SelectionSet, v []schema.JobState) graphql.Marshaler { +func (ec *executionContext) marshalOJobState2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJobStateᚄ(ctx context.Context, sel ast.SelectionSet, v []schema.JobState) graphql.Marshaler { if v == nil { return graphql.Null } ret := make(graphql.Array, len(v)) for i := range v { - ret[i] = ec.marshalNJobState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐJobState(ctx, sel, v[i]) + ret[i] = ec.marshalNJobState2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐJobState(ctx, sel, v[i]) } for _, e := range ret { @@ -19762,39 +19467,11 @@ func (ec *executionContext) marshalOMetricHistoPoint2ᚕᚖgithubᚗcomᚋCluste if v == nil { return graphql.Null } - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNMetricHistoPoint2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐMetricHistoPoint(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNMetricHistoPoint2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐMetricHistoPoint(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -19805,7 +19482,7 @@ func (ec *executionContext) marshalOMetricHistoPoint2ᚕᚖgithubᚗcomᚋCluste return ret } -func (ec *executionContext) unmarshalOMetricScope2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricScopeᚄ(ctx context.Context, v any) ([]schema.MetricScope, error) { +func (ec *executionContext) unmarshalOMetricScope2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricScopeᚄ(ctx context.Context, v any) ([]schema.MetricScope, error) { if v == nil { return nil, nil } @@ -19815,7 +19492,7 @@ func (ec *executionContext) unmarshalOMetricScope2ᚕgithubᚗcomᚋClusterCockp res := make([]schema.MetricScope, len(vSlice)) for i := range vSlice { ctx := graphql.WithPathContext(ctx, graphql.NewPathWithIndex(i)) - res[i], err = ec.unmarshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricScope(ctx, vSlice[i]) + res[i], err = ec.unmarshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricScope(ctx, vSlice[i]) if err != nil { return nil, err } @@ -19823,13 +19500,13 @@ func (ec *executionContext) unmarshalOMetricScope2ᚕgithubᚗcomᚋClusterCockp return res, nil } -func (ec *executionContext) marshalOMetricScope2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricScopeᚄ(ctx context.Context, sel ast.SelectionSet, v []schema.MetricScope) graphql.Marshaler { +func (ec *executionContext) marshalOMetricScope2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricScopeᚄ(ctx context.Context, sel ast.SelectionSet, v []schema.MetricScope) graphql.Marshaler { if v == nil { return graphql.Null } ret := make(graphql.Array, len(v)) for i := range v { - ret[i] = ec.marshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricScope(ctx, sel, v[i]) + ret[i] = ec.marshalNMetricScope2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricScope(ctx, sel, v[i]) } for _, e := range ret { @@ -19859,7 +19536,7 @@ func (ec *executionContext) unmarshalOMetricStatItem2ᚕᚖgithubᚗcomᚋCluste return res, nil } -func (ec *executionContext) marshalOMetricStatistics2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐMetricStatistics(ctx context.Context, sel ast.SelectionSet, v schema.MetricStatistics) graphql.Marshaler { +func (ec *executionContext) marshalOMetricStatistics2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐMetricStatistics(ctx context.Context, sel ast.SelectionSet, v schema.MetricStatistics) graphql.Marshaler { return ec._MetricStatistics(ctx, sel, &v) } @@ -19881,7 +19558,7 @@ func (ec *executionContext) marshalOMonitoringState2ᚖstring(ctx context.Contex return res } -func (ec *executionContext) marshalONode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐNode(ctx context.Context, sel ast.SelectionSet, v *schema.Node) graphql.Marshaler { +func (ec *executionContext) marshalONode2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐNode(ctx context.Context, sel ast.SelectionSet, v *schema.Node) graphql.Marshaler { if v == nil { return graphql.Null } @@ -19922,7 +19599,7 @@ func (ec *executionContext) unmarshalOPageRequest2ᚖgithubᚗcomᚋClusterCockp return &res, graphql.ErrorOnPath(ctx, err) } -func (ec *executionContext) unmarshalOSchedulerState2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSchedulerState(ctx context.Context, v any) (*schema.SchedulerState, error) { +func (ec *executionContext) unmarshalOSchedulerState2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSchedulerState(ctx context.Context, v any) (*schema.SchedulerState, error) { if v == nil { return nil, nil } @@ -19931,7 +19608,7 @@ func (ec *executionContext) unmarshalOSchedulerState2ᚖgithubᚗcomᚋClusterCo return &res, graphql.ErrorOnPath(ctx, err) } -func (ec *executionContext) marshalOSchedulerState2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSchedulerState(ctx context.Context, sel ast.SelectionSet, v *schema.SchedulerState) graphql.Marshaler { +func (ec *executionContext) marshalOSchedulerState2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSchedulerState(ctx context.Context, sel ast.SelectionSet, v *schema.SchedulerState) graphql.Marshaler { if v == nil { return graphql.Null } @@ -19941,43 +19618,15 @@ func (ec *executionContext) marshalOSchedulerState2ᚖgithubᚗcomᚋClusterCock return res } -func (ec *executionContext) marshalOSeries2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSeriesᚄ(ctx context.Context, sel ast.SelectionSet, v []schema.Series) graphql.Marshaler { +func (ec *executionContext) marshalOSeries2ᚕgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSeriesᚄ(ctx context.Context, sel ast.SelectionSet, v []schema.Series) graphql.Marshaler { if v == nil { return graphql.Null } - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalNSeries2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐSeries(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalNSeries2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐSeries(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -19992,7 +19641,7 @@ func (ec *executionContext) unmarshalOSortByAggregate2ᚖgithubᚗcomᚋClusterC if v == nil { return nil, nil } - res := new(model.SortByAggregate) + var res = new(model.SortByAggregate) err := res.UnmarshalGQL(v) return res, graphql.ErrorOnPath(ctx, err) } @@ -20004,7 +19653,7 @@ func (ec *executionContext) marshalOSortByAggregate2ᚖgithubᚗcomᚋClusterCoc return v } -func (ec *executionContext) marshalOStatsSeries2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐStatsSeries(ctx context.Context, sel ast.SelectionSet, v *schema.StatsSeries) graphql.Marshaler { +func (ec *executionContext) marshalOStatsSeries2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐStatsSeries(ctx context.Context, sel ast.SelectionSet, v *schema.StatsSeries) graphql.Marshaler { if v == nil { return graphql.Null } @@ -20111,10 +19760,17 @@ func (ec *executionContext) unmarshalOTimeRange2ᚖgithubᚗcomᚋClusterCockpit return &res, graphql.ErrorOnPath(ctx, err) } -func (ec *executionContext) marshalOUnit2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋschemaᚐUnit(ctx context.Context, sel ast.SelectionSet, v schema.Unit) graphql.Marshaler { +func (ec *executionContext) marshalOUnit2githubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐUnit(ctx context.Context, sel ast.SelectionSet, v schema.Unit) graphql.Marshaler { return ec._Unit(ctx, sel, &v) } +func (ec *executionContext) marshalOUnit2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑlibᚋv2ᚋschemaᚐUnit(ctx context.Context, sel ast.SelectionSet, v *schema.Unit) graphql.Marshaler { + if v == nil { + return graphql.Null + } + return ec._Unit(ctx, sel, v) +} + func (ec *executionContext) marshalOUser2ᚖgithubᚗcomᚋClusterCockpitᚋccᚑbackendᚋinternalᚋgraphᚋmodelᚐUser(ctx context.Context, sel ast.SelectionSet, v *model.User) graphql.Marshaler { if v == nil { return graphql.Null @@ -20126,39 +19782,11 @@ func (ec *executionContext) marshalO__EnumValue2ᚕgithubᚗcomᚋ99designsᚋgq if v == nil { return graphql.Null } - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalN__EnumValue2githubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐEnumValue(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalN__EnumValue2githubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐEnumValue(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -20173,39 +19801,11 @@ func (ec *executionContext) marshalO__Field2ᚕgithubᚗcomᚋ99designsᚋgqlgen if v == nil { return graphql.Null } - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalN__Field2githubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐField(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalN__Field2githubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐField(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -20220,39 +19820,11 @@ func (ec *executionContext) marshalO__InputValue2ᚕgithubᚗcomᚋ99designsᚋg if v == nil { return graphql.Null } - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalN__InputValue2githubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐInputValue(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalN__InputValue2githubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐInputValue(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { @@ -20274,39 +19846,11 @@ func (ec *executionContext) marshalO__Type2ᚕgithubᚗcomᚋ99designsᚋgqlgen if v == nil { return graphql.Null } - ret := make(graphql.Array, len(v)) - var wg sync.WaitGroup - isLen1 := len(v) == 1 - if !isLen1 { - wg.Add(len(v)) - } - for i := range v { - i := i - fc := &graphql.FieldContext{ - Index: &i, - Result: &v[i], - } - ctx := graphql.WithFieldContext(ctx, fc) - f := func(i int) { - defer func() { - if r := recover(); r != nil { - ec.Error(ctx, ec.Recover(ctx, r)) - ret = nil - } - }() - if !isLen1 { - defer wg.Done() - } - ret[i] = ec.marshalN__Type2githubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐType(ctx, sel, v[i]) - } - if isLen1 { - f(i) - } else { - go f(i) - } - - } - wg.Wait() + ret := graphql.MarshalSliceConcurrently(ctx, len(v), 0, false, func(ctx context.Context, i int) graphql.Marshaler { + fc := graphql.GetFieldContext(ctx) + fc.Result = &v[i] + return ec.marshalN__Type2githubᚗcomᚋ99designsᚋgqlgenᚋgraphqlᚋintrospectionᚐType(ctx, sel, v[i]) + }) for _, e := range ret { if e == graphql.Null { diff --git a/internal/graph/model/models_gen.go b/internal/graph/model/models_gen.go index 4cb414eb..24b33847 100644 --- a/internal/graph/model/models_gen.go +++ b/internal/graph/model/models_gen.go @@ -10,9 +10,21 @@ import ( "time" "github.com/ClusterCockpit/cc-backend/internal/config" - "github.com/ClusterCockpit/cc-lib/schema" + "github.com/ClusterCockpit/cc-lib/v2/schema" ) +type ClusterMetricWithName struct { + Name string `json:"name"` + Unit *schema.Unit `json:"unit,omitempty"` + Timestep int `json:"timestep"` + Data []schema.Float `json:"data"` +} + +type ClusterMetrics struct { + NodeCount int `json:"nodeCount"` + Metrics []*ClusterMetricWithName `json:"metrics"` +} + type Count struct { Name string `json:"name"` Count int `json:"count"` @@ -59,6 +71,7 @@ type JobFilter struct { Project *StringInput `json:"project,omitempty"` JobName *StringInput `json:"jobName,omitempty"` Cluster *StringInput `json:"cluster,omitempty"` + SubCluster *StringInput `json:"subCluster,omitempty"` Partition *StringInput `json:"partition,omitempty"` Duration *config.IntRange `json:"duration,omitempty"` Energy *FloatRange `json:"energy,omitempty"` @@ -70,6 +83,7 @@ type JobFilter struct { State []schema.JobState `json:"state,omitempty"` MetricStats []*MetricStatItem `json:"metricStats,omitempty"` Shared *string `json:"shared,omitempty"` + Schedule *string `json:"schedule,omitempty"` Node *StringInput `json:"node,omitempty"` } @@ -173,7 +187,7 @@ type NamedStatsWithScope struct { type NodeFilter struct { Hostname *StringInput `json:"hostname,omitempty"` Cluster *StringInput `json:"cluster,omitempty"` - Subcluster *StringInput `json:"subcluster,omitempty"` + SubCluster *StringInput `json:"subCluster,omitempty"` SchedulerState *schema.SchedulerState `json:"schedulerState,omitempty"` HealthState *string `json:"healthState,omitempty"` TimeStart *int `json:"timeStart,omitempty"` diff --git a/internal/graph/resolver.go b/internal/graph/resolver.go index 990014c7..d1b04de6 100644 --- a/internal/graph/resolver.go +++ b/internal/graph/resolver.go @@ -4,7 +4,7 @@ import ( "sync" "github.com/ClusterCockpit/cc-backend/internal/repository" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/jmoiron/sqlx" ) diff --git a/internal/graph/schema.resolvers.go b/internal/graph/schema.resolvers.go index 75556938..4c398ee3 100644 --- a/internal/graph/schema.resolvers.go +++ b/internal/graph/schema.resolvers.go @@ -1,13 +1,15 @@ package graph -// This file will be automatically regenerated based on the schema, any resolver implementations +// This file will be automatically regenerated based on the schema, any resolver +// implementations // will be copied through when generating and any unknown code will be moved to the end. -// Code generated by github.com/99designs/gqlgen version v0.17.81 +// Code generated by github.com/99designs/gqlgen version v0.17.87 import ( "context" "errors" "fmt" + "math" "regexp" "slices" "strconv" @@ -17,11 +19,12 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/graph/generated" "github.com/ClusterCockpit/cc-backend/internal/graph/model" - "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" + "github.com/ClusterCockpit/cc-backend/internal/metricdispatch" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + ccunit "github.com/ClusterCockpit/cc-lib/v2/ccUnits" + "github.com/ClusterCockpit/cc-lib/v2/schema" ) // Partitions is the resolver for the partitions field. @@ -86,14 +89,14 @@ func (r *jobResolver) EnergyFootprint(ctx context.Context, obj *schema.Job) ([]* res := []*model.EnergyFootprintValue{} for name, value := range rawEnergyFootprint { // Suboptimal: Nearly hardcoded metric name expectations - matchCpu := regexp.MustCompile(`cpu|Cpu|CPU`) + matchCPU := regexp.MustCompile(`cpu|Cpu|CPU`) matchAcc := regexp.MustCompile(`acc|Acc|ACC`) matchMem := regexp.MustCompile(`mem|Mem|MEM`) matchCore := regexp.MustCompile(`core|Core|CORE`) hwType := "" switch test := name; { // NOtice ';' for var declaration - case matchCpu.MatchString(test): + case matchCPU.MatchString(test): hwType = "CPU" case matchAcc.MatchString(test): hwType = "Accelerator" @@ -173,9 +176,9 @@ func (r *mutationResolver) AddTagsToJob(ctx context.Context, job string, tagIds } tags := []*schema.Tag{} - for _, tagId := range tagIds { + for _, tagID := range tagIds { // Get ID - tid, err := strconv.ParseInt(tagId, 10, 64) + tid, err := strconv.ParseInt(tagID, 10, 64) if err != nil { cclog.Warn("Error while parsing tag id") return nil, err @@ -220,9 +223,9 @@ func (r *mutationResolver) RemoveTagsFromJob(ctx context.Context, job string, ta } tags := []*schema.Tag{} - for _, tagId := range tagIds { + for _, tagID := range tagIds { // Get ID - tid, err := strconv.ParseInt(tagId, 10, 64) + tid, err := strconv.ParseInt(tagID, 10, 64) if err != nil { cclog.Warn("Error while parsing tag id") return nil, err @@ -263,9 +266,9 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin } tags := []int{} - for _, tagId := range tagIds { + for _, tagID := range tagIds { // Get ID - tid, err := strconv.ParseInt(tagId, 10, 64) + tid, err := strconv.ParseInt(tagID, 10, 64) if err != nil { cclog.Warn("Error while parsing tag id for removal") return nil, err @@ -281,7 +284,7 @@ func (r *mutationResolver) RemoveTagFromList(ctx context.Context, tagIds []strin // Test Access: Admins && Admin Tag OR Everyone && Private Tag if user.HasRole(schema.RoleAdmin) && (tscope == "global" || tscope == "admin") || user.Username == tscope { // Remove from DB - if err = r.Repo.RemoveTagById(tid); err != nil { + if err = r.Repo.RemoveTagByID(tid); err != nil { cclog.Warn("Error while removing tag") return nil, err } else { @@ -315,18 +318,39 @@ func (r *nodeResolver) SchedulerState(ctx context.Context, obj *schema.Node) (sc if obj.NodeState != "" { return obj.NodeState, nil } else { - return "", fmt.Errorf("No SchedulerState (NodeState) on Object") + return "", fmt.Errorf("resolver: no SchedulerState (NodeState) on node object") } } // HealthState is the resolver for the healthState field. func (r *nodeResolver) HealthState(ctx context.Context, obj *schema.Node) (string, error) { - panic(fmt.Errorf("not implemented: HealthState - healthState")) + if obj.HealthState != "" { + return string(obj.HealthState), nil + } else { + return "", fmt.Errorf("resolver: no HealthState (NodeState) on node object") + } } // MetaData is the resolver for the metaData field. func (r *nodeResolver) MetaData(ctx context.Context, obj *schema.Node) (any, error) { - panic(fmt.Errorf("not implemented: MetaData - metaData")) + if obj.MetaData != nil { + return obj.MetaData, nil + } else { + cclog.Debug("resolver: no MetaData (NodeState) on node object") + emptyMeta := make(map[string]string, 0) + return emptyMeta, nil + } +} + +// HealthData is the resolver for the healthData field. +func (r *nodeResolver) HealthData(ctx context.Context, obj *schema.Node) (any, error) { + if obj.HealthData != nil { + return obj.HealthData, nil + } else { + cclog.Debug("resolver: no HealthData (NodeState) on node object") + emptyHealth := make(map[string][]string, 0) + return emptyHealth, nil + } } // Clusters is the resolver for the clusters field. @@ -341,6 +365,14 @@ func (r *queryResolver) Tags(ctx context.Context) ([]*schema.Tag, error) { // GlobalMetrics is the resolver for the globalMetrics field. func (r *queryResolver) GlobalMetrics(ctx context.Context) ([]*schema.GlobalMetricListItem, error) { + user := repository.GetUserFromContext(ctx) + + if user != nil { + if user.HasRole(schema.RoleUser) || user.HasRole(schema.RoleManager) { + return archive.GlobalUserMetricList, nil + } + } + return archive.GlobalMetricList, nil } @@ -371,12 +403,12 @@ func (r *queryResolver) AllocatedNodes(ctx context.Context, cluster string) ([]* // Node is the resolver for the node field. func (r *queryResolver) Node(ctx context.Context, id string) (*schema.Node, error) { repo := repository.GetNodeRepository() - numericId, err := strconv.ParseInt(id, 10, 64) + numericID, err := strconv.ParseInt(id, 10, 64) if err != nil { cclog.Warn("Error while parsing job id") return nil, err } - return repo.GetNodeById(numericId, false) + return repo.GetNodeByID(numericID, false) } // Nodes is the resolver for the nodes field. @@ -387,6 +419,15 @@ func (r *queryResolver) Nodes(ctx context.Context, filter []*model.NodeFilter, o return &model.NodeStateResultList{Items: nodes, Count: &count}, err } +// NodesWithMeta is the resolver for the nodesWithMeta field. +func (r *queryResolver) NodesWithMeta(ctx context.Context, filter []*model.NodeFilter, order *model.OrderByInput) (*model.NodeStateResultList, error) { + // Why Extra Handler? -> graphql.CollectAllFields(ctx) only returns toplevel fields (i.e.: items, count), and not subfields like item.metaData + repo := repository.GetNodeRepository() + nodes, err := repo.QueryNodesWithMeta(ctx, filter, nil, order) // Ignore Paging, Order Unused + count := len(nodes) + return &model.NodeStateResultList{Items: nodes, Count: &count}, err +} + // NodeStates is the resolver for the nodeStates field. func (r *queryResolver) NodeStates(ctx context.Context, filter []*model.NodeFilter) ([]*model.NodeStates, error) { repo := repository.GetNodeRepository() @@ -403,8 +444,7 @@ func (r *queryResolver) NodeStates(ctx context.Context, filter []*model.NodeFilt return nil, herr } - allCounts := make([]*model.NodeStates, 0) - allCounts = append(stateCounts, healthCounts...) + allCounts := append(stateCounts, healthCounts...) return allCounts, nil } @@ -431,18 +471,18 @@ func (r *queryResolver) NodeStatesTimed(ctx context.Context, filter []*model.Nod return healthCounts, nil } - return nil, errors.New("Unknown Node State Query Type") + return nil, errors.New("unknown Node State Query Type") } // Job is the resolver for the job field. func (r *queryResolver) Job(ctx context.Context, id string) (*schema.Job, error) { - numericId, err := strconv.ParseInt(id, 10, 64) + numericID, err := strconv.ParseInt(id, 10, 64) if err != nil { cclog.Warn("Error while parsing job id") return nil, err } - job, err := r.Repo.FindById(ctx, numericId) + job, err := r.Repo.FindByID(ctx, numericID) if err != nil { cclog.Warn("Error while finding job by id") return nil, err @@ -475,7 +515,7 @@ func (r *queryResolver) JobMetrics(ctx context.Context, id string, metrics []str return nil, err } - data, err := metricDataDispatcher.LoadData(job, metrics, scopes, ctx, *resolution) + data, err := metricdispatch.LoadData(job, metrics, scopes, ctx, *resolution) if err != nil { cclog.Warn("Error while loading job data") return nil, err @@ -503,7 +543,7 @@ func (r *queryResolver) JobStats(ctx context.Context, id string, metrics []strin return nil, err } - data, err := metricDataDispatcher.LoadJobStats(job, metrics, ctx) + data, err := metricdispatch.LoadJobStats(job, metrics, ctx) if err != nil { cclog.Warnf("Error while loading jobStats data for job id %s", id) return nil, err @@ -528,7 +568,7 @@ func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics [ return nil, err } - data, err := metricDataDispatcher.LoadScopedJobStats(job, metrics, scopes, ctx) + data, err := metricdispatch.LoadScopedJobStats(job, metrics, scopes, ctx) if err != nil { cclog.Warnf("Error while loading scopedJobStats data for job id %s", id) return nil, err @@ -542,7 +582,7 @@ func (r *queryResolver) ScopedJobStats(ctx context.Context, id string, metrics [ for _, stat := range stats { mdlStats = append(mdlStats, &model.ScopedStats{ Hostname: stat.Hostname, - ID: stat.Id, + ID: stat.ID, Data: stat.Data, }) } @@ -581,21 +621,24 @@ func (r *queryResolver) Jobs(ctx context.Context, filter []*model.JobFilter, pag // Note: Even if App-Default 'config.Keys.UiDefaults["job_list_usePaging"]' is set, always return hasNextPage boolean. // Users can decide in frontend to use continuous scroll, even if app-default is paging! + // Skip if page.ItemsPerPage == -1 ("Load All" -> No Next Page required, Status Dashboards) /* Example Page 4 @ 10 IpP : Does item 41 exist? Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 @ 10 IpP exists. */ - nextPage := &model.PageRequest{ - ItemsPerPage: 1, - Page: ((page.Page * page.ItemsPerPage) + 1), + hasNextPage := false + if page.ItemsPerPage != -1 { + nextPage := &model.PageRequest{ + ItemsPerPage: 1, + Page: ((page.Page * page.ItemsPerPage) + 1), + } + nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order) + if err != nil { + cclog.Warn("Error while querying next jobs") + return nil, err + } + hasNextPage = len(nextJobs) == 1 } - nextJobs, err := r.Repo.QueryJobs(ctx, filter, nextPage, order) - if err != nil { - cclog.Warn("Error while querying next jobs") - return nil, err - } - - hasNextPage := len(nextJobs) == 1 return &model.JobResultList{Items: jobs, Count: &count, HasNextPage: &hasNextPage}, nil } @@ -693,7 +736,7 @@ func (r *queryResolver) JobsMetricStats(ctx context.Context, filter []*model.Job res := []*model.JobStats{} for _, job := range jobs { - data, err := metricDataDispatcher.LoadJobStats(job, metrics, ctx) + data, err := metricdispatch.LoadJobStats(job, metrics, ctx) if err != nil { cclog.Warnf("Error while loading comparison jobStats data for job id %d", job.JobID) continue @@ -744,13 +787,19 @@ func (r *queryResolver) NodeMetrics(ctx context.Context, cluster string, nodes [ return nil, errors.New("you need to be administrator or support staff for this query") } + defaultMetrics := make([]string, 0) + for _, mc := range archive.GetCluster(cluster).MetricConfig { + defaultMetrics = append(defaultMetrics, mc.Name) + } if metrics == nil { - for _, mc := range archive.GetCluster(cluster).MetricConfig { - metrics = append(metrics, mc.Name) - } + metrics = defaultMetrics + } else { + metrics = slices.DeleteFunc(metrics, func(metric string) bool { + return !slices.Contains(defaultMetrics, metric) // Remove undefined metrics. + }) } - data, err := metricDataDispatcher.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx) + data, err := metricdispatch.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx) if err != nil { cclog.Warn("error while loading node data") return nil, err @@ -804,153 +853,39 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub return nil, errors.New("you need to be administrator or support staff for this query") } + nodeRepo := repository.GetNodeRepository() + // nodes -> array hostname + nodes, stateMap, countNodes, hasNextPage, nerr := nodeRepo.GetNodesForList(ctx, cluster, subCluster, stateFilter, nodeFilter, page) + if nerr != nil { + return nil, errors.New("could not retrieve node list required for resolving NodeMetricsList") + } + if metrics == nil { for _, mc := range archive.GetCluster(cluster).MetricConfig { metrics = append(metrics, mc.Name) } } - // Build Filters - queryFilters := make([]*model.NodeFilter, 0) - if cluster != "" { - queryFilters = append(queryFilters, &model.NodeFilter{Cluster: &model.StringInput{Eq: &cluster}}) - } - if subCluster != "" { - queryFilters = append(queryFilters, &model.NodeFilter{Subcluster: &model.StringInput{Eq: &subCluster}}) - } - if nodeFilter != "" && stateFilter != "notindb" { - queryFilters = append(queryFilters, &model.NodeFilter{Hostname: &model.StringInput{Contains: &nodeFilter}}) - } - if stateFilter != "all" && stateFilter != "notindb" { - var queryState schema.SchedulerState = schema.SchedulerState(stateFilter) - queryFilters = append(queryFilters, &model.NodeFilter{SchedulerState: &queryState}) - } - // if healthFilter != "all" { - // filters = append(filters, &model.NodeFilter{HealthState: &healthFilter}) - // } - - // Special Case: Disable Paging for missing nodes filter, save IPP for later - var backupItems int - if stateFilter == "notindb" { - backupItems = page.ItemsPerPage - page.ItemsPerPage = -1 - } - - // Query Nodes From DB - nodeRepo := repository.GetNodeRepository() - rawNodes, serr := nodeRepo.QueryNodes(ctx, queryFilters, page, nil) // Order not Used - if serr != nil { - cclog.Warn("error while loading node database data (Resolver.NodeMetricsList)") - return nil, serr - } - - // Intermediate Node Result Info - nodes := make([]string, 0) - stateMap := make(map[string]string) - for _, node := range rawNodes { - nodes = append(nodes, node.Hostname) - stateMap[node.Hostname] = string(node.NodeState) - } - - // Setup Vars - var countNodes int - var cerr error - var hasNextPage bool - - // Special Case: Find Nodes not in DB node table but in metricStore only - if stateFilter == "notindb" { - // Reapply Original Paging - page.ItemsPerPage = backupItems - // Get Nodes From Topology - var topoNodes []string - if subCluster != "" { - scNodes := archive.NodeLists[cluster][subCluster] - topoNodes = scNodes.PrintList() - } else { - subClusterNodeLists := archive.NodeLists[cluster] - for _, nodeList := range subClusterNodeLists { - topoNodes = append(topoNodes, nodeList.PrintList()...) - } - } - // Compare to all nodes from cluster/subcluster in DB - var missingNodes []string - for _, scanNode := range topoNodes { - if !slices.Contains(nodes, scanNode) { - missingNodes = append(missingNodes, scanNode) - } - } - // Filter nodes by name - if nodeFilter != "" { - filteredNodesByName := []string{} - for _, missingNode := range missingNodes { - if strings.Contains(missingNode, nodeFilter) { - filteredNodesByName = append(filteredNodesByName, missingNode) - } - } - missingNodes = filteredNodesByName - } - // Sort Missing Nodes Alphanumerically - slices.Sort(missingNodes) - // Total Missing - countNodes = len(missingNodes) - // Apply paging - if countNodes > page.ItemsPerPage { - start := (page.Page - 1) * page.ItemsPerPage - end := start + page.ItemsPerPage - if end > countNodes { - end = countNodes - hasNextPage = false - } else { - hasNextPage = true - } - nodes = missingNodes[start:end] - } else { - nodes = missingNodes - } - - } else { - // DB Nodes: Count and Find Next Page - countNodes, cerr = nodeRepo.CountNodes(ctx, queryFilters) - if cerr != nil { - cclog.Warn("error while counting node database data (Resolver.NodeMetricsList)") - return nil, cerr - } - - // Example Page 4 @ 10 IpP : Does item 41 exist? - // Minimal Page 41 @ 1 IpP : If len(result) is 1, Page 5 exists. - nextPage := &model.PageRequest{ - ItemsPerPage: 1, - Page: ((page.Page * page.ItemsPerPage) + 1), - } - nextNodes, err := nodeRepo.QueryNodes(ctx, queryFilters, nextPage, nil) // Order not Used - if err != nil { - cclog.Warn("Error while querying next nodes") - return nil, err - } - hasNextPage = len(nextNodes) == 1 - } - - // Load Metric Data For Specified Nodes Only - data, err := metricDataDispatcher.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx) + // data -> map hostname:jobdata + data, err := metricdispatch.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, *resolution, from, to, ctx) if err != nil { cclog.Warn("error while loading node data (Resolver.NodeMetricsList") return nil, err } - // Build Result nodeMetricsList := make([]*model.NodeMetrics, 0, len(data)) - for hostname, metrics := range data { + for _, hostname := range nodes { host := &model.NodeMetrics{ Host: hostname, State: stateMap[hostname], - Metrics: make([]*model.JobMetricWithName, 0, len(metrics)*len(scopes)), + Metrics: make([]*model.JobMetricWithName, 0), } host.SubCluster, err = archive.GetSubClusterByNode(cluster, hostname) if err != nil { cclog.Warnf("error in nodeMetrics resolver: %s", err) } - for metric, scopedMetrics := range metrics { + for metric, scopedMetrics := range data[hostname] { for scope, scopedMetric := range scopedMetrics { host.Metrics = append(host.Metrics, &model.JobMetricWithName{ Name: metric, @@ -963,9 +898,9 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub nodeMetricsList = append(nodeMetricsList, host) } - // Final Return nodeMetricsListResult := &model.NodesResultList{ - Items: nodeMetricsList, + Items: nodeMetricsList, + // TotalNodes depends on sum of nodes grouped on latest timestamp, see repo/node.go:357 TotalNodes: &countNodes, HasNextPage: &hasNextPage, } @@ -973,6 +908,99 @@ func (r *queryResolver) NodeMetricsList(ctx context.Context, cluster string, sub return nodeMetricsListResult, nil } +// ClusterMetrics is the resolver for the clusterMetrics field. +func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metrics []string, from time.Time, to time.Time) (*model.ClusterMetrics, error) { + user := repository.GetUserFromContext(ctx) + if user != nil && !user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) { + return nil, errors.New("you need to be administrator or support staff for this query") + } + + if metrics == nil { + for _, mc := range archive.GetCluster(cluster).MetricConfig { + metrics = append(metrics, mc.Name) + } + } + + // 'nodes' == nil -> Defaults to all nodes of cluster for existing query workflow + scopes := []schema.MetricScope{"node"} + data, err := metricdispatch.LoadNodeData(cluster, metrics, nil, scopes, from, to, ctx) + if err != nil { + cclog.Warn("error while loading node data") + return nil, err + } + + clusterMetricData := make([]*model.ClusterMetricWithName, 0) + clusterMetrics := model.ClusterMetrics{NodeCount: 0, Metrics: clusterMetricData} + + collectorTimestep := make(map[string]int) + collectorUnit := make(map[string]schema.Unit) + collectorData := make(map[string][]schema.Float) + + for _, metrics := range data { + clusterMetrics.NodeCount += 1 + for metric, scopedMetrics := range metrics { + for _, scopedMetric := range scopedMetrics { + // Collect Info Once + _, okTimestep := collectorTimestep[metric] + if !okTimestep { + collectorTimestep[metric] = scopedMetric.Timestep + } + _, okUnit := collectorUnit[metric] + if !okUnit { + collectorUnit[metric] = scopedMetric.Unit + } + // Collect Data + for _, ser := range scopedMetric.Series { + _, okData := collectorData[metric] + // Init With Datasize > 0 + if !okData && len(ser.Data) != 0 { + collectorData[metric] = make([]schema.Float, len(ser.Data)) + } else if !okData { + cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip init: no data -> %s at %s; size %d", metric, ser.Hostname, len(ser.Data)) + } + // Sum if init'd and matching size + if okData && len(ser.Data) == len(collectorData[metric]) { + for i, val := range ser.Data { + if val.IsNaN() { + continue + } else { + collectorData[metric][i] += val + } + } + } else if okData { + cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip sum: data diff -> %s at %s; want size %d, have size %d", metric, ser.Hostname, len(collectorData[metric]), len(ser.Data)) + } + } + } + } + } + + for metricName, data := range collectorData { + // use ccUnits for backend normalization to "Tera" + p_old := ccunit.NewPrefix(collectorUnit[metricName].Prefix) + p_new := ccunit.NewPrefix("T") + convFunc := ccunit.GetPrefixPrefixFactor(p_old, p_new) + u_new := schema.Unit{Prefix: p_new.Prefix(), Base: collectorUnit[metricName].Base} + + roundedData := make([]schema.Float, 0) + for _, v_old := range data { + v_new := math.Round(convFunc(float64(v_old)).(float64)*100.0) / 100.0 + roundedData = append(roundedData, schema.Float(v_new)) + } + + cm := model.ClusterMetricWithName{ + Name: metricName, + Unit: &u_new, + Timestep: collectorTimestep[metricName], + Data: roundedData, + } + + clusterMetrics.Metrics = append(clusterMetrics.Metrics, &cm) + } + + return &clusterMetrics, nil +} + // NumberOfNodes is the resolver for the numberOfNodes field. func (r *subClusterResolver) NumberOfNodes(ctx context.Context, obj *schema.SubCluster) (int, error) { nodeList, err := archive.ParseNodeList(obj.Nodes) diff --git a/internal/graph/util.go b/internal/graph/util.go index 38c4914f..5458d0ff 100644 --- a/internal/graph/util.go +++ b/internal/graph/util.go @@ -2,18 +2,20 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package graph import ( "context" "fmt" "math" + "slices" "github.com/99designs/gqlgen/graphql" "github.com/ClusterCockpit/cc-backend/internal/graph/model" - "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + "github.com/ClusterCockpit/cc-backend/internal/metricdispatch" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" ) const MAX_JOBS_FOR_ANALYSIS = 500 @@ -53,15 +55,15 @@ func (r *queryResolver) rooflineHeatmap( // resolution = max(resolution, mc.Timestep) // } - jobdata, err := metricDataDispatcher.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0) + jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0) if err != nil { - cclog.Errorf("Error while loading roofline metrics for job %d", job.ID) + cclog.Warnf("Error while loading roofline metrics for job %d", *job.ID) return nil, err } flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"] if flops_ == nil && membw_ == nil { - cclog.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID) + cclog.Warnf("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", *job.ID) continue // return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID) } @@ -126,7 +128,7 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF continue } - if err := metricDataDispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil { + if err := metricdispatch.LoadAverages(job, metrics, avgs, ctx); err != nil { cclog.Error("Error while loading averages for footprint") return nil, err } @@ -185,11 +187,5 @@ func (r *queryResolver) jobsFootprints(ctx context.Context, filter []*model.JobF func requireField(ctx context.Context, name string) bool { fields := graphql.CollectAllFields(ctx) - for _, f := range fields { - if f == name { - return true - } - } - - return false + return slices.Contains(fields, name) } diff --git a/internal/importer/handleImport.go b/internal/importer/handleImport.go index 482b328c..68b6db9c 100644 --- a/internal/importer/handleImport.go +++ b/internal/importer/handleImport.go @@ -2,6 +2,7 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package importer import ( @@ -14,8 +15,8 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" ) // HandleImportFlag imports jobs from file pairs specified in a comma-separated flag string. @@ -37,7 +38,7 @@ import ( func HandleImportFlag(flag string) error { r := repository.GetJobRepository() - for _, pair := range strings.Split(flag, ",") { + for pair := range strings.SplitSeq(flag, ",") { files := strings.Split(pair, ":") if len(files) != 2 { return fmt.Errorf("REPOSITORY/INIT > invalid import flag format") @@ -101,7 +102,7 @@ func HandleImportFlag(flag string) error { return err } - id, err := r.InsertJob(&job) + id, err := r.InsertJobDirect(&job) if err != nil { cclog.Warn("Error while job db insert") return err diff --git a/internal/importer/importer_test.go b/internal/importer/importer_test.go index 2aa007da..cb4dca89 100644 --- a/internal/importer/importer_test.go +++ b/internal/importer/importer_test.go @@ -16,8 +16,8 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/importer" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/archive" - ccconf "github.com/ClusterCockpit/cc-lib/ccConfig" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" ) // copyFile copies a file from source path to destination path. @@ -50,42 +50,14 @@ func setup(t *testing.T) *repository.JobRepository { "main": { "addr": "0.0.0.0:8080", "validate": false, - "apiAllowedIPs": [ + "api-allowed-ips": [ "*" ]}, "archive": { "kind": "file", "path": "./var/job-archive" - }, - "clusters": [ - { - "name": "testcluster", - "metricDataRepository": {"kind": "test", "url": "bla:8081"}, - "filterRanges": { - "numNodes": { "from": 1, "to": 64 }, - "duration": { "from": 0, "to": 86400 }, - "startTime": { "from": "2022-01-01T00:00:00Z", "to": null } - } - }, - { - "name": "fritz", - "metricDataRepository": {"kind": "test", "url": "bla:8081"}, - "filterRanges": { - "numNodes": { "from": 1, "to": 944 }, - "duration": { "from": 0, "to": 86400 }, - "startTime": { "from": "2022-01-01T00:00:00Z", "to": null } - } - }, - { - "name": "taurus", - "metricDataRepository": {"kind": "test", "url": "bla:8081"}, - "filterRanges": { - "numNodes": { "from": 1, "to": 4000 }, - "duration": { "from": 0, "to": 604800 }, - "startTime": { "from": "2010-01-01T00:00:00Z", "to": null } - } - } - ]}` + } + }` cclog.Init("info", true) tmpdir := t.TempDir() @@ -107,7 +79,7 @@ func setup(t *testing.T) *repository.JobRepository { } dbfilepath := filepath.Join(tmpdir, "test.db") - err := repository.MigrateDB("sqlite3", dbfilepath) + err := repository.MigrateDB(dbfilepath) if err != nil { t.Fatal(err) } @@ -121,22 +93,18 @@ func setup(t *testing.T) *repository.JobRepository { // Load and check main configuration if cfg := ccconf.GetPackageConfig("main"); cfg != nil { - if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil { - config.Init(cfg, clustercfg) - } else { - t.Fatal("Cluster configuration must be present") - } + config.Init(cfg) } else { t.Fatal("Main configuration must be present") } archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive) - if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil { + if err := archive.Init(json.RawMessage(archiveCfg)); err != nil { t.Fatal(err) } - repository.Connect("sqlite3", dbfilepath) + repository.Connect(dbfilepath) return repository.GetJobRepository() } @@ -197,7 +165,7 @@ func TestHandleImportFlag(t *testing.T) { } result := readResult(t, testname) - job, err := r.FindCached(&result.JobId, &result.Cluster, &result.StartTime) + job, err := r.Find(&result.JobId, &result.Cluster, &result.StartTime) if err != nil { t.Fatal(err) } diff --git a/internal/importer/initDB.go b/internal/importer/initDB.go index a4789576..87d92cd3 100644 --- a/internal/importer/initDB.go +++ b/internal/importer/initDB.go @@ -22,8 +22,8 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" ) const ( @@ -111,18 +111,22 @@ func InitDB() error { continue } - id, err := r.TransactionAddNamed(t, + id, jobErr := r.TransactionAddNamed(t, repository.NamedJobInsert, jobMeta) - if err != nil { - cclog.Errorf("repository initDB(): %v", err) + if jobErr != nil { + cclog.Errorf("repository initDB(): %v", jobErr) errorOccured++ continue } + // Job successfully inserted, increment counter + i += 1 + for _, tag := range jobMeta.Tags { tagstr := tag.Name + ":" + tag.Type tagID, ok := tags[tagstr] if !ok { + var err error tagID, err = r.TransactionAdd(t, addTagQuery, tag.Name, tag.Type) @@ -138,10 +142,6 @@ func InitDB() error { setTagQuery, id, tagID) } - - if err == nil { - i += 1 - } } if errorOccured > 0 { @@ -216,7 +216,7 @@ func enrichJobMetadata(job *schema.Job) error { metricEnergy = math.Round(rawEnergy*100.0) / 100.0 } } else { - cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, job.ID) + cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, *job.ID) } job.EnergyFootprint[fp] = metricEnergy @@ -225,7 +225,7 @@ func enrichJobMetadata(job *schema.Job) error { job.Energy = (math.Round(totalEnergy*100.0) / 100.0) if job.RawEnergyFootprint, err = json.Marshal(job.EnergyFootprint); err != nil { - cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", job.ID) + cclog.Warnf("Error while marshaling energy footprint for job INTO BYTES, DB ID '%v'", *job.ID) return err } diff --git a/internal/importer/normalize.go b/internal/importer/normalize.go index 943ceb26..cc6fb545 100644 --- a/internal/importer/normalize.go +++ b/internal/importer/normalize.go @@ -2,12 +2,13 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package importer import ( "math" - ccunits "github.com/ClusterCockpit/cc-lib/ccUnits" + ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits" ) // getNormalizationFactor calculates the scaling factor needed to normalize a value diff --git a/internal/importer/normalize_test.go b/internal/importer/normalize_test.go index 6aa1ed2e..039a3cfc 100644 --- a/internal/importer/normalize_test.go +++ b/internal/importer/normalize_test.go @@ -8,7 +8,7 @@ import ( "fmt" "testing" - ccunits "github.com/ClusterCockpit/cc-lib/ccUnits" + ccunits "github.com/ClusterCockpit/cc-lib/v2/ccUnits" ) // TestNormalizeFactor tests the normalization of large byte values to gigabyte prefix. diff --git a/internal/memorystore/archive.go b/internal/memorystore/archive.go deleted file mode 100644 index 56065aaf..00000000 --- a/internal/memorystore/archive.go +++ /dev/null @@ -1,191 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package memorystore - -import ( - "archive/zip" - "bufio" - "context" - "errors" - "fmt" - "io" - "os" - "path/filepath" - "sync" - "sync/atomic" - "time" - - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" -) - -func Archiving(wg *sync.WaitGroup, ctx context.Context) { - go func() { - defer wg.Done() - d, err := time.ParseDuration(Keys.Archive.Interval) - if err != nil { - cclog.Fatalf("[METRICSTORE]> error parsing archive interval duration: %v\n", err) - } - if d <= 0 { - return - } - - ticks := func() <-chan time.Time { - if d <= 0 { - return nil - } - return time.NewTicker(d).C - }() - for { - select { - case <-ctx.Done(): - return - case <-ticks: - t := time.Now().Add(-d) - cclog.Infof("[METRICSTORE]> start archiving checkpoints (older than %s)...", t.Format(time.RFC3339)) - n, err := ArchiveCheckpoints(Keys.Checkpoints.RootDir, - Keys.Archive.RootDir, t.Unix(), Keys.Archive.DeleteInstead) - - if err != nil { - cclog.Errorf("[METRICSTORE]> archiving failed: %s", err.Error()) - } else { - cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive", n) - } - } - } - }() -} - -var ErrNoNewArchiveData error = errors.New("all data already archived") - -// ZIP all checkpoint files older than `from` together and write them to the `archiveDir`, -// deleting them from the `checkpointsDir`. -func ArchiveCheckpoints(checkpointsDir, archiveDir string, from int64, deleteInstead bool) (int, error) { - entries1, err := os.ReadDir(checkpointsDir) - if err != nil { - return 0, err - } - - type workItem struct { - cdir, adir string - cluster, host string - } - - var wg sync.WaitGroup - n, errs := int32(0), int32(0) - work := make(chan workItem, Keys.NumWorkers) - - wg.Add(Keys.NumWorkers) - for worker := 0; worker < Keys.NumWorkers; worker++ { - go func() { - defer wg.Done() - for workItem := range work { - m, err := archiveCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead) - if err != nil { - cclog.Errorf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error()) - atomic.AddInt32(&errs, 1) - } - atomic.AddInt32(&n, int32(m)) - } - }() - } - - for _, de1 := range entries1 { - entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name())) - if e != nil { - err = e - } - - for _, de2 := range entries2 { - cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name()) - adir := filepath.Join(archiveDir, de1.Name(), de2.Name()) - work <- workItem{ - adir: adir, cdir: cdir, - cluster: de1.Name(), host: de2.Name(), - } - } - } - - close(work) - wg.Wait() - - if err != nil { - return int(n), err - } - - if errs > 0 { - return int(n), fmt.Errorf("%d errors happened while archiving (%d successes)", errs, n) - } - return int(n), nil -} - -// Helper function for `ArchiveCheckpoints`. -func archiveCheckpoints(dir string, archiveDir string, from int64, deleteInstead bool) (int, error) { - entries, err := os.ReadDir(dir) - if err != nil { - return 0, err - } - - extension := Keys.Checkpoints.FileFormat - files, err := findFiles(entries, from, extension, false) - if err != nil { - return 0, err - } - - if deleteInstead { - n := 0 - for _, checkpoint := range files { - filename := filepath.Join(dir, checkpoint) - if err = os.Remove(filename); err != nil { - return n, err - } - n += 1 - } - return n, nil - } - - filename := filepath.Join(archiveDir, fmt.Sprintf("%d.zip", from)) - f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms) - if err != nil && os.IsNotExist(err) { - err = os.MkdirAll(archiveDir, CheckpointDirPerms) - if err == nil { - f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms) - } - } - if err != nil { - return 0, err - } - defer f.Close() - bw := bufio.NewWriter(f) - defer bw.Flush() - zw := zip.NewWriter(bw) - defer zw.Close() - - n := 0 - for _, checkpoint := range files { - filename := filepath.Join(dir, checkpoint) - r, err := os.Open(filename) - if err != nil { - return n, err - } - defer r.Close() - - w, err := zw.Create(checkpoint) - if err != nil { - return n, err - } - - if _, err = io.Copy(w, r); err != nil { - return n, err - } - - if err = os.Remove(filename); err != nil { - return n, err - } - n += 1 - } - - return n, nil -} diff --git a/internal/memorystore/avroCheckpoint.go b/internal/memorystore/avroCheckpoint.go deleted file mode 100644 index 4d361514..00000000 --- a/internal/memorystore/avroCheckpoint.go +++ /dev/null @@ -1,482 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package memorystore - -import ( - "bufio" - "encoding/json" - "errors" - "fmt" - "os" - "path" - "sort" - "strconv" - "strings" - "sync" - "sync/atomic" - "time" - - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/linkedin/goavro/v2" -) - -var NumAvroWorkers int = 4 -var startUp bool = true -var ErrNoNewData error = errors.New("no data in the pool") - -func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) { - levels := make([]*AvroLevel, 0) - selectors := make([][]string, 0) - as.root.lock.RLock() - // Cluster - for sel1, l1 := range as.root.children { - l1.lock.RLock() - // Node - for sel2, l2 := range l1.children { - l2.lock.RLock() - // Frequency - for sel3, l3 := range l2.children { - levels = append(levels, l3) - selectors = append(selectors, []string{sel1, sel2, sel3}) - } - l2.lock.RUnlock() - } - l1.lock.RUnlock() - } - as.root.lock.RUnlock() - - type workItem struct { - level *AvroLevel - dir string - selector []string - } - - n, errs := int32(0), int32(0) - - var wg sync.WaitGroup - wg.Add(NumAvroWorkers) - work := make(chan workItem, NumAvroWorkers*2) - for range NumAvroWorkers { - go func() { - defer wg.Done() - - for workItem := range work { - from := getTimestamp(workItem.dir) - - if err := workItem.level.toCheckpoint(workItem.dir, from, dumpAll); err != nil { - if err == ErrNoNewArchiveData { - continue - } - - cclog.Errorf("error while checkpointing %#v: %s", workItem.selector, err.Error()) - atomic.AddInt32(&errs, 1) - } else { - atomic.AddInt32(&n, 1) - } - } - }() - } - - for i := range len(levels) { - dir := path.Join(dir, path.Join(selectors[i]...)) - work <- workItem{ - level: levels[i], - dir: dir, - selector: selectors[i], - } - } - - close(work) - wg.Wait() - - if errs > 0 { - return int(n), fmt.Errorf("%d errors happend while creating avro checkpoints (%d successes)", errs, n) - } - - startUp = false - - return int(n), nil -} - -// getTimestamp returns the timestamp from the directory name -func getTimestamp(dir string) int64 { - // Extract the resolution and timestamp from the directory name - // The existing avro file will be in epoch timestamp format - // iterate over all the files in the directory and find the maximum timestamp - // and return it - - resolution := path.Base(dir) - dir = path.Dir(dir) - - files, err := os.ReadDir(dir) - if err != nil { - return 0 - } - var maxTS int64 = 0 - - if len(files) == 0 { - return 0 - } - - for _, file := range files { - if file.IsDir() { - continue - } - name := file.Name() - - if len(name) < 5 || !strings.HasSuffix(name, ".avro") || !strings.HasPrefix(name, resolution+"_") { - continue - } - - ts, err := strconv.ParseInt(name[strings.Index(name, "_")+1:len(name)-5], 10, 64) - if err != nil { - fmt.Printf("error while parsing timestamp: %s\n", err.Error()) - continue - } - - if ts > maxTS { - maxTS = ts - } - } - - interval, _ := time.ParseDuration(Keys.Checkpoints.Interval) - updateTime := time.Unix(maxTS, 0).Add(interval).Add(time.Duration(CheckpointBufferMinutes-1) * time.Minute).Unix() - - if startUp { - return 0 - } - - if updateTime < time.Now().Unix() { - return 0 - } - - return maxTS -} - -func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error { - l.lock.Lock() - defer l.lock.Unlock() - - // fmt.Printf("Checkpointing directory: %s\n", dir) - // filepath contains the resolution - intRes, _ := strconv.Atoi(path.Base(dir)) - - // find smallest overall timestamp in l.data map and delete it from l.data - minTS := int64(1<<63 - 1) - for ts, dat := range l.data { - if ts < minTS && len(dat) != 0 { - minTS = ts - } - } - - if from == 0 && minTS != int64(1<<63-1) { - from = minTS - } - - if from == 0 { - return ErrNoNewArchiveData - } - - var schema string - var codec *goavro.Codec - recordList := make([]map[string]any, 0) - - var f *os.File - - filePath := dir + fmt.Sprintf("_%d.avro", from) - - var err error - - fp_, err_ := os.Stat(filePath) - if errors.Is(err_, os.ErrNotExist) { - err = os.MkdirAll(path.Dir(dir), 0o755) - if err != nil { - return fmt.Errorf("failed to create directory: %v", err) - } - } else if fp_.Size() != 0 { - f, err = os.Open(filePath) - if err != nil { - return fmt.Errorf("failed to open existing avro file: %v", err) - } - - br := bufio.NewReader(f) - - reader, err := goavro.NewOCFReader(br) - if err != nil { - return fmt.Errorf("failed to create OCF reader: %v", err) - } - codec = reader.Codec() - schema = codec.Schema() - - f.Close() - } - - timeRef := time.Now().Add(time.Duration(-CheckpointBufferMinutes+1) * time.Minute).Unix() - - if dumpAll { - timeRef = time.Now().Unix() - } - - // Empty values - if len(l.data) == 0 { - // we checkpoint avro files every 60 seconds - repeat := 60 / intRes - - for range repeat { - recordList = append(recordList, make(map[string]any)) - } - } - - readFlag := true - - for ts := range l.data { - flag := false - if ts < timeRef { - data := l.data[ts] - - schemaGen, err := generateSchema(data) - if err != nil { - return err - } - - flag, schema, err = compareSchema(schema, schemaGen) - if err != nil { - return fmt.Errorf("failed to compare read and generated schema: %v", err) - } - if flag && readFlag && !errors.Is(err_, os.ErrNotExist) { - - f.Close() - - f, err = os.Open(filePath) - if err != nil { - return fmt.Errorf("failed to open Avro file: %v", err) - } - - br := bufio.NewReader(f) - - ocfReader, err := goavro.NewOCFReader(br) - if err != nil { - return fmt.Errorf("failed to create OCF reader while changing schema: %v", err) - } - - for ocfReader.Scan() { - record, err := ocfReader.Read() - if err != nil { - return fmt.Errorf("failed to read record: %v", err) - } - - recordList = append(recordList, record.(map[string]any)) - } - - f.Close() - - err = os.Remove(filePath) - if err != nil { - return fmt.Errorf("failed to delete file: %v", err) - } - - readFlag = false - } - codec, err = goavro.NewCodec(schema) - if err != nil { - return fmt.Errorf("failed to create codec after merged schema: %v", err) - } - - recordList = append(recordList, generateRecord(data)) - delete(l.data, ts) - } - } - - if len(recordList) == 0 { - return ErrNoNewArchiveData - } - - f, err = os.OpenFile(filePath, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0o644) - if err != nil { - return fmt.Errorf("failed to append new avro file: %v", err) - } - - // fmt.Printf("Codec : %#v\n", codec) - - writer, err := goavro.NewOCFWriter(goavro.OCFConfig{ - W: f, - Codec: codec, - CompressionName: goavro.CompressionDeflateLabel, - }) - if err != nil { - return fmt.Errorf("failed to create OCF writer: %v", err) - } - - // Append the new record - if err := writer.Append(recordList); err != nil { - return fmt.Errorf("failed to append record: %v", err) - } - - f.Close() - - return nil -} - -func compareSchema(schemaRead, schemaGen string) (bool, string, error) { - var genSchema, readSchema AvroSchema - - if schemaRead == "" { - return false, schemaGen, nil - } - - // Unmarshal the schema strings into AvroSchema structs - if err := json.Unmarshal([]byte(schemaGen), &genSchema); err != nil { - return false, "", fmt.Errorf("failed to parse generated schema: %v", err) - } - if err := json.Unmarshal([]byte(schemaRead), &readSchema); err != nil { - return false, "", fmt.Errorf("failed to parse read schema: %v", err) - } - - sort.Slice(genSchema.Fields, func(i, j int) bool { - return genSchema.Fields[i].Name < genSchema.Fields[j].Name - }) - - sort.Slice(readSchema.Fields, func(i, j int) bool { - return readSchema.Fields[i].Name < readSchema.Fields[j].Name - }) - - // Check if schemas are identical - schemasEqual := true - if len(genSchema.Fields) <= len(readSchema.Fields) { - - for i := range genSchema.Fields { - if genSchema.Fields[i].Name != readSchema.Fields[i].Name { - schemasEqual = false - break - } - } - - // If schemas are identical, return the read schema - if schemasEqual { - return false, schemaRead, nil - } - } - - // Create a map to hold unique fields from both schemas - fieldMap := make(map[string]AvroField) - - // Add fields from the read schema - for _, field := range readSchema.Fields { - fieldMap[field.Name] = field - } - - // Add or update fields from the generated schema - for _, field := range genSchema.Fields { - fieldMap[field.Name] = field - } - - // Create a union schema by collecting fields from the map - var mergedFields []AvroField - for _, field := range fieldMap { - mergedFields = append(mergedFields, field) - } - - // Sort fields by name for consistency - sort.Slice(mergedFields, func(i, j int) bool { - return mergedFields[i].Name < mergedFields[j].Name - }) - - // Create the merged schema - mergedSchema := AvroSchema{ - Type: "record", - Name: genSchema.Name, - Fields: mergedFields, - } - - // Check if schemas are identical - schemasEqual = len(mergedSchema.Fields) == len(readSchema.Fields) - if schemasEqual { - for i := range mergedSchema.Fields { - if mergedSchema.Fields[i].Name != readSchema.Fields[i].Name { - schemasEqual = false - break - } - } - - if schemasEqual { - return false, schemaRead, nil - } - } - - // Marshal the merged schema back to JSON - mergedSchemaJSON, err := json.Marshal(mergedSchema) - if err != nil { - return false, "", fmt.Errorf("failed to marshal merged schema: %v", err) - } - - return true, string(mergedSchemaJSON), nil -} - -func generateSchema(data map[string]schema.Float) (string, error) { - // Define the Avro schema structure - schema := map[string]any{ - "type": "record", - "name": "DataRecord", - "fields": []map[string]any{}, - } - - fieldTracker := make(map[string]struct{}) - - for key := range data { - if _, exists := fieldTracker[key]; !exists { - key = correctKey(key) - - field := map[string]any{ - "name": key, - "type": "double", - "default": -1.0, - } - schema["fields"] = append(schema["fields"].([]map[string]any), field) - fieldTracker[key] = struct{}{} - } - } - - schemaString, err := json.Marshal(schema) - if err != nil { - return "", fmt.Errorf("failed to marshal schema: %v", err) - } - - return string(schemaString), nil -} - -func generateRecord(data map[string]schema.Float) map[string]any { - record := make(map[string]any) - - // Iterate through each map in data - for key, value := range data { - key = correctKey(key) - - // Set the value in the record - // avro only accepts basic types - record[key] = value.Double() - } - - return record -} - -func correctKey(key string) string { - // Replace any invalid characters in the key - // For example, replace spaces with underscores - key = strings.ReplaceAll(key, ":", "___") - key = strings.ReplaceAll(key, ".", "__") - - return key -} - -func ReplaceKey(key string) string { - // Replace any invalid characters in the key - // For example, replace spaces with underscores - key = strings.ReplaceAll(key, "___", ":") - key = strings.ReplaceAll(key, "__", ".") - - return key -} diff --git a/internal/memorystore/avroHelper.go b/internal/memorystore/avroHelper.go deleted file mode 100644 index 64e57064..00000000 --- a/internal/memorystore/avroHelper.go +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package memorystore - -import ( - "context" - "slices" - "strconv" - "sync" - - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" -) - -func DataStaging(wg *sync.WaitGroup, ctx context.Context) { - // AvroPool is a pool of Avro writers. - go func() { - if Keys.Checkpoints.FileFormat == "json" { - wg.Done() // Mark this goroutine as done - return // Exit the goroutine - } - - defer wg.Done() - - var avroLevel *AvroLevel - oldSelector := make([]string, 0) - - for { - select { - case <-ctx.Done(): - return - case val := <-LineProtocolMessages: - // Fetch the frequency of the metric from the global configuration - freq, err := GetMetricFrequency(val.MetricName) - if err != nil { - cclog.Errorf("Error fetching metric frequency: %s\n", err) - continue - } - - metricName := "" - - for _, selectorName := range val.Selector { - metricName += selectorName + Delimiter - } - - metricName += val.MetricName - - // Create a new selector for the Avro level - // The selector is a slice of strings that represents the path to the - // Avro level. It is created by appending the cluster, node, and metric - // name to the selector. - var selector []string - selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10)) - - if !testEq(oldSelector, selector) { - // Get the Avro level for the metric - avroLevel = avroStore.root.findAvroLevelOrCreate(selector) - - // If the Avro level is nil, create a new one - if avroLevel == nil { - cclog.Errorf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName) - } - oldSelector = slices.Clone(selector) - } - - avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq)) - } - } - }() -} - -func testEq(a, b []string) bool { - if len(a) != len(b) { - return false - } - for i := range a { - if a[i] != b[i] { - return false - } - } - return true -} diff --git a/internal/memorystore/avroStruct.go b/internal/memorystore/avroStruct.go deleted file mode 100644 index cc8005c7..00000000 --- a/internal/memorystore/avroStruct.go +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package memorystore - -import ( - "sync" - - "github.com/ClusterCockpit/cc-lib/schema" -) - -var ( - LineProtocolMessages = make(chan *AvroStruct) - Delimiter = "ZZZZZ" -) - -// CheckpointBufferMinutes should always be in minutes. -// Its controls the amount of data to hold for given amount of time. -var CheckpointBufferMinutes = 3 - -type AvroStruct struct { - MetricName string - Cluster string - Node string - Selector []string - Value schema.Float - Timestamp int64 -} - -type AvroStore struct { - root AvroLevel -} - -var avroStore AvroStore - -type AvroLevel struct { - children map[string]*AvroLevel - data map[int64]map[string]schema.Float - lock sync.RWMutex -} - -type AvroField struct { - Name string `json:"name"` - Type any `json:"type"` - Default any `json:"default,omitempty"` -} - -type AvroSchema struct { - Type string `json:"type"` - Name string `json:"name"` - Fields []AvroField `json:"fields"` -} - -func (l *AvroLevel) findAvroLevelOrCreate(selector []string) *AvroLevel { - if len(selector) == 0 { - return l - } - - // Allow concurrent reads: - l.lock.RLock() - var child *AvroLevel - var ok bool - if l.children == nil { - // Children map needs to be created... - l.lock.RUnlock() - } else { - child, ok := l.children[selector[0]] - l.lock.RUnlock() - if ok { - return child.findAvroLevelOrCreate(selector[1:]) - } - } - - // The level does not exist, take write lock for unqiue access: - l.lock.Lock() - // While this thread waited for the write lock, another thread - // could have created the child node. - if l.children != nil { - child, ok = l.children[selector[0]] - if ok { - l.lock.Unlock() - return child.findAvroLevelOrCreate(selector[1:]) - } - } - - child = &AvroLevel{ - data: make(map[int64]map[string]schema.Float, 0), - children: nil, - } - - if l.children != nil { - l.children[selector[0]] = child - } else { - l.children = map[string]*AvroLevel{selector[0]: child} - } - l.lock.Unlock() - return child.findAvroLevelOrCreate(selector[1:]) -} - -func (l *AvroLevel) addMetric(metricName string, value schema.Float, timestamp int64, Freq int) { - l.lock.Lock() - defer l.lock.Unlock() - - KeyCounter := int(CheckpointBufferMinutes * 60 / Freq) - - // Create keys in advance for the given amount of time - if len(l.data) != KeyCounter { - if len(l.data) == 0 { - for i := range KeyCounter { - l.data[timestamp+int64(i*Freq)] = make(map[string]schema.Float, 0) - } - } else { - // Get the last timestamp - var lastTS int64 - for ts := range l.data { - if ts > lastTS { - lastTS = ts - } - } - // Create keys for the next KeyCounter timestamps - l.data[lastTS+int64(Freq)] = make(map[string]schema.Float, 0) - } - } - - closestTS := int64(0) - minDiff := int64(Freq) + 1 // Start with diff just outside the valid range - found := false - - // Iterate over timestamps and choose the one which is within range. - // Since its epoch time, we check if the difference is less than 60 seconds. - for ts, dat := range l.data { - // Check if timestamp is within range - diff := timestamp - ts - if diff < -int64(Freq) || diff > int64(Freq) { - continue - } - - // Metric already present at this timestamp — skip - if _, ok := dat[metricName]; ok { - continue - } - - // Check if this is the closest timestamp so far - if Abs(diff) < minDiff { - minDiff = Abs(diff) - closestTS = ts - found = true - } - } - - if found { - l.data[closestTS][metricName] = value - } -} - -func GetAvroStore() *AvroStore { - return &avroStore -} - -// Abs returns the absolute value of x. -func Abs(x int64) int64 { - if x < 0 { - return -x - } - return x -} diff --git a/internal/memorystore/buffer.go b/internal/memorystore/buffer.go deleted file mode 100644 index cd2fd8fd..00000000 --- a/internal/memorystore/buffer.go +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package memorystore - -import ( - "errors" - "sync" - - "github.com/ClusterCockpit/cc-lib/schema" -) - -// Default buffer capacity. -// `buffer.data` will only ever grow up to it's capacity and a new link -// in the buffer chain will be created if needed so that no copying -// of data or reallocation needs to happen on writes. -const ( - BufferCap int = 512 -) - -// So that we can reuse allocations -var bufferPool sync.Pool = sync.Pool{ - New: func() any { - return &buffer{ - data: make([]schema.Float, 0, BufferCap), - } - }, -} - -var ( - ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level") - ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align") -) - -// Each metric on each level has it's own buffer. -// This is where the actual values go. -// If `cap(data)` is reached, a new buffer is created and -// becomes the new head of a buffer list. -type buffer struct { - prev *buffer - next *buffer - data []schema.Float - frequency int64 - start int64 - archived bool - closed bool -} - -func newBuffer(ts, freq int64) *buffer { - b := bufferPool.Get().(*buffer) - b.frequency = freq - b.start = ts - (freq / 2) - b.prev = nil - b.next = nil - b.archived = false - b.closed = false - b.data = b.data[:0] - return b -} - -// If a new buffer was created, the new head is returnd. -// Otherwise, the existing buffer is returnd. -// Normaly, only "newer" data should be written, but if the value would -// end up in the same buffer anyways it is allowed. -func (b *buffer) write(ts int64, value schema.Float) (*buffer, error) { - if ts < b.start { - return nil, errors.New("[METRICSTORE]> cannot write value to buffer from past") - } - - // idx := int((ts - b.start + (b.frequency / 3)) / b.frequency) - idx := int((ts - b.start) / b.frequency) - if idx >= cap(b.data) { - newbuf := newBuffer(ts, b.frequency) - newbuf.prev = b - b.next = newbuf - b.close() - b = newbuf - idx = 0 - } - - // Overwriting value or writing value from past - if idx < len(b.data) { - b.data[idx] = value - return b, nil - } - - // Fill up unwritten slots with NaN - for i := len(b.data); i < idx; i++ { - b.data = append(b.data, schema.NaN) - } - - b.data = append(b.data, value) - return b, nil -} - -func (b *buffer) end() int64 { - return b.firstWrite() + int64(len(b.data))*b.frequency -} - -func (b *buffer) firstWrite() int64 { - return b.start + (b.frequency / 2) -} - -func (b *buffer) close() {} - -// Return all known values from `from` to `to`. Gaps of information are represented as NaN. -// Simple linear interpolation is done between the two neighboring cells if possible. -// If values at the start or end are missing, instead of NaN values, the second and thrid -// return values contain the actual `from`/`to`. -// This function goes back the buffer chain if `from` is older than the currents buffer start. -// The loaded values are added to `data` and `data` is returned, possibly with a shorter length. -// If `data` is not long enough to hold all values, this function will panic! -func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) { - if from < b.firstWrite() { - if b.prev != nil { - return b.prev.read(from, to, data) - } - from = b.firstWrite() - } - - i := 0 - t := from - for ; t < to; t += b.frequency { - idx := int((t - b.start) / b.frequency) - if idx >= cap(b.data) { - if b.next == nil { - break - } - b = b.next - idx = 0 - } - - if idx >= len(b.data) { - if b.next == nil || to <= b.next.start { - break - } - data[i] += schema.NaN - } else if t < b.start { - data[i] += schema.NaN - // } else if b.data[idx].IsNaN() { - // data[i] += interpolate(idx, b.data) - } else { - data[i] += b.data[idx] - } - i++ - } - - return data[:i], from, t, nil -} - -// Returns true if this buffer needs to be freed. -func (b *buffer) free(t int64) (delme bool, n int) { - if b.prev != nil { - delme, m := b.prev.free(t) - n += m - if delme { - b.prev.next = nil - if cap(b.prev.data) == BufferCap { - bufferPool.Put(b.prev) - } - b.prev = nil - } - } - - end := b.end() - if end < t { - return true, n + 1 - } - - return false, n -} - -// Call `callback` on every buffer that contains data in the range from `from` to `to`. -func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error { - if b == nil { - return nil - } - - if err := b.prev.iterFromTo(from, to, callback); err != nil { - return err - } - - if from <= b.end() && b.start <= to { - return callback(b) - } - - return nil -} - -func (b *buffer) count() int64 { - res := int64(len(b.data)) - if b.prev != nil { - res += b.prev.count() - } - return res -} diff --git a/internal/memorystore/checkpoint.go b/internal/memorystore/checkpoint.go deleted file mode 100644 index e19cbf76..00000000 --- a/internal/memorystore/checkpoint.go +++ /dev/null @@ -1,783 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package memorystore - -import ( - "bufio" - "context" - "encoding/json" - "errors" - "fmt" - "io/fs" - "os" - "path" - "path/filepath" - "runtime" - "sort" - "strconv" - "strings" - "sync" - "sync/atomic" - "time" - - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/linkedin/goavro/v2" -) - -// File operation constants -const ( - // CheckpointFilePerms defines default permissions for checkpoint files - CheckpointFilePerms = 0o644 - // CheckpointDirPerms defines default permissions for checkpoint directories - CheckpointDirPerms = 0o755 - // GCTriggerInterval determines how often GC is forced during checkpoint loading - // GC is triggered every GCTriggerInterval*NumWorkers loaded hosts - GCTriggerInterval = 100 -) - -// Whenever changed, update MarshalJSON as well! -type CheckpointMetrics struct { - Data []schema.Float `json:"data"` - Frequency int64 `json:"frequency"` - Start int64 `json:"start"` -} - -type CheckpointFile struct { - Metrics map[string]*CheckpointMetrics `json:"metrics"` - Children map[string]*CheckpointFile `json:"children"` - From int64 `json:"from"` - To int64 `json:"to"` -} - -var lastCheckpoint time.Time - -func Checkpointing(wg *sync.WaitGroup, ctx context.Context) { - lastCheckpoint = time.Now() - - if Keys.Checkpoints.FileFormat == "json" { - ms := GetMemoryStore() - - go func() { - defer wg.Done() - d, err := time.ParseDuration(Keys.Checkpoints.Interval) - if err != nil { - cclog.Fatal(err) - } - if d <= 0 { - return - } - - ticks := func() <-chan time.Time { - if d <= 0 { - return nil - } - return time.NewTicker(d).C - }() - for { - select { - case <-ctx.Done(): - return - case <-ticks: - cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", lastCheckpoint.Format(time.RFC3339)) - now := time.Now() - n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir, - lastCheckpoint.Unix(), now.Unix()) - if err != nil { - cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error()) - } else { - cclog.Infof("[METRICSTORE]> done: %d checkpoint files created", n) - lastCheckpoint = now - } - } - } - }() - } else { - go func() { - defer wg.Done() - d, _ := time.ParseDuration("1m") - - select { - case <-ctx.Done(): - return - case <-time.After(time.Duration(CheckpointBufferMinutes) * time.Minute): - // This is the first tick untill we collect the data for given minutes. - GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false) - // log.Printf("Checkpointing %d avro files", count) - - } - - ticks := func() <-chan time.Time { - if d <= 0 { - return nil - } - return time.NewTicker(d).C - }() - - for { - select { - case <-ctx.Done(): - return - case <-ticks: - // Regular ticks of 1 minute to write data. - GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, false) - // log.Printf("Checkpointing %d avro files", count) - } - } - }() - } -} - -// As `Float` implements a custom MarshalJSON() function, -// serializing an array of such types has more overhead -// than one would assume (because of extra allocations, interfaces and so on). -func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) { - buf := make([]byte, 0, 128+len(cm.Data)*8) - buf = append(buf, `{"frequency":`...) - buf = strconv.AppendInt(buf, cm.Frequency, 10) - buf = append(buf, `,"start":`...) - buf = strconv.AppendInt(buf, cm.Start, 10) - buf = append(buf, `,"data":[`...) - for i, x := range cm.Data { - if i != 0 { - buf = append(buf, ',') - } - if x.IsNaN() { - buf = append(buf, `null`...) - } else { - buf = strconv.AppendFloat(buf, float64(x), 'f', 1, 32) - } - } - buf = append(buf, `]}`...) - return buf, nil -} - -// Metrics stored at the lowest 2 levels are not stored away (root and cluster)! -// On a per-host basis a new JSON file is created. I have no idea if this will scale. -// The good thing: Only a host at a time is locked, so this function can run -// in parallel to writes/reads. -func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) { - levels := make([]*Level, 0) - selectors := make([][]string, 0) - m.root.lock.RLock() - for sel1, l1 := range m.root.children { - l1.lock.RLock() - for sel2, l2 := range l1.children { - levels = append(levels, l2) - selectors = append(selectors, []string{sel1, sel2}) - } - l1.lock.RUnlock() - } - m.root.lock.RUnlock() - - type workItem struct { - level *Level - dir string - selector []string - } - - n, errs := int32(0), int32(0) - - var wg sync.WaitGroup - wg.Add(Keys.NumWorkers) - work := make(chan workItem, Keys.NumWorkers*2) - for worker := 0; worker < Keys.NumWorkers; worker++ { - go func() { - defer wg.Done() - - for workItem := range work { - if err := workItem.level.toCheckpoint(workItem.dir, from, to, m); err != nil { - if err == ErrNoNewArchiveData { - continue - } - - cclog.Errorf("[METRICSTORE]> error while checkpointing %#v: %s", workItem.selector, err.Error()) - atomic.AddInt32(&errs, 1) - } else { - atomic.AddInt32(&n, 1) - } - } - }() - } - - for i := 0; i < len(levels); i++ { - dir := path.Join(dir, path.Join(selectors[i]...)) - work <- workItem{ - level: levels[i], - dir: dir, - selector: selectors[i], - } - } - - close(work) - wg.Wait() - - if errs > 0 { - return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n) - } - return int(n), nil -} - -func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) { - l.lock.RLock() - defer l.lock.RUnlock() - - retval := &CheckpointFile{ - From: from, - To: to, - Metrics: make(map[string]*CheckpointMetrics), - Children: make(map[string]*CheckpointFile), - } - - for metric, minfo := range m.Metrics { - b := l.metrics[minfo.offset] - if b == nil { - continue - } - - allArchived := true - b.iterFromTo(from, to, func(b *buffer) error { - if !b.archived { - allArchived = false - } - return nil - }) - - if allArchived { - continue - } - - data := make([]schema.Float, (to-from)/b.frequency+1) - data, start, end, err := b.read(from, to, data) - if err != nil { - return nil, err - } - - for i := int((end - start) / b.frequency); i < len(data); i++ { - data[i] = schema.NaN - } - - retval.Metrics[metric] = &CheckpointMetrics{ - Frequency: b.frequency, - Start: start, - Data: data, - } - } - - for name, child := range l.children { - val, err := child.toCheckpointFile(from, to, m) - if err != nil { - return nil, err - } - - if val != nil { - retval.Children[name] = val - } - } - - if len(retval.Children) == 0 && len(retval.Metrics) == 0 { - return nil, nil - } - - return retval, nil -} - -func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error { - cf, err := l.toCheckpointFile(from, to, m) - if err != nil { - return err - } - - if cf == nil { - return ErrNoNewArchiveData - } - - filepath := path.Join(dir, fmt.Sprintf("%d.json", from)) - f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms) - if err != nil && os.IsNotExist(err) { - err = os.MkdirAll(dir, CheckpointDirPerms) - if err == nil { - f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms) - } - } - if err != nil { - return err - } - defer f.Close() - - bw := bufio.NewWriter(f) - if err = json.NewEncoder(bw).Encode(cf); err != nil { - return err - } - - return bw.Flush() -} - -func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) { - var wg sync.WaitGroup - work := make(chan [2]string, Keys.NumWorkers) - n, errs := int32(0), int32(0) - - wg.Add(Keys.NumWorkers) - for worker := 0; worker < Keys.NumWorkers; worker++ { - go func() { - defer wg.Done() - for host := range work { - lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics)) - nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension) - if err != nil { - cclog.Fatalf("[METRICSTORE]> error while loading checkpoints: %s", err.Error()) - atomic.AddInt32(&errs, 1) - } - atomic.AddInt32(&n, int32(nn)) - } - }() - } - - i := 0 - clustersDir, err := os.ReadDir(dir) - for _, clusterDir := range clustersDir { - if !clusterDir.IsDir() { - err = errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory") - goto done - } - - hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name())) - if e != nil { - err = e - goto done - } - - for _, hostDir := range hostsDir { - if !hostDir.IsDir() { - err = errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory") - goto done - } - - i++ - if i%Keys.NumWorkers == 0 && i > GCTriggerInterval { - // Forcing garbage collection runs here regulary during the loading of checkpoints - // will decrease the total heap size after loading everything back to memory is done. - // While loading data, the heap will grow fast, so the GC target size will double - // almost always. By forcing GCs here, we can keep it growing more slowly so that - // at the end, less memory is wasted. - runtime.GC() - } - - work <- [2]string{clusterDir.Name(), hostDir.Name()} - } - } -done: - close(work) - wg.Wait() - - if err != nil { - return int(n), err - } - - if errs > 0 { - return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n) - } - return int(n), nil -} - -// Metrics stored at the lowest 2 levels are not loaded (root and cluster)! -// This function can only be called once and before the very first write or read. -// Different host's data is loaded to memory in parallel. -func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) { - if _, err := os.Stat(dir); os.IsNotExist(err) { - // The directory does not exist, so create it using os.MkdirAll() - err := os.MkdirAll(dir, CheckpointDirPerms) // CheckpointDirPerms sets the permissions for the directory - if err != nil { - cclog.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err) - } - cclog.Debugf("[METRICSTORE]> %#v Directory created successfully", dir) - } - - // Config read (replace with your actual config read) - fileFormat := Keys.Checkpoints.FileFormat - if fileFormat == "" { - fileFormat = "avro" - } - - // Map to easily get the fallback format - oppositeFormat := map[string]string{ - "json": "avro", - "avro": "json", - } - - // First, attempt to load the specified format - if found, err := checkFilesWithExtension(dir, fileFormat); err != nil { - return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err) - } else if found { - cclog.Infof("[METRICSTORE]> Loading %s files because fileformat is %s", fileFormat, fileFormat) - return m.FromCheckpoint(dir, from, fileFormat) - } - - // If not found, attempt the opposite format - altFormat := oppositeFormat[fileFormat] - if found, err := checkFilesWithExtension(dir, altFormat); err != nil { - return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err) - } else if found { - cclog.Infof("[METRICSTORE]> Loading %s files but fileformat is %s", altFormat, fileFormat) - return m.FromCheckpoint(dir, from, altFormat) - } - - cclog.Print("[METRICSTORE]> No valid checkpoint files found in the directory") - return 0, nil -} - -func checkFilesWithExtension(dir string, extension string) (bool, error) { - found := false - - err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { - if err != nil { - return fmt.Errorf("[METRICSTORE]> error accessing path %s: %v", path, err) - } - if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension { - found = true - return nil - } - return nil - }) - if err != nil { - return false, fmt.Errorf("[METRICSTORE]> error walking through directories: %s", err) - } - - return found, nil -} - -func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error { - br := bufio.NewReader(f) - - fileName := f.Name()[strings.LastIndex(f.Name(), "/")+1:] - resolution, err := strconv.ParseInt(fileName[0:strings.Index(fileName, "_")], 10, 64) - if err != nil { - return fmt.Errorf("[METRICSTORE]> error while reading avro file (resolution parsing) : %s", err) - } - - fromTimestamp, err := strconv.ParseInt(fileName[strings.Index(fileName, "_")+1:len(fileName)-5], 10, 64) - - // Same logic according to lineprotocol - fromTimestamp -= (resolution / 2) - - if err != nil { - return fmt.Errorf("[METRICSTORE]> error converting timestamp from the avro file : %s", err) - } - - // fmt.Printf("File : %s with resolution : %d\n", fileName, resolution) - - var recordCounter int64 = 0 - - // Create a new OCF reader from the buffered reader - ocfReader, err := goavro.NewOCFReader(br) - if err != nil { - return fmt.Errorf("[METRICSTORE]> error creating OCF reader: %w", err) - } - - metricsData := make(map[string]schema.FloatArray) - - for ocfReader.Scan() { - datum, err := ocfReader.Read() - if err != nil { - return fmt.Errorf("[METRICSTORE]> error while reading avro file : %s", err) - } - - record, ok := datum.(map[string]any) - if !ok { - return fmt.Errorf("[METRICSTORE]> failed to assert datum as map[string]interface{}") - } - - for key, value := range record { - metricsData[key] = append(metricsData[key], schema.ConvertToFloat(value.(float64))) - } - - recordCounter += 1 - } - - to := (fromTimestamp + (recordCounter / (60 / resolution) * 60)) - if to < from { - return nil - } - - for key, floatArray := range metricsData { - metricName := ReplaceKey(key) - - if strings.Contains(metricName, Delimiter) { - subString := strings.Split(metricName, Delimiter) - - lvl := l - - for i := 0; i < len(subString)-1; i++ { - - sel := subString[i] - - if lvl.children == nil { - lvl.children = make(map[string]*Level) - } - - child, ok := lvl.children[sel] - if !ok { - child = &Level{ - metrics: make([]*buffer, len(m.Metrics)), - children: nil, - } - lvl.children[sel] = child - } - lvl = child - } - - leafMetricName := subString[len(subString)-1] - err = lvl.createBuffer(m, leafMetricName, floatArray, fromTimestamp, resolution) - if err != nil { - return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err) - } - } else { - err = l.createBuffer(m, metricName, floatArray, fromTimestamp, resolution) - if err != nil { - return fmt.Errorf("[METRICSTORE]> error while creating buffers from avroReader : %s", err) - } - } - - } - - return nil -} - -func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray schema.FloatArray, from int64, resolution int64) error { - n := len(floatArray) - b := &buffer{ - frequency: resolution, - start: from, - data: floatArray[0:n:n], - prev: nil, - next: nil, - archived: true, - } - b.close() - - minfo, ok := m.Metrics[metricName] - if !ok { - return nil - // return errors.New("Unkown metric: " + name) - } - - prev := l.metrics[minfo.offset] - if prev == nil { - l.metrics[minfo.offset] = b - } else { - if prev.start > b.start { - return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start) - } - - b.prev = prev - prev.next = b - - missingCount := ((int(b.start) - int(prev.start)) - len(prev.data)*int(b.frequency)) - if missingCount > 0 { - missingCount /= int(b.frequency) - - for range missingCount { - prev.data = append(prev.data, schema.NaN) - } - - prev.data = prev.data[0:len(prev.data):len(prev.data)] - } - } - l.metrics[minfo.offset] = b - - return nil -} - -func (l *Level) loadJSONFile(m *MemoryStore, f *os.File, from int64) error { - br := bufio.NewReader(f) - cf := &CheckpointFile{} - if err := json.NewDecoder(br).Decode(cf); err != nil { - return err - } - - if cf.To != 0 && cf.To < from { - return nil - } - - if err := l.loadFile(cf, m); err != nil { - return err - } - - return nil -} - -func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error { - for name, metric := range cf.Metrics { - n := len(metric.Data) - b := &buffer{ - frequency: metric.Frequency, - start: metric.Start, - data: metric.Data[0:n:n], // Space is wasted here :( - prev: nil, - next: nil, - archived: true, - } - b.close() - - minfo, ok := m.Metrics[name] - if !ok { - continue - // return errors.New("Unkown metric: " + name) - } - - prev := l.metrics[minfo.offset] - if prev == nil { - l.metrics[minfo.offset] = b - } else { - if prev.start > b.start { - return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start) - } - - b.prev = prev - prev.next = b - } - l.metrics[minfo.offset] = b - } - - if len(cf.Children) > 0 && l.children == nil { - l.children = make(map[string]*Level) - } - - for sel, childCf := range cf.Children { - child, ok := l.children[sel] - if !ok { - child = &Level{ - metrics: make([]*buffer, len(m.Metrics)), - children: nil, - } - l.children[sel] = child - } - - if err := child.loadFile(childCf, m); err != nil { - return err - } - } - - return nil -} - -func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension string) (int, error) { - direntries, err := os.ReadDir(dir) - if err != nil { - if os.IsNotExist(err) { - return 0, nil - } - - return 0, err - } - - allFiles := make([]fs.DirEntry, 0) - filesLoaded := 0 - for _, e := range direntries { - if e.IsDir() { - child := &Level{ - metrics: make([]*buffer, len(m.Metrics)), - children: make(map[string]*Level), - } - - files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from, extension) - filesLoaded += files - if err != nil { - return filesLoaded, err - } - - l.children[e.Name()] = child - } else if strings.HasSuffix(e.Name(), "."+extension) { - allFiles = append(allFiles, e) - } else { - continue - } - } - - files, err := findFiles(allFiles, from, extension, true) - if err != nil { - return filesLoaded, err - } - - loaders := map[string]func(*MemoryStore, *os.File, int64) error{ - "json": l.loadJSONFile, - "avro": l.loadAvroFile, - } - - loader := loaders[extension] - - for _, filename := range files { - // Use a closure to ensure file is closed immediately after use - err := func() error { - f, err := os.Open(path.Join(dir, filename)) - if err != nil { - return err - } - defer f.Close() - - return loader(m, f, from) - }() - if err != nil { - return filesLoaded, err - } - - filesLoaded += 1 - } - - return filesLoaded, nil -} - -// This will probably get very slow over time! -// A solution could be some sort of an index file in which all other files -// and the timespan they contain is listed. -func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRecentFiles bool) ([]string, error) { - nums := map[string]int64{} - for _, e := range direntries { - if !strings.HasSuffix(e.Name(), "."+extension) { - continue - } - - ts, err := strconv.ParseInt(e.Name()[strings.Index(e.Name(), "_")+1:len(e.Name())-5], 10, 64) - if err != nil { - return nil, err - } - nums[e.Name()] = ts - } - - sort.Slice(direntries, func(i, j int) bool { - a, b := direntries[i], direntries[j] - return nums[a.Name()] < nums[b.Name()] - }) - - filenames := make([]string, 0) - for i := range direntries { - e := direntries[i] - ts1 := nums[e.Name()] - - if findMoreRecentFiles && t <= ts1 { - filenames = append(filenames, e.Name()) - } - if i == len(direntries)-1 { - continue - } - - enext := direntries[i+1] - ts2 := nums[enext.Name()] - - if findMoreRecentFiles { - if ts1 < t && t < ts2 { - filenames = append(filenames, e.Name()) - } - } else { - if ts2 < t { - filenames = append(filenames, e.Name()) - } - } - } - - return filenames, nil -} diff --git a/internal/memorystore/config.go b/internal/memorystore/config.go deleted file mode 100644 index 8196ed69..00000000 --- a/internal/memorystore/config.go +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package memorystore - -import ( - "fmt" -) - -var InternalCCMSFlag bool = false - -type MetricStoreConfig struct { - // Number of concurrent workers for checkpoint and archive operations. - // If not set or 0, defaults to min(runtime.NumCPU()/2+1, 10) - NumWorkers int `json:"num-workers"` - Checkpoints struct { - FileFormat string `json:"file-format"` - Interval string `json:"interval"` - RootDir string `json:"directory"` - Restore string `json:"restore"` - } `json:"checkpoints"` - Debug struct { - DumpToFile string `json:"dump-to-file"` - EnableGops bool `json:"gops"` - } `json:"debug"` - RetentionInMemory string `json:"retention-in-memory"` - Archive struct { - Interval string `json:"interval"` - RootDir string `json:"directory"` - DeleteInstead bool `json:"delete-instead"` - } `json:"archive"` - Nats []*NatsConfig `json:"nats"` -} - -type NatsConfig struct { - // Address of the nats server - Address string `json:"address"` - - // Username/Password, optional - Username string `json:"username"` - Password string `json:"password"` - - // Creds file path - Credsfilepath string `json:"creds-file-path"` - - Subscriptions []struct { - // Channel name - SubscribeTo string `json:"subscribe-to"` - - // Allow lines without a cluster tag, use this as default, optional - ClusterTag string `json:"cluster-tag"` - } `json:"subscriptions"` -} - -var Keys MetricStoreConfig - -// AggregationStrategy for aggregation over multiple values at different cpus/sockets/..., not time! -type AggregationStrategy int - -const ( - NoAggregation AggregationStrategy = iota - SumAggregation - AvgAggregation -) - -func AssignAggregationStrategy(str string) (AggregationStrategy, error) { - switch str { - case "": - return NoAggregation, nil - case "sum": - return SumAggregation, nil - case "avg": - return AvgAggregation, nil - default: - return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str) - } -} - -type MetricConfig struct { - // Interval in seconds at which measurements are stored - Frequency int64 - - // Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy. - Aggregation AggregationStrategy - - // Private, used internally... - offset int -} - -var Metrics map[string]MetricConfig - -func GetMetricFrequency(metricName string) (int64, error) { - if metric, ok := Metrics[metricName]; ok { - return metric.Frequency, nil - } - return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName) -} - -// AddMetric adds logic to add metrics. Redundant metrics should be updated with max frequency. -// use metric.Name to check if the metric already exists. -// if not, add it to the Metrics map. -func AddMetric(name string, metric MetricConfig) error { - if Metrics == nil { - Metrics = make(map[string]MetricConfig, 0) - } - - if existingMetric, ok := Metrics[name]; ok { - if existingMetric.Frequency != metric.Frequency { - if existingMetric.Frequency < metric.Frequency { - existingMetric.Frequency = metric.Frequency - Metrics[name] = existingMetric - } - } - } else { - Metrics[name] = metric - } - - return nil -} diff --git a/internal/memorystore/configSchema.go b/internal/memorystore/configSchema.go deleted file mode 100644 index 2616edc6..00000000 --- a/internal/memorystore/configSchema.go +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package memorystore - -const configSchema = `{ - "type": "object", - "description": "Configuration specific to built-in metric-store.", - "properties": { - "checkpoints": { - "description": "Configuration for checkpointing the metrics within metric-store", - "type": "object", - "properties": { - "file-format": { - "description": "Specify the type of checkpoint file. There are 2 variants: 'avro' and 'json'. If nothing is specified, 'avro' is default.", - "type": "string" - }, - "interval": { - "description": "Interval at which the metrics should be checkpointed.", - "type": "string" - }, - "directory": { - "description": "Specify the parent directy in which the checkpointed files should be placed.", - "type": "string" - }, - "restore": { - "description": "When cc-backend starts up, look for checkpointed files that are less than X hours old and load metrics from these selected checkpoint files.", - "type": "string" - } - } - }, - "archive": { - "description": "Configuration for archiving the already checkpointed files.", - "type": "object", - "properties": { - "interval": { - "description": "Interval at which the checkpointed files should be archived.", - "type": "string" - }, - "directory": { - "description": "Specify the parent directy in which the archived files should be placed.", - "type": "string" - } - } - }, - "retention-in-memory": { - "description": "Keep the metrics within memory for given time interval. Retention for X hours, then the metrics would be freed.", - "type": "string" - }, - "nats": { - "description": "Configuration for accepting published data through NATS.", - "type": "array", - "items": { - "type": "object", - "properties": { - "address": { - "description": "Address of the NATS server.", - "type": "string" - }, - "username": { - "description": "Optional: If configured with username/password method.", - "type": "string" - }, - "password": { - "description": "Optional: If configured with username/password method.", - "type": "string" - }, - "creds-file-path": { - "description": "Optional: If configured with Credential File method. Path to your NATS cred file.", - "type": "string" - }, - "subscriptions": { - "description": "Array of various subscriptions. Allows to subscibe to different subjects and publishers.", - "type": "array", - "items": { - "type": "object", - "properties": { - "subscribe-to": { - "description": "Channel name", - "type": "string" - }, - "cluster-tag": { - "description": "Optional: Allow lines without a cluster tag, use this as default", - "type": "string" - } - } - } - } - } - } - } - } -}` diff --git a/internal/memorystore/healthcheck.go b/internal/memorystore/healthcheck.go deleted file mode 100644 index b1052f3b..00000000 --- a/internal/memorystore/healthcheck.go +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package memorystore - -import ( - "bufio" - "fmt" - "time" -) - -// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing. -// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a -// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy. -const MaxMissingDataPoints int64 = 5 - -// MaxUnhealthyMetrics is a threshold which allows upto certain number of metrics in a node to be unhealthly. -// Works with MaxMissingDataPoints. Say 5 metrics (including submetrics) do not receive the last -// MaxMissingDataPoints data points, then the node will be deemed healthy. Any more metrics that does -// not receive data for MaxMissingDataPoints data points will deem the node unhealthy. -const MaxUnhealthyMetrics int64 = 5 - -func (b *buffer) healthCheck() int64 { - // Check if the buffer is empty - if b.data == nil { - return 1 - } - - bufferEnd := b.start + b.frequency*int64(len(b.data)) - t := time.Now().Unix() - - // Check if the buffer is too old - if t-bufferEnd > MaxMissingDataPoints*b.frequency { - return 1 - } - - return 0 -} - -func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) { - l.lock.RLock() - defer l.lock.RUnlock() - - for _, mc := range m.Metrics { - if b := l.metrics[mc.offset]; b != nil { - count += b.healthCheck() - } - } - - for _, lvl := range l.children { - c, err := lvl.healthCheck(m, 0) - if err != nil { - return 0, err - } - count += c - } - - return count, nil -} - -func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error { - lvl := m.root.findLevel(selector) - if lvl == nil { - return fmt.Errorf("[METRICSTORE]> not found: %#v", selector) - } - - buf := make([]byte, 0, 25) - // buf = append(buf, "{"...) - - var count int64 = 0 - - unhealthyMetricsCount, err := lvl.healthCheck(m, count) - if err != nil { - return err - } - - if unhealthyMetricsCount < MaxUnhealthyMetrics { - buf = append(buf, "Healthy"...) - } else { - buf = append(buf, "Unhealthy"...) - } - - // buf = append(buf, "}\n"...) - - if _, err = w.Write(buf); err != nil { - return err - } - - return w.Flush() -} diff --git a/internal/memorystore/level.go b/internal/memorystore/level.go deleted file mode 100644 index aaa12103..00000000 --- a/internal/memorystore/level.go +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package memorystore - -import ( - "sync" - "unsafe" - - "github.com/ClusterCockpit/cc-lib/util" -) - -// Could also be called "node" as this forms a node in a tree structure. -// Called Level because "node" might be confusing here. -// Can be both a leaf or a inner node. In this tree structue, inner nodes can -// also hold data (in `metrics`). -type Level struct { - children map[string]*Level - metrics []*buffer - lock sync.RWMutex -} - -// Find the correct level for the given selector, creating it if -// it does not exist. Example selector in the context of the -// ClusterCockpit could be: []string{ "emmy", "host123", "cpu0" }. -// This function would probably benefit a lot from `level.children` beeing a `sync.Map`? -func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level { - if len(selector) == 0 { - return l - } - - // Allow concurrent reads: - l.lock.RLock() - var child *Level - var ok bool - if l.children == nil { - // Children map needs to be created... - l.lock.RUnlock() - } else { - child, ok = l.children[selector[0]] - l.lock.RUnlock() - if ok { - return child.findLevelOrCreate(selector[1:], nMetrics) - } - } - - // The level does not exist, take write lock for unqiue access: - l.lock.Lock() - // While this thread waited for the write lock, another thread - // could have created the child node. - if l.children != nil { - child, ok = l.children[selector[0]] - if ok { - l.lock.Unlock() - return child.findLevelOrCreate(selector[1:], nMetrics) - } - } - - child = &Level{ - metrics: make([]*buffer, nMetrics), - children: nil, - } - - if l.children != nil { - l.children[selector[0]] = child - } else { - l.children = map[string]*Level{selector[0]: child} - } - l.lock.Unlock() - return child.findLevelOrCreate(selector[1:], nMetrics) -} - -func (l *Level) free(t int64) (int, error) { - l.lock.Lock() - defer l.lock.Unlock() - - n := 0 - for i, b := range l.metrics { - if b != nil { - delme, m := b.free(t) - n += m - if delme { - if cap(b.data) == BufferCap { - bufferPool.Put(b) - } - l.metrics[i] = nil - } - } - } - - for _, l := range l.children { - m, err := l.free(t) - n += m - if err != nil { - return n, err - } - } - - return n, nil -} - -func (l *Level) sizeInBytes() int64 { - l.lock.RLock() - defer l.lock.RUnlock() - size := int64(0) - - for _, b := range l.metrics { - if b != nil { - size += b.count() * int64(unsafe.Sizeof(util.Float(0))) - } - } - - for _, child := range l.children { - size += child.sizeInBytes() - } - - return size -} - -func (l *Level) findLevel(selector []string) *Level { - if len(selector) == 0 { - return l - } - - l.lock.RLock() - defer l.lock.RUnlock() - - lvl := l.children[selector[0]] - if lvl == nil { - return nil - } - - return lvl.findLevel(selector[1:]) -} - -func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error { - l.lock.RLock() - defer l.lock.RUnlock() - - if len(selector) == 0 { - b := l.metrics[offset] - if b != nil { - return f(b) - } - - for _, lvl := range l.children { - err := lvl.findBuffers(nil, offset, f) - if err != nil { - return err - } - } - return nil - } - - sel := selector[0] - if len(sel.String) != 0 && l.children != nil { - lvl, ok := l.children[sel.String] - if ok { - err := lvl.findBuffers(selector[1:], offset, f) - if err != nil { - return err - } - } - return nil - } - - if sel.Group != nil && l.children != nil { - for _, key := range sel.Group { - lvl, ok := l.children[key] - if ok { - err := lvl.findBuffers(selector[1:], offset, f) - if err != nil { - return err - } - } - } - return nil - } - - if sel.Any && l.children != nil { - for _, lvl := range l.children { - if err := lvl.findBuffers(selector[1:], offset, f); err != nil { - return err - } - } - return nil - } - - return nil -} diff --git a/internal/memorystore/lineprotocol.go b/internal/memorystore/lineprotocol.go deleted file mode 100644 index 2bbd7eeb..00000000 --- a/internal/memorystore/lineprotocol.go +++ /dev/null @@ -1,351 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package memorystore - -import ( - "context" - "fmt" - "sync" - "time" - - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/influxdata/line-protocol/v2/lineprotocol" - "github.com/nats-io/nats.go" -) - -// Each connection is handled in it's own goroutine. This is a blocking function. -// func ReceiveRaw(ctx context.Context, -// listener net.Listener, -// handleLine func(*lineprotocol.Decoder, string) error, -// ) error { -// var wg sync.WaitGroup - -// wg.Add(1) -// go func() { -// defer wg.Done() -// <-ctx.Done() -// if err := listener.Close(); err != nil { -// log.Printf("listener.Close(): %s", err.Error()) -// } -// }() - -// for { -// conn, err := listener.Accept() -// if err != nil { -// if errors.Is(err, net.ErrClosed) { -// break -// } - -// log.Printf("listener.Accept(): %s", err.Error()) -// } - -// wg.Add(2) -// go func() { -// defer wg.Done() -// defer conn.Close() - -// dec := lineprotocol.NewDecoder(conn) -// connctx, cancel := context.WithCancel(context.Background()) -// defer cancel() -// go func() { -// defer wg.Done() -// select { -// case <-connctx.Done(): -// conn.Close() -// case <-ctx.Done(): -// conn.Close() -// } -// }() - -// if err := handleLine(dec, "default"); err != nil { -// if errors.Is(err, net.ErrClosed) { -// return -// } - -// log.Printf("%s: %s", conn.RemoteAddr().String(), err.Error()) -// errmsg := make([]byte, 128) -// errmsg = append(errmsg, `error: `...) -// errmsg = append(errmsg, err.Error()...) -// errmsg = append(errmsg, '\n') -// conn.Write(errmsg) -// } -// }() -// } - -// wg.Wait() -// return nil -// } - -// ReceiveNats connects to a nats server and subscribes to "updates". This is a -// blocking function. handleLine will be called for each line recieved via -// nats. Send `true` through the done channel for gracefull termination. -func ReceiveNats(conf *(NatsConfig), - ms *MemoryStore, - workers int, - ctx context.Context, -) error { - var opts []nats.Option - if conf.Username != "" && conf.Password != "" { - opts = append(opts, nats.UserInfo(conf.Username, conf.Password)) - } - - if conf.Credsfilepath != "" { - opts = append(opts, nats.UserCredentials(conf.Credsfilepath)) - } - - nc, err := nats.Connect(conf.Address, opts...) - if err != nil { - return err - } - defer nc.Close() - - var wg sync.WaitGroup - var subs []*nats.Subscription - - msgs := make(chan *nats.Msg, workers*2) - - for _, sc := range conf.Subscriptions { - clusterTag := sc.ClusterTag - var sub *nats.Subscription - if workers > 1 { - wg.Add(workers) - - for range workers { - go func() { - for m := range msgs { - dec := lineprotocol.NewDecoderWithBytes(m.Data) - if err := DecodeLine(dec, ms, clusterTag); err != nil { - cclog.Errorf("error: %s", err.Error()) - } - } - - wg.Done() - }() - } - - sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) { - msgs <- m - }) - } else { - sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) { - dec := lineprotocol.NewDecoderWithBytes(m.Data) - if err := DecodeLine(dec, ms, clusterTag); err != nil { - cclog.Errorf("error: %s", err.Error()) - } - }) - } - - if err != nil { - return err - } - cclog.Infof("NATS subscription to '%s' on '%s' established", sc.SubscribeTo, conf.Address) - subs = append(subs, sub) - } - - <-ctx.Done() - for _, sub := range subs { - err = sub.Unsubscribe() - if err != nil { - cclog.Errorf("NATS unsubscribe failed: %s", err.Error()) - } - } - close(msgs) - wg.Wait() - - nc.Close() - cclog.Print("NATS connection closed") - return nil -} - -// Place `prefix` in front of `buf` but if possible, -// do that inplace in `buf`. -func reorder(buf, prefix []byte) []byte { - n := len(prefix) - m := len(buf) - if cap(buf) < m+n { - return append(prefix[:n:n], buf...) - } else { - buf = buf[:n+m] - for i := m - 1; i >= 0; i-- { - buf[i+n] = buf[i] - } - for i := range n { - buf[i] = prefix[i] - } - return buf - } -} - -// Decode lines using dec and make write calls to the MemoryStore. -// If a line is missing its cluster tag, use clusterDefault as default. -func DecodeLine(dec *lineprotocol.Decoder, - ms *MemoryStore, - clusterDefault string, -) error { - // Reduce allocations in loop: - t := time.Now() - metric, metricBuf := Metric{}, make([]byte, 0, 16) - selector := make([]string, 0, 4) - typeBuf, subTypeBuf := make([]byte, 0, 16), make([]byte, 0) - - // Optimize for the case where all lines in a "batch" are about the same - // cluster and host. By using `WriteToLevel` (level = host), we do not need - // to take the root- and cluster-level lock as often. - var lvl *Level = nil - prevCluster, prevHost := "", "" - - var ok bool - for dec.Next() { - rawmeasurement, err := dec.Measurement() - if err != nil { - return err - } - - // Needs to be copied because another call to dec.* would - // invalidate the returned slice. - metricBuf = append(metricBuf[:0], rawmeasurement...) - - // The go compiler optimizes map[string(byteslice)] lookups: - metric.MetricConfig, ok = ms.Metrics[string(rawmeasurement)] - if !ok { - continue - } - - typeBuf, subTypeBuf := typeBuf[:0], subTypeBuf[:0] - cluster, host := clusterDefault, "" - for { - key, val, err := dec.NextTag() - if err != nil { - return err - } - if key == nil { - break - } - - // The go compiler optimizes string([]byte{...}) == "...": - switch string(key) { - case "cluster": - if string(val) == prevCluster { - cluster = prevCluster - } else { - cluster = string(val) - lvl = nil - } - case "hostname", "host": - if string(val) == prevHost { - host = prevHost - } else { - host = string(val) - lvl = nil - } - case "type": - if string(val) == "node" { - break - } - - // We cannot be sure that the "type" tag comes before the "type-id" tag: - if len(typeBuf) == 0 { - typeBuf = append(typeBuf, val...) - } else { - typeBuf = reorder(typeBuf, val) - } - case "type-id": - typeBuf = append(typeBuf, val...) - case "subtype": - // We cannot be sure that the "subtype" tag comes before the "stype-id" tag: - if len(subTypeBuf) == 0 { - subTypeBuf = append(subTypeBuf, val...) - } else { - subTypeBuf = reorder(subTypeBuf, val) - // subTypeBuf = reorder(typeBuf, val) - } - case "stype-id": - subTypeBuf = append(subTypeBuf, val...) - default: - // Ignore unkown tags (cc-metric-collector might send us a unit for example that we do not need) - // return fmt.Errorf("unkown tag: '%s' (value: '%s')", string(key), string(val)) - } - } - - // If the cluster or host changed, the lvl was set to nil - if lvl == nil { - selector = selector[:2] - selector[0], selector[1] = cluster, host - lvl = ms.GetLevel(selector) - prevCluster, prevHost = cluster, host - } - - // subtypes: - selector = selector[:0] - if len(typeBuf) > 0 { - selector = append(selector, string(typeBuf)) // <- Allocation :( - if len(subTypeBuf) > 0 { - selector = append(selector, string(subTypeBuf)) - } - } - - for { - key, val, err := dec.NextField() - if err != nil { - return err - } - - if key == nil { - break - } - - if string(key) != "value" { - return fmt.Errorf("host %s: unknown field: '%s' (value: %#v)", host, string(key), val) - } - - if val.Kind() == lineprotocol.Float { - metric.Value = schema.Float(val.FloatV()) - } else if val.Kind() == lineprotocol.Int { - metric.Value = schema.Float(val.IntV()) - } else if val.Kind() == lineprotocol.Uint { - metric.Value = schema.Float(val.UintV()) - } else { - return fmt.Errorf("host %s: unsupported value type in message: %s", host, val.Kind().String()) - } - } - - if t, err = dec.Time(lineprotocol.Second, t); err != nil { - t = time.Now() - if t, err = dec.Time(lineprotocol.Millisecond, t); err != nil { - t = time.Now() - if t, err = dec.Time(lineprotocol.Microsecond, t); err != nil { - t = time.Now() - if t, err = dec.Time(lineprotocol.Nanosecond, t); err != nil { - return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error()) - } - } - } - } - - if err != nil { - return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error()) - } - - time := t.Unix() - - if Keys.Checkpoints.FileFormat != "json" { - LineProtocolMessages <- &AvroStruct{ - MetricName: string(metricBuf), - Cluster: cluster, - Node: host, - Selector: append([]string{}, selector...), - Value: metric.Value, - Timestamp: time, - } - } - - if err := ms.WriteToLevel(lvl, selector, time, []Metric{metric}); err != nil { - return err - } - } - return nil -} diff --git a/internal/memorystore/memorystore.go b/internal/memorystore/memorystore.go deleted file mode 100644 index 3e372f34..00000000 --- a/internal/memorystore/memorystore.go +++ /dev/null @@ -1,437 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -// Package memorystore provides an efficient in-memory time-series metric storage system -// with support for hierarchical data organization, checkpointing, and archiving. -// -// The package organizes metrics in a tree structure (cluster → host → component) and -// provides concurrent read/write access to metric data with configurable aggregation strategies. -// Background goroutines handle periodic checkpointing (JSON or Avro format), archiving old data, -// and enforcing retention policies. -// -// Key features: -// - In-memory metric storage with configurable retention -// - Hierarchical data organization (selectors) -// - Concurrent checkpoint/archive workers -// - Support for sum and average aggregation -// - NATS integration for metric ingestion -package memorystore - -import ( - "bytes" - "context" - "encoding/json" - "errors" - "runtime" - "sync" - "time" - - "github.com/ClusterCockpit/cc-backend/internal/config" - "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/resampler" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/ClusterCockpit/cc-lib/util" -) - -var ( - singleton sync.Once - msInstance *MemoryStore - // shutdownFunc stores the context cancellation function created in Init - // and is called during Shutdown to cancel all background goroutines - shutdownFunc context.CancelFunc -) - - - -type Metric struct { - Name string - Value schema.Float - MetricConfig MetricConfig -} - -type MemoryStore struct { - Metrics map[string]MetricConfig - root Level -} - -func Init(rawConfig json.RawMessage, wg *sync.WaitGroup) { - startupTime := time.Now() - - if rawConfig != nil { - config.Validate(configSchema, rawConfig) - dec := json.NewDecoder(bytes.NewReader(rawConfig)) - // dec.DisallowUnknownFields() - if err := dec.Decode(&Keys); err != nil { - cclog.Abortf("[METRICSTORE]> Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", rawConfig, err.Error()) - } - } - - // Set NumWorkers from config or use default - if Keys.NumWorkers <= 0 { - maxWorkers := 10 - Keys.NumWorkers = min(runtime.NumCPU()/2+1, maxWorkers) - } - cclog.Debugf("[METRICSTORE]> Using %d workers for checkpoint/archive operations\n", Keys.NumWorkers) - - // Helper function to add metric configuration - addMetricConfig := func(mc schema.MetricConfig) { - agg, err := AssignAggregationStrategy(mc.Aggregation) - if err != nil { - cclog.Warnf("Could not find aggregation strategy for metric config '%s': %s", mc.Name, err.Error()) - } - - AddMetric(mc.Name, MetricConfig{ - Frequency: int64(mc.Timestep), - Aggregation: agg, - }) - } - - for _, c := range archive.Clusters { - for _, mc := range c.MetricConfig { - addMetricConfig(*mc) - } - - for _, sc := range c.SubClusters { - for _, mc := range sc.MetricConfig { - addMetricConfig(mc) - } - } - } - - // Pass the config.MetricStoreKeys - InitMetrics(Metrics) - - ms := GetMemoryStore() - - d, err := time.ParseDuration(Keys.Checkpoints.Restore) - if err != nil { - cclog.Fatal(err) - } - - restoreFrom := startupTime.Add(-d) - cclog.Infof("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339)) - files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix()) - loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB - if err != nil { - cclog.Fatalf("[METRICSTORE]> Loading checkpoints failed: %s\n", err.Error()) - } else { - cclog.Infof("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds()) - } - - // Try to use less memory by forcing a GC run here and then - // lowering the target percentage. The default of 100 means - // that only once the ratio of new allocations execeds the - // previously active heap, a GC is triggered. - // Forcing a GC here will set the "previously active heap" - // to a minumum. - runtime.GC() - - ctx, shutdown := context.WithCancel(context.Background()) - - wg.Add(4) - - Retention(wg, ctx) - Checkpointing(wg, ctx) - Archiving(wg, ctx) - DataStaging(wg, ctx) - - // Note: Signal handling has been removed from this function. - // The caller is responsible for handling shutdown signals and calling - // the shutdown() function when appropriate. - // Store the shutdown function for later use by Shutdown() - shutdownFunc = shutdown - - if Keys.Nats != nil { - for _, natsConf := range Keys.Nats { - // TODO: When multiple nats configs share a URL, do a single connect. - wg.Add(1) - nc := natsConf - go func() { - // err := ReceiveNats(conf.Nats, decodeLine, runtime.NumCPU()-1, ctx) - err := ReceiveNats(nc, ms, 1, ctx) - if err != nil { - cclog.Fatal(err) - } - wg.Done() - }() - } - } -} - -// InitMetrics creates a new, initialized instance of a MemoryStore. -// Will panic if values in the metric configurations are invalid. -func InitMetrics(metrics map[string]MetricConfig) { - singleton.Do(func() { - offset := 0 - for key, cfg := range metrics { - if cfg.Frequency == 0 { - panic("[METRICSTORE]> invalid frequency") - } - - metrics[key] = MetricConfig{ - Frequency: cfg.Frequency, - Aggregation: cfg.Aggregation, - offset: offset, - } - offset += 1 - } - - msInstance = &MemoryStore{ - root: Level{ - metrics: make([]*buffer, len(metrics)), - children: make(map[string]*Level), - }, - Metrics: metrics, - } - }) -} - -func GetMemoryStore() *MemoryStore { - if msInstance == nil { - cclog.Fatalf("[METRICSTORE]> MemoryStore not initialized!") - } - - return msInstance -} - -func Shutdown() { - // Cancel the context to signal all background goroutines to stop - if shutdownFunc != nil { - shutdownFunc() - } - - cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir) - var files int - var err error - - ms := GetMemoryStore() - - if Keys.Checkpoints.FileFormat == "json" { - files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix()) - } else { - files, err = GetAvroStore().ToCheckpoint(Keys.Checkpoints.RootDir, true) - close(LineProtocolMessages) - } - - if err != nil { - cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error()) - } - cclog.Infof("[METRICSTORE]> Done! (%d files written)\n", files) -} - -func getName(m *MemoryStore, i int) string { - for key, val := range m.Metrics { - if val.offset == i { - return key - } - } - return "" -} - -func Retention(wg *sync.WaitGroup, ctx context.Context) { - ms := GetMemoryStore() - - go func() { - defer wg.Done() - d, err := time.ParseDuration(Keys.RetentionInMemory) - if err != nil { - cclog.Fatal(err) - } - if d <= 0 { - return - } - - ticks := func() <-chan time.Time { - d := d / 2 - if d <= 0 { - return nil - } - return time.NewTicker(d).C - }() - for { - select { - case <-ctx.Done(): - return - case <-ticks: - t := time.Now().Add(-d) - cclog.Infof("[METRICSTORE]> start freeing buffers (older than %s)...\n", t.Format(time.RFC3339)) - freed, err := ms.Free(nil, t.Unix()) - if err != nil { - cclog.Errorf("[METRICSTORE]> freeing up buffers failed: %s\n", err.Error()) - } else { - cclog.Infof("[METRICSTORE]> done: %d buffers freed\n", freed) - } - } - } - }() -} - -// Write all values in `metrics` to the level specified by `selector` for time `ts`. -// Look at `findLevelOrCreate` for how selectors work. -func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error { - var ok bool - for i, metric := range metrics { - if metric.MetricConfig.Frequency == 0 { - metric.MetricConfig, ok = m.Metrics[metric.Name] - if !ok { - metric.MetricConfig.Frequency = 0 - } - metrics[i] = metric - } - } - - return m.WriteToLevel(&m.root, selector, ts, metrics) -} - -func (m *MemoryStore) GetLevel(selector []string) *Level { - return m.root.findLevelOrCreate(selector, len(m.Metrics)) -} - -// WriteToLevel assumes that `minfo` in `metrics` is filled in -func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metrics []Metric) error { - l = l.findLevelOrCreate(selector, len(m.Metrics)) - l.lock.Lock() - defer l.lock.Unlock() - - for _, metric := range metrics { - if metric.MetricConfig.Frequency == 0 { - continue - } - - b := l.metrics[metric.MetricConfig.offset] - if b == nil { - // First write to this metric and level - b = newBuffer(ts, metric.MetricConfig.Frequency) - l.metrics[metric.MetricConfig.offset] = b - } - - nb, err := b.write(ts, metric.Value) - if err != nil { - return err - } - - // Last write created a new buffer... - if b != nb { - l.metrics[metric.MetricConfig.offset] = nb - } - } - return nil -} - -// Read returns all values for metric `metric` from `from` to `to` for the selected level(s). -// If the level does not hold the metric itself, the data will be aggregated recursively from the children. -// The second and third return value are the actual from/to for the data. Those can be different from -// the range asked for if no data was available. -func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) { - if from > to { - return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range") - } - - minfo, ok := m.Metrics[metric] - if !ok { - return nil, 0, 0, 0, errors.New("[METRICSTORE]> unkown metric: " + metric) - } - - n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1) - - err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error { - cdata, cfrom, cto, err := b.read(from, to, data) - if err != nil { - return err - } - - if n == 0 { - from, to = cfrom, cto - } else if from != cfrom || to != cto || len(data) != len(cdata) { - missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency) - if missingfront != 0 { - return ErrDataDoesNotAlign - } - - newlen := len(cdata) - missingback - if newlen < 1 { - return ErrDataDoesNotAlign - } - cdata = cdata[0:newlen] - if len(cdata) != len(data) { - return ErrDataDoesNotAlign - } - - from, to = cfrom, cto - } - - data = cdata - n += 1 - return nil - }) - - if err != nil { - return nil, 0, 0, 0, err - } else if n == 0 { - return nil, 0, 0, 0, errors.New("[METRICSTORE]> metric or host not found") - } else if n > 1 { - if minfo.Aggregation == AvgAggregation { - normalize := 1. / schema.Float(n) - for i := 0; i < len(data); i++ { - data[i] *= normalize - } - } else if minfo.Aggregation != SumAggregation { - return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid aggregation") - } - } - - data, resolution, err = resampler.LargestTriangleThreeBucket(data, minfo.Frequency, resolution) - if err != nil { - return nil, 0, 0, 0, err - } - - return data, from, to, resolution, nil -} - -// Free releases all buffers for the selected level and all its children that -// contain only values older than `t`. -func (m *MemoryStore) Free(selector []string, t int64) (int, error) { - return m.GetLevel(selector).free(t) -} - -func (m *MemoryStore) FreeAll() error { - for k := range m.root.children { - delete(m.root.children, k) - } - - return nil -} - -func (m *MemoryStore) SizeInBytes() int64 { - return m.root.sizeInBytes() -} - -// ListChildren , given a selector, returns a list of all children of the level -// selected. -func (m *MemoryStore) ListChildren(selector []string) []string { - lvl := &m.root - for lvl != nil && len(selector) != 0 { - lvl.lock.RLock() - next := lvl.children[selector[0]] - lvl.lock.RUnlock() - lvl = next - selector = selector[1:] - } - - if lvl == nil { - return nil - } - - lvl.lock.RLock() - defer lvl.lock.RUnlock() - - children := make([]string, 0, len(lvl.children)) - for child := range lvl.children { - children = append(children, child) - } - - return children -} diff --git a/internal/memorystore/memorystore_test.go b/internal/memorystore/memorystore_test.go deleted file mode 100644 index b8ab090a..00000000 --- a/internal/memorystore/memorystore_test.go +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package memorystore - -import ( - "testing" - - "github.com/ClusterCockpit/cc-lib/schema" -) - -func TestAssignAggregationStrategy(t *testing.T) { - tests := []struct { - name string - input string - expected AggregationStrategy - wantErr bool - }{ - {"empty string", "", NoAggregation, false}, - {"sum", "sum", SumAggregation, false}, - {"avg", "avg", AvgAggregation, false}, - {"invalid", "invalid", NoAggregation, true}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result, err := AssignAggregationStrategy(tt.input) - if (err != nil) != tt.wantErr { - t.Errorf("AssignAggregationStrategy(%q) error = %v, wantErr %v", tt.input, err, tt.wantErr) - return - } - if result != tt.expected { - t.Errorf("AssignAggregationStrategy(%q) = %v, want %v", tt.input, result, tt.expected) - } - }) - } -} - -func TestAddMetric(t *testing.T) { - // Reset Metrics before test - Metrics = make(map[string]MetricConfig) - - err := AddMetric("test_metric", MetricConfig{ - Frequency: 60, - Aggregation: SumAggregation, - }) - if err != nil { - t.Errorf("AddMetric() error = %v", err) - } - - if _, ok := Metrics["test_metric"]; !ok { - t.Error("AddMetric() did not add metric to Metrics map") - } - - // Test updating with higher frequency - err = AddMetric("test_metric", MetricConfig{ - Frequency: 120, - Aggregation: SumAggregation, - }) - if err != nil { - t.Errorf("AddMetric() error = %v", err) - } - - if Metrics["test_metric"].Frequency != 120 { - t.Errorf("AddMetric() frequency = %d, want 120", Metrics["test_metric"].Frequency) - } - - // Test updating with lower frequency (should not update) - err = AddMetric("test_metric", MetricConfig{ - Frequency: 30, - Aggregation: SumAggregation, - }) - if err != nil { - t.Errorf("AddMetric() error = %v", err) - } - - if Metrics["test_metric"].Frequency != 120 { - t.Errorf("AddMetric() frequency = %d, want 120 (should not downgrade)", Metrics["test_metric"].Frequency) - } -} - -func TestGetMetricFrequency(t *testing.T) { - // Reset Metrics before test - Metrics = map[string]MetricConfig{ - "test_metric": { - Frequency: 60, - Aggregation: SumAggregation, - }, - } - - freq, err := GetMetricFrequency("test_metric") - if err != nil { - t.Errorf("GetMetricFrequency() error = %v", err) - } - if freq != 60 { - t.Errorf("GetMetricFrequency() = %d, want 60", freq) - } - - _, err = GetMetricFrequency("nonexistent") - if err == nil { - t.Error("GetMetricFrequency() expected error for nonexistent metric") - } -} - -func TestBufferWrite(t *testing.T) { - b := newBuffer(100, 10) - - // Test writing value - nb, err := b.write(100, schema.Float(42.0)) - if err != nil { - t.Errorf("buffer.write() error = %v", err) - } - if nb != b { - t.Error("buffer.write() created new buffer unexpectedly") - } - if len(b.data) != 1 { - t.Errorf("buffer.write() len(data) = %d, want 1", len(b.data)) - } - if b.data[0] != schema.Float(42.0) { - t.Errorf("buffer.write() data[0] = %v, want 42.0", b.data[0]) - } - - // Test writing value from past (should error) - _, err = b.write(50, schema.Float(10.0)) - if err == nil { - t.Error("buffer.write() expected error for past timestamp") - } -} - -func TestBufferRead(t *testing.T) { - b := newBuffer(100, 10) - - // Write some test data - b.write(100, schema.Float(1.0)) - b.write(110, schema.Float(2.0)) - b.write(120, schema.Float(3.0)) - - // Read data - data := make([]schema.Float, 3) - result, from, to, err := b.read(100, 130, data) - if err != nil { - t.Errorf("buffer.read() error = %v", err) - } - // Buffer read should return from as firstWrite (start + freq/2) - if from != 100 { - t.Errorf("buffer.read() from = %d, want 100", from) - } - if to != 130 { - t.Errorf("buffer.read() to = %d, want 130", to) - } - if len(result) != 3 { - t.Errorf("buffer.read() len(result) = %d, want 3", len(result)) - } -} diff --git a/internal/metricDataDispatcher/dataLoader.go b/internal/metricDataDispatcher/dataLoader.go deleted file mode 100644 index 780eb73e..00000000 --- a/internal/metricDataDispatcher/dataLoader.go +++ /dev/null @@ -1,381 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. -package metricDataDispatcher - -import ( - "context" - "fmt" - "math" - "time" - - "github.com/ClusterCockpit/cc-backend/internal/config" - "github.com/ClusterCockpit/cc-backend/internal/metricdata" - "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/lrucache" - "github.com/ClusterCockpit/cc-lib/resampler" - "github.com/ClusterCockpit/cc-lib/schema" -) - -var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024) - -func cacheKey( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - resolution int, -) string { - // Duration and StartTime do not need to be in the cache key as StartTime is less unique than - // job.ID and the TTL of the cache entry makes sure it does not stay there forever. - return fmt.Sprintf("%d(%s):[%v],[%v]-%d", - job.ID, job.State, metrics, scopes, resolution) -} - -// Fetches the metric data for a job. -func LoadData(job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - ctx context.Context, - resolution int, -) (schema.JobData, error) { - data := cache.Get(cacheKey(job, metrics, scopes, resolution), func() (_ any, ttl time.Duration, size int) { - var jd schema.JobData - var err error - - if job.State == schema.JobStateRunning || - job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving || - config.Keys.DisableArchive { - - repo, err := metricdata.GetMetricDataRepo(job.Cluster) - if err != nil { - return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster), 0, 0 - } - - if scopes == nil { - scopes = append(scopes, schema.MetricScopeNode) - } - - if metrics == nil { - cluster := archive.GetCluster(job.Cluster) - for _, mc := range cluster.MetricConfig { - metrics = append(metrics, mc.Name) - } - } - - jd, err = repo.LoadData(job, metrics, scopes, ctx, resolution) - if err != nil { - if len(jd) != 0 { - cclog.Warnf("partial error: %s", err.Error()) - // return err, 0, 0 // Reactivating will block archiving on one partial error - } else { - cclog.Error("Error while loading job data from metric repository") - return err, 0, 0 - } - } - size = jd.Size() - } else { - var jd_temp schema.JobData - jd_temp, err = archive.GetHandle().LoadJobData(job) - if err != nil { - cclog.Error("Error while loading job data from archive") - return err, 0, 0 - } - - // Deep copy the cached archive hashmap - jd = metricdata.DeepCopy(jd_temp) - - // Resampling for archived data. - // Pass the resolution from frontend here. - for _, v := range jd { - for _, v_ := range v { - timestep := int64(0) - for i := 0; i < len(v_.Series); i += 1 { - v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, int64(v_.Timestep), int64(resolution)) - if err != nil { - return err, 0, 0 - } - } - v_.Timestep = int(timestep) - } - } - - // Avoid sending unrequested data to the client: - if metrics != nil || scopes != nil { - if metrics == nil { - metrics = make([]string, 0, len(jd)) - for k := range jd { - metrics = append(metrics, k) - } - } - - res := schema.JobData{} - for _, metric := range metrics { - if perscope, ok := jd[metric]; ok { - if len(perscope) > 1 { - subset := make(map[schema.MetricScope]*schema.JobMetric) - for _, scope := range scopes { - if jm, ok := perscope[scope]; ok { - subset[scope] = jm - } - } - - if len(subset) > 0 { - perscope = subset - } - } - - res[metric] = perscope - } - } - jd = res - } - size = jd.Size() - } - - ttl = 5 * time.Hour - if job.State == schema.JobStateRunning { - ttl = 2 * time.Minute - } - - // FIXME: Review: Is this really necessary or correct. - // Note: Lines 147-170 formerly known as prepareJobData(jobData, scopes) - // For /monitoring/job/ and some other places, flops_any and mem_bw need - // to be available at the scope 'node'. If a job has a lot of nodes, - // statisticsSeries should be available so that a min/median/max Graph can be - // used instead of a lot of single lines. - // NOTE: New StatsSeries will always be calculated as 'min/median/max' - // Existing (archived) StatsSeries can be 'min/mean/max'! - const maxSeriesSize int = 15 - for _, scopes := range jd { - for _, jm := range scopes { - if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize { - continue - } - - jm.AddStatisticsSeries() - } - } - - nodeScopeRequested := false - for _, scope := range scopes { - if scope == schema.MetricScopeNode { - nodeScopeRequested = true - } - } - - if nodeScopeRequested { - jd.AddNodeScope("flops_any") - jd.AddNodeScope("mem_bw") - } - - // Round Resulting Stat Values - jd.RoundMetricStats() - - return jd, ttl, size - }) - - if err, ok := data.(error); ok { - cclog.Error("Error in returned dataset") - return nil, err - } - - return data.(schema.JobData), nil -} - -// Used for the jobsFootprint GraphQL-Query. TODO: Rename/Generalize. -func LoadAverages( - job *schema.Job, - metrics []string, - data [][]schema.Float, - ctx context.Context, -) error { - if job.State != schema.JobStateRunning && !config.Keys.DisableArchive { - return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here? - } - - repo, err := metricdata.GetMetricDataRepo(job.Cluster) - if err != nil { - return fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", job.Cluster) - } - - stats, err := repo.LoadStats(job, metrics, ctx) // #166 how to handle stats for acc normalizazion? - if err != nil { - cclog.Errorf("Error while loading statistics for job %v (User %v, Project %v)", job.JobID, job.User, job.Project) - return err - } - - for i, m := range metrics { - nodes, ok := stats[m] - if !ok { - data[i] = append(data[i], schema.NaN) - continue - } - - sum := 0.0 - for _, node := range nodes { - sum += node.Avg - } - data[i] = append(data[i], schema.Float(sum)) - } - - return nil -} - -// Used for statsTable in frontend: Return scoped statistics by metric. -func LoadScopedJobStats( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - ctx context.Context, -) (schema.ScopedJobStats, error) { - if job.State != schema.JobStateRunning && !config.Keys.DisableArchive { - return archive.LoadScopedStatsFromArchive(job, metrics, scopes) - } - - repo, err := metricdata.GetMetricDataRepo(job.Cluster) - if err != nil { - return nil, fmt.Errorf("job %d: no metric data repository configured for '%s'", job.JobID, job.Cluster) - } - - scopedStats, err := repo.LoadScopedStats(job, metrics, scopes, ctx) - if err != nil { - cclog.Errorf("error while loading scoped statistics for job %d (User %s, Project %s)", job.JobID, job.User, job.Project) - return nil, err - } - - return scopedStats, nil -} - -// Used for polar plots in frontend: Aggregates statistics for all nodes to single values for job per metric. -func LoadJobStats( - job *schema.Job, - metrics []string, - ctx context.Context, -) (map[string]schema.MetricStatistics, error) { - if job.State != schema.JobStateRunning && !config.Keys.DisableArchive { - return archive.LoadStatsFromArchive(job, metrics) - } - - data := make(map[string]schema.MetricStatistics, len(metrics)) - repo, err := metricdata.GetMetricDataRepo(job.Cluster) - if err != nil { - return data, fmt.Errorf("job %d: no metric data repository configured for '%s'", job.JobID, job.Cluster) - } - - stats, err := repo.LoadStats(job, metrics, ctx) - if err != nil { - cclog.Errorf("error while loading statistics for job %d (User %s, Project %s)", job.JobID, job.User, job.Project) - return data, err - } - - for _, m := range metrics { - sum, avg, min, max := 0.0, 0.0, 0.0, 0.0 - nodes, ok := stats[m] - if !ok { - data[m] = schema.MetricStatistics{Min: min, Avg: avg, Max: max} - continue - } - - for _, node := range nodes { - sum += node.Avg - min = math.Min(min, node.Min) - max = math.Max(max, node.Max) - } - - data[m] = schema.MetricStatistics{ - Avg: (math.Round((sum/float64(job.NumNodes))*100) / 100), - Min: (math.Round(min*100) / 100), - Max: (math.Round(max*100) / 100), - } - } - - return data, nil -} - -// Used for the classic node/system view. Returns a map of nodes to a map of metrics. -func LoadNodeData( - cluster string, - metrics, nodes []string, - scopes []schema.MetricScope, - from, to time.Time, - ctx context.Context, -) (map[string]map[string][]*schema.JobMetric, error) { - repo, err := metricdata.GetMetricDataRepo(cluster) - if err != nil { - return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster) - } - - if metrics == nil { - for _, m := range archive.GetCluster(cluster).MetricConfig { - metrics = append(metrics, m.Name) - } - } - - data, err := repo.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx) - if err != nil { - if len(data) != 0 { - cclog.Warnf("partial error: %s", err.Error()) - } else { - cclog.Error("Error while loading node data from metric repository") - return nil, err - } - } - - if data == nil { - return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster) - } - - return data, nil -} - -func LoadNodeListData( - cluster, subCluster string, - nodes []string, - metrics []string, - scopes []schema.MetricScope, - resolution int, - from, to time.Time, - ctx context.Context, -) (map[string]schema.JobData, error) { - repo, err := metricdata.GetMetricDataRepo(cluster) - if err != nil { - return nil, fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster) - } - - if metrics == nil { - for _, m := range archive.GetCluster(cluster).MetricConfig { - metrics = append(metrics, m.Name) - } - } - - data, err := repo.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx) - if err != nil { - if len(data) != 0 { - cclog.Warnf("partial error: %s", err.Error()) - } else { - cclog.Error("Error while loading node data from metric repository") - return nil, err - } - } - - // NOTE: New StatsSeries will always be calculated as 'min/median/max' - const maxSeriesSize int = 8 - for _, jd := range data { - for _, scopes := range jd { - for _, jm := range scopes { - if jm.StatisticsSeries != nil || len(jm.Series) < maxSeriesSize { - continue - } - jm.AddStatisticsSeries() - } - } - } - - if data == nil { - return nil, fmt.Errorf("METRICDATA/METRICDATA > the metric data repository for '%s' does not support this query", cluster) - } - - return data, nil -} diff --git a/internal/metricdata/cc-metric-store-internal.go b/internal/metricdata/cc-metric-store-internal.go deleted file mode 100644 index 9f0cd74a..00000000 --- a/internal/metricdata/cc-metric-store-internal.go +++ /dev/null @@ -1,1107 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package metricdata - -import ( - "context" - "encoding/json" - "fmt" - "strconv" - "strings" - "time" - - "github.com/ClusterCockpit/cc-backend/internal/memorystore" - "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" -) - -// Bloat Code -type CCMetricStoreConfigInternal struct { - Kind string `json:"kind"` - Url string `json:"url"` - Token string `json:"token"` - - // If metrics are known to this MetricDataRepository under a different - // name than in the `metricConfig` section of the 'cluster.json', - // provide this optional mapping of local to remote name for this metric. - Renamings map[string]string `json:"metricRenamings"` -} - -// Bloat Code -type CCMetricStoreInternal struct{} - -// Bloat Code -func (ccms *CCMetricStoreInternal) Init(rawConfig json.RawMessage) error { - return nil -} - -func (ccms *CCMetricStoreInternal) LoadData( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - ctx context.Context, - resolution int, -) (schema.JobData, error) { - queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, int64(resolution)) - if err != nil { - cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error()) - return nil, err - } - - req := memorystore.APIQueryRequest{ - Cluster: job.Cluster, - From: job.StartTime, - To: job.StartTime + int64(job.Duration), - Queries: queries, - WithStats: true, - WithData: true, - } - - resBody, err := memorystore.FetchData(req) - if err != nil { - cclog.Errorf("Error while fetching data : %s", err.Error()) - return nil, err - } - - var errors []string - jobData := make(schema.JobData) - for i, row := range resBody.Results { - query := req.Queries[i] - metric := query.Metric - scope := assignedScope[i] - mc := archive.GetMetricConfig(job.Cluster, metric) - if _, ok := jobData[metric]; !ok { - jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric) - } - - res := mc.Timestep - if len(row) > 0 { - res = int(row[0].Resolution) - } - - jobMetric, ok := jobData[metric][scope] - if !ok { - jobMetric = &schema.JobMetric{ - Unit: mc.Unit, - Timestep: res, - Series: make([]schema.Series, 0), - } - jobData[metric][scope] = jobMetric - } - - for ndx, res := range row { - if res.Error != nil { - /* Build list for "partial errors", if any */ - errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error)) - continue - } - - id := (*string)(nil) - if query.Type != nil { - id = new(string) - *id = query.TypeIds[ndx] - } - - if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() { - // "schema.Float()" because regular float64 can not be JSONed when NaN. - res.Avg = schema.Float(0) - res.Min = schema.Float(0) - res.Max = schema.Float(0) - } - - jobMetric.Series = append(jobMetric.Series, schema.Series{ - Hostname: query.Hostname, - Id: id, - Statistics: schema.MetricStatistics{ - Avg: float64(res.Avg), - Min: float64(res.Min), - Max: float64(res.Max), - }, - Data: res.Data, - }) - } - - // So that one can later check len(jobData): - if len(jobMetric.Series) == 0 { - delete(jobData[metric], scope) - if len(jobData[metric]) == 0 { - delete(jobData, metric) - } - } - } - - if len(errors) != 0 { - /* Returns list for "partial errors" */ - return jobData, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) - } - return jobData, nil -} - -var ( - hwthreadString = string(schema.MetricScopeHWThread) - coreString = string(schema.MetricScopeCore) - memoryDomainString = string(schema.MetricScopeMemoryDomain) - socketString = string(schema.MetricScopeSocket) - acceleratorString = string(schema.MetricScopeAccelerator) -) - -func (ccms *CCMetricStoreInternal) buildQueries( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - resolution int64, -) ([]memorystore.APIQuery, []schema.MetricScope, error) { - queries := make([]memorystore.APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources)) - assignedScope := []schema.MetricScope{} - - subcluster, scerr := archive.GetSubCluster(job.Cluster, job.SubCluster) - if scerr != nil { - return nil, nil, scerr - } - topology := subcluster.Topology - - for _, metric := range metrics { - mc := archive.GetMetricConfig(job.Cluster, metric) - if mc == nil { - // return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, job.Cluster) - cclog.Infof("metric '%s' is not specified for cluster '%s'", metric, job.Cluster) - continue - } - - // Skip if metric is removed for subcluster - if len(mc.SubClusters) != 0 { - isRemoved := false - for _, scConfig := range mc.SubClusters { - if scConfig.Name == job.SubCluster && scConfig.Remove { - isRemoved = true - break - } - } - if isRemoved { - continue - } - } - - // Avoid duplicates... - handledScopes := make([]schema.MetricScope, 0, 3) - - scopesLoop: - for _, requestedScope := range scopes { - nativeScope := mc.Scope - if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 { - continue - } - - scope := nativeScope.Max(requestedScope) - for _, s := range handledScopes { - if scope == s { - continue scopesLoop - } - } - handledScopes = append(handledScopes, scope) - - for _, host := range job.Resources { - hwthreads := host.HWThreads - if hwthreads == nil { - hwthreads = topology.Node - } - - // Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node) - if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) { - if scope != schema.MetricScopeAccelerator { - // Skip all other catched cases - continue - } - - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: host.Hostname, - Aggregate: false, - Type: &acceleratorString, - TypeIds: host.Accelerators, - Resolution: resolution, - }) - assignedScope = append(assignedScope, schema.MetricScopeAccelerator) - continue - } - - // Accelerator -> Node - if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode { - if len(host.Accelerators) == 0 { - continue - } - - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: host.Hostname, - Aggregate: true, - Type: &acceleratorString, - TypeIds: host.Accelerators, - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // HWThread -> HWThead - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread { - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: host.Hostname, - Aggregate: false, - Type: &hwthreadString, - TypeIds: intToStringSlice(hwthreads), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // HWThread -> Core - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore { - cores, _ := topology.GetCoresFromHWThreads(hwthreads) - for _, core := range cores { - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: host.Hostname, - Aggregate: true, - Type: &hwthreadString, - TypeIds: intToStringSlice(topology.Core[core]), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - } - continue - } - - // HWThread -> Socket - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket { - sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) - for _, socket := range sockets { - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: host.Hostname, - Aggregate: true, - Type: &hwthreadString, - TypeIds: intToStringSlice(topology.Socket[socket]), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - } - continue - } - - // HWThread -> Node - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode { - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: host.Hostname, - Aggregate: true, - Type: &hwthreadString, - TypeIds: intToStringSlice(hwthreads), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Core -> Core - if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore { - cores, _ := topology.GetCoresFromHWThreads(hwthreads) - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: host.Hostname, - Aggregate: false, - Type: &coreString, - TypeIds: intToStringSlice(cores), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Core -> Socket - if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket { - sockets, _ := topology.GetSocketsFromCores(hwthreads) - for _, socket := range sockets { - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: host.Hostname, - Aggregate: true, - Type: &coreString, - TypeIds: intToStringSlice(topology.Socket[socket]), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - } - continue - } - - // Core -> Node - if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode { - cores, _ := topology.GetCoresFromHWThreads(hwthreads) - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: host.Hostname, - Aggregate: true, - Type: &coreString, - TypeIds: intToStringSlice(cores), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // MemoryDomain -> MemoryDomain - if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain { - sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads) - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: host.Hostname, - Aggregate: false, - Type: &memoryDomainString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // MemoryDoman -> Node - if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode { - sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads) - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: host.Hostname, - Aggregate: true, - Type: &memoryDomainString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Socket -> Socket - if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket { - sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: host.Hostname, - Aggregate: false, - Type: &socketString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Socket -> Node - if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode { - sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: host.Hostname, - Aggregate: true, - Type: &socketString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Node -> Node - if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode { - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: host.Hostname, - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) - } - } - } - - return queries, assignedScope, nil -} - -func (ccms *CCMetricStoreInternal) LoadStats( - job *schema.Job, - metrics []string, - ctx context.Context, -) (map[string]map[string]schema.MetricStatistics, error) { - queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) // #166 Add scope shere for analysis view accelerator normalization? - if err != nil { - cclog.Errorf("Error while building queries for jobId %d, Metrics %v: %s", job.JobID, metrics, err.Error()) - return nil, err - } - - req := memorystore.APIQueryRequest{ - Cluster: job.Cluster, - From: job.StartTime, - To: job.StartTime + int64(job.Duration), - Queries: queries, - WithStats: true, - WithData: false, - } - - resBody, err := memorystore.FetchData(req) - if err != nil { - cclog.Errorf("Error while fetching data : %s", err.Error()) - return nil, err - } - - stats := make(map[string]map[string]schema.MetricStatistics, len(metrics)) - for i, res := range resBody.Results { - query := req.Queries[i] - metric := query.Metric - data := res[0] - if data.Error != nil { - cclog.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error) - continue - } - - metricdata, ok := stats[metric] - if !ok { - metricdata = make(map[string]schema.MetricStatistics, job.NumNodes) - stats[metric] = metricdata - } - - if data.Avg.IsNaN() || data.Min.IsNaN() || data.Max.IsNaN() { - cclog.Warnf("fetching %s for node %s failed: one of avg/min/max is NaN", metric, query.Hostname) - continue - } - - metricdata[query.Hostname] = schema.MetricStatistics{ - Avg: float64(data.Avg), - Min: float64(data.Min), - Max: float64(data.Max), - } - } - - return stats, nil -} - -// Used for Job-View Statistics Table -func (ccms *CCMetricStoreInternal) LoadScopedStats( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - ctx context.Context, -) (schema.ScopedJobStats, error) { - queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, 0) - if err != nil { - cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error()) - return nil, err - } - - req := memorystore.APIQueryRequest{ - Cluster: job.Cluster, - From: job.StartTime, - To: job.StartTime + int64(job.Duration), - Queries: queries, - WithStats: true, - WithData: false, - } - - resBody, err := memorystore.FetchData(req) - if err != nil { - cclog.Errorf("Error while fetching data : %s", err.Error()) - return nil, err - } - - var errors []string - scopedJobStats := make(schema.ScopedJobStats) - - for i, row := range resBody.Results { - query := req.Queries[i] - metric := query.Metric - scope := assignedScope[i] - - if _, ok := scopedJobStats[metric]; !ok { - scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats) - } - - if _, ok := scopedJobStats[metric][scope]; !ok { - scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0) - } - - for ndx, res := range row { - if res.Error != nil { - /* Build list for "partial errors", if any */ - errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error)) - continue - } - - id := (*string)(nil) - if query.Type != nil { - id = new(string) - *id = query.TypeIds[ndx] - } - - if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() { - // "schema.Float()" because regular float64 can not be JSONed when NaN. - res.Avg = schema.Float(0) - res.Min = schema.Float(0) - res.Max = schema.Float(0) - } - - scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{ - Hostname: query.Hostname, - Id: id, - Data: &schema.MetricStatistics{ - Avg: float64(res.Avg), - Min: float64(res.Min), - Max: float64(res.Max), - }, - }) - } - - // So that one can later check len(scopedJobStats[metric][scope]): Remove from map if empty - if len(scopedJobStats[metric][scope]) == 0 { - delete(scopedJobStats[metric], scope) - if len(scopedJobStats[metric]) == 0 { - delete(scopedJobStats, metric) - } - } - } - - if len(errors) != 0 { - /* Returns list for "partial errors" */ - return scopedJobStats, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) - } - return scopedJobStats, nil -} - -// Used for Systems-View Node-Overview -func (ccms *CCMetricStoreInternal) LoadNodeData( - cluster string, - metrics, nodes []string, - scopes []schema.MetricScope, - from, to time.Time, - ctx context.Context, -) (map[string]map[string][]*schema.JobMetric, error) { - req := memorystore.APIQueryRequest{ - Cluster: cluster, - From: from.Unix(), - To: to.Unix(), - WithStats: true, - WithData: true, - } - - if nodes == nil { - req.ForAllNodes = append(req.ForAllNodes, metrics...) - } else { - for _, node := range nodes { - for _, metric := range metrics { - req.Queries = append(req.Queries, memorystore.APIQuery{ - Hostname: node, - Metric: metric, - Resolution: 0, // Default for Node Queries: Will return metric $Timestep Resolution - }) - } - } - } - - resBody, err := memorystore.FetchData(req) - if err != nil { - cclog.Errorf("Error while fetching data : %s", err.Error()) - return nil, err - } - - var errors []string - data := make(map[string]map[string][]*schema.JobMetric) - for i, res := range resBody.Results { - var query memorystore.APIQuery - if resBody.Queries != nil { - query = resBody.Queries[i] - } else { - query = req.Queries[i] - } - - metric := query.Metric - qdata := res[0] - if qdata.Error != nil { - /* Build list for "partial errors", if any */ - errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error)) - } - - if qdata.Avg.IsNaN() || qdata.Min.IsNaN() || qdata.Max.IsNaN() { - // return nil, fmt.Errorf("METRICDATA/CCMS > fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN") - qdata.Avg, qdata.Min, qdata.Max = 0., 0., 0. - } - - hostdata, ok := data[query.Hostname] - if !ok { - hostdata = make(map[string][]*schema.JobMetric) - data[query.Hostname] = hostdata - } - - mc := archive.GetMetricConfig(cluster, metric) - hostdata[metric] = append(hostdata[metric], &schema.JobMetric{ - Unit: mc.Unit, - Timestep: mc.Timestep, - Series: []schema.Series{ - { - Hostname: query.Hostname, - Data: qdata.Data, - Statistics: schema.MetricStatistics{ - Avg: float64(qdata.Avg), - Min: float64(qdata.Min), - Max: float64(qdata.Max), - }, - }, - }, - }) - } - - if len(errors) != 0 { - /* Returns list of "partial errors" */ - return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) - } - - return data, nil -} - -// Used for Systems-View Node-List -func (ccms *CCMetricStoreInternal) LoadNodeListData( - cluster, subCluster string, - nodes []string, - metrics []string, - scopes []schema.MetricScope, - resolution int, - from, to time.Time, - ctx context.Context, -) (map[string]schema.JobData, error) { - - // Note: Order of node data is not guaranteed after this point - queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, int64(resolution)) - if err != nil { - cclog.Errorf("Error while building node queries for Cluster %s, SubCLuster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error()) - return nil, err - } - - req := memorystore.APIQueryRequest{ - Cluster: cluster, - Queries: queries, - From: from.Unix(), - To: to.Unix(), - WithStats: true, - WithData: true, - } - - resBody, err := memorystore.FetchData(req) - if err != nil { - cclog.Errorf("Error while fetching data : %s", err.Error()) - return nil, err - } - - var errors []string - data := make(map[string]schema.JobData) - for i, row := range resBody.Results { - var query memorystore.APIQuery - if resBody.Queries != nil { - query = resBody.Queries[i] - } else { - query = req.Queries[i] - } - // qdata := res[0] - metric := query.Metric - scope := assignedScope[i] - mc := archive.GetMetricConfig(cluster, metric) - - res := mc.Timestep - if len(row) > 0 { - res = int(row[0].Resolution) - } - - // Init Nested Map Data Structures If Not Found - hostData, ok := data[query.Hostname] - if !ok { - hostData = make(schema.JobData) - data[query.Hostname] = hostData - } - - metricData, ok := hostData[metric] - if !ok { - metricData = make(map[schema.MetricScope]*schema.JobMetric) - data[query.Hostname][metric] = metricData - } - - scopeData, ok := metricData[scope] - if !ok { - scopeData = &schema.JobMetric{ - Unit: mc.Unit, - Timestep: res, - Series: make([]schema.Series, 0), - } - data[query.Hostname][metric][scope] = scopeData - } - - for ndx, res := range row { - if res.Error != nil { - /* Build list for "partial errors", if any */ - errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error)) - continue - } - - id := (*string)(nil) - if query.Type != nil { - id = new(string) - *id = query.TypeIds[ndx] - } - - if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() { - // "schema.Float()" because regular float64 can not be JSONed when NaN. - res.Avg = schema.Float(0) - res.Min = schema.Float(0) - res.Max = schema.Float(0) - } - - scopeData.Series = append(scopeData.Series, schema.Series{ - Hostname: query.Hostname, - Id: id, - Statistics: schema.MetricStatistics{ - Avg: float64(res.Avg), - Min: float64(res.Min), - Max: float64(res.Max), - }, - Data: res.Data, - }) - } - } - - if len(errors) != 0 { - /* Returns list of "partial errors" */ - return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) - } - - return data, nil -} - -func (ccms *CCMetricStoreInternal) buildNodeQueries( - cluster string, - subCluster string, - nodes []string, - metrics []string, - scopes []schema.MetricScope, - resolution int64, -) ([]memorystore.APIQuery, []schema.MetricScope, error) { - queries := make([]memorystore.APIQuery, 0, len(metrics)*len(scopes)*len(nodes)) - assignedScope := []schema.MetricScope{} - - // Get Topol before loop if subCluster given - var subClusterTopol *schema.SubCluster - var scterr error - if subCluster != "" { - subClusterTopol, scterr = archive.GetSubCluster(cluster, subCluster) - if scterr != nil { - cclog.Errorf("could not load cluster %s subCluster %s topology: %s", cluster, subCluster, scterr.Error()) - return nil, nil, scterr - } - } - - for _, metric := range metrics { - metric := metric - mc := archive.GetMetricConfig(cluster, metric) - if mc == nil { - // return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, cluster) - cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, cluster) - continue - } - - // Skip if metric is removed for subcluster - if mc.SubClusters != nil { - isRemoved := false - for _, scConfig := range mc.SubClusters { - if scConfig.Name == subCluster && scConfig.Remove { - isRemoved = true - break - } - } - if isRemoved { - continue - } - } - - // Avoid duplicates... - handledScopes := make([]schema.MetricScope, 0, 3) - - scopesLoop: - for _, requestedScope := range scopes { - nativeScope := mc.Scope - - scope := nativeScope.Max(requestedScope) - for _, s := range handledScopes { - if scope == s { - continue scopesLoop - } - } - handledScopes = append(handledScopes, scope) - - for _, hostname := range nodes { - - // If no subCluster given, get it by node - if subCluster == "" { - subClusterName, scnerr := archive.GetSubClusterByNode(cluster, hostname) - if scnerr != nil { - return nil, nil, scnerr - } - subClusterTopol, scterr = archive.GetSubCluster(cluster, subClusterName) - if scterr != nil { - return nil, nil, scterr - } - } - - // Always full node hwthread id list, no partial queries expected -> Use "topology.Node" directly where applicable - // Always full accelerator id list, no partial queries expected -> Use "acceleratorIds" directly where applicable - topology := subClusterTopol.Topology - acceleratorIds := topology.GetAcceleratorIDs() - - // Moved check here if metric matches hardware specs - if nativeScope == schema.MetricScopeAccelerator && len(acceleratorIds) == 0 { - continue scopesLoop - } - - // Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node) - if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) { - if scope != schema.MetricScopeAccelerator { - // Skip all other catched cases - continue - } - - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: hostname, - Aggregate: false, - Type: &acceleratorString, - TypeIds: acceleratorIds, - Resolution: resolution, - }) - assignedScope = append(assignedScope, schema.MetricScopeAccelerator) - continue - } - - // Accelerator -> Node - if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode { - if len(acceleratorIds) == 0 { - continue - } - - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: hostname, - Aggregate: true, - Type: &acceleratorString, - TypeIds: acceleratorIds, - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // HWThread -> HWThead - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread { - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: hostname, - Aggregate: false, - Type: &hwthreadString, - TypeIds: intToStringSlice(topology.Node), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // HWThread -> Core - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore { - cores, _ := topology.GetCoresFromHWThreads(topology.Node) - for _, core := range cores { - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: hostname, - Aggregate: true, - Type: &hwthreadString, - TypeIds: intToStringSlice(topology.Core[core]), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - } - continue - } - - // HWThread -> Socket - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket { - sockets, _ := topology.GetSocketsFromHWThreads(topology.Node) - for _, socket := range sockets { - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: hostname, - Aggregate: true, - Type: &hwthreadString, - TypeIds: intToStringSlice(topology.Socket[socket]), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - } - continue - } - - // HWThread -> Node - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode { - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: hostname, - Aggregate: true, - Type: &hwthreadString, - TypeIds: intToStringSlice(topology.Node), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Core -> Core - if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore { - cores, _ := topology.GetCoresFromHWThreads(topology.Node) - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: hostname, - Aggregate: false, - Type: &coreString, - TypeIds: intToStringSlice(cores), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Core -> Socket - if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket { - sockets, _ := topology.GetSocketsFromCores(topology.Node) - for _, socket := range sockets { - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: hostname, - Aggregate: true, - Type: &coreString, - TypeIds: intToStringSlice(topology.Socket[socket]), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - } - continue - } - - // Core -> Node - if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode { - cores, _ := topology.GetCoresFromHWThreads(topology.Node) - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: hostname, - Aggregate: true, - Type: &coreString, - TypeIds: intToStringSlice(cores), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // MemoryDomain -> MemoryDomain - if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain { - sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node) - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: hostname, - Aggregate: false, - Type: &memoryDomainString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // MemoryDoman -> Node - if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode { - sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node) - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: hostname, - Aggregate: true, - Type: &memoryDomainString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Socket -> Socket - if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket { - sockets, _ := topology.GetSocketsFromHWThreads(topology.Node) - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: hostname, - Aggregate: false, - Type: &socketString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Socket -> Node - if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode { - sockets, _ := topology.GetSocketsFromHWThreads(topology.Node) - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: hostname, - Aggregate: true, - Type: &socketString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Node -> Node - if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode { - queries = append(queries, memorystore.APIQuery{ - Metric: metric, - Hostname: hostname, - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) - } - } - } - - return queries, assignedScope, nil -} - -func intToStringSlice(is []int) []string { - ss := make([]string, len(is)) - for i, x := range is { - ss[i] = strconv.Itoa(x) - } - return ss -} diff --git a/internal/metricdata/cc-metric-store.go b/internal/metricdata/cc-metric-store.go deleted file mode 100644 index 6d446d17..00000000 --- a/internal/metricdata/cc-metric-store.go +++ /dev/null @@ -1,1222 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. -package metricdata - -import ( - "bufio" - "bytes" - "context" - "encoding/json" - "fmt" - "net/http" - "strings" - "time" - - "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" -) - -type CCMetricStoreConfig struct { - Kind string `json:"kind"` - Url string `json:"url"` - Token string `json:"token"` - - // If metrics are known to this MetricDataRepository under a different - // name than in the `metricConfig` section of the 'cluster.json', - // provide this optional mapping of local to remote name for this metric. - Renamings map[string]string `json:"metricRenamings"` -} - -type CCMetricStore struct { - here2there map[string]string - there2here map[string]string - client http.Client - jwt string - url string - queryEndpoint string -} - -type ApiQueryRequest struct { - Cluster string `json:"cluster"` - Queries []ApiQuery `json:"queries"` - ForAllNodes []string `json:"for-all-nodes"` - From int64 `json:"from"` - To int64 `json:"to"` - WithStats bool `json:"with-stats"` - WithData bool `json:"with-data"` -} - -type ApiQuery struct { - Type *string `json:"type,omitempty"` - SubType *string `json:"subtype,omitempty"` - Metric string `json:"metric"` - Hostname string `json:"host"` - Resolution int `json:"resolution"` - TypeIds []string `json:"type-ids,omitempty"` - SubTypeIds []string `json:"subtype-ids,omitempty"` - Aggregate bool `json:"aggreg"` -} - -type ApiQueryResponse struct { - Queries []ApiQuery `json:"queries,omitempty"` - Results [][]ApiMetricData `json:"results"` -} - -type ApiMetricData struct { - Error *string `json:"error"` - Data []schema.Float `json:"data"` - From int64 `json:"from"` - To int64 `json:"to"` - Resolution int `json:"resolution"` - Avg schema.Float `json:"avg"` - Min schema.Float `json:"min"` - Max schema.Float `json:"max"` -} - -func (ccms *CCMetricStore) Init(rawConfig json.RawMessage) error { - var config CCMetricStoreConfig - if err := json.Unmarshal(rawConfig, &config); err != nil { - cclog.Warn("Error while unmarshaling raw json config") - return err - } - - ccms.url = config.Url - ccms.queryEndpoint = fmt.Sprintf("%s/api/query", config.Url) - ccms.jwt = config.Token - ccms.client = http.Client{ - Timeout: 10 * time.Second, - } - - if config.Renamings != nil { - ccms.here2there = config.Renamings - ccms.there2here = make(map[string]string, len(config.Renamings)) - for k, v := range ccms.here2there { - ccms.there2here[v] = k - } - } else { - ccms.here2there = make(map[string]string) - ccms.there2here = make(map[string]string) - } - - return nil -} - -func (ccms *CCMetricStore) toRemoteName(metric string) string { - if renamed, ok := ccms.here2there[metric]; ok { - return renamed - } - - return metric -} - -func (ccms *CCMetricStore) toLocalName(metric string) string { - if renamed, ok := ccms.there2here[metric]; ok { - return renamed - } - - return metric -} - -func (ccms *CCMetricStore) doRequest( - ctx context.Context, - body *ApiQueryRequest, -) (*ApiQueryResponse, error) { - buf := &bytes.Buffer{} - if err := json.NewEncoder(buf).Encode(body); err != nil { - cclog.Errorf("Error while encoding request body: %s", err.Error()) - return nil, err - } - - req, err := http.NewRequestWithContext(ctx, http.MethodGet, ccms.queryEndpoint, buf) - if err != nil { - cclog.Errorf("Error while building request body: %s", err.Error()) - return nil, err - } - if ccms.jwt != "" { - req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt)) - } - - // versioning the cc-metric-store query API. - // v2 = data with resampling - // v1 = data without resampling - q := req.URL.Query() - q.Add("version", "v2") - req.URL.RawQuery = q.Encode() - - res, err := ccms.client.Do(req) - if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) - return nil, err - } - - if res.StatusCode != http.StatusOK { - return nil, fmt.Errorf("'%s': HTTP Status: %s", ccms.queryEndpoint, res.Status) - } - - var resBody ApiQueryResponse - if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil { - cclog.Errorf("Error while decoding result body: %s", err.Error()) - return nil, err - } - - return &resBody, nil -} - -func (ccms *CCMetricStore) LoadData( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - ctx context.Context, - resolution int, -) (schema.JobData, error) { - queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, resolution) - if err != nil { - cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error()) - return nil, err - } - - req := ApiQueryRequest{ - Cluster: job.Cluster, - From: job.StartTime, - To: job.StartTime + int64(job.Duration), - Queries: queries, - WithStats: true, - WithData: true, - } - - resBody, err := ccms.doRequest(ctx, &req) - if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) - return nil, err - } - - var errors []string - jobData := make(schema.JobData) - for i, row := range resBody.Results { - query := req.Queries[i] - metric := ccms.toLocalName(query.Metric) - scope := assignedScope[i] - mc := archive.GetMetricConfig(job.Cluster, metric) - if _, ok := jobData[metric]; !ok { - jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric) - } - - res := mc.Timestep - if len(row) > 0 { - res = row[0].Resolution - } - - jobMetric, ok := jobData[metric][scope] - if !ok { - jobMetric = &schema.JobMetric{ - Unit: mc.Unit, - Timestep: res, - Series: make([]schema.Series, 0), - } - jobData[metric][scope] = jobMetric - } - - for ndx, res := range row { - if res.Error != nil { - /* Build list for "partial errors", if any */ - errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error)) - continue - } - - id := (*string)(nil) - if query.Type != nil { - id = new(string) - *id = query.TypeIds[ndx] - } - - if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() { - // "schema.Float()" because regular float64 can not be JSONed when NaN. - res.Avg = schema.Float(0) - res.Min = schema.Float(0) - res.Max = schema.Float(0) - } - - jobMetric.Series = append(jobMetric.Series, schema.Series{ - Hostname: query.Hostname, - Id: id, - Statistics: schema.MetricStatistics{ - Avg: float64(res.Avg), - Min: float64(res.Min), - Max: float64(res.Max), - }, - Data: res.Data, - }) - } - - // So that one can later check len(jobData): - if len(jobMetric.Series) == 0 { - delete(jobData[metric], scope) - if len(jobData[metric]) == 0 { - delete(jobData, metric) - } - } - } - - if len(errors) != 0 { - /* Returns list for "partial errors" */ - return jobData, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) - } - return jobData, nil -} - -func (ccms *CCMetricStore) buildQueries( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - resolution int, -) ([]ApiQuery, []schema.MetricScope, error) { - queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(job.Resources)) - assignedScope := []schema.MetricScope{} - - subcluster, scerr := archive.GetSubCluster(job.Cluster, job.SubCluster) - if scerr != nil { - return nil, nil, scerr - } - topology := subcluster.Topology - - for _, metric := range metrics { - remoteName := ccms.toRemoteName(metric) - mc := archive.GetMetricConfig(job.Cluster, metric) - if mc == nil { - // return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, job.Cluster) - cclog.Infof("metric '%s' is not specified for cluster '%s'", metric, job.Cluster) - continue - } - - // Skip if metric is removed for subcluster - if len(mc.SubClusters) != 0 { - isRemoved := false - for _, scConfig := range mc.SubClusters { - if scConfig.Name == job.SubCluster && scConfig.Remove { - isRemoved = true - break - } - } - if isRemoved { - continue - } - } - - // Avoid duplicates... - handledScopes := make([]schema.MetricScope, 0, 3) - - scopesLoop: - for _, requestedScope := range scopes { - nativeScope := mc.Scope - if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 { - continue - } - - scope := nativeScope.Max(requestedScope) - for _, s := range handledScopes { - if scope == s { - continue scopesLoop - } - } - handledScopes = append(handledScopes, scope) - - for _, host := range job.Resources { - hwthreads := host.HWThreads - if hwthreads == nil { - hwthreads = topology.Node - } - - // Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node) - if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) { - if scope != schema.MetricScopeAccelerator { - // Skip all other catched cases - continue - } - - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: host.Hostname, - Aggregate: false, - Type: &acceleratorString, - TypeIds: host.Accelerators, - Resolution: resolution, - }) - assignedScope = append(assignedScope, schema.MetricScopeAccelerator) - continue - } - - // Accelerator -> Node - if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode { - if len(host.Accelerators) == 0 { - continue - } - - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: host.Hostname, - Aggregate: true, - Type: &acceleratorString, - TypeIds: host.Accelerators, - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // HWThread -> HWThead - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread { - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: host.Hostname, - Aggregate: false, - Type: &hwthreadString, - TypeIds: intToStringSlice(hwthreads), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // HWThread -> Core - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore { - cores, _ := topology.GetCoresFromHWThreads(hwthreads) - for _, core := range cores { - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: host.Hostname, - Aggregate: true, - Type: &hwthreadString, - TypeIds: intToStringSlice(topology.Core[core]), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - } - continue - } - - // HWThread -> Socket - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket { - sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) - for _, socket := range sockets { - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: host.Hostname, - Aggregate: true, - Type: &hwthreadString, - TypeIds: intToStringSlice(topology.Socket[socket]), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - } - continue - } - - // HWThread -> Node - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode { - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: host.Hostname, - Aggregate: true, - Type: &hwthreadString, - TypeIds: intToStringSlice(hwthreads), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Core -> Core - if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore { - cores, _ := topology.GetCoresFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: host.Hostname, - Aggregate: false, - Type: &coreString, - TypeIds: intToStringSlice(cores), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Core -> Socket - if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket { - sockets, _ := topology.GetSocketsFromCores(hwthreads) - for _, socket := range sockets { - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: host.Hostname, - Aggregate: true, - Type: &coreString, - TypeIds: intToStringSlice(topology.Socket[socket]), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - } - continue - } - - // Core -> Node - if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode { - cores, _ := topology.GetCoresFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: host.Hostname, - Aggregate: true, - Type: &coreString, - TypeIds: intToStringSlice(cores), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // MemoryDomain -> MemoryDomain - if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain { - sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: host.Hostname, - Aggregate: false, - Type: &memoryDomainString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // MemoryDoman -> Node - if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode { - sockets, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: host.Hostname, - Aggregate: true, - Type: &memoryDomainString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Socket -> Socket - if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket { - sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: host.Hostname, - Aggregate: false, - Type: &socketString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Socket -> Node - if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode { - sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: host.Hostname, - Aggregate: true, - Type: &socketString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Node -> Node - if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode { - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: host.Hostname, - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) - } - } - } - - return queries, assignedScope, nil -} - -func (ccms *CCMetricStore) LoadStats( - job *schema.Job, - metrics []string, - ctx context.Context, -) (map[string]map[string]schema.MetricStatistics, error) { - - queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) // #166 Add scope shere for analysis view accelerator normalization? - if err != nil { - cclog.Errorf("Error while building queries for jobId %d, Metrics %v: %s", job.JobID, metrics, err.Error()) - return nil, err - } - - req := ApiQueryRequest{ - Cluster: job.Cluster, - From: job.StartTime, - To: job.StartTime + int64(job.Duration), - Queries: queries, - WithStats: true, - WithData: false, - } - - resBody, err := ccms.doRequest(ctx, &req) - if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) - return nil, err - } - - stats := make(map[string]map[string]schema.MetricStatistics, len(metrics)) - for i, res := range resBody.Results { - query := req.Queries[i] - metric := ccms.toLocalName(query.Metric) - data := res[0] - if data.Error != nil { - cclog.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error) - continue - } - - metricdata, ok := stats[metric] - if !ok { - metricdata = make(map[string]schema.MetricStatistics, job.NumNodes) - stats[metric] = metricdata - } - - if data.Avg.IsNaN() || data.Min.IsNaN() || data.Max.IsNaN() { - cclog.Warnf("fetching %s for node %s failed: one of avg/min/max is NaN", metric, query.Hostname) - continue - } - - metricdata[query.Hostname] = schema.MetricStatistics{ - Avg: float64(data.Avg), - Min: float64(data.Min), - Max: float64(data.Max), - } - } - - return stats, nil -} - -// Used for Job-View Statistics Table -func (ccms *CCMetricStore) LoadScopedStats( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - ctx context.Context, -) (schema.ScopedJobStats, error) { - queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, 0) - if err != nil { - cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error()) - return nil, err - } - - req := ApiQueryRequest{ - Cluster: job.Cluster, - From: job.StartTime, - To: job.StartTime + int64(job.Duration), - Queries: queries, - WithStats: true, - WithData: false, - } - - resBody, err := ccms.doRequest(ctx, &req) - if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) - return nil, err - } - - var errors []string - scopedJobStats := make(schema.ScopedJobStats) - - for i, row := range resBody.Results { - query := req.Queries[i] - metric := ccms.toLocalName(query.Metric) - scope := assignedScope[i] - - if _, ok := scopedJobStats[metric]; !ok { - scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats) - } - - if _, ok := scopedJobStats[metric][scope]; !ok { - scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0) - } - - for ndx, res := range row { - if res.Error != nil { - /* Build list for "partial errors", if any */ - errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error)) - continue - } - - id := (*string)(nil) - if query.Type != nil { - id = new(string) - *id = query.TypeIds[ndx] - } - - if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() { - // "schema.Float()" because regular float64 can not be JSONed when NaN. - res.Avg = schema.Float(0) - res.Min = schema.Float(0) - res.Max = schema.Float(0) - } - - scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{ - Hostname: query.Hostname, - Id: id, - Data: &schema.MetricStatistics{ - Avg: float64(res.Avg), - Min: float64(res.Min), - Max: float64(res.Max), - }, - }) - } - - // So that one can later check len(scopedJobStats[metric][scope]): Remove from map if empty - if len(scopedJobStats[metric][scope]) == 0 { - delete(scopedJobStats[metric], scope) - if len(scopedJobStats[metric]) == 0 { - delete(scopedJobStats, metric) - } - } - } - - if len(errors) != 0 { - /* Returns list for "partial errors" */ - return scopedJobStats, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) - } - return scopedJobStats, nil -} - -// Used for Systems-View Node-Overview -func (ccms *CCMetricStore) LoadNodeData( - cluster string, - metrics, nodes []string, - scopes []schema.MetricScope, - from, to time.Time, - ctx context.Context, -) (map[string]map[string][]*schema.JobMetric, error) { - req := ApiQueryRequest{ - Cluster: cluster, - From: from.Unix(), - To: to.Unix(), - WithStats: true, - WithData: true, - } - - if nodes == nil { - for _, metric := range metrics { - req.ForAllNodes = append(req.ForAllNodes, ccms.toRemoteName(metric)) - } - } else { - for _, node := range nodes { - for _, metric := range metrics { - req.Queries = append(req.Queries, ApiQuery{ - Hostname: node, - Metric: ccms.toRemoteName(metric), - Resolution: 0, // Default for Node Queries: Will return metric $Timestep Resolution - }) - } - } - } - - resBody, err := ccms.doRequest(ctx, &req) - if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) - return nil, err - } - - var errors []string - data := make(map[string]map[string][]*schema.JobMetric) - for i, res := range resBody.Results { - var query ApiQuery - if resBody.Queries != nil { - query = resBody.Queries[i] - } else { - query = req.Queries[i] - } - - metric := ccms.toLocalName(query.Metric) - qdata := res[0] - if qdata.Error != nil { - /* Build list for "partial errors", if any */ - errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error)) - } - - if qdata.Avg.IsNaN() || qdata.Min.IsNaN() || qdata.Max.IsNaN() { - // return nil, fmt.Errorf("METRICDATA/CCMS > fetching %s for node %s failed: %s", metric, query.Hostname, "avg/min/max is NaN") - qdata.Avg, qdata.Min, qdata.Max = 0., 0., 0. - } - - hostdata, ok := data[query.Hostname] - if !ok { - hostdata = make(map[string][]*schema.JobMetric) - data[query.Hostname] = hostdata - } - - mc := archive.GetMetricConfig(cluster, metric) - hostdata[metric] = append(hostdata[metric], &schema.JobMetric{ - Unit: mc.Unit, - Timestep: mc.Timestep, - Series: []schema.Series{ - { - Hostname: query.Hostname, - Data: qdata.Data, - Statistics: schema.MetricStatistics{ - Avg: float64(qdata.Avg), - Min: float64(qdata.Min), - Max: float64(qdata.Max), - }, - }, - }, - }) - } - - if len(errors) != 0 { - /* Returns list of "partial errors" */ - return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) - } - - return data, nil -} - -// Used for Systems-View Node-List -func (ccms *CCMetricStore) LoadNodeListData( - cluster, subCluster string, - nodes []string, - metrics []string, - scopes []schema.MetricScope, - resolution int, - from, to time.Time, - ctx context.Context, -) (map[string]schema.JobData, error) { - - // Note: Order of node data is not guaranteed after this point - queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution) - if err != nil { - cclog.Errorf("Error while building node queries for Cluster %s, SubCLuster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error()) - return nil, err - } - - req := ApiQueryRequest{ - Cluster: cluster, - Queries: queries, - From: from.Unix(), - To: to.Unix(), - WithStats: true, - WithData: true, - } - - resBody, err := ccms.doRequest(ctx, &req) - if err != nil { - cclog.Errorf("Error while performing request: %s", err.Error()) - return nil, err - } - - var errors []string - data := make(map[string]schema.JobData) - for i, row := range resBody.Results { - var query ApiQuery - if resBody.Queries != nil { - query = resBody.Queries[i] - } else { - query = req.Queries[i] - } - // qdata := res[0] - metric := ccms.toLocalName(query.Metric) - scope := assignedScope[i] - mc := archive.GetMetricConfig(cluster, metric) - - res := mc.Timestep - if len(row) > 0 { - res = row[0].Resolution - } - - // Init Nested Map Data Structures If Not Found - hostData, ok := data[query.Hostname] - if !ok { - hostData = make(schema.JobData) - data[query.Hostname] = hostData - } - - metricData, ok := hostData[metric] - if !ok { - metricData = make(map[schema.MetricScope]*schema.JobMetric) - data[query.Hostname][metric] = metricData - } - - scopeData, ok := metricData[scope] - if !ok { - scopeData = &schema.JobMetric{ - Unit: mc.Unit, - Timestep: res, - Series: make([]schema.Series, 0), - } - data[query.Hostname][metric][scope] = scopeData - } - - for ndx, res := range row { - if res.Error != nil { - /* Build list for "partial errors", if any */ - errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error)) - continue - } - - id := (*string)(nil) - if query.Type != nil { - id = new(string) - *id = query.TypeIds[ndx] - } - - if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() { - // "schema.Float()" because regular float64 can not be JSONed when NaN. - res.Avg = schema.Float(0) - res.Min = schema.Float(0) - res.Max = schema.Float(0) - } - - scopeData.Series = append(scopeData.Series, schema.Series{ - Hostname: query.Hostname, - Id: id, - Statistics: schema.MetricStatistics{ - Avg: float64(res.Avg), - Min: float64(res.Min), - Max: float64(res.Max), - }, - Data: res.Data, - }) - } - } - - if len(errors) != 0 { - /* Returns list of "partial errors" */ - return data, fmt.Errorf("METRICDATA/CCMS > Errors: %s", strings.Join(errors, ", ")) - } - - return data, nil -} - -func (ccms *CCMetricStore) buildNodeQueries( - cluster string, - subCluster string, - nodes []string, - metrics []string, - scopes []schema.MetricScope, - resolution int, -) ([]ApiQuery, []schema.MetricScope, error) { - - queries := make([]ApiQuery, 0, len(metrics)*len(scopes)*len(nodes)) - assignedScope := []schema.MetricScope{} - - // Get Topol before loop if subCluster given - var subClusterTopol *schema.SubCluster - var scterr error - if subCluster != "" { - subClusterTopol, scterr = archive.GetSubCluster(cluster, subCluster) - if scterr != nil { - cclog.Errorf("could not load cluster %s subCluster %s topology: %s", cluster, subCluster, scterr.Error()) - return nil, nil, scterr - } - } - - for _, metric := range metrics { - remoteName := ccms.toRemoteName(metric) - mc := archive.GetMetricConfig(cluster, metric) - if mc == nil { - // return nil, fmt.Errorf("METRICDATA/CCMS > metric '%s' is not specified for cluster '%s'", metric, cluster) - cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, cluster) - continue - } - - // Skip if metric is removed for subcluster - if mc.SubClusters != nil { - isRemoved := false - for _, scConfig := range mc.SubClusters { - if scConfig.Name == subCluster && scConfig.Remove { - isRemoved = true - break - } - } - if isRemoved { - continue - } - } - - // Avoid duplicates... - handledScopes := make([]schema.MetricScope, 0, 3) - - scopesLoop: - for _, requestedScope := range scopes { - nativeScope := mc.Scope - - scope := nativeScope.Max(requestedScope) - for _, s := range handledScopes { - if scope == s { - continue scopesLoop - } - } - handledScopes = append(handledScopes, scope) - - for _, hostname := range nodes { - - // If no subCluster given, get it by node - if subCluster == "" { - subClusterName, scnerr := archive.GetSubClusterByNode(cluster, hostname) - if scnerr != nil { - return nil, nil, scnerr - } - subClusterTopol, scterr = archive.GetSubCluster(cluster, subClusterName) - if scterr != nil { - return nil, nil, scterr - } - } - - // Always full node hwthread id list, no partial queries expected -> Use "topology.Node" directly where applicable - // Always full accelerator id list, no partial queries expected -> Use "acceleratorIds" directly where applicable - topology := subClusterTopol.Topology - acceleratorIds := topology.GetAcceleratorIDs() - - // Moved check here if metric matches hardware specs - if nativeScope == schema.MetricScopeAccelerator && len(acceleratorIds) == 0 { - continue scopesLoop - } - - // Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node) - if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) { - if scope != schema.MetricScopeAccelerator { - // Skip all other catched cases - continue - } - - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: hostname, - Aggregate: false, - Type: &acceleratorString, - TypeIds: acceleratorIds, - Resolution: resolution, - }) - assignedScope = append(assignedScope, schema.MetricScopeAccelerator) - continue - } - - // Accelerator -> Node - if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode { - if len(acceleratorIds) == 0 { - continue - } - - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: hostname, - Aggregate: true, - Type: &acceleratorString, - TypeIds: acceleratorIds, - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // HWThread -> HWThead - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread { - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: hostname, - Aggregate: false, - Type: &hwthreadString, - TypeIds: intToStringSlice(topology.Node), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // HWThread -> Core - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore { - cores, _ := topology.GetCoresFromHWThreads(topology.Node) - for _, core := range cores { - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: hostname, - Aggregate: true, - Type: &hwthreadString, - TypeIds: intToStringSlice(topology.Core[core]), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - } - continue - } - - // HWThread -> Socket - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket { - sockets, _ := topology.GetSocketsFromHWThreads(topology.Node) - for _, socket := range sockets { - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: hostname, - Aggregate: true, - Type: &hwthreadString, - TypeIds: intToStringSlice(topology.Socket[socket]), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - } - continue - } - - // HWThread -> Node - if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode { - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: hostname, - Aggregate: true, - Type: &hwthreadString, - TypeIds: intToStringSlice(topology.Node), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Core -> Core - if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore { - cores, _ := topology.GetCoresFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: hostname, - Aggregate: false, - Type: &coreString, - TypeIds: intToStringSlice(cores), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Core -> Socket - if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket { - sockets, _ := topology.GetSocketsFromCores(topology.Node) - for _, socket := range sockets { - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: hostname, - Aggregate: true, - Type: &coreString, - TypeIds: intToStringSlice(topology.Socket[socket]), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - } - continue - } - - // Core -> Node - if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode { - cores, _ := topology.GetCoresFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: hostname, - Aggregate: true, - Type: &coreString, - TypeIds: intToStringSlice(cores), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // MemoryDomain -> MemoryDomain - if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain { - sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: hostname, - Aggregate: false, - Type: &memoryDomainString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // MemoryDoman -> Node - if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode { - sockets, _ := topology.GetMemoryDomainsFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: hostname, - Aggregate: true, - Type: &memoryDomainString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Socket -> Socket - if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket { - sockets, _ := topology.GetSocketsFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: hostname, - Aggregate: false, - Type: &socketString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Socket -> Node - if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode { - sockets, _ := topology.GetSocketsFromHWThreads(topology.Node) - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: hostname, - Aggregate: true, - Type: &socketString, - TypeIds: intToStringSlice(sockets), - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - // Node -> Node - if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode { - queries = append(queries, ApiQuery{ - Metric: remoteName, - Hostname: hostname, - Resolution: resolution, - }) - assignedScope = append(assignedScope, scope) - continue - } - - return nil, nil, fmt.Errorf("METRICDATA/CCMS > TODO: unhandled case: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) - } - } - } - - return queries, assignedScope, nil -} diff --git a/internal/metricdata/metricdata.go b/internal/metricdata/metricdata.go deleted file mode 100644 index 0748a8d5..00000000 --- a/internal/metricdata/metricdata.go +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package metricdata - -import ( - "context" - "encoding/json" - "fmt" - "time" - - "github.com/ClusterCockpit/cc-backend/internal/config" - "github.com/ClusterCockpit/cc-backend/internal/memorystore" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" -) - -type MetricDataRepository interface { - // Initialize this MetricDataRepository. One instance of - // this interface will only ever be responsible for one cluster. - Init(rawConfig json.RawMessage) error - - // Return the JobData for the given job, only with the requested metrics. - LoadData(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) - - // Return a map of metrics to a map of nodes to the metric statistics of the job. node scope only. - LoadStats(job *schema.Job, metrics []string, ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) - - // Return a map of metrics to a map of scopes to the scoped metric statistics of the job. - LoadScopedStats(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context) (schema.ScopedJobStats, error) - - // Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node. - LoadNodeData(cluster string, metrics, nodes []string, scopes []schema.MetricScope, from, to time.Time, ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) - - // Return a map of hosts to a map of metrics to a map of scopes for multiple nodes. - LoadNodeListData(cluster, subCluster string, nodes, metrics []string, scopes []schema.MetricScope, resolution int, from, to time.Time, ctx context.Context) (map[string]schema.JobData, error) -} - -var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{} - -func Init() error { - for _, cluster := range config.Clusters { - if cluster.MetricDataRepository != nil { - var kind struct { - Kind string `json:"kind"` - } - if err := json.Unmarshal(cluster.MetricDataRepository, &kind); err != nil { - cclog.Warn("Error while unmarshaling raw json MetricDataRepository") - return err - } - - var mdr MetricDataRepository - switch kind.Kind { - case "cc-metric-store": - mdr = &CCMetricStore{} - case "cc-metric-store-internal": - mdr = &CCMetricStoreInternal{} - memorystore.InternalCCMSFlag = true - case "prometheus": - mdr = &PrometheusDataRepository{} - case "test": - mdr = &TestMetricDataRepository{} - default: - return fmt.Errorf("METRICDATA/METRICDATA > Unknown MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name) - } - - if err := mdr.Init(cluster.MetricDataRepository); err != nil { - cclog.Errorf("Error initializing MetricDataRepository %v for cluster %v", kind.Kind, cluster.Name) - return err - } - metricDataRepos[cluster.Name] = mdr - } - } - return nil -} - -func GetMetricDataRepo(cluster string) (MetricDataRepository, error) { - var err error - repo, ok := metricDataRepos[cluster] - - if !ok { - err = fmt.Errorf("METRICDATA/METRICDATA > no metric data repository configured for '%s'", cluster) - } - - return repo, err -} diff --git a/internal/metricdata/prometheus.go b/internal/metricdata/prometheus.go deleted file mode 100644 index 66c5bc1e..00000000 --- a/internal/metricdata/prometheus.go +++ /dev/null @@ -1,587 +0,0 @@ -// Copyright (C) 2022 DKRZ -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. -package metricdata - -import ( - "bytes" - "context" - "encoding/json" - "errors" - "fmt" - "math" - "net/http" - "os" - "regexp" - "sort" - "strings" - "sync" - "text/template" - "time" - - "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" - promapi "github.com/prometheus/client_golang/api" - promv1 "github.com/prometheus/client_golang/api/prometheus/v1" - promcfg "github.com/prometheus/common/config" - promm "github.com/prometheus/common/model" -) - -type PrometheusDataRepositoryConfig struct { - Url string `json:"url"` - Username string `json:"username,omitempty"` - Suffix string `json:"suffix,omitempty"` - Templates map[string]string `json:"query-templates"` -} - -type PrometheusDataRepository struct { - client promapi.Client - queryClient promv1.API - suffix string - templates map[string]*template.Template -} - -type PromQLArgs struct { - Nodes string -} - -type Trie map[rune]Trie - -var logOnce sync.Once - -func contains(s []schema.MetricScope, str schema.MetricScope) bool { - for _, v := range s { - if v == str { - return true - } - } - return false -} - -func MinMaxMean(data []schema.Float) (float64, float64, float64) { - if len(data) == 0 { - return 0.0, 0.0, 0.0 - } - min := math.MaxFloat64 - max := -math.MaxFloat64 - var sum float64 - var n float64 - for _, val := range data { - if val.IsNaN() { - continue - } - sum += float64(val) - n += 1 - if float64(val) > max { - max = float64(val) - } - if float64(val) < min { - min = float64(val) - } - } - return min, max, sum / n -} - -// Rewritten from -// https://github.com/ermanh/trieregex/blob/master/trieregex/trieregex.py -func nodeRegex(nodes []string) string { - root := Trie{} - // add runes of each compute node to trie - for _, node := range nodes { - _trie := root - for _, c := range node { - if _, ok := _trie[c]; !ok { - _trie[c] = Trie{} - } - _trie = _trie[c] - } - _trie['*'] = Trie{} - } - // recursively build regex from rune trie - var trieRegex func(trie Trie, reset bool) string - trieRegex = func(trie Trie, reset bool) string { - if reset == true { - trie = root - } - if len(trie) == 0 { - return "" - } - if len(trie) == 1 { - for key, _trie := range trie { - if key == '*' { - return "" - } - return regexp.QuoteMeta(string(key)) + trieRegex(_trie, false) - } - } else { - sequences := []string{} - for key, _trie := range trie { - if key != '*' { - sequences = append(sequences, regexp.QuoteMeta(string(key))+trieRegex(_trie, false)) - } - } - sort.Slice(sequences, func(i, j int) bool { - return (-len(sequences[i]) < -len(sequences[j])) || (sequences[i] < sequences[j]) - }) - var result string - // single edge from this tree node - if len(sequences) == 1 { - result = sequences[0] - if len(result) > 1 { - result = "(?:" + result + ")" - } - // multiple edges, each length 1 - } else if s := strings.Join(sequences, ""); len(s) == len(sequences) { - // char or numeric range - if len(s)-1 == int(s[len(s)-1])-int(s[0]) { - result = fmt.Sprintf("[%c-%c]", s[0], s[len(s)-1]) - // char or numeric set - } else { - result = "[" + s + "]" - } - // multiple edges of different lengths - } else { - result = "(?:" + strings.Join(sequences, "|") + ")" - } - if _, ok := trie['*']; ok { - result += "?" - } - return result - } - return "" - } - return trieRegex(root, true) -} - -func (pdb *PrometheusDataRepository) Init(rawConfig json.RawMessage) error { - var config PrometheusDataRepositoryConfig - // parse config - if err := json.Unmarshal(rawConfig, &config); err != nil { - cclog.Warn("Error while unmarshaling raw json config") - return err - } - // support basic authentication - var rt http.RoundTripper = nil - if prom_pw := os.Getenv("PROMETHEUS_PASSWORD"); prom_pw != "" && config.Username != "" { - prom_pw := promcfg.Secret(prom_pw) - rt = promcfg.NewBasicAuthRoundTripper(promcfg.NewInlineSecret(config.Username), promcfg.NewInlineSecret(string(prom_pw)), promapi.DefaultRoundTripper) - } else { - if config.Username != "" { - return errors.New("METRICDATA/PROMETHEUS > Prometheus username provided, but PROMETHEUS_PASSWORD not set") - } - } - // init client - client, err := promapi.NewClient(promapi.Config{ - Address: config.Url, - RoundTripper: rt, - }) - if err != nil { - cclog.Error("Error while initializing new prometheus client") - return err - } - // init query client - pdb.client = client - pdb.queryClient = promv1.NewAPI(pdb.client) - // site config - pdb.suffix = config.Suffix - // init query templates - pdb.templates = make(map[string]*template.Template) - for metric, templ := range config.Templates { - pdb.templates[metric], err = template.New(metric).Parse(templ) - if err == nil { - cclog.Debugf("Added PromQL template for %s: %s", metric, templ) - } else { - cclog.Warnf("Failed to parse PromQL template %s for metric %s", templ, metric) - } - } - return nil -} - -// TODO: respect scope argument -func (pdb *PrometheusDataRepository) FormatQuery( - metric string, - scope schema.MetricScope, - nodes []string, - cluster string, -) (string, error) { - args := PromQLArgs{} - if len(nodes) > 0 { - args.Nodes = fmt.Sprintf("(%s)%s", nodeRegex(nodes), pdb.suffix) - } else { - args.Nodes = fmt.Sprintf(".*%s", pdb.suffix) - } - - buf := &bytes.Buffer{} - if templ, ok := pdb.templates[metric]; ok { - err := templ.Execute(buf, args) - if err != nil { - return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > Error compiling template %v", templ)) - } else { - query := buf.String() - cclog.Debugf("PromQL: %s", query) - return query, nil - } - } else { - return "", errors.New(fmt.Sprintf("METRICDATA/PROMETHEUS > No PromQL for metric %s configured.", metric)) - } -} - -// Convert PromAPI row to CC schema.Series -func (pdb *PrometheusDataRepository) RowToSeries( - from time.Time, - step int64, - steps int64, - row *promm.SampleStream, -) schema.Series { - ts := from.Unix() - hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix) - // init array of expected length with NaN - values := make([]schema.Float, steps+1) - for i := range values { - values[i] = schema.NaN - } - // copy recorded values from prom sample pair - for _, v := range row.Values { - idx := (v.Timestamp.Unix() - ts) / step - values[idx] = schema.Float(v.Value) - } - min, max, mean := MinMaxMean(values) - // output struct - return schema.Series{ - Hostname: hostname, - Data: values, - Statistics: schema.MetricStatistics{ - Avg: mean, - Min: min, - Max: max, - }, - } -} - -func (pdb *PrometheusDataRepository) LoadData( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - ctx context.Context, - resolution int, -) (schema.JobData, error) { - // TODO respect requested scope - if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) { - scopes = append(scopes, schema.MetricScopeNode) - } - - jobData := make(schema.JobData) - // parse job specs - nodes := make([]string, len(job.Resources)) - for i, resource := range job.Resources { - nodes[i] = resource.Hostname - } - from := time.Unix(job.StartTime, 0) - to := time.Unix(job.StartTime+int64(job.Duration), 0) - - for _, scope := range scopes { - if scope != schema.MetricScopeNode { - logOnce.Do(func() { - cclog.Infof("Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope) - }) - continue - } - - for _, metric := range metrics { - metricConfig := archive.GetMetricConfig(job.Cluster, metric) - if metricConfig == nil { - cclog.Warnf("Error in LoadData: Metric %s for cluster %s not configured", metric, job.Cluster) - return nil, errors.New("Prometheus config error") - } - query, err := pdb.FormatQuery(metric, scope, nodes, job.Cluster) - if err != nil { - cclog.Warn("Error while formatting prometheus query") - return nil, err - } - - // ranged query over all job nodes - r := promv1.Range{ - Start: from, - End: to, - Step: time.Duration(metricConfig.Timestep * 1e9), - } - result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r) - if err != nil { - cclog.Errorf("Prometheus query error in LoadData: %v\nQuery: %s", err, query) - return nil, errors.New("Prometheus query error") - } - if len(warnings) > 0 { - cclog.Warnf("Warnings: %v\n", warnings) - } - - // init data structures - if _, ok := jobData[metric]; !ok { - jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric) - } - jobMetric, ok := jobData[metric][scope] - if !ok { - jobMetric = &schema.JobMetric{ - Unit: metricConfig.Unit, - Timestep: metricConfig.Timestep, - Series: make([]schema.Series, 0), - } - } - step := int64(metricConfig.Timestep) - steps := int64(to.Sub(from).Seconds()) / step - // iter rows of host, metric, values - for _, row := range result.(promm.Matrix) { - jobMetric.Series = append(jobMetric.Series, - pdb.RowToSeries(from, step, steps, row)) - } - // only add metric if at least one host returned data - if !ok && len(jobMetric.Series) > 0 { - jobData[metric][scope] = jobMetric - } - // sort by hostname to get uniform coloring - sort.Slice(jobMetric.Series, func(i, j int) bool { - return (jobMetric.Series[i].Hostname < jobMetric.Series[j].Hostname) - }) - } - } - return jobData, nil -} - -// TODO change implementation to precomputed/cached stats -func (pdb *PrometheusDataRepository) LoadStats( - job *schema.Job, - metrics []string, - ctx context.Context, -) (map[string]map[string]schema.MetricStatistics, error) { - // map of metrics of nodes of stats - stats := map[string]map[string]schema.MetricStatistics{} - - data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/) - if err != nil { - cclog.Warn("Error while loading job for stats") - return nil, err - } - for metric, metricData := range data { - stats[metric] = make(map[string]schema.MetricStatistics) - for _, series := range metricData[schema.MetricScopeNode].Series { - stats[metric][series.Hostname] = series.Statistics - } - } - - return stats, nil -} - -func (pdb *PrometheusDataRepository) LoadNodeData( - cluster string, - metrics, nodes []string, - scopes []schema.MetricScope, - from, to time.Time, - ctx context.Context, -) (map[string]map[string][]*schema.JobMetric, error) { - t0 := time.Now() - // Map of hosts of metrics of value slices - data := make(map[string]map[string][]*schema.JobMetric) - // query db for each metric - // TODO: scopes seems to be always empty - if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) { - scopes = append(scopes, schema.MetricScopeNode) - } - for _, scope := range scopes { - if scope != schema.MetricScopeNode { - logOnce.Do(func() { - cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope) - }) - continue - } - for _, metric := range metrics { - metricConfig := archive.GetMetricConfig(cluster, metric) - if metricConfig == nil { - cclog.Warnf("Error in LoadNodeData: Metric %s for cluster %s not configured", metric, cluster) - return nil, errors.New("Prometheus config error") - } - query, err := pdb.FormatQuery(metric, scope, nodes, cluster) - if err != nil { - cclog.Warn("Error while formatting prometheus query") - return nil, err - } - - // ranged query over all nodes - r := promv1.Range{ - Start: from, - End: to, - Step: time.Duration(metricConfig.Timestep * 1e9), - } - result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r) - if err != nil { - cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err) - return nil, errors.New("Prometheus query error") - } - if len(warnings) > 0 { - cclog.Warnf("Warnings: %v\n", warnings) - } - - step := int64(metricConfig.Timestep) - steps := int64(to.Sub(from).Seconds()) / step - - // iter rows of host, metric, values - for _, row := range result.(promm.Matrix) { - hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix) - hostdata, ok := data[hostname] - if !ok { - hostdata = make(map[string][]*schema.JobMetric) - data[hostname] = hostdata - } - // output per host and metric - hostdata[metric] = append(hostdata[metric], &schema.JobMetric{ - Unit: metricConfig.Unit, - Timestep: metricConfig.Timestep, - Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)}, - }, - ) - } - } - } - t1 := time.Since(t0) - cclog.Debugf("LoadNodeData of %v nodes took %s", len(data), t1) - return data, nil -} - -// Implemented by NHR@FAU; Used in Job-View StatsTable -func (pdb *PrometheusDataRepository) LoadScopedStats( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - ctx context.Context, -) (schema.ScopedJobStats, error) { - // Assumption: pdb.loadData() only returns series node-scope - use node scope for statsTable - scopedJobStats := make(schema.ScopedJobStats) - data, err := pdb.LoadData(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0 /*resolution here*/) - if err != nil { - cclog.Warn("Error while loading job for scopedJobStats") - return nil, err - } - - for metric, metricData := range data { - for _, scope := range scopes { - if scope != schema.MetricScopeNode { - logOnce.Do(func() { - cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope) - }) - continue - } - - if _, ok := scopedJobStats[metric]; !ok { - scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats) - } - - if _, ok := scopedJobStats[metric][scope]; !ok { - scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0) - } - - for _, series := range metricData[scope].Series { - scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{ - Hostname: series.Hostname, - Data: &series.Statistics, - }) - } - } - } - - return scopedJobStats, nil -} - -// Implemented by NHR@FAU; Used in NodeList-View -func (pdb *PrometheusDataRepository) LoadNodeListData( - cluster, subCluster string, - nodes []string, - metrics []string, - scopes []schema.MetricScope, - resolution int, - from, to time.Time, - ctx context.Context, -) (map[string]schema.JobData, error) { - // Assumption: pdb.loadData() only returns series node-scope - use node scope for NodeList - - // Fetch Data, based on pdb.LoadNodeData() - t0 := time.Now() - // Map of hosts of jobData - data := make(map[string]schema.JobData) - - // query db for each metric - // TODO: scopes seems to be always empty - if len(scopes) == 0 || !contains(scopes, schema.MetricScopeNode) { - scopes = append(scopes, schema.MetricScopeNode) - } - - for _, scope := range scopes { - if scope != schema.MetricScopeNode { - logOnce.Do(func() { - cclog.Infof("Note: Scope '%s' requested, but not yet supported: Will return 'node' scope only.", scope) - }) - continue - } - - for _, metric := range metrics { - metricConfig := archive.GetMetricConfig(cluster, metric) - if metricConfig == nil { - cclog.Warnf("Error in LoadNodeListData: Metric %s for cluster %s not configured", metric, cluster) - return nil, errors.New("Prometheus config error") - } - query, err := pdb.FormatQuery(metric, scope, nodes, cluster) - if err != nil { - cclog.Warn("Error while formatting prometheus query") - return nil, err - } - - // ranged query over all nodes - r := promv1.Range{ - Start: from, - End: to, - Step: time.Duration(metricConfig.Timestep * 1e9), - } - result, warnings, err := pdb.queryClient.QueryRange(ctx, query, r) - if err != nil { - cclog.Errorf("Prometheus query error in LoadNodeData: %v\n", err) - return nil, errors.New("Prometheus query error") - } - if len(warnings) > 0 { - cclog.Warnf("Warnings: %v\n", warnings) - } - - step := int64(metricConfig.Timestep) - steps := int64(to.Sub(from).Seconds()) / step - - // iter rows of host, metric, values - for _, row := range result.(promm.Matrix) { - hostname := strings.TrimSuffix(string(row.Metric["exported_instance"]), pdb.suffix) - - hostdata, ok := data[hostname] - if !ok { - hostdata = make(schema.JobData) - data[hostname] = hostdata - } - - metricdata, ok := hostdata[metric] - if !ok { - metricdata = make(map[schema.MetricScope]*schema.JobMetric) - data[hostname][metric] = metricdata - } - - // output per host, metric and scope - scopeData, ok := metricdata[scope] - if !ok { - scopeData = &schema.JobMetric{ - Unit: metricConfig.Unit, - Timestep: metricConfig.Timestep, - Series: []schema.Series{pdb.RowToSeries(from, step, steps, row)}, - } - data[hostname][metric][scope] = scopeData - } - } - } - } - t1 := time.Since(t0) - cclog.Debugf("LoadNodeListData of %v nodes took %s", len(data), t1) - return data, nil -} diff --git a/internal/metricdata/utils.go b/internal/metricdata/utils.go deleted file mode 100644 index 0b2bb7ec..00000000 --- a/internal/metricdata/utils.go +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. -// All rights reserved. This file is part of cc-backend. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package metricdata - -import ( - "context" - "encoding/json" - "time" - - "github.com/ClusterCockpit/cc-lib/schema" -) - -var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) = func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) { - panic("TODO") -} - -// TestMetricDataRepository is only a mock for unit-testing. -type TestMetricDataRepository struct{} - -func (tmdr *TestMetricDataRepository) Init(_ json.RawMessage) error { - return nil -} - -func (tmdr *TestMetricDataRepository) LoadData( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - ctx context.Context, - resolution int, -) (schema.JobData, error) { - return TestLoadDataCallback(job, metrics, scopes, ctx, resolution) -} - -func (tmdr *TestMetricDataRepository) LoadStats( - job *schema.Job, - metrics []string, - ctx context.Context, -) (map[string]map[string]schema.MetricStatistics, error) { - panic("TODO") -} - -func (tmdr *TestMetricDataRepository) LoadScopedStats( - job *schema.Job, - metrics []string, - scopes []schema.MetricScope, - ctx context.Context, -) (schema.ScopedJobStats, error) { - panic("TODO") -} - -func (tmdr *TestMetricDataRepository) LoadNodeData( - cluster string, - metrics, nodes []string, - scopes []schema.MetricScope, - from, to time.Time, - ctx context.Context, -) (map[string]map[string][]*schema.JobMetric, error) { - panic("TODO") -} - -func (tmdr *TestMetricDataRepository) LoadNodeListData( - cluster, subCluster string, - nodes []string, - metrics []string, - scopes []schema.MetricScope, - resolution int, - from, to time.Time, - ctx context.Context, -) (map[string]schema.JobData, error) { - panic("TODO") -} - -func DeepCopy(jdTemp schema.JobData) schema.JobData { - jd := make(schema.JobData, len(jdTemp)) - for k, v := range jdTemp { - jd[k] = make(map[schema.MetricScope]*schema.JobMetric, len(jdTemp[k])) - for k_, v_ := range v { - jd[k][k_] = new(schema.JobMetric) - jd[k][k_].Series = make([]schema.Series, len(v_.Series)) - for i := 0; i < len(v_.Series); i += 1 { - jd[k][k_].Series[i].Data = make([]schema.Float, len(v_.Series[i].Data)) - copy(jd[k][k_].Series[i].Data, v_.Series[i].Data) - jd[k][k_].Series[i].Hostname = v_.Series[i].Hostname - jd[k][k_].Series[i].Id = v_.Series[i].Id - jd[k][k_].Series[i].Statistics.Avg = v_.Series[i].Statistics.Avg - jd[k][k_].Series[i].Statistics.Min = v_.Series[i].Statistics.Min - jd[k][k_].Series[i].Statistics.Max = v_.Series[i].Statistics.Max - } - jd[k][k_].Timestep = v_.Timestep - jd[k][k_].Unit.Base = v_.Unit.Base - jd[k][k_].Unit.Prefix = v_.Unit.Prefix - if v_.StatisticsSeries != nil { - // Init Slices - jd[k][k_].StatisticsSeries = new(schema.StatsSeries) - jd[k][k_].StatisticsSeries.Max = make([]schema.Float, len(v_.StatisticsSeries.Max)) - jd[k][k_].StatisticsSeries.Min = make([]schema.Float, len(v_.StatisticsSeries.Min)) - jd[k][k_].StatisticsSeries.Median = make([]schema.Float, len(v_.StatisticsSeries.Median)) - jd[k][k_].StatisticsSeries.Mean = make([]schema.Float, len(v_.StatisticsSeries.Mean)) - // Copy Data - copy(jd[k][k_].StatisticsSeries.Max, v_.StatisticsSeries.Max) - copy(jd[k][k_].StatisticsSeries.Min, v_.StatisticsSeries.Min) - copy(jd[k][k_].StatisticsSeries.Median, v_.StatisticsSeries.Median) - copy(jd[k][k_].StatisticsSeries.Mean, v_.StatisticsSeries.Mean) - // Handle Percentiles - for k__, v__ := range v_.StatisticsSeries.Percentiles { - jd[k][k_].StatisticsSeries.Percentiles[k__] = make([]schema.Float, len(v__)) - copy(jd[k][k_].StatisticsSeries.Percentiles[k__], v__) - } - } else { - jd[k][k_].StatisticsSeries = v_.StatisticsSeries - } - } - } - return jd -} diff --git a/internal/metricdispatch/configSchema.go b/internal/metricdispatch/configSchema.go new file mode 100644 index 00000000..6dec69bf --- /dev/null +++ b/internal/metricdispatch/configSchema.go @@ -0,0 +1,29 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package metricdispatch + +const configSchema = `{ + "type": "array", + "description": "Array of metric store configurations with scope-based routing.", + "items": { + "type": "object", + "properties": { + "scope": { + "description": "Scope identifier for routing metrics (e.g., cluster name, '*' for default)", + "type": "string" + }, + "url": { + "description": "URL of the metric store endpoint", + "type": "string" + }, + "token": { + "description": "Authentication token for the metric store", + "type": "string" + } + }, + "required": ["scope", "url", "token"] + } +}` diff --git a/internal/metricdispatch/dataLoader.go b/internal/metricdispatch/dataLoader.go new file mode 100644 index 00000000..c420fee4 --- /dev/null +++ b/internal/metricdispatch/dataLoader.go @@ -0,0 +1,533 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +// Package metricdispatch provides a unified interface for loading and caching job metric data. +// +// This package serves as a central dispatcher that routes metric data requests to the appropriate +// backend based on job state. For running jobs, data is fetched from the metric store (e.g., cc-metric-store). +// For completed jobs, data is retrieved from the file-based job archive. +// +// # Key Features +// +// - Automatic backend selection based on job state (running vs. archived) +// - LRU cache for performance optimization (128 MB default cache size) +// - Data resampling using Largest Triangle Three Bucket algorithm for archived data +// - Automatic statistics series generation for jobs with many nodes +// - Support for scoped metrics (node, socket, accelerator, core) +// +// # Cache Behavior +// +// Cached data has different TTL (time-to-live) values depending on job state: +// - Running jobs: 2 minutes (data changes frequently) +// - Completed jobs: 5 hours (data is static) +// +// The cache key is based on job ID, state, requested metrics, scopes, and resolution. +// +// # Usage +// +// The primary entry point is LoadData, which automatically handles both running and archived jobs: +// +// jobData, err := metricdispatch.LoadData(job, metrics, scopes, ctx, resolution) +// if err != nil { +// // Handle error +// } +// +// For statistics only, use LoadJobStats, LoadScopedJobStats, or LoadAverages depending on the required format. +package metricdispatch + +import ( + "context" + "fmt" + "math" + "time" + + "github.com/ClusterCockpit/cc-backend/pkg/archive" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/lrucache" + "github.com/ClusterCockpit/cc-lib/v2/resampler" + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +// cache is an LRU cache with 128 MB capacity for storing loaded job metric data. +// The cache reduces load on both the metric store and archive backends. +var cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024) + +// cacheKey generates a unique cache key for a job's metric data based on job ID, state, +// requested metrics, scopes, and resolution. Duration and StartTime are intentionally excluded +// because job.ID is more unique and the cache TTL ensures entries don't persist indefinitely. +func cacheKey( + job *schema.Job, + metrics []string, + scopes []schema.MetricScope, + resolution int, +) string { + return fmt.Sprintf("%d(%s):[%v],[%v]-%d", + *job.ID, job.State, metrics, scopes, resolution) +} + +// LoadData retrieves metric data for a job from the appropriate backend (memory store for running jobs, +// archive for completed jobs) and applies caching, resampling, and statistics generation as needed. +// +// For running jobs or when archive is disabled, data is fetched from the metric store. +// For completed archived jobs, data is loaded from the job archive and resampled if needed. +// +// Parameters: +// - job: The job for which to load metric data +// - metrics: List of metric names to load (nil loads all metrics for the cluster) +// - scopes: Metric scopes to include (nil defaults to node scope) +// - ctx: Context for cancellation and timeouts +// - resolution: Target number of data points for resampling (only applies to archived data) +// +// Returns the loaded job data and any error encountered. For partial errors (some metrics failed), +// the function returns the successfully loaded data with a warning logged. +func LoadData(job *schema.Job, + metrics []string, + scopes []schema.MetricScope, + ctx context.Context, + resolution int, +) (schema.JobData, error) { + data := cache.Get(cacheKey(job, metrics, scopes, resolution), func() (_ any, ttl time.Duration, size int) { + var jd schema.JobData + var err error + + if job.State == schema.JobStateRunning || + job.MonitoringStatus == schema.MonitoringStatusRunningOrArchiving { + + ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) + if err != nil { + cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s", + job.Cluster, job.SubCluster, err.Error()) + return err, 0, 0 + } + + if scopes == nil { + scopes = append(scopes, schema.MetricScopeNode) + } + + if metrics == nil { + cluster := archive.GetCluster(job.Cluster) + for _, mc := range cluster.MetricConfig { + metrics = append(metrics, mc.Name) + } + } + + jd, err = ms.LoadData(job, metrics, scopes, ctx, resolution) + if err != nil { + if len(jd) != 0 { + cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s, cluster: %s-%s): %s", + job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error()) + } else { + cclog.Warnf("failed to load job data from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s", + job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error()) + return err, 0, 0 + } + } + size = jd.Size() + } else { + var jdTemp schema.JobData + jdTemp, err = archive.GetHandle().LoadJobData(job) + if err != nil { + cclog.Warnf("failed to load job data from archive for job %d (user: %s, project: %s, cluster: %s-%s): %s", + job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error()) + return err, 0, 0 + } + + jd = deepCopy(jdTemp) + + // Resample archived data using Largest Triangle Three Bucket algorithm to reduce data points + // to the requested resolution, improving transfer performance and client-side rendering. + for _, v := range jd { + for _, v_ := range v { + timestep := int64(0) + for i := 0; i < len(v_.Series); i += 1 { + v_.Series[i].Data, timestep, err = resampler.LargestTriangleThreeBucket(v_.Series[i].Data, int64(v_.Timestep), int64(resolution)) + if err != nil { + return err, 0, 0 + } + } + v_.Timestep = int(timestep) + } + } + + // Filter job data to only include requested metrics and scopes, avoiding unnecessary data transfer. + if metrics != nil || scopes != nil { + if metrics == nil { + metrics = make([]string, 0, len(jd)) + for k := range jd { + metrics = append(metrics, k) + } + } + + res := schema.JobData{} + for _, metric := range metrics { + if perscope, ok := jd[metric]; ok { + if len(perscope) > 1 { + subset := make(map[schema.MetricScope]*schema.JobMetric) + for _, scope := range scopes { + if jm, ok := perscope[scope]; ok { + subset[scope] = jm + } + } + + if len(subset) > 0 { + perscope = subset + } + } + + res[metric] = perscope + } + } + jd = res + } + size = jd.Size() + } + + ttl = 5 * time.Hour + if job.State == schema.JobStateRunning { + ttl = 2 * time.Minute + } + + // Generate statistics series for jobs with many nodes to enable min/median/max graphs + // instead of overwhelming the UI with individual node lines. Note that newly calculated + // statistics use min/median/max, while archived statistics may use min/mean/max. + const maxSeriesSize int = 15 + for _, scopes := range jd { + for _, jm := range scopes { + if jm.StatisticsSeries != nil || len(jm.Series) <= maxSeriesSize { + continue + } + + jm.AddStatisticsSeries() + } + } + + nodeScopeRequested := false + for _, scope := range scopes { + if scope == schema.MetricScopeNode { + nodeScopeRequested = true + } + } + + if nodeScopeRequested { + jd.AddNodeScope("flops_any") + jd.AddNodeScope("mem_bw") + } + + // Round Resulting Stat Values + jd.RoundMetricStats() + + return jd, ttl, size + }) + + if err, ok := data.(error); ok { + cclog.Errorf("error in cached dataset for job %d: %s", job.JobID, err.Error()) + return nil, err + } + + return data.(schema.JobData), nil +} + +// LoadAverages computes average values for the specified metrics across all nodes of a job. +// For running jobs, it loads statistics from the metric store. For completed jobs, it uses +// the pre-calculated averages from the job archive. The results are appended to the data slice. +func LoadAverages( + job *schema.Job, + metrics []string, + data [][]schema.Float, + ctx context.Context, +) error { + if job.State != schema.JobStateRunning { + return archive.LoadAveragesFromArchive(job, metrics, data) // #166 change also here? + } + + ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) + if err != nil { + cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s", + job.Cluster, job.SubCluster, err.Error()) + return err + } + + stats, err := ms.LoadStats(job, metrics, ctx) + if err != nil { + cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s", + job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error()) + return err + } + + for i, m := range metrics { + nodes, ok := stats[m] + if !ok { + data[i] = append(data[i], schema.NaN) + continue + } + + sum := 0.0 + for _, node := range nodes { + sum += node.Avg + } + data[i] = append(data[i], schema.Float(sum)) + } + + return nil +} + +// LoadScopedJobStats retrieves job statistics organized by metric scope (node, socket, core, accelerator). +// For running jobs, statistics are computed from the metric store. For completed jobs, pre-calculated +// statistics are loaded from the job archive. +func LoadScopedJobStats( + job *schema.Job, + metrics []string, + scopes []schema.MetricScope, + ctx context.Context, +) (schema.ScopedJobStats, error) { + if job.State != schema.JobStateRunning { + return archive.LoadScopedStatsFromArchive(job, metrics, scopes) + } + + ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) + if err != nil { + cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s", + job.Cluster, job.SubCluster, err.Error()) + return nil, err + } + + scopedStats, err := ms.LoadScopedStats(job, metrics, scopes, ctx) + if err != nil { + cclog.Warnf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s", + job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error()) + return nil, err + } + + // Round Resulting Stat Values + scopedStats.RoundScopedMetricStats() + + return scopedStats, nil +} + +// LoadJobStats retrieves aggregated statistics (min/avg/max) for each requested metric across all job nodes. +// For running jobs, statistics are computed from the metric store. For completed jobs, pre-calculated +// statistics are loaded from the job archive. +func LoadJobStats( + job *schema.Job, + metrics []string, + ctx context.Context, +) (map[string]schema.MetricStatistics, error) { + if job.State != schema.JobStateRunning { + return archive.LoadStatsFromArchive(job, metrics) + } + + ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster) + if err != nil { + cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s", + job.Cluster, job.SubCluster, err.Error()) + return nil, err + } + + data := make(map[string]schema.MetricStatistics, len(metrics)) + + stats, err := ms.LoadStats(job, metrics, ctx) + if err != nil { + cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s", + job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error()) + return data, err + } + + for _, m := range metrics { + sum, avg, min, max := 0.0, 0.0, 0.0, 0.0 + nodes, ok := stats[m] + if !ok { + data[m] = schema.MetricStatistics{Min: min, Avg: avg, Max: max} + continue + } + + for _, node := range nodes { + sum += node.Avg + min = math.Min(min, node.Min) + max = math.Max(max, node.Max) + } + + data[m] = schema.MetricStatistics{ + Avg: (math.Round((sum/float64(job.NumNodes))*100) / 100), + Min: (math.Round(min*100) / 100), + Max: (math.Round(max*100) / 100), + } + } + + return data, nil +} + +// LoadNodeData retrieves metric data for specific nodes in a cluster within a time range. +// This is used for node monitoring views and system status pages. Data is always fetched from +// the metric store (not the archive) since it's for current/recent node status monitoring. +// +// Returns a nested map structure: node -> metric -> scoped data. +// FIXME: Add support for subcluster specific cc-metric-stores +func LoadNodeData( + cluster string, + metrics, nodes []string, + scopes []schema.MetricScope, + from, to time.Time, + ctx context.Context, +) (map[string]map[string][]*schema.JobMetric, error) { + if metrics == nil { + for _, m := range archive.GetCluster(cluster).MetricConfig { + metrics = append(metrics, m.Name) + } + } + + ms, err := GetMetricDataRepo(cluster, "") + if err != nil { + cclog.Errorf("failed to access metricDataRepo for cluster %s: %s", + cluster, err.Error()) + return nil, err + } + + data, err := ms.LoadNodeData(cluster, metrics, nodes, scopes, from, to, ctx) + if err != nil { + if len(data) != 0 { + cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error()) + } else { + cclog.Warnf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error()) + return nil, err + } + } + + if data == nil { + return nil, fmt.Errorf("metric store for cluster '%s' does not support node data queries", cluster) + } + + return data, nil +} + +// LoadNodeListData retrieves time-series metric data for multiple nodes within a time range, +// with optional resampling and automatic statistics generation for large datasets. +// This is used for comparing multiple nodes or displaying node status over time. +// +// Returns a map of node names to their job-like metric data structures. +func LoadNodeListData( + cluster, subCluster string, + nodes []string, + metrics []string, + scopes []schema.MetricScope, + resolution int, + from, to time.Time, + ctx context.Context, +) (map[string]schema.JobData, error) { + if metrics == nil { + for _, m := range archive.GetCluster(cluster).MetricConfig { + metrics = append(metrics, m.Name) + } + } + + ms, err := GetMetricDataRepo(cluster, subCluster) + if err != nil { + cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s", + cluster, subCluster, err.Error()) + return nil, err + } + + data, err := ms.LoadNodeListData(cluster, subCluster, nodes, metrics, scopes, resolution, from, to, ctx) + if err != nil { + if len(data) != 0 { + cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s", + cluster, subCluster, err.Error()) + } else { + cclog.Warnf("failed to load node list data from metric store for cluster %s, subcluster %s: %s", + cluster, subCluster, err.Error()) + return nil, err + } + } + + // Generate statistics series for datasets with many series to improve visualization performance. + // Statistics are calculated as min/median/max. + const maxSeriesSize int = 8 + for _, jd := range data { + for _, scopes := range jd { + for _, jm := range scopes { + if jm.StatisticsSeries != nil || len(jm.Series) < maxSeriesSize { + continue + } + jm.AddStatisticsSeries() + } + } + } + + if data == nil { + return nil, fmt.Errorf("metric store for cluster '%s' does not support node list queries", cluster) + } + + return data, nil +} + +// deepCopy creates a deep copy of JobData to prevent cache corruption when modifying +// archived data (e.g., during resampling). This ensures the cached archive data remains +// immutable while allowing per-request transformations. +func deepCopy(source schema.JobData) schema.JobData { + result := make(schema.JobData, len(source)) + + for metricName, scopeMap := range source { + result[metricName] = make(map[schema.MetricScope]*schema.JobMetric, len(scopeMap)) + + for scope, jobMetric := range scopeMap { + result[metricName][scope] = copyJobMetric(jobMetric) + } + } + + return result +} + +func copyJobMetric(src *schema.JobMetric) *schema.JobMetric { + dst := &schema.JobMetric{ + Timestep: src.Timestep, + Unit: src.Unit, + Series: make([]schema.Series, len(src.Series)), + } + + for i := range src.Series { + dst.Series[i] = copySeries(&src.Series[i]) + } + + if src.StatisticsSeries != nil { + dst.StatisticsSeries = copyStatisticsSeries(src.StatisticsSeries) + } + + return dst +} + +func copySeries(src *schema.Series) schema.Series { + dst := schema.Series{ + Hostname: src.Hostname, + ID: src.ID, + Statistics: src.Statistics, + Data: make([]schema.Float, len(src.Data)), + } + + copy(dst.Data, src.Data) + return dst +} + +func copyStatisticsSeries(src *schema.StatsSeries) *schema.StatsSeries { + dst := &schema.StatsSeries{ + Min: make([]schema.Float, len(src.Min)), + Mean: make([]schema.Float, len(src.Mean)), + Median: make([]schema.Float, len(src.Median)), + Max: make([]schema.Float, len(src.Max)), + } + + copy(dst.Min, src.Min) + copy(dst.Mean, src.Mean) + copy(dst.Median, src.Median) + copy(dst.Max, src.Max) + + if len(src.Percentiles) > 0 { + dst.Percentiles = make(map[int][]schema.Float, len(src.Percentiles)) + for percentile, values := range src.Percentiles { + dst.Percentiles[percentile] = make([]schema.Float, len(values)) + copy(dst.Percentiles[percentile], values) + } + } + + return dst +} diff --git a/internal/metricdispatch/dataLoader_test.go b/internal/metricdispatch/dataLoader_test.go new file mode 100644 index 00000000..65a366f9 --- /dev/null +++ b/internal/metricdispatch/dataLoader_test.go @@ -0,0 +1,125 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package metricdispatch + +import ( + "testing" + + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +func TestDeepCopy(t *testing.T) { + nodeId := "0" + original := schema.JobData{ + "cpu_load": { + schema.MetricScopeNode: &schema.JobMetric{ + Timestep: 60, + Unit: schema.Unit{Base: "load", Prefix: ""}, + Series: []schema.Series{ + { + Hostname: "node001", + ID: &nodeId, + Data: []schema.Float{1.0, 2.0, 3.0}, + Statistics: schema.MetricStatistics{ + Min: 1.0, + Avg: 2.0, + Max: 3.0, + }, + }, + }, + StatisticsSeries: &schema.StatsSeries{ + Min: []schema.Float{1.0, 1.5, 2.0}, + Mean: []schema.Float{2.0, 2.5, 3.0}, + Median: []schema.Float{2.0, 2.5, 3.0}, + Max: []schema.Float{3.0, 3.5, 4.0}, + Percentiles: map[int][]schema.Float{ + 25: {1.5, 2.0, 2.5}, + 75: {2.5, 3.0, 3.5}, + }, + }, + }, + }, + } + + copied := deepCopy(original) + + original["cpu_load"][schema.MetricScopeNode].Series[0].Data[0] = 999.0 + original["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Min[0] = 888.0 + original["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles[25][0] = 777.0 + + if copied["cpu_load"][schema.MetricScopeNode].Series[0].Data[0] != 1.0 { + t.Errorf("Series data was not deeply copied: got %v, want 1.0", + copied["cpu_load"][schema.MetricScopeNode].Series[0].Data[0]) + } + + if copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Min[0] != 1.0 { + t.Errorf("StatisticsSeries was not deeply copied: got %v, want 1.0", + copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Min[0]) + } + + if copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles[25][0] != 1.5 { + t.Errorf("Percentiles was not deeply copied: got %v, want 1.5", + copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles[25][0]) + } + + if copied["cpu_load"][schema.MetricScopeNode].Timestep != 60 { + t.Errorf("Timestep not copied correctly: got %v, want 60", + copied["cpu_load"][schema.MetricScopeNode].Timestep) + } + + if copied["cpu_load"][schema.MetricScopeNode].Series[0].Hostname != "node001" { + t.Errorf("Hostname not copied correctly: got %v, want node001", + copied["cpu_load"][schema.MetricScopeNode].Series[0].Hostname) + } +} + +func TestDeepCopyNilStatisticsSeries(t *testing.T) { + original := schema.JobData{ + "mem_used": { + schema.MetricScopeNode: &schema.JobMetric{ + Timestep: 60, + Series: []schema.Series{ + { + Hostname: "node001", + Data: []schema.Float{1.0, 2.0}, + }, + }, + StatisticsSeries: nil, + }, + }, + } + + copied := deepCopy(original) + + if copied["mem_used"][schema.MetricScopeNode].StatisticsSeries != nil { + t.Errorf("StatisticsSeries should be nil, got %v", + copied["mem_used"][schema.MetricScopeNode].StatisticsSeries) + } +} + +func TestDeepCopyEmptyPercentiles(t *testing.T) { + original := schema.JobData{ + "cpu_load": { + schema.MetricScopeNode: &schema.JobMetric{ + Timestep: 60, + Series: []schema.Series{}, + StatisticsSeries: &schema.StatsSeries{ + Min: []schema.Float{1.0}, + Mean: []schema.Float{2.0}, + Median: []schema.Float{2.0}, + Max: []schema.Float{3.0}, + Percentiles: nil, + }, + }, + }, + } + + copied := deepCopy(original) + + if copied["cpu_load"][schema.MetricScopeNode].StatisticsSeries.Percentiles != nil { + t.Errorf("Percentiles should be nil when source is nil/empty") + } +} diff --git a/internal/metricdispatch/metricdata.go b/internal/metricdispatch/metricdata.go new file mode 100755 index 00000000..3f03234e --- /dev/null +++ b/internal/metricdispatch/metricdata.go @@ -0,0 +1,123 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package metricdispatch + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/config" + ccms "github.com/ClusterCockpit/cc-backend/internal/metricstoreclient" + "github.com/ClusterCockpit/cc-backend/pkg/metricstore" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +type MetricDataRepository interface { + // Return the JobData for the given job, only with the requested metrics. + LoadData(job *schema.Job, + metrics []string, + scopes []schema.MetricScope, + ctx context.Context, + resolution int) (schema.JobData, error) + + // Return a map of metrics to a map of nodes to the metric statistics of the job. node scope only. + LoadStats(job *schema.Job, + metrics []string, + ctx context.Context) (map[string]map[string]schema.MetricStatistics, error) + + // Return a map of metrics to a map of scopes to the scoped metric statistics of the job. + LoadScopedStats(job *schema.Job, + metrics []string, + scopes []schema.MetricScope, + ctx context.Context) (schema.ScopedJobStats, error) + + // Return a map of hosts to a map of metrics at the requested scopes (currently only node) for that node. + LoadNodeData(cluster string, + metrics, nodes []string, + scopes []schema.MetricScope, + from, to time.Time, + ctx context.Context) (map[string]map[string][]*schema.JobMetric, error) + + // Return a map of hosts to a map of metrics to a map of scopes for multiple nodes. + LoadNodeListData(cluster, subCluster string, + nodes []string, + metrics []string, + scopes []schema.MetricScope, + resolution int, + from, to time.Time, + ctx context.Context) (map[string]schema.JobData, error) + + // HealthCheck evaluates the monitoring state for a set of nodes against expected metrics. + HealthCheck(cluster string, + nodes []string, + metrics []string) (map[string]metricstore.HealthCheckResult, error) +} + +type CCMetricStoreConfig struct { + Scope string `json:"scope"` + URL string `json:"url"` + Token string `json:"token"` +} + +var metricDataRepos map[string]MetricDataRepository = map[string]MetricDataRepository{} + +func Init(rawConfig json.RawMessage) error { + if rawConfig != nil { + var configs []CCMetricStoreConfig + config.Validate(configSchema, rawConfig) + dec := json.NewDecoder(bytes.NewReader(rawConfig)) + dec.DisallowUnknownFields() + if err := dec.Decode(&configs); err != nil { + return fmt.Errorf("[METRICDISPATCH]> External Metric Store Config Init: Could not decode config file '%s' Error: %s", rawConfig, err.Error()) + } + + if len(configs) == 0 { + return fmt.Errorf("[METRICDISPATCH]> No external metric store configurations found in config file") + } + + for _, config := range configs { + metricDataRepos[config.Scope] = ccms.NewCCMetricStore(config.URL, config.Token) + } + } + + return nil +} + +func GetMetricDataRepo(cluster string, subcluster string) (MetricDataRepository, error) { + var repo MetricDataRepository + var ok bool + + key := cluster + "-" + subcluster + repo, ok = metricDataRepos[key] + + if !ok { + repo, ok = metricDataRepos[cluster] + + if !ok { + repo, ok = metricDataRepos["*"] + + if !ok { + if metricstore.MetricStoreHandle == nil { + return nil, fmt.Errorf("[METRICDISPATCH]> no metric data repository configured '%s'", key) + } + + repo = metricstore.MetricStoreHandle + cclog.Debugf("[METRICDISPATCH]> Using internal metric data repository for '%s'", key) + } + } + } + + return repo, nil +} + +// GetHealthCheckRepo returns the MetricDataRepository for performing health checks on a cluster. +// It uses the same fallback logic as GetMetricDataRepo: cluster → wildcard → internal. +func GetHealthCheckRepo(cluster string) (MetricDataRepository, error) { + return GetMetricDataRepo(cluster, "") +} diff --git a/internal/metricstoreclient/cc-metric-store-queries.go b/internal/metricstoreclient/cc-metric-store-queries.go new file mode 100644 index 00000000..1119d70c --- /dev/null +++ b/internal/metricstoreclient/cc-metric-store-queries.go @@ -0,0 +1,239 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +// Package metricstoreclient - Query Building +// +// This file contains the query construction and scope transformation logic for cc-metric-store queries. +// It handles the complex mapping between requested metric scopes and native hardware topology, +// automatically aggregating or filtering metrics as needed. +// +// # Scope Transformations +// +// The buildScopeQueries function implements the core scope transformation algorithm. +// It handles 25+ different transformation cases, mapping between: +// - Accelerator (GPU) scope +// - HWThread (hardware thread/SMT) scope +// - Core (CPU core) scope +// - Socket (CPU package) scope +// - MemoryDomain (NUMA domain) scope +// - Node (full system) scope +// +// Transformations follow these rules: +// - Same scope: Return data as-is (e.g., Core → Core) +// - Coarser scope: Aggregate data (e.g., Core → Socket with Aggregate=true) +// - Finer scope: Error - cannot increase granularity +// +// # Query Building +// +// buildQueries and buildNodeQueries are the main entry points, handling job-specific +// and node-specific query construction respectively. They: +// - Validate metric configurations +// - Handle subcluster-specific metric filtering +// - Detect and skip duplicate scope requests +// - Call buildScopeQueries for each metric/scope/host combination +package metricstoreclient + +import ( + "fmt" + + "github.com/ClusterCockpit/cc-backend/pkg/archive" + "github.com/ClusterCockpit/cc-backend/pkg/metricstore" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +// buildQueries constructs API queries for job-specific metric data. +// It iterates through metrics, scopes, and job resources to build the complete query set. +// +// The function handles: +// - Metric configuration validation and subcluster filtering +// - Scope deduplication to avoid redundant queries +// - Hardware thread list resolution (job-allocated vs full node) +// - Delegation to buildScopeQueries for scope transformations +// +// Returns queries and their corresponding assigned scopes (which may differ from requested scopes). +func (ccms *CCMetricStore) buildQueries( + job *schema.Job, + metrics []string, + scopes []schema.MetricScope, + resolution int, +) ([]APIQuery, []schema.MetricScope, error) { + // Initialize both slices together + queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources)) + assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(job.Resources)) + + topology, err := ccms.getTopology(job.Cluster, job.SubCluster) + if err != nil { + cclog.Errorf("could not load cluster %s subCluster %s topology: %s", job.Cluster, job.SubCluster, err.Error()) + return nil, nil, err + } + + for _, metric := range metrics { + remoteName := metric + mc := archive.GetMetricConfig(job.Cluster, metric) + if mc == nil { + cclog.Warnf("metric '%s' is not specified for cluster '%s' - skipping", metric, job.Cluster) + continue + } + + // Skip if metric is removed for subcluster + if len(mc.SubClusters) != 0 && metricstore.IsMetricRemovedForSubCluster(mc, job.SubCluster) { + continue + } + + // Avoid duplicates... + handledScopes := make([]schema.MetricScope, 0, 3) + + scopesLoop: + for _, requestedScope := range scopes { + nativeScope := mc.Scope + if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 { + continue + } + + scope := nativeScope.Max(requestedScope) + for _, s := range handledScopes { + if scope == s { + continue scopesLoop + } + } + handledScopes = append(handledScopes, scope) + + for _, host := range job.Resources { + hwthreads := host.HWThreads + if hwthreads == nil { + hwthreads = topology.Node + } + + scopeResults, ok := metricstore.BuildScopeQueries( + nativeScope, requestedScope, + remoteName, host.Hostname, + topology, hwthreads, host.Accelerators, + ) + + if !ok { + return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > unsupported scope transformation: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) + } + + for _, sr := range scopeResults { + queries = append(queries, APIQuery{ + Metric: sr.Metric, + Hostname: sr.Hostname, + Aggregate: sr.Aggregate, + Type: sr.Type, + TypeIds: sr.TypeIds, + Resolution: resolution, + }) + assignedScope = append(assignedScope, sr.Scope) + } + } + } + } + + return queries, assignedScope, nil +} + +// buildNodeQueries constructs API queries for node-specific metric data (Systems View). +// Similar to buildQueries but uses full node topology instead of job-allocated resources. +// +// The function handles: +// - SubCluster topology resolution (either pre-loaded or per-node lookup) +// - Full node hardware thread lists (not job-specific subsets) +// - All accelerators on each node +// - Metric configuration validation with subcluster filtering +// +// Returns queries and their corresponding assigned scopes. +func (ccms *CCMetricStore) buildNodeQueries( + cluster string, + subCluster string, + nodes []string, + metrics []string, + scopes []schema.MetricScope, + resolution int, +) ([]APIQuery, []schema.MetricScope, error) { + // Initialize both slices together + queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(nodes)) + assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(nodes)) + + for _, metric := range metrics { + remoteName := metric + mc := archive.GetMetricConfig(cluster, metric) + if mc == nil { + cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, cluster) + continue + } + + // Skip if metric is removed for subcluster + if mc.SubClusters != nil && metricstore.IsMetricRemovedForSubCluster(mc, subCluster) { + continue + } + + // Avoid duplicates... + handledScopes := make([]schema.MetricScope, 0, 3) + + scopesLoop: + for _, requestedScope := range scopes { + nativeScope := mc.Scope + + scope := nativeScope.Max(requestedScope) + for _, s := range handledScopes { + if scope == s { + continue scopesLoop + } + } + handledScopes = append(handledScopes, scope) + + for _, hostname := range nodes { + var topology *schema.Topology + var err error + + // If no subCluster given, get it by node + if subCluster == "" { + topology, err = ccms.getTopologyByNode(cluster, hostname) + } else { + topology, err = ccms.getTopology(cluster, subCluster) + } + + if err != nil { + return nil, nil, err + } + + // Always full node hwthread id list, no partial queries expected -> Use "topology.Node" directly where applicable + // Always full accelerator id list, no partial queries expected -> Use "acceleratorIds" directly where applicable + acceleratorIds := topology.GetAcceleratorIDs() + + // Moved check here if metric matches hardware specs + if nativeScope == schema.MetricScopeAccelerator && len(acceleratorIds) == 0 { + continue scopesLoop + } + + scopeResults, ok := metricstore.BuildScopeQueries( + nativeScope, requestedScope, + remoteName, hostname, + topology, topology.Node, acceleratorIds, + ) + + if !ok { + return nil, nil, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > unsupported scope transformation: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) + } + + for _, sr := range scopeResults { + queries = append(queries, APIQuery{ + Metric: sr.Metric, + Hostname: sr.Hostname, + Aggregate: sr.Aggregate, + Type: sr.Type, + TypeIds: sr.TypeIds, + Resolution: resolution, + }) + assignedScope = append(assignedScope, sr.Scope) + } + } + } + } + + return queries, assignedScope, nil +} + diff --git a/internal/metricstoreclient/cc-metric-store.go b/internal/metricstoreclient/cc-metric-store.go new file mode 100644 index 00000000..55dc7fb5 --- /dev/null +++ b/internal/metricstoreclient/cc-metric-store.go @@ -0,0 +1,796 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +// Package metricstoreclient provides a client for querying the cc-metric-store time series database. +// +// The cc-metric-store is a high-performance time series database optimized for HPC metric data. +// This client handles HTTP communication, query construction, scope transformations, and data retrieval +// for job and node metrics across different metric scopes (node, socket, core, hwthread, accelerator). +// +// # Architecture +// +// The package is split into two main components: +// - Client Operations (cc-metric-store.go): HTTP client, request handling, data loading methods +// - Query Building (cc-metric-store-queries.go): Query construction and scope transformation logic +// +// # Basic Usage +// +// store := NewCCMetricStore("http://localhost:8080", "jwt-token") +// +// // Load job data +// jobData, err := store.LoadData(job, metrics, scopes, ctx, resolution) +// if err != nil { +// log.Fatal(err) +// } +// +// # Metric Scopes +// +// The client supports hierarchical metric scopes that map to HPC hardware topology: +// - MetricScopeAccelerator: GPU/accelerator level metrics +// - MetricScopeHWThread: Hardware thread (SMT) level metrics +// - MetricScopeCore: CPU core level metrics +// - MetricScopeSocket: CPU socket level metrics +// - MetricScopeMemoryDomain: NUMA domain level metrics +// - MetricScopeNode: Full node level metrics +// +// The client automatically handles scope transformations, aggregating finer-grained metrics +// to coarser scopes when needed (e.g., aggregating core metrics to socket level). +// +// # Error Handling +// +// The client supports partial errors - if some queries fail, it returns both the successful +// data and an error listing the failed queries. This allows processing partial results +// when some nodes or metrics are temporarily unavailable. +// +// # API Versioning +// +// The client uses cc-metric-store API v2, which includes support for: +// - Data resampling for bandwidth optimization +// - Multi-scope queries in a single request +// - Aggregation across hardware topology levels +package metricstoreclient + +import ( + "bufio" + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "strings" + "time" + + "github.com/ClusterCockpit/cc-backend/pkg/archive" + ms "github.com/ClusterCockpit/cc-backend/pkg/metricstore" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +// CCMetricStore is the HTTP client for communicating with cc-metric-store. +// It manages connection details, authentication, and provides methods for querying metrics. +type CCMetricStore struct { + client http.Client // HTTP client with 10-second timeout + jwt string // JWT Bearer token for authentication + url string // Base URL of cc-metric-store instance + queryEndpoint string // Full URL to query API endpoint + topologyCache map[string]*schema.Topology // cluster -> topology cache +} + +// APIQueryRequest represents a request to the cc-metric-store query API. +// It supports both explicit queries and "for-all-nodes" bulk queries. +type APIQueryRequest struct { + Cluster string `json:"cluster"` // Target cluster name + Queries []APIQuery `json:"queries"` // Explicit list of metric queries + ForAllNodes []string `json:"for-all-nodes"` // Metrics to query for all nodes + From int64 `json:"from"` // Start time (Unix timestamp) + To int64 `json:"to"` // End time (Unix timestamp) + WithStats bool `json:"with-stats"` // Include min/avg/max statistics + WithData bool `json:"with-data"` // Include time series data points +} + +// APIQuery specifies a single metric query with optional scope filtering. +// Type and TypeIds define the hardware scope (core, socket, accelerator, etc.). +type APIQuery struct { + Type *string `json:"type,omitempty"` // Scope type (e.g., "core", "socket") + SubType *string `json:"subtype,omitempty"` // Sub-scope type (reserved for future use) + Metric string `json:"metric"` // Metric name + Hostname string `json:"host"` // Target hostname + Resolution int `json:"resolution"` // Data resolution in seconds (0 = native) + TypeIds []string `json:"type-ids,omitempty"` // IDs for the scope type (e.g., core IDs) + SubTypeIds []string `json:"subtype-ids,omitempty"` // IDs for sub-scope (reserved) + Aggregate bool `json:"aggreg"` // Aggregate across TypeIds +} + +// APIQueryResponse contains the results from a cc-metric-store query. +// Results align with the Queries slice by index. +type APIQueryResponse struct { + Queries []APIQuery `json:"queries,omitempty"` // Echoed queries (for bulk requests) + Results [][]APIMetricData `json:"results"` // Result data, indexed by query +} + +// APIMetricData represents time series data and statistics for a single metric series. +// Error is set if this particular series failed to load. +type APIMetricData struct { + Error *string `json:"error"` // Error message if query failed + Data []schema.Float `json:"data"` // Time series data points + From int64 `json:"from"` // Actual start time of data + To int64 `json:"to"` // Actual end time of data + Resolution int `json:"resolution"` // Actual resolution of data in seconds + Avg schema.Float `json:"avg"` // Average value across time range + Min schema.Float `json:"min"` // Minimum value in time range + Max schema.Float `json:"max"` // Maximum value in time range +} + +// NewCCMetricStore creates and initializes a new (external) CCMetricStore client. +// The url parameter should include the protocol and port (e.g., "http://localhost:8080"). +// The token parameter is a JWT used for Bearer authentication; pass empty string if auth is disabled. +func NewCCMetricStore(url string, token string) *CCMetricStore { + return &CCMetricStore{ + url: url, + queryEndpoint: fmt.Sprintf("%s/api/query", url), + jwt: token, + client: http.Client{ + Timeout: 10 * time.Second, + }, + topologyCache: make(map[string]*schema.Topology), + } +} + +// doRequest executes an HTTP POST request to the cc-metric-store query API. +// It handles JSON encoding/decoding, authentication, and API versioning. +// The request body is automatically closed to prevent resource leaks. +func (ccms *CCMetricStore) doRequest( + ctx context.Context, + body *APIQueryRequest, +) (*APIQueryResponse, error) { + buf := &bytes.Buffer{} + if err := json.NewEncoder(buf).Encode(body); err != nil { + cclog.Errorf("Error while encoding request body: %s", err.Error()) + return nil, err + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, ccms.queryEndpoint, buf) + if err != nil { + cclog.Errorf("Error while building request body: %s", err.Error()) + return nil, err + } + if ccms.jwt != "" { + req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt)) + } + + // versioning the cc-metric-store query API. + // v2 = data with resampling + // v1 = data without resampling + q := req.URL.Query() + q.Add("version", "v2") + req.URL.RawQuery = q.Encode() + + res, err := ccms.client.Do(req) + if err != nil { + cclog.Errorf("Error while performing request: %s", err.Error()) + return nil, err + } + defer res.Body.Close() + + if res.StatusCode != http.StatusOK { + return nil, fmt.Errorf("'%s': HTTP Status: %s", ccms.queryEndpoint, res.Status) + } + + var resBody APIQueryResponse + if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&resBody); err != nil { + cclog.Errorf("Error while decoding result body: %s", err.Error()) + return nil, err + } + + return &resBody, nil +} + +// getTopology returns the topology for a given cluster and subcluster, caching it if not already present +func (ccms *CCMetricStore) getTopology(cluster, subCluster string) (*schema.Topology, error) { + cacheKey := fmt.Sprintf("%s:%s", cluster, subCluster) + if topology, ok := ccms.topologyCache[cacheKey]; ok { + return topology, nil + } + + subcluster, err := archive.GetSubCluster(cluster, subCluster) + if err != nil { + return nil, err + } + + ccms.topologyCache[cacheKey] = &subcluster.Topology + return &subcluster.Topology, nil +} + +// getTopologyByNode returns the topology for a given cluster and node, caching it if not already present +func (ccms *CCMetricStore) getTopologyByNode(cluster, node string) (*schema.Topology, error) { + subCluster, err := archive.GetSubClusterByNode(cluster, node) + if err != nil { + return nil, err + } + + return ccms.getTopology(cluster, subCluster) +} + +// LoadData retrieves time series data and statistics for the specified job and metrics. +// It queries data for the job's time range and resources, handling scope transformations automatically. +// +// Parameters: +// - job: Job metadata including cluster, time range, and allocated resources +// - metrics: List of metric names to retrieve +// - scopes: Requested metric scopes (node, socket, core, etc.) +// - ctx: Context for cancellation and timeouts +// - resolution: Data resolution in seconds (0 for native resolution) +// +// Returns JobData organized as: metric -> scope -> series list. +// Supports partial errors: returns available data even if some queries fail. +func (ccms *CCMetricStore) LoadData( + job *schema.Job, + metrics []string, + scopes []schema.MetricScope, + ctx context.Context, + resolution int, +) (schema.JobData, error) { + queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, resolution) + if err != nil { + cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error()) + return nil, err + } + + // Verify assignment is correct - log any inconsistencies for debugging + if len(queries) != len(assignedScope) { + cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildQueries: %d vs %d", + len(queries), len(assignedScope)) + } + + req := APIQueryRequest{ + Cluster: job.Cluster, + From: job.StartTime, + To: job.StartTime + int64(job.Duration), + Queries: queries, + WithStats: true, + WithData: true, + } + + resBody, err := ccms.doRequest(ctx, &req) + if err != nil { + cclog.Errorf("Error while performing request for job %d: %s", job.JobID, err.Error()) + return nil, err + } + + var errors []string + jobData := make(schema.JobData) + + // Add safety check for potential index out of range errors + if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) { + cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d", + len(req.Queries), len(resBody.Results), len(assignedScope)) + if len(resBody.Results) > len(req.Queries) { + resBody.Results = resBody.Results[:len(req.Queries)] + } + if len(assignedScope) > len(req.Queries) { + assignedScope = assignedScope[:len(req.Queries)] + } + } + + for i, row := range resBody.Results { + query := req.Queries[i] + metric := query.Metric + scope := assignedScope[i] + mc := archive.GetMetricConfig(job.Cluster, metric) + + if mc == nil { + cclog.Warnf("Metric config not found for %s on cluster %s", metric, job.Cluster) + continue + } + + if _, ok := jobData[metric]; !ok { + jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric) + } + + res := mc.Timestep + if len(row) > 0 { + res = row[0].Resolution + } + + jobMetric, ok := jobData[metric][scope] + if !ok { + jobMetric = &schema.JobMetric{ + Unit: mc.Unit, + Timestep: res, + Series: make([]schema.Series, 0), + } + jobData[metric][scope] = jobMetric + } + + for ndx, res := range row { + if res.Error != nil { + /* Build list for "partial errors", if any */ + errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error)) + continue + } + + id := ms.ExtractTypeID(query.Type, query.TypeIds, ndx, query.Metric, query.Hostname) + + ms.SanitizeStats(&res.Avg, &res.Min, &res.Max) + + jobMetric.Series = append(jobMetric.Series, schema.Series{ + Hostname: query.Hostname, + ID: id, + Statistics: schema.MetricStatistics{ + Avg: float64(res.Avg), + Min: float64(res.Min), + Max: float64(res.Max), + }, + Data: res.Data, + }) + } + + // So that one can later check len(jobData): + if len(jobMetric.Series) == 0 { + delete(jobData[metric], scope) + if len(jobData[metric]) == 0 { + delete(jobData, metric) + } + } + } + + if len(errors) != 0 { + /* Returns list for "partial errors" */ + return jobData, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) + } + return jobData, nil +} + +// LoadStats retrieves min/avg/max statistics for job metrics at node scope. +// This is faster than LoadData when only statistical summaries are needed (no time series data). +// +// Returns statistics organized as: metric -> hostname -> statistics. +func (ccms *CCMetricStore) LoadStats( + job *schema.Job, + metrics []string, + ctx context.Context, +) (map[string]map[string]schema.MetricStatistics, error) { + queries, _, err := ccms.buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) // #166 Add scope shere for analysis view accelerator normalization? + if err != nil { + cclog.Errorf("Error while building queries for jobId %d, Metrics %v: %s", job.JobID, metrics, err.Error()) + return nil, err + } + + req := APIQueryRequest{ + Cluster: job.Cluster, + From: job.StartTime, + To: job.StartTime + int64(job.Duration), + Queries: queries, + WithStats: true, + WithData: false, + } + + resBody, err := ccms.doRequest(ctx, &req) + if err != nil { + cclog.Errorf("Error while performing request for job %d: %s", job.JobID, err.Error()) + return nil, err + } + + stats := make(map[string]map[string]schema.MetricStatistics, len(metrics)) + for i, res := range resBody.Results { + if i >= len(req.Queries) { + cclog.Warnf("LoadStats: result index %d exceeds queries length %d", i, len(req.Queries)) + break + } + if len(res) == 0 { + // No Data Found For Metric, Logged in FetchData to Warn + continue + } + query := req.Queries[i] + metric := query.Metric + data := res[0] + if data.Error != nil { + cclog.Warnf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error) + continue + } + + metricdata, ok := stats[metric] + if !ok { + metricdata = make(map[string]schema.MetricStatistics, job.NumNodes) + stats[metric] = metricdata + } + + if hasNaNStats(data.Avg, data.Min, data.Max) { + cclog.Warnf("fetching %s for node %s failed: one of avg/min/max is NaN", metric, query.Hostname) + continue + } + + metricdata[query.Hostname] = schema.MetricStatistics{ + Avg: float64(data.Avg), + Min: float64(data.Min), + Max: float64(data.Max), + } + } + + return stats, nil +} + +// LoadScopedStats retrieves statistics for job metrics across multiple scopes. +// Used for the Job-View Statistics Table to display per-scope breakdowns. +// +// Returns statistics organized as: metric -> scope -> list of scoped statistics. +// Each scoped statistic includes hostname, hardware ID (if applicable), and min/avg/max values. +func (ccms *CCMetricStore) LoadScopedStats( + job *schema.Job, + metrics []string, + scopes []schema.MetricScope, + ctx context.Context, +) (schema.ScopedJobStats, error) { + queries, assignedScope, err := ccms.buildQueries(job, metrics, scopes, 0) + if err != nil { + cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error()) + return nil, err + } + + req := APIQueryRequest{ + Cluster: job.Cluster, + From: job.StartTime, + To: job.StartTime + int64(job.Duration), + Queries: queries, + WithStats: true, + WithData: false, + } + + resBody, err := ccms.doRequest(ctx, &req) + if err != nil { + cclog.Errorf("Error while performing request for job %d: %s", job.JobID, err.Error()) + return nil, err + } + + var errors []string + scopedJobStats := make(schema.ScopedJobStats) + + for i, row := range resBody.Results { + query := req.Queries[i] + metric := query.Metric + scope := assignedScope[i] + + if _, ok := scopedJobStats[metric]; !ok { + scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats) + } + + if _, ok := scopedJobStats[metric][scope]; !ok { + scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0) + } + + for ndx, res := range row { + if res.Error != nil { + /* Build list for "partial errors", if any */ + errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error)) + continue + } + + id := ms.ExtractTypeID(query.Type, query.TypeIds, ndx, query.Metric, query.Hostname) + + ms.SanitizeStats(&res.Avg, &res.Min, &res.Max) + + scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{ + Hostname: query.Hostname, + ID: id, + Data: &schema.MetricStatistics{ + Avg: float64(res.Avg), + Min: float64(res.Min), + Max: float64(res.Max), + }, + }) + } + + // So that one can later check len(scopedJobStats[metric][scope]): Remove from map if empty + if len(scopedJobStats[metric][scope]) == 0 { + delete(scopedJobStats[metric], scope) + if len(scopedJobStats[metric]) == 0 { + delete(scopedJobStats, metric) + } + } + } + + if len(errors) != 0 { + /* Returns list for "partial errors" */ + return scopedJobStats, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) + } + return scopedJobStats, nil +} + +// LoadNodeData retrieves current metric data for specified nodes in a cluster. +// Used for the Systems-View Node-Overview to display real-time node status. +// +// If nodes is nil, queries all metrics for all nodes in the cluster (bulk query). +// Returns data organized as: hostname -> metric -> list of JobMetric (with time series and stats). +func (ccms *CCMetricStore) LoadNodeData( + cluster string, + metrics, nodes []string, + scopes []schema.MetricScope, + from, to time.Time, + ctx context.Context, +) (map[string]map[string][]*schema.JobMetric, error) { + req := APIQueryRequest{ + Cluster: cluster, + From: from.Unix(), + To: to.Unix(), + WithStats: true, + WithData: true, + } + + if nodes == nil { + req.ForAllNodes = append(req.ForAllNodes, metrics...) + } else { + for _, node := range nodes { + for _, metric := range metrics { + req.Queries = append(req.Queries, APIQuery{ + Hostname: node, + Metric: metric, + Resolution: 0, // Default for Node Queries: Will return metric $Timestep Resolution + }) + } + } + } + + resBody, err := ccms.doRequest(ctx, &req) + if err != nil { + cclog.Errorf("Error while performing request for cluster %s: %s", cluster, err.Error()) + return nil, err + } + + var errors []string + data := make(map[string]map[string][]*schema.JobMetric) + for i, res := range resBody.Results { + if len(res) == 0 { + // No Data Found For Metric, Logged in FetchData to Warn + continue + } + + var query APIQuery + if resBody.Queries != nil { + query = resBody.Queries[i] + } else { + query = req.Queries[i] + } + + metric := query.Metric + qdata := res[0] + if qdata.Error != nil { + errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error)) + continue + } + + mc := archive.GetMetricConfig(cluster, metric) + if mc == nil { + cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster) + continue + } + + ms.SanitizeStats(&qdata.Avg, &qdata.Min, &qdata.Max) + + hostdata, ok := data[query.Hostname] + if !ok { + hostdata = make(map[string][]*schema.JobMetric) + data[query.Hostname] = hostdata + } + + hostdata[metric] = append(hostdata[metric], &schema.JobMetric{ + Unit: mc.Unit, + Timestep: mc.Timestep, + Series: []schema.Series{ + { + Hostname: query.Hostname, + Data: qdata.Data, + Statistics: schema.MetricStatistics{ + Avg: float64(qdata.Avg), + Min: float64(qdata.Min), + Max: float64(qdata.Max), + }, + }, + }, + }) + } + + if len(errors) != 0 { + /* Returns list of "partial errors" */ + return data, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) + } + + return data, nil +} + +// LoadNodeListData retrieves paginated node metrics for the Systems-View Node-List. +// +// Supports filtering by subcluster and node name pattern. The nodeFilter performs +// substring matching on hostnames. +// +// Returns: +// - Node data organized as: hostname -> JobData (metric -> scope -> series) +// - Total node count (before pagination) +// - HasNextPage flag indicating if more pages are available +// - Error (may be partial error with some data returned) +func (ccms *CCMetricStore) LoadNodeListData( + cluster, subCluster string, + nodes []string, + metrics []string, + scopes []schema.MetricScope, + resolution int, + from, to time.Time, + ctx context.Context, +) (map[string]schema.JobData, error) { + queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution) + if err != nil { + cclog.Errorf("Error while building node queries for Cluster %s, SubCluster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error()) + return nil, err + } + + // Verify assignment is correct - log any inconsistencies for debugging + if len(queries) != len(assignedScope) { + cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildNodeQueries: %d vs %d", + len(queries), len(assignedScope)) + } + + req := APIQueryRequest{ + Cluster: cluster, + Queries: queries, + From: from.Unix(), + To: to.Unix(), + WithStats: true, + WithData: true, + } + + resBody, err := ccms.doRequest(ctx, &req) + if err != nil { + cclog.Errorf("Error while performing request for cluster %s: %s", cluster, err.Error()) + return nil, err + } + + var errors []string + data := make(map[string]schema.JobData) + + // Add safety check for index out of range issues + if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) { + cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d", + len(req.Queries), len(resBody.Results), len(assignedScope)) + if len(resBody.Results) > len(req.Queries) { + resBody.Results = resBody.Results[:len(req.Queries)] + } + if len(assignedScope) > len(req.Queries) { + assignedScope = assignedScope[:len(req.Queries)] + } + } + + for i, row := range resBody.Results { + var query APIQuery + if resBody.Queries != nil { + if i < len(resBody.Queries) { + query = resBody.Queries[i] + } else { + cclog.Warnf("Index out of range prevented for resBody.Queries: i=%d, len=%d", + i, len(resBody.Queries)) + continue + } + } else { + query = req.Queries[i] + } + + metric := query.Metric + scope := assignedScope[i] + mc := archive.GetMetricConfig(cluster, metric) + if mc == nil { + cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster) + continue + } + + res := mc.Timestep + if len(row) > 0 { + res = row[0].Resolution + } + + // Init Nested Map Data Structures If Not Found + hostData, ok := data[query.Hostname] + if !ok { + hostData = make(schema.JobData) + data[query.Hostname] = hostData + } + + metricData, ok := hostData[metric] + if !ok { + metricData = make(map[schema.MetricScope]*schema.JobMetric) + data[query.Hostname][metric] = metricData + } + + scopeData, ok := metricData[scope] + if !ok { + scopeData = &schema.JobMetric{ + Unit: mc.Unit, + Timestep: res, + Series: make([]schema.Series, 0), + } + data[query.Hostname][metric][scope] = scopeData + } + + for ndx, res := range row { + if res.Error != nil { + /* Build list for "partial errors", if any */ + errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error)) + continue + } + + id := ms.ExtractTypeID(query.Type, query.TypeIds, ndx, query.Metric, query.Hostname) + + ms.SanitizeStats(&res.Avg, &res.Min, &res.Max) + + scopeData.Series = append(scopeData.Series, schema.Series{ + Hostname: query.Hostname, + ID: id, + Statistics: schema.MetricStatistics{ + Avg: float64(res.Avg), + Min: float64(res.Min), + Max: float64(res.Max), + }, + Data: res.Data, + }) + } + } + + if len(errors) != 0 { + /* Returns list of "partial errors" */ + return data, fmt.Errorf("METRICDATA/EXTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) + } + + return data, nil +} + +// HealthCheck queries the external cc-metric-store's health check endpoint. +// It sends a HealthCheckReq as the request body to /api/healthcheck and +// returns the per-node health check results. +func (ccms *CCMetricStore) HealthCheck(cluster string, + nodes []string, metrics []string, +) (map[string]ms.HealthCheckResult, error) { + req := ms.HealthCheckReq{ + Cluster: cluster, + Nodes: nodes, + MetricNames: metrics, + } + + buf := &bytes.Buffer{} + if err := json.NewEncoder(buf).Encode(req); err != nil { + cclog.Errorf("Error while encoding health check request body: %s", err.Error()) + return nil, err + } + + endpoint := fmt.Sprintf("%s/api/healthcheck", ccms.url) + httpReq, err := http.NewRequest(http.MethodGet, endpoint, buf) + if err != nil { + cclog.Errorf("Error while building health check request: %s", err.Error()) + return nil, err + } + if ccms.jwt != "" { + httpReq.Header.Add("Authorization", fmt.Sprintf("Bearer %s", ccms.jwt)) + } + + res, err := ccms.client.Do(httpReq) + if err != nil { + cclog.Errorf("Error while performing health check request: %s", err.Error()) + return nil, err + } + defer res.Body.Close() + + if res.StatusCode != http.StatusOK { + return nil, fmt.Errorf("'%s': HTTP Status: %s", endpoint, res.Status) + } + + var results map[string]ms.HealthCheckResult + if err := json.NewDecoder(bufio.NewReader(res.Body)).Decode(&results); err != nil { + cclog.Errorf("Error while decoding health check response: %s", err.Error()) + return nil, err + } + + return results, nil +} + +// hasNaNStats returns true if any of the statistics contain NaN values. +func hasNaNStats(avg, min, max schema.Float) bool { + return avg.IsNaN() || min.IsNaN() || max.IsNaN() +} diff --git a/internal/repository/dbConnection.go b/internal/repository/dbConnection.go index 1c14c956..d67d3227 100644 --- a/internal/repository/dbConnection.go +++ b/internal/repository/dbConnection.go @@ -12,7 +12,7 @@ import ( "sync" "time" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/jmoiron/sqlx" "github.com/mattn/go-sqlite3" "github.com/qustavo/sqlhooks/v2" @@ -51,7 +51,7 @@ func setupSqlite(db *sql.DB) error { return nil } -func Connect(driver string, db string) { +func Connect(db string) { var err error var dbHandle *sqlx.DB @@ -64,39 +64,31 @@ func Connect(driver string, db string) { ConnectionMaxIdleTime: repoConfig.ConnectionMaxIdleTime, } - switch driver { - case "sqlite3": - // TODO: Have separate DB handles for Writes and Reads - // Optimize SQLite connection: https://kerkour.com/sqlite-for-servers - connectionURLParams := make(url.Values) - connectionURLParams.Add("_txlock", "immediate") - connectionURLParams.Add("_journal_mode", "WAL") - connectionURLParams.Add("_busy_timeout", "5000") - connectionURLParams.Add("_synchronous", "NORMAL") - connectionURLParams.Add("_cache_size", "1000000000") - connectionURLParams.Add("_foreign_keys", "true") - opts.URL = fmt.Sprintf("file:%s?%s", opts.URL, connectionURLParams.Encode()) + // TODO: Have separate DB handles for Writes and Reads + // Optimize SQLite connection: https://kerkour.com/sqlite-for-servers + connectionURLParams := make(url.Values) + connectionURLParams.Add("_txlock", "immediate") + connectionURLParams.Add("_journal_mode", "WAL") + connectionURLParams.Add("_busy_timeout", "5000") + connectionURLParams.Add("_synchronous", "NORMAL") + connectionURLParams.Add("_cache_size", "1000000000") + connectionURLParams.Add("_foreign_keys", "true") + opts.URL = fmt.Sprintf("file:%s?%s", opts.URL, connectionURLParams.Encode()) - if cclog.Loglevel() == "debug" { - sql.Register("sqlite3WithHooks", sqlhooks.Wrap(&sqlite3.SQLiteDriver{}, &Hooks{})) - dbHandle, err = sqlx.Open("sqlite3WithHooks", opts.URL) - } else { - dbHandle, err = sqlx.Open("sqlite3", opts.URL) - } - - err = setupSqlite(dbHandle.DB) - if err != nil { - cclog.Abortf("Failed sqlite db setup.\nError: %s\n", err.Error()) - } - case "mysql": - opts.URL += "?multiStatements=true" - dbHandle, err = sqlx.Open("mysql", opts.URL) - default: - cclog.Abortf("DB Connection: Unsupported database driver '%s'.\n", driver) + if cclog.Loglevel() == "debug" { + sql.Register("sqlite3WithHooks", sqlhooks.Wrap(&sqlite3.SQLiteDriver{}, &Hooks{})) + dbHandle, err = sqlx.Open("sqlite3WithHooks", opts.URL) + } else { + dbHandle, err = sqlx.Open("sqlite3", opts.URL) } if err != nil { - cclog.Abortf("DB Connection: Could not connect to '%s' database with sqlx.Open().\nError: %s\n", driver, err.Error()) + cclog.Abortf("DB Connection: Could not connect to SQLite database with sqlx.Open().\nError: %s\n", err.Error()) + } + + err = setupSqlite(dbHandle.DB) + if err != nil { + cclog.Abortf("Failed sqlite db setup.\nError: %s\n", err.Error()) } dbHandle.SetMaxOpenConns(opts.MaxOpenConnections) @@ -104,8 +96,8 @@ func Connect(driver string, db string) { dbHandle.SetConnMaxLifetime(opts.ConnectionMaxLifetime) dbHandle.SetConnMaxIdleTime(opts.ConnectionMaxIdleTime) - dbConnInstance = &DBConnection{DB: dbHandle, Driver: driver} - err = checkDBVersion(driver, dbHandle.DB) + dbConnInstance = &DBConnection{DB: dbHandle} + err = checkDBVersion(dbHandle.DB) if err != nil { cclog.Abortf("DB Connection: Failed DB version check.\nError: %s\n", err.Error()) } @@ -119,3 +111,26 @@ func GetConnection() *DBConnection { return dbConnInstance } + +// ResetConnection closes the current database connection and resets the connection state. +// This function is intended for testing purposes only to allow test isolation. +func ResetConnection() error { + if dbConnInstance != nil && dbConnInstance.DB != nil { + if err := dbConnInstance.DB.Close(); err != nil { + return fmt.Errorf("failed to close database connection: %w", err) + } + } + + dbConnInstance = nil + dbConnOnce = sync.Once{} + jobRepoInstance = nil + jobRepoOnce = sync.Once{} + nodeRepoInstance = nil + nodeRepoOnce = sync.Once{} + userRepoInstance = nil + userRepoOnce = sync.Once{} + userCfgRepoInstance = nil + userCfgRepoOnce = sync.Once{} + + return nil +} diff --git a/internal/repository/hooks.go b/internal/repository/hooks.go index 54330723..824beb7c 100644 --- a/internal/repository/hooks.go +++ b/internal/repository/hooks.go @@ -2,13 +2,14 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package repository import ( "context" "time" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" ) // Hooks satisfies the sqlhook.Hooks interface diff --git a/internal/repository/hooks_test.go b/internal/repository/hooks_test.go new file mode 100644 index 00000000..52f954b5 --- /dev/null +++ b/internal/repository/hooks_test.go @@ -0,0 +1,274 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package repository + +import ( + "context" + "testing" + "time" + + "github.com/ClusterCockpit/cc-lib/v2/schema" + _ "github.com/mattn/go-sqlite3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type MockJobHook struct { + startCalled bool + stopCalled bool + startJobs []*schema.Job + stopJobs []*schema.Job +} + +func (m *MockJobHook) JobStartCallback(job *schema.Job) { + m.startCalled = true + m.startJobs = append(m.startJobs, job) +} + +func (m *MockJobHook) JobStopCallback(job *schema.Job) { + m.stopCalled = true + m.stopJobs = append(m.stopJobs, job) +} + +func TestRegisterJobHook(t *testing.T) { + t.Run("register single hook", func(t *testing.T) { + hooks = nil + mock := &MockJobHook{} + + RegisterJobHook(mock) + + assert.NotNil(t, hooks) + assert.Len(t, hooks, 1) + assert.Equal(t, mock, hooks[0]) + + hooks = nil + }) + + t.Run("register multiple hooks", func(t *testing.T) { + hooks = nil + mock1 := &MockJobHook{} + mock2 := &MockJobHook{} + + RegisterJobHook(mock1) + RegisterJobHook(mock2) + + assert.Len(t, hooks, 2) + assert.Equal(t, mock1, hooks[0]) + assert.Equal(t, mock2, hooks[1]) + + hooks = nil + }) + + t.Run("register nil hook does not add to hooks", func(t *testing.T) { + hooks = nil + RegisterJobHook(nil) + + if hooks != nil { + assert.Len(t, hooks, 0, "Nil hook should not be added") + } + + hooks = nil + }) +} + +func TestCallJobStartHooks(t *testing.T) { + t.Run("call start hooks with single job", func(t *testing.T) { + hooks = nil + mock := &MockJobHook{} + RegisterJobHook(mock) + + job := &schema.Job{ + JobID: 123, + User: "testuser", + Cluster: "testcluster", + } + + CallJobStartHooks([]*schema.Job{job}) + + assert.True(t, mock.startCalled) + assert.False(t, mock.stopCalled) + assert.Len(t, mock.startJobs, 1) + assert.Equal(t, int64(123), mock.startJobs[0].JobID) + + hooks = nil + }) + + t.Run("call start hooks with multiple jobs", func(t *testing.T) { + hooks = nil + mock := &MockJobHook{} + RegisterJobHook(mock) + + jobs := []*schema.Job{ + {JobID: 1, User: "user1", Cluster: "cluster1"}, + {JobID: 2, User: "user2", Cluster: "cluster2"}, + {JobID: 3, User: "user3", Cluster: "cluster3"}, + } + + CallJobStartHooks(jobs) + + assert.True(t, mock.startCalled) + assert.Len(t, mock.startJobs, 3) + assert.Equal(t, int64(1), mock.startJobs[0].JobID) + assert.Equal(t, int64(2), mock.startJobs[1].JobID) + assert.Equal(t, int64(3), mock.startJobs[2].JobID) + + hooks = nil + }) + + t.Run("call start hooks with multiple registered hooks", func(t *testing.T) { + hooks = nil + mock1 := &MockJobHook{} + mock2 := &MockJobHook{} + RegisterJobHook(mock1) + RegisterJobHook(mock2) + + job := &schema.Job{ + JobID: 456, User: "testuser", Cluster: "testcluster", + } + + CallJobStartHooks([]*schema.Job{job}) + + assert.True(t, mock1.startCalled) + assert.True(t, mock2.startCalled) + assert.Len(t, mock1.startJobs, 1) + assert.Len(t, mock2.startJobs, 1) + + hooks = nil + }) + + t.Run("call start hooks with nil hooks", func(t *testing.T) { + hooks = nil + + job := &schema.Job{ + JobID: 789, User: "testuser", Cluster: "testcluster", + } + + CallJobStartHooks([]*schema.Job{job}) + + hooks = nil + }) + + t.Run("call start hooks with empty job list", func(t *testing.T) { + hooks = nil + mock := &MockJobHook{} + RegisterJobHook(mock) + + CallJobStartHooks([]*schema.Job{}) + + assert.False(t, mock.startCalled) + assert.Len(t, mock.startJobs, 0) + + hooks = nil + }) +} + +func TestCallJobStopHooks(t *testing.T) { + t.Run("call stop hooks with single job", func(t *testing.T) { + hooks = nil + mock := &MockJobHook{} + RegisterJobHook(mock) + + job := &schema.Job{ + JobID: 123, + User: "testuser", + Cluster: "testcluster", + } + + CallJobStopHooks(job) + + assert.True(t, mock.stopCalled) + assert.False(t, mock.startCalled) + assert.Len(t, mock.stopJobs, 1) + assert.Equal(t, int64(123), mock.stopJobs[0].JobID) + + hooks = nil + }) + + t.Run("call stop hooks with multiple registered hooks", func(t *testing.T) { + hooks = nil + mock1 := &MockJobHook{} + mock2 := &MockJobHook{} + RegisterJobHook(mock1) + RegisterJobHook(mock2) + + job := &schema.Job{ + JobID: 456, User: "testuser", Cluster: "testcluster", + } + + CallJobStopHooks(job) + + assert.True(t, mock1.stopCalled) + assert.True(t, mock2.stopCalled) + assert.Len(t, mock1.stopJobs, 1) + assert.Len(t, mock2.stopJobs, 1) + + hooks = nil + }) + + t.Run("call stop hooks with nil hooks", func(t *testing.T) { + hooks = nil + + job := &schema.Job{ + JobID: 789, User: "testuser", Cluster: "testcluster", + } + + CallJobStopHooks(job) + + hooks = nil + }) +} + +func TestSQLHooks(t *testing.T) { + _ = setup(t) + + t.Run("hooks log queries in debug mode", func(t *testing.T) { + h := &Hooks{} + + ctx := context.Background() + query := "SELECT * FROM job WHERE job_id = ?" + args := []any{123} + + ctxWithTime, err := h.Before(ctx, query, args...) + require.NoError(t, err) + assert.NotNil(t, ctxWithTime) + + beginTime := ctxWithTime.Value("begin") + require.NotNil(t, beginTime) + _, ok := beginTime.(time.Time) + assert.True(t, ok, "Begin time should be time.Time") + + time.Sleep(10 * time.Millisecond) + + ctxAfter, err := h.After(ctxWithTime, query, args...) + require.NoError(t, err) + assert.NotNil(t, ctxAfter) + }) +} + +func TestHookIntegration(t *testing.T) { + t.Run("hooks are called during job lifecycle", func(t *testing.T) { + hooks = nil + mock := &MockJobHook{} + RegisterJobHook(mock) + + job := &schema.Job{ + JobID: 999, + User: "integrationuser", + Cluster: "integrationcluster", + } + + CallJobStartHooks([]*schema.Job{job}) + assert.True(t, mock.startCalled) + assert.Equal(t, 1, len(mock.startJobs)) + + CallJobStopHooks(job) + assert.True(t, mock.stopCalled) + assert.Equal(t, 1, len(mock.stopJobs)) + + assert.Equal(t, mock.startJobs[0].JobID, mock.stopJobs[0].JobID) + + hooks = nil + }) +} diff --git a/internal/repository/job.go b/internal/repository/job.go index 2f003e3b..566a13b1 100644 --- a/internal/repository/job.go +++ b/internal/repository/job.go @@ -14,8 +14,6 @@ // Initialize the database connection before using any repository: // // repository.Connect("sqlite3", "./var/job.db") -// // or for MySQL: -// repository.Connect("mysql", "user:password@tcp(localhost:3306)/dbname") // // # Configuration // @@ -68,31 +66,47 @@ import ( "fmt" "maps" "math" + "sort" "strconv" "sync" "time" "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/lrucache" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/lrucache" + "github.com/ClusterCockpit/cc-lib/v2/schema" sq "github.com/Masterminds/squirrel" "github.com/jmoiron/sqlx" ) var ( - jobRepoOnce sync.Once + // jobRepoOnce ensures singleton initialization of the JobRepository + jobRepoOnce sync.Once + // jobRepoInstance holds the single instance of JobRepository jobRepoInstance *JobRepository ) +// JobRepository provides database access for job-related operations. +// It implements the repository pattern to abstract database interactions +// and provides caching for improved performance. +// +// The repository is a singleton initialized via GetJobRepository(). +// All database queries use prepared statements via stmtCache for efficiency. +// Frequently accessed data (metadata, energy footprints) is cached in an LRU cache. type JobRepository struct { - DB *sqlx.DB - stmtCache *sq.StmtCache - cache *lrucache.Cache - driver string - Mutex sync.Mutex + DB *sqlx.DB // Database connection pool + stmtCache *sq.StmtCache // Prepared statement cache for query optimization + cache *lrucache.Cache // LRU cache for metadata and footprint data + driver string // Database driver name (e.g., "sqlite3") + Mutex sync.Mutex // Mutex for thread-safe operations } +// GetJobRepository returns the singleton instance of JobRepository. +// The repository is initialized lazily on first access with database connection, +// prepared statement cache, and LRU cache configured from repoConfig. +// +// This function is thread-safe and ensures only one instance is created. +// It must be called after Connect() has established a database connection. func GetJobRepository() *JobRepository { jobRepoOnce.Do(func() { db := GetConnection() @@ -108,6 +122,8 @@ func GetJobRepository() *JobRepository { return jobRepoInstance } +// jobColumns defines the standard set of columns selected from the job table. +// Used consistently across all job queries to ensure uniform data retrieval. var jobColumns []string = []string{ "job.id", "job.job_id", "job.hpc_user", "job.project", "job.cluster", "job.subcluster", "job.start_time", "job.cluster_partition", "job.array_job_id", "job.num_nodes", @@ -116,6 +132,8 @@ var jobColumns []string = []string{ "job.footprint", "job.energy", } +// jobCacheColumns defines columns from the job_cache table, mirroring jobColumns. +// Used for queries against cached job data for performance optimization. var jobCacheColumns []string = []string{ "job_cache.id", "job_cache.job_id", "job_cache.hpc_user", "job_cache.project", "job_cache.cluster", "job_cache.subcluster", "job_cache.start_time", "job_cache.cluster_partition", @@ -125,6 +143,14 @@ var jobCacheColumns []string = []string{ "job_cache.footprint", "job_cache.energy", } +// scanJob converts a database row into a schema.Job struct. +// It handles JSON unmarshaling of resources and footprint fields, +// and calculates accurate duration for running jobs. +// +// Parameters: +// - row: Database row implementing Scan() interface (sql.Row or sql.Rows) +// +// Returns the populated Job struct or an error if scanning or unmarshaling fails. func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { job := &schema.Job{} @@ -157,59 +183,53 @@ func scanJob(row interface{ Scan(...any) error }) (*schema.Job, error) { return job, nil } +// Optimize performs database optimization by running VACUUM command. +// This reclaims unused space and defragments the database file. +// Should be run periodically during maintenance windows. func (r *JobRepository) Optimize() error { - var err error - - switch r.driver { - case "sqlite3": - if _, err = r.DB.Exec(`VACUUM`); err != nil { - return err - } - case "mysql": - cclog.Info("Optimize currently not supported for mysql driver") + if _, err := r.DB.Exec(`VACUUM`); err != nil { + cclog.Errorf("Error while executing VACUUM: %v", err) + return fmt.Errorf("failed to optimize database: %w", err) } - return nil } +// Flush removes all data from job-related tables (jobtag, tag, job). +// WARNING: This is a destructive operation that deletes all job data. +// Use with extreme caution, typically only for testing or complete resets. func (r *JobRepository) Flush() error { - var err error - - switch r.driver { - case "sqlite3": - if _, err = r.DB.Exec(`DELETE FROM jobtag`); err != nil { - return err - } - if _, err = r.DB.Exec(`DELETE FROM tag`); err != nil { - return err - } - if _, err = r.DB.Exec(`DELETE FROM job`); err != nil { - return err - } - case "mysql": - if _, err = r.DB.Exec(`SET FOREIGN_KEY_CHECKS = 0`); err != nil { - return err - } - if _, err = r.DB.Exec(`TRUNCATE TABLE jobtag`); err != nil { - return err - } - if _, err = r.DB.Exec(`TRUNCATE TABLE tag`); err != nil { - return err - } - if _, err = r.DB.Exec(`TRUNCATE TABLE job`); err != nil { - return err - } - if _, err = r.DB.Exec(`SET FOREIGN_KEY_CHECKS = 1`); err != nil { - return err - } + if _, err := r.DB.Exec(`DELETE FROM jobtag`); err != nil { + cclog.Errorf("Error while deleting from jobtag table: %v", err) + return fmt.Errorf("failed to flush jobtag table: %w", err) + } + if _, err := r.DB.Exec(`DELETE FROM tag`); err != nil { + cclog.Errorf("Error while deleting from tag table: %v", err) + return fmt.Errorf("failed to flush tag table: %w", err) + } + if _, err := r.DB.Exec(`DELETE FROM job`); err != nil { + cclog.Errorf("Error while deleting from job table: %v", err) + return fmt.Errorf("failed to flush job table: %w", err) } - return nil } +// FetchMetadata retrieves and unmarshals the metadata JSON for a job. +// Metadata is cached with a 24-hour TTL to improve performance. +// +// The metadata field stores arbitrary key-value pairs associated with a job, +// such as tags, labels, or custom attributes added by external systems. +// +// Parameters: +// - job: Job struct with valid ID field, metadata will be populated in job.MetaData +// +// Returns the metadata map or an error if the job is nil or database query fails. func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error) { + if job == nil { + return nil, fmt.Errorf("job cannot be nil") + } + start := time.Now() - cachekey := fmt.Sprintf("metadata:%d", job.ID) + cachekey := fmt.Sprintf("metadata:%d", *job.ID) if cached := r.cache.Get(cachekey, nil); cached != nil { job.MetaData = cached.(map[string]string) return job.MetaData, nil @@ -217,8 +237,8 @@ func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error if err := sq.Select("job.meta_data").From("job").Where("job.id = ?", job.ID). RunWith(r.stmtCache).QueryRow().Scan(&job.RawMetaData); err != nil { - cclog.Warn("Error while scanning for job metadata") - return nil, err + cclog.Warnf("Error while scanning for job metadata (ID=%d): %v", *job.ID, err) + return nil, fmt.Errorf("failed to fetch metadata for job %d: %w", *job.ID, err) } if len(job.RawMetaData) == 0 { @@ -226,8 +246,8 @@ func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error } if err := json.Unmarshal(job.RawMetaData, &job.MetaData); err != nil { - cclog.Warn("Error while unmarshaling raw metadata json") - return nil, err + cclog.Warnf("Error while unmarshaling raw metadata json (ID=%d): %v", *job.ID, err) + return nil, fmt.Errorf("failed to unmarshal metadata for job %d: %w", *job.ID, err) } r.cache.Put(cachekey, job.MetaData, len(job.RawMetaData), 24*time.Hour) @@ -235,13 +255,27 @@ func (r *JobRepository) FetchMetadata(job *schema.Job) (map[string]string, error return job.MetaData, nil } +// UpdateMetadata adds or updates a single metadata key-value pair for a job. +// The entire metadata map is re-marshaled and stored, and the cache is invalidated. +// Also triggers archive metadata update via archive.UpdateMetadata. +// +// Parameters: +// - job: Job struct with valid ID, existing metadata will be fetched if not present +// - key: Metadata key to set +// - val: Metadata value to set +// +// Returns an error if the job is nil, metadata fetch fails, or database update fails. func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err error) { - cachekey := fmt.Sprintf("metadata:%d", job.ID) + if job == nil { + return fmt.Errorf("job cannot be nil") + } + + cachekey := fmt.Sprintf("metadata:%d", *job.ID) r.cache.Del(cachekey) if job.MetaData == nil { if _, err = r.FetchMetadata(job); err != nil { - cclog.Warnf("Error while fetching metadata for job, DB ID '%v'", job.ID) - return err + cclog.Warnf("Error while fetching metadata for job, DB ID '%v'", *job.ID) + return fmt.Errorf("failed to fetch metadata for job %d: %w", *job.ID, err) } } @@ -255,29 +289,43 @@ func (r *JobRepository) UpdateMetadata(job *schema.Job, key, val string) (err er } if job.RawMetaData, err = json.Marshal(job.MetaData); err != nil { - cclog.Warnf("Error while marshaling metadata for job, DB ID '%v'", job.ID) - return err + cclog.Warnf("Error while marshaling metadata for job, DB ID '%v'", *job.ID) + return fmt.Errorf("failed to marshal metadata for job %d: %w", *job.ID, err) } if _, err = sq.Update("job"). Set("meta_data", job.RawMetaData). Where("job.id = ?", job.ID). RunWith(r.stmtCache).Exec(); err != nil { - cclog.Warnf("Error while updating metadata for job, DB ID '%v'", job.ID) - return err + cclog.Warnf("Error while updating metadata for job, DB ID '%v'", *job.ID) + return fmt.Errorf("failed to update metadata in database for job %d: %w", *job.ID, err) } r.cache.Put(cachekey, job.MetaData, len(job.RawMetaData), 24*time.Hour) return archive.UpdateMetadata(job, job.MetaData) } +// FetchFootprint retrieves and unmarshals the performance footprint JSON for a job. +// Unlike FetchMetadata, footprints are NOT cached as they can be large and change frequently. +// +// The footprint contains summary statistics (avg/min/max) for monitored metrics, +// stored as JSON with keys like "cpu_load_avg", "mem_used_max", etc. +// +// Parameters: +// - job: Job struct with valid ID, footprint will be populated in job.Footprint +// +// Returns the footprint map or an error if the job is nil or database query fails. func (r *JobRepository) FetchFootprint(job *schema.Job) (map[string]float64, error) { + if job == nil { + return nil, fmt.Errorf("job cannot be nil") + } + start := time.Now() if err := sq.Select("job.footprint").From("job").Where("job.id = ?", job.ID). RunWith(r.stmtCache).QueryRow().Scan(&job.RawFootprint); err != nil { - cclog.Warn("Error while scanning for job footprint") - return nil, err + cclog.Warnf("Error while scanning for job footprint (ID=%d): %v", *job.ID, err) + return nil, fmt.Errorf("failed to fetch footprint for job %d: %w", *job.ID, err) } if len(job.RawFootprint) == 0 { @@ -285,17 +333,31 @@ func (r *JobRepository) FetchFootprint(job *schema.Job) (map[string]float64, err } if err := json.Unmarshal(job.RawFootprint, &job.Footprint); err != nil { - cclog.Warn("Error while unmarshaling raw footprint json") - return nil, err + cclog.Warnf("Error while unmarshaling raw footprint json (ID=%d): %v", *job.ID, err) + return nil, fmt.Errorf("failed to unmarshal footprint for job %d: %w", *job.ID, err) } cclog.Debugf("Timer FetchFootprint %s", time.Since(start)) return job.Footprint, nil } +// FetchEnergyFootprint retrieves and unmarshals the energy footprint JSON for a job. +// Energy footprints are cached with a 24-hour TTL as they are frequently accessed but rarely change. +// +// The energy footprint contains calculated energy consumption (in kWh) per metric, +// stored as JSON with keys like "power_avg", "acc_power_avg", etc. +// +// Parameters: +// - job: Job struct with valid ID, energy footprint will be populated in job.EnergyFootprint +// +// Returns the energy footprint map or an error if the job is nil or database query fails. func (r *JobRepository) FetchEnergyFootprint(job *schema.Job) (map[string]float64, error) { + if job == nil { + return nil, fmt.Errorf("job cannot be nil") + } + start := time.Now() - cachekey := fmt.Sprintf("energyFootprint:%d", job.ID) + cachekey := fmt.Sprintf("energyFootprint:%d", *job.ID) if cached := r.cache.Get(cachekey, nil); cached != nil { job.EnergyFootprint = cached.(map[string]float64) return job.EnergyFootprint, nil @@ -303,8 +365,8 @@ func (r *JobRepository) FetchEnergyFootprint(job *schema.Job) (map[string]float6 if err := sq.Select("job.energy_footprint").From("job").Where("job.id = ?", job.ID). RunWith(r.stmtCache).QueryRow().Scan(&job.RawEnergyFootprint); err != nil { - cclog.Warn("Error while scanning for job energy_footprint") - return nil, err + cclog.Warnf("Error while scanning for job energy_footprint (ID=%d): %v", *job.ID, err) + return nil, fmt.Errorf("failed to fetch energy footprint for job %d: %w", *job.ID, err) } if len(job.RawEnergyFootprint) == 0 { @@ -312,8 +374,8 @@ func (r *JobRepository) FetchEnergyFootprint(job *schema.Job) (map[string]float6 } if err := json.Unmarshal(job.RawEnergyFootprint, &job.EnergyFootprint); err != nil { - cclog.Warn("Error while unmarshaling raw energy footprint json") - return nil, err + cclog.Warnf("Error while unmarshaling raw energy footprint json (ID=%d): %v", *job.ID, err) + return nil, fmt.Errorf("failed to unmarshal energy footprint for job %d: %w", *job.ID, err) } r.cache.Put(cachekey, job.EnergyFootprint, len(job.EnergyFootprint), 24*time.Hour) @@ -321,12 +383,28 @@ func (r *JobRepository) FetchEnergyFootprint(job *schema.Job) (map[string]float6 return job.EnergyFootprint, nil } -func (r *JobRepository) DeleteJobsBefore(startTime int64, omitTagged bool) (int, error) { +// DeleteJobsBefore removes jobs older than the specified start time. +// Optionally preserves tagged jobs to protect important data from deletion. +// Cache entries for deleted jobs are automatically invalidated. +// +// This is typically used for data retention policies and cleanup operations. +// WARNING: This is a destructive operation that permanently deletes job records. +// +// Parameters: +// - startTime: Unix timestamp, jobs with start_time < this value will be deleted +// - omitTagged: "none" = delete all jobs, "all" = skip any tagged jobs, +// "user" = skip jobs with user-created tags (not auto-tagger types "app"/"jobClass") +// +// Returns the count of deleted jobs or an error if the operation fails. +func (r *JobRepository) DeleteJobsBefore(startTime int64, omitTagged string) (int, error) { var cnt int q := sq.Select("count(*)").From("job").Where("job.start_time < ?", startTime) - if omitTagged { + switch omitTagged { + case "all": q = q.Where("NOT EXISTS (SELECT 1 FROM jobtag WHERE jobtag.job_id = job.id)") + case "user": + q = q.Where("NOT EXISTS (SELECT 1 FROM jobtag JOIN tag ON tag.id = jobtag.tag_id WHERE jobtag.job_id = job.id AND tag.tag_type NOT IN ('app', 'jobClass'))") } if err := q.RunWith(r.DB).QueryRow().Scan(&cnt); err != nil { @@ -339,8 +417,11 @@ func (r *JobRepository) DeleteJobsBefore(startTime int64, omitTagged bool) (int, var jobIds []int64 selectQuery := sq.Select("id").From("job").Where("job.start_time < ?", startTime) - if omitTagged { + switch omitTagged { + case "all": selectQuery = selectQuery.Where("NOT EXISTS (SELECT 1 FROM jobtag WHERE jobtag.job_id = job.id)") + case "user": + selectQuery = selectQuery.Where("NOT EXISTS (SELECT 1 FROM jobtag JOIN tag ON tag.id = jobtag.tag_id WHERE jobtag.job_id = job.id AND tag.tag_type NOT IN ('app', 'jobClass'))") } rows, err := selectQuery.RunWith(r.DB).Query() @@ -362,8 +443,11 @@ func (r *JobRepository) DeleteJobsBefore(startTime int64, omitTagged bool) (int, qd := sq.Delete("job").Where("job.start_time < ?", startTime) - if omitTagged { + switch omitTagged { + case "all": qd = qd.Where("NOT EXISTS (SELECT 1 FROM jobtag WHERE jobtag.job_id = job.id)") + case "user": + qd = qd.Where("NOT EXISTS (SELECT 1 FROM jobtag JOIN tag ON tag.id = jobtag.tag_id WHERE jobtag.job_id = job.id AND tag.tag_type NOT IN ('app', 'jobClass'))") } _, err := qd.RunWith(r.DB).Exec() @@ -376,7 +460,14 @@ func (r *JobRepository) DeleteJobsBefore(startTime int64, omitTagged bool) (int, return cnt, err } -func (r *JobRepository) DeleteJobById(id int64) error { +// DeleteJobByID permanently removes a single job by its database ID. +// Cache entries for the deleted job are automatically invalidated. +// +// Parameters: +// - id: Database ID (primary key) of the job to delete +// +// Returns an error if the deletion fails. +func (r *JobRepository) DeleteJobByID(id int64) error { // Invalidate cache entries before deletion r.cache.Del(fmt.Sprintf("metadata:%d", id)) r.cache.Del(fmt.Sprintf("energyFootprint:%d", id)) @@ -393,7 +484,29 @@ func (r *JobRepository) DeleteJobById(id int64) error { return err } +// FindUserOrProjectOrJobname attempts to interpret a search term as a job ID, +// username, project ID, or job name by querying the database. +// +// Search logic (in priority order): +// 1. If searchterm is numeric, treat as job ID (returned immediately) +// 2. Try exact match in job.hpc_user column (username) +// 3. Try LIKE match in hpc_user.name column (real name) +// 4. Try exact match in job.project column (project ID) +// 5. If no matches, return searchterm as jobname for GraphQL query +// +// This powers the searchbar functionality for flexible job searching. +// Requires authenticated user for database lookups (returns empty if user is nil). +// +// Parameters: +// - user: Authenticated user context, required for database access +// - searchterm: Search string to interpret +// +// Returns up to one non-empty value among (jobid, username, project, jobname). func (r *JobRepository) FindUserOrProjectOrJobname(user *schema.User, searchterm string) (jobid string, username string, project string, jobname string) { + if searchterm == "" { + return "", "", "", "" + } + if _, err := strconv.Atoi(searchterm); err == nil { // Return empty on successful conversion: parent method will redirect for integer jobId return searchterm, "", "", "" } else { // Has to have letters and logged-in user for other guesses @@ -424,7 +537,24 @@ var ( ErrForbidden = errors.New("not authorized") ) +// FindColumnValue performs a generic column lookup in a database table with role-based access control. +// Only users with admin, support, or manager roles can execute this query. +// +// Parameters: +// - user: User context for authorization check +// - searchterm: Value to search for (exact match or LIKE pattern) +// - table: Database table name to query +// - selectColumn: Column name to return in results +// - whereColumn: Column name to filter on +// - isLike: If true, use LIKE with wildcards; if false, use exact equality +// +// Returns the first matching value, ErrForbidden if user lacks permission, +// or ErrNotFound if no matches are found. func (r *JobRepository) FindColumnValue(user *schema.User, searchterm string, table string, selectColumn string, whereColumn string, isLike bool) (result string, err error) { + if user == nil { + return "", fmt.Errorf("user cannot be nil") + } + compareStr := " = ?" query := searchterm if isLike { @@ -435,17 +565,11 @@ func (r *JobRepository) FindColumnValue(user *schema.User, searchterm string, ta theQuery := sq.Select(table+"."+selectColumn).Distinct().From(table). Where(table+"."+whereColumn+compareStr, query) - // theSql, args, theErr := theQuery.ToSql() - // if theErr != nil { - // cclog.Warn("Error while converting query to sql") - // return "", err - // } - // cclog.Debugf("SQL query (FindColumnValue): `%s`, args: %#v", theSql, args) - err := theQuery.RunWith(r.stmtCache).QueryRow().Scan(&result) if err != nil && err != sql.ErrNoRows { - return "", err + cclog.Warnf("Error while querying FindColumnValue (table=%s, column=%s): %v", table, selectColumn, err) + return "", fmt.Errorf("failed to find column value: %w", err) } else if err == nil { return result, nil } @@ -456,22 +580,40 @@ func (r *JobRepository) FindColumnValue(user *schema.User, searchterm string, ta } } +// FindColumnValues performs a generic column lookup returning multiple matches with role-based access control. +// Similar to FindColumnValue but returns all matching values instead of just the first. +// Only users with admin, support, or manager roles can execute this query. +// +// Parameters: +// - user: User context for authorization check +// - query: Search pattern (always uses LIKE with wildcards) +// - table: Database table name to query +// - selectColumn: Column name to return in results +// - whereColumn: Column name to filter on +// +// Returns a slice of matching values, ErrForbidden if user lacks permission, +// or ErrNotFound if no matches are found. func (r *JobRepository) FindColumnValues(user *schema.User, query string, table string, selectColumn string, whereColumn string) (results []string, err error) { + if user == nil { + return nil, fmt.Errorf("user cannot be nil") + } + emptyResult := make([]string, 0) if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport, schema.RoleManager}) { rows, err := sq.Select(table+"."+selectColumn).Distinct().From(table). Where(table+"."+whereColumn+" LIKE ?", fmt.Sprint("%", query, "%")). RunWith(r.stmtCache).Query() if err != nil && err != sql.ErrNoRows { - return emptyResult, err + cclog.Errorf("Error while querying FindColumnValues (table=%s, column=%s): %v", table, selectColumn, err) + return emptyResult, fmt.Errorf("failed to find column values: %w", err) } else if err == nil { + defer rows.Close() for rows.Next() { var result string err := rows.Scan(&result) if err != nil { - rows.Close() - cclog.Warnf("Error while scanning rows: %v", err) - return emptyResult, err + cclog.Warnf("Error while scanning rows in FindColumnValues: %v", err) + return emptyResult, fmt.Errorf("failed to scan column value: %w", err) } results = append(results, result) } @@ -485,6 +627,13 @@ func (r *JobRepository) FindColumnValues(user *schema.User, query string, table } } +// Partitions returns a list of distinct cluster partitions for a given cluster. +// Results are cached with a 1-hour TTL to improve performance. +// +// Parameters: +// - cluster: Cluster name to query partitions for +// +// Returns a slice of partition names or an error if the database query fails. func (r *JobRepository) Partitions(cluster string) ([]string, error) { var err error start := time.Now() @@ -513,8 +662,8 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in Where("job.cluster = ?", cluster). RunWith(r.stmtCache).Query() if err != nil { - cclog.Error("Error while running query") - return nil, err + cclog.Errorf("Error while running AllocatedNodes query for cluster=%s: %v", cluster, err) + return nil, fmt.Errorf("failed to query allocated nodes for cluster %s: %w", cluster, err) } var raw []byte @@ -524,12 +673,12 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in var resources []*schema.Resource var subcluster string if err := rows.Scan(&raw, &subcluster); err != nil { - cclog.Warn("Error while scanning rows") - return nil, err + cclog.Warnf("Error while scanning rows in AllocatedNodes: %v", err) + return nil, fmt.Errorf("failed to scan allocated nodes row: %w", err) } if err := json.Unmarshal(raw, &resources); err != nil { - cclog.Warn("Error while unmarshaling raw resources json") - return nil, err + cclog.Warnf("Error while unmarshaling raw resources json in AllocatedNodes: %v", err) + return nil, fmt.Errorf("failed to unmarshal resources in AllocatedNodes: %w", err) } hosts, ok := subclusters[subcluster] @@ -547,7 +696,19 @@ func (r *JobRepository) AllocatedNodes(cluster string) (map[string]map[string]in return subclusters, nil } -// FIXME: Set duration to requested walltime? +// StopJobsExceedingWalltimeBy marks running jobs as failed if they exceed their walltime limit. +// This is typically called periodically to clean up stuck or orphaned jobs. +// +// Jobs are marked with: +// - monitoring_status: MonitoringStatusArchivingFailed +// - duration: 0 +// - job_state: JobStateFailed +// +// Parameters: +// - seconds: Grace period beyond walltime before marking as failed +// +// Returns an error if the database update fails. +// Logs the number of jobs marked as failed if any were affected. func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error { start := time.Now() currentTime := time.Now().Unix() @@ -557,17 +718,17 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error { Set("job_state", schema.JobStateFailed). Where("job.job_state = 'running'"). Where("job.walltime > 0"). - Where("(? - job.start_time) > (job.walltime + ?)", currentTime, seconds). + Where("job.start_time < (? - job.walltime)", currentTime-int64(seconds)). RunWith(r.DB).Exec() if err != nil { - cclog.Warn("Error while stopping jobs exceeding walltime") - return err + cclog.Warnf("Error while stopping jobs exceeding walltime: %v", err) + return fmt.Errorf("failed to stop jobs exceeding walltime: %w", err) } rowsAffected, err := res.RowsAffected() if err != nil { - cclog.Warn("Error while fetching affected rows after stopping due to exceeded walltime") - return err + cclog.Warnf("Error while fetching affected rows after stopping due to exceeded walltime: %v", err) + return fmt.Errorf("failed to get rows affected count: %w", err) } if rowsAffected > 0 { @@ -577,33 +738,46 @@ func (r *JobRepository) StopJobsExceedingWalltimeBy(seconds int) error { return nil } -func (r *JobRepository) FindJobIdsByTag(tagId int64) ([]int64, error) { +// FindJobIdsByTag returns all job database IDs associated with a specific tag. +// +// Parameters: +// - tagID: Database ID of the tag to search for +// +// Returns a slice of job IDs or an error if the query fails. +func (r *JobRepository) FindJobIdsByTag(tagID int64) ([]int64, error) { query := sq.Select("job.id").From("job"). Join("jobtag ON jobtag.job_id = job.id"). - Where(sq.Eq{"jobtag.tag_id": tagId}).Distinct() + Where(sq.Eq{"jobtag.tag_id": tagID}).Distinct() rows, err := query.RunWith(r.stmtCache).Query() if err != nil { - cclog.Error("Error while running query") - return nil, err + cclog.Errorf("Error while running FindJobIdsByTag query for tagID=%d: %v", tagID, err) + return nil, fmt.Errorf("failed to find job IDs by tag %d: %w", tagID, err) } + defer rows.Close() + jobIds := make([]int64, 0, 100) for rows.Next() { - var jobId int64 + var jobID int64 - if err := rows.Scan(&jobId); err != nil { - rows.Close() - cclog.Warn("Error while scanning rows") - return nil, err + if err := rows.Scan(&jobID); err != nil { + cclog.Warnf("Error while scanning rows in FindJobIdsByTag: %v", err) + return nil, fmt.Errorf("failed to scan job ID in FindJobIdsByTag: %w", err) } - jobIds = append(jobIds, jobId) + jobIds = append(jobIds, jobID) } return jobIds, nil } -// FIXME: Reconsider filtering short jobs with harcoded threshold +// FindRunningJobs returns all currently running jobs for a specific cluster. +// Filters out short-running jobs based on repoConfig.MinRunningJobDuration threshold. +// +// Parameters: +// - cluster: Cluster name to filter jobs +// +// Returns a slice of running job objects or an error if the query fails. func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) { query := sq.Select(jobColumns...).From("job"). Where("job.cluster = ?", cluster). @@ -612,8 +786,8 @@ func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) { rows, err := query.RunWith(r.stmtCache).Query() if err != nil { - cclog.Error("Error while running query") - return nil, err + cclog.Errorf("Error while running FindRunningJobs query for cluster=%s: %v", cluster, err) + return nil, fmt.Errorf("failed to find running jobs for cluster %s: %w", cluster, err) } defer rows.Close() @@ -621,16 +795,22 @@ func (r *JobRepository) FindRunningJobs(cluster string) ([]*schema.Job, error) { for rows.Next() { job, err := scanJob(rows) if err != nil { - cclog.Warn("Error while scanning rows") - return nil, err + cclog.Warnf("Error while scanning rows in FindRunningJobs: %v", err) + return nil, fmt.Errorf("failed to scan job in FindRunningJobs: %w", err) } jobs = append(jobs, job) } - cclog.Infof("Return job count %d", len(jobs)) + cclog.Debugf("JobRepository.FindRunningJobs(): Return job count %d (cluster: %s)", len(jobs), cluster) return jobs, nil } +// UpdateDuration recalculates and updates the duration field for all running jobs. +// Called periodically to keep job durations current without querying individual jobs. +// +// Duration is calculated as: current_time - job.start_time +// +// Returns an error if the database update fails. func (r *JobRepository) UpdateDuration() error { stmnt := sq.Update("job"). Set("duration", sq.Expr("? - job.start_time", time.Now().Unix())). @@ -638,13 +818,25 @@ func (r *JobRepository) UpdateDuration() error { _, err := stmnt.RunWith(r.stmtCache).Exec() if err != nil { - return err + cclog.Errorf("Error while updating duration for running jobs: %v", err) + return fmt.Errorf("failed to update duration for running jobs: %w", err) } return nil } -func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64, omitTagged bool) ([]*schema.Job, error) { +// FindJobsBetween returns jobs within a specified time range. +// If startTimeBegin is 0, returns all jobs before startTimeEnd. +// Optionally excludes tagged jobs from results. +// +// Parameters: +// - startTimeBegin: Unix timestamp for range start (use 0 for unbounded start) +// - startTimeEnd: Unix timestamp for range end +// - omitTagged: "none" = include all jobs, "all" = exclude any tagged jobs, +// "user" = exclude jobs with user-created tags (not auto-tagger types "app"/"jobClass") +// +// Returns a slice of jobs or an error if the time range is invalid or query fails. +func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64, omitTagged string) ([]*schema.Job, error) { var query sq.SelectBuilder if startTimeBegin == startTimeEnd || startTimeBegin > startTimeEnd { @@ -659,14 +851,19 @@ func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64 query = sq.Select(jobColumns...).From("job").Where("job.start_time BETWEEN ? AND ?", startTimeBegin, startTimeEnd) } - if omitTagged { + switch omitTagged { + case "all": query = query.Where("NOT EXISTS (SELECT 1 FROM jobtag WHERE jobtag.job_id = job.id)") + case "user": + query = query.Where("NOT EXISTS (SELECT 1 FROM jobtag JOIN tag ON tag.id = jobtag.tag_id WHERE jobtag.job_id = job.id AND tag.tag_type NOT IN ('app', 'jobClass'))") } + query = query.OrderBy("job.cluster ASC", "job.subcluster ASC", "job.project ASC", "job.start_time ASC") + rows, err := query.RunWith(r.stmtCache).Query() if err != nil { - cclog.Error("Error while running query") - return nil, err + cclog.Errorf("Error while running FindJobsBetween query: %v", err) + return nil, fmt.Errorf("failed to find jobs between %d and %d: %w", startTimeBegin, startTimeEnd, err) } defer rows.Close() @@ -674,16 +871,24 @@ func (r *JobRepository) FindJobsBetween(startTimeBegin int64, startTimeEnd int64 for rows.Next() { job, err := scanJob(rows) if err != nil { - cclog.Warn("Error while scanning rows") - return nil, err + cclog.Warnf("Error while scanning rows in FindJobsBetween: %v", err) + return nil, fmt.Errorf("failed to scan job in FindJobsBetween: %w", err) } jobs = append(jobs, job) } - cclog.Infof("Return job count %d", len(jobs)) + cclog.Debugf("JobRepository.FindJobsBetween(): Return job count %d (omitTagged: %v)", len(jobs), omitTagged) return jobs, nil } +// UpdateMonitoringStatus updates the monitoring status for a job and invalidates its cache entries. +// Cache invalidation affects both metadata and energy footprint to ensure consistency. +// +// Parameters: +// - job: Database ID of the job to update +// - monitoringStatus: New monitoring status value (see schema.MonitoringStatus constants) +// +// Returns an error if the database update fails. func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32) (err error) { // Invalidate cache entries as monitoring status affects job state r.cache.Del(fmt.Sprintf("metadata:%d", job)) @@ -693,18 +898,37 @@ func (r *JobRepository) UpdateMonitoringStatus(job int64, monitoringStatus int32 Set("monitoring_status", monitoringStatus). Where("job.id = ?", job) - _, err = stmt.RunWith(r.stmtCache).Exec() - return err + if _, err = stmt.RunWith(r.stmtCache).Exec(); err != nil { + cclog.Errorf("Error while updating monitoring status for job %d: %v", job, err) + return fmt.Errorf("failed to update monitoring status for job %d: %w", job, err) + } + return nil } +// Execute runs a Squirrel UpdateBuilder statement against the database. +// This is a generic helper for executing pre-built update queries. +// +// Parameters: +// - stmt: Squirrel UpdateBuilder with prepared update query +// +// Returns an error if the execution fails. func (r *JobRepository) Execute(stmt sq.UpdateBuilder) error { if _, err := stmt.RunWith(r.stmtCache).Exec(); err != nil { - return err + cclog.Errorf("Error while executing statement: %v", err) + return fmt.Errorf("failed to execute update statement: %w", err) } return nil } +// MarkArchived adds monitoring status update to an existing UpdateBuilder statement. +// This is a builder helper used when constructing multi-field update queries. +// +// Parameters: +// - stmt: Existing UpdateBuilder to modify +// - monitoringStatus: Monitoring status value to set +// +// Returns the modified UpdateBuilder for method chaining. func (r *JobRepository) MarkArchived( stmt sq.UpdateBuilder, monitoringStatus int32, @@ -712,11 +936,22 @@ func (r *JobRepository) MarkArchived( return stmt.Set("monitoring_status", monitoringStatus) } +// UpdateEnergy calculates and updates the energy consumption for a job. +// This is called for running jobs during intermediate updates or when archiving. +// +// Energy calculation formula: +// - For "power" metrics: Energy (kWh) = (Power_avg * NumNodes * Duration_hours) / 1000 +// - For "energy" metrics: Currently not implemented (would need sum statistics) +// +// The calculation accounts for: +// - Multi-node jobs: Multiplies by NumNodes to get total cluster energy +// - Shared jobs: Node average is already based on partial resources, so NumNodes=1 +// - Unit conversion: Watts * hours / 1000 = kilowatt-hours (kWh) +// - Rounding: Results rounded to 2 decimal places func (r *JobRepository) UpdateEnergy( stmt sq.UpdateBuilder, jobMeta *schema.Job, ) (sq.UpdateBuilder, error) { - /* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */ sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) if err != nil { cclog.Errorf("cannot get subcluster: %s", err.Error()) @@ -724,24 +959,27 @@ func (r *JobRepository) UpdateEnergy( } energyFootprint := make(map[string]float64) - // Total Job Energy Outside Loop + // Accumulate total energy across all energy-related metrics totalEnergy := 0.0 for _, fp := range sc.EnergyFootprint { - // Always Init Metric Energy Inside Loop + // Calculate energy for this specific metric metricEnergy := 0.0 if i, err := archive.MetricIndex(sc.MetricConfig, fp); err == nil { - // Note: For DB data, calculate and save as kWh - if sc.MetricConfig[i].Energy == "energy" { // this metric has energy as unit (Joules or Wh) + switch sc.MetricConfig[i].Energy { + case "energy": // Metric already in energy units (Joules or Wh) cclog.Warnf("Update EnergyFootprint for Job %d and Metric %s on cluster %s: Set to 'energy' in cluster.json: Not implemented, will return 0.0", jobMeta.JobID, jobMeta.Cluster, fp) - // FIXME: Needs sum as stats type - } else if sc.MetricConfig[i].Energy == "power" { // this metric has power as unit (Watt) - // Energy: Power (in Watts) * Time (in Seconds) - // Unit: (W * (s / 3600)) / 1000 = kWh - // Round 2 Digits: round(Energy * 100) / 100 - // Here: (All-Node Metric Average * Number of Nodes) * (Job Duration in Seconds / 3600) / 1000 - // Note: Shared Jobs handled correctly since "Node Average" is based on partial resources, while "numNodes" factor is 1 + // FIXME: Needs sum as stats type to accumulate energy values over time + case "power": // Metric in power units (Watts) + // Energy (kWh) = Power (W) × Time (h) / 1000 + // Formula: (avg_power_per_node * num_nodes) * (duration_sec / 3600) / 1000 + // + // Breakdown: + // LoadJobStat(jobMeta, fp, "avg") = average power per node (W) + // jobMeta.NumNodes = number of nodes (1 for shared jobs) + // jobMeta.Duration / 3600.0 = duration in hours + // / 1000.0 = convert Wh to kWh rawEnergy := ((LoadJobStat(jobMeta, fp, "avg") * float64(jobMeta.NumNodes)) * (float64(jobMeta.Duration) / 3600.0)) / 1000.0 - metricEnergy = math.Round(rawEnergy*100.0) / 100.0 + metricEnergy = math.Round(rawEnergy*100.0) / 100.0 // Round to 2 decimal places } } else { cclog.Warnf("Error while collecting energy metric %s for job, DB ID '%v', return '0.0'", fp, jobMeta.ID) @@ -749,8 +987,6 @@ func (r *JobRepository) UpdateEnergy( energyFootprint[fp] = metricEnergy totalEnergy += metricEnergy - - // cclog.Infof("Metric %s Average %f -> %f kWh | Job %d Total -> %f kWh", fp, LoadJobStat(jobMeta, fp, "avg"), energy, jobMeta.JobID, totalEnergy) } var rawFootprint []byte @@ -762,11 +998,19 @@ func (r *JobRepository) UpdateEnergy( return stmt.Set("energy_footprint", string(rawFootprint)).Set("energy", (math.Round(totalEnergy*100.0) / 100.0)), nil } +// UpdateFootprint calculates and updates the performance footprint for a job. +// This is called for running jobs during intermediate updates or when archiving. +// +// A footprint is a summary statistic (avg/min/max) for each monitored metric. +// The specific statistic type is defined in the cluster config's Footprint field. +// Results are stored as JSON with keys like "metric_avg", "metric_max", etc. +// +// Example: For a "cpu_load" metric with Footprint="avg", this stores +// the average CPU load across all nodes as "cpu_load_avg": 85.3 func (r *JobRepository) UpdateFootprint( stmt sq.UpdateBuilder, jobMeta *schema.Job, ) (sq.UpdateBuilder, error) { - /* Note: Only Called for Running Jobs during Intermediate Update or on Archiving */ sc, err := archive.GetSubCluster(jobMeta.Cluster, jobMeta.SubCluster) if err != nil { cclog.Errorf("cannot get subcluster: %s", err.Error()) @@ -774,7 +1018,10 @@ func (r *JobRepository) UpdateFootprint( } footprint := make(map[string]float64) + // Build footprint map with metric_stattype as keys for _, fp := range sc.Footprint { + // Determine which statistic to use: avg, min, or max + // First check global metric config, then cluster-specific config var statType string for _, gm := range archive.GlobalMetricList { if gm.Name == fp { @@ -782,15 +1029,18 @@ func (r *JobRepository) UpdateFootprint( } } + // Validate statistic type if statType != "avg" && statType != "min" && statType != "max" { cclog.Warnf("unknown statType for footprint update: %s", statType) return stmt, fmt.Errorf("unknown statType for footprint update: %s", statType) } + // Override with cluster-specific config if available if i, err := archive.MetricIndex(sc.MetricConfig, fp); err != nil { statType = sc.MetricConfig[i].Footprint } + // Store as "metric_stattype": value (e.g., "cpu_load_avg": 85.3) name := fmt.Sprintf("%s_%s", fp, statType) footprint[name] = LoadJobStat(jobMeta, fp, statType) } @@ -803,3 +1053,84 @@ func (r *JobRepository) UpdateFootprint( return stmt.Set("footprint", string(rawFootprint)), nil } + +// GetUsedNodes returns a map of cluster names to sorted lists of unique hostnames +// that are currently in use by jobs that started before the given timestamp and +// are still in running state. +// +// The timestamp parameter (ts) is compared against job.start_time to find +// relevant jobs. Returns an error if the database query fails or row iteration +// encounters errors. Individual row parsing errors are logged but don't fail +// the entire operation. +func (r *JobRepository) GetUsedNodes(ts int64) (map[string][]string, error) { + // Note: Query expects index on (job_state, start_time) for optimal performance + q := sq.Select("job.cluster", "job.resources").From("job"). + Where("job.start_time < ?", ts). + Where(sq.Eq{"job.job_state": "running"}) + + rows, err := q.RunWith(r.stmtCache).Query() + if err != nil { + queryString, queryVars, _ := q.ToSql() + return nil, fmt.Errorf("query failed [%s] %v: %w", queryString, queryVars, err) + } + defer rows.Close() + + // Use a map of sets for efficient deduplication + nodeSet := make(map[string]map[string]struct{}) + + var ( + cluster string + rawResources []byte + resources []*schema.Resource + skippedRows int + ) + + for rows.Next() { + if err := rows.Scan(&cluster, &rawResources); err != nil { + cclog.Warnf("Error scanning job row in GetUsedNodes: %v", err) + skippedRows++ + continue + } + + resources = resources[:0] // Clear slice, keep capacity + if err := json.Unmarshal(rawResources, &resources); err != nil { + cclog.Warnf("Error unmarshaling resources for cluster %s: %v", cluster, err) + skippedRows++ + continue + } + + if len(resources) == 0 { + cclog.Debugf("Job in cluster %s has no resources", cluster) + continue + } + + if _, ok := nodeSet[cluster]; !ok { + nodeSet[cluster] = make(map[string]struct{}) + } + + for _, res := range resources { + nodeSet[cluster][res.Hostname] = struct{}{} + } + } + + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("error iterating rows: %w", err) + } + + if skippedRows > 0 { + cclog.Warnf("GetUsedNodes: Skipped %d rows due to parsing errors", skippedRows) + } + + // Convert sets to sorted slices + nodeList := make(map[string][]string, len(nodeSet)) + for cluster, nodes := range nodeSet { + list := make([]string, 0, len(nodes)) + for node := range nodes { + list = append(list, node) + } + sort.Strings(list) + nodeList[cluster] = list + } + + return nodeList, nil +} diff --git a/internal/repository/jobCreate.go b/internal/repository/jobCreate.go index 2fcc69e9..07c8ce11 100644 --- a/internal/repository/jobCreate.go +++ b/internal/repository/jobCreate.go @@ -2,14 +2,15 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package repository import ( "encoding/json" "fmt" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" sq "github.com/Masterminds/squirrel" ) @@ -29,6 +30,27 @@ const NamedJobInsert string = `INSERT INTO job ( :shared, :monitoring_status, :smt, :job_state, :start_time, :duration, :walltime, :footprint, :energy, :energy_footprint, :resources, :meta_data );` +// InsertJobDirect inserts a job directly into the job table (not job_cache). +// Use this when the returned ID will be used for operations on the job table +// (e.g., adding tags), or for imported jobs that are already completed. +func (r *JobRepository) InsertJobDirect(job *schema.Job) (int64, error) { + r.Mutex.Lock() + defer r.Mutex.Unlock() + + res, err := r.DB.NamedExec(NamedJobInsert, job) + if err != nil { + cclog.Warn("Error while NamedJobInsert (direct)") + return 0, err + } + id, err := res.LastInsertId() + if err != nil { + cclog.Warn("Error while getting last insert ID (direct)") + return 0, err + } + + return id, nil +} + func (r *JobRepository) InsertJob(job *schema.Job) (int64, error) { r.Mutex.Lock() defer r.Mutex.Unlock() @@ -70,8 +92,9 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) { jobs = append(jobs, job) } + // Use INSERT OR IGNORE to skip jobs already transferred by the stop path _, err = r.DB.Exec( - "INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache") + "INSERT OR IGNORE INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache") if err != nil { cclog.Warnf("Error while Job sync: %v", err) return nil, err @@ -83,9 +106,48 @@ func (r *JobRepository) SyncJobs() ([]*schema.Job, error) { return nil, err } + // Resolve correct job.id from the job table. The IDs read from job_cache + // are from a different auto-increment sequence and must not be used to + // query the job table. + for _, job := range jobs { + var newID int64 + if err := sq.Select("job.id").From("job"). + Where("job.job_id = ? AND job.cluster = ? AND job.start_time = ?", + job.JobID, job.Cluster, job.StartTime). + RunWith(r.stmtCache).QueryRow().Scan(&newID); err != nil { + cclog.Warnf("SyncJobs: could not resolve job table id for job %d on %s: %v", + job.JobID, job.Cluster, err) + continue + } + job.ID = &newID + } + return jobs, nil } +// TransferCachedJobToMain moves a job from job_cache to the job table. +// Caller must hold r.Mutex. Returns the new job table ID. +func (r *JobRepository) TransferCachedJobToMain(cacheID int64) (int64, error) { + res, err := r.DB.Exec( + "INSERT INTO job (job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data) SELECT job_id, cluster, subcluster, start_time, hpc_user, project, cluster_partition, array_job_id, num_nodes, num_hwthreads, num_acc, shared, monitoring_status, smt, job_state, duration, walltime, footprint, energy, energy_footprint, resources, meta_data FROM job_cache WHERE id = ?", + cacheID) + if err != nil { + return 0, fmt.Errorf("transferring cached job %d to main table failed: %w", cacheID, err) + } + + newID, err := res.LastInsertId() + if err != nil { + return 0, fmt.Errorf("getting new job ID after transfer failed: %w", err) + } + + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", cacheID) + if err != nil { + return 0, fmt.Errorf("deleting cached job %d after transfer failed: %w", cacheID, err) + } + + return newID, nil +} + // Start inserts a new job in the table, returning the unique job ID. // Statistics are not transfered! func (r *JobRepository) Start(job *schema.Job) (id int64, err error) { @@ -107,41 +169,46 @@ func (r *JobRepository) Start(job *schema.Job) (id int64, err error) { return r.InsertJob(job) } +// StartDirect inserts a new job directly into the job table (not job_cache). +// Use this when the returned ID will immediately be used for job table +// operations such as adding tags. +func (r *JobRepository) StartDirect(job *schema.Job) (id int64, err error) { + job.RawFootprint, err = json.Marshal(job.Footprint) + if err != nil { + return -1, fmt.Errorf("REPOSITORY/JOB > encoding footprint field failed: %w", err) + } + + job.RawResources, err = json.Marshal(job.Resources) + if err != nil { + return -1, fmt.Errorf("REPOSITORY/JOB > encoding resources field failed: %w", err) + } + + job.RawMetaData, err = json.Marshal(job.MetaData) + if err != nil { + return -1, fmt.Errorf("REPOSITORY/JOB > encoding metaData field failed: %w", err) + } + + return r.InsertJobDirect(job) +} + // Stop updates the job with the database id jobId using the provided arguments. func (r *JobRepository) Stop( - jobId int64, + jobID int64, duration int32, state schema.JobState, monitoringStatus int32, ) (err error) { // Invalidate cache entries as job state is changing - r.cache.Del(fmt.Sprintf("metadata:%d", jobId)) - r.cache.Del(fmt.Sprintf("energyFootprint:%d", jobId)) + r.cache.Del(fmt.Sprintf("metadata:%d", jobID)) + r.cache.Del(fmt.Sprintf("energyFootprint:%d", jobID)) stmt := sq.Update("job"). Set("job_state", state). Set("duration", duration). Set("monitoring_status", monitoringStatus). - Where("job.id = ?", jobId) + Where("job.id = ?", jobID) _, err = stmt.RunWith(r.stmtCache).Exec() return err } -func (r *JobRepository) StopCached( - jobId int64, - duration int32, - state schema.JobState, - monitoringStatus int32, -) (err error) { - // Note: StopCached updates job_cache table, not the main job table - // Cache invalidation happens when job is synced to main table - stmt := sq.Update("job_cache"). - Set("job_state", state). - Set("duration", duration). - Set("monitoring_status", monitoringStatus). - Where("job_cache.id = ?", jobId) - - _, err = stmt.RunWith(r.stmtCache).Exec() - return err -} diff --git a/internal/repository/jobCreate_test.go b/internal/repository/jobCreate_test.go new file mode 100644 index 00000000..3f2ee6fa --- /dev/null +++ b/internal/repository/jobCreate_test.go @@ -0,0 +1,607 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package repository + +import ( + "encoding/json" + "testing" + + "github.com/ClusterCockpit/cc-lib/v2/schema" + _ "github.com/mattn/go-sqlite3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// createTestJob creates a minimal valid job for testing +func createTestJob(jobID int64, cluster string) *schema.Job { + return &schema.Job{ + JobID: jobID, + User: "testuser", + Project: "testproject", + Cluster: cluster, + SubCluster: "main", + Partition: "batch", + NumNodes: 1, + NumHWThreads: 4, + NumAcc: 0, + Shared: "none", + MonitoringStatus: schema.MonitoringStatusRunningOrArchiving, + SMT: 1, + State: schema.JobStateRunning, + StartTime: 1234567890, + Duration: 0, + Walltime: 3600, + Resources: []*schema.Resource{ + { + Hostname: "node01", + HWThreads: []int{0, 1, 2, 3}, + }, + }, + Footprint: map[string]float64{ + "cpu_load": 50.0, + "mem_used": 8000.0, + "flops_any": 0.5, + "mem_bw": 10.0, + "net_bw": 2.0, + "file_bw": 1.0, + "cpu_used": 2.0, + "cpu_load_core": 12.5, + }, + MetaData: map[string]string{ + "jobName": "test_job", + "queue": "normal", + "qosName": "default", + "accountName": "testaccount", + }, + } +} + +func TestInsertJob(t *testing.T) { + r := setup(t) + + t.Run("successful insertion", func(t *testing.T) { + job := createTestJob(999001, "testcluster") + job.RawResources, _ = json.Marshal(job.Resources) + job.RawFootprint, _ = json.Marshal(job.Footprint) + job.RawMetaData, _ = json.Marshal(job.MetaData) + + id, err := r.InsertJob(job) + require.NoError(t, err, "InsertJob should succeed") + assert.Greater(t, id, int64(0), "Should return valid insert ID") + + // Verify job was inserted into job_cache + var count int + err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id = ? AND cluster = ?", + job.JobID, job.Cluster).Scan(&count) + require.NoError(t, err) + assert.Equal(t, 1, count, "Job should be in job_cache table") + + // Clean up + _, err = r.DB.Exec("DELETE FROM job_cache WHERE job_id = ? AND cluster = ?", job.JobID, job.Cluster) + require.NoError(t, err) + }) + + t.Run("insertion with all fields", func(t *testing.T) { + job := createTestJob(999002, "testcluster") + job.ArrayJobID = 5000 + job.Energy = 1500.5 + job.RawResources, _ = json.Marshal(job.Resources) + job.RawFootprint, _ = json.Marshal(job.Footprint) + job.RawMetaData, _ = json.Marshal(job.MetaData) + + id, err := r.InsertJob(job) + require.NoError(t, err) + assert.Greater(t, id, int64(0)) + + // Verify all fields were stored correctly + var retrievedJob schema.Job + err = r.DB.QueryRow(`SELECT job_id, hpc_user, project, cluster, array_job_id, energy + FROM job_cache WHERE id = ?`, id).Scan( + &retrievedJob.JobID, &retrievedJob.User, &retrievedJob.Project, + &retrievedJob.Cluster, &retrievedJob.ArrayJobID, &retrievedJob.Energy) + require.NoError(t, err) + assert.Equal(t, job.JobID, retrievedJob.JobID) + assert.Equal(t, job.User, retrievedJob.User) + assert.Equal(t, job.Project, retrievedJob.Project) + assert.Equal(t, job.Cluster, retrievedJob.Cluster) + assert.Equal(t, job.ArrayJobID, retrievedJob.ArrayJobID) + assert.Equal(t, job.Energy, retrievedJob.Energy) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + }) +} + +func TestStart(t *testing.T) { + r := setup(t) + + t.Run("successful job start with JSON encoding", func(t *testing.T) { + job := createTestJob(999003, "testcluster") + + id, err := r.Start(job) + require.NoError(t, err, "Start should succeed") + assert.Greater(t, id, int64(0), "Should return valid insert ID") + + // Verify job was inserted and JSON fields were encoded + var rawResources, rawFootprint, rawMetaData []byte + err = r.DB.QueryRow(`SELECT resources, footprint, meta_data FROM job_cache WHERE id = ?`, id).Scan( + &rawResources, &rawFootprint, &rawMetaData) + require.NoError(t, err) + + // Verify resources JSON + var resources []*schema.Resource + err = json.Unmarshal(rawResources, &resources) + require.NoError(t, err, "Resources should be valid JSON") + assert.Equal(t, 1, len(resources)) + assert.Equal(t, "node01", resources[0].Hostname) + + // Verify footprint JSON + var footprint map[string]float64 + err = json.Unmarshal(rawFootprint, &footprint) + require.NoError(t, err, "Footprint should be valid JSON") + assert.Equal(t, 50.0, footprint["cpu_load"]) + assert.Equal(t, 8000.0, footprint["mem_used"]) + + // Verify metadata JSON + var metaData map[string]string + err = json.Unmarshal(rawMetaData, &metaData) + require.NoError(t, err, "MetaData should be valid JSON") + assert.Equal(t, "test_job", metaData["jobName"]) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + }) + + t.Run("job start with empty footprint", func(t *testing.T) { + job := createTestJob(999004, "testcluster") + job.Footprint = map[string]float64{} + + id, err := r.Start(job) + require.NoError(t, err) + assert.Greater(t, id, int64(0)) + + // Verify empty footprint was encoded as empty JSON object + var rawFootprint []byte + err = r.DB.QueryRow(`SELECT footprint FROM job_cache WHERE id = ?`, id).Scan(&rawFootprint) + require.NoError(t, err) + assert.Equal(t, []byte("{}"), rawFootprint) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + }) + + t.Run("job start with nil metadata", func(t *testing.T) { + job := createTestJob(999005, "testcluster") + job.MetaData = nil + + id, err := r.Start(job) + require.NoError(t, err) + assert.Greater(t, id, int64(0)) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + }) +} + +func TestStop(t *testing.T) { + r := setup(t) + + t.Run("successful job stop", func(t *testing.T) { + // First insert a job using Start + job := createTestJob(999106, "testcluster") + id, err := r.Start(job) + require.NoError(t, err) + + // Move from job_cache to job table (simulate SyncJobs) - exclude id to let it auto-increment + _, err = r.DB.Exec(`INSERT INTO job (job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, + num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint) + SELECT job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, + num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint + FROM job_cache WHERE id = ?`, id) + require.NoError(t, err) + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + + // Get the new job id in the job table + err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?", + job.JobID, job.Cluster, job.StartTime).Scan(&id) + require.NoError(t, err) + + // Stop the job + duration := int32(3600) + state := schema.JobStateCompleted + monitoringStatus := int32(schema.MonitoringStatusArchivingSuccessful) + + err = r.Stop(id, duration, state, monitoringStatus) + require.NoError(t, err, "Stop should succeed") + + // Verify job was updated + var retrievedDuration int32 + var retrievedState string + var retrievedMonStatus int32 + err = r.DB.QueryRow(`SELECT duration, job_state, monitoring_status FROM job WHERE id = ?`, id).Scan( + &retrievedDuration, &retrievedState, &retrievedMonStatus) + require.NoError(t, err) + assert.Equal(t, duration, retrievedDuration) + assert.Equal(t, string(state), retrievedState) + assert.Equal(t, monitoringStatus, retrievedMonStatus) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id) + require.NoError(t, err) + }) + + t.Run("stop updates job state transitions", func(t *testing.T) { + // Insert a job + job := createTestJob(999107, "testcluster") + id, err := r.Start(job) + require.NoError(t, err) + + // Move to job table + _, err = r.DB.Exec(`INSERT INTO job (job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, + num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint) + SELECT job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, + num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint + FROM job_cache WHERE id = ?`, id) + require.NoError(t, err) + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + + // Get the new job id in the job table + err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?", + job.JobID, job.Cluster, job.StartTime).Scan(&id) + require.NoError(t, err) + + // Stop the job with different duration + err = r.Stop(id, 7200, schema.JobStateCompleted, int32(schema.MonitoringStatusArchivingSuccessful)) + require.NoError(t, err) + + // Verify the duration was updated correctly + var duration int32 + err = r.DB.QueryRow(`SELECT duration FROM job WHERE id = ?`, id).Scan(&duration) + require.NoError(t, err) + assert.Equal(t, int32(7200), duration, "Duration should be updated to 7200") + + // Clean up + _, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id) + require.NoError(t, err) + }) + + t.Run("stop with different states", func(t *testing.T) { + testCases := []struct { + name string + jobID int64 + state schema.JobState + monitoringStatus int32 + }{ + {"completed", 999108, schema.JobStateCompleted, int32(schema.MonitoringStatusArchivingSuccessful)}, + {"failed", 999118, schema.JobStateFailed, int32(schema.MonitoringStatusArchivingSuccessful)}, + {"cancelled", 999119, schema.JobStateCancelled, int32(schema.MonitoringStatusArchivingSuccessful)}, + {"timeout", 999120, schema.JobStateTimeout, int32(schema.MonitoringStatusArchivingSuccessful)}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + job := createTestJob(tc.jobID, "testcluster") + id, err := r.Start(job) + require.NoError(t, err) + + // Move to job table + _, err = r.DB.Exec(`INSERT INTO job (job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, + num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint) + SELECT job_id, cluster, subcluster, submit_time, start_time, hpc_user, project, + cluster_partition, array_job_id, duration, walltime, job_state, meta_data, resources, num_nodes, + num_hwthreads, num_acc, smt, shared, monitoring_status, energy, energy_footprint, footprint + FROM job_cache WHERE id = ?`, id) + require.NoError(t, err) + _, err = r.DB.Exec("DELETE FROM job_cache WHERE id = ?", id) + require.NoError(t, err) + + // Get the new job id in the job table + err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?", + job.JobID, job.Cluster, job.StartTime).Scan(&id) + require.NoError(t, err) + + // Stop with specific state + err = r.Stop(id, 1800, tc.state, tc.monitoringStatus) + require.NoError(t, err) + + // Verify state was set correctly + var retrievedState string + err = r.DB.QueryRow(`SELECT job_state FROM job WHERE id = ?`, id).Scan(&retrievedState) + require.NoError(t, err) + assert.Equal(t, string(tc.state), retrievedState) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id) + require.NoError(t, err) + }) + } + }) +} + +func TestTransferCachedJobToMain(t *testing.T) { + r := setup(t) + + t.Run("successful transfer from cache to main", func(t *testing.T) { + // Insert a job in job_cache + job := createTestJob(999009, "testcluster") + cacheID, err := r.Start(job) + require.NoError(t, err) + + // Transfer the cached job to the main table + r.Mutex.Lock() + newID, err := r.TransferCachedJobToMain(cacheID) + r.Mutex.Unlock() + require.NoError(t, err, "TransferCachedJobToMain should succeed") + assert.NotEqual(t, cacheID, newID, "New ID should differ from cache ID") + + // Verify job exists in job table + var count int + err = r.DB.QueryRow(`SELECT COUNT(*) FROM job WHERE id = ?`, newID).Scan(&count) + require.NoError(t, err) + assert.Equal(t, 1, count, "Job should exist in main table") + + // Verify job was removed from job_cache + err = r.DB.QueryRow(`SELECT COUNT(*) FROM job_cache WHERE id = ?`, cacheID).Scan(&count) + require.NoError(t, err) + assert.Equal(t, 0, count, "Job should be removed from cache") + + // Clean up + _, err = r.DB.Exec("DELETE FROM job WHERE id = ?", newID) + require.NoError(t, err) + }) + + t.Run("transfer preserves job data", func(t *testing.T) { + // Insert a job in job_cache + job := createTestJob(999010, "testcluster") + cacheID, err := r.Start(job) + require.NoError(t, err) + + // Transfer the cached job + r.Mutex.Lock() + newID, err := r.TransferCachedJobToMain(cacheID) + r.Mutex.Unlock() + require.NoError(t, err) + + // Verify the transferred job has the correct data + var jobID int64 + var cluster string + err = r.DB.QueryRow(`SELECT job_id, cluster FROM job WHERE id = ?`, newID).Scan(&jobID, &cluster) + require.NoError(t, err) + assert.Equal(t, job.JobID, jobID) + assert.Equal(t, job.Cluster, cluster) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job WHERE id = ?", newID) + require.NoError(t, err) + }) +} + +func TestSyncJobs(t *testing.T) { + r := setup(t) + + t.Run("sync jobs from cache to main table", func(t *testing.T) { + // Ensure cache is empty first + _, err := r.DB.Exec("DELETE FROM job_cache") + require.NoError(t, err) + + // Insert multiple jobs in job_cache + job1 := createTestJob(999011, "testcluster") + job2 := createTestJob(999012, "testcluster") + job3 := createTestJob(999013, "testcluster") + + _, err = r.Start(job1) + require.NoError(t, err) + _, err = r.Start(job2) + require.NoError(t, err) + _, err = r.Start(job3) + require.NoError(t, err) + + // Verify jobs are in job_cache + var cacheCount int + err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id IN (?, ?, ?)", + job1.JobID, job2.JobID, job3.JobID).Scan(&cacheCount) + require.NoError(t, err) + assert.Equal(t, 3, cacheCount, "All jobs should be in job_cache") + + // Sync jobs + jobs, err := r.SyncJobs() + require.NoError(t, err, "SyncJobs should succeed") + assert.Equal(t, 3, len(jobs), "Should return 3 synced jobs") + + // Verify jobs were moved to job table + var jobCount int + err = r.DB.QueryRow("SELECT COUNT(*) FROM job WHERE job_id IN (?, ?, ?)", + job1.JobID, job2.JobID, job3.JobID).Scan(&jobCount) + require.NoError(t, err) + assert.Equal(t, 3, jobCount, "All jobs should be in job table") + + // Verify job_cache was cleared + err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id IN (?, ?, ?)", + job1.JobID, job2.JobID, job3.JobID).Scan(&cacheCount) + require.NoError(t, err) + assert.Equal(t, 0, cacheCount, "job_cache should be empty after sync") + + // Clean up + _, err = r.DB.Exec("DELETE FROM job WHERE job_id IN (?, ?, ?)", job1.JobID, job2.JobID, job3.JobID) + require.NoError(t, err) + }) + + t.Run("sync preserves job data", func(t *testing.T) { + // Ensure cache is empty first + _, err := r.DB.Exec("DELETE FROM job_cache") + require.NoError(t, err) + + // Insert a job with specific data + job := createTestJob(999014, "testcluster") + job.ArrayJobID = 7777 + job.Energy = 2500.75 + job.Duration = 1800 + + id, err := r.Start(job) + require.NoError(t, err) + + // Update some fields to simulate job progress + result, err := r.DB.Exec(`UPDATE job_cache SET duration = ?, energy = ? WHERE id = ?`, + 3600, 3000.5, id) + require.NoError(t, err) + rowsAffected, _ := result.RowsAffected() + require.Equal(t, int64(1), rowsAffected, "UPDATE should affect exactly 1 row") + + // Verify the update worked + var checkDuration int32 + var checkEnergy float64 + err = r.DB.QueryRow(`SELECT duration, energy FROM job_cache WHERE id = ?`, id).Scan(&checkDuration, &checkEnergy) + require.NoError(t, err) + require.Equal(t, int32(3600), checkDuration, "Duration should be updated to 3600 before sync") + require.Equal(t, 3000.5, checkEnergy, "Energy should be updated to 3000.5 before sync") + + // Sync jobs + jobs, err := r.SyncJobs() + require.NoError(t, err) + require.Equal(t, 1, len(jobs), "Should return exactly 1 synced job") + + // Verify in database + var dbJob schema.Job + err = r.DB.QueryRow(`SELECT job_id, hpc_user, project, cluster, array_job_id, duration, energy + FROM job WHERE job_id = ? AND cluster = ?`, job.JobID, job.Cluster).Scan( + &dbJob.JobID, &dbJob.User, &dbJob.Project, &dbJob.Cluster, + &dbJob.ArrayJobID, &dbJob.Duration, &dbJob.Energy) + require.NoError(t, err) + assert.Equal(t, job.JobID, dbJob.JobID) + assert.Equal(t, int32(3600), dbJob.Duration) + assert.Equal(t, 3000.5, dbJob.Energy) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job WHERE job_id = ? AND cluster = ?", job.JobID, job.Cluster) + require.NoError(t, err) + }) + + t.Run("sync returns job table IDs not cache IDs", func(t *testing.T) { + // Ensure cache is empty first + _, err := r.DB.Exec("DELETE FROM job_cache") + require.NoError(t, err) + + // Insert a job into job_cache + job := createTestJob(999015, "testcluster") + cacheID, err := r.Start(job) + require.NoError(t, err) + + // Sync jobs + jobs, err := r.SyncJobs() + require.NoError(t, err) + require.Equal(t, 1, len(jobs)) + + // The returned ID must refer to the job table, not job_cache + var jobTableID int64 + err = r.DB.QueryRow("SELECT id FROM job WHERE job_id = ? AND cluster = ? AND start_time = ?", + jobs[0].JobID, jobs[0].Cluster, jobs[0].StartTime).Scan(&jobTableID) + require.NoError(t, err) + assert.Equal(t, jobTableID, *jobs[0].ID, + "returned ID should match the job table row, not the cache ID (%d)", cacheID) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job WHERE job_id = ? AND cluster = ?", job.JobID, job.Cluster) + require.NoError(t, err) + }) + + t.Run("sync with empty cache returns empty list", func(t *testing.T) { + // Ensure cache is empty + _, err := r.DB.Exec("DELETE FROM job_cache") + require.NoError(t, err) + + // Sync should return empty list + jobs, err := r.SyncJobs() + require.NoError(t, err) + assert.Equal(t, 0, len(jobs), "Should return empty list when cache is empty") + }) +} + +func TestInsertJobDirect(t *testing.T) { + r := setup(t) + + t.Run("inserts into job table not cache", func(t *testing.T) { + job := createTestJob(999020, "testcluster") + job.RawResources, _ = json.Marshal(job.Resources) + job.RawFootprint, _ = json.Marshal(job.Footprint) + job.RawMetaData, _ = json.Marshal(job.MetaData) + + id, err := r.InsertJobDirect(job) + require.NoError(t, err, "InsertJobDirect should succeed") + assert.Greater(t, id, int64(0), "Should return valid insert ID") + + // Verify job is in job table + var count int + err = r.DB.QueryRow("SELECT COUNT(*) FROM job WHERE id = ?", id).Scan(&count) + require.NoError(t, err) + assert.Equal(t, 1, count, "Job should be in job table") + + // Verify job is NOT in job_cache + err = r.DB.QueryRow("SELECT COUNT(*) FROM job_cache WHERE job_id = ? AND cluster = ?", + job.JobID, job.Cluster).Scan(&count) + require.NoError(t, err) + assert.Equal(t, 0, count, "Job should NOT be in job_cache") + + // Clean up + _, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id) + require.NoError(t, err) + }) + + t.Run("returned ID works for tag operations", func(t *testing.T) { + job := createTestJob(999021, "testcluster") + job.RawResources, _ = json.Marshal(job.Resources) + job.RawFootprint, _ = json.Marshal(job.Footprint) + job.RawMetaData, _ = json.Marshal(job.MetaData) + + id, err := r.InsertJobDirect(job) + require.NoError(t, err) + + // Adding a tag using the returned ID should succeed (FK constraint on jobtag) + err = r.ImportTag(id, "test_type", "test_name", "global") + require.NoError(t, err, "ImportTag should succeed with direct insert ID") + + // Clean up + _, err = r.DB.Exec("DELETE FROM jobtag WHERE job_id = ?", id) + require.NoError(t, err) + _, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id) + require.NoError(t, err) + }) +} + +func TestStartDirect(t *testing.T) { + r := setup(t) + + t.Run("inserts into job table with JSON encoding", func(t *testing.T) { + job := createTestJob(999022, "testcluster") + + id, err := r.StartDirect(job) + require.NoError(t, err, "StartDirect should succeed") + assert.Greater(t, id, int64(0)) + + // Verify job is in job table with encoded JSON + var rawResources []byte + err = r.DB.QueryRow("SELECT resources FROM job WHERE id = ?", id).Scan(&rawResources) + require.NoError(t, err) + + var resources []*schema.Resource + err = json.Unmarshal(rawResources, &resources) + require.NoError(t, err, "Resources should be valid JSON") + assert.Equal(t, "node01", resources[0].Hostname) + + // Clean up + _, err = r.DB.Exec("DELETE FROM job WHERE id = ?", id) + require.NoError(t, err) + }) +} diff --git a/internal/repository/jobFind.go b/internal/repository/jobFind.go index 11f66c40..13dd4418 100644 --- a/internal/repository/jobFind.go +++ b/internal/repository/jobFind.go @@ -2,6 +2,7 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package repository import ( @@ -11,8 +12,8 @@ import ( "time" "github.com/ClusterCockpit/cc-backend/internal/graph/model" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" sq "github.com/Masterminds/squirrel" ) @@ -22,13 +23,17 @@ import ( // It returns a pointer to a schema.Job data structure and an error variable. // To check if no job was found test err == sql.ErrNoRows func (r *JobRepository) Find( - jobId *int64, + jobID *int64, cluster *string, startTime *int64, ) (*schema.Job, error) { + if jobID == nil { + return nil, fmt.Errorf("jobID cannot be nil") + } + start := time.Now() q := sq.Select(jobColumns...).From("job"). - Where("job.job_id = ?", *jobId) + Where("job.job_id = ?", *jobID) if cluster != nil { q = q.Where("job.cluster = ?", *cluster) @@ -37,19 +42,29 @@ func (r *JobRepository) Find( q = q.Where("job.start_time = ?", *startTime) } - q = q.OrderBy("job.id DESC") // always use newest matching job by db id if more than one match + q = q.OrderBy("job.id DESC").Limit(1) // always use newest matching job by db id if more than one match cclog.Debugf("Timer Find %s", time.Since(start)) return scanJob(q.RunWith(r.stmtCache).QueryRow()) } +// FindCached executes a SQL query to find a specific batch job from the job_cache table. +// The job is queried using the batch job id, and optionally filtered by cluster name +// and start time (UNIX epoch time seconds). This method uses cached job data which +// may be stale but provides faster access than Find(). +// It returns a pointer to a schema.Job data structure and an error variable. +// To check if no job was found test err == sql.ErrNoRows func (r *JobRepository) FindCached( - jobId *int64, + jobID *int64, cluster *string, startTime *int64, ) (*schema.Job, error) { + if jobID == nil { + return nil, fmt.Errorf("jobID cannot be nil") + } + q := sq.Select(jobCacheColumns...).From("job_cache"). - Where("job_cache.job_id = ?", *jobId) + Where("job_cache.job_id = ?", *jobID) if cluster != nil { q = q.Where("job_cache.cluster = ?", *cluster) @@ -58,24 +73,28 @@ func (r *JobRepository) FindCached( q = q.Where("job_cache.start_time = ?", *startTime) } - q = q.OrderBy("job_cache.id DESC") // always use newest matching job by db id if more than one match + q = q.OrderBy("job_cache.id DESC").Limit(1) // always use newest matching job by db id if more than one match return scanJob(q.RunWith(r.stmtCache).QueryRow()) } -// Find executes a SQL query to find a specific batch job. -// The job is queried using the batch job id, the cluster name, -// and the start time of the job in UNIX epoch time seconds. -// It returns a pointer to a schema.Job data structure and an error variable. -// To check if no job was found test err == sql.ErrNoRows +// FindAll executes a SQL query to find all batch jobs matching the given criteria. +// Jobs are queried using the batch job id, and optionally filtered by cluster name +// and start time (UNIX epoch time seconds). +// It returns a slice of pointers to schema.Job data structures and an error variable. +// An empty slice is returned if no matching jobs are found. func (r *JobRepository) FindAll( - jobId *int64, + jobID *int64, cluster *string, startTime *int64, ) ([]*schema.Job, error) { + if jobID == nil { + return nil, fmt.Errorf("jobID cannot be nil") + } + start := time.Now() q := sq.Select(jobColumns...).From("job"). - Where("job.job_id = ?", *jobId) + Where("job.job_id = ?", *jobID) if cluster != nil { q = q.Where("job.cluster = ?", *cluster) @@ -86,8 +105,8 @@ func (r *JobRepository) FindAll( rows, err := q.RunWith(r.stmtCache).Query() if err != nil { - cclog.Error("Error while running query") - return nil, err + cclog.Errorf("Error while running FindAll query for jobID=%d: %v", *jobID, err) + return nil, fmt.Errorf("failed to execute FindAll query: %w", err) } defer rows.Close() @@ -95,8 +114,8 @@ func (r *JobRepository) FindAll( for rows.Next() { job, err := scanJob(rows) if err != nil { - cclog.Warn("Error while scanning rows") - return nil, err + cclog.Warnf("Error while scanning rows in FindAll: %v", err) + return nil, fmt.Errorf("failed to scan job row: %w", err) } jobs = append(jobs, job) } @@ -119,8 +138,8 @@ func (r *JobRepository) GetJobList(limit int, offset int) ([]int64, error) { rows, err := query.RunWith(r.stmtCache).Query() if err != nil { - cclog.Error("Error while running query") - return nil, err + cclog.Errorf("Error while running GetJobList query (limit=%d, offset=%d): %v", limit, offset, err) + return nil, fmt.Errorf("failed to execute GetJobList query: %w", err) } defer rows.Close() @@ -129,23 +148,23 @@ func (r *JobRepository) GetJobList(limit int, offset int) ([]int64, error) { var id int64 err := rows.Scan(&id) if err != nil { - cclog.Warn("Error while scanning rows") - return nil, err + cclog.Warnf("Error while scanning rows in GetJobList: %v", err) + return nil, fmt.Errorf("failed to scan job ID: %w", err) } jl = append(jl, id) } - cclog.Infof("Return job count %d", len(jl)) + cclog.Debugf("JobRepository.GetJobList(): Return job count %d", len(jl)) return jl, nil } -// FindById executes a SQL query to find a specific batch job. +// FindByID executes a SQL query to find a specific batch job. // The job is queried using the database id. // It returns a pointer to a schema.Job data structure and an error variable. // To check if no job was found test err == sql.ErrNoRows -func (r *JobRepository) FindById(ctx context.Context, jobId int64) (*schema.Job, error) { +func (r *JobRepository) FindByID(ctx context.Context, jobID int64) (*schema.Job, error) { q := sq.Select(jobColumns...). - From("job").Where("job.id = ?", jobId) + From("job").Where("job.id = ?", jobID) q, qerr := SecurityCheck(ctx, q) if qerr != nil { @@ -155,14 +174,14 @@ func (r *JobRepository) FindById(ctx context.Context, jobId int64) (*schema.Job, return scanJob(q.RunWith(r.stmtCache).QueryRow()) } -// FindByIdWithUser executes a SQL query to find a specific batch job. +// FindByIDWithUser executes a SQL query to find a specific batch job. // The job is queried using the database id. The user is passed directly, // instead as part of the context. // It returns a pointer to a schema.Job data structure and an error variable. // To check if no job was found test err == sql.ErrNoRows -func (r *JobRepository) FindByIdWithUser(user *schema.User, jobId int64) (*schema.Job, error) { +func (r *JobRepository) FindByIDWithUser(user *schema.User, jobID int64) (*schema.Job, error) { q := sq.Select(jobColumns...). - From("job").Where("job.id = ?", jobId) + From("job").Where("job.id = ?", jobID) q, qerr := SecurityCheckWithUser(user, q) if qerr != nil { @@ -172,24 +191,24 @@ func (r *JobRepository) FindByIdWithUser(user *schema.User, jobId int64) (*schem return scanJob(q.RunWith(r.stmtCache).QueryRow()) } -// FindByIdDirect executes a SQL query to find a specific batch job. +// FindByIDDirect executes a SQL query to find a specific batch job. // The job is queried using the database id. // It returns a pointer to a schema.Job data structure and an error variable. // To check if no job was found test err == sql.ErrNoRows -func (r *JobRepository) FindByIdDirect(jobId int64) (*schema.Job, error) { +func (r *JobRepository) FindByIDDirect(jobID int64) (*schema.Job, error) { q := sq.Select(jobColumns...). - From("job").Where("job.id = ?", jobId) + From("job").Where("job.id = ?", jobID) return scanJob(q.RunWith(r.stmtCache).QueryRow()) } -// FindByJobId executes a SQL query to find a specific batch job. +// FindByJobID executes a SQL query to find a specific batch job. // The job is queried using the slurm id and the clustername. // It returns a pointer to a schema.Job data structure and an error variable. // To check if no job was found test err == sql.ErrNoRows -func (r *JobRepository) FindByJobId(ctx context.Context, jobId int64, startTime int64, cluster string) (*schema.Job, error) { +func (r *JobRepository) FindByJobID(ctx context.Context, jobID int64, startTime int64, cluster string) (*schema.Job, error) { q := sq.Select(jobColumns...). From("job"). - Where("job.job_id = ?", jobId). + Where("job.job_id = ?", jobID). Where("job.cluster = ?", cluster). Where("job.start_time = ?", startTime) @@ -201,19 +220,22 @@ func (r *JobRepository) FindByJobId(ctx context.Context, jobId int64, startTime return scanJob(q.RunWith(r.stmtCache).QueryRow()) } -// IsJobOwner executes a SQL query to find a specific batch job. -// The job is queried using the slurm id,a username and the cluster. -// It returns a bool. -// If job was found, user is owner: test err != sql.ErrNoRows -func (r *JobRepository) IsJobOwner(jobId int64, startTime int64, user string, cluster string) bool { +// IsJobOwner checks if the specified user owns the batch job identified by jobID, +// startTime, and cluster. Returns true if the user is the owner, false otherwise. +// This method does not return errors; it returns false for both non-existent jobs +// and jobs owned by other users. +func (r *JobRepository) IsJobOwner(jobID int64, startTime int64, user string, cluster string) bool { q := sq.Select("id"). From("job"). - Where("job.job_id = ?", jobId). + Where("job.job_id = ?", jobID). Where("job.hpc_user = ?", user). Where("job.cluster = ?", cluster). Where("job.start_time = ?", startTime) _, err := scanJob(q.RunWith(r.stmtCache).QueryRow()) + if err != nil && err != sql.ErrNoRows { + cclog.Warnf("IsJobOwner: unexpected error for jobID=%d, user=%s, cluster=%s: %v", jobID, user, cluster, err) + } return err != sql.ErrNoRows } @@ -231,6 +253,11 @@ func (r *JobRepository) FindConcurrentJobs( } query = query.Where("cluster = ?", job.Cluster) + + if len(job.Resources) == 0 { + return nil, fmt.Errorf("job has no resources defined") + } + var startTime int64 var stopTime int64 @@ -243,25 +270,28 @@ func (r *JobRepository) FindConcurrentJobs( stopTime = startTime + int64(job.Duration) } - // Add 200s overlap for jobs start time at the end - startTimeTail := startTime + 10 - stopTimeTail := stopTime - 200 - startTimeFront := startTime + 200 + // Time buffer constant for finding overlapping jobs + // overlapBufferEnd: 200s buffer at job end to account for scheduling/cleanup overlap + const overlapBufferEnd = 200 - queryRunning := query.Where("job.job_state = ?").Where("(job.start_time BETWEEN ? AND ? OR job.start_time < ?)", - "running", startTimeTail, stopTimeTail, startTime) + stopTimeTail := stopTime - overlapBufferEnd + startTimeFront := startTime + overlapBufferEnd + + queryRunning := query.Where("job.job_state = ?", "running"). + Where("job.start_time <= ?", stopTimeTail) // Get At Least One Exact Hostname Match from JSON Resources Array in Database queryRunning = queryRunning.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, '$.hostname') = ?)", hostname) - query = query.Where("job.job_state != ?").Where("((job.start_time BETWEEN ? AND ?) OR (job.start_time + job.duration) BETWEEN ? AND ? OR (job.start_time < ?) AND (job.start_time + job.duration) > ?)", - "running", startTimeTail, stopTimeTail, startTimeFront, stopTimeTail, startTime, stopTime) + query = query.Where("job.job_state != ?", "running"). + Where("job.start_time < ?", stopTimeTail). + Where("(job.start_time + job.duration) > ?", startTimeFront) // Get At Least One Exact Hostname Match from JSON Resources Array in Database query = query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, '$.hostname') = ?)", hostname) rows, err := query.RunWith(r.stmtCache).Query() if err != nil { - cclog.Errorf("Error while running query: %v", err) - return nil, err + cclog.Errorf("Error while running concurrent jobs query: %v", err) + return nil, fmt.Errorf("failed to execute concurrent jobs query: %w", err) } defer rows.Close() @@ -269,44 +299,44 @@ func (r *JobRepository) FindConcurrentJobs( queryString := fmt.Sprintf("cluster=%s", job.Cluster) for rows.Next() { - var id, jobId, startTime sql.NullInt64 + var id, jobID, startTime sql.NullInt64 - if err = rows.Scan(&id, &jobId, &startTime); err != nil { - cclog.Warn("Error while scanning rows") - return nil, err + if err = rows.Scan(&id, &jobID, &startTime); err != nil { + cclog.Warnf("Error while scanning concurrent job rows: %v", err) + return nil, fmt.Errorf("failed to scan concurrent job row: %w", err) } if id.Valid { - queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64)) + queryString += fmt.Sprintf("&jobId=%d", int(jobID.Int64)) items = append(items, &model.JobLink{ ID: fmt.Sprint(id.Int64), - JobID: int(jobId.Int64), + JobID: int(jobID.Int64), }) } } rows, err = queryRunning.RunWith(r.stmtCache).Query() if err != nil { - cclog.Errorf("Error while running query: %v", err) - return nil, err + cclog.Errorf("Error while running concurrent running jobs query: %v", err) + return nil, fmt.Errorf("failed to execute concurrent running jobs query: %w", err) } defer rows.Close() for rows.Next() { - var id, jobId, startTime sql.NullInt64 + var id, jobID, startTime sql.NullInt64 - if err := rows.Scan(&id, &jobId, &startTime); err != nil { - cclog.Warn("Error while scanning rows") - return nil, err + if err := rows.Scan(&id, &jobID, &startTime); err != nil { + cclog.Warnf("Error while scanning running concurrent job rows: %v", err) + return nil, fmt.Errorf("failed to scan running concurrent job row: %w", err) } if id.Valid { - queryString += fmt.Sprintf("&jobId=%d", int(jobId.Int64)) + queryString += fmt.Sprintf("&jobId=%d", int(jobID.Int64)) items = append(items, &model.JobLink{ ID: fmt.Sprint(id.Int64), - JobID: int(jobId.Int64), + JobID: int(jobID.Int64), }) } } diff --git a/internal/repository/jobHooks.go b/internal/repository/jobHooks.go index 824b5cde..66d29eea 100644 --- a/internal/repository/jobHooks.go +++ b/internal/repository/jobHooks.go @@ -2,16 +2,45 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package repository import ( "sync" - "github.com/ClusterCockpit/cc-lib/schema" + "github.com/ClusterCockpit/cc-lib/v2/schema" ) +// JobHook interface allows external components to hook into job lifecycle events. +// Implementations can perform actions when jobs start or stop, such as tagging, +// logging, notifications, or triggering external workflows. +// +// Example implementation: +// +// type MyJobTagger struct{} +// +// func (t *MyJobTagger) JobStartCallback(job *schema.Job) { +// if job.NumNodes > 100 { +// // Tag large jobs automatically +// } +// } +// +// func (t *MyJobTagger) JobStopCallback(job *schema.Job) { +// if job.State == schema.JobStateFailed { +// // Log or alert on failed jobs +// } +// } +// +// Register hooks during application initialization: +// +// repository.RegisterJobHook(&MyJobTagger{}) type JobHook interface { + // JobStartCallback is invoked when one or more jobs start. + // This is called synchronously, so implementations should be fast. JobStartCallback(job *schema.Job) + + // JobStopCallback is invoked when a job completes. + // This is called synchronously, so implementations should be fast. JobStopCallback(job *schema.Job) } @@ -20,7 +49,13 @@ var ( hooks []JobHook ) -func RegisterJobJook(hook JobHook) { +// RegisterJobHook registers a JobHook to receive job lifecycle callbacks. +// Multiple hooks can be registered and will be called in registration order. +// This function is safe to call multiple times and is typically called during +// application initialization. +// +// Nil hooks are silently ignored to simplify conditional registration. +func RegisterJobHook(hook JobHook) { initOnce.Do(func() { hooks = make([]JobHook, 0) }) @@ -30,6 +65,12 @@ func RegisterJobJook(hook JobHook) { } } +// CallJobStartHooks invokes all registered JobHook.JobStartCallback methods +// for each job in the provided slice. This is called internally by the repository +// when jobs are started (e.g., via StartJob or batch job imports). +// +// Hooks are called synchronously in registration order. If a hook panics, +// the panic will propagate to the caller. func CallJobStartHooks(jobs []*schema.Job) { if hooks == nil { return @@ -44,6 +85,12 @@ func CallJobStartHooks(jobs []*schema.Job) { } } +// CallJobStopHooks invokes all registered JobHook.JobStopCallback methods +// for the provided job. This is called internally by the repository when a +// job completes (e.g., via StopJob or job state updates). +// +// Hooks are called synchronously in registration order. If a hook panics, +// the panic will propagate to the caller. func CallJobStopHooks(job *schema.Job) { if hooks == nil { return diff --git a/internal/repository/jobQuery.go b/internal/repository/jobQuery.go index 00dabea3..437801aa 100644 --- a/internal/repository/jobQuery.go +++ b/internal/repository/jobQuery.go @@ -2,6 +2,10 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + +// Package repository provides job query functionality with filtering, pagination, +// and security controls. This file contains the main query builders and security +// checks for job retrieval operations. package repository import ( @@ -14,11 +18,27 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/graph/model" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" sq "github.com/Masterminds/squirrel" ) +const ( + // Default initial capacity for job result slices + defaultJobsCapacity = 50 +) + +// QueryJobs retrieves jobs from the database with optional filtering, pagination, +// and sorting. Security controls are automatically applied based on the user context. +// +// Parameters: +// - ctx: Context containing user authentication information +// - filters: Optional job filters (cluster, state, user, time ranges, etc.) +// - page: Optional pagination parameters (page number and items per page) +// - order: Optional sorting specification (column or footprint field) +// +// Returns a slice of jobs matching the criteria, or an error if the query fails. +// The function enforces role-based access control through SecurityCheck. func (r *JobRepository) QueryJobs( ctx context.Context, filters []*model.JobFilter, @@ -33,26 +53,24 @@ func (r *JobRepository) QueryJobs( if order != nil { field := toSnakeCase(order.Field) if order.Type == "col" { - // "col": Fixed column name query switch order.Order { case model.SortDirectionEnumAsc: query = query.OrderBy(fmt.Sprintf("job.%s ASC", field)) case model.SortDirectionEnumDesc: query = query.OrderBy(fmt.Sprintf("job.%s DESC", field)) default: - return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for column") + return nil, errors.New("invalid sorting order for column") } } else { - // "foot": Order by footprint JSON field values - // Verify and Search Only in Valid Jsons - query = query.Where("JSON_VALID(meta_data)") + // Order by footprint JSON field values + query = query.Where("JSON_VALID(footprint)") switch order.Order { case model.SortDirectionEnumAsc: query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") ASC", field)) case model.SortDirectionEnumDesc: query = query.OrderBy(fmt.Sprintf("JSON_EXTRACT(footprint, \"$.%s\") DESC", field)) default: - return nil, errors.New("REPOSITORY/QUERY > invalid sorting order for footprint") + return nil, errors.New("invalid sorting order for footprint") } } } @@ -69,29 +87,35 @@ func (r *JobRepository) QueryJobs( rows, err := query.RunWith(r.stmtCache).Query() if err != nil { queryString, queryVars, _ := query.ToSql() - cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err) - return nil, err + return nil, fmt.Errorf("query failed [%s] %v: %w", queryString, queryVars, err) } + defer rows.Close() - jobs := make([]*schema.Job, 0, 50) + jobs := make([]*schema.Job, 0, defaultJobsCapacity) for rows.Next() { job, err := scanJob(rows) if err != nil { - rows.Close() - cclog.Warn("Error while scanning rows (Jobs)") - return nil, err + cclog.Warnf("Error scanning job row: %v", err) + return nil, fmt.Errorf("failed to scan job row: %w", err) } jobs = append(jobs, job) } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("error iterating job rows: %w", err) + } + return jobs, nil } +// CountJobs returns the total number of jobs matching the given filters. +// Security controls are automatically applied based on the user context. +// Uses DISTINCT count to handle tag filters correctly (jobs may appear multiple +// times when joined with the tag table). func (r *JobRepository) CountJobs( ctx context.Context, filters []*model.JobFilter, ) (int, error) { - // DISTICT count for tags filters, does not affect other queries query, qerr := SecurityCheck(ctx, sq.Select("count(DISTINCT job.id)").From("job")) if qerr != nil { return 0, qerr @@ -103,12 +127,22 @@ func (r *JobRepository) CountJobs( var count int if err := query.RunWith(r.DB).Scan(&count); err != nil { - return 0, err + return 0, fmt.Errorf("failed to count jobs: %w", err) } return count, nil } +// SecurityCheckWithUser applies role-based access control filters to a job query +// based on the provided user's roles and permissions. +// +// Access rules by role: +// - API role (exclusive): Full access to all jobs +// - Admin/Support roles: Full access to all jobs +// - Manager role: Access to jobs in managed projects plus own jobs +// - User role: Access only to own jobs +// +// Returns an error if the user is nil or has no recognized roles. func SecurityCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.SelectBuilder, error) { if user == nil { var qnil sq.SelectBuilder @@ -116,84 +150,68 @@ func SecurityCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.Select } switch { - case len(user.Roles) == 1 && user.HasRole(schema.RoleApi): // API-User : All jobs + case len(user.Roles) == 1 && user.HasRole(schema.RoleAPI): return query, nil - case user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}): // Admin & Support : All jobs + case user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}): return query, nil - case user.HasRole(schema.RoleManager): // Manager : Add filter for managed projects' jobs only + personal jobs + case user.HasRole(schema.RoleManager): if len(user.Projects) != 0 { return query.Where(sq.Or{sq.Eq{"job.project": user.Projects}, sq.Eq{"job.hpc_user": user.Username}}), nil - } else { - cclog.Debugf("Manager-User '%s' has no defined projects to lookup! Query only personal jobs ...", user.Username) - return query.Where("job.hpc_user = ?", user.Username), nil } - case user.HasRole(schema.RoleUser): // User : Only personal jobs + cclog.Debugf("Manager '%s' has no assigned projects, restricting to personal jobs", user.Username) return query.Where("job.hpc_user = ?", user.Username), nil - default: // No known Role, return error + case user.HasRole(schema.RoleUser): + return query.Where("job.hpc_user = ?", user.Username), nil + default: var qnil sq.SelectBuilder return qnil, fmt.Errorf("user has no or unknown roles") } } +// SecurityCheck extracts the user from the context and applies role-based access +// control filters to the query. This is a convenience wrapper around SecurityCheckWithUser. func SecurityCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilder, error) { user := GetUserFromContext(ctx) - return SecurityCheckWithUser(user, query) } -// Build a sq.SelectBuilder out of a schema.JobFilter. +// BuildWhereClause constructs SQL WHERE conditions from a JobFilter and applies +// them to the query. Supports filtering by job properties (cluster, state, user), +// time ranges, resource usage, tags, and JSON field searches in meta_data, +// footprint, and resources columns. func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.SelectBuilder { - if filter.Tags != nil { - // This is an OR-Logic query: Returns all distinct jobs with at least one of the requested tags; TODO: AND-Logic query? - query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags}).Distinct() - } + // Primary Key if filter.DbID != nil { dbIDs := make([]string, len(filter.DbID)) copy(dbIDs, filter.DbID) query = query.Where(sq.Eq{"job.id": dbIDs}) } - if filter.JobID != nil { - query = buildStringCondition("job.job_id", filter.JobID, query) - } - if filter.ArrayJobID != nil { - query = query.Where("job.array_job_id = ?", *filter.ArrayJobID) - } - if filter.User != nil { - query = buildStringCondition("job.hpc_user", filter.User, query) - } - if filter.Project != nil { - query = buildStringCondition("job.project", filter.Project, query) - } - if filter.JobName != nil { - query = buildMetaJsonCondition("jobName", filter.JobName, query) - } + // Explicit indices if filter.Cluster != nil { query = buildStringCondition("job.cluster", filter.Cluster, query) } + if filter.SubCluster != nil { + query = buildStringCondition("job.subcluster", filter.SubCluster, query) + } if filter.Partition != nil { query = buildStringCondition("job.cluster_partition", filter.Partition, query) } - if filter.StartTime != nil { - query = buildTimeCondition("job.start_time", filter.StartTime, query) - } - if filter.Duration != nil { - query = buildIntCondition("job.duration", filter.Duration, query) - } - if filter.MinRunningFor != nil { - now := time.Now().Unix() // There does not seam to be a portable way to get the current unix timestamp accross different DBs. - query = query.Where("(job.job_state != 'running' OR (? - job.start_time) > ?)", now, *filter.MinRunningFor) - } - if filter.Shared != nil { - query = query.Where("job.shared = ?", *filter.Shared) - } if filter.State != nil { states := make([]string, len(filter.State)) for i, val := range filter.State { states[i] = string(val) } - query = query.Where(sq.Eq{"job.job_state": states}) } + if filter.Shared != nil { + query = query.Where("job.shared = ?", *filter.Shared) + } + if filter.Project != nil { + query = buildStringCondition("job.project", filter.Project, query) + } + if filter.User != nil { + query = buildStringCondition("job.hpc_user", filter.User, query) + } if filter.NumNodes != nil { query = buildIntCondition("job.num_nodes", filter.NumNodes, query) } @@ -203,33 +221,95 @@ func BuildWhereClause(filter *model.JobFilter, query sq.SelectBuilder) sq.Select if filter.NumHWThreads != nil { query = buildIntCondition("job.num_hwthreads", filter.NumHWThreads, query) } - if filter.Node != nil { - query = buildResourceJsonCondition("hostname", filter.Node, query) + if filter.ArrayJobID != nil { + query = query.Where("job.array_job_id = ?", *filter.ArrayJobID) + } + if filter.StartTime != nil { + query = buildTimeCondition("job.start_time", filter.StartTime, query) + } + if filter.Duration != nil { + query = buildIntCondition("job.duration", filter.Duration, query) } if filter.Energy != nil { query = buildFloatCondition("job.energy", filter.Energy, query) } + // Indices on Tag Table + if filter.Tags != nil { + // This is an OR-Logic query: Returns all distinct jobs with at least one of the requested tags; TODO: AND-Logic query? + query = query.Join("jobtag ON jobtag.job_id = job.id").Where(sq.Eq{"jobtag.tag_id": filter.Tags}).Distinct() + } + // No explicit Indices + if filter.JobID != nil { + query = buildStringCondition("job.job_id", filter.JobID, query) + } + // Queries Within JSONs if filter.MetricStats != nil { for _, ms := range filter.MetricStats { - query = buildFloatJsonCondition(ms.MetricName, ms.Range, query) + query = buildFloatJSONCondition(ms.MetricName, ms.Range, query) } } + if filter.Node != nil { + query = buildResourceJSONCondition("hostname", filter.Node, query) + } + if filter.JobName != nil { + query = buildMetaJSONCondition("jobName", filter.JobName, query) + } + if filter.Schedule != nil { + interactiveJobname := "interactive" + switch *filter.Schedule { + case "interactive": + iFilter := model.StringInput{Eq: &interactiveJobname} + query = buildMetaJSONCondition("jobName", &iFilter, query) + case "batch": + sFilter := model.StringInput{Neq: &interactiveJobname} + query = buildMetaJSONCondition("jobName", &sFilter, query) + } + } + + // Configurable Filter to exclude recently started jobs, see config.go: ShortRunningJobsDuration + if filter.MinRunningFor != nil { + now := time.Now().Unix() + // Only jobs whose start timestamp is more than MinRunningFor seconds in the past + // If a job completed within the configured timeframe, it will still show up after the start_time matches the condition! + query = query.Where(sq.Lt{"job.start_time": (now - int64(*filter.MinRunningFor))}) + } return query } +// buildIntCondition creates clauses for integer range filters, using BETWEEN only if required. func buildIntCondition(field string, cond *config.IntRange, query sq.SelectBuilder) sq.SelectBuilder { - return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To) + if cond.From != 1 && cond.To != 0 { + return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To) + } else if cond.From != 1 && cond.To == 0 { + return query.Where(field+" >= ?", cond.From) + } else if cond.From == 1 && cond.To != 0 { + return query.Where(field+" <= ?", cond.To) + } else { + return query + } } +// buildFloatCondition creates a clauses for float range filters, using BETWEEN only if required. func buildFloatCondition(field string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder { - return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To) + if cond.From != 1.0 && cond.To != 0.0 { + return query.Where(field+" BETWEEN ? AND ?", cond.From, cond.To) + } else if cond.From != 1.0 && cond.To == 0.0 { + return query.Where(field+" >= ?", cond.From) + } else if cond.From == 1.0 && cond.To != 0.0 { + return query.Where(field+" <= ?", cond.To) + } else { + return query + } } +// buildTimeCondition creates time range filters supporting absolute timestamps, +// relative time ranges (last6h, last24h, last7d, last30d), or open-ended ranges. +// Reminder: BETWEEN Queries are slower and dont use indices as frequently: Only use if both conditions required func buildTimeCondition(field string, cond *config.TimeRange, query sq.SelectBuilder) sq.SelectBuilder { if cond.From != nil && cond.To != nil { return query.Where(field+" BETWEEN ? AND ?", cond.From.Unix(), cond.To.Unix()) } else if cond.From != nil { - return query.Where("? <= "+field, cond.From.Unix()) + return query.Where(field+" >= ?", cond.From.Unix()) } else if cond.To != nil { return query.Where(field+" <= ?", cond.To.Unix()) } else if cond.Range != "" { @@ -248,18 +328,28 @@ func buildTimeCondition(field string, cond *config.TimeRange, query sq.SelectBui cclog.Debugf("No known named timeRange: startTime.range = %s", cond.Range) return query } - return query.Where(field+" BETWEEN ? AND ?", then, now) + return query.Where(field+" >= ?", then) } else { return query } } -func buildFloatJsonCondition(condName string, condRange *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder { - // Verify and Search Only in Valid Jsons +// buildFloatJSONCondition creates a filter on a numeric field within the footprint JSON column, using BETWEEN only if required. +func buildFloatJSONCondition(jsonField string, cond *model.FloatRange, query sq.SelectBuilder) sq.SelectBuilder { query = query.Where("JSON_VALID(footprint)") - return query.Where("JSON_EXTRACT(footprint, \"$."+condName+"\") BETWEEN ? AND ?", condRange.From, condRange.To) + if cond.From != 1.0 && cond.To != 0.0 { + return query.Where("JSON_EXTRACT(footprint, \"$."+jsonField+"\") BETWEEN ? AND ?", cond.From, cond.To) + } else if cond.From != 1.0 && cond.To == 0.0 { + return query.Where("JSON_EXTRACT(footprint, \"$."+jsonField+"\") >= ?", cond.From) + } else if cond.From == 1.0 && cond.To != 0.0 { + return query.Where("JSON_EXTRACT(footprint, \"$."+jsonField+"\") <= ?", cond.To) + } else { + return query + } } +// buildStringCondition creates filters for string fields supporting equality, +// inequality, prefix, suffix, substring, and IN list matching. func buildStringCondition(field string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder { if cond.Eq != nil { return query.Where(field+" = ?", *cond.Eq) @@ -284,10 +374,9 @@ func buildStringCondition(field string, cond *model.StringInput, query sq.Select return query } -func buildMetaJsonCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder { - // Verify and Search Only in Valid Jsons +// buildMetaJSONCondition creates filters on fields within the meta_data JSON column. +func buildMetaJSONCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder { query = query.Where("JSON_VALID(meta_data)") - // add "AND" Sql query Block for field match if cond.Eq != nil { return query.Where("JSON_EXTRACT(meta_data, \"$."+jsonField+"\") = ?", *cond.Eq) } @@ -306,10 +395,10 @@ func buildMetaJsonCondition(jsonField string, cond *model.StringInput, query sq. return query } -func buildResourceJsonCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder { - // Verify and Search Only in Valid Jsons +// buildResourceJSONCondition creates filters on fields within the resources JSON array column. +// Uses json_each to search within array elements. +func buildResourceJSONCondition(jsonField string, cond *model.StringInput, query sq.SelectBuilder) sq.SelectBuilder { query = query.Where("JSON_VALID(resources)") - // add "AND" Sql query Block for field match if cond.Eq != nil { return query.Where("EXISTS (SELECT 1 FROM json_each(job.resources) WHERE json_extract(value, \"$."+jsonField+"\") = ?)", *cond.Eq) } @@ -333,15 +422,16 @@ var ( matchAllCap = regexp.MustCompile("([a-z0-9])([A-Z])") ) +// toSnakeCase converts camelCase strings to snake_case for SQL column names. +// Includes security checks to prevent SQL injection attempts. +// Panics if potentially dangerous characters are detected. func toSnakeCase(str string) string { for _, c := range str { - if c == '\'' || c == '\\' { - cclog.Panic("toSnakeCase() attack vector!") + if c == '\'' || c == '\\' || c == '"' || c == ';' || c == '-' || c == ' ' { + cclog.Panicf("toSnakeCase: potentially dangerous character detected in input: %q", str) } } - str = strings.ReplaceAll(str, "'", "") - str = strings.ReplaceAll(str, "\\", "") snake := matchFirstCap.ReplaceAllString(str, "${1}_${2}") snake = matchAllCap.ReplaceAllString(snake, "${1}_${2}") return strings.ToLower(snake) diff --git a/internal/repository/job_test.go b/internal/repository/job_test.go index 9415bf98..992251af 100644 --- a/internal/repository/job_test.go +++ b/internal/repository/job_test.go @@ -10,7 +10,7 @@ import ( "testing" "time" - "github.com/ClusterCockpit/cc-lib/schema" + "github.com/ClusterCockpit/cc-lib/v2/schema" _ "github.com/mattn/go-sqlite3" ) @@ -33,7 +33,7 @@ func TestFind(t *testing.T) { func TestFindById(t *testing.T) { r := setup(t) - job, err := r.FindById(getContext(t), 338) + job, err := r.FindByID(getContext(t), 338) if err != nil { t.Fatal(err) } @@ -78,7 +78,7 @@ func TestFindJobsBetween(t *testing.T) { // 1. Find a job to use (Find all jobs) // We use a large time range to ensure we get something if it exists - jobs, err := r.FindJobsBetween(0, 9999999999, false) + jobs, err := r.FindJobsBetween(0, 9999999999, "none") if err != nil { t.Fatal(err) } @@ -88,21 +88,21 @@ func TestFindJobsBetween(t *testing.T) { targetJob := jobs[0] - // 2. Create a tag - tagName := fmt.Sprintf("testtag_%d", time.Now().UnixNano()) - tagId, err := r.CreateTag("testtype", tagName, "global") + // 2. Create an auto-tagger tag (type "app") + appTagName := fmt.Sprintf("apptag_%d", time.Now().UnixNano()) + appTagID, err := r.CreateTag("app", appTagName, "global") if err != nil { t.Fatal(err) } - // 3. Link Tag (Manually to avoid archive dependency side-effects in unit test) - _, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, tagId) + // 3. Link auto-tagger tag to job + _, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, appTagID) if err != nil { t.Fatal(err) } - // 4. Search with omitTagged = false (Should find the job) - jobsFound, err := r.FindJobsBetween(0, 9999999999, false) + // 4. Search with omitTagged = "none" (Should find the job) + jobsFound, err := r.FindJobsBetween(0, 9999999999, "none") if err != nil { t.Fatal(err) } @@ -115,18 +115,58 @@ func TestFindJobsBetween(t *testing.T) { } } if !found { - t.Errorf("Target job %d should be found when omitTagged=false", *targetJob.ID) + t.Errorf("Target job %d should be found when omitTagged=none", *targetJob.ID) } - // 5. Search with omitTagged = true (Should NOT find the job) - jobsFiltered, err := r.FindJobsBetween(0, 9999999999, true) + // 5. Search with omitTagged = "all" (Should NOT find the job — it has a tag) + jobsFiltered, err := r.FindJobsBetween(0, 9999999999, "all") if err != nil { t.Fatal(err) } for _, j := range jobsFiltered { if *j.ID == *targetJob.ID { - t.Errorf("Target job %d should NOT be found when omitTagged=true", *targetJob.ID) + t.Errorf("Target job %d should NOT be found when omitTagged=all", *targetJob.ID) + } + } + + // 6. Search with omitTagged = "user": auto-tagger tag ("app") should NOT exclude the job + jobsUserFilter, err := r.FindJobsBetween(0, 9999999999, "user") + if err != nil { + t.Fatal(err) + } + + found = false + for _, j := range jobsUserFilter { + if *j.ID == *targetJob.ID { + found = true + break + } + } + if !found { + t.Errorf("Target job %d should be found when omitTagged=user (only has auto-tagger tag)", *targetJob.ID) + } + + // 7. Add a user-created tag (type "testtype") to the same job + userTagName := fmt.Sprintf("usertag_%d", time.Now().UnixNano()) + userTagID, err := r.CreateTag("testtype", userTagName, "global") + if err != nil { + t.Fatal(err) + } + _, err = r.DB.Exec("INSERT INTO jobtag (job_id, tag_id) VALUES (?, ?)", *targetJob.ID, userTagID) + if err != nil { + t.Fatal(err) + } + + // 8. Now omitTagged = "user" should exclude the job (has a user-created tag) + jobsUserFilter2, err := r.FindJobsBetween(0, 9999999999, "user") + if err != nil { + t.Fatal(err) + } + + for _, j := range jobsUserFilter2 { + if *j.ID == *targetJob.ID { + t.Errorf("Target job %d should NOT be found when omitTagged=user (has user-created tag)", *targetJob.ID) } } } diff --git a/internal/repository/migration.go b/internal/repository/migration.go index 58ab3e69..0f99889e 100644 --- a/internal/repository/migration.go +++ b/internal/repository/migration.go @@ -10,52 +10,48 @@ import ( "embed" "fmt" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/golang-migrate/migrate/v4" - "github.com/golang-migrate/migrate/v4/database/mysql" "github.com/golang-migrate/migrate/v4/database/sqlite3" "github.com/golang-migrate/migrate/v4/source/iofs" ) +// Version is the current database schema version required by this version of cc-backend. +// When the database schema changes, this version is incremented and a new migration file +// is added to internal/repository/migrations/sqlite3/. +// +// Version history: +// - Version 10: Current version +// +// Migration files are embedded at build time from the migrations directory. const Version uint = 10 //go:embed migrations/* var migrationFiles embed.FS -func checkDBVersion(backend string, db *sql.DB) error { - var m *migrate.Migrate +// checkDBVersion verifies that the database schema version matches the expected version. +// This is called automatically during Connect() to ensure schema compatibility. +// +// Returns an error if: +// - Database version is older than expected (needs migration) +// - Database version is newer than expected (needs app upgrade) +// - Database is in a dirty state (failed migration) +// +// A "dirty" database indicates a migration was started but not completed successfully. +// This requires manual intervention to fix the database and force the version. +func checkDBVersion(db *sql.DB) error { + driver, err := sqlite3.WithInstance(db, &sqlite3.Config{}) + if err != nil { + return err + } + d, err := iofs.New(migrationFiles, "migrations/sqlite3") + if err != nil { + return err + } - switch backend { - case "sqlite3": - driver, err := sqlite3.WithInstance(db, &sqlite3.Config{}) - if err != nil { - return err - } - d, err := iofs.New(migrationFiles, "migrations/sqlite3") - if err != nil { - return err - } - - m, err = migrate.NewWithInstance("iofs", d, "sqlite3", driver) - if err != nil { - return err - } - case "mysql": - driver, err := mysql.WithInstance(db, &mysql.Config{}) - if err != nil { - return err - } - d, err := iofs.New(migrationFiles, "migrations/mysql") - if err != nil { - return err - } - - m, err = migrate.NewWithInstance("iofs", d, "mysql", driver) - if err != nil { - return err - } - default: - cclog.Abortf("Migration: Unsupported database backend '%s'.\n", backend) + m, err := migrate.NewWithInstance("iofs", d, "sqlite3", driver) + if err != nil { + return err } v, dirty, err := m.Version() @@ -80,37 +76,41 @@ func checkDBVersion(backend string, db *sql.DB) error { return nil } -func getMigrateInstance(backend string, db string) (m *migrate.Migrate, err error) { - switch backend { - case "sqlite3": - d, err := iofs.New(migrationFiles, "migrations/sqlite3") - if err != nil { - cclog.Fatal(err) - } +// getMigrateInstance creates a new migration instance for the given database file. +// This is used internally by MigrateDB, RevertDB, and ForceDB. +func getMigrateInstance(db string) (m *migrate.Migrate, err error) { + d, err := iofs.New(migrationFiles, "migrations/sqlite3") + if err != nil { + return nil, err + } - m, err = migrate.NewWithSourceInstance("iofs", d, fmt.Sprintf("sqlite3://%s?_foreign_keys=on", db)) - if err != nil { - return m, err - } - case "mysql": - d, err := iofs.New(migrationFiles, "migrations/mysql") - if err != nil { - return m, err - } - - m, err = migrate.NewWithSourceInstance("iofs", d, fmt.Sprintf("mysql://%s?multiStatements=true", db)) - if err != nil { - return m, err - } - default: - cclog.Abortf("Migration: Unsupported database backend '%s'.\n", backend) + m, err = migrate.NewWithSourceInstance("iofs", d, fmt.Sprintf("sqlite3://%s?_foreign_keys=on", db)) + if err != nil { + return nil, err } return m, nil } -func MigrateDB(backend string, db string) error { - m, err := getMigrateInstance(backend, db) +// MigrateDB applies all pending database migrations to bring the schema up to date. +// This should be run with the -migrate-db flag before starting the application +// after upgrading to a new version that requires schema changes. +// +// Process: +// 1. Checks current database version +// 2. Applies all migrations from current version to target Version +// 3. Updates schema_migrations table to track applied migrations +// +// Important: +// - Always backup your database before running migrations +// - Migrations are irreversible without manual intervention +// - If a migration fails, the database is marked "dirty" and requires manual fix +// +// Usage: +// +// cc-backend -migrate-db +func MigrateDB(db string) error { + m, err := getMigrateInstance(db) if err != nil { return err } @@ -118,7 +118,7 @@ func MigrateDB(backend string, db string) error { v, dirty, err := m.Version() if err != nil { if err == migrate.ErrNilVersion { - cclog.Warn("Legacy database without version or missing database file!") + cclog.Info("Legacy database without version or missing database file!") } else { return err } @@ -144,8 +144,19 @@ func MigrateDB(backend string, db string) error { return nil } -func RevertDB(backend string, db string) error { - m, err := getMigrateInstance(backend, db) +// RevertDB rolls back the database schema to the previous version (Version - 1). +// This is primarily used for testing or emergency rollback scenarios. +// +// Warning: +// - This may cause data loss if newer schema added columns/tables +// - Always backup before reverting +// - Not all migrations are safely reversible +// +// Usage: +// +// cc-backend -revert-db +func RevertDB(db string) error { + m, err := getMigrateInstance(db) if err != nil { return err } @@ -162,8 +173,23 @@ func RevertDB(backend string, db string) error { return nil } -func ForceDB(backend string, db string) error { - m, err := getMigrateInstance(backend, db) +// ForceDB forces the database schema version to the current Version without running migrations. +// This is only used to recover from failed migrations that left the database in a "dirty" state. +// +// When to use: +// - After manually fixing a failed migration +// - When you've manually applied schema changes and need to update the version marker +// +// Warning: +// - This does NOT apply any schema changes +// - Only use after manually verifying the schema is correct +// - Improper use can cause schema/version mismatch +// +// Usage: +// +// cc-backend -force-db +func ForceDB(db string) error { + m, err := getMigrateInstance(db) if err != nil { return err } diff --git a/internal/repository/migrations/mysql/01_init-schema.down.sql b/internal/repository/migrations/mysql/01_init-schema.down.sql deleted file mode 100644 index 68da6469..00000000 --- a/internal/repository/migrations/mysql/01_init-schema.down.sql +++ /dev/null @@ -1,5 +0,0 @@ -DROP TABLE IF EXISTS job; -DROP TABLE IF EXISTS tags; -DROP TABLE IF EXISTS jobtag; -DROP TABLE IF EXISTS configuration; -DROP TABLE IF EXISTS user; diff --git a/internal/repository/migrations/mysql/01_init-schema.up.sql b/internal/repository/migrations/mysql/01_init-schema.up.sql deleted file mode 100644 index 3a6930cd..00000000 --- a/internal/repository/migrations/mysql/01_init-schema.up.sql +++ /dev/null @@ -1,66 +0,0 @@ -CREATE TABLE IF NOT EXISTS job ( - id INTEGER AUTO_INCREMENT PRIMARY KEY , - job_id BIGINT NOT NULL, - cluster VARCHAR(255) NOT NULL, - subcluster VARCHAR(255) NOT NULL, - start_time BIGINT NOT NULL, -- Unix timestamp - - user VARCHAR(255) NOT NULL, - project VARCHAR(255) NOT NULL, - `partition` VARCHAR(255) NOT NULL, - array_job_id BIGINT NOT NULL, - duration INT NOT NULL DEFAULT 0, - walltime INT NOT NULL DEFAULT 0, - job_state VARCHAR(255) NOT NULL - CHECK(job_state IN ('running', 'completed', 'failed', 'cancelled', - 'stopped', 'timeout', 'preempted', 'out_of_memory')), - meta_data TEXT, -- JSON - resources TEXT NOT NULL, -- JSON - - num_nodes INT NOT NULL, - num_hwthreads INT NOT NULL, - num_acc INT NOT NULL, - smt TINYINT NOT NULL DEFAULT 1 CHECK(smt IN (0, 1 )), - exclusive TINYINT NOT NULL DEFAULT 1 CHECK(exclusive IN (0, 1, 2)), - monitoring_status TINYINT NOT NULL DEFAULT 1 CHECK(monitoring_status IN (0, 1, 2, 3)), - - mem_used_max REAL NOT NULL DEFAULT 0.0, - flops_any_avg REAL NOT NULL DEFAULT 0.0, - mem_bw_avg REAL NOT NULL DEFAULT 0.0, - load_avg REAL NOT NULL DEFAULT 0.0, - net_bw_avg REAL NOT NULL DEFAULT 0.0, - net_data_vol_total REAL NOT NULL DEFAULT 0.0, - file_bw_avg REAL NOT NULL DEFAULT 0.0, - file_data_vol_total REAL NOT NULL DEFAULT 0.0, - UNIQUE (job_id, cluster, start_time) - ); - -CREATE TABLE IF NOT EXISTS tag ( - id INTEGER PRIMARY KEY, - tag_type VARCHAR(255) NOT NULL, - tag_name VARCHAR(255) NOT NULL, - UNIQUE (tag_type, tag_name)); - -CREATE TABLE IF NOT EXISTS jobtag ( - job_id INTEGER, - tag_id INTEGER, - PRIMARY KEY (job_id, tag_id), - FOREIGN KEY (job_id) REFERENCES job (id) ON DELETE CASCADE, - FOREIGN KEY (tag_id) REFERENCES tag (id) ON DELETE CASCADE); - -CREATE TABLE IF NOT EXISTS user ( - username varchar(255) PRIMARY KEY NOT NULL, - password varchar(255) DEFAULT NULL, - ldap tinyint NOT NULL DEFAULT 0, /* col called "ldap" for historic reasons, fills the "AuthSource" */ - name varchar(255) DEFAULT NULL, - roles varchar(255) NOT NULL DEFAULT "[]", - email varchar(255) DEFAULT NULL); - -CREATE TABLE IF NOT EXISTS configuration ( - username varchar(255), - confkey varchar(255), - value varchar(255), - PRIMARY KEY (username, confkey), - FOREIGN KEY (username) REFERENCES user (username) ON DELETE CASCADE ON UPDATE NO ACTION); - - diff --git a/internal/repository/migrations/mysql/02_add-index.down.sql b/internal/repository/migrations/mysql/02_add-index.down.sql deleted file mode 100644 index 1392c45c..00000000 --- a/internal/repository/migrations/mysql/02_add-index.down.sql +++ /dev/null @@ -1,8 +0,0 @@ -DROP INDEX IF EXISTS job_stats; -DROP INDEX IF EXISTS job_by_user; -DROP INDEX IF EXISTS job_by_starttime; -DROP INDEX IF EXISTS job_by_job_id; -DROP INDEX IF EXISTS job_list; -DROP INDEX IF EXISTS job_list_user; -DROP INDEX IF EXISTS job_list_users; -DROP INDEX IF EXISTS job_list_users_start; diff --git a/internal/repository/migrations/mysql/02_add-index.up.sql b/internal/repository/migrations/mysql/02_add-index.up.sql deleted file mode 100644 index 2524bd93..00000000 --- a/internal/repository/migrations/mysql/02_add-index.up.sql +++ /dev/null @@ -1,8 +0,0 @@ -CREATE INDEX IF NOT EXISTS job_stats ON job (cluster,subcluster,user); -CREATE INDEX IF NOT EXISTS job_by_user ON job (user); -CREATE INDEX IF NOT EXISTS job_by_starttime ON job (start_time); -CREATE INDEX IF NOT EXISTS job_by_job_id ON job (job_id); -CREATE INDEX IF NOT EXISTS job_list ON job (cluster, job_state); -CREATE INDEX IF NOT EXISTS job_list_user ON job (user, cluster, job_state); -CREATE INDEX IF NOT EXISTS job_list_users ON job (user, job_state); -CREATE INDEX IF NOT EXISTS job_list_users_start ON job (start_time, user, job_state); diff --git a/internal/repository/migrations/mysql/03_add-userprojects.down.sql b/internal/repository/migrations/mysql/03_add-userprojects.down.sql deleted file mode 100644 index bbf1e649..00000000 --- a/internal/repository/migrations/mysql/03_add-userprojects.down.sql +++ /dev/null @@ -1 +0,0 @@ -ALTER TABLE user DROP COLUMN projects; diff --git a/internal/repository/migrations/mysql/03_add-userprojects.up.sql b/internal/repository/migrations/mysql/03_add-userprojects.up.sql deleted file mode 100644 index d0f19c21..00000000 --- a/internal/repository/migrations/mysql/03_add-userprojects.up.sql +++ /dev/null @@ -1 +0,0 @@ -ALTER TABLE user ADD COLUMN projects varchar(255) NOT NULL DEFAULT "[]"; diff --git a/internal/repository/migrations/mysql/04_alter-table-job.down.sql b/internal/repository/migrations/mysql/04_alter-table-job.down.sql deleted file mode 100644 index ebc74549..00000000 --- a/internal/repository/migrations/mysql/04_alter-table-job.down.sql +++ /dev/null @@ -1,5 +0,0 @@ -ALTER TABLE job - MODIFY `partition` VARCHAR(255) NOT NULL, - MODIFY array_job_id BIGINT NOT NULL, - MODIFY num_hwthreads INT NOT NULL, - MODIFY num_acc INT NOT NULL; diff --git a/internal/repository/migrations/mysql/04_alter-table-job.up.sql b/internal/repository/migrations/mysql/04_alter-table-job.up.sql deleted file mode 100644 index 9fe76208..00000000 --- a/internal/repository/migrations/mysql/04_alter-table-job.up.sql +++ /dev/null @@ -1,5 +0,0 @@ -ALTER TABLE job - MODIFY `partition` VARCHAR(255), - MODIFY array_job_id BIGINT, - MODIFY num_hwthreads INT, - MODIFY num_acc INT; diff --git a/internal/repository/migrations/mysql/05_extend-tags.down.sql b/internal/repository/migrations/mysql/05_extend-tags.down.sql deleted file mode 100644 index 925c9f8f..00000000 --- a/internal/repository/migrations/mysql/05_extend-tags.down.sql +++ /dev/null @@ -1,2 +0,0 @@ -ALTER TABLE tag DROP COLUMN insert_time; -ALTER TABLE jobtag DROP COLUMN insert_time; diff --git a/internal/repository/migrations/mysql/05_extend-tags.up.sql b/internal/repository/migrations/mysql/05_extend-tags.up.sql deleted file mode 100644 index 4577564a..00000000 --- a/internal/repository/migrations/mysql/05_extend-tags.up.sql +++ /dev/null @@ -1,2 +0,0 @@ -ALTER TABLE tag ADD COLUMN insert_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP; -ALTER TABLE jobtag ADD COLUMN insert_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP; diff --git a/internal/repository/migrations/mysql/06_change-config.down.sql b/internal/repository/migrations/mysql/06_change-config.down.sql deleted file mode 100644 index 0651790c..00000000 --- a/internal/repository/migrations/mysql/06_change-config.down.sql +++ /dev/null @@ -1 +0,0 @@ -ALTER TABLE configuration MODIFY value VARCHAR(255); diff --git a/internal/repository/migrations/mysql/06_change-config.up.sql b/internal/repository/migrations/mysql/06_change-config.up.sql deleted file mode 100644 index e35ff195..00000000 --- a/internal/repository/migrations/mysql/06_change-config.up.sql +++ /dev/null @@ -1 +0,0 @@ -ALTER TABLE configuration MODIFY value TEXT; diff --git a/internal/repository/migrations/mysql/07_fix-tag-id.down.sql b/internal/repository/migrations/mysql/07_fix-tag-id.down.sql deleted file mode 100644 index 9f9959ac..00000000 --- a/internal/repository/migrations/mysql/07_fix-tag-id.down.sql +++ /dev/null @@ -1,3 +0,0 @@ -SET FOREIGN_KEY_CHECKS = 0; -ALTER TABLE tag MODIFY id INTEGER; -SET FOREIGN_KEY_CHECKS = 1; diff --git a/internal/repository/migrations/mysql/07_fix-tag-id.up.sql b/internal/repository/migrations/mysql/07_fix-tag-id.up.sql deleted file mode 100644 index 1abc4b35..00000000 --- a/internal/repository/migrations/mysql/07_fix-tag-id.up.sql +++ /dev/null @@ -1,3 +0,0 @@ -SET FOREIGN_KEY_CHECKS = 0; -ALTER TABLE tag MODIFY id INTEGER AUTO_INCREMENT; -SET FOREIGN_KEY_CHECKS = 1; diff --git a/internal/repository/migrations/mysql/08_add-footprint.down.sql b/internal/repository/migrations/mysql/08_add-footprint.down.sql deleted file mode 100644 index 57f2145c..00000000 --- a/internal/repository/migrations/mysql/08_add-footprint.down.sql +++ /dev/null @@ -1,83 +0,0 @@ -ALTER TABLE job DROP energy; -ALTER TABLE job DROP energy_footprint; -ALTER TABLE job ADD COLUMN flops_any_avg; -ALTER TABLE job ADD COLUMN mem_bw_avg; -ALTER TABLE job ADD COLUMN mem_used_max; -ALTER TABLE job ADD COLUMN load_avg; -ALTER TABLE job ADD COLUMN net_bw_avg; -ALTER TABLE job ADD COLUMN net_data_vol_total; -ALTER TABLE job ADD COLUMN file_bw_avg; -ALTER TABLE job ADD COLUMN file_data_vol_total; - -UPDATE job SET flops_any_avg = json_extract(footprint, '$.flops_any_avg'); -UPDATE job SET mem_bw_avg = json_extract(footprint, '$.mem_bw_avg'); -UPDATE job SET mem_used_max = json_extract(footprint, '$.mem_used_max'); -UPDATE job SET load_avg = json_extract(footprint, '$.cpu_load_avg'); -UPDATE job SET net_bw_avg = json_extract(footprint, '$.net_bw_avg'); -UPDATE job SET net_data_vol_total = json_extract(footprint, '$.net_data_vol_total'); -UPDATE job SET file_bw_avg = json_extract(footprint, '$.file_bw_avg'); -UPDATE job SET file_data_vol_total = json_extract(footprint, '$.file_data_vol_total'); - -ALTER TABLE job DROP footprint; --- Do not use reserved keywords anymore -RENAME TABLE hpc_user TO `user`; -ALTER TABLE job RENAME COLUMN hpc_user TO `user`; -ALTER TABLE job RENAME COLUMN cluster_partition TO `partition`; - -DROP INDEX IF EXISTS jobs_cluster; -DROP INDEX IF EXISTS jobs_cluster_user; -DROP INDEX IF EXISTS jobs_cluster_project; -DROP INDEX IF EXISTS jobs_cluster_subcluster; -DROP INDEX IF EXISTS jobs_cluster_starttime; -DROP INDEX IF EXISTS jobs_cluster_duration; -DROP INDEX IF EXISTS jobs_cluster_numnodes; - -DROP INDEX IF EXISTS jobs_cluster_partition; -DROP INDEX IF EXISTS jobs_cluster_partition_starttime; -DROP INDEX IF EXISTS jobs_cluster_partition_duration; -DROP INDEX IF EXISTS jobs_cluster_partition_numnodes; - -DROP INDEX IF EXISTS jobs_cluster_partition_jobstate; -DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_user; -DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_project; -DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_starttime; -DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_duration; -DROP INDEX IF EXISTS jobs_cluster_partition_jobstate_numnodes; - -DROP INDEX IF EXISTS jobs_cluster_jobstate; -DROP INDEX IF EXISTS jobs_cluster_jobstate_user; -DROP INDEX IF EXISTS jobs_cluster_jobstate_project; - -DROP INDEX IF EXISTS jobs_cluster_jobstate_starttime; -DROP INDEX IF EXISTS jobs_cluster_jobstate_duration; -DROP INDEX IF EXISTS jobs_cluster_jobstate_numnodes; - -DROP INDEX IF EXISTS jobs_user; -DROP INDEX IF EXISTS jobs_user_starttime; -DROP INDEX IF EXISTS jobs_user_duration; -DROP INDEX IF EXISTS jobs_user_numnodes; - -DROP INDEX IF EXISTS jobs_project; -DROP INDEX IF EXISTS jobs_project_user; -DROP INDEX IF EXISTS jobs_project_starttime; -DROP INDEX IF EXISTS jobs_project_duration; -DROP INDEX IF EXISTS jobs_project_numnodes; - -DROP INDEX IF EXISTS jobs_jobstate; -DROP INDEX IF EXISTS jobs_jobstate_user; -DROP INDEX IF EXISTS jobs_jobstate_project; -DROP INDEX IF EXISTS jobs_jobstate_starttime; -DROP INDEX IF EXISTS jobs_jobstate_duration; -DROP INDEX IF EXISTS jobs_jobstate_numnodes; - -DROP INDEX IF EXISTS jobs_arrayjobid_starttime; -DROP INDEX IF EXISTS jobs_cluster_arrayjobid_starttime; - -DROP INDEX IF EXISTS jobs_starttime; -DROP INDEX IF EXISTS jobs_duration; -DROP INDEX IF EXISTS jobs_numnodes; - -DROP INDEX IF EXISTS jobs_duration_starttime; -DROP INDEX IF EXISTS jobs_numnodes_starttime; -DROP INDEX IF EXISTS jobs_numacc_starttime; -DROP INDEX IF EXISTS jobs_energy_starttime; diff --git a/internal/repository/migrations/mysql/08_add-footprint.up.sql b/internal/repository/migrations/mysql/08_add-footprint.up.sql deleted file mode 100644 index 207ccf9e..00000000 --- a/internal/repository/migrations/mysql/08_add-footprint.up.sql +++ /dev/null @@ -1,123 +0,0 @@ -DROP INDEX IF EXISTS job_stats ON job; -DROP INDEX IF EXISTS job_by_user ON job; -DROP INDEX IF EXISTS job_by_starttime ON job; -DROP INDEX IF EXISTS job_by_job_id ON job; -DROP INDEX IF EXISTS job_list ON job; -DROP INDEX IF EXISTS job_list_user ON job; -DROP INDEX IF EXISTS job_list_users ON job; -DROP INDEX IF EXISTS job_list_users_start ON job; - -ALTER TABLE job ADD COLUMN energy REAL NOT NULL DEFAULT 0.0; -ALTER TABLE job ADD COLUMN energy_footprint JSON; - -ALTER TABLE job ADD COLUMN footprint JSON; -ALTER TABLE tag ADD COLUMN tag_scope TEXT NOT NULL DEFAULT 'global'; - --- Do not use reserved keywords anymore -RENAME TABLE `user` TO hpc_user; -ALTER TABLE job RENAME COLUMN `user` TO hpc_user; -ALTER TABLE job RENAME COLUMN `partition` TO cluster_partition; - -ALTER TABLE job MODIFY COLUMN cluster VARCHAR(50); -ALTER TABLE job MODIFY COLUMN hpc_user VARCHAR(50); -ALTER TABLE job MODIFY COLUMN subcluster VARCHAR(50); -ALTER TABLE job MODIFY COLUMN project VARCHAR(50); -ALTER TABLE job MODIFY COLUMN cluster_partition VARCHAR(50); -ALTER TABLE job MODIFY COLUMN job_state VARCHAR(25); - -UPDATE job SET footprint = '{"flops_any_avg": 0.0}'; -UPDATE job SET footprint = json_replace(footprint, '$.flops_any_avg', job.flops_any_avg); -UPDATE job SET footprint = json_insert(footprint, '$.mem_bw_avg', job.mem_bw_avg); -UPDATE job SET footprint = json_insert(footprint, '$.mem_used_max', job.mem_used_max); -UPDATE job SET footprint = json_insert(footprint, '$.cpu_load_avg', job.load_avg); -UPDATE job SET footprint = json_insert(footprint, '$.net_bw_avg', job.net_bw_avg) WHERE job.net_bw_avg != 0; -UPDATE job SET footprint = json_insert(footprint, '$.net_data_vol_total', job.net_data_vol_total) WHERE job.net_data_vol_total != 0; -UPDATE job SET footprint = json_insert(footprint, '$.file_bw_avg', job.file_bw_avg) WHERE job.file_bw_avg != 0; -UPDATE job SET footprint = json_insert(footprint, '$.file_data_vol_total', job.file_data_vol_total) WHERE job.file_data_vol_total != 0; - -ALTER TABLE job DROP flops_any_avg; -ALTER TABLE job DROP mem_bw_avg; -ALTER TABLE job DROP mem_used_max; -ALTER TABLE job DROP load_avg; -ALTER TABLE job DROP net_bw_avg; -ALTER TABLE job DROP net_data_vol_total; -ALTER TABLE job DROP file_bw_avg; -ALTER TABLE job DROP file_data_vol_total; - --- Indices for: Single filters, combined filters, sorting, sorting with filters --- Cluster Filter -CREATE INDEX IF NOT EXISTS jobs_cluster ON job (cluster); -CREATE INDEX IF NOT EXISTS jobs_cluster_user ON job (cluster, hpc_user); -CREATE INDEX IF NOT EXISTS jobs_cluster_project ON job (cluster, project); -CREATE INDEX IF NOT EXISTS jobs_cluster_subcluster ON job (cluster, subcluster); --- Cluster Filter Sorting -CREATE INDEX IF NOT EXISTS jobs_cluster_starttime ON job (cluster, start_time); -CREATE INDEX IF NOT EXISTS jobs_cluster_duration ON job (cluster, duration); -CREATE INDEX IF NOT EXISTS jobs_cluster_numnodes ON job (cluster, num_nodes); - --- Cluster+Partition Filter -CREATE INDEX IF NOT EXISTS jobs_cluster_partition ON job (cluster, cluster_partition); --- Cluster+Partition Filter Sorting -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime ON job (cluster, cluster_partition, start_time); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration ON job (cluster, cluster_partition, duration); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numnodes ON job (cluster, cluster_partition, num_nodes); - --- Cluster+Partition+Jobstate Filter -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_user ON job (cluster, cluster_partition, job_state, hpc_user); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_project ON job (cluster, cluster_partition, job_state, project); --- Cluster+Partition+Jobstate Filter Sorting -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_starttime ON job (cluster, cluster_partition, job_state, start_time); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_duration ON job (cluster, cluster_partition, job_state, duration); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numnodes ON job (cluster, cluster_partition, job_state, num_nodes); - --- Cluster+JobState Filter -CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate ON job (cluster, job_state); -CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_user ON job (cluster, job_state, hpc_user); -CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_project ON job (cluster, job_state, project); --- Cluster+JobState Filter Sorting -CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime ON job (cluster, job_state, start_time); -CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration ON job (cluster, job_state, duration); -CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numnodes ON job (cluster, job_state, num_nodes); - --- User Filter -CREATE INDEX IF NOT EXISTS jobs_user ON job (hpc_user); --- User Filter Sorting -CREATE INDEX IF NOT EXISTS jobs_user_starttime ON job (hpc_user, start_time); -CREATE INDEX IF NOT EXISTS jobs_user_duration ON job (hpc_user, duration); -CREATE INDEX IF NOT EXISTS jobs_user_numnodes ON job (hpc_user, num_nodes); - --- Project Filter -CREATE INDEX IF NOT EXISTS jobs_project ON job (project); -CREATE INDEX IF NOT EXISTS jobs_project_user ON job (project, hpc_user); --- Project Filter Sorting -CREATE INDEX IF NOT EXISTS jobs_project_starttime ON job (project, start_time); -CREATE INDEX IF NOT EXISTS jobs_project_duration ON job (project, duration); -CREATE INDEX IF NOT EXISTS jobs_project_numnodes ON job (project, num_nodes); - --- JobState Filter -CREATE INDEX IF NOT EXISTS jobs_jobstate ON job (job_state); -CREATE INDEX IF NOT EXISTS jobs_jobstate_user ON job (job_state, hpc_user); -CREATE INDEX IF NOT EXISTS jobs_jobstate_project ON job (job_state, project); -CREATE INDEX IF NOT EXISTS jobs_jobstate_cluster ON job (job_state, cluster); --- JobState Filter Sorting -CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime ON job (job_state, start_time); -CREATE INDEX IF NOT EXISTS jobs_jobstate_duration ON job (job_state, duration); -CREATE INDEX IF NOT EXISTS jobs_jobstate_numnodes ON job (job_state, num_nodes); - --- ArrayJob Filter -CREATE INDEX IF NOT EXISTS jobs_arrayjobid_starttime ON job (array_job_id, start_time); -CREATE INDEX IF NOT EXISTS jobs_cluster_arrayjobid_starttime ON job (cluster, array_job_id, start_time); - --- Sorting without active filters -CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time); -CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration); -CREATE INDEX IF NOT EXISTS jobs_numnodes ON job (num_nodes); - --- Single filters with default starttime sorting -CREATE INDEX IF NOT EXISTS jobs_duration_starttime ON job (duration, start_time); -CREATE INDEX IF NOT EXISTS jobs_numnodes_starttime ON job (num_nodes, start_time); -CREATE INDEX IF NOT EXISTS jobs_numacc_starttime ON job (num_acc, start_time); -CREATE INDEX IF NOT EXISTS jobs_energy_starttime ON job (energy, start_time); - --- Optimize DB index usage diff --git a/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql b/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql index 863b50ea..6e1ac009 100644 --- a/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql +++ b/internal/repository/migrations/sqlite3/09_add-job-cache.up.sql @@ -118,104 +118,116 @@ DROP TABLE lookup_exclusive; DROP TABLE job; -- Deletes All Existing 'job' Indices; Recreate after Renaming ALTER TABLE job_new RENAME TO job; --- Recreate Indices from 08_add-footprint, include new submit_time indices +-- Recreate Indices from 08_add-footprint; include new 'shared' column -- Cluster Filter -CREATE INDEX IF NOT EXISTS jobs_cluster ON job (cluster); CREATE INDEX IF NOT EXISTS jobs_cluster_user ON job (cluster, hpc_user); CREATE INDEX IF NOT EXISTS jobs_cluster_project ON job (cluster, project); CREATE INDEX IF NOT EXISTS jobs_cluster_subcluster ON job (cluster, subcluster); -- Cluster Filter Sorting -CREATE INDEX IF NOT EXISTS jobs_cluster_starttime ON job (cluster, start_time); -CREATE INDEX IF NOT EXISTS jobs_cluster_submittime ON job (cluster, submit_time); -CREATE INDEX IF NOT EXISTS jobs_cluster_duration ON job (cluster, duration); CREATE INDEX IF NOT EXISTS jobs_cluster_numnodes ON job (cluster, num_nodes); CREATE INDEX IF NOT EXISTS jobs_cluster_numhwthreads ON job (cluster, num_hwthreads); CREATE INDEX IF NOT EXISTS jobs_cluster_numacc ON job (cluster, num_acc); CREATE INDEX IF NOT EXISTS jobs_cluster_energy ON job (cluster, energy); +-- Cluster Time Filter Sorting +CREATE INDEX IF NOT EXISTS jobs_cluster_duration_starttime ON job (cluster, duration, start_time); +CREATE INDEX IF NOT EXISTS jobs_cluster_starttime_duration ON job (cluster, start_time, duration); + -- Cluster+Partition Filter -CREATE INDEX IF NOT EXISTS jobs_cluster_partition ON job (cluster, cluster_partition); +CREATE INDEX IF NOT EXISTS jobs_cluster_partition_user ON job (cluster, cluster_partition, hpc_user); +CREATE INDEX IF NOT EXISTS jobs_cluster_partition_project ON job (cluster, cluster_partition, project); +CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state); +CREATE INDEX IF NOT EXISTS jobs_cluster_partition_shared ON job (cluster, cluster_partition, shared); + -- Cluster+Partition Filter Sorting -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime ON job (cluster, cluster_partition, start_time); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_submittime ON job (cluster, cluster_partition, submit_time); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration ON job (cluster, cluster_partition, duration); CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numnodes ON job (cluster, cluster_partition, num_nodes); CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numhwthreads ON job (cluster, cluster_partition, num_hwthreads); CREATE INDEX IF NOT EXISTS jobs_cluster_partition_numacc ON job (cluster, cluster_partition, num_acc); CREATE INDEX IF NOT EXISTS jobs_cluster_partition_energy ON job (cluster, cluster_partition, energy); --- Cluster+Partition+Jobstate Filter -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate ON job (cluster, cluster_partition, job_state); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_user ON job (cluster, cluster_partition, job_state, hpc_user); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_project ON job (cluster, cluster_partition, job_state, project); --- Cluster+Partition+Jobstate Filter Sorting -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_starttime ON job (cluster, cluster_partition, job_state, start_time); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_submittime ON job (cluster, cluster_partition, job_state, submit_time); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_duration ON job (cluster, cluster_partition, job_state, duration); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numnodes ON job (cluster, cluster_partition, job_state, num_nodes); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numhwthreads ON job (cluster, cluster_partition, job_state, num_hwthreads); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_numacc ON job (cluster, cluster_partition, job_state, num_acc); -CREATE INDEX IF NOT EXISTS jobs_cluster_partition_jobstate_energy ON job (cluster, cluster_partition, job_state, energy); +-- Cluster+Partition Time Filter Sorting +CREATE INDEX IF NOT EXISTS jobs_cluster_partition_duration_starttime ON job (cluster, cluster_partition, duration, start_time); +CREATE INDEX IF NOT EXISTS jobs_cluster_partition_starttime_duration ON job (cluster, cluster_partition, start_time, duration); -- Cluster+JobState Filter -CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate ON job (cluster, job_state); CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_user ON job (cluster, job_state, hpc_user); CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_project ON job (cluster, job_state, project); -- Cluster+JobState Filter Sorting -CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime ON job (cluster, job_state, start_time); -CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_submittime ON job (cluster, job_state, submit_time); -CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration ON job (cluster, job_state, duration); CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numnodes ON job (cluster, job_state, num_nodes); CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numhwthreads ON job (cluster, job_state, num_hwthreads); CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_numacc ON job (cluster, job_state, num_acc); CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_energy ON job (cluster, job_state, energy); +-- Cluster+JobState Time Filter Sorting +CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_starttime_duration ON job (cluster, job_state, start_time, duration); +CREATE INDEX IF NOT EXISTS jobs_cluster_jobstate_duration_starttime ON job (cluster, job_state, duration, start_time); + +-- Cluster+Shared Filter +CREATE INDEX IF NOT EXISTS jobs_cluster_shared_user ON job (cluster, shared, hpc_user); +CREATE INDEX IF NOT EXISTS jobs_cluster_shared_project ON job (cluster, shared, project); +-- Cluster+Shared Filter Sorting +CREATE INDEX IF NOT EXISTS jobs_cluster_shared_numnodes ON job (cluster, shared, num_nodes); +CREATE INDEX IF NOT EXISTS jobs_cluster_shared_numhwthreads ON job (cluster, shared, num_hwthreads); +CREATE INDEX IF NOT EXISTS jobs_cluster_shared_numacc ON job (cluster, shared, num_acc); +CREATE INDEX IF NOT EXISTS jobs_cluster_shared_energy ON job (cluster, shared, energy); + +-- Cluster+Shared Time Filter Sorting +CREATE INDEX IF NOT EXISTS jobs_cluster_shared_starttime_duration ON job (cluster, shared, start_time, duration); +CREATE INDEX IF NOT EXISTS jobs_cluster_shared_duration_starttime ON job (cluster, shared, duration, start_time); + -- User Filter -CREATE INDEX IF NOT EXISTS jobs_user ON job (hpc_user); -- User Filter Sorting -CREATE INDEX IF NOT EXISTS jobs_user_starttime ON job (hpc_user, start_time); -CREATE INDEX IF NOT EXISTS jobs_user_duration ON job (hpc_user, duration); CREATE INDEX IF NOT EXISTS jobs_user_numnodes ON job (hpc_user, num_nodes); CREATE INDEX IF NOT EXISTS jobs_user_numhwthreads ON job (hpc_user, num_hwthreads); CREATE INDEX IF NOT EXISTS jobs_user_numacc ON job (hpc_user, num_acc); CREATE INDEX IF NOT EXISTS jobs_user_energy ON job (hpc_user, energy); +-- Cluster+Shared Time Filter Sorting +CREATE INDEX IF NOT EXISTS jobs_user_starttime_duration ON job (hpc_user, start_time, duration); +CREATE INDEX IF NOT EXISTS jobs_user_duration_starttime ON job (hpc_user, duration, start_time); + -- Project Filter -CREATE INDEX IF NOT EXISTS jobs_project ON job (project); CREATE INDEX IF NOT EXISTS jobs_project_user ON job (project, hpc_user); -- Project Filter Sorting -CREATE INDEX IF NOT EXISTS jobs_project_starttime ON job (project, start_time); -CREATE INDEX IF NOT EXISTS jobs_project_duration ON job (project, duration); CREATE INDEX IF NOT EXISTS jobs_project_numnodes ON job (project, num_nodes); CREATE INDEX IF NOT EXISTS jobs_project_numhwthreads ON job (project, num_hwthreads); CREATE INDEX IF NOT EXISTS jobs_project_numacc ON job (project, num_acc); CREATE INDEX IF NOT EXISTS jobs_project_energy ON job (project, energy); +-- Cluster+Shared Time Filter Sorting +CREATE INDEX IF NOT EXISTS jobs_project_starttime_duration ON job (project, start_time, duration); +CREATE INDEX IF NOT EXISTS jobs_project_duration_starttime ON job (project, duration, start_time); + -- JobState Filter -CREATE INDEX IF NOT EXISTS jobs_jobstate ON job (job_state); CREATE INDEX IF NOT EXISTS jobs_jobstate_user ON job (job_state, hpc_user); CREATE INDEX IF NOT EXISTS jobs_jobstate_project ON job (job_state, project); -CREATE INDEX IF NOT EXISTS jobs_jobstate_cluster ON job (job_state, cluster); -- JobState Filter Sorting -CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime ON job (job_state, start_time); -CREATE INDEX IF NOT EXISTS jobs_jobstate_duration ON job (job_state, duration); CREATE INDEX IF NOT EXISTS jobs_jobstate_numnodes ON job (job_state, num_nodes); CREATE INDEX IF NOT EXISTS jobs_jobstate_numhwthreads ON job (job_state, num_hwthreads); CREATE INDEX IF NOT EXISTS jobs_jobstate_numacc ON job (job_state, num_acc); CREATE INDEX IF NOT EXISTS jobs_jobstate_energy ON job (job_state, energy); +-- Cluster+Shared Time Filter Sorting +CREATE INDEX IF NOT EXISTS jobs_jobstate_starttime_duration ON job (job_state, start_time, duration); +CREATE INDEX IF NOT EXISTS jobs_jobstate_duration_starttime ON job (job_state, duration, start_time); + +-- Shared Filter +CREATE INDEX IF NOT EXISTS jobs_shared_user ON job (shared, hpc_user); +CREATE INDEX IF NOT EXISTS jobs_shared_project ON job (shared, project); +-- Shared Filter Sorting +CREATE INDEX IF NOT EXISTS jobs_shared_numnodes ON job (shared, num_nodes); +CREATE INDEX IF NOT EXISTS jobs_shared_numhwthreads ON job (shared, num_hwthreads); +CREATE INDEX IF NOT EXISTS jobs_shared_numacc ON job (shared, num_acc); +CREATE INDEX IF NOT EXISTS jobs_shared_energy ON job (shared, energy); + +-- Cluster+Shared Time Filter Sorting +CREATE INDEX IF NOT EXISTS jobs_shared_starttime_duration ON job (shared, start_time, duration); +CREATE INDEX IF NOT EXISTS jobs_shared_duration_starttime ON job (shared, duration, start_time); + -- ArrayJob Filter CREATE INDEX IF NOT EXISTS jobs_arrayjobid_starttime ON job (array_job_id, start_time); CREATE INDEX IF NOT EXISTS jobs_cluster_arrayjobid_starttime ON job (cluster, array_job_id, start_time); --- Sorting without active filters -CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time); -CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration); -CREATE INDEX IF NOT EXISTS jobs_numnodes ON job (num_nodes); -CREATE INDEX IF NOT EXISTS jobs_numhwthreads ON job (num_hwthreads); -CREATE INDEX IF NOT EXISTS jobs_numacc ON job (num_acc); -CREATE INDEX IF NOT EXISTS jobs_energy ON job (energy); - -- Single filters with default starttime sorting CREATE INDEX IF NOT EXISTS jobs_duration_starttime ON job (duration, start_time); CREATE INDEX IF NOT EXISTS jobs_numnodes_starttime ON job (num_nodes, start_time); @@ -223,6 +235,22 @@ CREATE INDEX IF NOT EXISTS jobs_numhwthreads_starttime ON job (num_hwthreads, st CREATE INDEX IF NOT EXISTS jobs_numacc_starttime ON job (num_acc, start_time); CREATE INDEX IF NOT EXISTS jobs_energy_starttime ON job (energy, start_time); +-- Single filters with duration sorting +CREATE INDEX IF NOT EXISTS jobs_starttime_duration ON job (start_time, duration); +CREATE INDEX IF NOT EXISTS jobs_numnodes_duration ON job (num_nodes, duration); +CREATE INDEX IF NOT EXISTS jobs_numhwthreads_duration ON job (num_hwthreads, duration); +CREATE INDEX IF NOT EXISTS jobs_numacc_duration ON job (num_acc, duration); +CREATE INDEX IF NOT EXISTS jobs_energy_duration ON job (energy, duration); + +-- Backup Indices For High Variety Columns +CREATE INDEX IF NOT EXISTS jobs_starttime ON job (start_time); +CREATE INDEX IF NOT EXISTS jobs_duration ON job (duration); + +-- Notes: +-- Cluster+Partition+Jobstate Filter: Tested -> Full Array Of Combinations non-required +-- Cluster+JobState+Shared Filter: Tested -> No further timing improvement +-- JobState+Shared Filter: Tested -> No further timing improvement + -- Optimize DB index usage PRAGMA optimize; diff --git a/internal/repository/migrations/sqlite3/10_node-table.up.sql b/internal/repository/migrations/sqlite3/10_node-table.up.sql index 247bceab..b788a8a9 100644 --- a/internal/repository/migrations/sqlite3/10_node-table.up.sql +++ b/internal/repository/migrations/sqlite3/10_node-table.up.sql @@ -23,6 +23,7 @@ CREATE TABLE "node_state" ( CHECK (health_state IN ( 'full', 'partial', 'failed' )), + health_metrics TEXT, -- JSON array of strings node_id INTEGER, FOREIGN KEY (node_id) REFERENCES node (id) ); @@ -33,12 +34,11 @@ CREATE INDEX IF NOT EXISTS nodes_cluster_subcluster ON node (cluster, subcluster -- Add NEW Indices For New Node_State Table Fields CREATE INDEX IF NOT EXISTS nodestates_timestamp ON node_state (time_stamp); -CREATE INDEX IF NOT EXISTS nodestates_state ON node_state (node_state); -CREATE INDEX IF NOT EXISTS nodestates_health ON node_state (health_state); CREATE INDEX IF NOT EXISTS nodestates_state_timestamp ON node_state (node_state, time_stamp); CREATE INDEX IF NOT EXISTS nodestates_health_timestamp ON node_state (health_state, time_stamp); CREATE INDEX IF NOT EXISTS nodestates_nodeid_state ON node_state (node_id, node_state); CREATE INDEX IF NOT EXISTS nodestates_nodeid_health ON node_state (node_id, health_state); +CREATE INDEX IF NOT EXISTS nodestates_nodeid_timestamp ON node_state (node_id, time_stamp DESC); -- Add NEW Indices For Increased Amounts of Tags CREATE INDEX IF NOT EXISTS tags_jobid ON jobtag (job_id); diff --git a/internal/repository/node.go b/internal/repository/node.go index 75aebb33..cf8939da 100644 --- a/internal/repository/node.go +++ b/internal/repository/node.go @@ -10,14 +10,17 @@ import ( "database/sql" "encoding/json" "fmt" + "slices" + "sort" + "strings" "sync" "time" "github.com/ClusterCockpit/cc-backend/internal/graph/model" "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/lrucache" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/lrucache" + "github.com/ClusterCockpit/cc-lib/v2/schema" sq "github.com/Masterminds/squirrel" "github.com/jmoiron/sqlx" ) @@ -49,6 +52,38 @@ func GetNodeRepository() *NodeRepository { return nodeRepoInstance } +// latestStateCondition returns a squirrel expression that restricts node_state +// rows to the latest per node_id using a correlated subquery. +// Requires the query to join node and node_state tables. +func latestStateCondition() sq.Sqlizer { + return sq.Expr( + "node_state.id = (SELECT ns2.id FROM node_state ns2 WHERE ns2.node_id = node.id ORDER BY ns2.time_stamp DESC LIMIT 1)", + ) +} + +// applyNodeFilters applies common NodeFilter conditions to a query that joins +// the node and node_state tables with latestStateCondition. +func applyNodeFilters(query sq.SelectBuilder, filters []*model.NodeFilter) sq.SelectBuilder { + for _, f := range filters { + if f.Cluster != nil { + query = buildStringCondition("node.cluster", f.Cluster, query) + } + if f.SubCluster != nil { + query = buildStringCondition("node.subcluster", f.SubCluster, query) + } + if f.Hostname != nil { + query = buildStringCondition("node.hostname", f.Hostname, query) + } + if f.SchedulerState != nil { + query = query.Where("node_state.node_state = ?", f.SchedulerState) + } + if f.HealthState != nil { + query = query.Where("node_state.health_state = ?", f.HealthState) + } + } + return query +} + func (r *NodeRepository) FetchMetadata(hostname string, cluster string) (map[string]string, error) { start := time.Now() @@ -79,17 +114,16 @@ func (r *NodeRepository) FetchMetadata(hostname string, cluster string) (map[str func (r *NodeRepository) GetNode(hostname string, cluster string, withMeta bool) (*schema.Node, error) { node := &schema.Node{} - var timestamp int - if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state", - "node_state.health_state", "MAX(node_state.time_stamp) as time"). - From("node_state"). - Join("node ON node_state.node_id = node.id"). + if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", + "node_state.node_state", "node_state.health_state"). + From("node"). + Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition()). Where("node.hostname = ?", hostname). Where("node.cluster = ?", cluster). - GroupBy("node_state.node_id"). RunWith(r.DB). - QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState, ×tamp); err != nil { - cclog.Warnf("Error while querying node '%s' at time '%d' from database: %v", hostname, timestamp, err) + QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState); err != nil { + cclog.Warnf("Error while querying node '%s' from database: %v", hostname, err) return nil, err } @@ -106,31 +140,28 @@ func (r *NodeRepository) GetNode(hostname string, cluster string, withMeta bool) return node, nil } -func (r *NodeRepository) GetNodeById(id int64, withMeta bool) (*schema.Node, error) { +func (r *NodeRepository) GetNodeByID(id int64, withMeta bool) (*schema.Node, error) { node := &schema.Node{} - var timestamp int - if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state", - "node_state.health_state", "MAX(node_state.time_stamp) as time"). - From("node_state"). - Join("node ON node_state.node_id = node.id"). + if err := sq.Select("node.hostname", "node.cluster", "node.subcluster", + "node_state.node_state", "node_state.health_state"). + From("node"). + Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition()). Where("node.id = ?", id). - GroupBy("node_state.node_id"). RunWith(r.DB). - QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState, ×tamp); err != nil { - cclog.Warnf("Error while querying node ID '%d' at time '%d' from database: %v", id, timestamp, err) + QueryRow().Scan(&node.Hostname, &node.Cluster, &node.SubCluster, &node.NodeState, &node.HealthState); err != nil { + cclog.Warnf("Error while querying node ID '%d' from database: %v", id, err) return nil, err } - // NEEDS METADATA BY ID - // if withMeta { - // var err error - // var meta map[string]string - // if meta, err = r.FetchMetadata(hostname, cluster); err != nil { - // cclog.Warnf("Error while fetching metadata for node '%s'", hostname) - // return nil, err - // } - // node.MetaData = meta - // } + if withMeta { + meta, metaErr := r.FetchMetadata(node.Hostname, node.Cluster) + if metaErr != nil { + cclog.Warnf("Error while fetching metadata for node ID '%d': %v", id, metaErr) + return nil, metaErr + } + node.MetaData = meta + } return node, nil } @@ -166,9 +197,10 @@ func (r *NodeRepository) AddNode(node *schema.NodeDB) (int64, error) { } const NamedNodeStateInsert string = ` -INSERT INTO node_state (time_stamp, node_state, health_state, cpus_allocated, - memory_allocated, gpus_allocated, jobs_running, node_id) - VALUES (:time_stamp, :node_state, :health_state, :cpus_allocated, :memory_allocated, :gpus_allocated, :jobs_running, :node_id);` +INSERT INTO node_state (time_stamp, node_state, health_state, health_metrics, + cpus_allocated, memory_allocated, gpus_allocated, jobs_running, node_id) + VALUES (:time_stamp, :node_state, :health_state, :health_metrics, + :cpus_allocated, :memory_allocated, :gpus_allocated, :jobs_running, :node_id);` // TODO: Add real Monitoring Health State @@ -194,8 +226,7 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt return err } - cclog.Infof("Added node '%s' to database", hostname) - return nil + cclog.Debugf("Added node '%s' to database", hostname) } else { cclog.Warnf("Error while querying node '%v' from database", id) return err @@ -209,7 +240,7 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt cclog.Errorf("Error while adding node state for '%v' to database", hostname) return err } - cclog.Infof("Updated node state for '%s' in database", hostname) + cclog.Debugf("Updated node state for '%s' in database", hostname) return nil } @@ -222,6 +253,77 @@ func (r *NodeRepository) UpdateNodeState(hostname string, cluster string, nodeSt // return nil // } +// NodeStateWithNode combines a node state row with denormalized node info. +type NodeStateWithNode struct { + ID int64 `db:"id"` + TimeStamp int64 `db:"time_stamp"` + NodeState string `db:"node_state"` + HealthState string `db:"health_state"` + HealthMetrics string `db:"health_metrics"` + CpusAllocated int `db:"cpus_allocated"` + MemoryAllocated int64 `db:"memory_allocated"` + GpusAllocated int `db:"gpus_allocated"` + JobsRunning int `db:"jobs_running"` + Hostname string `db:"hostname"` + Cluster string `db:"cluster"` + SubCluster string `db:"subcluster"` +} + +// FindNodeStatesBefore returns all node_state rows with time_stamp < cutoff, +// joined with node info for denormalized archiving. +func (r *NodeRepository) FindNodeStatesBefore(cutoff int64) ([]NodeStateWithNode, error) { + rows, err := sq.Select( + "node_state.id", "node_state.time_stamp", "node_state.node_state", + "node_state.health_state", "node_state.health_metrics", + "node_state.cpus_allocated", "node_state.memory_allocated", + "node_state.gpus_allocated", "node_state.jobs_running", + "node.hostname", "node.cluster", "node.subcluster", + ). + From("node_state"). + Join("node ON node_state.node_id = node.id"). + Where(sq.Lt{"node_state.time_stamp": cutoff}). + Where("node_state.id NOT IN (SELECT ns2.id FROM node_state ns2 WHERE ns2.time_stamp = (SELECT MAX(ns3.time_stamp) FROM node_state ns3 WHERE ns3.node_id = ns2.node_id))"). + OrderBy("node.cluster ASC", "node.subcluster ASC", "node.hostname ASC", "node_state.time_stamp ASC"). + RunWith(r.DB).Query() + if err != nil { + return nil, err + } + defer rows.Close() + + var result []NodeStateWithNode + for rows.Next() { + var ns NodeStateWithNode + var healthMetrics sql.NullString + if err := rows.Scan(&ns.ID, &ns.TimeStamp, &ns.NodeState, + &ns.HealthState, &healthMetrics, + &ns.CpusAllocated, &ns.MemoryAllocated, + &ns.GpusAllocated, &ns.JobsRunning, + &ns.Hostname, &ns.Cluster, &ns.SubCluster); err != nil { + return nil, err + } + ns.HealthMetrics = healthMetrics.String + result = append(result, ns) + } + return result, nil +} + +// DeleteNodeStatesBefore removes node_state rows with time_stamp < cutoff, +// but always preserves the row with the latest timestamp per node_id. +func (r *NodeRepository) DeleteNodeStatesBefore(cutoff int64) (int64, error) { + res, err := r.DB.Exec( + `DELETE FROM node_state WHERE time_stamp < ? + AND id NOT IN ( + SELECT id FROM node_state ns2 + WHERE ns2.time_stamp = (SELECT MAX(ns3.time_stamp) FROM node_state ns3 WHERE ns3.node_id = ns2.node_id) + )`, + cutoff, + ) + if err != nil { + return 0, err + } + return res.RowsAffected() +} + func (r *NodeRepository) DeleteNode(id int64) error { _, err := r.DB.Exec(`DELETE FROM node WHERE node.id = ?`, id) if err != nil { @@ -241,38 +343,17 @@ func (r *NodeRepository) QueryNodes( order *model.OrderByInput, // Currently unused! ) ([]*schema.Node, error) { query, qerr := AccessCheck(ctx, - sq.Select("hostname", "cluster", "subcluster", "node_state", "health_state", "MAX(time_stamp) as time"). + sq.Select("node.hostname", "node.cluster", "node.subcluster", + "node_state.node_state", "node_state.health_state"). From("node"). - Join("node_state ON node_state.node_id = node.id")) + Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition())) if qerr != nil { return nil, qerr } - for _, f := range filters { - if f.Cluster != nil { - query = buildStringCondition("cluster", f.Cluster, query) - } - if f.Subcluster != nil { - query = buildStringCondition("subcluster", f.Subcluster, query) - } - if f.Hostname != nil { - query = buildStringCondition("hostname", f.Hostname, query) - } - if f.SchedulerState != nil { - query = query.Where("node_state = ?", f.SchedulerState) - // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned - now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 60)}) - } - if f.HealthState != nil { - query = query.Where("health_state = ?", f.HealthState) - // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned - now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 60)}) - } - } - - query = query.GroupBy("node_id").OrderBy("hostname ASC") + query = applyNodeFilters(query, filters) + query = query.OrderBy("node.hostname ASC") if page != nil && page.ItemsPerPage != -1 { limit := uint64(page.ItemsPerPage) @@ -290,11 +371,10 @@ func (r *NodeRepository) QueryNodes( nodes := make([]*schema.Node, 0) for rows.Next() { node := schema.Node{} - var timestamp int if err := rows.Scan(&node.Hostname, &node.Cluster, &node.SubCluster, - &node.NodeState, &node.HealthState, ×tamp); err != nil { + &node.NodeState, &node.HealthState); err != nil { rows.Close() - cclog.Warnf("Error while scanning rows (QueryNodes) at time '%d'", timestamp) + cclog.Warn("Error while scanning rows (QueryNodes)") return nil, err } nodes = append(nodes, &node) @@ -386,73 +466,115 @@ func (r *NodeRepository) QueryNodesWithMeta( return nodes, nil } -// CountNodes returns the total matched nodes based on a node filter. It always operates -// on the last state (largest timestamp). -func (r *NodeRepository) CountNodes( +// QueryNodesWithMeta returns a list of nodes based on a node filter. It always operates +// on the last state (largest timestamp). It includes both (!) optional JSON column data +func (r *NodeRepository) QueryNodesWithMeta( ctx context.Context, filters []*model.NodeFilter, -) (int, error) { + page *model.PageRequest, + order *model.OrderByInput, // Currently unused! +) ([]*schema.Node, error) { query, qerr := AccessCheck(ctx, - sq.Select("time_stamp", "count(*) as countRes"). + sq.Select("node.hostname", "node.cluster", "node.subcluster", + "node_state.node_state", "node_state.health_state", + "node.meta_data", "node_state.health_metrics"). From("node"). - Join("node_state ON node_state.node_id = node.id")) + Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition())) if qerr != nil { - return 0, qerr + return nil, qerr } - for _, f := range filters { - if f.Cluster != nil { - query = buildStringCondition("cluster", f.Cluster, query) - } - if f.Subcluster != nil { - query = buildStringCondition("subcluster", f.Subcluster, query) - } - if f.Hostname != nil { - query = buildStringCondition("hostname", f.Hostname, query) - } - if f.SchedulerState != nil { - query = query.Where("node_state = ?", f.SchedulerState) - // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned - now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 60)}) - } - if f.HealthState != nil { - query = query.Where("health_state = ?", f.HealthState) - // Requires Additional time_stamp Filter: Else the last (past!) time_stamp with queried state will be returned - now := time.Now().Unix() - query = query.Where(sq.Gt{"time_stamp": (now - 60)}) - } - } + query = applyNodeFilters(query, filters) + query = query.OrderBy("node.hostname ASC") - query = query.GroupBy("time_stamp").OrderBy("time_stamp DESC").Limit(1) + if page != nil && page.ItemsPerPage != -1 { + limit := uint64(page.ItemsPerPage) + query = query.Offset((uint64(page.Page) - 1) * limit).Limit(limit) + } rows, err := query.RunWith(r.stmtCache).Query() if err != nil { + queryString, queryVars, _ := query.ToSql() + cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err) + return nil, err + } + + nodes := make([]*schema.Node, 0) + for rows.Next() { + node := schema.Node{} + RawMetaData := make([]byte, 0) + RawMetricHealth := make([]byte, 0) + + if err := rows.Scan(&node.Hostname, &node.Cluster, &node.SubCluster, + &node.NodeState, &node.HealthState, &RawMetaData, &RawMetricHealth); err != nil { + rows.Close() + cclog.Warn("Error while scanning rows (QueryNodes)") + return nil, err + } + + if len(RawMetaData) == 0 { + node.MetaData = nil + } else { + metaData := make(map[string]string) + if err := json.Unmarshal(RawMetaData, &metaData); err != nil { + cclog.Warn("Error while unmarshaling raw metadata json") + return nil, err + } + node.MetaData = metaData + } + + if len(RawMetricHealth) == 0 { + node.HealthData = nil + } else { + healthData := make(map[string][]string) + if err := json.Unmarshal(RawMetricHealth, &healthData); err != nil { + cclog.Warn("Error while unmarshaling raw healthdata json") + return nil, err + } + node.HealthData = healthData + } + + nodes = append(nodes, &node) + } + + return nodes, nil +} + +// CountNodes returns the total matched nodes based on a node filter. It always operates +// on the last state (largest timestamp) per node. +func (r *NodeRepository) CountNodes( + ctx context.Context, + filters []*model.NodeFilter, +) (int, error) { + query, qerr := AccessCheck(ctx, + sq.Select("COUNT(*)"). + From("node"). + Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition())) + if qerr != nil { + return 0, qerr + } + + query = applyNodeFilters(query, filters) + + var count int + if err := query.RunWith(r.stmtCache).QueryRow().Scan(&count); err != nil { queryString, queryVars, _ := query.ToSql() cclog.Errorf("Error while running query '%s' %v: %v", queryString, queryVars, err) return 0, err } - var totalNodes int - for rows.Next() { - var timestamp int - if err := rows.Scan(×tamp, &totalNodes); err != nil { - rows.Close() - cclog.Warnf("Error while scanning rows (CountNodes) at time '%d'", timestamp) - return 0, err - } - } - - return totalNodes, nil + return count, nil } func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) { - q := sq.Select("node.hostname", "node.cluster", "node.subcluster", "node_state.node_state", - "node_state.health_state", "MAX(node_state.time_stamp) as time"). + q := sq.Select("node.hostname", "node.cluster", "node.subcluster", + "node_state.node_state", "node_state.health_state"). From("node"). Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition()). Where("node.cluster = ?", cluster). - GroupBy("node_state.node_id"). OrderBy("node.hostname ASC") rows, err := q.RunWith(r.DB).Query() @@ -464,10 +586,9 @@ func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) { defer rows.Close() for rows.Next() { node := &schema.Node{} - var timestamp int if err := rows.Scan(&node.Hostname, &node.Cluster, - &node.SubCluster, &node.NodeState, &node.HealthState, ×tamp); err != nil { - cclog.Warnf("Error while scanning node list (ListNodes) at time '%d'", timestamp) + &node.SubCluster, &node.NodeState, &node.HealthState); err != nil { + cclog.Warn("Error while scanning node list (ListNodes)") return nil, err } @@ -478,11 +599,11 @@ func (r *NodeRepository) ListNodes(cluster string) ([]*schema.Node, error) { } func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) { - q := sq.Select("node.hostname", "node_state.node_state", "MAX(node_state.time_stamp) as time"). + q := sq.Select("node.hostname", "node_state.node_state"). From("node"). Join("node_state ON node_state.node_id = node.id"). + Where(latestStateCondition()). Where("node.cluster = ?", cluster). - GroupBy("node_state.node_id"). OrderBy("node.hostname ASC") rows, err := q.RunWith(r.DB).Query() @@ -495,9 +616,8 @@ func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) { defer rows.Close() for rows.Next() { var hostname, nodestate string - var timestamp int - if err := rows.Scan(&hostname, &nodestate, ×tamp); err != nil { - cclog.Warnf("Error while scanning node list (MapNodes) at time '%d'", timestamp) + if err := rows.Scan(&hostname, &nodestate); err != nil { + cclog.Warn("Error while scanning node list (MapNodes)") return nil, err } @@ -509,37 +629,15 @@ func (r *NodeRepository) MapNodes(cluster string) (map[string]string, error) { func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeFilter, column string) ([]*model.NodeStates, error) { query, qerr := AccessCheck(ctx, - sq.Select(column, "COUNT(*) as count"). + sq.Select(column). From("node"). Join("node_state ON node_state.node_id = node.id"). - Where(latestStateCondition()). - GroupBy(column)) + Where(latestStateCondition())) if qerr != nil { return nil, qerr } - query = query.Join("node_state ON node_state.node_id = node.id") - - for _, f := range filters { - if f.Hostname != nil { - query = buildStringCondition("hostname", f.Hostname, query) - } - if f.Cluster != nil { - query = buildStringCondition("cluster", f.Cluster, query) - } - if f.Subcluster != nil { - query = buildStringCondition("subcluster", f.Subcluster, query) - } - if f.SchedulerState != nil { - query = query.Where("node_state = ?", f.SchedulerState) - } - if f.HealthState != nil { - query = query.Where("health_state = ?", f.HealthState) - } - } - - // Add Group and Order - query = query.GroupBy("hostname").OrderBy("hostname DESC") + query = applyNodeFilters(query, filters) rows, err := query.RunWith(r.stmtCache).Query() if err != nil { @@ -549,6 +647,18 @@ func (r *NodeRepository) CountStates(ctx context.Context, filters []*model.NodeF } defer rows.Close() + stateMap := map[string]int{} + for rows.Next() { + var state string + if err := rows.Scan(&state); err != nil { + rows.Close() + cclog.Warn("Error while scanning rows (CountStates)") + return nil, err + } + + stateMap[state] += 1 + } + nodes := make([]*model.NodeStates, 0) for rows.Next() { var state string @@ -587,8 +697,8 @@ func (r *NodeRepository) CountStatesTimed(ctx context.Context, filters []*model. if f.Cluster != nil { query = buildStringCondition("cluster", f.Cluster, query) } - if f.Subcluster != nil { - query = buildStringCondition("subcluster", f.Subcluster, query) + if f.SubCluster != nil { + query = buildStringCondition("subcluster", f.SubCluster, query) } if f.SchedulerState != nil { query = query.Where("node_state = ?", f.SchedulerState) @@ -640,6 +750,132 @@ func (r *NodeRepository) CountStatesTimed(ctx context.Context, filters []*model. return timedStates, nil } +func (r *NodeRepository) GetNodesForList( + ctx context.Context, + cluster string, + subCluster string, + stateFilter string, + nodeFilter string, + page *model.PageRequest, +) ([]string, map[string]string, int, bool, error) { + // Init Return Vars + nodes := make([]string, 0) + stateMap := make(map[string]string) + countNodes := 0 + hasNextPage := false + + // Build Filters + queryFilters := make([]*model.NodeFilter, 0) + if cluster != "" { + queryFilters = append(queryFilters, &model.NodeFilter{Cluster: &model.StringInput{Eq: &cluster}}) + } + if subCluster != "" { + queryFilters = append(queryFilters, &model.NodeFilter{SubCluster: &model.StringInput{Eq: &subCluster}}) + } + if nodeFilter != "" && stateFilter != "notindb" { + queryFilters = append(queryFilters, &model.NodeFilter{Hostname: &model.StringInput{Contains: &nodeFilter}}) + } + if stateFilter != "all" && stateFilter != "notindb" { + queryState := schema.SchedulerState(stateFilter) + queryFilters = append(queryFilters, &model.NodeFilter{SchedulerState: &queryState}) + } + // if healthFilter != "all" { + // filters = append(filters, &model.NodeFilter{HealthState: &healthFilter}) + // } + + // Special Case: Disable Paging for missing nodes filter, save IPP for later + var backupItems int + if stateFilter == "notindb" { + backupItems = page.ItemsPerPage + page.ItemsPerPage = -1 + } + + // Query Nodes From DB + rawNodes, serr := r.QueryNodes(ctx, queryFilters, page, nil) // Order not Used + if serr != nil { + cclog.Warn("error while loading node database data (Resolver.NodeMetricsList)") + return nil, nil, 0, false, serr + } + + // Intermediate Node Result Info + for _, node := range rawNodes { + if node == nil { + continue + } + nodes = append(nodes, node.Hostname) + stateMap[node.Hostname] = string(node.NodeState) + } + + // Special Case: Find Nodes not in DB node table but in metricStore only + if stateFilter == "notindb" { + // Reapply Original Paging + page.ItemsPerPage = backupItems + // Get Nodes From Topology + var topoNodes []string + if subCluster != "" { + scNodes := archive.NodeLists[cluster][subCluster] + topoNodes = scNodes.PrintList() + } else { + subClusterNodeLists := archive.NodeLists[cluster] + for _, nodeList := range subClusterNodeLists { + topoNodes = append(topoNodes, nodeList.PrintList()...) + } + } + // Compare to all nodes from cluster/subcluster in DB + var missingNodes []string + for _, scanNode := range topoNodes { + if !slices.Contains(nodes, scanNode) { + missingNodes = append(missingNodes, scanNode) + } + } + // Filter nodes by name + if nodeFilter != "" { + filteredNodesByName := []string{} + for _, missingNode := range missingNodes { + if strings.Contains(missingNode, nodeFilter) { + filteredNodesByName = append(filteredNodesByName, missingNode) + } + } + missingNodes = filteredNodesByName + } + // Sort Missing Nodes Alphanumerically + slices.Sort(missingNodes) + // Total Missing + countNodes = len(missingNodes) + // Apply paging + if countNodes > page.ItemsPerPage { + start := (page.Page - 1) * page.ItemsPerPage + end := start + page.ItemsPerPage + if end > countNodes { + end = countNodes + hasNextPage = false + } else { + hasNextPage = true + } + nodes = missingNodes[start:end] + } else { + nodes = missingNodes + } + + } else { + // DB Nodes: Count and derive hasNextPage from count + var cerr error + countNodes, cerr = r.CountNodes(ctx, queryFilters) + if cerr != nil { + cclog.Warn("error while counting node database data (Resolver.NodeMetricsList)") + return nil, nil, 0, false, cerr + } + hasNextPage = page.Page*page.ItemsPerPage < countNodes + } + + // Fallback for non-init'd node table in DB; Ignores stateFilter + if stateFilter == "all" && countNodes == 0 { + nodes, countNodes, hasNextPage = getNodesFromTopol(cluster, subCluster, nodeFilter, page) + } + + return nodes, stateMap, countNodes, hasNextPage, nil +} + func AccessCheck(ctx context.Context, query sq.SelectBuilder) (sq.SelectBuilder, error) { user := GetUserFromContext(ctx) return AccessCheckWithUser(user, query) @@ -661,3 +897,51 @@ func AccessCheckWithUser(user *schema.User, query sq.SelectBuilder) (sq.SelectBu return qnil, fmt.Errorf("user has no or unknown roles") } } + +func getNodesFromTopol(cluster string, subCluster string, nodeFilter string, page *model.PageRequest) ([]string, int, bool) { + // 0) Init additional vars + hasNextPage := false + totalNodes := 0 + + // 1) Get list of all nodes + var topolNodes []string + if subCluster != "" { + scNodes := archive.NodeLists[cluster][subCluster] + topolNodes = scNodes.PrintList() + } else { + subClusterNodeLists := archive.NodeLists[cluster] + for _, nodeList := range subClusterNodeLists { + topolNodes = append(topolNodes, nodeList.PrintList()...) + } + } + + // 2) Filter nodes + if nodeFilter != "" { + filteredNodes := []string{} + for _, node := range topolNodes { + if strings.Contains(node, nodeFilter) { + filteredNodes = append(filteredNodes, node) + } + } + topolNodes = filteredNodes + } + + // 2.1) Count total nodes && Sort nodes -> Sorting invalidated after ccms return ... + totalNodes = len(topolNodes) + sort.Strings(topolNodes) + + // 3) Apply paging + if len(topolNodes) > page.ItemsPerPage { + start := (page.Page - 1) * page.ItemsPerPage + end := start + page.ItemsPerPage + if end >= len(topolNodes) { + end = len(topolNodes) + hasNextPage = false + } else { + hasNextPage = true + } + topolNodes = topolNodes[start:end] + } + + return topolNodes, totalNodes, hasNextPage +} diff --git a/internal/repository/node_test.go b/internal/repository/node_test.go index b42e09b8..d1e86b9a 100644 --- a/internal/repository/node_test.go +++ b/internal/repository/node_test.go @@ -15,9 +15,9 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/pkg/archive" - ccconf "github.com/ClusterCockpit/cc-lib/ccConfig" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" _ "github.com/mattn/go-sqlite3" ) @@ -26,7 +26,7 @@ func nodeTestSetup(t *testing.T) { "main": { "addr": "0.0.0.0:8080", "validate": false, - "apiAllowedIPs": [ + "api-allowed-ips": [ "*" ] }, @@ -38,18 +38,7 @@ func nodeTestSetup(t *testing.T) { "jwts": { "max-age": "2m" } - }, - "clusters": [ - { - "name": "testcluster", - "metricDataRepository": {"kind": "test", "url": "bla:8081"}, - "filterRanges": { - "numNodes": { "from": 1, "to": 64 }, - "duration": { "from": 0, "to": 86400 }, - "startTime": { "from": "2022-01-01T00:00:00Z", "to": null } - } } - ] }` const testclusterJSON = `{ "name": "testcluster", @@ -130,7 +119,7 @@ func nodeTestSetup(t *testing.T) { } dbfilepath := filepath.Join(tmpdir, "test.db") - err := MigrateDB("sqlite3", dbfilepath) + err := MigrateDB(dbfilepath) if err != nil { t.Fatal(err) } @@ -144,19 +133,22 @@ func nodeTestSetup(t *testing.T) { // Load and check main configuration if cfg := ccconf.GetPackageConfig("main"); cfg != nil { - if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil { - config.Init(cfg, clustercfg) - } else { - cclog.Abort("Cluster configuration must be present") - } + config.Init(cfg) } else { cclog.Abort("Main configuration must be present") } archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive) - Connect("sqlite3", dbfilepath) + if err := ResetConnection(); err != nil { + t.Fatal(err) + } + t.Cleanup(func() { + ResetConnection() + }) - if err := archive.Init(json.RawMessage(archiveCfg), config.Keys.DisableArchive); err != nil { + Connect(dbfilepath) + + if err := archive.Init(json.RawMessage(archiveCfg)); err != nil { t.Fatal(err) } } @@ -164,8 +156,12 @@ func nodeTestSetup(t *testing.T) { func TestUpdateNodeState(t *testing.T) { nodeTestSetup(t) + repo := GetNodeRepository() + now := time.Now().Unix() + nodeState := schema.NodeStateDB{ - TimeStamp: time.Now().Unix(), NodeState: "allocated", + TimeStamp: now, + NodeState: "allocated", CpusAllocated: 72, MemoryAllocated: 480, GpusAllocated: 0, @@ -173,18 +169,152 @@ func TestUpdateNodeState(t *testing.T) { JobsRunning: 1, } - repo := GetNodeRepository() err := repo.UpdateNodeState("host124", "testcluster", &nodeState) if err != nil { - return + t.Fatal(err) } node, err := repo.GetNode("host124", "testcluster", false) if err != nil { - return + t.Fatal(err) } if node.NodeState != "allocated" { t.Errorf("wrong node state\ngot: %s \nwant: allocated ", node.NodeState) } + + t.Run("FindBeforeEmpty", func(t *testing.T) { + // Only the current-timestamp row exists, so nothing should be found before now + rows, err := repo.FindNodeStatesBefore(now) + if err != nil { + t.Fatal(err) + } + if len(rows) != 0 { + t.Errorf("expected 0 rows, got %d", len(rows)) + } + }) + + t.Run("DeleteOldRows", func(t *testing.T) { + // Insert 2 more old rows for host124 + for i, ts := range []int64{now - 7200, now - 3600} { + ns := schema.NodeStateDB{ + TimeStamp: ts, + NodeState: "allocated", + HealthState: schema.MonitoringStateFull, + CpusAllocated: 72, + MemoryAllocated: 480, + JobsRunning: i, + } + if err := repo.UpdateNodeState("host124", "testcluster", &ns); err != nil { + t.Fatal(err) + } + } + + // Delete rows older than 30 minutes + cutoff := now - 1800 + cnt, err := repo.DeleteNodeStatesBefore(cutoff) + if err != nil { + t.Fatal(err) + } + + // Should delete the 2 old rows + if cnt != 2 { + t.Errorf("expected 2 deleted rows, got %d", cnt) + } + + // Latest row should still exist + node, err := repo.GetNode("host124", "testcluster", false) + if err != nil { + t.Fatal(err) + } + if node.NodeState != "allocated" { + t.Errorf("expected node state 'allocated', got %s", node.NodeState) + } + }) + + t.Run("PreservesLatestPerNode", func(t *testing.T) { + // Insert a single old row for host125 — it's the latest per node so it must survive + ns := schema.NodeStateDB{ + TimeStamp: now - 7200, + NodeState: "idle", + HealthState: schema.MonitoringStateFull, + CpusAllocated: 0, + MemoryAllocated: 0, + JobsRunning: 0, + } + if err := repo.UpdateNodeState("host125", "testcluster", &ns); err != nil { + t.Fatal(err) + } + + // Delete everything older than now — the latest per node should be preserved + _, err := repo.DeleteNodeStatesBefore(now) + if err != nil { + t.Fatal(err) + } + + // The latest row for host125 must still exist + node, err := repo.GetNode("host125", "testcluster", false) + if err != nil { + t.Fatal(err) + } + if node.NodeState != "idle" { + t.Errorf("expected node state 'idle', got %s", node.NodeState) + } + + // Verify exactly 1 row remains for host125 + var countAfter int + if err := repo.DB.QueryRow( + "SELECT COUNT(*) FROM node_state WHERE node_id = (SELECT id FROM node WHERE hostname = 'host125')"). + Scan(&countAfter); err != nil { + t.Fatal(err) + } + if countAfter != 1 { + t.Errorf("expected 1 row remaining for host125, got %d", countAfter) + } + }) + + t.Run("FindBeforeWithJoin", func(t *testing.T) { + // Insert old and current rows for host123 + for _, ts := range []int64{now - 7200, now} { + ns := schema.NodeStateDB{ + TimeStamp: ts, + NodeState: "allocated", + HealthState: schema.MonitoringStateFull, + CpusAllocated: 8, + MemoryAllocated: 1024, + GpusAllocated: 1, + JobsRunning: 1, + } + if err := repo.UpdateNodeState("host123", "testcluster", &ns); err != nil { + t.Fatal(err) + } + } + + // Find rows older than 30 minutes, excluding latest per node + cutoff := now - 1800 + rows, err := repo.FindNodeStatesBefore(cutoff) + if err != nil { + t.Fatal(err) + } + + // Should find the old host123 row + found := false + for _, row := range rows { + if row.Hostname == "host123" && row.TimeStamp == now-7200 { + found = true + if row.Cluster != "testcluster" { + t.Errorf("expected cluster 'testcluster', got %s", row.Cluster) + } + if row.SubCluster != "sc1" { + t.Errorf("expected subcluster 'sc1', got %s", row.SubCluster) + } + if row.CpusAllocated != 8 { + t.Errorf("expected cpus_allocated 8, got %d", row.CpusAllocated) + } + } + } + if !found { + t.Errorf("expected to find old host123 row among %d results", len(rows)) + } + }) } diff --git a/internal/repository/repository_test.go b/internal/repository/repository_test.go index 5603c31c..b9496143 100644 --- a/internal/repository/repository_test.go +++ b/internal/repository/repository_test.go @@ -6,11 +6,13 @@ package repository import ( "context" + "os" + "path/filepath" "testing" "github.com/ClusterCockpit/cc-backend/internal/graph/model" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" _ "github.com/mattn/go-sqlite3" ) @@ -46,7 +48,7 @@ func BenchmarkSelect1(b *testing.B) { } func BenchmarkDB_FindJobById(b *testing.B) { - var jobId int64 = 1677322 + var jobID int64 = 1677322 b.Run("FindJobById", func(b *testing.B) { db := setup(b) @@ -55,7 +57,7 @@ func BenchmarkDB_FindJobById(b *testing.B) { b.RunParallel(func(pb *testing.PB) { for pb.Next() { - _, err := db.FindById(getContext(b), jobId) + _, err := db.FindByID(getContext(b), jobID) noErr(b, err) } }) @@ -63,7 +65,7 @@ func BenchmarkDB_FindJobById(b *testing.B) { } func BenchmarkDB_FindJob(b *testing.B) { - var jobId int64 = 107266 + var jobID int64 = 107266 var startTime int64 = 1657557241 cluster := "fritz" @@ -74,7 +76,7 @@ func BenchmarkDB_FindJob(b *testing.B) { b.RunParallel(func(pb *testing.PB) { for pb.Next() { - _, err := db.Find(&jobId, &cluster, &startTime) + _, err := db.Find(&jobID, &cluster, &startTime) noErr(b, err) } }) @@ -148,10 +150,24 @@ func getContext(tb testing.TB) context.Context { func setup(tb testing.TB) *JobRepository { tb.Helper() cclog.Init("warn", true) - dbfile := "testdata/job.db" - err := MigrateDB("sqlite3", dbfile) + + // Copy test DB to a temp file for test isolation + srcData, err := os.ReadFile("testdata/job.db") noErr(tb, err) - Connect("sqlite3", dbfile) + dbfile := filepath.Join(tb.TempDir(), "job.db") + err = os.WriteFile(dbfile, srcData, 0o644) + noErr(tb, err) + + // Reset singletons so Connect uses the new temp DB + err = ResetConnection() + noErr(tb, err) + tb.Cleanup(func() { + ResetConnection() + }) + + err = MigrateDB(dbfile) + noErr(tb, err) + Connect(dbfile) return GetJobRepository() } diff --git a/internal/repository/stats.go b/internal/repository/stats.go index 4ca8aa00..4a61f62d 100644 --- a/internal/repository/stats.go +++ b/internal/repository/stats.go @@ -2,6 +2,44 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + +// This file contains job statistics and histogram generation functionality for the JobRepository. +// +// # Job Statistics +// +// The statistics methods provide aggregated metrics about jobs including total jobs, users, +// walltime, and resource usage (nodes, cores, accelerators). Statistics can be computed: +// - Overall (JobsStats): Single aggregate across all matching jobs +// - Grouped (JobsStatsGrouped): Aggregated by user, project, cluster, or subcluster +// - Counts (JobCountGrouped, AddJobCount): Simple job counts with optional filtering +// +// All statistics methods support filtering via JobFilter and respect security contexts. +// +// # Histograms +// +// Histogram methods generate distribution data for visualization: +// - Duration, nodes, cores, accelerators (AddHistograms) +// - Job metrics like CPU load, memory usage (AddMetricHistograms) +// +// Histograms use intelligent binning: +// - Duration: Variable bin sizes (1m, 10m, 1h, 6h, 12h, 24h) with zero-padding +// - Resources: Natural value-based bins +// - Metrics: Normalized to peak values with configurable bin counts +// +// # Running vs. Completed Jobs +// +// Statistics handle running jobs specially: +// - Duration calculated as (now - start_time) for running jobs +// - Metric histograms for running jobs load data from metric backend instead of footprint +// - Job state filtering distinguishes running/completed jobs +// +// # Performance Considerations +// +// - All queries use prepared statements via stmtCache +// - Complex aggregations use SQL for efficiency +// - Histogram pre-initialization ensures consistent bin ranges +// - Metric histogram queries limited to 5000 jobs for running job analysis + package repository import ( @@ -12,14 +50,16 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/graph/model" - "github.com/ClusterCockpit/cc-backend/internal/metricDataDispatcher" + "github.com/ClusterCockpit/cc-backend/internal/metricdispatch" "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" sq "github.com/Masterminds/squirrel" ) -// GraphQL validation should make sure that no unkown values can be specified. +// groupBy2column maps GraphQL Aggregate enum values to their corresponding database column names. +// Used by JobsStatsGrouped and JobCountGrouped to translate user-facing grouping dimensions +// into SQL GROUP BY clauses. GraphQL validation ensures only valid enum values are accepted. var groupBy2column = map[model.Aggregate]string{ model.AggregateUser: "job.hpc_user", model.AggregateProject: "job.project", @@ -27,6 +67,9 @@ var groupBy2column = map[model.Aggregate]string{ model.AggregateSubcluster: "job.subcluster", } +// sortBy2column maps GraphQL SortByAggregate enum values to their corresponding computed column names. +// Used by JobsStatsGrouped to translate sort preferences into SQL ORDER BY clauses. +// Column names match the AS aliases used in buildStatsQuery. var sortBy2column = map[model.SortByAggregate]string{ model.SortByAggregateTotaljobs: "totalJobs", model.SortByAggregateTotalusers: "totalUsers", @@ -39,6 +82,21 @@ var sortBy2column = map[model.SortByAggregate]string{ model.SortByAggregateTotalacchours: "totalAccHours", } +// buildCountQuery constructs a SQL query to count jobs with optional grouping and filtering. +// +// Parameters: +// - filter: Job filters to apply (cluster, user, time range, etc.) +// - kind: Special filter - "running" for running jobs only, "short" for jobs under threshold +// - col: Column name to GROUP BY; empty string for total count without grouping +// +// Returns a SelectBuilder that produces either: +// - Single count: COUNT(job.id) when col is empty +// - Grouped counts: col, COUNT(job.id) when col is specified +// +// The kind parameter enables counting specific job categories: +// - "running": Only jobs with job_state = 'running' +// - "short": Only jobs with duration < ShortRunningJobsDuration config value +// - empty: All jobs matching filters func (r *JobRepository) buildCountQuery( filter []*model.JobFilter, kind string, @@ -47,10 +105,8 @@ func (r *JobRepository) buildCountQuery( var query sq.SelectBuilder if col != "" { - // Scan columns: id, cnt query = sq.Select(col, "COUNT(job.id)").From("job").GroupBy(col) } else { - // Scan columns: cnt query = sq.Select("COUNT(job.id)").From("job") } @@ -68,42 +124,58 @@ func (r *JobRepository) buildCountQuery( return query } +// buildStatsQuery constructs a SQL query to compute comprehensive job statistics with optional grouping. +// +// Parameters: +// - filter: Job filters to apply (cluster, user, time range, etc.) +// - col: Column name to GROUP BY; empty string for overall statistics without grouping +// +// Returns a SelectBuilder that produces comprehensive statistics: +// - totalJobs: Count of jobs +// - totalUsers: Count of distinct users (always 0 when grouping by user) +// - totalWalltime: Sum of job durations in hours +// - totalNodes: Sum of nodes used across all jobs +// - totalNodeHours: Sum of (duration × num_nodes) in hours +// - totalCores: Sum of hardware threads used across all jobs +// - totalCoreHours: Sum of (duration × num_hwthreads) in hours +// - totalAccs: Sum of accelerators used across all jobs +// - totalAccHours: Sum of (duration × num_acc) in hours +// +// Special handling: +// - Running jobs: Duration calculated as (now - start_time) instead of stored duration +// - Grouped queries: Also select grouping column and user's display name from hpc_user table +// - All time values converted from seconds to hours (÷ 3600) and rounded func (r *JobRepository) buildStatsQuery( filter []*model.JobFilter, col string, ) sq.SelectBuilder { var query sq.SelectBuilder - castType := r.getCastType() - - // fmt.Sprintf(`CAST(ROUND((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) / 3600) as %s) as value`, time.Now().Unix(), castType) if col != "" { - // Scan columns: id, name, totalJobs, totalUsers, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours query = sq.Select( col, "name", "COUNT(job.id) as totalJobs", "COUNT(DISTINCT job.hpc_user) AS totalUsers", - fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as %s) as totalWalltime`, time.Now().Unix(), castType), - fmt.Sprintf(`CAST(SUM(job.num_nodes) as %s) as totalNodes`, castType), - fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as %s) as totalNodeHours`, time.Now().Unix(), castType), - fmt.Sprintf(`CAST(SUM(job.num_hwthreads) as %s) as totalCores`, castType), - fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as %s) as totalCoreHours`, time.Now().Unix(), castType), - fmt.Sprintf(`CAST(SUM(job.num_acc) as %s) as totalAccs`, castType), - fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as %s) as totalAccHours`, time.Now().Unix(), castType), + fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as int) as totalWalltime`, time.Now().Unix()), + `CAST(SUM(job.num_nodes) as int) as totalNodes`, + fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as int) as totalNodeHours`, time.Now().Unix()), + `CAST(SUM(job.num_hwthreads) as int) as totalCores`, + fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as int) as totalCoreHours`, time.Now().Unix()), + `CAST(SUM(job.num_acc) as int) as totalAccs`, + fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as int) as totalAccHours`, time.Now().Unix()), ).From("job").LeftJoin("hpc_user ON hpc_user.username = job.hpc_user").GroupBy(col) } else { - // Scan columns: totalJobs, totalUsers, totalWalltime, totalNodes, totalNodeHours, totalCores, totalCoreHours, totalAccs, totalAccHours query = sq.Select( "COUNT(job.id) as totalJobs", "COUNT(DISTINCT job.hpc_user) AS totalUsers", - fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as %s)`, time.Now().Unix(), castType), - fmt.Sprintf(`CAST(SUM(job.num_nodes) as %s)`, castType), - fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as %s)`, time.Now().Unix(), castType), - fmt.Sprintf(`CAST(SUM(job.num_hwthreads) as %s)`, castType), - fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as %s)`, time.Now().Unix(), castType), - fmt.Sprintf(`CAST(SUM(job.num_acc) as %s)`, castType), - fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as %s)`, time.Now().Unix(), castType), + fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END)) / 3600) as int)`, time.Now().Unix()), + `CAST(SUM(job.num_nodes) as int)`, + fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_nodes) / 3600) as int)`, time.Now().Unix()), + `CAST(SUM(job.num_hwthreads) as int)`, + fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_hwthreads) / 3600) as int)`, time.Now().Unix()), + `CAST(SUM(job.num_acc) as int)`, + fmt.Sprintf(`CAST(ROUND(SUM((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) * job.num_acc) / 3600) as int)`, time.Now().Unix()), ).From("job") } @@ -114,21 +186,25 @@ func (r *JobRepository) buildStatsQuery( return query } -func (r *JobRepository) getCastType() string { - var castType string - - switch r.driver { - case "sqlite3": - castType = "int" - case "mysql": - castType = "unsigned" - default: - castType = "" - } - - return castType -} - +// JobsStatsGrouped computes comprehensive job statistics grouped by a dimension (user, project, cluster, or subcluster). +// +// This is the primary method for generating aggregated statistics views in the UI, providing +// metrics like total jobs, walltime, and resource usage broken down by the specified grouping. +// +// Parameters: +// - ctx: Context for security checks and cancellation +// - filter: Filters to apply (time range, cluster, job state, etc.) +// - page: Optional pagination (ItemsPerPage: -1 disables pagination) +// - sortBy: Optional sort column (totalJobs, totalWalltime, totalCoreHours, etc.) +// - groupBy: Required grouping dimension (User, Project, Cluster, or SubCluster) +// +// Returns a slice of JobsStatistics, one per group, with: +// - ID: The group identifier (username, project name, cluster name, etc.) +// - Name: Display name (for users, from hpc_user.name; empty for other groups) +// - Statistics: totalJobs, totalUsers, totalWalltime, resource usage metrics +// +// Security: Respects user roles via SecurityCheck - users see only their own data unless admin/support. +// Performance: Results are sorted in SQL and pagination applied before scanning rows. func (r *JobRepository) JobsStatsGrouped( ctx context.Context, filter []*model.JobFilter, @@ -253,6 +329,21 @@ func (r *JobRepository) JobsStatsGrouped( return stats, nil } +// JobsStats computes overall job statistics across all matching jobs without grouping. +// +// This method provides a single aggregate view of job metrics, useful for dashboard +// summaries and overall system utilization reports. +// +// Parameters: +// - ctx: Context for security checks and cancellation +// - filter: Filters to apply (time range, cluster, job state, etc.) +// +// Returns a single-element slice containing aggregate statistics: +// - totalJobs, totalUsers, totalWalltime +// - totalNodeHours, totalCoreHours, totalAccHours +// +// Unlike JobsStatsGrouped, this returns overall totals without breaking down by dimension. +// Security checks are applied via SecurityCheck to respect user access levels. func (r *JobRepository) JobsStats( ctx context.Context, filter []*model.JobFilter, @@ -300,6 +391,15 @@ func (r *JobRepository) JobsStats( return stats, nil } +// LoadJobStat retrieves a specific statistic for a metric from a job's statistics. +// Returns 0.0 if the metric is not found or statType is invalid. +// +// Parameters: +// - job: Job struct with populated Statistics field +// - metric: Name of the metric to query (e.g., "cpu_load", "mem_used") +// - statType: Type of statistic: "avg", "min", or "max" +// +// Returns the requested statistic value or 0.0 if not found. func LoadJobStat(job *schema.Job, metric string, statType string) float64 { if stats, ok := job.Statistics[metric]; ok { switch statType { @@ -317,6 +417,17 @@ func LoadJobStat(job *schema.Job, metric string, statType string) float64 { return 0.0 } +// JobCountGrouped counts jobs grouped by a dimension without computing detailed statistics. +// +// This is a lightweight alternative to JobsStatsGrouped when only job counts are needed, +// avoiding the overhead of calculating walltime and resource usage metrics. +// +// Parameters: +// - ctx: Context for security checks +// - filter: Filters to apply +// - groupBy: Grouping dimension (User, Project, Cluster, or SubCluster) +// +// Returns JobsStatistics with only ID and TotalJobs populated for each group. func (r *JobRepository) JobCountGrouped( ctx context.Context, filter []*model.JobFilter, @@ -362,6 +473,20 @@ func (r *JobRepository) JobCountGrouped( return stats, nil } +// AddJobCountGrouped augments existing statistics with additional job counts by category. +// +// This method enriches JobsStatistics returned by JobsStatsGrouped or JobCountGrouped +// with counts of running or short-running (based on ShortRunningJobsDuration) jobs, matched by group ID. +// +// Parameters: +// - ctx: Context for security checks +// - filter: Filters to apply +// - groupBy: Grouping dimension (must match the dimension used for stats parameter) +// - stats: Existing statistics to augment (modified in-place by ID matching) +// - kind: "running" to add RunningJobs count, "short" to add ShortJobs count +// +// Returns the same stats slice with RunningJobs or ShortJobs fields populated per group. +// Groups without matching jobs will have 0 for the added field. func (r *JobRepository) AddJobCountGrouped( ctx context.Context, filter []*model.JobFilter, @@ -416,6 +541,18 @@ func (r *JobRepository) AddJobCountGrouped( return stats, nil } +// AddJobCount augments existing overall statistics with additional job counts by category. +// +// Similar to AddJobCountGrouped but for ungrouped statistics. Applies the same count +// to all statistics entries (typically just one). +// +// Parameters: +// - ctx: Context for security checks +// - filter: Filters to apply +// - stats: Existing statistics to augment (modified in-place) +// - kind: "running" to add RunningJobs count, "short" to add ShortJobs count +// +// Returns the same stats slice with RunningJobs or ShortJobs fields set to the total count. func (r *JobRepository) AddJobCount( ctx context.Context, filter []*model.JobFilter, @@ -451,6 +588,26 @@ func (r *JobRepository) AddJobCount( return stats, nil } +// AddHistograms augments statistics with distribution histograms for job properties. +// +// Generates histogram data for visualization of job duration, node count, core count, +// and accelerator count distributions. Duration histogram uses intelligent binning based +// on the requested resolution. +// +// Parameters: +// - ctx: Context for security checks +// - filter: Filters to apply to jobs included in histograms +// - stat: Statistics struct to augment (modified in-place) +// - durationBins: Bin size - "1m", "10m", "1h", "6h", "12h", or "24h" (default) +// +// Populates these fields in stat: +// - HistDuration: Job duration distribution (zero-padded bins) +// - HistNumNodes: Node count distribution +// - HistNumCores: Core (hwthread) count distribution +// - HistNumAccs: Accelerator count distribution +// +// Duration bins are pre-initialized with zeros to ensure consistent ranges for visualization. +// Bin size determines both the width and maximum duration displayed (e.g., "1h" = 48 bins × 1h = 48h max). func (r *JobRepository) AddHistograms( ctx context.Context, filter []*model.JobFilter, @@ -461,20 +618,20 @@ func (r *JobRepository) AddHistograms( var targetBinCount int var targetBinSize int - switch { - case *durationBins == "1m": // 1 Minute Bins + Max 60 Bins -> Max 60 Minutes + switch *durationBins { + case "1m": // 1 Minute Bins + Max 60 Bins -> Max 60 Minutes targetBinCount = 60 targetBinSize = 60 - case *durationBins == "10m": // 10 Minute Bins + Max 72 Bins -> Max 12 Hours + case "10m": // 10 Minute Bins + Max 72 Bins -> Max 12 Hours targetBinCount = 72 targetBinSize = 600 - case *durationBins == "1h": // 1 Hour Bins + Max 48 Bins -> Max 48 Hours + case "1h": // 1 Hour Bins + Max 48 Bins -> Max 48 Hours targetBinCount = 48 targetBinSize = 3600 - case *durationBins == "6h": // 6 Hour Bins + Max 12 Bins -> Max 3 Days + case "6h": // 6 Hour Bins + Max 12 Bins -> Max 3 Days targetBinCount = 12 targetBinSize = 21600 - case *durationBins == "12h": // 12 hour Bins + Max 14 Bins -> Max 7 Days + case "12h": // 12 hour Bins + Max 14 Bins -> Max 7 Days targetBinCount = 14 targetBinSize = 43200 default: // 24h @@ -482,10 +639,9 @@ func (r *JobRepository) AddHistograms( targetBinSize = 3600 } - castType := r.getCastType() var err error // Return X-Values always as seconds, will be formatted into minutes and hours in frontend - value := fmt.Sprintf(`CAST(ROUND(((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) / %d) + 1) as %s) as value`, time.Now().Unix(), targetBinSize, castType) + value := fmt.Sprintf(`CAST(ROUND(((CASE WHEN job.job_state = "running" THEN %d - job.start_time ELSE job.duration END) / %d) + 1) as int) as value`, time.Now().Unix(), targetBinSize) stat.HistDuration, err = r.jobsDurationStatisticsHistogram(ctx, value, filter, targetBinSize, &targetBinCount) if err != nil { cclog.Warn("Error while loading job statistics histogram: job duration") @@ -514,7 +670,30 @@ func (r *JobRepository) AddHistograms( return stat, nil } -// Requires thresholds for metric from config for cluster? Of all clusters and use largest? split to 10 + 1 for artifacts? +// AddMetricHistograms augments statistics with distribution histograms for job metrics. +// +// Generates histogram data for metrics like CPU load, memory usage, etc. Handles running +// and completed jobs differently: running jobs load data from metric backend, completed jobs +// use footprint data from database. +// +// Parameters: +// - ctx: Context for security checks +// - filter: Filters to apply (MUST contain State filter for running jobs) +// - metrics: List of metric names to histogram (e.g., ["cpu_load", "mem_used"]) +// - stat: Statistics struct to augment (modified in-place) +// - targetBinCount: Number of histogram bins (default: 10) +// +// Populates HistMetrics field in stat with MetricHistoPoints for each metric. +// +// Binning algorithm: +// - Values normalized to metric's peak value from cluster configuration +// - Bins evenly distributed from 0 to peak +// - Pre-initialized with zeros for consistent visualization +// +// Limitations: +// - Running jobs: Limited to 5000 jobs for performance +// - Requires valid cluster configuration with metric peak values +// - Uses footprint statistic (avg/max/min) configured per metric func (r *JobRepository) AddMetricHistograms( ctx context.Context, filter []*model.JobFilter, @@ -549,7 +728,16 @@ func (r *JobRepository) AddMetricHistograms( return stat, nil } -// `value` must be the column grouped by, but renamed to "value" +// jobsStatisticsHistogram generates a simple histogram by grouping on a column value. +// +// Used for histograms where the column value directly represents the bin (e.g., node count, core count). +// Unlike duration/metric histograms, this doesn't pre-initialize bins with zeros. +// +// Parameters: +// - value: SQL expression that produces the histogram value, aliased as "value" +// - filters: Job filters to apply +// +// Returns histogram points with Value (from column) and Count (number of jobs). func (r *JobRepository) jobsStatisticsHistogram( ctx context.Context, value string, @@ -594,6 +782,26 @@ func (r *JobRepository) jobsStatisticsHistogram( return points, nil } +// jobsDurationStatisticsHistogram generates a duration histogram with pre-initialized bins. +// +// Bins are zero-padded to provide consistent ranges for visualization, unlike simple +// histograms which only return bins with data. The value parameter should compute +// the bin number from job duration. +// +// Parameters: +// - value: SQL expression computing bin number from duration, aliased as "value" +// - filters: Job filters to apply +// - binSizeSeconds: Width of each bin in seconds +// - targetBinCount: Number of bins to pre-initialize +// +// Returns histogram points with Value (bin_number × binSizeSeconds) and Count. +// All bins from 1 to targetBinCount are returned, with Count=0 for empty bins. +// +// Algorithm: +// 1. Pre-initialize targetBinCount bins with zero counts +// 2. Query database for actual counts per bin +// 3. Match query results to pre-initialized bins by value +// 4. Bins without matches remain at zero func (r *JobRepository) jobsDurationStatisticsHistogram( ctx context.Context, value string, @@ -609,7 +817,8 @@ func (r *JobRepository) jobsDurationStatisticsHistogram( return nil, qerr } - // Setup Array + // Each bin represents a duration range: bin N = [N*binSizeSeconds, (N+1)*binSizeSeconds) + // Example: binSizeSeconds=3600 (1 hour), bin 1 = 0-1h, bin 2 = 1-2h, etc. points := make([]*model.HistoPoint, 0) for i := 1; i <= *targetBinCount; i++ { point := model.HistoPoint{Value: i * binSizeSeconds, Count: 0} @@ -627,7 +836,8 @@ func (r *JobRepository) jobsDurationStatisticsHistogram( } defer rows.Close() - // Fill Array at matching $Value + // Match query results to pre-initialized bins. + // point.Value from query is the bin number; multiply by binSizeSeconds to match bin.Value. for rows.Next() { point := model.HistoPoint{} if err := rows.Scan(&point.Value, &point.Count); err != nil { @@ -637,9 +847,6 @@ func (r *JobRepository) jobsDurationStatisticsHistogram( for _, e := range points { if e.Value == (point.Value * binSizeSeconds) { - // Note: - // Matching on unmodified integer value (and multiplying point.Value by binSizeSeconds after match) - // causes frontend to loop into highest targetBinCount, due to zoom condition instantly being fullfilled (cause unknown) e.Count = point.Count break } @@ -654,18 +861,43 @@ func (r *JobRepository) jobsDurationStatisticsHistogram( return points, nil } +// jobsMetricStatisticsHistogram generates a metric histogram using footprint data from completed jobs. +// +// Values are normalized to the metric's peak value and distributed into bins. The algorithm +// is based on SQL histogram generation techniques, extracting metric values from JSON footprint +// and computing bin assignments in SQL. +// +// Parameters: +// - metric: Metric name (e.g., "cpu_load", "mem_used") +// - filters: Job filters to apply +// - bins: Number of bins to generate +// +// Returns MetricHistoPoints with metric name, unit, footprint stat type, and binned data. +// +// Algorithm: +// 1. Determine peak value from cluster configuration (filtered cluster or max across all) +// 2. Generate SQL that extracts footprint value, normalizes to [0,1], multiplies by bin count +// 3. Pre-initialize bins with min/max ranges based on peak value +// 4. Query database for counts per bin +// 5. Match results to pre-initialized bins +// +// Special handling: Values exactly equal to peak are forced into the last bin by multiplying +// peak by 0.999999999 to avoid creating an extra bin. func (r *JobRepository) jobsMetricStatisticsHistogram( ctx context.Context, metric string, filters []*model.JobFilter, bins *int, ) (*model.MetricHistoPoints, error) { - // Get specific Peak or largest Peak + // Peak value defines the upper bound for binning: values are distributed across + // bins from 0 to peak. First try to get peak from filtered cluster, otherwise + // scan all clusters to find the maximum peak value. var metricConfig *schema.MetricConfig var peak float64 var unit string var footprintStat string + // Try to get metric config from filtered cluster for _, f := range filters { if f.Cluster != nil { metricConfig = archive.GetMetricConfig(*f.Cluster.Eq, metric) @@ -676,6 +908,8 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( } } + // If no cluster filter or peak not found, find largest peak across all clusters + // This ensures histogram can accommodate all possible values if peak == 0.0 { for _, c := range archive.Clusters { for _, m := range c.MetricConfig { @@ -694,11 +928,14 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( } } - // cclog.Debugf("Metric %s, Peak %f, Unit %s", metric, peak, unit) - // Make bins, see https://jereze.com/code/sql-histogram/ (Modified here) + // Construct SQL histogram bins using normalized values. + // Algorithm based on: https://jereze.com/code/sql-histogram/ (modified) start := time.Now() - // Find Jobs' Value Bin Number: Divide Value by Peak, Multiply by RequestedBins, then CAST to INT: Gets Bin-Number of Job + // Bin calculation formula: + // bin_number = CAST( (value / peak) * num_bins AS INTEGER ) + 1 + // Special case: value == peak would create bin N+1, so we test for equality + // and multiply peak by 0.999999999 to force it into bin N. binQuery := fmt.Sprintf(`CAST( ((case when json_extract(footprint, "$.%s") = %f then %f*0.999999999 else json_extract(footprint, "$.%s") end) / %f) * %v as INTEGER )`, @@ -707,24 +944,19 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( mainQuery := sq.Select( fmt.Sprintf(`%s + 1 as bin`, binQuery), `count(*) as count`, - // For Debug: // fmt.Sprintf(`CAST((%f / %d) as INTEGER ) * %s as min`, peak, *bins, binQuery), - // For Debug: // fmt.Sprintf(`CAST((%f / %d) as INTEGER ) * (%s + 1) as max`, peak, *bins, binQuery), ).From("job").Where( "JSON_VALID(footprint)", ).Where(fmt.Sprintf(`json_extract(footprint, "$.%s") is not null and json_extract(footprint, "$.%s") <= %f`, (metric + "_" + footprintStat), (metric + "_" + footprintStat), peak)) - // Only accessible Jobs... mainQuery, qerr := SecurityCheck(ctx, mainQuery) if qerr != nil { return nil, qerr } - // Filters... for _, f := range filters { mainQuery = BuildWhereClause(f, mainQuery) } - // Finalize query with Grouping and Ordering mainQuery = mainQuery.GroupBy("bin").OrderBy("bin") rows, err := mainQuery.RunWith(r.DB).Query() @@ -734,7 +966,8 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( } defer rows.Close() - // Setup Return Array With Bin-Numbers for Match and Min/Max based on Peak + // Pre-initialize bins with calculated min/max ranges. + // Example: peak=1000, bins=10 -> bin 1=[0,100), bin 2=[100,200), ..., bin 10=[900,1000] points := make([]*model.MetricHistoPoint, 0) binStep := int(peak) / *bins for i := 1; i <= *bins; i++ { @@ -744,26 +977,18 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( points = append(points, &epoint) } - for rows.Next() { // Fill Count if Bin-No. Matches (Not every Bin exists in DB!) + // Match query results to pre-initialized bins. + for rows.Next() { rpoint := model.MetricHistoPoint{} - if err := rows.Scan(&rpoint.Bin, &rpoint.Count); err != nil { // Required for Debug: &rpoint.Min, &rpoint.Max + if err := rows.Scan(&rpoint.Bin, &rpoint.Count); err != nil { cclog.Warnf("Error while scanning rows for %s", metric) - return nil, err // FIXME: Totally bricks cc-backend if returned and if all metrics requested? + return nil, err } for _, e := range points { - if e.Bin != nil && rpoint.Bin != nil { - if *e.Bin == *rpoint.Bin { - e.Count = rpoint.Count - // Only Required For Debug: Check DB returned Min/Max against Backend Init above - // if rpoint.Min != nil { - // cclog.Warnf(">>>> Bin %d Min Set For %s to %d (Init'd with: %d)", *e.Bin, metric, *rpoint.Min, *e.Min) - // } - // if rpoint.Max != nil { - // cclog.Warnf(">>>> Bin %d Max Set For %s to %d (Init'd with: %d)", *e.Bin, metric, *rpoint.Max, *e.Max) - // } - break - } + if e.Bin != nil && rpoint.Bin != nil && *e.Bin == *rpoint.Bin { + e.Count = rpoint.Count + break } } } @@ -778,6 +1003,28 @@ func (r *JobRepository) jobsMetricStatisticsHistogram( return &result, nil } +// runningJobsMetricStatisticsHistogram generates metric histograms for running jobs using live data. +// +// Unlike completed jobs which use footprint data from the database, running jobs require +// fetching current metric averages from the metric backend (via metricdispatch). +// +// Parameters: +// - metrics: List of metric names +// - filters: Job filters (should filter to running jobs only) +// - bins: Number of histogram bins +// +// Returns slice of MetricHistoPoints, one per metric. +// +// Limitations: +// - Maximum 5000 jobs (returns nil if more jobs match) +// - Requires metric backend availability +// - Bins based on metric peak values from cluster configuration +// +// Algorithm: +// 1. Query first 5001 jobs to check count limit +// 2. Load metric averages for all jobs via metricdispatch +// 3. For each metric, create bins based on peak value +// 4. Iterate averages and count jobs per bin func (r *JobRepository) runningJobsMetricStatisticsHistogram( ctx context.Context, metrics []string, @@ -785,13 +1032,13 @@ func (r *JobRepository) runningJobsMetricStatisticsHistogram( bins *int, ) []*model.MetricHistoPoints { // Get Jobs - jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 500 + 1}, nil) + jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 5000 + 1}, nil) if err != nil { cclog.Errorf("Error while querying jobs for footprint: %s", err) return nil } - if len(jobs) > 500 { - cclog.Errorf("too many jobs matched (max: %d)", 500) + if len(jobs) > 5000 { + cclog.Errorf("too many jobs matched (max: %d)", 5000) return nil } @@ -806,7 +1053,7 @@ func (r *JobRepository) runningJobsMetricStatisticsHistogram( continue } - if err := metricDataDispatcher.LoadAverages(job, metrics, avgs, ctx); err != nil { + if err := metricdispatch.LoadAverages(job, metrics, avgs, ctx); err != nil { cclog.Errorf("Error while loading averages for histogram: %s", err) return nil } diff --git a/internal/repository/stats_test.go b/internal/repository/stats_test.go index e10c9685..a6c2da17 100644 --- a/internal/repository/stats_test.go +++ b/internal/repository/stats_test.go @@ -25,11 +25,14 @@ func TestBuildJobStatsQuery(t *testing.T) { func TestJobStats(t *testing.T) { r := setup(t) - filter := &model.JobFilter{} - stats, err := r.JobsStats(getContext(t), []*model.JobFilter{filter}) + var expectedCount int + err := r.DB.QueryRow(`SELECT COUNT(*) FROM job`).Scan(&expectedCount) noErr(t, err) - if stats[0].TotalJobs != 544 { - t.Fatalf("Want 544, Got %d", stats[0].TotalJobs) + stats, err := r.JobsStats(getContext(t), []*model.JobFilter{}) + noErr(t, err) + + if stats[0].TotalJobs != expectedCount { + t.Fatalf("Want %d, Got %d", expectedCount, stats[0].TotalJobs) } } diff --git a/internal/repository/tags.go b/internal/repository/tags.go index a7307302..39ccd90d 100644 --- a/internal/repository/tags.go +++ b/internal/repository/tags.go @@ -2,6 +2,35 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + +// Package repository provides data access and persistence layer for ClusterCockpit. +// +// This file implements tag management functionality for job categorization and classification. +// Tags support both manual assignment (via REST/GraphQL APIs) and automatic detection +// (via tagger plugins). The implementation includes role-based access control through +// tag scopes and maintains bidirectional consistency between the SQL database and +// the file-based job archive. +// +// Database Schema: +// +// CREATE TABLE tag ( +// id INTEGER PRIMARY KEY AUTOINCREMENT, +// tag_type VARCHAR(255) NOT NULL, +// tag_name VARCHAR(255) NOT NULL, +// tag_scope VARCHAR(255) NOT NULL DEFAULT "global", +// CONSTRAINT tag_unique UNIQUE (tag_type, tag_name, tag_scope) +// ); +// +// CREATE TABLE jobtag ( +// job_id INTEGER, +// tag_id INTEGER, +// PRIMARY KEY (job_id, tag_id), +// FOREIGN KEY (job_id) REFERENCES job(id) ON DELETE CASCADE, +// FOREIGN KEY (tag_id) REFERENCES tag(id) ON DELETE CASCADE +// ); +// +// The jobtag junction table enables many-to-many relationships between jobs and tags. +// CASCADE deletion ensures referential integrity when jobs or tags are removed. package repository import ( @@ -10,15 +39,39 @@ import ( "strings" "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" sq "github.com/Masterminds/squirrel" ) +// Tag Scope Rules: +// +// Tags in ClusterCockpit have three visibility scopes that control who can see and use them: +// +// 1. "global" - Visible to all users, can be used by anyone +// Example: System-generated tags like "energy-efficient", "failed", "short" +// +// 2. "private" - Only visible to the creating user +// Example: Personal notes like "needs-review", "interesting-case" +// +// 3. "admin" - Only visible to users with admin or support roles +// Example: Internal notes like "hardware-issue", "billing-problem" +// +// Authorization Rules: +// - Regular users can only create/see "global" and their own "private" tags +// - Admin/Support can create/see all scopes including "admin" tags +// - Users can only add tags to jobs they have permission to view +// - Tag scope is enforced at query time in GetTags() and CountTags() + // AddTag adds the tag with id `tagId` to the job with the database id `jobId`. // Requires user authentication for security checks. +// +// The user must have permission to view the job. Tag visibility is determined by scope: +// - "global" tags: visible to all users +// - "private" tags: only visible to the tag creator +// - "admin" tags: only visible to admin/support users func (r *JobRepository) AddTag(user *schema.User, job int64, tag int64) ([]*schema.Tag, error) { - j, err := r.FindByIdWithUser(user, job) + j, err := r.FindByIDWithUser(user, job) if err != nil { cclog.Warnf("Error finding job %d for user %s: %v", job, user.Username, err) return nil, err @@ -32,7 +85,7 @@ func (r *JobRepository) AddTag(user *schema.User, job int64, tag int64) ([]*sche // AddTagDirect adds a tag without user security checks. // Use only for internal/admin operations. func (r *JobRepository) AddTagDirect(job int64, tag int64) ([]*schema.Tag, error) { - j, err := r.FindByIdDirect(job) + j, err := r.FindByIDDirect(job) if err != nil { cclog.Warnf("Error finding job %d: %v", job, err) return nil, err @@ -43,12 +96,12 @@ func (r *JobRepository) AddTagDirect(job int64, tag int64) ([]*schema.Tag, error }) } -// Removes a tag from a job by tag id. -// Used by GraphQL API +// RemoveTag removes the tag with the database id `tag` from the job with the database id `job`. +// Requires user authentication for security checks. Used by GraphQL API. func (r *JobRepository) RemoveTag(user *schema.User, job, tag int64) ([]*schema.Tag, error) { - j, err := r.FindByIdWithUser(user, job) + j, err := r.FindByIDWithUser(user, job) if err != nil { - cclog.Warn("Error while finding job by id") + cclog.Warnf("Error while finding job %d for user %s during tag removal: %v", job, user.Username, err) return nil, err } @@ -68,27 +121,27 @@ func (r *JobRepository) RemoveTag(user *schema.User, job, tag int64) ([]*schema. archiveTags, err := r.getArchiveTags(&job) if err != nil { - cclog.Warn("Error while getting tags for job") + cclog.Warnf("Error while getting archive tags for job %d in RemoveTag: %v", job, err) return nil, err } return tags, archive.UpdateTags(j, archiveTags) } -// Removes a tag from a job by tag info -// Used by REST API +// RemoveJobTagByRequest removes a tag from the job with the database id `job` by tag type, name, and scope. +// Requires user authentication for security checks. Used by REST API. func (r *JobRepository) RemoveJobTagByRequest(user *schema.User, job int64, tagType string, tagName string, tagScope string) ([]*schema.Tag, error) { // Get Tag ID to delete - tagID, exists := r.TagId(tagType, tagName, tagScope) + tagID, exists := r.TagID(tagType, tagName, tagScope) if !exists { cclog.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) return nil, fmt.Errorf("tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) } // Get Job - j, err := r.FindByIdWithUser(user, job) + j, err := r.FindByIDWithUser(user, job) if err != nil { - cclog.Warn("Error while finding job by id") + cclog.Warnf("Error while finding job %d for user %s during tag removal by request: %v", job, user.Username, err) return nil, err } @@ -103,19 +156,30 @@ func (r *JobRepository) RemoveJobTagByRequest(user *schema.User, job int64, tagT tags, err := r.GetTags(user, &job) if err != nil { - cclog.Warn("Error while getting tags for job") + cclog.Warnf("Error while getting tags for job %d in RemoveJobTagByRequest: %v", job, err) return nil, err } archiveTags, err := r.getArchiveTags(&job) if err != nil { - cclog.Warn("Error while getting tags for job") + cclog.Warnf("Error while getting archive tags for job %d in RemoveJobTagByRequest: %v", job, err) return nil, err } return tags, archive.UpdateTags(j, archiveTags) } +// removeTagFromArchiveJobs updates the job archive for all affected jobs after a tag deletion. +// +// This function is called asynchronously (via goroutine) after removing a tag from the database +// to synchronize the file-based job archive with the database state. Errors are logged but not +// returned since this runs in the background. +// +// Parameters: +// - jobIds: Database IDs of all jobs that had the deleted tag +// +// Implementation note: Each job is processed individually to handle partial failures gracefully. +// If one job fails to update, others will still be processed. func (r *JobRepository) removeTagFromArchiveJobs(jobIds []int64) { for _, j := range jobIds { tags, err := r.getArchiveTags(&j) @@ -124,7 +188,7 @@ func (r *JobRepository) removeTagFromArchiveJobs(jobIds []int64) { continue } - job, err := r.FindByIdDirect(j) + job, err := r.FindByIDDirect(j) if err != nil { cclog.Warnf("Error while getting job %d", j) continue @@ -138,18 +202,18 @@ func (r *JobRepository) removeTagFromArchiveJobs(jobIds []int64) { // Used by REST API. Does not update tagged jobs in Job archive. func (r *JobRepository) RemoveTagByRequest(tagType string, tagName string, tagScope string) error { // Get Tag ID to delete - tagID, exists := r.TagId(tagType, tagName, tagScope) + tagID, exists := r.TagID(tagType, tagName, tagScope) if !exists { cclog.Warnf("Tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) return fmt.Errorf("tag does not exist (name, type, scope): %s, %s, %s", tagName, tagType, tagScope) } - return r.RemoveTagById(tagID) + return r.RemoveTagByID(tagID) } // Removes a tag from db by tag id // Used by GraphQL API. -func (r *JobRepository) RemoveTagById(tagID int64) error { +func (r *JobRepository) RemoveTagByID(tagID int64) error { jobIds, err := r.FindJobIdsByTag(tagID) if err != nil { return err @@ -179,8 +243,16 @@ func (r *JobRepository) RemoveTagById(tagID int64) error { return nil } -// CreateTag creates a new tag with the specified type and name and returns its database id. -func (r *JobRepository) CreateTag(tagType string, tagName string, tagScope string) (tagId int64, err error) { +// CreateTag creates a new tag with the specified type, name, and scope. +// Returns the database ID of the newly created tag. +// +// Scope defaults to "global" if empty string is provided. +// Valid scopes: "global", "private", "admin" +// +// Example: +// +// tagID, err := repo.CreateTag("performance", "high-memory", "global") +func (r *JobRepository) CreateTag(tagType string, tagName string, tagScope string) (tagID int64, err error) { // Default to "Global" scope if none defined if tagScope == "" { tagScope = "global" @@ -198,8 +270,14 @@ func (r *JobRepository) CreateTag(tagType string, tagName string, tagScope strin return res.LastInsertId() } +// CountTags returns all tags visible to the user and the count of jobs for each tag. +// Applies scope-based filtering to respect tag visibility rules. +// +// Returns: +// - tags: slice of tags the user can see +// - counts: map of tag name to job count +// - err: any error encountered func (r *JobRepository) CountTags(user *schema.User) (tags []schema.Tag, counts map[string]int, err error) { - // Fetch all Tags in DB for Display in Frontend Tag-View tags = make([]schema.Tag, 0, 100) xrows, err := r.DB.Queryx("SELECT id, tag_type, tag_name, tag_scope FROM tag") if err != nil { @@ -228,10 +306,10 @@ func (r *JobRepository) CountTags(user *schema.User) (tags []schema.Tag, counts } // Query and Count Jobs with attached Tags - q := sq.Select("t.tag_name, t.id, count(jt.tag_id)"). + q := sq.Select("t.tag_type, t.tag_name, t.id, count(jt.tag_id)"). From("tag t"). LeftJoin("jobtag jt ON t.id = jt.tag_id"). - GroupBy("t.tag_name") + GroupBy("t.tag_type, t.tag_name") // Build scope list for filtering var scopeBuilder strings.Builder @@ -265,14 +343,15 @@ func (r *JobRepository) CountTags(user *schema.User) (tags []schema.Tag, counts counts = make(map[string]int) for rows.Next() { + var tagType string var tagName string - var tagId int + var tagID int var count int - if err = rows.Scan(&tagName, &tagId, &count); err != nil { + if err = rows.Scan(&tagType, &tagName, &tagID, &count); err != nil { return nil, nil, err } // Use tagId as second Map-Key component to differentiate tags with identical names - counts[fmt.Sprint(tagName, tagId)] = count + counts[fmt.Sprint(tagType, tagName, tagID)] = count } err = rows.Err() @@ -280,18 +359,44 @@ func (r *JobRepository) CountTags(user *schema.User) (tags []schema.Tag, counts } var ( - ErrTagNotFound = errors.New("the tag does not exist") - ErrJobNotOwned = errors.New("user is not owner of job") - ErrTagNoAccess = errors.New("user not permitted to use that tag") - ErrTagPrivateScope = errors.New("tag is private to another user") - ErrTagAdminScope = errors.New("tag requires admin privileges") + // ErrTagNotFound is returned when a tag ID or tag identifier (type, name, scope) does not exist in the database. + ErrTagNotFound = errors.New("the tag does not exist") + + // ErrJobNotOwned is returned when a user attempts to tag a job they do not have permission to access. + ErrJobNotOwned = errors.New("user is not owner of job") + + // ErrTagNoAccess is returned when a user attempts to use a tag they cannot access due to scope restrictions. + ErrTagNoAccess = errors.New("user not permitted to use that tag") + + // ErrTagPrivateScope is returned when a user attempts to access another user's private tag. + ErrTagPrivateScope = errors.New("tag is private to another user") + + // ErrTagAdminScope is returned when a non-admin user attempts to use an admin-scoped tag. + ErrTagAdminScope = errors.New("tag requires admin privileges") + + // ErrTagsIncompatScopes is returned when attempting to combine admin and non-admin scoped tags in a single operation. ErrTagsIncompatScopes = errors.New("combining admin and non-admin scoped tags not allowed") ) // addJobTag is a helper function that inserts a job-tag association and updates the archive. -// Returns the updated tag list for the job. -func (r *JobRepository) addJobTag(jobId int64, tagId int64, job *schema.Job, getTags func() ([]*schema.Tag, error)) ([]*schema.Tag, error) { - q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(jobId, tagId) +// +// This function performs three operations atomically: +// 1. Inserts the job-tag association into the jobtag junction table +// 2. Retrieves the updated tag list for the job (using the provided getTags callback) +// 3. Updates the job archive with the new tags to maintain database-archive consistency +// +// Parameters: +// - jobId: Database ID of the job +// - tagId: Database ID of the tag to associate +// - job: Full job object needed for archive update +// - getTags: Callback function to retrieve updated tags (allows different security contexts) +// +// Returns the complete updated tag list for the job or an error. +// +// Note: This function does NOT validate tag scope permissions - callers must perform +// authorization checks before invoking this helper. +func (r *JobRepository) addJobTag(jobID int64, tagID int64, job *schema.Job, getTags func() ([]*schema.Tag, error)) ([]*schema.Tag, error) { + q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(jobID, tagID) if _, err := q.RunWith(r.stmtCache).Exec(); err != nil { s, _, _ := q.ToSql() @@ -301,13 +406,13 @@ func (r *JobRepository) addJobTag(jobId int64, tagId int64, job *schema.Job, get tags, err := getTags() if err != nil { - cclog.Warnf("Error getting tags for job %d: %v", jobId, err) + cclog.Warnf("Error getting tags for job %d: %v", jobID, err) return nil, err } - archiveTags, err := r.getArchiveTags(&jobId) + archiveTags, err := r.getArchiveTags(&jobID) if err != nil { - cclog.Warnf("Error getting archive tags for job %d: %v", jobId, err) + cclog.Warnf("Error getting archive tags for job %d: %v", jobID, err) return nil, err } @@ -316,7 +421,7 @@ func (r *JobRepository) addJobTag(jobId int64, tagId int64, job *schema.Job, get // AddTagOrCreate adds the tag with the specified type and name to the job with the database id `jobId`. // If such a tag does not yet exist, it is created. -func (r *JobRepository) AddTagOrCreate(user *schema.User, jobId int64, tagType string, tagName string, tagScope string) (tagId int64, err error) { +func (r *JobRepository) AddTagOrCreate(user *schema.User, jobID int64, tagType string, tagName string, tagScope string) (tagID int64, err error) { // Default to "Global" scope if none defined if tagScope == "" { tagScope = "global" @@ -330,44 +435,45 @@ func (r *JobRepository) AddTagOrCreate(user *schema.User, jobId int64, tagType s return 0, fmt.Errorf("cannot write tag scope with current authorization") } - tagId, exists := r.TagId(tagType, tagName, tagScope) + tagID, exists := r.TagID(tagType, tagName, tagScope) if !exists { - tagId, err = r.CreateTag(tagType, tagName, tagScope) + tagID, err = r.CreateTag(tagType, tagName, tagScope) if err != nil { return 0, err } } - if _, err := r.AddTag(user, jobId, tagId); err != nil { + if _, err := r.AddTag(user, jobID, tagID); err != nil { return 0, err } - return tagId, nil + return tagID, nil } -// used in auto tagger plugins -func (r *JobRepository) AddTagOrCreateDirect(jobId int64, tagType string, tagName string) (tagId int64, err error) { +func (r *JobRepository) AddTagOrCreateDirect(jobID int64, tagType string, tagName string) (tagID int64, err error) { tagScope := "global" - tagId, exists := r.TagId(tagType, tagName, tagScope) + tagID, exists := r.TagID(tagType, tagName, tagScope) if !exists { - tagId, err = r.CreateTag(tagType, tagName, tagScope) + tagID, err = r.CreateTag(tagType, tagName, tagScope) if err != nil { return 0, err } } - if _, err := r.AddTagDirect(jobId, tagId); err != nil { + cclog.Infof("Adding tag %s:%s:%s (direct)", tagType, tagName, tagScope) + + if _, err := r.AddTagDirect(jobID, tagID); err != nil { return 0, err } - return tagId, nil + return tagID, nil } -func (r *JobRepository) HasTag(jobId int64, tagType string, tagName string) bool { +func (r *JobRepository) HasTag(jobID int64, tagType string, tagName string) bool { var id int64 q := sq.Select("id").From("tag").Join("jobtag ON jobtag.tag_id = tag.id"). - Where("jobtag.job_id = ?", jobId).Where("tag.tag_type = ?", tagType). + Where("jobtag.job_id = ?", jobID).Where("tag.tag_type = ?", tagType). Where("tag.tag_name = ?", tagName) err := q.RunWith(r.stmtCache).QueryRow().Scan(&id) if err != nil { @@ -377,21 +483,21 @@ func (r *JobRepository) HasTag(jobId int64, tagType string, tagName string) bool } } -// TagId returns the database id of the tag with the specified type and name. -func (r *JobRepository) TagId(tagType string, tagName string, tagScope string) (tagId int64, exists bool) { +// TagID returns the database id of the tag with the specified type and name. +func (r *JobRepository) TagID(tagType string, tagName string, tagScope string) (tagID int64, exists bool) { exists = true if err := sq.Select("id").From("tag"). Where("tag.tag_type = ?", tagType).Where("tag.tag_name = ?", tagName).Where("tag.tag_scope = ?", tagScope). - RunWith(r.stmtCache).QueryRow().Scan(&tagId); err != nil { + RunWith(r.stmtCache).QueryRow().Scan(&tagID); err != nil { exists = false } return } // TagInfo returns the database infos of the tag with the specified id. -func (r *JobRepository) TagInfo(tagId int64) (tagType string, tagName string, tagScope string, exists bool) { +func (r *JobRepository) TagInfo(tagID int64) (tagType string, tagName string, tagScope string, exists bool) { exists = true - if err := sq.Select("tag.tag_type", "tag.tag_name", "tag.tag_scope").From("tag").Where("tag.id = ?", tagId). + if err := sq.Select("tag.tag_type", "tag.tag_name", "tag.tag_scope").From("tag").Where("tag.id = ?", tagID). RunWith(r.stmtCache).QueryRow().Scan(&tagType, &tagName, &tagScope); err != nil { exists = false } @@ -417,7 +523,7 @@ func (r *JobRepository) GetTags(user *schema.User, job *int64) ([]*schema.Tag, e for rows.Next() { tag := &schema.Tag{} if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil { - cclog.Warn("Error while scanning rows") + cclog.Warnf("Error while scanning tag rows in GetTags: %v", err) return nil, err } // Handle Scope Filtering: Tag Scope is Global, Private (== Username) or User is auth'd to view Admin Tags @@ -455,7 +561,7 @@ func (r *JobRepository) GetTagsDirect(job *int64) ([]*schema.Tag, error) { for rows.Next() { tag := &schema.Tag{} if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil { - cclog.Warn("Error while scanning rows") + cclog.Warnf("Error while scanning tag rows in GetTagsDirect: %v", err) return nil, err } tags = append(tags, tag) @@ -468,7 +574,18 @@ func (r *JobRepository) GetTagsDirect(job *int64) ([]*schema.Tag, error) { return tags, nil } -// GetArchiveTags returns a list of all tags *regardless of scope* for archiving if job is nil or of the tags that the job with that database ID has. +// getArchiveTags returns all tags for a job WITHOUT applying scope-based filtering. +// +// This internal function is used exclusively for job archive synchronization where we need +// to store all tags regardless of the current user's permissions. Unlike GetTags() which +// filters by scope, this returns the complete unfiltered tag list. +// +// Parameters: +// - job: Pointer to job database ID, or nil to return all tags in the system +// +// Returns all tags without scope filtering, used only for archive operations. +// +// WARNING: Do NOT expose this function to user-facing APIs as it bypasses authorization. func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) { q := sq.Select("id", "tag_type", "tag_name", "tag_scope").From("tag") if job != nil { @@ -487,7 +604,7 @@ func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) { for rows.Next() { tag := &schema.Tag{} if err := rows.Scan(&tag.ID, &tag.Type, &tag.Name, &tag.Scope); err != nil { - cclog.Warn("Error while scanning rows") + cclog.Warnf("Error while scanning tag rows in getArchiveTags: %v", err) return nil, err } tags = append(tags, tag) @@ -500,18 +617,18 @@ func (r *JobRepository) getArchiveTags(job *int64) ([]*schema.Tag, error) { return tags, nil } -func (r *JobRepository) ImportTag(jobId int64, tagType string, tagName string, tagScope string) (err error) { +func (r *JobRepository) ImportTag(jobID int64, tagType string, tagName string, tagScope string) (err error) { // Import has no scope ctx, only import from metafile to DB (No recursive archive update required), only returns err - tagId, exists := r.TagId(tagType, tagName, tagScope) + tagID, exists := r.TagID(tagType, tagName, tagScope) if !exists { - tagId, err = r.CreateTag(tagType, tagName, tagScope) + tagID, err = r.CreateTag(tagType, tagName, tagScope) if err != nil { return err } } - q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(jobId, tagId) + q := sq.Insert("jobtag").Columns("job_id", "tag_id").Values(jobID, tagID) if _, err := q.RunWith(r.stmtCache).Exec(); err != nil { s, _, _ := q.ToSql() @@ -522,16 +639,38 @@ func (r *JobRepository) ImportTag(jobId int64, tagType string, tagName string, t return nil } +// checkScopeAuth validates whether a user is authorized to perform an operation on a tag with the given scope. +// +// This function implements the tag scope authorization matrix: +// +// Scope | Read Access | Write Access +// -------------|----------------------------------|---------------------------------- +// "global" | All users | Admin, Support, API-only +// "admin" | Admin, Support | Admin, API-only +// | Owner only | Owner only (private tags) +// +// Parameters: +// - user: User attempting the operation (must not be nil) +// - operation: Either "read" or "write" +// - scope: Tag scope value ("global", "admin", or username for private tags) +// +// Returns: +// - pass: true if authorized, false if denied +// - err: error only if operation is invalid or user is nil +// +// Special cases: +// - API-only users (single role: RoleApi) can write to admin and global scopes for automation +// - Private tags use the username as scope, granting exclusive access to that user func (r *JobRepository) checkScopeAuth(user *schema.User, operation string, scope string) (pass bool, err error) { if user != nil { switch { case operation == "write" && scope == "admin": - if user.HasRole(schema.RoleAdmin) || (len(user.Roles) == 1 && user.HasRole(schema.RoleApi)) { + if user.HasRole(schema.RoleAdmin) || (len(user.Roles) == 1 && user.HasRole(schema.RoleAPI)) { return true, nil } return false, nil case operation == "write" && scope == "global": - if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) || (len(user.Roles) == 1 && user.HasRole(schema.RoleApi)) { + if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) || (len(user.Roles) == 1 && user.HasRole(schema.RoleAPI)) { return true, nil } return false, nil diff --git a/internal/repository/testdata/job.db b/internal/repository/testdata/job.db index 5c5a6925..729cac96 100644 Binary files a/internal/repository/testdata/job.db and b/internal/repository/testdata/job.db differ diff --git a/internal/repository/transaction.go b/internal/repository/transaction.go index 9074428f..3ae0562d 100644 --- a/internal/repository/transaction.go +++ b/internal/repository/transaction.go @@ -62,7 +62,7 @@ func (r *JobRepository) TransactionEnd(t *Transaction) error { func (r *JobRepository) TransactionAddNamed( t *Transaction, query string, - args ...interface{}, + args ...any, ) (int64, error) { if t.tx == nil { return 0, fmt.Errorf("transaction is nil or already completed") @@ -82,7 +82,7 @@ func (r *JobRepository) TransactionAddNamed( } // TransactionAdd executes a query within the transaction. -func (r *JobRepository) TransactionAdd(t *Transaction, query string, args ...interface{}) (int64, error) { +func (r *JobRepository) TransactionAdd(t *Transaction, query string, args ...any) (int64, error) { if t.tx == nil { return 0, fmt.Errorf("transaction is nil or already completed") } diff --git a/internal/repository/transaction_test.go b/internal/repository/transaction_test.go new file mode 100644 index 00000000..777a2a45 --- /dev/null +++ b/internal/repository/transaction_test.go @@ -0,0 +1,311 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package repository + +import ( + "testing" + + _ "github.com/mattn/go-sqlite3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestTransactionInit(t *testing.T) { + r := setup(t) + + t.Run("successful transaction init", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err, "TransactionInit should succeed") + require.NotNil(t, tx, "Transaction should not be nil") + require.NotNil(t, tx.tx, "Transaction.tx should not be nil") + + // Clean up + err = tx.Rollback() + require.NoError(t, err, "Rollback should succeed") + }) +} + +func TestTransactionCommit(t *testing.T) { + r := setup(t) + + t.Run("commit after successful operations", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + // Insert a test tag + _, err = r.TransactionAdd(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_tag_commit", "global") + require.NoError(t, err, "TransactionAdd should succeed") + + // Commit the transaction + err = tx.Commit() + require.NoError(t, err, "Commit should succeed") + + // Verify the tag was inserted + var count int + err = r.DB.QueryRow("SELECT COUNT(*) FROM tag WHERE tag_name = ?", "test_tag_commit").Scan(&count) + require.NoError(t, err) + assert.Equal(t, 1, count, "Tag should be committed to database") + + // Clean up + _, err = r.DB.Exec("DELETE FROM tag WHERE tag_name = ?", "test_tag_commit") + require.NoError(t, err) + }) + + t.Run("commit on already committed transaction", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + err = tx.Commit() + require.NoError(t, err, "First commit should succeed") + + err = tx.Commit() + assert.Error(t, err, "Second commit should fail") + assert.Contains(t, err.Error(), "transaction already committed or rolled back") + }) +} + +func TestTransactionRollback(t *testing.T) { + r := setup(t) + + t.Run("rollback after operations", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + // Insert a test tag + _, err = r.TransactionAdd(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_tag_rollback", "global") + require.NoError(t, err, "TransactionAdd should succeed") + + // Rollback the transaction + err = tx.Rollback() + require.NoError(t, err, "Rollback should succeed") + + // Verify the tag was NOT inserted + var count int + err = r.DB.QueryRow("SELECT COUNT(*) FROM tag WHERE tag_name = ?", "test_tag_rollback").Scan(&count) + require.NoError(t, err) + assert.Equal(t, 0, count, "Tag should not be in database after rollback") + }) + + t.Run("rollback on already rolled back transaction", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + err = tx.Rollback() + require.NoError(t, err, "First rollback should succeed") + + err = tx.Rollback() + assert.NoError(t, err, "Second rollback should be safe (no-op)") + }) + + t.Run("rollback on committed transaction", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + err = tx.Commit() + require.NoError(t, err) + + err = tx.Rollback() + assert.NoError(t, err, "Rollback after commit should be safe (no-op)") + }) +} + +func TestTransactionAdd(t *testing.T) { + r := setup(t) + + t.Run("insert with TransactionAdd", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + defer tx.Rollback() + + id, err := r.TransactionAdd(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_add", "global") + require.NoError(t, err, "TransactionAdd should succeed") + assert.Greater(t, id, int64(0), "Should return valid insert ID") + }) + + t.Run("error on nil transaction", func(t *testing.T) { + tx := &Transaction{tx: nil} + + _, err := r.TransactionAdd(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_nil", "global") + assert.Error(t, err, "Should error on nil transaction") + assert.Contains(t, err.Error(), "transaction is nil or already completed") + }) + + t.Run("error on invalid SQL", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + defer tx.Rollback() + + _, err = r.TransactionAdd(tx, "INVALID SQL STATEMENT") + assert.Error(t, err, "Should error on invalid SQL") + }) + + t.Run("error after transaction committed", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + err = tx.Commit() + require.NoError(t, err) + + _, err = r.TransactionAdd(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_after_commit", "global") + assert.Error(t, err, "Should error when transaction is already committed") + }) +} + +func TestTransactionAddNamed(t *testing.T) { + r := setup(t) + + t.Run("insert with TransactionAddNamed", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + defer tx.Rollback() + + type TagArgs struct { + Type string `db:"type"` + Name string `db:"name"` + Scope string `db:"scope"` + } + + args := TagArgs{ + Type: "test_type", + Name: "test_named", + Scope: "global", + } + + id, err := r.TransactionAddNamed(tx, + "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (:type, :name, :scope)", + args) + require.NoError(t, err, "TransactionAddNamed should succeed") + assert.Greater(t, id, int64(0), "Should return valid insert ID") + }) + + t.Run("error on nil transaction", func(t *testing.T) { + tx := &Transaction{tx: nil} + + _, err := r.TransactionAddNamed(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (:type, :name, :scope)", + map[string]any{"type": "test", "name": "test", "scope": "global"}) + assert.Error(t, err, "Should error on nil transaction") + assert.Contains(t, err.Error(), "transaction is nil or already completed") + }) +} + +func TestTransactionMultipleOperations(t *testing.T) { + r := setup(t) + + t.Run("multiple inserts in single transaction", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + defer tx.Rollback() + + // Insert multiple tags + for i := range 5 { + _, err = r.TransactionAdd(tx, + "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_multi_"+string(rune('a'+i)), "global") + require.NoError(t, err, "Insert %d should succeed", i) + } + + err = tx.Commit() + require.NoError(t, err, "Commit should succeed") + + // Verify all tags were inserted + var count int + err = r.DB.QueryRow("SELECT COUNT(*) FROM tag WHERE tag_name LIKE 'test_multi_%'").Scan(&count) + require.NoError(t, err) + assert.Equal(t, 5, count, "All 5 tags should be committed") + + // Clean up + _, err = r.DB.Exec("DELETE FROM tag WHERE tag_name LIKE 'test_multi_%'") + require.NoError(t, err) + }) + + t.Run("rollback undoes all operations", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + // Insert multiple tags + for i := range 3 { + _, err = r.TransactionAdd(tx, + "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_rollback_"+string(rune('a'+i)), "global") + require.NoError(t, err) + } + + err = tx.Rollback() + require.NoError(t, err, "Rollback should succeed") + + // Verify no tags were inserted + var count int + err = r.DB.QueryRow("SELECT COUNT(*) FROM tag WHERE tag_name LIKE 'test_rollback_%'").Scan(&count) + require.NoError(t, err) + assert.Equal(t, 0, count, "No tags should be in database after rollback") + }) +} + +func TestTransactionEnd(t *testing.T) { + r := setup(t) + + t.Run("deprecated TransactionEnd calls Commit", func(t *testing.T) { + tx, err := r.TransactionInit() + require.NoError(t, err) + + _, err = r.TransactionAdd(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_end", "global") + require.NoError(t, err) + + // Use deprecated method + err = r.TransactionEnd(tx) + require.NoError(t, err, "TransactionEnd should succeed") + + // Verify the tag was committed + var count int + err = r.DB.QueryRow("SELECT COUNT(*) FROM tag WHERE tag_name = ?", "test_end").Scan(&count) + require.NoError(t, err) + assert.Equal(t, 1, count, "Tag should be committed") + + // Clean up + _, err = r.DB.Exec("DELETE FROM tag WHERE tag_name = ?", "test_end") + require.NoError(t, err) + }) +} + +func TestTransactionDeferPattern(t *testing.T) { + r := setup(t) + + t.Run("defer rollback pattern", func(t *testing.T) { + insertTag := func() error { + tx, err := r.TransactionInit() + if err != nil { + return err + } + defer tx.Rollback() // Safe to call even after commit + + _, err = r.TransactionAdd(tx, "INSERT INTO tag (tag_type, tag_name, tag_scope) VALUES (?, ?, ?)", + "test_type", "test_defer", "global") + if err != nil { + return err + } + + return tx.Commit() + } + + err := insertTag() + require.NoError(t, err, "Function should succeed") + + // Verify the tag was committed + var count int + err = r.DB.QueryRow("SELECT COUNT(*) FROM tag WHERE tag_name = ?", "test_defer").Scan(&count) + require.NoError(t, err) + assert.Equal(t, 1, count, "Tag should be committed despite defer rollback") + + // Clean up + _, err = r.DB.Exec("DELETE FROM tag WHERE tag_name = ?", "test_defer") + require.NoError(t, err) + }) +} diff --git a/internal/repository/user.go b/internal/repository/user.go index 5cab2b0d..38a4980b 100644 --- a/internal/repository/user.go +++ b/internal/repository/user.go @@ -10,18 +10,38 @@ import ( "encoding/json" "errors" "fmt" + "reflect" "strings" "sync" "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/graph/model" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" sq "github.com/Masterminds/squirrel" "github.com/jmoiron/sqlx" "golang.org/x/crypto/bcrypt" ) +// Authentication and Role System: +// +// ClusterCockpit supports multiple authentication sources: +// - Local: Username/password stored in database (password hashed with bcrypt) +// - LDAP: External LDAP/Active Directory authentication +// - JWT: Token-based authentication for API access +// +// Role Hierarchy (from highest to lowest privilege): +// 1. "admin" - Full system access, can manage all users and jobs +// 2. "support" - Can view all jobs but limited management capabilities +// 3. "manager" - Can manage specific projects and their users +// 4. "api" - Programmatic access for job submission/management +// 5. "user" - Default role, can only view own jobs +// +// Project Association: +// - Managers have a list of projects they oversee +// - Regular users' project membership is determined by job data +// - Managers can view/manage all jobs within their projects + var ( userRepoOnce sync.Once userRepoInstance *UserRepository @@ -44,6 +64,9 @@ func GetUserRepository() *UserRepository { return userRepoInstance } +// GetUser retrieves a user by username from the database. +// Returns the complete user record including hashed password, roles, and projects. +// Password field contains bcrypt hash for local auth users, empty for LDAP users. func (r *UserRepository) GetUser(username string) (*schema.User, error) { user := &schema.User{Username: username} var hashedPassword, name, rawRoles, email, rawProjects sql.NullString @@ -93,12 +116,18 @@ func (r *UserRepository) GetLdapUsernames() ([]string, error) { return users, nil } +// AddUser creates a new user in the database. +// Passwords are automatically hashed with bcrypt before storage. +// Auth source determines authentication method (local, LDAP, etc.). +// +// Required fields: Username, Roles +// Optional fields: Name, Email, Password, Projects, AuthSource func (r *UserRepository) AddUser(user *schema.User) error { rolesJson, _ := json.Marshal(user.Roles) projectsJson, _ := json.Marshal(user.Projects) cols := []string{"username", "roles", "projects"} - vals := []interface{}{user.Username, string(rolesJson), string(projectsJson)} + vals := []any{user.Username, string(rolesJson), string(projectsJson)} if user.Name != "" { cols = append(cols, "name") @@ -159,8 +188,8 @@ func (r *UserRepository) AddUser(user *schema.User) error { } func (r *UserRepository) UpdateUser(dbUser *schema.User, user *schema.User) error { - // user contains updated info, apply to dbuser - // TODO: Discuss updatable fields + // user contains updated info -> Apply to dbUser + // --- Simple Name Update --- if dbUser.Name != user.Name { if _, err := sq.Update("hpc_user").Set("name", user.Name).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec(); err != nil { cclog.Errorf("error while updating name of user '%s'", user.Username) @@ -168,13 +197,64 @@ func (r *UserRepository) UpdateUser(dbUser *schema.User, user *schema.User) erro } } - // Toggled until greenlit - // if dbUser.HasRole(schema.RoleManager) && !reflect.DeepEqual(dbUser.Projects, user.Projects) { - // projects, _ := json.Marshal(user.Projects) - // if _, err := sq.Update("hpc_user").Set("projects", projects).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec(); err != nil { - // return err - // } - // } + // --- Def Helpers --- + // Helper to update roles + updateRoles := func(roles []string) error { + rolesJSON, _ := json.Marshal(roles) + _, err := sq.Update("hpc_user").Set("roles", rolesJSON).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec() + return err + } + + // Helper to update projects + updateProjects := func(projects []string) error { + projectsJSON, _ := json.Marshal(projects) + _, err := sq.Update("hpc_user").Set("projects", projectsJSON).Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec() + return err + } + + // Helper to clear projects + clearProjects := func() error { + _, err := sq.Update("hpc_user").Set("projects", "[]").Where("hpc_user.username = ?", dbUser.Username).RunWith(r.DB).Exec() + return err + } + + // --- Manager Role Handling --- + if dbUser.HasRole(schema.RoleManager) && user.HasRole(schema.RoleManager) && !reflect.DeepEqual(dbUser.Projects, user.Projects) { + // Existing Manager: update projects + if err := updateProjects(user.Projects); err != nil { + return err + } + } else if dbUser.HasRole(schema.RoleUser) && user.HasRole(schema.RoleManager) && user.HasNotRoles([]schema.Role{schema.RoleAdmin}) { + // New Manager: update roles and projects + if err := updateRoles(user.Roles); err != nil { + return err + } + if err := updateProjects(user.Projects); err != nil { + return err + } + } else if dbUser.HasRole(schema.RoleManager) && user.HasNotRoles([]schema.Role{schema.RoleAdmin, schema.RoleManager}) { + // Remove Manager: update roles and clear projects + if err := updateRoles(user.Roles); err != nil { + return err + } + if err := clearProjects(); err != nil { + return err + } + } + + // --- Support Role Handling --- + if dbUser.HasRole(schema.RoleUser) && dbUser.HasNotRoles([]schema.Role{schema.RoleSupport}) && + user.HasRole(schema.RoleSupport) && user.HasNotRoles([]schema.Role{schema.RoleAdmin}) { + // New Support: update roles + if err := updateRoles(user.Roles); err != nil { + return err + } + } else if dbUser.HasRole(schema.RoleSupport) && user.HasNotRoles([]schema.Role{schema.RoleAdmin, schema.RoleSupport}) { + // Remove Support: update roles + if err := updateRoles(user.Roles); err != nil { + return err + } + } return nil } @@ -229,6 +309,14 @@ func (r *UserRepository) ListUsers(specialsOnly bool) ([]*schema.User, error) { return users, nil } +// AddRole adds a role to a user's role list. +// Role string is automatically lowercased. +// Valid roles: admin, support, manager, api, user +// +// Returns error if: +// - User doesn't exist +// - Role is invalid +// - User already has the role func (r *UserRepository) AddRole( ctx context.Context, username string, @@ -258,6 +346,11 @@ func (r *UserRepository) AddRole( return nil } +// RemoveRole removes a role from a user's role list. +// +// Special rules: +// - Cannot remove "manager" role while user has assigned projects +// - Must remove all projects first before removing manager role func (r *UserRepository) RemoveRole(ctx context.Context, username string, queryrole string) error { oldRole := strings.ToLower(queryrole) user, err := r.GetUser(username) @@ -294,6 +387,12 @@ func (r *UserRepository) RemoveRole(ctx context.Context, username string, queryr return nil } +// AddProject assigns a project to a manager user. +// Only users with the "manager" role can have assigned projects. +// +// Returns error if: +// - User doesn't have manager role +// - User already manages the project func (r *UserRepository) AddProject( ctx context.Context, username string, @@ -345,7 +444,7 @@ func (r *UserRepository) RemoveProject(ctx context.Context, username string, pro } if exists { - var result interface{} + var result any if len(newprojects) == 0 { result = "[]" } else { diff --git a/internal/repository/userConfig.go b/internal/repository/userConfig.go index beeffbf5..75e7119f 100644 --- a/internal/repository/userConfig.go +++ b/internal/repository/userConfig.go @@ -12,9 +12,9 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/web" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/lrucache" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/lrucache" + "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/jmoiron/sqlx" ) diff --git a/internal/repository/userConfig_test.go b/internal/repository/userConfig_test.go index 0d6dc374..17ccbf78 100644 --- a/internal/repository/userConfig_test.go +++ b/internal/repository/userConfig_test.go @@ -10,9 +10,9 @@ import ( "testing" "github.com/ClusterCockpit/cc-backend/internal/config" - ccconf "github.com/ClusterCockpit/cc-lib/ccConfig" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" _ "github.com/mattn/go-sqlite3" ) @@ -20,33 +20,40 @@ func setupUserTest(t *testing.T) *UserCfgRepo { const testconfig = `{ "main": { "addr": "0.0.0.0:8080", - "apiAllowedIPs": [ + "api-allowed-ips": [ "*" ] }, "archive": { "kind": "file", "path": "./var/job-archive" - }, - "clusters": [ - { - "name": "testcluster", - "metricDataRepository": {"kind": "test", "url": "bla:8081"}, - "filterRanges": { - "numNodes": { "from": 1, "to": 64 }, - "duration": { "from": 0, "to": 86400 }, - "startTime": { "from": "2022-01-01T00:00:00Z", "to": null } } - }] }` cclog.Init("info", true) - dbfilepath := "testdata/job.db" - err := MigrateDB("sqlite3", dbfilepath) + + // Copy test DB to a temp file for test isolation + srcData, err := os.ReadFile("testdata/job.db") if err != nil { t.Fatal(err) } - Connect("sqlite3", dbfilepath) + dbfilepath := filepath.Join(t.TempDir(), "job.db") + if err := os.WriteFile(dbfilepath, srcData, 0o644); err != nil { + t.Fatal(err) + } + + if err := ResetConnection(); err != nil { + t.Fatal(err) + } + t.Cleanup(func() { + ResetConnection() + }) + + err = MigrateDB(dbfilepath) + if err != nil { + t.Fatal(err) + } + Connect(dbfilepath) tmpdir := t.TempDir() cfgFilePath := filepath.Join(tmpdir, "config.json") @@ -58,11 +65,7 @@ func setupUserTest(t *testing.T) *UserCfgRepo { // Load and check main configuration if cfg := ccconf.GetPackageConfig("main"); cfg != nil { - if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil { - config.Init(cfg, clustercfg) - } else { - t.Fatal("Cluster configuration must be present") - } + config.Init(cfg) } else { t.Fatal("Main configuration must be present") } diff --git a/internal/repository/user_test.go b/internal/repository/user_test.go new file mode 100644 index 00000000..370d261d --- /dev/null +++ b/internal/repository/user_test.go @@ -0,0 +1,596 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package repository + +import ( + "context" + "testing" + + "github.com/ClusterCockpit/cc-lib/v2/schema" + _ "github.com/mattn/go-sqlite3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/crypto/bcrypt" +) + +func TestAddUser(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + + t.Run("add user with all fields", func(t *testing.T) { + user := &schema.User{ + Username: "testuser1", + Name: "Test User One", + Email: "test1@example.com", + Password: "testpassword123", + Roles: []string{"user"}, + Projects: []string{"project1", "project2"}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + retrievedUser, err := r.GetUser("testuser1") + require.NoError(t, err) + assert.Equal(t, user.Username, retrievedUser.Username) + assert.Equal(t, user.Name, retrievedUser.Name) + assert.Equal(t, user.Email, retrievedUser.Email) + assert.Equal(t, user.Roles, retrievedUser.Roles) + assert.Equal(t, user.Projects, retrievedUser.Projects) + assert.NotEmpty(t, retrievedUser.Password) + err = bcrypt.CompareHashAndPassword([]byte(retrievedUser.Password), []byte("testpassword123")) + assert.NoError(t, err, "Password should be hashed correctly") + + err = r.DelUser("testuser1") + require.NoError(t, err) + }) + + t.Run("add user with minimal fields", func(t *testing.T) { + user := &schema.User{ + Username: "testuser2", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLDAP, + } + + err := r.AddUser(user) + require.NoError(t, err) + + retrievedUser, err := r.GetUser("testuser2") + require.NoError(t, err) + assert.Equal(t, user.Username, retrievedUser.Username) + assert.Equal(t, "", retrievedUser.Name) + assert.Equal(t, "", retrievedUser.Email) + assert.Equal(t, "", retrievedUser.Password) + + err = r.DelUser("testuser2") + require.NoError(t, err) + }) + + t.Run("add duplicate user fails", func(t *testing.T) { + user := &schema.User{ + Username: "testuser3", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.AddUser(user) + assert.Error(t, err, "Adding duplicate user should fail") + + err = r.DelUser("testuser3") + require.NoError(t, err) + }) +} + +func TestGetUser(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + + t.Run("get existing user", func(t *testing.T) { + user := &schema.User{ + Username: "getuser1", + Name: "Get User", + Email: "getuser@example.com", + Roles: []string{"user", "admin"}, + Projects: []string{"proj1"}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + retrieved, err := r.GetUser("getuser1") + require.NoError(t, err) + assert.Equal(t, user.Username, retrieved.Username) + assert.Equal(t, user.Name, retrieved.Name) + assert.Equal(t, user.Email, retrieved.Email) + assert.ElementsMatch(t, user.Roles, retrieved.Roles) + assert.ElementsMatch(t, user.Projects, retrieved.Projects) + + err = r.DelUser("getuser1") + require.NoError(t, err) + }) + + t.Run("get non-existent user", func(t *testing.T) { + _, err := r.GetUser("nonexistent") + assert.Error(t, err) + }) +} + +func TestUpdateUser(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + + t.Run("update user name", func(t *testing.T) { + user := &schema.User{ + Username: "updateuser1", + Name: "Original Name", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + dbUser, err := r.GetUser("updateuser1") + require.NoError(t, err) + + updatedUser := &schema.User{ + Username: "updateuser1", + Name: "Updated Name", + } + + err = r.UpdateUser(dbUser, updatedUser) + require.NoError(t, err) + + retrieved, err := r.GetUser("updateuser1") + require.NoError(t, err) + assert.Equal(t, "Updated Name", retrieved.Name) + + err = r.DelUser("updateuser1") + require.NoError(t, err) + }) + + t.Run("update with no changes", func(t *testing.T) { + user := &schema.User{ + Username: "updateuser2", + Name: "Same Name", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + dbUser, err := r.GetUser("updateuser2") + require.NoError(t, err) + + err = r.UpdateUser(dbUser, dbUser) + assert.NoError(t, err) + + err = r.DelUser("updateuser2") + require.NoError(t, err) + }) +} + +func TestDelUser(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + + t.Run("delete existing user", func(t *testing.T) { + user := &schema.User{ + Username: "deluser1", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.DelUser("deluser1") + require.NoError(t, err) + + _, err = r.GetUser("deluser1") + assert.Error(t, err, "User should not exist after deletion") + }) + + t.Run("delete non-existent user", func(t *testing.T) { + err := r.DelUser("nonexistent") + assert.NoError(t, err, "Deleting non-existent user should not error") + }) +} + +func TestListUsers(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + + user1 := &schema.User{ + Username: "listuser1", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + user2 := &schema.User{ + Username: "listuser2", + Roles: []string{"admin"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + user3 := &schema.User{ + Username: "listuser3", + Roles: []string{"manager"}, + Projects: []string{"proj1"}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user1) + require.NoError(t, err) + err = r.AddUser(user2) + require.NoError(t, err) + err = r.AddUser(user3) + require.NoError(t, err) + + t.Run("list all users", func(t *testing.T) { + users, err := r.ListUsers(false) + require.NoError(t, err) + assert.GreaterOrEqual(t, len(users), 3) + + usernames := make([]string, len(users)) + for i, u := range users { + usernames[i] = u.Username + } + assert.Contains(t, usernames, "listuser1") + assert.Contains(t, usernames, "listuser2") + assert.Contains(t, usernames, "listuser3") + }) + + t.Run("list special users only", func(t *testing.T) { + users, err := r.ListUsers(true) + require.NoError(t, err) + + usernames := make([]string, len(users)) + for i, u := range users { + usernames[i] = u.Username + } + assert.Contains(t, usernames, "listuser2") + assert.Contains(t, usernames, "listuser3") + }) + + err = r.DelUser("listuser1") + require.NoError(t, err) + err = r.DelUser("listuser2") + require.NoError(t, err) + err = r.DelUser("listuser3") + require.NoError(t, err) +} + +func TestGetLdapUsernames(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + + ldapUser := &schema.User{ + Username: "ldapuser1", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLDAP, + } + localUser := &schema.User{ + Username: "localuser1", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(ldapUser) + require.NoError(t, err) + err = r.AddUser(localUser) + require.NoError(t, err) + + usernames, err := r.GetLdapUsernames() + require.NoError(t, err) + assert.Contains(t, usernames, "ldapuser1") + assert.NotContains(t, usernames, "localuser1") + + err = r.DelUser("ldapuser1") + require.NoError(t, err) + err = r.DelUser("localuser1") + require.NoError(t, err) +} + +func TestAddRole(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + ctx := context.Background() + + t.Run("add valid role", func(t *testing.T) { + user := &schema.User{ + Username: "roleuser1", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.AddRole(ctx, "roleuser1", "admin") + require.NoError(t, err) + + retrieved, err := r.GetUser("roleuser1") + require.NoError(t, err) + assert.Contains(t, retrieved.Roles, "admin") + assert.Contains(t, retrieved.Roles, "user") + + err = r.DelUser("roleuser1") + require.NoError(t, err) + }) + + t.Run("add duplicate role", func(t *testing.T) { + user := &schema.User{ + Username: "roleuser2", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.AddRole(ctx, "roleuser2", "user") + assert.Error(t, err, "Adding duplicate role should fail") + assert.Contains(t, err.Error(), "already has role") + + err = r.DelUser("roleuser2") + require.NoError(t, err) + }) + + t.Run("add invalid role", func(t *testing.T) { + user := &schema.User{ + Username: "roleuser3", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.AddRole(ctx, "roleuser3", "invalidrole") + assert.Error(t, err, "Adding invalid role should fail") + assert.Contains(t, err.Error(), "no valid option") + + err = r.DelUser("roleuser3") + require.NoError(t, err) + }) +} + +func TestRemoveRole(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + ctx := context.Background() + + t.Run("remove existing role", func(t *testing.T) { + user := &schema.User{ + Username: "rmroleuser1", + Roles: []string{"user", "admin"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.RemoveRole(ctx, "rmroleuser1", "admin") + require.NoError(t, err) + + retrieved, err := r.GetUser("rmroleuser1") + require.NoError(t, err) + assert.NotContains(t, retrieved.Roles, "admin") + assert.Contains(t, retrieved.Roles, "user") + + err = r.DelUser("rmroleuser1") + require.NoError(t, err) + }) + + t.Run("remove non-existent role", func(t *testing.T) { + user := &schema.User{ + Username: "rmroleuser2", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.RemoveRole(ctx, "rmroleuser2", "admin") + assert.Error(t, err, "Removing non-existent role should fail") + assert.Contains(t, err.Error(), "already deleted") + + err = r.DelUser("rmroleuser2") + require.NoError(t, err) + }) + + t.Run("remove manager role with projects", func(t *testing.T) { + user := &schema.User{ + Username: "rmroleuser3", + Roles: []string{"manager"}, + Projects: []string{"proj1", "proj2"}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.RemoveRole(ctx, "rmroleuser3", "manager") + assert.Error(t, err, "Removing manager role with projects should fail") + assert.Contains(t, err.Error(), "still has assigned project") + + err = r.DelUser("rmroleuser3") + require.NoError(t, err) + }) +} + +func TestAddProject(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + ctx := context.Background() + + t.Run("add project to manager", func(t *testing.T) { + user := &schema.User{ + Username: "projuser1", + Roles: []string{"manager"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.AddProject(ctx, "projuser1", "newproject") + require.NoError(t, err) + + retrieved, err := r.GetUser("projuser1") + require.NoError(t, err) + assert.Contains(t, retrieved.Projects, "newproject") + + err = r.DelUser("projuser1") + require.NoError(t, err) + }) + + t.Run("add project to non-manager", func(t *testing.T) { + user := &schema.User{ + Username: "projuser2", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.AddProject(ctx, "projuser2", "newproject") + assert.Error(t, err, "Adding project to non-manager should fail") + assert.Contains(t, err.Error(), "not a manager") + + err = r.DelUser("projuser2") + require.NoError(t, err) + }) + + t.Run("add duplicate project", func(t *testing.T) { + user := &schema.User{ + Username: "projuser3", + Roles: []string{"manager"}, + Projects: []string{"existingproject"}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.AddProject(ctx, "projuser3", "existingproject") + assert.Error(t, err, "Adding duplicate project should fail") + assert.Contains(t, err.Error(), "already manages") + + err = r.DelUser("projuser3") + require.NoError(t, err) + }) +} + +func TestRemoveProject(t *testing.T) { + _ = setup(t) + r := GetUserRepository() + ctx := context.Background() + + t.Run("remove existing project", func(t *testing.T) { + user := &schema.User{ + Username: "rmprojuser1", + Roles: []string{"manager"}, + Projects: []string{"proj1", "proj2"}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.RemoveProject(ctx, "rmprojuser1", "proj1") + require.NoError(t, err) + + retrieved, err := r.GetUser("rmprojuser1") + require.NoError(t, err) + assert.NotContains(t, retrieved.Projects, "proj1") + assert.Contains(t, retrieved.Projects, "proj2") + + err = r.DelUser("rmprojuser1") + require.NoError(t, err) + }) + + t.Run("remove non-existent project", func(t *testing.T) { + user := &schema.User{ + Username: "rmprojuser2", + Roles: []string{"manager"}, + Projects: []string{"proj1"}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.RemoveProject(ctx, "rmprojuser2", "nonexistent") + assert.Error(t, err, "Removing non-existent project should fail") + + err = r.DelUser("rmprojuser2") + require.NoError(t, err) + }) + + t.Run("remove project from non-manager", func(t *testing.T) { + user := &schema.User{ + Username: "rmprojuser3", + Roles: []string{"user"}, + Projects: []string{}, + AuthSource: schema.AuthViaLocalPassword, + } + + err := r.AddUser(user) + require.NoError(t, err) + + err = r.RemoveProject(ctx, "rmprojuser3", "proj1") + assert.Error(t, err, "Removing project from non-manager should fail") + assert.Contains(t, err.Error(), "not a manager") + + err = r.DelUser("rmprojuser3") + require.NoError(t, err) + }) +} + +func TestGetUserFromContext(t *testing.T) { + t.Run("get user from context", func(t *testing.T) { + user := &schema.User{ + Username: "contextuser", + Roles: []string{"user"}, + } + + ctx := context.WithValue(context.Background(), ContextUserKey, user) + retrieved := GetUserFromContext(ctx) + + require.NotNil(t, retrieved) + assert.Equal(t, user.Username, retrieved.Username) + }) + + t.Run("get user from empty context", func(t *testing.T) { + ctx := context.Background() + retrieved := GetUserFromContext(ctx) + + assert.Nil(t, retrieved) + }) +} diff --git a/internal/routerConfig/routes.go b/internal/routerConfig/routes.go index 9c19de52..e24038e2 100644 --- a/internal/routerConfig/routes.go +++ b/internal/routerConfig/routes.go @@ -17,13 +17,13 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/graph/model" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/web" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/ClusterCockpit/cc-lib/util" - "github.com/gorilla/mux" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" + "github.com/go-chi/chi/v5" ) -type InfoType map[string]interface{} +type InfoType map[string]any type Route struct { Route string @@ -47,7 +47,10 @@ var routes []Route = []Route{ {"/monitoring/systems/list/{cluster}/{subcluster}", "monitoring/systems.tmpl", "Cluster Node List - ClusterCockpit", false, setupClusterListRoute}, {"/monitoring/node/{cluster}/{hostname}", "monitoring/node.tmpl", "Node - ClusterCockpit", false, setupNodeRoute}, {"/monitoring/analysis/{cluster}", "monitoring/analysis.tmpl", "Analysis - ClusterCockpit", true, setupAnalysisRoute}, - {"/monitoring/status/{cluster}", "monitoring/status.tmpl", "Status of - ClusterCockpit", false, setupClusterStatusRoute}, + {"/monitoring/status/{cluster}", "monitoring/status.tmpl", " Dashboard - ClusterCockpit", false, setupClusterStatusRoute}, + {"/monitoring/status/detail/{cluster}", "monitoring/status.tmpl", "Status of - ClusterCockpit", false, setupClusterDetailRoute}, + {"/monitoring/dashboard/{cluster}", "monitoring/dashboard.tmpl", " Dashboard - ClusterCockpit", false, setupDashboardRoute}, + {"/monitoring/logs", "monitoring/logs.tmpl", "Logs - ClusterCockpit", false, func(i InfoType, r *http.Request) InfoType { return i }}, } func setupHomeRoute(i InfoType, r *http.Request) InfoType { @@ -94,7 +97,7 @@ func setupConfigRoute(i InfoType, r *http.Request) InfoType { } func setupJobRoute(i InfoType, r *http.Request) InfoType { - i["id"] = mux.Vars(r)["id"] + i["id"] = chi.URLParam(r, "id") if config.Keys.EmissionConstant != 0 { i["emission"] = config.Keys.EmissionConstant } @@ -102,7 +105,7 @@ func setupJobRoute(i InfoType, r *http.Request) InfoType { } func setupUserRoute(i InfoType, r *http.Request) InfoType { - username := mux.Vars(r)["id"] + username := chi.URLParam(r, "id") i["id"] = username i["username"] = username // TODO: If forbidden (== err exists), redirect to error page @@ -114,21 +117,33 @@ func setupUserRoute(i InfoType, r *http.Request) InfoType { } func setupClusterStatusRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["id"] = vars["cluster"] - i["cluster"] = vars["cluster"] - from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to") - if from != "" || to != "" { - i["from"] = from - i["to"] = to - } + cluster := chi.URLParam(r, "cluster") + i["id"] = cluster + i["cluster"] = cluster + i["displayType"] = "DASHBOARD" + return i +} + +func setupClusterDetailRoute(i InfoType, r *http.Request) InfoType { + cluster := chi.URLParam(r, "cluster") + i["id"] = cluster + i["cluster"] = cluster + i["displayType"] = "DETAILS" + return i +} + +func setupDashboardRoute(i InfoType, r *http.Request) InfoType { + cluster := chi.URLParam(r, "cluster") + i["id"] = cluster + i["cluster"] = cluster + i["displayType"] = "PUBLIC" // Used in Main Template return i } func setupClusterOverviewRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["id"] = vars["cluster"] - i["cluster"] = vars["cluster"] + cluster := chi.URLParam(r, "cluster") + i["id"] = cluster + i["cluster"] = cluster i["displayType"] = "OVERVIEW" from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to") @@ -140,11 +155,12 @@ func setupClusterOverviewRoute(i InfoType, r *http.Request) InfoType { } func setupClusterListRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["id"] = vars["cluster"] - i["cluster"] = vars["cluster"] - i["sid"] = vars["subcluster"] - i["subCluster"] = vars["subcluster"] + cluster := chi.URLParam(r, "cluster") + subcluster := chi.URLParam(r, "subcluster") + i["id"] = cluster + i["cluster"] = cluster + i["sid"] = subcluster + i["subCluster"] = subcluster i["displayType"] = "LIST" from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to") @@ -156,10 +172,11 @@ func setupClusterListRoute(i InfoType, r *http.Request) InfoType { } func setupNodeRoute(i InfoType, r *http.Request) InfoType { - vars := mux.Vars(r) - i["cluster"] = vars["cluster"] - i["hostname"] = vars["hostname"] - i["id"] = fmt.Sprintf("%s (%s)", vars["cluster"], vars["hostname"]) + cluster := chi.URLParam(r, "cluster") + hostname := chi.URLParam(r, "hostname") + i["cluster"] = cluster + i["hostname"] = hostname + i["id"] = fmt.Sprintf("%s (%s)", cluster, hostname) from, to := r.URL.Query().Get("from"), r.URL.Query().Get("to") if from != "" && to != "" { i["from"] = from @@ -169,14 +186,14 @@ func setupNodeRoute(i InfoType, r *http.Request) InfoType { } func setupAnalysisRoute(i InfoType, r *http.Request) InfoType { - i["cluster"] = mux.Vars(r)["cluster"] + i["cluster"] = chi.URLParam(r, "cluster") return i } func setupTaglistRoute(i InfoType, r *http.Request) InfoType { jobRepo := repository.GetJobRepository() tags, counts, err := jobRepo.CountTags(repository.GetUserFromContext(r.Context())) - tagMap := make(map[string][]map[string]interface{}) + tagMap := make(map[string][]map[string]any) if err != nil { cclog.Warnf("GetTags failed: %s", err.Error()) i["tagmap"] = tagMap @@ -187,19 +204,19 @@ func setupTaglistRoute(i InfoType, r *http.Request) InfoType { // Uses tag.ID as second Map-Key component to differentiate tags with identical names if userAuthlevel >= 4 { // Support+ : Show tags for all scopes, regardless of count for _, tag := range tags { - tagItem := map[string]interface{}{ + tagItem := map[string]any{ "id": tag.ID, "name": tag.Name, "scope": tag.Scope, - "count": counts[fmt.Sprint(tag.Name, tag.ID)], + "count": counts[fmt.Sprint(tag.Type, tag.Name, tag.ID)], } tagMap[tag.Type] = append(tagMap[tag.Type], tagItem) } } else if userAuthlevel < 4 && userAuthlevel >= 2 { // User+ : Show global and admin scope only if at least 1 tag used, private scope regardless of count for _, tag := range tags { - tagCount := counts[fmt.Sprint(tag.Name, tag.ID)] + tagCount := counts[fmt.Sprint(tag.Type, tag.Name, tag.ID)] if ((tag.Scope == "global" || tag.Scope == "admin") && tagCount >= 1) || (tag.Scope != "global" && tag.Scope != "admin") { - tagItem := map[string]interface{}{ + tagItem := map[string]any{ "id": tag.ID, "name": tag.Name, "scope": tag.Scope, @@ -215,8 +232,8 @@ func setupTaglistRoute(i InfoType, r *http.Request) InfoType { } // FIXME: Lots of redundant code. Needs refactoring -func buildFilterPresets(query url.Values) map[string]interface{} { - filterPresets := map[string]interface{}{} +func buildFilterPresets(query url.Values) map[string]any { + filterPresets := map[string]any{} if query.Get("cluster") != "" { filterPresets["cluster"] = query.Get("cluster") @@ -243,6 +260,12 @@ func buildFilterPresets(query url.Values) map[string]interface{} { if len(query["state"]) != 0 { filterPresets["state"] = query["state"] } + if query.Get("shared") != "" { + filterPresets["shared"] = query.Get("shared") + } + if query.Get("schedule") != "" { + filterPresets["schedule"] = query.Get("schedule") + } if rawtags, ok := query["tag"]; ok { tags := make([]int, len(rawtags)) for i, tid := range rawtags { @@ -257,10 +280,22 @@ func buildFilterPresets(query url.Values) map[string]interface{} { if query.Get("duration") != "" { parts := strings.Split(query.Get("duration"), "-") if len(parts) == 2 { - a, e1 := strconv.Atoi(parts[0]) - b, e2 := strconv.Atoi(parts[1]) - if e1 == nil && e2 == nil { - filterPresets["duration"] = map[string]int{"from": a, "to": b} + if parts[0] == "lessthan" { + lt, lte := strconv.Atoi(parts[1]) + if lte == nil { + filterPresets["duration"] = map[string]int{"lessThan": lt, "from": 0, "to": 0} + } + } else if parts[0] == "morethan" { + mt, mte := strconv.Atoi(parts[1]) + if mte == nil { + filterPresets["duration"] = map[string]int{"moreThan": mt, "from": 0, "to": 0} + } + } else { + a, e1 := strconv.Atoi(parts[0]) + b, e2 := strconv.Atoi(parts[1]) + if e1 == nil && e2 == nil { + filterPresets["duration"] = map[string]int{"from": a, "to": b} + } } } } @@ -270,30 +305,66 @@ func buildFilterPresets(query url.Values) map[string]interface{} { if query.Get("numNodes") != "" { parts := strings.Split(query.Get("numNodes"), "-") if len(parts) == 2 { - a, e1 := strconv.Atoi(parts[0]) - b, e2 := strconv.Atoi(parts[1]) - if e1 == nil && e2 == nil { - filterPresets["numNodes"] = map[string]int{"from": a, "to": b} + if parts[0] == "lessthan" { + lt, lte := strconv.Atoi(parts[1]) + if lte == nil { + filterPresets["numNodes"] = map[string]int{"from": 1, "to": lt} + } + } else if parts[0] == "morethan" { + mt, mte := strconv.Atoi(parts[1]) + if mte == nil { + filterPresets["numNodes"] = map[string]int{"from": mt, "to": 0} + } + } else { + a, e1 := strconv.Atoi(parts[0]) + b, e2 := strconv.Atoi(parts[1]) + if e1 == nil && e2 == nil { + filterPresets["numNodes"] = map[string]int{"from": a, "to": b} + } } } } if query.Get("numHWThreads") != "" { parts := strings.Split(query.Get("numHWThreads"), "-") if len(parts) == 2 { - a, e1 := strconv.Atoi(parts[0]) - b, e2 := strconv.Atoi(parts[1]) - if e1 == nil && e2 == nil { - filterPresets["numHWThreads"] = map[string]int{"from": a, "to": b} + if parts[0] == "lessthan" { + lt, lte := strconv.Atoi(parts[1]) + if lte == nil { + filterPresets["numHWThreads"] = map[string]int{"from": 1, "to": lt} + } + } else if parts[0] == "morethan" { + mt, mte := strconv.Atoi(parts[1]) + if mte == nil { + filterPresets["numHWThreads"] = map[string]int{"from": mt, "to": 0} + } + } else { + a, e1 := strconv.Atoi(parts[0]) + b, e2 := strconv.Atoi(parts[1]) + if e1 == nil && e2 == nil { + filterPresets["numHWThreads"] = map[string]int{"from": a, "to": b} + } } } } if query.Get("numAccelerators") != "" { parts := strings.Split(query.Get("numAccelerators"), "-") if len(parts) == 2 { - a, e1 := strconv.Atoi(parts[0]) - b, e2 := strconv.Atoi(parts[1]) - if e1 == nil && e2 == nil { - filterPresets["numAccelerators"] = map[string]int{"from": a, "to": b} + if parts[0] == "lessthan" { + lt, lte := strconv.Atoi(parts[1]) + if lte == nil { + filterPresets["numAccelerators"] = map[string]int{"from": 1, "to": lt} + } + } else if parts[0] == "morethan" { + mt, mte := strconv.Atoi(parts[1]) + if mte == nil { + filterPresets["numAccelerators"] = map[string]int{"from": mt, "to": 0} + } + } else { + a, e1 := strconv.Atoi(parts[0]) + b, e2 := strconv.Atoi(parts[1]) + if e1 == nil && e2 == nil { + filterPresets["numAccelerators"] = map[string]int{"from": a, "to": b} + } } } } @@ -334,27 +405,61 @@ func buildFilterPresets(query url.Values) map[string]interface{} { if query.Get("energy") != "" { parts := strings.Split(query.Get("energy"), "-") if len(parts) == 2 { - a, e1 := strconv.Atoi(parts[0]) - b, e2 := strconv.Atoi(parts[1]) - if e1 == nil && e2 == nil { - filterPresets["energy"] = map[string]int{"from": a, "to": b} + if parts[0] == "lessthan" { + lt, lte := strconv.Atoi(parts[1]) + if lte == nil { + filterPresets["energy"] = map[string]int{"from": 1, "to": lt} + } + } else if parts[0] == "morethan" { + mt, mte := strconv.Atoi(parts[1]) + if mte == nil { + filterPresets["energy"] = map[string]int{"from": mt, "to": 0} + } + } else { + a, e1 := strconv.Atoi(parts[0]) + b, e2 := strconv.Atoi(parts[1]) + if e1 == nil && e2 == nil { + filterPresets["energy"] = map[string]int{"from": a, "to": b} + } } } } if len(query["stat"]) != 0 { - statList := make([]map[string]interface{}, 0) + statList := make([]map[string]any, 0) for _, statEntry := range query["stat"] { parts := strings.Split(statEntry, "-") if len(parts) == 3 { // Metric Footprint Stat Field, from - to - a, e1 := strconv.ParseInt(parts[1], 10, 64) - b, e2 := strconv.ParseInt(parts[2], 10, 64) - if e1 == nil && e2 == nil { - statEntry := map[string]interface{}{ - "field": parts[0], - "from": a, - "to": b, + if parts[1] == "lessthan" { + lt, lte := strconv.ParseInt(parts[2], 10, 64) + if lte == nil { + statEntry := map[string]any{ + "field": parts[0], + "from": 1, + "to": lt, + } + statList = append(statList, statEntry) + } + } else if parts[1] == "morethan" { + mt, mte := strconv.ParseInt(parts[2], 10, 64) + if mte == nil { + statEntry := map[string]any{ + "field": parts[0], + "from": mt, + "to": 0, + } + statList = append(statList, statEntry) + } + } else { + a, e1 := strconv.ParseInt(parts[1], 10, 64) + b, e2 := strconv.ParseInt(parts[2], 10, 64) + if e1 == nil && e2 == nil { + statEntry := map[string]any{ + "field": parts[0], + "from": a, + "to": b, + } + statList = append(statList, statEntry) } - statList = append(statList, statEntry) } } } @@ -363,10 +468,9 @@ func buildFilterPresets(query url.Values) map[string]interface{} { return filterPresets } -func SetupRoutes(router *mux.Router, buildInfo web.Build) { +func SetupRoutes(router chi.Router, buildInfo web.Build) { userCfgRepo := repository.GetUserCfgRepo() for _, route := range routes { - route := route router.HandleFunc(route.Route, func(rw http.ResponseWriter, r *http.Request) { conf, err := userCfgRepo.GetUIConfig(repository.GetUserFromContext(r.Context())) if err != nil { @@ -375,7 +479,7 @@ func SetupRoutes(router *mux.Router, buildInfo web.Build) { } title := route.Title - infos := route.Setup(map[string]interface{}{}, r) + infos := route.Setup(map[string]any{}, r) if id, ok := infos["id"]; ok { title = strings.Replace(route.Title, "", id.(string), 1) if sid, ok := infos["sid"]; ok { // 2nd ID element @@ -436,7 +540,7 @@ func HandleSearchBar(rw http.ResponseWriter, r *http.Request, buildInfo web.Buil http.Redirect(rw, r, "/monitoring/jobs/?startTime="+fromTime+"-"+untilTime+"&arrayJobId="+url.QueryEscape(strings.Trim(splitSearch[1], " ")), http.StatusFound) // All Users: Redirect to Tablequery case "username": if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport, schema.RoleManager}) { - http.Redirect(rw, r, "/monitoring/users/?user="+url.QueryEscape(strings.Trim(splitSearch[1], " ")), http.StatusFound) + http.Redirect(rw, r, "/monitoring/users/?user="+url.QueryEscape(strings.Trim(splitSearch[1], " "))+"&startTime=last30d", http.StatusFound) } else { web.RenderTemplate(rw, "message.tmpl", &web.Page{Title: "Error", MsgType: "alert-danger", Message: "Missing Access Rights", User: *user, Roles: availableRoles, Build: buildInfo}) } @@ -444,10 +548,10 @@ func HandleSearchBar(rw http.ResponseWriter, r *http.Request, buildInfo web.Buil usernames, _ := repo.FindColumnValues(user, strings.Trim(splitSearch[1], " "), "user", "username", "name") if len(usernames) != 0 { joinedNames := strings.Join(usernames, "&user=") - http.Redirect(rw, r, "/monitoring/users/?user="+joinedNames, http.StatusFound) + http.Redirect(rw, r, "/monitoring/users/?user="+joinedNames+"&startTime=last30d", http.StatusFound) } else { if user.HasAnyRole([]schema.Role{schema.RoleAdmin, schema.RoleSupport, schema.RoleManager}) { - http.Redirect(rw, r, "/monitoring/users/?user=NoUserNameFound", http.StatusPermanentRedirect) + http.Redirect(rw, r, "/monitoring/users/?user=NoUserNameFound&startTime=last30d", http.StatusPermanentRedirect) } else { web.RenderTemplate(rw, "message.tmpl", &web.Page{Title: "Error", MsgType: "alert-danger", Message: "Missing Access Rights", User: *user, Roles: availableRoles, Build: buildInfo}) } diff --git a/internal/tagger/apps/lammps.txt b/internal/tagger/apps/lammps.txt deleted file mode 100644 index d254f82f..00000000 --- a/internal/tagger/apps/lammps.txt +++ /dev/null @@ -1 +0,0 @@ -lmp diff --git a/internal/tagger/apps/vasp.txt b/internal/tagger/apps/vasp.txt deleted file mode 100644 index 9f9b9d5d..00000000 --- a/internal/tagger/apps/vasp.txt +++ /dev/null @@ -1,2 +0,0 @@ -vasp -VASP diff --git a/internal/tagger/classifyJob.go b/internal/tagger/classifyJob.go index 4e46f370..1bad61f1 100644 --- a/internal/tagger/classifyJob.go +++ b/internal/tagger/classifyJob.go @@ -2,29 +2,39 @@ // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. + package tagger import ( "bytes" - "embed" "encoding/json" "fmt" "maps" + "math" "os" + "path/filepath" "strings" "text/template" "github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/ClusterCockpit/cc-lib/util" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" "github.com/expr-lang/expr" "github.com/expr-lang/expr/vm" ) -//go:embed jobclasses/* -var jobClassFiles embed.FS +const ( + // defaultJobClassConfigPath is the default path for job classification configuration + defaultJobClassConfigPath = "./var/tagger/jobclasses" + // tagTypeJobClass is the tag type identifier for job classification tags + tagTypeJobClass = "jobClass" + // jobClassConfigDirMatch is the directory name used for matching filesystem events + jobClassConfigDirMatch = "jobclasses" + // parametersFileName is the name of the parameters configuration file + parametersFileName = "parameters.json" +) // Variable defines a named expression that can be computed and reused in rules. // Variables are evaluated before the main rule and their results are added to the environment. @@ -45,21 +55,21 @@ type ruleVariable struct { // and the final rule expression that determines if the job matches the classification. type RuleFormat struct { // Name is a human-readable description of the rule - Name string `json:"name"` + Name string `json:"name"` // Tag is the classification tag to apply if the rule matches - Tag string `json:"tag"` + Tag string `json:"tag"` // Parameters are shared values referenced in the rule (e.g., thresholds) - Parameters []string `json:"parameters"` + Parameters []string `json:"parameters"` // Metrics are the job metrics required for this rule (e.g., "cpu_load", "mem_used") - Metrics []string `json:"metrics"` + Metrics []string `json:"metrics"` // Requirements are boolean expressions that must be true for the rule to apply - Requirements []string `json:"requirements"` + Requirements []string `json:"requirements"` // Variables are computed values used in the rule expression - Variables []Variable `json:"variables"` + Variables []Variable `json:"variables"` // Rule is the boolean expression that determines if the job matches - Rule string `json:"rule"` + Rule string `json:"rule"` // Hint is a template string that generates a message when the rule matches - Hint string `json:"hint"` + Hint string `json:"hint"` } type ruleInfo struct { @@ -75,33 +85,56 @@ type ruleInfo struct { // This interface allows for easier testing and decoupling from the concrete repository implementation. type JobRepository interface { // HasTag checks if a job already has a specific tag - HasTag(jobId int64, tagType string, tagName string) bool + HasTag(jobID int64, tagType string, tagName string) bool // AddTagOrCreateDirect adds a tag to a job or creates it if it doesn't exist - AddTagOrCreateDirect(jobId int64, tagType string, tagName string) (tagId int64, err error) + AddTagOrCreateDirect(jobID int64, tagType string, tagName string) (tagID int64, err error) // UpdateMetadata updates job metadata with a key-value pair UpdateMetadata(job *schema.Job, key, val string) (err error) } // JobClassTagger classifies jobs based on configurable rules that evaluate job metrics and properties. -// Rules are loaded from embedded JSON files and can be dynamically reloaded from a watched directory. +// Rules are loaded from an external configuration directory and can be dynamically reloaded when files change. // When a job matches a rule, it is tagged with the corresponding classification and an optional hint message. type JobClassTagger struct { // rules maps classification tags to their compiled rule information - rules map[string]ruleInfo + rules map[string]ruleInfo // parameters are shared values (e.g., thresholds) used across multiple rules - parameters map[string]any + parameters map[string]any // tagType is the type of tag ("jobClass") - tagType string + tagType string // cfgPath is the path to watch for configuration changes - cfgPath string + cfgPath string // repo provides access to job database operations - repo JobRepository + repo JobRepository // getStatistics retrieves job statistics for analysis - getStatistics func(job *schema.Job) (map[string]schema.JobStatistics, error) + getStatistics func(job *schema.Job) (map[string]schema.JobStatistics, error) // getMetricConfig retrieves metric configuration (limits) for a cluster getMetricConfig func(cluster, subCluster string) map[string]*schema.Metric } +// roundEnv returns a copy of env with all float64 values rounded to 2 decimal places. +// Nested map[string]any and map[string]float64 values are recursed into. +func roundEnv(env map[string]any) map[string]any { + rounded := make(map[string]any, len(env)) + for k, v := range env { + switch val := v.(type) { + case float64: + rounded[k] = math.Round(val*100) / 100 + case map[string]any: + rounded[k] = roundEnv(val) + case map[string]float64: + rm := make(map[string]float64, len(val)) + for mk, mv := range val { + rm[mk] = math.Round(mv*100) / 100 + } + rounded[k] = rm + default: + rounded[k] = v + } + } + return rounded +} + func (t *JobClassTagger) prepareRule(b []byte, fns string) { var rule RuleFormat if err := json.NewDecoder(bytes.NewReader(b)).Decode(&rule); err != nil { @@ -169,7 +202,7 @@ func (t *JobClassTagger) prepareRule(b []byte, fns string) { // EventMatch checks if a filesystem event should trigger configuration reload. // It returns true if the event path contains "jobclasses". func (t *JobClassTagger) EventMatch(s string) bool { - return strings.Contains(s, "jobclasses") + return strings.Contains(s, jobClassConfigDirMatch) } // EventCallback is triggered when the configuration directory changes. @@ -181,9 +214,12 @@ func (t *JobClassTagger) EventCallback() { cclog.Fatal(err) } - if util.CheckFileExists(t.cfgPath + "/parameters.json") { + t.rules = make(map[string]ruleInfo) + + parametersFile := filepath.Join(t.cfgPath, parametersFileName) + if util.CheckFileExists(parametersFile) { cclog.Info("Merge parameters") - b, err := os.ReadFile(t.cfgPath + "/parameters.json") + b, err := os.ReadFile(parametersFile) if err != nil { cclog.Warnf("prepareRule() > open file error: %v", err) } @@ -198,13 +234,13 @@ func (t *JobClassTagger) EventCallback() { for _, fn := range files { fns := fn.Name() - if fns != "parameters.json" { + if fns != parametersFileName { cclog.Debugf("Process: %s", fns) - filename := fmt.Sprintf("%s/%s", t.cfgPath, fns) + filename := filepath.Join(t.cfgPath, fns) b, err := os.ReadFile(filename) if err != nil { cclog.Warnf("prepareRule() > open file error: %v", err) - return + continue } t.prepareRule(b, fns) } @@ -213,7 +249,8 @@ func (t *JobClassTagger) EventCallback() { func (t *JobClassTagger) initParameters() error { cclog.Info("Initialize parameters") - b, err := jobClassFiles.ReadFile("jobclasses/parameters.json") + parametersFile := filepath.Join(t.cfgPath, parametersFileName) + b, err := os.ReadFile(parametersFile) if err != nil { cclog.Warnf("prepareRule() > open file error: %v", err) return err @@ -227,13 +264,20 @@ func (t *JobClassTagger) initParameters() error { return nil } -// Register initializes the JobClassTagger by loading parameters and classification rules. -// It loads embedded configuration files and sets up a file watch on ./var/tagger/jobclasses -// if it exists, allowing for dynamic configuration updates without restarting the application. -// Returns an error if the embedded configuration files cannot be read or parsed. +// Register initializes the JobClassTagger by loading parameters and classification rules from external folder. +// It sets up a file watch on ./var/tagger/jobclasses if it exists, allowing for +// dynamic configuration updates without restarting the application. +// Returns an error if the configuration path does not exist or cannot be read. func (t *JobClassTagger) Register() error { - t.cfgPath = "./var/tagger/jobclasses" - t.tagType = "jobClass" + if t.cfgPath == "" { + t.cfgPath = defaultJobClassConfigPath + } + t.tagType = tagTypeJobClass + t.rules = make(map[string]ruleInfo) + + if !util.CheckFileExists(t.cfgPath) { + return fmt.Errorf("configuration path does not exist: %s", t.cfgPath) + } err := t.initParameters() if err != nil { @@ -241,31 +285,28 @@ func (t *JobClassTagger) Register() error { return err } - files, err := jobClassFiles.ReadDir("jobclasses") + files, err := os.ReadDir(t.cfgPath) if err != nil { - return fmt.Errorf("error reading app folder: %#v", err) + return fmt.Errorf("error reading jobclasses folder: %#v", err) } - t.rules = make(map[string]ruleInfo) + for _, fn := range files { fns := fn.Name() - if fns != "parameters.json" { - filename := fmt.Sprintf("jobclasses/%s", fns) + if fns != parametersFileName { cclog.Infof("Process: %s", fns) + filename := filepath.Join(t.cfgPath, fns) - b, err := jobClassFiles.ReadFile(filename) + b, err := os.ReadFile(filename) if err != nil { cclog.Warnf("prepareRule() > open file error: %v", err) - return err + continue } t.prepareRule(b, fns) } } - if util.CheckFileExists(t.cfgPath) { - t.EventCallback() - cclog.Infof("Setup file watch for %s", t.cfgPath) - util.AddListener(t.cfgPath, t) - } + cclog.Infof("Setup file watch for %s", t.cfgPath) + util.AddListener(t.cfgPath, t) t.repo = repository.GetJobRepository() t.getStatistics = archive.GetStatistics @@ -286,21 +327,25 @@ func (t *JobClassTagger) Register() error { // - Shared parameters defined in parameters.json // - Computed variables from the rule definition // -// Rules are evaluated in arbitrary order. If multiple rules match, only the first -// encountered match is applied (FIXME: this should handle multiple matches). +// Rules are evaluated in arbitrary order. Multiple rules can match and apply +// their tags to the same job. Hint messages from all matching rules are collected +// and stored as a combined message in the job metadata. func (t *JobClassTagger) Match(job *schema.Job) { jobStats, err := t.getStatistics(job) metricsList := t.getMetricConfig(job.Cluster, job.SubCluster) - cclog.Infof("Enter match rule with %d rules for job %d", len(t.rules), job.JobID) + cclog.Debugf("Enter match rule with %d rules for job %d", len(t.rules), job.JobID) if err != nil { - cclog.Errorf("job classification failed for job %d: %#v", job.JobID, err) + cclog.Errorf("job classification failed for job %d: %#v", job.JobID, err) return } + id := *job.ID + var messages []string + for tag, ri := range t.rules { env := make(map[string]any) maps.Copy(env, ri.env) - cclog.Infof("Try to match rule %s for job %d", tag, job.JobID) + cclog.Debugf("Try to match rule %s for job %d", tag, job.JobID) // Initialize environment env["job"] = map[string]any{ @@ -314,11 +359,13 @@ func (t *JobClassTagger) Match(job *schema.Job) { } // add metrics to env + skipRule := false for _, m := range ri.metrics { stats, ok := jobStats[m] if !ok { - cclog.Errorf("job classification failed for job %d: missing metric '%s'", job.JobID, m) - return + cclog.Errorf("job classification: missing metric '%s' for rule %s on job %d", m, tag, job.JobID) + skipRule = true + break } env[m] = map[string]any{ "min": stats.Min, @@ -332,62 +379,71 @@ func (t *JobClassTagger) Match(job *schema.Job) { }, } } + if skipRule { + continue + } // check rule requirements apply + requirementsMet := true for _, r := range ri.requirements { ok, err := expr.Run(r, env) if err != nil { cclog.Errorf("error running requirement for rule %s: %#v", tag, err) - return + requirementsMet = false + break } if !ok.(bool) { - cclog.Infof("requirement for rule %s not met", tag) - return + cclog.Debugf("requirement for rule %s not met", tag) + requirementsMet = false + break } } + if !requirementsMet { + continue + } - // validate rule expression + // evaluate rule variables + varError := false for _, v := range ri.variables { value, err := expr.Run(v.expr, env) if err != nil { - cclog.Errorf("error running rule %s: %#v", tag, err) - return + cclog.Errorf("error evaluating variable %s for rule %s: %#v", v.name, tag, err) + varError = true + break } env[v.name] = value } - - // dump.P(env) + if varError { + continue + } match, err := expr.Run(ri.rule, env) if err != nil { cclog.Errorf("error running rule %s: %#v", tag, err) - return + continue } if match.(bool) { - cclog.Info("Rule matches!") - id := *job.ID if !t.repo.HasTag(id, t.tagType, tag) { - _, err := t.repo.AddTagOrCreateDirect(id, t.tagType, tag) - if err != nil { - return + if _, err := t.repo.AddTagOrCreateDirect(id, t.tagType, tag); err != nil { + cclog.Errorf("failed to add tag '%s' to job %d: %v", tag, id, err) + continue } } // process hint template var msg bytes.Buffer - if err := ri.hint.Execute(&msg, env); err != nil { + if err := ri.hint.Execute(&msg, roundEnv(env)); err != nil { cclog.Errorf("Template error: %s", err.Error()) - return + continue } + messages = append(messages, msg.String()) + } + } - // FIXME: Handle case where multiple tags apply - // FIXME: Handle case where multiple tags apply - err = t.repo.UpdateMetadata(job, "message", msg.String()) - if err != nil { - return - } - } else { - cclog.Info("Rule does not match!") + if len(messages) > 0 { + combined := strings.Join(messages, "\n") + if err := t.repo.UpdateMetadata(job, "message", combined); err != nil { + cclog.Errorf("failed to update metadata for job %d: %v", *job.ID, err) } } } diff --git a/internal/tagger/classifyJob_test.go b/internal/tagger/classifyJob_test.go index 3795a60a..f82cf807 100644 --- a/internal/tagger/classifyJob_test.go +++ b/internal/tagger/classifyJob_test.go @@ -3,7 +3,7 @@ package tagger import ( "testing" - "github.com/ClusterCockpit/cc-lib/schema" + "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" ) @@ -13,13 +13,13 @@ type MockJobRepository struct { mock.Mock } -func (m *MockJobRepository) HasTag(jobId int64, tagType string, tagName string) bool { - args := m.Called(jobId, tagType, tagName) +func (m *MockJobRepository) HasTag(jobID int64, tagType string, tagName string) bool { + args := m.Called(jobID, tagType, tagName) return args.Bool(0) } -func (m *MockJobRepository) AddTagOrCreateDirect(jobId int64, tagType string, tagName string) (tagId int64, err error) { - args := m.Called(jobId, tagType, tagName) +func (m *MockJobRepository) AddTagOrCreateDirect(jobID int64, tagType string, tagName string) (tagID int64, err error) { + args := m.Called(jobID, tagType, tagName) return args.Get(0).(int64), args.Error(1) } diff --git a/internal/tagger/detectApp.go b/internal/tagger/detectApp.go index 4e8f858d..97b9d6b0 100644 --- a/internal/tagger/detectApp.go +++ b/internal/tagger/detectApp.go @@ -7,107 +7,158 @@ package tagger import ( "bufio" - "embed" "fmt" - "io/fs" "os" "path/filepath" "regexp" "strings" "github.com/ClusterCockpit/cc-backend/internal/repository" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/ClusterCockpit/cc-lib/util" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" ) -//go:embed apps/* -var appFiles embed.FS +func metadataKeys(m map[string]string) []string { + keys := make([]string, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + return keys +} + +const ( + // defaultConfigPath is the default path for application tagging configuration + defaultConfigPath = "./var/tagger/apps" + // tagTypeApp is the tag type identifier for application tags + tagTypeApp = "app" + // configDirMatch is the directory name used for matching filesystem events + configDirMatch = "apps" +) type appInfo struct { - tag string - strings []string + tag string + patterns []*regexp.Regexp } // AppTagger detects applications by matching patterns in job scripts. -// It loads application patterns from embedded files and can dynamically reload -// configuration from a watched directory. When a job script matches a pattern, +// It loads application patterns from an external configuration directory and can dynamically reload +// configuration when files change. When a job script matches a pattern, // the corresponding application tag is automatically applied. type AppTagger struct { - // apps maps application tags to their matching patterns - apps map[string]appInfo + // apps holds application patterns in deterministic order + apps []appInfo // tagType is the type of tag ("app") tagType string // cfgPath is the path to watch for configuration changes cfgPath string } -func (t *AppTagger) scanApp(f fs.File, fns string) { +func (t *AppTagger) scanApp(f *os.File, fns string) { scanner := bufio.NewScanner(f) - ai := appInfo{tag: strings.TrimSuffix(fns, filepath.Ext(fns)), strings: make([]string, 0)} + tag := strings.TrimSuffix(fns, filepath.Ext(fns)) + ai := appInfo{tag: tag, patterns: make([]*regexp.Regexp, 0)} for scanner.Scan() { - ai.strings = append(ai.strings, scanner.Text()) + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + // Wrap pattern to skip comment lines: match only if not preceded by # on the same line + wrapped := `(?m)^[^#]*` + line + re, err := regexp.Compile(wrapped) + if err != nil { + cclog.Errorf("invalid regex pattern '%s' (wrapped: '%s') in %s: %v", line, wrapped, fns, err) + continue + } + ai.patterns = append(ai.patterns, re) } - delete(t.apps, ai.tag) - t.apps[ai.tag] = ai + + // Remove existing entry for this tag if present + for i, a := range t.apps { + if a.tag == tag { + t.apps = append(t.apps[:i], t.apps[i+1:]...) + break + } + } + + cclog.Infof("AppTagger loaded %d patterns for %s", len(ai.patterns), tag) + t.apps = append(t.apps, ai) } // EventMatch checks if a filesystem event should trigger configuration reload. // It returns true if the event path contains "apps". func (t *AppTagger) EventMatch(s string) bool { - return strings.Contains(s, "apps") + return strings.Contains(s, configDirMatch) } // EventCallback is triggered when the configuration directory changes. // It reloads all application pattern files from the watched directory. -// FIXME: Only process the file that caused the event func (t *AppTagger) EventCallback() { files, err := os.ReadDir(t.cfgPath) if err != nil { cclog.Fatal(err) } + t.apps = make([]appInfo, 0) + for _, fn := range files { + if fn.IsDir() { + continue + } fns := fn.Name() cclog.Debugf("Process: %s", fns) - f, err := os.Open(fmt.Sprintf("%s/%s", t.cfgPath, fns)) + f, err := os.Open(filepath.Join(t.cfgPath, fns)) if err != nil { cclog.Errorf("error opening app file %s: %#v", fns, err) + continue } t.scanApp(f, fns) + if err := f.Close(); err != nil { + cclog.Errorf("error closing app file %s: %#v", fns, err) + } } } -// Register initializes the AppTagger by loading application patterns from embedded files. -// It also sets up a file watch on ./var/tagger/apps if it exists, allowing for +// Register initializes the AppTagger by loading application patterns from external folder. +// It sets up a file watch on ./var/tagger/apps if it exists, allowing for // dynamic configuration updates without restarting the application. -// Returns an error if the embedded application files cannot be read. +// Returns an error if the configuration path does not exist or cannot be read. func (t *AppTagger) Register() error { - t.cfgPath = "./var/tagger/apps" - t.tagType = "app" + if t.cfgPath == "" { + t.cfgPath = defaultConfigPath + } + t.tagType = tagTypeApp + t.apps = make([]appInfo, 0) - files, err := appFiles.ReadDir("apps") + if !util.CheckFileExists(t.cfgPath) { + return fmt.Errorf("configuration path does not exist: %s", t.cfgPath) + } + + files, err := os.ReadDir(t.cfgPath) if err != nil { return fmt.Errorf("error reading app folder: %#v", err) } - t.apps = make(map[string]appInfo, 0) + for _, fn := range files { + if fn.IsDir() { + continue + } fns := fn.Name() cclog.Debugf("Process: %s", fns) - f, err := appFiles.Open(fmt.Sprintf("apps/%s", fns)) + f, err := os.Open(filepath.Join(t.cfgPath, fns)) if err != nil { - return fmt.Errorf("error opening app file %s: %#v", fns, err) + cclog.Errorf("error opening app file %s: %#v", fns, err) + continue } - defer f.Close() t.scanApp(f, fns) + if err := f.Close(); err != nil { + cclog.Errorf("error closing app file %s: %#v", fns, err) + } } - if util.CheckFileExists(t.cfgPath) { - t.EventCallback() - cclog.Infof("Setup file watch for %s", t.cfgPath) - util.AddListener(t.cfgPath, t) - } + cclog.Infof("Setup file watch for %s", t.cfgPath) + util.AddListener(t.cfgPath, t) return nil } @@ -116,33 +167,61 @@ func (t *AppTagger) Register() error { // It fetches the job metadata, extracts the job script, and matches it against // all configured application patterns using regular expressions. // If a match is found, the corresponding application tag is added to the job. -// Only the first matching application is tagged. +// Multiple application tags can be applied if patterns for different apps match. func (t *AppTagger) Match(job *schema.Job) { r := repository.GetJobRepository() + + if len(t.apps) == 0 { + cclog.Warn("AppTagger: no app patterns loaded, skipping match") + return + } + metadata, err := r.FetchMetadata(job) if err != nil { - cclog.Infof("Cannot fetch metadata for job: %d on %s", job.JobID, job.Cluster) + cclog.Debugf("AppTagger: cannot fetch metadata for job %d on %s: %v", job.JobID, job.Cluster, err) + return + } + + if metadata == nil { + cclog.Debugf("AppTagger: metadata is nil for job %d on %s", job.JobID, job.Cluster) return } jobscript, ok := metadata["jobScript"] - if ok { - id := *job.ID + if !ok { + cclog.Debugf("AppTagger: no 'jobScript' key in metadata for job %d on %s (keys: %v)", + job.JobID, job.Cluster, metadataKeys(metadata)) + return + } - out: - for _, a := range t.apps { - tag := a.tag - for _, s := range a.strings { - matched, _ := regexp.MatchString(s, strings.ToLower(jobscript)) - if matched { - if !r.HasTag(id, t.tagType, tag) { - r.AddTagOrCreateDirect(id, t.tagType, tag) - break out + if len(jobscript) == 0 { + cclog.Debugf("AppTagger: empty jobScript for job %d on %s", job.JobID, job.Cluster) + return + } + + id := *job.ID + jobscriptLower := strings.ToLower(jobscript) + cclog.Debugf("AppTagger: matching job %d (script length: %d) against %d apps", id, len(jobscriptLower), len(t.apps)) + + matched := false + for _, a := range t.apps { + for _, re := range a.patterns { + if re.MatchString(jobscriptLower) { + if r.HasTag(id, t.tagType, a.tag) { + cclog.Debugf("AppTagger: job %d already has tag %s:%s, skipping", id, t.tagType, a.tag) + } else { + cclog.Debugf("AppTagger: pattern '%s' matched for app '%s' on job %d", re.String(), a.tag, id) + if _, err := r.AddTagOrCreateDirect(id, t.tagType, a.tag); err != nil { + cclog.Errorf("AppTagger: failed to add tag '%s' to job %d: %v", a.tag, id, err) } } + matched = true + break // matched this app, move to next app } } - } else { - cclog.Infof("Cannot extract job script for job: %d on %s", job.JobID, job.Cluster) + } + + if !matched { + cclog.Debugf("AppTagger: no pattern matched for job %d on %s", id, job.Cluster) } } diff --git a/internal/tagger/detectApp_test.go b/internal/tagger/detectApp_test.go index 295ee97c..7cd05a08 100644 --- a/internal/tagger/detectApp_test.go +++ b/internal/tagger/detectApp_test.go @@ -5,19 +5,21 @@ package tagger import ( + "os" + "path/filepath" "testing" "github.com/ClusterCockpit/cc-backend/internal/repository" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" ) func setup(tb testing.TB) *repository.JobRepository { tb.Helper() cclog.Init("warn", true) dbfile := "../repository/testdata/job.db" - err := repository.MigrateDB("sqlite3", dbfile) + err := repository.MigrateDB(dbfile) noErr(tb, err) - repository.Connect("sqlite3", dbfile) + repository.Connect(dbfile) return repository.GetJobRepository() } @@ -29,28 +31,88 @@ func noErr(tb testing.TB, err error) { } } -func TestRegister(t *testing.T) { - var tagger AppTagger +func setupAppTaggerTestDir(t *testing.T) string { + t.Helper() - err := tagger.Register() + testDir := t.TempDir() + appsDir := filepath.Join(testDir, "apps") + err := os.MkdirAll(appsDir, 0o755) noErr(t, err) + srcDir := "../../configs/tagger/apps" + files, err := os.ReadDir(srcDir) + noErr(t, err) + + for _, file := range files { + if file.IsDir() { + continue + } + srcPath := filepath.Join(srcDir, file.Name()) + dstPath := filepath.Join(appsDir, file.Name()) + + data, err := os.ReadFile(srcPath) + noErr(t, err) + + err = os.WriteFile(dstPath, data, 0o644) + noErr(t, err) + } + + return appsDir +} + +func TestRegister(t *testing.T) { + appsDir := setupAppTaggerTestDir(t) + + var tagger AppTagger + tagger.cfgPath = appsDir + tagger.tagType = tagTypeApp + tagger.apps = make([]appInfo, 0) + + files, err := os.ReadDir(appsDir) + noErr(t, err) + + for _, fn := range files { + if fn.IsDir() { + continue + } + fns := fn.Name() + f, err := os.Open(filepath.Join(appsDir, fns)) + noErr(t, err) + tagger.scanApp(f, fns) + f.Close() + } + if len(tagger.apps) != 16 { t.Errorf("wrong summary for diagnostic \ngot: %d \nwant: 16", len(tagger.apps)) } } func TestMatch(t *testing.T) { + appsDir := setupAppTaggerTestDir(t) r := setup(t) - job, err := r.FindByIdDirect(317) + job, err := r.FindByIDDirect(317) noErr(t, err) var tagger AppTagger + tagger.cfgPath = appsDir + tagger.tagType = tagTypeApp + tagger.apps = make([]appInfo, 0) - err = tagger.Register() + files, err := os.ReadDir(appsDir) noErr(t, err) + for _, fn := range files { + if fn.IsDir() { + continue + } + fns := fn.Name() + f, err := os.Open(filepath.Join(appsDir, fns)) + noErr(t, err) + tagger.scanApp(f, fns) + f.Close() + } + tagger.Match(job) if !r.HasTag(317, "app", "vasp") { diff --git a/internal/tagger/jobclasses/lowUtilization.json b/internal/tagger/jobclasses/lowUtilization.json deleted file mode 100644 index e84b81da..00000000 --- a/internal/tagger/jobclasses/lowUtilization.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "name": "Low ressource utilization", - "tag": "lowutilization", - "parameters": ["job_min_duration_seconds"], - "metrics": ["flops_any", "mem_bw"], - "requirements": [ - "job.shared == \"none\"", - "job.duration > job_min_duration_seconds" - ], - "variables": [ - { - "name": "mem_bw_perc", - "expr": "1.0 - (mem_bw.avg / mem_bw.limits.peak)" - }, - { - "name": "flops_any_perc", - "expr": "1.0 - (flops_any.avg / flops_any.limits.peak)" - } - ], - "rule": "flops_any.avg < flops_any.limits.alert && mem_bw.avg < mem_bw.limits.alert", - "hint": "This job was detected as low utilization because the average flop rate {{.flops_any.avg}} falls below the threshold {{.flops_any.limits.alert}}." -} diff --git a/internal/tagger/jobclasses/lowload.json b/internal/tagger/jobclasses/lowload.json deleted file mode 100644 index f952da59..00000000 --- a/internal/tagger/jobclasses/lowload.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "name": "Low CPU load", - "tag": "lowload", - "parameters": [ - "lowcpuload_threshold_factor", - "job_min_duration_seconds", - "sampling_interval_seconds" - ], - "metrics": ["cpu_load"], - "requirements": [ - "job.shared == \"none\"", - "job.duration > job_min_duration_seconds" - ], - "variables": [ - { - "name": "load_threshold", - "expr": "job.numCores * lowcpuload_threshold_factor" - }, - { - "name": "load_perc", - "expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)" - } - ], - "rule": "cpu_load.avg < cpu_load.limits.caution", - "hint": "This job was detected as lowload because the average cpu load {{.cpu_load}} falls below the threshold {{.cpu_load.limits.caution}}." -} diff --git a/internal/tagger/tagger.go b/internal/tagger/tagger.go index 028d9efe..5ee27e08 100644 --- a/internal/tagger/tagger.go +++ b/internal/tagger/tagger.go @@ -10,11 +10,12 @@ package tagger import ( + "fmt" "sync" "github.com/ClusterCockpit/cc-backend/internal/repository" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" ) // Tagger is the interface that must be implemented by all tagging components. @@ -29,18 +30,38 @@ type Tagger interface { Match(job *schema.Job) } +// TaggerInfo holds metadata about a tagger for JSON serialization. +type TaggerInfo struct { + Name string `json:"name"` + Type string `json:"type"` + Running bool `json:"running"` +} + var ( - initOnce sync.Once - jobTagger *JobTagger + initOnce sync.Once + jobTagger *JobTagger + statusMu sync.Mutex + taggerStatus = map[string]bool{} ) +// Known tagger definitions: name -> (type, factory) +type taggerDef struct { + ttype string + factory func() Tagger +} + +var knownTaggers = map[string]taggerDef{ + "AppTagger": {ttype: "start", factory: func() Tagger { return &AppTagger{} }}, + "JobClassTagger": {ttype: "stop", factory: func() Tagger { return &JobClassTagger{} }}, +} + // JobTagger coordinates multiple taggers that run at different job lifecycle events. // It maintains separate lists of taggers that run when jobs start and when they stop. type JobTagger struct { // startTaggers are applied when a job starts (e.g., application detection) startTaggers []Tagger // stopTaggers are applied when a job completes (e.g., job classification) - stopTaggers []Tagger + stopTaggers []Tagger } func newTagger() { @@ -51,10 +72,14 @@ func newTagger() { jobTagger.stopTaggers = append(jobTagger.stopTaggers, &JobClassTagger{}) for _, tagger := range jobTagger.startTaggers { - tagger.Register() + if err := tagger.Register(); err != nil { + cclog.Errorf("failed to register start tagger: %s", err) + } } for _, tagger := range jobTagger.stopTaggers { - tagger.Register() + if err := tagger.Register(); err != nil { + cclog.Errorf("failed to register stop tagger: %s", err) + } } } @@ -64,7 +89,7 @@ func newTagger() { func Init() { initOnce.Do(func() { newTagger() - repository.RegisterJobJook(jobTagger) + repository.RegisterJobHook(jobTagger) }) } @@ -84,6 +109,73 @@ func (jt *JobTagger) JobStopCallback(job *schema.Job) { } } +// ListTaggers returns information about all known taggers with their current running status. +func ListTaggers() []TaggerInfo { + statusMu.Lock() + defer statusMu.Unlock() + + result := make([]TaggerInfo, 0, len(knownTaggers)) + for name, def := range knownTaggers { + result = append(result, TaggerInfo{ + Name: name, + Type: def.ttype, + Running: taggerStatus[name], + }) + } + return result +} + +// RunTaggerByName starts a tagger by name asynchronously on all jobs. +// Returns an error if the name is unknown or the tagger is already running. +func RunTaggerByName(name string) error { + def, ok := knownTaggers[name] + if !ok { + return fmt.Errorf("unknown tagger: %s", name) + } + + statusMu.Lock() + if taggerStatus[name] { + statusMu.Unlock() + return fmt.Errorf("tagger %s is already running", name) + } + taggerStatus[name] = true + statusMu.Unlock() + + go func() { + defer func() { + statusMu.Lock() + taggerStatus[name] = false + statusMu.Unlock() + }() + + t := def.factory() + if err := t.Register(); err != nil { + cclog.Errorf("Failed to register tagger %s: %s", name, err) + return + } + + r := repository.GetJobRepository() + jl, err := r.GetJobList(0, 0) + if err != nil { + cclog.Errorf("Error getting job list for tagger %s: %s", name, err) + return + } + + cclog.Infof("Running tagger %s on %d jobs", name, len(jl)) + for _, id := range jl { + job, err := r.FindByIDDirect(id) + if err != nil { + cclog.Errorf("Error getting job %d for tagger %s: %s", id, name, err) + continue + } + t.Match(job) + } + cclog.Infof("Tagger %s completed", name) + }() + + return nil +} + // RunTaggers applies all configured taggers to all existing jobs in the repository. // This is useful for retroactively applying tags to jobs that were created before // the tagger system was initialized or when new tagging rules are added. @@ -98,7 +190,7 @@ func RunTaggers() error { } for _, id := range jl { - job, err := r.FindByIdDirect(id) + job, err := r.FindByIDDirect(id) if err != nil { cclog.Errorf("Error while getting job %s", err) return err @@ -107,7 +199,7 @@ func RunTaggers() error { tagger.Match(job) } for _, tagger := range jobTagger.stopTaggers { - cclog.Infof("Run stop tagger for job %d", job.ID) + cclog.Infof("Run stop tagger for job %d", *job.ID) tagger.Match(job) } } diff --git a/internal/tagger/tagger_test.go b/internal/tagger/tagger_test.go index c81fac4a..d24ad7f7 100644 --- a/internal/tagger/tagger_test.go +++ b/internal/tagger/tagger_test.go @@ -8,7 +8,7 @@ import ( "testing" "github.com/ClusterCockpit/cc-backend/internal/repository" - "github.com/ClusterCockpit/cc-lib/schema" + "github.com/ClusterCockpit/cc-lib/v2/schema" ) func TestInit(t *testing.T) { @@ -18,7 +18,7 @@ func TestInit(t *testing.T) { func TestJobStartCallback(t *testing.T) { Init() r := setup(t) - job, err := r.FindByIdDirect(525) + job, err := r.FindByIDDirect(525) noErr(t, err) jobs := make([]*schema.Job, 0, 1) diff --git a/internal/taskmanager/commitJobService.go b/internal/taskmanager/commitJobService.go index 4f21c86b..4a070284 100644 --- a/internal/taskmanager/commitJobService.go +++ b/internal/taskmanager/commitJobService.go @@ -9,7 +9,7 @@ import ( "time" "github.com/ClusterCockpit/cc-backend/internal/repository" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/go-co-op/gocron/v2" ) diff --git a/internal/taskmanager/compressionService.go b/internal/taskmanager/compressionService.go index c2df852d..353fcb65 100644 --- a/internal/taskmanager/compressionService.go +++ b/internal/taskmanager/compressionService.go @@ -9,15 +9,15 @@ import ( "time" "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/go-co-op/gocron/v2" ) func RegisterCompressionService(compressOlderThan int) { cclog.Info("Register compression service") - s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(0o5, 0, 0))), + s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(5, 0, 0))), gocron.NewTask( func() { var jobs []*schema.Job @@ -28,10 +28,10 @@ func RegisterCompressionService(compressOlderThan int) { lastTime := ar.CompressLast(startTime) if startTime == lastTime { cclog.Info("Compression Service - Complete archive run") - jobs, err = jobRepo.FindJobsBetween(0, startTime, false) + jobs, err = jobRepo.FindJobsBetween(0, startTime, "none") } else { - jobs, err = jobRepo.FindJobsBetween(lastTime, startTime, false) + jobs, err = jobRepo.FindJobsBetween(lastTime, startTime, "none") } if err != nil { diff --git a/internal/taskmanager/ldapSyncService.go b/internal/taskmanager/ldapSyncService.go index e410af9e..55a99bab 100644 --- a/internal/taskmanager/ldapSyncService.go +++ b/internal/taskmanager/ldapSyncService.go @@ -9,7 +9,7 @@ import ( "time" "github.com/ClusterCockpit/cc-backend/internal/auth" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/go-co-op/gocron/v2" ) @@ -23,7 +23,8 @@ func RegisterLdapSyncService(ds string) { auth := auth.GetAuthInstance() - cclog.Info("Register LDAP sync service") + cclog.Infof("register ldap sync service with %s interval", ds) + s.NewJob(gocron.DurationJob(interval), gocron.NewTask( func() { @@ -32,6 +33,5 @@ func RegisterLdapSyncService(ds string) { if err := auth.LdapAuth.Sync(); err != nil { cclog.Errorf("ldap sync failed: %s", err.Error()) } - cclog.Print("ldap sync done") })) } diff --git a/internal/taskmanager/nodestateRetentionService.go b/internal/taskmanager/nodestateRetentionService.go new file mode 100644 index 00000000..b6306849 --- /dev/null +++ b/internal/taskmanager/nodestateRetentionService.go @@ -0,0 +1,120 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package taskmanager + +import ( + "time" + + "github.com/ClusterCockpit/cc-backend/internal/config" + "github.com/ClusterCockpit/cc-backend/internal/repository" + pqarchive "github.com/ClusterCockpit/cc-backend/pkg/archive/parquet" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/go-co-op/gocron/v2" +) + +func RegisterNodeStateRetentionDeleteService(ageHours int) { + cclog.Info("Register node state retention delete service") + + s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(2, 0, 0))), + gocron.NewTask( + func() { + cutoff := time.Now().Unix() - int64(ageHours*3600) + nodeRepo := repository.GetNodeRepository() + cnt, err := nodeRepo.DeleteNodeStatesBefore(cutoff) + if err != nil { + cclog.Errorf("NodeState retention: error deleting old rows: %v", err) + } else if cnt > 0 { + cclog.Infof("NodeState retention: deleted %d old rows", cnt) + } + })) +} + +func RegisterNodeStateRetentionMoveService(cfg *config.NodeStateRetention) { + cclog.Info("Register node state retention move service") + + maxFileSizeMB := cfg.MaxFileSizeMB + if maxFileSizeMB <= 0 { + maxFileSizeMB = 128 + } + + ageHours := cfg.Age + if ageHours <= 0 { + ageHours = 24 + } + + var target pqarchive.ParquetTarget + var err error + + switch cfg.TargetKind { + case "s3": + target, err = pqarchive.NewS3Target(pqarchive.S3TargetConfig{ + Endpoint: cfg.TargetEndpoint, + Bucket: cfg.TargetBucket, + AccessKey: cfg.TargetAccessKey, + SecretKey: cfg.TargetSecretKey, + Region: cfg.TargetRegion, + UsePathStyle: cfg.TargetUsePathStyle, + }) + default: + target, err = pqarchive.NewFileTarget(cfg.TargetPath) + } + + if err != nil { + cclog.Errorf("NodeState move retention: failed to create target: %v", err) + return + } + + s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(2, 30, 0))), + gocron.NewTask( + func() { + cutoff := time.Now().Unix() - int64(ageHours*3600) + nodeRepo := repository.GetNodeRepository() + + rows, err := nodeRepo.FindNodeStatesBefore(cutoff) + if err != nil { + cclog.Errorf("NodeState move retention: error finding rows: %v", err) + return + } + if len(rows) == 0 { + return + } + + cclog.Infof("NodeState move retention: archiving %d rows", len(rows)) + pw := pqarchive.NewNodeStateParquetWriter(target, maxFileSizeMB) + + for _, ns := range rows { + row := pqarchive.ParquetNodeStateRow{ + TimeStamp: ns.TimeStamp, + NodeState: ns.NodeState, + HealthState: ns.HealthState, + HealthMetrics: ns.HealthMetrics, + CpusAllocated: int32(ns.CpusAllocated), + MemoryAllocated: ns.MemoryAllocated, + GpusAllocated: int32(ns.GpusAllocated), + JobsRunning: int32(ns.JobsRunning), + Hostname: ns.Hostname, + Cluster: ns.Cluster, + SubCluster: ns.SubCluster, + } + if err := pw.AddRow(row); err != nil { + cclog.Errorf("NodeState move retention: add row: %v", err) + continue + } + } + + if err := pw.Close(); err != nil { + cclog.Errorf("NodeState move retention: close writer: %v", err) + return + } + + cnt, err := nodeRepo.DeleteNodeStatesBefore(cutoff) + if err != nil { + cclog.Errorf("NodeState move retention: error deleting rows: %v", err) + } else { + cclog.Infof("NodeState move retention: deleted %d rows from db", cnt) + } + })) +} diff --git a/internal/taskmanager/retentionService.go b/internal/taskmanager/retentionService.go index 0a61bc4a..48e5c042 100644 --- a/internal/taskmanager/retentionService.go +++ b/internal/taskmanager/retentionService.go @@ -6,63 +6,329 @@ package taskmanager import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" "time" "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + pqarchive "github.com/ClusterCockpit/cc-backend/pkg/archive/parquet" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/go-co-op/gocron/v2" ) -func RegisterRetentionDeleteService(age int, includeDB bool, omitTagged bool) { +// createParquetTarget creates a ParquetTarget (file or S3) from the retention config. +func createParquetTarget(cfg Retention) (pqarchive.ParquetTarget, error) { + switch cfg.TargetKind { + case "s3": + return pqarchive.NewS3Target(pqarchive.S3TargetConfig{ + Endpoint: cfg.TargetEndpoint, + Bucket: cfg.TargetBucket, + AccessKey: cfg.TargetAccessKey, + SecretKey: cfg.TargetSecretKey, + Region: cfg.TargetRegion, + UsePathStyle: cfg.TargetUsePathStyle, + }) + default: + return pqarchive.NewFileTarget(cfg.TargetPath) + } +} + +// createTargetBackend creates a secondary archive backend (file or S3) for JSON copy/move. +func createTargetBackend(cfg Retention) (archive.ArchiveBackend, error) { + var raw json.RawMessage + var err error + + switch cfg.TargetKind { + case "s3": + raw, err = json.Marshal(map[string]any{ + "kind": "s3", + "endpoint": cfg.TargetEndpoint, + "bucket": cfg.TargetBucket, + "access-key": cfg.TargetAccessKey, + "secret-key": cfg.TargetSecretKey, + "region": cfg.TargetRegion, + "use-path-style": cfg.TargetUsePathStyle, + }) + default: + raw, err = json.Marshal(map[string]string{ + "kind": "file", + "path": cfg.TargetPath, + }) + } + if err != nil { + return nil, fmt.Errorf("marshal target config: %w", err) + } + return archive.InitBackend(raw) +} + +// transferJobsJSON copies job data from source archive to target backend in JSON format. +func transferJobsJSON(jobs []*schema.Job, src archive.ArchiveBackend, dst archive.ArchiveBackend) error { + // Transfer cluster configs for all clusters referenced by jobs + clustersDone := make(map[string]bool) + for _, job := range jobs { + if clustersDone[job.Cluster] { + continue + } + clusterCfg, err := src.LoadClusterCfg(job.Cluster) + if err != nil { + cclog.Warnf("Retention: load cluster config %q: %v", job.Cluster, err) + } else { + if err := dst.StoreClusterCfg(job.Cluster, clusterCfg); err != nil { + cclog.Warnf("Retention: store cluster config %q: %v", job.Cluster, err) + } + } + clustersDone[job.Cluster] = true + } + + for _, job := range jobs { + meta, err := src.LoadJobMeta(job) + if err != nil { + cclog.Warnf("Retention: load meta for job %d: %v", job.JobID, err) + continue + } + data, err := src.LoadJobData(job) + if err != nil { + cclog.Warnf("Retention: load data for job %d: %v", job.JobID, err) + continue + } + if err := dst.ImportJob(meta, &data); err != nil { + cclog.Warnf("Retention: import job %d: %v", job.JobID, err) + continue + } + } + return nil +} + +// transferJobsParquet converts jobs to Parquet format, organized by cluster. +func transferJobsParquet(jobs []*schema.Job, src archive.ArchiveBackend, target pqarchive.ParquetTarget, maxSizeMB int) error { + cw := pqarchive.NewClusterAwareParquetWriter(target, maxSizeMB) + + // Set cluster configs for all clusters referenced by jobs + clustersDone := make(map[string]bool) + for _, job := range jobs { + if clustersDone[job.Cluster] { + continue + } + clusterCfg, err := src.LoadClusterCfg(job.Cluster) + if err != nil { + cclog.Warnf("Retention: load cluster config %q: %v", job.Cluster, err) + } else { + cw.SetClusterConfig(job.Cluster, clusterCfg) + } + clustersDone[job.Cluster] = true + } + + for _, job := range jobs { + meta, err := src.LoadJobMeta(job) + if err != nil { + cclog.Warnf("Retention: load meta for job %d: %v", job.JobID, err) + continue + } + data, err := src.LoadJobData(job) + if err != nil { + cclog.Warnf("Retention: load data for job %d: %v", job.JobID, err) + continue + } + row, err := pqarchive.JobToParquetRow(meta, &data) + if err != nil { + cclog.Warnf("Retention: convert job %d: %v", job.JobID, err) + continue + } + if err := cw.AddJob(*row); err != nil { + cclog.Errorf("Retention: add job %d to writer: %v", job.JobID, err) + continue + } + } + + return cw.Close() +} + +// cleanupAfterTransfer removes jobs from archive and optionally from DB. +func cleanupAfterTransfer(jobs []*schema.Job, startTime int64, includeDB bool, omitTagged string) { + archive.GetHandle().CleanUp(jobs) + + if includeDB { + cnt, err := jobRepo.DeleteJobsBefore(startTime, omitTagged) + if err != nil { + cclog.Errorf("Retention: delete jobs from db: %v", err) + } else { + cclog.Infof("Retention: removed %d jobs from db", cnt) + } + if err = jobRepo.Optimize(); err != nil { + cclog.Errorf("Retention: db optimization error: %v", err) + } + } +} + +// readCopyMarker reads the last-processed timestamp from a copy marker file. +func readCopyMarker(cfg Retention) int64 { + var data []byte + var err error + + switch cfg.TargetKind { + case "s3": + // For S3 we store the marker locally alongside the config + data, err = os.ReadFile(copyMarkerPath(cfg)) + default: + data, err = os.ReadFile(filepath.Join(cfg.TargetPath, ".copy-marker")) + } + if err != nil { + return 0 + } + ts, err := strconv.ParseInt(strings.TrimSpace(string(data)), 10, 64) + if err != nil { + return 0 + } + return ts +} + +// writeCopyMarker writes the last-processed timestamp to a copy marker file. +func writeCopyMarker(cfg Retention, ts int64) { + content := []byte(strconv.FormatInt(ts, 10)) + var err error + + switch cfg.TargetKind { + case "s3": + err = os.WriteFile(copyMarkerPath(cfg), content, 0o640) + default: + err = os.WriteFile(filepath.Join(cfg.TargetPath, ".copy-marker"), content, 0o640) + } + if err != nil { + cclog.Warnf("Retention: write copy marker: %v", err) + } +} + +func copyMarkerPath(cfg Retention) string { + // For S3 targets, store the marker in a local temp-style path derived from the bucket name + return filepath.Join(os.TempDir(), fmt.Sprintf("cc-copy-marker-%s", cfg.TargetBucket)) +} + +func RegisterRetentionDeleteService(cfg Retention) { cclog.Info("Register retention delete service") - s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(0o4, 0, 0))), + s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(3, 0, 0))), gocron.NewTask( func() { - startTime := time.Now().Unix() - int64(age*24*3600) - jobs, err := jobRepo.FindJobsBetween(0, startTime, omitTagged) + startTime := time.Now().Unix() - int64(cfg.Age*24*3600) + jobs, err := jobRepo.FindJobsBetween(0, startTime, cfg.OmitTagged) if err != nil { - cclog.Warnf("Error while looking for retention jobs: %s", err.Error()) + cclog.Warnf("Retention delete: error finding jobs: %v", err) + return + } + if len(jobs) == 0 { + return } - archive.GetHandle().CleanUp(jobs) - if includeDB { - cnt, err := jobRepo.DeleteJobsBefore(startTime, omitTagged) - if err != nil { - cclog.Errorf("Error while deleting retention jobs from db: %s", err.Error()) - } else { - cclog.Infof("Retention: Removed %d jobs from db", cnt) - } - if err = jobRepo.Optimize(); err != nil { - cclog.Errorf("Error occured in db optimization: %s", err.Error()) - } - } + cclog.Infof("Retention delete: processing %d jobs", len(jobs)) + cleanupAfterTransfer(jobs, startTime, cfg.IncludeDB, cfg.OmitTagged) })) } -func RegisterRetentionMoveService(age int, includeDB bool, location string, omitTagged bool) { - cclog.Info("Register retention move service") +func RegisterRetentionCopyService(cfg Retention) { + cclog.Infof("Register retention copy service (format=%s, target=%s)", cfg.Format, cfg.TargetKind) - s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(0o4, 0, 0))), + maxFileSizeMB := cfg.MaxFileSizeMB + if maxFileSizeMB <= 0 { + maxFileSizeMB = 512 + } + + s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(4, 0, 0))), gocron.NewTask( func() { - startTime := time.Now().Unix() - int64(age*24*3600) - jobs, err := jobRepo.FindJobsBetween(0, startTime, omitTagged) - if err != nil { - cclog.Warnf("Error while looking for retention jobs: %s", err.Error()) - } - archive.GetHandle().Move(jobs, location) + cutoff := time.Now().Unix() - int64(cfg.Age*24*3600) + lastProcessed := readCopyMarker(cfg) - if includeDB { - cnt, err := jobRepo.DeleteJobsBefore(startTime, omitTagged) + jobs, err := jobRepo.FindJobsBetween(lastProcessed, cutoff, cfg.OmitTagged) + if err != nil { + cclog.Warnf("Retention copy: error finding jobs: %v", err) + return + } + if len(jobs) == 0 { + return + } + + cclog.Infof("Retention copy: processing %d jobs", len(jobs)) + ar := archive.GetHandle() + + switch cfg.Format { + case "parquet": + target, err := createParquetTarget(cfg) if err != nil { - cclog.Errorf("Error while deleting retention jobs from db: %v", err) - } else { - cclog.Infof("Retention: Removed %d jobs from db", cnt) + cclog.Errorf("Retention copy: create parquet target: %v", err) + return } - if err = jobRepo.Optimize(); err != nil { - cclog.Errorf("Error occured in db optimization: %v", err) + if err := transferJobsParquet(jobs, ar, target, maxFileSizeMB); err != nil { + cclog.Errorf("Retention copy: parquet transfer: %v", err) + return + } + default: // json + dst, err := createTargetBackend(cfg) + if err != nil { + cclog.Errorf("Retention copy: create target backend: %v", err) + return + } + if err := transferJobsJSON(jobs, ar, dst); err != nil { + cclog.Errorf("Retention copy: json transfer: %v", err) + return } } + + writeCopyMarker(cfg, cutoff) + })) +} + +func RegisterRetentionMoveService(cfg Retention) { + cclog.Infof("Register retention move service (format=%s, target=%s)", cfg.Format, cfg.TargetKind) + + maxFileSizeMB := cfg.MaxFileSizeMB + if maxFileSizeMB <= 0 { + maxFileSizeMB = 512 + } + + s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(5, 0, 0))), + gocron.NewTask( + func() { + startTime := time.Now().Unix() - int64(cfg.Age*24*3600) + jobs, err := jobRepo.FindJobsBetween(0, startTime, cfg.OmitTagged) + if err != nil { + cclog.Warnf("Retention move: error finding jobs: %v", err) + return + } + if len(jobs) == 0 { + return + } + + cclog.Infof("Retention move: processing %d jobs", len(jobs)) + ar := archive.GetHandle() + + switch cfg.Format { + case "parquet": + target, err := createParquetTarget(cfg) + if err != nil { + cclog.Errorf("Retention move: create parquet target: %v", err) + return + } + if err := transferJobsParquet(jobs, ar, target, maxFileSizeMB); err != nil { + cclog.Errorf("Retention move: parquet transfer: %v", err) + return + } + default: // json + dst, err := createTargetBackend(cfg) + if err != nil { + cclog.Errorf("Retention move: create target backend: %v", err) + return + } + if err := transferJobsJSON(jobs, ar, dst); err != nil { + cclog.Errorf("Retention move: json transfer: %v", err) + return + } + } + + cleanupAfterTransfer(jobs, startTime, cfg.IncludeDB, cfg.OmitTagged) })) } diff --git a/internal/taskmanager/stopJobsExceedTime.go b/internal/taskmanager/stopJobsExceedTime.go index e59b3aee..ce9cfd77 100644 --- a/internal/taskmanager/stopJobsExceedTime.go +++ b/internal/taskmanager/stopJobsExceedTime.go @@ -9,14 +9,14 @@ import ( "runtime" "github.com/ClusterCockpit/cc-backend/internal/config" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/go-co-op/gocron/v2" ) func RegisterStopJobsExceedTime() { cclog.Info("Register undead jobs service") - s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(0o3, 0, 0))), + s.NewJob(gocron.DailyJob(1, gocron.NewAtTimes(gocron.NewAtTime(3, 0, 0))), gocron.NewTask( func() { err := jobRepo.StopJobsExceedingWalltimeBy(config.Keys.StopJobsExceedingWalltime) diff --git a/internal/taskmanager/taskManager.go b/internal/taskmanager/taskManager.go index 57f2d883..d758ee52 100644 --- a/internal/taskmanager/taskManager.go +++ b/internal/taskmanager/taskManager.go @@ -13,17 +13,30 @@ import ( "github.com/ClusterCockpit/cc-backend/internal/auth" "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/internal/repository" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/go-co-op/gocron/v2" ) +const ( + DefaultCompressOlderThan = 7 +) + // Retention defines the configuration for job retention policies. type Retention struct { - Policy string `json:"policy"` - Location string `json:"location"` - Age int `json:"age"` - IncludeDB bool `json:"includeDB"` - OmitTagged bool `json:"omitTagged"` + Policy string `json:"policy"` + Format string `json:"format"` + Age int `json:"age"` + IncludeDB bool `json:"include-db"` + OmitTagged string `json:"omit-tagged"` + TargetKind string `json:"target-kind"` + TargetPath string `json:"target-path"` + TargetEndpoint string `json:"target-endpoint"` + TargetBucket string `json:"target-bucket"` + TargetAccessKey string `json:"target-access-key"` + TargetSecretKey string `json:"target-secret-key"` + TargetRegion string `json:"target-region"` + TargetUsePathStyle bool `json:"target-use-path-style"` + MaxFileSizeMB int `json:"max-file-size-mb"` } // CronFrequency defines the execution intervals for various background workers. @@ -60,6 +73,33 @@ func parseDuration(s string) (time.Duration, error) { return interval, nil } +func initArchiveServices(config json.RawMessage) { + var cfg struct { + Retention Retention `json:"retention"` + Compression int `json:"compression"` + } + cfg.Retention.IncludeDB = true + + if err := json.Unmarshal(config, &cfg); err != nil { + cclog.Errorf("error while unmarshaling raw config json: %v", err) + } + + switch cfg.Retention.Policy { + case "delete": + RegisterRetentionDeleteService(cfg.Retention) + case "copy": + RegisterRetentionCopyService(cfg.Retention) + case "move": + RegisterRetentionMoveService(cfg.Retention) + } + + if cfg.Compression > 0 { + RegisterCompressionService(cfg.Compression) + } else { + RegisterCompressionService(DefaultCompressOlderThan) + } +} + // Start initializes the task manager, parses configurations, and registers background tasks. // It starts the gocron scheduler. func Start(cronCfg, archiveConfig json.RawMessage) { @@ -80,32 +120,11 @@ func Start(cronCfg, archiveConfig json.RawMessage) { cclog.Errorf("error while decoding cron config: %v", err) } - var cfg struct { - Retention Retention `json:"retention"` - Compression int `json:"compression"` - } - cfg.Retention.IncludeDB = true - - if err := json.Unmarshal(archiveConfig, &cfg); err != nil { - cclog.Warn("Error while unmarshaling raw config json") - } - - switch cfg.Retention.Policy { - case "delete": - RegisterRetentionDeleteService( - cfg.Retention.Age, - cfg.Retention.IncludeDB, - cfg.Retention.OmitTagged) - case "move": - RegisterRetentionMoveService( - cfg.Retention.Age, - cfg.Retention.IncludeDB, - cfg.Retention.Location, - cfg.Retention.OmitTagged) - } - - if cfg.Compression > 0 { - RegisterCompressionService(cfg.Compression) + if archiveConfig != nil { + initArchiveServices(archiveConfig) + } else { + // Always enable compression + RegisterCompressionService(DefaultCompressOlderThan) } lc := auth.Keys.LdapConfig @@ -118,9 +137,30 @@ func Start(cronCfg, archiveConfig json.RawMessage) { RegisterUpdateDurationWorker() RegisterCommitJobService() + if config.Keys.NodeStateRetention != nil && config.Keys.NodeStateRetention.Policy != "" { + initNodeStateRetention() + } + s.Start() } +func initNodeStateRetention() { + cfg := config.Keys.NodeStateRetention + age := cfg.Age + if age <= 0 { + age = 24 + } + + switch cfg.Policy { + case "delete": + RegisterNodeStateRetentionDeleteService(age) + case "move": + RegisterNodeStateRetentionMoveService(cfg) + default: + cclog.Warnf("Unknown nodestate-retention policy: %s", cfg.Policy) + } +} + // Shutdown stops the task manager and its scheduler. func Shutdown() { if s != nil { diff --git a/internal/taskmanager/updateDurationService.go b/internal/taskmanager/updateDurationService.go index 9c52da79..f1dde74a 100644 --- a/internal/taskmanager/updateDurationService.go +++ b/internal/taskmanager/updateDurationService.go @@ -8,7 +8,7 @@ package taskmanager import ( "time" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" "github.com/go-co-op/gocron/v2" ) diff --git a/internal/taskmanager/updateFootprintService.go b/internal/taskmanager/updateFootprintService.go index ae9512cd..34a18bdd 100644 --- a/internal/taskmanager/updateFootprintService.go +++ b/internal/taskmanager/updateFootprintService.go @@ -10,10 +10,10 @@ import ( "math" "time" - "github.com/ClusterCockpit/cc-backend/internal/metricdata" + "github.com/ClusterCockpit/cc-backend/internal/metricdispatch" "github.com/ClusterCockpit/cc-backend/pkg/archive" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" sq "github.com/Masterminds/squirrel" "github.com/go-co-op/gocron/v2" ) @@ -49,7 +49,7 @@ func RegisterFootprintWorker() { if err != nil { continue } - // NOTE: Additional Subcluster Loop Could Allow For Limited List Of Footprint-Metrics Only. + // NOTE: Additional SubCluster Loop Could Allow For Limited List Of Footprint-Metrics Only. // - Chunk-Size Would Then Be 'SubCluster' (Running Jobs, Transactions) as Lists Can Change Within SCs // - Would Require Review of 'updateFootprint' Usage (Logic Could Possibly Be Included Here Completely) allMetrics := make([]string, 0) @@ -58,12 +58,6 @@ func RegisterFootprintWorker() { allMetrics = append(allMetrics, mc.Name) } - repo, err := metricdata.GetMetricDataRepo(cluster.Name) - if err != nil { - cclog.Errorf("no metric data repository configured for '%s'", cluster.Name) - continue - } - pendingStatements := []sq.UpdateBuilder{} for _, job := range jobs { @@ -72,7 +66,14 @@ func RegisterFootprintWorker() { sJob := time.Now() - jobStats, err := repo.LoadStats(job, allMetrics, context.Background()) + ms, err := metricdispatch.GetMetricDataRepo(job.Cluster, job.SubCluster) + if err != nil { + cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s", + job.Cluster, job.SubCluster, err.Error()) + continue + } + + jobStats, err := ms.LoadStats(job, allMetrics, context.Background()) if err != nil { cclog.Errorf("error wile loading job data stats for footprint update: %v", err) ce++ @@ -112,7 +113,7 @@ func RegisterFootprintWorker() { stmt := sq.Update("job") stmt, err = jobRepo.UpdateFootprint(stmt, job) if err != nil { - cclog.Errorf("update job (dbid: %d) statement build failed at footprint step: %s", job.ID, err.Error()) + cclog.Errorf("update job (dbid: %d) statement build failed at footprint step: %s", *job.ID, err.Error()) ce++ continue } diff --git a/pkg/archive/ConfigSchema.go b/pkg/archive/ConfigSchema.go index dfa7e1f6..1c2b7fe1 100644 --- a/pkg/archive/ConfigSchema.go +++ b/pkg/archive/ConfigSchema.go @@ -18,7 +18,7 @@ var configSchema = ` "description": "Path to job archive for file backend", "type": "string" }, - "dbPath": { + "db-path": { "description": "Path to SQLite database file for sqlite backend", "type": "string" }, @@ -26,11 +26,11 @@ var configSchema = ` "description": "S3 endpoint URL (for S3-compatible services like MinIO)", "type": "string" }, - "accessKey": { + "access-key": { "description": "S3 access key ID", "type": "string" }, - "secretKey": { + "secret-key": { "description": "S3 secret access key", "type": "string" }, @@ -42,7 +42,7 @@ var configSchema = ` "description": "AWS region for S3 bucket", "type": "string" }, - "usePathStyle": { + "use-path-style": { "description": "Use path-style S3 URLs (required for MinIO and some S3-compatible services)", "type": "boolean" }, @@ -57,19 +57,62 @@ var configSchema = ` "policy": { "description": "Retention policy", "type": "string", - "enum": ["none", "delete", "move"] + "enum": ["none", "delete", "copy", "move"] }, - "includeDB": { + "format": { + "description": "Output format for copy/move policies", + "type": "string", + "enum": ["json", "parquet"] + }, + "include-db": { "description": "Also remove jobs from database", "type": "boolean" }, + "omit-tagged": { + "description": "Omit tagged jobs from retention: none = include all, all = omit any tagged job, user = omit jobs with user-created tags (auto-tagger types 'app'/'jobClass' are not considered user tags)", + "type": "string", + "enum": ["none", "all", "user"] + }, "age": { "description": "Act on jobs with startTime older than age (in days)", "type": "integer" }, - "location": { - "description": "The target directory for retention. Only applicable for retention move.", + "target-kind": { + "description": "Target storage kind: file or s3", + "type": "string", + "enum": ["file", "s3"] + }, + "target-path": { + "description": "Target directory path for file storage", "type": "string" + }, + "target-endpoint": { + "description": "S3 endpoint URL for target", + "type": "string" + }, + "target-bucket": { + "description": "S3 bucket name for target", + "type": "string" + }, + "target-access-key": { + "description": "S3 access key for target", + "type": "string" + }, + "target-secret-key": { + "description": "S3 secret key for target", + "type": "string" + }, + "target-region": { + "description": "S3 region for target", + "type": "string" + }, + "target-use-path-style": { + "description": "Use path-style S3 URLs for target", + "type": "boolean" + }, + "max-file-size-mb": { + "description": "Maximum parquet file size in MB before splitting", + "type": "integer" } }, "required": ["policy"] diff --git a/pkg/archive/archive.go b/pkg/archive/archive.go index 71933f2b..f993f025 100644 --- a/pkg/archive/archive.go +++ b/pkg/archive/archive.go @@ -31,13 +31,15 @@ // } // } // -// For S3 backend: +// For S3 backend (endpoint, region, and usePathStyle are optional): // // { // "archive": { // "kind": "s3", +// "endpoint": "http://192.168.178.10", // "bucket": "my-job-archive", // "region": "us-east-1", +// "usePathStyle": true, // "accessKey": "...", // "secretKey": "..." // } @@ -85,9 +87,9 @@ import ( "sync" "github.com/ClusterCockpit/cc-backend/internal/config" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/lrucache" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/lrucache" + "github.com/ClusterCockpit/cc-lib/v2/schema" ) // Version is the current archive schema version. @@ -179,11 +181,10 @@ type JobContainer struct { } var ( - initOnce sync.Once - cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024) - ar ArchiveBackend - useArchive bool - mutex sync.Mutex + initOnce sync.Once + cache *lrucache.Cache = lrucache.New(128 * 1024 * 1024) + ar ArchiveBackend + mutex sync.Mutex ) // Init initializes the archive backend with the provided configuration. @@ -195,12 +196,10 @@ var ( // // The configuration determines which backend is used (file, s3, or sqlite). // Returns an error if initialization fails or version is incompatible. -func Init(rawConfig json.RawMessage, disableArchive bool) error { +func Init(rawConfig json.RawMessage) error { var err error initOnce.Do(func() { - useArchive = !disableArchive - var cfg struct { Kind string `json:"kind"` } @@ -376,7 +375,7 @@ func UpdateMetadata(job *schema.Job, metadata map[string]string) error { mutex.Lock() defer mutex.Unlock() - if job.State == schema.JobStateRunning || !useArchive { + if job.State == schema.JobStateRunning { return nil } @@ -399,7 +398,7 @@ func UpdateTags(job *schema.Job, tags []*schema.Tag) error { mutex.Lock() defer mutex.Unlock() - if job.State == schema.JobStateRunning || !useArchive { + if job.State == schema.JobStateRunning { return nil } diff --git a/pkg/archive/archive_test.go b/pkg/archive/archive_test.go index 34ea831a..277f8ac5 100644 --- a/pkg/archive/archive_test.go +++ b/pkg/archive/archive_test.go @@ -11,8 +11,8 @@ import ( "testing" "github.com/ClusterCockpit/cc-backend/pkg/archive" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/ClusterCockpit/cc-lib/util" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" ) var jobs []*schema.Job @@ -23,7 +23,7 @@ func setup(t *testing.T) archive.ArchiveBackend { util.CopyDir("./testdata/archive/", jobarchive) archiveCfg := fmt.Sprintf("{\"kind\": \"file\",\"path\": \"%s\"}", jobarchive) - if err := archive.Init(json.RawMessage(archiveCfg), false); err != nil { + if err := archive.Init(json.RawMessage(archiveCfg)); err != nil { t.Fatal(err) } diff --git a/pkg/archive/clusterConfig.go b/pkg/archive/clusterConfig.go index 13890c94..3e27e415 100644 --- a/pkg/archive/clusterConfig.go +++ b/pkg/archive/clusterConfig.go @@ -6,62 +6,69 @@ package archive import ( - "errors" "fmt" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" ) var ( - Clusters []*schema.Cluster - GlobalMetricList []*schema.GlobalMetricListItem - NodeLists map[string]map[string]NodeList + Clusters []*schema.Cluster + GlobalMetricList []*schema.GlobalMetricListItem + GlobalUserMetricList []*schema.GlobalMetricListItem + NodeLists map[string]map[string]NodeList ) func initClusterConfig() error { Clusters = []*schema.Cluster{} + GlobalMetricList = []*schema.GlobalMetricListItem{} + GlobalUserMetricList = []*schema.GlobalMetricListItem{} NodeLists = map[string]map[string]NodeList{} metricLookup := make(map[string]schema.GlobalMetricListItem) + userMetricLookup := make(map[string]schema.GlobalMetricListItem) for _, c := range ar.GetClusters() { cluster, err := ar.LoadClusterCfg(c) if err != nil { cclog.Warnf("Error while loading cluster config for cluster '%v'", c) - return err + return fmt.Errorf("failed to load cluster config for '%s': %w", c, err) } - if len(cluster.Name) == 0 || - len(cluster.MetricConfig) == 0 || - len(cluster.SubClusters) == 0 { - return errors.New("cluster.name, cluster.metricConfig and cluster.SubClusters should not be empty") + if len(cluster.Name) == 0 { + return fmt.Errorf("cluster name is empty in config for '%s'", c) + } + if len(cluster.MetricConfig) == 0 { + return fmt.Errorf("cluster '%s' has no metric configurations", cluster.Name) + } + if len(cluster.SubClusters) == 0 { + return fmt.Errorf("cluster '%s' has no subclusters defined", cluster.Name) } for _, mc := range cluster.MetricConfig { if len(mc.Name) == 0 { - return errors.New("cluster.metricConfig.name should not be empty") + return fmt.Errorf("cluster '%s' has a metric config with empty name", cluster.Name) } if mc.Timestep < 1 { - return errors.New("cluster.metricConfig.timestep should not be smaller than one") + return fmt.Errorf("metric '%s' in cluster '%s' has invalid timestep %d (must be >= 1)", mc.Name, cluster.Name, mc.Timestep) } - // For backwards compability... + // For backwards compatibility... if mc.Scope == "" { mc.Scope = schema.MetricScopeNode } if !mc.Scope.Valid() { - return errors.New("cluster.metricConfig.scope must be a valid scope ('node', 'scocket', ...)") + return fmt.Errorf("metric '%s' in cluster '%s' has invalid scope '%s' (must be 'node', 'socket', 'core', etc.)", mc.Name, cluster.Name, mc.Scope) } - ml, ok := metricLookup[mc.Name] - if !ok { + if _, ok := metricLookup[mc.Name]; !ok { metricLookup[mc.Name] = schema.GlobalMetricListItem{ Name: mc.Name, Scope: mc.Scope, Unit: mc.Unit, Footprint: mc.Footprint, } - ml = metricLookup[mc.Name] } + availability := schema.ClusterSupport{Cluster: cluster.Name} + userAvailability := schema.ClusterSupport{Cluster: cluster.Name} scLookup := make(map[string]*schema.SubClusterConfig) for _, scc := range mc.SubClusters { @@ -89,40 +96,55 @@ func initClusterConfig() error { newMetric.Footprint = mc.Footprint } + isRestricted := mc.Restrict if cfg, ok := scLookup[sc.Name]; ok { - if !cfg.Remove { - availability.SubClusters = append(availability.SubClusters, sc.Name) - newMetric.Peak = cfg.Peak - newMetric.Normal = cfg.Normal - newMetric.Caution = cfg.Caution - newMetric.Alert = cfg.Alert - newMetric.Footprint = cfg.Footprint - newMetric.Energy = cfg.Energy - newMetric.LowerIsBetter = cfg.LowerIsBetter - sc.MetricConfig = append(sc.MetricConfig, *newMetric) - - if newMetric.Footprint != "" { - sc.Footprint = append(sc.Footprint, newMetric.Name) - ml.Footprint = newMetric.Footprint - } - if newMetric.Energy != "" { - sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name) - } + if cfg.Remove { + continue } - } else { - availability.SubClusters = append(availability.SubClusters, sc.Name) - sc.MetricConfig = append(sc.MetricConfig, *newMetric) + newMetric.Peak = cfg.Peak + newMetric.Normal = cfg.Normal + newMetric.Caution = cfg.Caution + newMetric.Alert = cfg.Alert + newMetric.Footprint = cfg.Footprint + newMetric.Energy = cfg.Energy + newMetric.LowerIsBetter = cfg.LowerIsBetter + isRestricted = cfg.Restrict + } - if newMetric.Footprint != "" { - sc.Footprint = append(sc.Footprint, newMetric.Name) - } - if newMetric.Energy != "" { - sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name) + availability.SubClusters = append(availability.SubClusters, sc.Name) + if !isRestricted { + userAvailability.SubClusters = append(userAvailability.SubClusters, sc.Name) + } + sc.MetricConfig = append(sc.MetricConfig, newMetric) + + if newMetric.Footprint != "" { + sc.Footprint = append(sc.Footprint, newMetric.Name) + item := metricLookup[mc.Name] + item.Footprint = newMetric.Footprint + metricLookup[mc.Name] = item + } + if newMetric.Energy != "" { + sc.EnergyFootprint = append(sc.EnergyFootprint, newMetric.Name) + } + + // Init Topology Lookup Maps Once Per Subcluster + sc.Topology.InitTopologyMaps() + } + + item := metricLookup[mc.Name] + item.Availability = append(item.Availability, availability) + metricLookup[mc.Name] = item + + if len(userAvailability.SubClusters) > 0 { + userItem, ok := userMetricLookup[mc.Name] + if !ok { + userItem = schema.GlobalMetricListItem{ + Name: mc.Name, Scope: mc.Scope, Unit: mc.Unit, Footprint: mc.Footprint, } } + userItem.Availability = append(userItem.Availability, userAvailability) + userMetricLookup[mc.Name] = userItem } - ml.Availability = append(metricLookup[mc.Name].Availability, availability) - metricLookup[mc.Name] = ml } Clusters = append(Clusters, cluster) @@ -141,8 +163,11 @@ func initClusterConfig() error { } } - for _, ml := range metricLookup { - GlobalMetricList = append(GlobalMetricList, &ml) + for _, metric := range metricLookup { + GlobalMetricList = append(GlobalMetricList, &metric) + } + for _, metric := range userMetricLookup { + GlobalUserMetricList = append(GlobalUserMetricList, &metric) } return nil @@ -278,7 +303,7 @@ func GetSubClusterByNode(cluster, hostname string) (string, error) { return "", fmt.Errorf("ARCHIVE/CLUSTERCONFIG > no subcluster found for cluster %v and host %v", cluster, hostname) } -func MetricIndex(mc []schema.MetricConfig, name string) (int, error) { +func MetricIndex(mc []*schema.MetricConfig, name string) (int, error) { for i, m := range mc { if m.Name == name { return i, nil diff --git a/pkg/archive/clusterConfig_test.go b/pkg/archive/clusterConfig_test.go index 36130175..510c1747 100644 --- a/pkg/archive/clusterConfig_test.go +++ b/pkg/archive/clusterConfig_test.go @@ -12,7 +12,7 @@ import ( ) func TestClusterConfig(t *testing.T) { - if err := archive.Init(json.RawMessage("{\"kind\": \"file\",\"path\": \"testdata/archive\"}"), false); err != nil { + if err := archive.Init(json.RawMessage("{\"kind\": \"file\",\"path\": \"testdata/archive\"}")); err != nil { t.Fatal(err) } diff --git a/pkg/archive/fsBackend.go b/pkg/archive/fsBackend.go index 1e9d7db3..dfc870b4 100644 --- a/pkg/archive/fsBackend.go +++ b/pkg/archive/fsBackend.go @@ -16,15 +16,17 @@ import ( "os" "path" "path/filepath" + "slices" "strconv" "strings" + "sync" "text/tabwriter" "time" "github.com/ClusterCockpit/cc-backend/internal/config" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/ClusterCockpit/cc-lib/util" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" "github.com/santhosh-tekuri/jsonschema/v5" ) @@ -187,7 +189,7 @@ func (fsa *FsArchive) Init(rawConfig json.RawMessage) (uint64, error) { if isEmpty { cclog.Infof("fsBackend Init() > Bootstrapping new archive at %s", fsa.path) versionStr := fmt.Sprintf("%d\n", Version) - if err := os.WriteFile(filepath.Join(fsa.path, "version.txt"), []byte(versionStr), 0644); err != nil { + if err := os.WriteFile(filepath.Join(fsa.path, "version.txt"), []byte(versionStr), 0o644); err != nil { cclog.Errorf("fsBackend Init() > failed to create version.txt: %v", err) return 0, err } @@ -490,7 +492,44 @@ func (fsa *FsArchive) LoadClusterCfg(name string) (*schema.Cluster, error) { func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer { ch := make(chan JobContainer) + go func() { + defer close(ch) + + numWorkers := 4 + jobPaths := make(chan string, numWorkers*2) + var wg sync.WaitGroup + + for range numWorkers { + wg.Go(func() { + for jobPath := range jobPaths { + job, err := loadJobMeta(filepath.Join(jobPath, "meta.json")) + if err != nil && !errors.Is(err, &jsonschema.ValidationError{}) { + cclog.Errorf("in %s: %s", jobPath, err.Error()) + continue + } + + if loadMetricData { + isCompressed := true + filename := filepath.Join(jobPath, "data.json.gz") + + if !util.CheckFileExists(filename) { + filename = filepath.Join(jobPath, "data.json") + isCompressed = false + } + + data, err := loadJobData(filename, isCompressed) + if err != nil && !errors.Is(err, &jsonschema.ValidationError{}) { + cclog.Errorf("in %s: %s", jobPath, err.Error()) + } + ch <- JobContainer{Meta: job, Data: &data} + } else { + ch <- JobContainer{Meta: job, Data: nil} + } + } + }) + } + clustersDir, err := os.ReadDir(fsa.path) if err != nil { cclog.Fatalf("Reading clusters failed @ cluster dirs: %s", err.Error()) @@ -507,7 +546,6 @@ func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer { for _, lvl1Dir := range lvl1Dirs { if !lvl1Dir.IsDir() { - // Could be the cluster.json file continue } @@ -525,35 +563,17 @@ func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer { for _, startTimeDir := range startTimeDirs { if startTimeDir.IsDir() { - job, err := loadJobMeta(filepath.Join(dirpath, startTimeDir.Name(), "meta.json")) - if err != nil && !errors.Is(err, &jsonschema.ValidationError{}) { - cclog.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error()) - } - - if loadMetricData { - isCompressed := true - filename := filepath.Join(dirpath, startTimeDir.Name(), "data.json.gz") - - if !util.CheckFileExists(filename) { - filename = filepath.Join(dirpath, startTimeDir.Name(), "data.json") - isCompressed = false - } - - data, err := loadJobData(filename, isCompressed) - if err != nil && !errors.Is(err, &jsonschema.ValidationError{}) { - cclog.Errorf("in %s: %s", filepath.Join(dirpath, startTimeDir.Name()), err.Error()) - } - ch <- JobContainer{Meta: job, Data: &data} - } else { - ch <- JobContainer{Meta: job, Data: nil} - } + jobPaths <- filepath.Join(dirpath, startTimeDir.Name()) } } } } } - close(ch) + + close(jobPaths) + wg.Wait() }() + return ch } @@ -603,24 +623,57 @@ func (fsa *FsArchive) ImportJob( return err } - f, err = os.Create(path.Join(dir, "data.json")) - if err != nil { - cclog.Error("Error while creating filepath for data.json") + var dataBuf bytes.Buffer + if err := EncodeJobData(&dataBuf, jobData); err != nil { + cclog.Error("Error while encoding job metricdata") return err } - if err := EncodeJobData(f, jobData); err != nil { - cclog.Error("Error while encoding job metricdata to data.json file") - return err + + if dataBuf.Len() > 2000 { + f, err = os.Create(path.Join(dir, "data.json.gz")) + if err != nil { + cclog.Error("Error while creating filepath for data.json.gz") + return err + } + gzipWriter := gzip.NewWriter(f) + if _, err := gzipWriter.Write(dataBuf.Bytes()); err != nil { + cclog.Error("Error while writing compressed job data") + gzipWriter.Close() + f.Close() + return err + } + if err := gzipWriter.Close(); err != nil { + cclog.Warn("Error while closing gzip writer") + f.Close() + return err + } + if err := f.Close(); err != nil { + cclog.Warn("Error while closing data.json.gz file") + return err + } + } else { + f, err = os.Create(path.Join(dir, "data.json")) + if err != nil { + cclog.Error("Error while creating filepath for data.json") + return err + } + if _, err := f.Write(dataBuf.Bytes()); err != nil { + cclog.Error("Error while writing job metricdata to data.json file") + f.Close() + return err + } + if err := f.Close(); err != nil { + cclog.Warn("Error while closing data.json file") + return err + } } - if err := f.Close(); err != nil { - cclog.Warn("Error while closing data.json file") - } - return err + + return nil } func (fsa *FsArchive) StoreClusterCfg(name string, config *schema.Cluster) error { dir := filepath.Join(fsa.path, name) - if err := os.MkdirAll(dir, 0777); err != nil { + if err := os.MkdirAll(dir, 0o777); err != nil { cclog.Errorf("StoreClusterCfg() > mkdir error: %v", err) return err } @@ -638,13 +691,7 @@ func (fsa *FsArchive) StoreClusterCfg(name string, config *schema.Cluster) error } // Update clusters list if new - found := false - for _, c := range fsa.clusters { - if c == name { - found = true - break - } - } + found := slices.Contains(fsa.clusters, name) if !found { fsa.clusters = append(fsa.clusters, name) } diff --git a/pkg/archive/fsBackend_test.go b/pkg/archive/fsBackend_test.go index a43a6c3a..05491f61 100644 --- a/pkg/archive/fsBackend_test.go +++ b/pkg/archive/fsBackend_test.go @@ -10,8 +10,8 @@ import ( "path/filepath" "testing" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/ClusterCockpit/cc-lib/util" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" ) func TestInitEmptyPath(t *testing.T) { diff --git a/pkg/archive/json.go b/pkg/archive/json.go index 75c39531..dd37075d 100644 --- a/pkg/archive/json.go +++ b/pkg/archive/json.go @@ -10,8 +10,8 @@ import ( "io" "time" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" ) func DecodeJobData(r io.Reader, k string) (schema.JobData, error) { @@ -51,7 +51,7 @@ func DecodeJobStats(r io.Reader, k string) (schema.ScopedJobStats, error) { for _, series := range jobMetric.Series { scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{ Hostname: series.Hostname, - Id: series.Id, + ID: series.ID, Data: &series.Statistics, }) } diff --git a/pkg/archive/nodelist.go b/pkg/archive/nodelist.go index ffb5f563..42d8492a 100644 --- a/pkg/archive/nodelist.go +++ b/pkg/archive/nodelist.go @@ -3,6 +3,70 @@ // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. +// Package archive provides nodelist parsing functionality for HPC cluster node specifications. +// +// # Overview +// +// The nodelist package implements parsing and querying of compact node list representations +// commonly used in HPC job schedulers and cluster management systems. It converts compressed +// node specifications (e.g., "node[01-10]") into queryable structures that can efficiently +// test node membership and expand to full node lists. +// +// # Node List Format +// +// Node lists use a compact syntax with the following rules: +// +// 1. Comma-separated terms represent alternative node patterns (OR logic) +// 2. Each term consists of a string prefix followed by optional numeric ranges +// 3. Numeric ranges are specified in square brackets with zero-padded start-end format +// 4. Multiple ranges within brackets are comma-separated +// 5. Range digits must be zero-padded and of equal length (e.g., "01-99" not "1-99") +// +// # Examples +// +// "node01" // Single node +// "node01,node02" // Multiple individual nodes +// "node[01-10]" // Range: node01 through node10 (zero-padded) +// "node[01-10,20-30]" // Multiple ranges: node01-10 and node20-30 +// "cn-00[10-20],cn-00[50-60]" // Different prefixes with ranges +// "login,compute[001-100]" // Mixed individual and range terms +// +// # Usage +// +// Parse a node list specification: +// +// nl, err := ParseNodeList("node[01-10],login") +// if err != nil { +// log.Fatal(err) +// } +// +// Check if a node name matches the list: +// +// if nl.Contains("node05") { +// // node05 is in the list +// } +// +// Expand to full list of node names: +// +// nodes := nl.PrintList() // ["node01", "node02", ..., "node10", "login"] +// +// Count total nodes in the list: +// +// count := nl.NodeCount() // 11 (10 from range + 1 individual) +// +// # Integration +// +// This package is used by: +// - clusterConfig.go: Parses SubCluster.Nodes field from cluster configuration +// - schema.resolvers.go: GraphQL resolver for computing numberOfNodes in subclusters +// - Job archive: Validates node assignments against configured cluster topology +// +// # Constraints +// +// - Only zero-padded numeric ranges are supported +// - Range start and end must have identical digit counts +// - No whitespace allowed in node list specifications +// - Ranges must be specified as start-end (not individual numbers) package archive import ( @@ -10,15 +74,39 @@ import ( "strconv" "strings" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" ) +// NodeList represents a parsed node list specification as a collection of node pattern terms. +// Each term is a sequence of expressions that must match consecutively for a node name to match. +// Terms are evaluated with OR logic - a node matches if ANY term matches completely. +// +// Internal structure: +// - Outer slice: OR terms (comma-separated in input) +// - Inner slice: AND expressions (must all match sequentially) +// - Each expression implements: consume (pattern matching), limits (range info), prefix (string part) +// +// Example: "node[01-10],login" becomes: +// - Term 1: [NLExprString("node"), NLExprIntRanges(01-10)] +// - Term 2: [NLExprString("login")] type NodeList [][]interface { consume(input string) (next string, ok bool) limits() []map[string]int prefix() string } +// Contains tests whether the given node name matches any pattern in the NodeList. +// Returns true if the name matches at least one term completely, false otherwise. +// +// Matching logic: +// - Evaluates each term sequentially (OR logic across terms) +// - Within a term, all expressions must match in order (AND logic) +// - A match is complete only if the entire input is consumed (str == "") +// +// Examples: +// - NodeList("node[01-10]").Contains("node05") → true +// - NodeList("node[01-10]").Contains("node11") → false +// - NodeList("node[01-10]").Contains("node5") → false (missing zero-padding) func (nl *NodeList) Contains(name string) bool { var ok bool for _, term := range *nl { @@ -38,14 +126,22 @@ func (nl *NodeList) Contains(name string) bool { return false } +// PrintList expands the NodeList into a full slice of individual node names. +// This performs the inverse operation of ParseNodeList, expanding all ranges +// into their constituent node names with proper zero-padding. +// +// Returns a slice of node names in the order they appear in the NodeList. +// For range terms, nodes are expanded in ascending numeric order. +// +// Example: +// - ParseNodeList("node[01-03],login").PrintList() → ["node01", "node02", "node03", "login"] func (nl *NodeList) PrintList() []string { var out []string for _, term := range *nl { - // Get String-Part first prefix := term[0].prefix() - if len(term) == 1 { // If only String-Part in Term: Single Node Name -> Use as provided + if len(term) == 1 { out = append(out, prefix) - } else { // Else: Numeric start-end definition with x digits zeroPadded + } else { limitArr := term[1].limits() for _, inner := range limitArr { for i := inner["start"]; i < inner["end"]+1; i++ { @@ -61,12 +157,22 @@ func (nl *NodeList) PrintList() []string { return out } +// NodeCount returns the total number of individual nodes represented by the NodeList. +// This efficiently counts nodes without expanding the full list, making it suitable +// for large node ranges. +// +// Calculation: +// - Individual node terms contribute 1 +// - Range terms contribute (end - start + 1) for each range +// +// Example: +// - ParseNodeList("node[01-10],login").NodeCount() → 11 (10 from range + 1 individual) func (nl *NodeList) NodeCount() int { out := 0 for _, term := range *nl { - if len(term) == 1 { // If only String-Part in Term: Single Node Name -> add one + if len(term) == 1 { out += 1 - } else { // Else: Numeric start-end definition -> add difference + 1 + } else { limitArr := term[1].limits() for _, inner := range limitArr { out += (inner["end"] - inner["start"]) + 1 @@ -76,6 +182,8 @@ func (nl *NodeList) NodeCount() int { return out } +// NLExprString represents a literal string prefix in a node name pattern. +// It matches by checking if the input starts with this exact string. type NLExprString string func (nle NLExprString) consume(input string) (next string, ok bool) { @@ -96,6 +204,8 @@ func (nle NLExprString) prefix() string { return string(nle) } +// NLExprIntRanges represents multiple alternative integer ranges (comma-separated within brackets). +// A node name matches if it matches ANY of the contained ranges (OR logic). type NLExprIntRanges []NLExprIntRange func (nles NLExprIntRanges) consume(input string) (next string, ok bool) { @@ -122,6 +232,11 @@ func (nles NLExprIntRanges) prefix() string { return s } +// NLExprIntRange represents a single zero-padded integer range (e.g., "01-99"). +// Fields: +// - start, end: Numeric range boundaries (inclusive) +// - zeroPadded: Must be true (non-padded ranges not supported) +// - digits: Required digit count for zero-padding type NLExprIntRange struct { start, end int64 zeroPadded bool @@ -176,6 +291,28 @@ func (nles NLExprIntRange) prefix() string { return s } +// ParseNodeList parses a compact node list specification into a queryable NodeList structure. +// +// Input format rules: +// - Comma-separated terms (OR logic): "node01,node02" matches either node +// - Range syntax: "node[01-10]" expands to node01 through node10 +// - Multiple ranges: "node[01-05,10-15]" creates two ranges +// - Zero-padding required: digits in ranges must be zero-padded and equal length +// - Mixed formats: "login,compute[001-100]" combines individual and range terms +// +// Validation: +// - Returns error if brackets are unclosed +// - Returns error if ranges lack '-' separator +// - Returns error if range digits have unequal length +// - Returns error if range numbers fail to parse +// - Returns error on invalid characters +// +// Examples: +// - "node[01-10]" → NodeList with one term (10 nodes) +// - "node01,node02" → NodeList with two terms (2 nodes) +// - "cn[01-05,10-15]" → NodeList with ranges 01-05 and 10-15 (11 nodes total) +// - "a[1-9]" → Error (not zero-padded) +// - "a[01-9]" → Error (unequal digit counts) func ParseNodeList(raw string) (NodeList, error) { isLetter := func(r byte) bool { return ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z') } isDigit := func(r byte) bool { return '0' <= r && r <= '9' } @@ -232,12 +369,12 @@ func ParseNodeList(raw string) (NodeList, error) { nles := NLExprIntRanges{} for _, part := range parts { - minus := strings.Index(part, "-") - if minus == -1 { + before, after, ok := strings.Cut(part, "-") + if !ok { return nil, fmt.Errorf("ARCHIVE/NODELIST > no '-' found inside '[...]'") } - s1, s2 := part[0:minus], part[minus+1:] + s1, s2 := before, after if len(s1) != len(s2) || len(s1) == 0 { return nil, fmt.Errorf("ARCHIVE/NODELIST > %v and %v are not of equal length or of length zero", s1, s2) } diff --git a/pkg/archive/parquet/convert.go b/pkg/archive/parquet/convert.go new file mode 100644 index 00000000..43e611e4 --- /dev/null +++ b/pkg/archive/parquet/convert.go @@ -0,0 +1,200 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "compress/gzip" + "encoding/json" + "fmt" + + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +// JobToParquetRow converts job metadata and metric data into a flat ParquetJobRow. +// Nested fields are marshaled to JSON; metric data is gzip-compressed JSON. +func JobToParquetRow(meta *schema.Job, data *schema.JobData) (*ParquetJobRow, error) { + resourcesJSON, err := json.Marshal(meta.Resources) + if err != nil { + return nil, fmt.Errorf("marshal resources: %w", err) + } + + var statisticsJSON []byte + if meta.Statistics != nil { + statisticsJSON, err = json.Marshal(meta.Statistics) + if err != nil { + return nil, fmt.Errorf("marshal statistics: %w", err) + } + } + + var tagsJSON []byte + if len(meta.Tags) > 0 { + tagsJSON, err = json.Marshal(meta.Tags) + if err != nil { + return nil, fmt.Errorf("marshal tags: %w", err) + } + } + + var metaDataJSON []byte + if meta.MetaData != nil { + metaDataJSON, err = json.Marshal(meta.MetaData) + if err != nil { + return nil, fmt.Errorf("marshal metadata: %w", err) + } + } + + var footprintJSON []byte + if meta.Footprint != nil { + footprintJSON, err = json.Marshal(meta.Footprint) + if err != nil { + return nil, fmt.Errorf("marshal footprint: %w", err) + } + } + + var energyFootJSON []byte + if meta.EnergyFootprint != nil { + energyFootJSON, err = json.Marshal(meta.EnergyFootprint) + if err != nil { + return nil, fmt.Errorf("marshal energy footprint: %w", err) + } + } + + metricDataGz, err := compressJobData(data) + if err != nil { + return nil, fmt.Errorf("compress metric data: %w", err) + } + + return &ParquetJobRow{ + JobID: meta.JobID, + Cluster: meta.Cluster, + SubCluster: meta.SubCluster, + Partition: meta.Partition, + Project: meta.Project, + User: meta.User, + State: string(meta.State), + StartTime: meta.StartTime, + Duration: meta.Duration, + Walltime: meta.Walltime, + NumNodes: meta.NumNodes, + NumHWThreads: meta.NumHWThreads, + NumAcc: meta.NumAcc, + Energy: meta.Energy, + SMT: meta.SMT, + ResourcesJSON: resourcesJSON, + StatisticsJSON: statisticsJSON, + TagsJSON: tagsJSON, + MetaDataJSON: metaDataJSON, + FootprintJSON: footprintJSON, + EnergyFootJSON: energyFootJSON, + MetricDataGz: metricDataGz, + }, nil +} + +// ParquetRowToJob converts a ParquetJobRow back into job metadata and metric data. +// This is the reverse of JobToParquetRow. +func ParquetRowToJob(row *ParquetJobRow) (*schema.Job, *schema.JobData, error) { + meta := &schema.Job{ + JobID: row.JobID, + Cluster: row.Cluster, + SubCluster: row.SubCluster, + Partition: row.Partition, + Project: row.Project, + User: row.User, + State: schema.JobState(row.State), + StartTime: row.StartTime, + Duration: row.Duration, + Walltime: row.Walltime, + NumNodes: row.NumNodes, + NumHWThreads: row.NumHWThreads, + NumAcc: row.NumAcc, + Energy: row.Energy, + SMT: row.SMT, + } + + if len(row.ResourcesJSON) > 0 { + if err := json.Unmarshal(row.ResourcesJSON, &meta.Resources); err != nil { + return nil, nil, fmt.Errorf("unmarshal resources: %w", err) + } + } + + if len(row.StatisticsJSON) > 0 { + if err := json.Unmarshal(row.StatisticsJSON, &meta.Statistics); err != nil { + return nil, nil, fmt.Errorf("unmarshal statistics: %w", err) + } + } + + if len(row.TagsJSON) > 0 { + if err := json.Unmarshal(row.TagsJSON, &meta.Tags); err != nil { + return nil, nil, fmt.Errorf("unmarshal tags: %w", err) + } + } + + if len(row.MetaDataJSON) > 0 { + if err := json.Unmarshal(row.MetaDataJSON, &meta.MetaData); err != nil { + return nil, nil, fmt.Errorf("unmarshal metadata: %w", err) + } + } + + if len(row.FootprintJSON) > 0 { + if err := json.Unmarshal(row.FootprintJSON, &meta.Footprint); err != nil { + return nil, nil, fmt.Errorf("unmarshal footprint: %w", err) + } + } + + if len(row.EnergyFootJSON) > 0 { + if err := json.Unmarshal(row.EnergyFootJSON, &meta.EnergyFootprint); err != nil { + return nil, nil, fmt.Errorf("unmarshal energy footprint: %w", err) + } + } + + data, err := decompressJobData(row.MetricDataGz) + if err != nil { + return nil, nil, fmt.Errorf("decompress metric data: %w", err) + } + + return meta, data, nil +} + +func decompressJobData(data []byte) (*schema.JobData, error) { + gz, err := gzip.NewReader(bytes.NewReader(data)) + if err != nil { + return nil, err + } + defer gz.Close() + + var buf bytes.Buffer + if _, err := buf.ReadFrom(gz); err != nil { + return nil, err + } + + var jobData schema.JobData + if err := json.Unmarshal(buf.Bytes(), &jobData); err != nil { + return nil, err + } + + return &jobData, nil +} + +func compressJobData(data *schema.JobData) ([]byte, error) { + jsonBytes, err := json.Marshal(data) + if err != nil { + return nil, err + } + + var buf bytes.Buffer + gz, err := gzip.NewWriterLevel(&buf, gzip.BestCompression) + if err != nil { + return nil, err + } + if _, err := gz.Write(jsonBytes); err != nil { + return nil, err + } + if err := gz.Close(); err != nil { + return nil, err + } + + return buf.Bytes(), nil +} diff --git a/pkg/archive/parquet/convert_test.go b/pkg/archive/parquet/convert_test.go new file mode 100644 index 00000000..3b2848ba --- /dev/null +++ b/pkg/archive/parquet/convert_test.go @@ -0,0 +1,305 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "testing" + + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +func TestParquetRowToJob(t *testing.T) { + meta := &schema.Job{ + JobID: 42, + Cluster: "testcluster", + SubCluster: "sc0", + Partition: "main", + Project: "testproject", + User: "testuser", + State: schema.JobStateCompleted, + StartTime: 1700000000, + Duration: 3600, + Walltime: 7200, + NumNodes: 2, + NumHWThreads: 16, + NumAcc: 4, + Energy: 123.45, + SMT: 2, + Resources: []*schema.Resource{ + {Hostname: "node001", HWThreads: []int{0, 1, 2, 3}}, + {Hostname: "node002", HWThreads: []int{4, 5, 6, 7}}, + }, + Statistics: map[string]schema.JobStatistics{ + "cpu_load": {Avg: 50.0, Min: 10.0, Max: 90.0}, + }, + Tags: []*schema.Tag{ + {Type: "test", Name: "tag1"}, + }, + MetaData: map[string]string{ + "key1": "value1", + }, + Footprint: map[string]float64{ + "cpu_load": 50.0, + }, + EnergyFootprint: map[string]float64{ + "total": 123.45, + }, + } + + data := &schema.JobData{ + "cpu_load": { + schema.MetricScopeNode: &schema.JobMetric{ + Unit: schema.Unit{Base: ""}, + Timestep: 60, + Series: []schema.Series{ + { + Hostname: "node001", + Data: []schema.Float{1.0, 2.0, 3.0}, + }, + }, + }, + }, + } + + // Convert to parquet row + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("JobToParquetRow: %v", err) + } + + // Convert back + gotMeta, gotData, err := ParquetRowToJob(row) + if err != nil { + t.Fatalf("ParquetRowToJob: %v", err) + } + + // Verify scalar fields + if gotMeta.JobID != meta.JobID { + t.Errorf("JobID = %d, want %d", gotMeta.JobID, meta.JobID) + } + if gotMeta.Cluster != meta.Cluster { + t.Errorf("Cluster = %q, want %q", gotMeta.Cluster, meta.Cluster) + } + if gotMeta.SubCluster != meta.SubCluster { + t.Errorf("SubCluster = %q, want %q", gotMeta.SubCluster, meta.SubCluster) + } + if gotMeta.Partition != meta.Partition { + t.Errorf("Partition = %q, want %q", gotMeta.Partition, meta.Partition) + } + if gotMeta.Project != meta.Project { + t.Errorf("Project = %q, want %q", gotMeta.Project, meta.Project) + } + if gotMeta.User != meta.User { + t.Errorf("User = %q, want %q", gotMeta.User, meta.User) + } + if gotMeta.State != meta.State { + t.Errorf("State = %q, want %q", gotMeta.State, meta.State) + } + if gotMeta.StartTime != meta.StartTime { + t.Errorf("StartTime = %d, want %d", gotMeta.StartTime, meta.StartTime) + } + if gotMeta.Duration != meta.Duration { + t.Errorf("Duration = %d, want %d", gotMeta.Duration, meta.Duration) + } + if gotMeta.Walltime != meta.Walltime { + t.Errorf("Walltime = %d, want %d", gotMeta.Walltime, meta.Walltime) + } + if gotMeta.NumNodes != meta.NumNodes { + t.Errorf("NumNodes = %d, want %d", gotMeta.NumNodes, meta.NumNodes) + } + if gotMeta.NumHWThreads != meta.NumHWThreads { + t.Errorf("NumHWThreads = %d, want %d", gotMeta.NumHWThreads, meta.NumHWThreads) + } + if gotMeta.NumAcc != meta.NumAcc { + t.Errorf("NumAcc = %d, want %d", gotMeta.NumAcc, meta.NumAcc) + } + if gotMeta.Energy != meta.Energy { + t.Errorf("Energy = %f, want %f", gotMeta.Energy, meta.Energy) + } + if gotMeta.SMT != meta.SMT { + t.Errorf("SMT = %d, want %d", gotMeta.SMT, meta.SMT) + } + + // Verify complex fields + if len(gotMeta.Resources) != 2 { + t.Fatalf("Resources len = %d, want 2", len(gotMeta.Resources)) + } + if gotMeta.Resources[0].Hostname != "node001" { + t.Errorf("Resources[0].Hostname = %q, want %q", gotMeta.Resources[0].Hostname, "node001") + } + if len(gotMeta.Resources[0].HWThreads) != 4 { + t.Errorf("Resources[0].HWThreads len = %d, want 4", len(gotMeta.Resources[0].HWThreads)) + } + + if len(gotMeta.Statistics) != 1 { + t.Fatalf("Statistics len = %d, want 1", len(gotMeta.Statistics)) + } + if stat, ok := gotMeta.Statistics["cpu_load"]; !ok { + t.Error("Statistics missing cpu_load") + } else if stat.Avg != 50.0 { + t.Errorf("Statistics[cpu_load].Avg = %f, want 50.0", stat.Avg) + } + + if len(gotMeta.Tags) != 1 || gotMeta.Tags[0].Name != "tag1" { + t.Errorf("Tags = %v, want [{test tag1}]", gotMeta.Tags) + } + + if gotMeta.MetaData["key1"] != "value1" { + t.Errorf("MetaData[key1] = %q, want %q", gotMeta.MetaData["key1"], "value1") + } + + if gotMeta.Footprint["cpu_load"] != 50.0 { + t.Errorf("Footprint[cpu_load] = %f, want 50.0", gotMeta.Footprint["cpu_load"]) + } + + if gotMeta.EnergyFootprint["total"] != 123.45 { + t.Errorf("EnergyFootprint[total] = %f, want 123.45", gotMeta.EnergyFootprint["total"]) + } + + // Verify metric data + if gotData == nil { + t.Fatal("JobData is nil") + } + cpuLoad, ok := (*gotData)["cpu_load"] + if !ok { + t.Fatal("JobData missing cpu_load") + } + nodeMetric, ok := cpuLoad[schema.MetricScopeNode] + if !ok { + t.Fatal("cpu_load missing node scope") + } + if nodeMetric.Timestep != 60 { + t.Errorf("Timestep = %d, want 60", nodeMetric.Timestep) + } + if len(nodeMetric.Series) != 1 { + t.Fatalf("Series len = %d, want 1", len(nodeMetric.Series)) + } + if nodeMetric.Series[0].Hostname != "node001" { + t.Errorf("Series[0].Hostname = %q, want %q", nodeMetric.Series[0].Hostname, "node001") + } + if len(nodeMetric.Series[0].Data) != 3 { + t.Errorf("Series[0].Data len = %d, want 3", len(nodeMetric.Series[0].Data)) + } +} + +func TestParquetRowToJobNilOptionalFields(t *testing.T) { + meta := &schema.Job{ + JobID: 1, + Cluster: "test", + SubCluster: "sc0", + Project: "proj", + User: "user", + State: schema.JobStateCompleted, + StartTime: 1700000000, + Duration: 60, + NumNodes: 1, + Resources: []*schema.Resource{ + {Hostname: "node001"}, + }, + } + + data := &schema.JobData{ + "cpu_load": { + schema.MetricScopeNode: &schema.JobMetric{ + Timestep: 60, + Series: []schema.Series{ + {Hostname: "node001", Data: []schema.Float{1.0}}, + }, + }, + }, + } + + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("JobToParquetRow: %v", err) + } + + gotMeta, gotData, err := ParquetRowToJob(row) + if err != nil { + t.Fatalf("ParquetRowToJob: %v", err) + } + + if gotMeta.JobID != 1 { + t.Errorf("JobID = %d, want 1", gotMeta.JobID) + } + if gotMeta.Tags != nil { + t.Errorf("Tags should be nil, got %v", gotMeta.Tags) + } + if gotMeta.Statistics != nil { + t.Errorf("Statistics should be nil, got %v", gotMeta.Statistics) + } + if gotMeta.MetaData != nil { + t.Errorf("MetaData should be nil, got %v", gotMeta.MetaData) + } + if gotMeta.Footprint != nil { + t.Errorf("Footprint should be nil, got %v", gotMeta.Footprint) + } + if gotMeta.EnergyFootprint != nil { + t.Errorf("EnergyFootprint should be nil, got %v", gotMeta.EnergyFootprint) + } + if gotData == nil { + t.Fatal("JobData is nil") + } +} + +func TestRoundTripThroughParquetFile(t *testing.T) { + meta, data := makeTestJob(999) + meta.Tags = []*schema.Tag{{Type: "test", Name: "roundtrip"}} + + // Convert to row and write to parquet + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("JobToParquetRow: %v", err) + } + + // Write to parquet bytes + parquetBytes, err := writeParquetBytes([]ParquetJobRow{*row}) + if err != nil { + t.Fatalf("writeParquetBytes: %v", err) + } + + // Read back from parquet bytes + rows, err := ReadParquetFile(parquetBytes) + if err != nil { + t.Fatalf("ReadParquetFile: %v", err) + } + if len(rows) != 1 { + t.Fatalf("expected 1 row, got %d", len(rows)) + } + + // Convert back to job + gotMeta, gotData, err := ParquetRowToJob(&rows[0]) + if err != nil { + t.Fatalf("ParquetRowToJob: %v", err) + } + + // Verify key fields survived the round trip + if gotMeta.JobID != 999 { + t.Errorf("JobID = %d, want 999", gotMeta.JobID) + } + if gotMeta.Cluster != "testcluster" { + t.Errorf("Cluster = %q, want %q", gotMeta.Cluster, "testcluster") + } + if gotMeta.User != "testuser" { + t.Errorf("User = %q, want %q", gotMeta.User, "testuser") + } + if gotMeta.State != schema.JobStateCompleted { + t.Errorf("State = %q, want %q", gotMeta.State, schema.JobStateCompleted) + } + if len(gotMeta.Tags) != 1 || gotMeta.Tags[0].Name != "roundtrip" { + t.Errorf("Tags = %v, want [{test roundtrip}]", gotMeta.Tags) + } + if len(gotMeta.Resources) != 2 { + t.Errorf("Resources len = %d, want 2", len(gotMeta.Resources)) + } + + if gotData == nil { + t.Fatal("JobData is nil") + } + if _, ok := (*gotData)["cpu_load"]; !ok { + t.Error("JobData missing cpu_load") + } +} diff --git a/pkg/archive/parquet/nodestate_schema.go b/pkg/archive/parquet/nodestate_schema.go new file mode 100644 index 00000000..c9dfe363 --- /dev/null +++ b/pkg/archive/parquet/nodestate_schema.go @@ -0,0 +1,20 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +type ParquetNodeStateRow struct { + TimeStamp int64 `parquet:"time_stamp"` + NodeState string `parquet:"node_state"` + HealthState string `parquet:"health_state"` + HealthMetrics string `parquet:"health_metrics,optional"` + CpusAllocated int32 `parquet:"cpus_allocated"` + MemoryAllocated int64 `parquet:"memory_allocated"` + GpusAllocated int32 `parquet:"gpus_allocated"` + JobsRunning int32 `parquet:"jobs_running"` + Hostname string `parquet:"hostname"` + Cluster string `parquet:"cluster"` + SubCluster string `parquet:"subcluster"` +} diff --git a/pkg/archive/parquet/nodestate_writer.go b/pkg/archive/parquet/nodestate_writer.go new file mode 100644 index 00000000..074e02e4 --- /dev/null +++ b/pkg/archive/parquet/nodestate_writer.go @@ -0,0 +1,110 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "fmt" + "time" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + pq "github.com/parquet-go/parquet-go" +) + +// NodeStateParquetWriter batches ParquetNodeStateRows and flushes them to a target +// when the estimated size exceeds maxSizeBytes. +type NodeStateParquetWriter struct { + target ParquetTarget + maxSizeBytes int64 + rows []ParquetNodeStateRow + currentSize int64 + fileCounter int + datePrefix string +} + +// NewNodeStateParquetWriter creates a new writer for node state parquet files. +func NewNodeStateParquetWriter(target ParquetTarget, maxSizeMB int) *NodeStateParquetWriter { + return &NodeStateParquetWriter{ + target: target, + maxSizeBytes: int64(maxSizeMB) * 1024 * 1024, + datePrefix: time.Now().Format("2006-01-02"), + } +} + +// AddRow adds a row to the current batch. If the estimated batch size +// exceeds the configured maximum, the batch is flushed first. +func (pw *NodeStateParquetWriter) AddRow(row ParquetNodeStateRow) error { + rowSize := estimateNodeStateRowSize(&row) + + if pw.currentSize+rowSize > pw.maxSizeBytes && len(pw.rows) > 0 { + if err := pw.Flush(); err != nil { + return err + } + } + + pw.rows = append(pw.rows, row) + pw.currentSize += rowSize + return nil +} + +// Flush writes the current batch to a parquet file on the target. +func (pw *NodeStateParquetWriter) Flush() error { + if len(pw.rows) == 0 { + return nil + } + + pw.fileCounter++ + fileName := fmt.Sprintf("cc-nodestate-%s-%03d.parquet", pw.datePrefix, pw.fileCounter) + + data, err := writeNodeStateParquetBytes(pw.rows) + if err != nil { + return fmt.Errorf("write parquet buffer: %w", err) + } + + if err := pw.target.WriteFile(fileName, data); err != nil { + return fmt.Errorf("write parquet file %q: %w", fileName, err) + } + + cclog.Infof("NodeState retention: wrote %s (%d rows, %d bytes)", fileName, len(pw.rows), len(data)) + pw.rows = pw.rows[:0] + pw.currentSize = 0 + return nil +} + +// Close flushes any remaining rows and finalizes the writer. +func (pw *NodeStateParquetWriter) Close() error { + return pw.Flush() +} + +func writeNodeStateParquetBytes(rows []ParquetNodeStateRow) ([]byte, error) { + var buf bytes.Buffer + + writer := pq.NewGenericWriter[ParquetNodeStateRow](&buf, + pq.Compression(&pq.Zstd), + pq.SortingWriterConfig(pq.SortingColumns( + pq.Ascending("cluster"), + pq.Ascending("subcluster"), + pq.Ascending("hostname"), + pq.Ascending("time_stamp"), + )), + ) + + if _, err := writer.Write(rows); err != nil { + return nil, err + } + if err := writer.Close(); err != nil { + return nil, err + } + + return buf.Bytes(), nil +} + +func estimateNodeStateRowSize(row *ParquetNodeStateRow) int64 { + size := int64(100) // fixed numeric fields + size += int64(len(row.NodeState) + len(row.HealthState) + len(row.HealthMetrics)) + size += int64(len(row.Hostname) + len(row.Cluster) + len(row.SubCluster)) + return size +} diff --git a/pkg/archive/parquet/reader.go b/pkg/archive/parquet/reader.go new file mode 100644 index 00000000..32486bd5 --- /dev/null +++ b/pkg/archive/parquet/reader.go @@ -0,0 +1,216 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "os" + "path/filepath" + "strings" + + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/aws/aws-sdk-go-v2/aws" + awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" + "github.com/aws/aws-sdk-go-v2/service/s3" + pq "github.com/parquet-go/parquet-go" +) + +// ReadParquetFile reads all ParquetJobRow entries from parquet-encoded bytes. +func ReadParquetFile(data []byte) ([]ParquetJobRow, error) { + file, err := pq.OpenFile(bytes.NewReader(data), int64(len(data))) + if err != nil { + return nil, fmt.Errorf("open parquet: %w", err) + } + + reader := pq.NewGenericReader[ParquetJobRow](file) + defer reader.Close() + + numRows := file.NumRows() + rows := make([]ParquetJobRow, numRows) + n, err := reader.Read(rows) + if err != nil && err != io.EOF { + return nil, fmt.Errorf("read parquet rows: %w", err) + } + + return rows[:n], nil +} + +// ParquetSource abstracts reading parquet archives from different storage backends. +type ParquetSource interface { + GetClusters() ([]string, error) + ListParquetFiles(cluster string) ([]string, error) + ReadFile(path string) ([]byte, error) + ReadClusterConfig(cluster string) (*schema.Cluster, error) +} + +// FileParquetSource reads parquet archives from a local filesystem directory. +type FileParquetSource struct { + path string +} + +func NewFileParquetSource(path string) *FileParquetSource { + return &FileParquetSource{path: path} +} + +func (fs *FileParquetSource) GetClusters() ([]string, error) { + entries, err := os.ReadDir(fs.path) + if err != nil { + return nil, fmt.Errorf("read directory: %w", err) + } + + var clusters []string + for _, e := range entries { + if e.IsDir() { + clusters = append(clusters, e.Name()) + } + } + return clusters, nil +} + +func (fs *FileParquetSource) ListParquetFiles(cluster string) ([]string, error) { + dir := filepath.Join(fs.path, cluster) + entries, err := os.ReadDir(dir) + if err != nil { + return nil, fmt.Errorf("read cluster directory: %w", err) + } + + var files []string + for _, e := range entries { + if !e.IsDir() && strings.HasSuffix(e.Name(), ".parquet") { + files = append(files, filepath.Join(cluster, e.Name())) + } + } + return files, nil +} + +func (fs *FileParquetSource) ReadFile(path string) ([]byte, error) { + return os.ReadFile(filepath.Join(fs.path, path)) +} + +func (fs *FileParquetSource) ReadClusterConfig(cluster string) (*schema.Cluster, error) { + data, err := os.ReadFile(filepath.Join(fs.path, cluster, "cluster.json")) + if err != nil { + return nil, fmt.Errorf("read cluster.json: %w", err) + } + var cfg schema.Cluster + if err := json.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("unmarshal cluster config: %w", err) + } + return &cfg, nil +} + +// S3ParquetSource reads parquet archives from an S3-compatible object store. +type S3ParquetSource struct { + client *s3.Client + bucket string +} + +func NewS3ParquetSource(cfg S3TargetConfig) (*S3ParquetSource, error) { + if cfg.Bucket == "" { + return nil, fmt.Errorf("S3 source: empty bucket name") + } + + region := cfg.Region + if region == "" { + region = "us-east-1" + } + + awsCfg, err := awsconfig.LoadDefaultConfig(context.Background(), + awsconfig.WithRegion(region), + awsconfig.WithCredentialsProvider( + credentials.NewStaticCredentialsProvider(cfg.AccessKey, cfg.SecretKey, ""), + ), + ) + if err != nil { + return nil, fmt.Errorf("S3 source: load AWS config: %w", err) + } + + opts := func(o *s3.Options) { + if cfg.Endpoint != "" { + o.BaseEndpoint = aws.String(cfg.Endpoint) + } + o.UsePathStyle = cfg.UsePathStyle + } + + client := s3.NewFromConfig(awsCfg, opts) + return &S3ParquetSource{client: client, bucket: cfg.Bucket}, nil +} + +func (ss *S3ParquetSource) GetClusters() ([]string, error) { + ctx := context.Background() + paginator := s3.NewListObjectsV2Paginator(ss.client, &s3.ListObjectsV2Input{ + Bucket: aws.String(ss.bucket), + Delimiter: aws.String("/"), + }) + + var clusters []string + for paginator.HasMorePages() { + page, err := paginator.NextPage(ctx) + if err != nil { + return nil, fmt.Errorf("S3 source: list clusters: %w", err) + } + for _, prefix := range page.CommonPrefixes { + if prefix.Prefix != nil { + name := strings.TrimSuffix(*prefix.Prefix, "/") + clusters = append(clusters, name) + } + } + } + return clusters, nil +} + +func (ss *S3ParquetSource) ListParquetFiles(cluster string) ([]string, error) { + ctx := context.Background() + prefix := cluster + "/" + paginator := s3.NewListObjectsV2Paginator(ss.client, &s3.ListObjectsV2Input{ + Bucket: aws.String(ss.bucket), + Prefix: aws.String(prefix), + }) + + var files []string + for paginator.HasMorePages() { + page, err := paginator.NextPage(ctx) + if err != nil { + return nil, fmt.Errorf("S3 source: list parquet files: %w", err) + } + for _, obj := range page.Contents { + if obj.Key != nil && strings.HasSuffix(*obj.Key, ".parquet") { + files = append(files, *obj.Key) + } + } + } + return files, nil +} + +func (ss *S3ParquetSource) ReadFile(path string) ([]byte, error) { + ctx := context.Background() + result, err := ss.client.GetObject(ctx, &s3.GetObjectInput{ + Bucket: aws.String(ss.bucket), + Key: aws.String(path), + }) + if err != nil { + return nil, fmt.Errorf("S3 source: get object %q: %w", path, err) + } + defer result.Body.Close() + return io.ReadAll(result.Body) +} + +func (ss *S3ParquetSource) ReadClusterConfig(cluster string) (*schema.Cluster, error) { + data, err := ss.ReadFile(cluster + "/cluster.json") + if err != nil { + return nil, fmt.Errorf("read cluster.json: %w", err) + } + var cfg schema.Cluster + if err := json.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("unmarshal cluster config: %w", err) + } + return &cfg, nil +} diff --git a/pkg/archive/parquet/schema.go b/pkg/archive/parquet/schema.go new file mode 100644 index 00000000..74f82599 --- /dev/null +++ b/pkg/archive/parquet/schema.go @@ -0,0 +1,32 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +type ParquetJobRow struct { + JobID int64 `parquet:"job_id"` + Cluster string `parquet:"cluster"` + SubCluster string `parquet:"sub_cluster"` + Partition string `parquet:"partition,optional"` + Project string `parquet:"project"` + User string `parquet:"user"` + State string `parquet:"job_state"` + StartTime int64 `parquet:"start_time"` + Duration int32 `parquet:"duration"` + Walltime int64 `parquet:"walltime"` + NumNodes int32 `parquet:"num_nodes"` + NumHWThreads int32 `parquet:"num_hwthreads"` + NumAcc int32 `parquet:"num_acc"` + Exclusive int32 `parquet:"exclusive"` + Energy float64 `parquet:"energy"` + SMT int32 `parquet:"smt"` + ResourcesJSON []byte `parquet:"resources_json"` + StatisticsJSON []byte `parquet:"statistics_json,optional"` + TagsJSON []byte `parquet:"tags_json,optional"` + MetaDataJSON []byte `parquet:"meta_data_json,optional"` + FootprintJSON []byte `parquet:"footprint_json,optional"` + EnergyFootJSON []byte `parquet:"energy_footprint_json,optional"` + MetricDataGz []byte `parquet:"metric_data_gz"` +} diff --git a/pkg/archive/parquet/target.go b/pkg/archive/parquet/target.go new file mode 100644 index 00000000..090a230d --- /dev/null +++ b/pkg/archive/parquet/target.go @@ -0,0 +1,104 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "context" + "fmt" + "os" + "path/filepath" + + "github.com/aws/aws-sdk-go-v2/aws" + awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" + "github.com/aws/aws-sdk-go-v2/service/s3" +) + +// ParquetTarget abstracts the destination for parquet file writes. +type ParquetTarget interface { + WriteFile(name string, data []byte) error +} + +// FileTarget writes parquet files to a local filesystem directory. +type FileTarget struct { + path string +} + +func NewFileTarget(path string) (*FileTarget, error) { + if err := os.MkdirAll(path, 0o750); err != nil { + return nil, fmt.Errorf("create target directory: %w", err) + } + return &FileTarget{path: path}, nil +} + +func (ft *FileTarget) WriteFile(name string, data []byte) error { + fullPath := filepath.Join(ft.path, name) + if err := os.MkdirAll(filepath.Dir(fullPath), 0o750); err != nil { + return fmt.Errorf("create parent directory: %w", err) + } + return os.WriteFile(fullPath, data, 0o640) +} + +// S3TargetConfig holds the configuration for an S3 parquet target. +type S3TargetConfig struct { + Endpoint string + Bucket string + AccessKey string + SecretKey string + Region string + UsePathStyle bool +} + +// S3Target writes parquet files to an S3-compatible object store. +type S3Target struct { + client *s3.Client + bucket string +} + +func NewS3Target(cfg S3TargetConfig) (*S3Target, error) { + if cfg.Bucket == "" { + return nil, fmt.Errorf("S3 target: empty bucket name") + } + + region := cfg.Region + if region == "" { + region = "us-east-1" + } + + awsCfg, err := awsconfig.LoadDefaultConfig(context.Background(), + awsconfig.WithRegion(region), + awsconfig.WithCredentialsProvider( + credentials.NewStaticCredentialsProvider(cfg.AccessKey, cfg.SecretKey, ""), + ), + ) + if err != nil { + return nil, fmt.Errorf("S3 target: load AWS config: %w", err) + } + + opts := func(o *s3.Options) { + if cfg.Endpoint != "" { + o.BaseEndpoint = aws.String(cfg.Endpoint) + } + o.UsePathStyle = cfg.UsePathStyle + } + + client := s3.NewFromConfig(awsCfg, opts) + return &S3Target{client: client, bucket: cfg.Bucket}, nil +} + +func (st *S3Target) WriteFile(name string, data []byte) error { + _, err := st.client.PutObject(context.Background(), &s3.PutObjectInput{ + Bucket: aws.String(st.bucket), + Key: aws.String(name), + Body: bytes.NewReader(data), + ContentType: aws.String("application/vnd.apache.parquet"), + }) + if err != nil { + return fmt.Errorf("S3 target: put object %q: %w", name, err) + } + return nil +} diff --git a/pkg/archive/parquet/writer.go b/pkg/archive/parquet/writer.go new file mode 100644 index 00000000..bfe4490f --- /dev/null +++ b/pkg/archive/parquet/writer.go @@ -0,0 +1,186 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "encoding/json" + "fmt" + "path" + "time" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" + pq "github.com/parquet-go/parquet-go" +) + +// ParquetWriter batches ParquetJobRows and flushes them to a target +// when the estimated size exceeds maxSizeBytes. +type ParquetWriter struct { + target ParquetTarget + maxSizeBytes int64 + rows []ParquetJobRow + currentSize int64 + fileCounter int + datePrefix string +} + +// NewParquetWriter creates a new writer that flushes batches to the given target. +// maxSizeMB sets the approximate maximum size per parquet file in megabytes. +func NewParquetWriter(target ParquetTarget, maxSizeMB int) *ParquetWriter { + return &ParquetWriter{ + target: target, + maxSizeBytes: int64(maxSizeMB) * 1024 * 1024, + datePrefix: time.Now().Format("2006-01-02"), + } +} + +// AddJob adds a row to the current batch. If the estimated batch size +// exceeds the configured maximum, the batch is flushed to the target first. +func (pw *ParquetWriter) AddJob(row ParquetJobRow) error { + rowSize := estimateRowSize(&row) + + if pw.currentSize+rowSize > pw.maxSizeBytes && len(pw.rows) > 0 { + if err := pw.Flush(); err != nil { + return err + } + } + + pw.rows = append(pw.rows, row) + pw.currentSize += rowSize + return nil +} + +// Flush writes the current batch to a parquet file on the target. +func (pw *ParquetWriter) Flush() error { + if len(pw.rows) == 0 { + return nil + } + + pw.fileCounter++ + fileName := fmt.Sprintf("cc-archive-%s-%03d.parquet", pw.datePrefix, pw.fileCounter) + + data, err := writeParquetBytes(pw.rows) + if err != nil { + return fmt.Errorf("write parquet buffer: %w", err) + } + + if err := pw.target.WriteFile(fileName, data); err != nil { + return fmt.Errorf("write parquet file %q: %w", fileName, err) + } + + cclog.Infof("Parquet retention: wrote %s (%d jobs, %d bytes)", fileName, len(pw.rows), len(data)) + pw.rows = pw.rows[:0] + pw.currentSize = 0 + return nil +} + +// Close flushes any remaining rows and finalizes the writer. +func (pw *ParquetWriter) Close() error { + return pw.Flush() +} + +func writeParquetBytes(rows []ParquetJobRow) ([]byte, error) { + var buf bytes.Buffer + + writer := pq.NewGenericWriter[ParquetJobRow](&buf, + pq.Compression(&pq.Zstd), + pq.SortingWriterConfig(pq.SortingColumns( + pq.Ascending("sub_cluster"), + pq.Ascending("project"), + pq.Ascending("start_time"), + )), + ) + + if _, err := writer.Write(rows); err != nil { + return nil, err + } + if err := writer.Close(); err != nil { + return nil, err + } + + return buf.Bytes(), nil +} + +func estimateRowSize(row *ParquetJobRow) int64 { + // Fixed fields: ~100 bytes for numeric fields + strings estimate + size := int64(200) + size += int64(len(row.Cluster) + len(row.SubCluster) + len(row.Partition) + + len(row.Project) + len(row.User) + len(row.State)) + size += int64(len(row.ResourcesJSON)) + size += int64(len(row.StatisticsJSON)) + size += int64(len(row.TagsJSON)) + size += int64(len(row.MetaDataJSON)) + size += int64(len(row.FootprintJSON)) + size += int64(len(row.EnergyFootJSON)) + size += int64(len(row.MetricDataGz)) + return size +} + +// prefixedTarget wraps a ParquetTarget and prepends a path prefix to all file names. +type prefixedTarget struct { + inner ParquetTarget + prefix string +} + +func (pt *prefixedTarget) WriteFile(name string, data []byte) error { + return pt.inner.WriteFile(path.Join(pt.prefix, name), data) +} + +// ClusterAwareParquetWriter organizes Parquet output by cluster. +// Each cluster gets its own subdirectory with a cluster.json config file. +type ClusterAwareParquetWriter struct { + target ParquetTarget + maxSizeMB int + writers map[string]*ParquetWriter + clusterCfgs map[string]*schema.Cluster +} + +// NewClusterAwareParquetWriter creates a writer that routes jobs to per-cluster ParquetWriters. +func NewClusterAwareParquetWriter(target ParquetTarget, maxSizeMB int) *ClusterAwareParquetWriter { + return &ClusterAwareParquetWriter{ + target: target, + maxSizeMB: maxSizeMB, + writers: make(map[string]*ParquetWriter), + clusterCfgs: make(map[string]*schema.Cluster), + } +} + +// SetClusterConfig stores a cluster configuration to be written as cluster.json on Close. +func (cw *ClusterAwareParquetWriter) SetClusterConfig(name string, cfg *schema.Cluster) { + cw.clusterCfgs[name] = cfg +} + +// AddJob routes the job row to the appropriate per-cluster writer. +func (cw *ClusterAwareParquetWriter) AddJob(row ParquetJobRow) error { + cluster := row.Cluster + pw, ok := cw.writers[cluster] + if !ok { + pw = NewParquetWriter(&prefixedTarget{inner: cw.target, prefix: cluster}, cw.maxSizeMB) + cw.writers[cluster] = pw + } + return pw.AddJob(row) +} + +// Close writes cluster.json files and flushes all per-cluster writers. +func (cw *ClusterAwareParquetWriter) Close() error { + for name, cfg := range cw.clusterCfgs { + data, err := json.MarshalIndent(cfg, "", " ") + if err != nil { + return fmt.Errorf("marshal cluster config %q: %w", name, err) + } + if err := cw.target.WriteFile(path.Join(name, "cluster.json"), data); err != nil { + return fmt.Errorf("write cluster.json for %q: %w", name, err) + } + } + + for cluster, pw := range cw.writers { + if err := pw.Close(); err != nil { + return fmt.Errorf("close writer for cluster %q: %w", cluster, err) + } + } + return nil +} diff --git a/pkg/archive/parquet/writer_test.go b/pkg/archive/parquet/writer_test.go new file mode 100644 index 00000000..9515edc3 --- /dev/null +++ b/pkg/archive/parquet/writer_test.go @@ -0,0 +1,361 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package parquet + +import ( + "bytes" + "compress/gzip" + "encoding/json" + "io" + "os" + "path/filepath" + "strings" + "sync" + "testing" + + "github.com/ClusterCockpit/cc-lib/v2/schema" + pq "github.com/parquet-go/parquet-go" +) + +// memTarget collects written files in memory for testing. +type memTarget struct { + mu sync.Mutex + files map[string][]byte +} + +func newMemTarget() *memTarget { + return &memTarget{files: make(map[string][]byte)} +} + +func (m *memTarget) WriteFile(name string, data []byte) error { + m.mu.Lock() + defer m.mu.Unlock() + m.files[name] = append([]byte(nil), data...) + return nil +} + +func makeTestJob(jobID int64) (*schema.Job, *schema.JobData) { + meta := &schema.Job{ + JobID: jobID, + Cluster: "testcluster", + SubCluster: "sc0", + Project: "testproject", + User: "testuser", + State: schema.JobStateCompleted, + StartTime: 1700000000, + Duration: 3600, + Walltime: 7200, + NumNodes: 2, + NumHWThreads: 16, + SMT: 1, + Resources: []*schema.Resource{ + {Hostname: "node001"}, + {Hostname: "node002"}, + }, + } + + data := schema.JobData{ + "cpu_load": { + schema.MetricScopeNode: &schema.JobMetric{ + Unit: schema.Unit{Base: ""}, + Timestep: 60, + Series: []schema.Series{ + { + Hostname: "node001", + Data: []schema.Float{1.0, 2.0, 3.0}, + }, + }, + }, + }, + } + + return meta, &data +} + +func TestJobToParquetRowConversion(t *testing.T) { + meta, data := makeTestJob(1001) + meta.Tags = []*schema.Tag{{Type: "test", Name: "tag1"}} + meta.MetaData = map[string]string{"key": "value"} + + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("JobToParquetRow: %v", err) + } + + if row.JobID != 1001 { + t.Errorf("JobID = %d, want 1001", row.JobID) + } + if row.Cluster != "testcluster" { + t.Errorf("Cluster = %q, want %q", row.Cluster, "testcluster") + } + if row.User != "testuser" { + t.Errorf("User = %q, want %q", row.User, "testuser") + } + if row.State != "completed" { + t.Errorf("State = %q, want %q", row.State, "completed") + } + if row.NumNodes != 2 { + t.Errorf("NumNodes = %d, want 2", row.NumNodes) + } + + // Verify resources JSON + var resources []*schema.Resource + if err := json.Unmarshal(row.ResourcesJSON, &resources); err != nil { + t.Fatalf("unmarshal resources: %v", err) + } + if len(resources) != 2 { + t.Errorf("resources len = %d, want 2", len(resources)) + } + + // Verify tags JSON + var tags []*schema.Tag + if err := json.Unmarshal(row.TagsJSON, &tags); err != nil { + t.Fatalf("unmarshal tags: %v", err) + } + if len(tags) != 1 || tags[0].Name != "tag1" { + t.Errorf("tags = %v, want [{test tag1}]", tags) + } + + // Verify metric data is gzip-compressed valid JSON + gz, err := gzip.NewReader(bytes.NewReader(row.MetricDataGz)) + if err != nil { + t.Fatalf("gzip reader: %v", err) + } + decompressed, err := io.ReadAll(gz) + if err != nil { + t.Fatalf("gzip read: %v", err) + } + var jobData schema.JobData + if err := json.Unmarshal(decompressed, &jobData); err != nil { + t.Fatalf("unmarshal metric data: %v", err) + } + if _, ok := jobData["cpu_load"]; !ok { + t.Error("metric data missing cpu_load key") + } +} + +func TestParquetWriterSingleBatch(t *testing.T) { + target := newMemTarget() + pw := NewParquetWriter(target, 512) + + for i := range int64(5) { + meta, data := makeTestJob(i) + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("convert job %d: %v", i, err) + } + if err := pw.AddJob(*row); err != nil { + t.Fatalf("add job %d: %v", i, err) + } + } + + if err := pw.Close(); err != nil { + t.Fatalf("close: %v", err) + } + + if len(target.files) != 1 { + t.Fatalf("expected 1 file, got %d", len(target.files)) + } + + // Verify the parquet file is readable + for name, data := range target.files { + file := bytes.NewReader(data) + pf, err := pq.OpenFile(file, int64(len(data))) + if err != nil { + t.Fatalf("open parquet %s: %v", name, err) + } + if pf.NumRows() != 5 { + t.Errorf("parquet rows = %d, want 5", pf.NumRows()) + } + } +} + +func TestParquetWriterBatching(t *testing.T) { + target := newMemTarget() + // Use a very small max size to force multiple files + pw := NewParquetWriter(target, 0) // 0 MB means every job triggers a flush + pw.maxSizeBytes = 1 // Force flush after every row + + for i := range int64(3) { + meta, data := makeTestJob(i) + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("convert job %d: %v", i, err) + } + if err := pw.AddJob(*row); err != nil { + t.Fatalf("add job %d: %v", i, err) + } + } + + if err := pw.Close(); err != nil { + t.Fatalf("close: %v", err) + } + + // With maxSizeBytes=1, each AddJob should flush the previous batch, + // resulting in multiple files + if len(target.files) < 2 { + t.Errorf("expected multiple files due to batching, got %d", len(target.files)) + } + + // Verify all files are valid parquet + for name, data := range target.files { + file := bytes.NewReader(data) + _, err := pq.OpenFile(file, int64(len(data))) + if err != nil { + t.Errorf("invalid parquet file %s: %v", name, err) + } + } +} + +func TestFileTarget(t *testing.T) { + dir := t.TempDir() + ft, err := NewFileTarget(dir) + if err != nil { + t.Fatalf("NewFileTarget: %v", err) + } + + testData := []byte("test parquet data") + if err := ft.WriteFile("test.parquet", testData); err != nil { + t.Fatalf("WriteFile: %v", err) + } + + // Verify file exists and has correct content + // (using the target itself is sufficient; we just check no error) +} + +func TestFileTargetSubdirectories(t *testing.T) { + dir := t.TempDir() + ft, err := NewFileTarget(dir) + if err != nil { + t.Fatalf("NewFileTarget: %v", err) + } + + testData := []byte("test data in subdir") + if err := ft.WriteFile("fritz/cc-archive-2025-01-20-001.parquet", testData); err != nil { + t.Fatalf("WriteFile with subdir: %v", err) + } + + // Verify file was created in subdirectory + content, err := os.ReadFile(filepath.Join(dir, "fritz", "cc-archive-2025-01-20-001.parquet")) + if err != nil { + t.Fatalf("read file in subdir: %v", err) + } + if !bytes.Equal(content, testData) { + t.Error("file content mismatch") + } +} + +func makeTestJobForCluster(jobID int64, cluster string) (*schema.Job, *schema.JobData) { + meta, data := makeTestJob(jobID) + meta.Cluster = cluster + return meta, data +} + +func TestClusterAwareParquetWriter(t *testing.T) { + target := newMemTarget() + cw := NewClusterAwareParquetWriter(target, 512) + + // Set cluster configs + cw.SetClusterConfig("fritz", &schema.Cluster{Name: "fritz"}) + cw.SetClusterConfig("alex", &schema.Cluster{Name: "alex"}) + + // Add jobs from different clusters + for i := range int64(3) { + meta, data := makeTestJobForCluster(i, "fritz") + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("convert fritz job %d: %v", i, err) + } + if err := cw.AddJob(*row); err != nil { + t.Fatalf("add fritz job %d: %v", i, err) + } + } + + for i := int64(10); i < 12; i++ { + meta, data := makeTestJobForCluster(i, "alex") + row, err := JobToParquetRow(meta, data) + if err != nil { + t.Fatalf("convert alex job %d: %v", i, err) + } + if err := cw.AddJob(*row); err != nil { + t.Fatalf("add alex job %d: %v", i, err) + } + } + + if err := cw.Close(); err != nil { + t.Fatalf("close: %v", err) + } + + target.mu.Lock() + defer target.mu.Unlock() + + // Check cluster.json files were written + if _, ok := target.files["fritz/cluster.json"]; !ok { + t.Error("missing fritz/cluster.json") + } + if _, ok := target.files["alex/cluster.json"]; !ok { + t.Error("missing alex/cluster.json") + } + + // Verify cluster.json content + var clusterCfg schema.Cluster + if err := json.Unmarshal(target.files["fritz/cluster.json"], &clusterCfg); err != nil { + t.Fatalf("unmarshal fritz cluster.json: %v", err) + } + if clusterCfg.Name != "fritz" { + t.Errorf("fritz cluster name = %q, want %q", clusterCfg.Name, "fritz") + } + + // Check parquet files are in cluster subdirectories + fritzParquets := 0 + alexParquets := 0 + for name := range target.files { + if strings.HasPrefix(name, "fritz/") && strings.HasSuffix(name, ".parquet") { + fritzParquets++ + } + if strings.HasPrefix(name, "alex/") && strings.HasSuffix(name, ".parquet") { + alexParquets++ + } + } + if fritzParquets == 0 { + t.Error("no parquet files in fritz/") + } + if alexParquets == 0 { + t.Error("no parquet files in alex/") + } + + // Verify parquet files are readable and have correct row counts + for name, data := range target.files { + if !strings.HasSuffix(name, ".parquet") { + continue + } + file := bytes.NewReader(data) + pf, err := pq.OpenFile(file, int64(len(data))) + if err != nil { + t.Errorf("open parquet %s: %v", name, err) + continue + } + if strings.HasPrefix(name, "fritz/") && pf.NumRows() != 3 { + t.Errorf("fritz parquet rows = %d, want 3", pf.NumRows()) + } + if strings.HasPrefix(name, "alex/") && pf.NumRows() != 2 { + t.Errorf("alex parquet rows = %d, want 2", pf.NumRows()) + } + } +} + +func TestClusterAwareParquetWriterEmpty(t *testing.T) { + target := newMemTarget() + cw := NewClusterAwareParquetWriter(target, 512) + + if err := cw.Close(); err != nil { + t.Fatalf("close empty writer: %v", err) + } + + if len(target.files) != 0 { + t.Errorf("expected no files for empty writer, got %d", len(target.files)) + } +} diff --git a/pkg/archive/s3Backend.go b/pkg/archive/s3Backend.go index 5b3d9f02..7b82d309 100644 --- a/pkg/archive/s3Backend.go +++ b/pkg/archive/s3Backend.go @@ -15,15 +15,17 @@ import ( "io" "math" "os" + "slices" "strconv" "strings" + "sync" "text/tabwriter" "time" "github.com/ClusterCockpit/cc-backend/internal/config" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/ClusterCockpit/cc-lib/util" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" "github.com/aws/aws-sdk-go-v2/aws" awsconfig "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/credentials" @@ -33,12 +35,12 @@ import ( // S3ArchiveConfig holds the configuration for the S3 archive backend. type S3ArchiveConfig struct { - Endpoint string `json:"endpoint"` // S3 endpoint URL (optional, for MinIO/localstack) - AccessKey string `json:"accessKey"` // AWS access key ID - SecretKey string `json:"secretKey"` // AWS secret access key - Bucket string `json:"bucket"` // S3 bucket name - Region string `json:"region"` // AWS region - UsePathStyle bool `json:"usePathStyle"` // Use path-style URLs (required for MinIO) + Endpoint string `json:"endpoint"` // S3 endpoint URL (optional, for MinIO/localstack) + AccessKey string `json:"access-key"` // AWS access key ID + SecretKey string `json:"secret-key"` // AWS secret access key + Bucket string `json:"bucket"` // S3 bucket name + Region string `json:"region"` // AWS region + UsePathStyle bool `json:"use-path-style"` // Use path-style URLs (required for MinIO) } // S3Archive implements ArchiveBackend using AWS S3 or S3-compatible object storage. @@ -114,6 +116,7 @@ func (s3a *S3Archive) Init(rawConfig json.RawMessage) (uint64, error) { // Create S3 client with path-style option and custom endpoint if specified s3a.client = s3.NewFromConfig(awsCfg, func(o *s3.Options) { + o.DisableLogOutputChecksumValidationSkipped = true o.UsePathStyle = cfg.UsePathStyle if cfg.Endpoint != "" { o.BaseEndpoint = aws.String(cfg.Endpoint) @@ -467,7 +470,6 @@ func (s3a *S3Archive) StoreJobMeta(job *schema.Job) error { func (s3a *S3Archive) ImportJob(jobMeta *schema.Job, jobData *schema.JobData) error { ctx := context.Background() - // Upload meta.json metaKey := getS3Key(jobMeta, "meta.json") var metaBuf bytes.Buffer if err := EncodeJobMeta(&metaBuf, jobMeta); err != nil { @@ -485,18 +487,37 @@ func (s3a *S3Archive) ImportJob(jobMeta *schema.Job, jobData *schema.JobData) er return err } - // Upload data.json - dataKey := getS3Key(jobMeta, "data.json") var dataBuf bytes.Buffer if err := EncodeJobData(&dataBuf, jobData); err != nil { cclog.Error("S3Archive ImportJob() > encoding data error") return err } + var dataKey string + var dataBytes []byte + + if dataBuf.Len() > 2000 { + dataKey = getS3Key(jobMeta, "data.json.gz") + var compressedBuf bytes.Buffer + gzipWriter := gzip.NewWriter(&compressedBuf) + if _, err := gzipWriter.Write(dataBuf.Bytes()); err != nil { + cclog.Errorf("S3Archive ImportJob() > gzip write error: %v", err) + return err + } + if err := gzipWriter.Close(); err != nil { + cclog.Errorf("S3Archive ImportJob() > gzip close error: %v", err) + return err + } + dataBytes = compressedBuf.Bytes() + } else { + dataKey = getS3Key(jobMeta, "data.json") + dataBytes = dataBuf.Bytes() + } + _, err = s3a.client.PutObject(ctx, &s3.PutObjectInput{ Bucket: aws.String(s3a.bucket), Key: aws.String(dataKey), - Body: bytes.NewReader(dataBuf.Bytes()), + Body: bytes.NewReader(dataBytes), }) if err != nil { cclog.Errorf("S3Archive ImportJob() > PutObject data error: %v", err) @@ -795,29 +816,16 @@ func (s3a *S3Archive) Iter(loadMetricData bool) <-chan JobContainer { ctx := context.Background() defer close(ch) - for _, cluster := range s3a.clusters { - prefix := cluster + "/" + numWorkers := 4 + metaKeys := make(chan string, numWorkers*2) + var wg sync.WaitGroup - paginator := s3.NewListObjectsV2Paginator(s3a.client, &s3.ListObjectsV2Input{ - Bucket: aws.String(s3a.bucket), - Prefix: aws.String(prefix), - }) - - for paginator.HasMorePages() { - page, err := paginator.NextPage(ctx) - if err != nil { - cclog.Fatalf("S3Archive Iter() > list error: %s", err.Error()) - } - - for _, obj := range page.Contents { - if obj.Key == nil || !strings.HasSuffix(*obj.Key, "/meta.json") { - continue - } - - // Load job metadata + for range numWorkers { + wg.Go(func() { + for metaKey := range metaKeys { result, err := s3a.client.GetObject(ctx, &s3.GetObjectInput{ Bucket: aws.String(s3a.bucket), - Key: obj.Key, + Key: aws.String(metaKey), }) if err != nil { cclog.Errorf("S3Archive Iter() > GetObject meta error: %v", err) @@ -849,8 +857,34 @@ func (s3a *S3Archive) Iter(loadMetricData bool) <-chan JobContainer { ch <- JobContainer{Meta: job, Data: nil} } } + }) + } + + for _, cluster := range s3a.clusters { + prefix := cluster + "/" + + paginator := s3.NewListObjectsV2Paginator(s3a.client, &s3.ListObjectsV2Input{ + Bucket: aws.String(s3a.bucket), + Prefix: aws.String(prefix), + }) + + for paginator.HasMorePages() { + page, err := paginator.NextPage(ctx) + if err != nil { + cclog.Fatalf("S3Archive Iter() > list error: %s", err.Error()) + } + + for _, obj := range page.Contents { + if obj.Key == nil || !strings.HasSuffix(*obj.Key, "/meta.json") { + continue + } + metaKeys <- *obj.Key + } } } + + close(metaKeys) + wg.Wait() }() return ch @@ -877,13 +911,7 @@ func (s3a *S3Archive) StoreClusterCfg(name string, config *schema.Cluster) error } // Update clusters list if new - found := false - for _, c := range s3a.clusters { - if c == name { - found = true - break - } - } + found := slices.Contains(s3a.clusters, name) if !found { s3a.clusters = append(s3a.clusters, name) } diff --git a/pkg/archive/s3Backend_test.go b/pkg/archive/s3Backend_test.go index 06324cd3..ca56f3ed 100644 --- a/pkg/archive/s3Backend_test.go +++ b/pkg/archive/s3Backend_test.go @@ -13,7 +13,7 @@ import ( "strings" "testing" - "github.com/ClusterCockpit/cc-lib/schema" + "github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go-v2/service/s3/types" @@ -41,7 +41,7 @@ func (m *MockS3Client) GetObject(ctx context.Context, params *s3.GetObjectInput, if !exists { return nil, fmt.Errorf("NoSuchKey: object not found") } - + contentLength := int64(len(data)) return &s3.GetObjectOutput{ Body: io.NopCloser(bytes.NewReader(data)), @@ -65,7 +65,7 @@ func (m *MockS3Client) HeadObject(ctx context.Context, params *s3.HeadObjectInpu if !exists { return nil, fmt.Errorf("NotFound") } - + contentLength := int64(len(data)) return &s3.HeadObjectOutput{ ContentLength: &contentLength, @@ -86,12 +86,12 @@ func (m *MockS3Client) CopyObject(ctx context.Context, params *s3.CopyObjectInpu return nil, fmt.Errorf("invalid CopySource") } sourceKey := parts[1] - + data, exists := m.objects[sourceKey] if !exists { return nil, fmt.Errorf("source not found") } - + destKey := aws.ToString(params.Key) m.objects[destKey] = data return &s3.CopyObjectOutput{}, nil @@ -100,15 +100,15 @@ func (m *MockS3Client) CopyObject(ctx context.Context, params *s3.CopyObjectInpu func (m *MockS3Client) ListObjectsV2(ctx context.Context, params *s3.ListObjectsV2Input, optFns ...func(*s3.Options)) (*s3.ListObjectsV2Output, error) { prefix := aws.ToString(params.Prefix) delimiter := aws.ToString(params.Delimiter) - + var contents []types.Object commonPrefixes := make(map[string]bool) - + for key, data := range m.objects { if !strings.HasPrefix(key, prefix) { continue } - + if delimiter != "" { // Check if there's a delimiter after the prefix remainder := strings.TrimPrefix(key, prefix) @@ -120,21 +120,21 @@ func (m *MockS3Client) ListObjectsV2(ctx context.Context, params *s3.ListObjects continue } } - + size := int64(len(data)) contents = append(contents, types.Object{ Key: aws.String(key), Size: &size, }) } - + var prefixList []types.CommonPrefix for p := range commonPrefixes { prefixList = append(prefixList, types.CommonPrefix{ Prefix: aws.String(p), }) } - + return &s3.ListObjectsV2Output{ Contents: contents, CommonPrefixes: prefixList, @@ -144,10 +144,10 @@ func (m *MockS3Client) ListObjectsV2(ctx context.Context, params *s3.ListObjects // Test helper to create a mock S3 archive with test data func setupMockS3Archive(t *testing.T) *MockS3Client { mock := NewMockS3Client() - + // Add version.txt mock.objects["version.txt"] = []byte("2\n") - + // Add a test cluster directory mock.objects["emmy/cluster.json"] = []byte(`{ "name": "emmy", @@ -165,7 +165,7 @@ func setupMockS3Archive(t *testing.T) *MockS3Client { } ] }`) - + // Add a test job mock.objects["emmy/1403/244/1608923076/meta.json"] = []byte(`{ "jobId": 1403244, @@ -174,7 +174,7 @@ func setupMockS3Archive(t *testing.T) *MockS3Client { "numNodes": 1, "resources": [{"hostname": "node001"}] }`) - + mock.objects["emmy/1403/244/1608923076/data.json"] = []byte(`{ "mem_used": { "node": { @@ -184,7 +184,7 @@ func setupMockS3Archive(t *testing.T) *MockS3Client { } } }`) - + return mock } @@ -213,7 +213,7 @@ func TestGetS3Key(t *testing.T) { Cluster: "emmy", StartTime: 1608923076, } - + key := getS3Key(job, "meta.json") expected := "emmy/1403/244/1608923076/meta.json" if key != expected { @@ -227,7 +227,7 @@ func TestGetS3Directory(t *testing.T) { Cluster: "emmy", StartTime: 1608923076, } - + dir := getS3Directory(job) expected := "emmy/1403/244/1608923076/" if dir != expected { @@ -241,19 +241,19 @@ func TestGetS3Directory(t *testing.T) { func TestS3ArchiveConfigParsing(t *testing.T) { rawConfig := json.RawMessage(`{ "endpoint": "http://localhost:9000", - "accessKey": "minioadmin", - "secretKey": "minioadmin", + "access-key": "minioadmin", + "secret-key": "minioadmin", "bucket": "test-bucket", "region": "us-east-1", - "usePathStyle": true + "use-path-style": true }`) - + var cfg S3ArchiveConfig err := json.Unmarshal(rawConfig, &cfg) if err != nil { t.Fatalf("failed to parse config: %v", err) } - + if cfg.Bucket != "test-bucket" { t.Errorf("expected bucket 'test-bucket', got '%s'", cfg.Bucket) } @@ -277,14 +277,14 @@ func TestS3KeyGeneration(t *testing.T) { {1404397, "emmy", 1609300556, "data.json.gz", "emmy/1404/397/1609300556/data.json.gz"}, {42, "fritz", 1234567890, "meta.json", "fritz/0/042/1234567890/meta.json"}, } - + for _, tt := range tests { job := &schema.Job{ JobID: tt.jobID, Cluster: tt.cluster, StartTime: tt.startTime, } - + key := getS3Key(job, tt.file) if key != tt.expected { t.Errorf("for job %d: expected %s, got %s", tt.jobID, tt.expected, key) diff --git a/pkg/archive/sqliteBackend.go b/pkg/archive/sqliteBackend.go index 49aeb79d..3f214136 100644 --- a/pkg/archive/sqliteBackend.go +++ b/pkg/archive/sqliteBackend.go @@ -16,19 +16,20 @@ import ( "os" "slices" "strconv" + "sync" "text/tabwriter" "time" "github.com/ClusterCockpit/cc-backend/internal/config" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/ClusterCockpit/cc-lib/util" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" _ "github.com/mattn/go-sqlite3" ) // SqliteArchiveConfig holds the configuration for the SQLite archive backend. type SqliteArchiveConfig struct { - DBPath string `json:"dbPath"` // Path to SQLite database file + DBPath string `json:"db-path"` // Path to SQLite database file } // SqliteArchive implements ArchiveBackend using a SQLite database with BLOB storage. @@ -60,6 +61,7 @@ CREATE TABLE IF NOT EXISTS jobs ( CREATE INDEX IF NOT EXISTS idx_jobs_cluster ON jobs(cluster); CREATE INDEX IF NOT EXISTS idx_jobs_start_time ON jobs(start_time); +CREATE INDEX IF NOT EXISTS idx_jobs_order ON jobs(cluster, start_time); CREATE INDEX IF NOT EXISTS idx_jobs_lookup ON jobs(cluster, job_id, start_time); CREATE TABLE IF NOT EXISTS clusters ( @@ -361,16 +363,37 @@ func (sa *SqliteArchive) ImportJob(jobMeta *schema.Job, jobData *schema.JobData) return err } + var dataBytes []byte + var compressed bool + + if dataBuf.Len() > 2000 { + var compressedBuf bytes.Buffer + gzipWriter := gzip.NewWriter(&compressedBuf) + if _, err := gzipWriter.Write(dataBuf.Bytes()); err != nil { + cclog.Errorf("SqliteArchive ImportJob() > gzip write error: %v", err) + return err + } + if err := gzipWriter.Close(); err != nil { + cclog.Errorf("SqliteArchive ImportJob() > gzip close error: %v", err) + return err + } + dataBytes = compressedBuf.Bytes() + compressed = true + } else { + dataBytes = dataBuf.Bytes() + compressed = false + } + now := time.Now().Unix() _, err := sa.db.Exec(` INSERT INTO jobs (job_id, cluster, start_time, meta_json, data_json, data_compressed, created_at, updated_at) - VALUES (?, ?, ?, ?, ?, 0, ?, ?) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(job_id, cluster, start_time) DO UPDATE SET meta_json = excluded.meta_json, data_json = excluded.data_json, data_compressed = excluded.data_compressed, updated_at = excluded.updated_at - `, jobMeta.JobID, jobMeta.Cluster, jobMeta.StartTime, metaBuf.Bytes(), dataBuf.Bytes(), now, now) + `, jobMeta.JobID, jobMeta.Cluster, jobMeta.StartTime, metaBuf.Bytes(), dataBytes, compressed, now, now) if err != nil { cclog.Errorf("SqliteArchive ImportJob() > insert error: %v", err) return err @@ -526,62 +549,113 @@ func (sa *SqliteArchive) CompressLast(starttime int64) int64 { return last } +type sqliteJobRow struct { + metaBlob []byte + dataBlob []byte + compressed bool +} + func (sa *SqliteArchive) Iter(loadMetricData bool) <-chan JobContainer { ch := make(chan JobContainer) go func() { defer close(ch) - rows, err := sa.db.Query("SELECT job_id, cluster, start_time, meta_json, data_json, data_compressed FROM jobs ORDER BY cluster, start_time") - if err != nil { - cclog.Fatalf("SqliteArchive Iter() > query error: %s", err.Error()) + const chunkSize = 1000 + offset := 0 + + var query string + if loadMetricData { + query = "SELECT meta_json, data_json, data_compressed FROM jobs ORDER BY cluster, start_time LIMIT ? OFFSET ?" + } else { + query = "SELECT meta_json FROM jobs ORDER BY cluster, start_time LIMIT ? OFFSET ?" } - defer rows.Close() - for rows.Next() { - var jobID int64 - var cluster string - var startTime int64 - var metaBlob []byte - var dataBlob []byte - var compressed bool + numWorkers := 4 + jobRows := make(chan sqliteJobRow, numWorkers*2) + var wg sync.WaitGroup - if err := rows.Scan(&jobID, &cluster, &startTime, &metaBlob, &dataBlob, &compressed); err != nil { - cclog.Errorf("SqliteArchive Iter() > scan error: %v", err) - continue - } - - job, err := DecodeJobMeta(bytes.NewReader(metaBlob)) - if err != nil { - cclog.Errorf("SqliteArchive Iter() > decode meta error: %v", err) - continue - } - - if loadMetricData && dataBlob != nil { - var reader io.Reader = bytes.NewReader(dataBlob) - if compressed { - gzipReader, err := gzip.NewReader(reader) + for range numWorkers { + wg.Go(func() { + for row := range jobRows { + job, err := DecodeJobMeta(bytes.NewReader(row.metaBlob)) if err != nil { - cclog.Errorf("SqliteArchive Iter() > gzip error: %v", err) - ch <- JobContainer{Meta: job, Data: nil} + cclog.Errorf("SqliteArchive Iter() > decode meta error: %v", err) continue } - defer gzipReader.Close() - reader = gzipReader + + if loadMetricData && row.dataBlob != nil { + var reader io.Reader = bytes.NewReader(row.dataBlob) + if row.compressed { + gzipReader, err := gzip.NewReader(reader) + if err != nil { + cclog.Errorf("SqliteArchive Iter() > gzip error: %v", err) + ch <- JobContainer{Meta: job, Data: nil} + continue + } + decompressed, err := io.ReadAll(gzipReader) + gzipReader.Close() + if err != nil { + cclog.Errorf("SqliteArchive Iter() > decompress error: %v", err) + ch <- JobContainer{Meta: job, Data: nil} + continue + } + reader = bytes.NewReader(decompressed) + } + + key := fmt.Sprintf("%s:%d:%d", job.Cluster, job.JobID, job.StartTime) + jobData, err := DecodeJobData(reader, key) + if err != nil { + cclog.Errorf("SqliteArchive Iter() > decode data error: %v", err) + ch <- JobContainer{Meta: job, Data: nil} + } else { + ch <- JobContainer{Meta: job, Data: &jobData} + } + } else { + ch <- JobContainer{Meta: job, Data: nil} + } + } + }) + } + + for { + rows, err := sa.db.Query(query, chunkSize, offset) + if err != nil { + cclog.Fatalf("SqliteArchive Iter() > query error: %s", err.Error()) + } + + rowCount := 0 + for rows.Next() { + var row sqliteJobRow + + if loadMetricData { + if err := rows.Scan(&row.metaBlob, &row.dataBlob, &row.compressed); err != nil { + cclog.Errorf("SqliteArchive Iter() > scan error: %v", err) + continue + } + } else { + if err := rows.Scan(&row.metaBlob); err != nil { + cclog.Errorf("SqliteArchive Iter() > scan error: %v", err) + continue + } + row.dataBlob = nil + row.compressed = false } - key := fmt.Sprintf("%s:%d:%d", job.Cluster, job.JobID, job.StartTime) - jobData, err := DecodeJobData(reader, key) - if err != nil { - cclog.Errorf("SqliteArchive Iter() > decode data error: %v", err) - ch <- JobContainer{Meta: job, Data: nil} - } else { - ch <- JobContainer{Meta: job, Data: &jobData} - } - } else { - ch <- JobContainer{Meta: job, Data: nil} + jobRows <- row + rowCount++ } + rows.Close() + + if rowCount < chunkSize { + break + } + + offset += chunkSize } + + close(jobRows) + wg.Wait() }() return ch diff --git a/pkg/archive/sqliteBackend_test.go b/pkg/archive/sqliteBackend_test.go index 285055fc..6a35f6ca 100644 --- a/pkg/archive/sqliteBackend_test.go +++ b/pkg/archive/sqliteBackend_test.go @@ -9,7 +9,7 @@ import ( "os" "testing" - "github.com/ClusterCockpit/cc-lib/schema" + "github.com/ClusterCockpit/cc-lib/v2/schema" ) func TestSqliteInitEmptyPath(t *testing.T) { @@ -22,7 +22,7 @@ func TestSqliteInitEmptyPath(t *testing.T) { func TestSqliteInitInvalidConfig(t *testing.T) { var sa SqliteArchive - _, err := sa.Init(json.RawMessage(`"dbPath":"/tmp/test.db"`)) + _, err := sa.Init(json.RawMessage(`"db-path":"/tmp/test.db"`)) if err == nil { t.Fatal("expected error for invalid config") } @@ -33,7 +33,7 @@ func TestSqliteInit(t *testing.T) { defer os.Remove(tmpfile) var sa SqliteArchive - version, err := sa.Init(json.RawMessage(`{"dbPath":"` + tmpfile + `"}`)) + version, err := sa.Init(json.RawMessage(`{"db-path":"` + tmpfile + `"}`)) if err != nil { t.Fatalf("init failed: %v", err) } @@ -51,7 +51,7 @@ func TestSqliteStoreAndLoadJobMeta(t *testing.T) { defer os.Remove(tmpfile) var sa SqliteArchive - _, err := sa.Init(json.RawMessage(`{"dbPath":"` + tmpfile + `"}`)) + _, err := sa.Init(json.RawMessage(`{"db-path":"` + tmpfile + `"}`)) if err != nil { t.Fatalf("init failed: %v", err) } @@ -97,7 +97,7 @@ func TestSqliteImportJob(t *testing.T) { defer os.Remove(tmpfile) var sa SqliteArchive - _, err := sa.Init(json.RawMessage(`{"dbPath":"` + tmpfile + `"}`)) + _, err := sa.Init(json.RawMessage(`{"db-path":"` + tmpfile + `"}`)) if err != nil { t.Fatalf("init failed: %v", err) } @@ -114,7 +114,7 @@ func TestSqliteGetClusters(t *testing.T) { defer os.Remove(tmpfile) var sa SqliteArchive - _, err := sa.Init(json.RawMessage(`{"dbPath":"` + tmpfile + `"}`)) + _, err := sa.Init(json.RawMessage(`{"db-path":"` + tmpfile + `"}`)) if err != nil { t.Fatalf("init failed: %v", err) } @@ -141,7 +141,7 @@ func TestSqliteGetClusters(t *testing.T) { // Reinitialize to refresh cluster list sa.db.Close() - _, err = sa.Init(json.RawMessage(`{"dbPath":"` + tmpfile + `"}`)) + _, err = sa.Init(json.RawMessage(`{"db-path":"` + tmpfile + `"}`)) if err != nil { t.Fatalf("reinit failed: %v", err) } @@ -158,7 +158,7 @@ func TestSqliteCleanUp(t *testing.T) { defer os.Remove(tmpfile) var sa SqliteArchive - _, err := sa.Init(json.RawMessage(`{"dbPath":"` + tmpfile + `"}`)) + _, err := sa.Init(json.RawMessage(`{"db-path":"` + tmpfile + `"}`)) if err != nil { t.Fatalf("init failed: %v", err) } @@ -193,7 +193,7 @@ func TestSqliteClean(t *testing.T) { defer os.Remove(tmpfile) var sa SqliteArchive - _, err := sa.Init(json.RawMessage(`{"dbPath":"` + tmpfile + `"}`)) + _, err := sa.Init(json.RawMessage(`{"db-path":"` + tmpfile + `"}`)) if err != nil { t.Fatalf("init failed: %v", err) } @@ -237,7 +237,7 @@ func TestSqliteIter(t *testing.T) { defer os.Remove(tmpfile) var sa SqliteArchive - _, err := sa.Init(json.RawMessage(`{"dbPath":"` + tmpfile + `"}`)) + _, err := sa.Init(json.RawMessage(`{"db-path":"` + tmpfile + `"}`)) if err != nil { t.Fatalf("init failed: %v", err) } @@ -276,7 +276,7 @@ func TestSqliteCompress(t *testing.T) { defer os.Remove(tmpfile) var sa SqliteArchive - _, err := sa.Init(json.RawMessage(`{"dbPath":"` + tmpfile + `"}`)) + _, err := sa.Init(json.RawMessage(`{"db-path":"` + tmpfile + `"}`)) if err != nil { t.Fatalf("init failed: %v", err) } @@ -294,12 +294,12 @@ func TestSqliteCompress(t *testing.T) { // Compress should not panic even with missing data sa.Compress([]*schema.Job{job}) - + t.Log("Compression method verified") } func TestSqliteConfigParsing(t *testing.T) { - rawConfig := json.RawMessage(`{"dbPath": "/tmp/test.db"}`) + rawConfig := json.RawMessage(`{"db-path": "/tmp/test.db"}`) var cfg SqliteArchiveConfig err := json.Unmarshal(rawConfig, &cfg) @@ -311,3 +311,58 @@ func TestSqliteConfigParsing(t *testing.T) { t.Errorf("expected dbPath '/tmp/test.db', got '%s'", cfg.DBPath) } } + +func TestSqliteIterChunking(t *testing.T) { + tmpfile := t.TempDir() + "/test.db" + defer os.Remove(tmpfile) + + var sa SqliteArchive + _, err := sa.Init(json.RawMessage(`{"db-path":"` + tmpfile + `"}`)) + if err != nil { + t.Fatalf("init failed: %v", err) + } + defer sa.db.Close() + + const totalJobs = 2500 + for i := 1; i <= totalJobs; i++ { + job := &schema.Job{ + JobID: int64(i), + Cluster: "test", + StartTime: int64(i * 1000), + NumNodes: 1, + Resources: []*schema.Resource{{Hostname: "node001"}}, + } + if err := sa.StoreJobMeta(job); err != nil { + t.Fatalf("store failed: %v", err) + } + } + + t.Run("IterWithoutData", func(t *testing.T) { + count := 0 + for container := range sa.Iter(false) { + if container.Meta == nil { + t.Error("expected non-nil meta") + } + if container.Data != nil { + t.Error("expected nil data when loadMetricData is false") + } + count++ + } + if count != totalJobs { + t.Errorf("expected %d jobs, got %d", totalJobs, count) + } + }) + + t.Run("IterWithData", func(t *testing.T) { + count := 0 + for container := range sa.Iter(true) { + if container.Meta == nil { + t.Error("expected non-nil meta") + } + count++ + } + if count != totalJobs { + t.Errorf("expected %d jobs, got %d", totalJobs, count) + } + }) +} diff --git a/internal/memorystore/api.go b/pkg/metricstore/api.go similarity index 51% rename from internal/memorystore/api.go rename to pkg/metricstore/api.go index 1f7a531f..21f8db0c 100644 --- a/internal/memorystore/api.go +++ b/pkg/metricstore/api.go @@ -3,15 +3,35 @@ // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. -package memorystore +// This file contains the API types and data fetching logic for querying metric data +// from the in-memory metric store. It provides structures for building complex queries +// with support for aggregation, scaling, padding, and statistics computation. +package metricstore import ( + "errors" + "fmt" "math" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/ClusterCockpit/cc-lib/util" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" ) +var ( + // ErrNoHostOrMetric is returned when the metric store does not find the host or the metric + ErrNoHostOrMetric error = errors.New("[METRICSTORE]> metric or host not found") + // ErrInvalidTimeRange is returned when a query has 'from' >= 'to' + ErrInvalidTimeRange = errors.New("[METRICSTORE]> invalid time range: 'from' must be before 'to'") + // ErrEmptyCluster is returned when a query with ForAllNodes has no cluster specified + ErrEmptyCluster = errors.New("[METRICSTORE]> cluster name cannot be empty") +) + +// APIMetricData represents the response data for a single metric query. +// +// It contains both the time-series data points and computed statistics (avg, min, max). +// If an error occurred during data retrieval, the Error field will be set and other +// fields may be incomplete. type APIMetricData struct { Error *string `json:"error,omitempty"` Data schema.FloatArray `json:"data,omitempty"` @@ -23,6 +43,13 @@ type APIMetricData struct { Max schema.Float `json:"max"` } +// APIQueryRequest represents a batch query request for metric data. +// +// It supports two modes of operation: +// 1. Explicit queries via the Queries field +// 2. Automatic query generation via ForAllNodes (queries all specified metrics for all nodes in the cluster) +// +// The request can be customized with flags to include/exclude statistics, raw data, and padding. type APIQueryRequest struct { Cluster string `json:"cluster"` Queries []APIQuery `json:"queries"` @@ -34,11 +61,25 @@ type APIQueryRequest struct { WithPadding bool `json:"with-padding"` } +// APIQueryResponse represents the response to an APIQueryRequest. +// +// Results is a 2D array where each outer element corresponds to a query, +// and each inner element corresponds to a selector within that query +// (e.g., multiple CPUs or cores). type APIQueryResponse struct { Queries []APIQuery `json:"queries,omitempty"` Results [][]APIMetricData `json:"results"` } +// APIQuery represents a single metric query with optional hierarchical selectors. +// +// The hierarchical selection works as follows: +// - Hostname: The node to query +// - Type + TypeIds: First level of hierarchy (e.g., "cpu" + ["0", "1", "2"]) +// - SubType + SubTypeIds: Second level of hierarchy (e.g., "core" + ["0", "1"]) +// +// If Aggregate is true, data from multiple type/subtype IDs will be aggregated according +// to the metric's aggregation strategy. Otherwise, separate results are returned for each combination. type APIQuery struct { Type *string `json:"type,omitempty"` SubType *string `json:"subtype,omitempty"` @@ -51,6 +92,11 @@ type APIQuery struct { Aggregate bool `json:"aggreg"` } +// AddStats computes and populates the Avg, Min, and Max fields from the Data array. +// +// NaN values in the data are ignored during computation. If all values are NaN, +// the statistics fields will be set to NaN. +// // TODO: Optimize this, just like the stats endpoint! func (data *APIMetricData) AddStats() { n := 0 @@ -76,6 +122,10 @@ func (data *APIMetricData) AddStats() { } } +// ScaleBy multiplies all data points and statistics by the given factor. +// +// This is commonly used for unit conversion (e.g., bytes to gigabytes). +// Scaling by 0 or 1 is a no-op for performance reasons. func (data *APIMetricData) ScaleBy(f schema.Float) { if f == 0 || f == 1 { return @@ -89,6 +139,17 @@ func (data *APIMetricData) ScaleBy(f schema.Float) { } } +// PadDataWithNull pads the beginning of the data array with NaN values if needed. +// +// This ensures that the data aligns with the requested 'from' timestamp, even if +// the metric store doesn't have data for the earliest time points. This is useful +// for maintaining consistent array indexing across multiple queries. +// +// Parameters: +// - ms: MemoryStore instance to lookup metric configuration +// - from: The requested start timestamp +// - to: The requested end timestamp (unused but kept for API consistency) +// - metric: The metric name to lookup frequency information func (data *APIMetricData) PadDataWithNull(ms *MemoryStore, from, to int64, metric string) { minfo, ok := ms.Metrics[metric] if !ok { @@ -108,12 +169,44 @@ func (data *APIMetricData) PadDataWithNull(ms *MemoryStore, from, to int64, metr } } +// FetchData executes a batch metric query request and returns the results. +// +// This is the primary API for retrieving metric data from the memory store. It supports: +// - Individual queries via req.Queries +// - Batch queries for all nodes via req.ForAllNodes +// - Hierarchical selector construction (cluster → host → type → subtype) +// - Optional statistics computation (avg, min, max) +// - Optional data scaling +// - Optional data padding with NaN values +// +// The function constructs selectors based on the query parameters and calls MemoryStore.Read() +// for each selector. If a query specifies Aggregate=false with multiple type/subtype IDs, +// separate results are returned for each combination. +// +// Parameters: +// - req: The query request containing queries, time range, and options +// +// Returns: +// - APIQueryResponse containing results for each query, or error if validation fails +// +// Errors: +// - ErrInvalidTimeRange if req.From > req.To +// - ErrEmptyCluster if req.ForAllNodes is used without specifying a cluster +// - Error if MemoryStore is not initialized +// - Individual query errors are stored in APIMetricData.Error field func FetchData(req APIQueryRequest) (*APIQueryResponse, error) { - req.WithData = true - req.WithData = true - req.WithData = true + if req.From > req.To { + return nil, ErrInvalidTimeRange + } + if req.Cluster == "" && req.ForAllNodes != nil { + return nil, ErrEmptyCluster + } + req.WithData = true ms := GetMemoryStore() + if ms == nil { + return nil, fmt.Errorf("[METRICSTORE]> memorystore not initialized") + } response := APIQueryResponse{ Results: make([][]APIMetricData, 0, len(req.Queries)), @@ -181,8 +274,6 @@ func FetchData(req APIQueryRequest) (*APIQueryResponse, error) { } } - // log.Printf("query: %#v\n", query) - // log.Printf("sels: %#v\n", sels) var err error res := make([]APIMetricData, 0, len(sels)) for _, sel := range sels { @@ -190,9 +281,15 @@ func FetchData(req APIQueryRequest) (*APIQueryResponse, error) { data.Data, data.From, data.To, data.Resolution, err = ms.Read(sel, query.Metric, req.From, req.To, query.Resolution) if err != nil { - msg := err.Error() - data.Error = &msg - res = append(res, data) + // Skip Error If Just Missing Host or Metric, Continue + // Empty Return For Metric Handled Gracefully By Frontend + if err != ErrNoHostOrMetric { + msg := err.Error() + data.Error = &msg + res = append(res, data) + } else { + cclog.Warnf("failed to fetch '%s' from host '%s' (cluster: %s): %s", query.Metric, query.Hostname, req.Cluster, err.Error()) + } continue } diff --git a/pkg/metricstore/archive.go b/pkg/metricstore/archive.go new file mode 100644 index 00000000..916736d0 --- /dev/null +++ b/pkg/metricstore/archive.go @@ -0,0 +1,277 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package metricstore + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "sync" + "sync/atomic" + "time" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" +) + +// Worker for either Archiving or Deleting files + +func CleanUp(wg *sync.WaitGroup, ctx context.Context) { + if Keys.Cleanup.Mode == "archive" { + // Run as Archiver + cleanUpWorker(wg, ctx, + Keys.RetentionInMemory, + "archiving", + Keys.Cleanup.RootDir, + false, + ) + } else { + // Run as Deleter + cleanUpWorker(wg, ctx, + Keys.RetentionInMemory, + "deleting", + "", + true, + ) + } +} + +// cleanUpWorker takes simple values to configure what it does +func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mode string, cleanupDir string, delete bool) { + wg.Go(func() { + + d, err := time.ParseDuration(interval) + if err != nil { + cclog.Fatalf("[METRICSTORE]> error parsing %s interval duration: %v\n", mode, err) + } + if d <= 0 { + return + } + + ticker := time.NewTicker(d) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + t := time.Now().Add(-d) + cclog.Infof("[METRICSTORE]> start %s checkpoints (older than %s)...", mode, t.Format(time.RFC3339)) + + n, err := CleanupCheckpoints(Keys.Checkpoints.RootDir, cleanupDir, t.Unix(), delete) + + if err != nil { + cclog.Errorf("[METRICSTORE]> %s failed: %s", mode, err.Error()) + } else { + if delete { + cclog.Infof("[METRICSTORE]> done: %d checkpoints deleted", n) + } else { + cclog.Infof("[METRICSTORE]> done: %d checkpoint files archived to parquet", n) + } + } + } + } + }) +} + +var ErrNoNewArchiveData error = errors.New("all data already archived") + +// CleanupCheckpoints deletes or archives all checkpoint files older than `from`. +// When archiving, consolidates all hosts per cluster into a single Parquet file. +func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteInstead bool) (int, error) { + if deleteInstead { + return deleteCheckpoints(checkpointsDir, from) + } + + return archiveCheckpoints(checkpointsDir, cleanupDir, from) +} + +// deleteCheckpoints removes checkpoint files older than `from` across all clusters/hosts. +func deleteCheckpoints(checkpointsDir string, from int64) (int, error) { + entries1, err := os.ReadDir(checkpointsDir) + if err != nil { + return 0, err + } + + type workItem struct { + dir string + cluster, host string + } + + var wg sync.WaitGroup + n, errs := int32(0), int32(0) + work := make(chan workItem, Keys.NumWorkers) + + wg.Add(Keys.NumWorkers) + for worker := 0; worker < Keys.NumWorkers; worker++ { + go func() { + defer wg.Done() + for item := range work { + entries, err := os.ReadDir(item.dir) + if err != nil { + cclog.Errorf("error reading %s/%s: %s", item.cluster, item.host, err.Error()) + atomic.AddInt32(&errs, 1) + continue + } + + files, err := findFiles(entries, from, false) + if err != nil { + cclog.Errorf("error finding files in %s/%s: %s", item.cluster, item.host, err.Error()) + atomic.AddInt32(&errs, 1) + continue + } + + for _, checkpoint := range files { + if err := os.Remove(filepath.Join(item.dir, checkpoint)); err != nil { + cclog.Errorf("error deleting %s/%s/%s: %s", item.cluster, item.host, checkpoint, err.Error()) + atomic.AddInt32(&errs, 1) + } else { + atomic.AddInt32(&n, 1) + } + } + } + }() + } + + for _, de1 := range entries1 { + entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name())) + if e != nil { + err = e + continue + } + + for _, de2 := range entries2 { + work <- workItem{ + dir: filepath.Join(checkpointsDir, de1.Name(), de2.Name()), + cluster: de1.Name(), + host: de2.Name(), + } + } + } + + close(work) + wg.Wait() + + if err != nil { + return int(n), err + } + if errs > 0 { + return int(n), fmt.Errorf("%d errors happened while deleting (%d successes)", errs, n) + } + return int(n), nil +} + +// archiveCheckpoints archives checkpoint files to Parquet format. +// Produces one Parquet file per cluster: //.parquet +func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, error) { + clusterEntries, err := os.ReadDir(checkpointsDir) + if err != nil { + return 0, err + } + + totalFiles := 0 + + for _, clusterEntry := range clusterEntries { + if !clusterEntry.IsDir() { + continue + } + + cluster := clusterEntry.Name() + hostEntries, err := os.ReadDir(filepath.Join(checkpointsDir, cluster)) + if err != nil { + return totalFiles, err + } + + // Collect rows from all hosts in this cluster using worker pool + type hostResult struct { + rows []ParquetMetricRow + files []string // checkpoint filenames to delete after successful write + dir string // checkpoint directory for this host + } + + results := make(chan hostResult, len(hostEntries)) + work := make(chan struct { + dir, host string + }, Keys.NumWorkers) + + var wg sync.WaitGroup + errs := int32(0) + + wg.Add(Keys.NumWorkers) + for w := 0; w < Keys.NumWorkers; w++ { + go func() { + defer wg.Done() + for item := range work { + rows, files, err := archiveCheckpointsToParquet(item.dir, cluster, item.host, from) + if err != nil { + cclog.Errorf("[METRICSTORE]> error reading checkpoints for %s/%s: %s", cluster, item.host, err.Error()) + atomic.AddInt32(&errs, 1) + continue + } + if len(rows) > 0 { + results <- hostResult{rows: rows, files: files, dir: item.dir} + } + } + }() + } + + go func() { + for _, hostEntry := range hostEntries { + if !hostEntry.IsDir() { + continue + } + dir := filepath.Join(checkpointsDir, cluster, hostEntry.Name()) + work <- struct { + dir, host string + }{dir: dir, host: hostEntry.Name()} + } + close(work) + wg.Wait() + close(results) + }() + + // Collect all rows and file info + var allRows []ParquetMetricRow + var allResults []hostResult + for r := range results { + allRows = append(allRows, r.rows...) + allResults = append(allResults, r) + } + + if errs > 0 { + return totalFiles, fmt.Errorf("%d errors reading checkpoints for cluster %s", errs, cluster) + } + + if len(allRows) == 0 { + continue + } + + // Write one Parquet file per cluster + parquetFile := filepath.Join(cleanupDir, cluster, fmt.Sprintf("%d.parquet", from)) + if err := writeParquetArchive(parquetFile, allRows); err != nil { + return totalFiles, fmt.Errorf("writing parquet archive for cluster %s: %w", cluster, err) + } + + // Delete archived checkpoint files + for _, result := range allResults { + for _, file := range result.files { + filename := filepath.Join(result.dir, file) + if err := os.Remove(filename); err != nil { + cclog.Warnf("[METRICSTORE]> could not remove archived checkpoint %s: %v", filename, err) + } else { + totalFiles++ + } + } + } + + cclog.Infof("[METRICSTORE]> archived %d rows from %d files for cluster %s to %s", + len(allRows), totalFiles, cluster, parquetFile) + } + + return totalFiles, nil +} diff --git a/pkg/metricstore/buffer.go b/pkg/metricstore/buffer.go new file mode 100644 index 00000000..2d752006 --- /dev/null +++ b/pkg/metricstore/buffer.go @@ -0,0 +1,410 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +// Package metricstore provides buffer.go: Time-series data buffer implementation. +// +// # Buffer Architecture +// +// Each metric at each hierarchical level (cluster/host/cpu/etc.) uses a linked-list +// chain of fixed-size buffers to store time-series data. This design: +// +// - Avoids reallocation/copying when growing (new links added instead) +// - Enables efficient pooling (buffers returned to sync.Pool) +// - Supports traversal back in time (via prev pointers) +// - Maintains temporal ordering (newer data in later buffers) +// +// # Buffer Chain Example +// +// [oldest buffer] <- prev -- [older] <- prev -- [newest buffer (head)] +// start=1000 start=1512 start=2024 +// data=[v0...v511] data=[v0...v511] data=[v0...v42] +// +// When the head buffer reaches capacity (BufferCap = 512), a new buffer becomes +// the new head and the old head is linked via prev. +// +// # Pooling Strategy +// +// sync.Pool reduces GC pressure for the common case (BufferCap-sized allocations). +// Non-standard capacity buffers are not pooled (e.g., from checkpoint deserialization). +// +// # Time Alignment +// +// Timestamps are aligned to measurement frequency intervals: +// +// index = (timestamp - buffer.start) / buffer.frequency +// actualTime = buffer.start + (frequency / 2) + (index * frequency) +// +// Missing data points are represented as NaN values. The read() function performs +// linear interpolation where possible. +package metricstore + +import ( + "errors" + "sync" + "time" + + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +// BufferCap is the default buffer capacity. +// buffer.data will only ever grow up to its capacity and a new link +// in the buffer chain will be created if needed so that no copying +// of data or reallocation needs to happen on writes. +const BufferCap int = DefaultBufferCapacity + +// BufferPool is the global instance. +// It is initialized immediately when the package loads. +var bufferPool = NewPersistentBufferPool() + +type PersistentBufferPool struct { + pool []*buffer + mu sync.Mutex +} + +// NewPersistentBufferPool creates a dynamic pool for buffers. +func NewPersistentBufferPool() *PersistentBufferPool { + return &PersistentBufferPool{ + pool: make([]*buffer, 0), + } +} + +func (p *PersistentBufferPool) Get() *buffer { + p.mu.Lock() + defer p.mu.Unlock() + + n := len(p.pool) + if n == 0 { + // Pool is empty, allocate a new one + return &buffer{ + data: make([]schema.Float, 0, BufferCap), + } + } + + // Reuse existing buffer from the pool + b := p.pool[n-1] + p.pool[n-1] = nil // Avoid memory leak + p.pool = p.pool[:n-1] + return b +} + +// Put returns b to the pool. The caller must set b.lastUsed = time.Now().Unix() +// before calling Put so that Clean() can evict idle entries correctly. +func (p *PersistentBufferPool) Put(b *buffer) { + // Reset the buffer before putting it back + b.data = b.data[:0] + + p.mu.Lock() + defer p.mu.Unlock() + + p.pool = append(p.pool, b) +} + +// GetSize returns the exact number of buffers currently sitting in the pool. +func (p *PersistentBufferPool) GetSize() int { + p.mu.Lock() + defer p.mu.Unlock() + return len(p.pool) +} + +// Clear drains all buffers currently in the pool, allowing the GC to collect them. +func (p *PersistentBufferPool) Clear() { + p.mu.Lock() + defer p.mu.Unlock() + for i := range p.pool { + p.pool[i] = nil + } + p.pool = p.pool[:0] +} + +// Clean removes buffers from the pool that haven't been used in the given duration. +// It uses a simple LRU approach based on the lastUsed timestamp. +func (p *PersistentBufferPool) Clean(threshold int64) { + p.mu.Lock() + defer p.mu.Unlock() + + // Filter in place, retaining only buffers returned to the pool recently enough. + active := p.pool[:0] + for _, b := range p.pool { + if b.lastUsed >= threshold { + active = append(active, b) + } + } + + // Nullify the rest to prevent memory leaks + for i := len(active); i < len(p.pool); i++ { + p.pool[i] = nil + } + + p.pool = active +} + +var ( + // ErrNoData indicates no time-series data exists for the requested metric/level. + ErrNoData error = errors.New("[METRICSTORE]> no data for this metric/level") + + // ErrDataDoesNotAlign indicates that aggregated data from child scopes + // does not align with the parent scope's expected timestamps/intervals. + ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align") +) + +// buffer stores time-series data for a single metric at a specific hierarchical level. +// +// Buffers form doubly-linked chains ordered by time. When capacity is reached, +// a new buffer becomes the head and the old head is linked via prev/next. +// +// Fields: +// - prev: Link to older buffer in the chain (nil if this is oldest) +// - next: Link to newer buffer in the chain (nil if this is newest/head) +// - data: Time-series values (schema.Float supports NaN for missing data) +// - frequency: Measurement interval in seconds +// - start: Start timestamp (adjusted by -frequency/2 for alignment) +// - archived: True if data has been persisted to disk archive +// - closed: True if buffer is no longer accepting writes +// +// Index calculation: index = (timestamp - start) / frequency +// Actual data timestamp: start + (frequency / 2) + (index * frequency) +type buffer struct { + prev *buffer + next *buffer + data []schema.Float + frequency int64 + start int64 + archived bool + closed bool + lastUsed int64 +} + +func newBuffer(ts, freq int64) *buffer { + b := bufferPool.Get() + b.frequency = freq + b.start = ts - (freq / 2) + b.prev = nil + b.next = nil + b.archived = false + b.closed = false + b.data = b.data[:0] + return b +} + +// write appends a timestamped value to the buffer chain. +// +// Returns the head buffer (which may be newly created if capacity was reached). +// Timestamps older than the buffer's start are rejected. If the calculated index +// exceeds capacity, a new buffer is allocated and linked as the new head. +// +// Missing timestamps are automatically filled with NaN values to maintain alignment. +// Overwrites are allowed if the index is already within the existing data slice. +// +// Parameters: +// - ts: Unix timestamp in seconds +// - value: Metric value (can be schema.NaN for missing data) +// +// Returns: +// - *buffer: The new head buffer (same as b if no new buffer created) +// - error: Non-nil if timestamp is before buffer start +func (b *buffer) write(ts int64, value schema.Float) (*buffer, error) { + if ts < b.start { + return nil, errors.New("[METRICSTORE]> cannot write value to buffer from past") + } + + // idx := int((ts - b.start + (b.frequency / 3)) / b.frequency) + idx := int((ts - b.start) / b.frequency) + if idx >= cap(b.data) { + newbuf := newBuffer(ts, b.frequency) + newbuf.prev = b + b.next = newbuf + b = newbuf + idx = 0 + } + + // Overwriting value or writing value from past + if idx < len(b.data) { + b.data[idx] = value + return b, nil + } + + // Fill up unwritten slots with NaN + for i := len(b.data); i < idx; i++ { + b.data = append(b.data, schema.NaN) + } + + b.data = append(b.data, value) + return b, nil +} + +func (b *buffer) end() int64 { + return b.firstWrite() + int64(len(b.data))*b.frequency +} + +func (b *buffer) firstWrite() int64 { + return b.start + (b.frequency / 2) +} + +// read retrieves time-series data from the buffer chain for the specified time range. +// +// Traverses the buffer chain backwards (via prev links) if 'from' precedes the current +// buffer's start. Missing data points are represented as NaN. Values are accumulated +// into the provided 'data' slice (using +=, so caller must zero-initialize if needed). +// +// The function adjusts the actual time range returned if data is unavailable at the +// boundaries (returned via adjusted from/to timestamps). +// +// Parameters: +// - from: Start timestamp (Unix seconds) +// - to: End timestamp (Unix seconds, exclusive) +// - data: Pre-allocated slice to accumulate results (must be large enough) +// +// Returns: +// - []schema.Float: Slice of data (may be shorter than input 'data' slice) +// - int64: Actual start timestamp with available data +// - int64: Actual end timestamp (exclusive) +// - error: Non-nil on failure +// +// Panics if 'data' slice is too small to hold all values in [from, to). +func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) { + // Walk back to the buffer that covers 'from', adjusting if we hit the oldest. + for from < b.firstWrite() { + if b.prev == nil { + from = b.firstWrite() + break + } + b = b.prev + } + + i := 0 + t := from + for ; t < to; t += b.frequency { + idx := int((t - b.start) / b.frequency) + if idx >= cap(b.data) { + if b.next == nil { + break + } + b = b.next + // Recalculate idx in the new buffer; a gap between buffers may exist. + idx = int((t - b.start) / b.frequency) + } + + if idx >= len(b.data) { + if b.next == nil || to <= b.next.start { + break + } + data[i] += schema.NaN // NaN + anything = NaN; propagates missing data + } else if t < b.start { + data[i] += schema.NaN // gap before this buffer's first write + } else { + data[i] += b.data[idx] + } + i++ + } + + return data[:i], from, t, nil +} + +// free removes buffers older than the specified timestamp from the chain. +// +// Recursively traverses backwards (via prev) and unlinks buffers whose end time +// is before the retention threshold. Freed buffers are returned to the pool if +// they have the standard capacity (BufferCap). +// +// Parameters: +// - t: Retention threshold timestamp (Unix seconds) +// +// Returns: +// - delme: True if the current buffer itself should be deleted by caller +// - n: Number of buffers freed in this subtree +func (b *buffer) free(t int64) (delme bool, n int) { + if b.prev != nil { + delme, m := b.prev.free(t) + n += m + if delme { + b.prev.next = nil + if cap(b.prev.data) != BufferCap { + b.prev.data = make([]schema.Float, 0, BufferCap) + } + b.prev.lastUsed = time.Now().Unix() + bufferPool.Put(b.prev) + b.prev = nil + } + } + + end := b.end() + if end < t { + return true, n + 1 + } + + return false, n +} + +// forceFreeOldest recursively finds the end of the linked list (the oldest buffer) +// and removes it. +// Returns: +// +// delme: true if 'b' itself is the oldest and should be removed by the caller +// n: the number of buffers freed (will be 1 or 0) +func (b *buffer) forceFreeOldest() (delme bool, n int) { + // If there is a previous buffer, recurse down to find the oldest + if b.prev != nil { + delPrev, freed := b.prev.forceFreeOldest() + + // If the previous buffer signals it should be deleted: + if delPrev { + b.prev.next = nil + if cap(b.prev.data) != BufferCap { + b.prev.data = make([]schema.Float, 0, BufferCap) + } + b.prev.lastUsed = time.Now().Unix() + bufferPool.Put(b.prev) + b.prev = nil + } + return false, freed + } + + // If b.prev is nil, THIS buffer is the oldest. + // We return true so the parent (or the Level loop) knows to delete reference to 'b'. + return true, 1 +} + +// iterFromTo invokes callback on every buffer in the chain that overlaps [from, to]. +// +// Traverses backwards (via prev) first, then processes current buffer if it overlaps +// the time range. Used for checkpoint/archive operations that need to serialize buffers +// within a specific time window. +// +// Parameters: +// - from: Start timestamp (Unix seconds, inclusive) +// - to: End timestamp (Unix seconds, inclusive) +// - callback: Function to invoke on each overlapping buffer +// +// Returns: +// - error: First error returned by callback, or nil if all succeeded +func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error { + if b == nil { + return nil + } + + // Collect overlapping buffers walking backwards (newest → oldest). + var matching []*buffer + for cur := b; cur != nil; cur = cur.prev { + if from <= cur.end() && cur.start <= to { + matching = append(matching, cur) + } + } + + // Invoke callback in chronological order (oldest → newest). + for i := len(matching) - 1; i >= 0; i-- { + if err := callback(matching[i]); err != nil { + return err + } + } + return nil +} + +func (b *buffer) count() int64 { + var res int64 + for ; b != nil; b = b.prev { + res += int64(len(b.data)) + } + return res +} diff --git a/pkg/metricstore/checkpoint.go b/pkg/metricstore/checkpoint.go new file mode 100644 index 00000000..ba1f7ba0 --- /dev/null +++ b/pkg/metricstore/checkpoint.go @@ -0,0 +1,662 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +// This file implements checkpoint persistence for the in-memory metric store. +// +// Checkpoints enable graceful restarts by periodically saving in-memory metric +// data to disk in JSON or binary format. The checkpoint system: +// +// Key Features: +// - Periodic background checkpointing via the Checkpointing() worker +// - Two format families: JSON (human-readable) and WAL+binary (compact, crash-safe) +// - Parallel checkpoint creation and loading using worker pools +// - Hierarchical file organization: checkpoint_dir/cluster/host/timestamp.{json|bin} +// - WAL file: checkpoint_dir/cluster/host/current.wal (append-only, per-entry) +// - Only saves unarchived data (archived data is already persisted elsewhere) +// - GC optimization during loading to prevent excessive heap growth +// +// Checkpoint Workflow: +// 1. Init() loads checkpoints within retention window at startup +// 2. Checkpointing() worker periodically saves new data +// 3. Shutdown() writes final checkpoint before exit +// +// File Organization: +// +// checkpoints/ +// cluster1/ +// host001/ +// 1234567890.json (JSON format: full subtree snapshot) +// 1234567890.bin (binary format: full subtree snapshot) +// current.wal (WAL format: append-only per-entry log) +// host002/ +// ... +package metricstore + +import ( + "bufio" + "context" + "encoding/json" + "errors" + "fmt" + "io/fs" + "os" + "path" + "path/filepath" + "sort" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +const ( + CheckpointFilePerms = 0o644 // File permissions for checkpoint files + CheckpointDirPerms = 0o755 // Directory permissions for checkpoint directories + GCTriggerInterval = DefaultGCTriggerInterval // Interval for triggering GC during checkpoint loading +) + +// CheckpointMetrics represents metric data in a checkpoint file. +// Whenever the structure changes, update MarshalJSON as well! +type CheckpointMetrics struct { + Data []schema.Float `json:"data"` + Frequency int64 `json:"frequency"` + Start int64 `json:"start"` +} + +// CheckpointFile represents the hierarchical structure of a checkpoint file. +// It mirrors the Level tree structure from the MemoryStore. +type CheckpointFile struct { + Metrics map[string]*CheckpointMetrics `json:"metrics"` + Children map[string]*CheckpointFile `json:"children"` + From int64 `json:"from"` + To int64 `json:"to"` +} + +// lastCheckpoint tracks the timestamp of the last checkpoint creation. +var ( + lastCheckpoint time.Time + lastCheckpointMu sync.Mutex +) + +// Checkpointing starts a background worker that periodically saves metric data to disk. +// +// Checkpoints are written every 12 hours (hardcoded). +// +// Format behaviour: +// - "json": Periodic checkpointing every checkpointInterval +// - "wal": Periodic binary snapshots + WAL rotation every checkpointInterval +func Checkpointing(wg *sync.WaitGroup, ctx context.Context) { + lastCheckpointMu.Lock() + lastCheckpoint = time.Now() + lastCheckpointMu.Unlock() + + ms := GetMemoryStore() + + wg.Go(func() { + + const checkpointInterval = 12 * time.Hour + d := checkpointInterval + + ticker := time.NewTicker(d) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + lastCheckpointMu.Lock() + from := lastCheckpoint + lastCheckpointMu.Unlock() + + now := time.Now() + cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", from.Format(time.RFC3339)) + + if Keys.Checkpoints.FileFormat == "wal" { + n, hostDirs, err := ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), now.Unix()) + if err != nil { + cclog.Errorf("[METRICSTORE]> binary checkpointing failed: %s", err.Error()) + } else { + cclog.Infof("[METRICSTORE]> done: %d binary snapshot files created", n) + lastCheckpointMu.Lock() + lastCheckpoint = now + lastCheckpointMu.Unlock() + // Rotate WAL files for successfully checkpointed hosts. + RotateWALFiles(hostDirs) + } + } else { + n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir, from.Unix(), now.Unix()) + if err != nil { + cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error()) + } else { + cclog.Infof("[METRICSTORE]> done: %d checkpoint files created", n) + lastCheckpointMu.Lock() + lastCheckpoint = now + lastCheckpointMu.Unlock() + } + } + } + } + }) +} + +// MarshalJSON provides optimized JSON encoding for CheckpointMetrics. +// +// Since schema.Float has custom MarshalJSON, serializing []Float has significant overhead. +// This method manually constructs JSON to avoid allocations and interface conversions. +func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) { + buf := make([]byte, 0, 128+len(cm.Data)*8) + buf = append(buf, `{"frequency":`...) + buf = strconv.AppendInt(buf, cm.Frequency, 10) + buf = append(buf, `,"start":`...) + buf = strconv.AppendInt(buf, cm.Start, 10) + buf = append(buf, `,"data":[`...) + for i, x := range cm.Data { + if i != 0 { + buf = append(buf, ',') + } + if x.IsNaN() { + buf = append(buf, `null`...) + } else { + buf = strconv.AppendFloat(buf, float64(x), 'f', 1, 32) + } + } + buf = append(buf, `]}`...) + return buf, nil +} + +// ToCheckpoint writes metric data to checkpoint files in parallel (JSON format). +// +// Metrics at root and cluster levels are skipped. One file per host is created. +// Uses worker pool (Keys.NumWorkers) for parallel processing. Only locks one host +// at a time, allowing concurrent writes/reads to other hosts. +// +// Returns the number of checkpoint files created and any errors encountered. +func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) { + // Pre-calculate capacity by counting cluster/host pairs + m.root.lock.RLock() + totalHosts := 0 + for _, l1 := range m.root.children { + l1.lock.RLock() + totalHosts += len(l1.children) + l1.lock.RUnlock() + } + m.root.lock.RUnlock() + + levels := make([]*Level, 0, totalHosts) + selectors := make([][]string, 0, totalHosts) + + m.root.lock.RLock() + for sel1, l1 := range m.root.children { + l1.lock.RLock() + for sel2, l2 := range l1.children { + levels = append(levels, l2) + selectors = append(selectors, []string{sel1, sel2}) + } + l1.lock.RUnlock() + } + m.root.lock.RUnlock() + + type workItem struct { + level *Level + dir string + selector []string + } + + n, errs := int32(0), int32(0) + + var wg sync.WaitGroup + wg.Add(Keys.NumWorkers) + work := make(chan workItem, Keys.NumWorkers*2) + for worker := 0; worker < Keys.NumWorkers; worker++ { + go func() { + defer wg.Done() + + for workItem := range work { + if err := workItem.level.toCheckpoint(workItem.dir, from, to, m); err != nil { + if err == ErrNoNewArchiveData { + continue + } + + cclog.Errorf("[METRICSTORE]> error while checkpointing %#v: %s", workItem.selector, err.Error()) + atomic.AddInt32(&errs, 1) + } else { + atomic.AddInt32(&n, 1) + } + } + }() + } + + for i := 0; i < len(levels); i++ { + dir := path.Join(dir, path.Join(selectors[i]...)) + work <- workItem{ + level: levels[i], + dir: dir, + selector: selectors[i], + } + } + + close(work) + wg.Wait() + + if errs > 0 { + return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n) + } + return int(n), nil +} + +// toCheckpointFile recursively converts a Level tree to CheckpointFile structure. +// Skips metrics that are already archived. Returns nil if no unarchived data exists. +func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) { + l.lock.RLock() + defer l.lock.RUnlock() + + retval := &CheckpointFile{ + From: from, + To: to, + Metrics: make(map[string]*CheckpointMetrics), + Children: make(map[string]*CheckpointFile), + } + + for metric, minfo := range m.Metrics { + b := l.metrics[minfo.offset] + if b == nil { + continue + } + + allArchived := true + b.iterFromTo(from, to, func(b *buffer) error { + if !b.archived { + allArchived = false + return fmt.Errorf("stop") // Early termination signal + } + return nil + }) + + if allArchived { + continue + } + + data := make([]schema.Float, (to-from)/b.frequency+1) + data, start, end, err := b.read(from, to, data) + if err != nil { + return nil, err + } + + for i := int((end - start) / b.frequency); i < len(data); i++ { + data[i] = schema.NaN + } + + retval.Metrics[metric] = &CheckpointMetrics{ + Frequency: b.frequency, + Start: start, + Data: data, + } + } + + for name, child := range l.children { + val, err := child.toCheckpointFile(from, to, m) + if err != nil { + return nil, err + } + + if val != nil { + retval.Children[name] = val + } + } + + if len(retval.Children) == 0 && len(retval.Metrics) == 0 { + return nil, nil + } + + return retval, nil +} + +// toCheckpoint writes a Level's data to a JSON checkpoint file. +// Creates directory if needed. Returns ErrNoNewArchiveData if nothing to save. +func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error { + cf, err := l.toCheckpointFile(from, to, m) + if err != nil { + return err + } + + if cf == nil { + return ErrNoNewArchiveData + } + + filepath := path.Join(dir, fmt.Sprintf("%d.json", from)) + f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms) + if err != nil && os.IsNotExist(err) { + err = os.MkdirAll(dir, CheckpointDirPerms) + if err == nil { + f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms) + } + } + if err != nil { + return err + } + defer f.Close() + + bw := bufio.NewWriter(f) + if err = json.NewEncoder(bw).Encode(cf); err != nil { + return err + } + + return bw.Flush() +} + +// enqueueCheckpointHosts traverses checkpoint directory and enqueues cluster/host pairs. +// Returns error if directory structure is invalid. +func enqueueCheckpointHosts(dir string, work chan<- [2]string) error { + clustersDir, err := os.ReadDir(dir) + if err != nil { + return err + } + + for _, clusterDir := range clustersDir { + if !clusterDir.IsDir() { + return errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory") + } + + hostsDir, err := os.ReadDir(filepath.Join(dir, clusterDir.Name())) + if err != nil { + return err + } + + for _, hostDir := range hostsDir { + if !hostDir.IsDir() { + return errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory") + } + + work <- [2]string{clusterDir.Name(), hostDir.Name()} + } + } + + return nil +} + +// FromCheckpoint loads checkpoint files from disk into memory in parallel. +// +// Uses worker pool to load cluster/host combinations. Returns number of files +// loaded and any errors. +func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) { + var wg sync.WaitGroup + work := make(chan [2]string, Keys.NumWorkers*4) + n, errs := int32(0), int32(0) + + wg.Add(Keys.NumWorkers) + for worker := 0; worker < Keys.NumWorkers; worker++ { + go func() { + defer wg.Done() + for host := range work { + lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics)) + nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from) + if err != nil { + cclog.Errorf("[METRICSTORE]> error while loading checkpoints for %s/%s: %s", host[0], host[1], err.Error()) + atomic.AddInt32(&errs, 1) + } + atomic.AddInt32(&n, int32(nn)) + } + }() + } + + err := enqueueCheckpointHosts(dir, work) + close(work) + wg.Wait() + + if err != nil { + return int(n), err + } + + if errs > 0 { + return int(n), fmt.Errorf("[METRICSTORE]> %d errors happened while creating checkpoints (%d successes)", errs, n) + } + return int(n), nil +} + +// FromCheckpointFiles is the main entry point for loading checkpoints at startup. +// +// Creates checkpoint directory if it doesn't exist. This function must be called +// before any writes or reads, and can only be called once. +func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) { + if _, err := os.Stat(dir); os.IsNotExist(err) { + err := os.MkdirAll(dir, CheckpointDirPerms) + if err != nil { + cclog.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err) + } + cclog.Debugf("[METRICSTORE]> %#v Directory created successfully", dir) + } + + return m.FromCheckpoint(dir, from) +} + +func (l *Level) loadJSONFile(m *MemoryStore, f *os.File, from int64) error { + br := bufio.NewReader(f) + cf := &CheckpointFile{} + if err := json.NewDecoder(br).Decode(cf); err != nil { + return err + } + + if cf.To != 0 && cf.To < from { + return nil + } + + if err := l.loadFile(cf, m); err != nil { + return err + } + + return nil +} + +func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error { + for name, metric := range cf.Metrics { + n := len(metric.Data) + b := &buffer{ + frequency: metric.Frequency, + start: metric.Start, + data: metric.Data[0:n:n], + prev: nil, + next: nil, + archived: true, + } + + minfo, ok := m.Metrics[name] + if !ok { + continue + } + + prev := l.metrics[minfo.offset] + if prev == nil { + l.metrics[minfo.offset] = b + } else { + if prev.start > b.start { + return fmt.Errorf("[METRICSTORE]> buffer start time %d is before previous buffer start %d", b.start, prev.start) + } + + b.prev = prev + prev.next = b + } + l.metrics[minfo.offset] = b + } + + if len(cf.Children) > 0 && l.children == nil { + l.children = make(map[string]*Level) + } + + for sel, childCf := range cf.Children { + child, ok := l.children[sel] + if !ok { + child = &Level{ + metrics: make([]*buffer, len(m.Metrics)), + children: nil, + } + l.children[sel] = child + } + + if err := child.loadFile(childCf, m); err != nil { + return err + } + } + + return nil +} + +// fromCheckpoint loads all checkpoint files (JSON, binary snapshot, WAL) for a +// single host directory. Snapshot files are loaded first (sorted by timestamp), +// then current.wal is replayed on top. +func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, error) { + direntries, err := os.ReadDir(dir) + if err != nil { + if os.IsNotExist(err) { + return 0, nil + } + return 0, err + } + + allFiles := make([]fs.DirEntry, 0) + var walEntry fs.DirEntry + filesLoaded := 0 + + for _, e := range direntries { + if e.IsDir() { + // Legacy: skip subdirectories (only used by old Avro format). + // These are ignored; their data is not loaded. + cclog.Debugf("[METRICSTORE]> skipping subdirectory %s in checkpoint dir %s", e.Name(), dir) + continue + } + + name := e.Name() + if strings.HasSuffix(name, ".json") || strings.HasSuffix(name, ".bin") { + allFiles = append(allFiles, e) + } else if name == "current.wal" { + walEntry = e + } + // Silently ignore other files (e.g., .tmp, .bin.tmp from interrupted writes). + } + + files, err := findFiles(allFiles, from, true) + if err != nil { + return filesLoaded, err + } + + loaders := map[string]func(*MemoryStore, *os.File, int64) error{ + ".json": l.loadJSONFile, + ".bin": l.loadBinaryFile, + } + + for _, filename := range files { + ext := filepath.Ext(filename) + loader := loaders[ext] + if loader == nil { + cclog.Warnf("[METRICSTORE]> unknown extension for checkpoint file %s", filename) + continue + } + + err := func() error { + f, err := os.Open(path.Join(dir, filename)) + if err != nil { + return err + } + defer f.Close() + return loader(m, f, from) + }() + if err != nil { + return filesLoaded, err + } + filesLoaded++ + } + + // Replay WAL after all snapshot files so it fills in data since the last snapshot. + if walEntry != nil { + err := func() error { + f, err := os.Open(path.Join(dir, walEntry.Name())) + if err != nil { + return err + } + defer f.Close() + return l.loadWALFile(m, f, from) + }() + if err != nil { + // WAL errors are non-fatal: the snapshot already loaded the bulk of data. + cclog.Warnf("[METRICSTORE]> WAL replay error for %s: %v (data since last snapshot may be missing)", dir, err) + } else { + filesLoaded++ + } + } + + return filesLoaded, nil +} + +// parseTimestampFromFilename extracts a Unix timestamp from a checkpoint filename. +// Supports ".json" (format: ".json") and ".bin" (format: ".bin"). +func parseTimestampFromFilename(name string) (int64, error) { + switch { + case strings.HasSuffix(name, ".json"): + return strconv.ParseInt(name[:len(name)-5], 10, 64) + case strings.HasSuffix(name, ".bin"): + return strconv.ParseInt(name[:len(name)-4], 10, 64) + default: + return 0, fmt.Errorf("unknown checkpoint extension for file %q", name) + } +} + +// findFiles returns filenames from direntries whose timestamps satisfy the filter. +// If findMoreRecentFiles is true, returns files with timestamps >= t (plus the +// last file before t if t falls between two files). +func findFiles(direntries []fs.DirEntry, t int64, findMoreRecentFiles bool) ([]string, error) { + nums := map[string]int64{} + for _, e := range direntries { + name := e.Name() + if !strings.HasSuffix(name, ".json") && !strings.HasSuffix(name, ".bin") { + continue + } + + ts, err := parseTimestampFromFilename(name) + if err != nil { + return nil, err + } + nums[name] = ts + } + + sort.Slice(direntries, func(i, j int) bool { + a, b := direntries[i], direntries[j] + return nums[a.Name()] < nums[b.Name()] + }) + + if len(nums) == 0 { + return nil, nil + } + + filenames := make([]string, 0) + + for i, e := range direntries { + ts1 := nums[e.Name()] + + if findMoreRecentFiles && t <= ts1 { + filenames = append(filenames, e.Name()) + } else if !findMoreRecentFiles && ts1 <= t && ts1 != 0 { + filenames = append(filenames, e.Name()) + } + + if i == len(direntries)-1 { + continue + } + + enext := direntries[i+1] + ts2 := nums[enext.Name()] + + if findMoreRecentFiles { + if ts1 < t && t < ts2 { + filenames = append(filenames, e.Name()) + } + } + } + + return filenames, nil +} diff --git a/pkg/metricstore/config.go b/pkg/metricstore/config.go new file mode 100644 index 00000000..3b6be529 --- /dev/null +++ b/pkg/metricstore/config.go @@ -0,0 +1,253 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +// Package metricstore provides config.go: Configuration structures and metric management. +// +// # Configuration Hierarchy +// +// The metricstore package uses nested configuration structures: +// +// MetricStoreConfig (Keys) +// ├─ NumWorkers: Parallel checkpoint/archive workers +// ├─ RetentionInMemory: How long to keep data in RAM (also used as cleanup interval) +// ├─ MemoryCap: Memory limit in bytes (triggers forceFree) +// ├─ Checkpoints: Persistence configuration +// │ ├─ FileFormat: "json" or "wal" (default: "wal") +// │ └─ RootDir: Checkpoint storage path +// ├─ Cleanup: Long-term storage configuration (interval = RetentionInMemory) +// │ ├─ RootDir: Archive storage path (archive mode only) +// │ └─ Mode: "delete" or "archive" +// ├─ Debug: Development/debugging options +// └─ Subscriptions: NATS topic subscriptions for metric ingestion +// +// # Metric Configuration +// +// Each metric (e.g., "cpu_load", "mem_used") has a MetricConfig entry in the global +// Metrics map, defining: +// +// - Frequency: Measurement interval in seconds +// - Aggregation: How to combine values (sum/avg/none) when transforming scopes +// - offset: Internal index into Level.metrics slice (assigned during Init) +// +// # AggregationStrategy +// +// Determines how to combine metric values when aggregating from finer to coarser scopes: +// +// - NoAggregation: Do not combine (incompatible scopes) +// - SumAggregation: Add values (e.g., power consumption: core→socket) +// - AvgAggregation: Average values (e.g., temperature: core→socket) +package metricstore + +import ( + "fmt" + "time" + + "github.com/ClusterCockpit/cc-backend/pkg/archive" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +const ( + DefaultMaxWorkers = 10 + DefaultBufferCapacity = 512 + DefaultGCTriggerInterval = 100 + DefaultMemoryUsageTrackerInterval = 1 * time.Hour +) + +// Checkpoints configures periodic persistence of in-memory metric data. +// +// Fields: +// - FileFormat: "json" (human-readable, periodic) or "wal" (binary snapshot + WAL, crash-safe); default is "wal" +// - RootDir: Filesystem path for checkpoint files (created if missing) +type Checkpoints struct { + FileFormat string `json:"file-format"` + RootDir string `json:"directory"` +} + +// Debug provides development and profiling options. +// +// Fields: +// - DumpToFile: Path to dump checkpoint data for inspection (empty = disabled) +// - EnableGops: Enable gops agent for live runtime debugging (https://github.com/google/gops) +type Debug struct { + DumpToFile string `json:"dump-to-file"` + EnableGops bool `json:"gops"` +} + +// Cleanup configures long-term storage of old metric data. +// +// Data older than RetentionInMemory is archived to disk or deleted. +// The cleanup interval is always RetentionInMemory. +// +// Fields: +// - RootDir: Filesystem path for archived data (used in "archive" mode) +// - Mode: "delete" (discard old data) or "archive" (write to RootDir) +type Cleanup struct { + RootDir string `json:"directory"` + Mode string `json:"mode"` +} + +// Subscriptions defines NATS topics to subscribe to for metric ingestion. +// +// Each subscription receives metrics via NATS messaging, enabling real-time +// data collection from compute nodes. +// +// Fields: +// - SubscribeTo: NATS subject/channel name (e.g., "metrics.compute.*") +// - ClusterTag: Default cluster name for metrics without cluster tag (optional) +type Subscriptions []struct { + // Channel name + SubscribeTo string `json:"subscribe-to"` + + // Allow lines without a cluster tag, use this as default, optional + ClusterTag string `json:"cluster-tag"` +} + +// MetricStoreConfig defines the main configuration for the metricstore. +// +// Loaded from cc-backend's config.json "metricstore" section. Controls memory usage, +// persistence, archiving, and metric ingestion. +// +// Fields: +// - NumWorkers: Parallel workers for checkpoint/archive (0 = auto: min(NumCPU/2+1, 10)) +// - RetentionInMemory: Duration string (e.g., "48h") for in-memory data retention +// - MemoryCap: Max bytes for buffer data (0 = unlimited); triggers forceFree when exceeded +// - Checkpoints: Periodic persistence configuration +// - Debug: Development/profiling options (nil = disabled) +// - Archive: Long-term storage configuration (nil = disabled) +// - Subscriptions: NATS topics for metric ingestion (nil = polling only) +type MetricStoreConfig struct { + // Number of concurrent workers for checkpoint and archive operations. + // If not set or 0, defaults to min(runtime.NumCPU()/2+1, 10) + NumWorkers int `json:"num-workers"` + RetentionInMemory string `json:"retention-in-memory"` + MemoryCap int `json:"memory-cap"` + Checkpoints Checkpoints `json:"checkpoints"` + Debug *Debug `json:"debug"` + Cleanup *Cleanup `json:"cleanup"` + Subscriptions *Subscriptions `json:"nats-subscriptions"` +} + +// Keys is the global metricstore configuration instance. +// +// Initialized with defaults, then overwritten by cc-backend's config.json. +// Accessed by Init(), Checkpointing(), and other lifecycle functions. +var Keys MetricStoreConfig = MetricStoreConfig{ + Checkpoints: Checkpoints{ + FileFormat: "wal", + RootDir: "./var/checkpoints", + }, + Cleanup: &Cleanup{ + Mode: "delete", + }, +} + +// AggregationStrategy defines how to combine metric values across hierarchy levels. +// +// Used when transforming data from finer-grained scopes (e.g., core) to coarser scopes +// (e.g., socket). This is SPATIAL aggregation, not TEMPORAL (time-based) aggregation. +// +// Values: +// - NoAggregation: Do not aggregate (incompatible scopes or non-aggregatable metrics) +// - SumAggregation: Add values (e.g., power: sum core power → socket power) +// - AvgAggregation: Average values (e.g., temperature: average core temps → socket temp) +type AggregationStrategy int + +const ( + NoAggregation AggregationStrategy = iota // Do not aggregate + SumAggregation // Sum values (e.g., power, energy) + AvgAggregation // Average values (e.g., temperature, utilization) +) + +// AssignAggregationStrategy parses a string into an AggregationStrategy value. +// +// Used when loading metric configurations from JSON/YAML files. +// +// Parameters: +// - str: "sum", "avg", or "" (empty string for NoAggregation) +// +// Returns: +// - AggregationStrategy: Parsed value +// - error: Non-nil if str is unrecognized +func AssignAggregationStrategy(str string) (AggregationStrategy, error) { + switch str { + case "": + return NoAggregation, nil + case "sum": + return SumAggregation, nil + case "avg": + return AvgAggregation, nil + default: + return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str) + } +} + +// MetricConfig defines configuration for a single metric type. +// +// Stored in the global Metrics map, keyed by metric name (e.g., "cpu_load"). +// +// Fields: +// - Frequency: Measurement interval in seconds (e.g., 60 for 1-minute granularity) +// - Aggregation: How to combine values across hierarchy levels (sum/avg/none) +// - offset: Internal index into Level.metrics slice (assigned during Init) +type MetricConfig struct { + // Interval in seconds at which measurements are stored + Frequency int64 + + // Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy. + Aggregation AggregationStrategy + + // Private, used internally... + offset int +} + +func BuildMetricList() map[string]MetricConfig { + var metrics map[string]MetricConfig = make(map[string]MetricConfig) + + addMetric := func(name string, metric MetricConfig) error { + if metrics == nil { + metrics = make(map[string]MetricConfig, 0) + } + + if existingMetric, ok := metrics[name]; ok { + if existingMetric.Frequency != metric.Frequency { + if existingMetric.Frequency < metric.Frequency { + existingMetric.Frequency = metric.Frequency + metrics[name] = existingMetric + } + } + } else { + metrics[name] = metric + } + + return nil + } + + // Helper function to add metric configuration + addMetricConfig := func(mc *schema.MetricConfig) { + agg, err := AssignAggregationStrategy(mc.Aggregation) + if err != nil { + cclog.Warnf("Could not find aggregation strategy for metric config '%s': %s", mc.Name, err.Error()) + } + + addMetric(mc.Name, MetricConfig{ + Frequency: int64(mc.Timestep), + Aggregation: agg, + }) + } + for _, c := range archive.Clusters { + for _, mc := range c.MetricConfig { + addMetricConfig(mc) + } + + for _, sc := range c.SubClusters { + for _, mc := range sc.MetricConfig { + addMetricConfig(mc) + } + } + } + + return metrics +} diff --git a/pkg/metricstore/configSchema.go b/pkg/metricstore/configSchema.go new file mode 100644 index 00000000..ed9bccaa --- /dev/null +++ b/pkg/metricstore/configSchema.go @@ -0,0 +1,81 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package metricstore + +const configSchema = `{ + "type": "object", + "description": "Configuration specific to built-in metric-store.", + "properties": { + "num-workers": { + "description": "Number of concurrent workers for checkpoint and archive operations", + "type": "integer" + }, + "checkpoints": { + "description": "Configuration for checkpointing the metrics buffers", + "type": "object", + "properties": { + "file-format": { + "description": "Specify the format for checkpoint files. Two variants: 'json' (human-readable, periodic) and 'wal' (binary snapshot + Write-Ahead Log, crash-safe). Default is 'wal'.", + "type": "string" + }, + "directory": { + "description": "Path in which the checkpointed files should be placed.", + "type": "string" + } + } + }, + "cleanup": { + "description": "Configuration for the cleanup process. The cleanup interval is always 'retention-in-memory'.", + "type": "object", + "properties": { + "mode": { + "description": "The operation mode (e.g., 'archive' or 'delete').", + "type": "string", + "enum": ["archive", "delete"] + }, + "directory": { + "description": "Target directory for archive operations.", + "type": "string" + } + }, + "if": { + "properties": { + "mode": { "const": "archive" } + } + }, + "then": { + "required": ["directory"] + } + }, + "retention-in-memory": { + "description": "Keep the metrics within memory for given time interval. Retention for X hours, then the metrics would be freed.", + "type": "string" + }, + "memory-cap": { + "description": "Upper memory capacity limit used by metricstore in GB", + "type": "integer" + }, + "nats-subscriptions": { + "description": "Array of various subscriptions. Allows to subscribe to different subjects and publishers.", + "type": "array", + "items": { + "type": "object", + "properties": { + "subscribe-to": { + "description": "Subject name", + "type": "string" + }, + "cluster-tag": { + "description": "Optional: Allow lines without a cluster tag, use this as default", + "type": "string" + } + }, + "required": ["subscribe-to"] + } + } + }, + "required": ["retention-in-memory", "memory-cap"] +}` diff --git a/internal/memorystore/debug.go b/pkg/metricstore/debug.go similarity index 99% rename from internal/memorystore/debug.go rename to pkg/metricstore/debug.go index b56cf254..50c91e08 100644 --- a/internal/memorystore/debug.go +++ b/pkg/metricstore/debug.go @@ -3,7 +3,7 @@ // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. -package memorystore +package metricstore import ( "bufio" diff --git a/pkg/metricstore/healthcheck.go b/pkg/metricstore/healthcheck.go new file mode 100644 index 00000000..b3470a14 --- /dev/null +++ b/pkg/metricstore/healthcheck.go @@ -0,0 +1,197 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package metricstore + +import ( + "encoding/json" + "fmt" + "time" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +// HealthCheckResponse represents the result of a health check operation. +type HealthCheckResponse struct { + Status schema.MonitoringState + Error error +} + +// HealthCheckResult holds the monitoring state and raw JSON health metrics +// for a single node as determined by HealthCheck. +type HealthCheckResult struct { + State schema.MonitoringState + HealthMetrics string // JSON: {"missing":[...],"degraded":[...]} +} + +// MaxMissingDataPoints is the threshold for stale data detection. +// A buffer is considered healthy if the gap between its last data point +// and the current time is within MaxMissingDataPoints * frequency. +const MaxMissingDataPoints int64 = 5 + +// bufferExists returns true if the buffer is non-nil and contains data. +func (b *buffer) bufferExists() bool { + if b == nil || b.data == nil || len(b.data) == 0 { + return false + } + + return true +} + +// isBufferHealthy returns true if the buffer has recent data within +// MaxMissingDataPoints * frequency of the current time. +func (b *buffer) isBufferHealthy() bool { + bufferEnd := b.start + b.frequency*int64(len(b.data)) + t := time.Now().Unix() + + return t-bufferEnd <= MaxMissingDataPoints*b.frequency +} + +// collectMetricStatus walks the subtree rooted at l and classifies each +// expected metric into the healthy or degraded map. +// +// Classification rules (evaluated per buffer, pessimistic): +// - A single stale buffer marks the metric as degraded permanently. +// - A healthy buffer only counts if no stale buffer has been seen. +// - Metrics absent from the global config or without any buffer remain +// in neither map and are later reported as missing. +func (l *Level) collectMetricStatus(m *MemoryStore, expectedMetrics []string, healthy, degraded map[string]bool) { + l.lock.RLock() + defer l.lock.RUnlock() + + for _, metricName := range expectedMetrics { + if degraded[metricName] { + continue // already degraded, cannot improve + } + mc := m.Metrics[metricName] + b := l.metrics[mc.offset] + if b.bufferExists() { + if !b.isBufferHealthy() { + degraded[metricName] = true + delete(healthy, metricName) + } else if !degraded[metricName] { + healthy[metricName] = true + } + } + } + + for _, lvl := range l.children { + lvl.collectMetricStatus(m, expectedMetrics, healthy, degraded) + } +} + +// getHealthyMetrics walks the complete subtree rooted at l and classifies +// each expected metric by comparing the collected status against the +// expected list. +// +// Returns: +// - missingList: metrics not found in global config or without any buffer +// - degradedList: metrics with at least one stale buffer in the subtree +func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) ([]string, []string) { + healthy := make(map[string]bool, len(expectedMetrics)) + degraded := make(map[string]bool) + + l.collectMetricStatus(m, expectedMetrics, healthy, degraded) + + missingList := make([]string, 0) + degradedList := make([]string, 0) + + for _, metricName := range expectedMetrics { + if healthy[metricName] { + continue + } + + if degraded[metricName] { + degradedList = append(degradedList, metricName) + } else { + missingList = append(missingList, metricName) + } + } + + return degradedList, missingList +} + +// GetHealthyMetrics returns missing and degraded metric lists for a node. +// +// It walks the metric tree starting from the node identified by selector +// and classifies each expected metric: +// - Missing: no buffer anywhere in the subtree, or metric not in global config +// - Degraded: at least one stale buffer exists in the subtree +// +// Metrics present in expectedMetrics but absent from both returned lists +// are considered fully healthy. +func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, []string, error) { + lvl := m.root.findLevel(selector) + if lvl == nil { + return nil, nil, fmt.Errorf("[METRICSTORE]> GetHealthyMetrics: host not found: %#v", selector) + } + + degradedList, missingList := lvl.getHealthyMetrics(m, expectedMetrics) + return degradedList, missingList, nil +} + +type HealthCheckReq struct { + Cluster string `json:"cluster" example:"fritz"` + Nodes []string `json:"nodes"` + MetricNames []string `json:"metric-names"` +} + +// HealthCheck evaluates multiple nodes against a set of expected metrics +// and returns a monitoring state per node. +// +// States: +// - MonitoringStateFull: all expected metrics are healthy +// - MonitoringStatePartial: some metrics are missing or degraded +// - MonitoringStateFailed: node not found, or no healthy metrics at all +func (m *MemoryStore) HealthCheck(cluster string, + nodes []string, expectedMetrics []string, +) (map[string]HealthCheckResult, error) { + results := make(map[string]HealthCheckResult, len(nodes)) + + for _, hostname := range nodes { + selector := []string{cluster, hostname} + + degradedList, missingList, err := m.GetHealthyMetrics(selector, expectedMetrics) + if err != nil { + results[hostname] = HealthCheckResult{State: schema.MonitoringStateFailed} + continue + } + + degradedCount := len(degradedList) + missingCount := len(missingList) + + healthyCount := len(expectedMetrics) - degradedCount - missingCount + + if degradedCount > 0 { + cclog.ComponentDebug("metricstore", "HealthCheck: node ", hostname, "degraded metrics:", degradedList) + } + if missingCount > 0 { + cclog.ComponentDebug("metricstore", "HealthCheck: node ", hostname, "missing metrics:", missingList) + } + + var state schema.MonitoringState + switch { + case degradedCount == 0 && missingCount == 0: + state = schema.MonitoringStateFull + case healthyCount == 0: + state = schema.MonitoringStateFailed + default: + state = schema.MonitoringStatePartial + } + + hm, _ := json.Marshal(map[string][]string{ + "missing": missingList, + "degraded": degradedList, + }) + + results[hostname] = HealthCheckResult{ + State: state, + HealthMetrics: string(hm), + } + } + + return results, nil +} diff --git a/pkg/metricstore/level.go b/pkg/metricstore/level.go new file mode 100644 index 00000000..2b24a2ea --- /dev/null +++ b/pkg/metricstore/level.go @@ -0,0 +1,393 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +// Package metricstore provides level.go: Hierarchical tree structure for metric storage. +// +// # Level Architecture +// +// The Level type forms a tree structure where each node represents a level in the +// ClusterCockpit hierarchy: cluster → host → socket → core → hwthread, with special +// nodes for memory domains and accelerators. +// +// Structure: +// +// Root Level (cluster="emmy") +// ├─ Level (host="node001") +// │ ├─ Level (socket="0") +// │ │ ├─ Level (core="0") [stores cpu0 metrics] +// │ │ └─ Level (core="1") [stores cpu1 metrics] +// │ └─ Level (socket="1") +// │ └─ ... +// └─ Level (host="node002") +// └─ ... +// +// Each Level can: +// - Hold data (metrics slice of buffer pointers) +// - Have child nodes (children map[string]*Level) +// - Both simultaneously (inner nodes can store aggregated metrics) +// +// # Selector Paths +// +// Selectors are hierarchical paths: []string{"cluster", "host", "component"}. +// Example: []string{"emmy", "node001", "cpu0"} navigates to the cpu0 core level. +// +// # Concurrency +// +// RWMutex protects children map and metrics slice. Read-heavy workload (metric reads) +// uses RLock. Writes (new levels, buffer updates) use Lock. Double-checked locking +// prevents races during level creation. +package metricstore + +import ( + "sync" + "time" + "unsafe" + + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" +) + +// Level represents a node in the hierarchical metric storage tree. +// +// Can be both a leaf or inner node. Inner nodes hold data in 'metrics' for aggregated +// values (e.g., socket-level metrics derived from core-level data). Named "Level" +// instead of "node" to avoid confusion with cluster nodes (hosts). +// +// Fields: +// - children: Map of child level names to Level pointers (e.g., "cpu0" → Level) +// - metrics: Slice of buffer pointers (one per metric, indexed by MetricConfig.offset) +// - lock: RWMutex for concurrent access (read-heavy, write-rare) +type Level struct { + children map[string]*Level + metrics []*buffer + lock sync.RWMutex +} + +// findLevelOrCreate navigates to or creates the level specified by selector. +// +// Recursively descends the tree, creating missing levels as needed. Uses double-checked +// locking: RLock first (fast path), then Lock if creation needed (slow path), then +// re-check after acquiring Lock to handle races. +// +// Example selector: []string{"emmy", "node001", "cpu0"} +// Navigates: root → emmy → node001 → cpu0, creating levels as needed. +// +// Parameters: +// - selector: Hierarchical path (consumed recursively, decreasing depth) +// - nMetrics: Number of metric slots to allocate in new levels +// +// Returns: +// - *Level: The target level (existing or newly created) +// +// Note: sync.Map may improve performance for high-concurrency writes, but current +// approach suffices for read-heavy workload. +func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level { + if len(selector) == 0 { + return l + } + + // Allow concurrent reads: + l.lock.RLock() + var child *Level + var ok bool + if l.children == nil { + // Children map needs to be created... + l.lock.RUnlock() + } else { + child, ok = l.children[selector[0]] + l.lock.RUnlock() + if ok { + return child.findLevelOrCreate(selector[1:], nMetrics) + } + } + + // The level does not exist, take write lock for unique access: + l.lock.Lock() + // While this thread waited for the write lock, another thread + // could have created the child node. + if l.children != nil { + child, ok = l.children[selector[0]] + if ok { + l.lock.Unlock() + return child.findLevelOrCreate(selector[1:], nMetrics) + } + } + + child = &Level{ + metrics: make([]*buffer, nMetrics), + children: nil, + } + + if l.children != nil { + l.children[selector[0]] = child + } else { + l.children = map[string]*Level{selector[0]: child} + } + l.lock.Unlock() + return child.findLevelOrCreate(selector[1:], nMetrics) +} + +// collectPaths gathers all selector paths at the specified depth in the tree. +// +// Recursively traverses children, collecting paths when currentDepth+1 == targetDepth. +// Each path is a selector that can be used with findLevel() or findBuffers(). +// +// Explicitly copies slices to avoid shared underlying arrays between siblings, preventing +// unintended mutations. +// +// Parameters: +// - currentDepth: Depth of current level (0 = root) +// - targetDepth: Depth to collect paths from +// - currentPath: Path accumulated so far +// - results: Output slice (appended to) +// +// Example: collectPaths(0, 2, []string{}, &results) collects all 2-level paths +// like []string{"emmy", "node001"}, []string{"emmy", "node002"}, etc. +func (l *Level) collectPaths(currentDepth, targetDepth int, currentPath []string, results *[][]string) { + l.lock.RLock() + defer l.lock.RUnlock() + + for key, child := range l.children { + if child == nil { + continue + } + + // We explicitly make a new slice and copy data to avoid sharing underlying arrays between siblings + newPath := make([]string, len(currentPath)) + copy(newPath, currentPath) + newPath = append(newPath, key) + + // Check depth, and just return if depth reached + if currentDepth+1 == targetDepth { + *results = append(*results, newPath) + } else { + child.collectPaths(currentDepth+1, targetDepth, newPath, results) + } + } +} + +// free removes buffers older than the retention threshold from the entire subtree. +// +// Recursively frees buffers in this level's metrics and all child levels. Buffers +// with standard capacity (BufferCap) are returned to the pool. Called by the +// retention worker to enforce retention policies. +// +// Parameters: +// - t: Retention threshold timestamp (Unix seconds) +// +// Returns: +// - int: Total number of buffers freed in this subtree +// - error: Non-nil on failure (propagated from children) +func (l *Level) free(t int64) (int, error) { + l.lock.Lock() + defer l.lock.Unlock() + + n := 0 + for i, b := range l.metrics { + if b != nil { + delme, m := b.free(t) + n += m + if delme { + if cap(b.data) != BufferCap { + b.data = make([]schema.Float, 0, BufferCap) + } + b.lastUsed = time.Now().Unix() + bufferPool.Put(b) + l.metrics[i] = nil + } + } + } + + for _, l := range l.children { + m, err := l.free(t) + n += m + if err != nil { + return n, err + } + } + + return n, nil +} + +// forceFree removes the oldest buffer from each metric chain in the subtree. +// +// Unlike free(), which removes based on time threshold, this unconditionally removes +// the oldest buffer in each chain. Used by MemoryUsageTracker when memory cap is +// exceeded and time-based retention is insufficient. +// +// Recursively processes current level's metrics and all child levels. +// +// Returns: +// - int: Total number of buffers freed in this subtree +// - error: Non-nil on failure (propagated from children) +func (l *Level) forceFree() (int, error) { + l.lock.Lock() + defer l.lock.Unlock() + + n := 0 + + // Iterate over metrics in the current level + for i, b := range l.metrics { + if b != nil { + // Attempt to free the oldest buffer in this chain + delme, freedCount := b.forceFreeOldest() + n += freedCount + + // If delme is true, it means 'b' itself (the head) was the oldest + // and needs to be removed from the slice. + if delme { + b.next = nil + b.prev = nil + if cap(b.data) != BufferCap { + b.data = make([]schema.Float, 0, BufferCap) + } + b.lastUsed = time.Now().Unix() + bufferPool.Put(b) + l.metrics[i] = nil + } + } + } + + // Recursively traverse children + for _, child := range l.children { + m, err := child.forceFree() + n += m + if err != nil { + return n, err + } + } + + return n, nil +} + +// sizeInBytes calculates the total memory usage of all buffers in the subtree. +// +// Recursively sums buffer data sizes (count of Float values × sizeof(Float)) across +// this level's metrics and all child levels. Used by MemoryUsageTracker to enforce +// memory cap limits. +// +// Returns: +// - int64: Total bytes used by buffer data in this subtree +func (l *Level) sizeInBytes() int64 { + l.lock.RLock() + defer l.lock.RUnlock() + size := int64(0) + + for _, b := range l.metrics { + if b != nil { + size += b.count() * int64(unsafe.Sizeof(schema.Float(0))) + } + } + + for _, child := range l.children { + size += child.sizeInBytes() + } + + return size +} + +// findLevel navigates to the level specified by selector, returning nil if not found. +// +// Read-only variant of findLevelOrCreate. Does not create missing levels. +// Recursively descends the tree following the selector path. +// +// Parameters: +// - selector: Hierarchical path (e.g., []string{"emmy", "node001", "cpu0"}) +// +// Returns: +// - *Level: The target level, or nil if any component in the path does not exist +func (l *Level) findLevel(selector []string) *Level { + if len(selector) == 0 { + return l + } + + l.lock.RLock() + defer l.lock.RUnlock() + + lvl := l.children[selector[0]] + if lvl == nil { + return nil + } + + return lvl.findLevel(selector[1:]) +} + +// findBuffers invokes callback on all buffers matching the selector pattern. +// +// Supports flexible selector patterns (from cc-lib/util.Selector): +// - Exact match: Selector element with String set (e.g., "node001") +// - Group match: Selector element with Group set (e.g., ["cpu0", "cpu2", "cpu4"]) +// - Wildcard: Selector element with Any=true (matches all children) +// +// Empty selector (len==0) matches current level's buffer at 'offset' and recursively +// all descendant buffers at the same offset (used for aggregation queries). +// +// Parameters: +// - selector: Pattern to match (consumed recursively) +// - offset: Metric index in metrics slice (from MetricConfig.offset) +// - f: Callback invoked on each matching buffer +// +// Returns: +// - error: First error returned by callback, or nil if all succeeded +// +// Example: +// +// // Find all cpu0 buffers across all hosts: +// findBuffers([]Selector{{Any: true}, {String: "cpu0"}}, metricOffset, callback) +func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error { + l.lock.RLock() + defer l.lock.RUnlock() + + if len(selector) == 0 { + b := l.metrics[offset] + if b != nil { + return f(b) + } + + for _, lvl := range l.children { + err := lvl.findBuffers(nil, offset, f) + if err != nil { + return err + } + } + return nil + } + + sel := selector[0] + if len(sel.String) != 0 && l.children != nil { + lvl, ok := l.children[sel.String] + if ok { + err := lvl.findBuffers(selector[1:], offset, f) + if err != nil { + return err + } + } + return nil + } + + if sel.Group != nil && l.children != nil { + for _, key := range sel.Group { + lvl, ok := l.children[key] + if ok { + err := lvl.findBuffers(selector[1:], offset, f) + if err != nil { + return err + } + } + } + return nil + } + + if sel.Any && l.children != nil { + for _, lvl := range l.children { + if err := lvl.findBuffers(selector[1:], offset, f); err != nil { + return err + } + } + return nil + } + + return nil +} diff --git a/pkg/metricstore/lineprotocol.go b/pkg/metricstore/lineprotocol.go new file mode 100644 index 00000000..ecae3df1 --- /dev/null +++ b/pkg/metricstore/lineprotocol.go @@ -0,0 +1,366 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +// This file implements ingestion of InfluxDB line-protocol metric data received +// over NATS. Each line encodes one metric sample with the following structure: +// +// [,cluster=][,hostname=][,type=][,type-id=][,subtype=][,stype-id=] value= [] +// +// The measurement name identifies the metric (e.g. "cpu_load"). Tags provide +// routing information (cluster, host) and optional sub-device selectors (type, +// subtype). Only one field is expected per line: "value". +// +// After decoding, each sample is: +// 1. Written to the in-memory store via ms.WriteToLevel. +// 2. If the checkpoint format is "wal", also forwarded to the WAL staging +// goroutine via the WALMessages channel for durable write-ahead logging. +package metricstore + +import ( + "bytes" + "context" + "fmt" + "sync" + "time" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/nats" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol" +) + +// ReceiveNats subscribes to all configured NATS subjects and feeds incoming +// line-protocol messages into the MemoryStore. +// +// When workers > 1 a pool of goroutines drains a shared channel so that +// multiple messages can be decoded in parallel. With workers == 1 the NATS +// callback decodes inline (no channel overhead, lower latency). +// +// The function blocks until ctx is cancelled and all worker goroutines have +// finished. It returns nil when the NATS client is not configured; callers +// should treat that as a no-op rather than an error. +func ReceiveNats(ms *MemoryStore, + workers int, + ctx context.Context, +) error { + nc := nats.GetClient() + + if nc == nil { + cclog.Warn("NATS client not initialized") + return nil + } + + var wg sync.WaitGroup + msgs := make(chan []byte, workers*2) + + for _, sc := range *Keys.Subscriptions { + clusterTag := sc.ClusterTag + if workers > 1 { + wg.Add(workers) + + for range workers { + go func() { + defer wg.Done() + for m := range msgs { + dec := lineprotocol.NewDecoderWithBytes(m) + if err := DecodeLine(dec, ms, clusterTag); err != nil { + cclog.Errorf("error: %s", err.Error()) + } + } + }() + } + + nc.Subscribe(sc.SubscribeTo, func(subject string, data []byte) { + select { + case msgs <- data: + case <-ctx.Done(): + } + }) + } else { + nc.Subscribe(sc.SubscribeTo, func(subject string, data []byte) { + dec := lineprotocol.NewDecoderWithBytes(data) + if err := DecodeLine(dec, ms, clusterTag); err != nil { + cclog.Errorf("error: %s", err.Error()) + } + }) + } + cclog.Infof("NATS subscription to '%s' established", sc.SubscribeTo) + } + + go func() { + <-ctx.Done() + close(msgs) + }() + + wg.Wait() + + return nil +} + +// reorder prepends prefix to buf in-place when buf has enough spare capacity, +// avoiding an allocation. Falls back to a regular append otherwise. +// +// It is used to assemble the "type" and "subtype" selector +// strings when the type tag arrives before the type-id tag in the line, so the +// two byte slices need to be concatenated in tag-declaration order regardless +// of wire order. +func reorder(buf, prefix []byte) []byte { + n := len(prefix) + m := len(buf) + if cap(buf) < m+n { + return append(prefix[:n:n], buf...) + } else { + buf = buf[:n+m] + for i := m - 1; i >= 0; i-- { + buf[i+n] = buf[i] + } + for i := range n { + buf[i] = prefix[i] + } + return buf + } +} + +// decodeState holds the per-call scratch buffers used by DecodeLine. +// Instances are recycled via decodeStatePool to avoid repeated allocations +// during high-throughput ingestion. +type decodeState struct { + // metricBuf holds a copy of the current measurement name (line-protocol + // measurement field). Copied because dec.Measurement() returns a slice + // that is invalidated by the next decoder call. + metricBuf []byte + + // selector is the sub-device path passed to WriteToLevel and WALMessage + // (e.g. ["socket0"] or ["socket0", "memctrl1"]). Reused across lines. + selector []string + + // typeBuf accumulates the concatenated "type"+"type-id" tag value for the + // current line. Reset at the start of each line's tag-decode loop. + typeBuf []byte + + // subTypeBuf accumulates the concatenated "subtype"+"stype-id" tag value. + // Reset at the start of each line's tag-decode loop. + subTypeBuf []byte + + // prevTypeBytes / prevTypeStr cache the last seen typeBuf content and its + // string conversion. Because consecutive lines in a batch typically address + // the same sub-device, the cache hit rate is very high and avoids + // repeated []byte→string allocations. + prevTypeBytes []byte + prevTypeStr string + + // prevSubTypeBytes / prevSubTypeStr are the same cache for the subtype. + prevSubTypeBytes []byte + prevSubTypeStr string +} + +// decodeStatePool recycles decodeState values across DecodeLine calls to +// reduce GC pressure during sustained metric ingestion. +var decodeStatePool = sync.Pool{ + New: func() any { + return &decodeState{ + metricBuf: make([]byte, 0, 16), + selector: make([]string, 0, 4), + typeBuf: make([]byte, 0, 16), + subTypeBuf: make([]byte, 0, 16), + } + }, +} + +// DecodeLine reads all lines from dec (InfluxDB line-protocol) and writes each +// decoded metric sample into ms. +// +// clusterDefault is used as the cluster name for lines that do not carry a +// "cluster" tag. Callers typically supply the ClusterTag value from the NATS +// subscription configuration. +// +// Performance notes: +// - A decodeState is obtained from decodeStatePool to reuse scratch buffers. +// - The Level pointer (host-level node in the metric tree) is cached across +// consecutive lines that share the same cluster+host pair to avoid +// repeated lock acquisitions on the root and cluster levels. +// - []byte→string conversions for type/subtype selectors are cached via +// prevType*/prevSubType* fields because batches typically repeat the same +// sub-device identifiers. +// - Timestamp parsing tries Second precision first; if that fails it retries +// Millisecond, Microsecond, and Nanosecond in turn. A missing timestamp +// falls back to time.Now(). +// +// When the checkpoint format is "wal" each successfully decoded sample is also +// sent to WALMessages so the WAL staging goroutine can persist it durably +// before the next binary snapshot. +func DecodeLine(dec *lineprotocol.Decoder, + ms *MemoryStore, + clusterDefault string, +) error { + // Reduce allocations in loop: + t := time.Now() + metric := Metric{} + st := decodeStatePool.Get().(*decodeState) + defer decodeStatePool.Put(st) + + // Optimize for the case where all lines in a "batch" are about the same + // cluster and host. By using `WriteToLevel` (level = host), we do not need + // to take the root- and cluster-level lock as often. + var lvl *Level = nil + prevCluster, prevHost := "", "" + + var ok bool + for dec.Next() { + rawmeasurement, err := dec.Measurement() + if err != nil { + return err + } + + // Needs to be copied because another call to dec.* would + // invalidate the returned slice. + st.metricBuf = append(st.metricBuf[:0], rawmeasurement...) + + // The go compiler optimizes map[string(byteslice)] lookups: + metric.MetricConfig, ok = ms.Metrics[string(rawmeasurement)] + if !ok { + continue + } + + st.typeBuf, st.subTypeBuf = st.typeBuf[:0], st.subTypeBuf[:0] + cluster, host := clusterDefault, "" + for { + key, val, err := dec.NextTag() + if err != nil { + return err + } + if key == nil { + break + } + + // The go compiler optimizes string([]byte{...}) == "...": + switch string(key) { + case "cluster": + if string(val) == prevCluster { + cluster = prevCluster + } else { + cluster = string(val) + lvl = nil + } + case "hostname", "host": + if string(val) == prevHost { + host = prevHost + } else { + host = string(val) + lvl = nil + } + case "type": + if string(val) == "node" { + break + } + + // We cannot be sure that the "type" tag comes before the "type-id" tag: + if len(st.typeBuf) == 0 { + st.typeBuf = append(st.typeBuf, val...) + } else { + st.typeBuf = reorder(st.typeBuf, val) + } + case "type-id": + st.typeBuf = append(st.typeBuf, val...) + case "subtype": + // We cannot be sure that the "subtype" tag comes before the "stype-id" tag: + if len(st.subTypeBuf) == 0 { + st.subTypeBuf = append(st.subTypeBuf, val...) + } else { + st.subTypeBuf = reorder(st.subTypeBuf, val) + } + case "stype-id": + st.subTypeBuf = append(st.subTypeBuf, val...) + default: + } + } + + // If the cluster or host changed, the lvl was set to nil + if lvl == nil { + st.selector = st.selector[:2] + st.selector[0], st.selector[1] = cluster, host + lvl = ms.GetLevel(st.selector) + prevCluster, prevHost = cluster, host + } + + // subtypes: cache []byte→string conversions; messages in a batch typically + // share the same type/subtype so the hit rate is very high. + st.selector = st.selector[:0] + if len(st.typeBuf) > 0 { + if !bytes.Equal(st.typeBuf, st.prevTypeBytes) { + st.prevTypeBytes = append(st.prevTypeBytes[:0], st.typeBuf...) + st.prevTypeStr = string(st.typeBuf) + } + st.selector = append(st.selector, st.prevTypeStr) + if len(st.subTypeBuf) > 0 { + if !bytes.Equal(st.subTypeBuf, st.prevSubTypeBytes) { + st.prevSubTypeBytes = append(st.prevSubTypeBytes[:0], st.subTypeBuf...) + st.prevSubTypeStr = string(st.subTypeBuf) + } + st.selector = append(st.selector, st.prevSubTypeStr) + } + } + + for { + key, val, err := dec.NextField() + if err != nil { + return err + } + + if key == nil { + break + } + + if string(key) != "value" { + return fmt.Errorf("host %s: unknown field: '%s' (value: %#v)", host, string(key), val) + } + + if val.Kind() == lineprotocol.Float { + metric.Value = schema.Float(val.FloatV()) + } else if val.Kind() == lineprotocol.Int { + metric.Value = schema.Float(val.IntV()) + } else if val.Kind() == lineprotocol.Uint { + metric.Value = schema.Float(val.UintV()) + } else { + return fmt.Errorf("host %s: unsupported value type in message: %s", host, val.Kind().String()) + } + } + + if t, err = dec.Time(lineprotocol.Second, t); err != nil { + t = time.Now() + if t, err = dec.Time(lineprotocol.Millisecond, t); err != nil { + t = time.Now() + if t, err = dec.Time(lineprotocol.Microsecond, t); err != nil { + t = time.Now() + if t, err = dec.Time(lineprotocol.Nanosecond, t); err != nil { + return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error()) + } + } + } + } + + if err != nil { + return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error()) + } + + time := t.Unix() + + if Keys.Checkpoints.FileFormat == "wal" { + WALMessages <- &WALMessage{ + MetricName: string(st.metricBuf), + Cluster: cluster, + Node: host, + Selector: append([]string{}, st.selector...), + Value: metric.Value, + Timestamp: time, + } + } + + if err := ms.WriteToLevel(lvl, st.selector, time, []Metric{metric}); err != nil { + return err + } + } + return nil +} diff --git a/pkg/metricstore/metricstore.go b/pkg/metricstore/metricstore.go new file mode 100644 index 00000000..b6fbb51a --- /dev/null +++ b/pkg/metricstore/metricstore.go @@ -0,0 +1,784 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +// Package metricstore provides an efficient in-memory time-series metric storage system +// with support for hierarchical data organization, checkpointing, and archiving. +// +// The package organizes metrics in a tree structure (cluster → host → component) and +// provides concurrent read/write access to metric data with configurable aggregation strategies. +// Background goroutines handle periodic checkpointing (JSON or Avro format), archiving old data, +// and enforcing retention policies. +// +// Key features: +// - In-memory metric storage with configurable retention +// - Hierarchical data organization (selectors) +// - Concurrent checkpoint/archive workers +// - Support for sum and average aggregation +// - NATS integration for metric ingestion +package metricstore + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "runtime" + "runtime/debug" + "slices" + "sync" + "time" + + "github.com/ClusterCockpit/cc-backend/internal/config" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/resampler" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" +) + +// GlobalState holds the global state for the metric store with thread-safe access. +type GlobalState struct { + mu sync.RWMutex + lastRetentionTime int64 + selectorsExcluded bool +} + +var ( + singleton sync.Once + msInstance *MemoryStore + // shutdownFunc stores the context cancellation function created in Init + // and is called during Shutdown to cancel all background goroutines + shutdownFunc context.CancelFunc + shutdownFuncMu sync.Mutex // Protects shutdownFunc from concurrent access + // Create a global instance + state = &GlobalState{} +) + +// NodeProvider provides information about nodes currently in use by running jobs. +// +// This interface allows metricstore to query job information without directly +// depending on the repository package, breaking the import cycle. +// +// Implementations should return nodes that are actively processing jobs started +// before the given timestamp. These nodes will be excluded from retention-based +// garbage collection to prevent data loss for jobs that are still running or +// recently completed. +type NodeProvider interface { + // GetUsedNodes returns a map of cluster names to sorted lists of unique hostnames + // that are currently in use by jobs that started before the given timestamp. + // + // Parameters: + // - ts: Unix timestamp threshold - returns nodes with jobs started before this time + // + // Returns: + // - Map of cluster names to lists of node hostnames that should be excluded from garbage collection + // - Error if the query fails + GetUsedNodes(ts int64) (map[string][]string, error) +} + +// Metric represents a single metric data point to be written to the store. +type Metric struct { + Name string + Value schema.Float + // MetricConfig contains frequency and aggregation settings for this metric. + // If Frequency is 0, configuration will be looked up from MemoryStore.Metrics during Write(). + MetricConfig MetricConfig +} + +// MemoryStore is the main in-memory time-series metric storage implementation. +// +// It organizes metrics in a hierarchical tree structure where each level represents +// a component of the system hierarchy (e.g., cluster → host → CPU). Each level can +// store multiple metrics as time-series buffers. +// +// The store is initialized as a singleton via InitMetrics() and accessed via GetMemoryStore(). +// All public methods are safe for concurrent use. +type MemoryStore struct { + Metrics map[string]MetricConfig + root Level + nodeProvider NodeProvider +} + +// Init initializes the metric store from configuration and starts background workers. +// +// This function must be called exactly once before any other metricstore operations. +// It performs the following initialization steps: +// 1. Validates and decodes the metric store configuration +// 2. Configures worker pool size (defaults to NumCPU/2+1, max 10) +// 3. Loads metric configurations from all registered clusters +// 4. Restores checkpoints within the retention window +// 5. Starts background workers for retention, checkpointing, archiving, and monitoring +// 6. Optionally subscribes to NATS for real-time metric ingestion +// +// Parameters: +// - rawConfig: JSON configuration for the metric store (see MetricStoreConfig) +// - wg: WaitGroup that will be incremented for each background goroutine started +// +// The function will call cclog.Fatal on critical errors during initialization. +// Use Shutdown() to cleanly stop all background workers started by Init(). +// +// Note: Signal handling must be implemented by the caller. Call Shutdown() when +// receiving termination signals to ensure checkpoint data is persisted. +func Init(rawConfig json.RawMessage, metrics map[string]MetricConfig, wg *sync.WaitGroup) { + startupTime := time.Now() + + if rawConfig != nil { + config.Validate(configSchema, rawConfig) + dec := json.NewDecoder(bytes.NewReader(rawConfig)) + dec.DisallowUnknownFields() + if err := dec.Decode(&Keys); err != nil { + cclog.Abortf("[METRICSTORE]> Metric Store Config Init: Could not decode config file '%s'.\nError: %s\n", rawConfig, err.Error()) + } + } + + // Set NumWorkers from config or use default + if Keys.NumWorkers <= 0 { + Keys.NumWorkers = min(runtime.NumCPU()/2+1, DefaultMaxWorkers) + } + cclog.Debugf("[METRICSTORE]> Using %d workers for checkpoint/archive operations\n", Keys.NumWorkers) + + // Pass the config.MetricStoreKeys + InitMetrics(metrics) + + ms := GetMemoryStore() + + d, err := time.ParseDuration(Keys.RetentionInMemory) + if err != nil { + cclog.Fatal(err) + } + + restoreFrom := startupTime.Add(-d) + cclog.Infof("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339)) + files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix()) + loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB + if err != nil { + cclog.Fatalf("[METRICSTORE]> Loading checkpoints failed: %s\n", err.Error()) + } else { + cclog.Infof("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds()) + } + + // Try to use less memory by forcing a GC run here and then + // lowering the target percentage. The default of 100 means + // that only once the ratio of new allocations execeds the + // previously active heap, a GC is triggered. + // Forcing a GC here will set the "previously active heap" + // to a minumum. + // runtime.GC() + + ctx, shutdown := context.WithCancel(context.Background()) + + Retention(wg, ctx) + Checkpointing(wg, ctx) + CleanUp(wg, ctx) + WALStaging(wg, ctx) + MemoryUsageTracker(wg, ctx) + + // Note: Signal handling has been removed from this function. + // The caller is responsible for handling shutdown signals and calling + // the shutdown() function when appropriate. + // Store the shutdown function for later use by Shutdown() + shutdownFuncMu.Lock() + shutdownFunc = shutdown + shutdownFuncMu.Unlock() + + if Keys.Subscriptions != nil { + err = ReceiveNats(ms, 1, ctx) + if err != nil { + cclog.Fatal(err) + } + } +} + +// InitMetrics initializes the singleton MemoryStore instance with the given metric configurations. +// +// This function must be called before GetMemoryStore() and can only be called once due to +// the singleton pattern. It assigns each metric an internal offset for efficient buffer indexing. +// +// Parameters: +// - metrics: Map of metric names to their configurations (frequency and aggregation strategy) +// +// Panics if any metric has Frequency == 0, which indicates an invalid configuration. +// +// After this call, the global msInstance is ready for use via GetMemoryStore(). +func InitMetrics(metrics map[string]MetricConfig) { + singleton.Do(func() { + offset := 0 + for key, cfg := range metrics { + if cfg.Frequency == 0 { + panic("[METRICSTORE]> invalid frequency") + } + + metrics[key] = MetricConfig{ + Frequency: cfg.Frequency, + Aggregation: cfg.Aggregation, + offset: offset, + } + offset += 1 + } + + msInstance = &MemoryStore{ + root: Level{ + metrics: make([]*buffer, len(metrics)), + children: make(map[string]*Level), + }, + Metrics: metrics, + } + }) +} + +// GetMemoryStore returns the singleton MemoryStore instance. +// +// Returns the initialized MemoryStore singleton. Calls cclog.Fatal if InitMetrics() was not called first. +// +// This function is safe for concurrent use after initialization. +func GetMemoryStore() *MemoryStore { + if msInstance == nil { + cclog.Warnf("[METRICSTORE]> MemoryStore not initialized!") + } + + return msInstance +} + +func (ms *MemoryStore) GetMetricFrequency(metricName string) (int64, error) { + if metric, ok := ms.Metrics[metricName]; ok { + return metric.Frequency, nil + } + return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName) +} + +// SetNodeProvider sets the NodeProvider implementation for the MemoryStore. +// This must be called during initialization to provide job state information +// for selective buffer retention during Free operations. +// If not set, the Free function will fall back to freeing all buffers. +func (ms *MemoryStore) SetNodeProvider(provider NodeProvider) { + ms.nodeProvider = provider +} + +// Shutdown performs a graceful shutdown of the metric store. +// +// This function cancels all background goroutines started by Init() and writes +// a final checkpoint to disk before returning. It should be called when the +// application receives a termination signal. +// +// The function will: +// 1. Cancel the context to stop all background workers +// 2. Close the WAL messages channel if using WAL format +// 3. Write a final checkpoint to preserve in-memory data +// 4. Log any errors encountered during shutdown +// +// Note: This function blocks until the final checkpoint is written. +func Shutdown() { + shutdownFuncMu.Lock() + defer shutdownFuncMu.Unlock() + if shutdownFunc != nil { + shutdownFunc() + } + + if Keys.Checkpoints.FileFormat == "wal" { + close(WALMessages) + } + + cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir) + var files int + var err error + + ms := GetMemoryStore() + + lastCheckpointMu.Lock() + from := lastCheckpoint + lastCheckpointMu.Unlock() + + if Keys.Checkpoints.FileFormat == "wal" { + var hostDirs []string + files, hostDirs, err = ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix()) + if err == nil { + RotateWALFilesAfterShutdown(hostDirs) + } + } else { + files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix()) + } + + if err != nil { + cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error()) + } + cclog.Infof("[METRICSTORE]> Done! (%d files written)\n", files) +} + +// Retention starts a background goroutine that periodically frees old metric data. +// +// This worker runs at half the retention interval and calls Free() to remove buffers +// older than the configured retention time. It respects the NodeProvider to preserve +// data for nodes with active jobs. +// +// Parameters: +// - wg: WaitGroup to signal completion when context is cancelled +// - ctx: Context for cancellation signal +// +// The goroutine exits when ctx is cancelled. +func Retention(wg *sync.WaitGroup, ctx context.Context) { + ms := GetMemoryStore() + + wg.Go(func() { + d, err := time.ParseDuration(Keys.RetentionInMemory) + if err != nil { + cclog.Fatal(err) + } + if d <= 0 { + return + } + + tickInterval := d / 2 + if tickInterval <= 0 { + return + } + ticker := time.NewTicker(tickInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + state.mu.Lock() + + t := time.Now().Add(-d) + + state.lastRetentionTime = t.Unix() + + cclog.Infof("[METRICSTORE]> start freeing buffers (older than %s)...\n", t.Format(time.RFC3339)) + + freed, err := Free(ms, t) + if err != nil { + cclog.Errorf("[METRICSTORE]> freeing up buffers failed: %s\n", err.Error()) + } else { + cclog.Infof("[METRICSTORE]> done: %d buffers freed\n", freed) + } + + state.mu.Unlock() + + // Clean up the buffer pool + bufferPool.Clean(state.lastRetentionTime) + } + } + }) +} + +// MemoryUsageTracker starts a background goroutine that monitors memory usage. +// +// This worker checks actual process memory usage (via runtime.MemStats) periodically +// and force-frees buffers if memory exceeds the configured cap. It uses FreeOSMemory() +// to return memory to the OS after freeing buffers, avoiding aggressive GC that causes +// performance issues. +// +// The tracker logs both actual memory usage (heap allocated) and metric data size for +// visibility into memory overhead from Go runtime structures and allocations. +// +// Parameters: +// - wg: WaitGroup to signal completion when context is cancelled +// - ctx: Context for cancellation signal +// +// The goroutine exits when ctx is cancelled. +func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) { + ms := GetMemoryStore() + + wg.Go(func() { + d := DefaultMemoryUsageTrackerInterval + + if d <= 0 { + return + } + + ticker := time.NewTicker(d) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + var mem runtime.MemStats + runtime.ReadMemStats(&mem) + actualMemoryGB := float64(mem.Alloc) / 1e9 + metricDataGB := ms.SizeInGB() + cclog.Infof("[METRICSTORE]> memory usage: %.2f GB actual (%.2f GB metric data)", actualMemoryGB, metricDataGB) + + freedExcluded := 0 + freedEmergency := 0 + var err error + + state.mu.RLock() + lastRetention := state.lastRetentionTime + selectorsExcluded := state.selectorsExcluded + state.mu.RUnlock() + + if lastRetention != 0 && selectorsExcluded { + freedExcluded, err = ms.Free(nil, lastRetention) + if err != nil { + cclog.Errorf("[METRICSTORE]> error while force-freeing the excluded buffers: %s", err) + } + + if freedExcluded > 0 { + debug.FreeOSMemory() + cclog.Infof("[METRICSTORE]> done: %d excluded buffers force-freed", freedExcluded) + } + } + + runtime.ReadMemStats(&mem) + actualMemoryGB = float64(mem.Alloc) / 1e9 + + bufferPool.Clear() + cclog.Infof("[METRICSTORE]> Cleaned up bufferPool\n") + + if actualMemoryGB > float64(Keys.MemoryCap) { + cclog.Warnf("[METRICSTORE]> memory usage %.2f GB exceeds cap %d GB, starting emergency buffer freeing", actualMemoryGB, Keys.MemoryCap) + + const maxIterations = 100 + + for i := range maxIterations { + if actualMemoryGB < float64(Keys.MemoryCap) { + break + } + + freed, err := ms.ForceFree() + if err != nil { + cclog.Errorf("[METRICSTORE]> error while force-freeing buffers: %s", err) + } + if freed == 0 { + cclog.Errorf("[METRICSTORE]> no more buffers to free after %d emergency frees, memory usage %.2f GB still exceeds cap %d GB", freedEmergency, actualMemoryGB, Keys.MemoryCap) + break + } + freedEmergency += freed + + if i%10 == 0 && freedEmergency > 0 { + runtime.ReadMemStats(&mem) + actualMemoryGB = float64(mem.Alloc) / 1e9 + } + } + + // if freedEmergency > 0 { + // debug.FreeOSMemory() + // } + + runtime.ReadMemStats(&mem) + actualMemoryGB = float64(mem.Alloc) / 1e9 + + if actualMemoryGB >= float64(Keys.MemoryCap) { + cclog.Errorf("[METRICSTORE]> after %d emergency frees, memory usage %.2f GB still at/above cap %d GB", freedEmergency, actualMemoryGB, Keys.MemoryCap) + } else { + cclog.Infof("[METRICSTORE]> emergency freeing complete: %d buffers freed, memory now %.2f GB", freedEmergency, actualMemoryGB) + } + } + } + } + }) +} + +// Free removes metric data older than the given time while preserving data for active nodes. +// +// This function implements intelligent retention by consulting the NodeProvider (if configured) +// to determine which nodes are currently in use by running jobs. Data for these nodes is +// preserved even if older than the retention time. +// +// Parameters: +// - ms: The MemoryStore instance +// - t: Time threshold - buffers with data older than this will be freed +// +// Returns: +// - Number of buffers freed +// - Error if NodeProvider query fails +// +// Behavior: +// - If no NodeProvider is set: frees all buffers older than t +// - If NodeProvider returns empty map: frees all buffers older than t +// - Otherwise: preserves buffers for nodes returned by GetUsedNodes(), frees others +func Free(ms *MemoryStore, t time.Time) (int, error) { + // If no NodeProvider is configured, free all buffers older than t + if ms.nodeProvider == nil { + return ms.Free(nil, t.Unix()) + } + + excludeSelectors, err := ms.nodeProvider.GetUsedNodes(t.Unix()) + if err != nil { + return 0, err + } + + switch lenMap := len(excludeSelectors); lenMap { + + // If the length of the map returned by GetUsedNodes() is 0, + // then use default Free method with nil selector + case 0: + state.selectorsExcluded = false + return ms.Free(nil, t.Unix()) + + // Else formulate selectors, exclude those from the map + // and free the rest of the selectors + default: + state.selectorsExcluded = true + selectors := GetSelectors(ms, excludeSelectors) + return FreeSelected(ms, selectors, t) + } +} + +// FreeSelected frees buffers for specific selectors while preserving others. +// +// This function is used when we want to retain some specific nodes beyond the retention time. +// It iterates through the provided selectors and frees their associated buffers. +// +// Parameters: +// - ms: The MemoryStore instance +// - selectors: List of selector paths to free (e.g., [["cluster1", "node1"], ["cluster2", "node2"]]) +// - t: Time threshold for freeing buffers +// +// Returns the total number of buffers freed and any error encountered. +func FreeSelected(ms *MemoryStore, selectors [][]string, t time.Time) (int, error) { + freed := 0 + + for _, selector := range selectors { + + freedBuffers, err := ms.Free(selector, t.Unix()) + if err != nil { + cclog.Errorf("error while freeing selected buffers: %#v", err) + } + freed += freedBuffers + + } + + return freed, nil +} + +// GetSelectors returns all selectors at depth 2 (cluster/node level) that are NOT in the exclusion map. +// +// This function generates a list of selectors whose buffers should be freed by excluding +// selectors that correspond to nodes currently in use by running jobs. +// +// Parameters: +// - ms: The MemoryStore instance +// - excludeSelectors: Map of cluster names to node hostnames that should NOT be freed +// +// Returns a list of selectors ([]string paths) that can be safely freed. +// +// Example: +// +// If the tree has paths ["emmy", "node001"] and ["emmy", "node002"], +// and excludeSelectors contains {"emmy": ["node001"]}, +// then only [["emmy", "node002"]] is returned. +func GetSelectors(ms *MemoryStore, excludeSelectors map[string][]string) [][]string { + allSelectors := ms.GetPaths(2) + + filteredSelectors := make([][]string, 0, len(allSelectors)) + + for _, path := range allSelectors { + if len(path) < 2 { + continue + } + + key := path[0] // The "Key" (Level 1) + value := path[1] // The "Value" (Level 2) + + exclude := false + + // Check if the key exists in our exclusion map + if excludedValues, exists := excludeSelectors[key]; exists { + // The key exists, now check if the specific value is in the exclusion list + if slices.Contains(excludedValues, value) { + exclude = true + } + } + + if !exclude { + filteredSelectors = append(filteredSelectors, path) + } + } + + return filteredSelectors +} + +// GetPaths returns a list of lists (paths) to the specified depth. +func (ms *MemoryStore) GetPaths(targetDepth int) [][]string { + var results [][]string + + // Start recursion. Initial path is empty. + // We treat Root as depth 0. + ms.root.collectPaths(0, targetDepth, []string{}, &results) + + return results +} + +// Write all values in `metrics` to the level specified by `selector` for time `ts`. +// Look at `findLevelOrCreate` for how selectors work. +func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error { + var ok bool + for i, metric := range metrics { + if metric.MetricConfig.Frequency == 0 { + metric.MetricConfig, ok = m.Metrics[metric.Name] + if !ok { + cclog.Debugf("[METRICSTORE]> Unknown metric '%s' in Write() - skipping", metric.Name) + metric.MetricConfig.Frequency = 0 + } + metrics[i] = metric + } + } + + return m.WriteToLevel(&m.root, selector, ts, metrics) +} + +func (m *MemoryStore) GetLevel(selector []string) *Level { + return m.root.findLevelOrCreate(selector, len(m.Metrics)) +} + +// WriteToLevel assumes that `minfo` in `metrics` is filled in +func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metrics []Metric) error { + l = l.findLevelOrCreate(selector, len(m.Metrics)) + l.lock.Lock() + defer l.lock.Unlock() + + for _, metric := range metrics { + if metric.MetricConfig.Frequency == 0 { + continue + } + + b := l.metrics[metric.MetricConfig.offset] + if b == nil { + // First write to this metric and level + b = newBuffer(ts, metric.MetricConfig.Frequency) + l.metrics[metric.MetricConfig.offset] = b + } + + nb, err := b.write(ts, metric.Value) + if err != nil { + return err + } + + // Last write created a new buffer... + if b != nb { + l.metrics[metric.MetricConfig.offset] = nb + } + } + return nil +} + +// Read returns all values for metric `metric` from `from` to `to` for the selected level(s). +// If the level does not hold the metric itself, the data will be aggregated recursively from the children. +// The second and third return value are the actual from/to for the data. Those can be different from +// the range asked for if no data was available. +func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]schema.Float, int64, int64, int64, error) { + if from > to { + return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid time range") + } + + minfo, ok := m.Metrics[metric] + if !ok { + return nil, 0, 0, 0, errors.New("[METRICSTORE]> unknown metric: " + metric) + } + + n, data := 0, make([]schema.Float, (to-from)/minfo.Frequency+1) + + err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error { + cdata, cfrom, cto, err := b.read(from, to, data) + if err != nil { + return err + } + + if n == 0 { + from, to = cfrom, cto + } else if from != cfrom || to != cto || len(data) != len(cdata) { + missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency) + if missingfront != 0 { + return ErrDataDoesNotAlign + } + + newlen := len(cdata) - missingback + if newlen < 1 { + return ErrDataDoesNotAlign + } + cdata = cdata[0:newlen] + if len(cdata) != len(data) { + return ErrDataDoesNotAlign + } + + from, to = cfrom, cto + } + + data = cdata + n += 1 + return nil + }) + + if err != nil { + return nil, 0, 0, 0, err + } else if n == 0 { + return nil, 0, 0, 0, ErrNoHostOrMetric + } else if n > 1 { + if minfo.Aggregation == AvgAggregation { + normalize := 1. / schema.Float(n) + for i := 0; i < len(data); i++ { + data[i] *= normalize + } + } else if minfo.Aggregation != SumAggregation { + return nil, 0, 0, 0, errors.New("[METRICSTORE]> invalid aggregation") + } + } + + data, resolution, err = resampler.LargestTriangleThreeBucket(data, minfo.Frequency, resolution) + if err != nil { + return nil, 0, 0, 0, err + } + + return data, from, to, resolution, nil +} + +// Free releases all buffers for the selected level and all its children that +// contain only values older than `t`. +func (m *MemoryStore) Free(selector []string, t int64) (int, error) { + return m.GetLevel(selector).free(t) +} + +// ForceFree unconditionally removes the oldest buffer from each metric chain. +func (m *MemoryStore) ForceFree() (int, error) { + return m.GetLevel(nil).forceFree() +} + +func (m *MemoryStore) FreeAll() error { + for k := range m.root.children { + delete(m.root.children, k) + } + + return nil +} + +func (m *MemoryStore) SizeInBytes() int64 { + return m.root.sizeInBytes() +} + +func (m *MemoryStore) SizeInGB() float64 { + return float64(m.root.sizeInBytes()) / 1e9 +} + +// ListChildren , given a selector, returns a list of all children of the level +// selected. +func (m *MemoryStore) ListChildren(selector []string) []string { + lvl := &m.root + for lvl != nil && len(selector) != 0 { + lvl.lock.RLock() + next := lvl.children[selector[0]] + lvl.lock.RUnlock() + lvl = next + selector = selector[1:] + } + + if lvl == nil { + return nil + } + + lvl.lock.RLock() + defer lvl.lock.RUnlock() + + children := make([]string, 0, len(lvl.children)) + for child := range lvl.children { + children = append(children, child) + } + + return children +} diff --git a/pkg/metricstore/metricstore_test.go b/pkg/metricstore/metricstore_test.go new file mode 100644 index 00000000..35f97278 --- /dev/null +++ b/pkg/metricstore/metricstore_test.go @@ -0,0 +1,1019 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package metricstore + +import ( + "testing" + "time" + + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +// ─── Buffer pool ───────────────────────────────────────────────────────────── + +// TestBufferPoolGetReuse verifies that Get() returns pooled buffers before +// allocating new ones, and that an empty pool allocates a fresh BufferCap buffer. +func TestBufferPoolGetReuse(t *testing.T) { + pool := NewPersistentBufferPool() + + original := &buffer{data: make([]schema.Float, 0, BufferCap), lastUsed: time.Now().Unix()} + pool.Put(original) + + reused := pool.Get() + if reused != original { + t.Error("Get() should return the previously pooled buffer") + } + if pool.GetSize() != 0 { + t.Errorf("pool size after Get() = %d, want 0", pool.GetSize()) + } + + // Empty pool must allocate a fresh buffer with the standard capacity. + fresh := pool.Get() + if fresh == nil { + t.Fatal("Get() from empty pool returned nil") + } + if cap(fresh.data) != BufferCap { + t.Errorf("fresh buffer cap = %d, want %d", cap(fresh.data), BufferCap) + } +} + +// TestBufferPoolClear verifies that Clear() drains all entries. +func TestBufferPoolClear(t *testing.T) { + pool := NewPersistentBufferPool() + for i := 0; i < 10; i++ { + pool.Put(&buffer{data: make([]schema.Float, 0), lastUsed: time.Now().Unix()}) + } + pool.Clear() + if pool.GetSize() != 0 { + t.Errorf("pool size after Clear() = %d, want 0", pool.GetSize()) + } +} + +// ─── Buffer helpers ─────────────────────────────────────────────────────────── + +// TestBufferEndFirstWrite verifies the end() and firstWrite() calculations. +func TestBufferEndFirstWrite(t *testing.T) { + // start=90, freq=10 → firstWrite = 90+5 = 95 + b := &buffer{data: make([]schema.Float, 4, BufferCap), frequency: 10, start: 90} + if fw := b.firstWrite(); fw != 95 { + t.Errorf("firstWrite() = %d, want 95", fw) + } + // end = firstWrite + len(data)*freq = 95 + 4*10 = 135 + if e := b.end(); e != 135 { + t.Errorf("end() = %d, want 135", e) + } +} + +// ─── Buffer write ───────────────────────────────────────────────────────────── + +// TestBufferWriteNaNFill verifies that skipped timestamps are filled with NaN. +func TestBufferWriteNaNFill(t *testing.T) { + b := newBuffer(100, 10) + b.write(100, schema.Float(1.0)) + // skip 110 and 120 + b.write(130, schema.Float(4.0)) + + if len(b.data) != 4 { + t.Fatalf("len(data) = %d, want 4 (1 value + 2 NaN + 1 value)", len(b.data)) + } + if b.data[0] != schema.Float(1.0) { + t.Errorf("data[0] = %v, want 1.0", b.data[0]) + } + if !b.data[1].IsNaN() { + t.Errorf("data[1] should be NaN (gap), got %v", b.data[1]) + } + if !b.data[2].IsNaN() { + t.Errorf("data[2] should be NaN (gap), got %v", b.data[2]) + } + if b.data[3] != schema.Float(4.0) { + t.Errorf("data[3] = %v, want 4.0", b.data[3]) + } +} + +// TestBufferWriteCapacityOverflow verifies that exceeding capacity creates and +// links a new buffer rather than panicking or silently dropping data. +func TestBufferWriteCapacityOverflow(t *testing.T) { + // Cap=2 so the third write must overflow into a new buffer. + b := &buffer{data: make([]schema.Float, 0, 2), frequency: 10, start: 95} + + nb, _ := b.write(100, schema.Float(1.0)) + nb, _ = nb.write(110, schema.Float(2.0)) + nb, err := nb.write(120, schema.Float(3.0)) + if err != nil { + t.Fatalf("write() error = %v", err) + } + if nb == b { + t.Fatal("write() should have returned a new buffer after overflow") + } + if nb.prev != b { + t.Error("new buffer should link back to old via prev") + } + if b.next != nb { + t.Error("old buffer should link forward to new via next") + } + if len(b.data) != 2 { + t.Errorf("old buffer len = %d, want 2 (full)", len(b.data)) + } + if nb.data[0] != schema.Float(3.0) { + t.Errorf("new buffer data[0] = %v, want 3.0", nb.data[0]) + } +} + +// TestBufferWriteOverwrite verifies that writing to an already-occupied index +// replaces the value rather than appending. +func TestBufferWriteOverwrite(t *testing.T) { + b := newBuffer(100, 10) + b.write(100, schema.Float(1.0)) + b.write(110, schema.Float(2.0)) + + // Overwrite the first slot. + b.write(100, schema.Float(99.0)) + if len(b.data) != 2 { + t.Errorf("len(data) after overwrite = %d, want 2 (no append)", len(b.data)) + } + if b.data[0] != schema.Float(99.0) { + t.Errorf("data[0] after overwrite = %v, want 99.0", b.data[0]) + } +} + +// ─── Buffer read ────────────────────────────────────────────────────────────── + +// TestBufferReadBeforeFirstWrite verifies that 'from' is clamped to firstWrite +// when the requested range starts before any data in the chain. +func TestBufferReadBeforeFirstWrite(t *testing.T) { + b := newBuffer(100, 10) // firstWrite = 100 + b.write(100, schema.Float(1.0)) + b.write(110, schema.Float(2.0)) + + data := make([]schema.Float, 10) + result, adjustedFrom, _, err := b.read(50, 120, data) + if err != nil { + t.Fatalf("read() error = %v", err) + } + if adjustedFrom != 100 { + t.Errorf("adjustedFrom = %d, want 100 (clamped to firstWrite)", adjustedFrom) + } + if len(result) != 2 { + t.Errorf("len(result) = %d, want 2", len(result)) + } +} + +// TestBufferReadChain verifies that read() traverses a multi-buffer chain and +// returns contiguous values from both buffers. +// +// The switch to b.next in read() triggers on idx >= cap(b.data), so b1 must +// be full (len == cap) for the loop to advance to b2 without producing NaN. +func TestBufferReadChain(t *testing.T) { + // b1: cap=3, covers t=100..120. b2: covers t=130..150. b2 is head. + b1 := &buffer{data: make([]schema.Float, 0, 3), frequency: 10, start: 95} + b1.data = append(b1.data, 1.0, 2.0, 3.0) // fills b1: len=cap=3 + + b2 := &buffer{data: make([]schema.Float, 0, 3), frequency: 10, start: 125} + b2.data = append(b2.data, 4.0, 5.0, 6.0) // t=130,140,150 + b2.prev = b1 + b1.next = b2 + + data := make([]schema.Float, 6) + result, from, to, err := b2.read(100, 160, data) + if err != nil { + t.Fatalf("read() error = %v", err) + } + if from != 100 || to != 160 { + t.Errorf("read() from/to = %d/%d, want 100/160", from, to) + } + if len(result) != 6 { + t.Fatalf("len(result) = %d, want 6", len(result)) + } + for i, want := range []schema.Float{1, 2, 3, 4, 5, 6} { + if result[i] != want { + t.Errorf("result[%d] = %v, want %v", i, result[i], want) + } + } +} + +// TestBufferReadIdxAfterSwitch is a regression test for the index recalculation +// bug after switching to b.next during a read. +// +// When both buffers share the same start time (can happen with checkpoint-loaded +// chains), the old code hardcoded idx=0 after the switch, causing reads at time t +// to return the wrong element from the next buffer. +func TestBufferReadIdxAfterSwitch(t *testing.T) { + // b1: cap=2, both buffers start at 0 (firstWrite=5). + // b1 carries t=5 and t=15; b2 carries t=5,15,25,35 with the same start. + // When reading reaches t=25 the loop overflows b1 (idx=2 >= cap=2) and + // switches to b2. The correct index in b2 is (25-0)/10=2 → b2.data[2]=30.0. + // The old code set idx=0 → b2.data[0]=10.0 (wrong). + b1 := &buffer{data: make([]schema.Float, 0, 2), frequency: 10, start: 0} + b1.data = append(b1.data, schema.Float(1.0), schema.Float(2.0)) // t=5, t=15 + + b2 := &buffer{data: make([]schema.Float, 0, 10), frequency: 10, start: 0} + b2.data = append(b2.data, + schema.Float(10.0), schema.Float(20.0), + schema.Float(30.0), schema.Float(40.0)) // t=5,15,25,35 + b2.prev = b1 + b1.next = b2 + + // from=0 triggers the walkback to b1 (from < b2.firstWrite=5). + // After clamping, the loop runs t=5,15,25,35. + data := make([]schema.Float, 4) + result, _, _, err := b2.read(0, 36, data) + if err != nil { + t.Fatalf("read() error = %v", err) + } + if len(result) < 3 { + t.Fatalf("len(result) = %d, want >= 3", len(result)) + } + if result[0] != schema.Float(1.0) { + t.Errorf("result[0] (t=5) = %v, want 1.0 (from b1)", result[0]) + } + if result[1] != schema.Float(2.0) { + t.Errorf("result[1] (t=15) = %v, want 2.0 (from b1)", result[1]) + } + // This is the critical assertion: old code returned 10.0 (b2.data[0]). + if result[2] != schema.Float(30.0) { + t.Errorf("result[2] (t=25) = %v, want 30.0 (idx recalculation fix)", result[2]) + } +} + +// TestBufferReadNaNValues verifies that NaN slots written to the buffer are +// returned as NaN during read. +func TestBufferReadNaNValues(t *testing.T) { + b := newBuffer(100, 10) + b.write(100, schema.Float(1.0)) + b.write(110, schema.NaN) + b.write(120, schema.Float(3.0)) + + data := make([]schema.Float, 3) + result, _, _, err := b.read(100, 130, data) + if err != nil { + t.Fatalf("read() error = %v", err) + } + if len(result) != 3 { + t.Fatalf("len(result) = %d, want 3", len(result)) + } + if result[0] != schema.Float(1.0) { + t.Errorf("result[0] = %v, want 1.0", result[0]) + } + if !result[1].IsNaN() { + t.Errorf("result[1] should be NaN, got %v", result[1]) + } + if result[2] != schema.Float(3.0) { + t.Errorf("result[2] = %v, want 3.0", result[2]) + } +} + +// TestBufferReadAccumulation verifies the += accumulation pattern used for +// aggregation: values are added to whatever was already in the data slice. +func TestBufferReadAccumulation(t *testing.T) { + b := newBuffer(100, 10) + b.write(100, schema.Float(3.0)) + b.write(110, schema.Float(5.0)) + + // Pre-populate data slice (simulates a second metric being summed in). + data := []schema.Float{2.0, 1.0, 0.0} + result, _, _, err := b.read(100, 120, data) + if err != nil { + t.Fatalf("read() error = %v", err) + } + // 2.0+3.0=5.0, 1.0+5.0=6.0 + if result[0] != schema.Float(5.0) { + t.Errorf("result[0] = %v, want 5.0 (2+3)", result[0]) + } + if result[1] != schema.Float(6.0) { + t.Errorf("result[1] = %v, want 6.0 (1+5)", result[1]) + } +} + +// ─── Buffer free ───────────────────────────────────────────────────────────── + +// newTestPool swaps out the package-level bufferPool for a fresh isolated one +// and returns a cleanup function that restores the original. +func newTestPool(t *testing.T) *PersistentBufferPool { + t.Helper() + pool := NewPersistentBufferPool() + saved := bufferPool + bufferPool = pool + t.Cleanup(func() { bufferPool = saved }) + return pool +} + +// TestBufferFreeRetention verifies that free() removes buffers whose entire +// time range falls before the retention threshold and returns them to the pool. +func TestBufferFreeRetention(t *testing.T) { + pool := newTestPool(t) + + // b1: firstWrite=5, end=25 b2: firstWrite=25, end=45 b3: firstWrite=45, end=65 + b1 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 0} + b1.data = append(b1.data, 1.0, 2.0) + + b2 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 20} + b2.data = append(b2.data, 3.0, 4.0) + b2.prev = b1 + b1.next = b2 + + b3 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 40} + b3.data = append(b3.data, 5.0, 6.0) + b3.prev = b2 + b2.next = b3 + + // Threshold=30: b1.end()=25 < 30 → freed; b2.end()=45 >= 30 → kept. + delme, n := b3.free(30) + if delme { + t.Error("head buffer b3 should not be marked for deletion") + } + if n != 1 { + t.Errorf("freed count = %d, want 1", n) + } + if b2.prev != nil { + t.Error("b1 should have been unlinked from b2.prev") + } + if b3.prev != b2 { + t.Error("b3 should still reference b2") + } + if pool.GetSize() != 1 { + t.Errorf("pool size = %d, want 1 (b1 returned)", pool.GetSize()) + } +} + +// TestBufferFreeAll verifies that free() removes all buffers and signals the +// caller to delete the head when the entire chain is older than the threshold. +func TestBufferFreeAll(t *testing.T) { + pool := newTestPool(t) + + b1 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 0} + b1.data = append(b1.data, 1.0, 2.0) // end=25 + + b2 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 20} + b2.data = append(b2.data, 3.0, 4.0) // end=45 + b2.prev = b1 + b1.next = b2 + + // Threshold=100 > both ends → both should be freed. + delme, n := b2.free(100) + if !delme { + t.Error("head buffer b2 should be marked for deletion when all data is stale") + } + if n != 2 { + t.Errorf("freed count = %d, want 2", n) + } + // b1 was freed inside free(); b2 is returned with delme=true for the caller. + if pool.GetSize() != 1 { + t.Errorf("pool size = %d, want 1 (b1 returned; b2 returned by caller)", pool.GetSize()) + } +} + +// ─── forceFreeOldest ───────────────────────────────────────────────────────── + +// TestForceFreeOldestPoolReturn verifies that forceFreeOldest() returns the +// freed buffer to the pool (regression: previously it was just dropped). +func TestForceFreeOldestPoolReturn(t *testing.T) { + pool := newTestPool(t) + + b1 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 0} + b2 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 20} + b3 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 40} + b1.data = append(b1.data, 1.0) + b2.data = append(b2.data, 2.0) + b3.data = append(b3.data, 3.0) + b2.prev = b1 + b1.next = b2 + b3.prev = b2 + b2.next = b3 + + delme, n := b3.forceFreeOldest() + if delme { + t.Error("head b3 should not be marked for deletion (chain has 3 buffers)") + } + if n != 1 { + t.Errorf("freed count = %d, want 1", n) + } + if b2.prev != nil { + t.Error("b1 should have been unlinked from b2.prev after forceFreeOldest") + } + if b3.prev != b2 { + t.Error("b3 should still link to b2") + } + if pool.GetSize() != 1 { + t.Errorf("pool size = %d, want 1 (b1 returned to pool)", pool.GetSize()) + } +} + +// TestForceFreeOldestSingleBuffer verifies that forceFreeOldest() returns +// delme=true when the buffer is the only one in the chain. +func TestForceFreeOldestSingleBuffer(t *testing.T) { + b := newBuffer(100, 10) + b.write(100, schema.Float(1.0)) + + delme, n := b.forceFreeOldest() + if !delme { + t.Error("single-buffer chain: expected delme=true (the buffer IS the oldest)") + } + if n != 1 { + t.Errorf("freed count = %d, want 1", n) + } +} + +// ─── iterFromTo ─────────────────────────────────────────────────────────────── + +// TestBufferIterFromToOrder verifies that iterFromTo invokes the callback in +// chronological order (oldest → newest). +func TestBufferIterFromToOrder(t *testing.T) { + // Each buffer has 2 data points so end() = firstWrite + 2*freq. + b1 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 0} // end=25 + b2 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 20} // end=45 + b3 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 40} // end=65 + b2.prev = b1 + b1.next = b2 + b3.prev = b2 + b2.next = b3 + + var order []*buffer + err := b3.iterFromTo(0, 100, func(b *buffer) error { + order = append(order, b) + return nil + }) + if err != nil { + t.Fatalf("iterFromTo() error = %v", err) + } + if len(order) != 3 { + t.Fatalf("callback count = %d, want 3", len(order)) + } + if order[0] != b1 || order[1] != b2 || order[2] != b3 { + t.Error("iterFromTo() did not call callbacks in chronological (oldest→newest) order") + } +} + +// TestBufferIterFromToFiltered verifies that iterFromTo only calls the callback +// for buffers whose time range overlaps [from, to]. +func TestBufferIterFromToFiltered(t *testing.T) { + // b1: end=25 b2: start=20, end=45 b3: start=40, end=65 + b1 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 0} + b2 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 20} + b3 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 40} + b2.prev = b1 + b1.next = b2 + b3.prev = b2 + b2.next = b3 + + // [30,50]: b1.end=25 < 30 → excluded; b2 and b3 overlap → included. + var visited []*buffer + b3.iterFromTo(30, 50, func(b *buffer) error { + visited = append(visited, b) + return nil + }) + if len(visited) != 2 { + t.Fatalf("visited count = %d, want 2 (b2 and b3)", len(visited)) + } + if visited[0] != b2 || visited[1] != b3 { + t.Errorf("visited = %v, want [b2, b3]", visited) + } +} + +// TestBufferIterFromToNilBuffer verifies that iterFromTo on a nil buffer is a +// safe no-op. +func TestBufferIterFromToNilBuffer(t *testing.T) { + var b *buffer + called := false + err := b.iterFromTo(0, 100, func(_ *buffer) error { + called = true + return nil + }) + if err != nil { + t.Errorf("iterFromTo(nil) error = %v, want nil", err) + } + if called { + t.Error("callback should not be called for a nil buffer") + } +} + +// ─── count ──────────────────────────────────────────────────────────────────── + +// TestBufferCount verifies that count() sums data-point lengths across the +// entire chain, including all prev links. +func TestBufferCount(t *testing.T) { + b1 := &buffer{data: make([]schema.Float, 3, BufferCap), frequency: 10, start: 0} + b2 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 35} + b3 := &buffer{data: make([]schema.Float, 5, BufferCap), frequency: 10, start: 60} + b2.prev = b1 + b1.next = b2 + b3.prev = b2 + b2.next = b3 + + if got := b3.count(); got != 10 { + t.Errorf("count() = %d, want 10 (3+2+5)", got) + } + + // Single buffer. + lone := &buffer{data: make([]schema.Float, 7, BufferCap)} + if got := lone.count(); got != 7 { + t.Errorf("count() single buffer = %d, want 7", got) + } +} + +// ─── Existing tests below ──────────────────────────────────────────────────── + +func TestAssignAggregationStrategy(t *testing.T) { + tests := []struct { + name string + input string + expected AggregationStrategy + wantErr bool + }{ + {"empty string", "", NoAggregation, false}, + {"sum", "sum", SumAggregation, false}, + {"avg", "avg", AvgAggregation, false}, + {"invalid", "invalid", NoAggregation, true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := AssignAggregationStrategy(tt.input) + if (err != nil) != tt.wantErr { + t.Errorf("AssignAggregationStrategy(%q) error = %v, wantErr %v", tt.input, err, tt.wantErr) + return + } + if result != tt.expected { + t.Errorf("AssignAggregationStrategy(%q) = %v, want %v", tt.input, result, tt.expected) + } + }) + } +} + +func TestBufferWrite(t *testing.T) { + b := newBuffer(100, 10) + + // Test writing value + nb, err := b.write(100, schema.Float(42.0)) + if err != nil { + t.Errorf("buffer.write() error = %v", err) + } + if nb != b { + t.Error("buffer.write() created new buffer unexpectedly") + } + if len(b.data) != 1 { + t.Errorf("buffer.write() len(data) = %d, want 1", len(b.data)) + } + if b.data[0] != schema.Float(42.0) { + t.Errorf("buffer.write() data[0] = %v, want 42.0", b.data[0]) + } + + // Test writing value from past (should error) + _, err = b.write(50, schema.Float(10.0)) + if err == nil { + t.Error("buffer.write() expected error for past timestamp") + } +} + +func TestBufferRead(t *testing.T) { + b := newBuffer(100, 10) + + // Write some test data + b.write(100, schema.Float(1.0)) + b.write(110, schema.Float(2.0)) + b.write(120, schema.Float(3.0)) + + // Read data + data := make([]schema.Float, 3) + result, from, to, err := b.read(100, 130, data) + if err != nil { + t.Errorf("buffer.read() error = %v", err) + } + // Buffer read should return from as firstWrite (start + freq/2) + if from != 100 { + t.Errorf("buffer.read() from = %d, want 100", from) + } + if to != 130 { + t.Errorf("buffer.read() to = %d, want 130", to) + } + if len(result) != 3 { + t.Errorf("buffer.read() len(result) = %d, want 3", len(result)) + } +} + +func TestHealthCheck(t *testing.T) { + // Create a test MemoryStore with some metrics + metrics := map[string]MetricConfig{ + "load": {Frequency: 10, Aggregation: AvgAggregation, offset: 0}, + "mem_used": {Frequency: 10, Aggregation: AvgAggregation, offset: 1}, + "cpu_user": {Frequency: 10, Aggregation: AvgAggregation, offset: 2}, + "cpu_system": {Frequency: 10, Aggregation: AvgAggregation, offset: 3}, + } + + ms := &MemoryStore{ + Metrics: metrics, + root: Level{ + metrics: make([]*buffer, len(metrics)), + children: make(map[string]*Level), + }, + } + + // Use recent timestamps (current time minus a small offset) + now := time.Now().Unix() + startTime := now - 100 // Start 100 seconds ago to have enough data points + + // Setup test data for node001 - all metrics healthy (recent data) + node001 := ms.root.findLevelOrCreate([]string{"testcluster", "node001"}, len(metrics)) + for i := 0; i < len(metrics); i++ { + node001.metrics[i] = newBuffer(startTime, 10) + // Write recent data up to now + for ts := startTime; ts <= now; ts += 10 { + node001.metrics[i].write(ts, schema.Float(float64(i+1))) + } + } + + // Setup test data for node002 - some metrics stale (old data beyond MaxMissingDataPoints threshold) + node002 := ms.root.findLevelOrCreate([]string{"testcluster", "node002"}, len(metrics)) + // MaxMissingDataPoints = 5, frequency = 10, so threshold is 50 seconds + staleTime := now - 100 // Data ends 100 seconds ago (well beyond 50 second threshold) + for i := 0; i < len(metrics); i++ { + node002.metrics[i] = newBuffer(staleTime-50, 10) + if i < 2 { + // First two metrics: healthy (recent data) + for ts := startTime; ts <= now; ts += 10 { + node002.metrics[i].write(ts, schema.Float(float64(i+1))) + } + } else { + // Last two metrics: stale (data ends 100 seconds ago) + for ts := staleTime - 50; ts <= staleTime; ts += 10 { + node002.metrics[i].write(ts, schema.Float(float64(i+1))) + } + } + } + + // Setup test data for node003 - some metrics missing (no buffer) + node003 := ms.root.findLevelOrCreate([]string{"testcluster", "node003"}, len(metrics)) + // Only create buffers for first two metrics + for i := range 2 { + node003.metrics[i] = newBuffer(startTime, 10) + for ts := startTime; ts <= now; ts += 10 { + node003.metrics[i].write(ts, schema.Float(float64(i+1))) + } + } + // Leave metrics[2] and metrics[3] as nil (missing) + + // Setup test data for node005 - all metrics stale + node005 := ms.root.findLevelOrCreate([]string{"testcluster", "node005"}, len(metrics)) + for i := 0; i < len(metrics); i++ { + node005.metrics[i] = newBuffer(staleTime-50, 10) + // All metrics have stale data (ends 100 seconds ago) + for ts := staleTime - 50; ts <= staleTime; ts += 10 { + node005.metrics[i].write(ts, schema.Float(float64(i+1))) + } + } + + // node004 doesn't exist at all + + tests := []struct { + name string + cluster string + nodes []string + expectedMetrics []string + wantStates map[string]schema.MonitoringState + }{ + { + name: "all metrics healthy", + cluster: "testcluster", + nodes: []string{"node001"}, + expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"}, + wantStates: map[string]schema.MonitoringState{ + "node001": schema.MonitoringStateFull, + }, + }, + { + name: "some metrics stale", + cluster: "testcluster", + nodes: []string{"node002"}, + expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"}, + wantStates: map[string]schema.MonitoringState{ + "node002": schema.MonitoringStatePartial, + }, + }, + { + name: "some metrics missing", + cluster: "testcluster", + nodes: []string{"node003"}, + expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"}, + wantStates: map[string]schema.MonitoringState{ + "node003": schema.MonitoringStatePartial, + }, + }, + { + name: "node not found", + cluster: "testcluster", + nodes: []string{"node004"}, + expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"}, + wantStates: map[string]schema.MonitoringState{ + "node004": schema.MonitoringStateFailed, + }, + }, + { + name: "all metrics stale", + cluster: "testcluster", + nodes: []string{"node005"}, + expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"}, + wantStates: map[string]schema.MonitoringState{ + "node005": schema.MonitoringStateFailed, + }, + }, + { + name: "multiple nodes mixed states", + cluster: "testcluster", + nodes: []string{"node001", "node002", "node003", "node004", "node005"}, + expectedMetrics: []string{"load", "mem_used"}, + wantStates: map[string]schema.MonitoringState{ + "node001": schema.MonitoringStateFull, + "node002": schema.MonitoringStateFull, // Only checking first 2 metrics which are healthy + "node003": schema.MonitoringStateFull, // Only checking first 2 metrics which exist + "node004": schema.MonitoringStateFailed, // Node doesn't exist + "node005": schema.MonitoringStateFailed, // Both metrics are stale + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + results, err := ms.HealthCheck(tt.cluster, tt.nodes, tt.expectedMetrics) + if err != nil { + t.Errorf("HealthCheck() error = %v", err) + return + } + + // Check that we got results for all nodes + if len(results) != len(tt.nodes) { + t.Errorf("HealthCheck() returned %d results, want %d", len(results), len(tt.nodes)) + } + + // Check each node's state + for _, node := range tt.nodes { + state, ok := results[node] + if !ok { + t.Errorf("HealthCheck() missing result for node %s", node) + continue + } + + // Check status + if wantStatus, ok := tt.wantStates[node]; ok { + if state.State != wantStatus { + t.Errorf("HealthCheck() node %s status = %v, want %v", node, state.State, wantStatus) + } + } + } + }) + } +} + +// TestGetHealthyMetrics tests the GetHealthyMetrics function which returns lists of missing and degraded metrics +func TestGetHealthyMetrics(t *testing.T) { + metrics := map[string]MetricConfig{ + "load": {Frequency: 10, Aggregation: AvgAggregation, offset: 0}, + "mem_used": {Frequency: 10, Aggregation: AvgAggregation, offset: 1}, + "cpu_user": {Frequency: 10, Aggregation: AvgAggregation, offset: 2}, + } + + ms := &MemoryStore{ + Metrics: metrics, + root: Level{ + metrics: make([]*buffer, len(metrics)), + children: make(map[string]*Level), + }, + } + + now := time.Now().Unix() + startTime := now - 100 + staleTime := now - 100 + + // Setup node with mixed health states + node := ms.root.findLevelOrCreate([]string{"testcluster", "testnode"}, len(metrics)) + + // Metric 0 (load): healthy - recent data + node.metrics[0] = newBuffer(startTime, 10) + for ts := startTime; ts <= now; ts += 10 { + node.metrics[0].write(ts, schema.Float(1.0)) + } + + // Metric 1 (mem_used): degraded - stale data + node.metrics[1] = newBuffer(staleTime-50, 10) + for ts := staleTime - 50; ts <= staleTime; ts += 10 { + node.metrics[1].write(ts, schema.Float(2.0)) + } + + // Metric 2 (cpu_user): missing - no buffer (nil) + + tests := []struct { + name string + selector []string + expectedMetrics []string + wantDegraded []string + wantMissing []string + wantErr bool + }{ + { + name: "mixed health states", + selector: []string{"testcluster", "testnode"}, + expectedMetrics: []string{"load", "mem_used", "cpu_user"}, + wantDegraded: []string{"mem_used"}, + wantMissing: []string{"cpu_user"}, + wantErr: false, + }, + { + name: "node not found", + selector: []string{"testcluster", "nonexistent"}, + expectedMetrics: []string{"load"}, + wantDegraded: nil, + wantMissing: nil, + wantErr: true, + }, + { + name: "check only healthy metric", + selector: []string{"testcluster", "testnode"}, + expectedMetrics: []string{"load"}, + wantDegraded: []string{}, + wantMissing: []string{}, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + degraded, missing, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics) + + if (err != nil) != tt.wantErr { + t.Errorf("GetHealthyMetrics() error = %v, wantErr %v", err, tt.wantErr) + return + } + + if tt.wantErr { + return + } + + // Check degraded list + if len(degraded) != len(tt.wantDegraded) { + t.Errorf("GetHealthyMetrics() degraded = %v, want %v", degraded, tt.wantDegraded) + } else { + for i, d := range tt.wantDegraded { + if degraded[i] != d { + t.Errorf("GetHealthyMetrics() degraded[%d] = %v, want %v", i, degraded[i], d) + } + } + } + + // Check missing list + if len(missing) != len(tt.wantMissing) { + t.Errorf("GetHealthyMetrics() missing = %v, want %v", missing, tt.wantMissing) + } else { + for i, m := range tt.wantMissing { + if missing[i] != m { + t.Errorf("GetHealthyMetrics() missing[%d] = %v, want %v", i, missing[i], m) + } + } + } + }) + } +} + +// TestBufferHealthChecks tests the buffer-level health check functions +func TestBufferHealthChecks(t *testing.T) { + now := time.Now().Unix() + + tests := []struct { + name string + setupBuffer func() *buffer + wantExists bool + wantHealthy bool + description string + }{ + { + name: "nil buffer", + setupBuffer: func() *buffer { + return nil + }, + wantExists: false, + wantHealthy: false, + description: "nil buffer should not exist and not be healthy", + }, + { + name: "empty buffer", + setupBuffer: func() *buffer { + b := newBuffer(now, 10) + b.data = nil + return b + }, + wantExists: false, + wantHealthy: false, + description: "empty buffer should not exist and not be healthy", + }, + { + name: "healthy buffer with recent data", + setupBuffer: func() *buffer { + b := newBuffer(now-30, 10) + // Write data up to now (within MaxMissingDataPoints * frequency = 50 seconds) + for ts := now - 30; ts <= now; ts += 10 { + b.write(ts, schema.Float(1.0)) + } + return b + }, + wantExists: true, + wantHealthy: true, + description: "buffer with recent data should be healthy", + }, + { + name: "stale buffer beyond threshold", + setupBuffer: func() *buffer { + b := newBuffer(now-200, 10) + // Write data that ends 100 seconds ago (beyond MaxMissingDataPoints * frequency = 50 seconds) + for ts := now - 200; ts <= now-100; ts += 10 { + b.write(ts, schema.Float(1.0)) + } + return b + }, + wantExists: true, + wantHealthy: false, + description: "buffer with stale data should exist but not be healthy", + }, + { + name: "buffer at threshold boundary", + setupBuffer: func() *buffer { + b := newBuffer(now-50, 10) + // Write data that ends exactly at threshold (MaxMissingDataPoints * frequency = 50 seconds) + for ts := now - 50; ts <= now-50; ts += 10 { + b.write(ts, schema.Float(1.0)) + } + return b + }, + wantExists: true, + wantHealthy: true, + description: "buffer at threshold boundary should still be healthy", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + b := tt.setupBuffer() + + exists := b.bufferExists() + if exists != tt.wantExists { + t.Errorf("bufferExists() = %v, want %v: %s", exists, tt.wantExists, tt.description) + } + + if b != nil && b.data != nil && len(b.data) > 0 { + healthy := b.isBufferHealthy() + if healthy != tt.wantHealthy { + t.Errorf("isBufferHealthy() = %v, want %v: %s", healthy, tt.wantHealthy, tt.description) + } + } + }) + } +} + +func TestBufferPoolClean(t *testing.T) { + // Use a fresh pool for testing + pool := NewPersistentBufferPool() + + now := time.Now().Unix() + + // Create some buffers and put them in the pool with different lastUsed times + b1 := &buffer{lastUsed: now - 3600, data: make([]schema.Float, 0)} // 1 hour ago + b2 := &buffer{lastUsed: now - 7200, data: make([]schema.Float, 0)} // 2 hours ago + b3 := &buffer{lastUsed: now - 180000, data: make([]schema.Float, 0)} // 50 hours ago + b4 := &buffer{lastUsed: now - 200000, data: make([]schema.Float, 0)} // 55 hours ago + b5 := &buffer{lastUsed: now, data: make([]schema.Float, 0)} + + pool.Put(b1) + pool.Put(b2) + pool.Put(b3) + pool.Put(b4) + pool.Put(b5) + + if pool.GetSize() != 5 { + t.Fatalf("Expected pool size 5, got %d", pool.GetSize()) + } + + // Clean buffers older than 48 hours + timeUpdate := time.Now().Add(-48 * time.Hour).Unix() + pool.Clean(timeUpdate) + + // Expected: b1, b2, b5 should remain. b3, b4 should be cleaned. + if pool.GetSize() != 3 { + t.Fatalf("Expected pool size 3 after clean, got %d", pool.GetSize()) + } + + validBufs := map[int64]bool{ + b1.lastUsed: true, + b2.lastUsed: true, + b5.lastUsed: true, + } + + for i := 0; i < 3; i++ { + b := pool.Get() + if !validBufs[b.lastUsed] { + t.Errorf("Found unexpected buffer with lastUsed %d", b.lastUsed) + } + } + + if pool.GetSize() != 0 { + t.Fatalf("Expected pool to be empty, got %d", pool.GetSize()) + } +} diff --git a/pkg/metricstore/parquetArchive.go b/pkg/metricstore/parquetArchive.go new file mode 100644 index 00000000..420ee4e5 --- /dev/null +++ b/pkg/metricstore/parquetArchive.go @@ -0,0 +1,213 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package metricstore + +import ( + "bufio" + "encoding/binary" + "encoding/json" + "fmt" + "os" + "path/filepath" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + pq "github.com/parquet-go/parquet-go" +) + +// ParquetMetricRow is the long-format schema for archived metric data. +// One row per (host, metric, scope, scope_id, timestamp) data point. +// Sorted by (cluster, hostname, metric, timestamp) for optimal compression. +type ParquetMetricRow struct { + Cluster string `parquet:"cluster"` + Hostname string `parquet:"hostname"` + Metric string `parquet:"metric"` + Scope string `parquet:"scope"` + ScopeID string `parquet:"scope_id"` + Timestamp int64 `parquet:"timestamp"` + Frequency int64 `parquet:"frequency"` + Value float32 `parquet:"value"` +} + +// flattenCheckpointFile recursively converts a CheckpointFile tree into Parquet rows. +// The scope path is built from the hierarchy: host level is "node", then child names +// map to scope/scope_id (e.g., "socket0" → scope="socket", scope_id="0"). +func flattenCheckpointFile(cf *CheckpointFile, cluster, hostname, scope, scopeID string, rows []ParquetMetricRow) []ParquetMetricRow { + for metricName, cm := range cf.Metrics { + ts := cm.Start + for _, v := range cm.Data { + if !v.IsNaN() { + rows = append(rows, ParquetMetricRow{ + Cluster: cluster, + Hostname: hostname, + Metric: metricName, + Scope: scope, + ScopeID: scopeID, + Timestamp: ts, + Frequency: cm.Frequency, + Value: float32(v), + }) + } + ts += cm.Frequency + } + } + + for childName, childCf := range cf.Children { + childScope, childScopeID := parseScopeFromName(childName) + rows = flattenCheckpointFile(childCf, cluster, hostname, childScope, childScopeID, rows) + } + + return rows +} + +// parseScopeFromName infers scope and scope_id from a child level name. +// Examples: "socket0" → ("socket", "0"), "core12" → ("core", "12"), +// "a0" (accelerator) → ("accelerator", "0"). +// If the name doesn't match known patterns, it's used as-is for scope with empty scope_id. +func parseScopeFromName(name string) (string, string) { + prefixes := []struct { + prefix string + scope string + }{ + {"socket", "socket"}, + {"memoryDomain", "memoryDomain"}, + {"core", "core"}, + {"hwthread", "hwthread"}, + {"cpu", "hwthread"}, + {"accelerator", "accelerator"}, + } + + for _, p := range prefixes { + if len(name) > len(p.prefix) && name[:len(p.prefix)] == p.prefix { + id := name[len(p.prefix):] + if len(id) > 0 && id[0] >= '0' && id[0] <= '9' { + return p.scope, id + } + } + } + + return name, "" +} + +// writeParquetArchive writes rows to a Parquet file with Zstd compression. +func writeParquetArchive(filename string, rows []ParquetMetricRow) error { + if err := os.MkdirAll(filepath.Dir(filename), CheckpointDirPerms); err != nil { + return fmt.Errorf("creating archive directory: %w", err) + } + + f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms) + if err != nil { + return fmt.Errorf("creating parquet file: %w", err) + } + defer f.Close() + + bw := bufio.NewWriterSize(f, 1<<20) // 1MB write buffer + + writer := pq.NewGenericWriter[ParquetMetricRow](bw, + pq.Compression(&pq.Zstd), + pq.SortingWriterConfig(pq.SortingColumns( + pq.Ascending("cluster"), + pq.Ascending("hostname"), + pq.Ascending("metric"), + pq.Ascending("timestamp"), + )), + ) + + if _, err := writer.Write(rows); err != nil { + return fmt.Errorf("writing parquet rows: %w", err) + } + + if err := writer.Close(); err != nil { + return fmt.Errorf("closing parquet writer: %w", err) + } + + if err := bw.Flush(); err != nil { + return fmt.Errorf("flushing parquet file: %w", err) + } + + return nil +} + +// loadCheckpointFileFromDisk reads a JSON or binary checkpoint file and returns +// a CheckpointFile. Used by the Parquet archiver to read checkpoint data +// before converting it to Parquet format. +func loadCheckpointFileFromDisk(filename string) (*CheckpointFile, error) { + f, err := os.Open(filename) + if err != nil { + return nil, err + } + defer f.Close() + + ext := filepath.Ext(filename) + switch ext { + case ".json": + cf := &CheckpointFile{} + br := bufio.NewReader(f) + if err := json.NewDecoder(br).Decode(cf); err != nil { + return nil, fmt.Errorf("decoding JSON checkpoint %s: %w", filename, err) + } + return cf, nil + + case ".bin": + br := bufio.NewReader(f) + var magic uint32 + if err := binary.Read(br, binary.LittleEndian, &magic); err != nil { + return nil, fmt.Errorf("reading magic from %s: %w", filename, err) + } + if magic != snapFileMagic { + return nil, fmt.Errorf("invalid snapshot magic in %s: 0x%08X", filename, magic) + } + var fileFrom, fileTo int64 + if err := binary.Read(br, binary.LittleEndian, &fileFrom); err != nil { + return nil, fmt.Errorf("reading from-timestamp from %s: %w", filename, err) + } + if err := binary.Read(br, binary.LittleEndian, &fileTo); err != nil { + return nil, fmt.Errorf("reading to-timestamp from %s: %w", filename, err) + } + cf, err := readBinaryLevel(br) + if err != nil { + return nil, fmt.Errorf("reading binary level from %s: %w", filename, err) + } + cf.From = fileFrom + cf.To = fileTo + return cf, nil + + default: + return nil, fmt.Errorf("unsupported checkpoint extension: %s", ext) + } +} + +// archiveCheckpointsToParquet reads checkpoint files for a host directory, +// converts them to Parquet rows. Returns the rows and filenames that were processed. +func archiveCheckpointsToParquet(dir, cluster, host string, from int64) ([]ParquetMetricRow, []string, error) { + entries, err := os.ReadDir(dir) + if err != nil { + return nil, nil, err + } + + files, err := findFiles(entries, from, false) + if err != nil { + return nil, nil, err + } + + if len(files) == 0 { + return nil, nil, nil + } + + var rows []ParquetMetricRow + + for _, checkpoint := range files { + filename := filepath.Join(dir, checkpoint) + cf, err := loadCheckpointFileFromDisk(filename) + if err != nil { + cclog.Warnf("[METRICSTORE]> skipping unreadable checkpoint %s: %v", filename, err) + continue + } + + rows = flattenCheckpointFile(cf, cluster, host, "node", "", rows) + } + + return rows, files, nil +} diff --git a/pkg/metricstore/parquetArchive_test.go b/pkg/metricstore/parquetArchive_test.go new file mode 100644 index 00000000..d3d70c02 --- /dev/null +++ b/pkg/metricstore/parquetArchive_test.go @@ -0,0 +1,255 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package metricstore + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/ClusterCockpit/cc-lib/v2/schema" + pq "github.com/parquet-go/parquet-go" +) + +func TestParseScopeFromName(t *testing.T) { + tests := []struct { + name string + wantScope string + wantID string + }{ + {"socket0", "socket", "0"}, + {"socket12", "socket", "12"}, + {"core0", "core", "0"}, + {"core127", "core", "127"}, + {"cpu0", "hwthread", "0"}, + {"hwthread5", "hwthread", "5"}, + {"memoryDomain0", "memoryDomain", "0"}, + {"accelerator0", "accelerator", "0"}, + {"unknown", "unknown", ""}, + {"socketX", "socketX", ""}, // not numeric suffix + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + scope, id := parseScopeFromName(tt.name) + if scope != tt.wantScope || id != tt.wantID { + t.Errorf("parseScopeFromName(%q) = (%q, %q), want (%q, %q)", + tt.name, scope, id, tt.wantScope, tt.wantID) + } + }) + } +} + +func TestFlattenCheckpointFile(t *testing.T) { + cf := &CheckpointFile{ + From: 1000, + To: 1060, + Metrics: map[string]*CheckpointMetrics{ + "cpu_load": { + Frequency: 60, + Start: 1000, + Data: []schema.Float{0.5, 0.7, schema.NaN}, + }, + }, + Children: map[string]*CheckpointFile{ + "socket0": { + Metrics: map[string]*CheckpointMetrics{ + "mem_bw": { + Frequency: 60, + Start: 1000, + Data: []schema.Float{100.0, schema.NaN, 200.0}, + }, + }, + Children: make(map[string]*CheckpointFile), + }, + }, + } + + rows := flattenCheckpointFile(cf, "fritz", "node001", "node", "", nil) + + // cpu_load: 2 non-NaN values at node scope + // mem_bw: 2 non-NaN values at socket0 scope + if len(rows) != 4 { + t.Fatalf("expected 4 rows, got %d", len(rows)) + } + + // Verify a node-scope row + found := false + for _, r := range rows { + if r.Metric == "cpu_load" && r.Timestamp == 1000 { + found = true + if r.Cluster != "fritz" || r.Hostname != "node001" || r.Scope != "node" || r.Value != 0.5 { + t.Errorf("unexpected row: %+v", r) + } + } + } + if !found { + t.Error("expected cpu_load row at timestamp 1000") + } + + // Verify a socket-scope row + found = false + for _, r := range rows { + if r.Metric == "mem_bw" && r.Scope == "socket" && r.ScopeID == "0" { + found = true + } + } + if !found { + t.Error("expected mem_bw row with scope=socket, scope_id=0") + } +} + +func TestParquetArchiveRoundtrip(t *testing.T) { + tmpDir := t.TempDir() + + // Create checkpoint files on disk (JSON format) + cpDir := filepath.Join(tmpDir, "checkpoints", "testcluster", "node001") + if err := os.MkdirAll(cpDir, 0o755); err != nil { + t.Fatal(err) + } + + cf := &CheckpointFile{ + From: 1000, + To: 1180, + Metrics: map[string]*CheckpointMetrics{ + "cpu_load": { + Frequency: 60, + Start: 1000, + Data: []schema.Float{0.5, 0.7, 0.9}, + }, + "mem_used": { + Frequency: 60, + Start: 1000, + Data: []schema.Float{45.0, 46.0, 47.0}, + }, + }, + Children: map[string]*CheckpointFile{ + "socket0": { + Metrics: map[string]*CheckpointMetrics{ + "mem_bw": { + Frequency: 60, + Start: 1000, + Data: []schema.Float{100.0, 110.0, 120.0}, + }, + }, + Children: make(map[string]*CheckpointFile), + }, + }, + } + + // Write JSON checkpoint + cpFile := filepath.Join(cpDir, "1000.json") + data, err := json.Marshal(cf) + if err != nil { + t.Fatal(err) + } + if err := os.WriteFile(cpFile, data, 0o644); err != nil { + t.Fatal(err) + } + + // Archive to Parquet + archiveDir := filepath.Join(tmpDir, "archive") + rows, files, err := archiveCheckpointsToParquet(cpDir, "testcluster", "node001", 2000) + if err != nil { + t.Fatal(err) + } + if len(files) != 1 || files[0] != "1000.json" { + t.Fatalf("expected 1 file, got %v", files) + } + + parquetFile := filepath.Join(archiveDir, "testcluster", "1000.parquet") + if err := writeParquetArchive(parquetFile, rows); err != nil { + t.Fatal(err) + } + + // Read back and verify + f, err := os.Open(parquetFile) + if err != nil { + t.Fatal(err) + } + defer f.Close() + + stat, _ := f.Stat() + pf, err := pq.OpenFile(f, stat.Size()) + if err != nil { + t.Fatal(err) + } + + reader := pq.NewGenericReader[ParquetMetricRow](pf) + readRows := make([]ParquetMetricRow, 100) + n, err := reader.Read(readRows) + if err != nil && n == 0 { + t.Fatal(err) + } + readRows = readRows[:n] + reader.Close() + + // We expect: cpu_load(3) + mem_used(3) + mem_bw(3) = 9 rows + if n != 9 { + t.Fatalf("expected 9 rows in parquet file, got %d", n) + } + + // Verify cluster and hostname are set correctly + for _, r := range readRows { + if r.Cluster != "testcluster" { + t.Errorf("expected cluster=testcluster, got %s", r.Cluster) + } + if r.Hostname != "node001" { + t.Errorf("expected hostname=node001, got %s", r.Hostname) + } + } + + // Verify parquet file is smaller than JSON (compression working) + if stat.Size() == 0 { + t.Error("parquet file is empty") + } + + t.Logf("Parquet file size: %d bytes for %d rows", stat.Size(), n) +} + +func TestLoadCheckpointFileFromDisk_JSON(t *testing.T) { + tmpDir := t.TempDir() + + cf := &CheckpointFile{ + From: 1000, + To: 1060, + Metrics: map[string]*CheckpointMetrics{ + "test_metric": { + Frequency: 60, + Start: 1000, + Data: []schema.Float{1.0, 2.0, 3.0}, + }, + }, + Children: make(map[string]*CheckpointFile), + } + + filename := filepath.Join(tmpDir, "1000.json") + data, err := json.Marshal(cf) + if err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filename, data, 0o644); err != nil { + t.Fatal(err) + } + + loaded, err := loadCheckpointFileFromDisk(filename) + if err != nil { + t.Fatal(err) + } + + if loaded.From != 1000 || loaded.To != 1060 { + t.Errorf("expected From=1000, To=1060, got From=%d, To=%d", loaded.From, loaded.To) + } + + m, ok := loaded.Metrics["test_metric"] + if !ok { + t.Fatal("expected test_metric in loaded checkpoint") + } + if m.Frequency != 60 || m.Start != 1000 || len(m.Data) != 3 { + t.Errorf("unexpected metric data: %+v", m) + } +} diff --git a/pkg/metricstore/query.go b/pkg/metricstore/query.go new file mode 100644 index 00000000..8a349b5a --- /dev/null +++ b/pkg/metricstore/query.go @@ -0,0 +1,865 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +// This file implements high-level query functions for loading job metric data +// with automatic scope transformation and aggregation. +// +// Key Concepts: +// +// Metric Scopes: Metrics are collected at different granularities (native scope): +// - HWThread: Per hardware thread +// - Core: Per CPU core +// - Socket: Per CPU socket +// - MemoryDomain: Per memory domain (NUMA) +// - Accelerator: Per GPU/accelerator +// - Node: Per compute node +// +// Scope Transformation: The buildQueries functions transform between native scope +// and requested scope by: +// - Aggregating finer-grained data (e.g., HWThread → Core → Socket → Node) +// - Rejecting requests for finer granularity than available +// - Handling special cases (e.g., Accelerator metrics) +// +// Query Building: Constructs APIQuery structures with proper selectors (Type, TypeIds) +// based on cluster topology and job resources. +package metricstore + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/ClusterCockpit/cc-backend/pkg/archive" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +type InternalMetricStore struct{} + +var MetricStoreHandle *InternalMetricStore + +// HealthCheck delegates to the internal MemoryStore's HealthCheck. +func (ccms *InternalMetricStore) HealthCheck(cluster string, + nodes []string, metrics []string, +) (map[string]HealthCheckResult, error) { + return GetMemoryStore().HealthCheck(cluster, nodes, metrics) +} + +// TestLoadDataCallback allows tests to override LoadData behavior for testing purposes. +// When set to a non-nil function, LoadData will call this function instead of the default implementation. +var TestLoadDataCallback func(job *schema.Job, metrics []string, scopes []schema.MetricScope, ctx context.Context, resolution int) (schema.JobData, error) + +// LoadData loads metric data for a specific job with automatic scope transformation. +// +// This is the primary function for retrieving job metric data. It handles: +// - Building queries with scope transformation via buildQueries +// - Fetching data from the metric store +// - Organizing results by metric and scope +// - Converting NaN statistics to 0 for JSON compatibility +// - Partial error handling (returns data for successful queries even if some fail) +// +// Parameters: +// - job: Job metadata including cluster, resources, and time range +// - metrics: List of metric names to load +// - scopes: Requested metric scopes (will be transformed to match native scopes) +// - ctx: Context for cancellation (currently unused but reserved for future use) +// - resolution: Data resolution in seconds (0 for native resolution) +// +// Returns: +// - JobData: Map of metric → scope → JobMetric with time-series data and statistics +// - Error: Returns error if query building or fetching fails, or partial error listing failed hosts +// +// Example: +// +// jobData, err := LoadData(job, []string{"cpu_load", "mem_used"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 60) +func (ccms *InternalMetricStore) LoadData( + job *schema.Job, + metrics []string, + scopes []schema.MetricScope, + ctx context.Context, + resolution int, +) (schema.JobData, error) { + if TestLoadDataCallback != nil { + return TestLoadDataCallback(job, metrics, scopes, ctx, resolution) + } + + queries, assignedScope, err := buildQueries(job, metrics, scopes, int64(resolution)) + if err != nil { + cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error()) + return nil, err + } + + // Verify assignment is correct - log any inconsistencies for debugging + if len(queries) != len(assignedScope) { + cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildQueries: %d vs %d", + len(queries), len(assignedScope)) + } + + req := APIQueryRequest{ + Cluster: job.Cluster, + From: job.StartTime, + To: job.StartTime + int64(job.Duration), + Queries: queries, + WithStats: true, + WithData: true, + } + + resBody, err := FetchData(req) + if err != nil { + cclog.Errorf("Error while fetching data : %s", err.Error()) + return nil, err + } + + var errors []string + jobData := make(schema.JobData) + + // Add safety check for potential index out of range errors + if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) { + cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d", + len(req.Queries), len(resBody.Results), len(assignedScope)) + if len(resBody.Results) > len(req.Queries) { + resBody.Results = resBody.Results[:len(req.Queries)] + } + if len(assignedScope) > len(req.Queries) { + assignedScope = assignedScope[:len(req.Queries)] + } + } + + for i, row := range resBody.Results { + query := req.Queries[i] + metric := query.Metric + scope := assignedScope[i] + mc := archive.GetMetricConfig(job.Cluster, metric) + + if mc == nil { + cclog.Warnf("Metric config not found for %s on cluster %s", metric, job.Cluster) + continue + } + + if _, ok := jobData[metric]; !ok { + jobData[metric] = make(map[schema.MetricScope]*schema.JobMetric) + } + + res := mc.Timestep + if len(row) > 0 { + res = int(row[0].Resolution) + } + + jobMetric, ok := jobData[metric][scope] + if !ok { + jobMetric = &schema.JobMetric{ + Unit: mc.Unit, + Timestep: res, + Series: make([]schema.Series, 0), + } + jobData[metric][scope] = jobMetric + } + + for ndx, res := range row { + if res.Error != nil { + /* Build list for "partial errors", if any */ + errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error)) + continue + } + + id := ExtractTypeID(query.Type, query.TypeIds, ndx, query.Metric, query.Hostname) + + SanitizeStats(&res.Avg, &res.Min, &res.Max) + + jobMetric.Series = append(jobMetric.Series, schema.Series{ + Hostname: query.Hostname, + ID: id, + Statistics: schema.MetricStatistics{ + Avg: float64(res.Avg), + Min: float64(res.Min), + Max: float64(res.Max), + }, + Data: res.Data, + }) + } + + // So that one can later check len(jobData): + if len(jobMetric.Series) == 0 { + delete(jobData[metric], scope) + if len(jobData[metric]) == 0 { + delete(jobData, metric) + } + } + } + + if len(errors) != 0 { + /* Returns list for "partial errors" */ + return jobData, fmt.Errorf("METRICDATA/INTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) + } + return jobData, nil +} + +// buildQueries constructs APIQuery structures with automatic scope transformation for a job. +// +// This function implements the core scope transformation logic, handling all combinations of +// native metric scopes and requested scopes. It uses the cluster topology to determine which +// hardware IDs to include in each query. +// +// Scope Transformation Rules: +// - If native scope >= requested scope: Aggregates data (Aggregate=true in APIQuery) +// - If native scope < requested scope: Returns error (cannot increase granularity) +// - Special handling for Accelerator scope (independent of CPU hierarchy) +// +// The function generates one or more APIQuery per (metric, scope, host) combination: +// - For non-aggregated queries: One query with all relevant IDs +// - For aggregated queries: May generate multiple queries (e.g., one per socket/core) +// +// Parameters: +// - job: Job metadata including cluster, subcluster, and resource allocation +// - metrics: List of metrics to query +// - scopes: Requested scopes for each metric +// - resolution: Data resolution in seconds +// +// Returns: +// - []APIQuery: List of queries to execute +// - []schema.MetricScope: Assigned scope for each query (after transformation) +// - error: Returns error if topology lookup fails or unhandled scope combination encountered +func buildQueries( + job *schema.Job, + metrics []string, + scopes []schema.MetricScope, + resolution int64, +) ([]APIQuery, []schema.MetricScope, error) { + if len(job.Resources) == 0 { + return nil, nil, fmt.Errorf("METRICDATA/INTERNAL-CCMS > no resources allocated for job %d", job.JobID) + } + + queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(job.Resources)) + assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(job.Resources)) + + subcluster, scerr := archive.GetSubCluster(job.Cluster, job.SubCluster) + if scerr != nil { + return nil, nil, scerr + } + topology := subcluster.Topology + + for _, metric := range metrics { + mc := archive.GetMetricConfig(job.Cluster, metric) + if mc == nil { + cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster) + continue + } + + // Skip if metric is removed for subcluster + if len(mc.SubClusters) != 0 && IsMetricRemovedForSubCluster(mc, job.SubCluster) { + continue + } + + // Avoid duplicates... + handledScopes := make([]schema.MetricScope, 0, 3) + + scopesLoop: + for _, requestedScope := range scopes { + nativeScope := mc.Scope + if nativeScope == schema.MetricScopeAccelerator && job.NumAcc == 0 { + continue + } + + scope := nativeScope.Max(requestedScope) + for _, s := range handledScopes { + if scope == s { + continue scopesLoop + } + } + handledScopes = append(handledScopes, scope) + + for _, host := range job.Resources { + hwthreads := host.HWThreads + if hwthreads == nil { + hwthreads = topology.Node + } + + scopeResults, ok := BuildScopeQueries( + nativeScope, requestedScope, + metric, host.Hostname, + &topology, hwthreads, host.Accelerators, + ) + + if !ok { + return nil, nil, fmt.Errorf("METRICDATA/INTERNAL-CCMS > unsupported scope transformation: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) + } + + for _, sr := range scopeResults { + queries = append(queries, APIQuery{ + Metric: sr.Metric, + Hostname: sr.Hostname, + Aggregate: sr.Aggregate, + Type: sr.Type, + TypeIds: sr.TypeIds, + Resolution: resolution, + }) + assignedScope = append(assignedScope, sr.Scope) + } + } + } + } + + return queries, assignedScope, nil +} + +// LoadStats loads only metric statistics (avg/min/max) for a job at node scope. +// +// This is an optimized version of LoadData that fetches only statistics without +// time-series data, reducing bandwidth and memory usage. Always queries at node scope. +// +// Parameters: +// - job: Job metadata +// - metrics: List of metric names +// - ctx: Context (currently unused) +// +// Returns: +// - Map of metric → hostname → statistics +// - Error on query building or fetching failure +func (ccms *InternalMetricStore) LoadStats( + job *schema.Job, + metrics []string, + ctx context.Context, +) (map[string]map[string]schema.MetricStatistics, error) { + // TODO(#166): Add scope parameter for analysis view accelerator normalization + queries, _, err := buildQueries(job, metrics, []schema.MetricScope{schema.MetricScopeNode}, 0) + if err != nil { + cclog.Errorf("Error while building queries for jobId %d, Metrics %v: %s", job.JobID, metrics, err.Error()) + return nil, err + } + + req := APIQueryRequest{ + Cluster: job.Cluster, + From: job.StartTime, + To: job.StartTime + int64(job.Duration), + Queries: queries, + WithStats: true, + WithData: false, + } + + resBody, err := FetchData(req) + if err != nil { + cclog.Errorf("Error while fetching data : %s", err.Error()) + return nil, err + } + + stats := make(map[string]map[string]schema.MetricStatistics, len(metrics)) + for i, res := range resBody.Results { + if i >= len(req.Queries) { + cclog.Warnf("LoadStats: result index %d exceeds queries length %d", i, len(req.Queries)) + break + } + if len(res) == 0 { + // No Data Found For Metric, Logged in FetchData to Warn + continue + } + query := req.Queries[i] + metric := query.Metric + data := res[0] + if data.Error != nil { + cclog.Warnf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error) + continue + } + + metricdata, ok := stats[metric] + if !ok { + metricdata = make(map[string]schema.MetricStatistics, job.NumNodes) + stats[metric] = metricdata + } + + if data.Avg.IsNaN() || data.Min.IsNaN() || data.Max.IsNaN() { + cclog.Warnf("fetching %s for node %s failed: one of avg/min/max is NaN", metric, query.Hostname) + continue + } + + metricdata[query.Hostname] = schema.MetricStatistics{ + Avg: float64(data.Avg), + Min: float64(data.Min), + Max: float64(data.Max), + } + } + + return stats, nil +} + +// LoadScopedStats loads metric statistics for a job with scope-aware grouping. +// +// Similar to LoadStats but supports multiple scopes and returns statistics grouped +// by scope with hardware IDs (e.g., per-core, per-socket statistics). +// +// Parameters: +// - job: Job metadata +// - metrics: List of metric names +// - scopes: Requested metric scopes +// - ctx: Context (currently unused) +// +// Returns: +// - ScopedJobStats: Map of metric → scope → []ScopedStats (with hostname and ID) +// - Error or partial error listing failed queries +func (ccms *InternalMetricStore) LoadScopedStats( + job *schema.Job, + metrics []string, + scopes []schema.MetricScope, + ctx context.Context, +) (schema.ScopedJobStats, error) { + queries, assignedScope, err := buildQueries(job, metrics, scopes, 0) + if err != nil { + cclog.Errorf("Error while building queries for jobId %d, Metrics %v, Scopes %v: %s", job.JobID, metrics, scopes, err.Error()) + return nil, err + } + + req := APIQueryRequest{ + Cluster: job.Cluster, + From: job.StartTime, + To: job.StartTime + int64(job.Duration), + Queries: queries, + WithStats: true, + WithData: false, + } + + resBody, err := FetchData(req) + if err != nil { + cclog.Errorf("Error while fetching data : %s", err.Error()) + return nil, err + } + + var errors []string + scopedJobStats := make(schema.ScopedJobStats) + + for i, row := range resBody.Results { + if len(row) == 0 { + // No Data Found For Metric, Logged in FetchData to Warn + continue + } + query := req.Queries[i] + metric := query.Metric + scope := assignedScope[i] + + if _, ok := scopedJobStats[metric]; !ok { + scopedJobStats[metric] = make(map[schema.MetricScope][]*schema.ScopedStats) + } + + if _, ok := scopedJobStats[metric][scope]; !ok { + scopedJobStats[metric][scope] = make([]*schema.ScopedStats, 0) + } + + for ndx, res := range row { + if res.Error != nil { + /* Build list for "partial errors", if any */ + errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error)) + continue + } + + id := ExtractTypeID(query.Type, query.TypeIds, ndx, query.Metric, query.Hostname) + + SanitizeStats(&res.Avg, &res.Min, &res.Max) + + scopedJobStats[metric][scope] = append(scopedJobStats[metric][scope], &schema.ScopedStats{ + Hostname: query.Hostname, + ID: id, + Data: &schema.MetricStatistics{ + Avg: float64(res.Avg), + Min: float64(res.Min), + Max: float64(res.Max), + }, + }) + } + + // So that one can later check len(scopedJobStats[metric][scope]): Remove from map if empty + if len(scopedJobStats[metric][scope]) == 0 { + delete(scopedJobStats[metric], scope) + if len(scopedJobStats[metric]) == 0 { + delete(scopedJobStats, metric) + } + } + } + + if len(errors) != 0 { + /* Returns list for "partial errors" */ + return scopedJobStats, fmt.Errorf("METRICDATA/INTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) + } + return scopedJobStats, nil +} + +// LoadNodeData loads metric data for specific nodes in a cluster over a time range. +// +// Unlike LoadData which operates on job resources, this function queries arbitrary nodes +// directly. Useful for system monitoring and node status views. +// +// Parameters: +// - cluster: Cluster name +// - metrics: List of metric names +// - nodes: List of node hostnames (nil = all nodes in cluster via ForAllNodes) +// - scopes: Requested metric scopes (currently unused - always node scope) +// - from, to: Time range +// - ctx: Context (currently unused) +// +// Returns: +// - Map of hostname → metric → []JobMetric +// - Error or partial error listing failed queries +func (ccms *InternalMetricStore) LoadNodeData( + cluster string, + metrics, nodes []string, + scopes []schema.MetricScope, + from, to time.Time, + ctx context.Context, +) (map[string]map[string][]*schema.JobMetric, error) { + req := APIQueryRequest{ + Cluster: cluster, + From: from.Unix(), + To: to.Unix(), + WithStats: true, + WithData: true, + } + + if nodes == nil { + req.ForAllNodes = append(req.ForAllNodes, metrics...) + } else { + for _, node := range nodes { + for _, metric := range metrics { + req.Queries = append(req.Queries, APIQuery{ + Hostname: node, + Metric: metric, + Resolution: 0, // Default for Node Queries: Will return metric $Timestep Resolution + }) + } + } + } + + resBody, err := FetchData(req) + if err != nil { + cclog.Errorf("Error while fetching data : %s", err.Error()) + return nil, err + } + + var errors []string + data := make(map[string]map[string][]*schema.JobMetric) + for i, res := range resBody.Results { + if len(res) == 0 { + // No Data Found For Metric, Logged in FetchData to Warn + continue + } + + var query APIQuery + if resBody.Queries != nil { + query = resBody.Queries[i] + } else { + query = req.Queries[i] + } + + metric := query.Metric + qdata := res[0] + if qdata.Error != nil { + errors = append(errors, fmt.Sprintf("fetching %s for node %s failed: %s", metric, query.Hostname, *qdata.Error)) + continue + } + + mc := archive.GetMetricConfig(cluster, metric) + if mc == nil { + cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster) + continue + } + + SanitizeStats(&qdata.Avg, &qdata.Min, &qdata.Max) + + hostdata, ok := data[query.Hostname] + if !ok { + hostdata = make(map[string][]*schema.JobMetric) + data[query.Hostname] = hostdata + } + + hostdata[metric] = append(hostdata[metric], &schema.JobMetric{ + Unit: mc.Unit, + Timestep: mc.Timestep, + Series: []schema.Series{ + { + Hostname: query.Hostname, + Data: qdata.Data, + Statistics: schema.MetricStatistics{ + Avg: float64(qdata.Avg), + Min: float64(qdata.Min), + Max: float64(qdata.Max), + }, + }, + }, + }) + } + + if len(errors) != 0 { + /* Returns list of "partial errors" */ + return data, fmt.Errorf("METRICDATA/INTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) + } + + return data, nil +} + +// LoadNodeListData loads metric data for a list of nodes with full scope transformation support. +// +// This is the most flexible node data loading function, supporting arbitrary scopes and +// resolution. Uses buildNodeQueries for proper scope transformation based on topology. +// +// Parameters: +// - cluster: Cluster name +// - subCluster: SubCluster name (empty string to infer from node names) +// - nodes: List of node hostnames +// - metrics: List of metric names +// - scopes: Requested metric scopes +// - resolution: Data resolution in seconds +// - from, to: Time range +// - ctx: Context (currently unused) +// +// Returns: +// - Map of hostname → JobData (metric → scope → JobMetric) +// - Error or partial error listing failed queries +func (ccms *InternalMetricStore) LoadNodeListData( + cluster, subCluster string, + nodes []string, + metrics []string, + scopes []schema.MetricScope, + resolution int, + from, to time.Time, + ctx context.Context, +) (map[string]schema.JobData, error) { + // Note: Order of node data is not guaranteed after this point + queries, assignedScope, err := buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, int64(resolution)) + if err != nil { + cclog.Errorf("Error while building node queries for Cluster %s, SubCLuster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error()) + return nil, err + } + + // Verify assignment is correct - log any inconsistencies for debugging + if len(queries) != len(assignedScope) { + cclog.Errorf("Critical error: queries and assignedScope have different lengths after buildNodeQueries: %d vs %d", + len(queries), len(assignedScope)) + } + + req := APIQueryRequest{ + Cluster: cluster, + Queries: queries, + From: from.Unix(), + To: to.Unix(), + WithStats: true, + WithData: true, + } + + resBody, err := FetchData(req) + if err != nil { + cclog.Errorf("Error while fetching data : %s", err.Error()) + return nil, err + } + + var errors []string + data := make(map[string]schema.JobData) + + // Add safety check for index out of range issues + if len(resBody.Results) != len(req.Queries) || len(assignedScope) != len(req.Queries) { + cclog.Warnf("Mismatch in query results count: queries=%d, results=%d, assignedScope=%d", + len(req.Queries), len(resBody.Results), len(assignedScope)) + if len(resBody.Results) > len(req.Queries) { + resBody.Results = resBody.Results[:len(req.Queries)] + } + if len(assignedScope) > len(req.Queries) { + assignedScope = assignedScope[:len(req.Queries)] + } + } + + for i, row := range resBody.Results { + var query APIQuery + if resBody.Queries != nil { + if i < len(resBody.Queries) { + query = resBody.Queries[i] + } else { + cclog.Warnf("Index out of range prevented for resBody.Queries: i=%d, len=%d", + i, len(resBody.Queries)) + continue + } + } else { + query = req.Queries[i] + } + + metric := query.Metric + scope := assignedScope[i] + mc := archive.GetMetricConfig(cluster, metric) + if mc == nil { + cclog.Warnf("Metric config not found for %s on cluster %s", metric, cluster) + continue + } + + res := mc.Timestep + if len(row) > 0 { + res = int(row[0].Resolution) + } + + // Init Nested Map Data Structures If Not Found + hostData, ok := data[query.Hostname] + if !ok { + hostData = make(schema.JobData) + data[query.Hostname] = hostData + } + + metricData, ok := hostData[metric] + if !ok { + metricData = make(map[schema.MetricScope]*schema.JobMetric) + data[query.Hostname][metric] = metricData + } + + scopeData, ok := metricData[scope] + if !ok { + scopeData = &schema.JobMetric{ + Unit: mc.Unit, + Timestep: res, + Series: make([]schema.Series, 0), + } + data[query.Hostname][metric][scope] = scopeData + } + + for ndx, res := range row { + if res.Error != nil { + /* Build list for "partial errors", if any */ + errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error)) + continue + } + + id := ExtractTypeID(query.Type, query.TypeIds, ndx, query.Metric, query.Hostname) + + SanitizeStats(&res.Avg, &res.Min, &res.Max) + + scopeData.Series = append(scopeData.Series, schema.Series{ + Hostname: query.Hostname, + ID: id, + Statistics: schema.MetricStatistics{ + Avg: float64(res.Avg), + Min: float64(res.Min), + Max: float64(res.Max), + }, + Data: res.Data, + }) + } + } + + if len(errors) != 0 { + /* Returns list of "partial errors" */ + return data, fmt.Errorf("METRICDATA/INTERNAL-CCMS > Errors: %s", strings.Join(errors, ", ")) + } + + return data, nil +} + +// buildNodeQueries constructs APIQuery structures for node-based queries with scope transformation. +// +// Similar to buildQueries but operates on node lists rather than job resources. +// Supports dynamic subcluster lookup when subCluster parameter is empty. +// +// Parameters: +// - cluster: Cluster name +// - subCluster: SubCluster name (empty = infer from node hostnames) +// - nodes: List of node hostnames +// - metrics: List of metric names +// - scopes: Requested metric scopes +// - resolution: Data resolution in seconds +// +// Returns: +// - []APIQuery: List of queries to execute +// - []schema.MetricScope: Assigned scope for each query +// - error: Returns error if topology lookup fails or unhandled scope combination +func buildNodeQueries( + cluster string, + subCluster string, + nodes []string, + metrics []string, + scopes []schema.MetricScope, + resolution int64, +) ([]APIQuery, []schema.MetricScope, error) { + queries := make([]APIQuery, 0, len(metrics)*len(scopes)*len(nodes)) + assignedScope := make([]schema.MetricScope, 0, len(metrics)*len(scopes)*len(nodes)) + + // Get Topol before loop if subCluster given + var subClusterTopol *schema.SubCluster + var scterr error + if subCluster != "" { + subClusterTopol, scterr = archive.GetSubCluster(cluster, subCluster) + if scterr != nil { + cclog.Errorf("could not load cluster %s subCluster %s topology: %s", cluster, subCluster, scterr.Error()) + return nil, nil, scterr + } + } + + for _, metric := range metrics { + mc := archive.GetMetricConfig(cluster, metric) + if mc == nil { + cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, cluster) + continue + } + + // Skip if metric is removed for subcluster + if mc.SubClusters != nil && IsMetricRemovedForSubCluster(mc, subCluster) { + continue + } + + // Avoid duplicates... + handledScopes := make([]schema.MetricScope, 0, 3) + + nodeScopesLoop: + for _, requestedScope := range scopes { + nativeScope := mc.Scope + + scope := nativeScope.Max(requestedScope) + for _, s := range handledScopes { + if scope == s { + continue nodeScopesLoop + } + } + handledScopes = append(handledScopes, scope) + + for _, hostname := range nodes { + + // If no subCluster given, get it by node + if subCluster == "" { + subClusterName, scnerr := archive.GetSubClusterByNode(cluster, hostname) + if scnerr != nil { + return nil, nil, scnerr + } + subClusterTopol, scterr = archive.GetSubCluster(cluster, subClusterName) + if scterr != nil { + return nil, nil, scterr + } + } + + // Always full node hwthread id list, no partial queries expected + topology := subClusterTopol.Topology + acceleratorIds := topology.GetAcceleratorIDs() + + // Moved check here if metric matches hardware specs + if nativeScope == schema.MetricScopeAccelerator && len(acceleratorIds) == 0 { + continue + } + + scopeResults, ok := BuildScopeQueries( + nativeScope, requestedScope, + metric, hostname, + &topology, topology.Node, acceleratorIds, + ) + + if !ok { + return nil, nil, fmt.Errorf("METRICDATA/INTERNAL-CCMS > unsupported scope transformation: native-scope=%s, requested-scope=%s", nativeScope, requestedScope) + } + + for _, sr := range scopeResults { + queries = append(queries, APIQuery{ + Metric: sr.Metric, + Hostname: sr.Hostname, + Aggregate: sr.Aggregate, + Type: sr.Type, + TypeIds: sr.TypeIds, + Resolution: resolution, + }) + assignedScope = append(assignedScope, sr.Scope) + } + } + } + } + + return queries, assignedScope, nil +} diff --git a/pkg/metricstore/scopequery.go b/pkg/metricstore/scopequery.go new file mode 100644 index 00000000..a01a9cc6 --- /dev/null +++ b/pkg/metricstore/scopequery.go @@ -0,0 +1,341 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +// This file contains shared scope transformation logic used by both the internal +// metric store (pkg/metricstore) and the external cc-metric-store client +// (internal/metricstoreclient). It extracts the common algorithm for mapping +// between native metric scopes and requested scopes based on cluster topology. +package metricstore + +import ( + "strconv" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +// Pre-converted scope strings avoid repeated string(MetricScope) allocations +// during query construction. Used in ScopeQueryResult.Type field. +var ( + HWThreadString = string(schema.MetricScopeHWThread) + CoreString = string(schema.MetricScopeCore) + MemoryDomainString = string(schema.MetricScopeMemoryDomain) + SocketString = string(schema.MetricScopeSocket) + AcceleratorString = string(schema.MetricScopeAccelerator) +) + +// ScopeQueryResult is a package-independent intermediate type returned by +// BuildScopeQueries. Each consumer converts it to their own APIQuery type +// (adding Resolution and any other package-specific fields). +type ScopeQueryResult struct { + Type *string + Metric string + Hostname string + TypeIds []string + Scope schema.MetricScope + Aggregate bool +} + +// BuildScopeQueries generates scope query results for a given scope transformation. +// It returns a slice of results and a boolean indicating success. +// An empty slice means an expected exception (skip this combination). +// ok=false means an unhandled case (caller should return an error). +func BuildScopeQueries( + nativeScope, requestedScope schema.MetricScope, + metric, hostname string, + topology *schema.Topology, + hwthreads []int, + accelerators []string, +) ([]ScopeQueryResult, bool) { + scope := nativeScope.Max(requestedScope) + results := []ScopeQueryResult{} + + hwthreadsStr := IntToStringSlice(hwthreads) + + // Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node) + if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) { + if scope != schema.MetricScopeAccelerator { + // Expected Exception -> Return Empty Slice + return results, true + } + + results = append(results, ScopeQueryResult{ + Metric: metric, + Hostname: hostname, + Aggregate: false, + Type: &AcceleratorString, + TypeIds: accelerators, + Scope: schema.MetricScopeAccelerator, + }) + return results, true + } + + // Accelerator -> Node + if nativeScope == schema.MetricScopeAccelerator && scope == schema.MetricScopeNode { + if len(accelerators) == 0 { + // Expected Exception -> Return Empty Slice + return results, true + } + + results = append(results, ScopeQueryResult{ + Metric: metric, + Hostname: hostname, + Aggregate: true, + Type: &AcceleratorString, + TypeIds: accelerators, + Scope: scope, + }) + return results, true + } + + // HWThread -> HWThread + if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeHWThread { + results = append(results, ScopeQueryResult{ + Metric: metric, + Hostname: hostname, + Aggregate: false, + Type: &HWThreadString, + TypeIds: hwthreadsStr, + Scope: scope, + }) + return results, true + } + + // HWThread -> Core + if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeCore { + cores, _ := topology.GetCoresFromHWThreads(hwthreads) + for _, core := range cores { + results = append(results, ScopeQueryResult{ + Metric: metric, + Hostname: hostname, + Aggregate: true, + Type: &HWThreadString, + TypeIds: IntToStringSlice(topology.Core[core]), + Scope: scope, + }) + } + return results, true + } + + // HWThread -> Socket + if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeSocket { + sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) + for _, socket := range sockets { + results = append(results, ScopeQueryResult{ + Metric: metric, + Hostname: hostname, + Aggregate: true, + Type: &HWThreadString, + TypeIds: IntToStringSlice(topology.Socket[socket]), + Scope: scope, + }) + } + return results, true + } + + // HWThread -> Node + if nativeScope == schema.MetricScopeHWThread && scope == schema.MetricScopeNode { + results = append(results, ScopeQueryResult{ + Metric: metric, + Hostname: hostname, + Aggregate: true, + Type: &HWThreadString, + TypeIds: hwthreadsStr, + Scope: scope, + }) + return results, true + } + + // Core -> Core + if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeCore { + cores, _ := topology.GetCoresFromHWThreads(hwthreads) + results = append(results, ScopeQueryResult{ + Metric: metric, + Hostname: hostname, + Aggregate: false, + Type: &CoreString, + TypeIds: IntToStringSlice(cores), + Scope: scope, + }) + return results, true + } + + // Core -> Socket + if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeSocket { + sockets, _ := topology.GetSocketsFromCores(hwthreads) + for _, socket := range sockets { + results = append(results, ScopeQueryResult{ + Metric: metric, + Hostname: hostname, + Aggregate: true, + Type: &CoreString, + TypeIds: IntToStringSlice(topology.Socket[socket]), + Scope: scope, + }) + } + return results, true + } + + // Core -> Node + if nativeScope == schema.MetricScopeCore && scope == schema.MetricScopeNode { + cores, _ := topology.GetCoresFromHWThreads(hwthreads) + results = append(results, ScopeQueryResult{ + Metric: metric, + Hostname: hostname, + Aggregate: true, + Type: &CoreString, + TypeIds: IntToStringSlice(cores), + Scope: scope, + }) + return results, true + } + + // MemoryDomain -> MemoryDomain + if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeMemoryDomain { + memDomains, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads) + results = append(results, ScopeQueryResult{ + Metric: metric, + Hostname: hostname, + Aggregate: false, + Type: &MemoryDomainString, + TypeIds: IntToStringSlice(memDomains), + Scope: scope, + }) + return results, true + } + + // MemoryDomain -> Socket + if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeSocket { + memDomains, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads) + socketToDomains, err := topology.GetMemoryDomainsBySocket(memDomains) + if err != nil { + cclog.Errorf("Error mapping memory domains to sockets, return unchanged: %v", err) + // Rare Error Case -> Still Continue -> Return Empty Slice + return results, true + } + + // Create a query for each socket + for _, domains := range socketToDomains { + results = append(results, ScopeQueryResult{ + Metric: metric, + Hostname: hostname, + Aggregate: true, + Type: &MemoryDomainString, + TypeIds: IntToStringSlice(domains), + Scope: scope, + }) + } + return results, true + } + + // MemoryDomain -> Node + if nativeScope == schema.MetricScopeMemoryDomain && scope == schema.MetricScopeNode { + memDomains, _ := topology.GetMemoryDomainsFromHWThreads(hwthreads) + results = append(results, ScopeQueryResult{ + Metric: metric, + Hostname: hostname, + Aggregate: true, + Type: &MemoryDomainString, + TypeIds: IntToStringSlice(memDomains), + Scope: scope, + }) + return results, true + } + + // Socket -> Socket + if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeSocket { + sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) + results = append(results, ScopeQueryResult{ + Metric: metric, + Hostname: hostname, + Aggregate: false, + Type: &SocketString, + TypeIds: IntToStringSlice(sockets), + Scope: scope, + }) + return results, true + } + + // Socket -> Node + if nativeScope == schema.MetricScopeSocket && scope == schema.MetricScopeNode { + sockets, _ := topology.GetSocketsFromHWThreads(hwthreads) + results = append(results, ScopeQueryResult{ + Metric: metric, + Hostname: hostname, + Aggregate: true, + Type: &SocketString, + TypeIds: IntToStringSlice(sockets), + Scope: scope, + }) + return results, true + } + + // Node -> Node + if nativeScope == schema.MetricScopeNode && scope == schema.MetricScopeNode { + results = append(results, ScopeQueryResult{ + Metric: metric, + Hostname: hostname, + Scope: scope, + }) + return results, true + } + + // Unhandled Case + return nil, false +} + +// IntToStringSlice converts a slice of integers to a slice of strings. +// Used to convert hardware thread/core/socket IDs from topology (int) to query TypeIds (string). +// Optimized to reuse a byte buffer for string conversion, reducing allocations. +func IntToStringSlice(is []int) []string { + if len(is) == 0 { + return nil + } + + ss := make([]string, len(is)) + buf := make([]byte, 0, 16) // Reusable buffer for integer conversion + for i, x := range is { + buf = strconv.AppendInt(buf[:0], int64(x), 10) + ss[i] = string(buf) + } + return ss +} + +// ExtractTypeID returns the type ID at the given index from a query's TypeIds slice. +// Returns nil if queryType is nil (no type filtering). Logs a warning and returns nil +// if the index is out of range. +func ExtractTypeID(queryType *string, typeIds []string, ndx int, metric, hostname string) *string { + if queryType == nil { + return nil + } + if ndx < len(typeIds) { + id := typeIds[ndx] + return &id + } + cclog.Warnf("TypeIds index out of range: %d with length %d for metric %s on host %s", + ndx, len(typeIds), metric, hostname) + return nil +} + +// IsMetricRemovedForSubCluster checks whether a metric is marked as removed +// for the given subcluster in its per-subcluster configuration. +func IsMetricRemovedForSubCluster(mc *schema.MetricConfig, subCluster string) bool { + for _, scConfig := range mc.SubClusters { + if scConfig.Name == subCluster && scConfig.Remove { + return true + } + } + return false +} + +// SanitizeStats replaces NaN values in statistics with 0 to enable JSON marshaling. +// If ANY of avg/min/max is NaN, ALL three are zeroed for consistency. +func SanitizeStats(avg, min, max *schema.Float) { + if avg.IsNaN() || min.IsNaN() || max.IsNaN() { + *avg = schema.Float(0) + *min = schema.Float(0) + *max = schema.Float(0) + } +} diff --git a/pkg/metricstore/scopequery_test.go b/pkg/metricstore/scopequery_test.go new file mode 100644 index 00000000..4cdfca78 --- /dev/null +++ b/pkg/metricstore/scopequery_test.go @@ -0,0 +1,273 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. +package metricstore + +import ( + "testing" + + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +// makeTopology creates a simple 2-socket, 4-core, 8-hwthread topology for testing. +// Socket 0: cores 0,1 with hwthreads 0,1,2,3 +// Socket 1: cores 2,3 with hwthreads 4,5,6,7 +// MemoryDomain 0: hwthreads 0,1,2,3 (socket 0) +// MemoryDomain 1: hwthreads 4,5,6,7 (socket 1) +func makeTopology() schema.Topology { + topo := schema.Topology{ + Node: []int{0, 1, 2, 3, 4, 5, 6, 7}, + Socket: [][]int{{0, 1, 2, 3}, {4, 5, 6, 7}}, + MemoryDomain: [][]int{{0, 1, 2, 3}, {4, 5, 6, 7}}, + Core: [][]int{{0, 1}, {2, 3}, {4, 5}, {6, 7}}, + Accelerators: []*schema.Accelerator{ + {ID: "gpu0"}, + {ID: "gpu1"}, + }, + } + return topo +} + +func TestBuildScopeQueries(t *testing.T) { + topo := makeTopology() + topo.InitTopologyMaps() + accIds := topo.GetAcceleratorIDs() + + tests := []struct { + name string + nativeScope schema.MetricScope + requestedScope schema.MetricScope + expectOk bool + expectLen int // expected number of results + expectAgg bool + expectScope schema.MetricScope + }{ + // Same-scope cases + { + name: "HWThread->HWThread", nativeScope: schema.MetricScopeHWThread, + requestedScope: schema.MetricScopeHWThread, expectOk: true, expectLen: 1, + expectAgg: false, expectScope: schema.MetricScopeHWThread, + }, + { + name: "Core->Core", nativeScope: schema.MetricScopeCore, + requestedScope: schema.MetricScopeCore, expectOk: true, expectLen: 1, + expectAgg: false, expectScope: schema.MetricScopeCore, + }, + { + name: "Socket->Socket", nativeScope: schema.MetricScopeSocket, + requestedScope: schema.MetricScopeSocket, expectOk: true, expectLen: 1, + expectAgg: false, expectScope: schema.MetricScopeSocket, + }, + { + name: "MemoryDomain->MemoryDomain", nativeScope: schema.MetricScopeMemoryDomain, + requestedScope: schema.MetricScopeMemoryDomain, expectOk: true, expectLen: 1, + expectAgg: false, expectScope: schema.MetricScopeMemoryDomain, + }, + { + name: "Node->Node", nativeScope: schema.MetricScopeNode, + requestedScope: schema.MetricScopeNode, expectOk: true, expectLen: 1, + expectAgg: false, expectScope: schema.MetricScopeNode, + }, + { + name: "Accelerator->Accelerator", nativeScope: schema.MetricScopeAccelerator, + requestedScope: schema.MetricScopeAccelerator, expectOk: true, expectLen: 1, + expectAgg: false, expectScope: schema.MetricScopeAccelerator, + }, + // Aggregation cases + { + name: "HWThread->Core", nativeScope: schema.MetricScopeHWThread, + requestedScope: schema.MetricScopeCore, expectOk: true, expectLen: 4, // 4 cores + expectAgg: true, expectScope: schema.MetricScopeCore, + }, + { + name: "HWThread->Socket", nativeScope: schema.MetricScopeHWThread, + requestedScope: schema.MetricScopeSocket, expectOk: true, expectLen: 2, // 2 sockets + expectAgg: true, expectScope: schema.MetricScopeSocket, + }, + { + name: "HWThread->Node", nativeScope: schema.MetricScopeHWThread, + requestedScope: schema.MetricScopeNode, expectOk: true, expectLen: 1, + expectAgg: true, expectScope: schema.MetricScopeNode, + }, + { + name: "Core->Socket", nativeScope: schema.MetricScopeCore, + requestedScope: schema.MetricScopeSocket, expectOk: true, expectLen: 2, // 2 sockets + expectAgg: true, expectScope: schema.MetricScopeSocket, + }, + { + name: "Core->Node", nativeScope: schema.MetricScopeCore, + requestedScope: schema.MetricScopeNode, expectOk: true, expectLen: 1, + expectAgg: true, expectScope: schema.MetricScopeNode, + }, + { + name: "Socket->Node", nativeScope: schema.MetricScopeSocket, + requestedScope: schema.MetricScopeNode, expectOk: true, expectLen: 1, + expectAgg: true, expectScope: schema.MetricScopeNode, + }, + { + name: "MemoryDomain->Node", nativeScope: schema.MetricScopeMemoryDomain, + requestedScope: schema.MetricScopeNode, expectOk: true, expectLen: 1, + expectAgg: true, expectScope: schema.MetricScopeNode, + }, + { + name: "MemoryDomain->Socket", nativeScope: schema.MetricScopeMemoryDomain, + requestedScope: schema.MetricScopeSocket, expectOk: true, expectLen: 2, // 2 sockets + expectAgg: true, expectScope: schema.MetricScopeSocket, + }, + { + name: "Accelerator->Node", nativeScope: schema.MetricScopeAccelerator, + requestedScope: schema.MetricScopeNode, expectOk: true, expectLen: 1, + expectAgg: true, expectScope: schema.MetricScopeNode, + }, + // Expected exception: Accelerator scope requested but non-accelerator scope in between + { + name: "Accelerator->Core (exception)", nativeScope: schema.MetricScopeAccelerator, + requestedScope: schema.MetricScopeCore, expectOk: true, expectLen: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + results, ok := BuildScopeQueries( + tt.nativeScope, tt.requestedScope, + "test_metric", "node001", + &topo, topo.Node, accIds, + ) + + if ok != tt.expectOk { + t.Fatalf("expected ok=%v, got ok=%v", tt.expectOk, ok) + } + + if len(results) != tt.expectLen { + t.Fatalf("expected %d results, got %d", tt.expectLen, len(results)) + } + + if tt.expectLen > 0 { + for _, r := range results { + if r.Scope != tt.expectScope { + t.Errorf("expected scope %s, got %s", tt.expectScope, r.Scope) + } + if r.Aggregate != tt.expectAgg { + t.Errorf("expected aggregate=%v, got %v", tt.expectAgg, r.Aggregate) + } + if r.Metric != "test_metric" { + t.Errorf("expected metric 'test_metric', got '%s'", r.Metric) + } + if r.Hostname != "node001" { + t.Errorf("expected hostname 'node001', got '%s'", r.Hostname) + } + } + } + }) + } +} + +func TestBuildScopeQueries_UnhandledCase(t *testing.T) { + topo := makeTopology() + topo.InitTopologyMaps() + + // Node native with HWThread requested => scope.Max = Node, but let's try an invalid combination + // Actually all valid combinations are handled. An unhandled case would be something like + // a scope that doesn't exist in the if-chain. Since all real scopes are covered, + // we test with a synthetic unhandled combination by checking the bool return. + // The function should return ok=false for truly unhandled cases. + + // For now, verify all known combinations return ok=true + scopes := []schema.MetricScope{ + schema.MetricScopeHWThread, schema.MetricScopeCore, + schema.MetricScopeSocket, schema.MetricScopeNode, + } + + for _, native := range scopes { + for _, requested := range scopes { + results, ok := BuildScopeQueries( + native, requested, + "m", "h", &topo, topo.Node, nil, + ) + if !ok { + t.Errorf("unexpected unhandled case: native=%s, requested=%s", native, requested) + } + if results == nil { + t.Errorf("results should not be nil for native=%s, requested=%s", native, requested) + } + } + } +} + +func TestIntToStringSlice(t *testing.T) { + tests := []struct { + input []int + expected []string + }{ + {nil, nil}, + {[]int{}, nil}, + {[]int{0}, []string{"0"}}, + {[]int{1, 2, 3}, []string{"1", "2", "3"}}, + {[]int{10, 100, 1000}, []string{"10", "100", "1000"}}, + } + + for _, tt := range tests { + result := IntToStringSlice(tt.input) + if len(result) != len(tt.expected) { + t.Errorf("IntToStringSlice(%v): expected len %d, got %d", tt.input, len(tt.expected), len(result)) + continue + } + for i := range result { + if result[i] != tt.expected[i] { + t.Errorf("IntToStringSlice(%v)[%d]: expected %s, got %s", tt.input, i, tt.expected[i], result[i]) + } + } + } +} + +func TestSanitizeStats(t *testing.T) { + // Test: all valid - should remain unchanged + avg, min, max := schema.Float(1.0), schema.Float(0.5), schema.Float(2.0) + SanitizeStats(&avg, &min, &max) + if avg != 1.0 || min != 0.5 || max != 2.0 { + t.Errorf("SanitizeStats should not change valid values") + } + + // Test: one NaN - all should be zeroed + avg, min, max = schema.Float(1.0), schema.Float(0.5), schema.NaN + SanitizeStats(&avg, &min, &max) + if avg != 0 || min != 0 || max != 0 { + t.Errorf("SanitizeStats should zero all when any is NaN, got avg=%v min=%v max=%v", avg, min, max) + } + + // Test: all NaN + avg, min, max = schema.NaN, schema.NaN, schema.NaN + SanitizeStats(&avg, &min, &max) + if avg != 0 || min != 0 || max != 0 { + t.Errorf("SanitizeStats should zero all NaN values") + } +} + +func TestNodeToNodeQuery(t *testing.T) { + topo := makeTopology() + topo.InitTopologyMaps() + + results, ok := BuildScopeQueries( + schema.MetricScopeNode, schema.MetricScopeNode, + "cpu_load", "node001", + &topo, topo.Node, nil, + ) + + if !ok { + t.Fatal("expected ok=true for Node->Node") + } + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + r := results[0] + if r.Type != nil { + t.Error("Node->Node should have nil Type") + } + if r.TypeIds != nil { + t.Error("Node->Node should have nil TypeIds") + } + if r.Aggregate { + t.Error("Node->Node should not aggregate") + } +} diff --git a/internal/memorystore/stats.go b/pkg/metricstore/stats.go similarity index 82% rename from internal/memorystore/stats.go rename to pkg/metricstore/stats.go index 91b1f2cc..8f7886a3 100644 --- a/internal/memorystore/stats.go +++ b/pkg/metricstore/stats.go @@ -3,20 +3,21 @@ // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. -package memorystore +package metricstore import ( "errors" "math" - "github.com/ClusterCockpit/cc-lib/util" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" ) type Stats struct { Samples int - Avg util.Float - Min util.Float - Max util.Float + Avg schema.Float + Min schema.Float + Max schema.Float } func (b *buffer) stats(from, to int64) (Stats, int64, int64, error) { @@ -61,9 +62,9 @@ func (b *buffer) stats(from, to int64) (Stats, int64, int64, error) { return Stats{ Samples: samples, - Avg: util.Float(sum) / util.Float(samples), - Min: util.Float(min), - Max: util.Float(max), + Avg: schema.Float(sum) / schema.Float(samples), + Min: schema.Float(min), + Max: schema.Float(max), }, from, t, nil } @@ -77,11 +78,11 @@ func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int6 minfo, ok := m.Metrics[metric] if !ok { - return nil, 0, 0, errors.New("unkown metric: " + metric) + return nil, 0, 0, errors.New("unknown metric: " + metric) } n, samples := 0, 0 - avg, min, max := util.Float(0), math.MaxFloat32, -math.MaxFloat32 + avg, min, max := schema.Float(0), math.MaxFloat32, -math.MaxFloat32 err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error { stats, cfrom, cto, err := b.stats(from, to) if err != nil { @@ -110,7 +111,7 @@ func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int6 } if minfo.Aggregation == AvgAggregation { - avg /= util.Float(n) + avg /= schema.Float(n) } else if n > 1 && minfo.Aggregation != SumAggregation { return nil, 0, 0, errors.New("invalid aggregation") } @@ -118,7 +119,7 @@ func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int6 return &Stats{ Samples: samples, Avg: avg, - Min: util.Float(min), - Max: util.Float(max), + Min: schema.Float(min), + Max: schema.Float(max), }, from, to, nil } diff --git a/pkg/metricstore/walCheckpoint.go b/pkg/metricstore/walCheckpoint.go new file mode 100644 index 00000000..07414d98 --- /dev/null +++ b/pkg/metricstore/walCheckpoint.go @@ -0,0 +1,797 @@ +// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +// All rights reserved. This file is part of cc-backend. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +// Package metricstore provides walCheckpoint.go: WAL-based checkpoint implementation. +// +// This replaces the Avro shadow tree with an append-only Write-Ahead Log (WAL) +// per host, eliminating the extra memory overhead of the AvroStore and providing +// truly continuous (per-write) crash safety. +// +// # Architecture +// +// Metric write (DecodeLine) +// │ +// ├─► WriteToLevel() → main MemoryStore (unchanged) +// │ +// └─► WALMessages channel +// │ +// ▼ +// WALStaging goroutine +// │ +// ▼ +// checkpoints/cluster/host/current.wal (append-only, binary) +// +// Periodic checkpoint (Checkpointing goroutine): +// 1. Write .bin snapshot (column-oriented, from main tree) +// 2. Signal WALStaging to truncate current.wal per host +// +// On restart (FromCheckpoint): +// 1. Load most recent .bin snapshot +// 2. Replay current.wal (overwrite-safe: buffer.write handles duplicate timestamps) +// +// # WAL Record Format +// +// [4B magic 0xCC1DA7A1][4B payload_len][payload][4B CRC32] +// +// payload: +// [8B timestamp int64] +// [2B metric_name_len uint16][N metric name bytes] +// [1B selector_count uint8] +// per selector: [1B selector_len uint8][M selector bytes] +// [4B value float32 bits] +// +// # Binary Snapshot Format +// +// [4B magic 0xCC5B0001][8B from int64][8B to int64] +// Level tree (recursive): +// [4B num_metrics uint32] +// per metric: +// [2B name_len uint16][N name bytes] +// [8B frequency int64][8B start int64] +// [4B num_values uint32][num_values × 4B float32] +// [4B num_children uint32] +// per child: [2B name_len uint16][N name bytes] + Level (recursive) +package metricstore + +import ( + "bufio" + "context" + "encoding/binary" + "fmt" + "hash/crc32" + "io" + "math" + "os" + "path" + "strings" + "sync" + "sync/atomic" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + "github.com/ClusterCockpit/cc-lib/v2/schema" +) + +// Magic numbers for binary formats. +const ( + walFileMagic = uint32(0xCC1DA701) // WAL file header magic + walRecordMagic = uint32(0xCC1DA7A1) // WAL record magic + snapFileMagic = uint32(0xCC5B0001) // Binary snapshot magic +) + +// WALMessages is the channel for sending metric writes to the WAL staging goroutine. +// Buffered to allow burst writes without blocking the metric ingestion path. +var WALMessages = make(chan *WALMessage, 4096) + +// walRotateCh is used by the checkpoint goroutine to request WAL file rotation +// (close, delete, reopen) after a binary snapshot has been written. +var walRotateCh = make(chan walRotateReq, 256) + +// WALMessage represents a single metric write to be appended to the WAL. +// Cluster and Node are NOT stored in the WAL record (inferred from file path). +type WALMessage struct { + MetricName string + Cluster string + Node string + Selector []string + Value schema.Float + Timestamp int64 +} + +// walRotateReq requests WAL file rotation for a specific host directory. +// The done channel is closed by the WAL goroutine when rotation is complete. +type walRotateReq struct { + hostDir string + done chan struct{} +} + +// walFileState holds an open WAL file handle for one host directory. +type walFileState struct { + f *os.File +} + +// WALStaging starts a background goroutine that receives WALMessage items +// and appends binary WAL records to per-host current.wal files. +// Also handles WAL rotation requests from the checkpoint goroutine. +func WALStaging(wg *sync.WaitGroup, ctx context.Context) { + wg.Go(func() { + if Keys.Checkpoints.FileFormat == "json" { + return + } + + hostFiles := make(map[string]*walFileState) + + defer func() { + for _, ws := range hostFiles { + if ws.f != nil { + ws.f.Close() + } + } + }() + + getOrOpenWAL := func(hostDir string) *os.File { + ws, ok := hostFiles[hostDir] + if ok { + return ws.f + } + + if err := os.MkdirAll(hostDir, CheckpointDirPerms); err != nil { + cclog.Errorf("[METRICSTORE]> WAL: mkdir %s: %v", hostDir, err) + return nil + } + + walPath := path.Join(hostDir, "current.wal") + f, err := os.OpenFile(walPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, CheckpointFilePerms) + if err != nil { + cclog.Errorf("[METRICSTORE]> WAL: open %s: %v", walPath, err) + return nil + } + + // Write file header magic if file is new (empty). + info, err := f.Stat() + if err == nil && info.Size() == 0 { + var hdr [4]byte + binary.LittleEndian.PutUint32(hdr[:], walFileMagic) + if _, err := f.Write(hdr[:]); err != nil { + cclog.Errorf("[METRICSTORE]> WAL: write header %s: %v", walPath, err) + f.Close() + return nil + } + } + + hostFiles[hostDir] = &walFileState{f: f} + return f + } + + processMsg := func(msg *WALMessage) { + hostDir := path.Join(Keys.Checkpoints.RootDir, msg.Cluster, msg.Node) + f := getOrOpenWAL(hostDir) + if f == nil { + return + } + if err := writeWALRecord(f, msg); err != nil { + cclog.Errorf("[METRICSTORE]> WAL: write record: %v", err) + } + } + + processRotate := func(req walRotateReq) { + ws, ok := hostFiles[req.hostDir] + if ok && ws.f != nil { + ws.f.Close() + walPath := path.Join(req.hostDir, "current.wal") + if err := os.Remove(walPath); err != nil && !os.IsNotExist(err) { + cclog.Errorf("[METRICSTORE]> WAL: remove %s: %v", walPath, err) + } + delete(hostFiles, req.hostDir) + } + close(req.done) + } + + drain := func() { + for { + select { + case msg, ok := <-WALMessages: + if !ok { + return + } + processMsg(msg) + case req := <-walRotateCh: + processRotate(req) + default: + return + } + } + } + + for { + select { + case <-ctx.Done(): + drain() + return + case msg, ok := <-WALMessages: + if !ok { + return + } + processMsg(msg) + case req := <-walRotateCh: + processRotate(req) + } + } + }) +} + +// RotateWALFiles sends rotation requests for the given host directories +// and blocks until all rotations complete. +func RotateWALFiles(hostDirs []string) { + dones := make([]chan struct{}, len(hostDirs)) + for i, dir := range hostDirs { + dones[i] = make(chan struct{}) + walRotateCh <- walRotateReq{hostDir: dir, done: dones[i]} + } + for _, done := range dones { + <-done + } +} + +// RotateWALFiles sends rotation requests for the given host directories +// and blocks until all rotations complete. +func RotateWALFilesAfterShutdown(hostDirs []string) { + for _, dir := range hostDirs { + walPath := path.Join(dir, "current.wal") + if err := os.Remove(walPath); err != nil && !os.IsNotExist(err) { + cclog.Errorf("[METRICSTORE]> WAL: remove %s: %v", walPath, err) + } + } +} + +// buildWALPayload encodes a WALMessage into a binary payload (without magic/length/CRC). +func buildWALPayload(msg *WALMessage) []byte { + size := 8 + 2 + len(msg.MetricName) + 1 + 4 + for _, s := range msg.Selector { + size += 1 + len(s) + } + + buf := make([]byte, 0, size) + + // Timestamp (8 bytes, little-endian int64) + var ts [8]byte + binary.LittleEndian.PutUint64(ts[:], uint64(msg.Timestamp)) + buf = append(buf, ts[:]...) + + // Metric name (2-byte length prefix + bytes) + var mLen [2]byte + binary.LittleEndian.PutUint16(mLen[:], uint16(len(msg.MetricName))) + buf = append(buf, mLen[:]...) + buf = append(buf, msg.MetricName...) + + // Selector count (1 byte) + buf = append(buf, byte(len(msg.Selector))) + + // Selectors (1-byte length prefix + bytes each) + for _, sel := range msg.Selector { + buf = append(buf, byte(len(sel))) + buf = append(buf, sel...) + } + + // Value (4 bytes, float32 bit representation) + var val [4]byte + binary.LittleEndian.PutUint32(val[:], math.Float32bits(float32(msg.Value))) + buf = append(buf, val[:]...) + + return buf +} + +// writeWALRecord appends a binary WAL record to the file. +// Format: [4B magic][4B payload_len][payload][4B CRC32] +func writeWALRecord(f *os.File, msg *WALMessage) error { + payload := buildWALPayload(msg) + crc := crc32.ChecksumIEEE(payload) + + record := make([]byte, 0, 4+4+len(payload)+4) + + var magic [4]byte + binary.LittleEndian.PutUint32(magic[:], walRecordMagic) + record = append(record, magic[:]...) + + var pLen [4]byte + binary.LittleEndian.PutUint32(pLen[:], uint32(len(payload))) + record = append(record, pLen[:]...) + + record = append(record, payload...) + + var crcBytes [4]byte + binary.LittleEndian.PutUint32(crcBytes[:], crc) + record = append(record, crcBytes[:]...) + + _, err := f.Write(record) + return err +} + +// readWALRecord reads one WAL record from the reader. +// Returns (nil, nil) on clean EOF. Returns error on data corruption. +// A CRC mismatch indicates a truncated trailing record (expected on crash). +func readWALRecord(r io.Reader) (*WALMessage, error) { + var magic uint32 + if err := binary.Read(r, binary.LittleEndian, &magic); err != nil { + if err == io.EOF { + return nil, nil // Clean EOF + } + return nil, fmt.Errorf("read record magic: %w", err) + } + + if magic != walRecordMagic { + return nil, fmt.Errorf("invalid record magic 0x%08X (expected 0x%08X)", magic, walRecordMagic) + } + + var payloadLen uint32 + if err := binary.Read(r, binary.LittleEndian, &payloadLen); err != nil { + return nil, fmt.Errorf("read payload length: %w", err) + } + + if payloadLen > 1<<20 { // 1 MB sanity limit + return nil, fmt.Errorf("record payload too large: %d bytes", payloadLen) + } + + payload := make([]byte, payloadLen) + if _, err := io.ReadFull(r, payload); err != nil { + return nil, fmt.Errorf("read payload: %w", err) + } + + var storedCRC uint32 + if err := binary.Read(r, binary.LittleEndian, &storedCRC); err != nil { + return nil, fmt.Errorf("read CRC: %w", err) + } + + if crc32.ChecksumIEEE(payload) != storedCRC { + return nil, fmt.Errorf("CRC mismatch (truncated write or corruption)") + } + + return parseWALPayload(payload) +} + +// parseWALPayload decodes a binary payload into a WALMessage. +func parseWALPayload(payload []byte) (*WALMessage, error) { + if len(payload) < 8+2+1+4 { + return nil, fmt.Errorf("payload too short: %d bytes", len(payload)) + } + + offset := 0 + + // Timestamp (8 bytes) + ts := int64(binary.LittleEndian.Uint64(payload[offset : offset+8])) + offset += 8 + + // Metric name (2-byte length + bytes) + if offset+2 > len(payload) { + return nil, fmt.Errorf("metric name length overflows payload") + } + mLen := int(binary.LittleEndian.Uint16(payload[offset : offset+2])) + offset += 2 + + if offset+mLen > len(payload) { + return nil, fmt.Errorf("metric name overflows payload") + } + metricName := string(payload[offset : offset+mLen]) + offset += mLen + + // Selector count (1 byte) + if offset >= len(payload) { + return nil, fmt.Errorf("selector count overflows payload") + } + selCount := int(payload[offset]) + offset++ + + selectors := make([]string, selCount) + for i := range selCount { + if offset >= len(payload) { + return nil, fmt.Errorf("selector[%d] length overflows payload", i) + } + sLen := int(payload[offset]) + offset++ + + if offset+sLen > len(payload) { + return nil, fmt.Errorf("selector[%d] data overflows payload", i) + } + selectors[i] = string(payload[offset : offset+sLen]) + offset += sLen + } + + // Value (4 bytes, float32 bits) + if offset+4 > len(payload) { + return nil, fmt.Errorf("value overflows payload") + } + bits := binary.LittleEndian.Uint32(payload[offset : offset+4]) + value := schema.Float(math.Float32frombits(bits)) + + return &WALMessage{ + MetricName: metricName, + Timestamp: ts, + Selector: selectors, + Value: value, + }, nil +} + +// loadWALFile reads a WAL file and replays all valid records into the Level tree. +// l is the host-level node. Corrupt or partial trailing records are silently skipped +// (expected on crash). Records older than 'from' are skipped. +func (l *Level) loadWALFile(m *MemoryStore, f *os.File, from int64) error { + br := bufio.NewReader(f) + + // Verify file header magic. + var fileMagic uint32 + if err := binary.Read(br, binary.LittleEndian, &fileMagic); err != nil { + if err == io.EOF { + return nil // Empty file, no data + } + return fmt.Errorf("[METRICSTORE]> WAL: read file header: %w", err) + } + + if fileMagic != walFileMagic { + return fmt.Errorf("[METRICSTORE]> WAL: invalid file magic 0x%08X (expected 0x%08X)", fileMagic, walFileMagic) + } + + // Cache level lookups to avoid repeated tree traversal. + lvlCache := make(map[string]*Level) + + for { + msg, err := readWALRecord(br) + if err != nil { + // Truncated trailing record is expected after a crash; stop replaying. + cclog.Debugf("[METRICSTORE]> WAL: stopping replay at corrupted/partial record: %v", err) + break + } + if msg == nil { + break // Clean EOF + } + + if msg.Timestamp < from { + continue // Older than retention window + } + + minfo, ok := m.Metrics[msg.MetricName] + if !ok { + continue // Unknown metric (config may have changed) + } + + // Cache key is the null-separated selector path. + cacheKey := joinSelector(msg.Selector) + lvl, ok := lvlCache[cacheKey] + if !ok { + lvl = l.findLevelOrCreate(msg.Selector, len(m.Metrics)) + lvlCache[cacheKey] = lvl + } + + // Write directly to the buffer, same as WriteToLevel but without the + // global level lookup (we already have the right level). + lvl.lock.Lock() + b := lvl.metrics[minfo.offset] + if b == nil { + b = newBuffer(msg.Timestamp, minfo.Frequency) + lvl.metrics[minfo.offset] = b + } + nb, writeErr := b.write(msg.Timestamp, msg.Value) + if writeErr == nil && b != nb { + lvl.metrics[minfo.offset] = nb + } + // Ignore write errors for timestamps before buffer start (can happen when + // replaying WAL entries that predate a loaded snapshot's start time). + lvl.lock.Unlock() + } + + return nil +} + +// joinSelector builds a cache key from a selector slice using null bytes as separators. +func joinSelector(sel []string) string { + if len(sel) == 0 { + return "" + } + var result strings.Builder + result.WriteString(sel[0]) + for i := 1; i < len(sel); i++ { + result.WriteString("\x00" + sel[i]) + } + return result.String() +} + +// ToCheckpointWAL writes binary snapshot files for all hosts in parallel. +// Returns the number of files written, the list of host directories that were +// successfully checkpointed (for WAL rotation), and any errors. +func (m *MemoryStore) ToCheckpointWAL(dir string, from, to int64) (int, []string, error) { + // Collect all cluster/host pairs. + m.root.lock.RLock() + totalHosts := 0 + for _, l1 := range m.root.children { + l1.lock.RLock() + totalHosts += len(l1.children) + l1.lock.RUnlock() + } + m.root.lock.RUnlock() + + levels := make([]*Level, 0, totalHosts) + selectors := make([][]string, 0, totalHosts) + + m.root.lock.RLock() + for sel1, l1 := range m.root.children { + l1.lock.RLock() + for sel2, l2 := range l1.children { + levels = append(levels, l2) + selectors = append(selectors, []string{sel1, sel2}) + } + l1.lock.RUnlock() + } + m.root.lock.RUnlock() + + type workItem struct { + level *Level + hostDir string + selector []string + } + + n, errs := int32(0), int32(0) + var successDirs []string + var successMu sync.Mutex + + var wg sync.WaitGroup + wg.Add(Keys.NumWorkers) + work := make(chan workItem, Keys.NumWorkers*2) + + for range Keys.NumWorkers { + go func() { + defer wg.Done() + for wi := range work { + err := wi.level.toCheckpointBinary(wi.hostDir, from, to, m) + if err != nil { + if err == ErrNoNewArchiveData { + continue + } + cclog.Errorf("[METRICSTORE]> binary checkpoint error for %s: %v", wi.hostDir, err) + atomic.AddInt32(&errs, 1) + } else { + atomic.AddInt32(&n, 1) + successMu.Lock() + successDirs = append(successDirs, wi.hostDir) + successMu.Unlock() + } + } + }() + } + + for i := range levels { + hostDir := path.Join(dir, path.Join(selectors[i]...)) + work <- workItem{ + level: levels[i], + hostDir: hostDir, + selector: selectors[i], + } + } + close(work) + wg.Wait() + + if errs > 0 { + return int(n), successDirs, fmt.Errorf("[METRICSTORE]> %d errors during binary checkpoint (%d successes)", errs, n) + } + return int(n), successDirs, nil +} + +// toCheckpointBinary writes a binary snapshot file for a single host-level node. +// Uses atomic rename (write to .tmp then rename) to avoid partial reads on crash. +func (l *Level) toCheckpointBinary(dir string, from, to int64, m *MemoryStore) error { + cf, err := l.toCheckpointFile(from, to, m) + if err != nil { + return err + } + if cf == nil { + return ErrNoNewArchiveData + } + + if err := os.MkdirAll(dir, CheckpointDirPerms); err != nil { + return fmt.Errorf("mkdir %s: %w", dir, err) + } + + // Write to a temp file first, then rename (atomic on POSIX). + tmpPath := path.Join(dir, fmt.Sprintf("%d.bin.tmp", from)) + finalPath := path.Join(dir, fmt.Sprintf("%d.bin", from)) + + f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms) + if err != nil { + return fmt.Errorf("open binary snapshot %s: %w", tmpPath, err) + } + + bw := bufio.NewWriter(f) + if err := writeBinarySnapshotFile(bw, cf); err != nil { + f.Close() + os.Remove(tmpPath) + return fmt.Errorf("write binary snapshot: %w", err) + } + if err := bw.Flush(); err != nil { + f.Close() + os.Remove(tmpPath) + return err + } + f.Close() + + return os.Rename(tmpPath, finalPath) +} + +// writeBinarySnapshotFile writes the binary snapshot file header and level tree. +func writeBinarySnapshotFile(w io.Writer, cf *CheckpointFile) error { + if err := binary.Write(w, binary.LittleEndian, snapFileMagic); err != nil { + return err + } + if err := binary.Write(w, binary.LittleEndian, cf.From); err != nil { + return err + } + if err := binary.Write(w, binary.LittleEndian, cf.To); err != nil { + return err + } + return writeBinaryLevel(w, cf) +} + +// writeBinaryLevel recursively writes a CheckpointFile level in binary format. +func writeBinaryLevel(w io.Writer, cf *CheckpointFile) error { + if err := binary.Write(w, binary.LittleEndian, uint32(len(cf.Metrics))); err != nil { + return err + } + + for name, metric := range cf.Metrics { + if err := writeString16(w, name); err != nil { + return err + } + if err := binary.Write(w, binary.LittleEndian, metric.Frequency); err != nil { + return err + } + if err := binary.Write(w, binary.LittleEndian, metric.Start); err != nil { + return err + } + if err := binary.Write(w, binary.LittleEndian, uint32(len(metric.Data))); err != nil { + return err + } + for _, v := range metric.Data { + if err := binary.Write(w, binary.LittleEndian, math.Float32bits(float32(v))); err != nil { + return err + } + } + } + + if err := binary.Write(w, binary.LittleEndian, uint32(len(cf.Children))); err != nil { + return err + } + + for name, child := range cf.Children { + if err := writeString16(w, name); err != nil { + return err + } + if err := writeBinaryLevel(w, child); err != nil { + return err + } + } + + return nil +} + +// writeString16 writes a 2-byte length-prefixed string to w. +func writeString16(w io.Writer, s string) error { + if err := binary.Write(w, binary.LittleEndian, uint16(len(s))); err != nil { + return err + } + _, err := io.WriteString(w, s) + return err +} + +// loadBinaryFile reads a binary snapshot file and loads data into the Level tree. +// The retention check (from) is applied to the file's 'to' timestamp. +func (l *Level) loadBinaryFile(m *MemoryStore, f *os.File, from int64) error { + br := bufio.NewReader(f) + + var magic uint32 + if err := binary.Read(br, binary.LittleEndian, &magic); err != nil { + return fmt.Errorf("[METRICSTORE]> binary snapshot: read magic: %w", err) + } + if magic != snapFileMagic { + return fmt.Errorf("[METRICSTORE]> binary snapshot: invalid magic 0x%08X (expected 0x%08X)", magic, snapFileMagic) + } + + var fileFrom, fileTo int64 + if err := binary.Read(br, binary.LittleEndian, &fileFrom); err != nil { + return fmt.Errorf("[METRICSTORE]> binary snapshot: read from: %w", err) + } + if err := binary.Read(br, binary.LittleEndian, &fileTo); err != nil { + return fmt.Errorf("[METRICSTORE]> binary snapshot: read to: %w", err) + } + + if fileTo != 0 && fileTo < from { + return nil // File is older than retention window, skip it + } + + cf, err := readBinaryLevel(br) + if err != nil { + return fmt.Errorf("[METRICSTORE]> binary snapshot: read level tree: %w", err) + } + cf.From = fileFrom + cf.To = fileTo + + return l.loadFile(cf, m) +} + +// readBinaryLevel recursively reads a level from the binary snapshot format. +func readBinaryLevel(r io.Reader) (*CheckpointFile, error) { + cf := &CheckpointFile{ + Metrics: make(map[string]*CheckpointMetrics), + Children: make(map[string]*CheckpointFile), + } + + var numMetrics uint32 + if err := binary.Read(r, binary.LittleEndian, &numMetrics); err != nil { + return nil, fmt.Errorf("read num_metrics: %w", err) + } + + for range numMetrics { + name, err := readString16(r) + if err != nil { + return nil, fmt.Errorf("read metric name: %w", err) + } + + var freq, start int64 + if err := binary.Read(r, binary.LittleEndian, &freq); err != nil { + return nil, fmt.Errorf("read frequency for %s: %w", name, err) + } + if err := binary.Read(r, binary.LittleEndian, &start); err != nil { + return nil, fmt.Errorf("read start for %s: %w", name, err) + } + + var numValues uint32 + if err := binary.Read(r, binary.LittleEndian, &numValues); err != nil { + return nil, fmt.Errorf("read num_values for %s: %w", name, err) + } + + data := make([]schema.Float, numValues) + for i := range numValues { + var bits uint32 + if err := binary.Read(r, binary.LittleEndian, &bits); err != nil { + return nil, fmt.Errorf("read value[%d] for %s: %w", i, name, err) + } + data[i] = schema.Float(math.Float32frombits(bits)) + } + + cf.Metrics[name] = &CheckpointMetrics{ + Frequency: freq, + Start: start, + Data: data, + } + } + + var numChildren uint32 + if err := binary.Read(r, binary.LittleEndian, &numChildren); err != nil { + return nil, fmt.Errorf("read num_children: %w", err) + } + + for range numChildren { + childName, err := readString16(r) + if err != nil { + return nil, fmt.Errorf("read child name: %w", err) + } + + child, err := readBinaryLevel(r) + if err != nil { + return nil, fmt.Errorf("read child %s: %w", childName, err) + } + cf.Children[childName] = child + } + + return cf, nil +} + +// readString16 reads a 2-byte length-prefixed string from r. +func readString16(r io.Reader) (string, error) { + var sLen uint16 + if err := binary.Read(r, binary.LittleEndian, &sLen); err != nil { + return "", err + } + buf := make([]byte, sLen) + if _, err := io.ReadFull(r, buf); err != nil { + return "", err + } + return string(buf), nil +} diff --git a/startDemo.sh b/startDemo.sh index 10ba7f0c..108c95f5 100755 --- a/startDemo.sh +++ b/startDemo.sh @@ -1,10 +1,8 @@ #!/bin/sh -# rm -rf var - if [ -d './var' ]; then echo 'Directory ./var already exists! Skipping initialization.' - ./cc-backend -server -dev + ./cc-backend -server -dev -loglevel info else make ./cc-backend --init @@ -15,5 +13,6 @@ else rm ./job-archive-demo.tar ./cc-backend -dev -init-db -add-user demo:admin,api:demo - ./cc-backend -server -dev + ./cc-backend -server -dev -loglevel info fi + diff --git a/tools.go b/tools.go deleted file mode 100644 index 950056c4..00000000 --- a/tools.go +++ /dev/null @@ -1,9 +0,0 @@ -//go:build tools -// +build tools - -package tools - -import ( - _ "github.com/99designs/gqlgen" - _ "github.com/swaggo/swag/cmd/swag" -) diff --git a/tools/archive-manager/README.md b/tools/archive-manager/README.md new file mode 100644 index 00000000..c006a63e --- /dev/null +++ b/tools/archive-manager/README.md @@ -0,0 +1,148 @@ +# Archive Manager + +## Overview + +The `archive-manager` tool manages ClusterCockpit job archives. It supports inspecting archives, validating jobs, removing jobs by date range, importing jobs between archive backends, and converting archives between JSON and Parquet formats. + +## Features + +- **Archive Info**: Display statistics about an existing job archive +- **Validation**: Validate job archives against the JSON schema +- **Cleanup**: Remove jobs by date range +- **Import**: Copy jobs between archive backends (file, S3, SQLite) with parallel processing +- **Convert**: Convert archives between JSON and Parquet formats (both directions) +- **Progress Reporting**: Real-time progress display with ETA and throughput metrics +- **Graceful Interruption**: CTRL-C stops processing after finishing current jobs + +## Usage + +### Build + +```bash +go build ./tools/archive-manager/ +``` + +### Archive Info + +Display statistics about a job archive: + +```bash +./archive-manager -s ./var/job-archive +``` + +### Validate Archive + +```bash +./archive-manager -s ./var/job-archive --validate --config ./config.json +``` + +### Remove Jobs by Date + +```bash +# Remove jobs started before a date +./archive-manager -s ./var/job-archive --remove-before 2023-Jan-01 --config ./config.json + +# Remove jobs started after a date +./archive-manager -s ./var/job-archive --remove-after 2024-Dec-31 --config ./config.json +``` + +### Import Between Backends + +Import jobs from one archive backend to another (e.g., file to S3, file to SQLite): + +```bash +./archive-manager --import \ + --src-config '{"kind":"file","path":"./var/job-archive"}' \ + --dst-config '{"kind":"s3","endpoint":"https://s3.example.com","bucket":"archive","access-key":"...","secret-key":"..."}' +``` + +### Convert JSON to Parquet + +Convert a JSON job archive to Parquet format: + +```bash +./archive-manager --convert --format parquet \ + --src-config '{"kind":"file","path":"./var/job-archive"}' \ + --dst-config '{"kind":"file","path":"./var/parquet-archive"}' +``` + +The source (`--src-config`) is a standard archive backend config (file, S3, or SQLite). The destination (`--dst-config`) specifies where to write parquet files. + +### Convert Parquet to JSON + +Convert a Parquet archive back to JSON format: + +```bash +./archive-manager --convert --format json \ + --src-config '{"kind":"file","path":"./var/parquet-archive"}' \ + --dst-config '{"kind":"file","path":"./var/json-archive"}' +``` + +The source (`--src-config`) points to a directory or S3 bucket containing parquet files organized by cluster. The destination (`--dst-config`) is a standard archive backend config. + +### S3 Source/Destination Example + +Both conversion directions support S3: + +```bash +# JSON (S3) -> Parquet (local) +./archive-manager --convert --format parquet \ + --src-config '{"kind":"s3","endpoint":"https://s3.example.com","bucket":"json-archive","accessKey":"...","secretKey":"..."}' \ + --dst-config '{"kind":"file","path":"./var/parquet-archive"}' + +# Parquet (local) -> JSON (S3) +./archive-manager --convert --format json \ + --src-config '{"kind":"file","path":"./var/parquet-archive"}' \ + --dst-config '{"kind":"s3","endpoint":"https://s3.example.com","bucket":"json-archive","access-key":"...","secret-key":"..."}' +``` + +## Command-Line Options + +| Flag | Default | Description | +|------|---------|-------------| +| `-s` | `./var/job-archive` | Source job archive path (for info/validate/remove modes) | +| `--config` | `./config.json` | Path to config.json | +| `--loglevel` | `info` | Logging level: debug, info, warn, err, fatal, crit | +| `--logdate` | `false` | Add timestamps to log messages | +| `--validate` | `false` | Validate archive against JSON schema | +| `--remove-before` | | Remove jobs started before date (Format: 2006-Jan-02) | +| `--remove-after` | | Remove jobs started after date (Format: 2006-Jan-02) | +| `--import` | `false` | Import jobs between archive backends | +| `--convert` | `false` | Convert archive between JSON and Parquet formats | +| `--format` | `json` | Output format for conversion: `json` or `parquet` | +| `--max-file-size` | `512` | Max parquet file size in MB (only for parquet output) | +| `--src-config` | | Source config JSON (required for import/convert) | +| `--dst-config` | | Destination config JSON (required for import/convert) | + +## Parquet Archive Layout + +When converting to Parquet, the output is organized by cluster: + +``` +parquet-archive/ + clusterA/ + cluster.json + cc-archive-2025-01-20-001.parquet + cc-archive-2025-01-20-002.parquet + clusterB/ + cluster.json + cc-archive-2025-01-20-001.parquet +``` + +Each parquet file contains job metadata and gzip-compressed metric data. The `cluster.json` file preserves the cluster configuration from the source archive. + +## Round-Trip Conversion + +Archives can be converted from JSON to Parquet and back without data loss: + +```bash +# Original JSON archive +./archive-manager --convert --format parquet \ + --src-config '{"kind":"file","path":"./var/job-archive"}' \ + --dst-config '{"kind":"file","path":"./var/parquet-archive"}' + +# Convert back to JSON +./archive-manager --convert --format json \ + --src-config '{"kind":"file","path":"./var/parquet-archive"}' \ + --dst-config '{"kind":"file","path":"./var/json-archive"}' +``` diff --git a/tools/archive-manager/import_test.go b/tools/archive-manager/import_test.go index 02288285..db8940c2 100644 --- a/tools/archive-manager/import_test.go +++ b/tools/archive-manager/import_test.go @@ -12,8 +12,8 @@ import ( "testing" "github.com/ClusterCockpit/cc-backend/pkg/archive" - "github.com/ClusterCockpit/cc-lib/schema" - "github.com/ClusterCockpit/cc-lib/util" + "github.com/ClusterCockpit/cc-lib/v2/schema" + "github.com/ClusterCockpit/cc-lib/v2/util" ) // TestImportFileToSqlite tests importing jobs from file backend to SQLite backend @@ -41,14 +41,14 @@ func TestImportFileToSqlite(t *testing.T) { } // Initialize destination backend (sqlite) - dstConfig := fmt.Sprintf(`{"kind":"sqlite","dbPath":"%s"}`, dstDb) + dstConfig := fmt.Sprintf(`{"kind":"sqlite","db-path":"%s"}`, dstDb) dstBackend, err := archive.InitBackend(json.RawMessage(dstConfig)) if err != nil { t.Fatalf("Failed to initialize destination backend: %s", err.Error()) } // Perform import - imported, failed, err := importArchive(srcBackend, dstBackend) + imported, failed, err := importArchive(srcBackend, dstBackend, srcConfig) if err != nil { t.Errorf("Import failed: %s", err.Error()) } @@ -111,13 +111,13 @@ func TestImportFileToFile(t *testing.T) { } // Create destination archive directory - if err := os.MkdirAll(dstArchive, 0755); err != nil { + if err := os.MkdirAll(dstArchive, 0o755); err != nil { t.Fatalf("Failed to create destination directory: %s", err.Error()) } // Write version file versionFile := filepath.Join(dstArchive, "version.txt") - if err := os.WriteFile(versionFile, []byte("3"), 0644); err != nil { + if err := os.WriteFile(versionFile, []byte("3"), 0o644); err != nil { t.Fatalf("Failed to write version file: %s", err.Error()) } @@ -136,7 +136,7 @@ func TestImportFileToFile(t *testing.T) { } // Perform import - imported, failed, err := importArchive(srcBackend, dstBackend) + imported, failed, err := importArchive(srcBackend, dstBackend, srcConfig) if err != nil { t.Errorf("Import failed: %s", err.Error()) } @@ -176,14 +176,14 @@ func TestImportDataIntegrity(t *testing.T) { t.Fatalf("Failed to initialize source backend: %s", err.Error()) } - dstConfig := fmt.Sprintf(`{"kind":"sqlite","dbPath":"%s"}`, dstDb) + dstConfig := fmt.Sprintf(`{"kind":"sqlite","db-path":"%s"}`, dstDb) dstBackend, err := archive.InitBackend(json.RawMessage(dstConfig)) if err != nil { t.Fatalf("Failed to initialize destination backend: %s", err.Error()) } // Perform import - _, _, err = importArchive(srcBackend, dstBackend) + _, _, err = importArchive(srcBackend, dstBackend, srcConfig) if err != nil { t.Errorf("Import failed: %s", err.Error()) } @@ -253,13 +253,13 @@ func TestImportEmptyArchive(t *testing.T) { dstDb := filepath.Join(tmpdir, "dst-archive.db") // Create empty source archive - if err := os.MkdirAll(srcArchive, 0755); err != nil { + if err := os.MkdirAll(srcArchive, 0o755); err != nil { t.Fatalf("Failed to create source directory: %s", err.Error()) } // Write version file versionFile := filepath.Join(srcArchive, "version.txt") - if err := os.WriteFile(versionFile, []byte("3"), 0644); err != nil { + if err := os.WriteFile(versionFile, []byte("3"), 0o644); err != nil { t.Fatalf("Failed to write version file: %s", err.Error()) } @@ -270,14 +270,14 @@ func TestImportEmptyArchive(t *testing.T) { t.Fatalf("Failed to initialize source backend: %s", err.Error()) } - dstConfig := fmt.Sprintf(`{"kind":"sqlite","dbPath":"%s"}`, dstDb) + dstConfig := fmt.Sprintf(`{"kind":"sqlite","db-path":"%s"}`, dstDb) dstBackend, err := archive.InitBackend(json.RawMessage(dstConfig)) if err != nil { t.Fatalf("Failed to initialize destination backend: %s", err.Error()) } // Perform import - imported, failed, err := importArchive(srcBackend, dstBackend) + imported, failed, err := importArchive(srcBackend, dstBackend, srcConfig) if err != nil { t.Errorf("Import from empty archive should not fail: %s", err.Error()) } @@ -314,20 +314,20 @@ func TestImportDuplicateJobs(t *testing.T) { t.Fatalf("Failed to initialize source backend: %s", err.Error()) } - dstConfig := fmt.Sprintf(`{"kind":"sqlite","dbPath":"%s"}`, dstDb) + dstConfig := fmt.Sprintf(`{"kind":"sqlite","db-path":"%s"}`, dstDb) dstBackend, err := archive.InitBackend(json.RawMessage(dstConfig)) if err != nil { t.Fatalf("Failed to initialize destination backend: %s", err.Error()) } // First import - imported1, _, err := importArchive(srcBackend, dstBackend) + imported1, _, err := importArchive(srcBackend, dstBackend, srcConfig) if err != nil { t.Fatalf("First import failed: %s", err.Error()) } // Second import (should skip all jobs) - imported2, _, err := importArchive(srcBackend, dstBackend) + imported2, _, err := importArchive(srcBackend, dstBackend, srcConfig) if err != nil { t.Errorf("Second import failed: %s", err.Error()) } @@ -366,7 +366,7 @@ func TestImportToEmptyFileDestination(t *testing.T) { util.CopyDir(testDataPath, srcArchive) // Setup empty destination directory - os.MkdirAll(dstArchive, 0755) + os.MkdirAll(dstArchive, 0o755) // NOTE: NOT writing version.txt here! // Initialize source @@ -384,7 +384,7 @@ func TestImportToEmptyFileDestination(t *testing.T) { } // Perform import - imported, _, err := importArchive(srcBackend, dstBackend) + imported, _, err := importArchive(srcBackend, dstBackend, srcConfig) if err != nil { t.Errorf("Import failed: %v", err) } diff --git a/tools/archive-manager/main.go b/tools/archive-manager/main.go index 30aa9088..4a9094c0 100644 --- a/tools/archive-manager/main.go +++ b/tools/archive-manager/main.go @@ -5,18 +5,27 @@ package main import ( + "context" "encoding/json" "flag" "fmt" + "io/fs" "os" + "os/exec" + "os/signal" + "path/filepath" + "strconv" + "strings" "sync" "sync/atomic" + "syscall" "time" "github.com/ClusterCockpit/cc-backend/internal/config" "github.com/ClusterCockpit/cc-backend/pkg/archive" - ccconf "github.com/ClusterCockpit/cc-lib/ccConfig" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + pqarchive "github.com/ClusterCockpit/cc-backend/pkg/archive/parquet" + ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" ) func parseDate(in string) int64 { @@ -33,80 +42,288 @@ func parseDate(in string) int64 { return 0 } +// parseArchivePath extracts the path from the source config JSON. +func parseArchivePath(srcConfig string) (string, error) { + var config struct { + Kind string `json:"kind"` + Path string `json:"path"` + } + if err := json.Unmarshal([]byte(srcConfig), &config); err != nil { + return "", fmt.Errorf("failed to parse source config: %w", err) + } + + if config.Path == "" { + return "", fmt.Errorf("no path found in source config") + } + + return config.Path, nil +} + +// countJobsNative counts jobs using native Go filepath.WalkDir. +// This is used as a fallback when fd/fdfind is not available. +func countJobsNative(archivePath string) (int, error) { + count := 0 + err := filepath.WalkDir(archivePath, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return nil // Skip directories we can't access + } + if !d.IsDir() && d.Name() == "meta.json" { + count++ + } + return nil + }) + if err != nil { + return 0, fmt.Errorf("failed to walk directory: %w", err) + } + + return count, nil +} + +// countJobsWithFd counts jobs using the external fd command. +func countJobsWithFd(fdPath, archivePath string) (int, error) { + fdCmd := exec.Command(fdPath, "meta.json", archivePath) + wcCmd := exec.Command("wc", "-l") + + pipe, err := fdCmd.StdoutPipe() + if err != nil { + return 0, fmt.Errorf("failed to create pipe: %w", err) + } + wcCmd.Stdin = pipe + + if err := fdCmd.Start(); err != nil { + return 0, fmt.Errorf("failed to start fd command: %w", err) + } + + output, err := wcCmd.Output() + if err != nil { + return 0, fmt.Errorf("failed to run wc command: %w", err) + } + + if err := fdCmd.Wait(); err != nil { + return 0, fmt.Errorf("fd command failed: %w", err) + } + + countStr := strings.TrimSpace(string(output)) + count, err := strconv.Atoi(countStr) + if err != nil { + return 0, fmt.Errorf("failed to parse count from wc output '%s': %w", countStr, err) + } + + return count, nil +} + +// countJobs counts the total number of jobs in the source archive. +// It tries to use external fd/fdfind command for speed, falling back to +// native Go filepath.WalkDir if neither is available. +// The srcConfig parameter should be the JSON configuration string containing the archive path. +func countJobs(srcConfig string) (int, error) { + archivePath, err := parseArchivePath(srcConfig) + if err != nil { + return 0, err + } + + // Try fd first (common name) + if fdPath, err := exec.LookPath("fd"); err == nil { + return countJobsWithFd(fdPath, archivePath) + } + + // Try fdfind (Debian/Ubuntu package name) + if fdPath, err := exec.LookPath("fdfind"); err == nil { + return countJobsWithFd(fdPath, archivePath) + } + + // Fall back to native Go implementation + cclog.Debug("fd/fdfind not found, using native Go file walker") + return countJobsNative(archivePath) +} + +// formatDuration formats a duration as a human-readable string. +func formatDuration(d time.Duration) string { + if d < time.Minute { + return fmt.Sprintf("%ds", int(d.Seconds())) + } else if d < time.Hour { + return fmt.Sprintf("%dm%ds", int(d.Minutes()), int(d.Seconds())%60) + } + return fmt.Sprintf("%dh%dm", int(d.Hours()), int(d.Minutes())%60) +} + +// progressMeter displays import progress to the terminal. +type progressMeter struct { + total int + processed int32 + imported int32 + skipped int32 + failed int32 + startTime time.Time + done chan struct{} +} + +func newProgressMeter(total int) *progressMeter { + return &progressMeter{ + total: total, + startTime: time.Now(), + done: make(chan struct{}), + } +} + +func (p *progressMeter) start() { + go func() { + ticker := time.NewTicker(500 * time.Millisecond) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + p.render() + case <-p.done: + p.render() + fmt.Println() + return + } + } + }() +} + +func (p *progressMeter) render() { + processed := atomic.LoadInt32(&p.processed) + imported := atomic.LoadInt32(&p.imported) + skipped := atomic.LoadInt32(&p.skipped) + failed := atomic.LoadInt32(&p.failed) + + elapsed := time.Since(p.startTime) + percent := float64(processed) / float64(p.total) * 100 + if p.total == 0 { + percent = 0 + } + + var eta string + var throughput float64 + if processed > 0 { + throughput = float64(processed) / elapsed.Seconds() + remaining := float64(p.total-int(processed)) / throughput + eta = formatDuration(time.Duration(remaining) * time.Second) + } else { + eta = "calculating..." + } + + barWidth := 30 + filled := int(float64(barWidth) * float64(processed) / float64(p.total)) + if p.total == 0 { + filled = 0 + } + + var bar strings.Builder + for i := range barWidth { + if i < filled { + bar.WriteString("█") + } else { + bar.WriteString("░") + } + } + + fmt.Printf("\r[%s] %5.1f%% | %d/%d | %.1f jobs/s | ETA: %s | ✓%d ○%d ✗%d ", + bar.String(), percent, processed, p.total, throughput, eta, imported, skipped, failed) +} + +func (p *progressMeter) stop() { + close(p.done) +} + // importArchive imports all jobs from a source archive backend to a destination archive backend. // It uses parallel processing with a worker pool to improve performance. +// The import can be interrupted by CTRL-C (SIGINT) and will terminate gracefully. // Returns the number of successfully imported jobs, failed jobs, and any error encountered. -func importArchive(srcBackend, dstBackend archive.ArchiveBackend) (int, int, error) { +func importArchive(srcBackend, dstBackend archive.ArchiveBackend, srcConfig string) (int, int, error) { cclog.Info("Starting parallel archive import...") + cclog.Info("Press CTRL-C to interrupt (will finish current jobs before exiting)") - // Use atomic counters for thread-safe updates - var imported int32 - var failed int32 - var skipped int32 + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) + + var interrupted atomic.Bool + + go func() { + <-sigChan + cclog.Warn("Interrupt received, stopping import (finishing current jobs)...") + interrupted.Store(true) + cancel() + // Stop listening for further signals to allow force quit with second CTRL-C + signal.Stop(sigChan) + }() + + cclog.Info("Counting jobs in source archive (this may take a long time) ...") + totalJobs, err := countJobs(srcConfig) + if err != nil { + return 0, 0, fmt.Errorf("failed to count jobs: %w", err) + } + cclog.Infof("Found %d jobs to process", totalJobs) + + progress := newProgressMeter(totalJobs) - // Number of parallel workers numWorkers := 4 cclog.Infof("Using %d parallel workers", numWorkers) - // Create channels for job distribution jobs := make(chan archive.JobContainer, numWorkers*2) - // WaitGroup to track worker completion var wg sync.WaitGroup - // Start worker goroutines + progress.start() + for i := range numWorkers { wg.Add(1) go func(workerID int) { defer wg.Done() for job := range jobs { - // Validate job metadata if job.Meta == nil { cclog.Warn("Skipping job with nil metadata") - atomic.AddInt32(&failed, 1) + atomic.AddInt32(&progress.failed, 1) + atomic.AddInt32(&progress.processed, 1) continue } - // Validate job data if job.Data == nil { cclog.Warnf("Job %d from cluster %s has no metric data, skipping", job.Meta.JobID, job.Meta.Cluster) - atomic.AddInt32(&failed, 1) + atomic.AddInt32(&progress.failed, 1) + atomic.AddInt32(&progress.processed, 1) continue } - // Check if job already exists in destination if dstBackend.Exists(job.Meta) { cclog.Debugf("Job %d (cluster: %s, start: %d) already exists in destination, skipping", job.Meta.JobID, job.Meta.Cluster, job.Meta.StartTime) - atomic.AddInt32(&skipped, 1) + atomic.AddInt32(&progress.skipped, 1) + atomic.AddInt32(&progress.processed, 1) continue } - // Import job to destination if err := dstBackend.ImportJob(job.Meta, job.Data); err != nil { cclog.Errorf("Failed to import job %d from cluster %s: %s", job.Meta.JobID, job.Meta.Cluster, err.Error()) - atomic.AddInt32(&failed, 1) + atomic.AddInt32(&progress.failed, 1) + atomic.AddInt32(&progress.processed, 1) continue } - // Successfully imported - newCount := atomic.AddInt32(&imported, 1) - if newCount%100 == 0 { - cclog.Infof("Progress: %d jobs imported, %d skipped, %d failed", - newCount, atomic.LoadInt32(&skipped), atomic.LoadInt32(&failed)) - } + atomic.AddInt32(&progress.imported, 1) + atomic.AddInt32(&progress.processed, 1) } }(i) } - // Feed jobs to workers go func() { - // Import cluster configs first + defer close(jobs) + clusters := srcBackend.GetClusters() for _, clusterName := range clusters { + if ctx.Err() != nil { + return + } + clusterCfg, err := srcBackend.LoadClusterCfg(clusterName) if err != nil { cclog.Errorf("Failed to load cluster config for %s: %v", clusterName, err) @@ -121,20 +338,33 @@ func importArchive(srcBackend, dstBackend archive.ArchiveBackend) (int, int, err } for job := range srcBackend.Iter(true) { - jobs <- job + select { + case <-ctx.Done(): + // Drain remaining items from iterator to avoid resource leak + // but don't process them + return + case jobs <- job: + } } - close(jobs) }() - // Wait for all workers to complete wg.Wait() + progress.stop() - finalImported := int(atomic.LoadInt32(&imported)) - finalFailed := int(atomic.LoadInt32(&failed)) - finalSkipped := int(atomic.LoadInt32(&skipped)) + finalImported := int(atomic.LoadInt32(&progress.imported)) + finalFailed := int(atomic.LoadInt32(&progress.failed)) + finalSkipped := int(atomic.LoadInt32(&progress.skipped)) - cclog.Infof("Import completed: %d jobs imported, %d skipped, %d failed", - finalImported, finalSkipped, finalFailed) + elapsed := time.Since(progress.startTime) + + if interrupted.Load() { + cclog.Warnf("Import interrupted after %s: %d jobs imported, %d skipped, %d failed", + formatDuration(elapsed), finalImported, finalSkipped, finalFailed) + return finalImported, finalFailed, fmt.Errorf("import interrupted by user") + } + + cclog.Infof("Import completed in %s: %d jobs imported, %d skipped, %d failed", + formatDuration(elapsed), finalImported, finalSkipped, finalFailed) if finalFailed > 0 { return finalImported, finalFailed, fmt.Errorf("%d jobs failed to import", finalFailed) @@ -143,20 +373,220 @@ func importArchive(srcBackend, dstBackend archive.ArchiveBackend) (int, int, err return finalImported, finalFailed, nil } +// parseSourceConfig parses the common kind/path/s3 fields from a config JSON string. +type sourceConfig struct { + Kind string `json:"kind"` + Path string `json:"path"` + Endpoint string `json:"endpoint"` + Bucket string `json:"bucket"` + AccessKey string `json:"accessKey"` + SecretKey string `json:"secretKey"` + Region string `json:"region"` + UsePathStyle bool `json:"usePathStyle"` +} + +// createParquetTarget creates a ParquetTarget from a parsed config. +func createParquetTarget(cfg sourceConfig) (pqarchive.ParquetTarget, error) { + switch cfg.Kind { + case "s3": + return pqarchive.NewS3Target(pqarchive.S3TargetConfig{ + Endpoint: cfg.Endpoint, + Bucket: cfg.Bucket, + AccessKey: cfg.AccessKey, + SecretKey: cfg.SecretKey, + Region: cfg.Region, + UsePathStyle: cfg.UsePathStyle, + }) + default: + return pqarchive.NewFileTarget(cfg.Path) + } +} + +// createParquetSource creates a ParquetSource from a parsed config. +func createParquetSource(cfg sourceConfig) (pqarchive.ParquetSource, error) { + switch cfg.Kind { + case "s3": + return pqarchive.NewS3ParquetSource(pqarchive.S3TargetConfig{ + Endpoint: cfg.Endpoint, + Bucket: cfg.Bucket, + AccessKey: cfg.AccessKey, + SecretKey: cfg.SecretKey, + Region: cfg.Region, + UsePathStyle: cfg.UsePathStyle, + }) + default: + if cfg.Path == "" { + return nil, fmt.Errorf("file source: path is required") + } + return pqarchive.NewFileParquetSource(cfg.Path), nil + } +} + +// convertJSONToParquet converts a JSON archive backend to parquet format. +func convertJSONToParquet(srcBackend archive.ArchiveBackend, dstCfg sourceConfig, maxSizeMB int) error { + target, err := createParquetTarget(dstCfg) + if err != nil { + return fmt.Errorf("create parquet target: %w", err) + } + + cw := pqarchive.NewClusterAwareParquetWriter(target, maxSizeMB) + + // Transfer cluster configs + for _, clusterName := range srcBackend.GetClusters() { + clusterCfg, err := srcBackend.LoadClusterCfg(clusterName) + if err != nil { + cclog.Warnf("Convert: load cluster config %q: %v", clusterName, err) + continue + } + cw.SetClusterConfig(clusterName, clusterCfg) + } + + converted := 0 + failed := 0 + startTime := time.Now() + + for job := range srcBackend.Iter(true) { + if job.Meta == nil { + cclog.Warn("Skipping job with nil metadata") + failed++ + continue + } + if job.Data == nil { + cclog.Warnf("Job %d has no metric data, skipping", job.Meta.JobID) + failed++ + continue + } + + row, err := pqarchive.JobToParquetRow(job.Meta, job.Data) + if err != nil { + cclog.Warnf("Convert job %d: %v", job.Meta.JobID, err) + failed++ + continue + } + if err := cw.AddJob(*row); err != nil { + cclog.Errorf("Add job %d to writer: %v", job.Meta.JobID, err) + failed++ + continue + } + converted++ + + if converted%1000 == 0 { + cclog.Infof("Converted %d jobs so far...", converted) + } + } + + if err := cw.Close(); err != nil { + return fmt.Errorf("close parquet writer: %w", err) + } + + elapsed := time.Since(startTime) + cclog.Infof("JSON->Parquet conversion completed in %s: %d jobs converted, %d failed", + formatDuration(elapsed), converted, failed) + return nil +} + +// convertParquetToJSON converts a parquet archive to a JSON archive backend. +func convertParquetToJSON(srcCfg sourceConfig, dstBackend archive.ArchiveBackend) error { + src, err := createParquetSource(srcCfg) + if err != nil { + return fmt.Errorf("create parquet source: %w", err) + } + + clusters, err := src.GetClusters() + if err != nil { + return fmt.Errorf("list clusters: %w", err) + } + + converted := 0 + failed := 0 + skipped := 0 + startTime := time.Now() + + for _, cluster := range clusters { + // Transfer cluster config + clusterCfg, err := src.ReadClusterConfig(cluster) + if err != nil { + cclog.Warnf("Convert: read cluster config %q: %v", cluster, err) + } else { + if err := dstBackend.StoreClusterCfg(cluster, clusterCfg); err != nil { + cclog.Warnf("Convert: store cluster config %q: %v", cluster, err) + } else { + cclog.Infof("Imported cluster config for %s", cluster) + } + } + + // Read and convert parquet files + files, err := src.ListParquetFiles(cluster) + if err != nil { + cclog.Errorf("Convert: list parquet files for %q: %v", cluster, err) + continue + } + + for _, file := range files { + data, err := src.ReadFile(file) + if err != nil { + cclog.Errorf("Convert: read file %q: %v", file, err) + failed++ + continue + } + + rows, err := pqarchive.ReadParquetFile(data) + if err != nil { + cclog.Errorf("Convert: parse parquet file %q: %v", file, err) + failed++ + continue + } + + cclog.Infof("Processing %s: %d jobs", file, len(rows)) + + for _, row := range rows { + meta, jobData, err := pqarchive.ParquetRowToJob(&row) + if err != nil { + cclog.Warnf("Convert row to job: %v", err) + failed++ + continue + } + + if dstBackend.Exists(meta) { + skipped++ + continue + } + + if err := dstBackend.ImportJob(meta, jobData); err != nil { + cclog.Warnf("Import job %d: %v", meta.JobID, err) + failed++ + continue + } + converted++ + } + } + } + + elapsed := time.Since(startTime) + cclog.Infof("Parquet->JSON conversion completed in %s: %d jobs converted, %d skipped, %d failed", + formatDuration(elapsed), converted, skipped, failed) + return nil +} + func main() { var srcPath, flagConfigFile, flagLogLevel, flagRemoveCluster, flagRemoveAfter, flagRemoveBefore string var flagSrcConfig, flagDstConfig string - var flagLogDateTime, flagValidate, flagImport bool + var flagLogDateTime, flagValidate, flagImport, flagConvert bool + var flagFormat string + var flagMaxFileSize int flag.StringVar(&srcPath, "s", "./var/job-archive", "Specify the source job archive path. Default is ./var/job-archive") flag.BoolVar(&flagLogDateTime, "logdate", false, "Set this flag to add date and time to log messages") - flag.StringVar(&flagLogLevel, "loglevel", "warn", "Sets the logging level: `[debug,info,warn (default),err,fatal,crit]`") + flag.StringVar(&flagLogLevel, "loglevel", "info", "Sets the logging level: `[debug,info,warn (default),err,fatal,crit]`") flag.StringVar(&flagConfigFile, "config", "./config.json", "Specify alternative path to `config.json`") flag.StringVar(&flagRemoveCluster, "remove-cluster", "", "Remove cluster from archive and database") flag.StringVar(&flagRemoveBefore, "remove-before", "", "Remove all jobs with start time before date (Format: 2006-Jan-04)") flag.StringVar(&flagRemoveAfter, "remove-after", "", "Remove all jobs with start time after date (Format: 2006-Jan-04)") flag.BoolVar(&flagValidate, "validate", false, "Set this flag to validate a job archive against the json schema") flag.BoolVar(&flagImport, "import", false, "Import jobs from source archive to destination archive") + flag.BoolVar(&flagConvert, "convert", false, "Convert archive between JSON and Parquet formats") + flag.StringVar(&flagFormat, "format", "json", "Output format for conversion: 'json' or 'parquet'") + flag.IntVar(&flagMaxFileSize, "max-file-size", 512, "Max parquet file size in MB (only for parquet output)") flag.StringVar(&flagSrcConfig, "src-config", "", "Source archive backend configuration (JSON), e.g. '{\"kind\":\"file\",\"path\":\"./archive\"}'") flag.StringVar(&flagDstConfig, "dst-config", "", "Destination archive backend configuration (JSON), e.g. '{\"kind\":\"sqlite\",\"dbPath\":\"./archive.db\"}'") flag.Parse() @@ -188,7 +618,7 @@ func main() { cclog.Info("Destination backend initialized successfully") // Perform import - imported, failed, err := importArchive(srcBackend, dstBackend) + imported, failed, err := importArchive(srcBackend, dstBackend, flagSrcConfig) if err != nil { cclog.Errorf("Import completed with errors: %s", err.Error()) if failed > 0 { @@ -200,20 +630,59 @@ func main() { os.Exit(0) } + // Handle convert mode + if flagConvert { + if flagSrcConfig == "" || flagDstConfig == "" { + cclog.Fatal("Both --src-config and --dst-config must be specified for convert mode") + } + + var srcCfg, dstCfg sourceConfig + if err := json.Unmarshal([]byte(flagSrcConfig), &srcCfg); err != nil { + cclog.Fatalf("Failed to parse source config: %s", err.Error()) + } + if err := json.Unmarshal([]byte(flagDstConfig), &dstCfg); err != nil { + cclog.Fatalf("Failed to parse destination config: %s", err.Error()) + } + + switch flagFormat { + case "parquet": + // JSON archive -> Parquet: source is an archive backend + cclog.Info("Convert mode: JSON -> Parquet") + srcBackend, err := archive.InitBackend(json.RawMessage(flagSrcConfig)) + if err != nil { + cclog.Fatalf("Failed to initialize source backend: %s", err.Error()) + } + if err := convertJSONToParquet(srcBackend, dstCfg, flagMaxFileSize); err != nil { + cclog.Fatalf("Conversion failed: %s", err.Error()) + } + case "json": + // Parquet -> JSON archive: destination is an archive backend + cclog.Info("Convert mode: Parquet -> JSON") + dstBackend, err := archive.InitBackend(json.RawMessage(flagDstConfig)) + if err != nil { + cclog.Fatalf("Failed to initialize destination backend: %s", err.Error()) + } + if err := convertParquetToJSON(srcCfg, dstBackend); err != nil { + cclog.Fatalf("Conversion failed: %s", err.Error()) + } + default: + cclog.Fatalf("Unknown format %q: must be 'json' or 'parquet'", flagFormat) + } + + cclog.Info("Conversion finished successfully") + os.Exit(0) + } + ccconf.Init(flagConfigFile) // Load and check main configuration if cfg := ccconf.GetPackageConfig("main"); cfg != nil { - if clustercfg := ccconf.GetPackageConfig("clusters"); clustercfg != nil { - config.Init(cfg, clustercfg) - } else { - cclog.Abort("Cluster configuration must be present") - } + config.Init(cfg) } else { cclog.Abort("Main configuration must be present") } - if err := archive.Init(json.RawMessage(archiveCfg), false); err != nil { + if err := archive.Init(json.RawMessage(archiveCfg)); err != nil { cclog.Fatal(err) } ar := archive.GetHandle() diff --git a/tools/archive-migration/main.go b/tools/archive-migration/main.go index 9bbed121..1384e065 100644 --- a/tools/archive-migration/main.go +++ b/tools/archive-migration/main.go @@ -12,7 +12,7 @@ import ( "path/filepath" "strings" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" ) func main() { @@ -70,7 +70,6 @@ func main() { // Run migration migrated, failed, err := migrateArchive(archivePath, dryRun, numWorkers) - if err != nil { cclog.Errorf("Migration completed with errors: %s", err.Error()) if failed > 0 { @@ -104,5 +103,5 @@ func checkVersion(archivePath string) error { func updateVersion(archivePath string) error { versionFile := filepath.Join(archivePath, "version.txt") - return os.WriteFile(versionFile, []byte("3\n"), 0644) + return os.WriteFile(versionFile, []byte("3\n"), 0o644) } diff --git a/tools/archive-migration/transforms.go b/tools/archive-migration/transforms.go index 6558e47a..ef4ba5eb 100644 --- a/tools/archive-migration/transforms.go +++ b/tools/archive-migration/transforms.go @@ -12,7 +12,7 @@ import ( "sync" "sync/atomic" - cclog "github.com/ClusterCockpit/cc-lib/ccLogger" + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" ) // transformExclusiveToShared converts the old 'exclusive' field to the new 'shared' field diff --git a/tools/convert-pem-pubkey/Readme.md b/tools/convert-pem-pubkey/Readme.md index 1429acc4..22fd0db2 100644 --- a/tools/convert-pem-pubkey/Readme.md +++ b/tools/convert-pem-pubkey/Readme.md @@ -16,7 +16,7 @@ CROSS_LOGIN_JWT_PUBLIC_KEY="+51iXX8BdLFocrppRxIw52xCOf8xFSH/eNilN5IHVGc=" Instructions -- `cd tools/convert-pem-pubkey-for-cc/` +- `cd tools/convert-pem-pubkey/` - Insert your public ed25519 PEM key into `dummy.pub` - `go run . dummy.pub` - Copy the result into ClusterCockpit's `.env` diff --git a/tools/dataGenerator.sh b/tools/dataGenerator.sh new file mode 100644 index 00000000..338fd190 --- /dev/null +++ b/tools/dataGenerator.sh @@ -0,0 +1,186 @@ +#!/bin/bash + +# ========================================== +# CONFIGURATION & FLAGS +# ========================================== + +# MODE SETTINGS +TRANSPORT_MODE="REST" # Options: "REST" or "NATS" +CONNECTION_SCOPE="INTERNAL" # Options: "INTERNAL" or "EXTERNAL" +API_USER="demo" # User for JWT generation + +# BASE NETWORK CONFIG +SERVICE_ADDRESS="http://localhost:8080" +NATS_SERVER="nats://0.0.0.0:4222" +REST_URL="${SERVICE_ADDRESS}/api/write" + +# NATS CREDENTIALS +NATS_USER="root" +NATS_PASS="root" +NATS_SUBJECT="hpc-nats" + +# EXTERNAL JWT (Used if CONNECTION_SCOPE is EXTERNAL) +JWT_STATIC="eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NzU3Nzg4NDQsImlhdCI6MTc2ODU3ODg0NCwicm9sZXMiOlsiYWRtaW4iLCJhcGkiXSwic3ViIjoiZGVtbyJ9._SDEW9WaUVXSBFmWqGhyIZXLoqoDU8F1hkfh4cXKIqF4yw7w50IUpfUBtwUFUOnoviFKoi563f6RAMC7XxeLDA" + +# ========================================== +# DATA DEFINITIONS +# ========================================== +ALEX_HOSTS="a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904" +FRITZ_HOSTS="f0201 f0202 f0203 f0204 f0205 f0206 f0207 f0208 f0209 f0210 f0211 f0212 f0213 f0214 f0215 f0217 f0218 f0219 f0220 f0221 f0222 f0223 f0224 f0225 f0226 f0227 f0228 f0229 f0230 f0231 f0232 f0233 f0234 f0235 f0236 f0237 f0238 f0239 f0240 f0241 f0242 f0243 f0244 f0245 f0246 f0247 f0248 f0249 f0250 f0251 f0252 f0253 f0254 f0255 f0256 f0257 f0258 f0259 f0260 f0261 f0262 f0263 f0264 f0378" + +ALEX_METRICS_HWTHREAD="cpu_user flops_any clock core_power ipc" +ALEX_METRICS_SOCKET="mem_bw cpu_power" +ALEX_METRICS_ACC="acc_utilization acc_mem_used acc_power nv_mem_util nv_temp nv_sm_clock" +ALEX_METRICS_NODE="cpu_load mem_used net_bytes_in net_bytes_out" + +FRITZ_METRICS_HWTHREAD="cpu_user flops_any flops_sp flops_dp clock ipc vectorization_ratio" +FRITZ_METRICS_SOCKET="mem_bw cpu_power mem_power" +FRITZ_METRICS_NODE="cpu_load mem_used ib_recv ib_xmit ib_recv_pkts ib_xmit_pkts nfs4_read nfs4_total" + +ACCEL_IDS="00000000:49:00.0 00000000:0E:00.0 00000000:D1:00.0 00000000:90:00.0 00000000:13:00.0 00000000:96:00.0 00000000:CC:00.0 00000000:4F:00.0" + +# ========================================== +# SETUP ENV (URL & TOKEN) +# ========================================== + +if [ "$CONNECTION_SCOPE" == "INTERNAL" ]; then + # 2. Generate JWT dynamically + echo "Setup: INTERNAL mode selected." + echo "Generating JWT for user: $API_USER" + JWT=$(./cc-backend -jwt "$API_USER" | grep -oP "(?<=JWT: Successfully generated JWT for user '${API_USER}': ).*") + + if [ -z "$JWT" ]; then + echo "Error: Failed to generate JWT from cc-backend." + exit 1 + fi +else + # 2. Use Static JWT + echo "Setup: EXTERNAL mode selected." + echo "Using static JWT." + JWT="$JWT_STATIC" +fi + +echo "Target URL: $REST_URL" + +# ========================================== +# FUNCTIONS +# ========================================== + +send_payload() { + local file_path=$1 + local cluster_name=$2 + + if [ "$TRANSPORT_MODE" == "NATS" ]; then + # Piping file content directly to nats stdin + cat "$file_path" | nats pub "$NATS_SUBJECT" -s "$NATS_SERVER" --user "$NATS_USER" --password "$NATS_PASS" + else + # Sending via REST API + curl -s -X 'POST' "${REST_URL}/?cluster=${cluster_name}" \ + -H "Authorization: Bearer $JWT" \ + --data-binary "@$file_path" + fi + + # Clean up immediately + rm "$file_path" +} + +# ========================================== +# MAIN LOOP +# ========================================== + +# Clean up leftovers +rm -f sample_fritz.txt sample_alex.txt + +while [ true ]; do + timestamp="$(date '+%s')" + echo "--- Cycle Start: $timestamp [Mode: $TRANSPORT_MODE | Scope: $CONNECTION_SCOPE] ---" + + # 1. ALEX: HWTHREAD + echo "Generating Alex: hwthread" + { + for metric in $ALEX_METRICS_HWTHREAD; do + for hostname in $ALEX_HOSTS; do + for id in {0..127}; do + echo "$metric,cluster=alex,hostname=$hostname,type=hwthread,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" + done + done + done + } > sample_alex.txt + send_payload "sample_alex.txt" "alex" + + # 2. FRITZ: HWTHREAD + echo "Generating Fritz: hwthread" + { + for metric in $FRITZ_METRICS_HWTHREAD; do + for hostname in $FRITZ_HOSTS; do + for id in {0..71}; do + echo "$metric,cluster=fritz,hostname=$hostname,type=hwthread,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" + done + done + done + } > sample_fritz.txt + send_payload "sample_fritz.txt" "fritz" + + # 3. ALEX: ACCELERATOR + echo "Generating Alex: accelerator" + { + for metric in $ALEX_METRICS_ACC; do + for hostname in $ALEX_HOSTS; do + for id in $ACCEL_IDS; do + echo "$metric,cluster=alex,hostname=$hostname,type=accelerator,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" + done + done + done + } > sample_alex.txt + send_payload "sample_alex.txt" "alex" + + # 5. ALEX: SOCKET + echo "Generating Alex: socket" + { + for metric in $ALEX_METRICS_SOCKET; do + for hostname in $ALEX_HOSTS; do + for id in {0..1}; do + echo "$metric,cluster=alex,hostname=$hostname,type=socket,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" + done + done + done + } > sample_alex.txt + send_payload "sample_alex.txt" "alex" + + # 6. FRITZ: SOCKET + echo "Generating Fritz: socket" + { + for metric in $FRITZ_METRICS_SOCKET; do + for hostname in $FRITZ_HOSTS; do + for id in {0..1}; do + echo "$metric,cluster=fritz,hostname=$hostname,type=socket,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" + done + done + done + } > sample_fritz.txt + send_payload "sample_fritz.txt" "fritz" + + # 7. ALEX: NODE + echo "Generating Alex: node" + { + for metric in $ALEX_METRICS_NODE; do + for hostname in $ALEX_HOSTS; do + echo "$metric,cluster=alex,hostname=$hostname,type=node value=$((1 + RANDOM % 100)).0 $timestamp" + done + done + } > sample_alex.txt + send_payload "sample_alex.txt" "alex" + + # 8. FRITZ: NODE + echo "Generating Fritz: node" + { + for metric in $FRITZ_METRICS_NODE; do + for hostname in $FRITZ_HOSTS; do + echo "$metric,cluster=fritz,hostname=$hostname,type=node value=$((1 + RANDOM % 100)).0 $timestamp" + done + done + } > sample_fritz.txt + send_payload "sample_fritz.txt" "fritz" + + sleep 1m +done \ No newline at end of file diff --git a/web/configSchema.go b/web/configSchema.go index 5690b63a..66e57195 100644 --- a/web/configSchema.go +++ b/web/configSchema.go @@ -8,57 +8,57 @@ package web const configSchema = `{ "type": "object", "properties": { - "jobList": { + "job-list": { "description": "Job list defaults. Applies to user- and jobs views.", "type": "object", "properties": { - "usePaging": { + "use-paging": { "description": "If classic paging is used instead of continuous scrolling by default.", "type": "boolean" }, - "showFootprint": { + "show-footprint": { "description": "If footprint bars are shown as first column by default.", "type": "boolean" } } }, - "nodeList": { + "node-list": { "description": "Node list defaults. Applies to node list view.", "type": "object", "properties": { - "usePaging": { + "use-paging": { "description": "If classic paging is used instead of continuous scrolling by default.", "type": "boolean" } } }, - "jobView": { + "job-view": { "description": "Job view defaults.", "type": "object", "properties": { - "showPolarPlot": { + "show-polar-plot": { "description": "If the job metric footprints polar plot is shown by default.", "type": "boolean" }, - "showFootprint": { + "show-footprint": { "description": "If the annotated job metric footprint bars are shown by default.", "type": "boolean" }, - "showRoofline": { + "show-roofline": { "description": "If the job roofline plot is shown by default.", "type": "boolean" }, - "showStatTable": { + "show-stat-table": { "description": "If the job metric statistics table is shown by default.", "type": "boolean" } } }, - "metricConfig": { + "metric-config": { "description": "Global initial metric selections for primary views of all clusters.", "type": "object", "properties": { - "jobListMetrics": { + "job-list-metrics": { "description": "Initial metrics shown for new users in job lists (User and jobs view).", "type": "array", "items": { @@ -66,7 +66,7 @@ const configSchema = `{ "minItems": 1 } }, - "jobViewPlotMetrics": { + "job-view-plot-metrics": { "description": "Initial metrics shown for new users as job view metric plots.", "type": "array", "items": { @@ -74,7 +74,7 @@ const configSchema = `{ "minItems": 1 } }, - "jobViewTableMetrics": { + "job-view-table-metrics": { "description": "Initial metrics shown for new users in job view statistics table.", "type": "array", "items": { @@ -91,7 +91,7 @@ const configSchema = `{ "name": { "description": "The name of the cluster." }, - "jobListMetrics": { + "job-list-metrics": { "description": "Initial metrics shown for new users in job lists (User and jobs view) for subcluster.", "type": "array", "items": { @@ -99,7 +99,7 @@ const configSchema = `{ "minItems": 1 } }, - "jobViewPlotMetrics": { + "job-view-plot-metrics": { "description": "Initial metrics shown for new users as job view timeplots for subcluster.", "type": "array", "items": { @@ -107,7 +107,7 @@ const configSchema = `{ "minItems": 1 } }, - "jobViewTableMetrics": { + "job-view-table-metrics": { "description": "Initial metrics shown for new users in job view statistics table for subcluster.", "type": "array", "items": { @@ -115,7 +115,7 @@ const configSchema = `{ "minItems": 1 } }, - "subClusters": { + "sub-clusters": { "description": "The array of overrides per subcluster.", "type": "array", "items": { @@ -125,7 +125,7 @@ const configSchema = `{ "description": "The name of the subcluster.", "type": "string" }, - "jobListMetrics": { + "job-list-metrics": { "description": "Initial metrics shown for new users in job lists (User and jobs view) for subcluster.", "type": "array", "items": { @@ -133,7 +133,7 @@ const configSchema = `{ "minItems": 1 } }, - "jobViewPlotMetrics": { + "job-view-plot-metrics": { "description": "Initial metrics shown for new users as job view timeplots for subcluster.", "type": "array", "items": { @@ -141,7 +141,7 @@ const configSchema = `{ "minItems": 1 } }, - "jobViewTableMetrics": { + "job-view-table-metrics": { "description": "Initial metrics shown for new users in job view statistics table for subcluster.", "type": "array", "items": { @@ -155,29 +155,29 @@ const configSchema = `{ } } }, - "required": ["name", "subClusters"], + "required": ["name"], "minItems": 1 } } } }, - "plotConfiguration": { + "plot-configuration": { "description": "Initial settings for plot render options.", "type": "object", "properties": { - "colorBackground": { + "color-background": { "description": "If the metric plot backgrounds are initially colored by threshold limits.", "type": "boolean" }, - "plotsPerRow": { + "plots-per-row": { "description": "How many plots are initially rendered in per row. Applies to job, single node, and analysis views.", "type": "integer" }, - "lineWidth": { + "line-width": { "description": "Initial thickness of rendered plotlines. Applies to metric plot, job compare plot and roofline.", "type": "integer" }, - "colorScheme": { + "color-scheme": { "description": "Initial colorScheme to be used for metric plots.", "type": "array", "items": { diff --git a/web/frontend/README.md b/web/frontend/README.md index d61d302e..4dff4405 100644 --- a/web/frontend/README.md +++ b/web/frontend/README.md @@ -1,11 +1,11 @@ # cc-frontend -[![Build](https://github.com/ClusterCockpit/cc-svelte-datatable/actions/workflows/build.yml/badge.svg)](https://github.com/ClusterCockpit/cc-svelte-datatable/actions/workflows/build.yml) +[![Build](https://github.com/ClusterCockpit/cc-backend/actions/workflows/test.yml/badge.svg)](https://github.com/ClusterCockpit/cc-backend/actions/workflows/test.yml) -A frontend for [ClusterCockpit](https://github.com/ClusterCockpit/ClusterCockpit) and [cc-backend](https://github.com/ClusterCockpit/cc-backend). Backend specific configuration can de done using the constants defined in the `intro` section in `./rollup.config.js`. +A frontend for [ClusterCockpit](https://github.com/ClusterCockpit/ClusterCockpit) and [cc-backend](https://github.com/ClusterCockpit/cc-backend). Backend specific configuration can be done using the constants defined in the `intro` section in `./rollup.config.mjs`. Builds on: -* [Svelte](https://svelte.dev/) +* [Svelte 5](https://svelte.dev/) * [SvelteStrap](https://sveltestrap.js.org/) * [Bootstrap 5](https://getbootstrap.com/) * [urql](https://github.com/FormidableLabs/urql) diff --git a/web/frontend/package-lock.json b/web/frontend/package-lock.json index 4c7e4bf5..8656abc4 100644 --- a/web/frontend/package-lock.json +++ b/web/frontend/package-lock.json @@ -1,12 +1,12 @@ { "name": "cc-frontend", - "version": "1.0.0", + "version": "1.5.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "cc-frontend", - "version": "1.0.0", + "version": "1.5.0", "license": "MIT", "dependencies": { "@rollup/plugin-replace": "^6.0.3", @@ -14,20 +14,20 @@ "@urql/svelte": "^4.2.3", "chart.js": "^4.5.1", "date-fns": "^4.1.0", - "graphql": "^16.12.0", - "mathjs": "^15.0.0", + "graphql": "^16.13.1", + "mathjs": "^15.1.1", "uplot": "^1.6.32", "wonka": "^6.3.5" }, "devDependencies": { - "@rollup/plugin-commonjs": "^29.0.0", - "@rollup/plugin-node-resolve": "^16.0.1", - "@rollup/plugin-terser": "^0.4.4", + "@rollup/plugin-commonjs": "^29.0.2", + "@rollup/plugin-node-resolve": "^16.0.3", + "@rollup/plugin-terser": "^1.0.0", "@timohausmann/quadtree-js": "^1.2.6", - "rollup": "^4.53.3", + "rollup": "^4.59.0", "rollup-plugin-css-only": "^4.5.5", "rollup-plugin-svelte": "^7.2.3", - "svelte": "^5.44.0" + "svelte": "^5.53.9" } }, "node_modules/@0no-co/graphql.web": { @@ -45,9 +45,9 @@ } }, "node_modules/@babel/runtime": { - "version": "7.28.4", - "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.28.4.tgz", - "integrity": "sha512-Q/N6JNWvIvPnLDvjlE1OUBLPQHH6l3CltCEsHIujp45zQUSSh8K+gHnaEX45yAT1nyngnINhvWtzN+Nb9D8RAQ==", + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.28.6.tgz", + "integrity": "sha512-05WQkdpL9COIMz4LjTxGpPNCdlpyimKppYNoJ5Di5EUObifl8t4tuLuUBBZEpoLYOmfvIWrsp9fCl0HoPRVTdA==", "license": "MIT", "engines": { "node": ">=6.9.0" @@ -126,9 +126,9 @@ } }, "node_modules/@rollup/plugin-commonjs": { - "version": "29.0.0", - "resolved": "https://registry.npmjs.org/@rollup/plugin-commonjs/-/plugin-commonjs-29.0.0.tgz", - "integrity": "sha512-U2YHaxR2cU/yAiwKJtJRhnyLk7cifnQw0zUpISsocBDoHDJn+HTV74ABqnwr5bEgWUwFZC9oFL6wLe21lHu5eQ==", + "version": "29.0.2", + "resolved": "https://registry.npmjs.org/@rollup/plugin-commonjs/-/plugin-commonjs-29.0.2.tgz", + "integrity": "sha512-S/ggWH1LU7jTyi9DxZOKyxpVd4hF/OZ0JrEbeLjXk/DFXwRny0tjD2c992zOUYQobLrVkRVMDdmHP16HKP7GRg==", "dev": true, "license": "MIT", "dependencies": { @@ -199,18 +199,18 @@ } }, "node_modules/@rollup/plugin-terser": { - "version": "0.4.4", - "resolved": "https://registry.npmjs.org/@rollup/plugin-terser/-/plugin-terser-0.4.4.tgz", - "integrity": "sha512-XHeJC5Bgvs8LfukDwWZp7yeqin6ns8RTl2B9avbejt6tZqsqvVoWI7ZTQrcNsfKEDWBTnTxM8nMDkO2IFFbd0A==", + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/@rollup/plugin-terser/-/plugin-terser-1.0.0.tgz", + "integrity": "sha512-FnCxhTBx6bMOYQrar6C8h3scPt8/JwIzw3+AJ2K++6guogH5fYaIFia+zZuhqv0eo1RN7W1Pz630SyvLbDjhtQ==", "dev": true, "license": "MIT", "dependencies": { - "serialize-javascript": "^6.0.1", + "serialize-javascript": "^7.0.3", "smob": "^1.0.0", "terser": "^5.17.4" }, "engines": { - "node": ">=14.0.0" + "node": ">=20.0.0" }, "peerDependencies": { "rollup": "^2.0.0||^3.0.0||^4.0.0" @@ -244,13 +244,12 @@ } }, "node_modules/@rollup/rollup-android-arm-eabi": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.53.3.tgz", - "integrity": "sha512-mRSi+4cBjrRLoaal2PnqH82Wqyb+d3HsPUN/W+WslCXsZsyHa9ZeQQX/pQsZaVIWDkPcpV6jJ+3KLbTbgnwv8w==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.59.0.tgz", + "integrity": "sha512-upnNBkA6ZH2VKGcBj9Fyl9IGNPULcjXRlg0LLeaioQWueH30p6IXtJEbKAgvyv+mJaMxSm1l6xwDXYjpEMiLMg==", "cpu": [ "arm" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -258,13 +257,12 @@ ] }, "node_modules/@rollup/rollup-android-arm64": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.53.3.tgz", - "integrity": "sha512-CbDGaMpdE9sh7sCmTrTUyllhrg65t6SwhjlMJsLr+J8YjFuPmCEjbBSx4Z/e4SmDyH3aB5hGaJUP2ltV/vcs4w==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.59.0.tgz", + "integrity": "sha512-hZ+Zxj3SySm4A/DylsDKZAeVg0mvi++0PYVceVyX7hemkw7OreKdCvW2oQ3T1FMZvCaQXqOTHb8qmBShoqk69Q==", "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -272,13 +270,12 @@ ] }, "node_modules/@rollup/rollup-darwin-arm64": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.53.3.tgz", - "integrity": "sha512-Nr7SlQeqIBpOV6BHHGZgYBuSdanCXuw09hon14MGOLGmXAFYjx1wNvquVPmpZnl0tLjg25dEdr4IQ6GgyToCUA==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.59.0.tgz", + "integrity": "sha512-W2Psnbh1J8ZJw0xKAd8zdNgF9HRLkdWwwdWqubSVk0pUuQkoHnv7rx4GiF9rT4t5DIZGAsConRE3AxCdJ4m8rg==", "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -286,13 +283,12 @@ ] }, "node_modules/@rollup/rollup-darwin-x64": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.53.3.tgz", - "integrity": "sha512-DZ8N4CSNfl965CmPktJ8oBnfYr3F8dTTNBQkRlffnUarJ2ohudQD17sZBa097J8xhQ26AwhHJ5mvUyQW8ddTsQ==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.59.0.tgz", + "integrity": "sha512-ZW2KkwlS4lwTv7ZVsYDiARfFCnSGhzYPdiOU4IM2fDbL+QGlyAbjgSFuqNRbSthybLbIJ915UtZBtmuLrQAT/w==", "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -300,13 +296,12 @@ ] }, "node_modules/@rollup/rollup-freebsd-arm64": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.53.3.tgz", - "integrity": "sha512-yMTrCrK92aGyi7GuDNtGn2sNW+Gdb4vErx4t3Gv/Tr+1zRb8ax4z8GWVRfr3Jw8zJWvpGHNpss3vVlbF58DZ4w==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.59.0.tgz", + "integrity": "sha512-EsKaJ5ytAu9jI3lonzn3BgG8iRBjV4LxZexygcQbpiU0wU0ATxhNVEpXKfUa0pS05gTcSDMKpn3Sx+QB9RlTTA==", "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -314,13 +309,12 @@ ] }, "node_modules/@rollup/rollup-freebsd-x64": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.53.3.tgz", - "integrity": "sha512-lMfF8X7QhdQzseM6XaX0vbno2m3hlyZFhwcndRMw8fbAGUGL3WFMBdK0hbUBIUYcEcMhVLr1SIamDeuLBnXS+Q==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.59.0.tgz", + "integrity": "sha512-d3DuZi2KzTMjImrxoHIAODUZYoUUMsuUiY4SRRcJy6NJoZ6iIqWnJu9IScV9jXysyGMVuW+KNzZvBLOcpdl3Vg==", "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -328,13 +322,15 @@ ] }, "node_modules/@rollup/rollup-linux-arm-gnueabihf": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.53.3.tgz", - "integrity": "sha512-k9oD15soC/Ln6d2Wv/JOFPzZXIAIFLp6B+i14KhxAfnq76ajt0EhYc5YPeX6W1xJkAdItcVT+JhKl1QZh44/qw==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.59.0.tgz", + "integrity": "sha512-t4ONHboXi/3E0rT6OZl1pKbl2Vgxf9vJfWgmUoCEVQVxhW6Cw/c8I6hbbu7DAvgp82RKiH7TpLwxnJeKv2pbsw==", "cpu": [ "arm" ], - "dev": true, + "libc": [ + "glibc" + ], "license": "MIT", "optional": true, "os": [ @@ -342,13 +338,15 @@ ] }, "node_modules/@rollup/rollup-linux-arm-musleabihf": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.53.3.tgz", - "integrity": "sha512-vTNlKq+N6CK/8UktsrFuc+/7NlEYVxgaEgRXVUVK258Z5ymho29skzW1sutgYjqNnquGwVUObAaxae8rZ6YMhg==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.59.0.tgz", + "integrity": "sha512-CikFT7aYPA2ufMD086cVORBYGHffBo4K8MQ4uPS/ZnY54GKj36i196u8U+aDVT2LX4eSMbyHtyOh7D7Zvk2VvA==", "cpu": [ "arm" ], - "dev": true, + "libc": [ + "musl" + ], "license": "MIT", "optional": true, "os": [ @@ -356,13 +354,15 @@ ] }, "node_modules/@rollup/rollup-linux-arm64-gnu": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.53.3.tgz", - "integrity": "sha512-RGrFLWgMhSxRs/EWJMIFM1O5Mzuz3Xy3/mnxJp/5cVhZ2XoCAxJnmNsEyeMJtpK+wu0FJFWz+QF4mjCA7AUQ3w==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.59.0.tgz", + "integrity": "sha512-jYgUGk5aLd1nUb1CtQ8E+t5JhLc9x5WdBKew9ZgAXg7DBk0ZHErLHdXM24rfX+bKrFe+Xp5YuJo54I5HFjGDAA==", "cpu": [ "arm64" ], - "dev": true, + "libc": [ + "glibc" + ], "license": "MIT", "optional": true, "os": [ @@ -370,13 +370,15 @@ ] }, "node_modules/@rollup/rollup-linux-arm64-musl": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.53.3.tgz", - "integrity": "sha512-kASyvfBEWYPEwe0Qv4nfu6pNkITLTb32p4yTgzFCocHnJLAHs+9LjUu9ONIhvfT/5lv4YS5muBHyuV84epBo/A==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.59.0.tgz", + "integrity": "sha512-peZRVEdnFWZ5Bh2KeumKG9ty7aCXzzEsHShOZEFiCQlDEepP1dpUl/SrUNXNg13UmZl+gzVDPsiCwnV1uI0RUA==", "cpu": [ "arm64" ], - "dev": true, + "libc": [ + "musl" + ], "license": "MIT", "optional": true, "os": [ @@ -384,13 +386,31 @@ ] }, "node_modules/@rollup/rollup-linux-loong64-gnu": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.53.3.tgz", - "integrity": "sha512-JiuKcp2teLJwQ7vkJ95EwESWkNRFJD7TQgYmCnrPtlu50b4XvT5MOmurWNrCj3IFdyjBQ5p9vnrX4JM6I8OE7g==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.59.0.tgz", + "integrity": "sha512-gbUSW/97f7+r4gHy3Jlup8zDG190AuodsWnNiXErp9mT90iCy9NKKU0Xwx5k8VlRAIV2uU9CsMnEFg/xXaOfXg==", "cpu": [ "loong64" ], - "dev": true, + "libc": [ + "glibc" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loong64-musl": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-musl/-/rollup-linux-loong64-musl-4.59.0.tgz", + "integrity": "sha512-yTRONe79E+o0FWFijasoTjtzG9EBedFXJMl888NBEDCDV9I2wGbFFfJQQe63OijbFCUZqxpHz1GzpbtSFikJ4Q==", + "cpu": [ + "loong64" + ], + "libc": [ + "musl" + ], "license": "MIT", "optional": true, "os": [ @@ -398,13 +418,31 @@ ] }, "node_modules/@rollup/rollup-linux-ppc64-gnu": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.53.3.tgz", - "integrity": "sha512-EoGSa8nd6d3T7zLuqdojxC20oBfNT8nexBbB/rkxgKj5T5vhpAQKKnD+h3UkoMuTyXkP5jTjK/ccNRmQrPNDuw==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.59.0.tgz", + "integrity": "sha512-sw1o3tfyk12k3OEpRddF68a1unZ5VCN7zoTNtSn2KndUE+ea3m3ROOKRCZxEpmT9nsGnogpFP9x6mnLTCaoLkA==", "cpu": [ "ppc64" ], - "dev": true, + "libc": [ + "glibc" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-musl": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-musl/-/rollup-linux-ppc64-musl-4.59.0.tgz", + "integrity": "sha512-+2kLtQ4xT3AiIxkzFVFXfsmlZiG5FXYW7ZyIIvGA7Bdeuh9Z0aN4hVyXS/G1E9bTP/vqszNIN/pUKCk/BTHsKA==", + "cpu": [ + "ppc64" + ], + "libc": [ + "musl" + ], "license": "MIT", "optional": true, "os": [ @@ -412,13 +450,15 @@ ] }, "node_modules/@rollup/rollup-linux-riscv64-gnu": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.53.3.tgz", - "integrity": "sha512-4s+Wped2IHXHPnAEbIB0YWBv7SDohqxobiiPA1FIWZpX+w9o2i4LezzH/NkFUl8LRci/8udci6cLq+jJQlh+0g==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.59.0.tgz", + "integrity": "sha512-NDYMpsXYJJaj+I7UdwIuHHNxXZ/b/N2hR15NyH3m2qAtb/hHPA4g4SuuvrdxetTdndfj9b1WOmy73kcPRoERUg==", "cpu": [ "riscv64" ], - "dev": true, + "libc": [ + "glibc" + ], "license": "MIT", "optional": true, "os": [ @@ -426,13 +466,15 @@ ] }, "node_modules/@rollup/rollup-linux-riscv64-musl": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.53.3.tgz", - "integrity": "sha512-68k2g7+0vs2u9CxDt5ktXTngsxOQkSEV/xBbwlqYcUrAVh6P9EgMZvFsnHy4SEiUl46Xf0IObWVbMvPrr2gw8A==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.59.0.tgz", + "integrity": "sha512-nLckB8WOqHIf1bhymk+oHxvM9D3tyPndZH8i8+35p/1YiVoVswPid2yLzgX7ZJP0KQvnkhM4H6QZ5m0LzbyIAg==", "cpu": [ "riscv64" ], - "dev": true, + "libc": [ + "musl" + ], "license": "MIT", "optional": true, "os": [ @@ -440,13 +482,15 @@ ] }, "node_modules/@rollup/rollup-linux-s390x-gnu": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.53.3.tgz", - "integrity": "sha512-VYsFMpULAz87ZW6BVYw3I6sWesGpsP9OPcyKe8ofdg9LHxSbRMd7zrVrr5xi/3kMZtpWL/wC+UIJWJYVX5uTKg==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.59.0.tgz", + "integrity": "sha512-oF87Ie3uAIvORFBpwnCvUzdeYUqi2wY6jRFWJAy1qus/udHFYIkplYRW+wo+GRUP4sKzYdmE1Y3+rY5Gc4ZO+w==", "cpu": [ "s390x" ], - "dev": true, + "libc": [ + "glibc" + ], "license": "MIT", "optional": true, "os": [ @@ -454,13 +498,15 @@ ] }, "node_modules/@rollup/rollup-linux-x64-gnu": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.53.3.tgz", - "integrity": "sha512-3EhFi1FU6YL8HTUJZ51imGJWEX//ajQPfqWLI3BQq4TlvHy4X0MOr5q3D2Zof/ka0d5FNdPwZXm3Yyib/UEd+w==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.59.0.tgz", + "integrity": "sha512-3AHmtQq/ppNuUspKAlvA8HtLybkDflkMuLK4DPo77DfthRb71V84/c4MlWJXixZz4uruIH4uaa07IqoAkG64fg==", "cpu": [ "x64" ], - "dev": true, + "libc": [ + "glibc" + ], "license": "MIT", "optional": true, "os": [ @@ -468,27 +514,41 @@ ] }, "node_modules/@rollup/rollup-linux-x64-musl": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.53.3.tgz", - "integrity": "sha512-eoROhjcc6HbZCJr+tvVT8X4fW3/5g/WkGvvmwz/88sDtSJzO7r/blvoBDgISDiCjDRZmHpwud7h+6Q9JxFwq1Q==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.59.0.tgz", + "integrity": "sha512-2UdiwS/9cTAx7qIUZB/fWtToJwvt0Vbo0zmnYt7ED35KPg13Q0ym1g442THLC7VyI6JfYTP4PiSOWyoMdV2/xg==", "cpu": [ "x64" ], - "dev": true, + "libc": [ + "musl" + ], "license": "MIT", "optional": true, "os": [ "linux" ] }, + "node_modules/@rollup/rollup-openbsd-x64": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openbsd-x64/-/rollup-openbsd-x64-4.59.0.tgz", + "integrity": "sha512-M3bLRAVk6GOwFlPTIxVBSYKUaqfLrn8l0psKinkCFxl4lQvOSz8ZrKDz2gxcBwHFpci0B6rttydI4IpS4IS/jQ==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ] + }, "node_modules/@rollup/rollup-openharmony-arm64": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.53.3.tgz", - "integrity": "sha512-OueLAWgrNSPGAdUdIjSWXw+u/02BRTcnfw9PN41D2vq/JSEPnJnVuBgw18VkN8wcd4fjUs+jFHVM4t9+kBSNLw==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.59.0.tgz", + "integrity": "sha512-tt9KBJqaqp5i5HUZzoafHZX8b5Q2Fe7UjYERADll83O4fGqJ49O1FsL6LpdzVFQcpwvnyd0i+K/VSwu/o/nWlA==", "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -496,13 +556,12 @@ ] }, "node_modules/@rollup/rollup-win32-arm64-msvc": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.53.3.tgz", - "integrity": "sha512-GOFuKpsxR/whszbF/bzydebLiXIHSgsEUp6M0JI8dWvi+fFa1TD6YQa4aSZHtpmh2/uAlj/Dy+nmby3TJ3pkTw==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.59.0.tgz", + "integrity": "sha512-V5B6mG7OrGTwnxaNUzZTDTjDS7F75PO1ae6MJYdiMu60sq0CqN5CVeVsbhPxalupvTX8gXVSU9gq+Rx1/hvu6A==", "cpu": [ "arm64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -510,13 +569,12 @@ ] }, "node_modules/@rollup/rollup-win32-ia32-msvc": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.53.3.tgz", - "integrity": "sha512-iah+THLcBJdpfZ1TstDFbKNznlzoxa8fmnFYK4V67HvmuNYkVdAywJSoteUszvBQ9/HqN2+9AZghbajMsFT+oA==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.59.0.tgz", + "integrity": "sha512-UKFMHPuM9R0iBegwzKF4y0C4J9u8C6MEJgFuXTBerMk7EJ92GFVFYBfOZaSGLu6COf7FxpQNqhNS4c4icUPqxA==", "cpu": [ "ia32" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -524,13 +582,12 @@ ] }, "node_modules/@rollup/rollup-win32-x64-gnu": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.53.3.tgz", - "integrity": "sha512-J9QDiOIZlZLdcot5NXEepDkstocktoVjkaKUtqzgzpt2yWjGlbYiKyp05rWwk4nypbYUNoFAztEgixoLaSETkg==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.59.0.tgz", + "integrity": "sha512-laBkYlSS1n2L8fSo1thDNGrCTQMmxjYY5G0WFWjFFYZkKPjsMBsgJfGf4TLxXrF6RyhI60L8TMOjBMvXiTcxeA==", "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -538,13 +595,12 @@ ] }, "node_modules/@rollup/rollup-win32-x64-msvc": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.53.3.tgz", - "integrity": "sha512-UhTd8u31dXadv0MopwGgNOBpUVROFKWVQgAg5N1ESyCz8AuBcMqm4AuTjrwgQKGDfoFuz02EuMRHQIw/frmYKQ==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.59.0.tgz", + "integrity": "sha512-2HRCml6OztYXyJXAvdDXPKcawukWY2GpR5/nxKp4iBgiO3wcoEGkAaqctIbZcNB6KlUQBIqt8VYkNSj2397EfA==", "cpu": [ "x64" ], - "dev": true, "license": "MIT", "optional": true, "os": [ @@ -552,9 +608,9 @@ ] }, "node_modules/@sveltejs/acorn-typescript": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/@sveltejs/acorn-typescript/-/acorn-typescript-1.0.7.tgz", - "integrity": "sha512-znp1A/Y1Jj4l/Zy7PX5DZKBE0ZNY+5QBngiE21NJkfSTyzzC5iKNWOtwFXKtIrn7MXEFBck4jD95iBNkGjK92Q==", + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/@sveltejs/acorn-typescript/-/acorn-typescript-1.0.9.tgz", + "integrity": "sha512-lVJX6qEgs/4DOcRTpo56tmKzVPtoWAaVbL4hfO7t7NVwl9AAXzQR6cihesW1BmNMPl+bK6dreu2sOKBP2Q9CIA==", "license": "MIT", "peerDependencies": { "acorn": "^8.9.0" @@ -592,6 +648,12 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/trusted-types": { + "version": "2.0.7", + "resolved": "https://registry.npmjs.org/@types/trusted-types/-/trusted-types-2.0.7.tgz", + "integrity": "sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==", + "license": "MIT" + }, "node_modules/@urql/core": { "version": "5.2.0", "resolved": "https://registry.npmjs.org/@urql/core/-/core-5.2.0.tgz", @@ -617,9 +679,9 @@ } }, "node_modules/acorn": { - "version": "8.15.0", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz", - "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", + "version": "8.16.0", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.16.0.tgz", + "integrity": "sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw==", "license": "MIT", "bin": { "acorn": "bin/acorn" @@ -629,9 +691,9 @@ } }, "node_modules/aria-query": { - "version": "5.3.2", - "resolved": "https://registry.npmjs.org/aria-query/-/aria-query-5.3.2.tgz", - "integrity": "sha512-COROpnaoap1E2F000S62r6A60uHZnmlvomhfyT2DlTcrY1OrBKn2UhH7qn5wTC9zMvD0AY7csdPSNwKP+7WiQw==", + "version": "5.3.1", + "resolved": "https://registry.npmjs.org/aria-query/-/aria-query-5.3.1.tgz", + "integrity": "sha512-Z/ZeOgVl7bcSYZ/u/rh0fOpvEpq//LZmdbkXyc7syVzjPAhfOa9ebsdTSjEBDU4vs5nC98Kfduj1uFo0qyET3g==", "license": "Apache-2.0", "engines": { "node": ">= 0.4" @@ -728,9 +790,9 @@ } }, "node_modules/devalue": { - "version": "5.5.0", - "resolved": "https://registry.npmjs.org/devalue/-/devalue-5.5.0.tgz", - "integrity": "sha512-69sM5yrHfFLJt0AZ9QqZXGCPfJ7fQjvpln3Rq5+PS03LD32Ost1Q9N+eEnaQwGRIriKkMImXD56ocjQmfjbV3w==", + "version": "5.6.3", + "resolved": "https://registry.npmjs.org/devalue/-/devalue-5.6.3.tgz", + "integrity": "sha512-nc7XjUU/2Lb+SvEFVGcWLiKkzfw8+qHI7zn8WYXKkLMgfGSHbgCEaR6bJpev8Cm6Rmrb19Gfd/tZvGqx9is3wg==", "license": "MIT" }, "node_modules/escape-latex": { @@ -746,9 +808,9 @@ "license": "MIT" }, "node_modules/esrap": { - "version": "2.1.3", - "resolved": "https://registry.npmjs.org/esrap/-/esrap-2.1.3.tgz", - "integrity": "sha512-T/Dhhv/QH+yYmiaLz9SA3PW+YyenlnRKDNdtlYJrSOBmNsH4nvPux+mTwx7p+wAedlJrGoZtXNI0a0MjQ2QkVg==", + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/esrap/-/esrap-2.2.3.tgz", + "integrity": "sha512-8fOS+GIGCQZl/ZIlhl59htOlms6U8NvX6ZYgYHpRU/b6tVSh3uHkOHZikl3D4cMbYM0JlpBe+p/BkZEi8J9XIQ==", "license": "MIT", "dependencies": { "@jridgewell/sourcemap-codec": "^1.4.15" @@ -795,7 +857,6 @@ "version": "2.3.3", "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", - "dev": true, "hasInstallScript": true, "license": "MIT", "optional": true, @@ -817,9 +878,9 @@ } }, "node_modules/graphql": { - "version": "16.12.0", - "resolved": "https://registry.npmjs.org/graphql/-/graphql-16.12.0.tgz", - "integrity": "sha512-DKKrynuQRne0PNpEbzuEdHlYOMksHSUI8Zc9Unei5gTsMNA2/vMpoMz/yKba50pejK56qj98qM0SjYxAKi13gQ==", + "version": "16.13.1", + "resolved": "https://registry.npmjs.org/graphql/-/graphql-16.13.1.tgz", + "integrity": "sha512-gGgrVCoDKlIZ8fIqXBBb0pPKqDgki0Z/FSKNiQzSGj2uEYHr1tq5wmBegGwJx6QB5S5cM0khSBpi/JFHMCvsmQ==", "license": "MIT", "engines": { "node": "^12.22.0 || ^14.16.0 || ^16.0.0 || >=17.0.0" @@ -893,9 +954,9 @@ } }, "node_modules/mathjs": { - "version": "15.1.0", - "resolved": "https://registry.npmjs.org/mathjs/-/mathjs-15.1.0.tgz", - "integrity": "sha512-HfnAcScQm9drGryodlDqeS3WAl4gUTYGDcOtcqL/8s23MZ28Ib1i8XnYK3ZdjNuaW/L4BAp9lIp8vxAMrcuu1w==", + "version": "15.1.1", + "resolved": "https://registry.npmjs.org/mathjs/-/mathjs-15.1.1.tgz", + "integrity": "sha512-rM668DTtpSzMVoh/cKAllyQVEbBApM5g//IMGD8vD7YlrIz9ITRr3SrdhjaDxcBNTdyETWwPebj2unZyHD7ZdA==", "license": "Apache-2.0", "dependencies": { "@babel/runtime": "^7.26.10", @@ -934,16 +995,6 @@ "url": "https://github.com/sponsors/jonschlinkert" } }, - "node_modules/randombytes": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz", - "integrity": "sha512-vYl3iOX+4CKUWuxGi9Ukhie6fsqXqS9FE2Zaic4tNFD2N2QQaXOMFbuKK4QmDHC0JO6B1Zp41J0LpT0oR68amQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "safe-buffer": "^5.1.0" - } - }, "node_modules/resolve": { "version": "1.22.11", "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.11.tgz", @@ -976,9 +1027,9 @@ } }, "node_modules/rollup": { - "version": "4.53.3", - "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.53.3.tgz", - "integrity": "sha512-w8GmOxZfBmKknvdXU1sdM9NHcoQejwF/4mNgj2JuEEdRaHwwF12K7e9eXn1nLZ07ad+du76mkVsyeb2rKGllsA==", + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.59.0.tgz", + "integrity": "sha512-2oMpl67a3zCH9H79LeMcbDhXW/UmWG/y2zuqnF2jQq5uq9TbM9TVyXvA4+t+ne2IIkBdrLpAaRQAvo7YI/Yyeg==", "devOptional": true, "license": "MIT", "dependencies": { @@ -992,28 +1043,31 @@ "npm": ">=8.0.0" }, "optionalDependencies": { - "@rollup/rollup-android-arm-eabi": "4.53.3", - "@rollup/rollup-android-arm64": "4.53.3", - "@rollup/rollup-darwin-arm64": "4.53.3", - "@rollup/rollup-darwin-x64": "4.53.3", - "@rollup/rollup-freebsd-arm64": "4.53.3", - "@rollup/rollup-freebsd-x64": "4.53.3", - "@rollup/rollup-linux-arm-gnueabihf": "4.53.3", - "@rollup/rollup-linux-arm-musleabihf": "4.53.3", - "@rollup/rollup-linux-arm64-gnu": "4.53.3", - "@rollup/rollup-linux-arm64-musl": "4.53.3", - "@rollup/rollup-linux-loong64-gnu": "4.53.3", - "@rollup/rollup-linux-ppc64-gnu": "4.53.3", - "@rollup/rollup-linux-riscv64-gnu": "4.53.3", - "@rollup/rollup-linux-riscv64-musl": "4.53.3", - "@rollup/rollup-linux-s390x-gnu": "4.53.3", - "@rollup/rollup-linux-x64-gnu": "4.53.3", - "@rollup/rollup-linux-x64-musl": "4.53.3", - "@rollup/rollup-openharmony-arm64": "4.53.3", - "@rollup/rollup-win32-arm64-msvc": "4.53.3", - "@rollup/rollup-win32-ia32-msvc": "4.53.3", - "@rollup/rollup-win32-x64-gnu": "4.53.3", - "@rollup/rollup-win32-x64-msvc": "4.53.3", + "@rollup/rollup-android-arm-eabi": "4.59.0", + "@rollup/rollup-android-arm64": "4.59.0", + "@rollup/rollup-darwin-arm64": "4.59.0", + "@rollup/rollup-darwin-x64": "4.59.0", + "@rollup/rollup-freebsd-arm64": "4.59.0", + "@rollup/rollup-freebsd-x64": "4.59.0", + "@rollup/rollup-linux-arm-gnueabihf": "4.59.0", + "@rollup/rollup-linux-arm-musleabihf": "4.59.0", + "@rollup/rollup-linux-arm64-gnu": "4.59.0", + "@rollup/rollup-linux-arm64-musl": "4.59.0", + "@rollup/rollup-linux-loong64-gnu": "4.59.0", + "@rollup/rollup-linux-loong64-musl": "4.59.0", + "@rollup/rollup-linux-ppc64-gnu": "4.59.0", + "@rollup/rollup-linux-ppc64-musl": "4.59.0", + "@rollup/rollup-linux-riscv64-gnu": "4.59.0", + "@rollup/rollup-linux-riscv64-musl": "4.59.0", + "@rollup/rollup-linux-s390x-gnu": "4.59.0", + "@rollup/rollup-linux-x64-gnu": "4.59.0", + "@rollup/rollup-linux-x64-musl": "4.59.0", + "@rollup/rollup-openbsd-x64": "4.59.0", + "@rollup/rollup-openharmony-arm64": "4.59.0", + "@rollup/rollup-win32-arm64-msvc": "4.59.0", + "@rollup/rollup-win32-ia32-msvc": "4.59.0", + "@rollup/rollup-win32-x64-gnu": "4.59.0", + "@rollup/rollup-win32-x64-msvc": "4.59.0", "fsevents": "~2.3.2" } }, @@ -1078,27 +1132,6 @@ "url": "https://github.com/sponsors/jonschlinkert" } }, - "node_modules/safe-buffer": { - "version": "5.2.1", - "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", - "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", - "dev": true, - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "license": "MIT" - }, "node_modules/seedrandom": { "version": "3.0.5", "resolved": "https://registry.npmjs.org/seedrandom/-/seedrandom-3.0.5.tgz", @@ -1106,21 +1139,24 @@ "license": "MIT" }, "node_modules/serialize-javascript": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-6.0.2.tgz", - "integrity": "sha512-Saa1xPByTTq2gdeFZYLLo+RFE35NHZkAbqZeWNd3BpzppeVisAqpDjcp8dyf6uIvEqJRd46jemmyA4iFIeVk8g==", + "version": "7.0.4", + "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-7.0.4.tgz", + "integrity": "sha512-DuGdB+Po43Q5Jxwpzt1lhyFSYKryqoNjQSA9M92tyw0lyHIOur+XCalOUe0KTJpyqzT8+fQ5A0Jf7vCx/NKmIg==", "dev": true, "license": "BSD-3-Clause", - "dependencies": { - "randombytes": "^2.1.0" + "engines": { + "node": ">=20.0.0" } }, "node_modules/smob": { - "version": "1.5.0", - "resolved": "https://registry.npmjs.org/smob/-/smob-1.5.0.tgz", - "integrity": "sha512-g6T+p7QO8npa+/hNx9ohv1E5pVCmWrVCUzUXJyLdMmftX6ER0oiWY/w9knEonLpnOp6b6FenKnMfR8gqwWdwig==", + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/smob/-/smob-1.6.1.tgz", + "integrity": "sha512-KAkBqZl3c2GvNgNhcoyJae1aKldDW0LO279wF9bk1PnluRTETKBq0WyzRXxEhoQLk56yHaOY4JCBEKDuJIET5g==", "dev": true, - "license": "MIT" + "license": "MIT", + "engines": { + "node": ">=20.0.0" + } }, "node_modules/source-map": { "version": "0.6.1", @@ -1157,22 +1193,23 @@ } }, "node_modules/svelte": { - "version": "5.44.0", - "resolved": "https://registry.npmjs.org/svelte/-/svelte-5.44.0.tgz", - "integrity": "sha512-R7387No2zEGw4CtYtI2rgsui6BqjFARzoZFGLiLN5OPla0Pq4Ra2WwcP/zBomP3MYalhSNvF1fzDMuU0P0zPJw==", + "version": "5.53.9", + "resolved": "https://registry.npmjs.org/svelte/-/svelte-5.53.9.tgz", + "integrity": "sha512-MwDfWsN8qZzeP0jlQsWF4k/4B3csb3IbzCRggF+L/QqY7T8bbKvnChEo1cPZztF51HJQhilDbevWYl2LvXbquA==", "license": "MIT", "dependencies": { "@jridgewell/remapping": "^2.3.4", "@jridgewell/sourcemap-codec": "^1.5.0", "@sveltejs/acorn-typescript": "^1.0.5", "@types/estree": "^1.0.5", + "@types/trusted-types": "^2.0.7", "acorn": "^8.12.1", - "aria-query": "^5.3.1", + "aria-query": "5.3.1", "axobject-query": "^4.1.0", "clsx": "^2.1.1", - "devalue": "^5.5.0", + "devalue": "^5.6.3", "esm-env": "^1.2.1", - "esrap": "^2.1.0", + "esrap": "^2.2.2", "is-reference": "^3.0.3", "locate-character": "^3.0.0", "magic-string": "^0.30.11", @@ -1192,9 +1229,9 @@ } }, "node_modules/terser": { - "version": "5.44.1", - "resolved": "https://registry.npmjs.org/terser/-/terser-5.44.1.tgz", - "integrity": "sha512-t/R3R/n0MSwnnazuPpPNVO60LX0SKL45pyl9YlvxIdkH0Of7D5qM2EVe+yASRIlY5pZ73nclYJfNANGWPwFDZw==", + "version": "5.46.0", + "resolved": "https://registry.npmjs.org/terser/-/terser-5.46.0.tgz", + "integrity": "sha512-jTwoImyr/QbOWFFso3YoU3ik0jBBDJ6JTOQiy/J2YxVJdZCc+5u7skhNwiOR3FQIygFqVUPHl7qbbxtjW2K3Qg==", "dev": true, "license": "BSD-2-Clause", "dependencies": { @@ -1217,9 +1254,9 @@ "license": "MIT" }, "node_modules/typed-function": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/typed-function/-/typed-function-4.2.1.tgz", - "integrity": "sha512-EGjWssW7Tsk4DGfE+5yluuljS1OGYWiI1J6e8puZz9nTMM51Oug8CD5Zo4gWMsOhq5BI+1bF+rWTm4Vbj3ivRA==", + "version": "4.2.2", + "resolved": "https://registry.npmjs.org/typed-function/-/typed-function-4.2.2.tgz", + "integrity": "sha512-VwaXim9Gp1bngi/q3do8hgttYn2uC3MoT/gfuMWylnj1IeZBUAyPddHZlo1K05BDoj8DYPpMdiHqH1dDYdJf2A==", "license": "MIT", "engines": { "node": ">= 18" diff --git a/web/frontend/package.json b/web/frontend/package.json index 3f7434f7..0c206c66 100644 --- a/web/frontend/package.json +++ b/web/frontend/package.json @@ -1,20 +1,20 @@ { "name": "cc-frontend", - "version": "1.0.0", + "version": "1.5.0", "license": "MIT", "scripts": { "build": "rollup -c", "dev": "rollup -c -w" }, "devDependencies": { - "@rollup/plugin-commonjs": "^29.0.0", - "@rollup/plugin-node-resolve": "^16.0.1", - "@rollup/plugin-terser": "^0.4.4", + "@rollup/plugin-commonjs": "^29.0.2", + "@rollup/plugin-node-resolve": "^16.0.3", + "@rollup/plugin-terser": "^1.0.0", "@timohausmann/quadtree-js": "^1.2.6", - "rollup": "^4.53.3", + "rollup": "^4.59.0", "rollup-plugin-css-only": "^4.5.5", "rollup-plugin-svelte": "^7.2.3", - "svelte": "^5.44.0" + "svelte": "^5.53.9" }, "dependencies": { "@rollup/plugin-replace": "^6.0.3", @@ -22,8 +22,8 @@ "@urql/svelte": "^4.2.3", "chart.js": "^4.5.1", "date-fns": "^4.1.0", - "graphql": "^16.12.0", - "mathjs": "^15.0.0", + "graphql": "^16.13.1", + "mathjs": "^15.1.1", "uplot": "^1.6.32", "wonka": "^6.3.5" } diff --git a/web/frontend/rollup.config.mjs b/web/frontend/rollup.config.mjs index c92d8155..8aca6161 100644 --- a/web/frontend/rollup.config.mjs +++ b/web/frontend/rollup.config.mjs @@ -74,5 +74,7 @@ export default [ entrypoint('node', 'src/node.entrypoint.js'), entrypoint('analysis', 'src/analysis.entrypoint.js'), entrypoint('status', 'src/status.entrypoint.js'), - entrypoint('config', 'src/config.entrypoint.js') + entrypoint('dashpublic', 'src/dashpublic.entrypoint.js'), + entrypoint('config', 'src/config.entrypoint.js'), + entrypoint('logs', 'src/logs.entrypoint.js') ]; diff --git a/web/frontend/src/Analysis.root.svelte b/web/frontend/src/Analysis.root.svelte index 6c5d8507..f8ca1f3b 100644 --- a/web/frontend/src/Analysis.root.svelte +++ b/web/frontend/src/Analysis.root.svelte @@ -43,11 +43,14 @@ } = $props(); // By default, look at the jobs of the last 6 hours: + // svelte-ignore state_referenced_locally if (filterPresets?.startTime == null) { + // svelte-ignore state_referenced_locally if (filterPresets == null) filterPresets = {}; let now = new Date(Date.now()); let hourAgo = new Date(now); hourAgo.setHours(hourAgo.getHours() - 6); + // svelte-ignore state_referenced_locally filterPresets.startTime = { from: hourAgo.toISOString(), to: now.toISOString(), diff --git a/web/frontend/src/Config.root.svelte b/web/frontend/src/Config.root.svelte index e8a2045b..0e1daec3 100644 --- a/web/frontend/src/Config.root.svelte +++ b/web/frontend/src/Config.root.svelte @@ -6,7 +6,8 @@ - `isSupport Bool!`: Is currently logged in user support authority - `isApi Bool!`: Is currently logged in user api authority - `username String!`: Empty string if auth. is disabled, otherwise the username as string - - `ncontent String!`: The currently displayed message on the homescreen + - `ncontent String!`: The currently displayed message on the homescreen + - `clusterNames [String]`: The available clusternames --> @@ -30,7 +32,7 @@ Admin Options - + {/if} diff --git a/web/frontend/src/DashPublic.root.svelte b/web/frontend/src/DashPublic.root.svelte new file mode 100644 index 00000000..f66a6435 --- /dev/null +++ b/web/frontend/src/DashPublic.root.svelte @@ -0,0 +1,637 @@ + + + + + + + { + from = new Date(Date.now() - 5 * 60 * 1000); + to = new Date(Date.now()); + clusterFrom = new Date(Date.now() - (8 * 60 * 60 * 1000)) + + if (interval) stackedFrom += Math.floor(interval / 1000); + else stackedFrom += 1 // Workaround: TimeSelection not linked, just trigger new data on manual refresh + }} + /> + + + +{#if $statusQuery.fetching || $statesTimed.fetching} + + + + + + +{:else if $statusQuery.error || $statesTimed.error} + + + + + + + {#if $statusQuery.error} + + Error Requesting Status Data: {$statusQuery.error.message} + + {/if} + {#if $statesTimed.error} + + Error Requesting Node Scheduler States: {$statesTimed.error.message} + + {/if} + + +{:else} + +
+ + + + + + +

Cluster {presetCluster.charAt(0).toUpperCase() + presetCluster.slice(1)}

+ + + + +
+
+ +

CPU(s)

{[...clusterInfo?.processorTypes].join(', ')}

+
+
+ + + + + + + + + {clusterInfo?.runningJobs} + +
+ Running Jobs +
+ + + + {clusterInfo?.activeUsers} + +
+ Active Users +
+ + + + {clusterInfo?.allocatedNodes} + +
+ Active Nodes +
+ +
+ + + + {scaleNumber(clusterInfo?.flopRate, clusterInfo?.flopRateUnitPrefix)}{clusterInfo?.flopRateUnitBase} + +
+ Total Flop Rate +
+ + + + {scaleNumber(clusterInfo?.memBwRate, clusterInfo?.memBwRateUnitPrefix)}{clusterInfo?.memBwRateUnitBase} + +
+ Total Memory Bandwidth +
+ + {#if clusterInfo?.totalAccs !== 0} + + + {scaleNumber(clusterInfo?.gpuPwr, clusterInfo?.gpuPwrUnitPrefix)}{clusterInfo?.gpuPwrUnitBase} + +
+ Total GPU Power +
+ + {:else} + + + {scaleNumber(clusterInfo?.cpuPwr, clusterInfo?.cpuPwrUnitPrefix)}{clusterInfo?.cpuPwrUnitBase} + +
+ Total CPU Power +
+ + {/if} +
+ + + Active Cores + + + + {formatNumber(clusterInfo?.allocatedCores)} + {formatNumber(clusterInfo?.idleCores)} + + + + Idle Cores + + + {#if clusterInfo?.totalAccs !== 0} + + + Active GPU + + + + {formatNumber(clusterInfo?.allocatedAccs)} + {formatNumber(clusterInfo?.idleAccs)} + + + + Idle GPU + + + {/if} +
+
+ +
+ + + + +
+ Cluster Utilization ( + + {`${sortedClusterMetrics[0]?.name} (${sortedClusterMetrics[0]?.unit?.prefix}${sortedClusterMetrics[0]?.unit?.base})`} + , + + {`${sortedClusterMetrics[1]?.name} (${sortedClusterMetrics[1]?.unit?.prefix}${sortedClusterMetrics[1]?.unit?.base})`} + + ) +
+
+ {#key $statusQuery?.data?.clusterMetrics} + + {/key} +
+ + + +
+ {#key $statusQuery?.data?.nodeMetrics} + + {/key} +
+ +
+ + + + + {#if refinedStateData.length > 0} + +
+ {#key refinedStateData} + sd.count, + )} + entities={refinedStateData.map( + (sd) => sd.state, + )} + fixColors={refinedStateData.map( + (sd) => colors['nodeStates'][sd.state], + )} + /> + {/key} +
+ + + {#key refinedStateData} + + + + + + + {#each refinedStateData as sd, i} + + + + + + {/each} +
StateCount
{sd.state.charAt(0).toUpperCase() + sd.state.slice(1)}{sd.count}
+ {/key} + + {:else} + + Cannot render state status: No state data returned for Pie Chart + + {/if} +
+ + + +
+ {#key $statesTimed?.data?.nodeStatesTimed} + + {/key} +
+ +
+
+{/if} diff --git a/web/frontend/src/Header.svelte b/web/frontend/src/Header.svelte index 98a796a2..862981fd 100644 --- a/web/frontend/src/Header.svelte +++ b/web/frontend/src/Header.svelte @@ -4,8 +4,8 @@ Properties: - `username String`: Empty string if auth. is disabled, otherwise the username as string - `authlevel Number`: The current users authentication level - - `clusters [String]`: List of cluster names - - `subClusters [String]`: List of subCluster names + - `clusterNames [String]`: List of cluster names + - `subclusterMap map[String][]string`: Map of subclusters by cluster names - `roles [Number]`: Enum containing available roles --> @@ -28,8 +28,8 @@ let { username, authlevel, - clusters, - subClusters, + clusterNames, + subclusterMap, roles } = $props(); @@ -53,7 +53,9 @@ const views = [ { title: "My Jobs", + // svelte-ignore state_referenced_locally requiredRole: roles.user, + // svelte-ignore state_referenced_locally href: `/monitoring/user/${username}`, icon: "bar-chart-line", perCluster: false, @@ -61,7 +63,9 @@ menu: "none", }, { + // svelte-ignore state_referenced_locally title: jobsTitle.get(authlevel), + // svelte-ignore state_referenced_locally requiredRole: roles.user, href: `/monitoring/jobs/`, icon: "card-list", @@ -71,6 +75,7 @@ }, { title: "Tags", + // svelte-ignore state_referenced_locally requiredRole: roles.user, href: "/monitoring/tags/", icon: "tags", @@ -79,7 +84,9 @@ menu: "Jobs", }, { + // svelte-ignore state_referenced_locally title: usersTitle.get(authlevel), + // svelte-ignore state_referenced_locally requiredRole: roles.manager, href: "/monitoring/users/", icon: "people", @@ -88,7 +95,9 @@ menu: "Groups", }, { + // svelte-ignore state_referenced_locally title: projectsTitle.get(authlevel), + // svelte-ignore state_referenced_locally requiredRole: roles.manager, href: "/monitoring/projects/", icon: "journals", @@ -98,6 +107,7 @@ }, { title: "Nodes", + // svelte-ignore state_referenced_locally requiredRole: roles.support, href: "/monitoring/systems/", icon: "hdd-rack", @@ -107,6 +117,7 @@ }, { title: "Analysis", + // svelte-ignore state_referenced_locally requiredRole: roles.support, href: "/monitoring/analysis/", icon: "graph-up", @@ -116,10 +127,21 @@ }, { title: "Status", + // svelte-ignore state_referenced_locally requiredRole: roles.admin, href: "/monitoring/status/", icon: "clipboard-data", perCluster: true, + listOptions: true, + menu: "Info", + }, + { + title: "Logs", + // svelte-ignore state_referenced_locally + requiredRole: roles.admin, + href: "/monitoring/logs", + icon: "journal-text", + perCluster: false, listOptions: false, menu: "Info", }, @@ -152,15 +174,15 @@