Merge pull request #21 from ClusterCockpit/hotfix

Update Todos
This commit is contained in:
Jan Eitzinger 2024-06-28 08:47:32 +02:00 committed by GitHub
commit 2bf4ec1744
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 39 additions and 162 deletions

53
TODO.md
View File

@ -1,15 +1,40 @@
# TODOs
# Possible Tasks and Improvements
- Improve checkpoints/archives
- Store information in each buffer if already archived
- Do not create new checkpoint if all buffers already archived
- Missing Testcases:
- General tests
- Check for corner cases that should fail gracefully
- Write a more realistic `ToArchive`/`FromArchive` tests
- Optimization: Once a buffer is full, calculate min, max and avg
- Calculate averages buffer-wise, average weighted by length of buffer
- Only the head-buffer needs to be fully traversed
- Optimization: If aggregating over hwthreads/cores/sockets cache those results
and reuse some of that for new queres aggregating only over the newer data
- ...
Importance:
- **I** Important
- **N** Nice to have
- **W** Won't do. Probably not necessary.
- Benchmarking
- Benchmark and compare common timeseries DBs with our data and our queries (N)
- Memory management
- To overcome garbage collection overhead: Reimplement in Rust (N)
- Request memory directly batchwise via mmap (started in branch) (W)
- Archive
- S3 backend for archive (I)
- Store information in each buffer if already archived (N)
- Do not create new checkpoint if all buffers already archived (N)
- Checkpoints
- S3 backend for checkpoints (I)
- Combine checkpoints into larger files (I)
- Binary checkpoints (started in branch) (W)
- API
- Redesign query interface (N)
- Introduce JWT authentication for REST and NATS (I)
- Testing
- General tests (I)
- Test data generator for regression tests (I)
- Check for corner cases that should fail gracefully (N)
- Write a more realistic `ToArchive`/`FromArchive` Tests (N)
- Aggregation
- Calculate averages buffer-wise as soon as full, average weighted by length of buffer (N)
- Only the head-buffer needs to be fully traversed (N)
- If aggregating over hwthreads/cores/sockets cache those results and reuse
some of that for new queries aggregating only over the newer data (W)
- Compression
- Enable compression for http API requests (N)
- Enable compression for checkpoints/archive (I)
- Sampling
- Support data re sampling to reduce data points (I)
- Use re sampling algorithms that preserve min/max as far as possible (I)

View File

@ -1,148 +0,0 @@
# OpenAPI spec describing a subset of the HTTP REST API for the cc-metric-store.
openapi: 3.0.3
info:
title: 'cc-metric-store REST API'
description: 'In-memory time series database for hpc metrics to be used with the [ClusterCockpit](https://github.com/ClusterCockpit) toolsuite'
version: 0.1.0
paths:
'/api/write':
post:
operationId: 'writeMetrics'
description: 'Recieves metrics in the influx line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md)'
parameters:
- name: cluster
in: query
schema: { type: string }
description: "If the lines in the body do not have a cluster tag, use this value instead."
requestBody:
required: true
content:
'text/plain':
example:
'flops_any,cluster=emmy,hostname=e1001,type=cpu,type-id=0 value=42.0'
responses:
200:
description: 'Everything went fine'
400:
description: 'Bad Request'
'/api/query':
post:
operationId: 'queryMetrics'
description: 'Query metrics'
requestBody:
required: true
content:
'application/json':
schema:
type: object
required: [cluster, from, to]
properties:
cluster:
type: string
from:
type: integer
to:
type: integer
with-stats:
type: boolean
default: true
with-data:
type: boolean
default: true
queries:
type: array
items:
$ref: '#/components/schemas/ApiQuery'
for-all-nodes:
description: 'If not null, add a new query for every known host on that cluster and every metric (at node-scope) specified in this array to the request. This can be used to get a metric for every host in a cluster without knowing the name of every host.'
type: array
items:
type: string
responses:
200:
description: 'Requested data and stats as JSON'
content:
'application/json':
schema:
type: object
properties:
queries:
description: 'Only if for-all-nodes was used, this property exists.'
results:
type: array
description: 'Array where each element is a response to the query at that same index in the request'
items:
description: 'If `aggreg` is true, only ever has one element.'
type: array
items:
type: object
properties:
error:
description: 'If not null or undefined, an error happend processing that query'
type: string
nullable: true
data:
type: array
items:
type: number
nullable: true
avg: { type: number }
min: { type: number }
max: { type: number }
400:
description: 'Bad Request'
'/api/free':
post:
operationId: 'freeBuffers'
description: 'Allow all buffers containing only data older than `to`'
parameters:
- name: to
in: query
description: 'Unix Timestamp'
required: true
schema:
type: integer
requestBody:
required: true
content:
'application/json':
schema:
type: array
items:
type: array
items:
type: string
responses:
200:
description: 'Everything went fine'
400:
description: 'Bad Request'
components:
schemas:
ApiQuery:
description: 'A single query for a specific metric resulting in one series'
type: object
required: [metric, hostname, aggreg]
properties:
metirc:
type: string
hostname:
type: string
type:
description: 'Not required for node-level requests. Usually something like socket, cpu or hwthread.'
type: string
type-ids:
type: array
items:
type: string
aggreg:
type: boolean
description: 'If true, every query result will have exactly one element. Otherwise, the data for every requested type-id/sub-type-id is provided seperately'
securitySchemes:
bearerAuth:
type: http
scheme: bearer
bearerFormat: JWT
security:
- bearerAuth: [] # Applies `bearerAuth` globally