Merge pull request #21 from ClusterCockpit/hotfix

Update Todos
2025-10-04 13:54:31 +02:00 · 2024-06-28 08:47:32 +02:00
parent 362adab938 bc77ac4839
commit 2bf4ec1744
2 changed files with 39 additions and 162 deletions
--- a/TODO.md
+++ b/TODO.md
@@ -1,15 +1,40 @@
-# TODOs
+# Possible Tasks and Improvements
- Improve checkpoints/archives
+Importance:
-  - Store information in each buffer if already archived
+
-  - Do not create new checkpoint if all buffers already archived
+- **I** Important
- Missing Testcases:
+- **N** Nice to have
-  - General tests
+- **W** Won't do. Probably not necessary.
-  - Check for corner cases that should fail gracefully
+
-  - Write a more realistic `ToArchive`/`FromArchive` tests
+- Benchmarking
- Optimization: Once a buffer is full, calculate min, max and avg
+  - Benchmark and compare common timeseries DBs with our data and our queries (N)
-  - Calculate averages buffer-wise, average weighted by length of buffer
+- Memory management
-  - Only the head-buffer needs to be fully traversed
+  - To overcome garbage collection overhead: Reimplement in Rust (N)
- Optimization: If aggregating over hwthreads/cores/sockets cache those results
+  - Request memory directly batchwise via mmap (started in branch) (W)
-  and reuse some of that for new queres aggregating only over the newer data
+- Archive
- ...
+  - S3 backend for archive (I)
  - Store information in each buffer if already archived (N)
  - Do not create new checkpoint if all buffers already archived (N)
 - Checkpoints
  - S3 backend for checkpoints (I)
  - Combine checkpoints into larger files (I)
  - Binary checkpoints (started in branch) (W)
 - API
  - Redesign query interface (N)
  - Introduce JWT authentication for REST and NATS (I)
 - Testing
  - General tests (I)
  - Test data generator for regression tests (I)
  - Check for corner cases that should fail gracefully (N)
  - Write a more realistic `ToArchive`/`FromArchive` Tests (N)
 - Aggregation
  - Calculate averages buffer-wise as soon as full, average weighted by length of buffer (N)
  - Only the head-buffer needs to be fully traversed (N)
  - If aggregating over hwthreads/cores/sockets cache those results and reuse
    some of that for new queries aggregating only over the newer data (W)
 - Compression
  - Enable compression for http API requests (N)
  - Enable compression for checkpoints/archive (I)
 - Sampling
  - Support data re sampling to reduce data points (I)
  - Use re sampling algorithms that preserve min/max as far as possible (I)
--- a/api/openapi.yaml
+++ b/api/openapi.yaml
@@ -1,148 +0,0 @@
 # OpenAPI spec describing a subset of the HTTP REST API for the cc-metric-store.
 openapi: 3.0.3
 info:
  title: 'cc-metric-store REST API'
  description: 'In-memory time series database for hpc metrics to be used with the [ClusterCockpit](https://github.com/ClusterCockpit) toolsuite'
  version: 0.1.0
 paths:
  '/api/write':
    post:
      operationId: 'writeMetrics'
      description: 'Recieves metrics in the influx line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md)'
      parameters:
        - name: cluster
          in: query
          schema: { type: string }
          description: "If the lines in the body do not have a cluster tag, use this value instead."
      requestBody:
        required: true
        content:
          'text/plain':
            example:
              'flops_any,cluster=emmy,hostname=e1001,type=cpu,type-id=0 value=42.0'
      responses:
        200:
          description: 'Everything went fine'
        400:
          description: 'Bad Request'
  '/api/query':
    post:
      operationId: 'queryMetrics'
      description: 'Query metrics'
      requestBody:
        required: true
        content:
          'application/json':
            schema:
              type: object
              required: [cluster, from, to]
              properties:
                cluster:
                  type: string
                from:
                  type: integer
                to:
                  type: integer
                with-stats:
                  type: boolean
                  default: true
                with-data:
                  type: boolean
                  default: true
                queries:
                  type: array
                  items:
                    $ref: '#/components/schemas/ApiQuery'
                for-all-nodes:
                  description: 'If not null, add a new query for every known host on that cluster and every metric (at node-scope) specified in this array to the request. This can be used to get a metric for every host in a cluster without knowing the name of every host.'
                  type: array
                  items:
                    type: string
      responses:
        200:
          description: 'Requested data and stats as JSON'
          content:
            'application/json':
              schema:
                type: object
                properties:
                  queries:
                    description: 'Only if for-all-nodes was used, this property exists.'
                  results:
                    type: array
                    description: 'Array where each element is a response to the query at that same index in the request'
                    items:
                      description: 'If `aggreg` is true, only ever has one element.'
                      type: array
                      items:
                        type: object
                        properties:
                          error:
                            description: 'If not null or undefined, an error happend processing that query'
                            type: string
                            nullable: true
                          data:
                            type: array
                            items:
                              type: number
                              nullable: true
                          avg: { type: number }
                          min: { type: number }
                          max: { type: number }
        400:
          description: 'Bad Request'
  '/api/free':
    post:
      operationId: 'freeBuffers'
      description: 'Allow all buffers containing only data older than `to`'
      parameters:
        - name: to
          in: query
          description: 'Unix Timestamp'
          required: true
          schema:
            type: integer
      requestBody:
        required: true
        content:
          'application/json':
            schema:
              type: array
              items:
                type: array
                items:
                  type: string
      responses:
        200:
          description: 'Everything went fine'
        400:
          description: 'Bad Request'
 components:
  schemas:
    ApiQuery:
      description: 'A single query for a specific metric resulting in one series'
      type: object
      required: [metric, hostname, aggreg]
      properties:
        metirc:
          type: string
        hostname:
          type: string
        type:
          description: 'Not required for node-level requests. Usually something like socket, cpu or hwthread.'
          type: string
        type-ids:
          type: array
          items:
            type: string
        aggreg:
          type: boolean
          description: 'If true, every query result will have exactly one element. Otherwise, the data for every requested type-id/sub-type-id is provided seperately'
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      bearerFormat: JWT
 security:
  - bearerAuth: [] # Applies `bearerAuth` globally