Merge pull request #29 from ClusterCockpit/main

Update devel branch
This commit is contained in:
adityauj 2024-10-26 21:25:36 +02:00 committed by GitHub
commit 2f3f70aa9f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 144 additions and 235 deletions

3
.gitignore vendored
View File

@ -17,9 +17,6 @@
# Project specific ignores # Project specific ignores
/var /var
/configs
sample.txt
migrateTimestamps.pl migrateTimestamps.pl
test_ccms_api.sh test_ccms_api.sh

64
.goreleaser.yaml Normal file
View File

@ -0,0 +1,64 @@
before:
hooks:
- go mod tidy
builds:
- env:
- CGO_ENABLED=0
goos:
- linux
goarch:
- amd64
goamd64:
- v3
id: "cc-metric-store"
binary: cc-metric-store
main: ./cmd/cc-metric-store
ldflags:
- -s -w -X main.version={{.Version}}
- -X main.commit={{.Commit}} -X main.date={{.Date}}
- -linkmode external -extldflags -static
tags:
- static_build
archives:
- format: tar.gz
# this name template makes the OS and Arch compatible with the results of uname.
name_template: >-
{{ .ProjectName }}_
{{- title .Os }}_
{{- if eq .Arch "amd64" }}x86_64
{{- else }}{{ .Arch }}{{ end }}
{{- if .Arm }}v{{ .Arm }}{{ end }}
checksum:
name_template: "checksums.txt"
snapshot:
name_template: "{{ incpatch .Version }}-next"
changelog:
sort: asc
filters:
include:
- "^feat:"
- "^fix:"
- "^sec:"
- "^docs:"
groups:
- title: "Dependency updates"
regexp: '^.*?(feat|fix)\(deps\)!?:.+$'
order: 300
- title: "New Features"
regexp: '^.*?feat(\([[:word:]]+\))??!?:.+$'
order: 100
- title: "Security updates"
regexp: '^.*?sec(\([[:word:]]+\))??!?:.+$'
order: 150
- title: "Bug fixes"
regexp: '^.*?fix(\([[:word:]]+\))??!?:.+$'
order: 200
- title: "Documentation updates"
regexp: ^.*?doc(\([[:word:]]+\))??!?:.+$
order: 400
release:
draft: false
footer: |
Please check out the [Release Notes](https://github.com/ClusterCockpit/cc-metric-store/blob/master/ReleaseNotes.md) for further details on breaking changes.
# vim: set ts=2 sw=2 tw=0 fo=cnqoj

View File

@ -1,17 +1,24 @@
TARGET = ./cc-metric-store TARGET = ./cc-metric-store
VERSION = 1.3.0 VAR = ./var/checkpoints/
VERSION = 0.1.0
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development') GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S") CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}' LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
.PHONY: clean test tags swagger $(TARGET) .PHONY: clean distclean test swagger $(TARGET)
.NOTPARALLEL: .NOTPARALLEL:
$(TARGET): $(TARGET): config.json $(VAR)
$(info ===> BUILD cc-metric-store) $(info ===> BUILD cc-metric-store)
@go build -ldflags=${LD_FLAGS} ./cmd/cc-metric-store @go build -ldflags=${LD_FLAGS} ./cmd/cc-metric-store
config.json:
@cp ./configs/config.json config.json
$(VAR):
@mkdir -p $(VAR)
swagger: swagger:
$(info ===> GENERATE swagger) $(info ===> GENERATE swagger)
@go run github.com/swaggo/swag/cmd/swag init -d ./internal/api,./internal/util -g api.go -o ./api @go run github.com/swaggo/swag/cmd/swag init -d ./internal/api,./internal/util -g api.go -o ./api
@ -22,13 +29,13 @@ clean:
@go clean @go clean
@rm -f $(TARGET) @rm -f $(TARGET)
distclean: clean
@rm -rf ./var
@rm -f config.json
test: test:
$(info ===> TESTING) $(info ===> TESTING)
@go clean -testcache @go clean -testcache
@go build ./... @go build ./...
@go vet ./... @go vet ./...
@go test ./... @go test ./...
tags:
$(info ===> TAGS)
@ctags -R

View File

@ -6,23 +6,40 @@ The cc-metric-store provides a simple in-memory time series database for storing
metrics of cluster nodes at preconfigured intervals. It is meant to be used as metrics of cluster nodes at preconfigured intervals. It is meant to be used as
part of the [ClusterCockpit suite](https://github.com/ClusterCockpit). As all part of the [ClusterCockpit suite](https://github.com/ClusterCockpit). As all
data is kept in-memory (but written to disk as compressed JSON for long term data is kept in-memory (but written to disk as compressed JSON for long term
storage), accessing it is very fast. It also provides aggregations over time storage), accessing it is very fast. It also provides topology aware
_and_ nodes/sockets/cpus. aggregations over time _and_ nodes/sockets/cpus.
There are major limitations: Data only gets written to disk at periodic There are major limitations: Data only gets written to disk at periodic
checkpoints, not as soon as it is received. checkpoints, not as soon as it is received. Also only the fixed configured
duration is stored and available.
Go look at the `TODO.md` file and the [GitHub Go look at the [GitHub
Issues](https://github.com/ClusterCockpit/cc-metric-store/issues) for a progress Issues](https://github.com/ClusterCockpit/cc-metric-store/issues) for a progress
overview. Things work, but are not properly tested. The overview. The [NATS.io](https://nats.io/) based writing endpoint consumes messages in [this
[NATS.io](https://nats.io/) based writing endpoint consumes messages in [this
format of the InfluxDB line format of the InfluxDB line
protocol](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md). protocol](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md).
## Building
`cc-metric-store` can be built using the provided `Makefile`.
It supports the following targets:
- `make`: Build the application, copy a example configuration file and generate
checkpoint folders if required.
- `make clean`: Clean the golang build cache and application binary
- `make distclean`: In addition to the clean target also remove the `./var`
folder
- `make swagger`: Regenerate the Swagger files from the source comments.
- `make test`: Run test and basic checks.
## REST API Endpoints ## REST API Endpoints
The REST API is documented in [openapi.yaml](./api/openapi.yaml) in the OpenAPI The REST API is documented in [swagger.json](./api/swagger.json). You can
3.0 format. explore and try the REST API using the integrated [SwaggerUI web
interface](http://localhost:8082/swagger).
For more information on the `cc-metric-store` REST API have a look at the
ClusterCockpit documentation [website](https://clustercockpit.org/docs/reference/cc-metric-store/ccms-rest-api/)
## Run tests ## Run tests
@ -41,19 +58,14 @@ go test -bench=. -race -v ./...
## What are these selectors mentioned in the code? ## What are these selectors mentioned in the code?
Tags in InfluxDB are used to build indexes over the stored data. InfluxDB-Tags The cc-metric-store works as a time-series database and uses the InfluxDB line
have no relation to each other, they do not depend on each other and have no protocol as input format. Unlike InfluxDB, the data is indexed by one single
hierarchy. Different tags build up different indexes (I am no expert at all, but strictly hierarchical tree structure. A selector is build out of the tags in the
this is how i think they work). InfluxDB line protocol, and can be used to select a node (not in the sense of a
compute node, can also be a socket, cpu, ...) in that tree. The implementation
This project also works as a time-series database and uses the InfluxDB line calls those nodes `level` to avoid confusion. It is impossible to access data
protocol. Unlike InfluxDB, the data is indexed by one single strictly only by knowing the _socket_ or _cpu_ tag, all higher up levels have to be
hierarchical tree structure. A selector is build out of the tags in the InfluxDB specified as well.
line protocol, and can be used to select a node (not in the sense of a compute
node, can also be a socket, cpu, ...) in that tree. The implementation calls
those nodes `level` to avoid confusion. It is impossible to access data only by
knowing the _socket_ or _cpu_ tag, all higher up levels have to be specified as
well.
This is what the hierarchy currently looks like: This is what the hierarchy currently looks like:
@ -67,6 +79,8 @@ This is what the hierarchy currently looks like:
- cpu3 - cpu3
- cpu4 - cpu4
- ... - ...
- gpu1
- gpu2
- host2 - host2
- ... - ...
- cluster2 - cluster2
@ -80,42 +94,14 @@ Example selectors:
## Config file ## Config file
All durations are specified as string that will be parsed [like You find the configuration options on the ClusterCockpit [website](https://clustercockpit.org/docs/reference/cc-metric-store/ccms-configuration/).
this](https://pkg.go.dev/time#ParseDuration) (Allowed suffixes: `s`, `m`, `h`,
...).
- `metrics`: Map of metric-name to objects with the following properties
- `frequency`: Timestep/Interval/Resolution of this metric
- `aggregation`: Can be `"sum"`, `"avg"` or `null`
- `null` means aggregation across nodes is forbidden for this metric
- `"sum"` means that values from the child levels are summed up for the parent level
- `"avg"` means that values from the child levels are averaged for the parent level
- `scope`: Unused at the moment, should be something like `"node"`, `"socket"` or `"hwthread"`
- `nats`:
- `address`: Url of NATS.io server, example: "nats://localhost:4222"
- `username` and `password`: Optional, if provided use those for the connection
- `subscriptions`:
- `subscribe-to`: Where to expect the measurements to be published
- `cluster-tag`: Default value for the cluster tag
- `http-api`:
- `address`: Address to bind to, for example `0.0.0.0:8080`
- `https-cert-file` and `https-key-file`: Optional, if provided enable HTTPS using those files as certificate/key
- `jwt-public-key`: Base64 encoded string, use this to verify requests to the HTTP API
- `retention-on-memory`: Keep all values in memory for at least that amount of time
- `checkpoints`:
- `interval`: Do checkpoints every X seconds/minutes/hours
- `directory`: Path to a directory
- `restore`: After a restart, load the last X seconds/minutes/hours of data back into memory
- `archive`:
- `interval`: Move and compress all checkpoints not needed anymore every X seconds/minutes/hours
- `directory`: Path to a directory
## Test the complete setup (excluding cc-backend itself) ## Test the complete setup (excluding cc-backend itself)
There are two ways for sending data to the cc-metric-store, both of which are There are two ways for sending data to the cc-metric-store, both of which are
supported by the supported by the
[cc-metric-collector](https://github.com/ClusterCockpit/cc-metric-collector). [cc-metric-collector](https://github.com/ClusterCockpit/cc-metric-collector).
This example uses Nats, the alternative is to use HTTP. This example uses NATS, the alternative is to use HTTP.
```sh ```sh
# Only needed once, downloads the docker image # Only needed once, downloads the docker image

9
ReleaseNotes.md Normal file
View File

@ -0,0 +1,9 @@
# `cc-metric-store` version 0.1.0
This is a minor release of `cc-metric-store`, the metric timeseries cache
implementation of ClusterCockpit.
For release specific notes visit the [ClusterCockpit Documentation](https://clusterockpit.org/docs/release/).
## Breaking changes
None

15
TODO.md
View File

@ -1,15 +0,0 @@
# TODOs
- Improve checkpoints/archives
- Store information in each buffer if already archived
- Do not create new checkpoint if all buffers already archived
- Missing Testcases:
- General tests
- Check for corner cases that should fail gracefully
- Write a more realistic `ToArchive`/`FromArchive` tests
- Optimization: Once a buffer is full, calculate min, max and avg
- Calculate averages buffer-wise, average weighted by length of buffer
- Only the head-buffer needs to be fully traversed
- Optimization: If aggregating over hwthreads/cores/sockets cache those results
and reuse some of that for new queres aggregating only over the newer data
- ...

View File

@ -1,148 +0,0 @@
# OpenAPI spec describing a subset of the HTTP REST API for the cc-metric-store.
openapi: 3.0.3
info:
title: 'cc-metric-store REST API'
description: 'In-memory time series database for hpc metrics to be used with the [ClusterCockpit](https://github.com/ClusterCockpit) toolsuite'
version: 0.1.0
paths:
'/api/write':
post:
operationId: 'writeMetrics'
description: 'Recieves metrics in the influx line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md)'
parameters:
- name: cluster
in: query
schema: { type: string }
description: "If the lines in the body do not have a cluster tag, use this value instead."
requestBody:
required: true
content:
'text/plain':
example:
'flops_any,cluster=emmy,hostname=e1001,type=cpu,type-id=0 value=42.0'
responses:
200:
description: 'Everything went fine'
400:
description: 'Bad Request'
'/api/query':
post:
operationId: 'queryMetrics'
description: 'Query metrics'
requestBody:
required: true
content:
'application/json':
schema:
type: object
required: [cluster, from, to]
properties:
cluster:
type: string
from:
type: integer
to:
type: integer
with-stats:
type: boolean
default: true
with-data:
type: boolean
default: true
queries:
type: array
items:
$ref: '#/components/schemas/ApiQuery'
for-all-nodes:
description: 'If not null, add a new query for every known host on that cluster and every metric (at node-scope) specified in this array to the request. This can be used to get a metric for every host in a cluster without knowing the name of every host.'
type: array
items:
type: string
responses:
200:
description: 'Requested data and stats as JSON'
content:
'application/json':
schema:
type: object
properties:
queries:
description: 'Only if for-all-nodes was used, this property exists.'
results:
type: array
description: 'Array where each element is a response to the query at that same index in the request'
items:
description: 'If `aggreg` is true, only ever has one element.'
type: array
items:
type: object
properties:
error:
description: 'If not null or undefined, an error happend processing that query'
type: string
nullable: true
data:
type: array
items:
type: number
nullable: true
avg: { type: number }
min: { type: number }
max: { type: number }
400:
description: 'Bad Request'
'/api/free':
post:
operationId: 'freeBuffers'
description: 'Allow all buffers containing only data older than `to`'
parameters:
- name: to
in: query
description: 'Unix Timestamp'
required: true
schema:
type: integer
requestBody:
required: true
content:
'application/json':
schema:
type: array
items:
type: array
items:
type: string
responses:
200:
description: 'Everything went fine'
400:
description: 'Bad Request'
components:
schemas:
ApiQuery:
description: 'A single query for a specific metric resulting in one series'
type: object
required: [metric, hostname, aggreg]
properties:
metirc:
type: string
hostname:
type: string
type:
description: 'Not required for node-level requests. Usually something like socket, cpu or hwthread.'
type: string
type-ids:
type: array
items:
type: string
aggreg:
type: boolean
description: 'If true, every query result will have exactly one element. Otherwise, the data for every requested type-id/sub-type-id is provided seperately'
securitySchemes:
bearerAuth:
type: http
scheme: bearer
bearerFormat: JWT
security:
- bearerAuth: [] # Applies `bearerAuth` globally

View File

@ -24,7 +24,7 @@
"ApiKeyAuth": [] "ApiKeyAuth": []
} }
], ],
"description": "Write metrics to store", "description": "This endpoint allows the users to print the content of",
"produces": [ "produces": [
"application/json" "application/json"
], ],
@ -81,6 +81,7 @@
"ApiKeyAuth": [] "ApiKeyAuth": []
} }
], ],
"description": "This endpoint allows the users to free the Buffers from the",
"produces": [ "produces": [
"application/json" "application/json"
], ],
@ -136,7 +137,7 @@
"ApiKeyAuth": [] "ApiKeyAuth": []
} }
], ],
"description": "Query metrics.", "description": "This endpoint allows the users to retrieve data from the",
"consumes": [ "consumes": [
"application/json" "application/json"
], ],

View File

@ -106,7 +106,7 @@ info:
paths: paths:
/debug/: /debug/:
post: post:
description: Write metrics to store description: This endpoint allows the users to print the content of
parameters: parameters:
- description: Selector - description: Selector
in: query in: query
@ -142,6 +142,7 @@ paths:
- debug - debug
/free/: /free/:
post: post:
description: This endpoint allows the users to free the Buffers from the
parameters: parameters:
- description: up to timestamp - description: up to timestamp
in: query in: query
@ -178,7 +179,7 @@ paths:
get: get:
consumes: consumes:
- application/json - application/json
description: Query metrics. description: This endpoint allows the users to retrieve data from the
parameters: parameters:
- description: API query payload object - description: API query payload object
in: body in: body

View File

@ -127,7 +127,10 @@ func (data *ApiMetricData) PadDataWithNull(ms *memorystore.MemoryStore, from, to
// handleFree godoc // handleFree godoc
// @summary // @summary
// @tags free // @tags free
// @description // @description This endpoint allows the users to free the Buffers from the
// metric store. This endpoint offers the users to remove then systematically
// and also allows then to prune the data under node, if they do not want to
// remove the whole node.
// @produce json // @produce json
// @param to query string false "up to timestamp" // @param to query string false "up to timestamp"
// @success 200 {string} string "ok" // @success 200 {string} string "ok"
@ -182,9 +185,9 @@ func handleFree(rw http.ResponseWriter, r *http.Request) {
} }
// handleWrite godoc // handleWrite godoc
// @summary Receive metrics in line-protocol // @summary Receive metrics in InfluxDB line-protocol
// @tags write // @tags write
// @description Receives metrics in the influx line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md) // @description Write data to the in-memory store in the InfluxDB line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md)
// @accept plain // @accept plain
// @produce json // @produce json
@ -245,7 +248,9 @@ type ApiQuery struct {
// handleQuery godoc // handleQuery godoc
// @summary Query metrics // @summary Query metrics
// @tags query // @tags query
// @description Query metrics. // @description This endpoint allows the users to retrieve data from the
// in-memory database. The CCMS will return data in JSON format for the
// specified interval requested by the user
// @accept json // @accept json
// @produce json // @produce json
// @param request body api.ApiQueryRequest true "API query payload object" // @param request body api.ApiQueryRequest true "API query payload object"
@ -383,7 +388,8 @@ func handleQuery(rw http.ResponseWriter, r *http.Request) {
// handleDebug godoc // handleDebug godoc
// @summary Debug endpoint // @summary Debug endpoint
// @tags debug // @tags debug
// @description Write metrics to store // @description This endpoint allows the users to print the content of
// nodes/clusters/metrics to review the state of the data.
// @produce json // @produce json
// @param selector query string false "Selector" // @param selector query string false "Selector"
// @success 200 {string} string "Debug dump" // @success 200 {string} string "Debug dump"

View File

@ -30,7 +30,7 @@ const docTemplate = `{
"ApiKeyAuth": [] "ApiKeyAuth": []
} }
], ],
"description": "Write metrics to store", "description": "This endpoint allows the users to print the content of",
"produces": [ "produces": [
"application/json" "application/json"
], ],
@ -87,6 +87,7 @@ const docTemplate = `{
"ApiKeyAuth": [] "ApiKeyAuth": []
} }
], ],
"description": "This endpoint allows the users to free the Buffers from the",
"produces": [ "produces": [
"application/json" "application/json"
], ],
@ -142,7 +143,7 @@ const docTemplate = `{
"ApiKeyAuth": [] "ApiKeyAuth": []
} }
], ],
"description": "Query metrics.", "description": "This endpoint allows the users to retrieve data from the",
"consumes": [ "consumes": [
"application/json" "application/json"
], ],