mirror of
https://github.com/ClusterCockpit/cc-metric-store.git
synced 2024-12-25 16:39:06 +01:00
commit
2f3f70aa9f
3
.gitignore
vendored
3
.gitignore
vendored
@ -17,9 +17,6 @@
|
||||
|
||||
# Project specific ignores
|
||||
/var
|
||||
/configs
|
||||
|
||||
sample.txt
|
||||
|
||||
migrateTimestamps.pl
|
||||
test_ccms_api.sh
|
||||
|
64
.goreleaser.yaml
Normal file
64
.goreleaser.yaml
Normal file
@ -0,0 +1,64 @@
|
||||
before:
|
||||
hooks:
|
||||
- go mod tidy
|
||||
builds:
|
||||
- env:
|
||||
- CGO_ENABLED=0
|
||||
goos:
|
||||
- linux
|
||||
goarch:
|
||||
- amd64
|
||||
goamd64:
|
||||
- v3
|
||||
id: "cc-metric-store"
|
||||
binary: cc-metric-store
|
||||
main: ./cmd/cc-metric-store
|
||||
ldflags:
|
||||
- -s -w -X main.version={{.Version}}
|
||||
- -X main.commit={{.Commit}} -X main.date={{.Date}}
|
||||
- -linkmode external -extldflags -static
|
||||
tags:
|
||||
- static_build
|
||||
archives:
|
||||
- format: tar.gz
|
||||
# this name template makes the OS and Arch compatible with the results of uname.
|
||||
name_template: >-
|
||||
{{ .ProjectName }}_
|
||||
{{- title .Os }}_
|
||||
{{- if eq .Arch "amd64" }}x86_64
|
||||
{{- else }}{{ .Arch }}{{ end }}
|
||||
{{- if .Arm }}v{{ .Arm }}{{ end }}
|
||||
checksum:
|
||||
name_template: "checksums.txt"
|
||||
snapshot:
|
||||
name_template: "{{ incpatch .Version }}-next"
|
||||
changelog:
|
||||
sort: asc
|
||||
filters:
|
||||
include:
|
||||
- "^feat:"
|
||||
- "^fix:"
|
||||
- "^sec:"
|
||||
- "^docs:"
|
||||
groups:
|
||||
- title: "Dependency updates"
|
||||
regexp: '^.*?(feat|fix)\(deps\)!?:.+$'
|
||||
order: 300
|
||||
- title: "New Features"
|
||||
regexp: '^.*?feat(\([[:word:]]+\))??!?:.+$'
|
||||
order: 100
|
||||
- title: "Security updates"
|
||||
regexp: '^.*?sec(\([[:word:]]+\))??!?:.+$'
|
||||
order: 150
|
||||
- title: "Bug fixes"
|
||||
regexp: '^.*?fix(\([[:word:]]+\))??!?:.+$'
|
||||
order: 200
|
||||
- title: "Documentation updates"
|
||||
regexp: ^.*?doc(\([[:word:]]+\))??!?:.+$
|
||||
order: 400
|
||||
release:
|
||||
draft: false
|
||||
footer: |
|
||||
Please check out the [Release Notes](https://github.com/ClusterCockpit/cc-metric-store/blob/master/ReleaseNotes.md) for further details on breaking changes.
|
||||
|
||||
# vim: set ts=2 sw=2 tw=0 fo=cnqoj
|
21
Makefile
21
Makefile
@ -1,17 +1,24 @@
|
||||
TARGET = ./cc-metric-store
|
||||
VERSION = 1.3.0
|
||||
VAR = ./var/checkpoints/
|
||||
VERSION = 0.1.0
|
||||
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
|
||||
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
|
||||
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
|
||||
|
||||
.PHONY: clean test tags swagger $(TARGET)
|
||||
.PHONY: clean distclean test swagger $(TARGET)
|
||||
|
||||
.NOTPARALLEL:
|
||||
|
||||
$(TARGET):
|
||||
$(TARGET): config.json $(VAR)
|
||||
$(info ===> BUILD cc-metric-store)
|
||||
@go build -ldflags=${LD_FLAGS} ./cmd/cc-metric-store
|
||||
|
||||
config.json:
|
||||
@cp ./configs/config.json config.json
|
||||
|
||||
$(VAR):
|
||||
@mkdir -p $(VAR)
|
||||
|
||||
swagger:
|
||||
$(info ===> GENERATE swagger)
|
||||
@go run github.com/swaggo/swag/cmd/swag init -d ./internal/api,./internal/util -g api.go -o ./api
|
||||
@ -22,13 +29,13 @@ clean:
|
||||
@go clean
|
||||
@rm -f $(TARGET)
|
||||
|
||||
distclean: clean
|
||||
@rm -rf ./var
|
||||
@rm -f config.json
|
||||
|
||||
test:
|
||||
$(info ===> TESTING)
|
||||
@go clean -testcache
|
||||
@go build ./...
|
||||
@go vet ./...
|
||||
@go test ./...
|
||||
|
||||
tags:
|
||||
$(info ===> TAGS)
|
||||
@ctags -R
|
||||
|
88
README.md
88
README.md
@ -6,23 +6,40 @@ The cc-metric-store provides a simple in-memory time series database for storing
|
||||
metrics of cluster nodes at preconfigured intervals. It is meant to be used as
|
||||
part of the [ClusterCockpit suite](https://github.com/ClusterCockpit). As all
|
||||
data is kept in-memory (but written to disk as compressed JSON for long term
|
||||
storage), accessing it is very fast. It also provides aggregations over time
|
||||
_and_ nodes/sockets/cpus.
|
||||
storage), accessing it is very fast. It also provides topology aware
|
||||
aggregations over time _and_ nodes/sockets/cpus.
|
||||
|
||||
There are major limitations: Data only gets written to disk at periodic
|
||||
checkpoints, not as soon as it is received.
|
||||
checkpoints, not as soon as it is received. Also only the fixed configured
|
||||
duration is stored and available.
|
||||
|
||||
Go look at the `TODO.md` file and the [GitHub
|
||||
Go look at the [GitHub
|
||||
Issues](https://github.com/ClusterCockpit/cc-metric-store/issues) for a progress
|
||||
overview. Things work, but are not properly tested. The
|
||||
[NATS.io](https://nats.io/) based writing endpoint consumes messages in [this
|
||||
overview. The [NATS.io](https://nats.io/) based writing endpoint consumes messages in [this
|
||||
format of the InfluxDB line
|
||||
protocol](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md).
|
||||
|
||||
## Building
|
||||
|
||||
`cc-metric-store` can be built using the provided `Makefile`.
|
||||
It supports the following targets:
|
||||
|
||||
- `make`: Build the application, copy a example configuration file and generate
|
||||
checkpoint folders if required.
|
||||
- `make clean`: Clean the golang build cache and application binary
|
||||
- `make distclean`: In addition to the clean target also remove the `./var`
|
||||
folder
|
||||
- `make swagger`: Regenerate the Swagger files from the source comments.
|
||||
- `make test`: Run test and basic checks.
|
||||
|
||||
## REST API Endpoints
|
||||
|
||||
The REST API is documented in [openapi.yaml](./api/openapi.yaml) in the OpenAPI
|
||||
3.0 format.
|
||||
The REST API is documented in [swagger.json](./api/swagger.json). You can
|
||||
explore and try the REST API using the integrated [SwaggerUI web
|
||||
interface](http://localhost:8082/swagger).
|
||||
|
||||
For more information on the `cc-metric-store` REST API have a look at the
|
||||
ClusterCockpit documentation [website](https://clustercockpit.org/docs/reference/cc-metric-store/ccms-rest-api/)
|
||||
|
||||
## Run tests
|
||||
|
||||
@ -41,19 +58,14 @@ go test -bench=. -race -v ./...
|
||||
|
||||
## What are these selectors mentioned in the code?
|
||||
|
||||
Tags in InfluxDB are used to build indexes over the stored data. InfluxDB-Tags
|
||||
have no relation to each other, they do not depend on each other and have no
|
||||
hierarchy. Different tags build up different indexes (I am no expert at all, but
|
||||
this is how i think they work).
|
||||
|
||||
This project also works as a time-series database and uses the InfluxDB line
|
||||
protocol. Unlike InfluxDB, the data is indexed by one single strictly
|
||||
hierarchical tree structure. A selector is build out of the tags in the InfluxDB
|
||||
line protocol, and can be used to select a node (not in the sense of a compute
|
||||
node, can also be a socket, cpu, ...) in that tree. The implementation calls
|
||||
those nodes `level` to avoid confusion. It is impossible to access data only by
|
||||
knowing the _socket_ or _cpu_ tag, all higher up levels have to be specified as
|
||||
well.
|
||||
The cc-metric-store works as a time-series database and uses the InfluxDB line
|
||||
protocol as input format. Unlike InfluxDB, the data is indexed by one single
|
||||
strictly hierarchical tree structure. A selector is build out of the tags in the
|
||||
InfluxDB line protocol, and can be used to select a node (not in the sense of a
|
||||
compute node, can also be a socket, cpu, ...) in that tree. The implementation
|
||||
calls those nodes `level` to avoid confusion. It is impossible to access data
|
||||
only by knowing the _socket_ or _cpu_ tag, all higher up levels have to be
|
||||
specified as well.
|
||||
|
||||
This is what the hierarchy currently looks like:
|
||||
|
||||
@ -67,6 +79,8 @@ This is what the hierarchy currently looks like:
|
||||
- cpu3
|
||||
- cpu4
|
||||
- ...
|
||||
- gpu1
|
||||
- gpu2
|
||||
- host2
|
||||
- ...
|
||||
- cluster2
|
||||
@ -80,42 +94,14 @@ Example selectors:
|
||||
|
||||
## Config file
|
||||
|
||||
All durations are specified as string that will be parsed [like
|
||||
this](https://pkg.go.dev/time#ParseDuration) (Allowed suffixes: `s`, `m`, `h`,
|
||||
...).
|
||||
|
||||
- `metrics`: Map of metric-name to objects with the following properties
|
||||
- `frequency`: Timestep/Interval/Resolution of this metric
|
||||
- `aggregation`: Can be `"sum"`, `"avg"` or `null`
|
||||
- `null` means aggregation across nodes is forbidden for this metric
|
||||
- `"sum"` means that values from the child levels are summed up for the parent level
|
||||
- `"avg"` means that values from the child levels are averaged for the parent level
|
||||
- `scope`: Unused at the moment, should be something like `"node"`, `"socket"` or `"hwthread"`
|
||||
- `nats`:
|
||||
- `address`: Url of NATS.io server, example: "nats://localhost:4222"
|
||||
- `username` and `password`: Optional, if provided use those for the connection
|
||||
- `subscriptions`:
|
||||
- `subscribe-to`: Where to expect the measurements to be published
|
||||
- `cluster-tag`: Default value for the cluster tag
|
||||
- `http-api`:
|
||||
- `address`: Address to bind to, for example `0.0.0.0:8080`
|
||||
- `https-cert-file` and `https-key-file`: Optional, if provided enable HTTPS using those files as certificate/key
|
||||
- `jwt-public-key`: Base64 encoded string, use this to verify requests to the HTTP API
|
||||
- `retention-on-memory`: Keep all values in memory for at least that amount of time
|
||||
- `checkpoints`:
|
||||
- `interval`: Do checkpoints every X seconds/minutes/hours
|
||||
- `directory`: Path to a directory
|
||||
- `restore`: After a restart, load the last X seconds/minutes/hours of data back into memory
|
||||
- `archive`:
|
||||
- `interval`: Move and compress all checkpoints not needed anymore every X seconds/minutes/hours
|
||||
- `directory`: Path to a directory
|
||||
You find the configuration options on the ClusterCockpit [website](https://clustercockpit.org/docs/reference/cc-metric-store/ccms-configuration/).
|
||||
|
||||
## Test the complete setup (excluding cc-backend itself)
|
||||
|
||||
There are two ways for sending data to the cc-metric-store, both of which are
|
||||
supported by the
|
||||
[cc-metric-collector](https://github.com/ClusterCockpit/cc-metric-collector).
|
||||
This example uses Nats, the alternative is to use HTTP.
|
||||
This example uses NATS, the alternative is to use HTTP.
|
||||
|
||||
```sh
|
||||
# Only needed once, downloads the docker image
|
||||
|
9
ReleaseNotes.md
Normal file
9
ReleaseNotes.md
Normal file
@ -0,0 +1,9 @@
|
||||
# `cc-metric-store` version 0.1.0
|
||||
|
||||
This is a minor release of `cc-metric-store`, the metric timeseries cache
|
||||
implementation of ClusterCockpit.
|
||||
For release specific notes visit the [ClusterCockpit Documentation](https://clusterockpit.org/docs/release/).
|
||||
|
||||
## Breaking changes
|
||||
|
||||
None
|
15
TODO.md
15
TODO.md
@ -1,15 +0,0 @@
|
||||
# TODOs
|
||||
|
||||
- Improve checkpoints/archives
|
||||
- Store information in each buffer if already archived
|
||||
- Do not create new checkpoint if all buffers already archived
|
||||
- Missing Testcases:
|
||||
- General tests
|
||||
- Check for corner cases that should fail gracefully
|
||||
- Write a more realistic `ToArchive`/`FromArchive` tests
|
||||
- Optimization: Once a buffer is full, calculate min, max and avg
|
||||
- Calculate averages buffer-wise, average weighted by length of buffer
|
||||
- Only the head-buffer needs to be fully traversed
|
||||
- Optimization: If aggregating over hwthreads/cores/sockets cache those results
|
||||
and reuse some of that for new queres aggregating only over the newer data
|
||||
- ...
|
148
api/openapi.yaml
148
api/openapi.yaml
@ -1,148 +0,0 @@
|
||||
# OpenAPI spec describing a subset of the HTTP REST API for the cc-metric-store.
|
||||
|
||||
openapi: 3.0.3
|
||||
info:
|
||||
title: 'cc-metric-store REST API'
|
||||
description: 'In-memory time series database for hpc metrics to be used with the [ClusterCockpit](https://github.com/ClusterCockpit) toolsuite'
|
||||
version: 0.1.0
|
||||
paths:
|
||||
'/api/write':
|
||||
post:
|
||||
operationId: 'writeMetrics'
|
||||
description: 'Recieves metrics in the influx line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md)'
|
||||
parameters:
|
||||
- name: cluster
|
||||
in: query
|
||||
schema: { type: string }
|
||||
description: "If the lines in the body do not have a cluster tag, use this value instead."
|
||||
requestBody:
|
||||
required: true
|
||||
content:
|
||||
'text/plain':
|
||||
example:
|
||||
'flops_any,cluster=emmy,hostname=e1001,type=cpu,type-id=0 value=42.0'
|
||||
responses:
|
||||
200:
|
||||
description: 'Everything went fine'
|
||||
400:
|
||||
description: 'Bad Request'
|
||||
'/api/query':
|
||||
post:
|
||||
operationId: 'queryMetrics'
|
||||
description: 'Query metrics'
|
||||
requestBody:
|
||||
required: true
|
||||
content:
|
||||
'application/json':
|
||||
schema:
|
||||
type: object
|
||||
required: [cluster, from, to]
|
||||
properties:
|
||||
cluster:
|
||||
type: string
|
||||
from:
|
||||
type: integer
|
||||
to:
|
||||
type: integer
|
||||
with-stats:
|
||||
type: boolean
|
||||
default: true
|
||||
with-data:
|
||||
type: boolean
|
||||
default: true
|
||||
queries:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/ApiQuery'
|
||||
for-all-nodes:
|
||||
description: 'If not null, add a new query for every known host on that cluster and every metric (at node-scope) specified in this array to the request. This can be used to get a metric for every host in a cluster without knowing the name of every host.'
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
responses:
|
||||
200:
|
||||
description: 'Requested data and stats as JSON'
|
||||
content:
|
||||
'application/json':
|
||||
schema:
|
||||
type: object
|
||||
properties:
|
||||
queries:
|
||||
description: 'Only if for-all-nodes was used, this property exists.'
|
||||
results:
|
||||
type: array
|
||||
description: 'Array where each element is a response to the query at that same index in the request'
|
||||
items:
|
||||
description: 'If `aggreg` is true, only ever has one element.'
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
error:
|
||||
description: 'If not null or undefined, an error happend processing that query'
|
||||
type: string
|
||||
nullable: true
|
||||
data:
|
||||
type: array
|
||||
items:
|
||||
type: number
|
||||
nullable: true
|
||||
avg: { type: number }
|
||||
min: { type: number }
|
||||
max: { type: number }
|
||||
400:
|
||||
description: 'Bad Request'
|
||||
'/api/free':
|
||||
post:
|
||||
operationId: 'freeBuffers'
|
||||
description: 'Allow all buffers containing only data older than `to`'
|
||||
parameters:
|
||||
- name: to
|
||||
in: query
|
||||
description: 'Unix Timestamp'
|
||||
required: true
|
||||
schema:
|
||||
type: integer
|
||||
requestBody:
|
||||
required: true
|
||||
content:
|
||||
'application/json':
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
responses:
|
||||
200:
|
||||
description: 'Everything went fine'
|
||||
400:
|
||||
description: 'Bad Request'
|
||||
components:
|
||||
schemas:
|
||||
ApiQuery:
|
||||
description: 'A single query for a specific metric resulting in one series'
|
||||
type: object
|
||||
required: [metric, hostname, aggreg]
|
||||
properties:
|
||||
metirc:
|
||||
type: string
|
||||
hostname:
|
||||
type: string
|
||||
type:
|
||||
description: 'Not required for node-level requests. Usually something like socket, cpu or hwthread.'
|
||||
type: string
|
||||
type-ids:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
aggreg:
|
||||
type: boolean
|
||||
description: 'If true, every query result will have exactly one element. Otherwise, the data for every requested type-id/sub-type-id is provided seperately'
|
||||
securitySchemes:
|
||||
bearerAuth:
|
||||
type: http
|
||||
scheme: bearer
|
||||
bearerFormat: JWT
|
||||
security:
|
||||
- bearerAuth: [] # Applies `bearerAuth` globally
|
@ -24,7 +24,7 @@
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Write metrics to store",
|
||||
"description": "This endpoint allows the users to print the content of",
|
||||
"produces": [
|
||||
"application/json"
|
||||
],
|
||||
@ -81,6 +81,7 @@
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "This endpoint allows the users to free the Buffers from the",
|
||||
"produces": [
|
||||
"application/json"
|
||||
],
|
||||
@ -136,7 +137,7 @@
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Query metrics.",
|
||||
"description": "This endpoint allows the users to retrieve data from the",
|
||||
"consumes": [
|
||||
"application/json"
|
||||
],
|
||||
|
@ -106,7 +106,7 @@ info:
|
||||
paths:
|
||||
/debug/:
|
||||
post:
|
||||
description: Write metrics to store
|
||||
description: This endpoint allows the users to print the content of
|
||||
parameters:
|
||||
- description: Selector
|
||||
in: query
|
||||
@ -142,6 +142,7 @@ paths:
|
||||
- debug
|
||||
/free/:
|
||||
post:
|
||||
description: This endpoint allows the users to free the Buffers from the
|
||||
parameters:
|
||||
- description: up to timestamp
|
||||
in: query
|
||||
@ -178,7 +179,7 @@ paths:
|
||||
get:
|
||||
consumes:
|
||||
- application/json
|
||||
description: Query metrics.
|
||||
description: This endpoint allows the users to retrieve data from the
|
||||
parameters:
|
||||
- description: API query payload object
|
||||
in: body
|
||||
|
@ -127,7 +127,10 @@ func (data *ApiMetricData) PadDataWithNull(ms *memorystore.MemoryStore, from, to
|
||||
// handleFree godoc
|
||||
// @summary
|
||||
// @tags free
|
||||
// @description
|
||||
// @description This endpoint allows the users to free the Buffers from the
|
||||
// metric store. This endpoint offers the users to remove then systematically
|
||||
// and also allows then to prune the data under node, if they do not want to
|
||||
// remove the whole node.
|
||||
// @produce json
|
||||
// @param to query string false "up to timestamp"
|
||||
// @success 200 {string} string "ok"
|
||||
@ -182,9 +185,9 @@ func handleFree(rw http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
// handleWrite godoc
|
||||
// @summary Receive metrics in line-protocol
|
||||
// @summary Receive metrics in InfluxDB line-protocol
|
||||
// @tags write
|
||||
// @description Receives metrics in the influx line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md)
|
||||
// @description Write data to the in-memory store in the InfluxDB line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md)
|
||||
|
||||
// @accept plain
|
||||
// @produce json
|
||||
@ -245,7 +248,9 @@ type ApiQuery struct {
|
||||
// handleQuery godoc
|
||||
// @summary Query metrics
|
||||
// @tags query
|
||||
// @description Query metrics.
|
||||
// @description This endpoint allows the users to retrieve data from the
|
||||
// in-memory database. The CCMS will return data in JSON format for the
|
||||
// specified interval requested by the user
|
||||
// @accept json
|
||||
// @produce json
|
||||
// @param request body api.ApiQueryRequest true "API query payload object"
|
||||
@ -383,7 +388,8 @@ func handleQuery(rw http.ResponseWriter, r *http.Request) {
|
||||
// handleDebug godoc
|
||||
// @summary Debug endpoint
|
||||
// @tags debug
|
||||
// @description Write metrics to store
|
||||
// @description This endpoint allows the users to print the content of
|
||||
// nodes/clusters/metrics to review the state of the data.
|
||||
// @produce json
|
||||
// @param selector query string false "Selector"
|
||||
// @success 200 {string} string "Debug dump"
|
||||
|
@ -30,7 +30,7 @@ const docTemplate = `{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Write metrics to store",
|
||||
"description": "This endpoint allows the users to print the content of",
|
||||
"produces": [
|
||||
"application/json"
|
||||
],
|
||||
@ -87,6 +87,7 @@ const docTemplate = `{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "This endpoint allows the users to free the Buffers from the",
|
||||
"produces": [
|
||||
"application/json"
|
||||
],
|
||||
@ -142,7 +143,7 @@ const docTemplate = `{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "Query metrics.",
|
||||
"description": "This endpoint allows the users to retrieve data from the",
|
||||
"consumes": [
|
||||
"application/json"
|
||||
],
|
||||
|
Loading…
Reference in New Issue
Block a user