8 Commits

Author SHA1 Message Date
Thomas Roehl
3f3a1501ff Return unit in API requests per default 2023-08-22 16:16:19 +02:00
Thomas Roehl
997dd8f2ee Update test case to check the metric directly 2023-08-22 16:15:01 +02:00
Thomas Roehl
2d77dae30c Add basic test case for reading in checkpoint files 2023-08-22 15:17:27 +02:00
Thomas Roehl
5e5586f319 Add unit to buffers 2023-08-21 15:47:16 +02:00
Thomas Roehl
e71e1b123b Minor fix to Makefile 2023-04-14 18:25:20 +02:00
Thomas Roehl
7c891e1593 Add request flag WithUnit and return the metric unit if requested 2023-04-14 18:25:07 +02:00
Thomas Roehl
051cba4666 Add unit for archive reader/writer 2023-04-14 18:24:40 +02:00
Thomas Roehl
89acbe8db2 Accept unit tags for incoming metrics and store it in the buffer. 2023-04-14 18:24:22 +02:00
62 changed files with 4411 additions and 5960 deletions

View File

@@ -1,11 +0,0 @@
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
version: 2
updates:
- package-ecosystem: "gomod"
directory: "/"
schedule:
interval: "weekly"

314
.github/workflows/Release.yml vendored Normal file
View File

@@ -0,0 +1,314 @@
# See: https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions
# Workflow name
name: Release
# Run on tag push
on:
push:
tags:
- '**'
jobs:
#
# Build on AlmaLinux 8.5 using golang-1.18.2
#
AlmaLinux-RPM-build:
runs-on: ubuntu-latest
# See: https://hub.docker.com/_/almalinux
container: almalinux:8.5
# The job outputs link to the outputs of the 'rpmrename' step
# Only job outputs can be used in child jobs
outputs:
rpm : ${{steps.rpmrename.outputs.RPM}}
srpm : ${{steps.rpmrename.outputs.SRPM}}
steps:
# Use dnf to install development packages
- name: Install development packages
run: |
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
dnf --assumeyes install wget openssl-devel diffutils delve which
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use dnf to install build dependencies
- name: Install build dependencies
run: |
wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm
rpm -i go*.rpm
- name: RPM build MetricStore
id: rpmbuild
run: make RPM
# AlmaLinux 8.5 is a derivate of RedHat Enterprise Linux 8 (UBI8),
# so the created RPM both contain the substring 'el8' in the RPM file names
# This step replaces the substring 'el8' to 'alma85'. It uses the move operation
# because it is unclear whether the default AlmaLinux 8.5 container contains the
# 'rename' command. This way we also get the new names for output.
- name: Rename RPMs (s/el8/alma85/)
id: rpmrename
run: |
OLD_RPM="${{steps.rpmbuild.outputs.RPM}}"
OLD_SRPM="${{steps.rpmbuild.outputs.SRPM}}"
NEW_RPM="${OLD_RPM/el8/alma85}"
NEW_SRPM=${OLD_SRPM/el8/alma85}
mv "${OLD_RPM}" "${NEW_RPM}"
mv "${OLD_SRPM}" "${NEW_SRPM}"
echo "::set-output name=SRPM::${NEW_SRPM}"
echo "::set-output name=RPM::${NEW_RPM}"
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-metric-store RPM for AlmaLinux 8.5
path: ${{ steps.rpmrename.outputs.RPM }}
- name: Save SRPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-metric-store SRPM for AlmaLinux 8.5
path: ${{ steps.rpmrename.outputs.SRPM }}
#
# Build on UBI 8 using golang-1.18.2
#
UBI-8-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
container: registry.access.redhat.com/ubi8/ubi:8.5-226.1645809065
# The job outputs link to the outputs of the 'rpmbuild' step
outputs:
rpm : ${{steps.rpmbuild.outputs.RPM}}
srpm : ${{steps.rpmbuild.outputs.SRPM}}
steps:
# Use dnf to install development packages
- name: Install development packages
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros rpm-build-libs rpm-libs gcc make python38 git wget openssl-devel diffutils delve which
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use dnf to install build dependencies
- name: Install build dependencies
run: |
wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm
rpm -i go*.rpm
- name: RPM build MetricStore
id: rpmbuild
run: make RPM
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-metric-store RPM for UBI 8
path: ${{ steps.rpmbuild.outputs.RPM }}
- name: Save SRPM as artifact
uses: actions/upload-artifact@v2
with:
name: cc-metric-store SRPM for UBI 8
path: ${{ steps.rpmbuild.outputs.SRPM }}
#
# Build on Ubuntu 20.04 using official go 1.19.1 package
#
Ubuntu-focal-build:
runs-on: ubuntu-latest
container: ubuntu:20.04
# The job outputs link to the outputs of the 'debrename' step
# Only job outputs can be used in child jobs
outputs:
deb : ${{steps.debrename.outputs.DEB}}
steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt update && apt --assume-yes upgrade
apt --assume-yes install build-essential sed git wget bash
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
- name: Install Golang
run: |
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
go version
- name: DEB build MetricStore
id: dpkg-build
run: |
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
make DEB
- name: Rename DEB (add '_ubuntu20.04')
id: debrename
run: |
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu20.04.deb"
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
echo "::set-output name=DEB::${NEW_DEB_FILE}"
# See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact
uses: actions/upload-artifact@v2
with:
name: cc-metric-store DEB for Ubuntu 20.04
path: ${{ steps.debrename.outputs.DEB }}
#
# Build on Ubuntu 20.04 using official go 1.19.1 package
#
Ubuntu-jammy-build:
runs-on: ubuntu-latest
container: ubuntu:22.04
# The job outputs link to the outputs of the 'debrename' step
# Only job outputs can be used in child jobs
outputs:
deb : ${{steps.debrename.outputs.DEB}}
steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt update && apt --assume-yes upgrade
apt --assume-yes install build-essential sed git wget bash
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
- name: Install Golang
run: |
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
go version
- name: DEB build MetricStore
id: dpkg-build
run: |
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
make DEB
- name: Rename DEB (add '_ubuntu22.04')
id: debrename
run: |
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu22.04.deb"
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
echo "::set-output name=DEB::${NEW_DEB_FILE}"
# See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact
uses: actions/upload-artifact@v2
with:
name: cc-metric-store DEB for Ubuntu 22.04
path: ${{ steps.debrename.outputs.DEB }}
#
# Create release with fresh RPMs
#
Release:
runs-on: ubuntu-latest
# We need the RPMs, so add dependency
needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-focal-build, Ubuntu-jammy-build]
steps:
# See: https://github.com/actions/download-artifact
- name: Download AlmaLinux 8.5 RPM
uses: actions/download-artifact@v2
with:
name: cc-metric-store RPM for AlmaLinux 8.5
- name: Download AlmaLinux 8.5 SRPM
uses: actions/download-artifact@v2
with:
name: cc-metric-store SRPM for AlmaLinux 8.5
- name: Download UBI 8 RPM
uses: actions/download-artifact@v2
with:
name: cc-metric-store RPM for UBI 8
- name: Download UBI 8 SRPM
uses: actions/download-artifact@v2
with:
name: cc-metric-store SRPM for UBI 8
- name: Download Ubuntu 20.04 DEB
uses: actions/download-artifact@v2
with:
name: cc-metric-store DEB for Ubuntu 20.04
- name: Download Ubuntu 22.04 DEB
uses: actions/download-artifact@v2
with:
name: cc-metric-store DEB for Ubuntu 22.04
# The download actions do not publish the name of the downloaded file,
# so we re-use the job outputs of the parent jobs. The files are all
# downloaded to the current folder.
# The gh-release action afterwards does not accept file lists but all
# files have to be listed at 'files'. The step creates one output per
# RPM package (2 per distro)
- name: Set RPM variables
id: files
run: |
ALMA_85_RPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.rpm}}")
ALMA_85_SRPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.srpm}}")
UBI_8_RPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.rpm}}")
UBI_8_SRPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.srpm}}")
U_2004_DEB=$(basename "${{ needs.Ubuntu-focal-build.outputs.deb}}")
U_2204_DEB=$(basename "${{ needs.Ubuntu-jammy-build.outputs.deb}}")
echo "ALMA_85_RPM::${ALMA_85_RPM}"
echo "ALMA_85_SRPM::${ALMA_85_SRPM}"
echo "UBI_8_RPM::${UBI_8_RPM}"
echo "UBI_8_SRPM::${UBI_8_SRPM}"
echo "U_2004_DEB::${U_2004_DEB}"
echo "U_2204_DEB::${U_2204_DEB}"
echo "::set-output name=ALMA_85_RPM::${ALMA_85_RPM}"
echo "::set-output name=ALMA_85_SRPM::${ALMA_85_SRPM}"
echo "::set-output name=UBI_8_RPM::${UBI_8_RPM}"
echo "::set-output name=UBI_8_SRPM::${UBI_8_SRPM}"
echo "::set-output name=U_2004_DEB::${U_2004_DEB}"
echo "::set-output name=U_2204_DEB::${U_2204_DEB}"
# See: https://github.com/softprops/action-gh-release
- name: Release
uses: softprops/action-gh-release@v1
if: startsWith(github.ref, 'refs/tags/')
with:
name: cc-metric-store-${{github.ref_name}}
files: |
${{ steps.files.outputs.ALMA_85_RPM }}
${{ steps.files.outputs.ALMA_85_SRPM }}
${{ steps.files.outputs.UBI_8_RPM }}
${{ steps.files.outputs.UBI_8_SRPM }}
${{ steps.files.outputs.U_2004_DEB }}
${{ steps.files.outputs.U_2204_DEB }}

View File

@@ -5,14 +5,14 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Install Go
uses: actions/setup-go@v4
uses: actions/setup-go@v2
with:
go-version: 1.22.x
go-version: 1.16.x
- name: Checkout code
uses: actions/checkout@v3
uses: actions/checkout@v2
- name: Build, Vet & Test
run: |
go build ./...
go vet ./...
go test ./...
go test -v ./...

12
.gitignore vendored
View File

@@ -4,6 +4,7 @@
*.dll
*.so
*.dylib
/cc-metric-store
# Test binary, built with `go test -c`
*.test
@@ -15,14 +16,5 @@
# vendor/
# Project specific ignores
/cc-metric-store
/var
aditya.creds
test.creds
/config.json
migrateTimestamps.pl
sample.txt
sample_alex.txt
sample_fritz.txt
/configs

View File

@@ -1,63 +0,0 @@
before:
hooks:
- go mod tidy
builds:
- env:
- CGO_ENABLED=0
goos:
- linux
goarch:
- amd64
goamd64:
- v3
id: "cc-metric-store"
binary: cc-metric-store
main: ./cmd/cc-metric-store
ldflags:
- -s -w -X main.version={{.Version}}
- -X main.commit={{.Commit}} -X main.date={{.Date}}
tags:
- static_build
archives:
- format: tar.gz
# this name template makes the OS and Arch compatible with the results of uname.
name_template: >-
{{ .ProjectName }}_
{{- title .Os }}_
{{- if eq .Arch "amd64" }}x86_64
{{- else }}{{ .Arch }}{{ end }}
{{- if .Arm }}v{{ .Arm }}{{ end }}
checksum:
name_template: "checksums.txt"
snapshot:
name_template: "{{ incpatch .Version }}-next"
changelog:
sort: asc
filters:
include:
- "^feat:"
- "^fix:"
- "^sec:"
- "^docs:"
groups:
- title: "Dependency updates"
regexp: '^.*?(feat|fix)\(deps\)!?:.+$'
order: 300
- title: "New Features"
regexp: '^.*?feat(\([[:word:]]+\))??!?:.+$'
order: 100
- title: "Security updates"
regexp: '^.*?sec(\([[:word:]]+\))??!?:.+$'
order: 150
- title: "Bug fixes"
regexp: '^.*?fix(\([[:word:]]+\))??!?:.+$'
order: 200
- title: "Documentation updates"
regexp: ^.*?doc(\([[:word:]]+\))??!?:.+$
order: 400
release:
draft: false
footer: |
Please check out the [Release Notes](https://github.com/ClusterCockpit/cc-metric-store/blob/master/ReleaseNotes.md) for further details on breaking changes.
# vim: set ts=2 sw=2 tw=0 fo=cnqoj

134
Makefile
View File

@@ -1,41 +1,113 @@
TARGET = ./cc-metric-store
VAR = ./var/checkpoints/
VERSION = 0.1.1
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
.PHONY: clean distclean test swagger $(TARGET)
APP = cc-metric-store
GOSRC_APP := cc-metric-store.go
GOSRC_FILES := api.go \
memstore.go \
archive.go \
debug.go \
float.go \
lineprotocol.go \
selector.go \
stats.go
.NOTPARALLEL:
$(TARGET): config.json $(VAR)
$(info ===> BUILD cc-metric-store)
@go build -ldflags=${LD_FLAGS} ./cmd/cc-metric-store
config.json:
@cp ./configs/config.json config.json
BINDIR ?= bin
$(VAR):
@mkdir -p $(VAR)
swagger:
$(info ===> GENERATE swagger)
@go run github.com/swaggo/swag/cmd/swag init -d ./internal/api,./internal/util -g api.go -o ./api
@mv ./api/docs.go ./internal/api/docs.go
.PHONY: all
all: $(APP)
$(APP): $(GOSRC) $(GOSRC_APP)
go get
go build -o $(APP) $(GOSRC_APP) $(GOSRC_FILES)
install: $(APP)
@WORKSPACE=$(PREFIX)
@if [ -z "$${WORKSPACE}" ]; then exit 1; fi
@mkdir --parents --verbose $${WORKSPACE}/usr/$(BINDIR)
@install -Dpm 755 $(APP) $${WORKSPACE}/usr/$(BINDIR)/$(APP)
@install -Dpm 600 config.json $${WORKSPACE}/etc/$(APP)/$(APP).json
.PHONY: clean
.ONESHELL:
clean:
$(info ===> CLEAN)
@go clean
@rm -f $(TARGET)
rm -f $(APP)
distclean: clean
@rm -rf ./var
@rm -f config.json
.PHONY: fmt
fmt:
go fmt $(GOSRC_APP)
test:
$(info ===> TESTING)
@go clean -testcache
@go build ./...
@go vet ./...
@go test ./...
# Examine Go source code and reports suspicious constructs
.PHONY: vet
vet:
go vet ./...
# Run linter for the Go programming language.
# Using static analysis, it finds bugs and performance issues, offers simplifications, and enforces style rules
.PHONY: staticcheck
staticcheck:
go install honnef.co/go/tools/cmd/staticcheck@latest
$$(go env GOPATH)/bin/staticcheck ./...
.ONESHELL:
.PHONY: RPM
RPM: scripts/cc-metric-store.spec
@WORKSPACE="$${PWD}"
@SPECFILE="$${WORKSPACE}/scripts/cc-metric-store.spec"
# Setup RPM build tree
@eval $$(rpm --eval "ARCH='%{_arch}' RPMDIR='%{_rpmdir}' SOURCEDIR='%{_sourcedir}' SPECDIR='%{_specdir}' SRPMDIR='%{_srcrpmdir}' BUILDDIR='%{_builddir}'")
@mkdir --parents --verbose "$${RPMDIR}" "$${SOURCEDIR}" "$${SPECDIR}" "$${SRPMDIR}" "$${BUILDDIR}"
# Create source tarball
@COMMITISH="HEAD"
@VERS=$$(git describe --tags $${COMMITISH})
@VERS=$${VERS#v}
@VERS=$$(echo $$VERS | sed -e s+'-'+'_'+g)
@if [ "$${VERS}" = "" ]; then VERS="0.0.1"; fi
@eval $$(rpmspec --query --queryformat "NAME='%{name}' VERSION='%{version}' RELEASE='%{release}' NVR='%{NVR}' NVRA='%{NVRA}'" --define="VERS $${VERS}" "$${SPECFILE}")
@PREFIX="$${NAME}-$${VERSION}"
@FORMAT="tar.gz"
@SRCFILE="$${SOURCEDIR}/$${PREFIX}.$${FORMAT}"
@git archive --verbose --format "$${FORMAT}" --prefix="$${PREFIX}/" --output="$${SRCFILE}" $${COMMITISH}
# Build RPM and SRPM
@rpmbuild -ba --define="VERS $${VERS}" --rmsource --clean "$${SPECFILE}"
# Report RPMs and SRPMs when in GitHub Workflow
@if [[ "$${GITHUB_ACTIONS}" == true ]]; then
@ RPMFILE="$${RPMDIR}/$${ARCH}/$${NVRA}.rpm"
@ SRPMFILE="$${SRPMDIR}/$${NVR}.src.rpm"
@ echo "RPM: $${RPMFILE}"
@ echo "SRPM: $${SRPMFILE}"
@ echo "::set-output name=SRPM::$${SRPMFILE}"
@ echo "::set-output name=RPM::$${RPMFILE}"
@fi
.ONESHELL:
.PHONY: DEB
DEB: scripts/cc-metric-store.deb.control $(APP)
@BASEDIR=$${PWD}
@WORKSPACE=$${PWD}/.dpkgbuild
@DEBIANDIR=$${WORKSPACE}/debian
@DEBIANBINDIR=$${WORKSPACE}/DEBIAN
@mkdir --parents --verbose $$WORKSPACE $$DEBIANBINDIR
#@mkdir --parents --verbose $$DEBIANDIR
@CONTROLFILE="$${BASEDIR}/scripts/cc-metric-store.deb.control"
@COMMITISH="HEAD"
@VERS=$$(git describe --tags --abbrev=0 $${COMMITISH})
@VERS=$${VERS#v}
@VERS=$$(echo $$VERS | sed -e s+'-'+'_'+g)
@if [ "$${VERS}" = "" ]; then VERS="0.0.1"; fi
@ARCH=$$(uname -m)
@ARCH=$$(echo $$ARCH | sed -e s+'_'+'-'+g)
@if [ "$${ARCH}" = "x86-64" ]; then ARCH=amd64; fi
@PREFIX="$${NAME}-$${VERSION}_$${ARCH}"
@SIZE_BYTES=$$(du -bcs --exclude=.dpkgbuild "$$WORKSPACE"/ | awk '{print $$1}' | head -1 | sed -e 's/^0\+//')
@SIZE="$$(awk -v size="$$SIZE_BYTES" 'BEGIN {print (size/1024)+1}' | awk '{print int($$0)}')"
#@sed -e s+"{VERSION}"+"$$VERS"+g -e s+"{INSTALLED_SIZE}"+"$$SIZE"+g -e s+"{ARCH}"+"$$ARCH"+g $$CONTROLFILE > $${DEBIANDIR}/control
@sed -e s+"{VERSION}"+"$$VERS"+g -e s+"{INSTALLED_SIZE}"+"$$SIZE"+g -e s+"{ARCH}"+"$$ARCH"+g $$CONTROLFILE > $${DEBIANBINDIR}/control
@make PREFIX=$${WORKSPACE} install
@DEB_FILE="cc-metric-store_$${VERS}_$${ARCH}.deb"
@dpkg-deb -b $${WORKSPACE} "$$DEB_FILE"
@rm -r "$${WORKSPACE}"
@if [ "$${GITHUB_ACTIONS}" = "true" ]; then
@ echo "::set-output name=DEB::$${DEB_FILE}"
@fi

128
README.md
View File

@@ -2,46 +2,18 @@
[![Build & Test](https://github.com/ClusterCockpit/cc-metric-store/actions/workflows/test.yml/badge.svg)](https://github.com/ClusterCockpit/cc-metric-store/actions/workflows/test.yml)
The cc-metric-store provides a simple in-memory time series database for storing
metrics of cluster nodes at preconfigured intervals. It is meant to be used as
part of the [ClusterCockpit suite](https://github.com/ClusterCockpit). As all
data is kept in-memory (but written to disk as compressed JSON for long term
storage), accessing it is very fast. It also provides topology aware
aggregations over time _and_ nodes/sockets/cpus.
The cc-metric-store provides a simple in-memory time series database for storing metrics of cluster nodes at preconfigured intervals. It is meant to be used as part of the [ClusterCockpit suite](https://github.com/ClusterCockpit). As all data is kept in-memory (but written to disk as compressed JSON for long term storage), accessing it is very fast. It also provides aggregations over time *and* nodes/sockets/cpus.
There are major limitations: Data only gets written to disk at periodic
checkpoints, not as soon as it is received. Also only the fixed configured
duration is stored and available.
There are major limitations: Data only gets written to disk at periodic checkpoints, not as soon as it is received.
Go look at the [GitHub
Issues](https://github.com/ClusterCockpit/cc-metric-store/issues) for a progress
overview. The [NATS.io](https://nats.io/) based writing endpoint consumes messages in [this
format of the InfluxDB line
protocol](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md).
Go look at the `TODO.md` file and the [GitHub Issues](https://github.com/ClusterCockpit/cc-metric-store/issues) for a progress overview. Things work, but are not properly tested.
The [NATS.io](https://nats.io/) based writing endpoint consumes messages in [this format of the InfluxDB line protocol](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md).
## Building
### REST API Endpoints
`cc-metric-store` can be built using the provided `Makefile`.
It supports the following targets:
The REST API is documented in [openapi.yaml](./openapi.yaml) in the OpenAPI 3.0 format.
- `make`: Build the application, copy a example configuration file and generate
checkpoint folders if required.
- `make clean`: Clean the golang build cache and application binary
- `make distclean`: In addition to the clean target also remove the `./var`
folder
- `make swagger`: Regenerate the Swagger files from the source comments.
- `make test`: Run test and basic checks.
## REST API Endpoints
The REST API is documented in [swagger.json](./api/swagger.json). You can
explore and try the REST API using the integrated [SwaggerUI web
interface](http://localhost:8082/swagger).
For more information on the `cc-metric-store` REST API have a look at the
ClusterCockpit documentation [website](https://clustercockpit.org/docs/reference/cc-metric-store/ccms-rest-api/)
## Run tests
### Run tests
Some benchmarks concurrently access the `MemoryStore`, so enabling the
[Race Detector](https://golang.org/doc/articles/race_detector) might be useful.
@@ -56,16 +28,18 @@ go test -v ./...
go test -bench=. -race -v ./...
```
## What are these selectors mentioned in the code?
### What are these selectors mentioned in the code?
The cc-metric-store works as a time-series database and uses the InfluxDB line
protocol as input format. Unlike InfluxDB, the data is indexed by one single
strictly hierarchical tree structure. A selector is build out of the tags in the
InfluxDB line protocol, and can be used to select a node (not in the sense of a
compute node, can also be a socket, cpu, ...) in that tree. The implementation
calls those nodes `level` to avoid confusion. It is impossible to access data
only by knowing the _socket_ or _cpu_ tag, all higher up levels have to be
specified as well.
Tags in InfluxDB are used to build indexes over the stored data. InfluxDB-Tags have no
relation to each other, they do not depend on each other and have no hierarchy.
Different tags build up different indexes (I am no expert at all, but this is how i think they work).
This project also works as a time-series database and uses the InfluxDB line protocol.
Unlike InfluxDB, the data is indexed by one single strictly hierarchical tree structure.
A selector is build out of the tags in the InfluxDB line protocol, and can be used to select
a node (not in the sense of a compute node, can also be a socket, cpu, ...) in that tree.
The implementation calls those nodes `level` to avoid confusion.
It is impossible to access data only by knowing the *socket* or *cpu* tag, all higher up levels have to be specified as well.
This is what the hierarchy currently looks like:
@@ -79,29 +53,49 @@ This is what the hierarchy currently looks like:
- cpu3
- cpu4
- ...
- gpu1
- gpu2
- host2
- ...
- cluster2
- ...
Example selectors:
1. `["cluster1", "host1", "cpu0"]`: Select only the cpu0 of host1 in cluster1
2. `["cluster1", "host1", ["cpu4", "cpu5", "cpu6", "cpu7"]]`: Select only CPUs 4-7 of host1 in cluster1
3. `["cluster1", "host1"]`: Select the complete node. If querying for a CPU-specific metric such as floats, all CPUs are implied
## Config file
### Config file
You find the configuration options on the ClusterCockpit [website](https://clustercockpit.org/docs/reference/cc-metric-store/ccms-configuration/).
All durations are specified as string that will be parsed [like this](https://pkg.go.dev/time#ParseDuration) (Allowed suffixes: `s`, `m`, `h`, ...).
## Test the complete setup (excluding cc-backend itself)
- `metrics`: Map of metric-name to objects with the following properties
- `frequency`: Timestep/Interval/Resolution of this metric
- `aggregation`: Can be `"sum"`, `"avg"` or `null`
- `null` means aggregation across nodes is forbidden for this metric
- `"sum"` means that values from the child levels are summed up for the parent level
- `"avg"` means that values from the child levels are averaged for the parent level
- `scope`: Unused at the moment, should be something like `"node"`, `"socket"` or `"hwthread"`
- `nats`:
- `address`: Url of NATS.io server, example: "nats://localhost:4222"
- `username` and `password`: Optional, if provided use those for the connection
- `subscriptions`:
- `subscribe-to`: Where to expect the measurements to be published
- `cluster-tag`: Default value for the cluster tag
- `http-api`:
- `address`: Address to bind to, for example `0.0.0.0:8080`
- `https-cert-file` and `https-key-file`: Optional, if provided enable HTTPS using those files as certificate/key
- `jwt-public-key`: Base64 encoded string, use this to verify requests to the HTTP API
- `retention-on-memory`: Keep all values in memory for at least that amount of time
- `checkpoints`:
- `interval`: Do checkpoints every X seconds/minutes/hours
- `directory`: Path to a directory
- `restore`: After a restart, load the last X seconds/minutes/hours of data back into memory
- `archive`:
- `interval`: Move and compress all checkpoints not needed anymore every X seconds/minutes/hours
- `directory`: Path to a directory
There are two ways for sending data to the cc-metric-store, both of which are
supported by the
[cc-metric-collector](https://github.com/ClusterCockpit/cc-metric-collector).
This example uses NATS, the alternative is to use HTTP.
### Test the complete setup (excluding ClusterCockpit itself)
There are two ways for sending data to the cc-metric-store, both of which are supported by the [cc-metric-collector](https://github.com/ClusterCockpit/cc-metric-collector). This example uses Nats, the alternative is to use HTTP.
```sh
# Only needed once, downloads the docker image
@@ -111,9 +105,7 @@ docker pull nats:latest
docker run -p 4222:4222 -ti nats:latest
```
Second, build and start the
[cc-metric-collector](https://github.com/ClusterCockpit/cc-metric-collector)
using the following as Sink-Config:
Second, build and start the [cc-metric-collector](https://github.com/ClusterCockpit/cc-metric-collector) using the following as Sink-Config:
```json
{
@@ -124,20 +116,18 @@ using the following as Sink-Config:
}
```
Third, build and start the metric store. For this example here, the
`config.json` file already in the repository should work just fine.
Third, build and start the metric store. For this example here, the `config.json` file
already in the repository should work just fine.
```sh
# Assuming you have a clone of this repo in ./cc-metric-store:
cd cc-metric-store
make
go get
go build
./cc-metric-store
```
And finally, use the API to fetch some data. The API is protected by JWT based
authentication if `jwt-public-key` is set in `config.json`. You can use this JWT
for testing:
`eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw`
And finally, use the API to fetch some data. The API is protected by JWT based authentication if `jwt-public-key` is set in `config.json`. You can use this JWT for testing: `eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw`
```sh
JWT="eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
@@ -151,13 +141,3 @@ curl -H "Authorization: Bearer $JWT" -D - "http://localhost:8080/api/query" -d "
# ...
```
For debugging there is a debug endpoint to dump the current content to stdout:
```sh
JWT="eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
# If the collector and store and nats-server have been running for at least 60 seconds on the same host, you may run:
curl -H "Authorization: Bearer $JWT" -D - "http://localhost:8080/api/debug"
# ...
```

View File

@@ -1,19 +0,0 @@
# `cc-metric-store` version 0.1.1
This is a bugfix release of `cc-metric-store`, the metric timeseries cache
implementation of ClusterCockpit.
For release specific notes visit the [ClusterCockpit Documentation](https://clusterockpit.org/docs/release/).
Notable changes in version 0.1.0:
- Cleanup of code and restructuring
- Document REST API with Swagger
- Introduce REST API versioning
- Provide Swagger UI test web-frontend
- Introduce re-sampling of metric data
- Support also ms, ns in line protocol
- Support NATS credentials
## Breaking changes
None

14
TODO.md Normal file
View File

@@ -0,0 +1,14 @@
# TODO
- Improve checkpoints/archives
- Store information in each buffer if already archived
- Do not create new checkpoint if all buffers already archived
- Missing Testcases:
- General tests
- Check for corner cases that should fail gracefully
- Write a more realistic `ToArchive`/`FromArchive` tests
- Optimization: Once a buffer is full, calculate min, max and avg
- Calculate averages buffer-wise, average weighted by length of buffer
- Only the head-buffer needs to be fully traversed
- Optimization: If aggregating over hwthreads/cores/sockets cache those results and reuse some of that for new queres aggregating only over the newer data
- ...

429
api.go Normal file
View File

@@ -0,0 +1,429 @@
package main
import (
"bufio"
"context"
"crypto/ed25519"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"io"
"log"
"math"
"net/http"
"strconv"
"strings"
"sync"
"time"
"github.com/golang-jwt/jwt/v4"
"github.com/gorilla/mux"
"github.com/influxdata/line-protocol/v2/lineprotocol"
)
type ApiMetricData struct {
Error *string `json:"error,omitempty"`
From int64 `json:"from"`
To int64 `json:"to"`
Data FloatArray `json:"data,omitempty"`
Avg Float `json:"avg"`
Min Float `json:"min"`
Max Float `json:"max"`
Unit string `json:"unit,omitempty"`
}
// TODO: Optimize this, just like the stats endpoint!
func (data *ApiMetricData) AddStats() {
n := 0
sum, min, max := 0.0, math.MaxFloat64, -math.MaxFloat64
for _, x := range data.Data {
if x.IsNaN() {
continue
}
n += 1
sum += float64(x)
min = math.Min(min, float64(x))
max = math.Max(max, float64(x))
}
if n > 0 {
avg := sum / float64(n)
data.Avg = Float(avg)
data.Min = Float(min)
data.Max = Float(max)
} else {
data.Avg, data.Min, data.Max = NaN, NaN, NaN
}
}
func (data *ApiMetricData) ScaleBy(f Float) {
if f == 0 || f == 1 {
return
}
data.Avg *= f
data.Min *= f
data.Max *= f
for i := 0; i < len(data.Data); i++ {
data.Data[i] *= f
}
}
func (data *ApiMetricData) PadDataWithNull(from, to int64, metric string) {
minfo, ok := memoryStore.metrics[metric]
if !ok {
return
}
if (data.From / minfo.Frequency) > (from / minfo.Frequency) {
padfront := int((data.From / minfo.Frequency) - (from / minfo.Frequency))
ndata := make([]Float, 0, padfront+len(data.Data))
for i := 0; i < padfront; i++ {
ndata = append(ndata, NaN)
}
for j := 0; j < len(data.Data); j++ {
ndata = append(ndata, data.Data[j])
}
data.Data = ndata
}
}
func handleFree(rw http.ResponseWriter, r *http.Request) {
rawTo := r.URL.Query().Get("to")
if rawTo == "" {
http.Error(rw, "'to' is a required query parameter", http.StatusBadRequest)
return
}
to, err := strconv.ParseInt(rawTo, 10, 64)
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
// TODO: lastCheckpoint might be modified by different go-routines.
// Load it using the sync/atomic package?
freeUpTo := lastCheckpoint.Unix()
if to < freeUpTo {
freeUpTo = to
}
if r.Method != http.MethodPost {
http.Error(rw, "Method Not Allowed", http.StatusMethodNotAllowed)
return
}
bodyDec := json.NewDecoder(r.Body)
var selectors [][]string
err = bodyDec.Decode(&selectors)
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
n := 0
for _, sel := range selectors {
bn, err := memoryStore.Free(sel, freeUpTo)
if err != nil {
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
n += bn
}
rw.WriteHeader(http.StatusOK)
rw.Write([]byte(fmt.Sprintf("buffers freed: %d\n", n)))
}
func handleWrite(rw http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(rw, "Method Not Allowed", http.StatusMethodNotAllowed)
return
}
bytes, err := io.ReadAll(r.Body)
if err != nil {
log.Printf("error while reading request body: %s", err.Error())
http.Error(rw, err.Error(), http.StatusInternalServerError)
return
}
if debugDump != io.Discard {
now := time.Now()
msg := make([]byte, 0, 512)
msg = append(msg, "\n--- local unix time: "...)
msg = strconv.AppendInt(msg, now.Unix(), 10)
msg = append(msg, " ---\n"...)
debugDumpLock.Lock()
defer debugDumpLock.Unlock()
if _, err := debugDump.Write(msg); err != nil {
log.Printf("error while writing to debug dump: %s", err.Error())
}
if _, err := debugDump.Write(bytes); err != nil {
log.Printf("error while writing to debug dump: %s", err.Error())
}
return
}
dec := lineprotocol.NewDecoderWithBytes(bytes)
if err := decodeLine(dec, r.URL.Query().Get("cluster")); err != nil {
log.Printf("/api/write error: %s", err.Error())
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
rw.WriteHeader(http.StatusOK)
}
type ApiQueryRequest struct {
Cluster string `json:"cluster"`
From int64 `json:"from"`
To int64 `json:"to"`
WithStats bool `json:"with-stats"`
WithUnit bool `json:"with-unit,omitempty"`
WithData bool `json:"with-data"`
WithPadding bool `json:"with-padding"`
Queries []ApiQuery `json:"queries"`
ForAllNodes []string `json:"for-all-nodes"`
}
type ApiQueryResponse struct {
Queries []ApiQuery `json:"queries,omitempty"`
Results [][]ApiMetricData `json:"results"`
}
type ApiQuery struct {
Metric string `json:"metric"`
Hostname string `json:"host"`
Aggregate bool `json:"aggreg"`
ScaleFactor Float `json:"scale-by,omitempty"`
Type *string `json:"type,omitempty"`
TypeIds []string `json:"type-ids,omitempty"`
SubType *string `json:"subtype,omitempty"`
SubTypeIds []string `json:"subtype-ids,omitempty"`
}
func handleQuery(rw http.ResponseWriter, r *http.Request) {
var err error
var req ApiQueryRequest = ApiQueryRequest{WithStats: true, WithData: true, WithPadding: true, WithUnit: true}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
response := ApiQueryResponse{
Results: make([][]ApiMetricData, 0, len(req.Queries)),
}
if req.ForAllNodes != nil {
nodes := memoryStore.ListChildren([]string{req.Cluster})
for _, node := range nodes {
for _, metric := range req.ForAllNodes {
q := ApiQuery{
Metric: metric,
Hostname: node,
}
req.Queries = append(req.Queries, q)
response.Queries = append(response.Queries, q)
}
}
}
for _, query := range req.Queries {
sels := make([]Selector, 0, 1)
if query.Aggregate || query.Type == nil {
sel := Selector{{String: req.Cluster}, {String: query.Hostname}}
if query.Type != nil {
if len(query.TypeIds) == 1 {
sel = append(sel, SelectorElement{String: *query.Type + query.TypeIds[0]})
} else {
ids := make([]string, len(query.TypeIds))
for i, id := range query.TypeIds {
ids[i] = *query.Type + id
}
sel = append(sel, SelectorElement{Group: ids})
}
if query.SubType != nil {
if len(query.SubTypeIds) == 1 {
sel = append(sel, SelectorElement{String: *query.SubType + query.SubTypeIds[0]})
} else {
ids := make([]string, len(query.SubTypeIds))
for i, id := range query.SubTypeIds {
ids[i] = *query.SubType + id
}
sel = append(sel, SelectorElement{Group: ids})
}
}
}
sels = append(sels, sel)
} else {
for _, typeId := range query.TypeIds {
if query.SubType != nil {
for _, subTypeId := range query.SubTypeIds {
sels = append(sels, Selector{
{String: req.Cluster}, {String: query.Hostname},
{String: *query.Type + typeId},
{String: *query.SubType + subTypeId}})
}
} else {
sels = append(sels, Selector{
{String: req.Cluster},
{String: query.Hostname},
{String: *query.Type + typeId}})
}
}
}
// log.Printf("query: %#v\n", query)
// log.Printf("sels: %#v\n", sels)
res := make([]ApiMetricData, 0, len(sels))
for _, sel := range sels {
data := ApiMetricData{}
data.Unit = ""
unit := ""
data.Data, data.From, data.To, unit, err = memoryStore.Read(sel, query.Metric, req.From, req.To)
// log.Printf("data: %#v, %#v, %#v, %#v", data.Data, data.From, data.To, err)
if err != nil {
msg := err.Error()
data.Error = &msg
res = append(res, data)
continue
}
if req.WithStats {
data.AddStats()
}
if query.ScaleFactor != 0 {
data.ScaleBy(query.ScaleFactor)
}
if req.WithPadding {
data.PadDataWithNull(req.From, req.To, query.Metric)
}
if req.WithUnit && len(unit) > 0 {
data.Unit = unit
}
if !req.WithData {
data.Data = nil
}
res = append(res, data)
}
response.Results = append(response.Results, res)
}
rw.Header().Set("Content-Type", "application/json")
bw := bufio.NewWriter(rw)
defer bw.Flush()
if err := json.NewEncoder(bw).Encode(response); err != nil {
log.Print(err)
return
}
}
func authentication(next http.Handler, publicKey ed25519.PublicKey) http.Handler {
cacheLock := sync.RWMutex{}
cache := map[string]*jwt.Token{}
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
authheader := r.Header.Get("Authorization")
if authheader == "" || !strings.HasPrefix(authheader, "Bearer ") {
http.Error(rw, "Use JWT Authentication", http.StatusUnauthorized)
return
}
rawtoken := authheader[len("Bearer "):]
cacheLock.RLock()
token, ok := cache[rawtoken]
cacheLock.RUnlock()
if ok && token.Claims.Valid() == nil {
next.ServeHTTP(rw, r)
return
}
// The actual token is ignored for now.
// In case expiration and so on are specified, the Parse function
// already returns an error for expired tokens.
var err error
token, err = jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
if t.Method != jwt.SigningMethodEdDSA {
return nil, errors.New("only Ed25519/EdDSA supported")
}
return publicKey, nil
})
if err != nil {
http.Error(rw, err.Error(), http.StatusUnauthorized)
return
}
cacheLock.Lock()
cache[rawtoken] = token
cacheLock.Unlock()
// Let request through...
next.ServeHTTP(rw, r)
})
}
func StartApiServer(ctx context.Context, httpConfig *HttpConfig) error {
r := mux.NewRouter()
r.HandleFunc("/api/free", handleFree)
r.HandleFunc("/api/write", handleWrite)
r.HandleFunc("/api/query", handleQuery)
r.HandleFunc("/api/debug", func(rw http.ResponseWriter, r *http.Request) {
raw := r.URL.Query().Get("selector")
selector := []string{}
if len(raw) != 0 {
selector = strings.Split(raw, ":")
}
if err := memoryStore.DebugDump(bufio.NewWriter(rw), selector); err != nil {
rw.WriteHeader(http.StatusBadRequest)
rw.Write([]byte(err.Error()))
}
})
server := &http.Server{
Handler: r,
Addr: httpConfig.Address,
WriteTimeout: 30 * time.Second,
ReadTimeout: 30 * time.Second,
}
if len(conf.JwtPublicKey) > 0 {
buf, err := base64.StdEncoding.DecodeString(conf.JwtPublicKey)
if err != nil {
return err
}
publicKey := ed25519.PublicKey(buf)
server.Handler = authentication(server.Handler, publicKey)
}
go func() {
if httpConfig.CertFile != "" && httpConfig.KeyFile != "" {
log.Printf("API https endpoint listening on '%s'\n", httpConfig.Address)
err := server.ListenAndServeTLS(httpConfig.CertFile, httpConfig.KeyFile)
if err != nil && err != http.ErrServerClosed {
log.Println(err)
}
} else {
log.Printf("API http endpoint listening on '%s'\n", httpConfig.Address)
err := server.ListenAndServe()
if err != nil && err != http.ErrServerClosed {
log.Println(err)
}
}
}()
for {
<-ctx.Done()
err := server.Shutdown(context.Background())
log.Println("API server shut down")
return err
}
}

View File

@@ -1,456 +0,0 @@
{
"swagger": "2.0",
"info": {
"description": "API for cc-metric-store",
"title": "cc-metric-store REST API",
"contact": {
"name": "ClusterCockpit Project",
"url": "https://clustercockpit.org",
"email": "support@clustercockpit.org"
},
"license": {
"name": "MIT License",
"url": "https://opensource.org/licenses/MIT"
},
"version": "1.0.0"
},
"host": "localhost:8082",
"basePath": "/api/",
"paths": {
"/debug/": {
"post": {
"security": [
{
"ApiKeyAuth": []
}
],
"description": "This endpoint allows the users to print the content of",
"produces": [
"application/json"
],
"tags": [
"debug"
],
"summary": "Debug endpoint",
"parameters": [
{
"type": "string",
"description": "Selector",
"name": "selector",
"in": "query"
}
],
"responses": {
"200": {
"description": "Debug dump",
"schema": {
"type": "string"
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"401": {
"description": "Unauthorized",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"403": {
"description": "Forbidden",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
}
}
}
},
"/free/": {
"post": {
"security": [
{
"ApiKeyAuth": []
}
],
"description": "This endpoint allows the users to free the Buffers from the",
"produces": [
"application/json"
],
"tags": [
"free"
],
"parameters": [
{
"type": "string",
"description": "up to timestamp",
"name": "to",
"in": "query"
}
],
"responses": {
"200": {
"description": "ok",
"schema": {
"type": "string"
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"401": {
"description": "Unauthorized",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"403": {
"description": "Forbidden",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
}
}
}
},
"/healthcheck/": {
"get": {
"security": [
{
"ApiKeyAuth": []
}
],
"description": "This endpoint allows the users to check if a node is healthy",
"produces": [
"application/json"
],
"tags": [
"healthcheck"
],
"summary": "HealthCheck endpoint",
"parameters": [
{
"type": "string",
"description": "Selector",
"name": "selector",
"in": "query"
}
],
"responses": {
"200": {
"description": "Debug dump",
"schema": {
"type": "string"
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"401": {
"description": "Unauthorized",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"403": {
"description": "Forbidden",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
}
}
}
},
"/query/": {
"get": {
"security": [
{
"ApiKeyAuth": []
}
],
"description": "This endpoint allows the users to retrieve data from the",
"consumes": [
"application/json"
],
"produces": [
"application/json"
],
"tags": [
"query"
],
"summary": "Query metrics",
"parameters": [
{
"description": "API query payload object",
"name": "request",
"in": "body",
"required": true,
"schema": {
"$ref": "#/definitions/api.ApiQueryRequest"
}
}
],
"responses": {
"200": {
"description": "API query response object",
"schema": {
"$ref": "#/definitions/api.ApiQueryResponse"
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"401": {
"description": "Unauthorized",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"403": {
"description": "Forbidden",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
}
}
}
},
"/write/": {
"post": {
"security": [
{
"ApiKeyAuth": []
}
],
"consumes": [
"text/plain"
],
"produces": [
"application/json"
],
"parameters": [
{
"type": "string",
"description": "If the lines in the body do not have a cluster tag, use this value instead.",
"name": "cluster",
"in": "query"
}
],
"responses": {
"200": {
"description": "ok",
"schema": {
"type": "string"
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"401": {
"description": "Unauthorized",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"403": {
"description": "Forbidden",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
}
}
}
}
},
"definitions": {
"api.ApiMetricData": {
"type": "object",
"properties": {
"avg": {
"type": "number"
},
"data": {
"type": "array",
"items": {
"type": "number"
}
},
"error": {
"type": "string"
},
"from": {
"type": "integer"
},
"max": {
"type": "number"
},
"min": {
"type": "number"
},
"resolution": {
"type": "integer"
},
"to": {
"type": "integer"
}
}
},
"api.ApiQuery": {
"type": "object",
"properties": {
"aggreg": {
"type": "boolean"
},
"host": {
"type": "string"
},
"metric": {
"type": "string"
},
"resolution": {
"type": "integer"
},
"scale-by": {
"type": "number"
},
"subtype": {
"type": "string"
},
"subtype-ids": {
"type": "array",
"items": {
"type": "string"
}
},
"type": {
"type": "string"
},
"type-ids": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"api.ApiQueryRequest": {
"type": "object",
"properties": {
"cluster": {
"type": "string"
},
"for-all-nodes": {
"type": "array",
"items": {
"type": "string"
}
},
"from": {
"type": "integer"
},
"queries": {
"type": "array",
"items": {
"$ref": "#/definitions/api.ApiQuery"
}
},
"to": {
"type": "integer"
},
"with-data": {
"type": "boolean"
},
"with-padding": {
"type": "boolean"
},
"with-stats": {
"type": "boolean"
}
}
},
"api.ApiQueryResponse": {
"type": "object",
"properties": {
"queries": {
"type": "array",
"items": {
"$ref": "#/definitions/api.ApiQuery"
}
},
"results": {
"type": "array",
"items": {
"type": "array",
"items": {
"$ref": "#/definitions/api.ApiMetricData"
}
}
}
}
},
"api.ErrorResponse": {
"type": "object",
"properties": {
"error": {
"description": "Error Message",
"type": "string"
},
"status": {
"description": "Statustext of Errorcode",
"type": "string"
}
}
}
},
"securityDefinitions": {
"ApiKeyAuth": {
"type": "apiKey",
"name": "X-Auth-Token",
"in": "header"
}
}
}

View File

@@ -1,294 +0,0 @@
basePath: /api/
definitions:
api.ApiMetricData:
properties:
avg:
type: number
data:
items:
type: number
type: array
error:
type: string
from:
type: integer
max:
type: number
min:
type: number
resolution:
type: integer
to:
type: integer
type: object
api.ApiQuery:
properties:
aggreg:
type: boolean
host:
type: string
metric:
type: string
resolution:
type: integer
scale-by:
type: number
subtype:
type: string
subtype-ids:
items:
type: string
type: array
type:
type: string
type-ids:
items:
type: string
type: array
type: object
api.ApiQueryRequest:
properties:
cluster:
type: string
for-all-nodes:
items:
type: string
type: array
from:
type: integer
queries:
items:
$ref: '#/definitions/api.ApiQuery'
type: array
to:
type: integer
with-data:
type: boolean
with-padding:
type: boolean
with-stats:
type: boolean
type: object
api.ApiQueryResponse:
properties:
queries:
items:
$ref: '#/definitions/api.ApiQuery'
type: array
results:
items:
items:
$ref: '#/definitions/api.ApiMetricData'
type: array
type: array
type: object
api.ErrorResponse:
properties:
error:
description: Error Message
type: string
status:
description: Statustext of Errorcode
type: string
type: object
host: localhost:8082
info:
contact:
email: support@clustercockpit.org
name: ClusterCockpit Project
url: https://clustercockpit.org
description: API for cc-metric-store
license:
name: MIT License
url: https://opensource.org/licenses/MIT
title: cc-metric-store REST API
version: 1.0.0
paths:
/debug/:
post:
description: This endpoint allows the users to print the content of
parameters:
- description: Selector
in: query
name: selector
type: string
produces:
- application/json
responses:
"200":
description: Debug dump
schema:
type: string
"400":
description: Bad Request
schema:
$ref: '#/definitions/api.ErrorResponse'
"401":
description: Unauthorized
schema:
$ref: '#/definitions/api.ErrorResponse'
"403":
description: Forbidden
schema:
$ref: '#/definitions/api.ErrorResponse'
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/api.ErrorResponse'
security:
- ApiKeyAuth: []
summary: Debug endpoint
tags:
- debug
/free/:
post:
description: This endpoint allows the users to free the Buffers from the
parameters:
- description: up to timestamp
in: query
name: to
type: string
produces:
- application/json
responses:
"200":
description: ok
schema:
type: string
"400":
description: Bad Request
schema:
$ref: '#/definitions/api.ErrorResponse'
"401":
description: Unauthorized
schema:
$ref: '#/definitions/api.ErrorResponse'
"403":
description: Forbidden
schema:
$ref: '#/definitions/api.ErrorResponse'
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/api.ErrorResponse'
security:
- ApiKeyAuth: []
tags:
- free
/healthcheck/:
get:
description: This endpoint allows the users to check if a node is healthy
parameters:
- description: Selector
in: query
name: selector
type: string
produces:
- application/json
responses:
"200":
description: Debug dump
schema:
type: string
"400":
description: Bad Request
schema:
$ref: '#/definitions/api.ErrorResponse'
"401":
description: Unauthorized
schema:
$ref: '#/definitions/api.ErrorResponse'
"403":
description: Forbidden
schema:
$ref: '#/definitions/api.ErrorResponse'
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/api.ErrorResponse'
security:
- ApiKeyAuth: []
summary: HealthCheck endpoint
tags:
- healthcheck
/query/:
get:
consumes:
- application/json
description: This endpoint allows the users to retrieve data from the
parameters:
- description: API query payload object
in: body
name: request
required: true
schema:
$ref: '#/definitions/api.ApiQueryRequest'
produces:
- application/json
responses:
"200":
description: API query response object
schema:
$ref: '#/definitions/api.ApiQueryResponse'
"400":
description: Bad Request
schema:
$ref: '#/definitions/api.ErrorResponse'
"401":
description: Unauthorized
schema:
$ref: '#/definitions/api.ErrorResponse'
"403":
description: Forbidden
schema:
$ref: '#/definitions/api.ErrorResponse'
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/api.ErrorResponse'
security:
- ApiKeyAuth: []
summary: Query metrics
tags:
- query
/write/:
post:
consumes:
- text/plain
parameters:
- description: If the lines in the body do not have a cluster tag, use this
value instead.
in: query
name: cluster
type: string
produces:
- application/json
responses:
"200":
description: ok
schema:
type: string
"400":
description: Bad Request
schema:
$ref: '#/definitions/api.ErrorResponse'
"401":
description: Unauthorized
schema:
$ref: '#/definitions/api.ErrorResponse'
"403":
description: Forbidden
schema:
$ref: '#/definitions/api.ErrorResponse'
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/api.ErrorResponse'
security:
- ApiKeyAuth: []
securityDefinitions:
ApiKeyAuth:
in: header
name: X-Auth-Token
type: apiKey
swagger: "2.0"

601
archive.go Normal file
View File

@@ -0,0 +1,601 @@
package main
import (
"archive/zip"
"bufio"
"encoding/json"
"errors"
"fmt"
"io"
"io/fs"
"log"
"os"
"path"
"path/filepath"
"runtime"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
)
// Whenever changed, update MarshalJSON as well!
type CheckpointMetrics struct {
Frequency int64 `json:"frequency"`
Start int64 `json:"start"`
Unit string `json:"unit,omitempty"`
Data []Float `json:"data"`
}
// As `Float` implements a custom MarshalJSON() function,
// serializing an array of such types has more overhead
// than one would assume (because of extra allocations, interfaces and so on).
func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
buf := make([]byte, 0, 128+len(cm.Data)*8)
buf = append(buf, `{"frequency":`...)
buf = strconv.AppendInt(buf, cm.Frequency, 10)
buf = append(buf, `,"start":`...)
buf = strconv.AppendInt(buf, cm.Start, 10)
buf = append(buf, fmt.Sprintf(`,"unit":"%s"`, cm.Unit)...)
buf = append(buf, `,"data":[`...)
for i, x := range cm.Data {
if i != 0 {
buf = append(buf, ',')
}
if x.IsNaN() {
buf = append(buf, `null`...)
} else {
buf = strconv.AppendFloat(buf, float64(x), 'f', 1, 32)
}
}
buf = append(buf, `]}`...)
return buf, nil
}
type CheckpointFile struct {
From int64 `json:"from"`
To int64 `json:"to"`
Metrics map[string]*CheckpointMetrics `json:"metrics"`
Children map[string]*CheckpointFile `json:"children"`
}
var ErrNoNewData error = errors.New("all data already archived")
var NumWorkers int = 4
func init() {
maxWorkers := 10
NumWorkers = runtime.NumCPU()/2 + 1
if NumWorkers > maxWorkers {
NumWorkers = maxWorkers
}
}
// Metrics stored at the lowest 2 levels are not stored away (root and cluster)!
// On a per-host basis a new JSON file is created. I have no idea if this will scale.
// The good thing: Only a host at a time is locked, so this function can run
// in parallel to writes/reads.
func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) {
levels := make([]*level, 0)
selectors := make([][]string, 0)
m.root.lock.RLock()
for sel1, l1 := range m.root.children {
l1.lock.RLock()
for sel2, l2 := range l1.children {
levels = append(levels, l2)
selectors = append(selectors, []string{sel1, sel2})
}
l1.lock.RUnlock()
}
m.root.lock.RUnlock()
type workItem struct {
level *level
dir string
selector []string
}
n, errs := int32(0), int32(0)
var wg sync.WaitGroup
wg.Add(NumWorkers)
work := make(chan workItem, NumWorkers*2)
for worker := 0; worker < NumWorkers; worker++ {
go func() {
defer wg.Done()
for workItem := range work {
if err := workItem.level.toCheckpoint(workItem.dir, from, to, m); err != nil {
if err == ErrNoNewData {
continue
}
log.Printf("error while checkpointing %#v: %s", workItem.selector, err.Error())
atomic.AddInt32(&errs, 1)
} else {
atomic.AddInt32(&n, 1)
}
}
}()
}
for i := 0; i < len(levels); i++ {
dir := path.Join(dir, path.Join(selectors[i]...))
work <- workItem{
level: levels[i],
dir: dir,
selector: selectors[i],
}
}
close(work)
wg.Wait()
if errs > 0 {
return int(n), fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errs, n)
}
return int(n), nil
}
func (l *level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) {
l.lock.RLock()
defer l.lock.RUnlock()
retval := &CheckpointFile{
From: from,
To: to,
Metrics: make(map[string]*CheckpointMetrics),
Children: make(map[string]*CheckpointFile),
}
for metric, minfo := range m.metrics {
b := l.metrics[minfo.offset]
if b == nil {
continue
}
allArchived := true
b.iterFromTo(from, to, func(b *buffer) error {
if !b.archived {
allArchived = false
}
return nil
})
if allArchived {
continue
}
data := make([]Float, (to-from)/b.frequency+1)
data, start, end, err := b.read(from, to, data)
if err != nil {
return nil, err
}
for i := int((end - start) / b.frequency); i < len(data); i++ {
data[i] = NaN
}
retval.Metrics[metric] = &CheckpointMetrics{
Frequency: b.frequency,
Start: start,
Data: data,
Unit: b.unit,
}
}
for name, child := range l.children {
val, err := child.toCheckpointFile(from, to, m)
if err != nil {
return nil, err
}
if val != nil {
retval.Children[name] = val
}
}
if len(retval.Children) == 0 && len(retval.Metrics) == 0 {
return nil, nil
}
return retval, nil
}
func (l *level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
cf, err := l.toCheckpointFile(from, to, m)
if err != nil {
return err
}
if cf == nil {
return ErrNoNewData
}
filepath := path.Join(dir, fmt.Sprintf("%d.json", from))
f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil && os.IsNotExist(err) {
err = os.MkdirAll(dir, 0755)
if err == nil {
f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, 0644)
}
}
if err != nil {
return err
}
defer f.Close()
bw := bufio.NewWriter(f)
if err = json.NewEncoder(bw).Encode(cf); err != nil {
return err
}
return bw.Flush()
}
// Metrics stored at the lowest 2 levels are not loaded (root and cluster)!
// This function can only be called once and before the very first write or read.
// Different host's data is loaded to memory in parallel.
func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {
var wg sync.WaitGroup
work := make(chan [2]string, NumWorkers)
n, errs := int32(0), int32(0)
wg.Add(NumWorkers)
for worker := 0; worker < NumWorkers; worker++ {
go func() {
defer wg.Done()
for host := range work {
lvl := m.root.findLevelOrCreate(host[:], len(m.metrics))
nn, err := lvl.fromCheckpoint(filepath.Join(dir, host[0], host[1]), from, m)
if err != nil {
log.Fatalf("error while loading checkpoints: %s", err.Error())
atomic.AddInt32(&errs, 1)
}
atomic.AddInt32(&n, int32(nn))
}
}()
}
i := 0
clustersDir, err := os.ReadDir(dir)
for _, clusterDir := range clustersDir {
if !clusterDir.IsDir() {
err = errors.New("expected only directories at first level of checkpoints/ directory")
goto done
}
hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
if e != nil {
err = e
goto done
}
for _, hostDir := range hostsDir {
if !hostDir.IsDir() {
err = errors.New("expected only directories at second level of checkpoints/ directory")
goto done
}
i++
if i%NumWorkers == 0 && i > 100 {
// Forcing garbage collection runs here regulary during the loading of checkpoints
// will decrease the total heap size after loading everything back to memory is done.
// While loading data, the heap will grow fast, so the GC target size will double
// almost always. By forcing GCs here, we can keep it growing more slowly so that
// at the end, less memory is wasted.
runtime.GC()
}
work <- [2]string{clusterDir.Name(), hostDir.Name()}
}
}
done:
close(work)
wg.Wait()
if err != nil {
return int(n), err
}
if errs > 0 {
return int(n), fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errs, n)
}
return int(n), nil
}
func (l *level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
for name, metric := range cf.Metrics {
n := len(metric.Data)
b := &buffer{
frequency: metric.Frequency,
start: metric.Start,
data: metric.Data[0:n:n], // Space is wasted here :(
unit: metric.Unit,
prev: nil,
next: nil,
archived: true,
}
b.close()
minfo, ok := m.metrics[name]
if !ok {
continue
// return errors.New("Unkown metric: " + name)
}
prev := l.metrics[minfo.offset]
if prev == nil {
l.metrics[minfo.offset] = b
} else {
if prev.start > b.start {
return errors.New("wooops")
}
b.prev = prev
prev.next = b
}
l.metrics[minfo.offset] = b
}
if len(cf.Children) > 0 && l.children == nil {
l.children = make(map[string]*level)
}
for sel, childCf := range cf.Children {
child, ok := l.children[sel]
if !ok {
child = &level{
metrics: make([]*buffer, len(m.metrics)),
children: nil,
}
l.children[sel] = child
}
if err := child.loadFile(childCf, m); err != nil {
return err
}
}
return nil
}
func (l *level) fromCheckpoint(dir string, from int64, m *MemoryStore) (int, error) {
direntries, err := os.ReadDir(dir)
if err != nil {
if os.IsNotExist(err) {
return 0, nil
}
return 0, err
}
jsonFiles := make([]fs.DirEntry, 0)
filesLoaded := 0
for _, e := range direntries {
if e.IsDir() {
child := &level{
metrics: make([]*buffer, len(m.metrics)),
children: make(map[string]*level),
}
files, err := child.fromCheckpoint(path.Join(dir, e.Name()), from, m)
filesLoaded += files
if err != nil {
return filesLoaded, err
}
l.children[e.Name()] = child
} else if strings.HasSuffix(e.Name(), ".json") {
jsonFiles = append(jsonFiles, e)
} else {
return filesLoaded, errors.New("unexpected file: " + dir + "/" + e.Name())
}
}
files, err := findFiles(jsonFiles, from, true)
if err != nil {
return filesLoaded, err
}
for _, filename := range files {
f, err := os.Open(path.Join(dir, filename))
if err != nil {
return filesLoaded, err
}
defer f.Close()
br := bufio.NewReader(f)
cf := &CheckpointFile{}
if err = json.NewDecoder(br).Decode(cf); err != nil {
return filesLoaded, err
}
if cf.To != 0 && cf.To < from {
continue
}
if err = l.loadFile(cf, m); err != nil {
return filesLoaded, err
}
filesLoaded += 1
}
return filesLoaded, nil
}
// This will probably get very slow over time!
// A solution could be some sort of an index file in which all other files
// and the timespan they contain is listed.
func findFiles(direntries []fs.DirEntry, t int64, findMoreRecentFiles bool) ([]string, error) {
nums := map[string]int64{}
for _, e := range direntries {
ts, err := strconv.ParseInt(strings.TrimSuffix(e.Name(), ".json"), 10, 64)
if err != nil {
return nil, err
}
nums[e.Name()] = ts
}
sort.Slice(direntries, func(i, j int) bool {
a, b := direntries[i], direntries[j]
return nums[a.Name()] < nums[b.Name()]
})
filenames := make([]string, 0)
for i := 0; i < len(direntries); i++ {
e := direntries[i]
ts1 := nums[e.Name()]
if findMoreRecentFiles && t <= ts1 || i == len(direntries)-1 {
filenames = append(filenames, e.Name())
continue
}
enext := direntries[i+1]
ts2 := nums[enext.Name()]
if findMoreRecentFiles {
if ts1 < t && t < ts2 {
filenames = append(filenames, e.Name())
}
} else {
if ts2 < t {
filenames = append(filenames, e.Name())
}
}
}
return filenames, nil
}
// ZIP all checkpoint files older than `from` together and write them to the `archiveDir`,
// deleting them from the `checkpointsDir`.
func ArchiveCheckpoints(checkpointsDir, archiveDir string, from int64, deleteInstead bool) (int, error) {
entries1, err := os.ReadDir(checkpointsDir)
if err != nil {
return 0, err
}
type workItem struct {
cdir, adir string
cluster, host string
}
var wg sync.WaitGroup
n, errs := int32(0), int32(0)
work := make(chan workItem, NumWorkers)
wg.Add(NumWorkers)
for worker := 0; worker < NumWorkers; worker++ {
go func() {
defer wg.Done()
for workItem := range work {
m, err := archiveCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead)
if err != nil {
log.Printf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error())
atomic.AddInt32(&errs, 1)
}
atomic.AddInt32(&n, int32(m))
}
}()
}
for _, de1 := range entries1 {
entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name()))
if e != nil {
err = e
}
for _, de2 := range entries2 {
cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name())
adir := filepath.Join(archiveDir, de1.Name(), de2.Name())
work <- workItem{
adir: adir, cdir: cdir,
cluster: de1.Name(), host: de2.Name(),
}
}
}
close(work)
wg.Wait()
if err != nil {
return int(n), err
}
if errs > 0 {
return int(n), fmt.Errorf("%d errors happend while archiving (%d successes)", errs, n)
}
return int(n), nil
}
// Helper function for `ArchiveCheckpoints`.
func archiveCheckpoints(dir string, archiveDir string, from int64, deleteInstead bool) (int, error) {
entries, err := os.ReadDir(dir)
if err != nil {
return 0, err
}
files, err := findFiles(entries, from, false)
if err != nil {
return 0, err
}
if deleteInstead {
n := 0
for _, checkpoint := range files {
filename := filepath.Join(dir, checkpoint)
if err = os.Remove(filename); err != nil {
return n, err
}
n += 1
}
return n, nil
}
filename := filepath.Join(archiveDir, fmt.Sprintf("%d.zip", from))
f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil && os.IsNotExist(err) {
err = os.MkdirAll(archiveDir, 0755)
if err == nil {
f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0644)
}
}
if err != nil {
return 0, err
}
defer f.Close()
bw := bufio.NewWriter(f)
defer bw.Flush()
zw := zip.NewWriter(bw)
defer zw.Close()
n := 0
for _, checkpoint := range files {
filename := filepath.Join(dir, checkpoint)
r, err := os.Open(filename)
if err != nil {
return n, err
}
defer r.Close()
w, err := zw.Create(checkpoint)
if err != nil {
return n, err
}
if _, err = io.Copy(w, r); err != nil {
return n, err
}
if err = os.Remove(filename); err != nil {
return n, err
}
n += 1
}
return n, nil
}

54
archive_test.go Normal file
View File

@@ -0,0 +1,54 @@
package main
import (
"os"
"path/filepath"
"testing"
"time"
)
func TestArchiveReadDir(t *testing.T) {
memoryStore = NewMemoryStore(map[string]MetricConfig{
"flops_dp": {Frequency: 5},
"flops_sp": {Frequency: 5},
})
data := `{"from":1692628930,"to":1692629003,"metrics":{},"children":{"hwthread0":{"from":1692628930,"to":1692629003,"metrics":{"cpi":{"frequency":5,"start":1692628930,"unit":"","data":[2.2,0.7,2.1,1.6,2.3,0.7,0.9,1.8,0.7,1.7,1.5,1.8,0.9,1.6]},"flops_any":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.3,0.0,0.0,0.0,0.0,28.2,5.1,4.7,0.1,0.3,0.0,0.2,0.0,0.0]},"flops_dp":{"frequency":5,"start":1692628931,"unit":"MFLOP/s","data":[0.0,0.0,0.0,0.0,0.0,0.5,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0]},"flops_sp":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.3,0.0,0.0,0.0,0.0,27.2,4.8,4.7,0.0,0.3,0.0,0.2,0.0,0.0]}},"children":{}},"hwthread1":{"from":1692628930,"to":1692629003,"metrics":{"cpi":{"frequency":5,"start":1692628930,"unit":"","data":[0.9,4.2,4.2,3.8,3.1,0.7,1.3,0.7,1.8,1.6,5.0,3.0,4.9,5.2]},"flops_any":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.5,0.0,0.0,0.0,0.0,49.8,5.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0]},"flops_dp":{"frequency":5,"start":1692628931,"unit":"MFLOP/s","data":[0.0,0.0,0.0,0.0,0.0,0.4,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0]},"flops_sp":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.5,0.0,0.0,0.0,0.0,49.1,4.9,0.1,0.0,0.0,0.0,0.0,0.0,0.0]}},"children":{}},"hwthread2":{"from":1692628930,"to":1692629003,"metrics":{"cpi":{"frequency":5,"start":1692628930,"unit":"","data":[1.8,0.9,2.2,2.1,4.3,1.4,0.8,1.0,1.1,1.6,3.9,4.9,2.7,5.1]},"flops_any":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.0,0.0,0.0,0.0,0.0,16.2,4.4,0.0,0.1,0.0,0.0,0.0,0.0,0.0]},"flops_dp":{"frequency":5,"start":1692628931,"unit":"MFLOP/s","data":[0.0,0.0,0.0,0.0,0.0,0.1,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0]},"flops_sp":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.0,0.0,0.0,0.0,0.0,16.1,3.9,0.0,0.1,0.0,0.0,0.0,0.0,0.0]}},"children":{}},"hwthread3":{"from":1692628930,"to":1692629003,"metrics":{"cpi":{"frequency":5,"start":1692628930,"unit":"","data":[2.0,0.6,6.0,3.5,2.3,0.9,1.2,1.2,1.1,1.0,2.0,0.7,1.1,3.6]},"flops_any":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.0,0.0,0.0,0.0,0.0,1.6,11.5,0.0,0.0,0.0,0.0,0.2,0.0,0.0]},"flops_dp":{"frequency":5,"start":1692628931,"unit":"MFLOP/s","data":[0.0,0.0,0.0,0.0,0.0,0.1,0.4,0.0,0.0,0.0,0.0,0.1,0.0,0.0]},"flops_sp":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.0,0.0,0.0,0.0,0.0,1.4,10.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0]}},"children":{}},"hwthread4":{"from":1692628930,"to":1692629003,"metrics":{"cpi":{"frequency":5,"start":1692628930,"unit":"","data":[1.0,1.5,1.3,0.9,0.8,0.8,1.2,1.2,1.4,1.0,0.8,2.3,2.7,0.8]},"flops_any":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.8,0.0,0.0,0.0,0.0,3.4,1.1,2.2,0.4,0.0,0.0,0.0,0.0,0.0]},"flops_dp":{"frequency":5,"start":1692628931,"unit":"MFLOP/s","data":[0.0,0.0,0.0,0.0,0.0,0.7,0.3,0.2,0.0,0.0,0.0,0.0,0.0,0.0]},"flops_sp":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.8,0.0,0.0,0.0,0.0,2.0,0.5,1.8,0.4,0.0,0.0,0.0,0.0,0.0]}},"children":{}},"hwthread5":{"from":1692628930,"to":1692629003,"metrics":{"cpi":{"frequency":5,"start":1692628930,"unit":"","data":[0.9,2.3,1.1,0.7,0.8,1.5,1.0,0.9,0.9,1.0,0.8,1.1,1.0,2.0]},"flops_any":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.5,0.0,1.9,0.1,0.3,3.0,13.8,0.0,1.7,1.1,0.0,0.0,0.5,1.0]},"flops_dp":{"frequency":5,"start":1692628931,"unit":"MFLOP/s","data":[0.3,0.0,0.1,0.0,0.2,0.1,0.9,0.0,0.8,0.6,0.0,0.0,0.0,0.0]},"flops_sp":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.0,0.0,1.7,0.0,0.0,2.9,12.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0]}},"children":{}},"hwthread6":{"from":1692628930,"to":1692629003,"metrics":{"cpi":{"frequency":5,"start":1692628930,"unit":"","data":[1.4,0.8,0.8,1.2,1.4,0.8,1.2,1.1,1.5,0.9,1.5,1.5,1.4,0.8]},"flops_any":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.0,0.1,0.0,3.9,10.6,3.0,1.4,7.0,0.0,1.0,0.2,0.0,0.0,0.0]},"flops_dp":{"frequency":5,"start":1692628931,"unit":"MFLOP/s","data":[0.0,0.1,0.0,0.0,0.0,0.1,0.1,0.2,0.0,0.4,0.0,0.0,0.0,0.0]},"flops_sp":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.0,0.0,0.0,3.9,10.6,2.9,1.2,6.6,0.0,0.1,0.2,0.0,0.0,0.0]}},"children":{}},"hwthread7":{"from":1692628930,"to":1692629003,"metrics":{"cpi":{"frequency":5,"start":1692628930,"unit":"","data":[1.2,1.2,1.3,0.8,1.2,1.5,1.0,0.7,1.7,1.8,3.5,0.7,1.5,4.6]},"flops_any":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.0,0.0,0.0,0.3,0.0,2.3,4.7,0.0,12.5,0.7,0.2,0.2,0.0,0.0]},"flops_dp":{"frequency":5,"start":1692628931,"unit":"MFLOP/s","data":[0.0,0.0,0.0,0.0,0.0,0.1,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0]},"flops_sp":{"frequency":5,"start":1692628930,"unit":"MFLOP/s","data":[0.0,0.0,0.0,0.3,0.0,2.2,2.8,0.0,12.5,0.7,0.2,0.2,0.0,0.0]}},"children":{}}}}`
tmpdir := t.TempDir()
folder := filepath.Join(tmpdir, "testcluster", "nuc")
filename := filepath.Join(folder, "1692628930.json")
err := os.MkdirAll(folder, 0755)
if err != nil {
t.Error(err)
}
f, err := os.Create(filename)
if err != nil {
t.Error(err)
}
f.WriteString(data)
f.Close()
startupTime := time.Unix(1692628930, 0)
_, err = memoryStore.FromCheckpoint(tmpdir, startupTime.Unix())
if err != nil {
t.Error(err)
return
}
querydata := ApiMetricData{}
querysel := Selector{
{String: "testcluster"},
{String: "nuc"},
{String: "hwthread0"}}
querydata.Data, querydata.From, querydata.To, querydata.Unit, err = memoryStore.Read(querysel, "flops_dp", 1692628930, time.Now().Unix())
if err != nil {
t.Error(err)
return
}
if querydata.Unit != "MFLOP/s" {
t.Errorf("Metric flops_dp does not have the right unit")
return
}
}

349
cc-metric-store.go Normal file
View File

@@ -0,0 +1,349 @@
package main
import (
"bufio"
"context"
"encoding/json"
"flag"
"fmt"
"io"
"log"
"os"
"os/signal"
"runtime"
"runtime/debug"
"sync"
"syscall"
"time"
"github.com/google/gops/agent"
)
// For aggregation over multiple values at different cpus/sockets/..., not time!
type AggregationStrategy int
const (
NoAggregation AggregationStrategy = iota
SumAggregation
AvgAggregation
)
func (as *AggregationStrategy) UnmarshalJSON(data []byte) error {
var str string
if err := json.Unmarshal(data, &str); err != nil {
return err
}
switch str {
case "":
*as = NoAggregation
case "sum":
*as = SumAggregation
case "avg":
*as = AvgAggregation
default:
return fmt.Errorf("invalid aggregation strategy: %#v", str)
}
return nil
}
type MetricConfig struct {
// Interval in seconds at which measurements will arive.
Frequency int64 `json:"frequency"`
// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
Aggregation AggregationStrategy `json:"aggregation"`
// Private, used internally...
offset int
}
type HttpConfig struct {
// Address to bind to, for example "0.0.0.0:8081"
Address string `json:"address"`
// If not the empty string, use https with this as the certificate file
CertFile string `json:"https-cert-file"`
// If not the empty string, use https with this as the key file
KeyFile string `json:"https-key-file"`
}
type NatsConfig struct {
// Address of the nats server
Address string `json:"address"`
// Username/Password, optional
Username string `json:"username"`
Password string `json:"password"`
Subscriptions []struct {
// Channel name
SubscribeTo string `json:"subscribe-to"`
// Allow lines without a cluster tag, use this as default, optional
ClusterTag string `json:"cluster-tag"`
} `json:"subscriptions"`
}
type Config struct {
Metrics map[string]MetricConfig `json:"metrics"`
RetentionInMemory string `json:"retention-in-memory"`
Nats []*NatsConfig `json:"nats"`
JwtPublicKey string `json:"jwt-public-key"`
HttpConfig *HttpConfig `json:"http-api"`
Checkpoints struct {
Interval string `json:"interval"`
RootDir string `json:"directory"`
Restore string `json:"restore"`
} `json:"checkpoints"`
Archive struct {
Interval string `json:"interval"`
RootDir string `json:"directory"`
DeleteInstead bool `json:"delete-instead"`
} `json:"archive"`
Debug struct {
EnableGops bool `json:"gops"`
DumpToFile string `json:"dump-to-file"`
} `json:"debug"`
}
var conf Config
var memoryStore *MemoryStore = nil
var lastCheckpoint time.Time
var debugDumpLock sync.Mutex
var debugDump io.Writer = io.Discard
func loadConfiguration(file string) Config {
var config Config
configFile, err := os.Open(file)
if err != nil {
log.Fatal(err)
}
defer configFile.Close()
dec := json.NewDecoder(configFile)
dec.DisallowUnknownFields()
if err := dec.Decode(&config); err != nil {
log.Fatal(err)
}
return config
}
func intervals(wg *sync.WaitGroup, ctx context.Context) {
wg.Add(3)
// go func() {
// defer wg.Done()
// ticks := time.Tick(30 * time.Minute)
// for {
// select {
// case <-ctx.Done():
// return
// case <-ticks:
// runtime.GC()
// }
// }
// }()
go func() {
defer wg.Done()
d, err := time.ParseDuration(conf.RetentionInMemory)
if err != nil {
log.Fatal(err)
}
if d <= 0 {
return
}
ticks := time.Tick(d / 2)
for {
select {
case <-ctx.Done():
return
case <-ticks:
t := time.Now().Add(-d)
log.Printf("start freeing buffers (older than %s)...\n", t.Format(time.RFC3339))
freed, err := memoryStore.Free(nil, t.Unix())
if err != nil {
log.Printf("freeing up buffers failed: %s\n", err.Error())
} else {
log.Printf("done: %d buffers freed\n", freed)
}
}
}
}()
lastCheckpoint = time.Now()
go func() {
defer wg.Done()
d, err := time.ParseDuration(conf.Checkpoints.Interval)
if err != nil {
log.Fatal(err)
}
if d <= 0 {
return
}
ticks := time.Tick(d)
for {
select {
case <-ctx.Done():
return
case <-ticks:
log.Printf("start checkpointing (starting at %s)...\n", lastCheckpoint.Format(time.RFC3339))
now := time.Now()
n, err := memoryStore.ToCheckpoint(conf.Checkpoints.RootDir,
lastCheckpoint.Unix(), now.Unix())
if err != nil {
log.Printf("checkpointing failed: %s\n", err.Error())
} else {
log.Printf("done: %d checkpoint files created\n", n)
lastCheckpoint = now
}
}
}
}()
go func() {
defer wg.Done()
d, err := time.ParseDuration(conf.Archive.Interval)
if err != nil {
log.Fatal(err)
}
if d <= 0 {
return
}
ticks := time.Tick(d)
for {
select {
case <-ctx.Done():
return
case <-ticks:
t := time.Now().Add(-d)
log.Printf("start archiving checkpoints (older than %s)...\n", t.Format(time.RFC3339))
n, err := ArchiveCheckpoints(conf.Checkpoints.RootDir, conf.Archive.RootDir, t.Unix(), conf.Archive.DeleteInstead)
if err != nil {
log.Printf("archiving failed: %s\n", err.Error())
} else {
log.Printf("done: %d files zipped and moved to archive\n", n)
}
}
}
}()
}
func main() {
var configFile string
var enableGopsAgent bool
flag.StringVar(&configFile, "config", "./config.json", "configuration file")
flag.BoolVar(&enableGopsAgent, "gops", false, "Listen via github.com/google/gops/agent")
flag.Parse()
startupTime := time.Now()
conf = loadConfiguration(configFile)
memoryStore = NewMemoryStore(conf.Metrics)
if enableGopsAgent || conf.Debug.EnableGops {
if err := agent.Listen(agent.Options{}); err != nil {
log.Fatal(err)
}
}
if conf.Debug.DumpToFile != "" {
f, err := os.Create(conf.Debug.DumpToFile)
if err != nil {
log.Fatal(err)
}
debugDump = f
}
d, err := time.ParseDuration(conf.Checkpoints.Restore)
if err != nil {
log.Fatal(err)
}
restoreFrom := startupTime.Add(-d)
log.Printf("Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339))
files, err := memoryStore.FromCheckpoint(conf.Checkpoints.RootDir, restoreFrom.Unix())
loadedData := memoryStore.SizeInBytes() / 1024 / 1024 // In MB
if err != nil {
log.Fatalf("Loading checkpoints failed: %s\n", err.Error())
} else {
log.Printf("Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds())
}
// Try to use less memory by forcing a GC run here and then
// lowering the target percentage. The default of 100 means
// that only once the ratio of new allocations execeds the
// previously active heap, a GC is triggered.
// Forcing a GC here will set the "previously active heap"
// to a minumum.
runtime.GC()
if loadedData > 1000 && os.Getenv("GOGC") == "" {
debug.SetGCPercent(10)
}
ctx, shutdown := context.WithCancel(context.Background())
var wg sync.WaitGroup
sigs := make(chan os.Signal, 1)
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM, syscall.SIGUSR1)
go func() {
for {
sig := <-sigs
if sig == syscall.SIGUSR1 {
memoryStore.DebugDump(bufio.NewWriter(os.Stdout), nil)
continue
}
log.Println("Shuting down...")
shutdown()
}
}()
intervals(&wg, ctx)
wg.Add(1)
go func() {
err := StartApiServer(ctx, conf.HttpConfig)
if err != nil {
log.Fatal(err)
}
wg.Done()
}()
if conf.Nats != nil {
for _, natsConf := range conf.Nats {
// TODO: When multiple nats configs share a URL, do a single connect.
wg.Add(1)
nc := natsConf
go func() {
// err := ReceiveNats(conf.Nats, decodeLine, runtime.NumCPU()-1, ctx)
err := ReceiveNats(nc, decodeLine, 1, ctx)
if err != nil {
log.Fatal(err)
}
wg.Done()
}()
}
}
wg.Wait()
log.Printf("Writing to '%s'...\n", conf.Checkpoints.RootDir)
files, err = memoryStore.ToCheckpoint(conf.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
if err != nil {
log.Printf("Writing checkpoint failed: %s\n", err.Error())
}
log.Printf("Done! (%d files written)\n", files)
if closer, ok := debugDump.(io.Closer); ok {
if err := closer.Close(); err != nil {
log.Printf("error: %s", err.Error())
}
}
}

View File

@@ -1,181 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"context"
"crypto/tls"
"flag"
"fmt"
"log"
"net"
"net/http"
"os"
"os/signal"
"runtime"
"runtime/debug"
"sync"
"syscall"
"time"
"github.com/ClusterCockpit/cc-metric-store/internal/api"
"github.com/ClusterCockpit/cc-metric-store/internal/avro"
"github.com/ClusterCockpit/cc-metric-store/internal/config"
"github.com/ClusterCockpit/cc-metric-store/internal/memorystore"
"github.com/ClusterCockpit/cc-metric-store/internal/runtimeEnv"
"github.com/google/gops/agent"
httpSwagger "github.com/swaggo/http-swagger"
)
var (
date string
commit string
version string
)
func main() {
var configFile string
var enableGopsAgent, flagVersion, flagDev bool
flag.StringVar(&configFile, "config", "./config.json", "configuration file")
flag.BoolVar(&enableGopsAgent, "gops", false, "Listen via github.com/google/gops/agent")
flag.BoolVar(&flagDev, "dev", false, "Enable development Swagger UI component")
flag.BoolVar(&flagVersion, "version", false, "Show version information and exit")
flag.Parse()
if flagVersion {
fmt.Printf("Version:\t%s\n", version)
fmt.Printf("Git hash:\t%s\n", commit)
fmt.Printf("Build time:\t%s\n", date)
os.Exit(0)
}
startupTime := time.Now()
config.Init(configFile)
memorystore.Init(config.Keys.Metrics)
ms := memorystore.GetMemoryStore()
if enableGopsAgent || config.Keys.Debug.EnableGops {
if err := agent.Listen(agent.Options{}); err != nil {
log.Fatal(err)
}
}
d, err := time.ParseDuration(config.Keys.Checkpoints.Restore)
if err != nil {
log.Fatal(err)
}
restoreFrom := startupTime.Add(-d)
log.Printf("Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339))
files, err := ms.FromCheckpointFiles(config.Keys.Checkpoints.RootDir, restoreFrom.Unix())
loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB
if err != nil {
log.Fatalf("Loading checkpoints failed: %s\n", err.Error())
} else {
log.Printf("Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds())
}
// Try to use less memory by forcing a GC run here and then
// lowering the target percentage. The default of 100 means
// that only once the ratio of new allocations execeds the
// previously active heap, a GC is triggered.
// Forcing a GC here will set the "previously active heap"
// to a minumum.
runtime.GC()
if loadedData > 1000 && os.Getenv("GOGC") == "" {
debug.SetGCPercent(10)
}
ctx, shutdown := context.WithCancel(context.Background())
var wg sync.WaitGroup
wg.Add(4)
memorystore.Retention(&wg, ctx)
memorystore.Checkpointing(&wg, ctx)
memorystore.Archiving(&wg, ctx)
avro.DataStaging(&wg, ctx)
r := http.NewServeMux()
api.MountRoutes(r)
if flagDev {
log.Print("Enable Swagger UI!")
r.HandleFunc("GET /swagger/", httpSwagger.Handler(
httpSwagger.URL("http://"+config.Keys.HttpConfig.Address+"/swagger/doc.json")))
}
server := &http.Server{
Handler: r,
Addr: config.Keys.HttpConfig.Address,
WriteTimeout: 30 * time.Second,
ReadTimeout: 30 * time.Second,
}
// Start http or https server
listener, err := net.Listen("tcp", config.Keys.HttpConfig.Address)
if err != nil {
log.Fatalf("starting http listener failed: %v", err)
}
if config.Keys.HttpConfig.CertFile != "" && config.Keys.HttpConfig.KeyFile != "" {
cert, err := tls.LoadX509KeyPair(config.Keys.HttpConfig.CertFile, config.Keys.HttpConfig.KeyFile)
if err != nil {
log.Fatalf("loading X509 keypair failed: %v", err)
}
listener = tls.NewListener(listener, &tls.Config{
Certificates: []tls.Certificate{cert},
CipherSuites: []uint16{
tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
},
MinVersion: tls.VersionTLS12,
PreferServerCipherSuites: true,
})
fmt.Printf("HTTPS server listening at %s...\n", config.Keys.HttpConfig.Address)
} else {
fmt.Printf("HTTP server listening at %s...\n", config.Keys.HttpConfig.Address)
}
wg.Add(1)
go func() {
defer wg.Done()
if err = server.Serve(listener); err != nil && err != http.ErrServerClosed {
log.Fatalf("starting server failed: %v", err)
}
}()
wg.Add(1)
sigs := make(chan os.Signal, 1)
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
go func() {
defer wg.Done()
<-sigs
runtimeEnv.SystemdNotifiy(false, "Shutting down ...")
server.Shutdown(context.Background())
shutdown()
memorystore.Shutdown()
}()
if config.Keys.Nats != nil {
for _, natsConf := range config.Keys.Nats {
// TODO: When multiple nats configs share a URL, do a single connect.
wg.Add(1)
nc := natsConf
go func() {
// err := ReceiveNats(conf.Nats, decodeLine, runtime.NumCPU()-1, ctx)
err := api.ReceiveNats(nc, ms, 1, ctx)
if err != nil {
log.Fatal(err)
}
wg.Done()
}()
}
}
runtimeEnv.SystemdNotifiy(true, "running")
wg.Wait()
log.Print("Graceful shutdown completed!")
}

27
config.json Normal file
View File

@@ -0,0 +1,27 @@
{
"metrics": {
"flops_any": { "frequency": 15, "aggregation": "sum" },
"flops_dp": { "frequency": 15, "aggregation": "sum" },
"flops_sp": { "frequency": 15, "aggregation": "sum" },
"mem_bw": { "frequency": 15, "aggregation": "sum" },
"load_one": { "frequency": 15, "aggregation": null },
"load_five": { "frequency": 15, "aggregation": null }
},
"checkpoints": {
"interval": "12h",
"directory": "./var/checkpoints",
"restore": "48h"
},
"archive": {
"interval": "168h",
"directory": "./var/archive"
},
"http-api": {
"address": "0.0.0.0:8081",
"https-cert-file": null,
"https-key-file": null
},
"retention-in-memory": "48h",
"nats": null,
"jwt-public-key": "kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0="
}

View File

@@ -1,196 +0,0 @@
{
"metrics": {
"debug_metric": {
"frequency": 60,
"aggregation": "avg"
},
"clock": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_idle": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_iowait": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_irq": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_system": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_user": {
"frequency": 60,
"aggregation": "avg"
},
"nv_mem_util": {
"frequency": 60,
"aggregation": "avg"
},
"nv_temp": {
"frequency": 60,
"aggregation": "avg"
},
"nv_sm_clock": {
"frequency": 60,
"aggregation": "avg"
},
"acc_utilization": {
"frequency": 60,
"aggregation": "avg"
},
"acc_mem_used": {
"frequency": 60,
"aggregation": "sum"
},
"acc_power": {
"frequency": 60,
"aggregation": "sum"
},
"flops_any": {
"frequency": 60,
"aggregation": "sum"
},
"flops_dp": {
"frequency": 60,
"aggregation": "sum"
},
"flops_sp": {
"frequency": 60,
"aggregation": "sum"
},
"ib_recv": {
"frequency": 60,
"aggregation": "sum"
},
"ib_xmit": {
"frequency": 60,
"aggregation": "sum"
},
"ib_recv_pkts": {
"frequency": 60,
"aggregation": "sum"
},
"ib_xmit_pkts": {
"frequency": 60,
"aggregation": "sum"
},
"cpu_power": {
"frequency": 60,
"aggregation": "sum"
},
"core_power": {
"frequency": 60,
"aggregation": "sum"
},
"mem_power": {
"frequency": 60,
"aggregation": "sum"
},
"ipc": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_load": {
"frequency": 60,
"aggregation": null
},
"lustre_close": {
"frequency": 60,
"aggregation": null
},
"lustre_open": {
"frequency": 60,
"aggregation": null
},
"lustre_statfs": {
"frequency": 60,
"aggregation": null
},
"lustre_read_bytes": {
"frequency": 60,
"aggregation": null
},
"lustre_write_bytes": {
"frequency": 60,
"aggregation": null
},
"net_bw": {
"frequency": 60,
"aggregation": null
},
"file_bw": {
"frequency": 60,
"aggregation": null
},
"mem_bw": {
"frequency": 60,
"aggregation": "sum"
},
"mem_cached": {
"frequency": 60,
"aggregation": null
},
"mem_used": {
"frequency": 60,
"aggregation": null
},
"net_bytes_in": {
"frequency": 60,
"aggregation": null
},
"net_bytes_out": {
"frequency": 60,
"aggregation": null
},
"nfs4_read": {
"frequency": 60,
"aggregation": null
},
"nfs4_total": {
"frequency": 60,
"aggregation": null
},
"nfs4_write": {
"frequency": 60,
"aggregation": null
},
"vectorization_ratio": {
"frequency": 60,
"aggregation": "avg"
}
},
"checkpoints": {
"interval": "12h",
"directory": "./var/checkpoints",
"restore": "48h"
},
"archive": {
"interval": "50h",
"directory": "./var/archive"
},
"http-api": {
"address": "localhost:8082",
"https-cert-file": null,
"https-key-file": null
},
"retention-in-memory": "48h",
"nats": [
{
"address": "nats://localhost:4222",
"creds-file-path": "test.creds",
"subscriptions": [
{
"subscribe-to": "ee-hpc-nats",
"cluster-tag": "fritz2"
}
]
}
],
"jwt-public-key": "kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0="
}

View File

@@ -1,185 +0,0 @@
{
"metrics": {
"debug_metric": {
"frequency": 60,
"aggregation": "avg"
},
"clock": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_idle": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_iowait": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_irq": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_system": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_user": {
"frequency": 60,
"aggregation": "avg"
},
"nv_mem_util": {
"frequency": 60,
"aggregation": "avg"
},
"nv_temp": {
"frequency": 60,
"aggregation": "avg"
},
"nv_sm_clock": {
"frequency": 60,
"aggregation": "avg"
},
"acc_utilization": {
"frequency": 60,
"aggregation": "avg"
},
"acc_mem_used": {
"frequency": 60,
"aggregation": "sum"
},
"acc_power": {
"frequency": 60,
"aggregation": "sum"
},
"flops_any": {
"frequency": 60,
"aggregation": "sum"
},
"flops_dp": {
"frequency": 60,
"aggregation": "sum"
},
"flops_sp": {
"frequency": 60,
"aggregation": "sum"
},
"ib_recv": {
"frequency": 60,
"aggregation": "sum"
},
"ib_xmit": {
"frequency": 60,
"aggregation": "sum"
},
"ib_recv_pkts": {
"frequency": 60,
"aggregation": "sum"
},
"ib_xmit_pkts": {
"frequency": 60,
"aggregation": "sum"
},
"cpu_power": {
"frequency": 60,
"aggregation": "sum"
},
"core_power": {
"frequency": 60,
"aggregation": "sum"
},
"mem_power": {
"frequency": 60,
"aggregation": "sum"
},
"ipc": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_load": {
"frequency": 60,
"aggregation": null
},
"lustre_close": {
"frequency": 60,
"aggregation": null
},
"lustre_open": {
"frequency": 60,
"aggregation": null
},
"lustre_statfs": {
"frequency": 60,
"aggregation": null
},
"lustre_read_bytes": {
"frequency": 60,
"aggregation": null
},
"lustre_write_bytes": {
"frequency": 60,
"aggregation": null
},
"net_bw": {
"frequency": 60,
"aggregation": null
},
"file_bw": {
"frequency": 60,
"aggregation": null
},
"mem_bw": {
"frequency": 60,
"aggregation": "sum"
},
"mem_cached": {
"frequency": 60,
"aggregation": null
},
"mem_used": {
"frequency": 60,
"aggregation": null
},
"net_bytes_in": {
"frequency": 60,
"aggregation": null
},
"net_bytes_out": {
"frequency": 60,
"aggregation": null
},
"nfs4_read": {
"frequency": 60,
"aggregation": null
},
"nfs4_total": {
"frequency": 60,
"aggregation": null
},
"nfs4_write": {
"frequency": 60,
"aggregation": null
},
"vectorization_ratio": {
"frequency": 60,
"aggregation": "avg"
}
},
"checkpoints": {
"interval": "12h",
"directory": "./var/checkpoints",
"restore": "48h"
},
"archive": {
"interval": "50h",
"directory": "./var/archive"
},
"http-api": {
"address": "localhost:8082",
"https-cert-file": null,
"https-key-file": null
},
"retention-in-memory": "48h",
"nats": null,
"jwt-public-key": "kzfYrYy+TzpanWZHJ5qSdMj5uKUWgq74BWhQG6copP0="
}

View File

@@ -1,4 +1,4 @@
package memorystore
package main
import (
"bufio"
@@ -21,6 +21,9 @@ func (b *buffer) debugDump(buf []byte) []byte {
if b.archived {
buf = append(buf, `,"saved":true`...)
}
if b.unit != "" {
buf = append(buf, fmt.Sprintf(`,"unit":"%s"`, b.unit)...)
}
if b.next != nil {
buf = append(buf, `},`...)
} else {
@@ -29,7 +32,7 @@ func (b *buffer) debugDump(buf []byte) []byte {
return buf
}
func (l *Level) debugDump(m *MemoryStore, w *bufio.Writer, lvlname string, buf []byte, depth int) ([]byte, error) {
func (l *level) debugDump(m *MemoryStore, w *bufio.Writer, lvlname string, buf []byte, depth int) ([]byte, error) {
l.lock.RLock()
defer l.lock.RUnlock()
for i := 0; i < depth; i++ {
@@ -40,8 +43,8 @@ func (l *Level) debugDump(m *MemoryStore, w *bufio.Writer, lvlname string, buf [
buf = append(buf, "\":{\n"...)
depth += 1
objitems := 0
for name, mc := range m.Metrics {
if b := l.metrics[mc.Offset]; b != nil {
for name, mc := range m.metrics {
if b := l.metrics[mc.offset]; b != nil {
for i := 0; i < depth; i++ {
buf = append(buf, '\t')
}

View File

@@ -1,6 +0,0 @@
JWT="eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
# If the collector and store and nats-server have been running for at least 60 seconds on the same host, you may run:
curl -X 'GET' 'http://localhost:8081/api/query/' -H "Authorization: Bearer $JWT" -d "{ \"cluster\": \"alex\", \"from\": 1720879275, \"to\": 1720964715, \"queries\": [{\"metric\": \"cpu_load\",\"host\": \"a0124\"}] }"
# ...

View File

@@ -1,6 +0,0 @@
JWT="eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
# If the collector and store and nats-server have been running for at least 60 seconds on the same host, you may run:
curl -X 'GET' 'http://localhost:8082/api/debug/' -H "Authorization: Bearer $JWT"
# ...

View File

@@ -1,6 +0,0 @@
JWT="eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
# If the collector and store and nats-server have been running for at least 60 seconds on the same host, you may run:
curl -X 'POST' 'http://localhost:8082/api/free/?to=1724536800' -H "Authorization: Bearer $JWT" -d "[ [ \"alex\", \"a0329\", \"memoryDomain2\" ], [ \"alex\", \"a0903\" ],[ \"fritz\", \"f0201\" ] ]"
# ...

View File

@@ -1,6 +0,0 @@
JWT="eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
# If the collector and store and nats-server have been running for at least 60 seconds on the same host, you may run:
curl -X 'GET' 'http://localhost:8082/api/healthcheck/?cluster=alex&node=a0903' -H "Authorization: Bearer $JWT"
# ...

View File

@@ -1,110 +0,0 @@
JWT="eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
# curl -X 'POST' 'http://localhost:8082/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" -d $'cpu_load,cluster=alex,hostname=a042,type=hwthread,type-id=0 value=35.0 1725827464642231296'
rm sample_fritz.txt
rm sample_alex.txt
while [ true ]; do
echo "Alex Metrics for hwthread types and type-ids"
timestamp="$(date '+%s')"
echo "Timestamp : "+$timestamp
for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do
for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904; do
for id in {0..127}; do
echo "$metric,cluster=alex,hostname=$hostname,type=hwthread,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_alex.txt
done
done
done
curl -X 'POST' 'http://localhost:8082/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" --data-binary @sample_alex.txt
echo "Fritz Metrics for hwthread types and type-ids"
for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do
for hostname in f0201 f0202 f0203 f0204 f0205 f0206 f0207 f0208 f0209 f0210 f0211 f0212 f0213 f0214 f0215 f0217 f0218 f0219 f0220 f0221 f0222 f0223 f0224 f0225 f0226 f0227 f0228 f0229 f0230 f0231 f0232 f0233 f0234 f0235 f0236 f0237 f0238 f0239 f0240 f0241 f0242 f0243 f0244 f0245 f0246 f0247 f0248 f0249 f0250 f0251 f0252 f0253 f0254 f0255 f0256 f0257 f0258 f0259 f0260 f0261 f0262 f0263 f0264 f0378; do
for id in {0..71}; do
echo "$metric,cluster=fritz,hostname=$hostname,type=hwthread,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_fritz.txt
done
done
done
curl -X 'POST' 'http://localhost:8082/api/write/?cluster=fritz' -H "Authorization: Bearer $JWT" --data-binary @sample_fritz.txt
rm sample_fritz.txt
rm sample_alex.txt
echo "Alex Metrics for accelerator types and type-ids"
for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do
for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904; do
for id in 00000000:49:00.0 00000000:0E:00.0 00000000:D1:00.0 00000000:90:00.0 00000000:13:00.0 00000000:96:00.0 00000000:CC:00.0 00000000:4F:00.0; do
echo "$metric,cluster=alex,hostname=$hostname,type=accelerator,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_alex.txt
done
done
done
curl -X 'POST' 'http://localhost:8082/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" --data-binary @sample_alex.txt
rm sample_alex.txt
echo "Alex Metrics for memoryDomain types and type-ids"
for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do
for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904; do
for id in {0..7}; do
echo "$metric,cluster=alex,hostname=$hostname,type=memoryDomain,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_alex.txt
done
done
done
curl -X 'POST' 'http://localhost:8082/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" --data-binary @sample_alex.txt
rm sample_alex.txt
echo "Alex Metrics for socket types and type-ids"
for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do
for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904; do
for id in {0..1}; do
echo "$metric,cluster=alex,hostname=$hostname,type=socket,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_alex.txt
done
done
done
curl -X 'POST' 'http://localhost:8082/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" --data-binary @sample_alex.txt
echo "Fritz Metrics for socket types and type-ids"
for metric in cpu_load cpu_user flops_any cpu_irq cpu_system ipc cpu_idle cpu_iowait core_power clock; do
for hostname in f0201 f0202 f0203 f0204 f0205 f0206 f0207 f0208 f0209 f0210 f0211 f0212 f0213 f0214 f0215 f0217 f0218 f0219 f0220 f0221 f0222 f0223 f0224 f0225 f0226 f0227 f0228 f0229 f0230 f0231 f0232 f0233 f0234 f0235 f0236 f0237 f0238 f0239 f0240 f0241 f0242 f0243 f0244 f0245 f0246 f0247 f0248 f0249 f0250 f0251 f0252 f0253 f0254 f0255 f0256 f0257 f0258 f0259 f0260 f0261 f0262 f0263 f0264 f0378; do
for id in {0..1}; do
echo "$metric,cluster=fritz,hostname=$hostname,type=socket,type-id=$id value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_fritz.txt
done
done
done
curl -X 'POST' 'http://localhost:8082/api/write/?cluster=fritz' -H "Authorization: Bearer $JWT" --data-binary @sample_fritz.txt
rm sample_fritz.txt
rm sample_alex.txt
echo "Alex Metrics for nodes"
for metric in cpu_irq cpu_load mem_cached net_bytes_in cpu_user cpu_idle nfs4_read mem_used nfs4_write nfs4_total ib_xmit ib_xmit_pkts net_bytes_out cpu_iowait ib_recv cpu_system ib_recv_pkts; do
for hostname in a0603 a0903 a0832 a0329 a0702 a0122 a1624 a0731 a0224 a0704 a0631 a0225 a0222 a0427 a0603 a0429 a0833 a0705 a0901 a0601 a0227 a0804 a0322 a0226 a0126 a0129 a0605 a0801 a0934 a1622 a0902 a0428 a0537 a1623 a1722 a0228 a0701 a0326 a0327 a0123 a0321 a1621 a0323 a0124 a0534 a0931 a0324 a0933 a0424 a0905 a0128 a0532 a0805 a0521 a0535 a0932 a0127 a0325 a0633 a0831 a0803 a0426 a0425 a0229 a1721 a0602 a0632 a0223 a0422 a0423 a0536 a0328 a0703 anvme7 a0125 a0221 a0604 a0802 a0522 a0531 a0533 a0904; do
echo "$metric,cluster=alex,hostname=$hostname,type=node value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_alex.txt
done
done
curl -X 'POST' 'http://localhost:8082/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" --data-binary @sample_alex.txt
echo "Fritz Metrics for nodes"
for metric in cpu_irq cpu_load mem_cached net_bytes_in cpu_user cpu_idle nfs4_read mem_used nfs4_write nfs4_total ib_xmit ib_xmit_pkts net_bytes_out cpu_iowait ib_recv cpu_system ib_recv_pkts; do
for hostname in f0201 f0202 f0203 f0204 f0205 f0206 f0207 f0208 f0209 f0210 f0211 f0212 f0213 f0214 f0215 f0217 f0218 f0219 f0220 f0221 f0222 f0223 f0224 f0225 f0226 f0227 f0228 f0229 f0230 f0231 f0232 f0233 f0234 f0235 f0236 f0237 f0238 f0239 f0240 f0241 f0242 f0243 f0244 f0245 f0246 f0247 f0248 f0249 f0250 f0251 f0252 f0253 f0254 f0255 f0256 f0257 f0258 f0259 f0260 f0261 f0262 f0263 f0264 f0378; do
echo "$metric,cluster=fritz,hostname=$hostname,type=node value=$((1 + RANDOM % 100)).0 $timestamp" >>sample_fritz.txt
done
done
curl -X 'POST' 'http://localhost:8082/api/write/?cluster=fritz' -H "Authorization: Bearer $JWT" --data-binary @sample_fritz.txt
rm sample_fritz.txt
rm sample_alex.txt
sleep 1m
done
# curl -X 'POST' 'http://localhost:8081/api/write/?cluster=alex' -H "Authorization: Bearer $JWT" -d $'cpu_load,cluster=alex,hostname=a042,type=hwthread,type-id=0 value=35.0 1725827464642231296'

View File

@@ -1,4 +1,4 @@
package util
package main
import (
"math"
@@ -11,10 +11,8 @@ import (
// we have to use our own type which implements encoding/json.Marshaler itself.
type Float float64
var (
NaN Float = Float(math.NaN())
nullAsBytes []byte = []byte("null")
)
var NaN Float = Float(math.NaN())
var nullAsBytes []byte = []byte("null")
func (f Float) IsNaN() bool {
return math.IsNaN(float64(f))
@@ -28,10 +26,6 @@ func (f Float) MarshalJSON() ([]byte, error) {
return strconv.AppendFloat(make([]byte, 0, 10), float64(f), 'f', 3, 64), nil
}
func (f Float) Double() float64 {
return float64(f)
}
func (f *Float) UnmarshalJSON(input []byte) error {
if string(input) == "null" {
*f = NaN
@@ -49,14 +43,6 @@ func (f *Float) UnmarshalJSON(input []byte) error {
// Same as `[]Float`, but can be marshaled to JSON with less allocations.
type FloatArray []Float
func ConvertToFloat(input float64) Float {
if input == -1.0 {
return NaN
} else {
return Float(input)
}
}
func (fa FloatArray) MarshalJSON() ([]byte, error) {
buf := make([]byte, 0, 2+len(fa)*8)
buf = append(buf, '[')
@@ -69,6 +55,7 @@ func (fa FloatArray) MarshalJSON() ([]byte, error) {
buf = append(buf, `null`...)
} else {
buf = strconv.AppendFloat(buf, float64(fa[i]), 'f', 3, 64)
}
}
buf = append(buf, ']')

42
go.mod
View File

@@ -1,39 +1,13 @@
module github.com/ClusterCockpit/cc-metric-store
go 1.22
go 1.16
require (
github.com/golang-jwt/jwt/v4 v4.5.0
github.com/google/gops v0.3.28
github.com/influxdata/line-protocol/v2 v2.2.1
github.com/nats-io/nats.go v1.36.0
github.com/swaggo/http-swagger v1.3.4
github.com/swaggo/swag v1.16.3
)
require (
github.com/KyleBanks/depth v1.2.1 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
github.com/go-openapi/jsonreference v0.21.0 // indirect
github.com/go-openapi/spec v0.21.0 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/golang/snappy v0.0.1 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/klauspost/compress v1.17.9 // indirect
github.com/linkedin/goavro/v2 v2.13.1 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/nats-io/nkeys v0.4.7 // indirect
github.com/nats-io/nuid v1.0.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/swaggo/files v1.0.1 // indirect
github.com/urfave/cli/v2 v2.27.1 // indirect
github.com/xrash/smetrics v0.0.0-20240312152122-5f08fbb34913 // indirect
golang.org/x/crypto v0.24.0 // indirect
golang.org/x/net v0.26.0 // indirect
golang.org/x/sys v0.21.0 // indirect
golang.org/x/text v0.16.0 // indirect
golang.org/x/tools v0.22.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect
github.com/golang-jwt/jwt/v4 v4.0.0
github.com/golang/protobuf v1.5.2 // indirect
github.com/google/gops v0.3.22
github.com/gorilla/mux v1.8.0
github.com/influxdata/line-protocol/v2 v2.2.0
github.com/nats-io/nats-server/v2 v2.2.6 // indirect
github.com/nats-io/nats.go v1.11.0
)

178
go.sum
View File

@@ -1,134 +1,110 @@
github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc=
github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE=
github.com/cpuguy83/go-md2man/v2 v2.0.4 h1:wfIWP927BUkWJb2NmU/kNDYIBTh/ziUX91+lVfRxZq4=
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA=
github.com/StackExchange/wmi v1.2.1/go.mod h1:rcmrprowKIVzvc+NUiLncP2uuArMWLCbu9SBzvHz7e8=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ=
github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY=
github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ=
github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4=
github.com/go-openapi/spec v0.21.0 h1:LTVzPc3p/RzRnkQqLRndbAzjY0d0BCL72A6j3CdL9ZY=
github.com/go-openapi/spec v0.21.0/go.mod h1:78u6VdPw81XU44qEWGhtr982gJ5BWg2c0I5XwVMotYk=
github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE=
github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ=
github.com/golang-jwt/jwt/v4 v4.5.0 h1:7cYmW1XlMY7h7ii7UhUyChSgS5wUJEnm9uZVTGqOWzg=
github.com/golang-jwt/jwt/v4 v4.5.0/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0=
github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/go-ole/go-ole v1.2.5/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
github.com/go-ole/go-ole v1.2.6-0.20210915003542-8b1f7f90f6b1 h1:4dntyT+x6QTOSCIrgczbQ+ockAEha0cfxD5Wi0iCzjY=
github.com/go-ole/go-ole v1.2.6-0.20210915003542-8b1f7f90f6b1/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
github.com/golang-jwt/jwt v3.2.2+incompatible h1:IfV12K8xAKAnZqdXVzCZ+TOjboZ2keLg81eXfW3O+oY=
github.com/golang-jwt/jwt v3.2.2+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I=
github.com/golang-jwt/jwt/v4 v4.0.0 h1:RAqyYixv1p7uEnocuy8P1nru5wprCh/MH2BIlW5z5/o=
github.com/golang-jwt/jwt/v4 v4.0.0/go.mod h1:/xlHOz8bRuivTWchD4jCa+NbatV+wEUSzwAxVc6locg=
github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw=
github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/gops v0.3.28 h1:2Xr57tqKAmQYRAfG12E+yLcoa2Y42UJo2lOrUFL9ark=
github.com/google/gops v0.3.28/go.mod h1:6f6+Nl8LcHrzJwi8+p0ii+vmBFSlB4f8cOOkTJ7sk4c=
github.com/google/gops v0.3.22 h1:lyvhDxfPLHAOR2xIYwjPhN387qHxyU21Sk9sz/GhmhQ=
github.com/google/gops v0.3.22/go.mod h1:7diIdLsqpCihPSX3fQagksT/Ku/y4RL9LHTlKyEUDl8=
github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI=
github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
github.com/influxdata/line-protocol-corpus v0.0.0-20210519164801-ca6fa5da0184/go.mod h1:03nmhxzZ7Xk2pdG+lmMd7mHDfeVOYFyhOgwO61qWU98=
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig=
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo=
github.com/influxdata/line-protocol/v2 v2.0.0-20210312151457-c52fdecb625a/go.mod h1:6+9Xt5Sq1rWx+glMgxhcg2c0DUaehK+5TDcPZ76GypY=
github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxksNj7PX9aUSeYOYE/ceHY=
github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE=
github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA=
github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
github.com/influxdata/line-protocol/v2 v2.2.0 h1:UPmAqE15Hw5zu9E10SYhoXVLWnEJkWnuCbaCiRsA3c0=
github.com/influxdata/line-protocol/v2 v2.2.0/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM=
github.com/keybase/go-ps v0.0.0-20190827175125-91aafc93ba19 h1:WjT3fLi9n8YWh/Ih8Q1LHAPsTqGddPcHqscN+PJ3i68=
github.com/keybase/go-ps v0.0.0-20190827175125-91aafc93ba19/go.mod h1:hY+WOq6m2FpbvyrI93sMaypsttvaIL5nhVR92dTMUcQ=
github.com/klauspost/compress v1.11.12 h1:famVnQVu7QwryBN4jNseQdUKES71ZAOnB6UQQJPZvqk=
github.com/klauspost/compress v1.11.12/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/linkedin/goavro/v2 v2.13.1 h1:4qZ5M0QzQFDRqccsroJlgOJznqAS/TpdvXg55h429+I=
github.com/linkedin/goavro/v2 v2.13.1/go.mod h1:KXx+erlq+RPlGSPmLF7xGo6SAbh8sCQ53x064+ioxhk=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/nats-io/nats.go v1.36.0 h1:suEUPuWzTSse/XhESwqLxXGuj8vGRuPRoG7MoRN/qyU=
github.com/nats-io/nats.go v1.36.0/go.mod h1:Ubdu4Nh9exXdSz0RVWRFBbRfrbSxOYd26oF0wkWclB8=
github.com/nats-io/nkeys v0.4.7 h1:RwNJbbIdYCoClSDNY7QVKZlyb/wfT6ugvFCiKy6vDvI=
github.com/nats-io/nkeys v0.4.7/go.mod h1:kqXRgRDPlGy7nGaEDMuYzmiJCIAAWDK0IMBtDmGD0nc=
github.com/minio/highwayhash v1.0.1 h1:dZ6IIu8Z14VlC0VpfKofAhCy74wu/Qb5gcn52yWoz/0=
github.com/minio/highwayhash v1.0.1/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLTk+kldvVxY=
github.com/nats-io/jwt v1.2.2 h1:w3GMTO969dFg+UOKTmmyuu7IGdusK+7Ytlt//OYH/uU=
github.com/nats-io/jwt v1.2.2/go.mod h1:/xX356yQA6LuXI9xWW7mZNpxgF2mBmGecH+Fj34sP5Q=
github.com/nats-io/jwt/v2 v2.0.2 h1:ejVCLO8gu6/4bOKIHQpmB5UhhUJfAQw55yvLWpfmKjI=
github.com/nats-io/jwt/v2 v2.0.2/go.mod h1:VRP+deawSXyhNjXmxPCHskrR6Mq50BqpEI5SEcNiGlY=
github.com/nats-io/nats-server/v2 v2.2.6 h1:FPK9wWx9pagxcw14s8W9rlfzfyHm61uNLnJyybZbn48=
github.com/nats-io/nats-server/v2 v2.2.6/go.mod h1:sEnFaxqe09cDmfMgACxZbziXnhQFhwk+aKkZjBBRYrI=
github.com/nats-io/nats.go v1.11.0 h1:L263PZkrmkRJRJT2YHU8GwWWvEvmr9/LUKuJTXsF32k=
github.com/nats-io/nats.go v1.11.0/go.mod h1:BPko4oXsySz4aSWeFgOHLZs3G4Jq4ZAyE6/zMCxRT6w=
github.com/nats-io/nkeys v0.2.0/go.mod h1:XdZpAbhgyyODYqjTawOnIOI7VlbKSarI9Gfy1tqEu/s=
github.com/nats-io/nkeys v0.3.0 h1:cgM5tL53EvYRU+2YLXIK0G2mJtK12Ft9oeooSZMA2G8=
github.com/nats-io/nkeys v0.3.0/go.mod h1:gvUNGjVcM2IPr5rCsRsC6Wb3Hr2CQAm08dsxtV6A5y4=
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M=
github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA=
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/shirou/gopsutil/v3 v3.21.9 h1:Vn4MUz2uXhqLSiCbGFRc0DILbMVLAY92DSkT8bsYrHg=
github.com/shirou/gopsutil/v3 v3.21.9/go.mod h1:YWp/H8Qs5fVmf17v7JNZzA0mPJ+mS2e9JdiUF9LlKzQ=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.5/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/swaggo/files v1.0.1 h1:J1bVJ4XHZNq0I46UU90611i9/YzdrF7x92oX1ig5IdE=
github.com/swaggo/files v1.0.1/go.mod h1:0qXmMNH6sXNf+73t65aKeB+ApmgxdnkQzVTAj2uaMUg=
github.com/swaggo/http-swagger v1.3.4 h1:q7t/XLx0n15H1Q9/tk3Y9L4n210XzJF5WtnDX64a5ww=
github.com/swaggo/http-swagger v1.3.4/go.mod h1:9dAh0unqMBAlbp1uE2Uc2mQTxNMU/ha4UbucIg1MFkQ=
github.com/swaggo/swag v1.16.3 h1:PnCYjPCah8FK4I26l2F/KQ4yz3sILcVUN3cTlBFA9Pg=
github.com/swaggo/swag v1.16.3/go.mod h1:DImHIuOFXKpMFAQjcC7FG4m3Dg4+QuUgUzJmKjI/gRk=
github.com/urfave/cli/v2 v2.27.1 h1:8xSQ6szndafKVRmfyeUMxkNUJQMjL1F2zmsZ+qHpfho=
github.com/urfave/cli/v2 v2.27.1/go.mod h1:8qnjx1vcq5s2/wpsqoZFndg2CE5tNFyrTvS6SinrnYQ=
github.com/xrash/smetrics v0.0.0-20240312152122-5f08fbb34913 h1:+qGGcbkzsfDQNPPe9UDgpxAWQrhbbBXOYJFQDq/dtJw=
github.com/xrash/smetrics v0.0.0-20240312152122-5f08fbb34913/go.mod h1:4aEEwZQutDLsQv2Deui4iYQ6DWTxR14g6m8Wv88+Xqk=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/tklauser/go-sysconf v0.3.9 h1:JeUVdAOWhhxVcU6Eqr/ATFHgXk/mmiItdKeJPev3vTo=
github.com/tklauser/go-sysconf v0.3.9/go.mod h1:11DU/5sG7UexIrp/O6g35hrWzu0JxlwQ3LSFUzyeuhs=
github.com/tklauser/numcpus v0.3.0 h1:ILuRUQBtssgnxw0XXIjKUC56fgnOrFoQQ/4+DeU2biQ=
github.com/tklauser/numcpus v0.3.0/go.mod h1:yFGUr7TUHQRAhyqBcEg0Ge34zDBAsIvJJcyE6boqnA8=
github.com/xlab/treeprint v1.1.0 h1:G/1DjNkPpfZCFt9CSh6b5/nY4VimlbHF3Rh4obvtzDk=
github.com/xlab/treeprint v1.1.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI=
golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.18.0 h1:5+9lSbEzPSdWkH32vYPBwEpX8KwDbM52Ud9xBUvNlb0=
golang.org/x/mod v0.18.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20210314154223-e6e6c4f2bb5b h1:wSOdpTq0/eI46Ez/LkDwIsAKA71YP2SRKBODiRWM0as=
golang.org/x/crypto v0.0.0-20210314154223-e6e6c4f2bb5b/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ=
golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190130150945-aca44879d564/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68 h1:nxC68pudNYkKU6jWhgrqdreuFiOQWj1Fs7T3VrH4Pjw=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws=
golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.0.0-20210816074244-15123e1e1f71/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210902050250-f475640dd07b h1:S7hKs0Flbq0bbc9xgYt4stIEG1zNDFqyrPwAX2Wj/sE=
golang.org/x/sys v0.0.0-20210902050250-f475640dd07b/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI=
golang.org/x/time v0.0.0-20200416051211-89c76fbcd5d1 h1:NusfzzA6yGQ+ua51ck7E3omNUX/JuqbFSaRGqU8CcLI=
golang.org/x/time v0.0.0-20200416051211-89c76fbcd5d1/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.22.0 h1:gqSGLZqv+AI9lIQzniJ0nZDRG5GBPsSi+DRNHWNz6yA=
golang.org/x/tools v0.22.0/go.mod h1:aCwcsjqvq7Yqt6TNyX7QMU2enbQ/Gt0bo6krSeEri+c=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.26.0 h1:bxAC2xTBsZGibn2RTntX0oH50xLsqy1OxA9tTL3p/lk=
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=
rsc.io/goversion v1.2.0 h1:SPn+NLTiAG7w30IRK/DKp1BjvpWabYgxlLp/+kx5J8w=
rsc.io/goversion v1.2.0/go.mod h1:Eih9y/uIBS3ulggl7KNJ09xGSLcuNaLgmvvqa07sgfo=

View File

@@ -1,448 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"bufio"
"encoding/json"
"errors"
"fmt"
"io"
"log"
"math"
"net/http"
"strconv"
"strings"
"github.com/ClusterCockpit/cc-metric-store/internal/memorystore"
"github.com/ClusterCockpit/cc-metric-store/internal/util"
"github.com/influxdata/line-protocol/v2/lineprotocol"
)
// @title cc-metric-store REST API
// @version 1.0.0
// @description API for cc-metric-store
// @contact.name ClusterCockpit Project
// @contact.url https://clustercockpit.org
// @contact.email support@clustercockpit.org
// @license.name MIT License
// @license.url https://opensource.org/licenses/MIT
// @host localhost:8082
// @basePath /api/
// @securityDefinitions.apikey ApiKeyAuth
// @in header
// @name X-Auth-Token
// ErrorResponse model
type ErrorResponse struct {
// Statustext of Errorcode
Status string `json:"status"`
Error string `json:"error"` // Error Message
}
type ApiMetricData struct {
Error *string `json:"error,omitempty"`
Data util.FloatArray `json:"data,omitempty"`
From int64 `json:"from"`
To int64 `json:"to"`
Resolution int64 `json:"resolution"`
Avg util.Float `json:"avg"`
Min util.Float `json:"min"`
Max util.Float `json:"max"`
}
func handleError(err error, statusCode int, rw http.ResponseWriter) {
// log.Warnf("REST ERROR : %s", err.Error())
rw.Header().Add("Content-Type", "application/json")
rw.WriteHeader(statusCode)
json.NewEncoder(rw).Encode(ErrorResponse{
Status: http.StatusText(statusCode),
Error: err.Error(),
})
}
// TODO: Optimize this, just like the stats endpoint!
func (data *ApiMetricData) AddStats() {
n := 0
sum, min, max := 0.0, math.MaxFloat64, -math.MaxFloat64
for _, x := range data.Data {
if x.IsNaN() {
continue
}
n += 1
sum += float64(x)
min = math.Min(min, float64(x))
max = math.Max(max, float64(x))
}
if n > 0 {
avg := sum / float64(n)
data.Avg = util.Float(avg)
data.Min = util.Float(min)
data.Max = util.Float(max)
} else {
data.Avg, data.Min, data.Max = util.NaN, util.NaN, util.NaN
}
}
func (data *ApiMetricData) ScaleBy(f util.Float) {
if f == 0 || f == 1 {
return
}
data.Avg *= f
data.Min *= f
data.Max *= f
for i := 0; i < len(data.Data); i++ {
data.Data[i] *= f
}
}
func (data *ApiMetricData) PadDataWithNull(ms *memorystore.MemoryStore, from, to int64, metric string) {
minfo, ok := ms.Metrics[metric]
if !ok {
return
}
if (data.From / minfo.Frequency) > (from / minfo.Frequency) {
padfront := int((data.From / minfo.Frequency) - (from / minfo.Frequency))
ndata := make([]util.Float, 0, padfront+len(data.Data))
for i := 0; i < padfront; i++ {
ndata = append(ndata, util.NaN)
}
for j := 0; j < len(data.Data); j++ {
ndata = append(ndata, data.Data[j])
}
data.Data = ndata
}
}
// handleFree godoc
// @summary
// @tags free
// @description This endpoint allows the users to free the Buffers from the
// metric store. This endpoint offers the users to remove then systematically
// and also allows then to prune the data under node, if they do not want to
// remove the whole node.
// @produce json
// @param to query string false "up to timestamp"
// @success 200 {string} string "ok"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /free/ [post]
func handleFree(rw http.ResponseWriter, r *http.Request) {
rawTo := r.URL.Query().Get("to")
if rawTo == "" {
handleError(errors.New("'to' is a required query parameter"), http.StatusBadRequest, rw)
return
}
to, err := strconv.ParseInt(rawTo, 10, 64)
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
// // TODO: lastCheckpoint might be modified by different go-routines.
// // Load it using the sync/atomic package?
// freeUpTo := lastCheckpoint.Unix()
// if to < freeUpTo {
// freeUpTo = to
// }
bodyDec := json.NewDecoder(r.Body)
var selectors [][]string
err = bodyDec.Decode(&selectors)
if err != nil {
http.Error(rw, err.Error(), http.StatusBadRequest)
return
}
ms := memorystore.GetMemoryStore()
n := 0
for _, sel := range selectors {
bn, err := ms.Free(sel, to)
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
n += bn
}
rw.WriteHeader(http.StatusOK)
fmt.Fprintf(rw, "buffers freed: %d\n", n)
}
// handleWrite godoc
// @summary Receive metrics in InfluxDB line-protocol
// @tags write
// @description Write data to the in-memory store in the InfluxDB line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md)
// @accept plain
// @produce json
// @param cluster query string false "If the lines in the body do not have a cluster tag, use this value instead."
// @success 200 {string} string "ok"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /write/ [post]
func handleWrite(rw http.ResponseWriter, r *http.Request) {
bytes, err := io.ReadAll(r.Body)
rw.Header().Add("Content-Type", "application/json")
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
ms := memorystore.GetMemoryStore()
dec := lineprotocol.NewDecoderWithBytes(bytes)
if err := decodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil {
log.Printf("/api/write error: %s", err.Error())
handleError(err, http.StatusBadRequest, rw)
return
}
rw.WriteHeader(http.StatusOK)
}
type ApiQueryRequest struct {
Cluster string `json:"cluster"`
Queries []ApiQuery `json:"queries"`
ForAllNodes []string `json:"for-all-nodes"`
From int64 `json:"from"`
To int64 `json:"to"`
WithStats bool `json:"with-stats"`
WithData bool `json:"with-data"`
WithPadding bool `json:"with-padding"`
}
type ApiQueryResponse struct {
Queries []ApiQuery `json:"queries,omitempty"`
Results [][]ApiMetricData `json:"results"`
}
type ApiQuery struct {
Type *string `json:"type,omitempty"`
SubType *string `json:"subtype,omitempty"`
Metric string `json:"metric"`
Hostname string `json:"host"`
Resolution int64 `json:"resolution"`
TypeIds []string `json:"type-ids,omitempty"`
SubTypeIds []string `json:"subtype-ids,omitempty"`
ScaleFactor util.Float `json:"scale-by,omitempty"`
Aggregate bool `json:"aggreg"`
}
// handleQuery godoc
// @summary Query metrics
// @tags query
// @description This endpoint allows the users to retrieve data from the
// in-memory database. The CCMS will return data in JSON format for the
// specified interval requested by the user
// @accept json
// @produce json
// @param request body api.ApiQueryRequest true "API query payload object"
// @success 200 {object} api.ApiQueryResponse "API query response object"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /query/ [get]
func handleQuery(rw http.ResponseWriter, r *http.Request) {
var err error
ver := r.URL.Query().Get("version")
if ver == "" {
ver = "v2"
}
req := ApiQueryRequest{WithStats: true, WithData: true, WithPadding: true}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
ms := memorystore.GetMemoryStore()
response := ApiQueryResponse{
Results: make([][]ApiMetricData, 0, len(req.Queries)),
}
if req.ForAllNodes != nil {
nodes := ms.ListChildren([]string{req.Cluster})
for _, node := range nodes {
for _, metric := range req.ForAllNodes {
q := ApiQuery{
Metric: metric,
Hostname: node,
}
req.Queries = append(req.Queries, q)
response.Queries = append(response.Queries, q)
}
}
}
for _, query := range req.Queries {
sels := make([]util.Selector, 0, 1)
if query.Aggregate || query.Type == nil {
sel := util.Selector{{String: req.Cluster}, {String: query.Hostname}}
if query.Type != nil {
if len(query.TypeIds) == 1 {
sel = append(sel, util.SelectorElement{String: *query.Type + query.TypeIds[0]})
} else {
ids := make([]string, len(query.TypeIds))
for i, id := range query.TypeIds {
ids[i] = *query.Type + id
}
sel = append(sel, util.SelectorElement{Group: ids})
}
if query.SubType != nil {
if len(query.SubTypeIds) == 1 {
sel = append(sel, util.SelectorElement{String: *query.SubType + query.SubTypeIds[0]})
} else {
ids := make([]string, len(query.SubTypeIds))
for i, id := range query.SubTypeIds {
ids[i] = *query.SubType + id
}
sel = append(sel, util.SelectorElement{Group: ids})
}
}
}
sels = append(sels, sel)
} else {
for _, typeId := range query.TypeIds {
if query.SubType != nil {
for _, subTypeId := range query.SubTypeIds {
sels = append(sels, util.Selector{
{String: req.Cluster},
{String: query.Hostname},
{String: *query.Type + typeId},
{String: *query.SubType + subTypeId},
})
}
} else {
sels = append(sels, util.Selector{
{String: req.Cluster},
{String: query.Hostname},
{String: *query.Type + typeId},
})
}
}
}
// log.Printf("query: %#v\n", query)
// log.Printf("sels: %#v\n", sels)
res := make([]ApiMetricData, 0, len(sels))
for _, sel := range sels {
data := ApiMetricData{}
if ver == "v1" {
data.Data, data.From, data.To, data.Resolution, err = ms.Read(sel, query.Metric, req.From, req.To, 0)
} else {
data.Data, data.From, data.To, data.Resolution, err = ms.Read(sel, query.Metric, req.From, req.To, query.Resolution)
}
if err != nil {
msg := err.Error()
data.Error = &msg
res = append(res, data)
continue
}
if req.WithStats {
data.AddStats()
}
if query.ScaleFactor != 0 {
data.ScaleBy(query.ScaleFactor)
}
if req.WithPadding {
data.PadDataWithNull(ms, req.From, req.To, query.Metric)
}
if !req.WithData {
data.Data = nil
}
res = append(res, data)
}
response.Results = append(response.Results, res)
}
rw.Header().Set("Content-Type", "application/json")
bw := bufio.NewWriter(rw)
defer bw.Flush()
if err := json.NewEncoder(bw).Encode(response); err != nil {
log.Printf("handleQuery Response Encode Error: %s", err.Error())
return
}
}
// handleDebug godoc
// @summary Debug endpoint
// @tags debug
// @description This endpoint allows the users to print the content of
// nodes/clusters/metrics to review the state of the data.
// @produce json
// @param selector query string false "Selector"
// @success 200 {string} string "Debug dump"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /debug/ [post]
func handleDebug(rw http.ResponseWriter, r *http.Request) {
raw := r.URL.Query().Get("selector")
rw.Header().Add("Content-Type", "application/json")
selector := []string{}
if len(raw) != 0 {
selector = strings.Split(raw, ":")
}
ms := memorystore.GetMemoryStore()
if err := ms.DebugDump(bufio.NewWriter(rw), selector); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
}
// handleHealthCheck godoc
// @summary HealthCheck endpoint
// @tags healthcheck
// @description This endpoint allows the users to check if a node is healthy
// @produce json
// @param selector query string false "Selector"
// @success 200 {string} string "Debug dump"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /healthcheck/ [get]
func handleHealthCheck(rw http.ResponseWriter, r *http.Request) {
rawCluster := r.URL.Query().Get("cluster")
rawNode := r.URL.Query().Get("node")
if rawCluster == "" || rawNode == "" {
handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw)
return
}
rw.Header().Add("Content-Type", "application/json")
selector := []string{rawCluster, rawNode}
ms := memorystore.GetMemoryStore()
if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
}

View File

@@ -1,56 +0,0 @@
package api
import (
"crypto/ed25519"
"errors"
"net/http"
"strings"
"sync"
"github.com/golang-jwt/jwt/v4"
)
func authHandler(next http.Handler, publicKey ed25519.PublicKey) http.Handler {
cacheLock := sync.RWMutex{}
cache := map[string]*jwt.Token{}
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
authheader := r.Header.Get("Authorization")
if authheader == "" || !strings.HasPrefix(authheader, "Bearer ") {
http.Error(rw, "Use JWT Authentication", http.StatusUnauthorized)
return
}
rawtoken := authheader[len("Bearer "):]
cacheLock.RLock()
token, ok := cache[rawtoken]
cacheLock.RUnlock()
if ok && token.Claims.Valid() == nil {
next.ServeHTTP(rw, r)
return
}
// The actual token is ignored for now.
// In case expiration and so on are specified, the Parse function
// already returns an error for expired tokens.
var err error
token, err = jwt.Parse(rawtoken, func(t *jwt.Token) (interface{}, error) {
if t.Method != jwt.SigningMethodEdDSA {
return nil, errors.New("only Ed25519/EdDSA supported")
}
return publicKey, nil
})
if err != nil {
http.Error(rw, err.Error(), http.StatusUnauthorized)
return
}
cacheLock.Lock()
cache[rawtoken] = token
cacheLock.Unlock()
// Let request through...
next.ServeHTTP(rw, r)
})
}

View File

@@ -1,480 +0,0 @@
// Package api Code generated by swaggo/swag. DO NOT EDIT
package api
import "github.com/swaggo/swag"
const docTemplate = `{
"schemes": {{ marshal .Schemes }},
"swagger": "2.0",
"info": {
"description": "{{escape .Description}}",
"title": "{{.Title}}",
"contact": {
"name": "ClusterCockpit Project",
"url": "https://clustercockpit.org",
"email": "support@clustercockpit.org"
},
"license": {
"name": "MIT License",
"url": "https://opensource.org/licenses/MIT"
},
"version": "{{.Version}}"
},
"host": "{{.Host}}",
"basePath": "{{.BasePath}}",
"paths": {
"/debug/": {
"post": {
"security": [
{
"ApiKeyAuth": []
}
],
"description": "This endpoint allows the users to print the content of",
"produces": [
"application/json"
],
"tags": [
"debug"
],
"summary": "Debug endpoint",
"parameters": [
{
"type": "string",
"description": "Selector",
"name": "selector",
"in": "query"
}
],
"responses": {
"200": {
"description": "Debug dump",
"schema": {
"type": "string"
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"401": {
"description": "Unauthorized",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"403": {
"description": "Forbidden",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
}
}
}
},
"/free/": {
"post": {
"security": [
{
"ApiKeyAuth": []
}
],
"description": "This endpoint allows the users to free the Buffers from the",
"produces": [
"application/json"
],
"tags": [
"free"
],
"parameters": [
{
"type": "string",
"description": "up to timestamp",
"name": "to",
"in": "query"
}
],
"responses": {
"200": {
"description": "ok",
"schema": {
"type": "string"
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"401": {
"description": "Unauthorized",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"403": {
"description": "Forbidden",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
}
}
}
},
"/healthcheck/": {
"get": {
"security": [
{
"ApiKeyAuth": []
}
],
"description": "This endpoint allows the users to check if a node is healthy",
"produces": [
"application/json"
],
"tags": [
"healthcheck"
],
"summary": "HealthCheck endpoint",
"parameters": [
{
"type": "string",
"description": "Selector",
"name": "selector",
"in": "query"
}
],
"responses": {
"200": {
"description": "Debug dump",
"schema": {
"type": "string"
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"401": {
"description": "Unauthorized",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"403": {
"description": "Forbidden",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
}
}
}
},
"/query/": {
"get": {
"security": [
{
"ApiKeyAuth": []
}
],
"description": "This endpoint allows the users to retrieve data from the",
"consumes": [
"application/json"
],
"produces": [
"application/json"
],
"tags": [
"query"
],
"summary": "Query metrics",
"parameters": [
{
"description": "API query payload object",
"name": "request",
"in": "body",
"required": true,
"schema": {
"$ref": "#/definitions/api.ApiQueryRequest"
}
}
],
"responses": {
"200": {
"description": "API query response object",
"schema": {
"$ref": "#/definitions/api.ApiQueryResponse"
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"401": {
"description": "Unauthorized",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"403": {
"description": "Forbidden",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
}
}
}
},
"/write/": {
"post": {
"security": [
{
"ApiKeyAuth": []
}
],
"consumes": [
"text/plain"
],
"produces": [
"application/json"
],
"parameters": [
{
"type": "string",
"description": "If the lines in the body do not have a cluster tag, use this value instead.",
"name": "cluster",
"in": "query"
}
],
"responses": {
"200": {
"description": "ok",
"schema": {
"type": "string"
}
},
"400": {
"description": "Bad Request",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"401": {
"description": "Unauthorized",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"403": {
"description": "Forbidden",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
},
"500": {
"description": "Internal Server Error",
"schema": {
"$ref": "#/definitions/api.ErrorResponse"
}
}
}
}
}
},
"definitions": {
"api.ApiMetricData": {
"type": "object",
"properties": {
"avg": {
"type": "number"
},
"data": {
"type": "array",
"items": {
"type": "number"
}
},
"error": {
"type": "string"
},
"from": {
"type": "integer"
},
"max": {
"type": "number"
},
"min": {
"type": "number"
},
"resolution": {
"type": "integer"
},
"to": {
"type": "integer"
}
}
},
"api.ApiQuery": {
"type": "object",
"properties": {
"aggreg": {
"type": "boolean"
},
"host": {
"type": "string"
},
"metric": {
"type": "string"
},
"resolution": {
"type": "integer"
},
"scale-by": {
"type": "number"
},
"subtype": {
"type": "string"
},
"subtype-ids": {
"type": "array",
"items": {
"type": "string"
}
},
"type": {
"type": "string"
},
"type-ids": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"api.ApiQueryRequest": {
"type": "object",
"properties": {
"cluster": {
"type": "string"
},
"for-all-nodes": {
"type": "array",
"items": {
"type": "string"
}
},
"from": {
"type": "integer"
},
"queries": {
"type": "array",
"items": {
"$ref": "#/definitions/api.ApiQuery"
}
},
"to": {
"type": "integer"
},
"with-data": {
"type": "boolean"
},
"with-padding": {
"type": "boolean"
},
"with-stats": {
"type": "boolean"
}
}
},
"api.ApiQueryResponse": {
"type": "object",
"properties": {
"queries": {
"type": "array",
"items": {
"$ref": "#/definitions/api.ApiQuery"
}
},
"results": {
"type": "array",
"items": {
"type": "array",
"items": {
"$ref": "#/definitions/api.ApiMetricData"
}
}
}
}
},
"api.ErrorResponse": {
"type": "object",
"properties": {
"error": {
"description": "Error Message",
"type": "string"
},
"status": {
"description": "Statustext of Errorcode",
"type": "string"
}
}
}
},
"securityDefinitions": {
"ApiKeyAuth": {
"type": "apiKey",
"name": "X-Auth-Token",
"in": "header"
}
}
}`
// SwaggerInfo holds exported Swagger Info so clients can modify it
var SwaggerInfo = &swag.Spec{
Version: "1.0.0",
Host: "localhost:8082",
BasePath: "/api/",
Schemes: []string{},
Title: "cc-metric-store REST API",
Description: "API for cc-metric-store",
InfoInstanceName: "swagger",
SwaggerTemplate: docTemplate,
LeftDelim: "{{",
RightDelim: "}}",
}
func init() {
swag.Register(SwaggerInfo.InstanceName(), SwaggerInfo)
}

View File

@@ -1,49 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package api
import (
"crypto/ed25519"
"encoding/base64"
"log"
"net/http"
"github.com/ClusterCockpit/cc-metric-store/internal/config"
)
func MountRoutes(r *http.ServeMux) {
if len(config.Keys.JwtPublicKey) > 0 {
buf, err := base64.StdEncoding.DecodeString(config.Keys.JwtPublicKey)
if err != nil {
log.Fatalf("starting server failed: %v", err)
}
publicKey := ed25519.PublicKey(buf)
// Compatibility
r.Handle("POST /api/free", authHandler(http.HandlerFunc(handleFree), publicKey))
r.Handle("POST /api/write", authHandler(http.HandlerFunc(handleWrite), publicKey))
r.Handle("GET /api/query", authHandler(http.HandlerFunc(handleQuery), publicKey))
r.Handle("GET /api/debug", authHandler(http.HandlerFunc(handleDebug), publicKey))
r.Handle("GET /api/healthcheck", authHandler(http.HandlerFunc(handleHealthCheck), publicKey))
// Refactor
r.Handle("POST /api/free/", authHandler(http.HandlerFunc(handleFree), publicKey))
r.Handle("POST /api/write/", authHandler(http.HandlerFunc(handleWrite), publicKey))
r.Handle("GET /api/query/", authHandler(http.HandlerFunc(handleQuery), publicKey))
r.Handle("GET /api/debug/", authHandler(http.HandlerFunc(handleDebug), publicKey))
r.Handle("GET /api/healthcheck/", authHandler(http.HandlerFunc(handleHealthCheck), publicKey))
} else {
// Compatibility
r.HandleFunc("POST /api/free", handleFree)
r.HandleFunc("POST /api/write", handleWrite)
r.HandleFunc("GET /api/query", handleQuery)
r.HandleFunc("GET /api/debug", handleDebug)
r.HandleFunc("GET /api/healthcheck", handleHealthCheck)
// Refactor
r.HandleFunc("POST /api/free/", handleFree)
r.HandleFunc("POST /api/write/", handleWrite)
r.HandleFunc("GET /api/query/", handleQuery)
r.HandleFunc("GET /api/debug/", handleDebug)
r.HandleFunc("GET /api/healthcheck/", handleHealthCheck)
}
}

View File

@@ -1,474 +0,0 @@
package avro
import (
"bufio"
"encoding/json"
"errors"
"fmt"
"log"
"os"
"path"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/ClusterCockpit/cc-metric-store/internal/config"
"github.com/ClusterCockpit/cc-metric-store/internal/util"
"github.com/linkedin/goavro/v2"
)
var NumWorkers int = 4
var ErrNoNewData error = errors.New("no data in the pool")
func (as *AvroStore) ToCheckpoint(dir string, dumpAll bool) (int, error) {
levels := make([]*AvroLevel, 0)
selectors := make([][]string, 0)
as.root.lock.RLock()
// Cluster
for sel1, l1 := range as.root.children {
l1.lock.RLock()
// Node
for sel2, l2 := range l1.children {
l2.lock.RLock()
// Frequency
for sel3, l3 := range l2.children {
levels = append(levels, l3)
selectors = append(selectors, []string{sel1, sel2, sel3})
}
l2.lock.RUnlock()
}
l1.lock.RUnlock()
}
as.root.lock.RUnlock()
type workItem struct {
level *AvroLevel
dir string
selector []string
}
n, errs := int32(0), int32(0)
var wg sync.WaitGroup
wg.Add(NumWorkers)
work := make(chan workItem, NumWorkers*2)
for range NumWorkers {
go func() {
defer wg.Done()
for workItem := range work {
var from int64 = getTimestamp(workItem.dir)
if err := workItem.level.toCheckpoint(workItem.dir, from, dumpAll); err != nil {
if err == ErrNoNewData {
continue
}
log.Printf("error while checkpointing %#v: %s", workItem.selector, err.Error())
atomic.AddInt32(&errs, 1)
} else {
atomic.AddInt32(&n, 1)
}
}
}()
}
for i := range len(levels) {
dir := path.Join(dir, path.Join(selectors[i]...))
work <- workItem{
level: levels[i],
dir: dir,
selector: selectors[i],
}
}
close(work)
wg.Wait()
if errs > 0 {
return int(n), fmt.Errorf("%d errors happend while creating avro checkpoints (%d successes)", errs, n)
}
return int(n), nil
}
// getTimestamp returns the timestamp from the directory name
func getTimestamp(dir string) int64 {
// Extract the resolution and timestamp from the directory name
// The existing avro file will be in epoch timestamp format
// iterate over all the files in the directory and find the maximum timestamp
// and return it
resolution := path.Base(dir)
dir = path.Dir(dir)
files, err := os.ReadDir(dir)
if err != nil {
return 0
}
var maxTs int64 = 0
if len(files) == 0 {
return 0
}
for _, file := range files {
if file.IsDir() {
continue
}
name := file.Name()
if len(name) < 5 || !strings.HasSuffix(name, ".avro") || !strings.HasPrefix(name, resolution+"_") {
continue
}
ts, err := strconv.ParseInt(name[strings.Index(name, "_")+1:len(name)-5], 10, 64)
if err != nil {
fmt.Printf("error while parsing timestamp: %s\n", err.Error())
continue
}
if ts > maxTs {
maxTs = ts
}
}
interval, _ := time.ParseDuration(config.Keys.Checkpoints.Interval)
updateTime := time.Unix(maxTs, 0).Add(interval).Add(time.Duration(CheckpointBufferMinutes-1) * time.Minute).Unix()
if updateTime < time.Now().Unix() {
return 0
}
return maxTs
}
func (l *AvroLevel) toCheckpoint(dir string, from int64, dumpAll bool) error {
l.lock.Lock()
defer l.lock.Unlock()
// fmt.Printf("Checkpointing directory: %s\n", dir)
// filepath contains the resolution
int_res, _ := strconv.Atoi(path.Base(dir))
// find smallest overall timestamp in l.data map and delete it from l.data
var minTs int64 = int64(1<<63 - 1)
for ts, dat := range l.data {
if ts < minTs && len(dat) != 0 {
minTs = ts
}
}
if from == 0 && minTs != int64(1<<63-1) {
from = minTs
}
if from == 0 {
return ErrNoNewData
}
var schema string
var codec *goavro.Codec
record_list := make([]map[string]interface{}, 0)
var f *os.File
filePath := dir + fmt.Sprintf("_%d.avro", from)
var err error
fp_, err_ := os.Stat(filePath)
if errors.Is(err_, os.ErrNotExist) {
err = os.MkdirAll(path.Dir(dir), 0o755)
if err != nil {
return fmt.Errorf("failed to create directory: %v", err)
}
} else if fp_.Size() != 0 {
f, err = os.Open(filePath)
if err != nil {
return fmt.Errorf("failed to open existing avro file: %v", err)
}
br := bufio.NewReader(f)
reader, err := goavro.NewOCFReader(br)
if err != nil {
return fmt.Errorf("failed to create OCF reader: %v", err)
}
codec = reader.Codec()
schema = codec.Schema()
f.Close()
}
time_ref := time.Now().Add(time.Duration(-CheckpointBufferMinutes+1) * time.Minute).Unix()
if dumpAll {
time_ref = time.Now().Unix()
}
// Empty values
if len(l.data) == 0 {
// we checkpoint avro files every 60 seconds
repeat := 60 / int_res
for range repeat {
record_list = append(record_list, make(map[string]interface{}))
}
}
readFlag := true
for ts := range l.data {
flag := false
if ts < time_ref {
data := l.data[ts]
schema_gen, err := generateSchema(data)
if err != nil {
return err
}
flag, schema, err = compareSchema(schema, schema_gen)
if err != nil {
return fmt.Errorf("failed to compare read and generated schema: %v", err)
}
if flag && readFlag && !errors.Is(err_, os.ErrNotExist) {
f.Close()
f, err = os.Open(filePath)
if err != nil {
return fmt.Errorf("failed to open Avro file: %v", err)
}
br := bufio.NewReader(f)
ocfReader, err := goavro.NewOCFReader(br)
if err != nil {
return fmt.Errorf("failed to create OCF reader while changing schema: %v", err)
}
for ocfReader.Scan() {
record, err := ocfReader.Read()
if err != nil {
return fmt.Errorf("failed to read record: %v", err)
}
record_list = append(record_list, record.(map[string]interface{}))
}
f.Close()
err = os.Remove(filePath)
if err != nil {
return fmt.Errorf("failed to delete file: %v", err)
}
readFlag = false
}
codec, err = goavro.NewCodec(schema)
if err != nil {
return fmt.Errorf("failed to create codec after merged schema: %v", err)
}
record_list = append(record_list, generateRecord(data))
delete(l.data, ts)
}
}
if len(record_list) == 0 {
return ErrNoNewData
}
f, err = os.OpenFile(filePath, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0o644)
if err != nil {
return fmt.Errorf("failed to append new avro file: %v", err)
}
// fmt.Printf("Codec : %#v\n", codec)
writer, err := goavro.NewOCFWriter(goavro.OCFConfig{
W: f,
Codec: codec,
CompressionName: goavro.CompressionDeflateLabel,
})
if err != nil {
return fmt.Errorf("failed to create OCF writer: %v", err)
}
// Append the new record
if err := writer.Append(record_list); err != nil {
return fmt.Errorf("failed to append record: %v", err)
}
f.Close()
return nil
}
func compareSchema(schemaRead, schemaGen string) (bool, string, error) {
var genSchema, readSchema AvroSchema
if schemaRead == "" {
return false, schemaGen, nil
}
// Unmarshal the schema strings into AvroSchema structs
if err := json.Unmarshal([]byte(schemaGen), &genSchema); err != nil {
return false, "", fmt.Errorf("failed to parse generated schema: %v", err)
}
if err := json.Unmarshal([]byte(schemaRead), &readSchema); err != nil {
return false, "", fmt.Errorf("failed to parse read schema: %v", err)
}
sort.Slice(genSchema.Fields, func(i, j int) bool {
return genSchema.Fields[i].Name < genSchema.Fields[j].Name
})
sort.Slice(readSchema.Fields, func(i, j int) bool {
return readSchema.Fields[i].Name < readSchema.Fields[j].Name
})
// Check if schemas are identical
schemasEqual := true
if len(genSchema.Fields) <= len(readSchema.Fields) {
for i := range genSchema.Fields {
if genSchema.Fields[i].Name != readSchema.Fields[i].Name {
schemasEqual = false
break
}
}
// If schemas are identical, return the read schema
if schemasEqual {
return false, schemaRead, nil
}
}
// Create a map to hold unique fields from both schemas
fieldMap := make(map[string]AvroField)
// Add fields from the read schema
for _, field := range readSchema.Fields {
fieldMap[field.Name] = field
}
// Add or update fields from the generated schema
for _, field := range genSchema.Fields {
fieldMap[field.Name] = field
}
// Create a union schema by collecting fields from the map
var mergedFields []AvroField
for _, field := range fieldMap {
mergedFields = append(mergedFields, field)
}
// Sort fields by name for consistency
sort.Slice(mergedFields, func(i, j int) bool {
return mergedFields[i].Name < mergedFields[j].Name
})
// Create the merged schema
mergedSchema := AvroSchema{
Type: "record",
Name: genSchema.Name,
Fields: mergedFields,
}
// Check if schemas are identical
schemasEqual = len(mergedSchema.Fields) == len(readSchema.Fields)
if schemasEqual {
for i := range mergedSchema.Fields {
if mergedSchema.Fields[i].Name != readSchema.Fields[i].Name {
schemasEqual = false
break
}
}
if schemasEqual {
return false, schemaRead, nil
}
}
// Marshal the merged schema back to JSON
mergedSchemaJson, err := json.Marshal(mergedSchema)
if err != nil {
return false, "", fmt.Errorf("failed to marshal merged schema: %v", err)
}
return true, string(mergedSchemaJson), nil
}
func generateSchema(data map[string]util.Float) (string, error) {
// Define the Avro schema structure
schema := map[string]interface{}{
"type": "record",
"name": "DataRecord",
"fields": []map[string]interface{}{},
}
fieldTracker := make(map[string]struct{})
for key := range data {
if _, exists := fieldTracker[key]; !exists {
key = correctKey(key)
field := map[string]interface{}{
"name": key,
"type": "double",
"default": -1.0,
}
schema["fields"] = append(schema["fields"].([]map[string]interface{}), field)
fieldTracker[key] = struct{}{}
}
}
schemaString, err := json.Marshal(schema)
if err != nil {
return "", fmt.Errorf("failed to marshal schema: %v", err)
}
return string(schemaString), nil
}
func generateRecord(data map[string]util.Float) map[string]interface{} {
record := make(map[string]interface{})
// Iterate through each map in data
for key, value := range data {
key = correctKey(key)
// Set the value in the record
record[key] = value.Double()
}
return record
}
func correctKey(key string) string {
// Replace any invalid characters in the key
// For example, replace spaces with underscores
key = strings.ReplaceAll(key, ":", "___")
key = strings.ReplaceAll(key, ".", "__")
return key
}
func ReplaceKey(key string) string {
// Replace any invalid characters in the key
// For example, replace spaces with underscores
key = strings.ReplaceAll(key, "___", ":")
key = strings.ReplaceAll(key, "__", ".")
return key
}

View File

@@ -1,80 +0,0 @@
package avro
import (
"context"
"fmt"
"strconv"
"sync"
"github.com/ClusterCockpit/cc-metric-store/internal/config"
)
func DataStaging(wg *sync.WaitGroup, ctx context.Context) {
// AvroPool is a pool of Avro writers.
go func() {
if config.Keys.Checkpoints.FileFormat == "json" {
wg.Done() // Mark this goroutine as done
return // Exit the goroutine
}
defer wg.Done()
var avroLevel *AvroLevel
oldSelector := make([]string, 0)
for {
select {
case <-ctx.Done():
return
case val := <-LineProtocolMessages:
//Fetch the frequency of the metric from the global configuration
freq, err := config.Keys.GetMetricFrequency(val.MetricName)
if err != nil {
fmt.Printf("Error fetching metric frequency: %s\n", err)
continue
}
metricName := ""
for _, selector_name := range val.Selector {
metricName += selector_name + Delimiter
}
metricName += val.MetricName
// Create a new selector for the Avro level
// The selector is a slice of strings that represents the path to the
// Avro level. It is created by appending the cluster, node, and metric
// name to the selector.
var selector []string
selector = append(selector, val.Cluster, val.Node, strconv.FormatInt(freq, 10))
if !testEq(oldSelector, selector) {
// Get the Avro level for the metric
avroLevel = avroStore.root.findAvroLevelOrCreate(selector)
// If the Avro level is nil, create a new one
if avroLevel == nil {
fmt.Printf("Error creating or finding the level with cluster : %s, node : %s, metric : %s\n", val.Cluster, val.Node, val.MetricName)
}
oldSelector = append([]string{}, selector...)
}
avroLevel.addMetric(metricName, val.Value, val.Timestamp, int(freq))
}
}
}()
}
func testEq(a, b []string) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}

View File

@@ -1,161 +0,0 @@
package avro
import (
"sync"
"github.com/ClusterCockpit/cc-metric-store/internal/util"
)
var LineProtocolMessages = make(chan *AvroStruct)
var Delimiter = "ZZZZZ"
// CheckpointBufferMinutes should always be in minutes.
// Its controls the amount of data to hold for given amount of time.
var CheckpointBufferMinutes = 3
type AvroStruct struct {
MetricName string
Cluster string
Node string
Selector []string
Value util.Float
Timestamp int64
}
type AvroStore struct {
root AvroLevel
}
var avroStore AvroStore
type AvroLevel struct {
children map[string]*AvroLevel
data map[int64]map[string]util.Float
lock sync.RWMutex
}
type AvroField struct {
Name string `json:"name"`
Type interface{} `json:"type"`
Default interface{} `json:"default,omitempty"`
}
type AvroSchema struct {
Type string `json:"type"`
Name string `json:"name"`
Fields []AvroField `json:"fields"`
}
func (l *AvroLevel) findAvroLevelOrCreate(selector []string) *AvroLevel {
if len(selector) == 0 {
return l
}
// Allow concurrent reads:
l.lock.RLock()
var child *AvroLevel
var ok bool
if l.children == nil {
// Children map needs to be created...
l.lock.RUnlock()
} else {
child, ok := l.children[selector[0]]
l.lock.RUnlock()
if ok {
return child.findAvroLevelOrCreate(selector[1:])
}
}
// The level does not exist, take write lock for unqiue access:
l.lock.Lock()
// While this thread waited for the write lock, another thread
// could have created the child node.
if l.children != nil {
child, ok = l.children[selector[0]]
if ok {
l.lock.Unlock()
return child.findAvroLevelOrCreate(selector[1:])
}
}
child = &AvroLevel{
data: make(map[int64]map[string]util.Float, 0),
children: nil,
}
if l.children != nil {
l.children[selector[0]] = child
} else {
l.children = map[string]*AvroLevel{selector[0]: child}
}
l.lock.Unlock()
return child.findAvroLevelOrCreate(selector[1:])
}
func (l *AvroLevel) addMetric(metricName string, value util.Float, timestamp int64, Freq int) {
l.lock.Lock()
defer l.lock.Unlock()
KeyCounter := int(CheckpointBufferMinutes * 60 / Freq)
// Create keys in advance for the given amount of time
if len(l.data) != KeyCounter {
if len(l.data) == 0 {
for i := range KeyCounter {
l.data[timestamp+int64(i*Freq)] = make(map[string]util.Float, 0)
}
} else {
//Get the last timestamp
var lastTs int64
for ts := range l.data {
if ts > lastTs {
lastTs = ts
}
}
// Create keys for the next KeyCounter timestamps
l.data[lastTs+int64(Freq)] = make(map[string]util.Float, 0)
}
}
closestTs := int64(0)
minDiff := int64(Freq) + 1 // Start with diff just outside the valid range
found := false
// Iterate over timestamps and choose the one which is within range.
// Since its epoch time, we check if the difference is less than 60 seconds.
for ts, dat := range l.data {
// Check if timestamp is within range
diff := timestamp - ts
if diff < -int64(Freq) || diff > int64(Freq) {
continue
}
// Metric already present at this timestamp — skip
if _, ok := dat[metricName]; ok {
continue
}
// Check if this is the closest timestamp so far
if Abs(diff) < minDiff {
minDiff = Abs(diff)
closestTs = ts
found = true
}
}
if found {
l.data[closestTs][metricName] = value
}
}
func GetAvroStore() *AvroStore {
return &avroStore
}
// Abs returns the absolute value of x.
func Abs(x int64) int64 {
if x < 0 {
return -x
}
return x
}

View File

@@ -1,123 +0,0 @@
package config
import (
"encoding/json"
"fmt"
"log"
"os"
)
// For aggregation over multiple values at different cpus/sockets/..., not time!
type AggregationStrategy int
const (
NoAggregation AggregationStrategy = iota
SumAggregation
AvgAggregation
)
func (as *AggregationStrategy) UnmarshalJSON(data []byte) error {
var str string
if err := json.Unmarshal(data, &str); err != nil {
return err
}
switch str {
case "":
*as = NoAggregation
case "sum":
*as = SumAggregation
case "avg":
*as = AvgAggregation
default:
return fmt.Errorf("invalid aggregation strategy: %#v", str)
}
return nil
}
type MetricConfig struct {
// Interval in seconds at which measurements will arive.
Frequency int64 `json:"frequency"`
// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
Aggregation AggregationStrategy `json:"aggregation"`
// Private, used internally...
Offset int
}
type HttpConfig struct {
// Address to bind to, for example "0.0.0.0:8081"
Address string `json:"address"`
// If not the empty string, use https with this as the certificate file
CertFile string `json:"https-cert-file"`
// If not the empty string, use https with this as the key file
KeyFile string `json:"https-key-file"`
}
type NatsConfig struct {
// Address of the nats server
Address string `json:"address"`
// Username/Password, optional
Username string `json:"username"`
Password string `json:"password"`
//Creds file path
Credsfilepath string `json:"creds-file-path"`
Subscriptions []struct {
// Channel name
SubscribeTo string `json:"subscribe-to"`
// Allow lines without a cluster tag, use this as default, optional
ClusterTag string `json:"cluster-tag"`
} `json:"subscriptions"`
}
type Config struct {
Metrics map[string]MetricConfig `json:"metrics"`
HttpConfig *HttpConfig `json:"http-api"`
Checkpoints struct {
FileFormat string `json:"file-format"`
Interval string `json:"interval"`
RootDir string `json:"directory"`
Restore string `json:"restore"`
} `json:"checkpoints"`
Debug struct {
DumpToFile string `json:"dump-to-file"`
EnableGops bool `json:"gops"`
} `json:"debug"`
RetentionInMemory string `json:"retention-in-memory"`
JwtPublicKey string `json:"jwt-public-key"`
Archive struct {
Interval string `json:"interval"`
RootDir string `json:"directory"`
DeleteInstead bool `json:"delete-instead"`
} `json:"archive"`
Nats []*NatsConfig `json:"nats"`
}
var Keys Config
func Init(file string) {
configFile, err := os.Open(file)
if err != nil {
log.Fatal(err)
}
defer configFile.Close()
dec := json.NewDecoder(configFile)
dec.DisallowUnknownFields()
if err := dec.Decode(&Keys); err != nil {
log.Fatal(err)
}
}
func (c *Config) GetMetricFrequency(metricName string) (int64, error) {
if metric, ok := c.Metrics[metricName]; ok {
return metric.Frequency, nil
}
return 0, fmt.Errorf("metric %s not found", metricName)
}

View File

@@ -1,185 +0,0 @@
package memorystore
import (
"archive/zip"
"bufio"
"context"
"errors"
"fmt"
"io"
"log"
"os"
"path/filepath"
"sync"
"sync/atomic"
"time"
"github.com/ClusterCockpit/cc-metric-store/internal/config"
)
func Archiving(wg *sync.WaitGroup, ctx context.Context) {
go func() {
defer wg.Done()
d, err := time.ParseDuration(config.Keys.Archive.Interval)
if err != nil {
log.Fatal(err)
}
if d <= 0 {
return
}
ticks := func() <-chan time.Time {
if d <= 0 {
return nil
}
return time.NewTicker(d).C
}()
for {
select {
case <-ctx.Done():
return
case <-ticks:
t := time.Now().Add(-d)
log.Printf("start archiving checkpoints (older than %s)...\n", t.Format(time.RFC3339))
n, err := ArchiveCheckpoints(config.Keys.Checkpoints.RootDir, config.Keys.Archive.RootDir, t.Unix(), config.Keys.Archive.DeleteInstead)
if err != nil {
log.Printf("archiving failed: %s\n", err.Error())
} else {
log.Printf("done: %d files zipped and moved to archive\n", n)
}
}
}
}()
}
var ErrNoNewData error = errors.New("all data already archived")
// ZIP all checkpoint files older than `from` together and write them to the `archiveDir`,
// deleting them from the `checkpointsDir`.
func ArchiveCheckpoints(checkpointsDir, archiveDir string, from int64, deleteInstead bool) (int, error) {
entries1, err := os.ReadDir(checkpointsDir)
if err != nil {
return 0, err
}
type workItem struct {
cdir, adir string
cluster, host string
}
var wg sync.WaitGroup
n, errs := int32(0), int32(0)
work := make(chan workItem, NumWorkers)
wg.Add(NumWorkers)
for worker := 0; worker < NumWorkers; worker++ {
go func() {
defer wg.Done()
for workItem := range work {
m, err := archiveCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead)
if err != nil {
log.Printf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error())
atomic.AddInt32(&errs, 1)
}
atomic.AddInt32(&n, int32(m))
}
}()
}
for _, de1 := range entries1 {
entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name()))
if e != nil {
err = e
}
for _, de2 := range entries2 {
cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name())
adir := filepath.Join(archiveDir, de1.Name(), de2.Name())
work <- workItem{
adir: adir, cdir: cdir,
cluster: de1.Name(), host: de2.Name(),
}
}
}
close(work)
wg.Wait()
if err != nil {
return int(n), err
}
if errs > 0 {
return int(n), fmt.Errorf("%d errors happend while archiving (%d successes)", errs, n)
}
return int(n), nil
}
// Helper function for `ArchiveCheckpoints`.
func archiveCheckpoints(dir string, archiveDir string, from int64, deleteInstead bool) (int, error) {
entries, err := os.ReadDir(dir)
if err != nil {
return 0, err
}
extension := config.Keys.Checkpoints.FileFormat
files, err := findFiles(entries, from, extension, false)
if err != nil {
return 0, err
}
if deleteInstead {
n := 0
for _, checkpoint := range files {
filename := filepath.Join(dir, checkpoint)
if err = os.Remove(filename); err != nil {
return n, err
}
n += 1
}
return n, nil
}
filename := filepath.Join(archiveDir, fmt.Sprintf("%d.zip", from))
f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644)
if err != nil && os.IsNotExist(err) {
err = os.MkdirAll(archiveDir, 0o755)
if err == nil {
f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644)
}
}
if err != nil {
return 0, err
}
defer f.Close()
bw := bufio.NewWriter(f)
defer bw.Flush()
zw := zip.NewWriter(bw)
defer zw.Close()
n := 0
for _, checkpoint := range files {
filename := filepath.Join(dir, checkpoint)
r, err := os.Open(filename)
if err != nil {
return n, err
}
defer r.Close()
w, err := zw.Create(checkpoint)
if err != nil {
return n, err
}
if _, err = io.Copy(w, r); err != nil {
return n, err
}
if err = os.Remove(filename); err != nil {
return n, err
}
n += 1
}
return n, nil
}

View File

@@ -1,233 +0,0 @@
package memorystore
import (
"errors"
"sync"
"github.com/ClusterCockpit/cc-metric-store/internal/util"
)
// Default buffer capacity.
// `buffer.data` will only ever grow up to it's capacity and a new link
// in the buffer chain will be created if needed so that no copying
// of data or reallocation needs to happen on writes.
const (
BUFFER_CAP int = 512
)
// So that we can reuse allocations
var bufferPool sync.Pool = sync.Pool{
New: func() interface{} {
return &buffer{
data: make([]util.Float, 0, BUFFER_CAP),
}
},
}
var (
ErrNoData error = errors.New("no data for this metric/level")
ErrDataDoesNotAlign error = errors.New("data from lower granularities does not align")
)
// Each metric on each level has it's own buffer.
// This is where the actual values go.
// If `cap(data)` is reached, a new buffer is created and
// becomes the new head of a buffer list.
type buffer struct {
prev *buffer
next *buffer
data []util.Float
frequency int64
start int64
archived bool
closed bool
}
func newBuffer(ts, freq int64) *buffer {
b := bufferPool.Get().(*buffer)
b.frequency = freq
b.start = ts - (freq / 2)
b.prev = nil
b.next = nil
b.archived = false
b.closed = false
b.data = b.data[:0]
return b
}
// If a new buffer was created, the new head is returnd.
// Otherwise, the existing buffer is returnd.
// Normaly, only "newer" data should be written, but if the value would
// end up in the same buffer anyways it is allowed.
func (b *buffer) write(ts int64, value util.Float) (*buffer, error) {
if ts < b.start {
return nil, errors.New("cannot write value to buffer from past")
}
// idx := int((ts - b.start + (b.frequency / 3)) / b.frequency)
idx := int((ts - b.start) / b.frequency)
if idx >= cap(b.data) {
newbuf := newBuffer(ts, b.frequency)
newbuf.prev = b
b.next = newbuf
b.close()
b = newbuf
idx = 0
}
// Overwriting value or writing value from past
if idx < len(b.data) {
b.data[idx] = value
return b, nil
}
// Fill up unwritten slots with NaN
for i := len(b.data); i < idx; i++ {
b.data = append(b.data, util.NaN)
}
b.data = append(b.data, value)
return b, nil
}
func (b *buffer) end() int64 {
return b.firstWrite() + int64(len(b.data))*b.frequency
}
func (b *buffer) firstWrite() int64 {
return b.start + (b.frequency / 2)
}
func (b *buffer) close() {}
/*
func (b *buffer) close() {
if b.closed {
return
}
b.closed = true
n, sum, min, max := 0, 0., math.MaxFloat64, -math.MaxFloat64
for _, x := range b.data {
if x.IsNaN() {
continue
}
n += 1
f := float64(x)
sum += f
min = math.Min(min, f)
max = math.Max(max, f)
}
b.statisticts.samples = n
if n > 0 {
b.statisticts.avg = Float(sum / float64(n))
b.statisticts.min = Float(min)
b.statisticts.max = Float(max)
} else {
b.statisticts.avg = NaN
b.statisticts.min = NaN
b.statisticts.max = NaN
}
}
*/
// func interpolate(idx int, data []Float) Float {
// if idx == 0 || idx+1 == len(data) {
// return NaN
// }
// return (data[idx-1] + data[idx+1]) / 2.0
// }
// Return all known values from `from` to `to`. Gaps of information are represented as NaN.
// Simple linear interpolation is done between the two neighboring cells if possible.
// If values at the start or end are missing, instead of NaN values, the second and thrid
// return values contain the actual `from`/`to`.
// This function goes back the buffer chain if `from` is older than the currents buffer start.
// The loaded values are added to `data` and `data` is returned, possibly with a shorter length.
// If `data` is not long enough to hold all values, this function will panic!
func (b *buffer) read(from, to int64, data []util.Float) ([]util.Float, int64, int64, error) {
if from < b.firstWrite() {
if b.prev != nil {
return b.prev.read(from, to, data)
}
from = b.firstWrite()
}
i := 0
t := from
for ; t < to; t += b.frequency {
idx := int((t - b.start) / b.frequency)
if idx >= cap(b.data) {
if b.next == nil {
break
}
b = b.next
idx = 0
}
if idx >= len(b.data) {
if b.next == nil || to <= b.next.start {
break
}
data[i] += util.NaN
} else if t < b.start {
data[i] += util.NaN
// } else if b.data[idx].IsNaN() {
// data[i] += interpolate(idx, b.data)
} else {
data[i] += b.data[idx]
}
i++
}
return data[:i], from, t, nil
}
// Returns true if this buffer needs to be freed.
func (b *buffer) free(t int64) (delme bool, n int) {
if b.prev != nil {
delme, m := b.prev.free(t)
n += m
if delme {
b.prev.next = nil
if cap(b.prev.data) == BUFFER_CAP {
bufferPool.Put(b.prev)
}
b.prev = nil
}
}
end := b.end()
if end < t {
return true, n + 1
}
return false, n
}
// Call `callback` on every buffer that contains data in the range from `from` to `to`.
func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error {
if b == nil {
return nil
}
if err := b.prev.iterFromTo(from, to, callback); err != nil {
return err
}
if from <= b.end() && b.start <= to {
return callback(b)
}
return nil
}
func (b *buffer) count() int64 {
res := int64(len(b.data))
if b.prev != nil {
res += b.prev.count()
}
return res
}

View File

@@ -1,767 +0,0 @@
package memorystore
import (
"bufio"
"context"
"encoding/json"
"errors"
"fmt"
"io/fs"
"log"
"os"
"path"
"path/filepath"
"runtime"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/ClusterCockpit/cc-metric-store/internal/avro"
"github.com/ClusterCockpit/cc-metric-store/internal/config"
"github.com/ClusterCockpit/cc-metric-store/internal/util"
"github.com/linkedin/goavro/v2"
)
// Whenever changed, update MarshalJSON as well!
type CheckpointMetrics struct {
Data []util.Float `json:"data"`
Frequency int64 `json:"frequency"`
Start int64 `json:"start"`
}
type CheckpointFile struct {
Metrics map[string]*CheckpointMetrics `json:"metrics"`
Children map[string]*CheckpointFile `json:"children"`
From int64 `json:"from"`
To int64 `json:"to"`
}
var lastCheckpoint time.Time
func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
lastCheckpoint = time.Now()
if config.Keys.Checkpoints.FileFormat == "json" {
ms := GetMemoryStore()
go func() {
defer wg.Done()
d, err := time.ParseDuration(config.Keys.Checkpoints.Interval)
if err != nil {
log.Fatal(err)
}
if d <= 0 {
return
}
ticks := func() <-chan time.Time {
if d <= 0 {
return nil
}
return time.NewTicker(d).C
}()
for {
select {
case <-ctx.Done():
return
case <-ticks:
log.Printf("start checkpointing (starting at %s)...\n", lastCheckpoint.Format(time.RFC3339))
now := time.Now()
n, err := ms.ToCheckpoint(config.Keys.Checkpoints.RootDir,
lastCheckpoint.Unix(), now.Unix())
if err != nil {
log.Printf("checkpointing failed: %s\n", err.Error())
} else {
log.Printf("done: %d checkpoint files created\n", n)
lastCheckpoint = now
}
}
}
}()
} else {
go func() {
defer wg.Done()
d, _ := time.ParseDuration("1m")
select {
case <-ctx.Done():
return
case <-time.After(time.Duration(avro.CheckpointBufferMinutes) * time.Minute):
// This is the first tick untill we collect the data for given minutes.
avro.GetAvroStore().ToCheckpoint(config.Keys.Checkpoints.RootDir, false)
// log.Printf("Checkpointing %d avro files", count)
}
ticks := func() <-chan time.Time {
if d <= 0 {
return nil
}
return time.NewTicker(d).C
}()
for {
select {
case <-ctx.Done():
return
case <-ticks:
// Regular ticks of 1 minute to write data.
avro.GetAvroStore().ToCheckpoint(config.Keys.Checkpoints.RootDir, false)
// log.Printf("Checkpointing %d avro files", count)
}
}
}()
}
}
// As `Float` implements a custom MarshalJSON() function,
// serializing an array of such types has more overhead
// than one would assume (because of extra allocations, interfaces and so on).
func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
buf := make([]byte, 0, 128+len(cm.Data)*8)
buf = append(buf, `{"frequency":`...)
buf = strconv.AppendInt(buf, cm.Frequency, 10)
buf = append(buf, `,"start":`...)
buf = strconv.AppendInt(buf, cm.Start, 10)
buf = append(buf, `,"data":[`...)
for i, x := range cm.Data {
if i != 0 {
buf = append(buf, ',')
}
if x.IsNaN() {
buf = append(buf, `null`...)
} else {
buf = strconv.AppendFloat(buf, float64(x), 'f', 1, 32)
}
}
buf = append(buf, `]}`...)
return buf, nil
}
// Metrics stored at the lowest 2 levels are not stored away (root and cluster)!
// On a per-host basis a new JSON file is created. I have no idea if this will scale.
// The good thing: Only a host at a time is locked, so this function can run
// in parallel to writes/reads.
func (m *MemoryStore) ToCheckpoint(dir string, from, to int64) (int, error) {
levels := make([]*Level, 0)
selectors := make([][]string, 0)
m.root.lock.RLock()
for sel1, l1 := range m.root.children {
l1.lock.RLock()
for sel2, l2 := range l1.children {
levels = append(levels, l2)
selectors = append(selectors, []string{sel1, sel2})
}
l1.lock.RUnlock()
}
m.root.lock.RUnlock()
type workItem struct {
level *Level
dir string
selector []string
}
n, errs := int32(0), int32(0)
var wg sync.WaitGroup
wg.Add(NumWorkers)
work := make(chan workItem, NumWorkers*2)
for worker := 0; worker < NumWorkers; worker++ {
go func() {
defer wg.Done()
for workItem := range work {
if err := workItem.level.toCheckpoint(workItem.dir, from, to, m); err != nil {
if err == ErrNoNewData {
continue
}
log.Printf("error while checkpointing %#v: %s", workItem.selector, err.Error())
atomic.AddInt32(&errs, 1)
} else {
atomic.AddInt32(&n, 1)
}
}
}()
}
for i := 0; i < len(levels); i++ {
dir := path.Join(dir, path.Join(selectors[i]...))
work <- workItem{
level: levels[i],
dir: dir,
selector: selectors[i],
}
}
close(work)
wg.Wait()
if errs > 0 {
return int(n), fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errs, n)
}
return int(n), nil
}
func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFile, error) {
l.lock.RLock()
defer l.lock.RUnlock()
retval := &CheckpointFile{
From: from,
To: to,
Metrics: make(map[string]*CheckpointMetrics),
Children: make(map[string]*CheckpointFile),
}
for metric, minfo := range m.Metrics {
b := l.metrics[minfo.Offset]
if b == nil {
continue
}
allArchived := true
b.iterFromTo(from, to, func(b *buffer) error {
if !b.archived {
allArchived = false
}
return nil
})
if allArchived {
continue
}
data := make([]util.Float, (to-from)/b.frequency+1)
data, start, end, err := b.read(from, to, data)
if err != nil {
return nil, err
}
for i := int((end - start) / b.frequency); i < len(data); i++ {
data[i] = util.NaN
}
retval.Metrics[metric] = &CheckpointMetrics{
Frequency: b.frequency,
Start: start,
Data: data,
}
}
for name, child := range l.children {
val, err := child.toCheckpointFile(from, to, m)
if err != nil {
return nil, err
}
if val != nil {
retval.Children[name] = val
}
}
if len(retval.Children) == 0 && len(retval.Metrics) == 0 {
return nil, nil
}
return retval, nil
}
func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
cf, err := l.toCheckpointFile(from, to, m)
if err != nil {
return err
}
if cf == nil {
return ErrNoNewData
}
filepath := path.Join(dir, fmt.Sprintf("%d.json", from))
f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, 0o644)
if err != nil && os.IsNotExist(err) {
err = os.MkdirAll(dir, 0o755)
if err == nil {
f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY, 0o644)
}
}
if err != nil {
return err
}
defer f.Close()
bw := bufio.NewWriter(f)
if err = json.NewEncoder(bw).Encode(cf); err != nil {
return err
}
return bw.Flush()
}
func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) {
var wg sync.WaitGroup
work := make(chan [2]string, NumWorkers)
n, errs := int32(0), int32(0)
wg.Add(NumWorkers)
for worker := 0; worker < NumWorkers; worker++ {
go func() {
defer wg.Done()
for host := range work {
lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics))
nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension)
if err != nil {
log.Fatalf("error while loading checkpoints: %s", err.Error())
atomic.AddInt32(&errs, 1)
}
atomic.AddInt32(&n, int32(nn))
}
}()
}
i := 0
clustersDir, err := os.ReadDir(dir)
for _, clusterDir := range clustersDir {
if !clusterDir.IsDir() {
err = errors.New("expected only directories at first level of checkpoints/ directory")
goto done
}
hostsDir, e := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
if e != nil {
err = e
goto done
}
for _, hostDir := range hostsDir {
if !hostDir.IsDir() {
err = errors.New("expected only directories at second level of checkpoints/ directory")
goto done
}
i++
if i%NumWorkers == 0 && i > 100 {
// Forcing garbage collection runs here regulary during the loading of checkpoints
// will decrease the total heap size after loading everything back to memory is done.
// While loading data, the heap will grow fast, so the GC target size will double
// almost always. By forcing GCs here, we can keep it growing more slowly so that
// at the end, less memory is wasted.
runtime.GC()
}
work <- [2]string{clusterDir.Name(), hostDir.Name()}
}
}
done:
close(work)
wg.Wait()
if err != nil {
return int(n), err
}
if errs > 0 {
return int(n), fmt.Errorf("%d errors happend while creating checkpoints (%d successes)", errs, n)
}
return int(n), nil
}
// Metrics stored at the lowest 2 levels are not loaded (root and cluster)!
// This function can only be called once and before the very first write or read.
// Different host's data is loaded to memory in parallel.
func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
if _, err := os.Stat(dir); os.IsNotExist(err) {
// The directory does not exist, so create it using os.MkdirAll()
err := os.MkdirAll(dir, 0755) // 0755 sets the permissions for the directory
if err != nil {
log.Fatalf("Error creating directory: %#v\n", err)
}
fmt.Printf("%#v Directory created successfully.\n", dir)
}
// Config read (replace with your actual config read)
fileFormat := config.Keys.Checkpoints.FileFormat
if fileFormat == "" {
fileFormat = "avro"
}
// Map to easily get the fallback format
oppositeFormat := map[string]string{
"json": "avro",
"avro": "json",
}
// First, attempt to load the specified format
if found, err := checkFilesWithExtension(dir, fileFormat); err != nil {
return 0, fmt.Errorf("error checking files with extension: %v", err)
} else if found {
log.Printf("Loading %s files because fileformat is %s\n", fileFormat, fileFormat)
return m.FromCheckpoint(dir, from, fileFormat)
}
// If not found, attempt the opposite format
altFormat := oppositeFormat[fileFormat]
if found, err := checkFilesWithExtension(dir, altFormat); err != nil {
return 0, fmt.Errorf("error checking files with extension: %v", err)
} else if found {
log.Printf("Loading %s files but fileformat is %s\n", altFormat, fileFormat)
return m.FromCheckpoint(dir, from, altFormat)
}
log.Println("No valid checkpoint files found in the directory.")
return 0, nil
}
func checkFilesWithExtension(dir string, extension string) (bool, error) {
found := false
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return fmt.Errorf("error accessing path %s: %v", path, err)
}
if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension {
found = true
return nil
}
return nil
})
if err != nil {
return false, fmt.Errorf("error walking through directories: %s", err)
}
return found, nil
}
func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error {
br := bufio.NewReader(f)
fileName := f.Name()[strings.LastIndex(f.Name(), "/")+1:]
resolution, err := strconv.ParseInt(fileName[0:strings.Index(fileName, "_")], 10, 64)
if err != nil {
return fmt.Errorf("error while reading avro file (resolution parsing) : %s", err)
}
from_timestamp, err := strconv.ParseInt(fileName[strings.Index(fileName, "_")+1:len(fileName)-5], 10, 64)
// Same logic according to lineprotocol
from_timestamp -= (resolution / 2)
if err != nil {
return fmt.Errorf("error converting timestamp from the avro file : %s", err)
}
// fmt.Printf("File : %s with resolution : %d\n", fileName, resolution)
var recordCounter int64 = 0
// Create a new OCF reader from the buffered reader
ocfReader, err := goavro.NewOCFReader(br)
if err != nil {
panic(err)
}
metricsData := make(map[string]util.FloatArray)
for ocfReader.Scan() {
datum, err := ocfReader.Read()
if err != nil {
return fmt.Errorf("error while reading avro file : %s", err)
}
record, ok := datum.(map[string]interface{})
if !ok {
panic("failed to assert datum as map[string]interface{}")
}
for key, value := range record {
metricsData[key] = append(metricsData[key], util.ConvertToFloat(value.(float64)))
}
recordCounter += 1
}
to := (from_timestamp + (recordCounter / (60 / resolution) * 60))
if to < from {
return nil
}
for key, floatArray := range metricsData {
metricName := avro.ReplaceKey(key)
if strings.Contains(metricName, avro.Delimiter) {
subString := strings.Split(metricName, avro.Delimiter)
lvl := l
for i := 0; i < len(subString)-1; i++ {
sel := subString[i]
if lvl.children == nil {
lvl.children = make(map[string]*Level)
}
child, ok := lvl.children[sel]
if !ok {
child = &Level{
metrics: make([]*buffer, len(m.Metrics)),
children: nil,
}
lvl.children[sel] = child
}
lvl = child
}
leafMetricName := subString[len(subString)-1]
err = lvl.createBuffer(m, leafMetricName, floatArray, from_timestamp, resolution)
if err != nil {
return fmt.Errorf("error while creating buffers from avroReader : %s", err)
}
} else {
err = l.createBuffer(m, metricName, floatArray, from_timestamp, resolution)
if err != nil {
return fmt.Errorf("error while creating buffers from avroReader : %s", err)
}
}
}
return nil
}
func (l *Level) createBuffer(m *MemoryStore, metricName string, floatArray util.FloatArray, from int64, resolution int64) error {
n := len(floatArray)
b := &buffer{
frequency: resolution,
start: from,
data: floatArray[0:n:n],
prev: nil,
next: nil,
archived: true,
}
b.close()
minfo, ok := m.Metrics[metricName]
if !ok {
return nil
// return errors.New("Unkown metric: " + name)
}
prev := l.metrics[minfo.Offset]
if prev == nil {
l.metrics[minfo.Offset] = b
} else {
if prev.start > b.start {
return errors.New("wooops")
}
b.prev = prev
prev.next = b
missingCount := ((int(b.start) - int(prev.start)) - len(prev.data)*int(b.frequency))
if missingCount > 0 {
missingCount /= int(b.frequency)
for range missingCount {
prev.data = append(prev.data, util.NaN)
}
prev.data = prev.data[0:len(prev.data):len(prev.data)]
}
}
l.metrics[minfo.Offset] = b
return nil
}
func (l *Level) loadJsonFile(m *MemoryStore, f *os.File, from int64) error {
br := bufio.NewReader(f)
cf := &CheckpointFile{}
if err := json.NewDecoder(br).Decode(cf); err != nil {
return err
}
if cf.To != 0 && cf.To < from {
return nil
}
if err := l.loadFile(cf, m); err != nil {
return err
}
return nil
}
func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
for name, metric := range cf.Metrics {
n := len(metric.Data)
b := &buffer{
frequency: metric.Frequency,
start: metric.Start,
data: metric.Data[0:n:n], // Space is wasted here :(
prev: nil,
next: nil,
archived: true,
}
b.close()
minfo, ok := m.Metrics[name]
if !ok {
continue
// return errors.New("Unkown metric: " + name)
}
prev := l.metrics[minfo.Offset]
if prev == nil {
l.metrics[minfo.Offset] = b
} else {
if prev.start > b.start {
return errors.New("wooops")
}
b.prev = prev
prev.next = b
}
l.metrics[minfo.Offset] = b
}
if len(cf.Children) > 0 && l.children == nil {
l.children = make(map[string]*Level)
}
for sel, childCf := range cf.Children {
child, ok := l.children[sel]
if !ok {
child = &Level{
metrics: make([]*buffer, len(m.Metrics)),
children: nil,
}
l.children[sel] = child
}
if err := child.loadFile(childCf, m); err != nil {
return err
}
}
return nil
}
func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension string) (int, error) {
direntries, err := os.ReadDir(dir)
if err != nil {
if os.IsNotExist(err) {
return 0, nil
}
return 0, err
}
allFiles := make([]fs.DirEntry, 0)
filesLoaded := 0
for _, e := range direntries {
if e.IsDir() {
child := &Level{
metrics: make([]*buffer, len(m.Metrics)),
children: make(map[string]*Level),
}
files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from, extension)
filesLoaded += files
if err != nil {
return filesLoaded, err
}
l.children[e.Name()] = child
} else if strings.HasSuffix(e.Name(), "."+extension) {
allFiles = append(allFiles, e)
} else {
continue
}
}
files, err := findFiles(allFiles, from, extension, true)
if err != nil {
return filesLoaded, err
}
loaders := map[string]func(*MemoryStore, *os.File, int64) error{
"json": l.loadJsonFile,
"avro": l.loadAvroFile,
}
loader := loaders[extension]
for _, filename := range files {
f, err := os.Open(path.Join(dir, filename))
if err != nil {
return filesLoaded, err
}
defer f.Close()
if err = loader(m, f, from); err != nil {
return filesLoaded, err
}
filesLoaded += 1
}
return filesLoaded, nil
}
// This will probably get very slow over time!
// A solution could be some sort of an index file in which all other files
// and the timespan they contain is listed.
func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRecentFiles bool) ([]string, error) {
nums := map[string]int64{}
for _, e := range direntries {
if !strings.HasSuffix(e.Name(), "."+extension) {
continue
}
ts, err := strconv.ParseInt(e.Name()[strings.Index(e.Name(), "_")+1:len(e.Name())-5], 10, 64)
if err != nil {
return nil, err
}
nums[e.Name()] = ts
}
sort.Slice(direntries, func(i, j int) bool {
a, b := direntries[i], direntries[j]
return nums[a.Name()] < nums[b.Name()]
})
filenames := make([]string, 0)
for i := 0; i < len(direntries); i++ {
e := direntries[i]
ts1 := nums[e.Name()]
if findMoreRecentFiles && t <= ts1 {
filenames = append(filenames, e.Name())
}
if i == len(direntries)-1 {
continue
}
enext := direntries[i+1]
ts2 := nums[enext.Name()]
if findMoreRecentFiles {
if ts1 < t && t < ts2 {
filenames = append(filenames, e.Name())
}
} else {
if ts2 < t {
filenames = append(filenames, e.Name())
}
}
}
return filenames, nil
}

View File

@@ -1,88 +0,0 @@
package memorystore
import (
"bufio"
"fmt"
"time"
)
// This is a threshold that allows a node to be healthy with certain number of data points missing.
// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a
// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy.
const MaxMissingDataPoints int64 = 5
// This is a threshold which allows upto certain number of metrics in a node to be unhealthly.
// Works with MaxMissingDataPoints. Say 5 metrics (including submetrics) do not receive the last
// MaxMissingDataPoints data points, then the node will be deemed healthy. Any more metrics that does
// not receive data for MaxMissingDataPoints data points will deem the node unhealthy.
const MaxUnhealthyMetrics int64 = 5
func (b *buffer) healthCheck() int64 {
// Check if the buffer is empty
if b.data == nil {
return 1
}
buffer_end := b.start + b.frequency*int64(len(b.data))
t := time.Now().Unix()
// Check if the buffer is too old
if t-buffer_end > MaxMissingDataPoints*b.frequency {
return 1
}
return 0
}
func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) {
l.lock.RLock()
defer l.lock.RUnlock()
for _, mc := range m.Metrics {
if b := l.metrics[mc.Offset]; b != nil {
count += b.healthCheck()
}
}
for _, lvl := range l.children {
c, err := lvl.healthCheck(m, 0)
if err != nil {
return 0, err
}
count += c
}
return count, nil
}
func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error {
lvl := m.root.findLevel(selector)
if lvl == nil {
return fmt.Errorf("not found: %#v", selector)
}
buf := make([]byte, 0, 25)
// buf = append(buf, "{"...)
var count int64 = 0
unhealthyMetricsCount, err := lvl.healthCheck(m, count)
if err != nil {
return err
}
if unhealthyMetricsCount < MaxUnhealthyMetrics {
buf = append(buf, "Healthy"...)
} else {
buf = append(buf, "Unhealthy"...)
}
// buf = append(buf, "}\n"...)
if _, err = w.Write(buf); err != nil {
return err
}
return w.Flush()
}

View File

@@ -1,187 +0,0 @@
package memorystore
import (
"sync"
"unsafe"
"github.com/ClusterCockpit/cc-metric-store/internal/util"
)
// Could also be called "node" as this forms a node in a tree structure.
// Called Level because "node" might be confusing here.
// Can be both a leaf or a inner node. In this tree structue, inner nodes can
// also hold data (in `metrics`).
type Level struct {
children map[string]*Level
metrics []*buffer
lock sync.RWMutex
}
// Find the correct level for the given selector, creating it if
// it does not exist. Example selector in the context of the
// ClusterCockpit could be: []string{ "emmy", "host123", "cpu0" }.
// This function would probably benefit a lot from `level.children` beeing a `sync.Map`?
func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level {
if len(selector) == 0 {
return l
}
// Allow concurrent reads:
l.lock.RLock()
var child *Level
var ok bool
if l.children == nil {
// Children map needs to be created...
l.lock.RUnlock()
} else {
child, ok := l.children[selector[0]]
l.lock.RUnlock()
if ok {
return child.findLevelOrCreate(selector[1:], nMetrics)
}
}
// The level does not exist, take write lock for unqiue access:
l.lock.Lock()
// While this thread waited for the write lock, another thread
// could have created the child node.
if l.children != nil {
child, ok = l.children[selector[0]]
if ok {
l.lock.Unlock()
return child.findLevelOrCreate(selector[1:], nMetrics)
}
}
child = &Level{
metrics: make([]*buffer, nMetrics),
children: nil,
}
if l.children != nil {
l.children[selector[0]] = child
} else {
l.children = map[string]*Level{selector[0]: child}
}
l.lock.Unlock()
return child.findLevelOrCreate(selector[1:], nMetrics)
}
func (l *Level) free(t int64) (int, error) {
l.lock.Lock()
defer l.lock.Unlock()
n := 0
for i, b := range l.metrics {
if b != nil {
delme, m := b.free(t)
n += m
if delme {
if cap(b.data) == BUFFER_CAP {
bufferPool.Put(b)
}
l.metrics[i] = nil
}
}
}
for _, l := range l.children {
m, err := l.free(t)
n += m
if err != nil {
return n, err
}
}
return n, nil
}
func (l *Level) sizeInBytes() int64 {
l.lock.RLock()
defer l.lock.RUnlock()
size := int64(0)
for _, b := range l.metrics {
if b != nil {
size += b.count() * int64(unsafe.Sizeof(util.Float(0)))
}
}
for _, child := range l.children {
size += child.sizeInBytes()
}
return size
}
func (l *Level) findLevel(selector []string) *Level {
if len(selector) == 0 {
return l
}
l.lock.RLock()
defer l.lock.RUnlock()
lvl := l.children[selector[0]]
if lvl == nil {
return nil
}
return lvl.findLevel(selector[1:])
}
func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error {
l.lock.RLock()
defer l.lock.RUnlock()
if len(selector) == 0 {
b := l.metrics[offset]
if b != nil {
return f(b)
}
for _, lvl := range l.children {
err := lvl.findBuffers(nil, offset, f)
if err != nil {
return err
}
}
return nil
}
sel := selector[0]
if len(sel.String) != 0 && l.children != nil {
lvl, ok := l.children[sel.String]
if ok {
err := lvl.findBuffers(selector[1:], offset, f)
if err != nil {
return err
}
}
return nil
}
if sel.Group != nil && l.children != nil {
for _, key := range sel.Group {
lvl, ok := l.children[key]
if ok {
err := lvl.findBuffers(selector[1:], offset, f)
if err != nil {
return err
}
}
}
return nil
}
if sel.Any && l.children != nil {
for _, lvl := range l.children {
if err := lvl.findBuffers(selector[1:], offset, f); err != nil {
return err
}
}
return nil
}
return nil
}

View File

@@ -1,373 +0,0 @@
package memorystore
import (
"context"
"errors"
"log"
"runtime"
"sync"
"time"
"github.com/ClusterCockpit/cc-metric-store/internal/avro"
"github.com/ClusterCockpit/cc-metric-store/internal/config"
"github.com/ClusterCockpit/cc-metric-store/internal/util"
"github.com/ClusterCockpit/cc-metric-store/pkg/resampler"
)
var (
singleton sync.Once
msInstance *MemoryStore
)
var NumWorkers int = 4
func init() {
maxWorkers := 10
NumWorkers = runtime.NumCPU()/2 + 1
if NumWorkers > maxWorkers {
NumWorkers = maxWorkers
}
}
type Metric struct {
Name string
Value util.Float
MetricConfig config.MetricConfig
}
type MemoryStore struct {
Metrics map[string]config.MetricConfig
root Level
}
// Create a new, initialized instance of a MemoryStore.
// Will panic if values in the metric configurations are invalid.
func Init(metrics map[string]config.MetricConfig) {
singleton.Do(func() {
offset := 0
for key, cfg := range metrics {
if cfg.Frequency == 0 {
panic("invalid frequency")
}
metrics[key] = config.MetricConfig{
Frequency: cfg.Frequency,
Aggregation: cfg.Aggregation,
Offset: offset,
}
offset += 1
}
msInstance = &MemoryStore{
root: Level{
metrics: make([]*buffer, len(metrics)),
children: make(map[string]*Level),
},
Metrics: metrics,
}
})
}
func GetMemoryStore() *MemoryStore {
if msInstance == nil {
log.Fatalf("MemoryStore not initialized!")
}
return msInstance
}
func Shutdown() {
log.Printf("Writing to '%s'...\n", config.Keys.Checkpoints.RootDir)
var files int
var err error
ms := GetMemoryStore()
if config.Keys.Checkpoints.FileFormat == "json" {
files, err = ms.ToCheckpoint(config.Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix())
} else {
files, err = avro.GetAvroStore().ToCheckpoint(config.Keys.Checkpoints.RootDir, true)
close(avro.LineProtocolMessages)
}
if err != nil {
log.Printf("Writing checkpoint failed: %s\n", err.Error())
}
log.Printf("Done! (%d files written)\n", files)
// ms.PrintHeirarchy()
}
// func (m *MemoryStore) PrintHeirarchy() {
// m.root.lock.Lock()
// defer m.root.lock.Unlock()
// fmt.Printf("Root : \n")
// for lvl1, sel1 := range m.root.children {
// fmt.Printf("\t%s\n", lvl1)
// for lvl2, sel2 := range sel1.children {
// fmt.Printf("\t\t%s\n", lvl2)
// if lvl1 == "fritz" && lvl2 == "f0201" {
// for name, met := range m.Metrics {
// mt := sel2.metrics[met.Offset]
// fmt.Printf("\t\t\t\t%s\n", name)
// fmt.Printf("\t\t\t\t")
// for mt != nil {
// // if name == "cpu_load" {
// fmt.Printf("%d(%d) -> %#v", mt.start, len(mt.data), mt.data)
// // }
// mt = mt.prev
// }
// fmt.Printf("\n")
// }
// }
// for lvl3, sel3 := range sel2.children {
// if lvl1 == "fritz" && lvl2 == "f0201" && lvl3 == "hwthread70" {
// fmt.Printf("\t\t\t\t\t%s\n", lvl3)
// for name, met := range m.Metrics {
// mt := sel3.metrics[met.Offset]
// fmt.Printf("\t\t\t\t\t\t%s\n", name)
// fmt.Printf("\t\t\t\t\t\t")
// for mt != nil {
// // if name == "clock" {
// fmt.Printf("%d(%d) -> %#v", mt.start, len(mt.data), mt.data)
// mt = mt.prev
// }
// fmt.Printf("\n")
// }
// // for i, _ := range sel3.metrics {
// // fmt.Printf("\t\t\t\t\t%s\n", getName(configmetrics, i))
// // }
// }
// }
// }
// }
// }
func getName(m *MemoryStore, i int) string {
for key, val := range m.Metrics {
if val.Offset == i {
return key
}
}
return ""
}
func Retention(wg *sync.WaitGroup, ctx context.Context) {
ms := GetMemoryStore()
go func() {
defer wg.Done()
d, err := time.ParseDuration(config.Keys.RetentionInMemory)
if err != nil {
log.Fatal(err)
}
if d <= 0 {
return
}
ticks := func() <-chan time.Time {
d := d / 2
if d <= 0 {
return nil
}
return time.NewTicker(d).C
}()
for {
select {
case <-ctx.Done():
return
case <-ticks:
t := time.Now().Add(-d)
log.Printf("start freeing buffers (older than %s)...\n", t.Format(time.RFC3339))
freed, err := ms.Free(nil, t.Unix())
if err != nil {
log.Printf("freeing up buffers failed: %s\n", err.Error())
} else {
log.Printf("done: %d buffers freed\n", freed)
}
}
}
}()
}
// Write all values in `metrics` to the level specified by `selector` for time `ts`.
// Look at `findLevelOrCreate` for how selectors work.
func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error {
var ok bool
for i, metric := range metrics {
if metric.MetricConfig.Frequency == 0 {
metric.MetricConfig, ok = m.Metrics[metric.Name]
if !ok {
metric.MetricConfig.Frequency = 0
}
metrics[i] = metric
}
}
return m.WriteToLevel(&m.root, selector, ts, metrics)
}
func (m *MemoryStore) GetLevel(selector []string) *Level {
return m.root.findLevelOrCreate(selector, len(m.Metrics))
}
// Assumes that `minfo` in `metrics` is filled in!
func (m *MemoryStore) WriteToLevel(l *Level, selector []string, ts int64, metrics []Metric) error {
l = l.findLevelOrCreate(selector, len(m.Metrics))
l.lock.Lock()
defer l.lock.Unlock()
for _, metric := range metrics {
if metric.MetricConfig.Frequency == 0 {
continue
}
b := l.metrics[metric.MetricConfig.Offset]
if b == nil {
// First write to this metric and level
b = newBuffer(ts, metric.MetricConfig.Frequency)
l.metrics[metric.MetricConfig.Offset] = b
}
nb, err := b.write(ts, metric.Value)
if err != nil {
return err
}
// Last write created a new buffer...
if b != nb {
l.metrics[metric.MetricConfig.Offset] = nb
}
}
return nil
}
// Returns all values for metric `metric` from `from` to `to` for the selected level(s).
// If the level does not hold the metric itself, the data will be aggregated recursively from the children.
// The second and third return value are the actual from/to for the data. Those can be different from
// the range asked for if no data was available.
func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, resolution int64) ([]util.Float, int64, int64, int64, error) {
if from > to {
return nil, 0, 0, 0, errors.New("invalid time range")
}
minfo, ok := m.Metrics[metric]
if !ok {
return nil, 0, 0, 0, errors.New("unkown metric: " + metric)
}
n, data := 0, make([]util.Float, (to-from)/minfo.Frequency+1)
err := m.root.findBuffers(selector, minfo.Offset, func(b *buffer) error {
cdata, cfrom, cto, err := b.read(from, to, data)
if err != nil {
return err
}
if n == 0 {
from, to = cfrom, cto
} else if from != cfrom || to != cto || len(data) != len(cdata) {
missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency)
if missingfront != 0 {
return ErrDataDoesNotAlign
}
newlen := len(cdata) - missingback
if newlen < 1 {
return ErrDataDoesNotAlign
}
cdata = cdata[0:newlen]
if len(cdata) != len(data) {
return ErrDataDoesNotAlign
}
from, to = cfrom, cto
}
data = cdata
n += 1
return nil
})
if err != nil {
return nil, 0, 0, 0, err
} else if n == 0 {
return nil, 0, 0, 0, errors.New("metric or host not found")
} else if n > 1 {
if minfo.Aggregation == config.AvgAggregation {
normalize := 1. / util.Float(n)
for i := 0; i < len(data); i++ {
data[i] *= normalize
}
} else if minfo.Aggregation != config.SumAggregation {
return nil, 0, 0, 0, errors.New("invalid aggregation")
}
}
data, resolution, err = resampler.LargestTriangleThreeBucket(data, minfo.Frequency, resolution)
if err != nil {
return nil, 0, 0, 0, err
}
return data, from, to, resolution, nil
}
// Release all buffers for the selected level and all its children that contain only
// values older than `t`.
func (m *MemoryStore) Free(selector []string, t int64) (int, error) {
return m.GetLevel(selector).free(t)
}
func (m *MemoryStore) FreeAll() error {
for k := range m.root.children {
delete(m.root.children, k)
}
return nil
}
func (m *MemoryStore) SizeInBytes() int64 {
return m.root.sizeInBytes()
}
// Given a selector, return a list of all children of the level selected.
func (m *MemoryStore) ListChildren(selector []string) []string {
lvl := &m.root
for lvl != nil && len(selector) != 0 {
lvl.lock.RLock()
next := lvl.children[selector[0]]
lvl.lock.RUnlock()
lvl = next
selector = selector[1:]
}
if lvl == nil {
return nil
}
lvl.lock.RLock()
defer lvl.lock.RUnlock()
children := make([]string, 0, len(lvl.children))
for child := range lvl.children {
children = append(children, child)
}
return children
}

View File

@@ -1,140 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package runtimeEnv
import (
"bufio"
"errors"
"fmt"
"os"
"os/exec"
"os/user"
"strconv"
"strings"
"syscall"
)
// Very simple and limited .env file reader.
// All variable definitions found are directly
// added to the processes environment.
func LoadEnv(file string) error {
f, err := os.Open(file)
if err != nil {
// log.Error("Error while opening .env file")
return err
}
defer f.Close()
s := bufio.NewScanner(bufio.NewReader(f))
for s.Scan() {
line := s.Text()
if strings.HasPrefix(line, "#") || len(line) == 0 {
continue
}
if strings.Contains(line, "#") {
return errors.New("'#' are only supported at the start of a line")
}
line = strings.TrimPrefix(line, "export ")
parts := strings.SplitN(line, "=", 2)
if len(parts) != 2 {
return fmt.Errorf("RUNTIME/SETUP > unsupported line: %#v", line)
}
key := strings.TrimSpace(parts[0])
val := strings.TrimSpace(parts[1])
if strings.HasPrefix(val, "\"") {
if !strings.HasSuffix(val, "\"") {
return fmt.Errorf("RUNTIME/SETUP > unsupported line: %#v", line)
}
runes := []rune(val[1 : len(val)-1])
sb := strings.Builder{}
for i := 0; i < len(runes); i++ {
if runes[i] == '\\' {
i++
switch runes[i] {
case 'n':
sb.WriteRune('\n')
case 'r':
sb.WriteRune('\r')
case 't':
sb.WriteRune('\t')
case '"':
sb.WriteRune('"')
default:
return fmt.Errorf("RUNTIME/SETUP > unsupported escape sequence in quoted string: backslash %#v", runes[i])
}
continue
}
sb.WriteRune(runes[i])
}
val = sb.String()
}
os.Setenv(key, val)
}
return s.Err()
}
// Changes the processes user and group to that
// specified in the config.json. The go runtime
// takes care of all threads (and not only the calling one)
// executing the underlying systemcall.
func DropPrivileges(username string, group string) error {
if group != "" {
g, err := user.LookupGroup(group)
if err != nil {
// log.Warn("Error while looking up group")
return err
}
gid, _ := strconv.Atoi(g.Gid)
if err := syscall.Setgid(gid); err != nil {
// log.Warn("Error while setting gid")
return err
}
}
if username != "" {
u, err := user.Lookup(username)
if err != nil {
// log.Warn("Error while looking up user")
return err
}
uid, _ := strconv.Atoi(u.Uid)
if err := syscall.Setuid(uid); err != nil {
// log.Warn("Error while setting uid")
return err
}
}
return nil
}
// If started via systemd, inform systemd that we are running:
// https://www.freedesktop.org/software/systemd/man/sd_notify.html
func SystemdNotifiy(ready bool, status string) {
if os.Getenv("NOTIFY_SOCKET") == "" {
// Not started using systemd
return
}
args := []string{fmt.Sprintf("--pid=%d", os.Getpid())}
if ready {
args = append(args, "--ready")
}
if status != "" {
args = append(args, fmt.Sprintf("--status=%s", status))
}
cmd := exec.Command("systemd-notify", args...)
cmd.Run() // errors ignored on purpose, there is not much to do anyways.
}

View File

@@ -1,51 +0,0 @@
package util
import (
"encoding/json"
"errors"
)
type SelectorElement struct {
String string
Group []string
Any bool
}
func (se *SelectorElement) UnmarshalJSON(input []byte) error {
if input[0] == '"' {
if err := json.Unmarshal(input, &se.String); err != nil {
return err
}
if se.String == "*" {
se.Any = true
se.String = ""
}
return nil
}
if input[0] == '[' {
return json.Unmarshal(input, &se.Group)
}
return errors.New("the Go SelectorElement type can only be a string or an array of strings")
}
func (se *SelectorElement) MarshalJSON() ([]byte, error) {
if se.Any {
return []byte("\"*\""), nil
}
if se.String != "" {
return json.Marshal(se.String)
}
if se.Group != nil {
return json.Marshal(se.Group)
}
return nil, errors.New("a Go Selector must be a non-empty string or a non-empty slice of strings")
}
type Selector []SelectorElement

View File

@@ -1,4 +1,4 @@
package api
package main
import (
"context"
@@ -9,19 +9,21 @@ import (
"sync"
"time"
"github.com/ClusterCockpit/cc-metric-store/internal/avro"
"github.com/ClusterCockpit/cc-metric-store/internal/config"
"github.com/ClusterCockpit/cc-metric-store/internal/memorystore"
"github.com/ClusterCockpit/cc-metric-store/internal/util"
"github.com/influxdata/line-protocol/v2/lineprotocol"
"github.com/nats-io/nats.go"
)
type Metric struct {
Name string
Value Float
Unit string
mc MetricConfig
}
// Currently unused, could be used to send messages via raw TCP.
// Each connection is handled in it's own goroutine. This is a blocking function.
func ReceiveRaw(ctx context.Context,
listener net.Listener,
handleLine func(*lineprotocol.Decoder, string) error,
) error {
func ReceiveRaw(ctx context.Context, listener net.Listener, handleLine func(*lineprotocol.Decoder, string) error) error {
var wg sync.WaitGroup
wg.Add(1)
@@ -66,7 +68,7 @@ func ReceiveRaw(ctx context.Context,
return
}
log.Printf("ReceiveRaw Error from Address %s: %s", conn.RemoteAddr().String(), err.Error())
log.Printf("%s: %s", conn.RemoteAddr().String(), err.Error())
errmsg := make([]byte, 128)
errmsg = append(errmsg, `error: `...)
errmsg = append(errmsg, err.Error()...)
@@ -83,20 +85,12 @@ func ReceiveRaw(ctx context.Context,
// Connect to a nats server and subscribe to "updates". This is a blocking
// function. handleLine will be called for each line recieved via nats.
// Send `true` through the done channel for gracefull termination.
func ReceiveNats(conf *config.NatsConfig,
ms *memorystore.MemoryStore,
workers int,
ctx context.Context,
) error {
func ReceiveNats(conf *NatsConfig, handleLine func(*lineprotocol.Decoder, string) error, workers int, ctx context.Context) error {
var opts []nats.Option
if conf.Username != "" && conf.Password != "" {
opts = append(opts, nats.UserInfo(conf.Username, conf.Password))
}
if conf.Credsfilepath != "" {
opts = append(opts, nats.UserCredentials(conf.Credsfilepath))
}
nc, err := nats.Connect(conf.Address, opts...)
if err != nil {
return err
@@ -118,7 +112,7 @@ func ReceiveNats(conf *config.NatsConfig,
go func() {
for m := range msgs {
dec := lineprotocol.NewDecoderWithBytes(m.Data)
if err := decodeLine(dec, ms, clusterTag); err != nil {
if err := handleLine(dec, clusterTag); err != nil {
log.Printf("error: %s\n", err.Error())
}
}
@@ -133,7 +127,7 @@ func ReceiveNats(conf *config.NatsConfig,
} else {
sub, err = nc.Subscribe(sc.SubscribeTo, func(m *nats.Msg) {
dec := lineprotocol.NewDecoderWithBytes(m.Data)
if err := decodeLine(dec, ms, clusterTag); err != nil {
if err := handleLine(dec, clusterTag); err != nil {
log.Printf("error: %s\n", err.Error())
}
})
@@ -182,21 +176,18 @@ func reorder(buf, prefix []byte) []byte {
// Decode lines using dec and make write calls to the MemoryStore.
// If a line is missing its cluster tag, use clusterDefault as default.
func decodeLine(dec *lineprotocol.Decoder,
ms *memorystore.MemoryStore,
clusterDefault string,
) error {
func decodeLine(dec *lineprotocol.Decoder, clusterDefault string) error {
// Reduce allocations in loop:
t := time.Now()
metric, metricBuf := memorystore.Metric{}, make([]byte, 0, 16)
metric, metricBuf := Metric{}, make([]byte, 0, 16)
selector := make([]string, 0, 4)
typeBuf, subTypeBuf := make([]byte, 0, 16), make([]byte, 0)
// Optimize for the case where all lines in a "batch" are about the same
// cluster and host. By using `WriteToLevel` (level = host), we do not need
// to take the root- and cluster-level lock as often.
var lvl *memorystore.Level = nil
prevCluster, prevHost := "", ""
var lvl *level = nil
var prevCluster, prevHost string = "", ""
var ok bool
for dec.Next() {
@@ -210,13 +201,13 @@ func decodeLine(dec *lineprotocol.Decoder,
metricBuf = append(metricBuf[:0], rawmeasurement...)
// The go compiler optimizes map[string(byteslice)] lookups:
metric.MetricConfig, ok = ms.Metrics[string(rawmeasurement)]
metric.mc, ok = memoryStore.metrics[string(rawmeasurement)]
if !ok {
continue
}
typeBuf, subTypeBuf := typeBuf[:0], subTypeBuf[:0]
cluster, host := clusterDefault, ""
cluster, host, unit := clusterDefault, "", ""
for {
key, val, err := dec.NextTag()
if err != nil {
@@ -242,6 +233,8 @@ func decodeLine(dec *lineprotocol.Decoder,
host = string(val)
lvl = nil
}
case "unit":
unit = string(val)
case "type":
if string(val) == "node" {
break
@@ -255,13 +248,12 @@ func decodeLine(dec *lineprotocol.Decoder,
}
case "type-id":
typeBuf = append(typeBuf, val...)
case "stype":
// We cannot be sure that the "stype" tag comes before the "stype-id" tag:
case "subtype":
// We cannot be sure that the "subtype" tag comes before the "stype-id" tag:
if len(subTypeBuf) == 0 {
subTypeBuf = append(subTypeBuf, val...)
} else {
subTypeBuf = reorder(subTypeBuf, val)
// subTypeBuf = reorder(typeBuf, val)
subTypeBuf = reorder(typeBuf, val)
}
case "stype-id":
subTypeBuf = append(subTypeBuf, val...)
@@ -275,7 +267,7 @@ func decodeLine(dec *lineprotocol.Decoder,
if lvl == nil {
selector = selector[:2]
selector[0], selector[1] = cluster, host
lvl = ms.GetLevel(selector)
lvl = memoryStore.GetLevel(selector)
prevCluster, prevHost = cluster, host
}
@@ -299,50 +291,26 @@ func decodeLine(dec *lineprotocol.Decoder,
}
if string(key) != "value" {
return fmt.Errorf("host %s: unknown field: '%s' (value: %#v)", host, string(key), val)
return fmt.Errorf("unkown field: '%s' (value: %#v)", string(key), val)
}
if val.Kind() == lineprotocol.Float {
metric.Value = util.Float(val.FloatV())
metric.Value = Float(val.FloatV())
} else if val.Kind() == lineprotocol.Int {
metric.Value = util.Float(val.IntV())
metric.Value = Float(val.IntV())
} else if val.Kind() == lineprotocol.Uint {
metric.Value = util.Float(val.UintV())
metric.Value = Float(val.UintV())
} else {
return fmt.Errorf("host %s: unsupported value type in message: %s", host, val.Kind().String())
return fmt.Errorf("unsupported value type in message: %s", val.Kind().String())
}
metric.Unit = unit
}
if t, err = dec.Time(lineprotocol.Second, t); err != nil {
t = time.Now()
if t, err = dec.Time(lineprotocol.Millisecond, t); err != nil {
t = time.Now()
if t, err = dec.Time(lineprotocol.Microsecond, t); err != nil {
t = time.Now()
if t, err = dec.Time(lineprotocol.Nanosecond, t); err != nil {
return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error())
}
}
}
return err
}
if err != nil {
return fmt.Errorf("host %s: timestamp : %#v with error : %#v", host, t, err.Error())
}
time := t.Unix()
if config.Keys.Checkpoints.FileFormat != "json" {
avro.LineProtocolMessages <- &avro.AvroStruct{
MetricName: string(metricBuf),
Cluster: cluster,
Node: host,
Selector: append([]string{}, selector...),
Value: metric.Value,
Timestamp: time}
}
if err := ms.WriteToLevel(lvl, selector, time, []memorystore.Metric{metric}); err != nil {
if err := memoryStore.WriteToLevel(lvl, selector, t.Unix(), []Metric{metric}); err != nil {
return err
}
}

144
lineprotocol_test.go Normal file
View File

@@ -0,0 +1,144 @@
package main
import (
"bytes"
"log"
"strconv"
"testing"
"github.com/influxdata/line-protocol/v2/lineprotocol"
)
const TestDataClassicFormat string = `
m1,cluster=ctest,hostname=htest1,type=node value=1 123456789
m2,cluster=ctest,hostname=htest1,type=node value=2 123456789
m3,hostname=htest2,type=node value=3 123456789
m4,cluster=ctest,hostname=htest2,type=core,type-id=1 value=4 123456789
m4,cluster=ctest,hostname=htest2,type-id=2,type=core value=5 123456789
`
const BenchmarkLineBatch string = `
nm1,cluster=ctest,hostname=htest1,type=node value=123.0 123456789
nm2,cluster=ctest,hostname=htest1,type=node value=123.0 123456789
nm3,cluster=ctest,hostname=htest1,type=node value=123.0 123456789
nm4,cluster=ctest,hostname=htest1,type=node value=123.0 123456789
nm5,cluster=ctest,hostname=htest1,type=node value=123.0 123456789
nm6,cluster=ctest,hostname=htest1,type=node value=123.0 123456789
nm7,cluster=ctest,hostname=htest1,type=node value=123.0 123456789
nm8,cluster=ctest,hostname=htest1,type=node value=123.0 123456789
nm9,cluster=ctest,hostname=htest1,type=node value=123.0 123456789
cm1,cluster=ctest,hostname=htest1,type=core,type-id=1 value=234.0 123456789
cm2,cluster=ctest,hostname=htest1,type=core,type-id=1 value=234.0 123456789
cm3,cluster=ctest,hostname=htest1,type=core,type-id=1 value=234.0 123456789
cm4,cluster=ctest,hostname=htest1,type=core,type-id=1 value=234.0 123456789
cm5,cluster=ctest,hostname=htest1,type=core,type-id=1 value=234.0 123456789
cm6,cluster=ctest,hostname=htest1,type=core,type-id=1 value=234.0 123456789
cm7,cluster=ctest,hostname=htest1,type=core,type-id=1 value=234.0 123456789
cm8,cluster=ctest,hostname=htest1,type=core,type-id=1 value=234.0 123456789
cm9,cluster=ctest,hostname=htest1,type=core,type-id=1 value=234.0 123456789
cm1,cluster=ctest,hostname=htest1,type=core,type-id=2 value=345.0 123456789
cm2,cluster=ctest,hostname=htest1,type=core,type-id=2 value=345.0 123456789
cm3,cluster=ctest,hostname=htest1,type=core,type-id=2 value=345.0 123456789
cm4,cluster=ctest,hostname=htest1,type=core,type-id=2 value=345.0 123456789
cm5,cluster=ctest,hostname=htest1,type=core,type-id=2 value=345.0 123456789
cm6,cluster=ctest,hostname=htest1,type=core,type-id=2 value=345.0 123456789
cm7,cluster=ctest,hostname=htest1,type=core,type-id=2 value=345.0 123456789
cm8,cluster=ctest,hostname=htest1,type=core,type-id=2 value=345.0 123456789
cm9,cluster=ctest,hostname=htest1,type=core,type-id=2 value=345.0 123456789
cm1,cluster=ctest,hostname=htest1,type=core,type-id=3 value=456.0 123456789
cm2,cluster=ctest,hostname=htest1,type=core,type-id=3 value=456.0 123456789
cm3,cluster=ctest,hostname=htest1,type=core,type-id=3 value=456.0 123456789
cm4,cluster=ctest,hostname=htest1,type=core,type-id=3 value=456.0 123456789
cm5,cluster=ctest,hostname=htest1,type=core,type-id=3 value=456.0 123456789
cm6,cluster=ctest,hostname=htest1,type=core,type-id=3 value=456.0 123456789
cm7,cluster=ctest,hostname=htest1,type=core,type-id=3 value=456.0 123456789
cm8,cluster=ctest,hostname=htest1,type=core,type-id=3 value=456.0 123456789
cm9,cluster=ctest,hostname=htest1,type=core,type-id=3 value=456.0 123456789
cm1,cluster=ctest,hostname=htest1,type=core,type-id=4 value=567.0 123456789
cm2,cluster=ctest,hostname=htest1,type=core,type-id=4 value=567.0 123456789
cm3,cluster=ctest,hostname=htest1,type=core,type-id=4 value=567.0 123456789
cm4,cluster=ctest,hostname=htest1,type=core,type-id=4 value=567.0 123456789
cm5,cluster=ctest,hostname=htest1,type=core,type-id=4 value=567.0 123456789
cm6,cluster=ctest,hostname=htest1,type=core,type-id=4 value=567.0 123456789
cm7,cluster=ctest,hostname=htest1,type=core,type-id=4 value=567.0 123456789
cm8,cluster=ctest,hostname=htest1,type=core,type-id=4 value=567.0 123456789
cm9,cluster=ctest,hostname=htest1,type=core,type-id=4 value=567.0 123456789
`
func TestLineprotocolDecoder(t *testing.T) {
prevMemoryStore := memoryStore
t.Cleanup(func() {
memoryStore = prevMemoryStore
})
memoryStore = NewMemoryStore(map[string]MetricConfig{
"m1": {Frequency: 1},
"m2": {Frequency: 1},
"m3": {Frequency: 1},
"m4": {Frequency: 1},
})
dec := lineprotocol.NewDecoderWithBytes([]byte(TestDataClassicFormat))
if err := decodeLine(dec, "ctest"); err != nil {
log.Fatal(err)
}
// memoryStore.DebugDump(bufio.NewWriter(os.Stderr))
h1 := memoryStore.GetLevel([]string{"ctest", "htest1"})
h1b1 := h1.metrics[memoryStore.metrics["m1"].offset]
h1b2 := h1.metrics[memoryStore.metrics["m2"].offset]
if h1b1.data[0] != 1.0 || h1b2.data[0] != 2.0 {
log.Fatal()
}
h2 := memoryStore.GetLevel([]string{"ctest", "htest2"})
h2b3 := h2.metrics[memoryStore.metrics["m3"].offset]
if h2b3.data[0] != 3.0 {
log.Fatal()
}
h2c1 := memoryStore.GetLevel([]string{"ctest", "htest2", "core1"})
h2c1b4 := h2c1.metrics[memoryStore.metrics["m4"].offset]
h2c2 := memoryStore.GetLevel([]string{"ctest", "htest2", "core2"})
h2c2b4 := h2c2.metrics[memoryStore.metrics["m4"].offset]
if h2c1b4.data[0] != 4.0 || h2c2b4.data[0] != 5.0 {
log.Fatal()
}
}
func BenchmarkLineprotocolDecoder(b *testing.B) {
b.StopTimer()
memoryStore = NewMemoryStore(map[string]MetricConfig{
"nm1": {Frequency: 1},
"nm2": {Frequency: 1},
"nm3": {Frequency: 1},
"nm4": {Frequency: 1},
"nm5": {Frequency: 1},
"nm6": {Frequency: 1},
"nm7": {Frequency: 1},
"nm8": {Frequency: 1},
"nm9": {Frequency: 1},
"cm1": {Frequency: 1},
"cm2": {Frequency: 1},
"cm3": {Frequency: 1},
"cm4": {Frequency: 1},
"cm5": {Frequency: 1},
"cm6": {Frequency: 1},
"cm7": {Frequency: 1},
"cm8": {Frequency: 1},
"cm9": {Frequency: 1},
})
for i := 0; i < b.N; i++ {
data := []byte(BenchmarkLineBatch)
data = bytes.ReplaceAll(data, []byte("123456789"), []byte(strconv.Itoa(i+123456789)))
dec := lineprotocol.NewDecoderWithBytes(data)
b.StartTimer()
if err := decodeLine(dec, "ctest"); err != nil {
b.Fatal(err)
}
b.StopTimer()
}
}

504
memoryStore_test.go.orig Normal file
View File

@@ -0,0 +1,504 @@
package main
import (
"fmt"
"log"
"math"
"testing"
"github.com/ClusterCockpit/cc-metric-store/lineprotocol"
)
var testMetrics [][]lineprotocol.Metric = [][]lineprotocol.Metric{
{{"flops", 100.5}, {"mem_bw", 2088.67}},
{{"flops", 180.5}, {"mem_bw", 4078.32}, {"mem_capacity", 1020}},
{{"flops", 980.5}, {"mem_bw", 9078.32}, {"mem_capacity", 5010}},
{{"flops", 940.5}, {"mem_bw", 9278.32}, {"mem_capacity", 6010}},
{{"flops", 930.5}, {"mem_bw", 9378.32}, {"mem_capacity", 7010}},
{{"flops", 980.5}, {"mem_bw", 9478.32}, {"mem_capacity", 8010}},
{{"flops", 980.5}, {"mem_bw", 9478.32}, {"mem_capacity", 8010}},
{{"flops", 980.5}, {"mem_bw", 9478.32}, {"mem_capacity", 8010}},
{{"flops", 970.5}, {"mem_bw", 9178.32}, {"mem_capacity", 2010}},
{{"flops", 970.5}, {"mem_bw", 9178.32}, {"mem_capacity", 2010}}}
var testMetricsAlt [][]lineprotocol.Metric = [][]lineprotocol.Metric{
{{"flops", 120.5}, {"mem_bw", 2080.67}},
{{"flops", 130.5}, {"mem_bw", 4071.32}, {"mem_capacity", 1120}},
{{"flops", 940.5}, {"mem_bw", 9072.32}, {"mem_capacity", 5210}},
{{"flops", 950.5}, {"mem_bw", 9273.32}, {"mem_capacity", 6310}},
{{"flops", 960.5}, {"mem_bw", 9374.32}, {"mem_capacity", 7410}},
{{"flops", 970.5}, {"mem_bw", 9475.32}, {"mem_capacity", 8510}},
{{"flops", 990.5}, {"mem_bw", 9476.32}, {"mem_capacity", 8610}},
{{"flops", 910.5}, {"mem_bw", 9477.32}, {"mem_capacity", 8710}},
{{"flops", 920.5}, {"mem_bw", 9178.32}, {"mem_capacity", 2810}},
{{"flops", 930.5}, {"mem_bw", 9179.32}, {"mem_capacity", 2910}}}
func dumpStoreBuffer(s *storeBuffer) {
log.Printf("Start TS %d\n", s.start)
ctr := 0
for _, val := range s.store {
fmt.Printf("%f\t", val)
ctr++
if ctr == 10 {
fmt.Printf("\n")
ctr = 0
}
}
}
func printMemStore(m *MemoryStore) {
log.Println("########################")
log.Printf("Frequency %d, Metrics %d Slots %d\n",
m.frequency, m.numMetrics, m.numSlots)
log.Println("##Offsets")
for key, val := range m.offsets {
log.Printf("\t%s = %d\n", key, val)
}
log.Println("##Containers")
for key, c := range m.containers {
log.Printf("ID %s\n", key)
log.Println("###current")
dumpStoreBuffer(c.current)
log.Println("###next")
dumpStoreBuffer(c.next)
}
log.Println("########################")
}
//############################
//#### Whitebox tests ########
//############################
func TestAddMetricSimple(t *testing.T) {
key := "m1220"
m := newMemoryStore([]string{"flops", "mem_bw", "mem_capacity"}, 10, 60)
// printMemStore(m)
m.AddMetrics(key, 1584022800, testMetrics[0])
m.AddMetrics(key, 1584022890, testMetrics[1])
want := testMetrics[0][0].Value
got := m.containers[key].current.store[0]
if got != want {
t.Errorf("Want %f got %f\n", want, got)
}
want = testMetrics[1][2].Value
got = m.containers[key].current.store[21]
if got != want {
t.Errorf("Want %f got %f\n", want, got)
}
// printMemStore(m)
}
func TestAddMetricReplace(t *testing.T) {
key := "m1220"
m := newMemoryStore([]string{"flops", "mem_bw", "mem_capacity"}, 10, 60)
// printMemStore(m)
m.AddMetrics(key, 1584022800, testMetrics[0])
m.AddMetrics(key, 1584022800, testMetrics[1])
want := testMetrics[1][0].Value
got := m.containers[key].current.store[0]
if got != want {
t.Errorf("Want %f got %f\n", want, got)
}
m.AddMetrics(key, 1584022850, testMetrics[0])
want = testMetrics[0][0].Value
got = m.containers[key].current.store[0]
if got != want {
t.Errorf("Want %f got %f\n", want, got)
}
m.AddMetrics(key, 1584022860, testMetrics[1])
want = testMetrics[0][0].Value
got = m.containers[key].current.store[0]
if got != want {
t.Errorf("Want %f got %f\n", want, got)
}
// printMemStore(m)
}
func TestAddMetricSwitch(t *testing.T) {
key := "m1220"
m := newMemoryStore([]string{"flops", "mem_bw", "mem_capacity"}, 10, 60)
// printMemStore(m)
m.AddMetrics(key, 1584023000, testMetrics[0])
m.AddMetrics(key, 1584023580, testMetrics[1])
want := testMetrics[1][2].Value
got := m.containers[key].current.store[29]
if got != want {
t.Errorf("Want %f got %f\n", want, got)
}
m.AddMetrics(key, 1584023600, testMetrics[2])
want = testMetrics[2][2].Value
got = m.containers[key].current.store[20]
if got != want {
t.Errorf("Want %f got %f\n", want, got)
}
// printMemStore(m)
}
//############################
//#### Blackbox tests ########
//############################
func TestAddMetricOutOfBounds(t *testing.T) {
key := "m1220"
m := newMemoryStore([]string{"flops", "mem_bw", "mem_capacity"}, 30, 60)
err := m.AddMetrics(key, 1584023000, testMetrics[0])
if err != nil {
t.Errorf("Got error 1584023000\n")
}
err = m.AddMetrics(key, 1584026600, testMetrics[0])
if err == nil {
t.Errorf("Got no error 1584026600\n")
}
err = m.AddMetrics(key, 1584021580, testMetrics[1])
if err == nil {
t.Errorf("Got no error 1584021580\n")
}
err = m.AddMetrics(key, 1584024580, testMetrics[1])
if err != nil {
t.Errorf("Got error 1584024580\n")
}
err = m.AddMetrics(key, 1584091580, testMetrics[1])
if err == nil {
t.Errorf("Got no error 1584091580\n")
}
err = m.AddMetrics(key, 1584024780, testMetrics[0])
if err != nil {
t.Errorf("Got error 1584024780\n")
}
}
func TestGetMetricPlainCurrent(t *testing.T) {
key := "m1220"
m := newMemoryStore([]string{"flops", "mem_bw", "mem_capacity"}, 10, 60)
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023000+i*60), testMetrics[i])
}
// printMemStore(m)
val, tsFrom, err := m.GetMetric(key, "flops", 1584023000, 1584023560)
if err != nil {
t.Errorf("Got error\n")
}
if tsFrom != 1584023000 {
t.Errorf("Start ts differs: %d\n", tsFrom)
}
if len(val) != 9 {
t.Errorf("Want 9. Got %d\n", len(val))
}
if val[0] != 100.5 {
t.Errorf("Want 100.5 Got %f\n", val[0])
}
if val[8] != 970.5 {
t.Errorf("Want 970.5 Got %f\n", val[9])
}
}
func TestGetMetricPlainNext(t *testing.T) {
key := "m1220"
m := newMemoryStore([]string{"flops", "mem_bw", "mem_capacity"}, 10, 60)
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023000+i*60), testMetrics[i])
}
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023600+i*60), testMetricsAlt[i])
}
// printMemStore(m)
val, tsFrom, err := m.GetMetric(key, "flops", 1584023000, 1584023560)
if err != nil {
t.Errorf("Got error\n")
}
if tsFrom != 1584023000 {
t.Errorf("Start ts differs: %d\n", tsFrom)
}
if len(val) != 9 {
t.Errorf("Want 9. Got %d\n", len(val))
}
if val[0] != 100.5 {
t.Errorf("Want 100.5 Got %f\n", val[0])
}
if val[8] != 970.5 {
t.Errorf("Want 970.5 Got %f\n", val[9])
}
}
func TestGetMetricGap(t *testing.T) {
key := "m1220"
m := newMemoryStore([]string{"flops", "mem_bw", "mem_capacity"}, 10, 60)
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023000+i*120), testMetrics[i])
}
val, tsFrom, err := m.GetMetric(key, "flops", 1584023000, 1584023600)
if err != nil {
t.Errorf("Got error\n")
}
if tsFrom != 1584023000 {
t.Errorf("Start ts differs: %d\n", tsFrom)
}
if len(val) != 10 {
t.Errorf("Want 10. Got %d\n", len(val))
}
if val[0] != 100.5 {
t.Errorf("Want 100.5 Got %f\n", val[0])
}
if !math.IsNaN(float64(val[1])) {
t.Errorf("Want NaN Got %f\n", val[1])
}
if val[0] != 100.5 {
t.Errorf("Want 100.5 Got %f\n", val[0])
}
// fmt.Println(val)
}
func TestGetMetricSplit(t *testing.T) {
key := "m1220"
m := newMemoryStore([]string{"flops", "mem_bw", "mem_capacity"}, 10, 60)
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023000+i*60), testMetrics[i])
}
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023600+i*60), testMetricsAlt[i])
}
// printMemStore(m)
val, tsFrom, err := m.GetMetric(key, "flops", 1584023200, 1584023860)
if err != nil {
t.Errorf("Got error\n")
}
if tsFrom != 1584023200 {
t.Errorf("Start ts differs: %d\n", tsFrom)
}
if len(val) != 11 {
t.Errorf("Want 11. Got %d\n", len(val))
}
if val[0] != 940.5 {
t.Errorf("Want 940.5 Got %f\n", val[0])
}
if val[10] != 950.5 {
t.Errorf("Want 950.5 Got %f\n", val[0])
}
}
func TestGetMetricExceedNext(t *testing.T) {
key := "m1220"
m := newMemoryStore([]string{"flops", "mem_bw", "mem_capacity"}, 10, 60)
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023000+i*60), testMetrics[i])
}
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023600+i*60), testMetricsAlt[i])
}
// printMemStore(m)
val, tsFrom, err := m.GetMetric(key, "flops", 1584022800, 1584023400)
if err != nil {
t.Errorf("Got error\n")
}
if tsFrom != 1584023000 {
t.Errorf("Start ts differs: %d\n", tsFrom)
}
if len(val) != 6 {
t.Errorf("Want 6. Got %d\n", len(val))
}
if val[0] != 100.5 {
t.Errorf("Want 100.5 Got %f\n", val[0])
}
if val[5] != 980.5 {
t.Errorf("Want 980.5 Got %f\n", val[5])
}
}
func TestGetMetricExceedNextSplit(t *testing.T) {
key := "m1220"
m := newMemoryStore([]string{"flops", "mem_bw", "mem_capacity"}, 10, 60)
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023000+i*60), testMetrics[i])
}
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023600+i*60), testMetricsAlt[i])
}
// printMemStore(m)
val, tsFrom, err := m.GetMetric(key, "flops", 1584022800, 1584023900)
if err != nil {
t.Errorf("Got error\n")
}
if tsFrom != 1584023000 {
t.Errorf("Start ts differs: %d\n", tsFrom)
}
if len(val) != 15 {
t.Errorf("Want 14. Got %d\n", len(val))
}
if val[0] != 100.5 {
t.Errorf("Want 100.5 Got %f\n", val[0])
}
if val[14] != 960.5 {
t.Errorf("Want 960.5 Got %f\n", val[13])
}
}
func TestGetMetricExceedCurrent(t *testing.T) {
key := "m1220"
m := newMemoryStore([]string{"flops", "mem_bw", "mem_capacity"}, 10, 60)
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023000+i*60), testMetrics[i])
}
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023600+i*60), testMetricsAlt[i])
}
// printMemStore(m)
val, tsFrom, err := m.GetMetric(key, "flops", 1584023800, 1584027900)
if err != nil {
t.Errorf("Got error\n")
}
if tsFrom != 1584023800 {
t.Errorf("Start ts differs: %d\n", tsFrom)
}
if len(val) != 7 {
t.Errorf("Want 6. Got %d\n", len(val))
}
if val[0] != 950.5 {
t.Errorf("Want 950.5 Got %f\n", val[0])
}
if val[6] != 930.5 {
t.Errorf("Want 930.5 Got %f\n", val[5])
}
}
func TestGetMetricExceedCurrentSplit(t *testing.T) {
key := "m1220"
m := newMemoryStore([]string{"flops", "mem_bw", "mem_capacity"}, 10, 60)
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023000+i*60), testMetrics[i])
}
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023600+i*60), testMetricsAlt[i])
}
// printMemStore(m)
val, tsFrom, err := m.GetMetric(key, "flops", 1584023120, 1584027900)
if err != nil {
t.Errorf("Got error\n")
}
if tsFrom != 1584023120 {
t.Errorf("Start ts differs: %d\n", tsFrom)
}
if len(val) != 18 {
t.Errorf("Want 18. Got %d\n", len(val))
}
if val[0] != 980.5 {
t.Errorf("Want 950.5 Got %f\n", val[0])
}
if val[17] != 930.5 {
t.Errorf("Want 930.5 Got %f\n", val[17])
}
}
func TestGetMetricExceedBoth(t *testing.T) {
key := "m1220"
m := newMemoryStore([]string{"flops", "mem_bw", "mem_capacity"}, 10, 60)
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023000+i*60), testMetrics[i])
}
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023600+i*60), testMetricsAlt[i])
}
// printMemStore(m)
val, tsFrom, err := m.GetMetric(key, "flops", 1584022800, 1584027900)
if err != nil {
t.Errorf("Got error\n")
}
if tsFrom != 1584023000 {
t.Errorf("Start ts differs: %d\n", tsFrom)
}
if len(val) != 20 {
t.Errorf("Want 20. Got %d\n", len(val))
}
if val[0] != 100.5 {
t.Errorf("Want 950.5 Got %f\n", val[0])
}
if val[19] != 930.5 {
t.Errorf("Want 930.5 Got %f\n", val[17])
}
}
func TestGetMetricOutUpper(t *testing.T) {
key := "m1220"
m := newMemoryStore([]string{"flops", "mem_bw", "mem_capacity"}, 10, 60)
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023000+i*60), testMetrics[i])
}
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023600+i*60), testMetricsAlt[i])
}
// printMemStore(m)
_, _, err := m.GetMetric(key, "flops", 1584032800, 1584037900)
if err == nil {
t.Errorf("Got no error\n")
}
}
func TestGetMetricOutLower(t *testing.T) {
key := "m1220"
m := newMemoryStore([]string{"flops", "mem_bw", "mem_capacity"}, 10, 60)
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023000+i*60), testMetrics[i])
}
for i := 0; i < len(testMetrics); i++ {
m.AddMetrics(key, int64(1584023600+i*60), testMetricsAlt[i])
}
// printMemStore(m)
_, _, err := m.GetMetric(key, "flops", 1584002800, 1584007900)
if err == nil {
t.Errorf("Got no error\n")
}
}

542
memstore.go Normal file
View File

@@ -0,0 +1,542 @@
package main
import (
"errors"
"sync"
"unsafe"
)
// Default buffer capacity.
// `buffer.data` will only ever grow up to it's capacity and a new link
// in the buffer chain will be created if needed so that no copying
// of data or reallocation needs to happen on writes.
const (
BUFFER_CAP int = 512
)
// So that we can reuse allocations
var bufferPool sync.Pool = sync.Pool{
New: func() interface{} {
return &buffer{
data: make([]Float, 0, BUFFER_CAP),
}
},
}
var (
ErrNoData error = errors.New("no data for this metric/level")
ErrDataDoesNotAlign error = errors.New("data from lower granularities does not align")
)
// Each metric on each level has it's own buffer.
// This is where the actual values go.
// If `cap(data)` is reached, a new buffer is created and
// becomes the new head of a buffer list.
type buffer struct {
frequency int64 // Time between two "slots"
start int64 // Timestamp of when `data[0]` was written.
unit string // Unit for the data in this buffer
data []Float // The slice should never reallocacte as `cap(data)` is respected.
prev, next *buffer // `prev` contains older data, `next` newer data.
archived bool // If true, this buffer is already archived
closed bool
/*
statisticts struct {
samples int
min Float
max Float
avg Float
}
*/
}
func newBuffer(ts, freq int64, unit string) *buffer {
b := bufferPool.Get().(*buffer)
b.frequency = freq
b.start = ts - (freq / 2)
b.prev = nil
b.next = nil
b.unit = unit
b.archived = false
b.closed = false
b.data = b.data[:0]
return b
}
// If a new buffer was created, the new head is returnd.
// Otherwise, the existing buffer is returnd.
// Normaly, only "newer" data should be written, but if the value would
// end up in the same buffer anyways it is allowed.
func (b *buffer) write(ts int64, value Float) (*buffer, error) {
if ts < b.start {
return nil, errors.New("cannot write value to buffer from past")
}
// idx := int((ts - b.start + (b.frequency / 3)) / b.frequency)
idx := int((ts - b.start) / b.frequency)
if idx >= cap(b.data) {
newbuf := newBuffer(ts, b.frequency, b.unit)
newbuf.prev = b
b.next = newbuf
b.close()
b = newbuf
idx = 0
}
// Overwriting value or writing value from past
if idx < len(b.data) {
b.data[idx] = value
return b, nil
}
// Fill up unwritten slots with NaN
for i := len(b.data); i < idx; i++ {
b.data = append(b.data, NaN)
}
b.data = append(b.data, value)
return b, nil
}
func (b *buffer) end() int64 {
return b.firstWrite() + int64(len(b.data))*b.frequency
}
func (b *buffer) firstWrite() int64 {
return b.start + (b.frequency / 2)
}
func (b *buffer) close() {}
/*
func (b *buffer) close() {
if b.closed {
return
}
b.closed = true
n, sum, min, max := 0, 0., math.MaxFloat64, -math.MaxFloat64
for _, x := range b.data {
if x.IsNaN() {
continue
}
n += 1
f := float64(x)
sum += f
min = math.Min(min, f)
max = math.Max(max, f)
}
b.statisticts.samples = n
if n > 0 {
b.statisticts.avg = Float(sum / float64(n))
b.statisticts.min = Float(min)
b.statisticts.max = Float(max)
} else {
b.statisticts.avg = NaN
b.statisticts.min = NaN
b.statisticts.max = NaN
}
}
*/
// func interpolate(idx int, data []Float) Float {
// if idx == 0 || idx+1 == len(data) {
// return NaN
// }
// return (data[idx-1] + data[idx+1]) / 2.0
// }
// Return all known values from `from` to `to`. Gaps of information are represented as NaN.
// Simple linear interpolation is done between the two neighboring cells if possible.
// If values at the start or end are missing, instead of NaN values, the second and thrid
// return values contain the actual `from`/`to`.
// This function goes back the buffer chain if `from` is older than the currents buffer start.
// The loaded values are added to `data` and `data` is returned, possibly with a shorter length.
// If `data` is not long enough to hold all values, this function will panic!
func (b *buffer) read(from, to int64, data []Float) ([]Float, int64, int64, error) {
if from < b.firstWrite() {
if b.prev != nil {
return b.prev.read(from, to, data)
}
from = b.firstWrite()
}
var i int = 0
var t int64 = from
for ; t < to; t += b.frequency {
idx := int((t - b.start) / b.frequency)
if idx >= cap(b.data) {
if b.next == nil {
break
}
b = b.next
idx = 0
}
if idx >= len(b.data) {
if b.next == nil || to <= b.next.start {
break
}
data[i] += NaN
} else if t < b.start {
data[i] += NaN
// } else if b.data[idx].IsNaN() {
// data[i] += interpolate(idx, b.data)
} else {
data[i] += b.data[idx]
}
i++
}
return data[:i], from, t, nil
}
// Returns true if this buffer needs to be freed.
func (b *buffer) free(t int64) (delme bool, n int) {
if b.prev != nil {
delme, m := b.prev.free(t)
n += m
if delme {
b.prev.next = nil
if cap(b.prev.data) == BUFFER_CAP {
bufferPool.Put(b.prev)
}
b.prev = nil
}
}
end := b.end()
if end < t {
return true, n + 1
}
return false, n
}
// Call `callback` on every buffer that contains data in the range from `from` to `to`.
func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) error {
if b == nil {
return nil
}
if err := b.prev.iterFromTo(from, to, callback); err != nil {
return err
}
if from <= b.end() && b.start <= to {
return callback(b)
}
return nil
}
func (b *buffer) count() int64 {
res := int64(len(b.data))
if b.prev != nil {
res += b.prev.count()
}
return res
}
// Could also be called "node" as this forms a node in a tree structure.
// Called level because "node" might be confusing here.
// Can be both a leaf or a inner node. In this tree structue, inner nodes can
// also hold data (in `metrics`).
type level struct {
lock sync.RWMutex
metrics []*buffer // Every level can store metrics.
children map[string]*level // Lower levels.
}
// Find the correct level for the given selector, creating it if
// it does not exist. Example selector in the context of the
// ClusterCockpit could be: []string{ "emmy", "host123", "cpu0" }.
// This function would probably benefit a lot from `level.children` beeing a `sync.Map`?
func (l *level) findLevelOrCreate(selector []string, nMetrics int) *level {
if len(selector) == 0 {
return l
}
// Allow concurrent reads:
l.lock.RLock()
var child *level
var ok bool
if l.children == nil {
// Children map needs to be created...
l.lock.RUnlock()
} else {
child, ok := l.children[selector[0]]
l.lock.RUnlock()
if ok {
return child.findLevelOrCreate(selector[1:], nMetrics)
}
}
// The level does not exist, take write lock for unqiue access:
l.lock.Lock()
// While this thread waited for the write lock, another thread
// could have created the child node.
if l.children != nil {
child, ok = l.children[selector[0]]
if ok {
l.lock.Unlock()
return child.findLevelOrCreate(selector[1:], nMetrics)
}
}
child = &level{
metrics: make([]*buffer, nMetrics),
children: nil,
}
if l.children != nil {
l.children[selector[0]] = child
} else {
l.children = map[string]*level{selector[0]: child}
}
l.lock.Unlock()
return child.findLevelOrCreate(selector[1:], nMetrics)
}
func (l *level) free(t int64) (int, error) {
l.lock.Lock()
defer l.lock.Unlock()
n := 0
for i, b := range l.metrics {
if b != nil {
delme, m := b.free(t)
n += m
if delme {
if cap(b.data) == BUFFER_CAP {
bufferPool.Put(b)
}
l.metrics[i] = nil
}
}
}
for _, l := range l.children {
m, err := l.free(t)
n += m
if err != nil {
return n, err
}
}
return n, nil
}
func (l *level) sizeInBytes() int64 {
l.lock.RLock()
defer l.lock.RUnlock()
size := int64(0)
for _, b := range l.metrics {
if b != nil {
size += b.count() * int64(unsafe.Sizeof(Float(0)))
}
}
for _, child := range l.children {
size += child.sizeInBytes()
}
return size
}
type MemoryStore struct {
root level // root of the tree structure
metrics map[string]MetricConfig
}
// Return a new, initialized instance of a MemoryStore.
// Will panic if values in the metric configurations are invalid.
func NewMemoryStore(metrics map[string]MetricConfig) *MemoryStore {
offset := 0
for key, config := range metrics {
if config.Frequency == 0 {
panic("invalid frequency")
}
metrics[key] = MetricConfig{
Frequency: config.Frequency,
Aggregation: config.Aggregation,
offset: offset,
}
offset += 1
}
return &MemoryStore{
root: level{
metrics: make([]*buffer, len(metrics)),
children: make(map[string]*level),
},
metrics: metrics,
}
}
// Write all values in `metrics` to the level specified by `selector` for time `ts`.
// Look at `findLevelOrCreate` for how selectors work.
func (m *MemoryStore) Write(selector []string, ts int64, metrics []Metric) error {
var ok bool
for i, metric := range metrics {
if metric.mc.Frequency == 0 {
metric.mc, ok = m.metrics[metric.Name]
if !ok {
metric.mc.Frequency = 0
}
metrics[i] = metric
}
}
return m.WriteToLevel(&m.root, selector, ts, metrics)
}
func (m *MemoryStore) GetLevel(selector []string) *level {
return m.root.findLevelOrCreate(selector, len(m.metrics))
}
// Assumes that `minfo` in `metrics` is filled in!
func (m *MemoryStore) WriteToLevel(l *level, selector []string, ts int64, metrics []Metric) error {
l = l.findLevelOrCreate(selector, len(m.metrics))
l.lock.Lock()
defer l.lock.Unlock()
for _, metric := range metrics {
if metric.mc.Frequency == 0 {
continue
}
b := l.metrics[metric.mc.offset]
if b == nil {
// First write to this metric and level
b = newBuffer(ts, metric.mc.Frequency, metric.Unit)
l.metrics[metric.mc.offset] = b
}
nb, err := b.write(ts, metric.Value)
if err != nil {
return err
}
// Last write created a new buffer...
if b != nb {
l.metrics[metric.mc.offset] = nb
}
}
return nil
}
// Returns all values for metric `metric` from `from` to `to` for the selected level(s).
// If the level does not hold the metric itself, the data will be aggregated recursively from the children.
// The second and third return value are the actual from/to for the data. Those can be different from
// the range asked for if no data was available.
func (m *MemoryStore) Read(selector Selector, metric string, from, to int64) ([]Float, int64, int64, string, error) {
var unit string = ""
if from > to {
return nil, 0, 0, "", errors.New("invalid time range")
}
minfo, ok := m.metrics[metric]
if !ok {
return nil, 0, 0, "", errors.New("unkown metric: " + metric)
}
n, data := 0, make([]Float, (to-from)/minfo.Frequency+1)
err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
cdata, cfrom, cto, err := b.read(from, to, data)
if err != nil {
return err
}
unit = b.unit
if n == 0 {
from, to = cfrom, cto
} else if from != cfrom || to != cto || len(data) != len(cdata) {
missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency)
if missingfront != 0 {
return ErrDataDoesNotAlign
}
newlen := len(cdata) - missingback
if newlen < 1 {
return ErrDataDoesNotAlign
}
cdata = cdata[0:newlen]
if len(cdata) != len(data) {
return ErrDataDoesNotAlign
}
from, to = cfrom, cto
}
data = cdata
n += 1
return nil
})
if err != nil {
return nil, 0, 0, "", err
} else if n == 0 {
return nil, 0, 0, "", errors.New("metric or host not found")
} else if n > 1 {
if minfo.Aggregation == AvgAggregation {
normalize := 1. / Float(n)
for i := 0; i < len(data); i++ {
data[i] *= normalize
}
} else if minfo.Aggregation != SumAggregation {
return nil, 0, 0, "", errors.New("invalid aggregation")
}
}
return data, from, to, unit, nil
}
// Release all buffers for the selected level and all its children that contain only
// values older than `t`.
func (m *MemoryStore) Free(selector []string, t int64) (int, error) {
return m.GetLevel(selector).free(t)
}
func (m *MemoryStore) FreeAll() error {
for k := range m.root.children {
delete(m.root.children, k)
}
return nil
}
func (m *MemoryStore) SizeInBytes() int64 {
return m.root.sizeInBytes()
}
// Given a selector, return a list of all children of the level selected.
func (m *MemoryStore) ListChildren(selector []string) []string {
lvl := &m.root
for lvl != nil && len(selector) != 0 {
lvl.lock.RLock()
next := lvl.children[selector[0]]
lvl.lock.RUnlock()
lvl = next
selector = selector[1:]
}
if lvl == nil {
return nil
}
lvl.lock.RLock()
defer lvl.lock.RUnlock()
children := make([]string, 0, len(lvl.children))
for child := range lvl.children {
children = append(children, child)
}
return children
}

512
memstore_test.go Normal file
View File

@@ -0,0 +1,512 @@
package main
import (
"fmt"
"math"
"math/rand"
"sync"
"testing"
)
func TestMemoryStoreBasics(t *testing.T) {
frequency := int64(10)
start, count := int64(100), int64(5000)
store := NewMemoryStore(map[string]MetricConfig{
"a": {Frequency: frequency},
"b": {Frequency: frequency * 2},
})
for i := int64(0); i < count; i++ {
err := store.Write([]string{"testhost"}, start+i*frequency, []Metric{
{Name: "a", Value: Float(i)},
{Name: "b", Value: Float(i / 2)},
})
if err != nil {
t.Error(err)
return
}
}
sel := Selector{{String: "testhost"}}
adata, from, to, unit, err := store.Read(sel, "a", start, start+count*frequency)
if err != nil || from != start || to != start+count*frequency || unit != "" {
t.Error(err)
return
}
bdata, _, _, unit, err := store.Read(sel, "b", start, start+count*frequency)
if err != nil {
t.Error(err)
return
}
if len(adata) != int(count) || len(bdata) != int(count/2) {
t.Error("unexpected count of returned values")
return
}
for i := 0; i < int(count); i++ {
if adata[i] != Float(i) {
t.Errorf("incorrect value for metric a (%f vs. %f)", adata[i], Float(i))
return
}
}
for i := 0; i < int(count/2); i++ {
if bdata[i] != Float(i) && bdata[i] != Float(i-1) {
t.Errorf("incorrect value for metric b (%f) at index %d", bdata[i], i)
return
}
}
}
func TestMemoryStoreTooMuchWrites(t *testing.T) {
frequency := int64(10)
count := BUFFER_CAP*3 + 10
store := NewMemoryStore(map[string]MetricConfig{
"a": {Frequency: frequency},
"b": {Frequency: frequency * 2},
"c": {Frequency: frequency / 2},
"d": {Frequency: frequency * 3},
})
start := int64(100)
for i := 0; i < count; i++ {
if err := store.Write([]string{"test"}, start+int64(i)*frequency, []Metric{
{Name: "a", Value: Float(i)},
{Name: "b", Value: Float(i / 2)},
{Name: "c", Value: Float(i * 2)},
{Name: "d", Value: Float(i / 3)},
}); err != nil {
t.Fatal(err)
}
}
end := start + int64(count)*frequency
data, from, to, unit, err := store.Read(Selector{{String: "test"}}, "a", start, end)
if len(data) != count || from != start || to != end || err != nil || unit != "" {
t.Fatalf("a: err=%#v, from=%d, to=%d, data=%#v\n", err, from, to, data)
}
data, from, to, unit, err = store.Read(Selector{{String: "test"}}, "b", start, end)
if len(data) != count/2 || from != start || to != end || err != nil || unit != "" {
t.Fatalf("b: err=%#v, from=%d, to=%d, data=%#v\n", err, from, to, data)
}
data, from, to, unit, err = store.Read(Selector{{String: "test"}}, "c", start, end)
if len(data) != count*2-1 || from != start || to != end-frequency/2 || err != nil || unit != "" {
t.Fatalf("c: err=%#v, from=%d, to=%d, data=%#v\n", err, from, to, data)
}
data, from, to, unit, err = store.Read(Selector{{String: "test"}}, "d", start, end)
if len(data) != count/3+1 || from != start || to != end+frequency*2 || err != nil || unit != "" {
t.Errorf("expected: err=nil, from=%d, to=%d, len(data)=%d\n", start, end+frequency*2, count/3)
t.Fatalf("d: err=%#v, from=%d, to=%d, data=%#v\n", err, from, to, data)
}
}
func TestMemoryStoreOutOfBounds(t *testing.T) {
count := 2000
toffset := 1000
store := NewMemoryStore(map[string]MetricConfig{
"a": {Frequency: 60},
})
for i := 0; i < count; i++ {
if err := store.Write([]string{"cluster", "host", "cpu"}, int64(toffset+i*60), []Metric{
{Name: "a", Value: Float(i)},
}); err != nil {
t.Fatal(err)
}
}
sel := Selector{{String: "cluster"}, {String: "host"}, {String: "cpu"}}
data, from, to, unit, err := store.Read(sel, "a", 500, int64(toffset+count*60+500))
if err != nil {
t.Fatal(err)
}
if from/60 != int64(toffset)/60 || to/60 != int64(toffset+count*60)/60 {
t.Fatalf("Got %d-%d, expected %d-%d", from, to, toffset, toffset+count*60)
}
if len(data) != count || data[0] != 0 || data[len(data)-1] != Float((count-1)) {
t.Fatalf("Wrong data (got: %d, %f, %f, expected: %d, %f, %f)",
len(data), data[0], data[len(data)-1], count, 0., Float(count-1))
}
testfrom, testlen := int64(100000000), int64(10000)
data, from, to, unit, err = store.Read(sel, "a", testfrom, testfrom+testlen)
if len(data) != 0 || from != testfrom || to != testfrom || err != nil || unit != "" {
t.Fatal("Unexpected data returned when reading range after valid data")
}
testfrom, testlen = 0, 10
data, from, to, unit, err = store.Read(sel, "a", testfrom, testfrom+testlen)
if len(data) != 0 || from/60 != int64(toffset)/60 || to/60 != int64(toffset)/60 || err != nil || unit != "" {
t.Fatal("Unexpected data returned when reading range before valid data")
}
}
func TestMemoryStoreMissingDatapoints(t *testing.T) {
count := 3000
store := NewMemoryStore(map[string]MetricConfig{
"a": {Frequency: 1},
})
for i := 0; i < count; i++ {
if i%3 != 0 {
continue
}
err := store.Write([]string{"testhost"}, int64(i), []Metric{
{Name: "a", Value: Float(i)},
})
if err != nil {
t.Error(err)
return
}
}
sel := Selector{{String: "testhost"}}
adata, _, _, _, err := store.Read(sel, "a", 0, int64(count))
if err != nil {
t.Error(err)
return
}
if len(adata) != count-2 {
t.Error("unexpected len")
return
}
for i := 0; i < count-2; i++ {
if i%3 == 0 {
if adata[i] != Float(i) {
t.Error("unexpected value")
return
}
} else {
if !math.IsNaN(float64(adata[i])) {
t.Errorf("NaN expected (i = %d, value = %f)\n", i, adata[i])
return
}
}
}
}
func TestMemoryStoreAggregation(t *testing.T) {
count := 3000
store := NewMemoryStore(map[string]MetricConfig{
"a": {Frequency: 1, Aggregation: SumAggregation},
})
for i := 0; i < count; i++ {
err := store.Write([]string{"host0", "cpu0"}, int64(i), []Metric{
{Name: "a", Value: Float(i) / 2.},
})
if err != nil {
t.Error(err)
return
}
err = store.Write([]string{"host0", "cpu1"}, int64(i), []Metric{
{Name: "a", Value: Float(i) * 2.},
})
if err != nil {
t.Error(err)
return
}
}
adata, from, to, _, err := store.Read(Selector{{String: "host0"}}, "a", int64(0), int64(count))
if err != nil {
t.Error(err)
return
}
if len(adata) != count || from != 0 || to != int64(count) {
t.Error("unexpected length or time range of returned data")
return
}
for i := 0; i < count; i++ {
expected := Float(i)/2. + Float(i)*2.
if adata[i] != expected {
t.Errorf("expected: %f, got: %f", expected, adata[i])
return
}
}
}
func TestMemoryStoreStats(t *testing.T) {
count := 3000
store := NewMemoryStore(map[string]MetricConfig{
"a": {Frequency: 1},
"b": {Frequency: 1, Aggregation: AvgAggregation},
})
sel1 := []string{"cluster", "host1"}
sel2 := []string{"cluster", "host2", "left"}
sel3 := []string{"cluster", "host2", "right"}
samples := 0
asum, amin, amax := 0., math.MaxFloat32, -math.MaxFloat32
bsum, bmin, bmax := 0., math.MaxFloat32, -math.MaxFloat32
for i := 0; i < count; i++ {
if i%5 == 0 {
// Skip some writes, test if samples is calculated correctly
continue
}
samples += 1
a := float64(rand.Int()%100 - 50)
asum += a
amin = math.Min(amin, a)
amax = math.Max(amax, a)
b := float64(rand.Int()%100 - 50)
bsum += b * 2
bmin = math.Min(bmin, b)
bmax = math.Max(bmax, b)
store.Write(sel1, int64(i), []Metric{
{Name: "a", Value: Float(a)},
})
store.Write(sel2, int64(i), []Metric{
{Name: "b", Value: Float(b)},
})
store.Write(sel3, int64(i), []Metric{
{Name: "b", Value: Float(b)},
})
}
stats, from, to, err := store.Stats(Selector{{String: "cluster"}, {String: "host1"}}, "a", 0, int64(count))
if err != nil {
t.Fatal(err)
}
if from != 1 || to != int64(count) || stats.Samples != samples {
t.Fatalf("unexpected: from=%d, to=%d, stats.Samples=%d (expected samples=%d)\n", from, to, stats.Samples, samples)
}
if stats.Avg != Float(asum/float64(samples)) || stats.Min != Float(amin) || stats.Max != Float(amax) {
t.Fatalf("wrong stats: %#v\n", stats)
}
stats, from, to, err = store.Stats(Selector{{String: "cluster"}, {String: "host2"}}, "b", 0, int64(count))
if err != nil {
t.Fatal(err)
}
if from != 1 || to != int64(count) || stats.Samples != samples*2 {
t.Fatalf("unexpected: from=%d, to=%d, stats.Samples=%d (expected samples=%d)\n", from, to, stats.Samples, samples*2)
}
if stats.Avg != Float(bsum/float64(samples*2)) || stats.Min != Float(bmin) || stats.Max != Float(bmax) {
t.Fatalf("wrong stats: %#v (expected: avg=%f, min=%f, max=%f)\n", stats, bsum/float64(samples*2), bmin, bmax)
}
}
func TestMemoryStoreArchive(t *testing.T) {
store1 := NewMemoryStore(map[string]MetricConfig{
"a": {Frequency: 1},
"b": {Frequency: 1},
})
count := 2000
for i := 0; i < count; i++ {
err := store1.Write([]string{"cluster", "host", "cpu0"}, 100+int64(i), []Metric{
{Name: "a", Value: Float(i)},
{Name: "b", Value: Float(i * 2)},
})
if err != nil {
t.Error(err)
return
}
}
// store1.DebugDump(bufio.NewWriter(os.Stdout))
archiveRoot := t.TempDir()
_, err := store1.ToCheckpoint(archiveRoot, 100, 100+int64(count/2))
if err != nil {
t.Error(err)
return
}
_, err = store1.ToCheckpoint(archiveRoot, 100+int64(count/2), 100+int64(count))
if err != nil {
t.Error(err)
return
}
store2 := NewMemoryStore(map[string]MetricConfig{
"a": {Frequency: 1},
"b": {Frequency: 1},
})
n, err := store2.FromCheckpoint(archiveRoot, 100)
if err != nil {
t.Error(err)
return
}
sel := Selector{{String: "cluster"}, {String: "host"}, {String: "cpu0"}}
adata, from, to, _, err := store2.Read(sel, "a", 100, int64(100+count))
if err != nil {
t.Error(err)
return
}
if n != 2 || len(adata) != count || from != 100 || to != int64(100+count) {
t.Errorf("unexpected: n=%d, len=%d, from=%d, to=%d\n", n, len(adata), from, to)
return
}
for i := 0; i < count; i++ {
expected := Float(i)
if adata[i] != expected {
t.Errorf("expected: %f, got: %f", expected, adata[i])
}
}
}
func TestMemoryStoreFree(t *testing.T) {
store := NewMemoryStore(map[string]MetricConfig{
"a": {Frequency: 1},
"b": {Frequency: 2},
})
count := 3000
sel := []string{"cluster", "host", "1"}
for i := 0; i < count; i++ {
err := store.Write(sel, int64(i), []Metric{
{Name: "a", Value: Float(i)},
{Name: "b", Value: Float(i)},
})
if err != nil {
t.Fatal(err)
}
}
n, err := store.Free([]string{"cluster", "host"}, int64(BUFFER_CAP*2)+100)
if err != nil {
t.Fatal(err)
}
if n != 3 {
t.Fatal("two buffers expected to be released")
}
adata, from, to, _, err := store.Read(Selector{{String: "cluster"}, {String: "host"}, {String: "1"}}, "a", 0, int64(count))
if err != nil {
t.Fatal(err)
}
if from != int64(BUFFER_CAP*2) || to != int64(count) || len(adata) != count-2*BUFFER_CAP {
t.Fatalf("unexpected values from call to `Read`: from=%d, to=%d, len=%d", from, to, len(adata))
}
// bdata, from, to, err := store.Read(Selector{{String: "cluster"}, {String: "host"}, {String: "1"}}, "b", 0, int64(count))
// if err != nil {
// t.Fatal(err)
// }
// if from != int64(BUFFER_CAP*2) || to != int64(count) || len(bdata) != (count-2*BUFFER_CAP)/2 {
// t.Fatalf("unexpected values from call to `Read`: from=%d (expected: %d), to=%d (expected: %d), len=%d (expected: %d)",
// from, BUFFER_CAP*2, to, count, len(bdata), (count-2*BUFFER_CAP)/2)
// }
if adata[0] != Float(BUFFER_CAP*2) || adata[len(adata)-1] != Float(count-1) {
t.Fatal("wrong values")
}
}
func BenchmarkMemoryStoreConcurrentWrites(b *testing.B) {
frequency := int64(5)
count := b.N
goroutines := 4
store := NewMemoryStore(map[string]MetricConfig{
"a": {Frequency: frequency},
})
var wg sync.WaitGroup
wg.Add(goroutines)
for g := 0; g < goroutines; g++ {
go func(g int) {
host := fmt.Sprintf("host%d", g)
for i := 0; i < count; i++ {
store.Write([]string{"cluster", host, "cpu0"}, int64(i)*frequency, []Metric{
{Name: "a", Value: Float(i)},
})
}
wg.Done()
}(g)
}
wg.Wait()
b.StopTimer()
for g := 0; g < goroutines; g++ {
host := fmt.Sprintf("host%d", g)
sel := Selector{{String: "cluster"}, {String: host}, {String: "cpu0"}}
adata, _, _, _, err := store.Read(sel, "a", 0, int64(count)*frequency)
if err != nil {
b.Error(err)
return
}
if len(adata) != count {
b.Error("unexpected count")
return
}
for i := 0; i < count; i++ {
expected := Float(i)
if adata[i] != expected {
b.Error("incorrect value for metric a")
return
}
}
}
}
func BenchmarkMemoryStoreAggregation(b *testing.B) {
b.StopTimer()
count := 2000
store := NewMemoryStore(map[string]MetricConfig{
"flops_any": {Frequency: 1, Aggregation: AvgAggregation},
})
sel := []string{"testcluster", "host123", "cpu0"}
for i := 0; i < count; i++ {
sel[2] = "cpu0"
err := store.Write(sel, int64(i), []Metric{
{Name: "flops_any", Value: Float(i)},
})
if err != nil {
b.Fatal(err)
}
sel[2] = "cpu1"
err = store.Write(sel, int64(i), []Metric{
{Name: "flops_any", Value: Float(i)},
})
if err != nil {
b.Fatal(err)
}
}
b.StartTimer()
for n := 0; n < b.N; n++ {
data, from, to, _, err := store.Read(Selector{{String: "testcluster"}, {String: "host123"}}, "flops_any", 0, int64(count))
if err != nil {
b.Fatal(err)
}
if len(data) != count || from != 0 || to != int64(count) {
b.Fatal()
}
}
}

148
openapi.yaml Normal file
View File

@@ -0,0 +1,148 @@
# OpenAPI spec describing a subset of the HTTP REST API for the cc-metric-store.
openapi: 3.0.3
info:
title: 'cc-metric-store REST API'
description: 'In-memory time series database for hpc metrics to be used with the [ClusterCockpit](https://github.com/ClusterCockpit) toolsuite'
version: 0.1.0
paths:
'/api/write':
post:
operationId: 'writeMetrics'
description: 'Recieves metrics in the influx line-protocol using [this format](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md)'
parameters:
- name: cluster
in: query
schema: { type: string }
description: "If the lines in the body do not have a cluster tag, use this value instead."
requestBody:
required: true
content:
'text/plain':
example:
'flops_any,cluster=emmy,hostname=e1001,type=cpu,type-id=0 value=42.0'
responses:
200:
description: 'Everything went fine'
400:
description: 'Bad Request'
'/api/query':
post:
operationId: 'queryMetrics'
description: 'Query metrics'
requestBody:
required: true
content:
'application/json':
schema:
type: object
required: [cluster, from, to]
properties:
cluster:
type: string
from:
type: integer
to:
type: integer
with-stats:
type: boolean
default: true
with-data:
type: boolean
default: true
queries:
type: array
items:
$ref: '#/components/schemas/ApiQuery'
for-all-nodes:
description: 'If not null, add a new query for every known host on that cluster and every metric (at node-scope) specified in this array to the request. This can be used to get a metric for every host in a cluster without knowing the name of every host.'
type: array
items:
type: string
responses:
200:
description: 'Requested data and stats as JSON'
content:
'application/json':
schema:
type: object
properties:
queries:
description: 'Only if for-all-nodes was used, this property exists.'
results:
type: array
description: 'Array where each element is a response to the query at that same index in the request'
items:
description: 'If `aggreg` is true, only ever has one element.'
type: array
items:
type: object
properties:
error:
description: 'If not null or undefined, an error happend processing that query'
type: string
nullable: true
data:
type: array
items:
type: number
nullable: true
avg: { type: number }
min: { type: number }
max: { type: number }
400:
description: 'Bad Request'
'/api/free':
post:
operationId: 'freeBuffers'
description: 'Allow all buffers containing only data older than `to`'
parameters:
- name: to
in: query
description: 'Unix Timestamp'
required: true
schema:
type: integer
requestBody:
required: true
content:
'application/json':
schema:
type: array
items:
type: array
items:
type: string
responses:
200:
description: 'Everything went fine'
400:
description: 'Bad Request'
components:
schemas:
ApiQuery:
description: 'A single query for a specific metric resulting in one series'
type: object
required: [metric, hostname, aggreg]
properties:
metirc:
type: string
hostname:
type: string
type:
description: 'Not required for node-level requests. Usually something like socket, cpu or hwthread.'
type: string
type-ids:
type: array
items:
type: string
aggreg:
type: boolean
description: 'If true, every query result will have exactly one element. Otherwise, the data for every requested type-id/sub-type-id is provided seperately'
securitySchemes:
bearerAuth:
type: http
scheme: bearer
bearerFormat: JWT
security:
- bearerAuth: [] # Applies `bearerAuth` globally

View File

@@ -1,122 +0,0 @@
package resampler
import (
"errors"
"fmt"
"math"
"github.com/ClusterCockpit/cc-metric-store/internal/util"
)
func SimpleResampler(data []util.Float, old_frequency int64, new_frequency int64) ([]util.Float, int64, error) {
if old_frequency == 0 || new_frequency == 0 || new_frequency <= old_frequency {
return data, old_frequency, nil
}
if new_frequency%old_frequency != 0 {
return nil, 0, errors.New("new sampling frequency should be multiple of the old frequency")
}
var step int = int(new_frequency / old_frequency)
var new_data_length = len(data) / step
if new_data_length == 0 || len(data) < 100 || new_data_length >= len(data) {
return data, old_frequency, nil
}
new_data := make([]util.Float, new_data_length)
for i := 0; i < new_data_length; i++ {
new_data[i] = data[i*step]
}
return new_data, new_frequency, nil
}
// Inspired by one of the algorithms from https://skemman.is/bitstream/1946/15343/3/SS_MSthesis.pdf
// Adapted from https://github.com/haoel/downsampling/blob/master/core/lttb.go
func LargestTriangleThreeBucket(data []util.Float, old_frequency int64, new_frequency int64) ([]util.Float, int64, error) {
if old_frequency == 0 || new_frequency == 0 || new_frequency <= old_frequency {
return data, old_frequency, nil
}
if new_frequency%old_frequency != 0 {
return nil, 0, fmt.Errorf("new sampling frequency : %d should be multiple of the old frequency : %d", new_frequency, old_frequency)
}
var step int = int(new_frequency / old_frequency)
var new_data_length = len(data) / step
if new_data_length == 0 || len(data) < 100 || new_data_length >= len(data) {
return data, old_frequency, nil
}
new_data := make([]util.Float, 0, new_data_length)
// Bucket size. Leave room for start and end data points
bucketSize := float64(len(data)-2) / float64(new_data_length-2)
new_data = append(new_data, data[0]) // Always add the first point
// We have 3 pointers represent for
// > bucketLow - the current bucket's beginning location
// > bucketMiddle - the current bucket's ending location,
// also the beginning location of next bucket
// > bucketHight - the next bucket's ending location.
bucketLow := 1
bucketMiddle := int(math.Floor(bucketSize)) + 1
var prevMaxAreaPoint int
for i := 0; i < new_data_length-2; i++ {
bucketHigh := int(math.Floor(float64(i+2)*bucketSize)) + 1
if bucketHigh >= len(data)-1 {
bucketHigh = len(data) - 2
}
// Calculate point average for next bucket (containing c)
avgPointX, avgPointY := calculateAverageDataPoint(data[bucketMiddle:bucketHigh+1], int64(bucketMiddle))
// Get the range for current bucket
currBucketStart := bucketLow
currBucketEnd := bucketMiddle
// Point a
pointX := prevMaxAreaPoint
pointY := data[prevMaxAreaPoint]
maxArea := -1.0
var maxAreaPoint int
flag_ := 0
for ; currBucketStart < currBucketEnd; currBucketStart++ {
area := calculateTriangleArea(util.Float(pointX), pointY, avgPointX, avgPointY, util.Float(currBucketStart), data[currBucketStart])
if area > maxArea {
maxArea = area
maxAreaPoint = currBucketStart
}
if math.IsNaN(float64(avgPointY)) {
flag_ = 1
}
}
if flag_ == 1 {
new_data = append(new_data, util.NaN) // Pick this point from the bucket
} else {
new_data = append(new_data, data[maxAreaPoint]) // Pick this point from the bucket
}
prevMaxAreaPoint = maxAreaPoint // This MaxArea point is the next's prevMAxAreaPoint
//move to the next window
bucketLow = bucketMiddle
bucketMiddle = bucketHigh
}
new_data = append(new_data, data[len(data)-1]) // Always add last
return new_data, new_frequency, nil
}

View File

@@ -1,35 +0,0 @@
package resampler
import (
"math"
"github.com/ClusterCockpit/cc-metric-store/internal/util"
)
func calculateTriangleArea(paX, paY, pbX, pbY, pcX, pcY util.Float) float64 {
area := ((paX-pcX)*(pbY-paY) - (paX-pbX)*(pcY-paY)) * 0.5
return math.Abs(float64(area))
}
func calculateAverageDataPoint(points []util.Float, xStart int64) (avgX util.Float, avgY util.Float) {
flag := 0
for _, point := range points {
avgX += util.Float(xStart)
avgY += point
xStart++
if math.IsNaN(float64(point)) {
flag = 1
}
}
l := util.Float(len(points))
avgX /= l
avgY /= l
if flag == 1 {
return avgX, util.NaN
} else {
return avgX, avgY
}
}

View File

@@ -0,0 +1,17 @@
CC_USER=clustercockpit
CC_GROUP=clustercockpit
CC_HOME=/tmp
LOG_DIR=/var/log
DATA_DIR=/var/run/cc-metric-store
MAX_OPEN_FILES=10000
CONF_DIR=/etc/cc-metric-store
CONF_FILE=/etc/cc-metric-store/cc-metric-store.json
RESTART_ON_UPGRADE=true

View File

@@ -0,0 +1,12 @@
Package: cc-metric-store
Version: {VERSION}
Installed-Size: {INSTALLED_SIZE}
Architecture: {ARCH}
Maintainer: thomas.gruber@fau.de
Depends: libc6 (>= 2.2.1)
Build-Depends: debhelper-compat (= 13), git, golang-go
Description: In-memory metric store daemon from the ClusterCockpit suite
Homepage: https://github.com/ClusterCockpit/cc-metric-store
Source: cc-metric-store
Rules-Requires-Root: no

View File

@@ -0,0 +1,141 @@
#! /usr/bin/env bash
# chkconfig: 2345 80 05
# description: ClusterCockpit metric store
# processname: cc-metric-store
# config: /etc/default/cc-metric-store
# pidfile: /var/run/cc-metric-store.pid
### BEGIN INIT INFO
# Provides: cc-metric-store
# Required-Start: $all
# Required-Stop: $remote_fs $syslog
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: Start ClusterCockpit metric store at boot time
### END INIT INFO
PATH=/bin:/usr/bin:/sbin:/usr/sbin
NAME=cc-metric-store
DESC="ClusterCockpit metric store"
DEFAULT=/etc/default/${NAME}.json
CC_USER=clustercockpit
CC_GROUP=clustercockpit
CONF_DIR=/etc/cc-metric-store
PID_FILE=/var/run/$NAME.pid
DAEMON=/usr/sbin/$NAME
CONF_FILE=${CONF_DIR}/cc-metric-store.json
umask 0027
if [ ! -x $DAEMON ]; then
echo "Program not installed or not executable"
exit 5
fi
. /lib/lsb/init-functions
if [ -r /etc/default/rcS ]; then
. /etc/default/rcS
fi
# overwrite settings from default file
if [ -f "$DEFAULT" ]; then
. "$DEFAULT"
fi
CC_OPTS="--config=${CONF_FILE}"
function checkUser() {
if [ `id -u` -ne 0 ]; then
echo "You need root privileges to run this script"
exit 4
fi
}
case "$1" in
start)
checkUser
log_daemon_msg "Starting $DESC"
pid=`pidofproc -p $PID_FILE $NAME`
if [ -n "$pid" ] ; then
log_begin_msg "Already running."
log_end_msg 0
exit 0
fi
# Prepare environment
touch "$PID_FILE" && chown "$CC_USER":"$CC_GROUP" "$PID_FILE"
if [ -n "$MAX_OPEN_FILES" ]; then
ulimit -n $MAX_OPEN_FILES
fi
# Start Daemon
start-stop-daemon --start -b --chdir "$WORK_DIR" --user "$CC_USER" -c "$CC_USER" --pidfile "$PID_FILE" --exec $DAEMON -- $DAEMON_OPTS
return=$?
if [ $return -eq 0 ]
then
sleep 1
# check if pid file has been written to
if ! [[ -s $PID_FILE ]]; then
log_end_msg 1
exit 1
fi
i=0
timeout=10
# Wait for the process to be properly started before exiting
until { cat "$PID_FILE" | xargs kill -0; } >/dev/null 2>&1
do
sleep 1
i=$(($i + 1))
if [ $i -gt $timeout ]; then
log_end_msg 1
exit 1
fi
done
fi
log_end_msg $return
;;
stop)
checkUser
log_daemon_msg "Stopping $DESC"
if [ -f "$PID_FILE" ]; then
start-stop-daemon --stop --pidfile "$PID_FILE" \
--user "$CC_USER" \
--retry=TERM/20/KILL/5 >/dev/null
if [ $? -eq 1 ]; then
log_progress_msg "$DESC is not running but pid file exists, cleaning up"
elif [ $? -eq 3 ]; then
PID="`cat $PID_FILE`"
log_failure_msg "Failed to stop $DESC (pid $PID)"
exit 1
fi
rm -f "$PID_FILE"
else
log_progress_msg "(not running)"
fi
log_end_msg 0
;;
status)
status_of_proc -p $PID_FILE $NAME $NAME && exit 0 || exit $?
;;
restart|force-reload)
if [ -f "$PID_FILE" ]; then
$0 stop
sleep 1
fi
$0 start
;;
*)
log_success_msg "Usage: $0 {start|stop|restart|force-reload|status}"
exit 3
;;
esac

View File

@@ -0,0 +1,62 @@
Name: cc-metric-store
Version: %{VERS}
Release: 1%{?dist}
Summary: In-memory metric database from the ClusterCockpit suite
License: MIT
Source0: %{name}-%{version}.tar.gz
BuildRequires: go-toolset
BuildRequires: systemd-rpm-macros
Provides: %{name} = %{version}
%description
In-memory metric database from the ClusterCockpit suite
%global debug_package %{nil}
%prep
%autosetup
%build
make
%install
# Install cc-metric-store
make PREFIX=%{buildroot} install
# Integrate into system
install -Dpm 0644 scripts/%{name}.service %{buildroot}%{_unitdir}/%{name}.service
install -Dpm 0600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{name}
install -Dpm 0644 scripts/%{name}.sysusers %{buildroot}%{_sysusersdir}/%{name}.conf
%check
# go test should be here... :)
%pre
%sysusers_create_package scripts/%{name}.sysusers
%post
%systemd_post %{name}.service
%preun
%systemd_preun %{name}.service
%files
# Binary
%attr(-,clustercockpit,clustercockpit) %{_bindir}/%{name}
# Config
%dir %{_sysconfdir}/%{name}
%attr(0600,clustercockpit,clustercockpit) %config(noreplace) %{_sysconfdir}/%{name}/%{name}.json
# Systemd
%{_unitdir}/%{name}.service
%{_sysconfdir}/default/%{name}
%{_sysusersdir}/%{name}.conf
%changelog
* Mon Mar 07 2022 Thomas Gruber - 0.1
- Initial metric store implementation

View File

@@ -0,0 +1,2 @@
#Type Name ID GECOS Home directory Shell
u clustercockpit - "User for ClusterCockpit" /run/cc-metric-collector /sbin/nologin

105
scripts/send-fake-data.go Normal file
View File

@@ -0,0 +1,105 @@
package main
import (
"bytes"
"fmt"
"io"
"log"
"math"
"math/rand"
"net/http"
"time"
)
const token = "eyJ0eXAiOiJKV1QiLCJhbGciOiJFZERTQSJ9.eyJ1c2VyIjoiYWRtaW4iLCJyb2xlcyI6WyJST0xFX0FETUlOIiwiUk9MRV9BTkFMWVNUIiwiUk9MRV9VU0VSIl19.d-3_3FZTsadPjDEdsWrrQ7nS0edMAR4zjl-eK7rJU3HziNBfI9PDHDIpJVHTNN5E5SlLGLFXctWyKAkwhXL-Dw"
const ccmsurl = "http://localhost:8081/api/write"
const cluster = "fakedev"
const sockets = 2
const cpus = 8
const freq = 15 * time.Second
var hosts = []string{"fake001", "fake002", "fake003", "fake004", "fake005"}
var metrics = []struct {
Name string
Type string
AvgValue float64
}{
{"flops_any", "cpu", 10.0},
{"mem_bw", "socket", 50.0},
{"ipc", "cpu", 1.25},
{"cpu_load", "node", 4},
{"mem_used", "node", 20},
}
var states = make([]float64, 0)
func send(client *http.Client, t int64) {
msg := &bytes.Buffer{}
i := 0
for _, host := range hosts {
for _, metric := range metrics {
n := 1
if metric.Type == "socket" {
n = sockets
} else if metric.Type == "cpu" {
n = cpus
}
for j := 0; j < n; j++ {
fmt.Fprintf(msg, "%s,cluster=%s,host=%s,type=%s", metric.Name, cluster, host, metric.Type)
if metric.Type == "socket" {
fmt.Fprintf(msg, ",type-id=%d", j)
} else if metric.Type == "cpu" {
fmt.Fprintf(msg, ",type-id=%d", j)
}
x := metric.AvgValue + math.Sin(states[i])*(metric.AvgValue/10.)
states[i] += 0.1
fmt.Fprintf(msg, " value=%f ", x)
fmt.Fprintf(msg, "%d\n", t)
i++
}
}
}
req, _ := http.NewRequest(http.MethodPost, ccmsurl, msg)
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", token))
res, err := client.Do(req)
if err != nil {
log.Print(err)
return
}
if res.StatusCode != http.StatusOK {
body, _ := io.ReadAll(res.Body)
log.Printf("%s: %s", res.Status, string(body))
}
}
func main() {
for range hosts {
for _, m := range metrics {
n := 1
if m.Type == "socket" {
n = sockets
} else if m.Type == "cpu" {
n = cpus
}
for i := 0; i < n; i++ {
states = append(states, rand.Float64()*100)
}
}
}
client := &http.Client{}
i := 0
for t := range time.Tick(freq) {
log.Printf("tick... (#%d)", i)
i++
send(client, t.Unix())
}
}

123
selector.go Normal file
View File

@@ -0,0 +1,123 @@
package main
import (
"encoding/json"
"errors"
)
type SelectorElement struct {
Any bool
String string
Group []string
}
func (se *SelectorElement) UnmarshalJSON(input []byte) error {
if input[0] == '"' {
if err := json.Unmarshal(input, &se.String); err != nil {
return err
}
if se.String == "*" {
se.Any = true
se.String = ""
}
return nil
}
if input[0] == '[' {
return json.Unmarshal(input, &se.Group)
}
return errors.New("the Go SelectorElement type can only be a string or an array of strings")
}
func (se *SelectorElement) MarshalJSON() ([]byte, error) {
if se.Any {
return []byte("\"*\""), nil
}
if se.String != "" {
return json.Marshal(se.String)
}
if se.Group != nil {
return json.Marshal(se.Group)
}
return nil, errors.New("a Go Selector must be a non-empty string or a non-empty slice of strings")
}
type Selector []SelectorElement
func (l *level) findLevel(selector []string) *level {
if len(selector) == 0 {
return l
}
l.lock.RLock()
defer l.lock.RUnlock()
lvl := l.children[selector[0]]
if lvl == nil {
return nil
}
return lvl.findLevel(selector[1:])
}
func (l *level) findBuffers(selector Selector, offset int, f func(b *buffer) error) error {
l.lock.RLock()
defer l.lock.RUnlock()
if len(selector) == 0 {
b := l.metrics[offset]
if b != nil {
return f(b)
}
for _, lvl := range l.children {
err := lvl.findBuffers(nil, offset, f)
if err != nil {
return err
}
}
return nil
}
sel := selector[0]
if len(sel.String) != 0 && l.children != nil {
lvl, ok := l.children[sel.String]
if ok {
err := lvl.findBuffers(selector[1:], offset, f)
if err != nil {
return err
}
}
return nil
}
if sel.Group != nil && l.children != nil {
for _, key := range sel.Group {
lvl, ok := l.children[key]
if ok {
err := lvl.findBuffers(selector[1:], offset, f)
if err != nil {
return err
}
}
}
return nil
}
if sel.Any && l.children != nil {
for _, lvl := range l.children {
if err := lvl.findBuffers(selector[1:], offset, f); err != nil {
return err
}
}
return nil
}
return nil
}

View File

@@ -1,18 +1,15 @@
package memorystore
package main
import (
"errors"
"math"
"github.com/ClusterCockpit/cc-metric-store/internal/config"
"github.com/ClusterCockpit/cc-metric-store/internal/util"
)
type Stats struct {
Samples int
Avg util.Float
Min util.Float
Max util.Float
Avg Float
Min Float
Max Float
}
func (b *buffer) stats(from, to int64) (Stats, int64, int64, error) {
@@ -57,28 +54,28 @@ func (b *buffer) stats(from, to int64) (Stats, int64, int64, error) {
return Stats{
Samples: samples,
Avg: util.Float(sum) / util.Float(samples),
Min: util.Float(min),
Max: util.Float(max),
Avg: Float(sum) / Float(samples),
Min: Float(min),
Max: Float(max),
}, from, t, nil
}
// Returns statistics for the requested metric on the selected node/level.
// Data is aggregated to the selected level the same way as in `MemoryStore.Read`.
// If `Stats.Samples` is zero, the statistics should not be considered as valid.
func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int64) (*Stats, int64, int64, error) {
func (m *MemoryStore) Stats(selector Selector, metric string, from, to int64) (*Stats, int64, int64, error) {
if from > to {
return nil, 0, 0, errors.New("invalid time range")
}
minfo, ok := m.Metrics[metric]
minfo, ok := m.metrics[metric]
if !ok {
return nil, 0, 0, errors.New("unkown metric: " + metric)
}
n, samples := 0, 0
avg, min, max := util.Float(0), math.MaxFloat32, -math.MaxFloat32
err := m.root.findBuffers(selector, minfo.Offset, func(b *buffer) error {
avg, min, max := Float(0), math.MaxFloat32, -math.MaxFloat32
err := m.root.findBuffers(selector, minfo.offset, func(b *buffer) error {
stats, cfrom, cto, err := b.stats(from, to)
if err != nil {
return err
@@ -97,6 +94,7 @@ func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int6
n += 1
return nil
})
if err != nil {
return nil, 0, 0, err
}
@@ -105,16 +103,16 @@ func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int6
return nil, 0, 0, ErrNoData
}
if minfo.Aggregation == config.AvgAggregation {
avg /= util.Float(n)
} else if n > 1 && minfo.Aggregation != config.SumAggregation {
if minfo.Aggregation == AvgAggregation {
avg /= Float(n)
} else if n > 1 && minfo.Aggregation != SumAggregation {
return nil, 0, 0, errors.New("invalid aggregation")
}
return &Stats{
Samples: samples,
Avg: avg,
Min: util.Float(min),
Max: util.Float(max),
Min: Float(min),
Max: Float(max),
}, from, to, nil
}

View File

@@ -1,8 +0,0 @@
//go:build tools
// +build tools
package tools
import (
_ "github.com/swaggo/swag/cmd/swag"
)