mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-04-01 13:37:30 +02:00
Compare commits
21 Commits
dependabot
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a101f215dc | ||
| 641dc0e3b8 | |||
| b734c1a92a | |||
| c5fe3c5cd9 | |||
| e2910b18b3 | |||
| ed236ec539 | |||
| 82c514b11a | |||
| 66707bbf15 | |||
| fc47b12fed | |||
| 937984d11f | |||
| 3d99aec185 | |||
| 280289185a | |||
| cc3d03bb5b | |||
|
|
5398246a61 | ||
| ac0a4cc39a | |||
|
|
71fc9efec7 | ||
|
|
6e97ac8b28 | ||
|
|
b43c52f5b5 | ||
| 97d65a9e5c | |||
| e759810051 | |||
| b1884fda9d |
38
.golangci.yml
Normal file
38
.golangci.yml
Normal file
@@ -0,0 +1,38 @@
|
||||
version: "2"
|
||||
|
||||
run:
|
||||
timeout: 5m
|
||||
|
||||
formatters:
|
||||
enable:
|
||||
- gofumpt # gofumpt formatter
|
||||
|
||||
settings:
|
||||
gofumpt:
|
||||
module-path: github.com/ClusterCockpit/cc-backend
|
||||
|
||||
linters:
|
||||
enable:
|
||||
- staticcheck # staticcheck = true
|
||||
- unparam # unusedparams = true
|
||||
- nilnil # nilness = true (catches nil returns of nil interface values)
|
||||
- govet # base vet; nilness + unusedwrite via govet analyzers
|
||||
|
||||
settings:
|
||||
staticcheck:
|
||||
checks: ["ST1003"]
|
||||
|
||||
govet:
|
||||
enable:
|
||||
- nilness
|
||||
- unusedwrite
|
||||
|
||||
exclusions:
|
||||
paths:
|
||||
- .git
|
||||
- .vscode
|
||||
- .idea
|
||||
- node_modules
|
||||
- internal/graph/generated # gqlgen generated code
|
||||
- internal/api/docs\.go # swaggo generated code
|
||||
- _test\.go # test files excluded from all linters
|
||||
12
Makefile
12
Makefile
@@ -1,6 +1,6 @@
|
||||
TARGET = ./cc-backend
|
||||
FRONTEND = ./web/frontend
|
||||
VERSION = 1.5.2
|
||||
VERSION = 1.5.3
|
||||
GIT_HASH := $(shell git rev-parse --short HEAD || echo 'development')
|
||||
CURRENT_TIME = $(shell date +"%Y-%m-%d:T%H:%M:%S")
|
||||
LD_FLAGS = '-s -X main.date=${CURRENT_TIME} -X main.version=${VERSION} -X main.commit=${GIT_HASH}'
|
||||
@@ -36,7 +36,7 @@ SVELTE_SRC = $(wildcard $(FRONTEND)/src/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/header/*.svelte) \
|
||||
$(wildcard $(FRONTEND)/src/job/*.svelte)
|
||||
|
||||
.PHONY: clean distclean test tags frontend swagger graphql $(TARGET)
|
||||
.PHONY: clean distclean fmt lint test tags frontend swagger graphql $(TARGET)
|
||||
|
||||
.NOTPARALLEL:
|
||||
|
||||
@@ -75,6 +75,14 @@ test:
|
||||
@go vet ./...
|
||||
@go test ./...
|
||||
|
||||
fmt:
|
||||
$(info ===> FORMAT)
|
||||
@gofumpt -l -w .
|
||||
|
||||
lint:
|
||||
$(info ===> LINT)
|
||||
@golangci-lint run ./...
|
||||
|
||||
tags:
|
||||
$(info ===> TAGS)
|
||||
@ctags -R
|
||||
|
||||
20
README.md
20
README.md
@@ -100,6 +100,26 @@ the following targets:
|
||||
frontend source files will result in a complete rebuild.
|
||||
- `make clean`: Clean go build cache and remove binary.
|
||||
- `make test`: Run the tests that are also run in the GitHub workflow setup.
|
||||
- `make fmt`: Format all Go source files using
|
||||
[gofumpt](https://github.com/mvdan/gofumpt), a stricter superset of `gofmt`.
|
||||
Requires `gofumpt` to be installed (see below).
|
||||
- `make lint`: Run [golangci-lint](https://golangci-lint.run/) with the project
|
||||
configuration in `.golangci.yml`. Requires `golangci-lint` to be installed
|
||||
(see below).
|
||||
|
||||
### Installing development tools
|
||||
|
||||
`gofumpt` and `golangci-lint` are not part of the Go module and must be
|
||||
installed separately:
|
||||
|
||||
```sh
|
||||
# Formatter (gofumpt)
|
||||
go install mvdan.cc/gofumpt@latest
|
||||
|
||||
# Linter (golangci-lint) — use the official install script to get a
|
||||
# pre-built binary; installing via `go install` is not supported upstream.
|
||||
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/HEAD/install.sh | sh -s -- -b $(go env GOPATH)/bin
|
||||
```
|
||||
|
||||
A common workflow for setting up cc-backend from scratch is:
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# `cc-backend` version 1.5.2
|
||||
# `cc-backend` version 1.5.3
|
||||
|
||||
Supports job archive version 3 and database version 11.
|
||||
|
||||
@@ -15,6 +15,65 @@ While we are confident that the memory issue with the metricstore cleanup move
|
||||
policy is fixed, it is still recommended to use delete policy for cleanup.
|
||||
This is also the default.
|
||||
|
||||
## Changes in 1.5.3
|
||||
|
||||
### Bug fixes
|
||||
|
||||
- **WAL not rotated on partial checkpoint failure**: When binary checkpointing
|
||||
failed for some hosts, WAL files for successfully checkpointed hosts were not
|
||||
rotated and the checkpoint timestamp was not advanced. Partial successes now
|
||||
correctly advance the checkpoint and rotate WAL files for completed hosts.
|
||||
- **Unbounded WAL file growth**: If binary checkpointing consistently failed for
|
||||
a host, its `current.wal` file grew without limit until disk exhaustion. A new
|
||||
`max-wal-size` configuration option (in the `checkpoints` block) allows setting
|
||||
a per-host WAL size cap in bytes. When exceeded, the WAL is force-rotated.
|
||||
Defaults to 0 (unlimited) for backward compatibility.
|
||||
|
||||
- **Doubleranged filter fixes**: Range filters now correctly handle zero as a
|
||||
boundary value. Improved validation and UI text for "more than equal" and
|
||||
"less than equal" range selections.
|
||||
- **Lineprotocol body parsing interrupted**: Switched from `ReadTimeout` to
|
||||
`ReadHeaderTimeout` so that long-running metric submissions are no longer
|
||||
cut off mid-stream.
|
||||
- **Checkpoint archiving continues on error**: A single cluster's archiving
|
||||
failure no longer aborts the entire cleanup operation. Errors are collected
|
||||
and reported per cluster.
|
||||
- **Parquet row group overflow**: Added periodic flush during checkpoint
|
||||
archiving to prevent exceeding the parquet-go 32k column-write limit.
|
||||
- **Removed metrics excluded from subcluster config**: Metrics removed from a
|
||||
subcluster are no longer returned by `GetMetricConfigSubCluster`.
|
||||
|
||||
### MetricStore performance
|
||||
|
||||
- **WAL writer throughput**: Decoupled WAL file flushing from message processing
|
||||
using a periodic 5-second batch flush (up to 4096 messages per cycle),
|
||||
significantly increasing metric ingestion throughput.
|
||||
- **Improved shutdown time**: HTTP shutdown timeout reduced; metricstore and
|
||||
archiver now shut down concurrently. Overall shutdown deadline raised to
|
||||
60 seconds.
|
||||
|
||||
### New features
|
||||
|
||||
- **Manual checkpoint cleanup flag**: New `-cleanup-checkpoints` CLI flag
|
||||
triggers checkpoint cleanup without starting the server, useful for
|
||||
maintenance windows or automated cleanup scripts.
|
||||
- **Explicit node state queries in node view**: Node health and scheduler state
|
||||
are now fetched independently from metric data for fresher status information.
|
||||
|
||||
### New tools
|
||||
|
||||
- **binaryCheckpointReader**: New utility tool (`tools/binaryCheckpointReader`)
|
||||
that reads `.wal` or `.bin` checkpoint files produced by the metricstore
|
||||
WAL/snapshot system and dumps their contents to a human-readable `.txt` file.
|
||||
Useful for debugging and inspecting checkpoint data. Usage:
|
||||
`go run ./tools/binaryCheckpointReader <file.wal|file.bin>`
|
||||
|
||||
### Logging improvements
|
||||
|
||||
- **Reduced tagger log noise**: Missing metrics and expression evaluation errors
|
||||
in the job classification tagger are now logged at debug level instead of
|
||||
error level.
|
||||
|
||||
## Changes in 1.5.2
|
||||
|
||||
### Bug fixes
|
||||
|
||||
@@ -407,21 +407,27 @@ func (s *Server) Start(ctx context.Context) error {
|
||||
}
|
||||
|
||||
func (s *Server) Shutdown(ctx context.Context) {
|
||||
shutdownCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
defer cancel()
|
||||
shutdownStart := time.Now()
|
||||
|
||||
natsStart := time.Now()
|
||||
nc := nats.GetClient()
|
||||
if nc != nil {
|
||||
nc.Close()
|
||||
}
|
||||
cclog.Infof("Shutdown: NATS closed (%v)", time.Since(natsStart))
|
||||
|
||||
httpStart := time.Now()
|
||||
shutdownCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
|
||||
defer cancel()
|
||||
if err := s.server.Shutdown(shutdownCtx); err != nil {
|
||||
cclog.Errorf("Server shutdown error: %v", err)
|
||||
}
|
||||
cclog.Infof("Shutdown: HTTP server stopped (%v)", time.Since(httpStart))
|
||||
|
||||
// Run metricstore and archiver shutdown concurrently.
|
||||
// They are independent: metricstore writes .bin snapshots,
|
||||
// archiver flushes pending job archives.
|
||||
storeStart := time.Now()
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
defer close(done)
|
||||
@@ -434,7 +440,7 @@ func (s *Server) Shutdown(ctx context.Context) {
|
||||
}
|
||||
|
||||
wg.Go(func() {
|
||||
if err := archiver.Shutdown(10 * time.Second); err != nil {
|
||||
if err := archiver.Shutdown(60 * time.Second); err != nil {
|
||||
cclog.Warnf("Archiver shutdown: %v", err)
|
||||
}
|
||||
})
|
||||
@@ -444,7 +450,10 @@ func (s *Server) Shutdown(ctx context.Context) {
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
cclog.Infof("Shutdown: metricstore + archiver completed (%v)", time.Since(storeStart))
|
||||
case <-time.After(60 * time.Second):
|
||||
cclog.Warn("Shutdown deadline exceeded, forcing exit")
|
||||
cclog.Warnf("Shutdown deadline exceeded after %v, forcing exit", time.Since(shutdownStart))
|
||||
}
|
||||
|
||||
cclog.Infof("Shutdown: total time %v", time.Since(shutdownStart))
|
||||
}
|
||||
|
||||
49
gopls.json
Normal file
49
gopls.json
Normal file
@@ -0,0 +1,49 @@
|
||||
{
|
||||
"gofumpt": true,
|
||||
"codelenses": {
|
||||
"gc_details": false,
|
||||
"generate": true,
|
||||
"regenerate_cgo": true,
|
||||
"run_govulncheck": true,
|
||||
"test": true,
|
||||
"tidy": true,
|
||||
"upgrade_dependency": true,
|
||||
"vendor": true
|
||||
},
|
||||
"hints": {
|
||||
"assignVariableTypes": true,
|
||||
"compositeLiteralFields": true,
|
||||
"compositeLiteralTypes": true,
|
||||
"constantValues": true,
|
||||
"functionTypeParameters": true,
|
||||
"parameterNames": true,
|
||||
"rangeVariableTypes": true
|
||||
},
|
||||
"analyses": {
|
||||
"nilness": true,
|
||||
"unusedparams": true,
|
||||
"unusedwrite": true,
|
||||
"useany": true
|
||||
},
|
||||
"usePlaceholders": true,
|
||||
"completeUnimported": true,
|
||||
"staticcheck": true,
|
||||
"directoryFilters": [
|
||||
"-.git",
|
||||
"-.vscode",
|
||||
"-.idea",
|
||||
"-.vscode-test",
|
||||
"-node_modules",
|
||||
"-web/frontend",
|
||||
"-web/templates",
|
||||
"-var",
|
||||
"-api",
|
||||
"-configs",
|
||||
"-init",
|
||||
"-internal/repository/migrations",
|
||||
"-internal/repository/testdata",
|
||||
"-internal/importer/testdata",
|
||||
"-pkg/archive/testdata"
|
||||
],
|
||||
"semanticTokens": true
|
||||
}
|
||||
@@ -164,12 +164,17 @@ func (auth *Authentication) AuthViaSession(
|
||||
return nil, errors.New("invalid session data")
|
||||
}
|
||||
|
||||
authSourceInt, ok := session.Values["authSource"].(int)
|
||||
if !ok {
|
||||
authSourceInt = int(schema.AuthViaLocalPassword)
|
||||
}
|
||||
|
||||
return &schema.User{
|
||||
Username: username,
|
||||
Projects: projects,
|
||||
Roles: roles,
|
||||
AuthType: schema.AuthSession,
|
||||
AuthSource: -1,
|
||||
AuthSource: schema.AuthSource(authSourceInt),
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -319,10 +324,11 @@ func (auth *Authentication) SaveSession(rw http.ResponseWriter, r *http.Request,
|
||||
}
|
||||
session.Options.Secure = false
|
||||
}
|
||||
session.Options.SameSite = http.SameSiteStrictMode
|
||||
session.Options.SameSite = http.SameSiteLaxMode
|
||||
session.Values["username"] = user.Username
|
||||
session.Values["projects"] = user.Projects
|
||||
session.Values["roles"] = user.Roles
|
||||
session.Values["authSource"] = int(user.AuthSource)
|
||||
if err := auth.sessionStore.Save(r, rw, session); err != nil {
|
||||
cclog.Warnf("session save failed: %s", err.Error())
|
||||
http.Error(rw, err.Error(), http.StatusInternalServerError)
|
||||
|
||||
@@ -1072,10 +1072,12 @@ func (r *Resolver) Query() generated.QueryResolver { return &queryResolver{r} }
|
||||
// SubCluster returns generated.SubClusterResolver implementation.
|
||||
func (r *Resolver) SubCluster() generated.SubClusterResolver { return &subClusterResolver{r} }
|
||||
|
||||
type clusterResolver struct{ *Resolver }
|
||||
type jobResolver struct{ *Resolver }
|
||||
type metricValueResolver struct{ *Resolver }
|
||||
type mutationResolver struct{ *Resolver }
|
||||
type nodeResolver struct{ *Resolver }
|
||||
type queryResolver struct{ *Resolver }
|
||||
type subClusterResolver struct{ *Resolver }
|
||||
type (
|
||||
clusterResolver struct{ *Resolver }
|
||||
jobResolver struct{ *Resolver }
|
||||
metricValueResolver struct{ *Resolver }
|
||||
mutationResolver struct{ *Resolver }
|
||||
nodeResolver struct{ *Resolver }
|
||||
queryResolver struct{ *Resolver }
|
||||
subClusterResolver struct{ *Resolver }
|
||||
)
|
||||
|
||||
@@ -236,4 +236,3 @@ func (ccms *CCMetricStore) buildNodeQueries(
|
||||
|
||||
return queries, assignedScope, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -63,10 +63,10 @@ func DefaultConfig() *RepositoryConfig {
|
||||
MaxIdleConnections: 4,
|
||||
ConnectionMaxLifetime: time.Hour,
|
||||
ConnectionMaxIdleTime: 10 * time.Minute,
|
||||
MinRunningJobDuration: 600, // 10 minutes
|
||||
DbCacheSizeMB: 2048, // 2GB per connection
|
||||
DbSoftHeapLimitMB: 16384, // 16GB process-wide
|
||||
BusyTimeoutMs: 60000, // 60 seconds
|
||||
MinRunningJobDuration: 600, // 10 minutes
|
||||
DbCacheSizeMB: 2048, // 2GB per connection
|
||||
DbSoftHeapLimitMB: 16384, // 16GB process-wide
|
||||
BusyTimeoutMs: 60000, // 60 seconds
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -133,10 +133,10 @@ func (pt *prefixedTarget) WriteFile(name string, data []byte) error {
|
||||
// ClusterAwareParquetWriter organizes Parquet output by cluster.
|
||||
// Each cluster gets its own subdirectory with a cluster.json config file.
|
||||
type ClusterAwareParquetWriter struct {
|
||||
target ParquetTarget
|
||||
maxSizeMB int
|
||||
writers map[string]*ParquetWriter
|
||||
clusterCfgs map[string]*schema.Cluster
|
||||
target ParquetTarget
|
||||
maxSizeMB int
|
||||
writers map[string]*ParquetWriter
|
||||
clusterCfgs map[string]*schema.Cluster
|
||||
}
|
||||
|
||||
// NewClusterAwareParquetWriter creates a writer that routes jobs to per-cluster ParquetWriters.
|
||||
|
||||
@@ -3,6 +3,12 @@
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// This file implements the cleanup (archiving or deletion) of old checkpoint files.
|
||||
//
|
||||
// The CleanUp worker runs on a timer equal to RetentionInMemory. In "archive" mode
|
||||
// it converts checkpoint files older than the retention window into per-cluster
|
||||
// Parquet files and then deletes the originals. In "delete" mode it simply removes
|
||||
// old checkpoint files.
|
||||
package metricstore
|
||||
|
||||
import (
|
||||
@@ -19,8 +25,12 @@ import (
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
)
|
||||
|
||||
// Worker for either Archiving or Deleting files
|
||||
|
||||
// CleanUp starts a background worker that periodically removes or archives
|
||||
// checkpoint files older than the configured retention window.
|
||||
//
|
||||
// In "archive" mode, old checkpoint files are converted to Parquet and stored
|
||||
// under Keys.Cleanup.RootDir. In "delete" mode they are simply removed.
|
||||
// The cleanup interval equals Keys.RetentionInMemory.
|
||||
func CleanUp(wg *sync.WaitGroup, ctx context.Context) {
|
||||
if Keys.Cleanup.Mode == "archive" {
|
||||
cclog.Info("[METRICSTORE]> enable archive cleanup to parquet")
|
||||
|
||||
@@ -146,7 +146,9 @@ var (
|
||||
|
||||
// ErrDataDoesNotAlign indicates that aggregated data from child scopes
|
||||
// does not align with the parent scope's expected timestamps/intervals.
|
||||
ErrDataDoesNotAlign error = errors.New("[METRICSTORE]> data from lower granularities does not align")
|
||||
ErrDataDoesNotAlignMissingFront error = errors.New("[METRICSTORE]> data from lower granularities does not align (missing data prior to start of the buffers)")
|
||||
ErrDataDoesNotAlignMissingBack error = errors.New("[METRICSTORE]> data from lower granularities does not align (missing data after the end of the buffers)")
|
||||
ErrDataDoesNotAlignDataLenMismatch error = errors.New("[METRICSTORE]> data from lower granularities does not align (collected data length is different than expected data length)")
|
||||
)
|
||||
|
||||
// buffer stores time-series data for a single metric at a specific hierarchical level.
|
||||
|
||||
@@ -86,11 +86,12 @@ var (
|
||||
|
||||
// Checkpointing starts a background worker that periodically saves metric data to disk.
|
||||
//
|
||||
// Checkpoints are written every 12 hours (hardcoded).
|
||||
// The checkpoint interval is read from Keys.CheckpointInterval (default: 12 hours).
|
||||
//
|
||||
// Format behaviour:
|
||||
// - "json": Periodic checkpointing every checkpointInterval
|
||||
// - "wal": Periodic binary snapshots + WAL rotation every checkpointInterval
|
||||
// - "json": Writes a JSON snapshot file per host every interval
|
||||
// - "wal": Writes a binary snapshot file per host every interval, then rotates
|
||||
// the current.wal files for all successfully checkpointed hosts
|
||||
func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
|
||||
lastCheckpointMu.Lock()
|
||||
lastCheckpoint = time.Now()
|
||||
@@ -99,7 +100,6 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
|
||||
ms := GetMemoryStore()
|
||||
|
||||
wg.Go(func() {
|
||||
|
||||
d := 12 * time.Hour // default checkpoint interval
|
||||
if Keys.CheckpointInterval != "" {
|
||||
parsed, err := time.ParseDuration(Keys.CheckpointInterval)
|
||||
@@ -126,10 +126,15 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
|
||||
cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", from.Format(time.RFC3339))
|
||||
|
||||
if Keys.Checkpoints.FileFormat == "wal" {
|
||||
// Pause WAL writes: the binary snapshot captures all in-memory
|
||||
// data, so WAL records written during checkpoint are redundant
|
||||
// and would be deleted during rotation anyway.
|
||||
walCheckpointActive.Store(true)
|
||||
n, hostDirs, err := ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), now.Unix())
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> binary checkpointing failed: %s", err.Error())
|
||||
} else {
|
||||
}
|
||||
if n > 0 {
|
||||
cclog.Infof("[METRICSTORE]> done: %d binary snapshot files created", n)
|
||||
lastCheckpointMu.Lock()
|
||||
lastCheckpoint = now
|
||||
@@ -137,6 +142,8 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
|
||||
// Rotate WAL files for successfully checkpointed hosts.
|
||||
RotateWALFiles(hostDirs)
|
||||
}
|
||||
walCheckpointActive.Store(false)
|
||||
walDropped.Store(0)
|
||||
} else {
|
||||
n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir, from.Unix(), now.Unix())
|
||||
if err != nil {
|
||||
|
||||
@@ -59,11 +59,14 @@ const (
|
||||
// Checkpoints configures periodic persistence of in-memory metric data.
|
||||
//
|
||||
// Fields:
|
||||
// - FileFormat: "json" (human-readable, periodic) or "wal" (binary snapshot + WAL, crash-safe); default is "wal"
|
||||
// - RootDir: Filesystem path for checkpoint files (created if missing)
|
||||
// - FileFormat: "json" (human-readable, periodic) or "wal" (binary snapshot + WAL, crash-safe); default is "wal"
|
||||
// - RootDir: Filesystem path for checkpoint files (created if missing)
|
||||
// - MaxWALSize: Maximum size in bytes for a single host's WAL file; 0 = unlimited (default).
|
||||
// When exceeded the WAL is force-rotated to prevent unbounded disk growth.
|
||||
type Checkpoints struct {
|
||||
FileFormat string `json:"file-format"`
|
||||
RootDir string `json:"directory"`
|
||||
MaxWALSize int64 `json:"max-wal-size,omitempty"`
|
||||
}
|
||||
|
||||
// Debug provides development and profiling options.
|
||||
|
||||
@@ -24,6 +24,11 @@ const configSchema = `{
|
||||
"directory": {
|
||||
"description": "Path in which the checkpointed files should be placed.",
|
||||
"type": "string"
|
||||
},
|
||||
"max-wal-size": {
|
||||
"description": "Maximum size in bytes for a single host's WAL file. When exceeded the WAL is force-rotated to prevent unbounded disk growth. Only applies when file-format is 'wal'. 0 means unlimited (default).",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
//
|
||||
// The package organizes metrics in a tree structure (cluster → host → component) and
|
||||
// provides concurrent read/write access to metric data with configurable aggregation strategies.
|
||||
// Background goroutines handle periodic checkpointing (JSON or Avro format), archiving old data,
|
||||
// Background goroutines handle periodic checkpointing (JSON or WAL/binary format), archiving old data,
|
||||
// and enforcing retention policies.
|
||||
//
|
||||
// Key features:
|
||||
@@ -16,7 +16,7 @@
|
||||
// - Hierarchical data organization (selectors)
|
||||
// - Concurrent checkpoint/archive workers
|
||||
// - Support for sum and average aggregation
|
||||
// - NATS integration for metric ingestion
|
||||
// - NATS integration for metric ingestion via InfluxDB line protocol
|
||||
package metricstore
|
||||
|
||||
import (
|
||||
@@ -113,7 +113,8 @@ type MemoryStore struct {
|
||||
// 6. Optionally subscribes to NATS for real-time metric ingestion
|
||||
//
|
||||
// Parameters:
|
||||
// - rawConfig: JSON configuration for the metric store (see MetricStoreConfig)
|
||||
// - rawConfig: JSON configuration for the metric store (see MetricStoreConfig); may be nil to use defaults
|
||||
// - metrics: Map of metric names to their configurations (frequency and aggregation strategy)
|
||||
// - wg: WaitGroup that will be incremented for each background goroutine started
|
||||
//
|
||||
// The function will call cclog.Fatal on critical errors during initialization.
|
||||
@@ -271,19 +272,32 @@ func (ms *MemoryStore) SetNodeProvider(provider NodeProvider) {
|
||||
//
|
||||
// Note: This function blocks until the final checkpoint is written.
|
||||
func Shutdown() {
|
||||
totalStart := time.Now()
|
||||
|
||||
shutdownFuncMu.Lock()
|
||||
defer shutdownFuncMu.Unlock()
|
||||
if shutdownFunc != nil {
|
||||
shutdownFunc()
|
||||
}
|
||||
cclog.Infof("[METRICSTORE]> Background workers cancelled (%v)", time.Since(totalStart))
|
||||
|
||||
if Keys.Checkpoints.FileFormat == "wal" {
|
||||
// Signal producers to stop sending before closing channels,
|
||||
// preventing send-on-closed-channel panics from in-flight NATS workers.
|
||||
walShuttingDown.Store(true)
|
||||
// Brief grace period for in-flight DecodeLine calls to complete.
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
for _, ch := range walShardChs {
|
||||
close(ch)
|
||||
}
|
||||
drainStart := time.Now()
|
||||
WaitForWALStagingDrain()
|
||||
cclog.Infof("[METRICSTORE]> WAL staging goroutines exited (%v)", time.Since(drainStart))
|
||||
}
|
||||
|
||||
cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir)
|
||||
cclog.Infof("[METRICSTORE]> Writing checkpoint to '%s'...", Keys.Checkpoints.RootDir)
|
||||
checkpointStart := time.Now()
|
||||
var files int
|
||||
var err error
|
||||
|
||||
@@ -294,19 +308,16 @@ func Shutdown() {
|
||||
lastCheckpointMu.Unlock()
|
||||
|
||||
if Keys.Checkpoints.FileFormat == "wal" {
|
||||
var hostDirs []string
|
||||
files, hostDirs, err = ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix())
|
||||
if err == nil {
|
||||
RotateWALFilesAfterShutdown(hostDirs)
|
||||
}
|
||||
// WAL files are deleted per-host inside ToCheckpointWAL workers.
|
||||
files, _, err = ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix())
|
||||
} else {
|
||||
files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix())
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error())
|
||||
cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s", err.Error())
|
||||
}
|
||||
cclog.Infof("[METRICSTORE]> Done! (%d files written)\n", files)
|
||||
cclog.Infof("[METRICSTORE]> Done! (%d files written in %v, total shutdown: %v)", files, time.Since(checkpointStart), time.Since(totalStart))
|
||||
}
|
||||
|
||||
// Retention starts a background goroutine that periodically frees old metric data.
|
||||
@@ -702,16 +713,16 @@ func (m *MemoryStore) Read(selector util.Selector, metric string, from, to, reso
|
||||
} else if from != cfrom || to != cto || len(data) != len(cdata) {
|
||||
missingfront, missingback := int((from-cfrom)/minfo.Frequency), int((to-cto)/minfo.Frequency)
|
||||
if missingfront != 0 {
|
||||
return ErrDataDoesNotAlign
|
||||
return ErrDataDoesNotAlignMissingFront
|
||||
}
|
||||
|
||||
newlen := len(cdata) - missingback
|
||||
if newlen < 1 {
|
||||
return ErrDataDoesNotAlign
|
||||
return ErrDataDoesNotAlignMissingBack
|
||||
}
|
||||
cdata = cdata[0:newlen]
|
||||
if len(cdata) != len(data) {
|
||||
return ErrDataDoesNotAlign
|
||||
return ErrDataDoesNotAlignDataLenMismatch
|
||||
}
|
||||
|
||||
from, to = cfrom, cto
|
||||
|
||||
@@ -91,8 +91,10 @@ func (m *MemoryStore) Stats(selector util.Selector, metric string, from, to int6
|
||||
|
||||
if n == 0 {
|
||||
from, to = cfrom, cto
|
||||
} else if from != cfrom || to != cto {
|
||||
return ErrDataDoesNotAlign
|
||||
} else if from != cfrom {
|
||||
return ErrDataDoesNotAlignMissingFront
|
||||
} else if to != cto {
|
||||
return ErrDataDoesNotAlignMissingBack
|
||||
}
|
||||
|
||||
samples += stats.Samples
|
||||
|
||||
@@ -92,6 +92,18 @@ var walShardRotateChs []chan walRotateReq
|
||||
// walNumShards stores the number of shards (set during WALStaging init).
|
||||
var walNumShards int
|
||||
|
||||
// walStagingWg tracks WALStaging goroutine exits for shutdown synchronization.
|
||||
var walStagingWg sync.WaitGroup
|
||||
|
||||
// walShuttingDown is set before closing shard channels to prevent
|
||||
// SendWALMessage from sending on a closed channel (which panics in Go).
|
||||
var walShuttingDown atomic.Bool
|
||||
|
||||
// walCheckpointActive is set during binary checkpoint writes.
|
||||
// While active, SendWALMessage skips sending (returns true) because the
|
||||
// snapshot captures all in-memory data, making WAL writes redundant.
|
||||
var walCheckpointActive atomic.Bool
|
||||
|
||||
// WALMessage represents a single metric write to be appended to the WAL.
|
||||
// Cluster and Node are NOT stored in the WAL record (inferred from file path).
|
||||
type WALMessage struct {
|
||||
@@ -115,12 +127,13 @@ type walFileState struct {
|
||||
f *os.File
|
||||
w *bufio.Writer
|
||||
dirty bool
|
||||
size int64 // approximate bytes written (tracked from open + writes)
|
||||
}
|
||||
|
||||
// walFlushInterval controls how often dirty WAL files are flushed to disk.
|
||||
// Decoupling flushes from message processing lets the consumer run at memory
|
||||
// speed, amortizing syscall overhead across many writes.
|
||||
const walFlushInterval = 5 * time.Second
|
||||
const walFlushInterval = 1 * time.Second
|
||||
|
||||
// walShardIndex computes which shard a message belongs to based on cluster+node.
|
||||
// Uses FNV-1a hash for fast, well-distributed mapping.
|
||||
@@ -133,11 +146,14 @@ func walShardIndex(cluster, node string) int {
|
||||
}
|
||||
|
||||
// SendWALMessage routes a WAL message to the appropriate shard channel.
|
||||
// Returns false if the channel is full (message dropped).
|
||||
// Returns false if the channel is full or shutdown is in progress.
|
||||
func SendWALMessage(msg *WALMessage) bool {
|
||||
if walShardChs == nil {
|
||||
if walShardChs == nil || walShuttingDown.Load() {
|
||||
return false
|
||||
}
|
||||
if walCheckpointActive.Load() {
|
||||
return true // Data safe in memory; snapshot will capture it
|
||||
}
|
||||
shard := walShardIndex(msg.Cluster, msg.Node)
|
||||
select {
|
||||
case walShardChs[shard] <- msg:
|
||||
@@ -171,7 +187,9 @@ func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
|
||||
msgCh := walShardChs[i]
|
||||
rotateCh := walShardRotateChs[i]
|
||||
|
||||
walStagingWg.Add(1)
|
||||
wg.Go(func() {
|
||||
defer walStagingWg.Done()
|
||||
hostFiles := make(map[string]*walFileState)
|
||||
|
||||
defer func() {
|
||||
@@ -205,7 +223,11 @@ func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
|
||||
|
||||
// Write file header magic if file is new (empty).
|
||||
info, err := f.Stat()
|
||||
if err == nil && info.Size() == 0 {
|
||||
var fileSize int64
|
||||
if err == nil {
|
||||
fileSize = info.Size()
|
||||
}
|
||||
if err == nil && fileSize == 0 {
|
||||
var hdr [4]byte
|
||||
binary.LittleEndian.PutUint32(hdr[:], walFileMagic)
|
||||
if _, err := w.Write(hdr[:]); err != nil {
|
||||
@@ -213,9 +235,10 @@ func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
|
||||
f.Close()
|
||||
return nil
|
||||
}
|
||||
fileSize = 4
|
||||
}
|
||||
|
||||
ws = &walFileState{f: f, w: w}
|
||||
ws = &walFileState{f: f, w: w, size: fileSize}
|
||||
hostFiles[hostDir] = ws
|
||||
return ws
|
||||
}
|
||||
@@ -226,9 +249,30 @@ func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
|
||||
if ws == nil {
|
||||
return
|
||||
}
|
||||
if err := writeWALRecordDirect(ws.w, msg); err != nil {
|
||||
|
||||
// Enforce max WAL size: force-rotate before writing if limit is exceeded.
|
||||
// The in-memory store still holds the data; only crash-recovery coverage is lost.
|
||||
if maxSize := Keys.Checkpoints.MaxWALSize; maxSize > 0 && ws.size >= maxSize {
|
||||
cclog.Warnf("[METRICSTORE]> WAL: force-rotating %s (size %d >= limit %d)",
|
||||
hostDir, ws.size, maxSize)
|
||||
ws.w.Flush()
|
||||
ws.f.Close()
|
||||
walPath := path.Join(hostDir, "current.wal")
|
||||
if err := os.Remove(walPath); err != nil && !os.IsNotExist(err) {
|
||||
cclog.Errorf("[METRICSTORE]> WAL: remove %s: %v", walPath, err)
|
||||
}
|
||||
delete(hostFiles, hostDir)
|
||||
ws = getOrOpenWAL(hostDir)
|
||||
if ws == nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
n, err := writeWALRecordDirect(ws.w, msg)
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> WAL: write record: %v", err)
|
||||
}
|
||||
ws.size += int64(n)
|
||||
ws.dirty = true
|
||||
}
|
||||
|
||||
@@ -255,23 +299,6 @@ func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
drain := func() {
|
||||
for {
|
||||
select {
|
||||
case msg, ok := <-msgCh:
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
processMsg(msg)
|
||||
case req := <-rotateCh:
|
||||
processRotate(req)
|
||||
default:
|
||||
flushDirty()
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(walFlushInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
@@ -298,7 +325,10 @@ func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
drain()
|
||||
// On shutdown, skip draining buffered messages — a full binary
|
||||
// checkpoint will be written from in-memory state, making
|
||||
// buffered WAL records redundant.
|
||||
flushDirty()
|
||||
return
|
||||
case msg, ok := <-msgCh:
|
||||
if !ok {
|
||||
@@ -319,23 +349,45 @@ func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// WaitForWALStagingDrain blocks until all WALStaging goroutines have exited.
|
||||
// Must be called after closing walShardChs to ensure all file handles are
|
||||
// flushed and closed before checkpoint writes begin.
|
||||
func WaitForWALStagingDrain() {
|
||||
walStagingWg.Wait()
|
||||
}
|
||||
|
||||
// RotateWALFiles sends rotation requests for the given host directories
|
||||
// and blocks until all rotations complete. Each request is routed to the
|
||||
// shard that owns the host directory.
|
||||
//
|
||||
// If shutdown is in progress (WAL staging goroutines may have exited),
|
||||
// rotation is skipped to avoid deadlocking on abandoned channels.
|
||||
func RotateWALFiles(hostDirs []string) {
|
||||
if walShardRotateChs == nil {
|
||||
if walShardRotateChs == nil || walShuttingDown.Load() {
|
||||
return
|
||||
}
|
||||
dones := make([]chan struct{}, len(hostDirs))
|
||||
for i, dir := range hostDirs {
|
||||
dones[i] = make(chan struct{})
|
||||
// Extract cluster/node from hostDir to find the right shard.
|
||||
// hostDir = rootDir/cluster/node
|
||||
deadline := time.After(2 * time.Minute)
|
||||
dones := make([]chan struct{}, 0, len(hostDirs))
|
||||
for _, dir := range hostDirs {
|
||||
done := make(chan struct{})
|
||||
shard := walShardIndexFromDir(dir)
|
||||
walShardRotateChs[shard] <- walRotateReq{hostDir: dir, done: dones[i]}
|
||||
select {
|
||||
case walShardRotateChs[shard] <- walRotateReq{hostDir: dir, done: done}:
|
||||
dones = append(dones, done)
|
||||
case <-deadline:
|
||||
cclog.Warnf("[METRICSTORE]> WAL rotation send timed out, %d of %d hosts remaining",
|
||||
len(hostDirs)-len(dones), len(hostDirs))
|
||||
goto waitDones
|
||||
}
|
||||
}
|
||||
waitDones:
|
||||
for _, done := range dones {
|
||||
<-done
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(30 * time.Second):
|
||||
cclog.Warn("[METRICSTORE]> WAL rotation completion timed out, continuing")
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -348,8 +400,9 @@ func walShardIndexFromDir(hostDir string) int {
|
||||
return walShardIndex(cluster, node)
|
||||
}
|
||||
|
||||
// RotateWALFiles sends rotation requests for the given host directories
|
||||
// and blocks until all rotations complete.
|
||||
// RotateWALFilesAfterShutdown directly removes current.wal files for the given
|
||||
// host directories. Used after shutdown, when WALStaging goroutines have already
|
||||
// exited and the channel-based RotateWALFiles is no longer safe to call.
|
||||
func RotateWALFilesAfterShutdown(hostDirs []string) {
|
||||
for _, dir := range hostDirs {
|
||||
walPath := path.Join(dir, "current.wal")
|
||||
@@ -359,79 +412,66 @@ func RotateWALFilesAfterShutdown(hostDirs []string) {
|
||||
}
|
||||
}
|
||||
|
||||
// writeWALRecordDirect encodes a WAL record directly into the bufio.Writer,
|
||||
// avoiding heap allocations by using a stack-allocated scratch buffer for
|
||||
// the fixed-size header/trailer and computing CRC inline.
|
||||
func writeWALRecordDirect(w *bufio.Writer, msg *WALMessage) error {
|
||||
// Compute payload size.
|
||||
// writeWALRecordDirect encodes a WAL record into a contiguous buffer first,
|
||||
// then writes it to the bufio.Writer in a single call. This prevents partial
|
||||
// records in the write buffer if a write error occurs mid-record (e.g. disk full).
|
||||
// Returns the number of bytes written and any error.
|
||||
func writeWALRecordDirect(w *bufio.Writer, msg *WALMessage) (int, error) {
|
||||
// Compute payload and total record size.
|
||||
payloadSize := 8 + 2 + len(msg.MetricName) + 1 + 4
|
||||
for _, s := range msg.Selector {
|
||||
payloadSize += 1 + len(s)
|
||||
}
|
||||
// Total: 8 (header) + payload + 4 (CRC).
|
||||
totalSize := 8 + payloadSize + 4
|
||||
|
||||
// Write magic + payload length (8 bytes header).
|
||||
var hdr [8]byte
|
||||
binary.LittleEndian.PutUint32(hdr[0:4], walRecordMagic)
|
||||
binary.LittleEndian.PutUint32(hdr[4:8], uint32(payloadSize))
|
||||
if _, err := w.Write(hdr[:]); err != nil {
|
||||
return err
|
||||
// Use stack buffer for typical small records, heap-allocate only for large ones.
|
||||
var stackBuf [256]byte
|
||||
var buf []byte
|
||||
if totalSize <= len(stackBuf) {
|
||||
buf = stackBuf[:totalSize]
|
||||
} else {
|
||||
buf = make([]byte, totalSize)
|
||||
}
|
||||
|
||||
// We need to compute CRC over the payload as we write it.
|
||||
crc := crc32.NewIEEE()
|
||||
// Header: magic + payload length.
|
||||
binary.LittleEndian.PutUint32(buf[0:4], walRecordMagic)
|
||||
binary.LittleEndian.PutUint32(buf[4:8], uint32(payloadSize))
|
||||
|
||||
// Payload starts at offset 8.
|
||||
p := 8
|
||||
|
||||
// Timestamp (8 bytes).
|
||||
var scratch [8]byte
|
||||
binary.LittleEndian.PutUint64(scratch[:8], uint64(msg.Timestamp))
|
||||
crc.Write(scratch[:8])
|
||||
if _, err := w.Write(scratch[:8]); err != nil {
|
||||
return err
|
||||
}
|
||||
binary.LittleEndian.PutUint64(buf[p:p+8], uint64(msg.Timestamp))
|
||||
p += 8
|
||||
|
||||
// Metric name length (2 bytes) + metric name.
|
||||
binary.LittleEndian.PutUint16(scratch[:2], uint16(len(msg.MetricName)))
|
||||
crc.Write(scratch[:2])
|
||||
if _, err := w.Write(scratch[:2]); err != nil {
|
||||
return err
|
||||
}
|
||||
nameBytes := []byte(msg.MetricName)
|
||||
crc.Write(nameBytes)
|
||||
if _, err := w.Write(nameBytes); err != nil {
|
||||
return err
|
||||
}
|
||||
binary.LittleEndian.PutUint16(buf[p:p+2], uint16(len(msg.MetricName)))
|
||||
p += 2
|
||||
p += copy(buf[p:], msg.MetricName)
|
||||
|
||||
// Selector count (1 byte).
|
||||
scratch[0] = byte(len(msg.Selector))
|
||||
crc.Write(scratch[:1])
|
||||
if _, err := w.Write(scratch[:1]); err != nil {
|
||||
return err
|
||||
}
|
||||
buf[p] = byte(len(msg.Selector))
|
||||
p++
|
||||
|
||||
// Selectors (1-byte length + bytes each).
|
||||
for _, sel := range msg.Selector {
|
||||
scratch[0] = byte(len(sel))
|
||||
crc.Write(scratch[:1])
|
||||
if _, err := w.Write(scratch[:1]); err != nil {
|
||||
return err
|
||||
}
|
||||
selBytes := []byte(sel)
|
||||
crc.Write(selBytes)
|
||||
if _, err := w.Write(selBytes); err != nil {
|
||||
return err
|
||||
}
|
||||
buf[p] = byte(len(sel))
|
||||
p++
|
||||
p += copy(buf[p:], sel)
|
||||
}
|
||||
|
||||
// Value (4 bytes, float32 bits).
|
||||
binary.LittleEndian.PutUint32(scratch[:4], math.Float32bits(float32(msg.Value)))
|
||||
crc.Write(scratch[:4])
|
||||
if _, err := w.Write(scratch[:4]); err != nil {
|
||||
return err
|
||||
}
|
||||
binary.LittleEndian.PutUint32(buf[p:p+4], math.Float32bits(float32(msg.Value)))
|
||||
p += 4
|
||||
|
||||
// CRC32 (4 bytes).
|
||||
binary.LittleEndian.PutUint32(scratch[:4], crc.Sum32())
|
||||
_, err := w.Write(scratch[:4])
|
||||
return err
|
||||
// CRC32 over payload (bytes 8..8+payloadSize).
|
||||
crc := crc32.ChecksumIEEE(buf[8 : 8+payloadSize])
|
||||
binary.LittleEndian.PutUint32(buf[p:p+4], crc)
|
||||
|
||||
// Single atomic write to the buffered writer.
|
||||
n, err := w.Write(buf)
|
||||
return n, err
|
||||
}
|
||||
|
||||
// readWALRecord reads one WAL record from the reader.
|
||||
@@ -655,7 +695,10 @@ func (m *MemoryStore) ToCheckpointWAL(dir string, from, to int64) (int, []string
|
||||
selector []string
|
||||
}
|
||||
|
||||
n, errs := int32(0), int32(0)
|
||||
totalWork := len(levels)
|
||||
cclog.Infof("[METRICSTORE]> Starting binary checkpoint for %d hosts with %d workers", totalWork, Keys.NumWorkers)
|
||||
|
||||
n, errs, completed := int32(0), int32(0), int32(0)
|
||||
var successDirs []string
|
||||
var successMu sync.Mutex
|
||||
|
||||
@@ -663,6 +706,22 @@ func (m *MemoryStore) ToCheckpointWAL(dir string, from, to int64) (int, []string
|
||||
wg.Add(Keys.NumWorkers)
|
||||
work := make(chan workItem, Keys.NumWorkers*2)
|
||||
|
||||
// Progress logging goroutine.
|
||||
stopProgress := make(chan struct{})
|
||||
go func() {
|
||||
ticker := time.NewTicker(10 * time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
cclog.Infof("[METRICSTORE]> Checkpoint progress: %d/%d hosts (%d written, %d errors)",
|
||||
atomic.LoadInt32(&completed), totalWork, atomic.LoadInt32(&n), atomic.LoadInt32(&errs))
|
||||
case <-stopProgress:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
for range Keys.NumWorkers {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
@@ -670,6 +729,7 @@ func (m *MemoryStore) ToCheckpointWAL(dir string, from, to int64) (int, []string
|
||||
err := wi.level.toCheckpointBinary(wi.hostDir, from, to, m)
|
||||
if err != nil {
|
||||
if err == ErrNoNewArchiveData {
|
||||
atomic.AddInt32(&completed, 1)
|
||||
continue
|
||||
}
|
||||
cclog.Errorf("[METRICSTORE]> binary checkpoint error for %s: %v", wi.hostDir, err)
|
||||
@@ -680,6 +740,7 @@ func (m *MemoryStore) ToCheckpointWAL(dir string, from, to int64) (int, []string
|
||||
successDirs = append(successDirs, wi.hostDir)
|
||||
successMu.Unlock()
|
||||
}
|
||||
atomic.AddInt32(&completed, 1)
|
||||
}
|
||||
}()
|
||||
}
|
||||
@@ -694,6 +755,7 @@ func (m *MemoryStore) ToCheckpointWAL(dir string, from, to int64) (int, []string
|
||||
}
|
||||
close(work)
|
||||
wg.Wait()
|
||||
close(stopProgress)
|
||||
|
||||
if errs > 0 {
|
||||
return int(n), successDirs, fmt.Errorf("[METRICSTORE]> %d errors during binary checkpoint (%d successes)", errs, n)
|
||||
|
||||
381
tools/binaryCheckpointReader/binaryCheckpointReader.go
Normal file
381
tools/binaryCheckpointReader/binaryCheckpointReader.go
Normal file
@@ -0,0 +1,381 @@
|
||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
// All rights reserved. This file is part of cc-backend.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// binaryCheckpointReader reads .wal or .bin checkpoint files produced by the
|
||||
// metricstore WAL/snapshot system and dumps their contents to a human-readable
|
||||
// .txt file (same name as input, with .txt extension).
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// go run ./tools/binaryCheckpointReader <file.wal|file.bin>
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"hash/crc32"
|
||||
"io"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Magic numbers matching metricstore/walCheckpoint.go.
|
||||
const (
|
||||
walFileMagic = uint32(0xCC1DA701)
|
||||
walRecordMagic = uint32(0xCC1DA7A1)
|
||||
snapFileMagic = uint32(0xCC5B0001)
|
||||
)
|
||||
|
||||
func main() {
|
||||
if len(os.Args) != 2 {
|
||||
fmt.Fprintf(os.Stderr, "Usage: %s <file.wal|file.bin>\n", os.Args[0])
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
inputPath := os.Args[1]
|
||||
ext := strings.ToLower(filepath.Ext(inputPath))
|
||||
|
||||
if ext != ".wal" && ext != ".bin" {
|
||||
fmt.Fprintf(os.Stderr, "Error: file must have .wal or .bin extension, got %q\n", ext)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
f, err := os.Open(inputPath)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error opening %s: %v\n", inputPath, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// Output file: replace extension with .txt
|
||||
outputPath := strings.TrimSuffix(inputPath, filepath.Ext(inputPath)) + ".txt"
|
||||
out, err := os.Create(outputPath)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error creating output %s: %v\n", outputPath, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer out.Close()
|
||||
|
||||
w := bufio.NewWriter(out)
|
||||
defer w.Flush()
|
||||
|
||||
switch ext {
|
||||
case ".wal":
|
||||
err = dumpWAL(f, w)
|
||||
case ".bin":
|
||||
err = dumpBinarySnapshot(f, w)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error reading %s: %v\n", inputPath, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
w.Flush()
|
||||
fmt.Printf("Output written to %s\n", outputPath)
|
||||
}
|
||||
|
||||
// ---------- WAL reader ----------
|
||||
|
||||
func dumpWAL(f *os.File, w *bufio.Writer) error {
|
||||
br := bufio.NewReader(f)
|
||||
|
||||
// Read and verify file header magic.
|
||||
var fileMagic uint32
|
||||
if err := binary.Read(br, binary.LittleEndian, &fileMagic); err != nil {
|
||||
if err == io.EOF {
|
||||
fmt.Fprintln(w, "WAL file is empty (0 bytes).")
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("read file header: %w", err)
|
||||
}
|
||||
|
||||
if fileMagic != walFileMagic {
|
||||
return fmt.Errorf("invalid WAL file magic 0x%08X (expected 0x%08X)", fileMagic, walFileMagic)
|
||||
}
|
||||
|
||||
fmt.Fprintf(w, "=== WAL File Dump ===\n")
|
||||
fmt.Fprintf(w, "File: %s\n", f.Name())
|
||||
fmt.Fprintf(w, "File Magic: 0x%08X (valid)\n\n", fileMagic)
|
||||
|
||||
recordNum := 0
|
||||
for {
|
||||
msg, err := readWALRecord(br)
|
||||
if err != nil {
|
||||
fmt.Fprintf(w, "--- Record #%d: ERROR ---\n", recordNum+1)
|
||||
fmt.Fprintf(w, " Error: %v\n", err)
|
||||
fmt.Fprintf(w, " (stopping replay — likely truncated trailing record)\n\n")
|
||||
break
|
||||
}
|
||||
if msg == nil {
|
||||
break // Clean EOF
|
||||
}
|
||||
|
||||
recordNum++
|
||||
ts := time.Unix(msg.Timestamp, 0).UTC()
|
||||
|
||||
fmt.Fprintf(w, "--- Record #%d ---\n", recordNum)
|
||||
fmt.Fprintf(w, " Timestamp: %d (%s)\n", msg.Timestamp, ts.Format(time.RFC3339))
|
||||
fmt.Fprintf(w, " Metric: %s\n", msg.MetricName)
|
||||
if len(msg.Selector) > 0 {
|
||||
fmt.Fprintf(w, " Selectors: [%s]\n", strings.Join(msg.Selector, ", "))
|
||||
} else {
|
||||
fmt.Fprintf(w, " Selectors: (none)\n")
|
||||
}
|
||||
fmt.Fprintf(w, " Value: %g\n\n", msg.Value)
|
||||
}
|
||||
|
||||
fmt.Fprintf(w, "=== Total valid records: %d ===\n", recordNum)
|
||||
return nil
|
||||
}
|
||||
|
||||
type walMessage struct {
|
||||
MetricName string
|
||||
Selector []string
|
||||
Value float32
|
||||
Timestamp int64
|
||||
}
|
||||
|
||||
func readWALRecord(r io.Reader) (*walMessage, error) {
|
||||
var magic uint32
|
||||
if err := binary.Read(r, binary.LittleEndian, &magic); err != nil {
|
||||
if err == io.EOF {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, fmt.Errorf("read record magic: %w", err)
|
||||
}
|
||||
|
||||
if magic != walRecordMagic {
|
||||
return nil, fmt.Errorf("invalid record magic 0x%08X (expected 0x%08X)", magic, walRecordMagic)
|
||||
}
|
||||
|
||||
var payloadLen uint32
|
||||
if err := binary.Read(r, binary.LittleEndian, &payloadLen); err != nil {
|
||||
return nil, fmt.Errorf("read payload length: %w", err)
|
||||
}
|
||||
|
||||
if payloadLen > 1<<20 {
|
||||
return nil, fmt.Errorf("record payload too large: %d bytes", payloadLen)
|
||||
}
|
||||
|
||||
payload := make([]byte, payloadLen)
|
||||
if _, err := io.ReadFull(r, payload); err != nil {
|
||||
return nil, fmt.Errorf("read payload: %w", err)
|
||||
}
|
||||
|
||||
var storedCRC uint32
|
||||
if err := binary.Read(r, binary.LittleEndian, &storedCRC); err != nil {
|
||||
return nil, fmt.Errorf("read CRC: %w", err)
|
||||
}
|
||||
|
||||
if crc32.ChecksumIEEE(payload) != storedCRC {
|
||||
return nil, fmt.Errorf("CRC mismatch (truncated write or corruption)")
|
||||
}
|
||||
|
||||
return parseWALPayload(payload)
|
||||
}
|
||||
|
||||
func parseWALPayload(payload []byte) (*walMessage, error) {
|
||||
if len(payload) < 8+2+1+4 {
|
||||
return nil, fmt.Errorf("payload too short: %d bytes", len(payload))
|
||||
}
|
||||
|
||||
offset := 0
|
||||
|
||||
// Timestamp (8 bytes).
|
||||
ts := int64(binary.LittleEndian.Uint64(payload[offset : offset+8]))
|
||||
offset += 8
|
||||
|
||||
// Metric name (2-byte length + bytes).
|
||||
if offset+2 > len(payload) {
|
||||
return nil, fmt.Errorf("metric name length overflows payload")
|
||||
}
|
||||
mLen := int(binary.LittleEndian.Uint16(payload[offset : offset+2]))
|
||||
offset += 2
|
||||
|
||||
if offset+mLen > len(payload) {
|
||||
return nil, fmt.Errorf("metric name overflows payload")
|
||||
}
|
||||
metricName := string(payload[offset : offset+mLen])
|
||||
offset += mLen
|
||||
|
||||
// Selector count (1 byte).
|
||||
if offset >= len(payload) {
|
||||
return nil, fmt.Errorf("selector count overflows payload")
|
||||
}
|
||||
selCount := int(payload[offset])
|
||||
offset++
|
||||
|
||||
selectors := make([]string, selCount)
|
||||
for i := range selCount {
|
||||
if offset >= len(payload) {
|
||||
return nil, fmt.Errorf("selector[%d] length overflows payload", i)
|
||||
}
|
||||
sLen := int(payload[offset])
|
||||
offset++
|
||||
|
||||
if offset+sLen > len(payload) {
|
||||
return nil, fmt.Errorf("selector[%d] data overflows payload", i)
|
||||
}
|
||||
selectors[i] = string(payload[offset : offset+sLen])
|
||||
offset += sLen
|
||||
}
|
||||
|
||||
// Value (4 bytes, float32 bits).
|
||||
if offset+4 > len(payload) {
|
||||
return nil, fmt.Errorf("value overflows payload")
|
||||
}
|
||||
bits := binary.LittleEndian.Uint32(payload[offset : offset+4])
|
||||
value := math.Float32frombits(bits)
|
||||
|
||||
return &walMessage{
|
||||
MetricName: metricName,
|
||||
Timestamp: ts,
|
||||
Selector: selectors,
|
||||
Value: value,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ---------- Binary snapshot reader ----------
|
||||
|
||||
func dumpBinarySnapshot(f *os.File, w *bufio.Writer) error {
|
||||
br := bufio.NewReader(f)
|
||||
|
||||
var magic uint32
|
||||
if err := binary.Read(br, binary.LittleEndian, &magic); err != nil {
|
||||
return fmt.Errorf("read magic: %w", err)
|
||||
}
|
||||
if magic != snapFileMagic {
|
||||
return fmt.Errorf("invalid snapshot magic 0x%08X (expected 0x%08X)", magic, snapFileMagic)
|
||||
}
|
||||
|
||||
var from, to int64
|
||||
if err := binary.Read(br, binary.LittleEndian, &from); err != nil {
|
||||
return fmt.Errorf("read from: %w", err)
|
||||
}
|
||||
if err := binary.Read(br, binary.LittleEndian, &to); err != nil {
|
||||
return fmt.Errorf("read to: %w", err)
|
||||
}
|
||||
|
||||
fromTime := time.Unix(from, 0).UTC()
|
||||
toTime := time.Unix(to, 0).UTC()
|
||||
|
||||
fmt.Fprintf(w, "=== Binary Snapshot Dump ===\n")
|
||||
fmt.Fprintf(w, "File: %s\n", f.Name())
|
||||
fmt.Fprintf(w, "Magic: 0x%08X (valid)\n", magic)
|
||||
fmt.Fprintf(w, "From: %d (%s)\n", from, fromTime.Format(time.RFC3339))
|
||||
fmt.Fprintf(w, "To: %d (%s)\n\n", to, toTime.Format(time.RFC3339))
|
||||
|
||||
return dumpBinaryLevel(br, w, 0)
|
||||
}
|
||||
|
||||
func dumpBinaryLevel(r io.Reader, w *bufio.Writer, depth int) error {
|
||||
indent := strings.Repeat(" ", depth)
|
||||
|
||||
var numMetrics uint32
|
||||
if err := binary.Read(r, binary.LittleEndian, &numMetrics); err != nil {
|
||||
return fmt.Errorf("read num_metrics: %w", err)
|
||||
}
|
||||
|
||||
if numMetrics > 0 {
|
||||
fmt.Fprintf(w, "%sMetrics (%d):\n", indent, numMetrics)
|
||||
}
|
||||
|
||||
for i := range numMetrics {
|
||||
name, err := readString16(r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read metric name [%d]: %w", i, err)
|
||||
}
|
||||
|
||||
var freq, start int64
|
||||
if err := binary.Read(r, binary.LittleEndian, &freq); err != nil {
|
||||
return fmt.Errorf("read frequency for %s: %w", name, err)
|
||||
}
|
||||
if err := binary.Read(r, binary.LittleEndian, &start); err != nil {
|
||||
return fmt.Errorf("read start for %s: %w", name, err)
|
||||
}
|
||||
|
||||
var numValues uint32
|
||||
if err := binary.Read(r, binary.LittleEndian, &numValues); err != nil {
|
||||
return fmt.Errorf("read num_values for %s: %w", name, err)
|
||||
}
|
||||
|
||||
startTime := time.Unix(start, 0).UTC()
|
||||
|
||||
fmt.Fprintf(w, "%s [%s]\n", indent, name)
|
||||
fmt.Fprintf(w, "%s Frequency: %d s\n", indent, freq)
|
||||
fmt.Fprintf(w, "%s Start: %d (%s)\n", indent, start, startTime.Format(time.RFC3339))
|
||||
fmt.Fprintf(w, "%s Values (%d):", indent, numValues)
|
||||
|
||||
if numValues == 0 {
|
||||
fmt.Fprintln(w, " (none)")
|
||||
} else {
|
||||
fmt.Fprintln(w)
|
||||
// Print values in rows of 10 for readability.
|
||||
for j := range numValues {
|
||||
var bits uint32
|
||||
if err := binary.Read(r, binary.LittleEndian, &bits); err != nil {
|
||||
return fmt.Errorf("read value[%d] for %s: %w", j, name, err)
|
||||
}
|
||||
val := math.Float32frombits(bits)
|
||||
|
||||
if j%10 == 0 {
|
||||
if j > 0 {
|
||||
fmt.Fprintln(w)
|
||||
}
|
||||
// Print the timestamp for this row's first value.
|
||||
rowTS := start + int64(j)*freq
|
||||
fmt.Fprintf(w, "%s [%s] ", indent, time.Unix(rowTS, 0).UTC().Format("15:04:05"))
|
||||
}
|
||||
|
||||
if math.IsNaN(float64(val)) {
|
||||
fmt.Fprintf(w, "NaN ")
|
||||
} else {
|
||||
fmt.Fprintf(w, "%g ", val)
|
||||
}
|
||||
}
|
||||
fmt.Fprintln(w)
|
||||
}
|
||||
}
|
||||
|
||||
var numChildren uint32
|
||||
if err := binary.Read(r, binary.LittleEndian, &numChildren); err != nil {
|
||||
return fmt.Errorf("read num_children: %w", err)
|
||||
}
|
||||
|
||||
if numChildren > 0 {
|
||||
fmt.Fprintf(w, "%sChildren (%d):\n", indent, numChildren)
|
||||
}
|
||||
|
||||
for i := range numChildren {
|
||||
childName, err := readString16(r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read child name [%d]: %w", i, err)
|
||||
}
|
||||
|
||||
fmt.Fprintf(w, "%s [%s]\n", indent, childName)
|
||||
if err := dumpBinaryLevel(r, w, depth+2); err != nil {
|
||||
return fmt.Errorf("read child %s: %w", childName, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func readString16(r io.Reader) (string, error) {
|
||||
var sLen uint16
|
||||
if err := binary.Read(r, binary.LittleEndian, &sLen); err != nil {
|
||||
return "", err
|
||||
}
|
||||
buf := make([]byte, sLen)
|
||||
if _, err := io.ReadFull(r, buf); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(buf), nil
|
||||
}
|
||||
55
web/frontend/package-lock.json
generated
55
web/frontend/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "cc-frontend",
|
||||
"version": "1.5.2",
|
||||
"version": "1.5.3",
|
||||
"lockfileVersion": 4,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "cc-frontend",
|
||||
"version": "1.5.2",
|
||||
"version": "1.5.3",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@rollup/plugin-replace": "^6.0.3",
|
||||
@@ -328,6 +328,9 @@
|
||||
"cpu": [
|
||||
"arm"
|
||||
],
|
||||
"libc": [
|
||||
"glibc"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
@@ -341,6 +344,9 @@
|
||||
"cpu": [
|
||||
"arm"
|
||||
],
|
||||
"libc": [
|
||||
"musl"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
@@ -354,6 +360,9 @@
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"libc": [
|
||||
"glibc"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
@@ -367,6 +376,9 @@
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"libc": [
|
||||
"musl"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
@@ -380,6 +392,9 @@
|
||||
"cpu": [
|
||||
"loong64"
|
||||
],
|
||||
"libc": [
|
||||
"glibc"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
@@ -393,6 +408,9 @@
|
||||
"cpu": [
|
||||
"loong64"
|
||||
],
|
||||
"libc": [
|
||||
"musl"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
@@ -406,6 +424,9 @@
|
||||
"cpu": [
|
||||
"ppc64"
|
||||
],
|
||||
"libc": [
|
||||
"glibc"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
@@ -419,6 +440,9 @@
|
||||
"cpu": [
|
||||
"ppc64"
|
||||
],
|
||||
"libc": [
|
||||
"musl"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
@@ -432,6 +456,9 @@
|
||||
"cpu": [
|
||||
"riscv64"
|
||||
],
|
||||
"libc": [
|
||||
"glibc"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
@@ -445,6 +472,9 @@
|
||||
"cpu": [
|
||||
"riscv64"
|
||||
],
|
||||
"libc": [
|
||||
"musl"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
@@ -458,6 +488,9 @@
|
||||
"cpu": [
|
||||
"s390x"
|
||||
],
|
||||
"libc": [
|
||||
"glibc"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
@@ -471,6 +504,9 @@
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"libc": [
|
||||
"glibc"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
@@ -484,6 +520,9 @@
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"libc": [
|
||||
"musl"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
@@ -959,9 +998,9 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/picomatch": {
|
||||
"version": "4.0.4",
|
||||
"resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz",
|
||||
"integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==",
|
||||
"version": "4.0.3",
|
||||
"resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz",
|
||||
"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
@@ -1095,9 +1134,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/rollup-plugin-svelte/node_modules/picomatch": {
|
||||
"version": "2.3.2",
|
||||
"resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.2.tgz",
|
||||
"integrity": "sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==",
|
||||
"version": "2.3.1",
|
||||
"resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz",
|
||||
"integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "cc-frontend",
|
||||
"version": "1.5.2",
|
||||
"version": "1.5.3",
|
||||
"license": "MIT",
|
||||
"scripts": {
|
||||
"build": "rollup -c",
|
||||
|
||||
@@ -57,7 +57,7 @@
|
||||
let entries = $state([]);
|
||||
let loading = $state(false);
|
||||
let error = $state(null);
|
||||
let timer = $state(null);
|
||||
let timer = null;
|
||||
|
||||
function levelColor(priority) {
|
||||
if (priority <= 2) return "danger";
|
||||
|
||||
Reference in New Issue
Block a user