9 Commits

18 changed files with 2421 additions and 823 deletions

View File

@@ -10,7 +10,6 @@ import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"io"
"net/http" "net/http"
"strconv" "strconv"
"strings" "strings"
@@ -90,16 +89,17 @@ func freeMetrics(rw http.ResponseWriter, r *http.Request) {
// @security ApiKeyAuth // @security ApiKeyAuth
// @router /write/ [post] // @router /write/ [post]
func writeMetrics(rw http.ResponseWriter, r *http.Request) { func writeMetrics(rw http.ResponseWriter, r *http.Request) {
bytes, err := io.ReadAll(r.Body)
rw.Header().Add("Content-Type", "application/json") rw.Header().Add("Content-Type", "application/json")
if err != nil {
handleError(err, http.StatusInternalServerError, rw)
return
}
// Extract the "cluster" query parameter without allocating a url.Values map.
cluster := queryParam(r.URL.RawQuery, "cluster")
// Stream directly from the request body instead of copying it into a
// temporary buffer via io.ReadAll. The line-protocol decoder supports
// io.Reader natively, so this avoids the largest heap allocation.
ms := metricstore.GetMemoryStore() ms := metricstore.GetMemoryStore()
dec := lineprotocol.NewDecoderWithBytes(bytes) dec := lineprotocol.NewDecoder(r.Body)
if err := metricstore.DecodeLine(dec, ms, r.URL.Query().Get("cluster")); err != nil { if err := metricstore.DecodeLine(dec, ms, cluster); err != nil {
cclog.Errorf("/api/write error: %s", err.Error()) cclog.Errorf("/api/write error: %s", err.Error())
handleError(err, http.StatusBadRequest, rw) handleError(err, http.StatusBadRequest, rw)
return return
@@ -107,6 +107,20 @@ func writeMetrics(rw http.ResponseWriter, r *http.Request) {
rw.WriteHeader(http.StatusOK) rw.WriteHeader(http.StatusOK)
} }
// queryParam extracts a single query-parameter value from a raw query string
// without allocating a url.Values map. Returns "" if the key is not present.
func queryParam(raw, key string) string {
for raw != "" {
var kv string
kv, raw, _ = strings.Cut(raw, "&")
k, v, _ := strings.Cut(kv, "=")
if k == key {
return v
}
}
return ""
}
// handleDebug godoc // handleDebug godoc
// @summary Debug endpoint // @summary Debug endpoint
// @tags debug // @tags debug

View File

@@ -302,7 +302,7 @@ func (api *RestAPI) runTagger(rw http.ResponseWriter, r *http.Request) {
rw.Header().Set("Content-Type", "text/plain") rw.Header().Set("Content-Type", "text/plain")
rw.WriteHeader(http.StatusOK) rw.WriteHeader(http.StatusOK)
if _, err := rw.Write([]byte(fmt.Sprintf("Tagger %s started", name))); err != nil { if _, err := rw.Write(fmt.Appendf(nil, "Tagger %s started", name)); err != nil {
cclog.Errorf("Failed to write response: %v", err) cclog.Errorf("Failed to write response: %v", err)
} }
} }

View File

@@ -501,9 +501,7 @@ func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer {
var wg sync.WaitGroup var wg sync.WaitGroup
for range numWorkers { for range numWorkers {
wg.Add(1) wg.Go(func() {
go func() {
defer wg.Done()
for jobPath := range jobPaths { for jobPath := range jobPaths {
job, err := loadJobMeta(filepath.Join(jobPath, "meta.json")) job, err := loadJobMeta(filepath.Join(jobPath, "meta.json"))
if err != nil && !errors.Is(err, &jsonschema.ValidationError{}) { if err != nil && !errors.Is(err, &jsonschema.ValidationError{}) {
@@ -529,7 +527,7 @@ func (fsa *FsArchive) Iter(loadMetricData bool) <-chan JobContainer {
ch <- JobContainer{Meta: job, Data: nil} ch <- JobContainer{Meta: job, Data: nil}
} }
} }
}() })
} }
clustersDir, err := os.ReadDir(fsa.path) clustersDir, err := os.ReadDir(fsa.path)

View File

@@ -821,9 +821,7 @@ func (s3a *S3Archive) Iter(loadMetricData bool) <-chan JobContainer {
var wg sync.WaitGroup var wg sync.WaitGroup
for range numWorkers { for range numWorkers {
wg.Add(1) wg.Go(func() {
go func() {
defer wg.Done()
for metaKey := range metaKeys { for metaKey := range metaKeys {
result, err := s3a.client.GetObject(ctx, &s3.GetObjectInput{ result, err := s3a.client.GetObject(ctx, &s3.GetObjectInput{
Bucket: aws.String(s3a.bucket), Bucket: aws.String(s3a.bucket),
@@ -859,7 +857,7 @@ func (s3a *S3Archive) Iter(loadMetricData bool) <-chan JobContainer {
ch <- JobContainer{Meta: job, Data: nil} ch <- JobContainer{Meta: job, Data: nil}
} }
} }
}() })
} }
for _, cluster := range s3a.clusters { for _, cluster := range s3a.clusters {

View File

@@ -576,9 +576,7 @@ func (sa *SqliteArchive) Iter(loadMetricData bool) <-chan JobContainer {
var wg sync.WaitGroup var wg sync.WaitGroup
for range numWorkers { for range numWorkers {
wg.Add(1) wg.Go(func() {
go func() {
defer wg.Done()
for row := range jobRows { for row := range jobRows {
job, err := DecodeJobMeta(bytes.NewReader(row.metaBlob)) job, err := DecodeJobMeta(bytes.NewReader(row.metaBlob))
if err != nil { if err != nil {
@@ -617,7 +615,7 @@ func (sa *SqliteArchive) Iter(loadMetricData bool) <-chan JobContainer {
ch <- JobContainer{Meta: job, Data: nil} ch <- JobContainer{Meta: job, Data: nil}
} }
} }
}() })
} }
for { for {

View File

@@ -6,12 +6,9 @@
package metricstore package metricstore
import ( import (
"archive/zip"
"bufio"
"context" "context"
"errors" "errors"
"fmt" "fmt"
"io"
"os" "os"
"path/filepath" "path/filepath"
"sync" "sync"
@@ -47,11 +44,9 @@ func CleanUp(wg *sync.WaitGroup, ctx context.Context) {
} }
} }
// runWorker takes simple values to configure what it does // cleanUpWorker takes simple values to configure what it does
func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mode string, cleanupDir string, delete bool) { func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mode string, cleanupDir string, delete bool) {
wg.Add(1) wg.Go(func() {
go func() {
defer wg.Done()
d, err := time.ParseDuration(interval) d, err := time.ParseDuration(interval)
if err != nil { if err != nil {
@@ -77,30 +72,39 @@ func cleanUpWorker(wg *sync.WaitGroup, ctx context.Context, interval string, mod
if err != nil { if err != nil {
cclog.Errorf("[METRICSTORE]> %s failed: %s", mode, err.Error()) cclog.Errorf("[METRICSTORE]> %s failed: %s", mode, err.Error())
} else { } else {
if delete && cleanupDir == "" { if delete {
cclog.Infof("[METRICSTORE]> done: %d checkpoints deleted", n) cclog.Infof("[METRICSTORE]> done: %d checkpoints deleted", n)
} else { } else {
cclog.Infof("[METRICSTORE]> done: %d files zipped and moved to archive", n) cclog.Infof("[METRICSTORE]> done: %d checkpoint files archived to parquet", n)
} }
} }
} }
} }
}() })
} }
var ErrNoNewArchiveData error = errors.New("all data already archived") var ErrNoNewArchiveData error = errors.New("all data already archived")
// Delete or ZIP all checkpoint files older than `from` together and write them to the `cleanupDir`, // CleanupCheckpoints deletes or archives all checkpoint files older than `from`.
// deleting/moving them from the `checkpointsDir`. // When archiving, consolidates all hosts per cluster into a single Parquet file.
func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteInstead bool) (int, error) { func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteInstead bool) (int, error) {
if deleteInstead {
return deleteCheckpoints(checkpointsDir, from)
}
return archiveCheckpoints(checkpointsDir, cleanupDir, from)
}
// deleteCheckpoints removes checkpoint files older than `from` across all clusters/hosts.
func deleteCheckpoints(checkpointsDir string, from int64) (int, error) {
entries1, err := os.ReadDir(checkpointsDir) entries1, err := os.ReadDir(checkpointsDir)
if err != nil { if err != nil {
return 0, err return 0, err
} }
type workItem struct { type workItem struct {
cdir, adir string dir string
cluster, host string cluster, host string
} }
var wg sync.WaitGroup var wg sync.WaitGroup
@@ -111,13 +115,29 @@ func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteIns
for worker := 0; worker < Keys.NumWorkers; worker++ { for worker := 0; worker < Keys.NumWorkers; worker++ {
go func() { go func() {
defer wg.Done() defer wg.Done()
for workItem := range work { for item := range work {
m, err := cleanupCheckpoints(workItem.cdir, workItem.adir, from, deleteInstead) entries, err := os.ReadDir(item.dir)
if err != nil { if err != nil {
cclog.Errorf("error while archiving %s/%s: %s", workItem.cluster, workItem.host, err.Error()) cclog.Errorf("error reading %s/%s: %s", item.cluster, item.host, err.Error())
atomic.AddInt32(&errs, 1) atomic.AddInt32(&errs, 1)
continue
}
files, err := findFiles(entries, from, false)
if err != nil {
cclog.Errorf("error finding files in %s/%s: %s", item.cluster, item.host, err.Error())
atomic.AddInt32(&errs, 1)
continue
}
for _, checkpoint := range files {
if err := os.Remove(filepath.Join(item.dir, checkpoint)); err != nil {
cclog.Errorf("error deleting %s/%s/%s: %s", item.cluster, item.host, checkpoint, err.Error())
atomic.AddInt32(&errs, 1)
} else {
atomic.AddInt32(&n, 1)
}
} }
atomic.AddInt32(&n, int32(m))
} }
}() }()
} }
@@ -126,14 +146,14 @@ func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteIns
entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name())) entries2, e := os.ReadDir(filepath.Join(checkpointsDir, de1.Name()))
if e != nil { if e != nil {
err = e err = e
continue
} }
for _, de2 := range entries2 { for _, de2 := range entries2 {
cdir := filepath.Join(checkpointsDir, de1.Name(), de2.Name())
adir := filepath.Join(cleanupDir, de1.Name(), de2.Name())
work <- workItem{ work <- workItem{
adir: adir, cdir: cdir, dir: filepath.Join(checkpointsDir, de1.Name(), de2.Name()),
cluster: de1.Name(), host: de2.Name(), cluster: de1.Name(),
host: de2.Name(),
} }
} }
} }
@@ -144,85 +164,118 @@ func CleanupCheckpoints(checkpointsDir, cleanupDir string, from int64, deleteIns
if err != nil { if err != nil {
return int(n), err return int(n), err
} }
if errs > 0 { if errs > 0 {
return int(n), fmt.Errorf("%d errors happened while archiving (%d successes)", errs, n) return int(n), fmt.Errorf("%d errors happened while deleting (%d successes)", errs, n)
} }
return int(n), nil return int(n), nil
} }
// Helper function for `CleanupCheckpoints`. // archiveCheckpoints archives checkpoint files to Parquet format.
func cleanupCheckpoints(dir string, cleanupDir string, from int64, deleteInstead bool) (int, error) { // Produces one Parquet file per cluster: <cleanupDir>/<cluster>/<timestamp>.parquet
entries, err := os.ReadDir(dir) func archiveCheckpoints(checkpointsDir, cleanupDir string, from int64) (int, error) {
clusterEntries, err := os.ReadDir(checkpointsDir)
if err != nil { if err != nil {
return 0, err return 0, err
} }
files, err := findFiles(entries, from, false) totalFiles := 0
if err != nil {
return 0, err
}
if deleteInstead { for _, clusterEntry := range clusterEntries {
n := 0 if !clusterEntry.IsDir() {
for _, checkpoint := range files { continue
filename := filepath.Join(dir, checkpoint)
if err = os.Remove(filename); err != nil {
return n, err
}
n += 1
} }
return n, nil
}
filename := filepath.Join(cleanupDir, fmt.Sprintf("%d.zip", from)) cluster := clusterEntry.Name()
f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms) hostEntries, err := os.ReadDir(filepath.Join(checkpointsDir, cluster))
if err != nil && os.IsNotExist(err) {
err = os.MkdirAll(cleanupDir, CheckpointDirPerms)
if err == nil {
f, err = os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, CheckpointFilePerms)
}
}
if err != nil {
return 0, err
}
defer f.Close()
bw := bufio.NewWriter(f)
defer bw.Flush()
zw := zip.NewWriter(bw)
defer zw.Close()
n := 0
for _, checkpoint := range files {
// Use closure to ensure file is closed immediately after use,
// avoiding file descriptor leak from defer in loop
err := func() error {
filename := filepath.Join(dir, checkpoint)
r, err := os.Open(filename)
if err != nil {
return err
}
defer r.Close()
w, err := zw.Create(checkpoint)
if err != nil {
return err
}
if _, err = io.Copy(w, r); err != nil {
return err
}
if err = os.Remove(filename); err != nil {
return err
}
return nil
}()
if err != nil { if err != nil {
return n, err return totalFiles, err
} }
n += 1
// Collect rows from all hosts in this cluster using worker pool
type hostResult struct {
rows []ParquetMetricRow
files []string // checkpoint filenames to delete after successful write
dir string // checkpoint directory for this host
}
results := make(chan hostResult, len(hostEntries))
work := make(chan struct {
dir, host string
}, Keys.NumWorkers)
var wg sync.WaitGroup
errs := int32(0)
wg.Add(Keys.NumWorkers)
for w := 0; w < Keys.NumWorkers; w++ {
go func() {
defer wg.Done()
for item := range work {
rows, files, err := archiveCheckpointsToParquet(item.dir, cluster, item.host, from)
if err != nil {
cclog.Errorf("[METRICSTORE]> error reading checkpoints for %s/%s: %s", cluster, item.host, err.Error())
atomic.AddInt32(&errs, 1)
continue
}
if len(rows) > 0 {
results <- hostResult{rows: rows, files: files, dir: item.dir}
}
}
}()
}
go func() {
for _, hostEntry := range hostEntries {
if !hostEntry.IsDir() {
continue
}
dir := filepath.Join(checkpointsDir, cluster, hostEntry.Name())
work <- struct {
dir, host string
}{dir: dir, host: hostEntry.Name()}
}
close(work)
wg.Wait()
close(results)
}()
// Collect all rows and file info
var allRows []ParquetMetricRow
var allResults []hostResult
for r := range results {
allRows = append(allRows, r.rows...)
allResults = append(allResults, r)
}
if errs > 0 {
return totalFiles, fmt.Errorf("%d errors reading checkpoints for cluster %s", errs, cluster)
}
if len(allRows) == 0 {
continue
}
// Write one Parquet file per cluster
parquetFile := filepath.Join(cleanupDir, cluster, fmt.Sprintf("%d.parquet", from))
if err := writeParquetArchive(parquetFile, allRows); err != nil {
return totalFiles, fmt.Errorf("writing parquet archive for cluster %s: %w", cluster, err)
}
// Delete archived checkpoint files
for _, result := range allResults {
for _, file := range result.files {
filename := filepath.Join(result.dir, file)
if err := os.Remove(filename); err != nil {
cclog.Warnf("[METRICSTORE]> could not remove archived checkpoint %s: %v", filename, err)
} else {
totalFiles++
}
}
}
cclog.Infof("[METRICSTORE]> archived %d rows from %d files for cluster %s to %s",
len(allRows), totalFiles, cluster, parquetFile)
} }
return n, nil return totalFiles, nil
} }

View File

@@ -1,274 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// This file implements the binary checkpoint format for fast loading.
//
// The binary format stores metric data in column-oriented layout (per-metric
// float64 arrays) for maximum load speed. Float32 arrays are read/written
// as raw bytes, avoiding per-element parsing overhead.
//
// File format:
//
// Header (28 bytes):
// magic: [4]byte "CCMS"
// version: uint32 LE
// from: int64 LE
// to: int64 LE
//
// Body (recursive):
// nmetrics: uint32 LE
// Per metric:
// name_len: uint16 LE
// name: []byte
// freq: int64 LE
// start: int64 LE
// nvalues: uint32 LE
// data: []float64 LE (NaN = missing)
// nchildren: uint32 LE
// Per child:
// name_len: uint16 LE
// name: []byte
// (recursive body)
package metricstore
import (
"bufio"
"encoding/binary"
"fmt"
"io"
"os"
"path"
"unsafe"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
var (
binaryMagic = [4]byte{'C', 'C', 'M', 'S'}
binaryVersion = uint32(1)
binaryByteOrder = binary.LittleEndian
floatSize = int(unsafe.Sizeof(schema.Float(0))) // schema.Float is float64
)
// writeBinaryCheckpoint writes a CheckpointFile to a binary checkpoint file on disk.
func writeBinaryCheckpoint(filePath string, cf *CheckpointFile) error {
f, err := os.OpenFile(filePath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
if err != nil && os.IsNotExist(err) {
if err2 := os.MkdirAll(path.Dir(filePath), CheckpointDirPerms); err2 != nil {
return err2
}
f, err = os.OpenFile(filePath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
}
if err != nil {
return err
}
defer f.Close()
bw := bufio.NewWriter(f)
// Write header
if _, err := bw.Write(binaryMagic[:]); err != nil {
return err
}
if err := binary.Write(bw, binaryByteOrder, binaryVersion); err != nil {
return err
}
if err := binary.Write(bw, binaryByteOrder, cf.From); err != nil {
return err
}
if err := binary.Write(bw, binaryByteOrder, cf.To); err != nil {
return err
}
// Write body (metrics + children recursively)
if err := writeBinaryBody(bw, cf); err != nil {
return err
}
return bw.Flush()
}
// writeBinaryBody writes the metrics and children of a CheckpointFile.
func writeBinaryBody(w io.Writer, cf *CheckpointFile) error {
if err := binary.Write(w, binaryByteOrder, uint32(len(cf.Metrics))); err != nil {
return err
}
for name, metric := range cf.Metrics {
nameBytes := []byte(name)
if err := binary.Write(w, binaryByteOrder, uint16(len(nameBytes))); err != nil {
return err
}
if _, err := w.Write(nameBytes); err != nil {
return err
}
if err := binary.Write(w, binaryByteOrder, metric.Frequency); err != nil {
return err
}
if err := binary.Write(w, binaryByteOrder, metric.Start); err != nil {
return err
}
if err := binary.Write(w, binaryByteOrder, uint32(len(metric.Data))); err != nil {
return err
}
if err := writeFloatArray(w, metric.Data); err != nil {
return err
}
}
if err := binary.Write(w, binaryByteOrder, uint32(len(cf.Children))); err != nil {
return err
}
for name, child := range cf.Children {
nameBytes := []byte(name)
if err := binary.Write(w, binaryByteOrder, uint16(len(nameBytes))); err != nil {
return err
}
if _, err := w.Write(nameBytes); err != nil {
return err
}
if err := writeBinaryBody(w, child); err != nil {
return err
}
}
return nil
}
// writeFloatArray writes a schema.Float slice as raw little-endian float64 bytes.
func writeFloatArray(w io.Writer, data []schema.Float) error {
if len(data) == 0 {
return nil
}
buf := unsafe.Slice((*byte)(unsafe.Pointer(&data[0])), len(data)*floatSize)
_, err := w.Write(buf)
return err
}
// loadBinaryFile reads a binary checkpoint file into a CheckpointFile.
func loadBinaryFile(filePath string) (*CheckpointFile, error) {
f, err := os.Open(filePath)
if err != nil {
return nil, err
}
defer f.Close()
br := bufio.NewReader(f)
var magic [4]byte
if _, err := io.ReadFull(br, magic[:]); err != nil {
return nil, fmt.Errorf("reading magic: %w", err)
}
if magic != binaryMagic {
return nil, fmt.Errorf("[METRICSTORE]> invalid binary checkpoint magic in %s", filePath)
}
var version uint32
if err := binary.Read(br, binaryByteOrder, &version); err != nil {
return nil, fmt.Errorf("reading version: %w", err)
}
if version != binaryVersion {
return nil, fmt.Errorf("[METRICSTORE]> unsupported binary checkpoint version %d in %s", version, filePath)
}
cf := &CheckpointFile{}
if err := binary.Read(br, binaryByteOrder, &cf.From); err != nil {
return nil, fmt.Errorf("reading from: %w", err)
}
if err := binary.Read(br, binaryByteOrder, &cf.To); err != nil {
return nil, fmt.Errorf("reading to: %w", err)
}
if err := readBinaryBody(br, cf); err != nil {
return nil, err
}
return cf, nil
}
// readBinaryBody reads the metrics and children of a CheckpointFile.
func readBinaryBody(r io.Reader, cf *CheckpointFile) error {
var nmetrics uint32
if err := binary.Read(r, binaryByteOrder, &nmetrics); err != nil {
return fmt.Errorf("reading metric count: %w", err)
}
cf.Metrics = make(map[string]*CheckpointMetrics, nmetrics)
for range nmetrics {
var nameLen uint16
if err := binary.Read(r, binaryByteOrder, &nameLen); err != nil {
return fmt.Errorf("reading metric name length: %w", err)
}
nameBytes := make([]byte, nameLen)
if _, err := io.ReadFull(r, nameBytes); err != nil {
return fmt.Errorf("reading metric name: %w", err)
}
cm := &CheckpointMetrics{}
if err := binary.Read(r, binaryByteOrder, &cm.Frequency); err != nil {
return fmt.Errorf("reading frequency: %w", err)
}
if err := binary.Read(r, binaryByteOrder, &cm.Start); err != nil {
return fmt.Errorf("reading start: %w", err)
}
var nvalues uint32
if err := binary.Read(r, binaryByteOrder, &nvalues); err != nil {
return fmt.Errorf("reading value count: %w", err)
}
var err error
cm.Data, err = readFloatArray(r, int(nvalues))
if err != nil {
return fmt.Errorf("reading data for %s: %w", string(nameBytes), err)
}
cf.Metrics[string(nameBytes)] = cm
}
var nchildren uint32
if err := binary.Read(r, binaryByteOrder, &nchildren); err != nil {
return fmt.Errorf("reading children count: %w", err)
}
cf.Children = make(map[string]*CheckpointFile, nchildren)
for range nchildren {
var nameLen uint16
if err := binary.Read(r, binaryByteOrder, &nameLen); err != nil {
return fmt.Errorf("reading child name length: %w", err)
}
nameBytes := make([]byte, nameLen)
if _, err := io.ReadFull(r, nameBytes); err != nil {
return fmt.Errorf("reading child name: %w", err)
}
child := &CheckpointFile{}
if err := readBinaryBody(r, child); err != nil {
return fmt.Errorf("reading child %s: %w", string(nameBytes), err)
}
cf.Children[string(nameBytes)] = child
}
return nil
}
// readFloatArray reads n float32 values from raw little-endian bytes.
func readFloatArray(r io.Reader, n int) ([]schema.Float, error) {
if n == 0 {
return nil, nil
}
data := make([]schema.Float, n)
buf := unsafe.Slice((*byte)(unsafe.Pointer(&data[0])), n*floatSize)
if _, err := io.ReadFull(r, buf); err != nil {
return nil, err
}
return data, nil
}

View File

@@ -43,6 +43,7 @@ package metricstore
import ( import (
"errors" "errors"
"sync" "sync"
"time"
"github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
@@ -53,12 +54,97 @@ import (
// of data or reallocation needs to happen on writes. // of data or reallocation needs to happen on writes.
const BufferCap int = DefaultBufferCapacity const BufferCap int = DefaultBufferCapacity
var bufferPool sync.Pool = sync.Pool{ // maxPoolSize caps the number of buffers held in the pool at any time.
New: func() any { // Prevents unbounded memory growth after large retention-cleanup bursts.
const maxPoolSize = 4096
// BufferPool is the global instance.
// It is initialized immediately when the package loads.
var bufferPool = NewPersistentBufferPool()
type PersistentBufferPool struct {
pool []*buffer
mu sync.Mutex
}
// NewPersistentBufferPool creates a dynamic pool for buffers.
func NewPersistentBufferPool() *PersistentBufferPool {
return &PersistentBufferPool{
pool: make([]*buffer, 0),
}
}
func (p *PersistentBufferPool) Get() *buffer {
p.mu.Lock()
defer p.mu.Unlock()
n := len(p.pool)
if n == 0 {
// Pool is empty, allocate a new one
return &buffer{ return &buffer{
data: make([]schema.Float, 0, BufferCap), data: make([]schema.Float, 0, BufferCap),
} }
}, }
// Reuse existing buffer from the pool
b := p.pool[n-1]
p.pool[n-1] = nil // Avoid memory leak
p.pool = p.pool[:n-1]
return b
}
// Put returns b to the pool. The caller must set b.lastUsed = time.Now().Unix()
// before calling Put so that Clean() can evict idle entries correctly.
func (p *PersistentBufferPool) Put(b *buffer) {
// Reset the buffer before putting it back
b.data = b.data[:0]
p.mu.Lock()
defer p.mu.Unlock()
if len(p.pool) >= maxPoolSize {
// Pool is full; drop the buffer and let GC collect it.
return
}
p.pool = append(p.pool, b)
}
// GetSize returns the exact number of buffers currently sitting in the pool.
func (p *PersistentBufferPool) GetSize() int {
p.mu.Lock()
defer p.mu.Unlock()
return len(p.pool)
}
// Clear drains all buffers currently in the pool, allowing the GC to collect them.
func (p *PersistentBufferPool) Clear() {
p.mu.Lock()
defer p.mu.Unlock()
for i := range p.pool {
p.pool[i] = nil
}
p.pool = p.pool[:0]
}
// Clean removes buffers from the pool that haven't been used in the given duration.
// It uses a simple LRU approach based on the lastUsed timestamp.
func (p *PersistentBufferPool) Clean(threshold int64) {
p.mu.Lock()
defer p.mu.Unlock()
// Filter in place, retaining only buffers returned to the pool recently enough.
active := p.pool[:0]
for _, b := range p.pool {
if b.lastUsed >= threshold {
active = append(active, b)
}
}
// Nullify the rest to prevent memory leaks
for i := len(active); i < len(p.pool); i++ {
p.pool[i] = nil
}
p.pool = active
} }
var ( var (
@@ -94,10 +180,11 @@ type buffer struct {
start int64 start int64
archived bool archived bool
closed bool closed bool
lastUsed int64
} }
func newBuffer(ts, freq int64) *buffer { func newBuffer(ts, freq int64) *buffer {
b := bufferPool.Get().(*buffer) b := bufferPool.Get()
b.frequency = freq b.frequency = freq
b.start = ts - (freq / 2) b.start = ts - (freq / 2)
b.prev = nil b.prev = nil
@@ -184,11 +271,13 @@ func (b *buffer) firstWrite() int64 {
// //
// Panics if 'data' slice is too small to hold all values in [from, to). // Panics if 'data' slice is too small to hold all values in [from, to).
func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) { func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int64, int64, error) {
if from < b.firstWrite() { // Walk back to the buffer that covers 'from', adjusting if we hit the oldest.
if b.prev != nil { for from < b.firstWrite() {
return b.prev.read(from, to, data) if b.prev == nil {
from = b.firstWrite()
break
} }
from = b.firstWrite() b = b.prev
} }
i := 0 i := 0
@@ -200,16 +289,17 @@ func (b *buffer) read(from, to int64, data []schema.Float) ([]schema.Float, int6
break break
} }
b = b.next b = b.next
idx = 0 // Recalculate idx in the new buffer; a gap between buffers may exist.
idx = int((t - b.start) / b.frequency)
} }
if idx >= len(b.data) { if idx >= len(b.data) {
if b.next == nil || to <= b.next.start { if b.next == nil || to <= b.next.start {
break break
} }
data[i] += schema.NaN data[i] += schema.NaN // NaN + anything = NaN; propagates missing data
} else if t < b.start { } else if t < b.start {
data[i] += schema.NaN data[i] += schema.NaN // gap before this buffer's first write
} else { } else {
data[i] += b.data[idx] data[i] += b.data[idx]
} }
@@ -240,6 +330,7 @@ func (b *buffer) free(t int64) (delme bool, n int) {
if cap(b.prev.data) != BufferCap { if cap(b.prev.data) != BufferCap {
b.prev.data = make([]schema.Float, 0, BufferCap) b.prev.data = make([]schema.Float, 0, BufferCap)
} }
b.prev.lastUsed = time.Now().Unix()
bufferPool.Put(b.prev) bufferPool.Put(b.prev)
b.prev = nil b.prev = nil
} }
@@ -266,11 +357,12 @@ func (b *buffer) forceFreeOldest() (delme bool, n int) {
// If the previous buffer signals it should be deleted: // If the previous buffer signals it should be deleted:
if delPrev { if delPrev {
// Clear links on the dying buffer to prevent leaks
b.prev.next = nil b.prev.next = nil
b.prev.data = nil // Release the underlying float slice immediately if cap(b.prev.data) != BufferCap {
b.prev.data = make([]schema.Float, 0, BufferCap)
// Remove the link from the current buffer }
b.prev.lastUsed = time.Now().Unix()
bufferPool.Put(b.prev)
b.prev = nil b.prev = nil
} }
return false, freed return false, freed
@@ -299,21 +391,27 @@ func (b *buffer) iterFromTo(from, to int64, callback func(b *buffer) error) erro
return nil return nil
} }
if err := b.prev.iterFromTo(from, to, callback); err != nil { // Collect overlapping buffers walking backwards (newest → oldest).
return err var matching []*buffer
for cur := b; cur != nil; cur = cur.prev {
if from <= cur.end() && cur.start <= to {
matching = append(matching, cur)
}
} }
if from <= b.end() && b.start <= to { // Invoke callback in chronological order (oldest → newest).
return callback(b) for i := len(matching) - 1; i >= 0; i-- {
if err := callback(matching[i]); err != nil {
return err
}
} }
return nil return nil
} }
func (b *buffer) count() int64 { func (b *buffer) count() int64 {
res := int64(len(b.data)) var res int64
if b.prev != nil { for ; b != nil; b = b.prev {
res += b.prev.count() res += int64(len(b.data))
} }
return res return res
} }

View File

@@ -6,16 +6,15 @@
// This file implements checkpoint persistence for the in-memory metric store. // This file implements checkpoint persistence for the in-memory metric store.
// //
// Checkpoints enable graceful restarts by periodically saving in-memory metric // Checkpoints enable graceful restarts by periodically saving in-memory metric
// data to disk. The checkpoint system supports two write formats: // data to disk in JSON or binary format. The checkpoint system:
// - binary (default): fast loading via raw float32 arrays
// - json: human-readable, slightly slower to load
// //
// Key Features: // Key Features:
// - Periodic background checkpointing via the Checkpointing() worker // - Periodic background checkpointing via the Checkpointing() worker
// - Two format families: JSON (human-readable) and WAL+binary (compact, crash-safe)
// - Parallel checkpoint creation and loading using worker pools // - Parallel checkpoint creation and loading using worker pools
// - Hierarchical file organization: checkpoint_dir/cluster/host/timestamp.{bin|json} // - Hierarchical file organization: checkpoint_dir/cluster/host/timestamp.{json|bin}
// - WAL file: checkpoint_dir/cluster/host/current.wal (append-only, per-entry)
// - Only saves unarchived data (archived data is already persisted elsewhere) // - Only saves unarchived data (archived data is already persisted elsewhere)
// - Automatic format detection during loading (supports bin, json, and legacy avro)
// - GC optimization during loading to prevent excessive heap growth // - GC optimization during loading to prevent excessive heap growth
// //
// Checkpoint Workflow: // Checkpoint Workflow:
@@ -28,8 +27,9 @@
// checkpoints/ // checkpoints/
// cluster1/ // cluster1/
// host001/ // host001/
// 1234567890.bin (timestamp = checkpoint start time) // 1234567890.json (JSON format: full subtree snapshot)
// 1234567950.bin // 1234567890.bin (binary format: full subtree snapshot)
// current.wal (WAL format: append-only per-entry log)
// host002/ // host002/
// ... // ...
package metricstore package metricstore
@@ -56,8 +56,9 @@ import (
) )
const ( const (
CheckpointFilePerms = 0o644 // File permissions for checkpoint files CheckpointFilePerms = 0o644 // File permissions for checkpoint files
CheckpointDirPerms = 0o755 // Directory permissions for checkpoint directories CheckpointDirPerms = 0o755 // Directory permissions for checkpoint directories
GCTriggerInterval = DefaultGCTriggerInterval // Interval for triggering GC during checkpoint loading
) )
// CheckpointMetrics represents metric data in a checkpoint file. // CheckpointMetrics represents metric data in a checkpoint file.
@@ -85,9 +86,9 @@ var (
// Checkpointing starts a background worker that periodically saves metric data to disk. // Checkpointing starts a background worker that periodically saves metric data to disk.
// //
// Checkpoints are written at the configured interval (Keys.Checkpoints.Interval) in // Format behaviour:
// either binary or JSON format. The worker respects context cancellation and signals // - "json": Periodic checkpointing based on Keys.Checkpoints.Interval
// completion via the WaitGroup. // - "wal": Periodic binary snapshots + WAL rotation at Keys.Checkpoints.Interval
func Checkpointing(wg *sync.WaitGroup, ctx context.Context) { func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
lastCheckpointMu.Lock() lastCheckpointMu.Lock()
lastCheckpoint = time.Now() lastCheckpoint = time.Now()
@@ -96,6 +97,7 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
ms := GetMemoryStore() ms := GetMemoryStore()
wg.Go(func() { wg.Go(func() {
d, err := time.ParseDuration(Keys.Checkpoints.Interval) d, err := time.ParseDuration(Keys.Checkpoints.Interval)
if err != nil { if err != nil {
cclog.Fatalf("[METRICSTORE]> invalid checkpoint interval '%s': %s", Keys.Checkpoints.Interval, err.Error()) cclog.Fatalf("[METRICSTORE]> invalid checkpoint interval '%s': %s", Keys.Checkpoints.Interval, err.Error())
@@ -117,170 +119,37 @@ func Checkpointing(wg *sync.WaitGroup, ctx context.Context) {
from := lastCheckpoint from := lastCheckpoint
lastCheckpointMu.Unlock() lastCheckpointMu.Unlock()
cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", from.Format(time.RFC3339))
now := time.Now() now := time.Now()
n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir, cclog.Infof("[METRICSTORE]> start checkpointing (starting at %s)...", from.Format(time.RFC3339))
from.Unix(), now.Unix())
if err != nil { if Keys.Checkpoints.FileFormat == "wal" {
cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error()) n, hostDirs, err := ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), now.Unix())
if err != nil {
cclog.Errorf("[METRICSTORE]> binary checkpointing failed: %s", err.Error())
} else {
cclog.Infof("[METRICSTORE]> done: %d binary snapshot files created", n)
lastCheckpointMu.Lock()
lastCheckpoint = now
lastCheckpointMu.Unlock()
// Rotate WAL files for successfully checkpointed hosts.
RotateWALFiles(hostDirs)
}
} else { } else {
cclog.Infof("[METRICSTORE]> done: %d checkpoint files created", n) n, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir, from.Unix(), now.Unix())
lastCheckpointMu.Lock() if err != nil {
lastCheckpoint = now cclog.Errorf("[METRICSTORE]> checkpointing failed: %s", err.Error())
lastCheckpointMu.Unlock() } else {
cclog.Infof("[METRICSTORE]> done: %d checkpoint files created", n)
lastCheckpointMu.Lock()
lastCheckpoint = now
lastCheckpointMu.Unlock()
}
} }
} }
} }
}) })
} }
// UnmarshalJSON provides optimized JSON decoding for CheckpointMetrics.
//
// Mirrors the optimized MarshalJSON by manually parsing JSON to avoid
// per-element interface dispatch and allocation overhead of the generic
// json.Unmarshal path for []schema.Float.
func (cm *CheckpointMetrics) UnmarshalJSON(input []byte) error {
// Minimal manual JSON parsing for the known structure:
// {"frequency":N,"start":N,"data":[...]}
// Field order may vary, so we parse field names.
if len(input) < 2 || input[0] != '{' {
return fmt.Errorf("expected JSON object")
}
i := 1 // skip '{'
for i < len(input) {
// Skip whitespace
for i < len(input) && (input[i] == ' ' || input[i] == '\t' || input[i] == '\n' || input[i] == '\r') {
i++
}
if i >= len(input) || input[i] == '}' {
break
}
if input[i] == ',' {
i++
continue
}
// Parse field name
if input[i] != '"' {
return fmt.Errorf("expected field name at pos %d", i)
}
i++
nameStart := i
for i < len(input) && input[i] != '"' {
i++
}
fieldName := string(input[nameStart:i])
i++ // skip closing '"'
// Skip ':'
for i < len(input) && (input[i] == ' ' || input[i] == ':') {
i++
}
switch fieldName {
case "frequency":
numStart := i
for i < len(input) && input[i] != ',' && input[i] != '}' {
i++
}
v, err := strconv.ParseInt(string(input[numStart:i]), 10, 64)
if err != nil {
return fmt.Errorf("invalid frequency: %w", err)
}
cm.Frequency = v
case "start":
numStart := i
for i < len(input) && input[i] != ',' && input[i] != '}' {
i++
}
v, err := strconv.ParseInt(string(input[numStart:i]), 10, 64)
if err != nil {
return fmt.Errorf("invalid start: %w", err)
}
cm.Start = v
case "data":
if input[i] != '[' {
return fmt.Errorf("expected '[' for data array at pos %d", i)
}
i++ // skip '['
cm.Data = make([]schema.Float, 0, 256)
for i < len(input) {
// Skip whitespace
for i < len(input) && (input[i] == ' ' || input[i] == '\t' || input[i] == '\n' || input[i] == '\r') {
i++
}
if i >= len(input) {
break
}
if input[i] == ']' {
i++
break
}
if input[i] == ',' {
i++
continue
}
// Parse value: number or null
if input[i] == 'n' {
// "null"
cm.Data = append(cm.Data, schema.NaN)
i += 4
} else {
numStart := i
for i < len(input) && input[i] != ',' && input[i] != ']' && input[i] != ' ' {
i++
}
v, err := strconv.ParseFloat(string(input[numStart:i]), 64)
if err != nil {
return fmt.Errorf("invalid data value: %w", err)
}
cm.Data = append(cm.Data, schema.Float(v))
}
}
default:
// Skip unknown field value
depth := 0
inStr := false
for i < len(input) {
if inStr {
if input[i] == '\\' {
i++
} else if input[i] == '"' {
inStr = false
}
} else {
switch input[i] {
case '"':
inStr = true
case '{', '[':
depth++
case '}', ']':
if depth == 0 {
goto doneSkip
}
depth--
case ',':
if depth == 0 {
goto doneSkip
}
}
}
i++
}
doneSkip:
}
}
return nil
}
// MarshalJSON provides optimized JSON encoding for CheckpointMetrics. // MarshalJSON provides optimized JSON encoding for CheckpointMetrics.
// //
// Since schema.Float has custom MarshalJSON, serializing []Float has significant overhead. // Since schema.Float has custom MarshalJSON, serializing []Float has significant overhead.
@@ -306,7 +175,7 @@ func (cm *CheckpointMetrics) MarshalJSON() ([]byte, error) {
return buf, nil return buf, nil
} }
// ToCheckpoint writes metric data to checkpoint files in parallel. // ToCheckpoint writes metric data to checkpoint files in parallel (JSON format).
// //
// Metrics at root and cluster levels are skipped. One file per host is created. // Metrics at root and cluster levels are skipped. One file per host is created.
// Uses worker pool (Keys.NumWorkers) for parallel processing. Only locks one host // Uses worker pool (Keys.NumWorkers) for parallel processing. Only locks one host
@@ -453,8 +322,7 @@ func (l *Level) toCheckpointFile(from, to int64, m *MemoryStore) (*CheckpointFil
return retval, nil return retval, nil
} }
// toCheckpoint writes a Level's data to a checkpoint file. // toCheckpoint writes a Level's data to a JSON checkpoint file.
// The format (binary or JSON) is determined by Keys.Checkpoints.FileFormat.
// Creates directory if needed. Returns ErrNoNewArchiveData if nothing to save. // Creates directory if needed. Returns ErrNoNewArchiveData if nothing to save.
func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error { func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
cf, err := l.toCheckpointFile(from, to, m) cf, err := l.toCheckpointFile(from, to, m)
@@ -466,23 +334,12 @@ func (l *Level) toCheckpoint(dir string, from, to int64, m *MemoryStore) error {
return ErrNoNewArchiveData return ErrNoNewArchiveData
} }
if Keys.Checkpoints.FileFormat == "json" { filepath := path.Join(dir, fmt.Sprintf("%d.json", from))
return writeJSONCheckpoint(dir, from, cf) f, err := os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
}
// Default: binary format
filePath := path.Join(dir, fmt.Sprintf("%d.bin", from))
return writeBinaryCheckpoint(filePath, cf)
}
// writeJSONCheckpoint writes a CheckpointFile in JSON format.
func writeJSONCheckpoint(dir string, from int64, cf *CheckpointFile) error {
filePath := path.Join(dir, fmt.Sprintf("%d.json", from))
f, err := os.OpenFile(filePath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
if err != nil && os.IsNotExist(err) { if err != nil && os.IsNotExist(err) {
err = os.MkdirAll(dir, CheckpointDirPerms) err = os.MkdirAll(dir, CheckpointDirPerms)
if err == nil { if err == nil {
f, err = os.OpenFile(filePath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms) f, err = os.OpenFile(filepath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
} }
} }
if err != nil { if err != nil {
@@ -499,56 +356,40 @@ func writeJSONCheckpoint(dir string, from int64, cf *CheckpointFile) error {
} }
// enqueueCheckpointHosts traverses checkpoint directory and enqueues cluster/host pairs. // enqueueCheckpointHosts traverses checkpoint directory and enqueues cluster/host pairs.
// Returns the set of cluster names found and any error if directory structure is invalid. // Returns error if directory structure is invalid.
func enqueueCheckpointHosts(dir string, work chan<- [2]string) (map[string]struct{}, error) { func enqueueCheckpointHosts(dir string, work chan<- [2]string) error {
clustersDir, err := os.ReadDir(dir) clustersDir, err := os.ReadDir(dir)
if err != nil { if err != nil {
return nil, err return err
} }
clusters := make(map[string]struct{}, len(clustersDir))
for _, clusterDir := range clustersDir { for _, clusterDir := range clustersDir {
if !clusterDir.IsDir() { if !clusterDir.IsDir() {
return nil, errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory") return errors.New("[METRICSTORE]> expected only directories at first level of checkpoints/ directory")
} }
clusters[clusterDir.Name()] = struct{}{}
hostsDir, err := os.ReadDir(filepath.Join(dir, clusterDir.Name())) hostsDir, err := os.ReadDir(filepath.Join(dir, clusterDir.Name()))
if err != nil { if err != nil {
return nil, err return err
} }
for _, hostDir := range hostsDir { for _, hostDir := range hostsDir {
if !hostDir.IsDir() { if !hostDir.IsDir() {
return nil, errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory") return errors.New("[METRICSTORE]> expected only directories at second level of checkpoints/ directory")
} }
work <- [2]string{clusterDir.Name(), hostDir.Name()} work <- [2]string{clusterDir.Name(), hostDir.Name()}
} }
} }
return clusters, nil return nil
} }
// FromCheckpoint loads checkpoint files from disk into memory in parallel. // FromCheckpoint loads checkpoint files from disk into memory in parallel.
// //
// Pre-creates cluster-level entries to reduce lock contention during parallel loading. // Uses worker pool to load cluster/host combinations. Returns number of files
// Uses worker pool to load cluster/host combinations. Returns number of files loaded and any errors. // loaded and any errors.
func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) { func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {
// Pre-create cluster-level entries to eliminate write-lock contention on m.root
// during parallel loading. Workers only contend at the cluster level (independent).
clusterDirs, err := os.ReadDir(dir)
if err != nil && !os.IsNotExist(err) {
return 0, err
}
for _, d := range clusterDirs {
if d.IsDir() {
m.root.findLevelOrCreate([]string{d.Name()}, len(m.Metrics))
}
}
var wg sync.WaitGroup var wg sync.WaitGroup
work := make(chan [2]string, Keys.NumWorkers*4) work := make(chan [2]string, Keys.NumWorkers*4)
n, errs := int32(0), int32(0) n, errs := int32(0), int32(0)
@@ -569,7 +410,7 @@ func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {
}() }()
} }
_, err = enqueueCheckpointHosts(dir, work) err := enqueueCheckpointHosts(dir, work)
close(work) close(work)
wg.Wait() wg.Wait()
@@ -585,13 +426,11 @@ func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {
// FromCheckpointFiles is the main entry point for loading checkpoints at startup. // FromCheckpointFiles is the main entry point for loading checkpoints at startup.
// //
// Automatically detects checkpoint format (binary, JSON, or legacy Avro).
// Creates checkpoint directory if it doesn't exist. This function must be called // Creates checkpoint directory if it doesn't exist. This function must be called
// before any writes or reads, and can only be called once. // before any writes or reads, and can only be called once.
func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) { func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
if _, err := os.Stat(dir); os.IsNotExist(err) { if _, err := os.Stat(dir); os.IsNotExist(err) {
// The directory does not exist, so create it using os.MkdirAll() err := os.MkdirAll(dir, CheckpointDirPerms)
err := os.MkdirAll(dir, CheckpointDirPerms) // CheckpointDirPerms sets the permissions for the directory
if err != nil { if err != nil {
cclog.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err) cclog.Fatalf("[METRICSTORE]> Error creating directory: %#v\n", err)
} }
@@ -601,11 +440,10 @@ func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
return m.FromCheckpoint(dir, from) return m.FromCheckpoint(dir, from)
} }
// loadBinaryCheckpointFile loads a binary checkpoint file into the Level tree. func (l *Level) loadJSONFile(m *MemoryStore, f *os.File, from int64) error {
// Binary files are decoded in the same way as JSON files (via loadFile). br := bufio.NewReader(f)
func (l *Level) loadBinaryCheckpointFile(m *MemoryStore, filePath string, from int64) error { cf := &CheckpointFile{}
cf, err := loadBinaryFile(filePath) if err := json.NewDecoder(br).Decode(cf); err != nil {
if err != nil {
return err return err
} }
@@ -613,7 +451,11 @@ func (l *Level) loadBinaryCheckpointFile(m *MemoryStore, filePath string, from i
return nil return nil
} }
return l.loadFile(cf, m) if err := l.loadFile(cf, m); err != nil {
return err
}
return nil
} }
func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error { func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
@@ -669,25 +511,37 @@ func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
return nil return nil
} }
// fromCheckpoint loads all checkpoint files (JSON, binary snapshot, WAL) for a
// single host directory. Snapshot files are loaded first (sorted by timestamp),
// then current.wal is replayed on top.
func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, error) { func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, error) {
direntries, err := os.ReadDir(dir) direntries, err := os.ReadDir(dir)
if err != nil { if err != nil {
if os.IsNotExist(err) { if os.IsNotExist(err) {
return 0, nil return 0, nil
} }
return 0, err return 0, err
} }
allFiles := make([]fs.DirEntry, 0, len(direntries)) allFiles := make([]fs.DirEntry, 0)
var walEntry fs.DirEntry
filesLoaded := 0 filesLoaded := 0
for _, e := range direntries { for _, e := range direntries {
if e.IsDir() { if e.IsDir() {
cclog.Warnf("[METRICSTORE]> unexpected subdirectory '%s' in checkpoint dir '%s', skipping", e.Name(), dir) // Legacy: skip subdirectories (only used by old Avro format).
// These are ignored; their data is not loaded.
cclog.Debugf("[METRICSTORE]> skipping subdirectory %s in checkpoint dir %s", e.Name(), dir)
continue continue
} else if strings.HasSuffix(e.Name(), ".bin") || strings.HasSuffix(e.Name(), ".json") {
allFiles = append(allFiles, e)
} }
name := e.Name()
if strings.HasSuffix(name, ".json") || strings.HasSuffix(name, ".bin") {
allFiles = append(allFiles, e)
} else if name == "current.wal" {
walEntry = e
}
// Silently ignore other files (e.g., .tmp, .bin.tmp from interrupted writes).
} }
files, err := findFiles(allFiles, from, true) files, err := findFiles(allFiles, from, true)
@@ -695,109 +549,47 @@ func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, err
return filesLoaded, err return filesLoaded, err
} }
if len(files) == 0 { loaders := map[string]func(*MemoryStore, *os.File, int64) error{
return 0, nil ".json": l.loadJSONFile,
".bin": l.loadBinaryFile,
} }
// Separate files by type
var binFiles, jsonFiles []string
for _, filename := range files { for _, filename := range files {
switch filepath.Ext(filename) { ext := filepath.Ext(filename)
case ".bin": loader := loaders[ext]
binFiles = append(binFiles, filename) if loader == nil {
case ".json": cclog.Warnf("[METRICSTORE]> unknown extension for checkpoint file %s", filename)
jsonFiles = append(jsonFiles, filename) continue
default:
cclog.Warnf("[METRICSTORE]> unknown extension for file %s", filename)
} }
err := func() error {
f, err := os.Open(path.Join(dir, filename))
if err != nil {
return err
}
defer f.Close()
return loader(m, f, from)
}()
if err != nil {
return filesLoaded, err
}
filesLoaded++
} }
// Parallel binary decoding: decode files concurrently, then apply sequentially // Replay WAL after all snapshot files so it fills in data since the last snapshot.
if len(binFiles) > 0 { if walEntry != nil {
type decodedFile struct { err := func() error {
cf *CheckpointFile f, err := os.Open(path.Join(dir, walEntry.Name()))
err error if err != nil {
} return err
decoded := make([]decodedFile, len(binFiles))
var decodeWg sync.WaitGroup
for i, filename := range binFiles {
decodeWg.Add(1)
go func(idx int, fname string) {
defer decodeWg.Done()
cf, err := loadBinaryFile(path.Join(dir, fname))
if err != nil {
decoded[idx] = decodedFile{err: fmt.Errorf("decoding %s: %w", fname, err)}
return
}
decoded[idx] = decodedFile{cf: cf}
}(i, filename)
}
decodeWg.Wait()
for i, d := range decoded {
if d.err != nil {
return filesLoaded, d.err
}
if d.cf.To != 0 && d.cf.To < from {
continue
}
if err := l.loadFile(d.cf, m); err != nil {
return filesLoaded, fmt.Errorf("loading %s: %w", binFiles[i], err)
}
filesLoaded++
}
}
// Parallel JSON decoding: decode files concurrently, then apply sequentially
if len(jsonFiles) > 0 {
type decodedFile struct {
cf *CheckpointFile
err error
}
decoded := make([]decodedFile, len(jsonFiles))
var decodeWg sync.WaitGroup
for i, filename := range jsonFiles {
decodeWg.Add(1)
go func(idx int, fname string) {
defer decodeWg.Done()
f, err := os.Open(path.Join(dir, fname))
if err != nil {
decoded[idx] = decodedFile{err: err}
return
}
defer f.Close()
cf := &CheckpointFile{}
if err := json.NewDecoder(bufio.NewReader(f)).Decode(cf); err != nil {
decoded[idx] = decodedFile{err: fmt.Errorf("decoding %s: %w", fname, err)}
return
}
decoded[idx] = decodedFile{cf: cf}
}(i, filename)
}
decodeWg.Wait()
for i, d := range decoded {
if d.err != nil {
return filesLoaded, d.err
}
if d.cf.To != 0 && d.cf.To < from {
continue
}
if err := l.loadFile(d.cf, m); err != nil {
return filesLoaded, fmt.Errorf("loading %s: %w", jsonFiles[i], err)
} }
defer f.Close()
return l.loadWALFile(m, f, from)
}()
if err != nil {
// WAL errors are non-fatal: the snapshot already loaded the bulk of data.
cclog.Warnf("[METRICSTORE]> WAL replay error for %s: %v (data since last snapshot may be missing)", dir, err)
} else {
filesLoaded++ filesLoaded++
} }
} }
@@ -805,79 +597,69 @@ func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, err
return filesLoaded, nil return filesLoaded, nil
} }
// findFiles filters and sorts checkpoint files by timestamp. // parseTimestampFromFilename extracts a Unix timestamp from a checkpoint filename.
// // Supports ".json" (format: "<ts>.json") and ".bin" (format: "<ts>.bin").
// When findMoreRecentFiles is true, returns files with timestamp >= t (for loading), func parseTimestampFromFilename(name string) (int64, error) {
// plus the immediately preceding file if it straddles the boundary. switch {
// When false, returns files with timestamp <= t (for cleanup). case strings.HasSuffix(name, ".json"):
// return strconv.ParseInt(name[:len(name)-5], 10, 64)
// Filters before sorting so only relevant files are sorted, keeping performance case strings.HasSuffix(name, ".bin"):
// stable regardless of total directory size. return strconv.ParseInt(name[:len(name)-4], 10, 64)
func findFiles(direntries []fs.DirEntry, t int64, findMoreRecentFiles bool) ([]string, error) { default:
type fileEntry struct { return 0, fmt.Errorf("unknown checkpoint extension for file %q", name)
name string
ts int64
} }
}
// Parse timestamps and pre-filter in a single pass // findFiles returns filenames from direntries whose timestamps satisfy the filter.
var candidates []fileEntry // If findMoreRecentFiles is true, returns files with timestamps >= t (plus the
var bestPreceding *fileEntry // Track the file just before the cutoff (for boundary straddling) // last file before t if t falls between two files).
func findFiles(direntries []fs.DirEntry, t int64, findMoreRecentFiles bool) ([]string, error) {
nums := map[string]int64{}
for _, e := range direntries { for _, e := range direntries {
name := e.Name() name := e.Name()
ext := filepath.Ext(name) if !strings.HasSuffix(name, ".json") && !strings.HasSuffix(name, ".bin") {
if ext != ".bin" && ext != ".json" {
continue continue
} }
// Parse timestamp from filename: for .bin and .json it's just "TIMESTAMP.ext" ts, err := parseTimestampFromFilename(name)
baseName := name[:len(name)-len(ext)]
// Handle legacy format with prefix (e.g., "60_TIMESTAMP.avro")
if idx := strings.Index(baseName, "_"); idx >= 0 {
baseName = baseName[idx+1:]
}
ts, err := strconv.ParseInt(baseName, 10, 64)
if err != nil { if err != nil {
return nil, err return nil, err
} }
nums[name] = ts
if findMoreRecentFiles {
if ts >= t {
candidates = append(candidates, fileEntry{name, ts})
} else {
// Track the most recent file before the cutoff for boundary straddling
if bestPreceding == nil || ts > bestPreceding.ts {
bestPreceding = &fileEntry{name, ts}
}
}
} else {
if ts <= t && ts != 0 {
candidates = append(candidates, fileEntry{name, ts})
}
}
} }
// Include the boundary-straddling file if we found one and there are also files after the cutoff sort.Slice(direntries, func(i, j int) bool {
if findMoreRecentFiles && bestPreceding != nil && len(candidates) > 0 { a, b := direntries[i], direntries[j]
candidates = append(candidates, *bestPreceding) return nums[a.Name()] < nums[b.Name()]
} })
if len(candidates) == 0 { if len(nums) == 0 {
// If searching for recent files and we only have a preceding file, include it
if findMoreRecentFiles && bestPreceding != nil {
return []string{bestPreceding.name}, nil
}
return nil, nil return nil, nil
} }
// Sort only the filtered candidates filenames := make([]string, 0)
sort.Slice(candidates, func(i, j int) bool {
return candidates[i].ts < candidates[j].ts
})
filenames := make([]string, len(candidates)) for i, e := range direntries {
for i, c := range candidates { ts1 := nums[e.Name()]
filenames[i] = c.name
if findMoreRecentFiles && t <= ts1 {
filenames = append(filenames, e.Name())
} else if !findMoreRecentFiles && ts1 <= t && ts1 != 0 {
filenames = append(filenames, e.Name())
}
if i == len(direntries)-1 {
continue
}
enext := direntries[i+1]
ts2 := nums[enext.Name()]
if findMoreRecentFiles {
if ts1 < t && t < ts2 {
filenames = append(filenames, e.Name())
}
}
} }
return filenames, nil return filenames, nil

View File

@@ -14,7 +14,7 @@
// ├─ RetentionInMemory: How long to keep data in RAM // ├─ RetentionInMemory: How long to keep data in RAM
// ├─ MemoryCap: Memory limit in bytes (triggers forceFree) // ├─ MemoryCap: Memory limit in bytes (triggers forceFree)
// ├─ Checkpoints: Persistence configuration // ├─ Checkpoints: Persistence configuration
// │ ├─ FileFormat: "binary" or "json" // │ ├─ FileFormat: "json" or "wal"
// │ ├─ Interval: How often to save (e.g., "1h") // │ ├─ Interval: How often to save (e.g., "1h")
// │ └─ RootDir: Checkpoint storage path // │ └─ RootDir: Checkpoint storage path
// ├─ Cleanup: Long-term storage configuration // ├─ Cleanup: Long-term storage configuration
@@ -54,13 +54,14 @@ import (
const ( const (
DefaultMaxWorkers = 10 DefaultMaxWorkers = 10
DefaultBufferCapacity = 512 DefaultBufferCapacity = 512
DefaultGCTriggerInterval = 100
DefaultMemoryUsageTrackerInterval = 1 * time.Hour DefaultMemoryUsageTrackerInterval = 1 * time.Hour
) )
// Checkpoints configures periodic persistence of in-memory metric data. // Checkpoints configures periodic persistence of in-memory metric data.
// //
// Fields: // Fields:
// - FileFormat: "binary" (default, fast loading) or "json" (human-readable) // - FileFormat: "json" (human-readable, periodic) or "wal" (binary snapshot + WAL, crash-safe)
// - Interval: Duration string (e.g., "1h", "30m") between checkpoint saves // - Interval: Duration string (e.g., "1h", "30m") between checkpoint saves
// - RootDir: Filesystem path for checkpoint files (created if missing) // - RootDir: Filesystem path for checkpoint files (created if missing)
type Checkpoints struct { type Checkpoints struct {
@@ -140,7 +141,7 @@ type MetricStoreConfig struct {
// Accessed by Init(), Checkpointing(), and other lifecycle functions. // Accessed by Init(), Checkpointing(), and other lifecycle functions.
var Keys MetricStoreConfig = MetricStoreConfig{ var Keys MetricStoreConfig = MetricStoreConfig{
Checkpoints: Checkpoints{ Checkpoints: Checkpoints{
FileFormat: "binary", FileFormat: "json",
RootDir: "./var/checkpoints", RootDir: "./var/checkpoints",
}, },
Cleanup: &Cleanup{ Cleanup: &Cleanup{

View File

@@ -18,9 +18,8 @@ const configSchema = `{
"type": "object", "type": "object",
"properties": { "properties": {
"file-format": { "file-format": {
"description": "Specify the format for checkpoint files: 'binary' (default, fast loading) or 'json' (human-readable).", "description": "Specify the format for checkpoint files. Two variants: 'json' (human-readable, periodic) and 'wal' (binary snapshot + Write-Ahead Log, crash-safe). Default is 'json'.",
"type": "string", "type": "string"
"enum": ["binary", "json"]
}, },
"interval": { "interval": {
"description": "Interval at which the metrics should be checkpointed.", "description": "Interval at which the metrics should be checkpointed.",

View File

@@ -42,6 +42,7 @@ package metricstore
import ( import (
"sync" "sync"
"time"
"unsafe" "unsafe"
"github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
@@ -192,6 +193,7 @@ func (l *Level) free(t int64) (int, error) {
if cap(b.data) != BufferCap { if cap(b.data) != BufferCap {
b.data = make([]schema.Float, 0, BufferCap) b.data = make([]schema.Float, 0, BufferCap)
} }
b.lastUsed = time.Now().Unix()
bufferPool.Put(b) bufferPool.Put(b)
l.metrics[i] = nil l.metrics[i] = nil
} }
@@ -236,12 +238,13 @@ func (l *Level) forceFree() (int, error) {
// If delme is true, it means 'b' itself (the head) was the oldest // If delme is true, it means 'b' itself (the head) was the oldest
// and needs to be removed from the slice. // and needs to be removed from the slice.
if delme { if delme {
// Nil out fields to ensure no hanging references
b.next = nil b.next = nil
b.prev = nil b.prev = nil
b.data = nil if cap(b.data) != BufferCap {
b.data = make([]schema.Float, 0, BufferCap)
}
b.lastUsed = time.Now().Unix()
bufferPool.Put(b)
l.metrics[i] = nil l.metrics[i] = nil
} }
} }

View File

@@ -3,6 +3,19 @@
// Use of this source code is governed by a MIT-style // Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// This file implements ingestion of InfluxDB line-protocol metric data received
// over NATS. Each line encodes one metric sample with the following structure:
//
// <measurement>[,cluster=<c>][,hostname=<h>][,type=<t>][,type-id=<id>][,subtype=<s>][,stype-id=<id>] value=<v> [<timestamp>]
//
// The measurement name identifies the metric (e.g. "cpu_load"). Tags provide
// routing information (cluster, host) and optional sub-device selectors (type,
// subtype). Only one field is expected per line: "value".
//
// After decoding, each sample is:
// 1. Written to the in-memory store via ms.WriteToLevel.
// 2. If the checkpoint format is "wal", also forwarded to the WAL staging
// goroutine via the WALMessages channel for durable write-ahead logging.
package metricstore package metricstore
import ( import (
@@ -18,6 +31,16 @@ import (
"github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol" "github.com/ClusterCockpit/cc-line-protocol/v2/lineprotocol"
) )
// ReceiveNats subscribes to all configured NATS subjects and feeds incoming
// line-protocol messages into the MemoryStore.
//
// When workers > 1 a pool of goroutines drains a shared channel so that
// multiple messages can be decoded in parallel. With workers == 1 the NATS
// callback decodes inline (no channel overhead, lower latency).
//
// The function blocks until ctx is cancelled and all worker goroutines have
// finished. It returns nil when the NATS client is not configured; callers
// should treat that as a no-op rather than an error.
func ReceiveNats(ms *MemoryStore, func ReceiveNats(ms *MemoryStore,
workers int, workers int,
ctx context.Context, ctx context.Context,
@@ -76,8 +99,13 @@ func ReceiveNats(ms *MemoryStore,
return nil return nil
} }
// Place `prefix` in front of `buf` but if possible, // reorder prepends prefix to buf in-place when buf has enough spare capacity,
// do that inplace in `buf`. // avoiding an allocation. Falls back to a regular append otherwise.
//
// It is used to assemble the "type<type-id>" and "subtype<stype-id>" selector
// strings when the type tag arrives before the type-id tag in the line, so the
// two byte slices need to be concatenated in tag-declaration order regardless
// of wire order.
func reorder(buf, prefix []byte) []byte { func reorder(buf, prefix []byte) []byte {
n := len(prefix) n := len(prefix)
m := len(buf) m := len(buf)
@@ -95,17 +123,41 @@ func reorder(buf, prefix []byte) []byte {
} }
} }
// decodeState holds the per-call scratch buffers used by DecodeLine.
// Instances are recycled via decodeStatePool to avoid repeated allocations
// during high-throughput ingestion.
type decodeState struct { type decodeState struct {
metricBuf []byte // metricBuf holds a copy of the current measurement name (line-protocol
selector []string // measurement field). Copied because dec.Measurement() returns a slice
typeBuf []byte // that is invalidated by the next decoder call.
subTypeBuf []byte metricBuf []byte
prevTypeBytes []byte
prevTypeStr string // selector is the sub-device path passed to WriteToLevel and WALMessage
// (e.g. ["socket0"] or ["socket0", "memctrl1"]). Reused across lines.
selector []string
// typeBuf accumulates the concatenated "type"+"type-id" tag value for the
// current line. Reset at the start of each line's tag-decode loop.
typeBuf []byte
// subTypeBuf accumulates the concatenated "subtype"+"stype-id" tag value.
// Reset at the start of each line's tag-decode loop.
subTypeBuf []byte
// prevTypeBytes / prevTypeStr cache the last seen typeBuf content and its
// string conversion. Because consecutive lines in a batch typically address
// the same sub-device, the cache hit rate is very high and avoids
// repeated []byte→string allocations.
prevTypeBytes []byte
prevTypeStr string
// prevSubTypeBytes / prevSubTypeStr are the same cache for the subtype.
prevSubTypeBytes []byte prevSubTypeBytes []byte
prevSubTypeStr string prevSubTypeStr string
} }
// decodeStatePool recycles decodeState values across DecodeLine calls to
// reduce GC pressure during sustained metric ingestion.
var decodeStatePool = sync.Pool{ var decodeStatePool = sync.Pool{
New: func() any { New: func() any {
return &decodeState{ return &decodeState{
@@ -117,8 +169,28 @@ var decodeStatePool = sync.Pool{
}, },
} }
// Decode lines using dec and make write calls to the MemoryStore. // DecodeLine reads all lines from dec (InfluxDB line-protocol) and writes each
// If a line is missing its cluster tag, use clusterDefault as default. // decoded metric sample into ms.
//
// clusterDefault is used as the cluster name for lines that do not carry a
// "cluster" tag. Callers typically supply the ClusterTag value from the NATS
// subscription configuration.
//
// Performance notes:
// - A decodeState is obtained from decodeStatePool to reuse scratch buffers.
// - The Level pointer (host-level node in the metric tree) is cached across
// consecutive lines that share the same cluster+host pair to avoid
// repeated lock acquisitions on the root and cluster levels.
// - []byte→string conversions for type/subtype selectors are cached via
// prevType*/prevSubType* fields because batches typically repeat the same
// sub-device identifiers.
// - Timestamp parsing tries Second precision first; if that fails it retries
// Millisecond, Microsecond, and Nanosecond in turn. A missing timestamp
// falls back to time.Now().
//
// When the checkpoint format is "wal" each successfully decoded sample is also
// sent to WALMessages so the WAL staging goroutine can persist it durably
// before the next binary snapshot.
func DecodeLine(dec *lineprotocol.Decoder, func DecodeLine(dec *lineprotocol.Decoder,
ms *MemoryStore, ms *MemoryStore,
clusterDefault string, clusterDefault string,
@@ -275,6 +347,17 @@ func DecodeLine(dec *lineprotocol.Decoder,
time := t.Unix() time := t.Unix()
if Keys.Checkpoints.FileFormat == "wal" {
WALMessages <- &WALMessage{
MetricName: string(st.metricBuf),
Cluster: cluster,
Node: host,
Selector: append([]string{}, st.selector...),
Value: metric.Value,
Timestamp: time,
}
}
if err := ms.WriteToLevel(lvl, st.selector, time, []Metric{metric}); err != nil { if err := ms.WriteToLevel(lvl, st.selector, time, []Metric{metric}); err != nil {
return err return err
} }

View File

@@ -8,7 +8,7 @@
// //
// The package organizes metrics in a tree structure (cluster → host → component) and // The package organizes metrics in a tree structure (cluster → host → component) and
// provides concurrent read/write access to metric data with configurable aggregation strategies. // provides concurrent read/write access to metric data with configurable aggregation strategies.
// Background goroutines handle periodic checkpointing (binary or JSON format), archiving old data, // Background goroutines handle periodic checkpointing (JSON or Avro format), archiving old data,
// and enforcing retention policies. // and enforcing retention policies.
// //
// Key features: // Key features:
@@ -151,12 +151,6 @@ func Init(rawConfig json.RawMessage, metrics map[string]MetricConfig, wg *sync.W
restoreFrom := startupTime.Add(-d) restoreFrom := startupTime.Add(-d)
cclog.Infof("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339)) cclog.Infof("[METRICSTORE]> Loading checkpoints newer than %s\n", restoreFrom.Format(time.RFC3339))
// Lower GC target during loading to prevent excessive heap growth.
// During checkpoint loading the heap grows rapidly, causing the GC to
// double its target repeatedly. A lower percentage keeps it tighter.
oldGCPercent := debug.SetGCPercent(20)
files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix()) files, err := ms.FromCheckpointFiles(Keys.Checkpoints.RootDir, restoreFrom.Unix())
loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB loadedData := ms.SizeInBytes() / 1024 / 1024 // In MB
if err != nil { if err != nil {
@@ -165,16 +159,20 @@ func Init(rawConfig json.RawMessage, metrics map[string]MetricConfig, wg *sync.W
cclog.Infof("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds()) cclog.Infof("[METRICSTORE]> Checkpoints loaded (%d files, %d MB, that took %fs)\n", files, loadedData, time.Since(startupTime).Seconds())
} }
// Restore GC target and force a collection to set a tight baseline // Try to use less memory by forcing a GC run here and then
// for the "previously active heap" size, reducing long-term memory waste. // lowering the target percentage. The default of 100 means
debug.SetGCPercent(oldGCPercent) // that only once the ratio of new allocations execeds the
runtime.GC() // previously active heap, a GC is triggered.
// Forcing a GC here will set the "previously active heap"
// to a minumum.
// runtime.GC()
ctx, shutdown := context.WithCancel(context.Background()) ctx, shutdown := context.WithCancel(context.Background())
Retention(wg, ctx) Retention(wg, ctx)
Checkpointing(wg, ctx) Checkpointing(wg, ctx)
CleanUp(wg, ctx) CleanUp(wg, ctx)
WALStaging(wg, ctx)
MemoryUsageTracker(wg, ctx) MemoryUsageTracker(wg, ctx)
// Note: Signal handling has been removed from this function. // Note: Signal handling has been removed from this function.
@@ -266,7 +264,7 @@ func (ms *MemoryStore) SetNodeProvider(provider NodeProvider) {
// //
// The function will: // The function will:
// 1. Cancel the context to stop all background workers // 1. Cancel the context to stop all background workers
// 2. Close NATS message channels if using Avro format // 2. Close the WAL messages channel if using WAL format
// 3. Write a final checkpoint to preserve in-memory data // 3. Write a final checkpoint to preserve in-memory data
// 4. Log any errors encountered during shutdown // 4. Log any errors encountered during shutdown
// //
@@ -278,10 +276,30 @@ func Shutdown() {
shutdownFunc() shutdownFunc()
} }
if Keys.Checkpoints.FileFormat == "wal" {
close(WALMessages)
}
cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir) cclog.Infof("[METRICSTORE]> Writing to '%s'...\n", Keys.Checkpoints.RootDir)
var files int
var err error
ms := GetMemoryStore() ms := GetMemoryStore()
files, err := ms.ToCheckpoint(Keys.Checkpoints.RootDir, lastCheckpoint.Unix(), time.Now().Unix()) lastCheckpointMu.Lock()
from := lastCheckpoint
lastCheckpointMu.Unlock()
if Keys.Checkpoints.FileFormat == "wal" {
var hostDirs []string
files, hostDirs, err = ms.ToCheckpointWAL(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix())
if err == nil {
RotateWALFiles(hostDirs)
}
} else {
files, err = ms.ToCheckpoint(Keys.Checkpoints.RootDir, from.Unix(), time.Now().Unix())
}
if err != nil { if err != nil {
cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error()) cclog.Errorf("[METRICSTORE]> Writing checkpoint failed: %s\n", err.Error())
} }
@@ -302,9 +320,7 @@ func Shutdown() {
func Retention(wg *sync.WaitGroup, ctx context.Context) { func Retention(wg *sync.WaitGroup, ctx context.Context) {
ms := GetMemoryStore() ms := GetMemoryStore()
wg.Add(1) wg.Go(func() {
go func() {
defer wg.Done()
d, err := time.ParseDuration(Keys.RetentionInMemory) d, err := time.ParseDuration(Keys.RetentionInMemory)
if err != nil { if err != nil {
cclog.Fatal(err) cclog.Fatal(err)
@@ -341,9 +357,12 @@ func Retention(wg *sync.WaitGroup, ctx context.Context) {
} }
state.mu.Unlock() state.mu.Unlock()
// Clean up the buffer pool
bufferPool.Clean(state.lastRetentionTime)
} }
} }
}() })
} }
// MemoryUsageTracker starts a background goroutine that monitors memory usage. // MemoryUsageTracker starts a background goroutine that monitors memory usage.
@@ -364,9 +383,7 @@ func Retention(wg *sync.WaitGroup, ctx context.Context) {
func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) { func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) {
ms := GetMemoryStore() ms := GetMemoryStore()
wg.Add(1) wg.Go(func() {
go func() {
defer wg.Done()
d := DefaultMemoryUsageTrackerInterval d := DefaultMemoryUsageTrackerInterval
if d <= 0 { if d <= 0 {
@@ -411,6 +428,9 @@ func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) {
runtime.ReadMemStats(&mem) runtime.ReadMemStats(&mem)
actualMemoryGB = float64(mem.Alloc) / 1e9 actualMemoryGB = float64(mem.Alloc) / 1e9
bufferPool.Clear()
cclog.Infof("[METRICSTORE]> Cleaned up bufferPool\n")
if actualMemoryGB > float64(Keys.MemoryCap) { if actualMemoryGB > float64(Keys.MemoryCap) {
cclog.Warnf("[METRICSTORE]> memory usage %.2f GB exceeds cap %d GB, starting emergency buffer freeing", actualMemoryGB, Keys.MemoryCap) cclog.Warnf("[METRICSTORE]> memory usage %.2f GB exceeds cap %d GB, starting emergency buffer freeing", actualMemoryGB, Keys.MemoryCap)
@@ -452,7 +472,7 @@ func MemoryUsageTracker(wg *sync.WaitGroup, ctx context.Context) {
} }
} }
} }
}() })
} }
// Free removes metric data older than the given time while preserving data for active nodes. // Free removes metric data older than the given time while preserving data for active nodes.

View File

@@ -12,6 +12,526 @@ import (
"github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
// ─── Buffer pool ─────────────────────────────────────────────────────────────
// TestBufferPoolGetReuse verifies that Get() returns pooled buffers before
// allocating new ones, and that an empty pool allocates a fresh BufferCap buffer.
func TestBufferPoolGetReuse(t *testing.T) {
pool := NewPersistentBufferPool()
original := &buffer{data: make([]schema.Float, 0, BufferCap), lastUsed: time.Now().Unix()}
pool.Put(original)
reused := pool.Get()
if reused != original {
t.Error("Get() should return the previously pooled buffer")
}
if pool.GetSize() != 0 {
t.Errorf("pool size after Get() = %d, want 0", pool.GetSize())
}
// Empty pool must allocate a fresh buffer with the standard capacity.
fresh := pool.Get()
if fresh == nil {
t.Fatal("Get() from empty pool returned nil")
}
if cap(fresh.data) != BufferCap {
t.Errorf("fresh buffer cap = %d, want %d", cap(fresh.data), BufferCap)
}
}
// TestBufferPoolClear verifies that Clear() drains all entries.
func TestBufferPoolClear(t *testing.T) {
pool := NewPersistentBufferPool()
for i := 0; i < 10; i++ {
pool.Put(&buffer{data: make([]schema.Float, 0), lastUsed: time.Now().Unix()})
}
pool.Clear()
if pool.GetSize() != 0 {
t.Errorf("pool size after Clear() = %d, want 0", pool.GetSize())
}
}
// TestBufferPoolMaxSize verifies that Put() silently drops buffers once the
// pool reaches maxPoolSize, preventing unbounded memory growth.
func TestBufferPoolMaxSize(t *testing.T) {
pool := NewPersistentBufferPool()
for i := 0; i < maxPoolSize; i++ {
pool.Put(&buffer{data: make([]schema.Float, 0, BufferCap), lastUsed: time.Now().Unix()})
}
if pool.GetSize() != maxPoolSize {
t.Fatalf("pool size = %d, want %d", pool.GetSize(), maxPoolSize)
}
pool.Put(&buffer{data: make([]schema.Float, 0, BufferCap), lastUsed: time.Now().Unix()})
if pool.GetSize() != maxPoolSize {
t.Errorf("pool size after overflow Put = %d, want %d (should not grow)", pool.GetSize(), maxPoolSize)
}
}
// ─── Buffer helpers ───────────────────────────────────────────────────────────
// TestBufferEndFirstWrite verifies the end() and firstWrite() calculations.
func TestBufferEndFirstWrite(t *testing.T) {
// start=90, freq=10 → firstWrite = 90+5 = 95
b := &buffer{data: make([]schema.Float, 4, BufferCap), frequency: 10, start: 90}
if fw := b.firstWrite(); fw != 95 {
t.Errorf("firstWrite() = %d, want 95", fw)
}
// end = firstWrite + len(data)*freq = 95 + 4*10 = 135
if e := b.end(); e != 135 {
t.Errorf("end() = %d, want 135", e)
}
}
// ─── Buffer write ─────────────────────────────────────────────────────────────
// TestBufferWriteNaNFill verifies that skipped timestamps are filled with NaN.
func TestBufferWriteNaNFill(t *testing.T) {
b := newBuffer(100, 10)
b.write(100, schema.Float(1.0))
// skip 110 and 120
b.write(130, schema.Float(4.0))
if len(b.data) != 4 {
t.Fatalf("len(data) = %d, want 4 (1 value + 2 NaN + 1 value)", len(b.data))
}
if b.data[0] != schema.Float(1.0) {
t.Errorf("data[0] = %v, want 1.0", b.data[0])
}
if !b.data[1].IsNaN() {
t.Errorf("data[1] should be NaN (gap), got %v", b.data[1])
}
if !b.data[2].IsNaN() {
t.Errorf("data[2] should be NaN (gap), got %v", b.data[2])
}
if b.data[3] != schema.Float(4.0) {
t.Errorf("data[3] = %v, want 4.0", b.data[3])
}
}
// TestBufferWriteCapacityOverflow verifies that exceeding capacity creates and
// links a new buffer rather than panicking or silently dropping data.
func TestBufferWriteCapacityOverflow(t *testing.T) {
// Cap=2 so the third write must overflow into a new buffer.
b := &buffer{data: make([]schema.Float, 0, 2), frequency: 10, start: 95}
nb, _ := b.write(100, schema.Float(1.0))
nb, _ = nb.write(110, schema.Float(2.0))
nb, err := nb.write(120, schema.Float(3.0))
if err != nil {
t.Fatalf("write() error = %v", err)
}
if nb == b {
t.Fatal("write() should have returned a new buffer after overflow")
}
if nb.prev != b {
t.Error("new buffer should link back to old via prev")
}
if b.next != nb {
t.Error("old buffer should link forward to new via next")
}
if len(b.data) != 2 {
t.Errorf("old buffer len = %d, want 2 (full)", len(b.data))
}
if nb.data[0] != schema.Float(3.0) {
t.Errorf("new buffer data[0] = %v, want 3.0", nb.data[0])
}
}
// TestBufferWriteOverwrite verifies that writing to an already-occupied index
// replaces the value rather than appending.
func TestBufferWriteOverwrite(t *testing.T) {
b := newBuffer(100, 10)
b.write(100, schema.Float(1.0))
b.write(110, schema.Float(2.0))
// Overwrite the first slot.
b.write(100, schema.Float(99.0))
if len(b.data) != 2 {
t.Errorf("len(data) after overwrite = %d, want 2 (no append)", len(b.data))
}
if b.data[0] != schema.Float(99.0) {
t.Errorf("data[0] after overwrite = %v, want 99.0", b.data[0])
}
}
// ─── Buffer read ──────────────────────────────────────────────────────────────
// TestBufferReadBeforeFirstWrite verifies that 'from' is clamped to firstWrite
// when the requested range starts before any data in the chain.
func TestBufferReadBeforeFirstWrite(t *testing.T) {
b := newBuffer(100, 10) // firstWrite = 100
b.write(100, schema.Float(1.0))
b.write(110, schema.Float(2.0))
data := make([]schema.Float, 10)
result, adjustedFrom, _, err := b.read(50, 120, data)
if err != nil {
t.Fatalf("read() error = %v", err)
}
if adjustedFrom != 100 {
t.Errorf("adjustedFrom = %d, want 100 (clamped to firstWrite)", adjustedFrom)
}
if len(result) != 2 {
t.Errorf("len(result) = %d, want 2", len(result))
}
}
// TestBufferReadChain verifies that read() traverses a multi-buffer chain and
// returns contiguous values from both buffers.
//
// The switch to b.next in read() triggers on idx >= cap(b.data), so b1 must
// be full (len == cap) for the loop to advance to b2 without producing NaN.
func TestBufferReadChain(t *testing.T) {
// b1: cap=3, covers t=100..120. b2: covers t=130..150. b2 is head.
b1 := &buffer{data: make([]schema.Float, 0, 3), frequency: 10, start: 95}
b1.data = append(b1.data, 1.0, 2.0, 3.0) // fills b1: len=cap=3
b2 := &buffer{data: make([]schema.Float, 0, 3), frequency: 10, start: 125}
b2.data = append(b2.data, 4.0, 5.0, 6.0) // t=130,140,150
b2.prev = b1
b1.next = b2
data := make([]schema.Float, 6)
result, from, to, err := b2.read(100, 160, data)
if err != nil {
t.Fatalf("read() error = %v", err)
}
if from != 100 || to != 160 {
t.Errorf("read() from/to = %d/%d, want 100/160", from, to)
}
if len(result) != 6 {
t.Fatalf("len(result) = %d, want 6", len(result))
}
for i, want := range []schema.Float{1, 2, 3, 4, 5, 6} {
if result[i] != want {
t.Errorf("result[%d] = %v, want %v", i, result[i], want)
}
}
}
// TestBufferReadIdxAfterSwitch is a regression test for the index recalculation
// bug after switching to b.next during a read.
//
// When both buffers share the same start time (can happen with checkpoint-loaded
// chains), the old code hardcoded idx=0 after the switch, causing reads at time t
// to return the wrong element from the next buffer.
func TestBufferReadIdxAfterSwitch(t *testing.T) {
// b1: cap=2, both buffers start at 0 (firstWrite=5).
// b1 carries t=5 and t=15; b2 carries t=5,15,25,35 with the same start.
// When reading reaches t=25 the loop overflows b1 (idx=2 >= cap=2) and
// switches to b2. The correct index in b2 is (25-0)/10=2 → b2.data[2]=30.0.
// The old code set idx=0 → b2.data[0]=10.0 (wrong).
b1 := &buffer{data: make([]schema.Float, 0, 2), frequency: 10, start: 0}
b1.data = append(b1.data, schema.Float(1.0), schema.Float(2.0)) // t=5, t=15
b2 := &buffer{data: make([]schema.Float, 0, 10), frequency: 10, start: 0}
b2.data = append(b2.data,
schema.Float(10.0), schema.Float(20.0),
schema.Float(30.0), schema.Float(40.0)) // t=5,15,25,35
b2.prev = b1
b1.next = b2
// from=0 triggers the walkback to b1 (from < b2.firstWrite=5).
// After clamping, the loop runs t=5,15,25,35.
data := make([]schema.Float, 4)
result, _, _, err := b2.read(0, 36, data)
if err != nil {
t.Fatalf("read() error = %v", err)
}
if len(result) < 3 {
t.Fatalf("len(result) = %d, want >= 3", len(result))
}
if result[0] != schema.Float(1.0) {
t.Errorf("result[0] (t=5) = %v, want 1.0 (from b1)", result[0])
}
if result[1] != schema.Float(2.0) {
t.Errorf("result[1] (t=15) = %v, want 2.0 (from b1)", result[1])
}
// This is the critical assertion: old code returned 10.0 (b2.data[0]).
if result[2] != schema.Float(30.0) {
t.Errorf("result[2] (t=25) = %v, want 30.0 (idx recalculation fix)", result[2])
}
}
// TestBufferReadNaNValues verifies that NaN slots written to the buffer are
// returned as NaN during read.
func TestBufferReadNaNValues(t *testing.T) {
b := newBuffer(100, 10)
b.write(100, schema.Float(1.0))
b.write(110, schema.NaN)
b.write(120, schema.Float(3.0))
data := make([]schema.Float, 3)
result, _, _, err := b.read(100, 130, data)
if err != nil {
t.Fatalf("read() error = %v", err)
}
if len(result) != 3 {
t.Fatalf("len(result) = %d, want 3", len(result))
}
if result[0] != schema.Float(1.0) {
t.Errorf("result[0] = %v, want 1.0", result[0])
}
if !result[1].IsNaN() {
t.Errorf("result[1] should be NaN, got %v", result[1])
}
if result[2] != schema.Float(3.0) {
t.Errorf("result[2] = %v, want 3.0", result[2])
}
}
// TestBufferReadAccumulation verifies the += accumulation pattern used for
// aggregation: values are added to whatever was already in the data slice.
func TestBufferReadAccumulation(t *testing.T) {
b := newBuffer(100, 10)
b.write(100, schema.Float(3.0))
b.write(110, schema.Float(5.0))
// Pre-populate data slice (simulates a second metric being summed in).
data := []schema.Float{2.0, 1.0, 0.0}
result, _, _, err := b.read(100, 120, data)
if err != nil {
t.Fatalf("read() error = %v", err)
}
// 2.0+3.0=5.0, 1.0+5.0=6.0
if result[0] != schema.Float(5.0) {
t.Errorf("result[0] = %v, want 5.0 (2+3)", result[0])
}
if result[1] != schema.Float(6.0) {
t.Errorf("result[1] = %v, want 6.0 (1+5)", result[1])
}
}
// ─── Buffer free ─────────────────────────────────────────────────────────────
// newTestPool swaps out the package-level bufferPool for a fresh isolated one
// and returns a cleanup function that restores the original.
func newTestPool(t *testing.T) *PersistentBufferPool {
t.Helper()
pool := NewPersistentBufferPool()
saved := bufferPool
bufferPool = pool
t.Cleanup(func() { bufferPool = saved })
return pool
}
// TestBufferFreeRetention verifies that free() removes buffers whose entire
// time range falls before the retention threshold and returns them to the pool.
func TestBufferFreeRetention(t *testing.T) {
pool := newTestPool(t)
// b1: firstWrite=5, end=25 b2: firstWrite=25, end=45 b3: firstWrite=45, end=65
b1 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 0}
b1.data = append(b1.data, 1.0, 2.0)
b2 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 20}
b2.data = append(b2.data, 3.0, 4.0)
b2.prev = b1
b1.next = b2
b3 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 40}
b3.data = append(b3.data, 5.0, 6.0)
b3.prev = b2
b2.next = b3
// Threshold=30: b1.end()=25 < 30 → freed; b2.end()=45 >= 30 → kept.
delme, n := b3.free(30)
if delme {
t.Error("head buffer b3 should not be marked for deletion")
}
if n != 1 {
t.Errorf("freed count = %d, want 1", n)
}
if b2.prev != nil {
t.Error("b1 should have been unlinked from b2.prev")
}
if b3.prev != b2 {
t.Error("b3 should still reference b2")
}
if pool.GetSize() != 1 {
t.Errorf("pool size = %d, want 1 (b1 returned)", pool.GetSize())
}
}
// TestBufferFreeAll verifies that free() removes all buffers and signals the
// caller to delete the head when the entire chain is older than the threshold.
func TestBufferFreeAll(t *testing.T) {
pool := newTestPool(t)
b1 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 0}
b1.data = append(b1.data, 1.0, 2.0) // end=25
b2 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 20}
b2.data = append(b2.data, 3.0, 4.0) // end=45
b2.prev = b1
b1.next = b2
// Threshold=100 > both ends → both should be freed.
delme, n := b2.free(100)
if !delme {
t.Error("head buffer b2 should be marked for deletion when all data is stale")
}
if n != 2 {
t.Errorf("freed count = %d, want 2", n)
}
// b1 was freed inside free(); b2 is returned with delme=true for the caller.
if pool.GetSize() != 1 {
t.Errorf("pool size = %d, want 1 (b1 returned; b2 returned by caller)", pool.GetSize())
}
}
// ─── forceFreeOldest ─────────────────────────────────────────────────────────
// TestForceFreeOldestPoolReturn verifies that forceFreeOldest() returns the
// freed buffer to the pool (regression: previously it was just dropped).
func TestForceFreeOldestPoolReturn(t *testing.T) {
pool := newTestPool(t)
b1 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 0}
b2 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 20}
b3 := &buffer{data: make([]schema.Float, 0, BufferCap), frequency: 10, start: 40}
b1.data = append(b1.data, 1.0)
b2.data = append(b2.data, 2.0)
b3.data = append(b3.data, 3.0)
b2.prev = b1
b1.next = b2
b3.prev = b2
b2.next = b3
delme, n := b3.forceFreeOldest()
if delme {
t.Error("head b3 should not be marked for deletion (chain has 3 buffers)")
}
if n != 1 {
t.Errorf("freed count = %d, want 1", n)
}
if b2.prev != nil {
t.Error("b1 should have been unlinked from b2.prev after forceFreeOldest")
}
if b3.prev != b2 {
t.Error("b3 should still link to b2")
}
if pool.GetSize() != 1 {
t.Errorf("pool size = %d, want 1 (b1 returned to pool)", pool.GetSize())
}
}
// TestForceFreeOldestSingleBuffer verifies that forceFreeOldest() returns
// delme=true when the buffer is the only one in the chain.
func TestForceFreeOldestSingleBuffer(t *testing.T) {
b := newBuffer(100, 10)
b.write(100, schema.Float(1.0))
delme, n := b.forceFreeOldest()
if !delme {
t.Error("single-buffer chain: expected delme=true (the buffer IS the oldest)")
}
if n != 1 {
t.Errorf("freed count = %d, want 1", n)
}
}
// ─── iterFromTo ───────────────────────────────────────────────────────────────
// TestBufferIterFromToOrder verifies that iterFromTo invokes the callback in
// chronological order (oldest → newest).
func TestBufferIterFromToOrder(t *testing.T) {
// Each buffer has 2 data points so end() = firstWrite + 2*freq.
b1 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 0} // end=25
b2 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 20} // end=45
b3 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 40} // end=65
b2.prev = b1
b1.next = b2
b3.prev = b2
b2.next = b3
var order []*buffer
err := b3.iterFromTo(0, 100, func(b *buffer) error {
order = append(order, b)
return nil
})
if err != nil {
t.Fatalf("iterFromTo() error = %v", err)
}
if len(order) != 3 {
t.Fatalf("callback count = %d, want 3", len(order))
}
if order[0] != b1 || order[1] != b2 || order[2] != b3 {
t.Error("iterFromTo() did not call callbacks in chronological (oldest→newest) order")
}
}
// TestBufferIterFromToFiltered verifies that iterFromTo only calls the callback
// for buffers whose time range overlaps [from, to].
func TestBufferIterFromToFiltered(t *testing.T) {
// b1: end=25 b2: start=20, end=45 b3: start=40, end=65
b1 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 0}
b2 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 20}
b3 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 40}
b2.prev = b1
b1.next = b2
b3.prev = b2
b2.next = b3
// [30,50]: b1.end=25 < 30 → excluded; b2 and b3 overlap → included.
var visited []*buffer
b3.iterFromTo(30, 50, func(b *buffer) error {
visited = append(visited, b)
return nil
})
if len(visited) != 2 {
t.Fatalf("visited count = %d, want 2 (b2 and b3)", len(visited))
}
if visited[0] != b2 || visited[1] != b3 {
t.Errorf("visited = %v, want [b2, b3]", visited)
}
}
// TestBufferIterFromToNilBuffer verifies that iterFromTo on a nil buffer is a
// safe no-op.
func TestBufferIterFromToNilBuffer(t *testing.T) {
var b *buffer
called := false
err := b.iterFromTo(0, 100, func(_ *buffer) error {
called = true
return nil
})
if err != nil {
t.Errorf("iterFromTo(nil) error = %v, want nil", err)
}
if called {
t.Error("callback should not be called for a nil buffer")
}
}
// ─── count ────────────────────────────────────────────────────────────────────
// TestBufferCount verifies that count() sums data-point lengths across the
// entire chain, including all prev links.
func TestBufferCount(t *testing.T) {
b1 := &buffer{data: make([]schema.Float, 3, BufferCap), frequency: 10, start: 0}
b2 := &buffer{data: make([]schema.Float, 2, BufferCap), frequency: 10, start: 35}
b3 := &buffer{data: make([]schema.Float, 5, BufferCap), frequency: 10, start: 60}
b2.prev = b1
b1.next = b2
b3.prev = b2
b2.next = b3
if got := b3.count(); got != 10 {
t.Errorf("count() = %d, want 10 (3+2+5)", got)
}
// Single buffer.
lone := &buffer{data: make([]schema.Float, 7, BufferCap)}
if got := lone.count(); got != 7 {
t.Errorf("count() single buffer = %d, want 7", got)
}
}
// ─── Existing tests below ────────────────────────────────────────────────────
func TestAssignAggregationStrategy(t *testing.T) { func TestAssignAggregationStrategy(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
@@ -464,3 +984,53 @@ func TestBufferHealthChecks(t *testing.T) {
}) })
} }
} }
func TestBufferPoolClean(t *testing.T) {
// Use a fresh pool for testing
pool := NewPersistentBufferPool()
now := time.Now().Unix()
// Create some buffers and put them in the pool with different lastUsed times
b1 := &buffer{lastUsed: now - 3600, data: make([]schema.Float, 0)} // 1 hour ago
b2 := &buffer{lastUsed: now - 7200, data: make([]schema.Float, 0)} // 2 hours ago
b3 := &buffer{lastUsed: now - 180000, data: make([]schema.Float, 0)} // 50 hours ago
b4 := &buffer{lastUsed: now - 200000, data: make([]schema.Float, 0)} // 55 hours ago
b5 := &buffer{lastUsed: now, data: make([]schema.Float, 0)}
pool.Put(b1)
pool.Put(b2)
pool.Put(b3)
pool.Put(b4)
pool.Put(b5)
if pool.GetSize() != 5 {
t.Fatalf("Expected pool size 5, got %d", pool.GetSize())
}
// Clean buffers older than 48 hours
timeUpdate := time.Now().Add(-48 * time.Hour).Unix()
pool.Clean(timeUpdate)
// Expected: b1, b2, b5 should remain. b3, b4 should be cleaned.
if pool.GetSize() != 3 {
t.Fatalf("Expected pool size 3 after clean, got %d", pool.GetSize())
}
validBufs := map[int64]bool{
b1.lastUsed: true,
b2.lastUsed: true,
b5.lastUsed: true,
}
for i := 0; i < 3; i++ {
b := pool.Get()
if !validBufs[b.lastUsed] {
t.Errorf("Found unexpected buffer with lastUsed %d", b.lastUsed)
}
}
if pool.GetSize() != 0 {
t.Fatalf("Expected pool to be empty, got %d", pool.GetSize())
}
}

View File

@@ -0,0 +1,213 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricstore
import (
"bufio"
"encoding/binary"
"encoding/json"
"fmt"
"os"
"path/filepath"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
pq "github.com/parquet-go/parquet-go"
)
// ParquetMetricRow is the long-format schema for archived metric data.
// One row per (host, metric, scope, scope_id, timestamp) data point.
// Sorted by (cluster, hostname, metric, timestamp) for optimal compression.
type ParquetMetricRow struct {
Cluster string `parquet:"cluster"`
Hostname string `parquet:"hostname"`
Metric string `parquet:"metric"`
Scope string `parquet:"scope"`
ScopeID string `parquet:"scope_id"`
Timestamp int64 `parquet:"timestamp"`
Frequency int64 `parquet:"frequency"`
Value float32 `parquet:"value"`
}
// flattenCheckpointFile recursively converts a CheckpointFile tree into Parquet rows.
// The scope path is built from the hierarchy: host level is "node", then child names
// map to scope/scope_id (e.g., "socket0" → scope="socket", scope_id="0").
func flattenCheckpointFile(cf *CheckpointFile, cluster, hostname, scope, scopeID string, rows []ParquetMetricRow) []ParquetMetricRow {
for metricName, cm := range cf.Metrics {
ts := cm.Start
for _, v := range cm.Data {
if !v.IsNaN() {
rows = append(rows, ParquetMetricRow{
Cluster: cluster,
Hostname: hostname,
Metric: metricName,
Scope: scope,
ScopeID: scopeID,
Timestamp: ts,
Frequency: cm.Frequency,
Value: float32(v),
})
}
ts += cm.Frequency
}
}
for childName, childCf := range cf.Children {
childScope, childScopeID := parseScopeFromName(childName)
rows = flattenCheckpointFile(childCf, cluster, hostname, childScope, childScopeID, rows)
}
return rows
}
// parseScopeFromName infers scope and scope_id from a child level name.
// Examples: "socket0" → ("socket", "0"), "core12" → ("core", "12"),
// "a0" (accelerator) → ("accelerator", "0").
// If the name doesn't match known patterns, it's used as-is for scope with empty scope_id.
func parseScopeFromName(name string) (string, string) {
prefixes := []struct {
prefix string
scope string
}{
{"socket", "socket"},
{"memoryDomain", "memoryDomain"},
{"core", "core"},
{"hwthread", "hwthread"},
{"cpu", "hwthread"},
{"accelerator", "accelerator"},
}
for _, p := range prefixes {
if len(name) > len(p.prefix) && name[:len(p.prefix)] == p.prefix {
id := name[len(p.prefix):]
if len(id) > 0 && id[0] >= '0' && id[0] <= '9' {
return p.scope, id
}
}
}
return name, ""
}
// writeParquetArchive writes rows to a Parquet file with Zstd compression.
func writeParquetArchive(filename string, rows []ParquetMetricRow) error {
if err := os.MkdirAll(filepath.Dir(filename), CheckpointDirPerms); err != nil {
return fmt.Errorf("creating archive directory: %w", err)
}
f, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
if err != nil {
return fmt.Errorf("creating parquet file: %w", err)
}
defer f.Close()
bw := bufio.NewWriterSize(f, 1<<20) // 1MB write buffer
writer := pq.NewGenericWriter[ParquetMetricRow](bw,
pq.Compression(&pq.Zstd),
pq.SortingWriterConfig(pq.SortingColumns(
pq.Ascending("cluster"),
pq.Ascending("hostname"),
pq.Ascending("metric"),
pq.Ascending("timestamp"),
)),
)
if _, err := writer.Write(rows); err != nil {
return fmt.Errorf("writing parquet rows: %w", err)
}
if err := writer.Close(); err != nil {
return fmt.Errorf("closing parquet writer: %w", err)
}
if err := bw.Flush(); err != nil {
return fmt.Errorf("flushing parquet file: %w", err)
}
return nil
}
// loadCheckpointFileFromDisk reads a JSON or binary checkpoint file and returns
// a CheckpointFile. Used by the Parquet archiver to read checkpoint data
// before converting it to Parquet format.
func loadCheckpointFileFromDisk(filename string) (*CheckpointFile, error) {
f, err := os.Open(filename)
if err != nil {
return nil, err
}
defer f.Close()
ext := filepath.Ext(filename)
switch ext {
case ".json":
cf := &CheckpointFile{}
br := bufio.NewReader(f)
if err := json.NewDecoder(br).Decode(cf); err != nil {
return nil, fmt.Errorf("decoding JSON checkpoint %s: %w", filename, err)
}
return cf, nil
case ".bin":
br := bufio.NewReader(f)
var magic uint32
if err := binary.Read(br, binary.LittleEndian, &magic); err != nil {
return nil, fmt.Errorf("reading magic from %s: %w", filename, err)
}
if magic != snapFileMagic {
return nil, fmt.Errorf("invalid snapshot magic in %s: 0x%08X", filename, magic)
}
var fileFrom, fileTo int64
if err := binary.Read(br, binary.LittleEndian, &fileFrom); err != nil {
return nil, fmt.Errorf("reading from-timestamp from %s: %w", filename, err)
}
if err := binary.Read(br, binary.LittleEndian, &fileTo); err != nil {
return nil, fmt.Errorf("reading to-timestamp from %s: %w", filename, err)
}
cf, err := readBinaryLevel(br)
if err != nil {
return nil, fmt.Errorf("reading binary level from %s: %w", filename, err)
}
cf.From = fileFrom
cf.To = fileTo
return cf, nil
default:
return nil, fmt.Errorf("unsupported checkpoint extension: %s", ext)
}
}
// archiveCheckpointsToParquet reads checkpoint files for a host directory,
// converts them to Parquet rows. Returns the rows and filenames that were processed.
func archiveCheckpointsToParquet(dir, cluster, host string, from int64) ([]ParquetMetricRow, []string, error) {
entries, err := os.ReadDir(dir)
if err != nil {
return nil, nil, err
}
files, err := findFiles(entries, from, false)
if err != nil {
return nil, nil, err
}
if len(files) == 0 {
return nil, nil, nil
}
var rows []ParquetMetricRow
for _, checkpoint := range files {
filename := filepath.Join(dir, checkpoint)
cf, err := loadCheckpointFileFromDisk(filename)
if err != nil {
cclog.Warnf("[METRICSTORE]> skipping unreadable checkpoint %s: %v", filename, err)
continue
}
rows = flattenCheckpointFile(cf, cluster, host, "node", "", rows)
}
return rows, files, nil
}

View File

@@ -0,0 +1,255 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricstore
import (
"encoding/json"
"os"
"path/filepath"
"testing"
"github.com/ClusterCockpit/cc-lib/v2/schema"
pq "github.com/parquet-go/parquet-go"
)
func TestParseScopeFromName(t *testing.T) {
tests := []struct {
name string
wantScope string
wantID string
}{
{"socket0", "socket", "0"},
{"socket12", "socket", "12"},
{"core0", "core", "0"},
{"core127", "core", "127"},
{"cpu0", "hwthread", "0"},
{"hwthread5", "hwthread", "5"},
{"memoryDomain0", "memoryDomain", "0"},
{"accelerator0", "accelerator", "0"},
{"unknown", "unknown", ""},
{"socketX", "socketX", ""}, // not numeric suffix
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
scope, id := parseScopeFromName(tt.name)
if scope != tt.wantScope || id != tt.wantID {
t.Errorf("parseScopeFromName(%q) = (%q, %q), want (%q, %q)",
tt.name, scope, id, tt.wantScope, tt.wantID)
}
})
}
}
func TestFlattenCheckpointFile(t *testing.T) {
cf := &CheckpointFile{
From: 1000,
To: 1060,
Metrics: map[string]*CheckpointMetrics{
"cpu_load": {
Frequency: 60,
Start: 1000,
Data: []schema.Float{0.5, 0.7, schema.NaN},
},
},
Children: map[string]*CheckpointFile{
"socket0": {
Metrics: map[string]*CheckpointMetrics{
"mem_bw": {
Frequency: 60,
Start: 1000,
Data: []schema.Float{100.0, schema.NaN, 200.0},
},
},
Children: make(map[string]*CheckpointFile),
},
},
}
rows := flattenCheckpointFile(cf, "fritz", "node001", "node", "", nil)
// cpu_load: 2 non-NaN values at node scope
// mem_bw: 2 non-NaN values at socket0 scope
if len(rows) != 4 {
t.Fatalf("expected 4 rows, got %d", len(rows))
}
// Verify a node-scope row
found := false
for _, r := range rows {
if r.Metric == "cpu_load" && r.Timestamp == 1000 {
found = true
if r.Cluster != "fritz" || r.Hostname != "node001" || r.Scope != "node" || r.Value != 0.5 {
t.Errorf("unexpected row: %+v", r)
}
}
}
if !found {
t.Error("expected cpu_load row at timestamp 1000")
}
// Verify a socket-scope row
found = false
for _, r := range rows {
if r.Metric == "mem_bw" && r.Scope == "socket" && r.ScopeID == "0" {
found = true
}
}
if !found {
t.Error("expected mem_bw row with scope=socket, scope_id=0")
}
}
func TestParquetArchiveRoundtrip(t *testing.T) {
tmpDir := t.TempDir()
// Create checkpoint files on disk (JSON format)
cpDir := filepath.Join(tmpDir, "checkpoints", "testcluster", "node001")
if err := os.MkdirAll(cpDir, 0o755); err != nil {
t.Fatal(err)
}
cf := &CheckpointFile{
From: 1000,
To: 1180,
Metrics: map[string]*CheckpointMetrics{
"cpu_load": {
Frequency: 60,
Start: 1000,
Data: []schema.Float{0.5, 0.7, 0.9},
},
"mem_used": {
Frequency: 60,
Start: 1000,
Data: []schema.Float{45.0, 46.0, 47.0},
},
},
Children: map[string]*CheckpointFile{
"socket0": {
Metrics: map[string]*CheckpointMetrics{
"mem_bw": {
Frequency: 60,
Start: 1000,
Data: []schema.Float{100.0, 110.0, 120.0},
},
},
Children: make(map[string]*CheckpointFile),
},
},
}
// Write JSON checkpoint
cpFile := filepath.Join(cpDir, "1000.json")
data, err := json.Marshal(cf)
if err != nil {
t.Fatal(err)
}
if err := os.WriteFile(cpFile, data, 0o644); err != nil {
t.Fatal(err)
}
// Archive to Parquet
archiveDir := filepath.Join(tmpDir, "archive")
rows, files, err := archiveCheckpointsToParquet(cpDir, "testcluster", "node001", 2000)
if err != nil {
t.Fatal(err)
}
if len(files) != 1 || files[0] != "1000.json" {
t.Fatalf("expected 1 file, got %v", files)
}
parquetFile := filepath.Join(archiveDir, "testcluster", "1000.parquet")
if err := writeParquetArchive(parquetFile, rows); err != nil {
t.Fatal(err)
}
// Read back and verify
f, err := os.Open(parquetFile)
if err != nil {
t.Fatal(err)
}
defer f.Close()
stat, _ := f.Stat()
pf, err := pq.OpenFile(f, stat.Size())
if err != nil {
t.Fatal(err)
}
reader := pq.NewGenericReader[ParquetMetricRow](pf)
readRows := make([]ParquetMetricRow, 100)
n, err := reader.Read(readRows)
if err != nil && n == 0 {
t.Fatal(err)
}
readRows = readRows[:n]
reader.Close()
// We expect: cpu_load(3) + mem_used(3) + mem_bw(3) = 9 rows
if n != 9 {
t.Fatalf("expected 9 rows in parquet file, got %d", n)
}
// Verify cluster and hostname are set correctly
for _, r := range readRows {
if r.Cluster != "testcluster" {
t.Errorf("expected cluster=testcluster, got %s", r.Cluster)
}
if r.Hostname != "node001" {
t.Errorf("expected hostname=node001, got %s", r.Hostname)
}
}
// Verify parquet file is smaller than JSON (compression working)
if stat.Size() == 0 {
t.Error("parquet file is empty")
}
t.Logf("Parquet file size: %d bytes for %d rows", stat.Size(), n)
}
func TestLoadCheckpointFileFromDisk_JSON(t *testing.T) {
tmpDir := t.TempDir()
cf := &CheckpointFile{
From: 1000,
To: 1060,
Metrics: map[string]*CheckpointMetrics{
"test_metric": {
Frequency: 60,
Start: 1000,
Data: []schema.Float{1.0, 2.0, 3.0},
},
},
Children: make(map[string]*CheckpointFile),
}
filename := filepath.Join(tmpDir, "1000.json")
data, err := json.Marshal(cf)
if err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filename, data, 0o644); err != nil {
t.Fatal(err)
}
loaded, err := loadCheckpointFileFromDisk(filename)
if err != nil {
t.Fatal(err)
}
if loaded.From != 1000 || loaded.To != 1060 {
t.Errorf("expected From=1000, To=1060, got From=%d, To=%d", loaded.From, loaded.To)
}
m, ok := loaded.Metrics["test_metric"]
if !ok {
t.Fatal("expected test_metric in loaded checkpoint")
}
if m.Frequency != 60 || m.Start != 1000 || len(m.Data) != 3 {
t.Errorf("unexpected metric data: %+v", m)
}
}

View File

@@ -0,0 +1,787 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// Package metricstore provides walCheckpoint.go: WAL-based checkpoint implementation.
//
// This replaces the Avro shadow tree with an append-only Write-Ahead Log (WAL)
// per host, eliminating the extra memory overhead of the AvroStore and providing
// truly continuous (per-write) crash safety.
//
// # Architecture
//
// Metric write (DecodeLine)
// │
// ├─► WriteToLevel() → main MemoryStore (unchanged)
// │
// └─► WALMessages channel
// │
// ▼
// WALStaging goroutine
// │
// ▼
// checkpoints/cluster/host/current.wal (append-only, binary)
//
// Periodic checkpoint (Checkpointing goroutine):
// 1. Write <timestamp>.bin snapshot (column-oriented, from main tree)
// 2. Signal WALStaging to truncate current.wal per host
//
// On restart (FromCheckpoint):
// 1. Load most recent <timestamp>.bin snapshot
// 2. Replay current.wal (overwrite-safe: buffer.write handles duplicate timestamps)
//
// # WAL Record Format
//
// [4B magic 0xCC1DA7A1][4B payload_len][payload][4B CRC32]
//
// payload:
// [8B timestamp int64]
// [2B metric_name_len uint16][N metric name bytes]
// [1B selector_count uint8]
// per selector: [1B selector_len uint8][M selector bytes]
// [4B value float32 bits]
//
// # Binary Snapshot Format
//
// [4B magic 0xCC5B0001][8B from int64][8B to int64]
// Level tree (recursive):
// [4B num_metrics uint32]
// per metric:
// [2B name_len uint16][N name bytes]
// [8B frequency int64][8B start int64]
// [4B num_values uint32][num_values × 4B float32]
// [4B num_children uint32]
// per child: [2B name_len uint16][N name bytes] + Level (recursive)
package metricstore
import (
"bufio"
"context"
"encoding/binary"
"fmt"
"hash/crc32"
"io"
"math"
"os"
"path"
"strings"
"sync"
"sync/atomic"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// Magic numbers for binary formats.
const (
walFileMagic = uint32(0xCC1DA701) // WAL file header magic
walRecordMagic = uint32(0xCC1DA7A1) // WAL record magic
snapFileMagic = uint32(0xCC5B0001) // Binary snapshot magic
)
// WALMessages is the channel for sending metric writes to the WAL staging goroutine.
// Buffered to allow burst writes without blocking the metric ingestion path.
var WALMessages = make(chan *WALMessage, 4096)
// walRotateCh is used by the checkpoint goroutine to request WAL file rotation
// (close, delete, reopen) after a binary snapshot has been written.
var walRotateCh = make(chan walRotateReq, 256)
// WALMessage represents a single metric write to be appended to the WAL.
// Cluster and Node are NOT stored in the WAL record (inferred from file path).
type WALMessage struct {
MetricName string
Cluster string
Node string
Selector []string
Value schema.Float
Timestamp int64
}
// walRotateReq requests WAL file rotation for a specific host directory.
// The done channel is closed by the WAL goroutine when rotation is complete.
type walRotateReq struct {
hostDir string
done chan struct{}
}
// walFileState holds an open WAL file handle for one host directory.
type walFileState struct {
f *os.File
}
// WALStaging starts a background goroutine that receives WALMessage items
// and appends binary WAL records to per-host current.wal files.
// Also handles WAL rotation requests from the checkpoint goroutine.
func WALStaging(wg *sync.WaitGroup, ctx context.Context) {
wg.Go(func() {
if Keys.Checkpoints.FileFormat == "json" {
return
}
hostFiles := make(map[string]*walFileState)
defer func() {
for _, ws := range hostFiles {
if ws.f != nil {
ws.f.Close()
}
}
}()
getOrOpenWAL := func(hostDir string) *os.File {
ws, ok := hostFiles[hostDir]
if ok {
return ws.f
}
if err := os.MkdirAll(hostDir, CheckpointDirPerms); err != nil {
cclog.Errorf("[METRICSTORE]> WAL: mkdir %s: %v", hostDir, err)
return nil
}
walPath := path.Join(hostDir, "current.wal")
f, err := os.OpenFile(walPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, CheckpointFilePerms)
if err != nil {
cclog.Errorf("[METRICSTORE]> WAL: open %s: %v", walPath, err)
return nil
}
// Write file header magic if file is new (empty).
info, err := f.Stat()
if err == nil && info.Size() == 0 {
var hdr [4]byte
binary.LittleEndian.PutUint32(hdr[:], walFileMagic)
if _, err := f.Write(hdr[:]); err != nil {
cclog.Errorf("[METRICSTORE]> WAL: write header %s: %v", walPath, err)
f.Close()
return nil
}
}
hostFiles[hostDir] = &walFileState{f: f}
return f
}
processMsg := func(msg *WALMessage) {
hostDir := path.Join(Keys.Checkpoints.RootDir, msg.Cluster, msg.Node)
f := getOrOpenWAL(hostDir)
if f == nil {
return
}
if err := writeWALRecord(f, msg); err != nil {
cclog.Errorf("[METRICSTORE]> WAL: write record: %v", err)
}
}
processRotate := func(req walRotateReq) {
ws, ok := hostFiles[req.hostDir]
if ok && ws.f != nil {
ws.f.Close()
walPath := path.Join(req.hostDir, "current.wal")
if err := os.Remove(walPath); err != nil && !os.IsNotExist(err) {
cclog.Errorf("[METRICSTORE]> WAL: remove %s: %v", walPath, err)
}
delete(hostFiles, req.hostDir)
}
close(req.done)
}
drain := func() {
for {
select {
case msg, ok := <-WALMessages:
if !ok {
return
}
processMsg(msg)
case req := <-walRotateCh:
processRotate(req)
default:
return
}
}
}
for {
select {
case <-ctx.Done():
drain()
return
case msg, ok := <-WALMessages:
if !ok {
return
}
processMsg(msg)
case req := <-walRotateCh:
processRotate(req)
}
}
})
}
// RotateWALFiles sends rotation requests for the given host directories
// and blocks until all rotations complete.
func RotateWALFiles(hostDirs []string) {
dones := make([]chan struct{}, len(hostDirs))
for i, dir := range hostDirs {
dones[i] = make(chan struct{})
walRotateCh <- walRotateReq{hostDir: dir, done: dones[i]}
}
for _, done := range dones {
<-done
}
}
// buildWALPayload encodes a WALMessage into a binary payload (without magic/length/CRC).
func buildWALPayload(msg *WALMessage) []byte {
size := 8 + 2 + len(msg.MetricName) + 1 + 4
for _, s := range msg.Selector {
size += 1 + len(s)
}
buf := make([]byte, 0, size)
// Timestamp (8 bytes, little-endian int64)
var ts [8]byte
binary.LittleEndian.PutUint64(ts[:], uint64(msg.Timestamp))
buf = append(buf, ts[:]...)
// Metric name (2-byte length prefix + bytes)
var mLen [2]byte
binary.LittleEndian.PutUint16(mLen[:], uint16(len(msg.MetricName)))
buf = append(buf, mLen[:]...)
buf = append(buf, msg.MetricName...)
// Selector count (1 byte)
buf = append(buf, byte(len(msg.Selector)))
// Selectors (1-byte length prefix + bytes each)
for _, sel := range msg.Selector {
buf = append(buf, byte(len(sel)))
buf = append(buf, sel...)
}
// Value (4 bytes, float32 bit representation)
var val [4]byte
binary.LittleEndian.PutUint32(val[:], math.Float32bits(float32(msg.Value)))
buf = append(buf, val[:]...)
return buf
}
// writeWALRecord appends a binary WAL record to the file.
// Format: [4B magic][4B payload_len][payload][4B CRC32]
func writeWALRecord(f *os.File, msg *WALMessage) error {
payload := buildWALPayload(msg)
crc := crc32.ChecksumIEEE(payload)
record := make([]byte, 0, 4+4+len(payload)+4)
var magic [4]byte
binary.LittleEndian.PutUint32(magic[:], walRecordMagic)
record = append(record, magic[:]...)
var pLen [4]byte
binary.LittleEndian.PutUint32(pLen[:], uint32(len(payload)))
record = append(record, pLen[:]...)
record = append(record, payload...)
var crcBytes [4]byte
binary.LittleEndian.PutUint32(crcBytes[:], crc)
record = append(record, crcBytes[:]...)
_, err := f.Write(record)
return err
}
// readWALRecord reads one WAL record from the reader.
// Returns (nil, nil) on clean EOF. Returns error on data corruption.
// A CRC mismatch indicates a truncated trailing record (expected on crash).
func readWALRecord(r io.Reader) (*WALMessage, error) {
var magic uint32
if err := binary.Read(r, binary.LittleEndian, &magic); err != nil {
if err == io.EOF {
return nil, nil // Clean EOF
}
return nil, fmt.Errorf("read record magic: %w", err)
}
if magic != walRecordMagic {
return nil, fmt.Errorf("invalid record magic 0x%08X (expected 0x%08X)", magic, walRecordMagic)
}
var payloadLen uint32
if err := binary.Read(r, binary.LittleEndian, &payloadLen); err != nil {
return nil, fmt.Errorf("read payload length: %w", err)
}
if payloadLen > 1<<20 { // 1 MB sanity limit
return nil, fmt.Errorf("record payload too large: %d bytes", payloadLen)
}
payload := make([]byte, payloadLen)
if _, err := io.ReadFull(r, payload); err != nil {
return nil, fmt.Errorf("read payload: %w", err)
}
var storedCRC uint32
if err := binary.Read(r, binary.LittleEndian, &storedCRC); err != nil {
return nil, fmt.Errorf("read CRC: %w", err)
}
if crc32.ChecksumIEEE(payload) != storedCRC {
return nil, fmt.Errorf("CRC mismatch (truncated write or corruption)")
}
return parseWALPayload(payload)
}
// parseWALPayload decodes a binary payload into a WALMessage.
func parseWALPayload(payload []byte) (*WALMessage, error) {
if len(payload) < 8+2+1+4 {
return nil, fmt.Errorf("payload too short: %d bytes", len(payload))
}
offset := 0
// Timestamp (8 bytes)
ts := int64(binary.LittleEndian.Uint64(payload[offset : offset+8]))
offset += 8
// Metric name (2-byte length + bytes)
if offset+2 > len(payload) {
return nil, fmt.Errorf("metric name length overflows payload")
}
mLen := int(binary.LittleEndian.Uint16(payload[offset : offset+2]))
offset += 2
if offset+mLen > len(payload) {
return nil, fmt.Errorf("metric name overflows payload")
}
metricName := string(payload[offset : offset+mLen])
offset += mLen
// Selector count (1 byte)
if offset >= len(payload) {
return nil, fmt.Errorf("selector count overflows payload")
}
selCount := int(payload[offset])
offset++
selectors := make([]string, selCount)
for i := range selCount {
if offset >= len(payload) {
return nil, fmt.Errorf("selector[%d] length overflows payload", i)
}
sLen := int(payload[offset])
offset++
if offset+sLen > len(payload) {
return nil, fmt.Errorf("selector[%d] data overflows payload", i)
}
selectors[i] = string(payload[offset : offset+sLen])
offset += sLen
}
// Value (4 bytes, float32 bits)
if offset+4 > len(payload) {
return nil, fmt.Errorf("value overflows payload")
}
bits := binary.LittleEndian.Uint32(payload[offset : offset+4])
value := schema.Float(math.Float32frombits(bits))
return &WALMessage{
MetricName: metricName,
Timestamp: ts,
Selector: selectors,
Value: value,
}, nil
}
// loadWALFile reads a WAL file and replays all valid records into the Level tree.
// l is the host-level node. Corrupt or partial trailing records are silently skipped
// (expected on crash). Records older than 'from' are skipped.
func (l *Level) loadWALFile(m *MemoryStore, f *os.File, from int64) error {
br := bufio.NewReader(f)
// Verify file header magic.
var fileMagic uint32
if err := binary.Read(br, binary.LittleEndian, &fileMagic); err != nil {
if err == io.EOF {
return nil // Empty file, no data
}
return fmt.Errorf("[METRICSTORE]> WAL: read file header: %w", err)
}
if fileMagic != walFileMagic {
return fmt.Errorf("[METRICSTORE]> WAL: invalid file magic 0x%08X (expected 0x%08X)", fileMagic, walFileMagic)
}
// Cache level lookups to avoid repeated tree traversal.
lvlCache := make(map[string]*Level)
for {
msg, err := readWALRecord(br)
if err != nil {
// Truncated trailing record is expected after a crash; stop replaying.
cclog.Debugf("[METRICSTORE]> WAL: stopping replay at corrupted/partial record: %v", err)
break
}
if msg == nil {
break // Clean EOF
}
if msg.Timestamp < from {
continue // Older than retention window
}
minfo, ok := m.Metrics[msg.MetricName]
if !ok {
continue // Unknown metric (config may have changed)
}
// Cache key is the null-separated selector path.
cacheKey := joinSelector(msg.Selector)
lvl, ok := lvlCache[cacheKey]
if !ok {
lvl = l.findLevelOrCreate(msg.Selector, len(m.Metrics))
lvlCache[cacheKey] = lvl
}
// Write directly to the buffer, same as WriteToLevel but without the
// global level lookup (we already have the right level).
lvl.lock.Lock()
b := lvl.metrics[minfo.offset]
if b == nil {
b = newBuffer(msg.Timestamp, minfo.Frequency)
lvl.metrics[minfo.offset] = b
}
nb, writeErr := b.write(msg.Timestamp, msg.Value)
if writeErr == nil && b != nb {
lvl.metrics[minfo.offset] = nb
}
// Ignore write errors for timestamps before buffer start (can happen when
// replaying WAL entries that predate a loaded snapshot's start time).
lvl.lock.Unlock()
}
return nil
}
// joinSelector builds a cache key from a selector slice using null bytes as separators.
func joinSelector(sel []string) string {
if len(sel) == 0 {
return ""
}
var result strings.Builder
result.WriteString(sel[0])
for i := 1; i < len(sel); i++ {
result.WriteString("\x00" + sel[i])
}
return result.String()
}
// ToCheckpointWAL writes binary snapshot files for all hosts in parallel.
// Returns the number of files written, the list of host directories that were
// successfully checkpointed (for WAL rotation), and any errors.
func (m *MemoryStore) ToCheckpointWAL(dir string, from, to int64) (int, []string, error) {
// Collect all cluster/host pairs.
m.root.lock.RLock()
totalHosts := 0
for _, l1 := range m.root.children {
l1.lock.RLock()
totalHosts += len(l1.children)
l1.lock.RUnlock()
}
m.root.lock.RUnlock()
levels := make([]*Level, 0, totalHosts)
selectors := make([][]string, 0, totalHosts)
m.root.lock.RLock()
for sel1, l1 := range m.root.children {
l1.lock.RLock()
for sel2, l2 := range l1.children {
levels = append(levels, l2)
selectors = append(selectors, []string{sel1, sel2})
}
l1.lock.RUnlock()
}
m.root.lock.RUnlock()
type workItem struct {
level *Level
hostDir string
selector []string
}
n, errs := int32(0), int32(0)
var successDirs []string
var successMu sync.Mutex
var wg sync.WaitGroup
wg.Add(Keys.NumWorkers)
work := make(chan workItem, Keys.NumWorkers*2)
for range Keys.NumWorkers {
go func() {
defer wg.Done()
for wi := range work {
err := wi.level.toCheckpointBinary(wi.hostDir, from, to, m)
if err != nil {
if err == ErrNoNewArchiveData {
continue
}
cclog.Errorf("[METRICSTORE]> binary checkpoint error for %s: %v", wi.hostDir, err)
atomic.AddInt32(&errs, 1)
} else {
atomic.AddInt32(&n, 1)
successMu.Lock()
successDirs = append(successDirs, wi.hostDir)
successMu.Unlock()
}
}
}()
}
for i := range levels {
hostDir := path.Join(dir, path.Join(selectors[i]...))
work <- workItem{
level: levels[i],
hostDir: hostDir,
selector: selectors[i],
}
}
close(work)
wg.Wait()
if errs > 0 {
return int(n), successDirs, fmt.Errorf("[METRICSTORE]> %d errors during binary checkpoint (%d successes)", errs, n)
}
return int(n), successDirs, nil
}
// toCheckpointBinary writes a binary snapshot file for a single host-level node.
// Uses atomic rename (write to .tmp then rename) to avoid partial reads on crash.
func (l *Level) toCheckpointBinary(dir string, from, to int64, m *MemoryStore) error {
cf, err := l.toCheckpointFile(from, to, m)
if err != nil {
return err
}
if cf == nil {
return ErrNoNewArchiveData
}
if err := os.MkdirAll(dir, CheckpointDirPerms); err != nil {
return fmt.Errorf("mkdir %s: %w", dir, err)
}
// Write to a temp file first, then rename (atomic on POSIX).
tmpPath := path.Join(dir, fmt.Sprintf("%d.bin.tmp", from))
finalPath := path.Join(dir, fmt.Sprintf("%d.bin", from))
f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, CheckpointFilePerms)
if err != nil {
return fmt.Errorf("open binary snapshot %s: %w", tmpPath, err)
}
bw := bufio.NewWriter(f)
if err := writeBinarySnapshotFile(bw, cf); err != nil {
f.Close()
os.Remove(tmpPath)
return fmt.Errorf("write binary snapshot: %w", err)
}
if err := bw.Flush(); err != nil {
f.Close()
os.Remove(tmpPath)
return err
}
f.Close()
return os.Rename(tmpPath, finalPath)
}
// writeBinarySnapshotFile writes the binary snapshot file header and level tree.
func writeBinarySnapshotFile(w io.Writer, cf *CheckpointFile) error {
if err := binary.Write(w, binary.LittleEndian, snapFileMagic); err != nil {
return err
}
if err := binary.Write(w, binary.LittleEndian, cf.From); err != nil {
return err
}
if err := binary.Write(w, binary.LittleEndian, cf.To); err != nil {
return err
}
return writeBinaryLevel(w, cf)
}
// writeBinaryLevel recursively writes a CheckpointFile level in binary format.
func writeBinaryLevel(w io.Writer, cf *CheckpointFile) error {
if err := binary.Write(w, binary.LittleEndian, uint32(len(cf.Metrics))); err != nil {
return err
}
for name, metric := range cf.Metrics {
if err := writeString16(w, name); err != nil {
return err
}
if err := binary.Write(w, binary.LittleEndian, metric.Frequency); err != nil {
return err
}
if err := binary.Write(w, binary.LittleEndian, metric.Start); err != nil {
return err
}
if err := binary.Write(w, binary.LittleEndian, uint32(len(metric.Data))); err != nil {
return err
}
for _, v := range metric.Data {
if err := binary.Write(w, binary.LittleEndian, math.Float32bits(float32(v))); err != nil {
return err
}
}
}
if err := binary.Write(w, binary.LittleEndian, uint32(len(cf.Children))); err != nil {
return err
}
for name, child := range cf.Children {
if err := writeString16(w, name); err != nil {
return err
}
if err := writeBinaryLevel(w, child); err != nil {
return err
}
}
return nil
}
// writeString16 writes a 2-byte length-prefixed string to w.
func writeString16(w io.Writer, s string) error {
if err := binary.Write(w, binary.LittleEndian, uint16(len(s))); err != nil {
return err
}
_, err := io.WriteString(w, s)
return err
}
// loadBinaryFile reads a binary snapshot file and loads data into the Level tree.
// The retention check (from) is applied to the file's 'to' timestamp.
func (l *Level) loadBinaryFile(m *MemoryStore, f *os.File, from int64) error {
br := bufio.NewReader(f)
var magic uint32
if err := binary.Read(br, binary.LittleEndian, &magic); err != nil {
return fmt.Errorf("[METRICSTORE]> binary snapshot: read magic: %w", err)
}
if magic != snapFileMagic {
return fmt.Errorf("[METRICSTORE]> binary snapshot: invalid magic 0x%08X (expected 0x%08X)", magic, snapFileMagic)
}
var fileFrom, fileTo int64
if err := binary.Read(br, binary.LittleEndian, &fileFrom); err != nil {
return fmt.Errorf("[METRICSTORE]> binary snapshot: read from: %w", err)
}
if err := binary.Read(br, binary.LittleEndian, &fileTo); err != nil {
return fmt.Errorf("[METRICSTORE]> binary snapshot: read to: %w", err)
}
if fileTo != 0 && fileTo < from {
return nil // File is older than retention window, skip it
}
cf, err := readBinaryLevel(br)
if err != nil {
return fmt.Errorf("[METRICSTORE]> binary snapshot: read level tree: %w", err)
}
cf.From = fileFrom
cf.To = fileTo
return l.loadFile(cf, m)
}
// readBinaryLevel recursively reads a level from the binary snapshot format.
func readBinaryLevel(r io.Reader) (*CheckpointFile, error) {
cf := &CheckpointFile{
Metrics: make(map[string]*CheckpointMetrics),
Children: make(map[string]*CheckpointFile),
}
var numMetrics uint32
if err := binary.Read(r, binary.LittleEndian, &numMetrics); err != nil {
return nil, fmt.Errorf("read num_metrics: %w", err)
}
for range numMetrics {
name, err := readString16(r)
if err != nil {
return nil, fmt.Errorf("read metric name: %w", err)
}
var freq, start int64
if err := binary.Read(r, binary.LittleEndian, &freq); err != nil {
return nil, fmt.Errorf("read frequency for %s: %w", name, err)
}
if err := binary.Read(r, binary.LittleEndian, &start); err != nil {
return nil, fmt.Errorf("read start for %s: %w", name, err)
}
var numValues uint32
if err := binary.Read(r, binary.LittleEndian, &numValues); err != nil {
return nil, fmt.Errorf("read num_values for %s: %w", name, err)
}
data := make([]schema.Float, numValues)
for i := range numValues {
var bits uint32
if err := binary.Read(r, binary.LittleEndian, &bits); err != nil {
return nil, fmt.Errorf("read value[%d] for %s: %w", i, name, err)
}
data[i] = schema.Float(math.Float32frombits(bits))
}
cf.Metrics[name] = &CheckpointMetrics{
Frequency: freq,
Start: start,
Data: data,
}
}
var numChildren uint32
if err := binary.Read(r, binary.LittleEndian, &numChildren); err != nil {
return nil, fmt.Errorf("read num_children: %w", err)
}
for range numChildren {
childName, err := readString16(r)
if err != nil {
return nil, fmt.Errorf("read child name: %w", err)
}
child, err := readBinaryLevel(r)
if err != nil {
return nil, fmt.Errorf("read child %s: %w", childName, err)
}
cf.Children[childName] = child
}
return cf, nil
}
// readString16 reads a 2-byte length-prefixed string from r.
func readString16(r io.Reader) (string, error) {
var sLen uint16
if err := binary.Read(r, binary.LittleEndian, &sLen); err != nil {
return "", err
}
buf := make([]byte, sLen)
if _, err := io.ReadFull(r, buf); err != nil {
return "", err
}
return string(buf), nil
}