mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-01-16 09:41:47 +01:00
258 lines
9.4 KiB
Go
258 lines
9.4 KiB
Go
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
// All rights reserved. This file is part of cc-backend.
|
|
// Use of this source code is governed by a MIT-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Package metricstore provides config.go: Configuration structures and metric management.
|
|
//
|
|
// # Configuration Hierarchy
|
|
//
|
|
// The metricstore package uses nested configuration structures:
|
|
//
|
|
// MetricStoreConfig (Keys)
|
|
// ├─ NumWorkers: Parallel checkpoint/archive workers
|
|
// ├─ RetentionInMemory: How long to keep data in RAM
|
|
// ├─ MemoryCap: Memory limit in bytes (triggers forceFree)
|
|
// ├─ Checkpoints: Persistence configuration
|
|
// │ ├─ FileFormat: "avro" or "json"
|
|
// │ ├─ Interval: How often to save (e.g., "1h")
|
|
// │ └─ RootDir: Checkpoint storage path
|
|
// ├─ Archive: Long-term storage configuration
|
|
// │ ├─ ArchiveInterval: How often to archive
|
|
// │ ├─ RootDir: Archive storage path
|
|
// │ └─ DeleteInstead: Delete old data instead of archiving
|
|
// ├─ Debug: Development/debugging options
|
|
// └─ Subscriptions: NATS topic subscriptions for metric ingestion
|
|
//
|
|
// # Metric Configuration
|
|
//
|
|
// Each metric (e.g., "cpu_load", "mem_used") has a MetricConfig entry in the global
|
|
// Metrics map, defining:
|
|
//
|
|
// - Frequency: Measurement interval in seconds
|
|
// - Aggregation: How to combine values (sum/avg/none) when transforming scopes
|
|
// - offset: Internal index into Level.metrics slice (assigned during Init)
|
|
//
|
|
// # AggregationStrategy
|
|
//
|
|
// Determines how to combine metric values when aggregating from finer to coarser scopes:
|
|
//
|
|
// - NoAggregation: Do not combine (incompatible scopes)
|
|
// - SumAggregation: Add values (e.g., power consumption: core→socket)
|
|
// - AvgAggregation: Average values (e.g., temperature: core→socket)
|
|
package metricstore
|
|
|
|
import (
|
|
"fmt"
|
|
"time"
|
|
)
|
|
|
|
const (
|
|
DefaultMaxWorkers = 10
|
|
DefaultBufferCapacity = 512
|
|
DefaultGCTriggerInterval = 100
|
|
DefaultAvroWorkers = 4
|
|
DefaultCheckpointBufferMin = 3
|
|
DefaultAvroCheckpointInterval = time.Minute
|
|
)
|
|
|
|
// Checkpoints configures periodic persistence of in-memory metric data.
|
|
//
|
|
// Fields:
|
|
// - FileFormat: "avro" (default, binary, compact) or "json" (human-readable, slower)
|
|
// - Interval: Duration string (e.g., "1h", "30m") between checkpoint saves
|
|
// - RootDir: Filesystem path for checkpoint files (created if missing)
|
|
type Checkpoints struct {
|
|
FileFormat string `json:"file-format"`
|
|
Interval string `json:"interval"`
|
|
RootDir string `json:"directory"`
|
|
}
|
|
|
|
// Debug provides development and profiling options.
|
|
//
|
|
// Fields:
|
|
// - DumpToFile: Path to dump checkpoint data for inspection (empty = disabled)
|
|
// - EnableGops: Enable gops agent for live runtime debugging (https://github.com/google/gops)
|
|
type Debug struct {
|
|
DumpToFile string `json:"dump-to-file"`
|
|
EnableGops bool `json:"gops"`
|
|
}
|
|
|
|
// Archive configures long-term storage of old metric data.
|
|
//
|
|
// Data older than RetentionInMemory is archived to disk or deleted.
|
|
//
|
|
// Fields:
|
|
// - ArchiveInterval: Duration string (e.g., "24h") between archive operations
|
|
// - RootDir: Filesystem path for archived data (created if missing)
|
|
// - DeleteInstead: If true, delete old data instead of archiving (saves disk space)
|
|
type Archive struct {
|
|
ArchiveInterval string `json:"interval"`
|
|
RootDir string `json:"directory"`
|
|
DeleteInstead bool `json:"delete-instead"`
|
|
}
|
|
|
|
// Subscriptions defines NATS topics to subscribe to for metric ingestion.
|
|
//
|
|
// Each subscription receives metrics via NATS messaging, enabling real-time
|
|
// data collection from compute nodes.
|
|
//
|
|
// Fields:
|
|
// - SubscribeTo: NATS subject/channel name (e.g., "metrics.compute.*")
|
|
// - ClusterTag: Default cluster name for metrics without cluster tag (optional)
|
|
type Subscriptions []struct {
|
|
// Channel name
|
|
SubscribeTo string `json:"subscribe-to"`
|
|
|
|
// Allow lines without a cluster tag, use this as default, optional
|
|
ClusterTag string `json:"cluster-tag"`
|
|
}
|
|
|
|
// MetricStoreConfig defines the main configuration for the metricstore.
|
|
//
|
|
// Loaded from cc-backend's config.json "metricstore" section. Controls memory usage,
|
|
// persistence, archiving, and metric ingestion.
|
|
//
|
|
// Fields:
|
|
// - NumWorkers: Parallel workers for checkpoint/archive (0 = auto: min(NumCPU/2+1, 10))
|
|
// - RetentionInMemory: Duration string (e.g., "48h") for in-memory data retention
|
|
// - MemoryCap: Max bytes for buffer data (0 = unlimited); triggers forceFree when exceeded
|
|
// - Checkpoints: Periodic persistence configuration
|
|
// - Debug: Development/profiling options (nil = disabled)
|
|
// - Archive: Long-term storage configuration (nil = disabled)
|
|
// - Subscriptions: NATS topics for metric ingestion (nil = polling only)
|
|
type MetricStoreConfig struct {
|
|
// Number of concurrent workers for checkpoint and archive operations.
|
|
// If not set or 0, defaults to min(runtime.NumCPU()/2+1, 10)
|
|
NumWorkers int `json:"num-workers"`
|
|
RetentionInMemory string `json:"retention-in-memory"`
|
|
MemoryCap int `json:"memory-cap"`
|
|
Checkpoints Checkpoints `json:"checkpoints"`
|
|
Debug *Debug `json:"debug"`
|
|
Archive *Archive `json:"archive"`
|
|
Subscriptions *Subscriptions `json:"nats-subscriptions"`
|
|
}
|
|
|
|
// Keys is the global metricstore configuration instance.
|
|
//
|
|
// Initialized with defaults, then overwritten by cc-backend's config.json.
|
|
// Accessed by Init(), Checkpointing(), and other lifecycle functions.
|
|
var Keys MetricStoreConfig = MetricStoreConfig{
|
|
Checkpoints: Checkpoints{
|
|
FileFormat: "avro",
|
|
RootDir: "./var/checkpoints",
|
|
},
|
|
}
|
|
|
|
// AggregationStrategy defines how to combine metric values across hierarchy levels.
|
|
//
|
|
// Used when transforming data from finer-grained scopes (e.g., core) to coarser scopes
|
|
// (e.g., socket). This is SPATIAL aggregation, not TEMPORAL (time-based) aggregation.
|
|
//
|
|
// Values:
|
|
// - NoAggregation: Do not aggregate (incompatible scopes or non-aggregatable metrics)
|
|
// - SumAggregation: Add values (e.g., power: sum core power → socket power)
|
|
// - AvgAggregation: Average values (e.g., temperature: average core temps → socket temp)
|
|
type AggregationStrategy int
|
|
|
|
const (
|
|
NoAggregation AggregationStrategy = iota // Do not aggregate
|
|
SumAggregation // Sum values (e.g., power, energy)
|
|
AvgAggregation // Average values (e.g., temperature, utilization)
|
|
)
|
|
|
|
// AssignAggregationStrategy parses a string into an AggregationStrategy value.
|
|
//
|
|
// Used when loading metric configurations from JSON/YAML files.
|
|
//
|
|
// Parameters:
|
|
// - str: "sum", "avg", or "" (empty string for NoAggregation)
|
|
//
|
|
// Returns:
|
|
// - AggregationStrategy: Parsed value
|
|
// - error: Non-nil if str is unrecognized
|
|
func AssignAggregationStrategy(str string) (AggregationStrategy, error) {
|
|
switch str {
|
|
case "":
|
|
return NoAggregation, nil
|
|
case "sum":
|
|
return SumAggregation, nil
|
|
case "avg":
|
|
return AvgAggregation, nil
|
|
default:
|
|
return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str)
|
|
}
|
|
}
|
|
|
|
// MetricConfig defines configuration for a single metric type.
|
|
//
|
|
// Stored in the global Metrics map, keyed by metric name (e.g., "cpu_load").
|
|
//
|
|
// Fields:
|
|
// - Frequency: Measurement interval in seconds (e.g., 60 for 1-minute granularity)
|
|
// - Aggregation: How to combine values across hierarchy levels (sum/avg/none)
|
|
// - offset: Internal index into Level.metrics slice (assigned during Init)
|
|
type MetricConfig struct {
|
|
// Interval in seconds at which measurements are stored
|
|
Frequency int64
|
|
|
|
// Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy.
|
|
Aggregation AggregationStrategy
|
|
|
|
// Private, used internally...
|
|
offset int
|
|
}
|
|
|
|
// Metrics is the global map of metric configurations.
|
|
//
|
|
// Keyed by metric name (e.g., "cpu_load", "mem_used"). Populated during Init()
|
|
// from cluster configuration and checkpoint restoration. Each MetricConfig.offset
|
|
// corresponds to the buffer slice index in Level.metrics.
|
|
var Metrics map[string]MetricConfig
|
|
|
|
// GetMetricFrequency retrieves the measurement interval for a metric.
|
|
//
|
|
// Parameters:
|
|
// - metricName: Metric name (e.g., "cpu_load")
|
|
//
|
|
// Returns:
|
|
// - int64: Frequency in seconds
|
|
// - error: Non-nil if metric not found in Metrics map
|
|
func GetMetricFrequency(metricName string) (int64, error) {
|
|
if metric, ok := Metrics[metricName]; ok {
|
|
return metric.Frequency, nil
|
|
}
|
|
return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName)
|
|
}
|
|
|
|
// AddMetric registers a new metric or updates an existing one.
|
|
//
|
|
// If the metric already exists with a different frequency, uses the higher frequency
|
|
// (finer granularity). This handles cases where different clusters report the same
|
|
// metric at different intervals.
|
|
//
|
|
// Parameters:
|
|
// - name: Metric name (e.g., "cpu_load")
|
|
// - metric: Configuration (frequency, aggregation strategy)
|
|
//
|
|
// Returns:
|
|
// - error: Always nil (signature for future error handling)
|
|
func AddMetric(name string, metric MetricConfig) error {
|
|
if Metrics == nil {
|
|
Metrics = make(map[string]MetricConfig, 0)
|
|
}
|
|
|
|
if existingMetric, ok := Metrics[name]; ok {
|
|
if existingMetric.Frequency != metric.Frequency {
|
|
if existingMetric.Frequency < metric.Frequency {
|
|
existingMetric.Frequency = metric.Frequency
|
|
Metrics[name] = existingMetric
|
|
}
|
|
}
|
|
} else {
|
|
Metrics[name] = metric
|
|
}
|
|
|
|
return nil
|
|
}
|