// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. // All rights reserved. This file is part of cc-backend. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. // Package metricstore provides config.go: Configuration structures and metric management. // // # Configuration Hierarchy // // The metricstore package uses nested configuration structures: // // MetricStoreConfig (Keys) // ├─ NumWorkers: Parallel checkpoint/archive workers // ├─ RetentionInMemory: How long to keep data in RAM // ├─ MemoryCap: Memory limit in bytes (triggers forceFree) // ├─ Checkpoints: Persistence configuration // │ ├─ FileFormat: "avro" or "json" // │ ├─ Interval: How often to save (e.g., "1h") // │ └─ RootDir: Checkpoint storage path // ├─ Archive: Long-term storage configuration // │ ├─ ArchiveInterval: How often to archive // │ ├─ RootDir: Archive storage path // │ └─ DeleteInstead: Delete old data instead of archiving // ├─ Debug: Development/debugging options // └─ Subscriptions: NATS topic subscriptions for metric ingestion // // # Metric Configuration // // Each metric (e.g., "cpu_load", "mem_used") has a MetricConfig entry in the global // Metrics map, defining: // // - Frequency: Measurement interval in seconds // - Aggregation: How to combine values (sum/avg/none) when transforming scopes // - offset: Internal index into Level.metrics slice (assigned during Init) // // # AggregationStrategy // // Determines how to combine metric values when aggregating from finer to coarser scopes: // // - NoAggregation: Do not combine (incompatible scopes) // - SumAggregation: Add values (e.g., power consumption: core→socket) // - AvgAggregation: Average values (e.g., temperature: core→socket) package metricstore import ( "fmt" "time" ) const ( DefaultMaxWorkers = 10 DefaultBufferCapacity = 512 DefaultGCTriggerInterval = 100 DefaultAvroWorkers = 4 DefaultCheckpointBufferMin = 3 DefaultAvroCheckpointInterval = time.Minute ) // Checkpoints configures periodic persistence of in-memory metric data. // // Fields: // - FileFormat: "avro" (default, binary, compact) or "json" (human-readable, slower) // - Interval: Duration string (e.g., "1h", "30m") between checkpoint saves // - RootDir: Filesystem path for checkpoint files (created if missing) type Checkpoints struct { FileFormat string `json:"file-format"` Interval string `json:"interval"` RootDir string `json:"directory"` } // Debug provides development and profiling options. // // Fields: // - DumpToFile: Path to dump checkpoint data for inspection (empty = disabled) // - EnableGops: Enable gops agent for live runtime debugging (https://github.com/google/gops) type Debug struct { DumpToFile string `json:"dump-to-file"` EnableGops bool `json:"gops"` } // Archive configures long-term storage of old metric data. // // Data older than RetentionInMemory is archived to disk or deleted. // // Fields: // - ArchiveInterval: Duration string (e.g., "24h") between archive operations // - RootDir: Filesystem path for archived data (created if missing) // - DeleteInstead: If true, delete old data instead of archiving (saves disk space) type Archive struct { ArchiveInterval string `json:"interval"` RootDir string `json:"directory"` DeleteInstead bool `json:"delete-instead"` } // Subscriptions defines NATS topics to subscribe to for metric ingestion. // // Each subscription receives metrics via NATS messaging, enabling real-time // data collection from compute nodes. // // Fields: // - SubscribeTo: NATS subject/channel name (e.g., "metrics.compute.*") // - ClusterTag: Default cluster name for metrics without cluster tag (optional) type Subscriptions []struct { // Channel name SubscribeTo string `json:"subscribe-to"` // Allow lines without a cluster tag, use this as default, optional ClusterTag string `json:"cluster-tag"` } // MetricStoreConfig defines the main configuration for the metricstore. // // Loaded from cc-backend's config.json "metricstore" section. Controls memory usage, // persistence, archiving, and metric ingestion. // // Fields: // - NumWorkers: Parallel workers for checkpoint/archive (0 = auto: min(NumCPU/2+1, 10)) // - RetentionInMemory: Duration string (e.g., "48h") for in-memory data retention // - MemoryCap: Max bytes for buffer data (0 = unlimited); triggers forceFree when exceeded // - Checkpoints: Periodic persistence configuration // - Debug: Development/profiling options (nil = disabled) // - Archive: Long-term storage configuration (nil = disabled) // - Subscriptions: NATS topics for metric ingestion (nil = polling only) type MetricStoreConfig struct { // Number of concurrent workers for checkpoint and archive operations. // If not set or 0, defaults to min(runtime.NumCPU()/2+1, 10) NumWorkers int `json:"num-workers"` RetentionInMemory string `json:"retention-in-memory"` MemoryCap int `json:"memory-cap"` Checkpoints Checkpoints `json:"checkpoints"` Debug *Debug `json:"debug"` Archive *Archive `json:"archive"` Subscriptions *Subscriptions `json:"nats-subscriptions"` } // Keys is the global metricstore configuration instance. // // Initialized with defaults, then overwritten by cc-backend's config.json. // Accessed by Init(), Checkpointing(), and other lifecycle functions. var Keys MetricStoreConfig = MetricStoreConfig{ Checkpoints: Checkpoints{ FileFormat: "avro", RootDir: "./var/checkpoints", }, } // AggregationStrategy defines how to combine metric values across hierarchy levels. // // Used when transforming data from finer-grained scopes (e.g., core) to coarser scopes // (e.g., socket). This is SPATIAL aggregation, not TEMPORAL (time-based) aggregation. // // Values: // - NoAggregation: Do not aggregate (incompatible scopes or non-aggregatable metrics) // - SumAggregation: Add values (e.g., power: sum core power → socket power) // - AvgAggregation: Average values (e.g., temperature: average core temps → socket temp) type AggregationStrategy int const ( NoAggregation AggregationStrategy = iota // Do not aggregate SumAggregation // Sum values (e.g., power, energy) AvgAggregation // Average values (e.g., temperature, utilization) ) // AssignAggregationStrategy parses a string into an AggregationStrategy value. // // Used when loading metric configurations from JSON/YAML files. // // Parameters: // - str: "sum", "avg", or "" (empty string for NoAggregation) // // Returns: // - AggregationStrategy: Parsed value // - error: Non-nil if str is unrecognized func AssignAggregationStrategy(str string) (AggregationStrategy, error) { switch str { case "": return NoAggregation, nil case "sum": return SumAggregation, nil case "avg": return AvgAggregation, nil default: return NoAggregation, fmt.Errorf("[METRICSTORE]> unknown aggregation strategy: %s", str) } } // MetricConfig defines configuration for a single metric type. // // Stored in the global Metrics map, keyed by metric name (e.g., "cpu_load"). // // Fields: // - Frequency: Measurement interval in seconds (e.g., 60 for 1-minute granularity) // - Aggregation: How to combine values across hierarchy levels (sum/avg/none) // - offset: Internal index into Level.metrics slice (assigned during Init) type MetricConfig struct { // Interval in seconds at which measurements are stored Frequency int64 // Can be 'sum', 'avg' or null. Describes how to aggregate metrics from the same timestep over the hierarchy. Aggregation AggregationStrategy // Private, used internally... offset int } // Metrics is the global map of metric configurations. // // Keyed by metric name (e.g., "cpu_load", "mem_used"). Populated during Init() // from cluster configuration and checkpoint restoration. Each MetricConfig.offset // corresponds to the buffer slice index in Level.metrics. var Metrics map[string]MetricConfig // GetMetricFrequency retrieves the measurement interval for a metric. // // Parameters: // - metricName: Metric name (e.g., "cpu_load") // // Returns: // - int64: Frequency in seconds // - error: Non-nil if metric not found in Metrics map func GetMetricFrequency(metricName string) (int64, error) { if metric, ok := Metrics[metricName]; ok { return metric.Frequency, nil } return 0, fmt.Errorf("[METRICSTORE]> metric %s not found", metricName) } // AddMetric registers a new metric or updates an existing one. // // If the metric already exists with a different frequency, uses the higher frequency // (finer granularity). This handles cases where different clusters report the same // metric at different intervals. // // Parameters: // - name: Metric name (e.g., "cpu_load") // - metric: Configuration (frequency, aggregation strategy) // // Returns: // - error: Always nil (signature for future error handling) func AddMetric(name string, metric MetricConfig) error { if Metrics == nil { Metrics = make(map[string]MetricConfig, 0) } if existingMetric, ok := Metrics[name]; ok { if existingMetric.Frequency != metric.Frequency { if existingMetric.Frequency < metric.Frequency { existingMetric.Frequency = metric.Frequency Metrics[name] = existingMetric } } } else { Metrics[name] = metric } return nil }