mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-01-16 09:41:47 +01:00
Add documentation
This commit is contained in:
@@ -3,6 +3,41 @@
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package metricstore provides level.go: Hierarchical tree structure for metric storage.
|
||||
//
|
||||
// # Level Architecture
|
||||
//
|
||||
// The Level type forms a tree structure where each node represents a level in the
|
||||
// ClusterCockpit hierarchy: cluster → host → socket → core → hwthread, with special
|
||||
// nodes for memory domains and accelerators.
|
||||
//
|
||||
// Structure:
|
||||
//
|
||||
// Root Level (cluster="emmy")
|
||||
// ├─ Level (host="node001")
|
||||
// │ ├─ Level (socket="0")
|
||||
// │ │ ├─ Level (core="0") [stores cpu0 metrics]
|
||||
// │ │ └─ Level (core="1") [stores cpu1 metrics]
|
||||
// │ └─ Level (socket="1")
|
||||
// │ └─ ...
|
||||
// └─ Level (host="node002")
|
||||
// └─ ...
|
||||
//
|
||||
// Each Level can:
|
||||
// - Hold data (metrics slice of buffer pointers)
|
||||
// - Have child nodes (children map[string]*Level)
|
||||
// - Both simultaneously (inner nodes can store aggregated metrics)
|
||||
//
|
||||
// # Selector Paths
|
||||
//
|
||||
// Selectors are hierarchical paths: []string{"cluster", "host", "component"}.
|
||||
// Example: []string{"emmy", "node001", "cpu0"} navigates to the cpu0 core level.
|
||||
//
|
||||
// # Concurrency
|
||||
//
|
||||
// RWMutex protects children map and metrics slice. Read-heavy workload (metric reads)
|
||||
// uses RLock. Writes (new levels, buffer updates) use Lock. Double-checked locking
|
||||
// prevents races during level creation.
|
||||
package metricstore
|
||||
|
||||
import (
|
||||
@@ -12,20 +47,40 @@ import (
|
||||
"github.com/ClusterCockpit/cc-lib/v2/util"
|
||||
)
|
||||
|
||||
// Could also be called "node" as this forms a node in a tree structure.
|
||||
// Called Level because "node" might be confusing here.
|
||||
// Can be both a leaf or a inner node. In this tree structue, inner nodes can
|
||||
// also hold data (in `metrics`).
|
||||
// Level represents a node in the hierarchical metric storage tree.
|
||||
//
|
||||
// Can be both a leaf or inner node. Inner nodes hold data in 'metrics' for aggregated
|
||||
// values (e.g., socket-level metrics derived from core-level data). Named "Level"
|
||||
// instead of "node" to avoid confusion with cluster nodes (hosts).
|
||||
//
|
||||
// Fields:
|
||||
// - children: Map of child level names to Level pointers (e.g., "cpu0" → Level)
|
||||
// - metrics: Slice of buffer pointers (one per metric, indexed by MetricConfig.offset)
|
||||
// - lock: RWMutex for concurrent access (read-heavy, write-rare)
|
||||
type Level struct {
|
||||
children map[string]*Level
|
||||
metrics []*buffer
|
||||
lock sync.RWMutex
|
||||
}
|
||||
|
||||
// Find the correct level for the given selector, creating it if
|
||||
// it does not exist. Example selector in the context of the
|
||||
// ClusterCockpit could be: []string{ "emmy", "host123", "cpu0" }.
|
||||
// This function would probably benefit a lot from `level.children` beeing a `sync.Map`?
|
||||
// findLevelOrCreate navigates to or creates the level specified by selector.
|
||||
//
|
||||
// Recursively descends the tree, creating missing levels as needed. Uses double-checked
|
||||
// locking: RLock first (fast path), then Lock if creation needed (slow path), then
|
||||
// re-check after acquiring Lock to handle races.
|
||||
//
|
||||
// Example selector: []string{"emmy", "node001", "cpu0"}
|
||||
// Navigates: root → emmy → node001 → cpu0, creating levels as needed.
|
||||
//
|
||||
// Parameters:
|
||||
// - selector: Hierarchical path (consumed recursively, decreasing depth)
|
||||
// - nMetrics: Number of metric slots to allocate in new levels
|
||||
//
|
||||
// Returns:
|
||||
// - *Level: The target level (existing or newly created)
|
||||
//
|
||||
// Note: sync.Map may improve performance for high-concurrency writes, but current
|
||||
// approach suffices for read-heavy workload.
|
||||
func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level {
|
||||
if len(selector) == 0 {
|
||||
return l
|
||||
@@ -72,6 +127,22 @@ func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level {
|
||||
return child.findLevelOrCreate(selector[1:], nMetrics)
|
||||
}
|
||||
|
||||
// collectPaths gathers all selector paths at the specified depth in the tree.
|
||||
//
|
||||
// Recursively traverses children, collecting paths when currentDepth+1 == targetDepth.
|
||||
// Each path is a selector that can be used with findLevel() or findBuffers().
|
||||
//
|
||||
// Explicitly copies slices to avoid shared underlying arrays between siblings, preventing
|
||||
// unintended mutations.
|
||||
//
|
||||
// Parameters:
|
||||
// - currentDepth: Depth of current level (0 = root)
|
||||
// - targetDepth: Depth to collect paths from
|
||||
// - currentPath: Path accumulated so far
|
||||
// - results: Output slice (appended to)
|
||||
//
|
||||
// Example: collectPaths(0, 2, []string{}, &results) collects all 2-level paths
|
||||
// like []string{"emmy", "node001"}, []string{"emmy", "node002"}, etc.
|
||||
func (l *Level) collectPaths(currentDepth, targetDepth int, currentPath []string, results *[][]string) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
@@ -95,6 +166,18 @@ func (l *Level) collectPaths(currentDepth, targetDepth int, currentPath []string
|
||||
}
|
||||
}
|
||||
|
||||
// free removes buffers older than the retention threshold from the entire subtree.
|
||||
//
|
||||
// Recursively frees buffers in this level's metrics and all child levels. Buffers
|
||||
// with standard capacity (BufferCap) are returned to the pool. Called by the
|
||||
// retention worker to enforce retention policies.
|
||||
//
|
||||
// Parameters:
|
||||
// - t: Retention threshold timestamp (Unix seconds)
|
||||
//
|
||||
// Returns:
|
||||
// - int: Total number of buffers freed in this subtree
|
||||
// - error: Non-nil on failure (propagated from children)
|
||||
func (l *Level) free(t int64) (int, error) {
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
@@ -124,6 +207,17 @@ func (l *Level) free(t int64) (int, error) {
|
||||
return n, nil
|
||||
}
|
||||
|
||||
// forceFree removes the oldest buffer from each metric chain in the subtree.
|
||||
//
|
||||
// Unlike free(), which removes based on time threshold, this unconditionally removes
|
||||
// the oldest buffer in each chain. Used by MemoryUsageTracker when memory cap is
|
||||
// exceeded and time-based retention is insufficient.
|
||||
//
|
||||
// Recursively processes current level's metrics and all child levels.
|
||||
//
|
||||
// Returns:
|
||||
// - int: Total number of buffers freed in this subtree
|
||||
// - error: Non-nil on failure (propagated from children)
|
||||
func (l *Level) forceFree() (int, error) {
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
@@ -160,6 +254,14 @@ func (l *Level) forceFree() (int, error) {
|
||||
return n, nil
|
||||
}
|
||||
|
||||
// sizeInBytes calculates the total memory usage of all buffers in the subtree.
|
||||
//
|
||||
// Recursively sums buffer data sizes (count of Float values × sizeof(Float)) across
|
||||
// this level's metrics and all child levels. Used by MemoryUsageTracker to enforce
|
||||
// memory cap limits.
|
||||
//
|
||||
// Returns:
|
||||
// - int64: Total bytes used by buffer data in this subtree
|
||||
func (l *Level) sizeInBytes() int64 {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
@@ -178,6 +280,16 @@ func (l *Level) sizeInBytes() int64 {
|
||||
return size
|
||||
}
|
||||
|
||||
// findLevel navigates to the level specified by selector, returning nil if not found.
|
||||
//
|
||||
// Read-only variant of findLevelOrCreate. Does not create missing levels.
|
||||
// Recursively descends the tree following the selector path.
|
||||
//
|
||||
// Parameters:
|
||||
// - selector: Hierarchical path (e.g., []string{"emmy", "node001", "cpu0"})
|
||||
//
|
||||
// Returns:
|
||||
// - *Level: The target level, or nil if any component in the path does not exist
|
||||
func (l *Level) findLevel(selector []string) *Level {
|
||||
if len(selector) == 0 {
|
||||
return l
|
||||
@@ -194,6 +306,28 @@ func (l *Level) findLevel(selector []string) *Level {
|
||||
return lvl.findLevel(selector[1:])
|
||||
}
|
||||
|
||||
// findBuffers invokes callback on all buffers matching the selector pattern.
|
||||
//
|
||||
// Supports flexible selector patterns (from cc-lib/util.Selector):
|
||||
// - Exact match: Selector element with String set (e.g., "node001")
|
||||
// - Group match: Selector element with Group set (e.g., ["cpu0", "cpu2", "cpu4"])
|
||||
// - Wildcard: Selector element with Any=true (matches all children)
|
||||
//
|
||||
// Empty selector (len==0) matches current level's buffer at 'offset' and recursively
|
||||
// all descendant buffers at the same offset (used for aggregation queries).
|
||||
//
|
||||
// Parameters:
|
||||
// - selector: Pattern to match (consumed recursively)
|
||||
// - offset: Metric index in metrics slice (from MetricConfig.offset)
|
||||
// - f: Callback invoked on each matching buffer
|
||||
//
|
||||
// Returns:
|
||||
// - error: First error returned by callback, or nil if all succeeded
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// // Find all cpu0 buffers across all hosts:
|
||||
// findBuffers([]Selector{{Any: true}, {String: "cpu0"}}, metricOffset, callback)
|
||||
func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
Reference in New Issue
Block a user