Add documentation

2026-01-16 09:41:47 +01:00 · 2026-01-16 08:27:46 +01:00
parent 93dcfee8c5
commit 9a97d0e8eb
6 changed files with 782 additions and 111 deletions
--- a/internal/metricstore/level.go
+++ b/internal/metricstore/level.go
@@ -3,6 +3,41 @@
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.

+// Package metricstore provides level.go: Hierarchical tree structure for metric storage.
+//
+// # Level Architecture
+//
+// The Level type forms a tree structure where each node represents a level in the
+// ClusterCockpit hierarchy: cluster → host → socket → core → hwthread, with special
+// nodes for memory domains and accelerators.
+//
+// Structure:
+//
+//	Root Level (cluster="emmy")
+//	├─ Level (host="node001")
+//	│  ├─ Level (socket="0")
+//	│  │  ├─ Level (core="0") [stores cpu0 metrics]
+//	│  │  └─ Level (core="1") [stores cpu1 metrics]
+//	│  └─ Level (socket="1")
+//	│     └─ ...
+//	└─ Level (host="node002")
+//	   └─ ...
+//
+// Each Level can:
+//   - Hold data (metrics slice of buffer pointers)
+//   - Have child nodes (children map[string]*Level)
+//   - Both simultaneously (inner nodes can store aggregated metrics)
+//
+// # Selector Paths
+//
+// Selectors are hierarchical paths: []string{"cluster", "host", "component"}.
+// Example: []string{"emmy", "node001", "cpu0"} navigates to the cpu0 core level.
+//
+// # Concurrency
+//
+// RWMutex protects children map and metrics slice. Read-heavy workload (metric reads)
+// uses RLock. Writes (new levels, buffer updates) use Lock. Double-checked locking
+// prevents races during level creation.
 package metricstore

 import (
@@ -12,20 +47,40 @@ import (
 	"github.com/ClusterCockpit/cc-lib/v2/util"
 )

-// Could also be called "node" as this forms a node in a tree structure.
-// Called Level because "node" might be confusing here.
-// Can be both a leaf or a inner node. In this tree structue, inner nodes can
-// also hold data (in `metrics`).
+// Level represents a node in the hierarchical metric storage tree.
+//
+// Can be both a leaf or inner node. Inner nodes hold data in 'metrics' for aggregated
+// values (e.g., socket-level metrics derived from core-level data). Named "Level"
+// instead of "node" to avoid confusion with cluster nodes (hosts).
+//
+// Fields:
+//   - children: Map of child level names to Level pointers (e.g., "cpu0" → Level)
+//   - metrics:  Slice of buffer pointers (one per metric, indexed by MetricConfig.offset)
+//   - lock:     RWMutex for concurrent access (read-heavy, write-rare)
 type Level struct {
 	children map[string]*Level
 	metrics  []*buffer
 	lock     sync.RWMutex
 }

-// Find the correct level for the given selector, creating it if
-// it does not exist. Example selector in the context of the
-// ClusterCockpit could be: []string{ "emmy", "host123", "cpu0" }.
-// This function would probably benefit a lot from `level.children` beeing a `sync.Map`?
+// findLevelOrCreate navigates to or creates the level specified by selector.
+//
+// Recursively descends the tree, creating missing levels as needed. Uses double-checked
+// locking: RLock first (fast path), then Lock if creation needed (slow path), then
+// re-check after acquiring Lock to handle races.
+//
+// Example selector: []string{"emmy", "node001", "cpu0"}
+// Navigates: root → emmy → node001 → cpu0, creating levels as needed.
+//
+// Parameters:
+//   - selector: Hierarchical path (consumed recursively, decreasing depth)
+//   - nMetrics: Number of metric slots to allocate in new levels
+//
+// Returns:
+//   - *Level: The target level (existing or newly created)
+//
+// Note: sync.Map may improve performance for high-concurrency writes, but current
+// approach suffices for read-heavy workload.
 func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level {
 	if len(selector) == 0 {
 		return l
@@ -72,6 +127,22 @@ func (l *Level) findLevelOrCreate(selector []string, nMetrics int) *Level {
 	return child.findLevelOrCreate(selector[1:], nMetrics)
 }

+// collectPaths gathers all selector paths at the specified depth in the tree.
+//
+// Recursively traverses children, collecting paths when currentDepth+1 == targetDepth.
+// Each path is a selector that can be used with findLevel() or findBuffers().
+//
+// Explicitly copies slices to avoid shared underlying arrays between siblings, preventing
+// unintended mutations.
+//
+// Parameters:
+//   - currentDepth: Depth of current level (0 = root)
+//   - targetDepth:  Depth to collect paths from
+//   - currentPath:  Path accumulated so far
+//   - results:      Output slice (appended to)
+//
+// Example: collectPaths(0, 2, []string{}, &results) collects all 2-level paths
+// like []string{"emmy", "node001"}, []string{"emmy", "node002"}, etc.
 func (l *Level) collectPaths(currentDepth, targetDepth int, currentPath []string, results *[][]string) {
 	l.lock.RLock()
 	defer l.lock.RUnlock()
@@ -95,6 +166,18 @@ func (l *Level) collectPaths(currentDepth, targetDepth int, currentPath []string
 	}
 }

+// free removes buffers older than the retention threshold from the entire subtree.
+//
+// Recursively frees buffers in this level's metrics and all child levels. Buffers
+// with standard capacity (BufferCap) are returned to the pool. Called by the
+// retention worker to enforce retention policies.
+//
+// Parameters:
+//   - t: Retention threshold timestamp (Unix seconds)
+//
+// Returns:
+//   - int:   Total number of buffers freed in this subtree
+//   - error: Non-nil on failure (propagated from children)
 func (l *Level) free(t int64) (int, error) {
 	l.lock.Lock()
 	defer l.lock.Unlock()
@@ -124,6 +207,17 @@ func (l *Level) free(t int64) (int, error) {
 	return n, nil
 }

+// forceFree removes the oldest buffer from each metric chain in the subtree.
+//
+// Unlike free(), which removes based on time threshold, this unconditionally removes
+// the oldest buffer in each chain. Used by MemoryUsageTracker when memory cap is
+// exceeded and time-based retention is insufficient.
+//
+// Recursively processes current level's metrics and all child levels.
+//
+// Returns:
+//   - int:   Total number of buffers freed in this subtree
+//   - error: Non-nil on failure (propagated from children)
 func (l *Level) forceFree() (int, error) {
 	l.lock.Lock()
 	defer l.lock.Unlock()
@@ -160,6 +254,14 @@ func (l *Level) forceFree() (int, error) {
 	return n, nil
 }

+// sizeInBytes calculates the total memory usage of all buffers in the subtree.
+//
+// Recursively sums buffer data sizes (count of Float values × sizeof(Float)) across
+// this level's metrics and all child levels. Used by MemoryUsageTracker to enforce
+// memory cap limits.
+//
+// Returns:
+//   - int64: Total bytes used by buffer data in this subtree
 func (l *Level) sizeInBytes() int64 {
 	l.lock.RLock()
 	defer l.lock.RUnlock()
@@ -178,6 +280,16 @@ func (l *Level) sizeInBytes() int64 {
 	return size
 }

+// findLevel navigates to the level specified by selector, returning nil if not found.
+//
+// Read-only variant of findLevelOrCreate. Does not create missing levels.
+// Recursively descends the tree following the selector path.
+//
+// Parameters:
+//   - selector: Hierarchical path (e.g., []string{"emmy", "node001", "cpu0"})
+//
+// Returns:
+//   - *Level: The target level, or nil if any component in the path does not exist
 func (l *Level) findLevel(selector []string) *Level {
 	if len(selector) == 0 {
 		return l
@@ -194,6 +306,28 @@ func (l *Level) findLevel(selector []string) *Level {
 	return lvl.findLevel(selector[1:])
 }

+// findBuffers invokes callback on all buffers matching the selector pattern.
+//
+// Supports flexible selector patterns (from cc-lib/util.Selector):
+//   - Exact match: Selector element with String set (e.g., "node001")
+//   - Group match: Selector element with Group set (e.g., ["cpu0", "cpu2", "cpu4"])
+//   - Wildcard:    Selector element with Any=true (matches all children)
+//
+// Empty selector (len==0) matches current level's buffer at 'offset' and recursively
+// all descendant buffers at the same offset (used for aggregation queries).
+//
+// Parameters:
+//   - selector: Pattern to match (consumed recursively)
+//   - offset:   Metric index in metrics slice (from MetricConfig.offset)
+//   - f:        Callback invoked on each matching buffer
+//
+// Returns:
+//   - error: First error returned by callback, or nil if all succeeded
+//
+// Example:
+//
+//	// Find all cpu0 buffers across all hosts:
+//	findBuffers([]Selector{{Any: true}, {String: "cpu0"}}, metricOffset, callback)
 func (l *Level) findBuffers(selector util.Selector, offset int, f func(b *buffer) error) error {
 	l.lock.RLock()
 	defer l.lock.RUnlock()