mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-02-11 13:31:45 +01:00
Add monitoring healthstate support in nodestate API.
This commit is contained in:
@@ -7,11 +7,14 @@ package api
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"maps"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
@@ -20,6 +23,15 @@ type UpdateNodeStatesRequest struct {
|
||||
Cluster string `json:"cluster" example:"fritz"`
|
||||
}
|
||||
|
||||
// metricListToNames converts a map of metric configurations to a list of metric names
|
||||
func metricListToNames(metricList map[string]*schema.Metric) []string {
|
||||
names := make([]string, 0, len(metricList))
|
||||
for name := range metricList {
|
||||
names = append(names, name)
|
||||
}
|
||||
return names
|
||||
}
|
||||
|
||||
// this routine assumes that only one of them exists per node
|
||||
func determineState(states []string) schema.SchedulerState {
|
||||
for _, state := range states {
|
||||
@@ -62,18 +74,42 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
||||
http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
repo := repository.GetNodeRepository()
|
||||
requestReceived := time.Now().Unix()
|
||||
repo := repository.GetNodeRepository()
|
||||
ms := metricstore.GetMemoryStore()
|
||||
|
||||
m := make(map[string][]string)
|
||||
healthStates := make(map[string]metricstore.NodeHealthState)
|
||||
|
||||
for _, node := range req.Nodes {
|
||||
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
|
||||
m[sc] = append(m[sc], node.Hostname)
|
||||
}
|
||||
}
|
||||
|
||||
for sc, nl := range m {
|
||||
if sc != "" {
|
||||
metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc)
|
||||
metricNames := metricListToNames(metricList)
|
||||
if states, err := ms.HealthCheckAlt(req.Cluster, nl, metricNames); err == nil {
|
||||
maps.Copy(healthStates, states)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, node := range req.Nodes {
|
||||
state := determineState(node.States)
|
||||
healthState := schema.MonitoringStateFull
|
||||
if hs, ok := healthStates[node.Hostname]; ok {
|
||||
healthState = hs.Status
|
||||
}
|
||||
nodeState := schema.NodeStateDB{
|
||||
TimeStamp: requestReceived,
|
||||
NodeState: state,
|
||||
CpusAllocated: node.CpusAllocated,
|
||||
MemoryAllocated: node.MemoryAllocated,
|
||||
GpusAllocated: node.GpusAllocated,
|
||||
HealthState: schema.MonitoringStateFull,
|
||||
HealthState: healthState,
|
||||
JobsRunning: node.JobsRunning,
|
||||
}
|
||||
|
||||
|
||||
@@ -47,6 +47,45 @@ func (b *buffer) healthCheck() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// healthCheck recursively examines a level and all its children to identify stale or missing metrics.
|
||||
//
|
||||
// This routine performs a two-phase check:
|
||||
//
|
||||
// Phase 1 - Check metrics at current level (node-level metrics):
|
||||
// - Iterates through all configured metrics in m.Metrics
|
||||
// - For each metric, checks if a buffer exists at l.metrics[mc.offset]
|
||||
// - If buffer exists: calls buffer.healthCheck() to verify data freshness
|
||||
// - Stale buffer (data older than MaxMissingDataPoints * frequency) → StaleNodeMetricList
|
||||
// - Fresh buffer → healthy, no action
|
||||
// - If buffer is nil: metric was never written → MissingNodeMetricList
|
||||
//
|
||||
// Phase 2 - Recursively check child levels (hardware-level metrics):
|
||||
// - Iterates through l.children (e.g., "cpu0", "gpu0", "socket0")
|
||||
// - Recursively calls healthCheck() on each child level
|
||||
// - Aggregates child results into hardware-specific lists:
|
||||
// - Child's StaleNodeMetricList → parent's StaleHardwareMetricList[childName]
|
||||
// - Child's MissingNodeMetricList → parent's MissingHardwareMetricList[childName]
|
||||
//
|
||||
// The recursive nature means:
|
||||
// - Calling on a host level checks: host metrics + all CPU/GPU/socket metrics
|
||||
// - Calling on a socket level checks: socket metrics + all core metrics
|
||||
// - Leaf levels (e.g., individual cores) only check their own metrics
|
||||
//
|
||||
// Parameters:
|
||||
// - m: MemoryStore containing the global metric configuration (m.Metrics)
|
||||
//
|
||||
// Returns:
|
||||
// - List: Categorized lists of stale and missing metrics at this level and below
|
||||
// - error: Non-nil only for internal errors during recursion
|
||||
//
|
||||
// Concurrency:
|
||||
// - Acquires read lock (RLock) to safely access l.metrics and l.children
|
||||
// - Lock held for entire duration including recursive calls
|
||||
//
|
||||
// Example for host level with structure: host → [cpu0, cpu1]:
|
||||
// - Checks host-level metrics (load, memory) → StaleNodeMetricList / MissingNodeMetricList
|
||||
// - Recursively checks cpu0 metrics → results in StaleHardwareMetricList["cpu0"]
|
||||
// - Recursively checks cpu1 metrics → results in StaleHardwareMetricList["cpu1"]
|
||||
func (l *Level) healthCheck(m *MemoryStore) (List, error) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
@@ -58,6 +97,7 @@ func (l *Level) healthCheck(m *MemoryStore) (List, error) {
|
||||
MissingHardwareMetricList: make(map[string][]string, 0),
|
||||
}
|
||||
|
||||
// Phase 1: Check metrics at this level
|
||||
for metricName, mc := range m.Metrics {
|
||||
if b := l.metrics[mc.offset]; b != nil {
|
||||
if b.healthCheck() {
|
||||
@@ -68,6 +108,7 @@ func (l *Level) healthCheck(m *MemoryStore) (List, error) {
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Recursively check child levels (hardware components)
|
||||
for hardwareMetricName, lvl := range l.children {
|
||||
l, err := lvl.healthCheck(m)
|
||||
if err != nil {
|
||||
@@ -85,6 +126,48 @@ func (l *Level) healthCheck(m *MemoryStore) (List, error) {
|
||||
return list, nil
|
||||
}
|
||||
|
||||
// HealthCheck performs a health check on a specific node in the metric store.
|
||||
//
|
||||
// This routine checks whether metrics for a given node are being received and are up-to-date.
|
||||
// It examines both node-level metrics (e.g., load, memory) and hardware-level metrics
|
||||
// (e.g., CPU, GPU, network) to determine the monitoring state.
|
||||
//
|
||||
// Parameters:
|
||||
// - selector: Hierarchical path to the target node, typically []string{cluster, hostname}.
|
||||
// Example: []string{"emmy", "node001"} navigates to the "node001" host in the "emmy" cluster.
|
||||
// The selector must match the hierarchy used during metric ingestion (see Level.findLevelOrCreate).
|
||||
// - subcluster: Subcluster name (currently unused, reserved for future filtering)
|
||||
//
|
||||
// Returns:
|
||||
// - *HeathCheckResponse: Health status with detailed lists of stale/missing metrics
|
||||
// - error: Non-nil only for internal errors (not for unhealthy nodes)
|
||||
//
|
||||
// Health States:
|
||||
// - MonitoringStateFull: All expected metrics are present and up-to-date
|
||||
// - MonitoringStatePartial: Some metrics are stale (data older than MaxMissingDataPoints * frequency)
|
||||
// - MonitoringStateFailed: Host not found, or metrics are completely missing
|
||||
//
|
||||
// The response includes detailed lists:
|
||||
// - StaleNodeMetricList: Node-level metrics with stale data
|
||||
// - StaleHardwareMetricList: Hardware-level metrics with stale data (grouped by component)
|
||||
// - MissingNodeMetricList: Expected node-level metrics that have no data
|
||||
// - MissingHardwareMetricList: Expected hardware-level metrics that have no data (grouped by component)
|
||||
//
|
||||
// Example usage:
|
||||
//
|
||||
// selector := []string{"emmy", "node001"}
|
||||
// response, err := ms.HealthCheck(selector, "")
|
||||
// if err != nil {
|
||||
// // Internal error
|
||||
// }
|
||||
// switch response.Status {
|
||||
// case schema.MonitoringStateFull:
|
||||
// // All metrics healthy
|
||||
// case schema.MonitoringStatePartial:
|
||||
// // Check response.list.StaleNodeMetricList for details
|
||||
// case schema.MonitoringStateFailed:
|
||||
// // Check response.Error or response.list.MissingNodeMetricList
|
||||
// }
|
||||
func (m *MemoryStore) HealthCheck(selector []string, subcluster string) (*HeathCheckResponse, error) {
|
||||
response := HeathCheckResponse{
|
||||
Status: schema.MonitoringStateFull,
|
||||
@@ -120,3 +203,276 @@ func (m *MemoryStore) HealthCheck(selector []string, subcluster string) (*HeathC
|
||||
|
||||
return &response, nil
|
||||
}
|
||||
|
||||
// isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints.
|
||||
//
|
||||
// Returns true if the buffer is healthy (recent data within threshold), false otherwise.
|
||||
// A nil buffer or empty buffer is considered unhealthy.
|
||||
func (b *buffer) isBufferHealthy() bool {
|
||||
// Check if the buffer is empty
|
||||
if b == nil || b.data == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
bufferEnd := b.start + b.frequency*int64(len(b.data))
|
||||
t := time.Now().Unix()
|
||||
|
||||
// Check if the buffer has recent data (within MaxMissingDataPoints threshold)
|
||||
if t-bufferEnd > MaxMissingDataPoints*b.frequency {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// countMissingValues counts the number of NaN (missing) values in the most recent data points.
|
||||
//
|
||||
// Examines the last MaxMissingDataPoints*2 values in the buffer and counts how many are NaN.
|
||||
// We check twice the threshold to allow detecting when more than MaxMissingDataPoints are missing.
|
||||
// If the buffer has fewer values, examines all available values.
|
||||
//
|
||||
// Returns:
|
||||
// - int: Number of NaN values found in the examined range
|
||||
func (b *buffer) countMissingValues() int {
|
||||
if b == nil || b.data == nil || len(b.data) == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
// Check twice the threshold to detect degraded metrics
|
||||
checkCount := min(int(MaxMissingDataPoints)*2, len(b.data))
|
||||
|
||||
// Count NaN values in the most recent data points
|
||||
missingCount := 0
|
||||
startIdx := len(b.data) - checkCount
|
||||
for i := startIdx; i < len(b.data); i++ {
|
||||
if b.data[i].IsNaN() {
|
||||
missingCount++
|
||||
}
|
||||
}
|
||||
|
||||
return missingCount
|
||||
}
|
||||
|
||||
// getHealthyMetrics recursively collects healthy and degraded metrics at this level and below.
|
||||
//
|
||||
// A metric is considered:
|
||||
// - Healthy: buffer has recent data within MaxMissingDataPoints threshold AND has few/no NaN values
|
||||
// - Degraded: buffer exists and has recent data, but contains more than MaxMissingDataPoints NaN values
|
||||
//
|
||||
// This routine walks the entire subtree starting from the current level.
|
||||
//
|
||||
// Parameters:
|
||||
// - m: MemoryStore containing the global metric configuration
|
||||
//
|
||||
// Returns:
|
||||
// - []string: Flat list of healthy metric names from this level and all children
|
||||
// - []string: Flat list of degraded metric names (exist but have too many missing values)
|
||||
// - error: Non-nil only for internal errors during recursion
|
||||
//
|
||||
// The routine mirrors healthCheck() but provides more granular classification:
|
||||
// - healthCheck() finds problems (stale/missing)
|
||||
// - getHealthyMetrics() separates healthy from degraded metrics
|
||||
func (l *Level) getHealthyMetrics(m *MemoryStore) ([]string, []string, error) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
healthyList := make([]string, 0)
|
||||
degradedList := make([]string, 0)
|
||||
|
||||
// Phase 1: Check metrics at this level
|
||||
for metricName, mc := range m.Metrics {
|
||||
b := l.metrics[mc.offset]
|
||||
if b.isBufferHealthy() {
|
||||
// Buffer has recent data, now check for missing values
|
||||
missingCount := b.countMissingValues()
|
||||
if missingCount > int(MaxMissingDataPoints) {
|
||||
degradedList = append(degradedList, metricName)
|
||||
} else {
|
||||
healthyList = append(healthyList, metricName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Recursively check child levels (hardware components)
|
||||
for _, lvl := range l.children {
|
||||
childHealthy, childDegraded, err := lvl.getHealthyMetrics(m)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// Merge child metrics into flat lists
|
||||
healthyList = append(healthyList, childHealthy...)
|
||||
degradedList = append(degradedList, childDegraded...)
|
||||
}
|
||||
|
||||
return healthyList, degradedList, nil
|
||||
}
|
||||
|
||||
// GetHealthyMetrics returns healthy and degraded metrics for a specific node as flat lists.
|
||||
//
|
||||
// This routine walks the metric tree starting from the specified node selector
|
||||
// and collects all metrics that have received data within the last MaxMissingDataPoints
|
||||
// (default: 5 data points). Metrics are classified into two categories:
|
||||
//
|
||||
// - Healthy: Buffer has recent data AND contains few/no NaN (missing) values
|
||||
// - Degraded: Buffer has recent data BUT contains more than MaxMissingDataPoints NaN values
|
||||
//
|
||||
// The returned lists include both node-level metrics (e.g., "load", "mem_used") and
|
||||
// hardware-level metrics (e.g., "cpu_user", "gpu_temp") in flat slices.
|
||||
//
|
||||
// Parameters:
|
||||
// - selector: Hierarchical path to the target node, typically []string{cluster, hostname}.
|
||||
// Example: []string{"emmy", "node001"} navigates to the "node001" host in the "emmy" cluster.
|
||||
// The selector must match the hierarchy used during metric ingestion.
|
||||
//
|
||||
// Returns:
|
||||
// - []string: Flat list of healthy metric names (recent data, few missing values)
|
||||
// - []string: Flat list of degraded metric names (recent data, many missing values)
|
||||
// - error: Non-nil if the node is not found or internal errors occur
|
||||
//
|
||||
// Example usage:
|
||||
//
|
||||
// selector := []string{"emmy", "node001"}
|
||||
// healthyMetrics, degradedMetrics, err := ms.GetHealthyMetrics(selector)
|
||||
// if err != nil {
|
||||
// // Node not found or internal error
|
||||
// return err
|
||||
// }
|
||||
// fmt.Printf("Healthy metrics: %v\n", healthyMetrics)
|
||||
// // Output: ["load", "mem_used", "cpu_user", ...]
|
||||
// fmt.Printf("Degraded metrics: %v\n", degradedMetrics)
|
||||
// // Output: ["gpu_temp", "network_rx", ...] (metrics with many NaN values)
|
||||
//
|
||||
// Note: This routine provides more granular classification than HealthCheck:
|
||||
// - HealthCheck reports stale/missing metrics (problems)
|
||||
// - GetHealthyMetrics separates fully healthy from degraded metrics (quality levels)
|
||||
func (m *MemoryStore) GetHealthyMetrics(selector []string) ([]string, []string, error) {
|
||||
lvl := m.root.findLevel(selector)
|
||||
if lvl == nil {
|
||||
return nil, nil, fmt.Errorf("[METRICSTORE]> error while GetHealthyMetrics, host not found: %#v", selector)
|
||||
}
|
||||
|
||||
healthyList, degradedList, err := lvl.getHealthyMetrics(m)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
return healthyList, degradedList, nil
|
||||
}
|
||||
|
||||
// NodeHealthState represents the health status of a single node's metrics.
|
||||
type NodeHealthState struct {
|
||||
Status schema.MonitoringState // Overall health status: Full, Partial, or Failed
|
||||
HealthyMetrics []string // Metrics with recent data and few missing values
|
||||
DegradedMetrics []string // Metrics with recent data but many missing values
|
||||
MissingMetrics []string // Expected metrics that are completely missing or stale
|
||||
}
|
||||
|
||||
// HealthCheckAlt performs health checks on multiple nodes and returns their health states.
|
||||
//
|
||||
// This routine provides a batch health check interface that evaluates multiple nodes
|
||||
// against a specific set of expected metrics. For each node, it determines which metrics
|
||||
// are healthy, degraded, or missing, and assigns an overall health status.
|
||||
//
|
||||
// Health Status Classification:
|
||||
// - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values)
|
||||
// - MonitoringStatePartial: Some metrics are degraded (many missing values) or missing
|
||||
// - MonitoringStateFailed: Node not found or all expected metrics are missing/stale
|
||||
//
|
||||
// Parameters:
|
||||
// - cluster: Cluster name (first element of selector path)
|
||||
// - nodes: List of node hostnames to check
|
||||
// - expectedMetrics: List of metric names that should be present on each node
|
||||
//
|
||||
// Returns:
|
||||
// - map[string]NodeHealthState: Map keyed by hostname containing health state for each node
|
||||
// - error: Non-nil only for internal errors (individual node failures are captured in NodeHealthState)
|
||||
//
|
||||
// Example usage:
|
||||
//
|
||||
// cluster := "emmy"
|
||||
// nodes := []string{"node001", "node002", "node003"}
|
||||
// expectedMetrics := []string{"load", "mem_used", "cpu_user", "cpu_system"}
|
||||
// healthStates, err := ms.HealthCheckAlt(cluster, nodes, expectedMetrics)
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
// for hostname, state := range healthStates {
|
||||
// fmt.Printf("Node %s: %s\n", hostname, state.Status)
|
||||
// fmt.Printf(" Healthy: %v\n", state.HealthyMetrics)
|
||||
// fmt.Printf(" Degraded: %v\n", state.DegradedMetrics)
|
||||
// fmt.Printf(" Missing: %v\n", state.MissingMetrics)
|
||||
// }
|
||||
//
|
||||
// Note: This routine is optimized for batch operations where you need to check
|
||||
// the same set of metrics across multiple nodes. For single-node checks with
|
||||
// all configured metrics, use HealthCheck() instead.
|
||||
func (m *MemoryStore) HealthCheckAlt(cluster string,
|
||||
nodes []string, expectedMetrics []string,
|
||||
) (map[string]NodeHealthState, error) {
|
||||
results := make(map[string]NodeHealthState, len(nodes))
|
||||
|
||||
// Create a set of expected metrics for fast lookup
|
||||
expectedSet := make(map[string]bool, len(expectedMetrics))
|
||||
for _, metric := range expectedMetrics {
|
||||
expectedSet[metric] = true
|
||||
}
|
||||
|
||||
// Check each node
|
||||
for _, hostname := range nodes {
|
||||
selector := []string{cluster, hostname}
|
||||
state := NodeHealthState{
|
||||
Status: schema.MonitoringStateFull,
|
||||
HealthyMetrics: make([]string, 0),
|
||||
DegradedMetrics: make([]string, 0),
|
||||
MissingMetrics: make([]string, 0),
|
||||
}
|
||||
|
||||
// Get healthy and degraded metrics for this node
|
||||
healthyList, degradedList, err := m.GetHealthyMetrics(selector)
|
||||
if err != nil {
|
||||
// Node not found or internal error
|
||||
state.Status = schema.MonitoringStateFailed
|
||||
state.MissingMetrics = expectedMetrics
|
||||
results[hostname] = state
|
||||
continue
|
||||
}
|
||||
|
||||
// Create sets for fast lookup
|
||||
healthySet := make(map[string]bool, len(healthyList))
|
||||
for _, metric := range healthyList {
|
||||
healthySet[metric] = true
|
||||
}
|
||||
degradedSet := make(map[string]bool, len(degradedList))
|
||||
for _, metric := range degradedList {
|
||||
degradedSet[metric] = true
|
||||
}
|
||||
|
||||
// Classify each expected metric
|
||||
for _, metric := range expectedMetrics {
|
||||
if healthySet[metric] {
|
||||
state.HealthyMetrics = append(state.HealthyMetrics, metric)
|
||||
} else if degradedSet[metric] {
|
||||
state.DegradedMetrics = append(state.DegradedMetrics, metric)
|
||||
} else {
|
||||
state.MissingMetrics = append(state.MissingMetrics, metric)
|
||||
}
|
||||
}
|
||||
|
||||
// Determine overall health status
|
||||
if len(state.MissingMetrics) > 0 || len(state.DegradedMetrics) > 0 {
|
||||
if len(state.HealthyMetrics) == 0 {
|
||||
// No healthy metrics at all
|
||||
state.Status = schema.MonitoringStateFailed
|
||||
} else {
|
||||
// Some healthy, some degraded/missing
|
||||
state.Status = schema.MonitoringStatePartial
|
||||
}
|
||||
}
|
||||
// else: all metrics healthy, status remains MonitoringStateFull
|
||||
|
||||
results[hostname] = state
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ package metricstore
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
@@ -88,3 +89,219 @@ func TestBufferRead(t *testing.T) {
|
||||
t.Errorf("buffer.read() len(result) = %d, want 3", len(result))
|
||||
}
|
||||
}
|
||||
|
||||
func TestHealthCheckAlt(t *testing.T) {
|
||||
// Create a test MemoryStore with some metrics
|
||||
metrics := map[string]MetricConfig{
|
||||
"load": {Frequency: 10, Aggregation: AvgAggregation, offset: 0},
|
||||
"mem_used": {Frequency: 10, Aggregation: AvgAggregation, offset: 1},
|
||||
"cpu_user": {Frequency: 10, Aggregation: AvgAggregation, offset: 2},
|
||||
"cpu_system": {Frequency: 10, Aggregation: AvgAggregation, offset: 3},
|
||||
}
|
||||
|
||||
ms := &MemoryStore{
|
||||
Metrics: metrics,
|
||||
root: Level{
|
||||
metrics: make([]*buffer, len(metrics)),
|
||||
children: make(map[string]*Level),
|
||||
},
|
||||
}
|
||||
|
||||
// Use recent timestamps (current time minus a small offset)
|
||||
now := time.Now().Unix()
|
||||
startTime := now - 100 // Start 100 seconds ago to have enough data points
|
||||
|
||||
// Setup test data for node001 - all metrics healthy
|
||||
node001 := ms.root.findLevelOrCreate([]string{"testcluster", "node001"}, len(metrics))
|
||||
for i := 0; i < len(metrics); i++ {
|
||||
node001.metrics[i] = newBuffer(startTime, 10)
|
||||
// Write recent data with no NaN values
|
||||
for ts := startTime; ts <= now; ts += 10 {
|
||||
node001.metrics[i].write(ts, schema.Float(float64(i+1)))
|
||||
}
|
||||
}
|
||||
|
||||
// Setup test data for node002 - some metrics degraded (many NaN values)
|
||||
node002 := ms.root.findLevelOrCreate([]string{"testcluster", "node002"}, len(metrics))
|
||||
for i := 0; i < len(metrics); i++ {
|
||||
node002.metrics[i] = newBuffer(startTime, 10)
|
||||
if i < 2 {
|
||||
// First two metrics: healthy (no NaN)
|
||||
for ts := startTime; ts <= now; ts += 10 {
|
||||
node002.metrics[i].write(ts, schema.Float(float64(i+1)))
|
||||
}
|
||||
} else {
|
||||
// Last two metrics: degraded (many NaN values in recent data)
|
||||
// Write real values first, then NaN values at the end
|
||||
count := 0
|
||||
for ts := startTime; ts <= now; ts += 10 {
|
||||
if count < 5 {
|
||||
// Write first 5 real values
|
||||
node002.metrics[i].write(ts, schema.Float(float64(i+1)))
|
||||
} else {
|
||||
// Write NaN for the rest (last ~6 values will be NaN)
|
||||
node002.metrics[i].write(ts, schema.NaN)
|
||||
}
|
||||
count++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Setup test data for node003 - some metrics missing (no buffer)
|
||||
node003 := ms.root.findLevelOrCreate([]string{"testcluster", "node003"}, len(metrics))
|
||||
// Only create buffers for first two metrics
|
||||
for i := 0; i < 2; i++ {
|
||||
node003.metrics[i] = newBuffer(startTime, 10)
|
||||
for ts := startTime; ts <= now; ts += 10 {
|
||||
node003.metrics[i].write(ts, schema.Float(float64(i+1)))
|
||||
}
|
||||
}
|
||||
// Leave metrics[2] and metrics[3] as nil (missing)
|
||||
|
||||
// node004 doesn't exist at all
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
cluster string
|
||||
nodes []string
|
||||
expectedMetrics []string
|
||||
wantStates map[string]schema.MonitoringState
|
||||
wantHealthyCounts map[string]int
|
||||
wantDegradedCounts map[string]int
|
||||
wantMissingCounts map[string]int
|
||||
}{
|
||||
{
|
||||
name: "all metrics healthy",
|
||||
cluster: "testcluster",
|
||||
nodes: []string{"node001"},
|
||||
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
|
||||
wantStates: map[string]schema.MonitoringState{
|
||||
"node001": schema.MonitoringStateFull,
|
||||
},
|
||||
wantHealthyCounts: map[string]int{"node001": 4},
|
||||
wantDegradedCounts: map[string]int{"node001": 0},
|
||||
wantMissingCounts: map[string]int{"node001": 0},
|
||||
},
|
||||
{
|
||||
name: "some metrics degraded",
|
||||
cluster: "testcluster",
|
||||
nodes: []string{"node002"},
|
||||
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
|
||||
wantStates: map[string]schema.MonitoringState{
|
||||
"node002": schema.MonitoringStatePartial,
|
||||
},
|
||||
wantHealthyCounts: map[string]int{"node002": 2},
|
||||
wantDegradedCounts: map[string]int{"node002": 2},
|
||||
wantMissingCounts: map[string]int{"node002": 0},
|
||||
},
|
||||
{
|
||||
name: "some metrics missing",
|
||||
cluster: "testcluster",
|
||||
nodes: []string{"node003"},
|
||||
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
|
||||
wantStates: map[string]schema.MonitoringState{
|
||||
"node003": schema.MonitoringStatePartial,
|
||||
},
|
||||
wantHealthyCounts: map[string]int{"node003": 2},
|
||||
wantDegradedCounts: map[string]int{"node003": 0},
|
||||
wantMissingCounts: map[string]int{"node003": 2},
|
||||
},
|
||||
{
|
||||
name: "node not found",
|
||||
cluster: "testcluster",
|
||||
nodes: []string{"node004"},
|
||||
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
|
||||
wantStates: map[string]schema.MonitoringState{
|
||||
"node004": schema.MonitoringStateFailed,
|
||||
},
|
||||
wantHealthyCounts: map[string]int{"node004": 0},
|
||||
wantDegradedCounts: map[string]int{"node004": 0},
|
||||
wantMissingCounts: map[string]int{"node004": 4},
|
||||
},
|
||||
{
|
||||
name: "multiple nodes mixed states",
|
||||
cluster: "testcluster",
|
||||
nodes: []string{"node001", "node002", "node003", "node004"},
|
||||
expectedMetrics: []string{"load", "mem_used"},
|
||||
wantStates: map[string]schema.MonitoringState{
|
||||
"node001": schema.MonitoringStateFull,
|
||||
"node002": schema.MonitoringStateFull,
|
||||
"node003": schema.MonitoringStateFull,
|
||||
"node004": schema.MonitoringStateFailed,
|
||||
},
|
||||
wantHealthyCounts: map[string]int{
|
||||
"node001": 2,
|
||||
"node002": 2,
|
||||
"node003": 2,
|
||||
"node004": 0,
|
||||
},
|
||||
wantDegradedCounts: map[string]int{
|
||||
"node001": 0,
|
||||
"node002": 0,
|
||||
"node003": 0,
|
||||
"node004": 0,
|
||||
},
|
||||
wantMissingCounts: map[string]int{
|
||||
"node001": 0,
|
||||
"node002": 0,
|
||||
"node003": 0,
|
||||
"node004": 2,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
results, err := ms.HealthCheckAlt(tt.cluster, tt.nodes, tt.expectedMetrics)
|
||||
if err != nil {
|
||||
t.Errorf("HealthCheckAlt() error = %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Check that we got results for all nodes
|
||||
if len(results) != len(tt.nodes) {
|
||||
t.Errorf("HealthCheckAlt() returned %d results, want %d", len(results), len(tt.nodes))
|
||||
}
|
||||
|
||||
// Check each node's state
|
||||
for _, node := range tt.nodes {
|
||||
state, ok := results[node]
|
||||
if !ok {
|
||||
t.Errorf("HealthCheckAlt() missing result for node %s", node)
|
||||
continue
|
||||
}
|
||||
|
||||
// Check status
|
||||
if wantStatus, ok := tt.wantStates[node]; ok {
|
||||
if state.Status != wantStatus {
|
||||
t.Errorf("HealthCheckAlt() node %s status = %v, want %v", node, state.Status, wantStatus)
|
||||
}
|
||||
}
|
||||
|
||||
// Check healthy count
|
||||
if wantCount, ok := tt.wantHealthyCounts[node]; ok {
|
||||
if len(state.HealthyMetrics) != wantCount {
|
||||
t.Errorf("HealthCheckAlt() node %s healthy count = %d, want %d (metrics: %v)",
|
||||
node, len(state.HealthyMetrics), wantCount, state.HealthyMetrics)
|
||||
}
|
||||
}
|
||||
|
||||
// Check degraded count
|
||||
if wantCount, ok := tt.wantDegradedCounts[node]; ok {
|
||||
if len(state.DegradedMetrics) != wantCount {
|
||||
t.Errorf("HealthCheckAlt() node %s degraded count = %d, want %d (metrics: %v)",
|
||||
node, len(state.DegradedMetrics), wantCount, state.DegradedMetrics)
|
||||
}
|
||||
}
|
||||
|
||||
// Check missing count
|
||||
if wantCount, ok := tt.wantMissingCounts[node]; ok {
|
||||
if len(state.MissingMetrics) != wantCount {
|
||||
t.Errorf("HealthCheckAlt() node %s missing count = %d, want %d (metrics: %v)",
|
||||
node, len(state.MissingMetrics), wantCount, state.MissingMetrics)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user