Add monitoring healthstate support in nodestate API.

2026-02-11 21:41:46 +01:00 · 2026-02-03 12:23:24 +01:00
parent e9cd6b4225
commit 00a41373e8
3 changed files with 611 additions and 2 deletions
--- a/pkg/metricstore/metricstore_test.go
+++ b/pkg/metricstore/metricstore_test.go
@@ -7,6 +7,7 @@ package metricstore

 import (
 	"testing"
+	"time"

 	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )
@@ -88,3 +89,219 @@ func TestBufferRead(t *testing.T) {
 		t.Errorf("buffer.read() len(result) = %d, want 3", len(result))
 	}
 }
+
+func TestHealthCheckAlt(t *testing.T) {
+	// Create a test MemoryStore with some metrics
+	metrics := map[string]MetricConfig{
+		"load":       {Frequency: 10, Aggregation: AvgAggregation, offset: 0},
+		"mem_used":   {Frequency: 10, Aggregation: AvgAggregation, offset: 1},
+		"cpu_user":   {Frequency: 10, Aggregation: AvgAggregation, offset: 2},
+		"cpu_system": {Frequency: 10, Aggregation: AvgAggregation, offset: 3},
+	}
+
+	ms := &MemoryStore{
+		Metrics: metrics,
+		root: Level{
+			metrics:  make([]*buffer, len(metrics)),
+			children: make(map[string]*Level),
+		},
+	}
+
+	// Use recent timestamps (current time minus a small offset)
+	now := time.Now().Unix()
+	startTime := now - 100 // Start 100 seconds ago to have enough data points
+
+	// Setup test data for node001 - all metrics healthy
+	node001 := ms.root.findLevelOrCreate([]string{"testcluster", "node001"}, len(metrics))
+	for i := 0; i < len(metrics); i++ {
+		node001.metrics[i] = newBuffer(startTime, 10)
+		// Write recent data with no NaN values
+		for ts := startTime; ts <= now; ts += 10 {
+			node001.metrics[i].write(ts, schema.Float(float64(i+1)))
+		}
+	}
+
+	// Setup test data for node002 - some metrics degraded (many NaN values)
+	node002 := ms.root.findLevelOrCreate([]string{"testcluster", "node002"}, len(metrics))
+	for i := 0; i < len(metrics); i++ {
+		node002.metrics[i] = newBuffer(startTime, 10)
+		if i < 2 {
+			// First two metrics: healthy (no NaN)
+			for ts := startTime; ts <= now; ts += 10 {
+				node002.metrics[i].write(ts, schema.Float(float64(i+1)))
+			}
+		} else {
+			// Last two metrics: degraded (many NaN values in recent data)
+			// Write real values first, then NaN values at the end
+			count := 0
+			for ts := startTime; ts <= now; ts += 10 {
+				if count < 5 {
+					// Write first 5 real values
+					node002.metrics[i].write(ts, schema.Float(float64(i+1)))
+				} else {
+					// Write NaN for the rest (last ~6 values will be NaN)
+					node002.metrics[i].write(ts, schema.NaN)
+				}
+				count++
+			}
+		}
+	}
+
+	// Setup test data for node003 - some metrics missing (no buffer)
+	node003 := ms.root.findLevelOrCreate([]string{"testcluster", "node003"}, len(metrics))
+	// Only create buffers for first two metrics
+	for i := 0; i < 2; i++ {
+		node003.metrics[i] = newBuffer(startTime, 10)
+		for ts := startTime; ts <= now; ts += 10 {
+			node003.metrics[i].write(ts, schema.Float(float64(i+1)))
+		}
+	}
+	// Leave metrics[2] and metrics[3] as nil (missing)
+
+	// node004 doesn't exist at all
+
+	tests := []struct {
+		name               string
+		cluster            string
+		nodes              []string
+		expectedMetrics    []string
+		wantStates         map[string]schema.MonitoringState
+		wantHealthyCounts  map[string]int
+		wantDegradedCounts map[string]int
+		wantMissingCounts  map[string]int
+	}{
+		{
+			name:            "all metrics healthy",
+			cluster:         "testcluster",
+			nodes:           []string{"node001"},
+			expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
+			wantStates: map[string]schema.MonitoringState{
+				"node001": schema.MonitoringStateFull,
+			},
+			wantHealthyCounts:  map[string]int{"node001": 4},
+			wantDegradedCounts: map[string]int{"node001": 0},
+			wantMissingCounts:  map[string]int{"node001": 0},
+		},
+		{
+			name:            "some metrics degraded",
+			cluster:         "testcluster",
+			nodes:           []string{"node002"},
+			expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
+			wantStates: map[string]schema.MonitoringState{
+				"node002": schema.MonitoringStatePartial,
+			},
+			wantHealthyCounts:  map[string]int{"node002": 2},
+			wantDegradedCounts: map[string]int{"node002": 2},
+			wantMissingCounts:  map[string]int{"node002": 0},
+		},
+		{
+			name:            "some metrics missing",
+			cluster:         "testcluster",
+			nodes:           []string{"node003"},
+			expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
+			wantStates: map[string]schema.MonitoringState{
+				"node003": schema.MonitoringStatePartial,
+			},
+			wantHealthyCounts:  map[string]int{"node003": 2},
+			wantDegradedCounts: map[string]int{"node003": 0},
+			wantMissingCounts:  map[string]int{"node003": 2},
+		},
+		{
+			name:            "node not found",
+			cluster:         "testcluster",
+			nodes:           []string{"node004"},
+			expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
+			wantStates: map[string]schema.MonitoringState{
+				"node004": schema.MonitoringStateFailed,
+			},
+			wantHealthyCounts:  map[string]int{"node004": 0},
+			wantDegradedCounts: map[string]int{"node004": 0},
+			wantMissingCounts:  map[string]int{"node004": 4},
+		},
+		{
+			name:            "multiple nodes mixed states",
+			cluster:         "testcluster",
+			nodes:           []string{"node001", "node002", "node003", "node004"},
+			expectedMetrics: []string{"load", "mem_used"},
+			wantStates: map[string]schema.MonitoringState{
+				"node001": schema.MonitoringStateFull,
+				"node002": schema.MonitoringStateFull,
+				"node003": schema.MonitoringStateFull,
+				"node004": schema.MonitoringStateFailed,
+			},
+			wantHealthyCounts: map[string]int{
+				"node001": 2,
+				"node002": 2,
+				"node003": 2,
+				"node004": 0,
+			},
+			wantDegradedCounts: map[string]int{
+				"node001": 0,
+				"node002": 0,
+				"node003": 0,
+				"node004": 0,
+			},
+			wantMissingCounts: map[string]int{
+				"node001": 0,
+				"node002": 0,
+				"node003": 0,
+				"node004": 2,
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			results, err := ms.HealthCheckAlt(tt.cluster, tt.nodes, tt.expectedMetrics)
+			if err != nil {
+				t.Errorf("HealthCheckAlt() error = %v", err)
+				return
+			}
+
+			// Check that we got results for all nodes
+			if len(results) != len(tt.nodes) {
+				t.Errorf("HealthCheckAlt() returned %d results, want %d", len(results), len(tt.nodes))
+			}
+
+			// Check each node's state
+			for _, node := range tt.nodes {
+				state, ok := results[node]
+				if !ok {
+					t.Errorf("HealthCheckAlt() missing result for node %s", node)
+					continue
+				}
+
+				// Check status
+				if wantStatus, ok := tt.wantStates[node]; ok {
+					if state.Status != wantStatus {
+						t.Errorf("HealthCheckAlt() node %s status = %v, want %v", node, state.Status, wantStatus)
+					}
+				}
+
+				// Check healthy count
+				if wantCount, ok := tt.wantHealthyCounts[node]; ok {
+					if len(state.HealthyMetrics) != wantCount {
+						t.Errorf("HealthCheckAlt() node %s healthy count = %d, want %d (metrics: %v)",
+							node, len(state.HealthyMetrics), wantCount, state.HealthyMetrics)
+					}
+				}
+
+				// Check degraded count
+				if wantCount, ok := tt.wantDegradedCounts[node]; ok {
+					if len(state.DegradedMetrics) != wantCount {
+						t.Errorf("HealthCheckAlt() node %s degraded count = %d, want %d (metrics: %v)",
+							node, len(state.DegradedMetrics), wantCount, state.DegradedMetrics)
+					}
+				}
+
+				// Check missing count
+				if wantCount, ok := tt.wantMissingCounts[node]; ok {
+					if len(state.MissingMetrics) != wantCount {
+						t.Errorf("HealthCheckAlt() node %s missing count = %d, want %d (metrics: %v)",
+							node, len(state.MissingMetrics), wantCount, state.MissingMetrics)
+					}
+				}
+			}
+		})
+	}
+}