Add monitoring healthstate support in nodestate API.

This commit is contained in:
2026-02-03 12:23:24 +01:00
parent e9cd6b4225
commit 00a41373e8
3 changed files with 611 additions and 2 deletions

View File

@@ -7,6 +7,7 @@ package metricstore
import (
"testing"
"time"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
@@ -88,3 +89,219 @@ func TestBufferRead(t *testing.T) {
t.Errorf("buffer.read() len(result) = %d, want 3", len(result))
}
}
func TestHealthCheckAlt(t *testing.T) {
// Create a test MemoryStore with some metrics
metrics := map[string]MetricConfig{
"load": {Frequency: 10, Aggregation: AvgAggregation, offset: 0},
"mem_used": {Frequency: 10, Aggregation: AvgAggregation, offset: 1},
"cpu_user": {Frequency: 10, Aggregation: AvgAggregation, offset: 2},
"cpu_system": {Frequency: 10, Aggregation: AvgAggregation, offset: 3},
}
ms := &MemoryStore{
Metrics: metrics,
root: Level{
metrics: make([]*buffer, len(metrics)),
children: make(map[string]*Level),
},
}
// Use recent timestamps (current time minus a small offset)
now := time.Now().Unix()
startTime := now - 100 // Start 100 seconds ago to have enough data points
// Setup test data for node001 - all metrics healthy
node001 := ms.root.findLevelOrCreate([]string{"testcluster", "node001"}, len(metrics))
for i := 0; i < len(metrics); i++ {
node001.metrics[i] = newBuffer(startTime, 10)
// Write recent data with no NaN values
for ts := startTime; ts <= now; ts += 10 {
node001.metrics[i].write(ts, schema.Float(float64(i+1)))
}
}
// Setup test data for node002 - some metrics degraded (many NaN values)
node002 := ms.root.findLevelOrCreate([]string{"testcluster", "node002"}, len(metrics))
for i := 0; i < len(metrics); i++ {
node002.metrics[i] = newBuffer(startTime, 10)
if i < 2 {
// First two metrics: healthy (no NaN)
for ts := startTime; ts <= now; ts += 10 {
node002.metrics[i].write(ts, schema.Float(float64(i+1)))
}
} else {
// Last two metrics: degraded (many NaN values in recent data)
// Write real values first, then NaN values at the end
count := 0
for ts := startTime; ts <= now; ts += 10 {
if count < 5 {
// Write first 5 real values
node002.metrics[i].write(ts, schema.Float(float64(i+1)))
} else {
// Write NaN for the rest (last ~6 values will be NaN)
node002.metrics[i].write(ts, schema.NaN)
}
count++
}
}
}
// Setup test data for node003 - some metrics missing (no buffer)
node003 := ms.root.findLevelOrCreate([]string{"testcluster", "node003"}, len(metrics))
// Only create buffers for first two metrics
for i := 0; i < 2; i++ {
node003.metrics[i] = newBuffer(startTime, 10)
for ts := startTime; ts <= now; ts += 10 {
node003.metrics[i].write(ts, schema.Float(float64(i+1)))
}
}
// Leave metrics[2] and metrics[3] as nil (missing)
// node004 doesn't exist at all
tests := []struct {
name string
cluster string
nodes []string
expectedMetrics []string
wantStates map[string]schema.MonitoringState
wantHealthyCounts map[string]int
wantDegradedCounts map[string]int
wantMissingCounts map[string]int
}{
{
name: "all metrics healthy",
cluster: "testcluster",
nodes: []string{"node001"},
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
wantStates: map[string]schema.MonitoringState{
"node001": schema.MonitoringStateFull,
},
wantHealthyCounts: map[string]int{"node001": 4},
wantDegradedCounts: map[string]int{"node001": 0},
wantMissingCounts: map[string]int{"node001": 0},
},
{
name: "some metrics degraded",
cluster: "testcluster",
nodes: []string{"node002"},
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
wantStates: map[string]schema.MonitoringState{
"node002": schema.MonitoringStatePartial,
},
wantHealthyCounts: map[string]int{"node002": 2},
wantDegradedCounts: map[string]int{"node002": 2},
wantMissingCounts: map[string]int{"node002": 0},
},
{
name: "some metrics missing",
cluster: "testcluster",
nodes: []string{"node003"},
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
wantStates: map[string]schema.MonitoringState{
"node003": schema.MonitoringStatePartial,
},
wantHealthyCounts: map[string]int{"node003": 2},
wantDegradedCounts: map[string]int{"node003": 0},
wantMissingCounts: map[string]int{"node003": 2},
},
{
name: "node not found",
cluster: "testcluster",
nodes: []string{"node004"},
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
wantStates: map[string]schema.MonitoringState{
"node004": schema.MonitoringStateFailed,
},
wantHealthyCounts: map[string]int{"node004": 0},
wantDegradedCounts: map[string]int{"node004": 0},
wantMissingCounts: map[string]int{"node004": 4},
},
{
name: "multiple nodes mixed states",
cluster: "testcluster",
nodes: []string{"node001", "node002", "node003", "node004"},
expectedMetrics: []string{"load", "mem_used"},
wantStates: map[string]schema.MonitoringState{
"node001": schema.MonitoringStateFull,
"node002": schema.MonitoringStateFull,
"node003": schema.MonitoringStateFull,
"node004": schema.MonitoringStateFailed,
},
wantHealthyCounts: map[string]int{
"node001": 2,
"node002": 2,
"node003": 2,
"node004": 0,
},
wantDegradedCounts: map[string]int{
"node001": 0,
"node002": 0,
"node003": 0,
"node004": 0,
},
wantMissingCounts: map[string]int{
"node001": 0,
"node002": 0,
"node003": 0,
"node004": 2,
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
results, err := ms.HealthCheckAlt(tt.cluster, tt.nodes, tt.expectedMetrics)
if err != nil {
t.Errorf("HealthCheckAlt() error = %v", err)
return
}
// Check that we got results for all nodes
if len(results) != len(tt.nodes) {
t.Errorf("HealthCheckAlt() returned %d results, want %d", len(results), len(tt.nodes))
}
// Check each node's state
for _, node := range tt.nodes {
state, ok := results[node]
if !ok {
t.Errorf("HealthCheckAlt() missing result for node %s", node)
continue
}
// Check status
if wantStatus, ok := tt.wantStates[node]; ok {
if state.Status != wantStatus {
t.Errorf("HealthCheckAlt() node %s status = %v, want %v", node, state.Status, wantStatus)
}
}
// Check healthy count
if wantCount, ok := tt.wantHealthyCounts[node]; ok {
if len(state.HealthyMetrics) != wantCount {
t.Errorf("HealthCheckAlt() node %s healthy count = %d, want %d (metrics: %v)",
node, len(state.HealthyMetrics), wantCount, state.HealthyMetrics)
}
}
// Check degraded count
if wantCount, ok := tt.wantDegradedCounts[node]; ok {
if len(state.DegradedMetrics) != wantCount {
t.Errorf("HealthCheckAlt() node %s degraded count = %d, want %d (metrics: %v)",
node, len(state.DegradedMetrics), wantCount, state.DegradedMetrics)
}
}
// Check missing count
if wantCount, ok := tt.wantMissingCounts[node]; ok {
if len(state.MissingMetrics) != wantCount {
t.Errorf("HealthCheckAlt() node %s missing count = %d, want %d (metrics: %v)",
node, len(state.MissingMetrics), wantCount, state.MissingMetrics)
}
}
}
})
}
}