diff --git a/internal/api/node.go b/internal/api/node.go index c3fe8492..7039a06f 100644 --- a/internal/api/node.go +++ b/internal/api/node.go @@ -7,11 +7,14 @@ package api import ( "fmt" + "maps" "net/http" "strings" "time" "github.com/ClusterCockpit/cc-backend/internal/repository" + "github.com/ClusterCockpit/cc-backend/pkg/archive" + "github.com/ClusterCockpit/cc-backend/pkg/metricstore" "github.com/ClusterCockpit/cc-lib/v2/schema" ) @@ -20,6 +23,15 @@ type UpdateNodeStatesRequest struct { Cluster string `json:"cluster" example:"fritz"` } +// metricListToNames converts a map of metric configurations to a list of metric names +func metricListToNames(metricList map[string]*schema.Metric) []string { + names := make([]string, 0, len(metricList)) + for name := range metricList { + names = append(names, name) + } + return names +} + // this routine assumes that only one of them exists per node func determineState(states []string) schema.SchedulerState { for _, state := range states { @@ -62,18 +74,42 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) { http.StatusBadRequest, rw) return } - repo := repository.GetNodeRepository() requestReceived := time.Now().Unix() + repo := repository.GetNodeRepository() + ms := metricstore.GetMemoryStore() + + m := make(map[string][]string) + healthStates := make(map[string]metricstore.NodeHealthState) + + for _, node := range req.Nodes { + if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil { + m[sc] = append(m[sc], node.Hostname) + } + } + + for sc, nl := range m { + if sc != "" { + metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc) + metricNames := metricListToNames(metricList) + if states, err := ms.HealthCheckAlt(req.Cluster, nl, metricNames); err == nil { + maps.Copy(healthStates, states) + } + } + } for _, node := range req.Nodes { state := determineState(node.States) + healthState := schema.MonitoringStateFull + if hs, ok := healthStates[node.Hostname]; ok { + healthState = hs.Status + } nodeState := schema.NodeStateDB{ TimeStamp: requestReceived, NodeState: state, CpusAllocated: node.CpusAllocated, MemoryAllocated: node.MemoryAllocated, GpusAllocated: node.GpusAllocated, - HealthState: schema.MonitoringStateFull, + HealthState: healthState, JobsRunning: node.JobsRunning, } diff --git a/pkg/metricstore/healthcheck.go b/pkg/metricstore/healthcheck.go index 3dbf661a..a40394a3 100644 --- a/pkg/metricstore/healthcheck.go +++ b/pkg/metricstore/healthcheck.go @@ -47,6 +47,45 @@ func (b *buffer) healthCheck() bool { return false } +// healthCheck recursively examines a level and all its children to identify stale or missing metrics. +// +// This routine performs a two-phase check: +// +// Phase 1 - Check metrics at current level (node-level metrics): +// - Iterates through all configured metrics in m.Metrics +// - For each metric, checks if a buffer exists at l.metrics[mc.offset] +// - If buffer exists: calls buffer.healthCheck() to verify data freshness +// - Stale buffer (data older than MaxMissingDataPoints * frequency) → StaleNodeMetricList +// - Fresh buffer → healthy, no action +// - If buffer is nil: metric was never written → MissingNodeMetricList +// +// Phase 2 - Recursively check child levels (hardware-level metrics): +// - Iterates through l.children (e.g., "cpu0", "gpu0", "socket0") +// - Recursively calls healthCheck() on each child level +// - Aggregates child results into hardware-specific lists: +// - Child's StaleNodeMetricList → parent's StaleHardwareMetricList[childName] +// - Child's MissingNodeMetricList → parent's MissingHardwareMetricList[childName] +// +// The recursive nature means: +// - Calling on a host level checks: host metrics + all CPU/GPU/socket metrics +// - Calling on a socket level checks: socket metrics + all core metrics +// - Leaf levels (e.g., individual cores) only check their own metrics +// +// Parameters: +// - m: MemoryStore containing the global metric configuration (m.Metrics) +// +// Returns: +// - List: Categorized lists of stale and missing metrics at this level and below +// - error: Non-nil only for internal errors during recursion +// +// Concurrency: +// - Acquires read lock (RLock) to safely access l.metrics and l.children +// - Lock held for entire duration including recursive calls +// +// Example for host level with structure: host → [cpu0, cpu1]: +// - Checks host-level metrics (load, memory) → StaleNodeMetricList / MissingNodeMetricList +// - Recursively checks cpu0 metrics → results in StaleHardwareMetricList["cpu0"] +// - Recursively checks cpu1 metrics → results in StaleHardwareMetricList["cpu1"] func (l *Level) healthCheck(m *MemoryStore) (List, error) { l.lock.RLock() defer l.lock.RUnlock() @@ -58,6 +97,7 @@ func (l *Level) healthCheck(m *MemoryStore) (List, error) { MissingHardwareMetricList: make(map[string][]string, 0), } + // Phase 1: Check metrics at this level for metricName, mc := range m.Metrics { if b := l.metrics[mc.offset]; b != nil { if b.healthCheck() { @@ -68,6 +108,7 @@ func (l *Level) healthCheck(m *MemoryStore) (List, error) { } } + // Phase 2: Recursively check child levels (hardware components) for hardwareMetricName, lvl := range l.children { l, err := lvl.healthCheck(m) if err != nil { @@ -85,6 +126,48 @@ func (l *Level) healthCheck(m *MemoryStore) (List, error) { return list, nil } +// HealthCheck performs a health check on a specific node in the metric store. +// +// This routine checks whether metrics for a given node are being received and are up-to-date. +// It examines both node-level metrics (e.g., load, memory) and hardware-level metrics +// (e.g., CPU, GPU, network) to determine the monitoring state. +// +// Parameters: +// - selector: Hierarchical path to the target node, typically []string{cluster, hostname}. +// Example: []string{"emmy", "node001"} navigates to the "node001" host in the "emmy" cluster. +// The selector must match the hierarchy used during metric ingestion (see Level.findLevelOrCreate). +// - subcluster: Subcluster name (currently unused, reserved for future filtering) +// +// Returns: +// - *HeathCheckResponse: Health status with detailed lists of stale/missing metrics +// - error: Non-nil only for internal errors (not for unhealthy nodes) +// +// Health States: +// - MonitoringStateFull: All expected metrics are present and up-to-date +// - MonitoringStatePartial: Some metrics are stale (data older than MaxMissingDataPoints * frequency) +// - MonitoringStateFailed: Host not found, or metrics are completely missing +// +// The response includes detailed lists: +// - StaleNodeMetricList: Node-level metrics with stale data +// - StaleHardwareMetricList: Hardware-level metrics with stale data (grouped by component) +// - MissingNodeMetricList: Expected node-level metrics that have no data +// - MissingHardwareMetricList: Expected hardware-level metrics that have no data (grouped by component) +// +// Example usage: +// +// selector := []string{"emmy", "node001"} +// response, err := ms.HealthCheck(selector, "") +// if err != nil { +// // Internal error +// } +// switch response.Status { +// case schema.MonitoringStateFull: +// // All metrics healthy +// case schema.MonitoringStatePartial: +// // Check response.list.StaleNodeMetricList for details +// case schema.MonitoringStateFailed: +// // Check response.Error or response.list.MissingNodeMetricList +// } func (m *MemoryStore) HealthCheck(selector []string, subcluster string) (*HeathCheckResponse, error) { response := HeathCheckResponse{ Status: schema.MonitoringStateFull, @@ -120,3 +203,276 @@ func (m *MemoryStore) HealthCheck(selector []string, subcluster string) (*HeathC return &response, nil } + +// isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints. +// +// Returns true if the buffer is healthy (recent data within threshold), false otherwise. +// A nil buffer or empty buffer is considered unhealthy. +func (b *buffer) isBufferHealthy() bool { + // Check if the buffer is empty + if b == nil || b.data == nil { + return false + } + + bufferEnd := b.start + b.frequency*int64(len(b.data)) + t := time.Now().Unix() + + // Check if the buffer has recent data (within MaxMissingDataPoints threshold) + if t-bufferEnd > MaxMissingDataPoints*b.frequency { + return false + } + + return true +} + +// countMissingValues counts the number of NaN (missing) values in the most recent data points. +// +// Examines the last MaxMissingDataPoints*2 values in the buffer and counts how many are NaN. +// We check twice the threshold to allow detecting when more than MaxMissingDataPoints are missing. +// If the buffer has fewer values, examines all available values. +// +// Returns: +// - int: Number of NaN values found in the examined range +func (b *buffer) countMissingValues() int { + if b == nil || b.data == nil || len(b.data) == 0 { + return 0 + } + + // Check twice the threshold to detect degraded metrics + checkCount := min(int(MaxMissingDataPoints)*2, len(b.data)) + + // Count NaN values in the most recent data points + missingCount := 0 + startIdx := len(b.data) - checkCount + for i := startIdx; i < len(b.data); i++ { + if b.data[i].IsNaN() { + missingCount++ + } + } + + return missingCount +} + +// getHealthyMetrics recursively collects healthy and degraded metrics at this level and below. +// +// A metric is considered: +// - Healthy: buffer has recent data within MaxMissingDataPoints threshold AND has few/no NaN values +// - Degraded: buffer exists and has recent data, but contains more than MaxMissingDataPoints NaN values +// +// This routine walks the entire subtree starting from the current level. +// +// Parameters: +// - m: MemoryStore containing the global metric configuration +// +// Returns: +// - []string: Flat list of healthy metric names from this level and all children +// - []string: Flat list of degraded metric names (exist but have too many missing values) +// - error: Non-nil only for internal errors during recursion +// +// The routine mirrors healthCheck() but provides more granular classification: +// - healthCheck() finds problems (stale/missing) +// - getHealthyMetrics() separates healthy from degraded metrics +func (l *Level) getHealthyMetrics(m *MemoryStore) ([]string, []string, error) { + l.lock.RLock() + defer l.lock.RUnlock() + + healthyList := make([]string, 0) + degradedList := make([]string, 0) + + // Phase 1: Check metrics at this level + for metricName, mc := range m.Metrics { + b := l.metrics[mc.offset] + if b.isBufferHealthy() { + // Buffer has recent data, now check for missing values + missingCount := b.countMissingValues() + if missingCount > int(MaxMissingDataPoints) { + degradedList = append(degradedList, metricName) + } else { + healthyList = append(healthyList, metricName) + } + } + } + + // Phase 2: Recursively check child levels (hardware components) + for _, lvl := range l.children { + childHealthy, childDegraded, err := lvl.getHealthyMetrics(m) + if err != nil { + return nil, nil, err + } + + // Merge child metrics into flat lists + healthyList = append(healthyList, childHealthy...) + degradedList = append(degradedList, childDegraded...) + } + + return healthyList, degradedList, nil +} + +// GetHealthyMetrics returns healthy and degraded metrics for a specific node as flat lists. +// +// This routine walks the metric tree starting from the specified node selector +// and collects all metrics that have received data within the last MaxMissingDataPoints +// (default: 5 data points). Metrics are classified into two categories: +// +// - Healthy: Buffer has recent data AND contains few/no NaN (missing) values +// - Degraded: Buffer has recent data BUT contains more than MaxMissingDataPoints NaN values +// +// The returned lists include both node-level metrics (e.g., "load", "mem_used") and +// hardware-level metrics (e.g., "cpu_user", "gpu_temp") in flat slices. +// +// Parameters: +// - selector: Hierarchical path to the target node, typically []string{cluster, hostname}. +// Example: []string{"emmy", "node001"} navigates to the "node001" host in the "emmy" cluster. +// The selector must match the hierarchy used during metric ingestion. +// +// Returns: +// - []string: Flat list of healthy metric names (recent data, few missing values) +// - []string: Flat list of degraded metric names (recent data, many missing values) +// - error: Non-nil if the node is not found or internal errors occur +// +// Example usage: +// +// selector := []string{"emmy", "node001"} +// healthyMetrics, degradedMetrics, err := ms.GetHealthyMetrics(selector) +// if err != nil { +// // Node not found or internal error +// return err +// } +// fmt.Printf("Healthy metrics: %v\n", healthyMetrics) +// // Output: ["load", "mem_used", "cpu_user", ...] +// fmt.Printf("Degraded metrics: %v\n", degradedMetrics) +// // Output: ["gpu_temp", "network_rx", ...] (metrics with many NaN values) +// +// Note: This routine provides more granular classification than HealthCheck: +// - HealthCheck reports stale/missing metrics (problems) +// - GetHealthyMetrics separates fully healthy from degraded metrics (quality levels) +func (m *MemoryStore) GetHealthyMetrics(selector []string) ([]string, []string, error) { + lvl := m.root.findLevel(selector) + if lvl == nil { + return nil, nil, fmt.Errorf("[METRICSTORE]> error while GetHealthyMetrics, host not found: %#v", selector) + } + + healthyList, degradedList, err := lvl.getHealthyMetrics(m) + if err != nil { + return nil, nil, err + } + + return healthyList, degradedList, nil +} + +// NodeHealthState represents the health status of a single node's metrics. +type NodeHealthState struct { + Status schema.MonitoringState // Overall health status: Full, Partial, or Failed + HealthyMetrics []string // Metrics with recent data and few missing values + DegradedMetrics []string // Metrics with recent data but many missing values + MissingMetrics []string // Expected metrics that are completely missing or stale +} + +// HealthCheckAlt performs health checks on multiple nodes and returns their health states. +// +// This routine provides a batch health check interface that evaluates multiple nodes +// against a specific set of expected metrics. For each node, it determines which metrics +// are healthy, degraded, or missing, and assigns an overall health status. +// +// Health Status Classification: +// - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values) +// - MonitoringStatePartial: Some metrics are degraded (many missing values) or missing +// - MonitoringStateFailed: Node not found or all expected metrics are missing/stale +// +// Parameters: +// - cluster: Cluster name (first element of selector path) +// - nodes: List of node hostnames to check +// - expectedMetrics: List of metric names that should be present on each node +// +// Returns: +// - map[string]NodeHealthState: Map keyed by hostname containing health state for each node +// - error: Non-nil only for internal errors (individual node failures are captured in NodeHealthState) +// +// Example usage: +// +// cluster := "emmy" +// nodes := []string{"node001", "node002", "node003"} +// expectedMetrics := []string{"load", "mem_used", "cpu_user", "cpu_system"} +// healthStates, err := ms.HealthCheckAlt(cluster, nodes, expectedMetrics) +// if err != nil { +// return err +// } +// for hostname, state := range healthStates { +// fmt.Printf("Node %s: %s\n", hostname, state.Status) +// fmt.Printf(" Healthy: %v\n", state.HealthyMetrics) +// fmt.Printf(" Degraded: %v\n", state.DegradedMetrics) +// fmt.Printf(" Missing: %v\n", state.MissingMetrics) +// } +// +// Note: This routine is optimized for batch operations where you need to check +// the same set of metrics across multiple nodes. For single-node checks with +// all configured metrics, use HealthCheck() instead. +func (m *MemoryStore) HealthCheckAlt(cluster string, + nodes []string, expectedMetrics []string, +) (map[string]NodeHealthState, error) { + results := make(map[string]NodeHealthState, len(nodes)) + + // Create a set of expected metrics for fast lookup + expectedSet := make(map[string]bool, len(expectedMetrics)) + for _, metric := range expectedMetrics { + expectedSet[metric] = true + } + + // Check each node + for _, hostname := range nodes { + selector := []string{cluster, hostname} + state := NodeHealthState{ + Status: schema.MonitoringStateFull, + HealthyMetrics: make([]string, 0), + DegradedMetrics: make([]string, 0), + MissingMetrics: make([]string, 0), + } + + // Get healthy and degraded metrics for this node + healthyList, degradedList, err := m.GetHealthyMetrics(selector) + if err != nil { + // Node not found or internal error + state.Status = schema.MonitoringStateFailed + state.MissingMetrics = expectedMetrics + results[hostname] = state + continue + } + + // Create sets for fast lookup + healthySet := make(map[string]bool, len(healthyList)) + for _, metric := range healthyList { + healthySet[metric] = true + } + degradedSet := make(map[string]bool, len(degradedList)) + for _, metric := range degradedList { + degradedSet[metric] = true + } + + // Classify each expected metric + for _, metric := range expectedMetrics { + if healthySet[metric] { + state.HealthyMetrics = append(state.HealthyMetrics, metric) + } else if degradedSet[metric] { + state.DegradedMetrics = append(state.DegradedMetrics, metric) + } else { + state.MissingMetrics = append(state.MissingMetrics, metric) + } + } + + // Determine overall health status + if len(state.MissingMetrics) > 0 || len(state.DegradedMetrics) > 0 { + if len(state.HealthyMetrics) == 0 { + // No healthy metrics at all + state.Status = schema.MonitoringStateFailed + } else { + // Some healthy, some degraded/missing + state.Status = schema.MonitoringStatePartial + } + } + // else: all metrics healthy, status remains MonitoringStateFull + + results[hostname] = state + } + + return results, nil +} diff --git a/pkg/metricstore/metricstore_test.go b/pkg/metricstore/metricstore_test.go index 90cec2bd..70ef73f8 100644 --- a/pkg/metricstore/metricstore_test.go +++ b/pkg/metricstore/metricstore_test.go @@ -7,6 +7,7 @@ package metricstore import ( "testing" + "time" "github.com/ClusterCockpit/cc-lib/v2/schema" ) @@ -88,3 +89,219 @@ func TestBufferRead(t *testing.T) { t.Errorf("buffer.read() len(result) = %d, want 3", len(result)) } } + +func TestHealthCheckAlt(t *testing.T) { + // Create a test MemoryStore with some metrics + metrics := map[string]MetricConfig{ + "load": {Frequency: 10, Aggregation: AvgAggregation, offset: 0}, + "mem_used": {Frequency: 10, Aggregation: AvgAggregation, offset: 1}, + "cpu_user": {Frequency: 10, Aggregation: AvgAggregation, offset: 2}, + "cpu_system": {Frequency: 10, Aggregation: AvgAggregation, offset: 3}, + } + + ms := &MemoryStore{ + Metrics: metrics, + root: Level{ + metrics: make([]*buffer, len(metrics)), + children: make(map[string]*Level), + }, + } + + // Use recent timestamps (current time minus a small offset) + now := time.Now().Unix() + startTime := now - 100 // Start 100 seconds ago to have enough data points + + // Setup test data for node001 - all metrics healthy + node001 := ms.root.findLevelOrCreate([]string{"testcluster", "node001"}, len(metrics)) + for i := 0; i < len(metrics); i++ { + node001.metrics[i] = newBuffer(startTime, 10) + // Write recent data with no NaN values + for ts := startTime; ts <= now; ts += 10 { + node001.metrics[i].write(ts, schema.Float(float64(i+1))) + } + } + + // Setup test data for node002 - some metrics degraded (many NaN values) + node002 := ms.root.findLevelOrCreate([]string{"testcluster", "node002"}, len(metrics)) + for i := 0; i < len(metrics); i++ { + node002.metrics[i] = newBuffer(startTime, 10) + if i < 2 { + // First two metrics: healthy (no NaN) + for ts := startTime; ts <= now; ts += 10 { + node002.metrics[i].write(ts, schema.Float(float64(i+1))) + } + } else { + // Last two metrics: degraded (many NaN values in recent data) + // Write real values first, then NaN values at the end + count := 0 + for ts := startTime; ts <= now; ts += 10 { + if count < 5 { + // Write first 5 real values + node002.metrics[i].write(ts, schema.Float(float64(i+1))) + } else { + // Write NaN for the rest (last ~6 values will be NaN) + node002.metrics[i].write(ts, schema.NaN) + } + count++ + } + } + } + + // Setup test data for node003 - some metrics missing (no buffer) + node003 := ms.root.findLevelOrCreate([]string{"testcluster", "node003"}, len(metrics)) + // Only create buffers for first two metrics + for i := 0; i < 2; i++ { + node003.metrics[i] = newBuffer(startTime, 10) + for ts := startTime; ts <= now; ts += 10 { + node003.metrics[i].write(ts, schema.Float(float64(i+1))) + } + } + // Leave metrics[2] and metrics[3] as nil (missing) + + // node004 doesn't exist at all + + tests := []struct { + name string + cluster string + nodes []string + expectedMetrics []string + wantStates map[string]schema.MonitoringState + wantHealthyCounts map[string]int + wantDegradedCounts map[string]int + wantMissingCounts map[string]int + }{ + { + name: "all metrics healthy", + cluster: "testcluster", + nodes: []string{"node001"}, + expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"}, + wantStates: map[string]schema.MonitoringState{ + "node001": schema.MonitoringStateFull, + }, + wantHealthyCounts: map[string]int{"node001": 4}, + wantDegradedCounts: map[string]int{"node001": 0}, + wantMissingCounts: map[string]int{"node001": 0}, + }, + { + name: "some metrics degraded", + cluster: "testcluster", + nodes: []string{"node002"}, + expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"}, + wantStates: map[string]schema.MonitoringState{ + "node002": schema.MonitoringStatePartial, + }, + wantHealthyCounts: map[string]int{"node002": 2}, + wantDegradedCounts: map[string]int{"node002": 2}, + wantMissingCounts: map[string]int{"node002": 0}, + }, + { + name: "some metrics missing", + cluster: "testcluster", + nodes: []string{"node003"}, + expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"}, + wantStates: map[string]schema.MonitoringState{ + "node003": schema.MonitoringStatePartial, + }, + wantHealthyCounts: map[string]int{"node003": 2}, + wantDegradedCounts: map[string]int{"node003": 0}, + wantMissingCounts: map[string]int{"node003": 2}, + }, + { + name: "node not found", + cluster: "testcluster", + nodes: []string{"node004"}, + expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"}, + wantStates: map[string]schema.MonitoringState{ + "node004": schema.MonitoringStateFailed, + }, + wantHealthyCounts: map[string]int{"node004": 0}, + wantDegradedCounts: map[string]int{"node004": 0}, + wantMissingCounts: map[string]int{"node004": 4}, + }, + { + name: "multiple nodes mixed states", + cluster: "testcluster", + nodes: []string{"node001", "node002", "node003", "node004"}, + expectedMetrics: []string{"load", "mem_used"}, + wantStates: map[string]schema.MonitoringState{ + "node001": schema.MonitoringStateFull, + "node002": schema.MonitoringStateFull, + "node003": schema.MonitoringStateFull, + "node004": schema.MonitoringStateFailed, + }, + wantHealthyCounts: map[string]int{ + "node001": 2, + "node002": 2, + "node003": 2, + "node004": 0, + }, + wantDegradedCounts: map[string]int{ + "node001": 0, + "node002": 0, + "node003": 0, + "node004": 0, + }, + wantMissingCounts: map[string]int{ + "node001": 0, + "node002": 0, + "node003": 0, + "node004": 2, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + results, err := ms.HealthCheckAlt(tt.cluster, tt.nodes, tt.expectedMetrics) + if err != nil { + t.Errorf("HealthCheckAlt() error = %v", err) + return + } + + // Check that we got results for all nodes + if len(results) != len(tt.nodes) { + t.Errorf("HealthCheckAlt() returned %d results, want %d", len(results), len(tt.nodes)) + } + + // Check each node's state + for _, node := range tt.nodes { + state, ok := results[node] + if !ok { + t.Errorf("HealthCheckAlt() missing result for node %s", node) + continue + } + + // Check status + if wantStatus, ok := tt.wantStates[node]; ok { + if state.Status != wantStatus { + t.Errorf("HealthCheckAlt() node %s status = %v, want %v", node, state.Status, wantStatus) + } + } + + // Check healthy count + if wantCount, ok := tt.wantHealthyCounts[node]; ok { + if len(state.HealthyMetrics) != wantCount { + t.Errorf("HealthCheckAlt() node %s healthy count = %d, want %d (metrics: %v)", + node, len(state.HealthyMetrics), wantCount, state.HealthyMetrics) + } + } + + // Check degraded count + if wantCount, ok := tt.wantDegradedCounts[node]; ok { + if len(state.DegradedMetrics) != wantCount { + t.Errorf("HealthCheckAlt() node %s degraded count = %d, want %d (metrics: %v)", + node, len(state.DegradedMetrics), wantCount, state.DegradedMetrics) + } + } + + // Check missing count + if wantCount, ok := tt.wantMissingCounts[node]; ok { + if len(state.MissingMetrics) != wantCount { + t.Errorf("HealthCheckAlt() node %s missing count = %d, want %d (metrics: %v)", + node, len(state.MissingMetrics), wantCount, state.MissingMetrics) + } + } + } + }) + } +}