Add monitoring healthstate support in nodestate API.

2026-03-03 22:57:29 +01:00 · 2026-02-03 12:23:24 +01:00
parent e9cd6b4225
commit 00a41373e8
3 changed files with 611 additions and 2 deletions
--- a/internal/api/node.go
+++ b/internal/api/node.go
@@ -7,11 +7,14 @@ package api

 import (
 	"fmt"
+	"maps"
 	"net/http"
 	"strings"
 	"time"

 	"github.com/ClusterCockpit/cc-backend/internal/repository"
+	"github.com/ClusterCockpit/cc-backend/pkg/archive"
+	"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
 	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )

@@ -20,6 +23,15 @@ type UpdateNodeStatesRequest struct {
 	Cluster string               `json:"cluster" example:"fritz"`
 }

+// metricListToNames converts a map of metric configurations to a list of metric names
+func metricListToNames(metricList map[string]*schema.Metric) []string {
+	names := make([]string, 0, len(metricList))
+	for name := range metricList {
+		names = append(names, name)
+	}
+	return names
+}
+
 // this routine assumes that only one of them exists per node
 func determineState(states []string) schema.SchedulerState {
 	for _, state := range states {
@@ -62,18 +74,42 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
 			http.StatusBadRequest, rw)
 		return
 	}
-	repo := repository.GetNodeRepository()
 	requestReceived := time.Now().Unix()
+	repo := repository.GetNodeRepository()
+	ms := metricstore.GetMemoryStore()
+
+	m := make(map[string][]string)
+	healthStates := make(map[string]metricstore.NodeHealthState)
+
+	for _, node := range req.Nodes {
+		if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
+			m[sc] = append(m[sc], node.Hostname)
+		}
+	}
+
+	for sc, nl := range m {
+		if sc != "" {
+			metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc)
+			metricNames := metricListToNames(metricList)
+			if states, err := ms.HealthCheckAlt(req.Cluster, nl, metricNames); err == nil {
+				maps.Copy(healthStates, states)
+			}
+		}
+	}

 	for _, node := range req.Nodes {
 		state := determineState(node.States)
+		healthState := schema.MonitoringStateFull
+		if hs, ok := healthStates[node.Hostname]; ok {
+			healthState = hs.Status
+		}
 		nodeState := schema.NodeStateDB{
 			TimeStamp:       requestReceived,
 			NodeState:       state,
 			CpusAllocated:   node.CpusAllocated,
 			MemoryAllocated: node.MemoryAllocated,
 			GpusAllocated:   node.GpusAllocated,
-			HealthState:     schema.MonitoringStateFull,
+			HealthState:     healthState,
 			JobsRunning:     node.JobsRunning,
 		}

--- a/pkg/metricstore/healthcheck.go
+++ b/pkg/metricstore/healthcheck.go
@@ -47,6 +47,45 @@ func (b *buffer) healthCheck() bool {
 	return false
 }

+// healthCheck recursively examines a level and all its children to identify stale or missing metrics.
+//
+// This routine performs a two-phase check:
+//
+// Phase 1 - Check metrics at current level (node-level metrics):
+//   - Iterates through all configured metrics in m.Metrics
+//   - For each metric, checks if a buffer exists at l.metrics[mc.offset]
+//   - If buffer exists: calls buffer.healthCheck() to verify data freshness
+//   - Stale buffer (data older than MaxMissingDataPoints * frequency) → StaleNodeMetricList
+//   - Fresh buffer → healthy, no action
+//   - If buffer is nil: metric was never written → MissingNodeMetricList
+//
+// Phase 2 - Recursively check child levels (hardware-level metrics):
+//   - Iterates through l.children (e.g., "cpu0", "gpu0", "socket0")
+//   - Recursively calls healthCheck() on each child level
+//   - Aggregates child results into hardware-specific lists:
+//   - Child's StaleNodeMetricList → parent's StaleHardwareMetricList[childName]
+//   - Child's MissingNodeMetricList → parent's MissingHardwareMetricList[childName]
+//
+// The recursive nature means:
+//   - Calling on a host level checks: host metrics + all CPU/GPU/socket metrics
+//   - Calling on a socket level checks: socket metrics + all core metrics
+//   - Leaf levels (e.g., individual cores) only check their own metrics
+//
+// Parameters:
+//   - m: MemoryStore containing the global metric configuration (m.Metrics)
+//
+// Returns:
+//   - List: Categorized lists of stale and missing metrics at this level and below
+//   - error: Non-nil only for internal errors during recursion
+//
+// Concurrency:
+//   - Acquires read lock (RLock) to safely access l.metrics and l.children
+//   - Lock held for entire duration including recursive calls
+//
+// Example for host level with structure: host → [cpu0, cpu1]:
+//   - Checks host-level metrics (load, memory) → StaleNodeMetricList / MissingNodeMetricList
+//   - Recursively checks cpu0 metrics → results in StaleHardwareMetricList["cpu0"]
+//   - Recursively checks cpu1 metrics → results in StaleHardwareMetricList["cpu1"]
 func (l *Level) healthCheck(m *MemoryStore) (List, error) {
 	l.lock.RLock()
 	defer l.lock.RUnlock()
@@ -58,6 +97,7 @@ func (l *Level) healthCheck(m *MemoryStore) (List, error) {
 		MissingHardwareMetricList: make(map[string][]string, 0),
 	}

+	// Phase 1: Check metrics at this level
 	for metricName, mc := range m.Metrics {
 		if b := l.metrics[mc.offset]; b != nil {
 			if b.healthCheck() {
@@ -68,6 +108,7 @@ func (l *Level) healthCheck(m *MemoryStore) (List, error) {
 		}
 	}

+	// Phase 2: Recursively check child levels (hardware components)
 	for hardwareMetricName, lvl := range l.children {
 		l, err := lvl.healthCheck(m)
 		if err != nil {
@@ -85,6 +126,48 @@ func (l *Level) healthCheck(m *MemoryStore) (List, error) {
 	return list, nil
 }

+// HealthCheck performs a health check on a specific node in the metric store.
+//
+// This routine checks whether metrics for a given node are being received and are up-to-date.
+// It examines both node-level metrics (e.g., load, memory) and hardware-level metrics
+// (e.g., CPU, GPU, network) to determine the monitoring state.
+//
+// Parameters:
+//   - selector: Hierarchical path to the target node, typically []string{cluster, hostname}.
+//     Example: []string{"emmy", "node001"} navigates to the "node001" host in the "emmy" cluster.
+//     The selector must match the hierarchy used during metric ingestion (see Level.findLevelOrCreate).
+//   - subcluster: Subcluster name (currently unused, reserved for future filtering)
+//
+// Returns:
+//   - *HeathCheckResponse: Health status with detailed lists of stale/missing metrics
+//   - error: Non-nil only for internal errors (not for unhealthy nodes)
+//
+// Health States:
+//   - MonitoringStateFull: All expected metrics are present and up-to-date
+//   - MonitoringStatePartial: Some metrics are stale (data older than MaxMissingDataPoints * frequency)
+//   - MonitoringStateFailed: Host not found, or metrics are completely missing
+//
+// The response includes detailed lists:
+//   - StaleNodeMetricList: Node-level metrics with stale data
+//   - StaleHardwareMetricList: Hardware-level metrics with stale data (grouped by component)
+//   - MissingNodeMetricList: Expected node-level metrics that have no data
+//   - MissingHardwareMetricList: Expected hardware-level metrics that have no data (grouped by component)
+//
+// Example usage:
+//
+//	selector := []string{"emmy", "node001"}
+//	response, err := ms.HealthCheck(selector, "")
+//	if err != nil {
+//	    // Internal error
+//	}
+//	switch response.Status {
+//	case schema.MonitoringStateFull:
+//	    // All metrics healthy
+//	case schema.MonitoringStatePartial:
+//	    // Check response.list.StaleNodeMetricList for details
+//	case schema.MonitoringStateFailed:
+//	    // Check response.Error or response.list.MissingNodeMetricList
+//	}
 func (m *MemoryStore) HealthCheck(selector []string, subcluster string) (*HeathCheckResponse, error) {
 	response := HeathCheckResponse{
 		Status: schema.MonitoringStateFull,
@@ -120,3 +203,276 @@ func (m *MemoryStore) HealthCheck(selector []string, subcluster string) (*HeathC

 	return &response, nil
 }
+
+// isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints.
+//
+// Returns true if the buffer is healthy (recent data within threshold), false otherwise.
+// A nil buffer or empty buffer is considered unhealthy.
+func (b *buffer) isBufferHealthy() bool {
+	// Check if the buffer is empty
+	if b == nil || b.data == nil {
+		return false
+	}
+
+	bufferEnd := b.start + b.frequency*int64(len(b.data))
+	t := time.Now().Unix()
+
+	// Check if the buffer has recent data (within MaxMissingDataPoints threshold)
+	if t-bufferEnd > MaxMissingDataPoints*b.frequency {
+		return false
+	}
+
+	return true
+}
+
+// countMissingValues counts the number of NaN (missing) values in the most recent data points.
+//
+// Examines the last MaxMissingDataPoints*2 values in the buffer and counts how many are NaN.
+// We check twice the threshold to allow detecting when more than MaxMissingDataPoints are missing.
+// If the buffer has fewer values, examines all available values.
+//
+// Returns:
+//   - int: Number of NaN values found in the examined range
+func (b *buffer) countMissingValues() int {
+	if b == nil || b.data == nil || len(b.data) == 0 {
+		return 0
+	}
+
+	// Check twice the threshold to detect degraded metrics
+	checkCount := min(int(MaxMissingDataPoints)*2, len(b.data))
+
+	// Count NaN values in the most recent data points
+	missingCount := 0
+	startIdx := len(b.data) - checkCount
+	for i := startIdx; i < len(b.data); i++ {
+		if b.data[i].IsNaN() {
+			missingCount++
+		}
+	}
+
+	return missingCount
+}
+
+// getHealthyMetrics recursively collects healthy and degraded metrics at this level and below.
+//
+// A metric is considered:
+//   - Healthy: buffer has recent data within MaxMissingDataPoints threshold AND has few/no NaN values
+//   - Degraded: buffer exists and has recent data, but contains more than MaxMissingDataPoints NaN values
+//
+// This routine walks the entire subtree starting from the current level.
+//
+// Parameters:
+//   - m: MemoryStore containing the global metric configuration
+//
+// Returns:
+//   - []string: Flat list of healthy metric names from this level and all children
+//   - []string: Flat list of degraded metric names (exist but have too many missing values)
+//   - error: Non-nil only for internal errors during recursion
+//
+// The routine mirrors healthCheck() but provides more granular classification:
+//   - healthCheck() finds problems (stale/missing)
+//   - getHealthyMetrics() separates healthy from degraded metrics
+func (l *Level) getHealthyMetrics(m *MemoryStore) ([]string, []string, error) {
+	l.lock.RLock()
+	defer l.lock.RUnlock()
+
+	healthyList := make([]string, 0)
+	degradedList := make([]string, 0)
+
+	// Phase 1: Check metrics at this level
+	for metricName, mc := range m.Metrics {
+		b := l.metrics[mc.offset]
+		if b.isBufferHealthy() {
+			// Buffer has recent data, now check for missing values
+			missingCount := b.countMissingValues()
+			if missingCount > int(MaxMissingDataPoints) {
+				degradedList = append(degradedList, metricName)
+			} else {
+				healthyList = append(healthyList, metricName)
+			}
+		}
+	}
+
+	// Phase 2: Recursively check child levels (hardware components)
+	for _, lvl := range l.children {
+		childHealthy, childDegraded, err := lvl.getHealthyMetrics(m)
+		if err != nil {
+			return nil, nil, err
+		}
+
+		// Merge child metrics into flat lists
+		healthyList = append(healthyList, childHealthy...)
+		degradedList = append(degradedList, childDegraded...)
+	}
+
+	return healthyList, degradedList, nil
+}
+
+// GetHealthyMetrics returns healthy and degraded metrics for a specific node as flat lists.
+//
+// This routine walks the metric tree starting from the specified node selector
+// and collects all metrics that have received data within the last MaxMissingDataPoints
+// (default: 5 data points). Metrics are classified into two categories:
+//
+//   - Healthy: Buffer has recent data AND contains few/no NaN (missing) values
+//   - Degraded: Buffer has recent data BUT contains more than MaxMissingDataPoints NaN values
+//
+// The returned lists include both node-level metrics (e.g., "load", "mem_used") and
+// hardware-level metrics (e.g., "cpu_user", "gpu_temp") in flat slices.
+//
+// Parameters:
+//   - selector: Hierarchical path to the target node, typically []string{cluster, hostname}.
+//     Example: []string{"emmy", "node001"} navigates to the "node001" host in the "emmy" cluster.
+//     The selector must match the hierarchy used during metric ingestion.
+//
+// Returns:
+//   - []string: Flat list of healthy metric names (recent data, few missing values)
+//   - []string: Flat list of degraded metric names (recent data, many missing values)
+//   - error: Non-nil if the node is not found or internal errors occur
+//
+// Example usage:
+//
+//	selector := []string{"emmy", "node001"}
+//	healthyMetrics, degradedMetrics, err := ms.GetHealthyMetrics(selector)
+//	if err != nil {
+//	    // Node not found or internal error
+//	    return err
+//	}
+//	fmt.Printf("Healthy metrics: %v\n", healthyMetrics)
+//	// Output: ["load", "mem_used", "cpu_user", ...]
+//	fmt.Printf("Degraded metrics: %v\n", degradedMetrics)
+//	// Output: ["gpu_temp", "network_rx", ...] (metrics with many NaN values)
+//
+// Note: This routine provides more granular classification than HealthCheck:
+//   - HealthCheck reports stale/missing metrics (problems)
+//   - GetHealthyMetrics separates fully healthy from degraded metrics (quality levels)
+func (m *MemoryStore) GetHealthyMetrics(selector []string) ([]string, []string, error) {
+	lvl := m.root.findLevel(selector)
+	if lvl == nil {
+		return nil, nil, fmt.Errorf("[METRICSTORE]> error while GetHealthyMetrics, host not found: %#v", selector)
+	}
+
+	healthyList, degradedList, err := lvl.getHealthyMetrics(m)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	return healthyList, degradedList, nil
+}
+
+// NodeHealthState represents the health status of a single node's metrics.
+type NodeHealthState struct {
+	Status          schema.MonitoringState // Overall health status: Full, Partial, or Failed
+	HealthyMetrics  []string               // Metrics with recent data and few missing values
+	DegradedMetrics []string               // Metrics with recent data but many missing values
+	MissingMetrics  []string               // Expected metrics that are completely missing or stale
+}
+
+// HealthCheckAlt performs health checks on multiple nodes and returns their health states.
+//
+// This routine provides a batch health check interface that evaluates multiple nodes
+// against a specific set of expected metrics. For each node, it determines which metrics
+// are healthy, degraded, or missing, and assigns an overall health status.
+//
+// Health Status Classification:
+//   - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values)
+//   - MonitoringStatePartial: Some metrics are degraded (many missing values) or missing
+//   - MonitoringStateFailed: Node not found or all expected metrics are missing/stale
+//
+// Parameters:
+//   - cluster: Cluster name (first element of selector path)
+//   - nodes: List of node hostnames to check
+//   - expectedMetrics: List of metric names that should be present on each node
+//
+// Returns:
+//   - map[string]NodeHealthState: Map keyed by hostname containing health state for each node
+//   - error: Non-nil only for internal errors (individual node failures are captured in NodeHealthState)
+//
+// Example usage:
+//
+//	cluster := "emmy"
+//	nodes := []string{"node001", "node002", "node003"}
+//	expectedMetrics := []string{"load", "mem_used", "cpu_user", "cpu_system"}
+//	healthStates, err := ms.HealthCheckAlt(cluster, nodes, expectedMetrics)
+//	if err != nil {
+//	    return err
+//	}
+//	for hostname, state := range healthStates {
+//	    fmt.Printf("Node %s: %s\n", hostname, state.Status)
+//	    fmt.Printf("  Healthy: %v\n", state.HealthyMetrics)
+//	    fmt.Printf("  Degraded: %v\n", state.DegradedMetrics)
+//	    fmt.Printf("  Missing: %v\n", state.MissingMetrics)
+//	}
+//
+// Note: This routine is optimized for batch operations where you need to check
+// the same set of metrics across multiple nodes. For single-node checks with
+// all configured metrics, use HealthCheck() instead.
+func (m *MemoryStore) HealthCheckAlt(cluster string,
+	nodes []string, expectedMetrics []string,
+) (map[string]NodeHealthState, error) {
+	results := make(map[string]NodeHealthState, len(nodes))
+
+	// Create a set of expected metrics for fast lookup
+	expectedSet := make(map[string]bool, len(expectedMetrics))
+	for _, metric := range expectedMetrics {
+		expectedSet[metric] = true
+	}
+
+	// Check each node
+	for _, hostname := range nodes {
+		selector := []string{cluster, hostname}
+		state := NodeHealthState{
+			Status:          schema.MonitoringStateFull,
+			HealthyMetrics:  make([]string, 0),
+			DegradedMetrics: make([]string, 0),
+			MissingMetrics:  make([]string, 0),
+		}
+
+		// Get healthy and degraded metrics for this node
+		healthyList, degradedList, err := m.GetHealthyMetrics(selector)
+		if err != nil {
+			// Node not found or internal error
+			state.Status = schema.MonitoringStateFailed
+			state.MissingMetrics = expectedMetrics
+			results[hostname] = state
+			continue
+		}
+
+		// Create sets for fast lookup
+		healthySet := make(map[string]bool, len(healthyList))
+		for _, metric := range healthyList {
+			healthySet[metric] = true
+		}
+		degradedSet := make(map[string]bool, len(degradedList))
+		for _, metric := range degradedList {
+			degradedSet[metric] = true
+		}
+
+		// Classify each expected metric
+		for _, metric := range expectedMetrics {
+			if healthySet[metric] {
+				state.HealthyMetrics = append(state.HealthyMetrics, metric)
+			} else if degradedSet[metric] {
+				state.DegradedMetrics = append(state.DegradedMetrics, metric)
+			} else {
+				state.MissingMetrics = append(state.MissingMetrics, metric)
+			}
+		}
+
+		// Determine overall health status
+		if len(state.MissingMetrics) > 0 || len(state.DegradedMetrics) > 0 {
+			if len(state.HealthyMetrics) == 0 {
+				// No healthy metrics at all
+				state.Status = schema.MonitoringStateFailed
+			} else {
+				// Some healthy, some degraded/missing
+				state.Status = schema.MonitoringStatePartial
+			}
+		}
+		// else: all metrics healthy, status remains MonitoringStateFull
+
+		results[hostname] = state
+	}
+
+	return results, nil
+}
--- a/pkg/metricstore/metricstore_test.go
+++ b/pkg/metricstore/metricstore_test.go
@@ -7,6 +7,7 @@ package metricstore

 import (
 	"testing"
+	"time"

 	"github.com/ClusterCockpit/cc-lib/v2/schema"
 )
@@ -88,3 +89,219 @@ func TestBufferRead(t *testing.T) {
 		t.Errorf("buffer.read() len(result) = %d, want 3", len(result))
 	}
 }
+
+func TestHealthCheckAlt(t *testing.T) {
+	// Create a test MemoryStore with some metrics
+	metrics := map[string]MetricConfig{
+		"load":       {Frequency: 10, Aggregation: AvgAggregation, offset: 0},
+		"mem_used":   {Frequency: 10, Aggregation: AvgAggregation, offset: 1},
+		"cpu_user":   {Frequency: 10, Aggregation: AvgAggregation, offset: 2},
+		"cpu_system": {Frequency: 10, Aggregation: AvgAggregation, offset: 3},
+	}
+
+	ms := &MemoryStore{
+		Metrics: metrics,
+		root: Level{
+			metrics:  make([]*buffer, len(metrics)),
+			children: make(map[string]*Level),
+		},
+	}
+
+	// Use recent timestamps (current time minus a small offset)
+	now := time.Now().Unix()
+	startTime := now - 100 // Start 100 seconds ago to have enough data points
+
+	// Setup test data for node001 - all metrics healthy
+	node001 := ms.root.findLevelOrCreate([]string{"testcluster", "node001"}, len(metrics))
+	for i := 0; i < len(metrics); i++ {
+		node001.metrics[i] = newBuffer(startTime, 10)
+		// Write recent data with no NaN values
+		for ts := startTime; ts <= now; ts += 10 {
+			node001.metrics[i].write(ts, schema.Float(float64(i+1)))
+		}
+	}
+
+	// Setup test data for node002 - some metrics degraded (many NaN values)
+	node002 := ms.root.findLevelOrCreate([]string{"testcluster", "node002"}, len(metrics))
+	for i := 0; i < len(metrics); i++ {
+		node002.metrics[i] = newBuffer(startTime, 10)
+		if i < 2 {
+			// First two metrics: healthy (no NaN)
+			for ts := startTime; ts <= now; ts += 10 {
+				node002.metrics[i].write(ts, schema.Float(float64(i+1)))
+			}
+		} else {
+			// Last two metrics: degraded (many NaN values in recent data)
+			// Write real values first, then NaN values at the end
+			count := 0
+			for ts := startTime; ts <= now; ts += 10 {
+				if count < 5 {
+					// Write first 5 real values
+					node002.metrics[i].write(ts, schema.Float(float64(i+1)))
+				} else {
+					// Write NaN for the rest (last ~6 values will be NaN)
+					node002.metrics[i].write(ts, schema.NaN)
+				}
+				count++
+			}
+		}
+	}
+
+	// Setup test data for node003 - some metrics missing (no buffer)
+	node003 := ms.root.findLevelOrCreate([]string{"testcluster", "node003"}, len(metrics))
+	// Only create buffers for first two metrics
+	for i := 0; i < 2; i++ {
+		node003.metrics[i] = newBuffer(startTime, 10)
+		for ts := startTime; ts <= now; ts += 10 {
+			node003.metrics[i].write(ts, schema.Float(float64(i+1)))
+		}
+	}
+	// Leave metrics[2] and metrics[3] as nil (missing)
+
+	// node004 doesn't exist at all
+
+	tests := []struct {
+		name               string
+		cluster            string
+		nodes              []string
+		expectedMetrics    []string
+		wantStates         map[string]schema.MonitoringState
+		wantHealthyCounts  map[string]int
+		wantDegradedCounts map[string]int
+		wantMissingCounts  map[string]int
+	}{
+		{
+			name:            "all metrics healthy",
+			cluster:         "testcluster",
+			nodes:           []string{"node001"},
+			expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
+			wantStates: map[string]schema.MonitoringState{
+				"node001": schema.MonitoringStateFull,
+			},
+			wantHealthyCounts:  map[string]int{"node001": 4},
+			wantDegradedCounts: map[string]int{"node001": 0},
+			wantMissingCounts:  map[string]int{"node001": 0},
+		},
+		{
+			name:            "some metrics degraded",
+			cluster:         "testcluster",
+			nodes:           []string{"node002"},
+			expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
+			wantStates: map[string]schema.MonitoringState{
+				"node002": schema.MonitoringStatePartial,
+			},
+			wantHealthyCounts:  map[string]int{"node002": 2},
+			wantDegradedCounts: map[string]int{"node002": 2},
+			wantMissingCounts:  map[string]int{"node002": 0},
+		},
+		{
+			name:            "some metrics missing",
+			cluster:         "testcluster",
+			nodes:           []string{"node003"},
+			expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
+			wantStates: map[string]schema.MonitoringState{
+				"node003": schema.MonitoringStatePartial,
+			},
+			wantHealthyCounts:  map[string]int{"node003": 2},
+			wantDegradedCounts: map[string]int{"node003": 0},
+			wantMissingCounts:  map[string]int{"node003": 2},
+		},
+		{
+			name:            "node not found",
+			cluster:         "testcluster",
+			nodes:           []string{"node004"},
+			expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
+			wantStates: map[string]schema.MonitoringState{
+				"node004": schema.MonitoringStateFailed,
+			},
+			wantHealthyCounts:  map[string]int{"node004": 0},
+			wantDegradedCounts: map[string]int{"node004": 0},
+			wantMissingCounts:  map[string]int{"node004": 4},
+		},
+		{
+			name:            "multiple nodes mixed states",
+			cluster:         "testcluster",
+			nodes:           []string{"node001", "node002", "node003", "node004"},
+			expectedMetrics: []string{"load", "mem_used"},
+			wantStates: map[string]schema.MonitoringState{
+				"node001": schema.MonitoringStateFull,
+				"node002": schema.MonitoringStateFull,
+				"node003": schema.MonitoringStateFull,
+				"node004": schema.MonitoringStateFailed,
+			},
+			wantHealthyCounts: map[string]int{
+				"node001": 2,
+				"node002": 2,
+				"node003": 2,
+				"node004": 0,
+			},
+			wantDegradedCounts: map[string]int{
+				"node001": 0,
+				"node002": 0,
+				"node003": 0,
+				"node004": 0,
+			},
+			wantMissingCounts: map[string]int{
+				"node001": 0,
+				"node002": 0,
+				"node003": 0,
+				"node004": 2,
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			results, err := ms.HealthCheckAlt(tt.cluster, tt.nodes, tt.expectedMetrics)
+			if err != nil {
+				t.Errorf("HealthCheckAlt() error = %v", err)
+				return
+			}
+
+			// Check that we got results for all nodes
+			if len(results) != len(tt.nodes) {
+				t.Errorf("HealthCheckAlt() returned %d results, want %d", len(results), len(tt.nodes))
+			}
+
+			// Check each node's state
+			for _, node := range tt.nodes {
+				state, ok := results[node]
+				if !ok {
+					t.Errorf("HealthCheckAlt() missing result for node %s", node)
+					continue
+				}
+
+				// Check status
+				if wantStatus, ok := tt.wantStates[node]; ok {
+					if state.Status != wantStatus {
+						t.Errorf("HealthCheckAlt() node %s status = %v, want %v", node, state.Status, wantStatus)
+					}
+				}
+
+				// Check healthy count
+				if wantCount, ok := tt.wantHealthyCounts[node]; ok {
+					if len(state.HealthyMetrics) != wantCount {
+						t.Errorf("HealthCheckAlt() node %s healthy count = %d, want %d (metrics: %v)",
+							node, len(state.HealthyMetrics), wantCount, state.HealthyMetrics)
+					}
+				}
+
+				// Check degraded count
+				if wantCount, ok := tt.wantDegradedCounts[node]; ok {
+					if len(state.DegradedMetrics) != wantCount {
+						t.Errorf("HealthCheckAlt() node %s degraded count = %d, want %d (metrics: %v)",
+							node, len(state.DegradedMetrics), wantCount, state.DegradedMetrics)
+					}
+				}
+
+				// Check missing count
+				if wantCount, ok := tt.wantMissingCounts[node]; ok {
+					if len(state.MissingMetrics) != wantCount {
+						t.Errorf("HealthCheckAlt() node %s missing count = %d, want %d (metrics: %v)",
+							node, len(state.MissingMetrics), wantCount, state.MissingMetrics)
+					}
+				}
+			}
+		})
+	}
+}