From 248f11f4f85b3ddd7b81a444bef080bc9935e0ab Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 3 Feb 2026 14:55:12 +0100 Subject: [PATCH] Change API of Node HealthState --- internal/api/node.go | 4 +- pkg/metricstore/healthcheck.go | 57 +++++++++-------------- pkg/metricstore/metricstore_test.go | 71 +++-------------------------- 3 files changed, 30 insertions(+), 102 deletions(-) diff --git a/internal/api/node.go b/internal/api/node.go index 7039a06f..853b23e1 100644 --- a/internal/api/node.go +++ b/internal/api/node.go @@ -79,7 +79,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) { ms := metricstore.GetMemoryStore() m := make(map[string][]string) - healthStates := make(map[string]metricstore.NodeHealthState) + healthStates := make(map[string]schema.MonitoringState) for _, node := range req.Nodes { if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil { @@ -101,7 +101,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) { state := determineState(node.States) healthState := schema.MonitoringStateFull if hs, ok := healthStates[node.Hostname]; ok { - healthState = hs.Status + healthState = hs } nodeState := schema.NodeStateDB{ TimeStamp: requestReceived, diff --git a/pkg/metricstore/healthcheck.go b/pkg/metricstore/healthcheck.go index a40394a3..5ab26466 100644 --- a/pkg/metricstore/healthcheck.go +++ b/pkg/metricstore/healthcheck.go @@ -360,19 +360,11 @@ func (m *MemoryStore) GetHealthyMetrics(selector []string) ([]string, []string, return healthyList, degradedList, nil } -// NodeHealthState represents the health status of a single node's metrics. -type NodeHealthState struct { - Status schema.MonitoringState // Overall health status: Full, Partial, or Failed - HealthyMetrics []string // Metrics with recent data and few missing values - DegradedMetrics []string // Metrics with recent data but many missing values - MissingMetrics []string // Expected metrics that are completely missing or stale -} - -// HealthCheckAlt performs health checks on multiple nodes and returns their health states. +// HealthCheckAlt performs health checks on multiple nodes and returns their monitoring states. // // This routine provides a batch health check interface that evaluates multiple nodes -// against a specific set of expected metrics. For each node, it determines which metrics -// are healthy, degraded, or missing, and assigns an overall health status. +// against a specific set of expected metrics. For each node, it determines the overall +// monitoring state based on which metrics are healthy, degraded, or missing. // // Health Status Classification: // - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values) @@ -385,8 +377,8 @@ type NodeHealthState struct { // - expectedMetrics: List of metric names that should be present on each node // // Returns: -// - map[string]NodeHealthState: Map keyed by hostname containing health state for each node -// - error: Non-nil only for internal errors (individual node failures are captured in NodeHealthState) +// - map[string]schema.MonitoringState: Map keyed by hostname containing monitoring state for each node +// - error: Non-nil only for internal errors (individual node failures are captured as MonitoringStateFailed) // // Example usage: // @@ -398,10 +390,7 @@ type NodeHealthState struct { // return err // } // for hostname, state := range healthStates { -// fmt.Printf("Node %s: %s\n", hostname, state.Status) -// fmt.Printf(" Healthy: %v\n", state.HealthyMetrics) -// fmt.Printf(" Degraded: %v\n", state.DegradedMetrics) -// fmt.Printf(" Missing: %v\n", state.MissingMetrics) +// fmt.Printf("Node %s: %s\n", hostname, state) // } // // Note: This routine is optimized for batch operations where you need to check @@ -409,8 +398,8 @@ type NodeHealthState struct { // all configured metrics, use HealthCheck() instead. func (m *MemoryStore) HealthCheckAlt(cluster string, nodes []string, expectedMetrics []string, -) (map[string]NodeHealthState, error) { - results := make(map[string]NodeHealthState, len(nodes)) +) (map[string]schema.MonitoringState, error) { + results := make(map[string]schema.MonitoringState, len(nodes)) // Create a set of expected metrics for fast lookup expectedSet := make(map[string]bool, len(expectedMetrics)) @@ -421,20 +410,16 @@ func (m *MemoryStore) HealthCheckAlt(cluster string, // Check each node for _, hostname := range nodes { selector := []string{cluster, hostname} - state := NodeHealthState{ - Status: schema.MonitoringStateFull, - HealthyMetrics: make([]string, 0), - DegradedMetrics: make([]string, 0), - MissingMetrics: make([]string, 0), - } + status := schema.MonitoringStateFull + healthyCount := 0 + degradedCount := 0 + missingCount := 0 // Get healthy and degraded metrics for this node healthyList, degradedList, err := m.GetHealthyMetrics(selector) if err != nil { // Node not found or internal error - state.Status = schema.MonitoringStateFailed - state.MissingMetrics = expectedMetrics - results[hostname] = state + results[hostname] = schema.MonitoringStateFailed continue } @@ -451,27 +436,27 @@ func (m *MemoryStore) HealthCheckAlt(cluster string, // Classify each expected metric for _, metric := range expectedMetrics { if healthySet[metric] { - state.HealthyMetrics = append(state.HealthyMetrics, metric) + healthyCount++ } else if degradedSet[metric] { - state.DegradedMetrics = append(state.DegradedMetrics, metric) + degradedCount++ } else { - state.MissingMetrics = append(state.MissingMetrics, metric) + missingCount++ } } // Determine overall health status - if len(state.MissingMetrics) > 0 || len(state.DegradedMetrics) > 0 { - if len(state.HealthyMetrics) == 0 { + if missingCount > 0 || degradedCount > 0 { + if healthyCount == 0 { // No healthy metrics at all - state.Status = schema.MonitoringStateFailed + status = schema.MonitoringStateFailed } else { // Some healthy, some degraded/missing - state.Status = schema.MonitoringStatePartial + status = schema.MonitoringStatePartial } } // else: all metrics healthy, status remains MonitoringStateFull - results[hostname] = state + results[hostname] = status } return results, nil diff --git a/pkg/metricstore/metricstore_test.go b/pkg/metricstore/metricstore_test.go index 70ef73f8..e0fcfea5 100644 --- a/pkg/metricstore/metricstore_test.go +++ b/pkg/metricstore/metricstore_test.go @@ -161,14 +161,11 @@ func TestHealthCheckAlt(t *testing.T) { // node004 doesn't exist at all tests := []struct { - name string - cluster string - nodes []string - expectedMetrics []string - wantStates map[string]schema.MonitoringState - wantHealthyCounts map[string]int - wantDegradedCounts map[string]int - wantMissingCounts map[string]int + name string + cluster string + nodes []string + expectedMetrics []string + wantStates map[string]schema.MonitoringState }{ { name: "all metrics healthy", @@ -178,9 +175,6 @@ func TestHealthCheckAlt(t *testing.T) { wantStates: map[string]schema.MonitoringState{ "node001": schema.MonitoringStateFull, }, - wantHealthyCounts: map[string]int{"node001": 4}, - wantDegradedCounts: map[string]int{"node001": 0}, - wantMissingCounts: map[string]int{"node001": 0}, }, { name: "some metrics degraded", @@ -190,9 +184,6 @@ func TestHealthCheckAlt(t *testing.T) { wantStates: map[string]schema.MonitoringState{ "node002": schema.MonitoringStatePartial, }, - wantHealthyCounts: map[string]int{"node002": 2}, - wantDegradedCounts: map[string]int{"node002": 2}, - wantMissingCounts: map[string]int{"node002": 0}, }, { name: "some metrics missing", @@ -202,9 +193,6 @@ func TestHealthCheckAlt(t *testing.T) { wantStates: map[string]schema.MonitoringState{ "node003": schema.MonitoringStatePartial, }, - wantHealthyCounts: map[string]int{"node003": 2}, - wantDegradedCounts: map[string]int{"node003": 0}, - wantMissingCounts: map[string]int{"node003": 2}, }, { name: "node not found", @@ -214,9 +202,6 @@ func TestHealthCheckAlt(t *testing.T) { wantStates: map[string]schema.MonitoringState{ "node004": schema.MonitoringStateFailed, }, - wantHealthyCounts: map[string]int{"node004": 0}, - wantDegradedCounts: map[string]int{"node004": 0}, - wantMissingCounts: map[string]int{"node004": 4}, }, { name: "multiple nodes mixed states", @@ -229,24 +214,6 @@ func TestHealthCheckAlt(t *testing.T) { "node003": schema.MonitoringStateFull, "node004": schema.MonitoringStateFailed, }, - wantHealthyCounts: map[string]int{ - "node001": 2, - "node002": 2, - "node003": 2, - "node004": 0, - }, - wantDegradedCounts: map[string]int{ - "node001": 0, - "node002": 0, - "node003": 0, - "node004": 0, - }, - wantMissingCounts: map[string]int{ - "node001": 0, - "node002": 0, - "node003": 0, - "node004": 2, - }, }, } @@ -273,32 +240,8 @@ func TestHealthCheckAlt(t *testing.T) { // Check status if wantStatus, ok := tt.wantStates[node]; ok { - if state.Status != wantStatus { - t.Errorf("HealthCheckAlt() node %s status = %v, want %v", node, state.Status, wantStatus) - } - } - - // Check healthy count - if wantCount, ok := tt.wantHealthyCounts[node]; ok { - if len(state.HealthyMetrics) != wantCount { - t.Errorf("HealthCheckAlt() node %s healthy count = %d, want %d (metrics: %v)", - node, len(state.HealthyMetrics), wantCount, state.HealthyMetrics) - } - } - - // Check degraded count - if wantCount, ok := tt.wantDegradedCounts[node]; ok { - if len(state.DegradedMetrics) != wantCount { - t.Errorf("HealthCheckAlt() node %s degraded count = %d, want %d (metrics: %v)", - node, len(state.DegradedMetrics), wantCount, state.DegradedMetrics) - } - } - - // Check missing count - if wantCount, ok := tt.wantMissingCounts[node]; ok { - if len(state.MissingMetrics) != wantCount { - t.Errorf("HealthCheckAlt() node %s missing count = %d, want %d (metrics: %v)", - node, len(state.MissingMetrics), wantCount, state.MissingMetrics) + if state != wantStatus { + t.Errorf("HealthCheckAlt() node %s status = %v, want %v", node, state, wantStatus) } } }