Change API of Node HealthState

This commit is contained in:
2026-02-03 14:55:12 +01:00
parent 00a41373e8
commit 248f11f4f8
3 changed files with 30 additions and 102 deletions

View File

@@ -79,7 +79,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
ms := metricstore.GetMemoryStore() ms := metricstore.GetMemoryStore()
m := make(map[string][]string) m := make(map[string][]string)
healthStates := make(map[string]metricstore.NodeHealthState) healthStates := make(map[string]schema.MonitoringState)
for _, node := range req.Nodes { for _, node := range req.Nodes {
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil { if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
@@ -101,7 +101,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
state := determineState(node.States) state := determineState(node.States)
healthState := schema.MonitoringStateFull healthState := schema.MonitoringStateFull
if hs, ok := healthStates[node.Hostname]; ok { if hs, ok := healthStates[node.Hostname]; ok {
healthState = hs.Status healthState = hs
} }
nodeState := schema.NodeStateDB{ nodeState := schema.NodeStateDB{
TimeStamp: requestReceived, TimeStamp: requestReceived,

View File

@@ -360,19 +360,11 @@ func (m *MemoryStore) GetHealthyMetrics(selector []string) ([]string, []string,
return healthyList, degradedList, nil return healthyList, degradedList, nil
} }
// NodeHealthState represents the health status of a single node's metrics. // HealthCheckAlt performs health checks on multiple nodes and returns their monitoring states.
type NodeHealthState struct {
Status schema.MonitoringState // Overall health status: Full, Partial, or Failed
HealthyMetrics []string // Metrics with recent data and few missing values
DegradedMetrics []string // Metrics with recent data but many missing values
MissingMetrics []string // Expected metrics that are completely missing or stale
}
// HealthCheckAlt performs health checks on multiple nodes and returns their health states.
// //
// This routine provides a batch health check interface that evaluates multiple nodes // This routine provides a batch health check interface that evaluates multiple nodes
// against a specific set of expected metrics. For each node, it determines which metrics // against a specific set of expected metrics. For each node, it determines the overall
// are healthy, degraded, or missing, and assigns an overall health status. // monitoring state based on which metrics are healthy, degraded, or missing.
// //
// Health Status Classification: // Health Status Classification:
// - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values) // - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values)
@@ -385,8 +377,8 @@ type NodeHealthState struct {
// - expectedMetrics: List of metric names that should be present on each node // - expectedMetrics: List of metric names that should be present on each node
// //
// Returns: // Returns:
// - map[string]NodeHealthState: Map keyed by hostname containing health state for each node // - map[string]schema.MonitoringState: Map keyed by hostname containing monitoring state for each node
// - error: Non-nil only for internal errors (individual node failures are captured in NodeHealthState) // - error: Non-nil only for internal errors (individual node failures are captured as MonitoringStateFailed)
// //
// Example usage: // Example usage:
// //
@@ -398,10 +390,7 @@ type NodeHealthState struct {
// return err // return err
// } // }
// for hostname, state := range healthStates { // for hostname, state := range healthStates {
// fmt.Printf("Node %s: %s\n", hostname, state.Status) // fmt.Printf("Node %s: %s\n", hostname, state)
// fmt.Printf(" Healthy: %v\n", state.HealthyMetrics)
// fmt.Printf(" Degraded: %v\n", state.DegradedMetrics)
// fmt.Printf(" Missing: %v\n", state.MissingMetrics)
// } // }
// //
// Note: This routine is optimized for batch operations where you need to check // Note: This routine is optimized for batch operations where you need to check
@@ -409,8 +398,8 @@ type NodeHealthState struct {
// all configured metrics, use HealthCheck() instead. // all configured metrics, use HealthCheck() instead.
func (m *MemoryStore) HealthCheckAlt(cluster string, func (m *MemoryStore) HealthCheckAlt(cluster string,
nodes []string, expectedMetrics []string, nodes []string, expectedMetrics []string,
) (map[string]NodeHealthState, error) { ) (map[string]schema.MonitoringState, error) {
results := make(map[string]NodeHealthState, len(nodes)) results := make(map[string]schema.MonitoringState, len(nodes))
// Create a set of expected metrics for fast lookup // Create a set of expected metrics for fast lookup
expectedSet := make(map[string]bool, len(expectedMetrics)) expectedSet := make(map[string]bool, len(expectedMetrics))
@@ -421,20 +410,16 @@ func (m *MemoryStore) HealthCheckAlt(cluster string,
// Check each node // Check each node
for _, hostname := range nodes { for _, hostname := range nodes {
selector := []string{cluster, hostname} selector := []string{cluster, hostname}
state := NodeHealthState{ status := schema.MonitoringStateFull
Status: schema.MonitoringStateFull, healthyCount := 0
HealthyMetrics: make([]string, 0), degradedCount := 0
DegradedMetrics: make([]string, 0), missingCount := 0
MissingMetrics: make([]string, 0),
}
// Get healthy and degraded metrics for this node // Get healthy and degraded metrics for this node
healthyList, degradedList, err := m.GetHealthyMetrics(selector) healthyList, degradedList, err := m.GetHealthyMetrics(selector)
if err != nil { if err != nil {
// Node not found or internal error // Node not found or internal error
state.Status = schema.MonitoringStateFailed results[hostname] = schema.MonitoringStateFailed
state.MissingMetrics = expectedMetrics
results[hostname] = state
continue continue
} }
@@ -451,27 +436,27 @@ func (m *MemoryStore) HealthCheckAlt(cluster string,
// Classify each expected metric // Classify each expected metric
for _, metric := range expectedMetrics { for _, metric := range expectedMetrics {
if healthySet[metric] { if healthySet[metric] {
state.HealthyMetrics = append(state.HealthyMetrics, metric) healthyCount++
} else if degradedSet[metric] { } else if degradedSet[metric] {
state.DegradedMetrics = append(state.DegradedMetrics, metric) degradedCount++
} else { } else {
state.MissingMetrics = append(state.MissingMetrics, metric) missingCount++
} }
} }
// Determine overall health status // Determine overall health status
if len(state.MissingMetrics) > 0 || len(state.DegradedMetrics) > 0 { if missingCount > 0 || degradedCount > 0 {
if len(state.HealthyMetrics) == 0 { if healthyCount == 0 {
// No healthy metrics at all // No healthy metrics at all
state.Status = schema.MonitoringStateFailed status = schema.MonitoringStateFailed
} else { } else {
// Some healthy, some degraded/missing // Some healthy, some degraded/missing
state.Status = schema.MonitoringStatePartial status = schema.MonitoringStatePartial
} }
} }
// else: all metrics healthy, status remains MonitoringStateFull // else: all metrics healthy, status remains MonitoringStateFull
results[hostname] = state results[hostname] = status
} }
return results, nil return results, nil

View File

@@ -161,14 +161,11 @@ func TestHealthCheckAlt(t *testing.T) {
// node004 doesn't exist at all // node004 doesn't exist at all
tests := []struct { tests := []struct {
name string name string
cluster string cluster string
nodes []string nodes []string
expectedMetrics []string expectedMetrics []string
wantStates map[string]schema.MonitoringState wantStates map[string]schema.MonitoringState
wantHealthyCounts map[string]int
wantDegradedCounts map[string]int
wantMissingCounts map[string]int
}{ }{
{ {
name: "all metrics healthy", name: "all metrics healthy",
@@ -178,9 +175,6 @@ func TestHealthCheckAlt(t *testing.T) {
wantStates: map[string]schema.MonitoringState{ wantStates: map[string]schema.MonitoringState{
"node001": schema.MonitoringStateFull, "node001": schema.MonitoringStateFull,
}, },
wantHealthyCounts: map[string]int{"node001": 4},
wantDegradedCounts: map[string]int{"node001": 0},
wantMissingCounts: map[string]int{"node001": 0},
}, },
{ {
name: "some metrics degraded", name: "some metrics degraded",
@@ -190,9 +184,6 @@ func TestHealthCheckAlt(t *testing.T) {
wantStates: map[string]schema.MonitoringState{ wantStates: map[string]schema.MonitoringState{
"node002": schema.MonitoringStatePartial, "node002": schema.MonitoringStatePartial,
}, },
wantHealthyCounts: map[string]int{"node002": 2},
wantDegradedCounts: map[string]int{"node002": 2},
wantMissingCounts: map[string]int{"node002": 0},
}, },
{ {
name: "some metrics missing", name: "some metrics missing",
@@ -202,9 +193,6 @@ func TestHealthCheckAlt(t *testing.T) {
wantStates: map[string]schema.MonitoringState{ wantStates: map[string]schema.MonitoringState{
"node003": schema.MonitoringStatePartial, "node003": schema.MonitoringStatePartial,
}, },
wantHealthyCounts: map[string]int{"node003": 2},
wantDegradedCounts: map[string]int{"node003": 0},
wantMissingCounts: map[string]int{"node003": 2},
}, },
{ {
name: "node not found", name: "node not found",
@@ -214,9 +202,6 @@ func TestHealthCheckAlt(t *testing.T) {
wantStates: map[string]schema.MonitoringState{ wantStates: map[string]schema.MonitoringState{
"node004": schema.MonitoringStateFailed, "node004": schema.MonitoringStateFailed,
}, },
wantHealthyCounts: map[string]int{"node004": 0},
wantDegradedCounts: map[string]int{"node004": 0},
wantMissingCounts: map[string]int{"node004": 4},
}, },
{ {
name: "multiple nodes mixed states", name: "multiple nodes mixed states",
@@ -229,24 +214,6 @@ func TestHealthCheckAlt(t *testing.T) {
"node003": schema.MonitoringStateFull, "node003": schema.MonitoringStateFull,
"node004": schema.MonitoringStateFailed, "node004": schema.MonitoringStateFailed,
}, },
wantHealthyCounts: map[string]int{
"node001": 2,
"node002": 2,
"node003": 2,
"node004": 0,
},
wantDegradedCounts: map[string]int{
"node001": 0,
"node002": 0,
"node003": 0,
"node004": 0,
},
wantMissingCounts: map[string]int{
"node001": 0,
"node002": 0,
"node003": 0,
"node004": 2,
},
}, },
} }
@@ -273,32 +240,8 @@ func TestHealthCheckAlt(t *testing.T) {
// Check status // Check status
if wantStatus, ok := tt.wantStates[node]; ok { if wantStatus, ok := tt.wantStates[node]; ok {
if state.Status != wantStatus { if state != wantStatus {
t.Errorf("HealthCheckAlt() node %s status = %v, want %v", node, state.Status, wantStatus) t.Errorf("HealthCheckAlt() node %s status = %v, want %v", node, state, wantStatus)
}
}
// Check healthy count
if wantCount, ok := tt.wantHealthyCounts[node]; ok {
if len(state.HealthyMetrics) != wantCount {
t.Errorf("HealthCheckAlt() node %s healthy count = %d, want %d (metrics: %v)",
node, len(state.HealthyMetrics), wantCount, state.HealthyMetrics)
}
}
// Check degraded count
if wantCount, ok := tt.wantDegradedCounts[node]; ok {
if len(state.DegradedMetrics) != wantCount {
t.Errorf("HealthCheckAlt() node %s degraded count = %d, want %d (metrics: %v)",
node, len(state.DegradedMetrics), wantCount, state.DegradedMetrics)
}
}
// Check missing count
if wantCount, ok := tt.wantMissingCounts[node]; ok {
if len(state.MissingMetrics) != wantCount {
t.Errorf("HealthCheckAlt() node %s missing count = %d, want %d (metrics: %v)",
node, len(state.MissingMetrics), wantCount, state.MissingMetrics)
} }
} }
} }