Change API of Node HealthState

This commit is contained in:
2026-02-03 14:55:12 +01:00
parent 00a41373e8
commit 248f11f4f8
3 changed files with 30 additions and 102 deletions

View File

@@ -79,7 +79,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
ms := metricstore.GetMemoryStore()
m := make(map[string][]string)
healthStates := make(map[string]metricstore.NodeHealthState)
healthStates := make(map[string]schema.MonitoringState)
for _, node := range req.Nodes {
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
@@ -101,7 +101,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
state := determineState(node.States)
healthState := schema.MonitoringStateFull
if hs, ok := healthStates[node.Hostname]; ok {
healthState = hs.Status
healthState = hs
}
nodeState := schema.NodeStateDB{
TimeStamp: requestReceived,

View File

@@ -360,19 +360,11 @@ func (m *MemoryStore) GetHealthyMetrics(selector []string) ([]string, []string,
return healthyList, degradedList, nil
}
// NodeHealthState represents the health status of a single node's metrics.
type NodeHealthState struct {
Status schema.MonitoringState // Overall health status: Full, Partial, or Failed
HealthyMetrics []string // Metrics with recent data and few missing values
DegradedMetrics []string // Metrics with recent data but many missing values
MissingMetrics []string // Expected metrics that are completely missing or stale
}
// HealthCheckAlt performs health checks on multiple nodes and returns their health states.
// HealthCheckAlt performs health checks on multiple nodes and returns their monitoring states.
//
// This routine provides a batch health check interface that evaluates multiple nodes
// against a specific set of expected metrics. For each node, it determines which metrics
// are healthy, degraded, or missing, and assigns an overall health status.
// against a specific set of expected metrics. For each node, it determines the overall
// monitoring state based on which metrics are healthy, degraded, or missing.
//
// Health Status Classification:
// - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values)
@@ -385,8 +377,8 @@ type NodeHealthState struct {
// - expectedMetrics: List of metric names that should be present on each node
//
// Returns:
// - map[string]NodeHealthState: Map keyed by hostname containing health state for each node
// - error: Non-nil only for internal errors (individual node failures are captured in NodeHealthState)
// - map[string]schema.MonitoringState: Map keyed by hostname containing monitoring state for each node
// - error: Non-nil only for internal errors (individual node failures are captured as MonitoringStateFailed)
//
// Example usage:
//
@@ -398,10 +390,7 @@ type NodeHealthState struct {
// return err
// }
// for hostname, state := range healthStates {
// fmt.Printf("Node %s: %s\n", hostname, state.Status)
// fmt.Printf(" Healthy: %v\n", state.HealthyMetrics)
// fmt.Printf(" Degraded: %v\n", state.DegradedMetrics)
// fmt.Printf(" Missing: %v\n", state.MissingMetrics)
// fmt.Printf("Node %s: %s\n", hostname, state)
// }
//
// Note: This routine is optimized for batch operations where you need to check
@@ -409,8 +398,8 @@ type NodeHealthState struct {
// all configured metrics, use HealthCheck() instead.
func (m *MemoryStore) HealthCheckAlt(cluster string,
nodes []string, expectedMetrics []string,
) (map[string]NodeHealthState, error) {
results := make(map[string]NodeHealthState, len(nodes))
) (map[string]schema.MonitoringState, error) {
results := make(map[string]schema.MonitoringState, len(nodes))
// Create a set of expected metrics for fast lookup
expectedSet := make(map[string]bool, len(expectedMetrics))
@@ -421,20 +410,16 @@ func (m *MemoryStore) HealthCheckAlt(cluster string,
// Check each node
for _, hostname := range nodes {
selector := []string{cluster, hostname}
state := NodeHealthState{
Status: schema.MonitoringStateFull,
HealthyMetrics: make([]string, 0),
DegradedMetrics: make([]string, 0),
MissingMetrics: make([]string, 0),
}
status := schema.MonitoringStateFull
healthyCount := 0
degradedCount := 0
missingCount := 0
// Get healthy and degraded metrics for this node
healthyList, degradedList, err := m.GetHealthyMetrics(selector)
if err != nil {
// Node not found or internal error
state.Status = schema.MonitoringStateFailed
state.MissingMetrics = expectedMetrics
results[hostname] = state
results[hostname] = schema.MonitoringStateFailed
continue
}
@@ -451,27 +436,27 @@ func (m *MemoryStore) HealthCheckAlt(cluster string,
// Classify each expected metric
for _, metric := range expectedMetrics {
if healthySet[metric] {
state.HealthyMetrics = append(state.HealthyMetrics, metric)
healthyCount++
} else if degradedSet[metric] {
state.DegradedMetrics = append(state.DegradedMetrics, metric)
degradedCount++
} else {
state.MissingMetrics = append(state.MissingMetrics, metric)
missingCount++
}
}
// Determine overall health status
if len(state.MissingMetrics) > 0 || len(state.DegradedMetrics) > 0 {
if len(state.HealthyMetrics) == 0 {
if missingCount > 0 || degradedCount > 0 {
if healthyCount == 0 {
// No healthy metrics at all
state.Status = schema.MonitoringStateFailed
status = schema.MonitoringStateFailed
} else {
// Some healthy, some degraded/missing
state.Status = schema.MonitoringStatePartial
status = schema.MonitoringStatePartial
}
}
// else: all metrics healthy, status remains MonitoringStateFull
results[hostname] = state
results[hostname] = status
}
return results, nil

View File

@@ -161,14 +161,11 @@ func TestHealthCheckAlt(t *testing.T) {
// node004 doesn't exist at all
tests := []struct {
name string
cluster string
nodes []string
expectedMetrics []string
wantStates map[string]schema.MonitoringState
wantHealthyCounts map[string]int
wantDegradedCounts map[string]int
wantMissingCounts map[string]int
name string
cluster string
nodes []string
expectedMetrics []string
wantStates map[string]schema.MonitoringState
}{
{
name: "all metrics healthy",
@@ -178,9 +175,6 @@ func TestHealthCheckAlt(t *testing.T) {
wantStates: map[string]schema.MonitoringState{
"node001": schema.MonitoringStateFull,
},
wantHealthyCounts: map[string]int{"node001": 4},
wantDegradedCounts: map[string]int{"node001": 0},
wantMissingCounts: map[string]int{"node001": 0},
},
{
name: "some metrics degraded",
@@ -190,9 +184,6 @@ func TestHealthCheckAlt(t *testing.T) {
wantStates: map[string]schema.MonitoringState{
"node002": schema.MonitoringStatePartial,
},
wantHealthyCounts: map[string]int{"node002": 2},
wantDegradedCounts: map[string]int{"node002": 2},
wantMissingCounts: map[string]int{"node002": 0},
},
{
name: "some metrics missing",
@@ -202,9 +193,6 @@ func TestHealthCheckAlt(t *testing.T) {
wantStates: map[string]schema.MonitoringState{
"node003": schema.MonitoringStatePartial,
},
wantHealthyCounts: map[string]int{"node003": 2},
wantDegradedCounts: map[string]int{"node003": 0},
wantMissingCounts: map[string]int{"node003": 2},
},
{
name: "node not found",
@@ -214,9 +202,6 @@ func TestHealthCheckAlt(t *testing.T) {
wantStates: map[string]schema.MonitoringState{
"node004": schema.MonitoringStateFailed,
},
wantHealthyCounts: map[string]int{"node004": 0},
wantDegradedCounts: map[string]int{"node004": 0},
wantMissingCounts: map[string]int{"node004": 4},
},
{
name: "multiple nodes mixed states",
@@ -229,24 +214,6 @@ func TestHealthCheckAlt(t *testing.T) {
"node003": schema.MonitoringStateFull,
"node004": schema.MonitoringStateFailed,
},
wantHealthyCounts: map[string]int{
"node001": 2,
"node002": 2,
"node003": 2,
"node004": 0,
},
wantDegradedCounts: map[string]int{
"node001": 0,
"node002": 0,
"node003": 0,
"node004": 0,
},
wantMissingCounts: map[string]int{
"node001": 0,
"node002": 0,
"node003": 0,
"node004": 2,
},
},
}
@@ -273,32 +240,8 @@ func TestHealthCheckAlt(t *testing.T) {
// Check status
if wantStatus, ok := tt.wantStates[node]; ok {
if state.Status != wantStatus {
t.Errorf("HealthCheckAlt() node %s status = %v, want %v", node, state.Status, wantStatus)
}
}
// Check healthy count
if wantCount, ok := tt.wantHealthyCounts[node]; ok {
if len(state.HealthyMetrics) != wantCount {
t.Errorf("HealthCheckAlt() node %s healthy count = %d, want %d (metrics: %v)",
node, len(state.HealthyMetrics), wantCount, state.HealthyMetrics)
}
}
// Check degraded count
if wantCount, ok := tt.wantDegradedCounts[node]; ok {
if len(state.DegradedMetrics) != wantCount {
t.Errorf("HealthCheckAlt() node %s degraded count = %d, want %d (metrics: %v)",
node, len(state.DegradedMetrics), wantCount, state.DegradedMetrics)
}
}
// Check missing count
if wantCount, ok := tt.wantMissingCounts[node]; ok {
if len(state.MissingMetrics) != wantCount {
t.Errorf("HealthCheckAlt() node %s missing count = %d, want %d (metrics: %v)",
node, len(state.MissingMetrics), wantCount, state.MissingMetrics)
if state != wantStatus {
t.Errorf("HealthCheckAlt() node %s status = %v, want %v", node, state, wantStatus)
}
}
}