mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-02-11 13:31:45 +01:00
Change API of Node HealthState
This commit is contained in:
@@ -79,7 +79,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
||||
ms := metricstore.GetMemoryStore()
|
||||
|
||||
m := make(map[string][]string)
|
||||
healthStates := make(map[string]metricstore.NodeHealthState)
|
||||
healthStates := make(map[string]schema.MonitoringState)
|
||||
|
||||
for _, node := range req.Nodes {
|
||||
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
|
||||
@@ -101,7 +101,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
||||
state := determineState(node.States)
|
||||
healthState := schema.MonitoringStateFull
|
||||
if hs, ok := healthStates[node.Hostname]; ok {
|
||||
healthState = hs.Status
|
||||
healthState = hs
|
||||
}
|
||||
nodeState := schema.NodeStateDB{
|
||||
TimeStamp: requestReceived,
|
||||
|
||||
@@ -360,19 +360,11 @@ func (m *MemoryStore) GetHealthyMetrics(selector []string) ([]string, []string,
|
||||
return healthyList, degradedList, nil
|
||||
}
|
||||
|
||||
// NodeHealthState represents the health status of a single node's metrics.
|
||||
type NodeHealthState struct {
|
||||
Status schema.MonitoringState // Overall health status: Full, Partial, or Failed
|
||||
HealthyMetrics []string // Metrics with recent data and few missing values
|
||||
DegradedMetrics []string // Metrics with recent data but many missing values
|
||||
MissingMetrics []string // Expected metrics that are completely missing or stale
|
||||
}
|
||||
|
||||
// HealthCheckAlt performs health checks on multiple nodes and returns their health states.
|
||||
// HealthCheckAlt performs health checks on multiple nodes and returns their monitoring states.
|
||||
//
|
||||
// This routine provides a batch health check interface that evaluates multiple nodes
|
||||
// against a specific set of expected metrics. For each node, it determines which metrics
|
||||
// are healthy, degraded, or missing, and assigns an overall health status.
|
||||
// against a specific set of expected metrics. For each node, it determines the overall
|
||||
// monitoring state based on which metrics are healthy, degraded, or missing.
|
||||
//
|
||||
// Health Status Classification:
|
||||
// - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values)
|
||||
@@ -385,8 +377,8 @@ type NodeHealthState struct {
|
||||
// - expectedMetrics: List of metric names that should be present on each node
|
||||
//
|
||||
// Returns:
|
||||
// - map[string]NodeHealthState: Map keyed by hostname containing health state for each node
|
||||
// - error: Non-nil only for internal errors (individual node failures are captured in NodeHealthState)
|
||||
// - map[string]schema.MonitoringState: Map keyed by hostname containing monitoring state for each node
|
||||
// - error: Non-nil only for internal errors (individual node failures are captured as MonitoringStateFailed)
|
||||
//
|
||||
// Example usage:
|
||||
//
|
||||
@@ -398,10 +390,7 @@ type NodeHealthState struct {
|
||||
// return err
|
||||
// }
|
||||
// for hostname, state := range healthStates {
|
||||
// fmt.Printf("Node %s: %s\n", hostname, state.Status)
|
||||
// fmt.Printf(" Healthy: %v\n", state.HealthyMetrics)
|
||||
// fmt.Printf(" Degraded: %v\n", state.DegradedMetrics)
|
||||
// fmt.Printf(" Missing: %v\n", state.MissingMetrics)
|
||||
// fmt.Printf("Node %s: %s\n", hostname, state)
|
||||
// }
|
||||
//
|
||||
// Note: This routine is optimized for batch operations where you need to check
|
||||
@@ -409,8 +398,8 @@ type NodeHealthState struct {
|
||||
// all configured metrics, use HealthCheck() instead.
|
||||
func (m *MemoryStore) HealthCheckAlt(cluster string,
|
||||
nodes []string, expectedMetrics []string,
|
||||
) (map[string]NodeHealthState, error) {
|
||||
results := make(map[string]NodeHealthState, len(nodes))
|
||||
) (map[string]schema.MonitoringState, error) {
|
||||
results := make(map[string]schema.MonitoringState, len(nodes))
|
||||
|
||||
// Create a set of expected metrics for fast lookup
|
||||
expectedSet := make(map[string]bool, len(expectedMetrics))
|
||||
@@ -421,20 +410,16 @@ func (m *MemoryStore) HealthCheckAlt(cluster string,
|
||||
// Check each node
|
||||
for _, hostname := range nodes {
|
||||
selector := []string{cluster, hostname}
|
||||
state := NodeHealthState{
|
||||
Status: schema.MonitoringStateFull,
|
||||
HealthyMetrics: make([]string, 0),
|
||||
DegradedMetrics: make([]string, 0),
|
||||
MissingMetrics: make([]string, 0),
|
||||
}
|
||||
status := schema.MonitoringStateFull
|
||||
healthyCount := 0
|
||||
degradedCount := 0
|
||||
missingCount := 0
|
||||
|
||||
// Get healthy and degraded metrics for this node
|
||||
healthyList, degradedList, err := m.GetHealthyMetrics(selector)
|
||||
if err != nil {
|
||||
// Node not found or internal error
|
||||
state.Status = schema.MonitoringStateFailed
|
||||
state.MissingMetrics = expectedMetrics
|
||||
results[hostname] = state
|
||||
results[hostname] = schema.MonitoringStateFailed
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -451,27 +436,27 @@ func (m *MemoryStore) HealthCheckAlt(cluster string,
|
||||
// Classify each expected metric
|
||||
for _, metric := range expectedMetrics {
|
||||
if healthySet[metric] {
|
||||
state.HealthyMetrics = append(state.HealthyMetrics, metric)
|
||||
healthyCount++
|
||||
} else if degradedSet[metric] {
|
||||
state.DegradedMetrics = append(state.DegradedMetrics, metric)
|
||||
degradedCount++
|
||||
} else {
|
||||
state.MissingMetrics = append(state.MissingMetrics, metric)
|
||||
missingCount++
|
||||
}
|
||||
}
|
||||
|
||||
// Determine overall health status
|
||||
if len(state.MissingMetrics) > 0 || len(state.DegradedMetrics) > 0 {
|
||||
if len(state.HealthyMetrics) == 0 {
|
||||
if missingCount > 0 || degradedCount > 0 {
|
||||
if healthyCount == 0 {
|
||||
// No healthy metrics at all
|
||||
state.Status = schema.MonitoringStateFailed
|
||||
status = schema.MonitoringStateFailed
|
||||
} else {
|
||||
// Some healthy, some degraded/missing
|
||||
state.Status = schema.MonitoringStatePartial
|
||||
status = schema.MonitoringStatePartial
|
||||
}
|
||||
}
|
||||
// else: all metrics healthy, status remains MonitoringStateFull
|
||||
|
||||
results[hostname] = state
|
||||
results[hostname] = status
|
||||
}
|
||||
|
||||
return results, nil
|
||||
|
||||
@@ -161,14 +161,11 @@ func TestHealthCheckAlt(t *testing.T) {
|
||||
// node004 doesn't exist at all
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
cluster string
|
||||
nodes []string
|
||||
expectedMetrics []string
|
||||
wantStates map[string]schema.MonitoringState
|
||||
wantHealthyCounts map[string]int
|
||||
wantDegradedCounts map[string]int
|
||||
wantMissingCounts map[string]int
|
||||
name string
|
||||
cluster string
|
||||
nodes []string
|
||||
expectedMetrics []string
|
||||
wantStates map[string]schema.MonitoringState
|
||||
}{
|
||||
{
|
||||
name: "all metrics healthy",
|
||||
@@ -178,9 +175,6 @@ func TestHealthCheckAlt(t *testing.T) {
|
||||
wantStates: map[string]schema.MonitoringState{
|
||||
"node001": schema.MonitoringStateFull,
|
||||
},
|
||||
wantHealthyCounts: map[string]int{"node001": 4},
|
||||
wantDegradedCounts: map[string]int{"node001": 0},
|
||||
wantMissingCounts: map[string]int{"node001": 0},
|
||||
},
|
||||
{
|
||||
name: "some metrics degraded",
|
||||
@@ -190,9 +184,6 @@ func TestHealthCheckAlt(t *testing.T) {
|
||||
wantStates: map[string]schema.MonitoringState{
|
||||
"node002": schema.MonitoringStatePartial,
|
||||
},
|
||||
wantHealthyCounts: map[string]int{"node002": 2},
|
||||
wantDegradedCounts: map[string]int{"node002": 2},
|
||||
wantMissingCounts: map[string]int{"node002": 0},
|
||||
},
|
||||
{
|
||||
name: "some metrics missing",
|
||||
@@ -202,9 +193,6 @@ func TestHealthCheckAlt(t *testing.T) {
|
||||
wantStates: map[string]schema.MonitoringState{
|
||||
"node003": schema.MonitoringStatePartial,
|
||||
},
|
||||
wantHealthyCounts: map[string]int{"node003": 2},
|
||||
wantDegradedCounts: map[string]int{"node003": 0},
|
||||
wantMissingCounts: map[string]int{"node003": 2},
|
||||
},
|
||||
{
|
||||
name: "node not found",
|
||||
@@ -214,9 +202,6 @@ func TestHealthCheckAlt(t *testing.T) {
|
||||
wantStates: map[string]schema.MonitoringState{
|
||||
"node004": schema.MonitoringStateFailed,
|
||||
},
|
||||
wantHealthyCounts: map[string]int{"node004": 0},
|
||||
wantDegradedCounts: map[string]int{"node004": 0},
|
||||
wantMissingCounts: map[string]int{"node004": 4},
|
||||
},
|
||||
{
|
||||
name: "multiple nodes mixed states",
|
||||
@@ -229,24 +214,6 @@ func TestHealthCheckAlt(t *testing.T) {
|
||||
"node003": schema.MonitoringStateFull,
|
||||
"node004": schema.MonitoringStateFailed,
|
||||
},
|
||||
wantHealthyCounts: map[string]int{
|
||||
"node001": 2,
|
||||
"node002": 2,
|
||||
"node003": 2,
|
||||
"node004": 0,
|
||||
},
|
||||
wantDegradedCounts: map[string]int{
|
||||
"node001": 0,
|
||||
"node002": 0,
|
||||
"node003": 0,
|
||||
"node004": 0,
|
||||
},
|
||||
wantMissingCounts: map[string]int{
|
||||
"node001": 0,
|
||||
"node002": 0,
|
||||
"node003": 0,
|
||||
"node004": 2,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -273,32 +240,8 @@ func TestHealthCheckAlt(t *testing.T) {
|
||||
|
||||
// Check status
|
||||
if wantStatus, ok := tt.wantStates[node]; ok {
|
||||
if state.Status != wantStatus {
|
||||
t.Errorf("HealthCheckAlt() node %s status = %v, want %v", node, state.Status, wantStatus)
|
||||
}
|
||||
}
|
||||
|
||||
// Check healthy count
|
||||
if wantCount, ok := tt.wantHealthyCounts[node]; ok {
|
||||
if len(state.HealthyMetrics) != wantCount {
|
||||
t.Errorf("HealthCheckAlt() node %s healthy count = %d, want %d (metrics: %v)",
|
||||
node, len(state.HealthyMetrics), wantCount, state.HealthyMetrics)
|
||||
}
|
||||
}
|
||||
|
||||
// Check degraded count
|
||||
if wantCount, ok := tt.wantDegradedCounts[node]; ok {
|
||||
if len(state.DegradedMetrics) != wantCount {
|
||||
t.Errorf("HealthCheckAlt() node %s degraded count = %d, want %d (metrics: %v)",
|
||||
node, len(state.DegradedMetrics), wantCount, state.DegradedMetrics)
|
||||
}
|
||||
}
|
||||
|
||||
// Check missing count
|
||||
if wantCount, ok := tt.wantMissingCounts[node]; ok {
|
||||
if len(state.MissingMetrics) != wantCount {
|
||||
t.Errorf("HealthCheckAlt() node %s missing count = %d, want %d (metrics: %v)",
|
||||
node, len(state.MissingMetrics), wantCount, state.MissingMetrics)
|
||||
if state != wantStatus {
|
||||
t.Errorf("HealthCheckAlt() node %s status = %v, want %v", node, state, wantStatus)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user