mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-02-11 21:41:46 +01:00
Change API of Node HealthState
This commit is contained in:
@@ -79,7 +79,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
|||||||
ms := metricstore.GetMemoryStore()
|
ms := metricstore.GetMemoryStore()
|
||||||
|
|
||||||
m := make(map[string][]string)
|
m := make(map[string][]string)
|
||||||
healthStates := make(map[string]metricstore.NodeHealthState)
|
healthStates := make(map[string]schema.MonitoringState)
|
||||||
|
|
||||||
for _, node := range req.Nodes {
|
for _, node := range req.Nodes {
|
||||||
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
|
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
|
||||||
@@ -101,7 +101,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
|||||||
state := determineState(node.States)
|
state := determineState(node.States)
|
||||||
healthState := schema.MonitoringStateFull
|
healthState := schema.MonitoringStateFull
|
||||||
if hs, ok := healthStates[node.Hostname]; ok {
|
if hs, ok := healthStates[node.Hostname]; ok {
|
||||||
healthState = hs.Status
|
healthState = hs
|
||||||
}
|
}
|
||||||
nodeState := schema.NodeStateDB{
|
nodeState := schema.NodeStateDB{
|
||||||
TimeStamp: requestReceived,
|
TimeStamp: requestReceived,
|
||||||
|
|||||||
@@ -360,19 +360,11 @@ func (m *MemoryStore) GetHealthyMetrics(selector []string) ([]string, []string,
|
|||||||
return healthyList, degradedList, nil
|
return healthyList, degradedList, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// NodeHealthState represents the health status of a single node's metrics.
|
// HealthCheckAlt performs health checks on multiple nodes and returns their monitoring states.
|
||||||
type NodeHealthState struct {
|
|
||||||
Status schema.MonitoringState // Overall health status: Full, Partial, or Failed
|
|
||||||
HealthyMetrics []string // Metrics with recent data and few missing values
|
|
||||||
DegradedMetrics []string // Metrics with recent data but many missing values
|
|
||||||
MissingMetrics []string // Expected metrics that are completely missing or stale
|
|
||||||
}
|
|
||||||
|
|
||||||
// HealthCheckAlt performs health checks on multiple nodes and returns their health states.
|
|
||||||
//
|
//
|
||||||
// This routine provides a batch health check interface that evaluates multiple nodes
|
// This routine provides a batch health check interface that evaluates multiple nodes
|
||||||
// against a specific set of expected metrics. For each node, it determines which metrics
|
// against a specific set of expected metrics. For each node, it determines the overall
|
||||||
// are healthy, degraded, or missing, and assigns an overall health status.
|
// monitoring state based on which metrics are healthy, degraded, or missing.
|
||||||
//
|
//
|
||||||
// Health Status Classification:
|
// Health Status Classification:
|
||||||
// - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values)
|
// - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values)
|
||||||
@@ -385,8 +377,8 @@ type NodeHealthState struct {
|
|||||||
// - expectedMetrics: List of metric names that should be present on each node
|
// - expectedMetrics: List of metric names that should be present on each node
|
||||||
//
|
//
|
||||||
// Returns:
|
// Returns:
|
||||||
// - map[string]NodeHealthState: Map keyed by hostname containing health state for each node
|
// - map[string]schema.MonitoringState: Map keyed by hostname containing monitoring state for each node
|
||||||
// - error: Non-nil only for internal errors (individual node failures are captured in NodeHealthState)
|
// - error: Non-nil only for internal errors (individual node failures are captured as MonitoringStateFailed)
|
||||||
//
|
//
|
||||||
// Example usage:
|
// Example usage:
|
||||||
//
|
//
|
||||||
@@ -398,10 +390,7 @@ type NodeHealthState struct {
|
|||||||
// return err
|
// return err
|
||||||
// }
|
// }
|
||||||
// for hostname, state := range healthStates {
|
// for hostname, state := range healthStates {
|
||||||
// fmt.Printf("Node %s: %s\n", hostname, state.Status)
|
// fmt.Printf("Node %s: %s\n", hostname, state)
|
||||||
// fmt.Printf(" Healthy: %v\n", state.HealthyMetrics)
|
|
||||||
// fmt.Printf(" Degraded: %v\n", state.DegradedMetrics)
|
|
||||||
// fmt.Printf(" Missing: %v\n", state.MissingMetrics)
|
|
||||||
// }
|
// }
|
||||||
//
|
//
|
||||||
// Note: This routine is optimized for batch operations where you need to check
|
// Note: This routine is optimized for batch operations where you need to check
|
||||||
@@ -409,8 +398,8 @@ type NodeHealthState struct {
|
|||||||
// all configured metrics, use HealthCheck() instead.
|
// all configured metrics, use HealthCheck() instead.
|
||||||
func (m *MemoryStore) HealthCheckAlt(cluster string,
|
func (m *MemoryStore) HealthCheckAlt(cluster string,
|
||||||
nodes []string, expectedMetrics []string,
|
nodes []string, expectedMetrics []string,
|
||||||
) (map[string]NodeHealthState, error) {
|
) (map[string]schema.MonitoringState, error) {
|
||||||
results := make(map[string]NodeHealthState, len(nodes))
|
results := make(map[string]schema.MonitoringState, len(nodes))
|
||||||
|
|
||||||
// Create a set of expected metrics for fast lookup
|
// Create a set of expected metrics for fast lookup
|
||||||
expectedSet := make(map[string]bool, len(expectedMetrics))
|
expectedSet := make(map[string]bool, len(expectedMetrics))
|
||||||
@@ -421,20 +410,16 @@ func (m *MemoryStore) HealthCheckAlt(cluster string,
|
|||||||
// Check each node
|
// Check each node
|
||||||
for _, hostname := range nodes {
|
for _, hostname := range nodes {
|
||||||
selector := []string{cluster, hostname}
|
selector := []string{cluster, hostname}
|
||||||
state := NodeHealthState{
|
status := schema.MonitoringStateFull
|
||||||
Status: schema.MonitoringStateFull,
|
healthyCount := 0
|
||||||
HealthyMetrics: make([]string, 0),
|
degradedCount := 0
|
||||||
DegradedMetrics: make([]string, 0),
|
missingCount := 0
|
||||||
MissingMetrics: make([]string, 0),
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get healthy and degraded metrics for this node
|
// Get healthy and degraded metrics for this node
|
||||||
healthyList, degradedList, err := m.GetHealthyMetrics(selector)
|
healthyList, degradedList, err := m.GetHealthyMetrics(selector)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// Node not found or internal error
|
// Node not found or internal error
|
||||||
state.Status = schema.MonitoringStateFailed
|
results[hostname] = schema.MonitoringStateFailed
|
||||||
state.MissingMetrics = expectedMetrics
|
|
||||||
results[hostname] = state
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -451,27 +436,27 @@ func (m *MemoryStore) HealthCheckAlt(cluster string,
|
|||||||
// Classify each expected metric
|
// Classify each expected metric
|
||||||
for _, metric := range expectedMetrics {
|
for _, metric := range expectedMetrics {
|
||||||
if healthySet[metric] {
|
if healthySet[metric] {
|
||||||
state.HealthyMetrics = append(state.HealthyMetrics, metric)
|
healthyCount++
|
||||||
} else if degradedSet[metric] {
|
} else if degradedSet[metric] {
|
||||||
state.DegradedMetrics = append(state.DegradedMetrics, metric)
|
degradedCount++
|
||||||
} else {
|
} else {
|
||||||
state.MissingMetrics = append(state.MissingMetrics, metric)
|
missingCount++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine overall health status
|
// Determine overall health status
|
||||||
if len(state.MissingMetrics) > 0 || len(state.DegradedMetrics) > 0 {
|
if missingCount > 0 || degradedCount > 0 {
|
||||||
if len(state.HealthyMetrics) == 0 {
|
if healthyCount == 0 {
|
||||||
// No healthy metrics at all
|
// No healthy metrics at all
|
||||||
state.Status = schema.MonitoringStateFailed
|
status = schema.MonitoringStateFailed
|
||||||
} else {
|
} else {
|
||||||
// Some healthy, some degraded/missing
|
// Some healthy, some degraded/missing
|
||||||
state.Status = schema.MonitoringStatePartial
|
status = schema.MonitoringStatePartial
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// else: all metrics healthy, status remains MonitoringStateFull
|
// else: all metrics healthy, status remains MonitoringStateFull
|
||||||
|
|
||||||
results[hostname] = state
|
results[hostname] = status
|
||||||
}
|
}
|
||||||
|
|
||||||
return results, nil
|
return results, nil
|
||||||
|
|||||||
@@ -161,14 +161,11 @@ func TestHealthCheckAlt(t *testing.T) {
|
|||||||
// node004 doesn't exist at all
|
// node004 doesn't exist at all
|
||||||
|
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
cluster string
|
cluster string
|
||||||
nodes []string
|
nodes []string
|
||||||
expectedMetrics []string
|
expectedMetrics []string
|
||||||
wantStates map[string]schema.MonitoringState
|
wantStates map[string]schema.MonitoringState
|
||||||
wantHealthyCounts map[string]int
|
|
||||||
wantDegradedCounts map[string]int
|
|
||||||
wantMissingCounts map[string]int
|
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "all metrics healthy",
|
name: "all metrics healthy",
|
||||||
@@ -178,9 +175,6 @@ func TestHealthCheckAlt(t *testing.T) {
|
|||||||
wantStates: map[string]schema.MonitoringState{
|
wantStates: map[string]schema.MonitoringState{
|
||||||
"node001": schema.MonitoringStateFull,
|
"node001": schema.MonitoringStateFull,
|
||||||
},
|
},
|
||||||
wantHealthyCounts: map[string]int{"node001": 4},
|
|
||||||
wantDegradedCounts: map[string]int{"node001": 0},
|
|
||||||
wantMissingCounts: map[string]int{"node001": 0},
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "some metrics degraded",
|
name: "some metrics degraded",
|
||||||
@@ -190,9 +184,6 @@ func TestHealthCheckAlt(t *testing.T) {
|
|||||||
wantStates: map[string]schema.MonitoringState{
|
wantStates: map[string]schema.MonitoringState{
|
||||||
"node002": schema.MonitoringStatePartial,
|
"node002": schema.MonitoringStatePartial,
|
||||||
},
|
},
|
||||||
wantHealthyCounts: map[string]int{"node002": 2},
|
|
||||||
wantDegradedCounts: map[string]int{"node002": 2},
|
|
||||||
wantMissingCounts: map[string]int{"node002": 0},
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "some metrics missing",
|
name: "some metrics missing",
|
||||||
@@ -202,9 +193,6 @@ func TestHealthCheckAlt(t *testing.T) {
|
|||||||
wantStates: map[string]schema.MonitoringState{
|
wantStates: map[string]schema.MonitoringState{
|
||||||
"node003": schema.MonitoringStatePartial,
|
"node003": schema.MonitoringStatePartial,
|
||||||
},
|
},
|
||||||
wantHealthyCounts: map[string]int{"node003": 2},
|
|
||||||
wantDegradedCounts: map[string]int{"node003": 0},
|
|
||||||
wantMissingCounts: map[string]int{"node003": 2},
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "node not found",
|
name: "node not found",
|
||||||
@@ -214,9 +202,6 @@ func TestHealthCheckAlt(t *testing.T) {
|
|||||||
wantStates: map[string]schema.MonitoringState{
|
wantStates: map[string]schema.MonitoringState{
|
||||||
"node004": schema.MonitoringStateFailed,
|
"node004": schema.MonitoringStateFailed,
|
||||||
},
|
},
|
||||||
wantHealthyCounts: map[string]int{"node004": 0},
|
|
||||||
wantDegradedCounts: map[string]int{"node004": 0},
|
|
||||||
wantMissingCounts: map[string]int{"node004": 4},
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "multiple nodes mixed states",
|
name: "multiple nodes mixed states",
|
||||||
@@ -229,24 +214,6 @@ func TestHealthCheckAlt(t *testing.T) {
|
|||||||
"node003": schema.MonitoringStateFull,
|
"node003": schema.MonitoringStateFull,
|
||||||
"node004": schema.MonitoringStateFailed,
|
"node004": schema.MonitoringStateFailed,
|
||||||
},
|
},
|
||||||
wantHealthyCounts: map[string]int{
|
|
||||||
"node001": 2,
|
|
||||||
"node002": 2,
|
|
||||||
"node003": 2,
|
|
||||||
"node004": 0,
|
|
||||||
},
|
|
||||||
wantDegradedCounts: map[string]int{
|
|
||||||
"node001": 0,
|
|
||||||
"node002": 0,
|
|
||||||
"node003": 0,
|
|
||||||
"node004": 0,
|
|
||||||
},
|
|
||||||
wantMissingCounts: map[string]int{
|
|
||||||
"node001": 0,
|
|
||||||
"node002": 0,
|
|
||||||
"node003": 0,
|
|
||||||
"node004": 2,
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -273,32 +240,8 @@ func TestHealthCheckAlt(t *testing.T) {
|
|||||||
|
|
||||||
// Check status
|
// Check status
|
||||||
if wantStatus, ok := tt.wantStates[node]; ok {
|
if wantStatus, ok := tt.wantStates[node]; ok {
|
||||||
if state.Status != wantStatus {
|
if state != wantStatus {
|
||||||
t.Errorf("HealthCheckAlt() node %s status = %v, want %v", node, state.Status, wantStatus)
|
t.Errorf("HealthCheckAlt() node %s status = %v, want %v", node, state, wantStatus)
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check healthy count
|
|
||||||
if wantCount, ok := tt.wantHealthyCounts[node]; ok {
|
|
||||||
if len(state.HealthyMetrics) != wantCount {
|
|
||||||
t.Errorf("HealthCheckAlt() node %s healthy count = %d, want %d (metrics: %v)",
|
|
||||||
node, len(state.HealthyMetrics), wantCount, state.HealthyMetrics)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check degraded count
|
|
||||||
if wantCount, ok := tt.wantDegradedCounts[node]; ok {
|
|
||||||
if len(state.DegradedMetrics) != wantCount {
|
|
||||||
t.Errorf("HealthCheckAlt() node %s degraded count = %d, want %d (metrics: %v)",
|
|
||||||
node, len(state.DegradedMetrics), wantCount, state.DegradedMetrics)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check missing count
|
|
||||||
if wantCount, ok := tt.wantMissingCounts[node]; ok {
|
|
||||||
if len(state.MissingMetrics) != wantCount {
|
|
||||||
t.Errorf("HealthCheckAlt() node %s missing count = %d, want %d (metrics: %v)",
|
|
||||||
node, len(state.MissingMetrics), wantCount, state.MissingMetrics)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user