Exclude down nodes from HealthCheck

Entire-Checkpoint: 0c3347168c79
This commit is contained in:
2026-03-18 11:20:10 +01:00
parent 33beb3c806
commit 8b0881fb17
2 changed files with 44 additions and 14 deletions

View File

@@ -402,12 +402,21 @@ func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
repo := repository.GetNodeRepository()
requestReceived := time.Now().Unix()
// Build nodeList per subcluster for health check
// Pre-compute node states; only include non-down nodes in health check
nodeStates := make(map[string]schema.SchedulerState, len(req.Nodes))
for _, node := range req.Nodes {
nodeStates[node.Hostname] = determineState(node.States)
}
// Build nodeList per subcluster for health check, skipping down nodes
m := make(map[string][]string)
metricNames := make(map[string][]string)
healthResults := make(map[string]metricstore.HealthCheckResult)
for _, node := range req.Nodes {
if nodeStates[node.Hostname] == schema.NodeStateDown {
continue
}
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
m[sc] = append(m[sc], node.Hostname)
}
@@ -436,13 +445,18 @@ func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
updates := make([]repository.NodeStateUpdate, 0, len(req.Nodes))
for _, node := range req.Nodes {
state := determineState(node.States)
healthState := schema.MonitoringStateFailed
state := nodeStates[node.Hostname]
var healthState schema.MonitoringState
var healthMetrics string
if state == schema.NodeStateDown {
healthState = schema.MonitoringStateFull
} else {
healthState = schema.MonitoringStateFailed
if result, ok := healthResults[node.Hostname]; ok {
healthState = result.State
healthMetrics = result.HealthMetrics
}
}
nodeState := schema.NodeStateDB{
TimeStamp: requestReceived,
NodeState: state,

View File

@@ -36,7 +36,7 @@ func metricListToNames(metricList map[string]*schema.Metric) []string {
// determineState resolves multiple states to a single state using priority order:
// allocated > reserved > idle > down > mixed.
// Exception: if both idle and down are present, idle is returned.
// Exception: if both idle and down are present, down is returned.
func determineState(states []string) schema.SchedulerState {
stateSet := make(map[string]bool, len(states))
for _, s := range states {
@@ -48,6 +48,8 @@ func determineState(states []string) schema.SchedulerState {
return schema.NodeStateAllocated
case stateSet["reserved"]:
return schema.NodeStateReserved
case stateSet["idle"] && stateSet["down"]:
return schema.NodeStateDown
case stateSet["idle"]:
return schema.NodeStateIdle
case stateSet["down"]:
@@ -84,14 +86,23 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
requestReceived := time.Now().Unix()
repo := repository.GetNodeRepository()
// Step 1: Pre-compute node states; only include non-down nodes in health check
nodeStates := make(map[string]schema.SchedulerState, len(req.Nodes))
for _, node := range req.Nodes {
nodeStates[node.Hostname] = determineState(node.States)
}
m := make(map[string][]string)
metricNames := make(map[string][]string)
healthResults := make(map[string]metricstore.HealthCheckResult)
startMs := time.Now()
// Step 1: Build nodeList and metricList per subcluster
// Step 2: Build nodeList and metricList per subcluster, skipping down nodes
for _, node := range req.Nodes {
if nodeStates[node.Hostname] == schema.NodeStateDown {
continue
}
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
m[sc] = append(m[sc], node.Hostname)
}
@@ -104,7 +115,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
}
}
// Step 2: Determine which metric store to query and perform health check
// Step 3: Determine which metric store to query and perform health check
healthRepo, err := metricdispatch.GetHealthCheckRepo(req.Cluster)
if err != nil {
cclog.Warnf("updateNodeStates: no metric store for cluster %s, skipping health check: %v", req.Cluster, err)
@@ -123,13 +134,18 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
updates := make([]repository.NodeStateUpdate, 0, len(req.Nodes))
for _, node := range req.Nodes {
state := determineState(node.States)
healthState := schema.MonitoringStateFailed
state := nodeStates[node.Hostname]
var healthState schema.MonitoringState
var healthMetrics string
if state == schema.NodeStateDown {
healthState = schema.MonitoringStateFull
} else {
healthState = schema.MonitoringStateFailed
if result, ok := healthResults[node.Hostname]; ok {
healthState = result.State
healthMetrics = result.HealthMetrics
}
}
nodeState := schema.NodeStateDB{
TimeStamp: requestReceived,
NodeState: state,