mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-03-20 23:07:29 +01:00
Exclude down nodes from HealthCheck
Entire-Checkpoint: 0c3347168c79
This commit is contained in:
@@ -402,12 +402,21 @@ func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
|
|||||||
repo := repository.GetNodeRepository()
|
repo := repository.GetNodeRepository()
|
||||||
requestReceived := time.Now().Unix()
|
requestReceived := time.Now().Unix()
|
||||||
|
|
||||||
// Build nodeList per subcluster for health check
|
// Pre-compute node states; only include non-down nodes in health check
|
||||||
|
nodeStates := make(map[string]schema.SchedulerState, len(req.Nodes))
|
||||||
|
for _, node := range req.Nodes {
|
||||||
|
nodeStates[node.Hostname] = determineState(node.States)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build nodeList per subcluster for health check, skipping down nodes
|
||||||
m := make(map[string][]string)
|
m := make(map[string][]string)
|
||||||
metricNames := make(map[string][]string)
|
metricNames := make(map[string][]string)
|
||||||
healthResults := make(map[string]metricstore.HealthCheckResult)
|
healthResults := make(map[string]metricstore.HealthCheckResult)
|
||||||
|
|
||||||
for _, node := range req.Nodes {
|
for _, node := range req.Nodes {
|
||||||
|
if nodeStates[node.Hostname] == schema.NodeStateDown {
|
||||||
|
continue
|
||||||
|
}
|
||||||
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
|
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
|
||||||
m[sc] = append(m[sc], node.Hostname)
|
m[sc] = append(m[sc], node.Hostname)
|
||||||
}
|
}
|
||||||
@@ -436,12 +445,17 @@ func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
|
|||||||
|
|
||||||
updates := make([]repository.NodeStateUpdate, 0, len(req.Nodes))
|
updates := make([]repository.NodeStateUpdate, 0, len(req.Nodes))
|
||||||
for _, node := range req.Nodes {
|
for _, node := range req.Nodes {
|
||||||
state := determineState(node.States)
|
state := nodeStates[node.Hostname]
|
||||||
healthState := schema.MonitoringStateFailed
|
var healthState schema.MonitoringState
|
||||||
var healthMetrics string
|
var healthMetrics string
|
||||||
if result, ok := healthResults[node.Hostname]; ok {
|
if state == schema.NodeStateDown {
|
||||||
healthState = result.State
|
healthState = schema.MonitoringStateFull
|
||||||
healthMetrics = result.HealthMetrics
|
} else {
|
||||||
|
healthState = schema.MonitoringStateFailed
|
||||||
|
if result, ok := healthResults[node.Hostname]; ok {
|
||||||
|
healthState = result.State
|
||||||
|
healthMetrics = result.HealthMetrics
|
||||||
|
}
|
||||||
}
|
}
|
||||||
nodeState := schema.NodeStateDB{
|
nodeState := schema.NodeStateDB{
|
||||||
TimeStamp: requestReceived,
|
TimeStamp: requestReceived,
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ func metricListToNames(metricList map[string]*schema.Metric) []string {
|
|||||||
|
|
||||||
// determineState resolves multiple states to a single state using priority order:
|
// determineState resolves multiple states to a single state using priority order:
|
||||||
// allocated > reserved > idle > down > mixed.
|
// allocated > reserved > idle > down > mixed.
|
||||||
// Exception: if both idle and down are present, idle is returned.
|
// Exception: if both idle and down are present, down is returned.
|
||||||
func determineState(states []string) schema.SchedulerState {
|
func determineState(states []string) schema.SchedulerState {
|
||||||
stateSet := make(map[string]bool, len(states))
|
stateSet := make(map[string]bool, len(states))
|
||||||
for _, s := range states {
|
for _, s := range states {
|
||||||
@@ -48,6 +48,8 @@ func determineState(states []string) schema.SchedulerState {
|
|||||||
return schema.NodeStateAllocated
|
return schema.NodeStateAllocated
|
||||||
case stateSet["reserved"]:
|
case stateSet["reserved"]:
|
||||||
return schema.NodeStateReserved
|
return schema.NodeStateReserved
|
||||||
|
case stateSet["idle"] && stateSet["down"]:
|
||||||
|
return schema.NodeStateDown
|
||||||
case stateSet["idle"]:
|
case stateSet["idle"]:
|
||||||
return schema.NodeStateIdle
|
return schema.NodeStateIdle
|
||||||
case stateSet["down"]:
|
case stateSet["down"]:
|
||||||
@@ -84,14 +86,23 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
|||||||
requestReceived := time.Now().Unix()
|
requestReceived := time.Now().Unix()
|
||||||
repo := repository.GetNodeRepository()
|
repo := repository.GetNodeRepository()
|
||||||
|
|
||||||
|
// Step 1: Pre-compute node states; only include non-down nodes in health check
|
||||||
|
nodeStates := make(map[string]schema.SchedulerState, len(req.Nodes))
|
||||||
|
for _, node := range req.Nodes {
|
||||||
|
nodeStates[node.Hostname] = determineState(node.States)
|
||||||
|
}
|
||||||
|
|
||||||
m := make(map[string][]string)
|
m := make(map[string][]string)
|
||||||
metricNames := make(map[string][]string)
|
metricNames := make(map[string][]string)
|
||||||
healthResults := make(map[string]metricstore.HealthCheckResult)
|
healthResults := make(map[string]metricstore.HealthCheckResult)
|
||||||
|
|
||||||
startMs := time.Now()
|
startMs := time.Now()
|
||||||
|
|
||||||
// Step 1: Build nodeList and metricList per subcluster
|
// Step 2: Build nodeList and metricList per subcluster, skipping down nodes
|
||||||
for _, node := range req.Nodes {
|
for _, node := range req.Nodes {
|
||||||
|
if nodeStates[node.Hostname] == schema.NodeStateDown {
|
||||||
|
continue
|
||||||
|
}
|
||||||
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
|
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
|
||||||
m[sc] = append(m[sc], node.Hostname)
|
m[sc] = append(m[sc], node.Hostname)
|
||||||
}
|
}
|
||||||
@@ -104,7 +115,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step 2: Determine which metric store to query and perform health check
|
// Step 3: Determine which metric store to query and perform health check
|
||||||
healthRepo, err := metricdispatch.GetHealthCheckRepo(req.Cluster)
|
healthRepo, err := metricdispatch.GetHealthCheckRepo(req.Cluster)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.Warnf("updateNodeStates: no metric store for cluster %s, skipping health check: %v", req.Cluster, err)
|
cclog.Warnf("updateNodeStates: no metric store for cluster %s, skipping health check: %v", req.Cluster, err)
|
||||||
@@ -123,12 +134,17 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
updates := make([]repository.NodeStateUpdate, 0, len(req.Nodes))
|
updates := make([]repository.NodeStateUpdate, 0, len(req.Nodes))
|
||||||
for _, node := range req.Nodes {
|
for _, node := range req.Nodes {
|
||||||
state := determineState(node.States)
|
state := nodeStates[node.Hostname]
|
||||||
healthState := schema.MonitoringStateFailed
|
var healthState schema.MonitoringState
|
||||||
var healthMetrics string
|
var healthMetrics string
|
||||||
if result, ok := healthResults[node.Hostname]; ok {
|
if state == schema.NodeStateDown {
|
||||||
healthState = result.State
|
healthState = schema.MonitoringStateFull
|
||||||
healthMetrics = result.HealthMetrics
|
} else {
|
||||||
|
healthState = schema.MonitoringStateFailed
|
||||||
|
if result, ok := healthResults[node.Hostname]; ok {
|
||||||
|
healthState = result.State
|
||||||
|
healthMetrics = result.HealthMetrics
|
||||||
|
}
|
||||||
}
|
}
|
||||||
nodeState := schema.NodeStateDB{
|
nodeState := schema.NodeStateDB{
|
||||||
TimeStamp: requestReceived,
|
TimeStamp: requestReceived,
|
||||||
|
|||||||
Reference in New Issue
Block a user