mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-02-11 21:41:46 +01:00
Update HealthCheck again Still WIP
This commit is contained in:
@@ -82,12 +82,13 @@ func (l *Level) collectMetricStatus(m *MemoryStore, expectedMetrics []string, he
|
|||||||
// Returns:
|
// Returns:
|
||||||
// - missingList: metrics not found in global config or without any buffer
|
// - missingList: metrics not found in global config or without any buffer
|
||||||
// - degradedList: metrics with at least one stale buffer in the subtree
|
// - degradedList: metrics with at least one stale buffer in the subtree
|
||||||
func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) []string {
|
func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) ([]string, []string) {
|
||||||
healthy := make(map[string]bool, len(expectedMetrics))
|
healthy := make(map[string]bool, len(expectedMetrics))
|
||||||
degraded := make(map[string]bool)
|
degraded := make(map[string]bool)
|
||||||
|
|
||||||
l.collectMetricStatus(m, expectedMetrics, healthy, degraded)
|
l.collectMetricStatus(m, expectedMetrics, healthy, degraded)
|
||||||
|
|
||||||
|
missingList := make([]string, 0)
|
||||||
degradedList := make([]string, 0)
|
degradedList := make([]string, 0)
|
||||||
|
|
||||||
for _, metricName := range expectedMetrics {
|
for _, metricName := range expectedMetrics {
|
||||||
@@ -95,10 +96,14 @@ func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) []st
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if degraded[metricName] {
|
||||||
degradedList = append(degradedList, metricName)
|
degradedList = append(degradedList, metricName)
|
||||||
|
} else {
|
||||||
|
missingList = append(missingList, metricName)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return degradedList
|
return degradedList, missingList
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetHealthyMetrics returns missing and degraded metric lists for a node.
|
// GetHealthyMetrics returns missing and degraded metric lists for a node.
|
||||||
@@ -110,14 +115,14 @@ func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) []st
|
|||||||
//
|
//
|
||||||
// Metrics present in expectedMetrics but absent from both returned lists
|
// Metrics present in expectedMetrics but absent from both returned lists
|
||||||
// are considered fully healthy.
|
// are considered fully healthy.
|
||||||
func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, error) {
|
func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, []string, error) {
|
||||||
lvl := m.root.findLevel(selector)
|
lvl := m.root.findLevel(selector)
|
||||||
if lvl == nil {
|
if lvl == nil {
|
||||||
return nil, fmt.Errorf("[METRICSTORE]> GetHealthyMetrics: host not found: %#v", selector)
|
return nil, nil, fmt.Errorf("[METRICSTORE]> GetHealthyMetrics: host not found: %#v", selector)
|
||||||
}
|
}
|
||||||
|
|
||||||
degradedList := lvl.getHealthyMetrics(m, expectedMetrics)
|
degradedList, missingList := lvl.getHealthyMetrics(m, expectedMetrics)
|
||||||
return degradedList, nil
|
return degradedList, missingList, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// HealthCheck evaluates multiple nodes against a set of expected metrics
|
// HealthCheck evaluates multiple nodes against a set of expected metrics
|
||||||
@@ -135,21 +140,23 @@ func (m *MemoryStore) HealthCheck(cluster string,
|
|||||||
for _, hostname := range nodes {
|
for _, hostname := range nodes {
|
||||||
selector := []string{cluster, hostname}
|
selector := []string{cluster, hostname}
|
||||||
|
|
||||||
degradedList, err := m.GetHealthyMetrics(selector, expectedMetrics)
|
degradedList, missingList, err := m.GetHealthyMetrics(selector, expectedMetrics)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
results[hostname] = schema.MonitoringStateFailed
|
results[hostname] = schema.MonitoringStateFailed
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
degradedCount := len(degradedList)
|
degradedCount := len(degradedList)
|
||||||
healthyCount := len(expectedMetrics) - degradedCount
|
missingCount := len(missingList)
|
||||||
|
|
||||||
|
healthyCount := len(expectedMetrics) - degradedCount - missingCount
|
||||||
|
|
||||||
if degradedCount > 0 {
|
if degradedCount > 0 {
|
||||||
cclog.ComponentDebug("metricstore", "HealthCheck: node", hostname, "degraded metrics:", degradedList)
|
cclog.ComponentDebug("metricstore", "HealthCheck: node", hostname, "degraded metrics:", degradedList)
|
||||||
}
|
}
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case degradedCount == 0:
|
case degradedCount == 0 && missingCount == 0:
|
||||||
results[hostname] = schema.MonitoringStateFull
|
results[hostname] = schema.MonitoringStateFull
|
||||||
case healthyCount == 0:
|
case healthyCount == 0:
|
||||||
results[hostname] = schema.MonitoringStateFailed
|
results[hostname] = schema.MonitoringStateFailed
|
||||||
|
|||||||
@@ -304,13 +304,15 @@ func TestGetHealthyMetrics(t *testing.T) {
|
|||||||
selector []string
|
selector []string
|
||||||
expectedMetrics []string
|
expectedMetrics []string
|
||||||
wantDegraded []string
|
wantDegraded []string
|
||||||
|
wantMissing []string
|
||||||
wantErr bool
|
wantErr bool
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "mixed health states",
|
name: "mixed health states",
|
||||||
selector: []string{"testcluster", "testnode"},
|
selector: []string{"testcluster", "testnode"},
|
||||||
expectedMetrics: []string{"load", "mem_used", "cpu_user"},
|
expectedMetrics: []string{"load", "mem_used", "cpu_user"},
|
||||||
wantDegraded: []string{"mem_used", "cpu_user"},
|
wantDegraded: []string{"mem_used"},
|
||||||
|
wantMissing: []string{"cpu_user"},
|
||||||
wantErr: false,
|
wantErr: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -318,6 +320,7 @@ func TestGetHealthyMetrics(t *testing.T) {
|
|||||||
selector: []string{"testcluster", "nonexistent"},
|
selector: []string{"testcluster", "nonexistent"},
|
||||||
expectedMetrics: []string{"load"},
|
expectedMetrics: []string{"load"},
|
||||||
wantDegraded: nil,
|
wantDegraded: nil,
|
||||||
|
wantMissing: nil,
|
||||||
wantErr: true,
|
wantErr: true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -325,13 +328,14 @@ func TestGetHealthyMetrics(t *testing.T) {
|
|||||||
selector: []string{"testcluster", "testnode"},
|
selector: []string{"testcluster", "testnode"},
|
||||||
expectedMetrics: []string{"load"},
|
expectedMetrics: []string{"load"},
|
||||||
wantDegraded: []string{},
|
wantDegraded: []string{},
|
||||||
|
wantMissing: []string{},
|
||||||
wantErr: false,
|
wantErr: false,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
degraded, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics)
|
degraded, missing, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics)
|
||||||
|
|
||||||
if (err != nil) != tt.wantErr {
|
if (err != nil) != tt.wantErr {
|
||||||
t.Errorf("GetHealthyMetrics() error = %v, wantErr %v", err, tt.wantErr)
|
t.Errorf("GetHealthyMetrics() error = %v, wantErr %v", err, tt.wantErr)
|
||||||
@@ -352,6 +356,17 @@ func TestGetHealthyMetrics(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check missing list
|
||||||
|
if len(missing) != len(tt.wantMissing) {
|
||||||
|
t.Errorf("GetHealthyMetrics() missing = %v, want %v", missing, tt.wantMissing)
|
||||||
|
} else {
|
||||||
|
for i, m := range tt.wantMissing {
|
||||||
|
if missing[i] != m {
|
||||||
|
t.Errorf("GetHealthyMetrics() missing[%d] = %v, want %v", i, missing[i], m)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user