Update HealthCheck again Still WIP

This commit is contained in:
2026-02-06 16:35:02 +01:00
parent 5579b6f40c
commit a8d385a1ee
2 changed files with 34 additions and 12 deletions

View File

@@ -82,12 +82,13 @@ func (l *Level) collectMetricStatus(m *MemoryStore, expectedMetrics []string, he
// Returns: // Returns:
// - missingList: metrics not found in global config or without any buffer // - missingList: metrics not found in global config or without any buffer
// - degradedList: metrics with at least one stale buffer in the subtree // - degradedList: metrics with at least one stale buffer in the subtree
func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) []string { func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) ([]string, []string) {
healthy := make(map[string]bool, len(expectedMetrics)) healthy := make(map[string]bool, len(expectedMetrics))
degraded := make(map[string]bool) degraded := make(map[string]bool)
l.collectMetricStatus(m, expectedMetrics, healthy, degraded) l.collectMetricStatus(m, expectedMetrics, healthy, degraded)
missingList := make([]string, 0)
degradedList := make([]string, 0) degradedList := make([]string, 0)
for _, metricName := range expectedMetrics { for _, metricName := range expectedMetrics {
@@ -95,10 +96,14 @@ func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) []st
continue continue
} }
degradedList = append(degradedList, metricName) if degraded[metricName] {
degradedList = append(degradedList, metricName)
} else {
missingList = append(missingList, metricName)
}
} }
return degradedList return degradedList, missingList
} }
// GetHealthyMetrics returns missing and degraded metric lists for a node. // GetHealthyMetrics returns missing and degraded metric lists for a node.
@@ -110,14 +115,14 @@ func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) []st
// //
// Metrics present in expectedMetrics but absent from both returned lists // Metrics present in expectedMetrics but absent from both returned lists
// are considered fully healthy. // are considered fully healthy.
func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, error) { func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, []string, error) {
lvl := m.root.findLevel(selector) lvl := m.root.findLevel(selector)
if lvl == nil { if lvl == nil {
return nil, fmt.Errorf("[METRICSTORE]> GetHealthyMetrics: host not found: %#v", selector) return nil, nil, fmt.Errorf("[METRICSTORE]> GetHealthyMetrics: host not found: %#v", selector)
} }
degradedList := lvl.getHealthyMetrics(m, expectedMetrics) degradedList, missingList := lvl.getHealthyMetrics(m, expectedMetrics)
return degradedList, nil return degradedList, missingList, nil
} }
// HealthCheck evaluates multiple nodes against a set of expected metrics // HealthCheck evaluates multiple nodes against a set of expected metrics
@@ -135,21 +140,23 @@ func (m *MemoryStore) HealthCheck(cluster string,
for _, hostname := range nodes { for _, hostname := range nodes {
selector := []string{cluster, hostname} selector := []string{cluster, hostname}
degradedList, err := m.GetHealthyMetrics(selector, expectedMetrics) degradedList, missingList, err := m.GetHealthyMetrics(selector, expectedMetrics)
if err != nil { if err != nil {
results[hostname] = schema.MonitoringStateFailed results[hostname] = schema.MonitoringStateFailed
continue continue
} }
degradedCount := len(degradedList) degradedCount := len(degradedList)
healthyCount := len(expectedMetrics) - degradedCount missingCount := len(missingList)
healthyCount := len(expectedMetrics) - degradedCount - missingCount
if degradedCount > 0 { if degradedCount > 0 {
cclog.ComponentDebug("metricstore", "HealthCheck: node", hostname, "degraded metrics:", degradedList) cclog.ComponentDebug("metricstore", "HealthCheck: node", hostname, "degraded metrics:", degradedList)
} }
switch { switch {
case degradedCount == 0: case degradedCount == 0 && missingCount == 0:
results[hostname] = schema.MonitoringStateFull results[hostname] = schema.MonitoringStateFull
case healthyCount == 0: case healthyCount == 0:
results[hostname] = schema.MonitoringStateFailed results[hostname] = schema.MonitoringStateFailed

View File

@@ -304,13 +304,15 @@ func TestGetHealthyMetrics(t *testing.T) {
selector []string selector []string
expectedMetrics []string expectedMetrics []string
wantDegraded []string wantDegraded []string
wantMissing []string
wantErr bool wantErr bool
}{ }{
{ {
name: "mixed health states", name: "mixed health states",
selector: []string{"testcluster", "testnode"}, selector: []string{"testcluster", "testnode"},
expectedMetrics: []string{"load", "mem_used", "cpu_user"}, expectedMetrics: []string{"load", "mem_used", "cpu_user"},
wantDegraded: []string{"mem_used", "cpu_user"}, wantDegraded: []string{"mem_used"},
wantMissing: []string{"cpu_user"},
wantErr: false, wantErr: false,
}, },
{ {
@@ -318,6 +320,7 @@ func TestGetHealthyMetrics(t *testing.T) {
selector: []string{"testcluster", "nonexistent"}, selector: []string{"testcluster", "nonexistent"},
expectedMetrics: []string{"load"}, expectedMetrics: []string{"load"},
wantDegraded: nil, wantDegraded: nil,
wantMissing: nil,
wantErr: true, wantErr: true,
}, },
{ {
@@ -325,13 +328,14 @@ func TestGetHealthyMetrics(t *testing.T) {
selector: []string{"testcluster", "testnode"}, selector: []string{"testcluster", "testnode"},
expectedMetrics: []string{"load"}, expectedMetrics: []string{"load"},
wantDegraded: []string{}, wantDegraded: []string{},
wantMissing: []string{},
wantErr: false, wantErr: false,
}, },
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
degraded, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics) degraded, missing, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics)
if (err != nil) != tt.wantErr { if (err != nil) != tt.wantErr {
t.Errorf("GetHealthyMetrics() error = %v, wantErr %v", err, tt.wantErr) t.Errorf("GetHealthyMetrics() error = %v, wantErr %v", err, tt.wantErr)
@@ -352,6 +356,17 @@ func TestGetHealthyMetrics(t *testing.T) {
} }
} }
} }
// Check missing list
if len(missing) != len(tt.wantMissing) {
t.Errorf("GetHealthyMetrics() missing = %v, want %v", missing, tt.wantMissing)
} else {
for i, m := range tt.wantMissing {
if missing[i] != m {
t.Errorf("GetHealthyMetrics() missing[%d] = %v, want %v", i, missing[i], m)
}
}
}
}) })
} }
} }