From a8d385a1eecee2cf838ad846cd65780ffd03edca Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 6 Feb 2026 16:35:02 +0100 Subject: [PATCH] Update HealthCheck again Still WIP --- pkg/metricstore/healthcheck.go | 27 +++++++++++++++++---------- pkg/metricstore/metricstore_test.go | 19 +++++++++++++++++-- 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/pkg/metricstore/healthcheck.go b/pkg/metricstore/healthcheck.go index b4be57c4..9527d4a2 100644 --- a/pkg/metricstore/healthcheck.go +++ b/pkg/metricstore/healthcheck.go @@ -82,12 +82,13 @@ func (l *Level) collectMetricStatus(m *MemoryStore, expectedMetrics []string, he // Returns: // - missingList: metrics not found in global config or without any buffer // - degradedList: metrics with at least one stale buffer in the subtree -func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) []string { +func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) ([]string, []string) { healthy := make(map[string]bool, len(expectedMetrics)) degraded := make(map[string]bool) l.collectMetricStatus(m, expectedMetrics, healthy, degraded) + missingList := make([]string, 0) degradedList := make([]string, 0) for _, metricName := range expectedMetrics { @@ -95,10 +96,14 @@ func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) []st continue } - degradedList = append(degradedList, metricName) + if degraded[metricName] { + degradedList = append(degradedList, metricName) + } else { + missingList = append(missingList, metricName) + } } - return degradedList + return degradedList, missingList } // GetHealthyMetrics returns missing and degraded metric lists for a node. @@ -110,14 +115,14 @@ func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) []st // // Metrics present in expectedMetrics but absent from both returned lists // are considered fully healthy. -func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, error) { +func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, []string, error) { lvl := m.root.findLevel(selector) if lvl == nil { - return nil, fmt.Errorf("[METRICSTORE]> GetHealthyMetrics: host not found: %#v", selector) + return nil, nil, fmt.Errorf("[METRICSTORE]> GetHealthyMetrics: host not found: %#v", selector) } - degradedList := lvl.getHealthyMetrics(m, expectedMetrics) - return degradedList, nil + degradedList, missingList := lvl.getHealthyMetrics(m, expectedMetrics) + return degradedList, missingList, nil } // HealthCheck evaluates multiple nodes against a set of expected metrics @@ -135,21 +140,23 @@ func (m *MemoryStore) HealthCheck(cluster string, for _, hostname := range nodes { selector := []string{cluster, hostname} - degradedList, err := m.GetHealthyMetrics(selector, expectedMetrics) + degradedList, missingList, err := m.GetHealthyMetrics(selector, expectedMetrics) if err != nil { results[hostname] = schema.MonitoringStateFailed continue } degradedCount := len(degradedList) - healthyCount := len(expectedMetrics) - degradedCount + missingCount := len(missingList) + + healthyCount := len(expectedMetrics) - degradedCount - missingCount if degradedCount > 0 { cclog.ComponentDebug("metricstore", "HealthCheck: node", hostname, "degraded metrics:", degradedList) } switch { - case degradedCount == 0: + case degradedCount == 0 && missingCount == 0: results[hostname] = schema.MonitoringStateFull case healthyCount == 0: results[hostname] = schema.MonitoringStateFailed diff --git a/pkg/metricstore/metricstore_test.go b/pkg/metricstore/metricstore_test.go index ba9d7ecf..4d68d76c 100644 --- a/pkg/metricstore/metricstore_test.go +++ b/pkg/metricstore/metricstore_test.go @@ -304,13 +304,15 @@ func TestGetHealthyMetrics(t *testing.T) { selector []string expectedMetrics []string wantDegraded []string + wantMissing []string wantErr bool }{ { name: "mixed health states", selector: []string{"testcluster", "testnode"}, expectedMetrics: []string{"load", "mem_used", "cpu_user"}, - wantDegraded: []string{"mem_used", "cpu_user"}, + wantDegraded: []string{"mem_used"}, + wantMissing: []string{"cpu_user"}, wantErr: false, }, { @@ -318,6 +320,7 @@ func TestGetHealthyMetrics(t *testing.T) { selector: []string{"testcluster", "nonexistent"}, expectedMetrics: []string{"load"}, wantDegraded: nil, + wantMissing: nil, wantErr: true, }, { @@ -325,13 +328,14 @@ func TestGetHealthyMetrics(t *testing.T) { selector: []string{"testcluster", "testnode"}, expectedMetrics: []string{"load"}, wantDegraded: []string{}, + wantMissing: []string{}, wantErr: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - degraded, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics) + degraded, missing, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics) if (err != nil) != tt.wantErr { t.Errorf("GetHealthyMetrics() error = %v, wantErr %v", err, tt.wantErr) @@ -352,6 +356,17 @@ func TestGetHealthyMetrics(t *testing.T) { } } } + + // Check missing list + if len(missing) != len(tt.wantMissing) { + t.Errorf("GetHealthyMetrics() missing = %v, want %v", missing, tt.wantMissing) + } else { + for i, m := range tt.wantMissing { + if missing[i] != m { + t.Errorf("GetHealthyMetrics() missing[%d] = %v, want %v", i, missing[i], m) + } + } + } }) } }