Prersist faulty nodestate metric lists to db

2026-02-18 00:41:46 +01:00 · 2026-02-12 08:48:15 +01:00
parent 90c8fbf07c
commit 865cd3db54
6 changed files with 45 additions and 23 deletions
--- a/pkg/metricstore/healthcheck.go
+++ b/pkg/metricstore/healthcheck.go
@@ -6,6 +6,7 @@
 package metricstore

 import (
+	"encoding/json"
 	"fmt"
 	"time"

@@ -19,6 +20,13 @@ type HealthCheckResponse struct {
 	Error  error
 }

+// HealthCheckResult holds the monitoring state and raw JSON health metrics
+// for a single node as determined by HealthCheck.
+type HealthCheckResult struct {
+	State         schema.MonitoringState
+	HealthMetrics string // JSON: {"missing":[...],"degraded":[...]}
+}
+
 // MaxMissingDataPoints is the threshold for stale data detection.
 // A buffer is considered healthy if the gap between its last data point
 // and the current time is within MaxMissingDataPoints * frequency.
@@ -134,15 +142,15 @@ func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []str
 //   - MonitoringStateFailed: node not found, or no healthy metrics at all
 func (m *MemoryStore) HealthCheck(cluster string,
 	nodes []string, expectedMetrics []string,
-) (map[string]schema.MonitoringState, error) {
-	results := make(map[string]schema.MonitoringState, len(nodes))
+) (map[string]HealthCheckResult, error) {
+	results := make(map[string]HealthCheckResult, len(nodes))

 	for _, hostname := range nodes {
 		selector := []string{cluster, hostname}

 		degradedList, missingList, err := m.GetHealthyMetrics(selector, expectedMetrics)
 		if err != nil {
-			results[hostname] = schema.MonitoringStateFailed
+			results[hostname] = HealthCheckResult{State: schema.MonitoringStateFailed}
 			continue
 		}

@@ -158,13 +166,24 @@ func (m *MemoryStore) HealthCheck(cluster string,
 			cclog.ComponentInfo("metricstore", "HealthCheck: node ", hostname, "missing metrics:", missingList)
 		}

+		var state schema.MonitoringState
 		switch {
 		case degradedCount == 0 && missingCount == 0:
-			results[hostname] = schema.MonitoringStateFull
+			state = schema.MonitoringStateFull
 		case healthyCount == 0:
-			results[hostname] = schema.MonitoringStateFailed
+			state = schema.MonitoringStateFailed
 		default:
-			results[hostname] = schema.MonitoringStatePartial
+			state = schema.MonitoringStatePartial
+		}
+
+		hm, _ := json.Marshal(map[string][]string{
+			"missing":  missingList,
+			"degraded": degradedList,
+		})
+
+		results[hostname] = HealthCheckResult{
+			State:         state,
+			HealthMetrics: string(hm),
 		}
 	}