Files
cc-backend/pkg/metricstore/healthcheck.go

173 lines
5.5 KiB
Go

// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package metricstore
import (
"fmt"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// HealthCheckResponse represents the result of a health check operation.
type HealthCheckResponse struct {
Status schema.MonitoringState
Error error
}
// MaxMissingDataPoints is the threshold for stale data detection.
// A buffer is considered healthy if the gap between its last data point
// and the current time is within MaxMissingDataPoints * frequency.
const MaxMissingDataPoints int64 = 5
// bufferExists returns true if the buffer is non-nil and contains data.
func (b *buffer) bufferExists() bool {
if b == nil || b.data == nil || len(b.data) == 0 {
return false
}
return true
}
// isBufferHealthy returns true if the buffer has recent data within
// MaxMissingDataPoints * frequency of the current time.
func (b *buffer) isBufferHealthy() bool {
bufferEnd := b.start + b.frequency*int64(len(b.data))
t := time.Now().Unix()
return t-bufferEnd <= MaxMissingDataPoints*b.frequency
}
// collectMetricStatus walks the subtree rooted at l and classifies each
// expected metric into the healthy or degraded map.
//
// Classification rules (evaluated per buffer, pessimistic):
// - A single stale buffer marks the metric as degraded permanently.
// - A healthy buffer only counts if no stale buffer has been seen.
// - Metrics absent from the global config or without any buffer remain
// in neither map and are later reported as missing.
func (l *Level) collectMetricStatus(m *MemoryStore, expectedMetrics []string, healthy, degraded map[string]bool) {
l.lock.RLock()
defer l.lock.RUnlock()
for _, metricName := range expectedMetrics {
if degraded[metricName] {
continue // already degraded, cannot improve
}
mc := m.Metrics[metricName]
b := l.metrics[mc.offset]
if b.bufferExists() {
if !b.isBufferHealthy() {
degraded[metricName] = true
delete(healthy, metricName)
} else if !degraded[metricName] {
healthy[metricName] = true
}
}
}
for _, lvl := range l.children {
lvl.collectMetricStatus(m, expectedMetrics, healthy, degraded)
}
}
// getHealthyMetrics walks the complete subtree rooted at l and classifies
// each expected metric by comparing the collected status against the
// expected list.
//
// Returns:
// - missingList: metrics not found in global config or without any buffer
// - degradedList: metrics with at least one stale buffer in the subtree
func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) ([]string, []string) {
healthy := make(map[string]bool, len(expectedMetrics))
degraded := make(map[string]bool)
l.collectMetricStatus(m, expectedMetrics, healthy, degraded)
missingList := make([]string, 0)
degradedList := make([]string, 0)
for _, metricName := range expectedMetrics {
if healthy[metricName] {
continue
}
if degraded[metricName] {
degradedList = append(degradedList, metricName)
} else {
missingList = append(missingList, metricName)
}
}
return degradedList, missingList
}
// GetHealthyMetrics returns missing and degraded metric lists for a node.
//
// It walks the metric tree starting from the node identified by selector
// and classifies each expected metric:
// - Missing: no buffer anywhere in the subtree, or metric not in global config
// - Degraded: at least one stale buffer exists in the subtree
//
// Metrics present in expectedMetrics but absent from both returned lists
// are considered fully healthy.
func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, []string, error) {
lvl := m.root.findLevel(selector)
if lvl == nil {
return nil, nil, fmt.Errorf("[METRICSTORE]> GetHealthyMetrics: host not found: %#v", selector)
}
degradedList, missingList := lvl.getHealthyMetrics(m, expectedMetrics)
return degradedList, missingList, nil
}
// HealthCheck evaluates multiple nodes against a set of expected metrics
// and returns a monitoring state per node.
//
// States:
// - MonitoringStateFull: all expected metrics are healthy
// - MonitoringStatePartial: some metrics are missing or degraded
// - MonitoringStateFailed: node not found, or no healthy metrics at all
func (m *MemoryStore) HealthCheck(cluster string,
nodes []string, expectedMetrics []string,
) (map[string]schema.MonitoringState, error) {
results := make(map[string]schema.MonitoringState, len(nodes))
for _, hostname := range nodes {
selector := []string{cluster, hostname}
degradedList, missingList, err := m.GetHealthyMetrics(selector, expectedMetrics)
if err != nil {
results[hostname] = schema.MonitoringStateFailed
continue
}
degradedCount := len(degradedList)
missingCount := len(missingList)
healthyCount := len(expectedMetrics) - degradedCount - missingCount
if degradedCount > 0 {
cclog.ComponentInfo("metricstore", "HealthCheck: node ", hostname, "degraded metrics:", degradedList)
}
if missingCount > 0 {
cclog.ComponentInfo("metricstore", "HealthCheck: node ", hostname, "missing metrics:", missingList)
}
switch {
case degradedCount == 0 && missingCount == 0:
results[hostname] = schema.MonitoringStateFull
case healthyCount == 0:
results[hostname] = schema.MonitoringStateFailed
default:
results[hostname] = schema.MonitoringStatePartial
}
}
return results, nil
}