mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-02-11 13:31:45 +01:00
173 lines
5.5 KiB
Go
173 lines
5.5 KiB
Go
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
// All rights reserved. This file is part of cc-backend.
|
|
// Use of this source code is governed by a MIT-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package metricstore
|
|
|
|
import (
|
|
"fmt"
|
|
"time"
|
|
|
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
|
)
|
|
|
|
// HealthCheckResponse represents the result of a health check operation.
|
|
type HealthCheckResponse struct {
|
|
Status schema.MonitoringState
|
|
Error error
|
|
}
|
|
|
|
// MaxMissingDataPoints is the threshold for stale data detection.
|
|
// A buffer is considered healthy if the gap between its last data point
|
|
// and the current time is within MaxMissingDataPoints * frequency.
|
|
const MaxMissingDataPoints int64 = 5
|
|
|
|
// bufferExists returns true if the buffer is non-nil and contains data.
|
|
func (b *buffer) bufferExists() bool {
|
|
if b == nil || b.data == nil || len(b.data) == 0 {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// isBufferHealthy returns true if the buffer has recent data within
|
|
// MaxMissingDataPoints * frequency of the current time.
|
|
func (b *buffer) isBufferHealthy() bool {
|
|
bufferEnd := b.start + b.frequency*int64(len(b.data))
|
|
t := time.Now().Unix()
|
|
|
|
return t-bufferEnd <= MaxMissingDataPoints*b.frequency
|
|
}
|
|
|
|
// collectMetricStatus walks the subtree rooted at l and classifies each
|
|
// expected metric into the healthy or degraded map.
|
|
//
|
|
// Classification rules (evaluated per buffer, pessimistic):
|
|
// - A single stale buffer marks the metric as degraded permanently.
|
|
// - A healthy buffer only counts if no stale buffer has been seen.
|
|
// - Metrics absent from the global config or without any buffer remain
|
|
// in neither map and are later reported as missing.
|
|
func (l *Level) collectMetricStatus(m *MemoryStore, expectedMetrics []string, healthy, degraded map[string]bool) {
|
|
l.lock.RLock()
|
|
defer l.lock.RUnlock()
|
|
|
|
for _, metricName := range expectedMetrics {
|
|
if degraded[metricName] {
|
|
continue // already degraded, cannot improve
|
|
}
|
|
mc := m.Metrics[metricName]
|
|
b := l.metrics[mc.offset]
|
|
if b.bufferExists() {
|
|
if !b.isBufferHealthy() {
|
|
degraded[metricName] = true
|
|
delete(healthy, metricName)
|
|
} else if !degraded[metricName] {
|
|
healthy[metricName] = true
|
|
}
|
|
}
|
|
}
|
|
|
|
for _, lvl := range l.children {
|
|
lvl.collectMetricStatus(m, expectedMetrics, healthy, degraded)
|
|
}
|
|
}
|
|
|
|
// getHealthyMetrics walks the complete subtree rooted at l and classifies
|
|
// each expected metric by comparing the collected status against the
|
|
// expected list.
|
|
//
|
|
// Returns:
|
|
// - missingList: metrics not found in global config or without any buffer
|
|
// - degradedList: metrics with at least one stale buffer in the subtree
|
|
func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) ([]string, []string) {
|
|
healthy := make(map[string]bool, len(expectedMetrics))
|
|
degraded := make(map[string]bool)
|
|
|
|
l.collectMetricStatus(m, expectedMetrics, healthy, degraded)
|
|
|
|
missingList := make([]string, 0)
|
|
degradedList := make([]string, 0)
|
|
|
|
for _, metricName := range expectedMetrics {
|
|
if healthy[metricName] {
|
|
continue
|
|
}
|
|
|
|
if degraded[metricName] {
|
|
degradedList = append(degradedList, metricName)
|
|
} else {
|
|
missingList = append(missingList, metricName)
|
|
}
|
|
}
|
|
|
|
return degradedList, missingList
|
|
}
|
|
|
|
// GetHealthyMetrics returns missing and degraded metric lists for a node.
|
|
//
|
|
// It walks the metric tree starting from the node identified by selector
|
|
// and classifies each expected metric:
|
|
// - Missing: no buffer anywhere in the subtree, or metric not in global config
|
|
// - Degraded: at least one stale buffer exists in the subtree
|
|
//
|
|
// Metrics present in expectedMetrics but absent from both returned lists
|
|
// are considered fully healthy.
|
|
func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, []string, error) {
|
|
lvl := m.root.findLevel(selector)
|
|
if lvl == nil {
|
|
return nil, nil, fmt.Errorf("[METRICSTORE]> GetHealthyMetrics: host not found: %#v", selector)
|
|
}
|
|
|
|
degradedList, missingList := lvl.getHealthyMetrics(m, expectedMetrics)
|
|
return degradedList, missingList, nil
|
|
}
|
|
|
|
// HealthCheck evaluates multiple nodes against a set of expected metrics
|
|
// and returns a monitoring state per node.
|
|
//
|
|
// States:
|
|
// - MonitoringStateFull: all expected metrics are healthy
|
|
// - MonitoringStatePartial: some metrics are missing or degraded
|
|
// - MonitoringStateFailed: node not found, or no healthy metrics at all
|
|
func (m *MemoryStore) HealthCheck(cluster string,
|
|
nodes []string, expectedMetrics []string,
|
|
) (map[string]schema.MonitoringState, error) {
|
|
results := make(map[string]schema.MonitoringState, len(nodes))
|
|
|
|
for _, hostname := range nodes {
|
|
selector := []string{cluster, hostname}
|
|
|
|
degradedList, missingList, err := m.GetHealthyMetrics(selector, expectedMetrics)
|
|
if err != nil {
|
|
results[hostname] = schema.MonitoringStateFailed
|
|
continue
|
|
}
|
|
|
|
degradedCount := len(degradedList)
|
|
missingCount := len(missingList)
|
|
|
|
healthyCount := len(expectedMetrics) - degradedCount - missingCount
|
|
|
|
if degradedCount > 0 {
|
|
cclog.ComponentInfo("metricstore", "HealthCheck: node ", hostname, "degraded metrics:", degradedList)
|
|
}
|
|
if missingCount > 0 {
|
|
cclog.ComponentInfo("metricstore", "HealthCheck: node ", hostname, "missing metrics:", missingList)
|
|
}
|
|
|
|
switch {
|
|
case degradedCount == 0 && missingCount == 0:
|
|
results[hostname] = schema.MonitoringStateFull
|
|
case healthyCount == 0:
|
|
results[hostname] = schema.MonitoringStateFailed
|
|
default:
|
|
results[hostname] = schema.MonitoringStatePartial
|
|
}
|
|
}
|
|
|
|
return results, nil
|
|
}
|