cc-backend/pkg/metricstore/healthcheck.go

// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package metricstore

import (
	"fmt"
	"time"

	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
	"github.com/ClusterCockpit/cc-lib/v2/schema"
)

// HealthCheckResponse represents the result of a health check operation.
type HealthCheckResponse struct {
	Status schema.MonitoringState
	Error  error
}

// MaxMissingDataPoints is the threshold for stale data detection.
// A buffer is considered healthy if the gap between its last data point
// and the current time is within MaxMissingDataPoints * frequency.
const MaxMissingDataPoints int64 = 5

// bufferExists returns true if the buffer is non-nil and contains data.
func (b *buffer) bufferExists() bool {
	if b == nil || b.data == nil || len(b.data) == 0 {
		return false
	}

	return true
}

// isBufferHealthy returns true if the buffer has recent data within
// MaxMissingDataPoints * frequency of the current time.
func (b *buffer) isBufferHealthy() bool {
	bufferEnd := b.start + b.frequency*int64(len(b.data))
	t := time.Now().Unix()

	return t-bufferEnd <= MaxMissingDataPoints*b.frequency
}

// collectMetricStatus walks the subtree rooted at l and classifies each
// expected metric into the healthy or degraded map.
//
// Classification rules (evaluated per buffer, pessimistic):
//   - A single stale buffer marks the metric as degraded permanently.
//   - A healthy buffer only counts if no stale buffer has been seen.
//   - Metrics absent from the global config or without any buffer remain
//     in neither map and are later reported as missing.
func (l *Level) collectMetricStatus(m *MemoryStore, expectedMetrics []string, healthy, degraded map[string]bool) {
	l.lock.RLock()
	defer l.lock.RUnlock()

	for _, metricName := range expectedMetrics {
		if degraded[metricName] {
			continue // already degraded, cannot improve
		}
		mc := m.Metrics[metricName]
		b := l.metrics[mc.offset]
		if b.bufferExists() {
			if !b.isBufferHealthy() {
				degraded[metricName] = true
				delete(healthy, metricName)
			} else if !degraded[metricName] {
				healthy[metricName] = true
			}
		}
	}

	for _, lvl := range l.children {
		lvl.collectMetricStatus(m, expectedMetrics, healthy, degraded)
	}
}

// getHealthyMetrics walks the complete subtree rooted at l and classifies
// each expected metric by comparing the collected status against the
// expected list.
//
// Returns:
//   - missingList: metrics not found in global config or without any buffer
//   - degradedList: metrics with at least one stale buffer in the subtree
func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) ([]string, []string) {
	healthy := make(map[string]bool, len(expectedMetrics))
	degraded := make(map[string]bool)

	l.collectMetricStatus(m, expectedMetrics, healthy, degraded)

	missingList := make([]string, 0)
	degradedList := make([]string, 0)

	for _, metricName := range expectedMetrics {
		if healthy[metricName] {
			continue
		}

		if degraded[metricName] {
			degradedList = append(degradedList, metricName)
		} else {
			missingList = append(missingList, metricName)
		}
	}

	return degradedList, missingList
}

// GetHealthyMetrics returns missing and degraded metric lists for a node.
//
// It walks the metric tree starting from the node identified by selector
// and classifies each expected metric:
//   - Missing: no buffer anywhere in the subtree, or metric not in global config
//   - Degraded: at least one stale buffer exists in the subtree
//
// Metrics present in expectedMetrics but absent from both returned lists
// are considered fully healthy.
func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, []string, error) {
	lvl := m.root.findLevel(selector)
	if lvl == nil {
		return nil, nil, fmt.Errorf("[METRICSTORE]> GetHealthyMetrics: host not found: %#v", selector)
	}

	degradedList, missingList := lvl.getHealthyMetrics(m, expectedMetrics)
	return degradedList, missingList, nil
}

// HealthCheck evaluates multiple nodes against a set of expected metrics
// and returns a monitoring state per node.
//
// States:
//   - MonitoringStateFull: all expected metrics are healthy
//   - MonitoringStatePartial: some metrics are missing or degraded
//   - MonitoringStateFailed: node not found, or no healthy metrics at all
func (m *MemoryStore) HealthCheck(cluster string,
	nodes []string, expectedMetrics []string,
) (map[string]schema.MonitoringState, error) {
	results := make(map[string]schema.MonitoringState, len(nodes))

	for _, hostname := range nodes {
		selector := []string{cluster, hostname}

		degradedList, missingList, err := m.GetHealthyMetrics(selector, expectedMetrics)
		if err != nil {
			results[hostname] = schema.MonitoringStateFailed
			continue
		}

		degradedCount := len(degradedList)
		missingCount := len(missingList)

		healthyCount := len(expectedMetrics) - degradedCount - missingCount

		if degradedCount > 0 {
			cclog.ComponentInfo("metricstore", "HealthCheck: node ", hostname, "degraded metrics:", degradedList)
		}
		if missingCount > 0 {
			cclog.ComponentInfo("metricstore", "HealthCheck: node ", hostname, "missing metrics:", missingList)
		}

		switch {
		case degradedCount == 0 && missingCount == 0:
			results[hostname] = schema.MonitoringStateFull
		case healthyCount == 0:
			results[hostname] = schema.MonitoringStateFailed
		default:
			results[hostname] = schema.MonitoringStatePartial
		}
	}

	return results, nil
}