cc-backend/pkg/metricstore/healthcheck.go

// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package metricstore

import (
	"fmt"
	"time"

	"github.com/ClusterCockpit/cc-lib/v2/schema"
)

type HeathCheckResponse struct {
	Status schema.MonitoringState
	Error  error
	list   List
}

type List struct {
	StaleNodeMetricList       []string
	StaleHardwareMetricList   map[string][]string
	MissingNodeMetricList     []string
	MissingHardwareMetricList map[string][]string
}

// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing.
// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a
// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy.
const MaxMissingDataPoints int64 = 5

func (b *buffer) healthCheck() bool {
	// Check if the buffer is empty
	if b.data == nil {
		return true
	}

	bufferEnd := b.start + b.frequency*int64(len(b.data))
	t := time.Now().Unix()

	// Check if the buffer is too old
	if t-bufferEnd > MaxMissingDataPoints*b.frequency {
		return true
	}

	return false
}

// healthCheck recursively examines a level and all its children to identify stale or missing metrics.
//
// This routine performs a two-phase check:
//
// Phase 1 - Check metrics at current level (node-level metrics):
//   - Iterates through all configured metrics in m.Metrics
//   - For each metric, checks if a buffer exists at l.metrics[mc.offset]
//   - If buffer exists: calls buffer.healthCheck() to verify data freshness
//   - Stale buffer (data older than MaxMissingDataPoints * frequency) → StaleNodeMetricList
//   - Fresh buffer → healthy, no action
//   - If buffer is nil: metric was never written → MissingNodeMetricList
//
// Phase 2 - Recursively check child levels (hardware-level metrics):
//   - Iterates through l.children (e.g., "cpu0", "gpu0", "socket0")
//   - Recursively calls healthCheck() on each child level
//   - Aggregates child results into hardware-specific lists:
//   - Child's StaleNodeMetricList → parent's StaleHardwareMetricList[childName]
//   - Child's MissingNodeMetricList → parent's MissingHardwareMetricList[childName]
//
// The recursive nature means:
//   - Calling on a host level checks: host metrics + all CPU/GPU/socket metrics
//   - Calling on a socket level checks: socket metrics + all core metrics
//   - Leaf levels (e.g., individual cores) only check their own metrics
//
// Parameters:
//   - m: MemoryStore containing the global metric configuration (m.Metrics)
//
// Returns:
//   - List: Categorized lists of stale and missing metrics at this level and below
//   - error: Non-nil only for internal errors during recursion
//
// Concurrency:
//   - Acquires read lock (RLock) to safely access l.metrics and l.children
//   - Lock held for entire duration including recursive calls
//
// Example for host level with structure: host → [cpu0, cpu1]:
//   - Checks host-level metrics (load, memory) → StaleNodeMetricList / MissingNodeMetricList
//   - Recursively checks cpu0 metrics → results in StaleHardwareMetricList["cpu0"]
//   - Recursively checks cpu1 metrics → results in StaleHardwareMetricList["cpu1"]
func (l *Level) healthCheck(m *MemoryStore) (List, error) {
	l.lock.RLock()
	defer l.lock.RUnlock()

	list := List{
		StaleNodeMetricList:       make([]string, 0),
		StaleHardwareMetricList:   make(map[string][]string, 0),
		MissingNodeMetricList:     make([]string, 0),
		MissingHardwareMetricList: make(map[string][]string, 0),
	}

	// Phase 1: Check metrics at this level
	for metricName, mc := range m.Metrics {
		if b := l.metrics[mc.offset]; b != nil {
			if b.healthCheck() {
				list.StaleNodeMetricList = append(list.StaleNodeMetricList, metricName)
			}
		} else {
			list.MissingNodeMetricList = append(list.MissingNodeMetricList, metricName)
		}
	}

	// Phase 2: Recursively check child levels (hardware components)
	for hardwareMetricName, lvl := range l.children {
		l, err := lvl.healthCheck(m)
		if err != nil {
			return List{}, err
		}

		if len(l.StaleNodeMetricList) != 0 {
			list.StaleHardwareMetricList[hardwareMetricName] = l.StaleNodeMetricList
		}
		if len(l.MissingNodeMetricList) != 0 {
			list.MissingHardwareMetricList[hardwareMetricName] = l.MissingNodeMetricList
		}
	}

	return list, nil
}

// HealthCheck performs a health check on a specific node in the metric store.
//
// This routine checks whether metrics for a given node are being received and are up-to-date.
// It examines both node-level metrics (e.g., load, memory) and hardware-level metrics
// (e.g., CPU, GPU, network) to determine the monitoring state.
//
// Parameters:
//   - selector: Hierarchical path to the target node, typically []string{cluster, hostname}.
//     Example: []string{"emmy", "node001"} navigates to the "node001" host in the "emmy" cluster.
//     The selector must match the hierarchy used during metric ingestion (see Level.findLevelOrCreate).
//   - subcluster: Subcluster name (currently unused, reserved for future filtering)
//
// Returns:
//   - *HeathCheckResponse: Health status with detailed lists of stale/missing metrics
//   - error: Non-nil only for internal errors (not for unhealthy nodes)
//
// Health States:
//   - MonitoringStateFull: All expected metrics are present and up-to-date
//   - MonitoringStatePartial: Some metrics are stale (data older than MaxMissingDataPoints * frequency)
//   - MonitoringStateFailed: Host not found, or metrics are completely missing
//
// The response includes detailed lists:
//   - StaleNodeMetricList: Node-level metrics with stale data
//   - StaleHardwareMetricList: Hardware-level metrics with stale data (grouped by component)
//   - MissingNodeMetricList: Expected node-level metrics that have no data
//   - MissingHardwareMetricList: Expected hardware-level metrics that have no data (grouped by component)
//
// Example usage:
//
//	selector := []string{"emmy", "node001"}
//	response, err := ms.HealthCheck(selector, "")
//	if err != nil {
//	    // Internal error
//	}
//	switch response.Status {
//	case schema.MonitoringStateFull:
//	    // All metrics healthy
//	case schema.MonitoringStatePartial:
//	    // Check response.list.StaleNodeMetricList for details
//	case schema.MonitoringStateFailed:
//	    // Check response.Error or response.list.MissingNodeMetricList
//	}
func (m *MemoryStore) HealthCheck(selector []string, subcluster string) (*HeathCheckResponse, error) {
	response := HeathCheckResponse{
		Status: schema.MonitoringStateFull,
	}

	lvl := m.root.findLevel(selector)
	if lvl == nil {
		response.Status = schema.MonitoringStateFailed
		response.Error = fmt.Errorf("[METRICSTORE]> error while HealthCheck, host not found: %#v", selector)
		return &response, nil
	}

	var err error

	response.list, err = lvl.healthCheck(m)
	if err != nil {
		return nil, err
	}

	fmt.Printf("Response: %#v\n", response)

	if len(response.list.StaleNodeMetricList) != 0 ||
		len(response.list.StaleHardwareMetricList) != 0 {
		response.Status = schema.MonitoringStatePartial
		return &response, nil
	}

	if len(response.list.MissingHardwareMetricList) != 0 ||
		len(response.list.MissingNodeMetricList) != 0 {
		response.Status = schema.MonitoringStateFailed
		return &response, nil
	}

	return &response, nil
}

// isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints.
//
// Returns true if the buffer is healthy (recent data within threshold), false otherwise.
// A nil buffer or empty buffer is considered unhealthy.
func (b *buffer) isBufferHealthy() bool {
	// Check if the buffer is empty
	if b == nil || b.data == nil {
		return false
	}

	bufferEnd := b.start + b.frequency*int64(len(b.data))
	t := time.Now().Unix()

	// Check if the buffer has recent data (within MaxMissingDataPoints threshold)
	if t-bufferEnd > MaxMissingDataPoints*b.frequency {
		return false
	}

	return true
}

// countMissingValues counts the number of NaN (missing) values in the most recent data points.
//
// Examines the last MaxMissingDataPoints*2 values in the buffer and counts how many are NaN.
// We check twice the threshold to allow detecting when more than MaxMissingDataPoints are missing.
// If the buffer has fewer values, examines all available values.
//
// Returns:
//   - int: Number of NaN values found in the examined range
func (b *buffer) countMissingValues() int {
	if b == nil || b.data == nil || len(b.data) == 0 {
		return 0
	}

	// Check twice the threshold to detect degraded metrics
	checkCount := min(int(MaxMissingDataPoints)*2, len(b.data))

	// Count NaN values in the most recent data points
	missingCount := 0
	startIdx := len(b.data) - checkCount
	for i := startIdx; i < len(b.data); i++ {
		if b.data[i].IsNaN() {
			missingCount++
		}
	}

	return missingCount
}

// getHealthyMetrics recursively collects healthy and degraded metrics at this level and below.
//
// A metric is considered:
//   - Healthy: buffer has recent data within MaxMissingDataPoints threshold AND has few/no NaN values
//   - Degraded: buffer exists and has recent data, but contains more than MaxMissingDataPoints NaN values
//
// This routine walks the entire subtree starting from the current level.
//
// Parameters:
//   - m: MemoryStore containing the global metric configuration
//
// Returns:
//   - []string: Flat list of healthy metric names from this level and all children
//   - []string: Flat list of degraded metric names (exist but have too many missing values)
//   - error: Non-nil only for internal errors during recursion
//
// The routine mirrors healthCheck() but provides more granular classification:
//   - healthCheck() finds problems (stale/missing)
//   - getHealthyMetrics() separates healthy from degraded metrics
func (l *Level) getHealthyMetrics(m *MemoryStore) ([]string, []string, error) {
	l.lock.RLock()
	defer l.lock.RUnlock()

	healthyList := make([]string, 0)
	degradedList := make([]string, 0)

	// Phase 1: Check metrics at this level
	for metricName, mc := range m.Metrics {
		b := l.metrics[mc.offset]
		if b.isBufferHealthy() {
			healthyList = append(healthyList, metricName)
		} else {
			degradedList = append(degradedList, metricName)
		}
	}

	// Phase 2: Recursively check child levels
	for _, lvl := range l.children {
		childHealthy, childDegraded, err := lvl.getHealthyMetrics(m)
		if err != nil {
			return nil, nil, err
		}

		// FIXME: Use a map to collect core level metrics
		// Merge child metrics into flat lists
		healthyList = append(healthyList, childHealthy...)
		degradedList = append(degradedList, childDegraded...)
	}

	return healthyList, degradedList, nil
}

// GetHealthyMetrics returns healthy and degraded metrics for a specific node as flat lists.
//
// This routine walks the metric tree starting from the specified node selector
// and collects all metrics that have received data within the last MaxMissingDataPoints
// (default: 5 data points). Metrics are classified into two categories:
//
//   - Healthy: Buffer has recent data AND contains few/no NaN (missing) values
//   - Degraded: Buffer has recent data BUT contains more than MaxMissingDataPoints NaN values
//
// The returned lists include both node-level metrics (e.g., "load", "mem_used") and
// hardware-level metrics (e.g., "cpu_user", "gpu_temp") in flat slices.
//
// Parameters:
//   - selector: Hierarchical path to the target node, typically []string{cluster, hostname}.
//     Example: []string{"emmy", "node001"} navigates to the "node001" host in the "emmy" cluster.
//     The selector must match the hierarchy used during metric ingestion.
//
// Returns:
//   - []string: Flat list of healthy metric names (recent data, few missing values)
//   - []string: Flat list of degraded metric names (recent data, many missing values)
//   - error: Non-nil if the node is not found or internal errors occur
//
// Example usage:
//
//	selector := []string{"emmy", "node001"}
//	healthyMetrics, degradedMetrics, err := ms.GetHealthyMetrics(selector)
//	if err != nil {
//	    // Node not found or internal error
//	    return err
//	}
//	fmt.Printf("Healthy metrics: %v\n", healthyMetrics)
//	// Output: ["load", "mem_used", "cpu_user", ...]
//	fmt.Printf("Degraded metrics: %v\n", degradedMetrics)
//	// Output: ["gpu_temp", "network_rx", ...] (metrics with many NaN values)
//
// Note: This routine provides more granular classification than HealthCheck:
//   - HealthCheck reports stale/missing metrics (problems)
//   - GetHealthyMetrics separates fully healthy from degraded metrics (quality levels)
func (m *MemoryStore) GetHealthyMetrics(selector []string) ([]string, []string, error) {
	lvl := m.root.findLevel(selector)
	if lvl == nil {
		return nil, nil, fmt.Errorf("[METRICSTORE]> error while GetHealthyMetrics, host not found: %#v", selector)
	}

	healthyList, degradedList, err := lvl.getHealthyMetrics(m)
	if err != nil {
		return nil, nil, err
	}

	return healthyList, degradedList, nil
}

// HealthCheckAlt performs health checks on multiple nodes and returns their monitoring states.
//
// This routine provides a batch health check interface that evaluates multiple nodes
// against a specific set of expected metrics. For each node, it determines the overall
// monitoring state based on which metrics are healthy, degraded, or missing.
//
// Health Status Classification:
//   - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values)
//   - MonitoringStatePartial: Some metrics are degraded (many missing values) or missing
//   - MonitoringStateFailed: Node not found or all expected metrics are missing/stale
//
// Parameters:
//   - cluster: Cluster name (first element of selector path)
//   - nodes: List of node hostnames to check
//   - expectedMetrics: List of metric names that should be present on each node
//
// Returns:
//   - map[string]schema.MonitoringState: Map keyed by hostname containing monitoring state for each node
//   - error: Non-nil only for internal errors (individual node failures are captured as MonitoringStateFailed)
//
// Example usage:
//
//	cluster := "emmy"
//	nodes := []string{"node001", "node002", "node003"}
//	expectedMetrics := []string{"load", "mem_used", "cpu_user", "cpu_system"}
//	healthStates, err := ms.HealthCheckAlt(cluster, nodes, expectedMetrics)
//	if err != nil {
//	    return err
//	}
//	for hostname, state := range healthStates {
//	    fmt.Printf("Node %s: %s\n", hostname, state)
//	}
//
// Note: This routine is optimized for batch operations where you need to check
// the same set of metrics across multiple nodes. For single-node checks with
// all configured metrics, use HealthCheck() instead.
func (m *MemoryStore) HealthCheckAlt(cluster string,
	nodes []string, expectedMetrics []string,
) (map[string]schema.MonitoringState, error) {
	results := make(map[string]schema.MonitoringState, len(nodes))

	// Create a set of expected metrics for fast lookup
	expectedSet := make(map[string]bool, len(expectedMetrics))
	for _, metric := range expectedMetrics {
		expectedSet[metric] = true
	}

	// Check each node
	for _, hostname := range nodes {
		selector := []string{cluster, hostname}
		status := schema.MonitoringStateFull
		healthyCount := 0
		degradedCount := 0
		missingCount := 0

		// Get healthy and degraded metrics for this node
		healthyList, degradedList, err := m.GetHealthyMetrics(selector)
		if err != nil {
			// Node not found or internal error
			results[hostname] = schema.MonitoringStateFailed
			continue
		}

		// Create sets for fast lookup
		healthySet := make(map[string]bool, len(healthyList))
		for _, metric := range healthyList {
			healthySet[metric] = true
		}
		degradedSet := make(map[string]bool, len(degradedList))
		for _, metric := range degradedList {
			degradedSet[metric] = true
		}

		// Classify each expected metric
		for _, metric := range expectedMetrics {
			if healthySet[metric] {
				healthyCount++
			} else if degradedSet[metric] {
				degradedCount++
			} else {
				missingCount++
			}
		}

		// Determine overall health status
		if missingCount > 0 || degradedCount > 0 {
			if healthyCount == 0 {
				// No healthy metrics at all
				status = schema.MonitoringStateFailed
			} else {
				// Some healthy, some degraded/missing
				status = schema.MonitoringStatePartial
			}
		}
		// else: all metrics healthy, status remains MonitoringStateFull

		results[hostname] = status
	}

	return results, nil
}