cc-backend/pkg/metricstore/healthcheck.go

// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-backend.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package metricstore

import (
	"fmt"
	"time"

	"github.com/ClusterCockpit/cc-lib/v2/schema"
)

type HeathCheckResponse struct {
	Status schema.MonitoringState
	Error  error
	list   List
}

type List struct {
	StaleNodeMetricList       []string
	StaleHardwareMetricList   map[string][]string
	MissingNodeMetricList     []string
	MissingHardwareMetricList map[string][]string
}

// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing.
// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a
// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy.
const MaxMissingDataPoints int64 = 5

func (b *buffer) healthCheck() bool {
	// Check if the buffer is empty
	if b.data == nil {
		return true
	}

	bufferEnd := b.start + b.frequency*int64(len(b.data))
	t := time.Now().Unix()

	// Check if the buffer is too old
	if t-bufferEnd > MaxMissingDataPoints*b.frequency {
		return true
	}

	return false
}

func (l *Level) healthCheck(m *MemoryStore) (List, error) {
	l.lock.RLock()
	defer l.lock.RUnlock()

	list := List{
		StaleNodeMetricList:       make([]string, 0),
		StaleHardwareMetricList:   make(map[string][]string, 0),
		MissingNodeMetricList:     make([]string, 0),
		MissingHardwareMetricList: make(map[string][]string, 0),
	}

	for metricName, mc := range m.Metrics {
		if b := l.metrics[mc.offset]; b != nil {
			if b.healthCheck() {
				list.StaleNodeMetricList = append(list.StaleNodeMetricList, metricName)
			}
		} else {
			list.MissingNodeMetricList = append(list.MissingNodeMetricList, metricName)
		}
	}

	for hardwareMetricName, lvl := range l.children {
		l, err := lvl.healthCheck(m)
		if err != nil {
			return List{}, err
		}

		if len(l.StaleNodeMetricList) != 0 {
			list.StaleHardwareMetricList[hardwareMetricName] = l.StaleNodeMetricList
		}
		if len(l.MissingNodeMetricList) != 0 {
			list.MissingHardwareMetricList[hardwareMetricName] = l.MissingNodeMetricList
		}
	}

	return list, nil
}

func (m *MemoryStore) HealthCheck(selector []string, subcluster string) (*HeathCheckResponse, error) {
	response := HeathCheckResponse{
		Status: schema.MonitoringStateFull,
	}

	lvl := m.root.findLevel(selector)
	if lvl == nil {
		response.Status = schema.MonitoringStateFailed
		response.Error = fmt.Errorf("[METRICSTORE]> error while HealthCheck, host not found: %#v", selector)
		return &response, nil
	}

	var err error

	response.list, err = lvl.healthCheck(m)
	if err != nil {
		return nil, err
	}

	fmt.Printf("Response: %#v\n", response)

	if len(response.list.StaleNodeMetricList) != 0 ||
		len(response.list.StaleHardwareMetricList) != 0 {
		response.Status = schema.MonitoringStatePartial
		return &response, nil
	}

	if len(response.list.MissingHardwareMetricList) != 0 ||
		len(response.list.MissingNodeMetricList) != 0 {
		response.Status = schema.MonitoringStateFailed
		return &response, nil
	}

	return &response, nil
}