mirror of
https://github.com/ClusterCockpit/cc-metric-store.git
synced 2025-07-23 05:11:41 +02:00
feat: HealthCheck endpoint for nodes
This commit is contained in:
@@ -413,3 +413,36 @@ func handleDebug(rw http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// handleHealthCheck godoc
|
||||
// @summary HealthCheck endpoint
|
||||
// @tags healthcheck
|
||||
// @description This endpoint allows the users to check if a node is healthy
|
||||
// @produce json
|
||||
// @param selector query string false "Selector"
|
||||
// @success 200 {string} string "Debug dump"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /healthcheck/ [get]
|
||||
func handleHealthCheck(rw http.ResponseWriter, r *http.Request) {
|
||||
rawCluster := r.URL.Query().Get("cluster")
|
||||
rawNode := r.URL.Query().Get("node")
|
||||
|
||||
if rawCluster == "" || rawNode == "" {
|
||||
handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
|
||||
selector := []string{rawCluster, rawNode}
|
||||
|
||||
ms := memorystore.GetMemoryStore()
|
||||
if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
@@ -136,6 +136,63 @@ const docTemplate = `{
|
||||
}
|
||||
}
|
||||
},
|
||||
"/healthcheck/": {
|
||||
"get": {
|
||||
"security": [
|
||||
{
|
||||
"ApiKeyAuth": []
|
||||
}
|
||||
],
|
||||
"description": "This endpoint allows the users to check if a node is healthy",
|
||||
"produces": [
|
||||
"application/json"
|
||||
],
|
||||
"tags": [
|
||||
"healthcheck"
|
||||
],
|
||||
"summary": "HealthCheck endpoint",
|
||||
"parameters": [
|
||||
{
|
||||
"type": "string",
|
||||
"description": "Selector",
|
||||
"name": "selector",
|
||||
"in": "query"
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Debug dump",
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"description": "Bad Request",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
},
|
||||
"401": {
|
||||
"description": "Unauthorized",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
},
|
||||
"403": {
|
||||
"description": "Forbidden",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Internal Server Error",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/api.ErrorResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/query/": {
|
||||
"get": {
|
||||
"security": [
|
||||
|
@@ -25,21 +25,25 @@ func MountRoutes(r *http.ServeMux) {
|
||||
r.Handle("POST /api/write", authHandler(http.HandlerFunc(handleWrite), publicKey))
|
||||
r.Handle("GET /api/query", authHandler(http.HandlerFunc(handleQuery), publicKey))
|
||||
r.Handle("GET /api/debug", authHandler(http.HandlerFunc(handleDebug), publicKey))
|
||||
r.Handle("GET /api/healthcheck", authHandler(http.HandlerFunc(handleHealthCheck), publicKey))
|
||||
// Refactor
|
||||
r.Handle("POST /api/free/", authHandler(http.HandlerFunc(handleFree), publicKey))
|
||||
r.Handle("POST /api/write/", authHandler(http.HandlerFunc(handleWrite), publicKey))
|
||||
r.Handle("GET /api/query/", authHandler(http.HandlerFunc(handleQuery), publicKey))
|
||||
r.Handle("GET /api/debug/", authHandler(http.HandlerFunc(handleDebug), publicKey))
|
||||
r.Handle("GET /api/healthcheck/", authHandler(http.HandlerFunc(handleHealthCheck), publicKey))
|
||||
} else {
|
||||
// Compatibility
|
||||
r.HandleFunc("POST /api/free", handleFree)
|
||||
r.HandleFunc("POST /api/write", handleWrite)
|
||||
r.HandleFunc("GET /api/query", handleQuery)
|
||||
r.HandleFunc("GET /api/debug", handleDebug)
|
||||
r.HandleFunc("GET /api/healthcheck", handleHealthCheck)
|
||||
// Refactor
|
||||
r.HandleFunc("POST /api/free/", handleFree)
|
||||
r.HandleFunc("POST /api/write/", handleWrite)
|
||||
r.HandleFunc("GET /api/query/", handleQuery)
|
||||
r.HandleFunc("GET /api/debug/", handleDebug)
|
||||
r.HandleFunc("GET /api/healthcheck/", handleHealthCheck)
|
||||
}
|
||||
}
|
||||
|
88
internal/memorystore/healthcheck.go
Normal file
88
internal/memorystore/healthcheck.go
Normal file
@@ -0,0 +1,88 @@
|
||||
package memorystore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// This is a threshold that allows a node to be healthy with certain number of data points missing.
|
||||
// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a
|
||||
// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy.
|
||||
const MaxMissingDataPoints int64 = 5
|
||||
|
||||
// This is a threshold which allows upto certain number of metrics in a node to be unhealthly.
|
||||
// Works with MaxMissingDataPoints. Say 5 metrics (including submetrics) do not receive the last
|
||||
// MaxMissingDataPoints data points, then the node will be deemed healthy. Any more metrics that does
|
||||
// not receive data for MaxMissingDataPoints data points will deem the node unhealthy.
|
||||
const MaxUnhealthyMetrics int64 = 5
|
||||
|
||||
func (b *buffer) healthCheck() int64 {
|
||||
|
||||
// Check if the buffer is empty
|
||||
if b.data == nil {
|
||||
return 1
|
||||
}
|
||||
|
||||
buffer_end := b.start + b.frequency*int64(len(b.data))
|
||||
t := time.Now().Unix()
|
||||
|
||||
// Check if the buffer is too old
|
||||
if t-buffer_end > MaxMissingDataPoints*b.frequency {
|
||||
return 1
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
for _, mc := range m.Metrics {
|
||||
if b := l.metrics[mc.Offset]; b != nil {
|
||||
count += b.healthCheck()
|
||||
}
|
||||
}
|
||||
|
||||
for _, lvl := range l.children {
|
||||
c, err := lvl.healthCheck(m, 0)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
count += c
|
||||
}
|
||||
|
||||
return count, nil
|
||||
}
|
||||
|
||||
func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error {
|
||||
lvl := m.root.findLevel(selector)
|
||||
if lvl == nil {
|
||||
return fmt.Errorf("not found: %#v", selector)
|
||||
}
|
||||
|
||||
buf := make([]byte, 0, 25)
|
||||
// buf = append(buf, "{"...)
|
||||
|
||||
var count int64 = 0
|
||||
|
||||
unhealthyMetricsCount, err := lvl.healthCheck(m, count)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if unhealthyMetricsCount < MaxUnhealthyMetrics {
|
||||
buf = append(buf, "Healthy"...)
|
||||
} else {
|
||||
buf = append(buf, "Unhealthy"...)
|
||||
}
|
||||
|
||||
// buf = append(buf, "}\n"...)
|
||||
|
||||
if _, err = w.Write(buf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return w.Flush()
|
||||
}
|
Reference in New Issue
Block a user