mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-02-17 16:31:45 +01:00
Prersist faulty nodestate metric lists to db
This commit is contained in:
2
go.mod
2
go.mod
@@ -124,3 +124,5 @@ require (
|
|||||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||||
sigs.k8s.io/yaml v1.6.0 // indirect
|
sigs.k8s.io/yaml v1.6.0 // indirect
|
||||||
)
|
)
|
||||||
|
|
||||||
|
replace github.com/ClusterCockpit/cc-lib/v2 => ../cc-lib
|
||||||
|
|||||||
6
go.sum
6
go.sum
@@ -4,12 +4,6 @@ github.com/99designs/gqlgen v0.17.85 h1:EkGx3U2FDcxQm8YDLQSpXIAVmpDyZ3IcBMOJi2nH
|
|||||||
github.com/99designs/gqlgen v0.17.85/go.mod h1:yvs8s0bkQlRfqg03YXr3eR4OQUowVhODT/tHzCXnbOU=
|
github.com/99designs/gqlgen v0.17.85/go.mod h1:yvs8s0bkQlRfqg03YXr3eR4OQUowVhODT/tHzCXnbOU=
|
||||||
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8=
|
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358 h1:mFRzDkZVAjdal+s7s0MwaRv9igoPqLRdzOLzw/8Xvq8=
|
||||||
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU=
|
github.com/Azure/go-ntlmssp v0.0.0-20221128193559-754e69321358/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU=
|
||||||
github.com/ClusterCockpit/cc-lib/v2 v2.2.2 h1:ye4RY57I19c2cXr3XWZBS/QYYgQVeGFvsiu5HkyKq9E=
|
|
||||||
github.com/ClusterCockpit/cc-lib/v2 v2.2.2/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw=
|
|
||||||
github.com/ClusterCockpit/cc-lib/v2 v2.3.0 h1:69NqCAYCU1r2w6J5Yuxoe8jfR68VLqtWwsWXZ6KTOo4=
|
|
||||||
github.com/ClusterCockpit/cc-lib/v2 v2.3.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw=
|
|
||||||
github.com/ClusterCockpit/cc-lib/v2 v2.4.0 h1:OnZlvqSatg7yCQ2NtSR7AddpUVSiuSMZ8scF1a7nfOk=
|
|
||||||
github.com/ClusterCockpit/cc-lib/v2 v2.4.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw=
|
|
||||||
github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU=
|
github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU=
|
||||||
github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU=
|
github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU=
|
||||||
github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc=
|
github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc=
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
|||||||
ms := metricstore.GetMemoryStore()
|
ms := metricstore.GetMemoryStore()
|
||||||
|
|
||||||
m := make(map[string][]string)
|
m := make(map[string][]string)
|
||||||
healthStates := make(map[string]schema.MonitoringState)
|
healthResults := make(map[string]metricstore.HealthCheckResult)
|
||||||
|
|
||||||
startMs := time.Now()
|
startMs := time.Now()
|
||||||
|
|
||||||
@@ -94,8 +94,8 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
|||||||
if sc != "" {
|
if sc != "" {
|
||||||
metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc)
|
metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc)
|
||||||
metricNames := metricListToNames(metricList)
|
metricNames := metricListToNames(metricList)
|
||||||
if states, err := ms.HealthCheck(req.Cluster, nl, metricNames); err == nil {
|
if results, err := ms.HealthCheck(req.Cluster, nl, metricNames); err == nil {
|
||||||
maps.Copy(healthStates, states)
|
maps.Copy(healthResults, results)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -106,8 +106,10 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
|||||||
for _, node := range req.Nodes {
|
for _, node := range req.Nodes {
|
||||||
state := determineState(node.States)
|
state := determineState(node.States)
|
||||||
healthState := schema.MonitoringStateFailed
|
healthState := schema.MonitoringStateFailed
|
||||||
if hs, ok := healthStates[node.Hostname]; ok {
|
var healthMetrics string
|
||||||
healthState = hs
|
if result, ok := healthResults[node.Hostname]; ok {
|
||||||
|
healthState = result.State
|
||||||
|
healthMetrics = result.HealthMetrics
|
||||||
}
|
}
|
||||||
nodeState := schema.NodeStateDB{
|
nodeState := schema.NodeStateDB{
|
||||||
TimeStamp: requestReceived,
|
TimeStamp: requestReceived,
|
||||||
@@ -116,10 +118,14 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
|||||||
MemoryAllocated: node.MemoryAllocated,
|
MemoryAllocated: node.MemoryAllocated,
|
||||||
GpusAllocated: node.GpusAllocated,
|
GpusAllocated: node.GpusAllocated,
|
||||||
HealthState: healthState,
|
HealthState: healthState,
|
||||||
|
HealthMetrics: healthMetrics,
|
||||||
JobsRunning: node.JobsRunning,
|
JobsRunning: node.JobsRunning,
|
||||||
}
|
}
|
||||||
|
|
||||||
repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState)
|
if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
|
||||||
|
cclog.Errorf("updateNodeStates: updating node state for %s on %s failed: %v",
|
||||||
|
node.Hostname, req.Cluster, err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cclog.Debugf("Timer updateNodeStates, SQLite Inserts: %s", time.Since(startDB))
|
cclog.Debugf("Timer updateNodeStates, SQLite Inserts: %s", time.Since(startDB))
|
||||||
|
|||||||
@@ -169,9 +169,10 @@ func (r *NodeRepository) AddNode(node *schema.NodeDB) (int64, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const NamedNodeStateInsert string = `
|
const NamedNodeStateInsert string = `
|
||||||
INSERT INTO node_state (time_stamp, node_state, health_state, cpus_allocated,
|
INSERT INTO node_state (time_stamp, node_state, health_state, health_metrics,
|
||||||
memory_allocated, gpus_allocated, jobs_running, node_id)
|
cpus_allocated, memory_allocated, gpus_allocated, jobs_running, node_id)
|
||||||
VALUES (:time_stamp, :node_state, :health_state, :cpus_allocated, :memory_allocated, :gpus_allocated, :jobs_running, :node_id);`
|
VALUES (:time_stamp, :node_state, :health_state, :health_metrics,
|
||||||
|
:cpus_allocated, :memory_allocated, :gpus_allocated, :jobs_running, :node_id);`
|
||||||
|
|
||||||
// TODO: Add real Monitoring Health State
|
// TODO: Add real Monitoring Health State
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,7 @@
|
|||||||
package metricstore
|
package metricstore
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -19,6 +20,13 @@ type HealthCheckResponse struct {
|
|||||||
Error error
|
Error error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// HealthCheckResult holds the monitoring state and raw JSON health metrics
|
||||||
|
// for a single node as determined by HealthCheck.
|
||||||
|
type HealthCheckResult struct {
|
||||||
|
State schema.MonitoringState
|
||||||
|
HealthMetrics string // JSON: {"missing":[...],"degraded":[...]}
|
||||||
|
}
|
||||||
|
|
||||||
// MaxMissingDataPoints is the threshold for stale data detection.
|
// MaxMissingDataPoints is the threshold for stale data detection.
|
||||||
// A buffer is considered healthy if the gap between its last data point
|
// A buffer is considered healthy if the gap between its last data point
|
||||||
// and the current time is within MaxMissingDataPoints * frequency.
|
// and the current time is within MaxMissingDataPoints * frequency.
|
||||||
@@ -134,15 +142,15 @@ func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []str
|
|||||||
// - MonitoringStateFailed: node not found, or no healthy metrics at all
|
// - MonitoringStateFailed: node not found, or no healthy metrics at all
|
||||||
func (m *MemoryStore) HealthCheck(cluster string,
|
func (m *MemoryStore) HealthCheck(cluster string,
|
||||||
nodes []string, expectedMetrics []string,
|
nodes []string, expectedMetrics []string,
|
||||||
) (map[string]schema.MonitoringState, error) {
|
) (map[string]HealthCheckResult, error) {
|
||||||
results := make(map[string]schema.MonitoringState, len(nodes))
|
results := make(map[string]HealthCheckResult, len(nodes))
|
||||||
|
|
||||||
for _, hostname := range nodes {
|
for _, hostname := range nodes {
|
||||||
selector := []string{cluster, hostname}
|
selector := []string{cluster, hostname}
|
||||||
|
|
||||||
degradedList, missingList, err := m.GetHealthyMetrics(selector, expectedMetrics)
|
degradedList, missingList, err := m.GetHealthyMetrics(selector, expectedMetrics)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
results[hostname] = schema.MonitoringStateFailed
|
results[hostname] = HealthCheckResult{State: schema.MonitoringStateFailed}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -158,13 +166,24 @@ func (m *MemoryStore) HealthCheck(cluster string,
|
|||||||
cclog.ComponentInfo("metricstore", "HealthCheck: node ", hostname, "missing metrics:", missingList)
|
cclog.ComponentInfo("metricstore", "HealthCheck: node ", hostname, "missing metrics:", missingList)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var state schema.MonitoringState
|
||||||
switch {
|
switch {
|
||||||
case degradedCount == 0 && missingCount == 0:
|
case degradedCount == 0 && missingCount == 0:
|
||||||
results[hostname] = schema.MonitoringStateFull
|
state = schema.MonitoringStateFull
|
||||||
case healthyCount == 0:
|
case healthyCount == 0:
|
||||||
results[hostname] = schema.MonitoringStateFailed
|
state = schema.MonitoringStateFailed
|
||||||
default:
|
default:
|
||||||
results[hostname] = schema.MonitoringStatePartial
|
state = schema.MonitoringStatePartial
|
||||||
|
}
|
||||||
|
|
||||||
|
hm, _ := json.Marshal(map[string][]string{
|
||||||
|
"missing": missingList,
|
||||||
|
"degraded": degradedList,
|
||||||
|
})
|
||||||
|
|
||||||
|
results[hostname] = HealthCheckResult{
|
||||||
|
State: state,
|
||||||
|
HealthMetrics: string(hm),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -253,8 +253,8 @@ func TestHealthCheck(t *testing.T) {
|
|||||||
|
|
||||||
// Check status
|
// Check status
|
||||||
if wantStatus, ok := tt.wantStates[node]; ok {
|
if wantStatus, ok := tt.wantStates[node]; ok {
|
||||||
if state != wantStatus {
|
if state.State != wantStatus {
|
||||||
t.Errorf("HealthCheck() node %s status = %v, want %v", node, state, wantStatus)
|
t.Errorf("HealthCheck() node %s status = %v, want %v", node, state.State, wantStatus)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user