Merge pull request #481 from ClusterCockpit/dev

Dev
This commit is contained in:
Jan Eitzinger
2026-02-04 19:46:02 +01:00
committed by GitHub
19 changed files with 862 additions and 234 deletions

2
.gitignore vendored
View File

@@ -13,7 +13,7 @@
/var/checkpoints* /var/checkpoints*
migrateTimestamps.pl migrateTimestamps.pl
test_ccms_write_api* test_ccms_*
/web/frontend/public/build /web/frontend/public/build
/web/frontend/node_modules /web/frontend/node_modules

View File

@@ -135,36 +135,3 @@ func debugMetrics(rw http.ResponseWriter, r *http.Request) {
return return
} }
} }
// handleHealthCheck godoc
// @summary HealthCheck endpoint
// @tags healthcheck
// @description This endpoint allows the users to check if a node is healthy
// @produce json
// @param selector query string false "Selector"
// @success 200 {string} string "Debug dump"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /healthcheck/ [get]
func metricsHealth(rw http.ResponseWriter, r *http.Request) {
rawCluster := r.URL.Query().Get("cluster")
rawNode := r.URL.Query().Get("node")
if rawCluster == "" || rawNode == "" {
handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw)
return
}
rw.Header().Add("Content-Type", "application/json")
selector := []string{rawCluster, rawNode}
ms := metricstore.GetMemoryStore()
if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
}

View File

@@ -324,11 +324,12 @@ func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
} }
repo := repository.GetNodeRepository() repo := repository.GetNodeRepository()
requestReceived := time.Now().Unix()
for _, node := range req.Nodes { for _, node := range req.Nodes {
state := determineState(node.States) state := determineState(node.States)
nodeState := schema.NodeStateDB{ nodeState := schema.NodeStateDB{
TimeStamp: time.Now().Unix(), TimeStamp: requestReceived,
NodeState: state, NodeState: state,
CpusAllocated: node.CpusAllocated, CpusAllocated: node.CpusAllocated,
MemoryAllocated: node.MemoryAllocated, MemoryAllocated: node.MemoryAllocated,

View File

@@ -7,11 +7,14 @@ package api
import ( import (
"fmt" "fmt"
"maps"
"net/http" "net/http"
"strings" "strings"
"time" "time"
"github.com/ClusterCockpit/cc-backend/internal/repository" "github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
"github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
@@ -20,6 +23,15 @@ type UpdateNodeStatesRequest struct {
Cluster string `json:"cluster" example:"fritz"` Cluster string `json:"cluster" example:"fritz"`
} }
// metricListToNames converts a map of metric configurations to a list of metric names
func metricListToNames(metricList map[string]*schema.Metric) []string {
names := make([]string, 0, len(metricList))
for name := range metricList {
names = append(names, name)
}
return names
}
// this routine assumes that only one of them exists per node // this routine assumes that only one of them exists per node
func determineState(states []string) schema.SchedulerState { func determineState(states []string) schema.SchedulerState {
for _, state := range states { for _, state := range states {
@@ -62,16 +74,42 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
http.StatusBadRequest, rw) http.StatusBadRequest, rw)
return return
} }
requestReceived := time.Now().Unix()
repo := repository.GetNodeRepository() repo := repository.GetNodeRepository()
ms := metricstore.GetMemoryStore()
m := make(map[string][]string)
healthStates := make(map[string]schema.MonitoringState)
for _, node := range req.Nodes {
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
m[sc] = append(m[sc], node.Hostname)
}
}
for sc, nl := range m {
if sc != "" {
metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc)
metricNames := metricListToNames(metricList)
if states, err := ms.HealthCheck(req.Cluster, nl, metricNames); err == nil {
maps.Copy(healthStates, states)
}
}
}
for _, node := range req.Nodes { for _, node := range req.Nodes {
state := determineState(node.States) state := determineState(node.States)
healthState := schema.MonitoringStateFailed
if hs, ok := healthStates[node.Hostname]; ok {
healthState = hs
}
nodeState := schema.NodeStateDB{ nodeState := schema.NodeStateDB{
TimeStamp: time.Now().Unix(), NodeState: state, TimeStamp: requestReceived,
NodeState: state,
CpusAllocated: node.CpusAllocated, CpusAllocated: node.CpusAllocated,
MemoryAllocated: node.MemoryAllocated, MemoryAllocated: node.MemoryAllocated,
GpusAllocated: node.GpusAllocated, GpusAllocated: node.GpusAllocated,
HealthState: schema.MonitoringStateFull, HealthState: healthState,
JobsRunning: node.JobsRunning, JobsRunning: node.JobsRunning,
} }

View File

@@ -81,7 +81,7 @@ func (api *RestAPI) MountAPIRoutes(r *mux.Router) {
// Cluster List // Cluster List
r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet) r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet)
// Slurm node state // Slurm node state
r.HandleFunc("/nodestate/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut) r.HandleFunc("/nodestates/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut)
// Job Handler // Job Handler
if config.Keys.APISubjects == nil { if config.Keys.APISubjects == nil {
cclog.Info("Enabling REST start/stop job API") cclog.Info("Enabling REST start/stop job API")
@@ -127,12 +127,12 @@ func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) {
r.HandleFunc("/free", freeMetrics).Methods(http.MethodPost) r.HandleFunc("/free", freeMetrics).Methods(http.MethodPost)
r.HandleFunc("/write", writeMetrics).Methods(http.MethodPost) r.HandleFunc("/write", writeMetrics).Methods(http.MethodPost)
r.HandleFunc("/debug", debugMetrics).Methods(http.MethodGet) r.HandleFunc("/debug", debugMetrics).Methods(http.MethodGet)
r.HandleFunc("/healthcheck", metricsHealth).Methods(http.MethodGet) r.HandleFunc("/healthcheck", api.updateNodeStates).Methods(http.MethodPost)
// Same endpoints but with trailing slash // Same endpoints but with trailing slash
r.HandleFunc("/free/", freeMetrics).Methods(http.MethodPost) r.HandleFunc("/free/", freeMetrics).Methods(http.MethodPost)
r.HandleFunc("/write/", writeMetrics).Methods(http.MethodPost) r.HandleFunc("/write/", writeMetrics).Methods(http.MethodPost)
r.HandleFunc("/debug/", debugMetrics).Methods(http.MethodGet) r.HandleFunc("/debug/", debugMetrics).Methods(http.MethodGet)
r.HandleFunc("/healthcheck/", metricsHealth).Methods(http.MethodGet) r.HandleFunc("/healthcheck/", api.updateNodeStates).Methods(http.MethodPost)
} }
// MountConfigAPIRoutes registers configuration and user management endpoints. // MountConfigAPIRoutes registers configuration and user management endpoints.

View File

@@ -923,15 +923,19 @@ func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metr
if !okData && len(ser.Data) != 0 { if !okData && len(ser.Data) != 0 {
collectorData[metric] = make([]schema.Float, len(ser.Data)) collectorData[metric] = make([]schema.Float, len(ser.Data))
} else if !okData { } else if !okData {
cclog.Debugf("ClusterMetrics Skip Init: No Data -> %s at %s; Size %d", metric, ser.Hostname, len(ser.Data)) cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip init: no data -> %s at %s; size %d", metric, ser.Hostname, len(ser.Data))
} }
// Sum if init'd and matching size // Sum if init'd and matching size
if okData && len(ser.Data) == len(collectorData[metric]) { if okData && len(ser.Data) == len(collectorData[metric]) {
for i, val := range ser.Data { for i, val := range ser.Data {
if val.IsNaN() {
continue
} else {
collectorData[metric][i] += val collectorData[metric][i] += val
} }
}
} else if okData { } else if okData {
cclog.Debugf("ClusterMetrics Skip Sum: Data Diff -> %s at %s; Want Size %d, Have Size %d", metric, ser.Hostname, len(collectorData[metric]), len(ser.Data)) cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip sum: data diff -> %s at %s; want size %d, have size %d", metric, ser.Hostname, len(collectorData[metric]), len(ser.Data))
} }
} }
} }

View File

@@ -466,7 +466,7 @@ func (r *JobRepository) JobCountGrouped(
// AddJobCountGrouped augments existing statistics with additional job counts by category. // AddJobCountGrouped augments existing statistics with additional job counts by category.
// //
// This method enriches JobsStatistics returned by JobsStatsGrouped or JobCountGrouped // This method enriches JobsStatistics returned by JobsStatsGrouped or JobCountGrouped
// with counts of running or short-running jobs, matched by group ID. // with counts of running or short-running (based on ShortRunningJobsDuration) jobs, matched by group ID.
// //
// Parameters: // Parameters:
// - ctx: Context for security checks // - ctx: Context for security checks

View File

@@ -158,8 +158,7 @@ func cleanupCheckpoints(dir string, cleanupDir string, from int64, deleteInstead
return 0, err return 0, err
} }
extension := Keys.Checkpoints.FileFormat files, err := findFiles(entries, from, false)
files, err := findFiles(entries, from, extension, false)
if err != nil { if err != nil {
return 0, err return 0, err
} }

View File

@@ -415,7 +415,7 @@ func enqueueCheckpointHosts(dir string, work chan<- [2]string) error {
// //
// Uses worker pool to load cluster/host combinations. Periodically triggers GC // Uses worker pool to load cluster/host combinations. Periodically triggers GC
// to prevent excessive heap growth. Returns number of files loaded and any errors. // to prevent excessive heap growth. Returns number of files loaded and any errors.
func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) { func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {
var wg sync.WaitGroup var wg sync.WaitGroup
work := make(chan [2]string, Keys.NumWorkers*4) work := make(chan [2]string, Keys.NumWorkers*4)
n, errs := int32(0), int32(0) n, errs := int32(0), int32(0)
@@ -426,7 +426,7 @@ func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (
defer wg.Done() defer wg.Done()
for host := range work { for host := range work {
lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics)) lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics))
nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension) nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from)
if err != nil { if err != nil {
cclog.Errorf("[METRICSTORE]> error while loading checkpoints for %s/%s: %s", host[0], host[1], err.Error()) cclog.Errorf("[METRICSTORE]> error while loading checkpoints for %s/%s: %s", host[0], host[1], err.Error())
atomic.AddInt32(&errs, 1) atomic.AddInt32(&errs, 1)
@@ -465,57 +465,7 @@ func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
cclog.Debugf("[METRICSTORE]> %#v Directory created successfully", dir) cclog.Debugf("[METRICSTORE]> %#v Directory created successfully", dir)
} }
// Config read (replace with your actual config read) return m.FromCheckpoint(dir, from)
fileFormat := Keys.Checkpoints.FileFormat
if fileFormat == "" {
fileFormat = "avro"
}
// Map to easily get the fallback format
oppositeFormat := map[string]string{
"json": "avro",
"avro": "json",
}
// First, attempt to load the specified format
if found, err := checkFilesWithExtension(dir, fileFormat); err != nil {
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
} else if found {
cclog.Infof("[METRICSTORE]> Loading %s files because fileformat is %s", fileFormat, fileFormat)
return m.FromCheckpoint(dir, from, fileFormat)
}
// If not found, attempt the opposite format
altFormat := oppositeFormat[fileFormat]
if found, err := checkFilesWithExtension(dir, altFormat); err != nil {
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
} else if found {
cclog.Infof("[METRICSTORE]> Loading %s files but fileformat is %s", altFormat, fileFormat)
return m.FromCheckpoint(dir, from, altFormat)
}
return 0, nil
}
// checkFilesWithExtension walks a directory tree to check if files with the given extension exist.
func checkFilesWithExtension(dir string, extension string) (bool, error) {
found := false
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return fmt.Errorf("[METRICSTORE]> error accessing path %s: %v", path, err)
}
if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension {
found = true
return nil
}
return nil
})
if err != nil {
return false, fmt.Errorf("[METRICSTORE]> error walking through directories: %s", err)
}
return found, nil
} }
func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error { func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error {
@@ -729,7 +679,7 @@ func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
return nil return nil
} }
func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension string) (int, error) { func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, error) {
direntries, err := os.ReadDir(dir) direntries, err := os.ReadDir(dir)
if err != nil { if err != nil {
if os.IsNotExist(err) { if os.IsNotExist(err) {
@@ -748,33 +698,38 @@ func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension
children: make(map[string]*Level), children: make(map[string]*Level),
} }
files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from, extension) files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from)
filesLoaded += files filesLoaded += files
if err != nil { if err != nil {
return filesLoaded, err return filesLoaded, err
} }
l.children[e.Name()] = child l.children[e.Name()] = child
} else if strings.HasSuffix(e.Name(), "."+extension) { } else if strings.HasSuffix(e.Name(), ".json") || strings.HasSuffix(e.Name(), ".avro") {
allFiles = append(allFiles, e) allFiles = append(allFiles, e)
} else { } else {
continue continue
} }
} }
files, err := findFiles(allFiles, from, extension, true) files, err := findFiles(allFiles, from, true)
if err != nil { if err != nil {
return filesLoaded, err return filesLoaded, err
} }
loaders := map[string]func(*MemoryStore, *os.File, int64) error{ loaders := map[string]func(*MemoryStore, *os.File, int64) error{
"json": l.loadJSONFile, ".json": l.loadJSONFile,
"avro": l.loadAvroFile, ".avro": l.loadAvroFile,
} }
loader := loaders[extension]
for _, filename := range files { for _, filename := range files {
ext := filepath.Ext(filename)
loader := loaders[ext]
if loader == nil {
cclog.Warnf("Unknown extension for file %s", filename)
continue
}
// Use a closure to ensure file is closed immediately after use // Use a closure to ensure file is closed immediately after use
err := func() error { err := func() error {
f, err := os.Open(path.Join(dir, filename)) f, err := os.Open(path.Join(dir, filename))
@@ -798,10 +753,12 @@ func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension
// This will probably get very slow over time! // This will probably get very slow over time!
// A solution could be some sort of an index file in which all other files // A solution could be some sort of an index file in which all other files
// and the timespan they contain is listed. // and the timespan they contain is listed.
func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRecentFiles bool) ([]string, error) { // NOTE: This now assumes that you have distinct timestamps for json and avro files
// Also, it assumes that the timestamps are not overlapping/self-modified.
func findFiles(direntries []fs.DirEntry, t int64, findMoreRecentFiles bool) ([]string, error) {
nums := map[string]int64{} nums := map[string]int64{}
for _, e := range direntries { for _, e := range direntries {
if !strings.HasSuffix(e.Name(), "."+extension) { if !strings.HasSuffix(e.Name(), ".json") && !strings.HasSuffix(e.Name(), ".avro") {
continue continue
} }

View File

@@ -6,87 +6,260 @@
package metricstore package metricstore
import ( import (
"bufio" "cmp"
"fmt" "fmt"
"slices"
"time" "time"
"github.com/ClusterCockpit/cc-lib/v2/schema"
) )
// HealthCheckResponse represents the result of a health check operation.
//
// Status indicates the monitoring state (Full, Partial, Failed).
// Error contains any error encountered during the health check.
type HealthCheckResponse struct {
Status schema.MonitoringState
Error error
}
// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing. // MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing.
// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a // Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a
// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy. // node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy.
const MaxMissingDataPoints int64 = 5 const MaxMissingDataPoints int64 = 5
// MaxUnhealthyMetrics is a threshold which allows upto certain number of metrics in a node to be unhealthly. // isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints.
// Works with MaxMissingDataPoints. Say 5 metrics (including submetrics) do not receive the last //
// MaxMissingDataPoints data points, then the node will be deemed healthy. Any more metrics that does // Returns true if the buffer is healthy (recent data within threshold), false otherwise.
// not receive data for MaxMissingDataPoints data points will deem the node unhealthy. // A nil buffer or empty buffer is considered unhealthy.
const MaxUnhealthyMetrics int64 = 5 func (b *buffer) bufferExists() bool {
func (b *buffer) healthCheck() int64 {
// Check if the buffer is empty // Check if the buffer is empty
if b.data == nil { if b == nil || b.data == nil || len(b.data) == 0 {
return 1 return false
} }
return true
}
// isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints.
//
// Returns true if the buffer is healthy (recent data within threshold), false otherwise.
// A nil buffer or empty buffer is considered unhealthy.
func (b *buffer) isBufferHealthy() bool {
// Get the last endtime of the buffer
bufferEnd := b.start + b.frequency*int64(len(b.data)) bufferEnd := b.start + b.frequency*int64(len(b.data))
t := time.Now().Unix() t := time.Now().Unix()
// Check if the buffer is too old // Check if the buffer has recent data (within MaxMissingDataPoints threshold)
if t-bufferEnd > MaxMissingDataPoints*b.frequency { if t-bufferEnd > MaxMissingDataPoints*b.frequency {
return 1 return false
} }
return 0 return true
} }
func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) { // MergeUniqueSorted merges two lists, sorts them, and removes duplicates.
// Requires 'cmp.Ordered' because we need to sort the data.
func mergeList[string cmp.Ordered](list1, list2 []string) []string {
// 1. Combine both lists
result := append(list1, list2...)
// 2. Sort the combined list
slices.Sort(result)
// 3. Compact removes consecutive duplicates (standard in Go 1.21+)
// e.g. [1, 1, 2, 3, 3] -> [1, 2, 3]
result = slices.Compact(result)
return result
}
// getHealthyMetrics recursively collects healthy and degraded metrics at this level and below.
//
// A metric is considered:
// - Healthy: buffer has recent data within MaxMissingDataPoints threshold AND has few/no NaN values
// - Degraded: buffer exists and has recent data, but contains more than MaxMissingDataPoints NaN values
//
// This routine walks the entire subtree starting from the current level.
//
// Parameters:
// - m: MemoryStore containing the global metric configuration
//
// Returns:
// - []string: Flat list of healthy metric names from this level and all children
// - []string: Flat list of degraded metric names (exist but have too many missing values)
// - error: Non-nil only for internal errors during recursion
//
// The routine mirrors healthCheck() but provides more granular classification:
// - healthCheck() finds problems (stale/missing)
// - getHealthyMetrics() separates healthy from degraded metrics
func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) ([]string, []string, error) {
l.lock.RLock() l.lock.RLock()
defer l.lock.RUnlock() defer l.lock.RUnlock()
for _, mc := range m.Metrics { globalMetrics := m.Metrics
if b := l.metrics[mc.offset]; b != nil {
count += b.healthCheck() missingList := make([]string, 0)
degradedList := make([]string, 0)
// Phase 1: Check metrics at this level
for _, metricName := range expectedMetrics {
offset := globalMetrics[metricName].offset
b := l.metrics[offset]
if !b.bufferExists() {
missingList = append(missingList, metricName)
} else if !b.isBufferHealthy() {
degradedList = append(degradedList, metricName)
} }
} }
// Phase 2: Recursively check child levels
for _, lvl := range l.children { for _, lvl := range l.children {
c, err := lvl.healthCheck(m, 0) childMissing, childDegraded, err := lvl.getHealthyMetrics(m, expectedMetrics)
if err != nil { if err != nil {
return 0, err return nil, nil, err
}
count += c
} }
return count, nil missingList = mergeList(missingList, childMissing)
degradedList = mergeList(degradedList, childDegraded)
} }
func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error { return missingList, degradedList, nil
}
// GetHealthyMetrics returns healthy and degraded metrics for a specific node as flat lists.
//
// This routine walks the metric tree starting from the specified node selector
// and collects all metrics that have received data within the last MaxMissingDataPoints
// (default: 5 data points). Metrics are classified into two categories:
//
// - Healthy: Buffer has recent data AND contains few/no NaN (missing) values
// - Degraded: Buffer has recent data BUT contains more than MaxMissingDataPoints NaN values
//
// The returned lists include both node-level metrics (e.g., "load", "mem_used") and
// hardware-level metrics (e.g., "cpu_user", "gpu_temp") in flat slices.
//
// Parameters:
// - selector: Hierarchical path to the target node, typically []string{cluster, hostname}.
// Example: []string{"emmy", "node001"} navigates to the "node001" host in the "emmy" cluster.
// The selector must match the hierarchy used during metric ingestion.
//
// Returns:
// - []string: Flat list of healthy metric names (recent data, few missing values)
// - []string: Flat list of degraded metric names (recent data, many missing values)
// - error: Non-nil if the node is not found or internal errors occur
//
// Example usage:
//
// selector := []string{"emmy", "node001"}
// healthyMetrics, degradedMetrics, err := ms.GetHealthyMetrics(selector)
// if err != nil {
// // Node not found or internal error
// return err
// }
// fmt.Printf("Healthy metrics: %v\n", healthyMetrics)
// // Output: ["load", "mem_used", "cpu_user", ...]
// fmt.Printf("Degraded metrics: %v\n", degradedMetrics)
// // Output: ["gpu_temp", "network_rx", ...] (metrics with many NaN values)
//
// Note: This routine provides more granular classification than HealthCheck:
// - HealthCheck reports stale/missing metrics (problems)
// - GetHealthyMetrics separates fully healthy from degraded metrics (quality levels)
func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, []string, error) {
lvl := m.root.findLevel(selector) lvl := m.root.findLevel(selector)
if lvl == nil { if lvl == nil {
return fmt.Errorf("[METRICSTORE]> not found: %#v", selector) return nil, nil, fmt.Errorf("[METRICSTORE]> error while GetHealthyMetrics, host not found: %#v", selector)
} }
buf := make([]byte, 0, 25) missingList, degradedList, err := lvl.getHealthyMetrics(m, expectedMetrics)
// buf = append(buf, "{"...)
var count int64 = 0
unhealthyMetricsCount, err := lvl.healthCheck(m, count)
if err != nil { if err != nil {
return err return nil, nil, err
} }
if unhealthyMetricsCount < MaxUnhealthyMetrics { return missingList, degradedList, nil
buf = append(buf, "Healthy"...) }
// HealthCheck performs health checks on multiple nodes and returns their monitoring states.
//
// This routine provides a batch health check interface that evaluates multiple nodes
// against a specific set of expected metrics. For each node, it determines the overall
// monitoring state based on which metrics are healthy, degraded, or missing.
//
// Health Status Classification:
// - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values)
// - MonitoringStatePartial: Some metrics are degraded (many missing values) or missing
// - MonitoringStateFailed: Node not found or all expected metrics are missing/stale
//
// Parameters:
// - cluster: Cluster name (first element of selector path)
// - nodes: List of node hostnames to check
// - expectedMetrics: List of metric names that should be present on each node
//
// Returns:
// - map[string]schema.MonitoringState: Map keyed by hostname containing monitoring state for each node
// - error: Non-nil only for internal errors (individual node failures are captured as MonitoringStateFailed)
//
// Example usage:
//
// cluster := "emmy"
// nodes := []string{"node001", "node002", "node003"}
// expectedMetrics := []string{"load", "mem_used", "cpu_user", "cpu_system"}
// healthStates, err := ms.HealthCheck(cluster, nodes, expectedMetrics)
// if err != nil {
// return err
// }
// for hostname, state := range healthStates {
// fmt.Printf("Node %s: %s\n", hostname, state)
// }
//
// Note: This routine is optimized for batch operations where you need to check
// the same set of metrics across multiple nodes.
func (m *MemoryStore) HealthCheck(cluster string,
nodes []string, expectedMetrics []string,
) (map[string]schema.MonitoringState, error) {
results := make(map[string]schema.MonitoringState, len(nodes))
// Create a set of expected metrics for fast lookup
expectedSet := make(map[string]bool, len(expectedMetrics))
for _, metric := range expectedMetrics {
expectedSet[metric] = true
}
// Check each node
for _, hostname := range nodes {
selector := []string{cluster, hostname}
status := schema.MonitoringStateFull
healthyCount := 0
degradedCount := 0
missingCount := 0
// Get healthy and degraded metrics for this node
missingList, degradedList, err := m.GetHealthyMetrics(selector, expectedMetrics)
if err != nil {
// Node not found or internal error
results[hostname] = schema.MonitoringStateFailed
continue
}
missingCount = len(missingList)
degradedCount = len(degradedList)
healthyCount = len(expectedMetrics) - (missingCount + degradedCount)
// Determine overall health status
if missingCount > 0 || degradedCount > 0 {
if healthyCount == 0 {
// No healthy metrics at all
status = schema.MonitoringStateFailed
} else { } else {
buf = append(buf, "Unhealthy"...) // Some healthy, some degraded/missing
status = schema.MonitoringStatePartial
}
}
// else: all metrics healthy, status remains MonitoringStateFull
results[hostname] = status
} }
// buf = append(buf, "}\n"...) return results, nil
if _, err = w.Write(buf); err != nil {
return err
}
return w.Flush()
} }

View File

@@ -7,6 +7,7 @@ package metricstore
import ( import (
"testing" "testing"
"time"
"github.com/ClusterCockpit/cc-lib/v2/schema" "github.com/ClusterCockpit/cc-lib/v2/schema"
) )
@@ -88,3 +89,378 @@ func TestBufferRead(t *testing.T) {
t.Errorf("buffer.read() len(result) = %d, want 3", len(result)) t.Errorf("buffer.read() len(result) = %d, want 3", len(result))
} }
} }
func TestHealthCheck(t *testing.T) {
// Create a test MemoryStore with some metrics
metrics := map[string]MetricConfig{
"load": {Frequency: 10, Aggregation: AvgAggregation, offset: 0},
"mem_used": {Frequency: 10, Aggregation: AvgAggregation, offset: 1},
"cpu_user": {Frequency: 10, Aggregation: AvgAggregation, offset: 2},
"cpu_system": {Frequency: 10, Aggregation: AvgAggregation, offset: 3},
}
ms := &MemoryStore{
Metrics: metrics,
root: Level{
metrics: make([]*buffer, len(metrics)),
children: make(map[string]*Level),
},
}
// Use recent timestamps (current time minus a small offset)
now := time.Now().Unix()
startTime := now - 100 // Start 100 seconds ago to have enough data points
// Setup test data for node001 - all metrics healthy (recent data)
node001 := ms.root.findLevelOrCreate([]string{"testcluster", "node001"}, len(metrics))
for i := 0; i < len(metrics); i++ {
node001.metrics[i] = newBuffer(startTime, 10)
// Write recent data up to now
for ts := startTime; ts <= now; ts += 10 {
node001.metrics[i].write(ts, schema.Float(float64(i+1)))
}
}
// Setup test data for node002 - some metrics stale (old data beyond MaxMissingDataPoints threshold)
node002 := ms.root.findLevelOrCreate([]string{"testcluster", "node002"}, len(metrics))
// MaxMissingDataPoints = 5, frequency = 10, so threshold is 50 seconds
staleTime := now - 100 // Data ends 100 seconds ago (well beyond 50 second threshold)
for i := 0; i < len(metrics); i++ {
node002.metrics[i] = newBuffer(staleTime-50, 10)
if i < 2 {
// First two metrics: healthy (recent data)
for ts := startTime; ts <= now; ts += 10 {
node002.metrics[i].write(ts, schema.Float(float64(i+1)))
}
} else {
// Last two metrics: stale (data ends 100 seconds ago)
for ts := staleTime - 50; ts <= staleTime; ts += 10 {
node002.metrics[i].write(ts, schema.Float(float64(i+1)))
}
}
}
// Setup test data for node003 - some metrics missing (no buffer)
node003 := ms.root.findLevelOrCreate([]string{"testcluster", "node003"}, len(metrics))
// Only create buffers for first two metrics
for i := 0; i < 2; i++ {
node003.metrics[i] = newBuffer(startTime, 10)
for ts := startTime; ts <= now; ts += 10 {
node003.metrics[i].write(ts, schema.Float(float64(i+1)))
}
}
// Leave metrics[2] and metrics[3] as nil (missing)
// Setup test data for node005 - all metrics stale
node005 := ms.root.findLevelOrCreate([]string{"testcluster", "node005"}, len(metrics))
for i := 0; i < len(metrics); i++ {
node005.metrics[i] = newBuffer(staleTime-50, 10)
// All metrics have stale data (ends 100 seconds ago)
for ts := staleTime - 50; ts <= staleTime; ts += 10 {
node005.metrics[i].write(ts, schema.Float(float64(i+1)))
}
}
// node004 doesn't exist at all
tests := []struct {
name string
cluster string
nodes []string
expectedMetrics []string
wantStates map[string]schema.MonitoringState
}{
{
name: "all metrics healthy",
cluster: "testcluster",
nodes: []string{"node001"},
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
wantStates: map[string]schema.MonitoringState{
"node001": schema.MonitoringStateFull,
},
},
{
name: "some metrics stale",
cluster: "testcluster",
nodes: []string{"node002"},
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
wantStates: map[string]schema.MonitoringState{
"node002": schema.MonitoringStatePartial,
},
},
{
name: "some metrics missing",
cluster: "testcluster",
nodes: []string{"node003"},
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
wantStates: map[string]schema.MonitoringState{
"node003": schema.MonitoringStatePartial,
},
},
{
name: "node not found",
cluster: "testcluster",
nodes: []string{"node004"},
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
wantStates: map[string]schema.MonitoringState{
"node004": schema.MonitoringStateFailed,
},
},
{
name: "all metrics stale",
cluster: "testcluster",
nodes: []string{"node005"},
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
wantStates: map[string]schema.MonitoringState{
"node005": schema.MonitoringStateFailed,
},
},
{
name: "multiple nodes mixed states",
cluster: "testcluster",
nodes: []string{"node001", "node002", "node003", "node004", "node005"},
expectedMetrics: []string{"load", "mem_used"},
wantStates: map[string]schema.MonitoringState{
"node001": schema.MonitoringStateFull,
"node002": schema.MonitoringStateFull, // Only checking first 2 metrics which are healthy
"node003": schema.MonitoringStateFull, // Only checking first 2 metrics which exist
"node004": schema.MonitoringStateFailed, // Node doesn't exist
"node005": schema.MonitoringStateFailed, // Both metrics are stale
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
results, err := ms.HealthCheck(tt.cluster, tt.nodes, tt.expectedMetrics)
if err != nil {
t.Errorf("HealthCheck() error = %v", err)
return
}
// Check that we got results for all nodes
if len(results) != len(tt.nodes) {
t.Errorf("HealthCheck() returned %d results, want %d", len(results), len(tt.nodes))
}
// Check each node's state
for _, node := range tt.nodes {
state, ok := results[node]
if !ok {
t.Errorf("HealthCheck() missing result for node %s", node)
continue
}
// Check status
if wantStatus, ok := tt.wantStates[node]; ok {
if state != wantStatus {
t.Errorf("HealthCheck() node %s status = %v, want %v", node, state, wantStatus)
}
}
}
})
}
}
// TestGetHealthyMetrics tests the GetHealthyMetrics function which returns lists of missing and degraded metrics
func TestGetHealthyMetrics(t *testing.T) {
metrics := map[string]MetricConfig{
"load": {Frequency: 10, Aggregation: AvgAggregation, offset: 0},
"mem_used": {Frequency: 10, Aggregation: AvgAggregation, offset: 1},
"cpu_user": {Frequency: 10, Aggregation: AvgAggregation, offset: 2},
}
ms := &MemoryStore{
Metrics: metrics,
root: Level{
metrics: make([]*buffer, len(metrics)),
children: make(map[string]*Level),
},
}
now := time.Now().Unix()
startTime := now - 100
staleTime := now - 100
// Setup node with mixed health states
node := ms.root.findLevelOrCreate([]string{"testcluster", "testnode"}, len(metrics))
// Metric 0 (load): healthy - recent data
node.metrics[0] = newBuffer(startTime, 10)
for ts := startTime; ts <= now; ts += 10 {
node.metrics[0].write(ts, schema.Float(1.0))
}
// Metric 1 (mem_used): degraded - stale data
node.metrics[1] = newBuffer(staleTime-50, 10)
for ts := staleTime - 50; ts <= staleTime; ts += 10 {
node.metrics[1].write(ts, schema.Float(2.0))
}
// Metric 2 (cpu_user): missing - no buffer (nil)
tests := []struct {
name string
selector []string
expectedMetrics []string
wantMissing []string
wantDegraded []string
wantErr bool
}{
{
name: "mixed health states",
selector: []string{"testcluster", "testnode"},
expectedMetrics: []string{"load", "mem_used", "cpu_user"},
wantMissing: []string{"cpu_user"},
wantDegraded: []string{"mem_used"},
wantErr: false,
},
{
name: "node not found",
selector: []string{"testcluster", "nonexistent"},
expectedMetrics: []string{"load"},
wantMissing: nil,
wantDegraded: nil,
wantErr: true,
},
{
name: "check only healthy metric",
selector: []string{"testcluster", "testnode"},
expectedMetrics: []string{"load"},
wantMissing: []string{},
wantDegraded: []string{},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
missing, degraded, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics)
if (err != nil) != tt.wantErr {
t.Errorf("GetHealthyMetrics() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr {
return
}
// Check missing list
if len(missing) != len(tt.wantMissing) {
t.Errorf("GetHealthyMetrics() missing = %v, want %v", missing, tt.wantMissing)
} else {
for i, m := range tt.wantMissing {
if missing[i] != m {
t.Errorf("GetHealthyMetrics() missing[%d] = %v, want %v", i, missing[i], m)
}
}
}
// Check degraded list
if len(degraded) != len(tt.wantDegraded) {
t.Errorf("GetHealthyMetrics() degraded = %v, want %v", degraded, tt.wantDegraded)
} else {
for i, d := range tt.wantDegraded {
if degraded[i] != d {
t.Errorf("GetHealthyMetrics() degraded[%d] = %v, want %v", i, degraded[i], d)
}
}
}
})
}
}
// TestBufferHealthChecks tests the buffer-level health check functions
func TestBufferHealthChecks(t *testing.T) {
now := time.Now().Unix()
tests := []struct {
name string
setupBuffer func() *buffer
wantExists bool
wantHealthy bool
description string
}{
{
name: "nil buffer",
setupBuffer: func() *buffer {
return nil
},
wantExists: false,
wantHealthy: false,
description: "nil buffer should not exist and not be healthy",
},
{
name: "empty buffer",
setupBuffer: func() *buffer {
b := newBuffer(now, 10)
b.data = nil
return b
},
wantExists: false,
wantHealthy: false,
description: "empty buffer should not exist and not be healthy",
},
{
name: "healthy buffer with recent data",
setupBuffer: func() *buffer {
b := newBuffer(now-30, 10)
// Write data up to now (within MaxMissingDataPoints * frequency = 50 seconds)
for ts := now - 30; ts <= now; ts += 10 {
b.write(ts, schema.Float(1.0))
}
return b
},
wantExists: true,
wantHealthy: true,
description: "buffer with recent data should be healthy",
},
{
name: "stale buffer beyond threshold",
setupBuffer: func() *buffer {
b := newBuffer(now-200, 10)
// Write data that ends 100 seconds ago (beyond MaxMissingDataPoints * frequency = 50 seconds)
for ts := now - 200; ts <= now-100; ts += 10 {
b.write(ts, schema.Float(1.0))
}
return b
},
wantExists: true,
wantHealthy: false,
description: "buffer with stale data should exist but not be healthy",
},
{
name: "buffer at threshold boundary",
setupBuffer: func() *buffer {
b := newBuffer(now-50, 10)
// Write data that ends exactly at threshold (MaxMissingDataPoints * frequency = 50 seconds)
for ts := now - 50; ts <= now-50; ts += 10 {
b.write(ts, schema.Float(1.0))
}
return b
},
wantExists: true,
wantHealthy: true,
description: "buffer at threshold boundary should still be healthy",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
b := tt.setupBuffer()
exists := b.bufferExists()
if exists != tt.wantExists {
t.Errorf("bufferExists() = %v, want %v: %s", exists, tt.wantExists, tt.description)
}
if b != nil && b.data != nil && len(b.data) > 0 {
healthy := b.isBufferHealthy()
if healthy != tt.wantHealthy {
t.Errorf("isBufferHealthy() = %v, want %v: %s", healthy, tt.wantHealthy, tt.description)
}
}
})
}
}

View File

@@ -7,7 +7,7 @@
--> -->
<script> <script>
import { onMount } from "svelte"; import { getContext, onMount } from "svelte";
import { import {
Row, Row,
Col, Col,
@@ -18,6 +18,7 @@
Spinner, Spinner,
InputGroup, InputGroup,
Input, Input,
Tooltip
} from "@sveltestrap/sveltestrap"; } from "@sveltestrap/sveltestrap";
import { import {
queryStore, queryStore,
@@ -29,6 +30,9 @@
scramble, scramble,
scrambleNames, scrambleNames,
} from "./generic/utils.js"; } from "./generic/utils.js";
import {
formatDurationTime
} from "./generic/units.js";
import Filters from "./generic/Filters.svelte"; import Filters from "./generic/Filters.svelte";
/* Svelte 5 Props */ /* Svelte 5 Props */
@@ -40,48 +44,70 @@
/* Const Init */ /* Const Init */
const {} = init(); const {} = init();
const client = getContextClient(); const client = getContextClient();
const shortDuration = getContext("cc-config").jobList_hideShortRunningJobs; // Always configured
/* State Init*/ /* State Init*/
let filterComponent = $state(); // see why here: https://stackoverflow.com/questions/58287729/how-can-i-export-a-function-from-a-svelte-component-that-changes-a-value-in-the let filterComponent = $state(); // see why here: https://stackoverflow.com/questions/58287729/how-can-i-export-a-function-from-a-svelte-component-that-changes-a-value-in-the
let jobFilters = $state([]); let jobFilters = $state([]);
let nameFilter = $state(""); let nameFilter = $state("");
let sorting = $state({ field: "totalJobs", direction: "down" }); let sorting = $state({ field: "totalJobs", direction: "desc" });
/* Derived Vars */ /* Derived Vars */
const fetchRunning = $derived(jobFilters.some(jf => jf?.state?.length == 1 && jf?.state?.includes("running")));
const numCols = $derived.by(() => {
let colbase = 6
if (fetchRunning) {
colbase += 2
}
return colbase
})
let stats = $derived( let stats = $derived(
queryStore({ queryStore({
client: client, client: client,
query: gql` query: gql`
query($jobFilters: [JobFilter!]!) { query($jobFilters: [JobFilter!]!, $fetchRunning: Boolean!) {
rows: jobsStatistics(filter: $jobFilters, groupBy: ${type}) { rows: jobsStatistics(filter: $jobFilters, groupBy: ${type}) {
id id
name name
totalJobs totalJobs
shortJobs
totalCores @include(if: $fetchRunning)
totalAccs @include(if: $fetchRunning)
totalWalltime totalWalltime
totalCoreHours totalCoreHours
totalAccHours totalAccHours
} }
}`, }`,
variables: { jobFilters }, variables: {
jobFilters,
fetchRunning
},
}) })
); );
/* Functions */ /* Functions */
function changeSorting(field) { function changeSorting(newField) {
sorting = { field, direction: sorting?.direction == "down" ? "up" : "down" }; if (sorting.field == newField) {
// Same Field, Change Direction
sorting = { field: newField, direction: sorting.direction == "desc" ? "asc" : "desc" };
} else {
// Change Field, Apply Default Direction
sorting = { field: newField, direction: "desc" };
}
} }
function sort(stats, sorting, nameFilter) { function sort(stats, sorting, nameFilter) {
const idCmp = sorting.direction == "up" const idCmp = sorting.direction == "asc"
? (a, b) => b.id.localeCompare(a.id) ? (a, b) => b.id.localeCompare(a.id)
: (a, b) => a.id.localeCompare(b.id) : (a, b) => a.id.localeCompare(b.id)
// Force empty or undefined strings to the end of the list // Force empty or undefined strings to the end of the list
const nameCmp = sorting.direction == "up" const nameCmp = sorting.direction == "asc"
? (a, b) => !a?.name ? 1 : (!b?.name ? -1 : (b.name.localeCompare(a.name))) ? (a, b) => !a?.name ? 1 : (!b?.name ? -1 : (b.name.localeCompare(a.name)))
: (a, b) => !a?.name ? 1 : (!b?.name ? -1 : (a.name.localeCompare(b.name))) : (a, b) => !a?.name ? 1 : (!b?.name ? -1 : (a.name.localeCompare(b.name)))
const intCmp = sorting.direction == "up" const intCmp = sorting.direction == "asc"
? (a, b) => a[sorting.field] - b[sorting.field] ? (a, b) => a[sorting.field] - b[sorting.field]
: (a, b) => b[sorting.field] - a[sorting.field]; : (a, b) => b[sorting.field] - a[sorting.field];
@@ -141,7 +167,7 @@
> >
{#if sorting?.field == "id"} {#if sorting?.field == "id"}
<!-- Note on Icon-Name: Arrow-indicator always down, only alpha-indicator switches --> <!-- Note on Icon-Name: Arrow-indicator always down, only alpha-indicator switches -->
<Icon name={`sort-alpha-${sorting?.direction == 'down' ? 'down' : 'down-alt'}`} /> <Icon name={`sort-alpha-${sorting?.direction == 'desc' ? 'down' : 'down-alt'}`} />
{:else} {:else}
<Icon name="three-dots-vertical" /> <Icon name="three-dots-vertical" />
{/if} {/if}
@@ -156,7 +182,7 @@
onclick={() => changeSorting("name")} onclick={() => changeSorting("name")}
> >
{#if sorting?.field == "name"} {#if sorting?.field == "name"}
<Icon name={`sort-alpha-${sorting?.direction == 'down' ? 'down' : 'down-alt'}`} /> <Icon name={`sort-alpha-${sorting?.direction == 'desc' ? 'down' : 'down-alt'}`} />
{:else} {:else}
<Icon name="three-dots-vertical" /> <Icon name="three-dots-vertical" />
{/if} {/if}
@@ -172,12 +198,66 @@
> >
{#if sorting?.field == "totalJobs"} {#if sorting?.field == "totalJobs"}
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches --> <!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} /> <Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
{:else} {:else}
<Icon name="three-dots-vertical" /> <Icon name="three-dots-vertical" />
{/if} {/if}
</Button> </Button>
</th> </th>
<th scope="col">
<span class="mr-1">
Short Jobs
<Icon id="shortjobs-info" style="cursor:help;" size="sm" name="info-circle"/>
</span>
<Tooltip target={`shortjobs-info`} placement="top">
Job duration less than {formatDurationTime(shortDuration)}
</Tooltip>
&#8239; <!-- Narrow Non-Breaking Space -->
<Button
color={sorting.field == "shortJobs" ? "primary" : "light"}
size="sm"
onclick={() => changeSorting("shortJobs")}
>
{#if sorting?.field == "shortJobs"}
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
{:else}
<Icon name="three-dots-vertical" />
{/if}
</Button>
</th>
{#if fetchRunning}
<th scope="col">
Total Cores
<Button
color={sorting.field == "totalCores" ? "primary" : "light"}
size="sm"
onclick={() => changeSorting("totalCores")}
>
{#if sorting?.field == "totalJCores"}
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
{:else}
<Icon name="three-dots-vertical" />
{/if}
</Button>
</th>
<th scope="col">
Total Accelerators
<Button
color={sorting.field == "totalAccs" ? "primary" : "light"}
size="sm"
onclick={() => changeSorting("totalAccs")}
>
{#if sorting?.field == "totalAccs"}
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
{:else}
<Icon name="three-dots-vertical" />
{/if}
</Button>
</th>
{/if}
<th scope="col"> <th scope="col">
Total Walltime Total Walltime
<Button <Button
@@ -186,7 +266,7 @@
onclick={() => changeSorting("totalWalltime")} onclick={() => changeSorting("totalWalltime")}
> >
{#if sorting?.field == "totalWalltime"} {#if sorting?.field == "totalWalltime"}
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} /> <Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
{:else} {:else}
<Icon name="three-dots-vertical" /> <Icon name="three-dots-vertical" />
{/if} {/if}
@@ -200,7 +280,7 @@
onclick={() => changeSorting("totalCoreHours")} onclick={() => changeSorting("totalCoreHours")}
> >
{#if sorting?.field == "totalCoreHours"} {#if sorting?.field == "totalCoreHours"}
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} /> <Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
{:else} {:else}
<Icon name="three-dots-vertical" /> <Icon name="three-dots-vertical" />
{/if} {/if}
@@ -214,7 +294,7 @@
onclick={() => changeSorting("totalAccHours")} onclick={() => changeSorting("totalAccHours")}
> >
{#if sorting?.field == "totalAccHours"} {#if sorting?.field == "totalAccHours"}
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} /> <Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
{:else} {:else}
<Icon name="three-dots-vertical" /> <Icon name="three-dots-vertical" />
{/if} {/if}
@@ -225,11 +305,11 @@
<tbody> <tbody>
{#if $stats.fetching} {#if $stats.fetching}
<tr> <tr>
<td colspan="4" style="text-align: center;"><Spinner secondary /></td> <td colspan={numCols} style="text-align: center;"><Spinner secondary /></td>
</tr> </tr>
{:else if $stats.error} {:else if $stats.error}
<tr> <tr>
<td colspan="4" <td colspan={numCols}
><Card body color="danger" class="mb-3">{$stats.error.message}</Card ><Card body color="danger" class="mb-3">{$stats.error.message}</Card
></td ></td
> >
@@ -260,13 +340,18 @@
> >
{/if} {/if}
<td>{row.totalJobs}</td> <td>{row.totalJobs}</td>
<td>{row.shortJobs}</td>
{#if fetchRunning}
<td>{row.totalCores}</td>
<td>{row.totalAccs}</td>
{/if}
<td>{row.totalWalltime}</td> <td>{row.totalWalltime}</td>
<td>{row.totalCoreHours}</td> <td>{row.totalCoreHours}</td>
<td>{row.totalAccHours}</td> <td>{row.totalAccHours}</td>
</tr> </tr>
{:else} {:else}
<tr> <tr>
<td colspan="4"><i>No {type.toLowerCase()}s/jobs found</i></td> <td colspan={numCols}><i>No {type.toLowerCase()}s/jobs found</i></td>
</tr> </tr>
{/each} {/each}
{/if} {/if}

View File

@@ -32,7 +32,7 @@
let { let {
matchedListJobs = $bindable(0), matchedListJobs = $bindable(0),
selectedJobs = $bindable([]), selectedJobs = $bindable([]),
metrics = getContext("cc-config").metricConfig_jobListMetrics, metrics = [],
sorting = { field: "startTime", type: "col", order: "DESC" }, sorting = { field: "startTime", type: "col", order: "DESC" },
showFootprint = false, showFootprint = false,
filterBuffer = [], filterBuffer = [],
@@ -109,7 +109,7 @@
let paging = $derived({ itemsPerPage, page }); let paging = $derived({ itemsPerPage, page });
const plotWidth = $derived.by(() => { const plotWidth = $derived.by(() => {
return Math.floor( return Math.floor(
(tableWidth - jobInfoColumnWidth) / (metrics.length + (showFootprint ? 1 : 0)) - 10, (tableWidth - jobInfoColumnWidth) / (metrics.length + (showFootprint ? 2 : 1)) - 10,
); );
}); });
let jobsStore = $derived(queryStore({ let jobsStore = $derived(queryStore({

View File

@@ -133,7 +133,7 @@
} }
</script> </script>
<Card class="mt-1 overflow-auto" style="width: {width}; height: {height}"> <Card class="mx-2 overflow-auto" style="width: {width}; height: {height}">
{#if displayTitle} {#if displayTitle}
<CardHeader> <CardHeader>
<CardTitle class="mb-0 d-flex justify-content-center"> <CardTitle class="mb-0 d-flex justify-content-center">

View File

@@ -79,6 +79,7 @@
/* Derived */ /* Derived */
const jobId = $derived(job?.id); const jobId = $derived(job?.id);
const refinedData = $derived($metricsQuery?.data?.jobMetrics ? sortAndSelectScope($metricsQuery.data.jobMetrics) : []);
const scopes = $derived.by(() => { const scopes = $derived.by(() => {
if (job.numNodes == 1) { if (job.numNodes == 1) {
if (job.numAcc >= 1) return ["core", "accelerator"]; if (job.numAcc >= 1) return ["core", "accelerator"];
@@ -202,10 +203,15 @@
/> />
</td> </td>
{/if} {/if}
{#each sortAndSelectScope($metricsQuery.data.jobMetrics) as metric, i (metric?.name || i)} {#each refinedData as metric, i (metric?.name || i)}
<td> <td>
<!-- Subluster Metricconfig remove keyword for jobtables (joblist main, user joblist, project joblist) to be used here as toplevel case--> {#key metric}
{#if metric.disabled == false && metric.data} {#if metric?.data}
{#if metric?.disabled}
<Card body class="mx-2" color="info">
Metric <b>{metric.data.name}</b>: Disabled for subcluster <code>{job.subCluster}</code>
</Card>
{:else}
<MetricPlot <MetricPlot
onZoom={(detail) => handleZoom(detail, metric.data.name)} onZoom={(detail) => handleZoom(detail, metric.data.name)}
height={plotHeight} height={plotHeight}
@@ -222,12 +228,7 @@
zoomState={zoomStates[metric.data.name] || null} zoomState={zoomStates[metric.data.name] || null}
thresholdState={thresholdStates[metric.data.name] || null} thresholdState={thresholdStates[metric.data.name] || null}
/> />
{:else if metric.disabled == true && metric.data} {/if}
<Card body color="info"
>Metric disabled for subcluster <code
>{metric.data.name}:{job.subCluster}</code
></Card
>
{:else} {:else}
<Card body class="mx-2" color="warning"> <Card body class="mx-2" color="warning">
<p>No dataset(s) returned for <b>{metrics[i]}</b></p> <p>No dataset(s) returned for <b>{metrics[i]}</b></p>
@@ -236,6 +237,11 @@
<p class="mb-1">Identical messages in <i>job {job.jobId} row</i>: Host not found.</p> <p class="mb-1">Identical messages in <i>job {job.jobId} row</i>: Host not found.</p>
</Card> </Card>
{/if} {/if}
{/key}
</td>
{:else}
<td>
<Card body class="mx-2">No metrics selected for display.</Card>
</td> </td>
{/each} {/each}
{/if} {/if}

View File

@@ -79,7 +79,7 @@
// X // X
let pendingSeries = [ let pendingSeries = [
{ {
label: "Runtime", label: "Time",
value: (u, ts, sidx, didx) => value: (u, ts, sidx, didx) =>
(didx == null) ? null : formatDurationTime(ts, forNode), (didx == null) ? null : formatDurationTime(ts, forNode),
} }

View File

@@ -34,6 +34,9 @@
/*Const Init */ /*Const Init */
const { query: initq } = init(); const { query: initq } = init();
const useCbColors = getContext("cc-config")?.plotConfiguration_colorblindMode || false const useCbColors = getContext("cc-config")?.plotConfiguration_colorblindMode || false
/* Derived */
const subClusters = $derived($initq?.data?.clusters?.find((c) => c.name == presetCluster)?.subClusters || []);
</script> </script>
<!-- Loading indicator & Refresh --> <!-- Loading indicator & Refresh -->
@@ -66,12 +69,22 @@
</CardBody> </CardBody>
</TabPane> </TabPane>
<TabPane tabId="usage-dash" tab="Usage"> <TabPane tabId="usage-dash" tab="Cluster Usage">
<CardBody> <CardBody>
<UsageDash {presetCluster} {useCbColors}></UsageDash> <UsageDash {presetCluster} {useCbColors}></UsageDash>
</CardBody> </CardBody>
</TabPane> </TabPane>
{#if subClusters?.length > 1}
{#each subClusters.map(sc => sc.name) as scn}
<TabPane tabId="{scn}-usage-dash" tab="{scn.charAt(0).toUpperCase() + scn.slice(1)} Usage">
<CardBody>
<UsageDash {presetCluster} presetSubCluster={scn} {useCbColors}></UsageDash>
</CardBody>
</TabPane>
{/each}
{/if}
<TabPane tabId="metric-dash" tab="Statistics"> <TabPane tabId="metric-dash" tab="Statistics">
<CardBody> <CardBody>
<StatisticsDash {presetCluster} {useCbColors}></StatisticsDash> <StatisticsDash {presetCluster} {useCbColors}></StatisticsDash>

View File

@@ -3,6 +3,9 @@
Properties: Properties:
- `presetCluster String`: The cluster to show status information for - `presetCluster String`: The cluster to show status information for
- `presetSubCluster String?`: The subCluster to show status information for [Default: null]
- `useCbColors Bool?`: Use colorblind friendly colors [Default: false]
- `useAltColors Bool?`: Use alternative color set [Default: false]
--> -->
<script> <script>
@@ -35,6 +38,7 @@
/* Svelte 5 Props */ /* Svelte 5 Props */
let { let {
presetCluster, presetCluster,
presetSubCluster = null,
useCbColors = false, useCbColors = false,
useAltColors = false useAltColors = false
} = $props(); } = $props();
@@ -52,7 +56,12 @@
let numDurationBins = $state("1h"); let numDurationBins = $state("1h");
/* Derived */ /* Derived */
let cluster = $derived(presetCluster) const canvasPrefix = $derived(`${presetCluster}-${presetSubCluster ? presetSubCluster : ''}`)
const statusFilter = $derived(presetSubCluster
? [{ state: ["running"] }, { cluster: { eq: presetCluster} }, { partition: { eq: presetSubCluster } }]
: [{ state: ["running"] }, { cluster: { eq: presetCluster} }]
);
const topJobsQuery = $derived(queryStore({ const topJobsQuery = $derived(queryStore({
client: client, client: client,
query: gql` query: gql`
@@ -82,7 +91,7 @@
} }
`, `,
variables: { variables: {
filter: [{ state: ["running"] }, { cluster: { eq: cluster} }], filter: statusFilter,
paging: pagingState // Top 10 paging: pagingState // Top 10
}, },
requestPolicy: "network-only" requestPolicy: "network-only"
@@ -117,7 +126,7 @@
} }
`, `,
variables: { variables: {
filter: [{ state: ["running"] }, { cluster: { eq: cluster } }], filter: statusFilter,
paging: pagingState paging: pagingState
}, },
requestPolicy: "network-only" requestPolicy: "network-only"
@@ -152,7 +161,7 @@
} }
`, `,
variables: { variables: {
filter: [{ state: ["running"] }, { cluster: { eq: cluster } }], filter: statusFilter,
paging: pagingState paging: pagingState
}, },
requestPolicy: "network-only" requestPolicy: "network-only"
@@ -184,7 +193,7 @@
} }
`, `,
variables: { variables: {
filter: [{ state: ["running"] }, { cluster: { eq: cluster } }], filter: statusFilter,
selectedHistograms: selectedHistograms, // No Metrics requested for node hardware stats selectedHistograms: selectedHistograms, // No Metrics requested for node hardware stats
numDurationBins: numDurationBins, numDurationBins: numDurationBins,
}, },
@@ -264,7 +273,7 @@
</h4> </h4>
<Pie <Pie
{useAltColors} {useAltColors}
canvasId="hpcpie-jobs-users" canvasId="{canvasPrefix}-hpcpie-jobs-users"
size={colWidthJobs * 0.75} size={colWidthJobs * 0.75}
sliceLabel="Jobs" sliceLabel="Jobs"
quantities={$topJobsQuery.data.topUser.map( quantities={$topJobsQuery.data.topUser.map(
@@ -284,14 +293,14 @@
{#each $topJobsQuery.data.topUser as tu, i} {#each $topJobsQuery.data.topUser as tu, i}
<tr> <tr>
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td> <td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
<td id="topName-jobs-{tu.id}"> <td id="{canvasPrefix}-topName-jobs-{tu.id}">
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={cluster}&state=running" <a target="_blank" href="/monitoring/user/{tu.id}?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running"
>{scrambleNames ? scramble(tu.id) : tu.id} >{scrambleNames ? scramble(tu.id) : tu.id}
</a> </a>
</td> </td>
{#if tu?.name} {#if tu?.name}
<Tooltip <Tooltip
target={`topName-jobs-${tu.id}`} target={`${canvasPrefix}-topName-jobs-${tu.id}`}
placement="left" placement="left"
>{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip >{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip
> >
@@ -308,7 +317,7 @@
</h4> </h4>
<Pie <Pie
{useAltColors} {useAltColors}
canvasId="hpcpie-jobs-projects" canvasId="{canvasPrefix}-hpcpie-jobs-projects"
size={colWidthJobs * 0.75} size={colWidthJobs * 0.75}
sliceLabel={'Jobs'} sliceLabel={'Jobs'}
quantities={$topJobsQuery.data.topProjects.map( quantities={$topJobsQuery.data.topProjects.map(
@@ -328,7 +337,7 @@
<tr> <tr>
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td> <td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
<td> <td>
<a target="_blank" href="/monitoring/jobs/?cluster={cluster}&state=running&project={tp.id}&projectMatch=eq" <a target="_blank" href="/monitoring/jobs/?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running&project={tp.id}&projectMatch=eq"
>{scrambleNames ? scramble(tp.id) : tp.id} >{scrambleNames ? scramble(tp.id) : tp.id}
</a> </a>
</td> </td>
@@ -368,7 +377,7 @@
</h4> </h4>
<Pie <Pie
{useAltColors} {useAltColors}
canvasId="hpcpie-nodes-users" canvasId="{canvasPrefix}-hpcpie-nodes-users"
size={colWidthNodes * 0.75} size={colWidthNodes * 0.75}
sliceLabel="Nodes" sliceLabel="Nodes"
quantities={$topNodesQuery.data.topUser.map( quantities={$topNodesQuery.data.topUser.map(
@@ -388,14 +397,14 @@
{#each $topNodesQuery.data.topUser as tu, i} {#each $topNodesQuery.data.topUser as tu, i}
<tr> <tr>
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td> <td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
<td id="topName-nodes-{tu.id}"> <td id="{canvasPrefix}-topName-nodes-{tu.id}">
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={cluster}&state=running" <a target="_blank" href="/monitoring/user/{tu.id}?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running"
>{scrambleNames ? scramble(tu.id) : tu.id} >{scrambleNames ? scramble(tu.id) : tu.id}
</a> </a>
</td> </td>
{#if tu?.name} {#if tu?.name}
<Tooltip <Tooltip
target={`topName-nodes-${tu.id}`} target={`${canvasPrefix}-topName-nodes-${tu.id}`}
placement="left" placement="left"
>{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip >{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip
> >
@@ -412,7 +421,7 @@
</h4> </h4>
<Pie <Pie
{useAltColors} {useAltColors}
canvasId="hpcpie-nodes-projects" canvasId="{canvasPrefix}-hpcpie-nodes-projects"
size={colWidthNodes * 0.75} size={colWidthNodes * 0.75}
sliceLabel={'Nodes'} sliceLabel={'Nodes'}
quantities={$topNodesQuery.data.topProjects.map( quantities={$topNodesQuery.data.topProjects.map(
@@ -432,7 +441,7 @@
<tr> <tr>
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td> <td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
<td> <td>
<a target="_blank" href="/monitoring/jobs/?cluster={cluster}&state=running&project={tp.id}&projectMatch=eq" <a target="_blank" href="/monitoring/jobs/?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running&project={tp.id}&projectMatch=eq"
>{scrambleNames ? scramble(tp.id) : tp.id} >{scrambleNames ? scramble(tp.id) : tp.id}
</a> </a>
</td> </td>
@@ -472,7 +481,7 @@
</h4> </h4>
<Pie <Pie
{useAltColors} {useAltColors}
canvasId="hpcpie-accs-users" canvasId="{canvasPrefix}-hpcpie-accs-users"
size={colWidthAccs * 0.75} size={colWidthAccs * 0.75}
sliceLabel="GPUs" sliceLabel="GPUs"
quantities={$topAccsQuery.data.topUser.map( quantities={$topAccsQuery.data.topUser.map(
@@ -492,14 +501,14 @@
{#each $topAccsQuery.data.topUser as tu, i} {#each $topAccsQuery.data.topUser as tu, i}
<tr> <tr>
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td> <td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
<td id="topName-accs-{tu.id}"> <td id="{canvasPrefix}-topName-accs-{tu.id}">
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={cluster}&state=running" <a target="_blank" href="/monitoring/user/{tu.id}?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running"
>{scrambleNames ? scramble(tu.id) : tu.id} >{scrambleNames ? scramble(tu.id) : tu.id}
</a> </a>
</td> </td>
{#if tu?.name} {#if tu?.name}
<Tooltip <Tooltip
target={`topName-accs-${tu.id}`} target={`${canvasPrefix}-topName-accs-${tu.id}`}
placement="left" placement="left"
>{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip >{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip
> >
@@ -516,7 +525,7 @@
</h4> </h4>
<Pie <Pie
{useAltColors} {useAltColors}
canvasId="hpcpie-accs-projects" canvasId="{canvasPrefix}-hpcpie-accs-projects"
size={colWidthAccs * 0.75} size={colWidthAccs * 0.75}
sliceLabel={'GPUs'} sliceLabel={'GPUs'}
quantities={$topAccsQuery.data.topProjects.map( quantities={$topAccsQuery.data.topProjects.map(
@@ -536,7 +545,7 @@
<tr> <tr>
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td> <td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
<td> <td>
<a target="_blank" href="/monitoring/jobs/?cluster={cluster}&state=running&project={tp.id}&projectMatch=eq" <a target="_blank" href="/monitoring/jobs/?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running&project={tp.id}&projectMatch=eq"
>{scrambleNames ? scramble(tp.id) : tp.id} >{scrambleNames ? scramble(tp.id) : tp.id}
</a> </a>
</td> </td>

View File

@@ -69,9 +69,9 @@
}) })
); );
let extendedLegendData = $derived($nodeJobsData?.data ? buildExtendedLegend() : null); const extendedLegendData = $derived($nodeJobsData?.data ? buildExtendedLegend() : null);
let refinedData = $derived(nodeData?.metrics ? sortAndSelectScope(nodeData.metrics) : null); const refinedData = $derived(nodeData?.metrics ? sortAndSelectScope(nodeData.metrics) : []);
let dataHealth = $derived(refinedData.filter((rd) => rd.disabled === false).map((enabled) => (enabled?.data?.metric?.series?.length > 0))); const dataHealth = $derived(refinedData.filter((rd) => rd.disabled === false).map((enabled) => (enabled?.data?.metric?.series?.length > 0)));
/* Functions */ /* Functions */
const selectScope = (nodeMetrics) => const selectScope = (nodeMetrics) =>