Merge pull request #481 from ClusterCockpit/dev

Dev
This commit is contained in:
Jan Eitzinger
2026-02-04 19:46:02 +01:00
committed by GitHub
19 changed files with 862 additions and 234 deletions

2
.gitignore vendored
View File

@@ -13,7 +13,7 @@
/var/checkpoints*
migrateTimestamps.pl
test_ccms_write_api*
test_ccms_*
/web/frontend/public/build
/web/frontend/node_modules

View File

@@ -135,36 +135,3 @@ func debugMetrics(rw http.ResponseWriter, r *http.Request) {
return
}
}
// handleHealthCheck godoc
// @summary HealthCheck endpoint
// @tags healthcheck
// @description This endpoint allows the users to check if a node is healthy
// @produce json
// @param selector query string false "Selector"
// @success 200 {string} string "Debug dump"
// @failure 400 {object} api.ErrorResponse "Bad Request"
// @failure 401 {object} api.ErrorResponse "Unauthorized"
// @failure 403 {object} api.ErrorResponse "Forbidden"
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
// @security ApiKeyAuth
// @router /healthcheck/ [get]
func metricsHealth(rw http.ResponseWriter, r *http.Request) {
rawCluster := r.URL.Query().Get("cluster")
rawNode := r.URL.Query().Get("node")
if rawCluster == "" || rawNode == "" {
handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw)
return
}
rw.Header().Add("Content-Type", "application/json")
selector := []string{rawCluster, rawNode}
ms := metricstore.GetMemoryStore()
if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil {
handleError(err, http.StatusBadRequest, rw)
return
}
}

View File

@@ -324,11 +324,12 @@ func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
}
repo := repository.GetNodeRepository()
requestReceived := time.Now().Unix()
for _, node := range req.Nodes {
state := determineState(node.States)
nodeState := schema.NodeStateDB{
TimeStamp: time.Now().Unix(),
TimeStamp: requestReceived,
NodeState: state,
CpusAllocated: node.CpusAllocated,
MemoryAllocated: node.MemoryAllocated,

View File

@@ -7,11 +7,14 @@ package api
import (
"fmt"
"maps"
"net/http"
"strings"
"time"
"github.com/ClusterCockpit/cc-backend/internal/repository"
"github.com/ClusterCockpit/cc-backend/pkg/archive"
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
@@ -20,6 +23,15 @@ type UpdateNodeStatesRequest struct {
Cluster string `json:"cluster" example:"fritz"`
}
// metricListToNames converts a map of metric configurations to a list of metric names
func metricListToNames(metricList map[string]*schema.Metric) []string {
names := make([]string, 0, len(metricList))
for name := range metricList {
names = append(names, name)
}
return names
}
// this routine assumes that only one of them exists per node
func determineState(states []string) schema.SchedulerState {
for _, state := range states {
@@ -62,16 +74,42 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
http.StatusBadRequest, rw)
return
}
requestReceived := time.Now().Unix()
repo := repository.GetNodeRepository()
ms := metricstore.GetMemoryStore()
m := make(map[string][]string)
healthStates := make(map[string]schema.MonitoringState)
for _, node := range req.Nodes {
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
m[sc] = append(m[sc], node.Hostname)
}
}
for sc, nl := range m {
if sc != "" {
metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc)
metricNames := metricListToNames(metricList)
if states, err := ms.HealthCheck(req.Cluster, nl, metricNames); err == nil {
maps.Copy(healthStates, states)
}
}
}
for _, node := range req.Nodes {
state := determineState(node.States)
healthState := schema.MonitoringStateFailed
if hs, ok := healthStates[node.Hostname]; ok {
healthState = hs
}
nodeState := schema.NodeStateDB{
TimeStamp: time.Now().Unix(), NodeState: state,
TimeStamp: requestReceived,
NodeState: state,
CpusAllocated: node.CpusAllocated,
MemoryAllocated: node.MemoryAllocated,
GpusAllocated: node.GpusAllocated,
HealthState: schema.MonitoringStateFull,
HealthState: healthState,
JobsRunning: node.JobsRunning,
}

View File

@@ -81,7 +81,7 @@ func (api *RestAPI) MountAPIRoutes(r *mux.Router) {
// Cluster List
r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet)
// Slurm node state
r.HandleFunc("/nodestate/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut)
r.HandleFunc("/nodestates/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut)
// Job Handler
if config.Keys.APISubjects == nil {
cclog.Info("Enabling REST start/stop job API")
@@ -127,12 +127,12 @@ func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) {
r.HandleFunc("/free", freeMetrics).Methods(http.MethodPost)
r.HandleFunc("/write", writeMetrics).Methods(http.MethodPost)
r.HandleFunc("/debug", debugMetrics).Methods(http.MethodGet)
r.HandleFunc("/healthcheck", metricsHealth).Methods(http.MethodGet)
r.HandleFunc("/healthcheck", api.updateNodeStates).Methods(http.MethodPost)
// Same endpoints but with trailing slash
r.HandleFunc("/free/", freeMetrics).Methods(http.MethodPost)
r.HandleFunc("/write/", writeMetrics).Methods(http.MethodPost)
r.HandleFunc("/debug/", debugMetrics).Methods(http.MethodGet)
r.HandleFunc("/healthcheck/", metricsHealth).Methods(http.MethodGet)
r.HandleFunc("/healthcheck/", api.updateNodeStates).Methods(http.MethodPost)
}
// MountConfigAPIRoutes registers configuration and user management endpoints.

View File

@@ -923,15 +923,19 @@ func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metr
if !okData && len(ser.Data) != 0 {
collectorData[metric] = make([]schema.Float, len(ser.Data))
} else if !okData {
cclog.Debugf("ClusterMetrics Skip Init: No Data -> %s at %s; Size %d", metric, ser.Hostname, len(ser.Data))
cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip init: no data -> %s at %s; size %d", metric, ser.Hostname, len(ser.Data))
}
// Sum if init'd and matching size
if okData && len(ser.Data) == len(collectorData[metric]) {
for i, val := range ser.Data {
collectorData[metric][i] += val
if val.IsNaN() {
continue
} else {
collectorData[metric][i] += val
}
}
} else if okData {
cclog.Debugf("ClusterMetrics Skip Sum: Data Diff -> %s at %s; Want Size %d, Have Size %d", metric, ser.Hostname, len(collectorData[metric]), len(ser.Data))
cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip sum: data diff -> %s at %s; want size %d, have size %d", metric, ser.Hostname, len(collectorData[metric]), len(ser.Data))
}
}
}

View File

@@ -466,7 +466,7 @@ func (r *JobRepository) JobCountGrouped(
// AddJobCountGrouped augments existing statistics with additional job counts by category.
//
// This method enriches JobsStatistics returned by JobsStatsGrouped or JobCountGrouped
// with counts of running or short-running jobs, matched by group ID.
// with counts of running or short-running (based on ShortRunningJobsDuration) jobs, matched by group ID.
//
// Parameters:
// - ctx: Context for security checks

View File

@@ -158,8 +158,7 @@ func cleanupCheckpoints(dir string, cleanupDir string, from int64, deleteInstead
return 0, err
}
extension := Keys.Checkpoints.FileFormat
files, err := findFiles(entries, from, extension, false)
files, err := findFiles(entries, from, false)
if err != nil {
return 0, err
}

View File

@@ -415,7 +415,7 @@ func enqueueCheckpointHosts(dir string, work chan<- [2]string) error {
//
// Uses worker pool to load cluster/host combinations. Periodically triggers GC
// to prevent excessive heap growth. Returns number of files loaded and any errors.
func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) {
func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {
var wg sync.WaitGroup
work := make(chan [2]string, Keys.NumWorkers*4)
n, errs := int32(0), int32(0)
@@ -426,7 +426,7 @@ func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (
defer wg.Done()
for host := range work {
lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics))
nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension)
nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from)
if err != nil {
cclog.Errorf("[METRICSTORE]> error while loading checkpoints for %s/%s: %s", host[0], host[1], err.Error())
atomic.AddInt32(&errs, 1)
@@ -465,57 +465,7 @@ func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
cclog.Debugf("[METRICSTORE]> %#v Directory created successfully", dir)
}
// Config read (replace with your actual config read)
fileFormat := Keys.Checkpoints.FileFormat
if fileFormat == "" {
fileFormat = "avro"
}
// Map to easily get the fallback format
oppositeFormat := map[string]string{
"json": "avro",
"avro": "json",
}
// First, attempt to load the specified format
if found, err := checkFilesWithExtension(dir, fileFormat); err != nil {
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
} else if found {
cclog.Infof("[METRICSTORE]> Loading %s files because fileformat is %s", fileFormat, fileFormat)
return m.FromCheckpoint(dir, from, fileFormat)
}
// If not found, attempt the opposite format
altFormat := oppositeFormat[fileFormat]
if found, err := checkFilesWithExtension(dir, altFormat); err != nil {
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
} else if found {
cclog.Infof("[METRICSTORE]> Loading %s files but fileformat is %s", altFormat, fileFormat)
return m.FromCheckpoint(dir, from, altFormat)
}
return 0, nil
}
// checkFilesWithExtension walks a directory tree to check if files with the given extension exist.
func checkFilesWithExtension(dir string, extension string) (bool, error) {
found := false
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return fmt.Errorf("[METRICSTORE]> error accessing path %s: %v", path, err)
}
if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension {
found = true
return nil
}
return nil
})
if err != nil {
return false, fmt.Errorf("[METRICSTORE]> error walking through directories: %s", err)
}
return found, nil
return m.FromCheckpoint(dir, from)
}
func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error {
@@ -729,7 +679,7 @@ func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
return nil
}
func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension string) (int, error) {
func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, error) {
direntries, err := os.ReadDir(dir)
if err != nil {
if os.IsNotExist(err) {
@@ -748,33 +698,38 @@ func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension
children: make(map[string]*Level),
}
files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from, extension)
files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from)
filesLoaded += files
if err != nil {
return filesLoaded, err
}
l.children[e.Name()] = child
} else if strings.HasSuffix(e.Name(), "."+extension) {
} else if strings.HasSuffix(e.Name(), ".json") || strings.HasSuffix(e.Name(), ".avro") {
allFiles = append(allFiles, e)
} else {
continue
}
}
files, err := findFiles(allFiles, from, extension, true)
files, err := findFiles(allFiles, from, true)
if err != nil {
return filesLoaded, err
}
loaders := map[string]func(*MemoryStore, *os.File, int64) error{
"json": l.loadJSONFile,
"avro": l.loadAvroFile,
".json": l.loadJSONFile,
".avro": l.loadAvroFile,
}
loader := loaders[extension]
for _, filename := range files {
ext := filepath.Ext(filename)
loader := loaders[ext]
if loader == nil {
cclog.Warnf("Unknown extension for file %s", filename)
continue
}
// Use a closure to ensure file is closed immediately after use
err := func() error {
f, err := os.Open(path.Join(dir, filename))
@@ -798,10 +753,12 @@ func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension
// This will probably get very slow over time!
// A solution could be some sort of an index file in which all other files
// and the timespan they contain is listed.
func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRecentFiles bool) ([]string, error) {
// NOTE: This now assumes that you have distinct timestamps for json and avro files
// Also, it assumes that the timestamps are not overlapping/self-modified.
func findFiles(direntries []fs.DirEntry, t int64, findMoreRecentFiles bool) ([]string, error) {
nums := map[string]int64{}
for _, e := range direntries {
if !strings.HasSuffix(e.Name(), "."+extension) {
if !strings.HasSuffix(e.Name(), ".json") && !strings.HasSuffix(e.Name(), ".avro") {
continue
}

View File

@@ -6,87 +6,260 @@
package metricstore
import (
"bufio"
"cmp"
"fmt"
"slices"
"time"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
// HealthCheckResponse represents the result of a health check operation.
//
// Status indicates the monitoring state (Full, Partial, Failed).
// Error contains any error encountered during the health check.
type HealthCheckResponse struct {
Status schema.MonitoringState
Error error
}
// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing.
// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a
// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy.
const MaxMissingDataPoints int64 = 5
// MaxUnhealthyMetrics is a threshold which allows upto certain number of metrics in a node to be unhealthly.
// Works with MaxMissingDataPoints. Say 5 metrics (including submetrics) do not receive the last
// MaxMissingDataPoints data points, then the node will be deemed healthy. Any more metrics that does
// not receive data for MaxMissingDataPoints data points will deem the node unhealthy.
const MaxUnhealthyMetrics int64 = 5
func (b *buffer) healthCheck() int64 {
// isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints.
//
// Returns true if the buffer is healthy (recent data within threshold), false otherwise.
// A nil buffer or empty buffer is considered unhealthy.
func (b *buffer) bufferExists() bool {
// Check if the buffer is empty
if b.data == nil {
return 1
if b == nil || b.data == nil || len(b.data) == 0 {
return false
}
return true
}
// isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints.
//
// Returns true if the buffer is healthy (recent data within threshold), false otherwise.
// A nil buffer or empty buffer is considered unhealthy.
func (b *buffer) isBufferHealthy() bool {
// Get the last endtime of the buffer
bufferEnd := b.start + b.frequency*int64(len(b.data))
t := time.Now().Unix()
// Check if the buffer is too old
// Check if the buffer has recent data (within MaxMissingDataPoints threshold)
if t-bufferEnd > MaxMissingDataPoints*b.frequency {
return 1
return false
}
return 0
return true
}
func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) {
// MergeUniqueSorted merges two lists, sorts them, and removes duplicates.
// Requires 'cmp.Ordered' because we need to sort the data.
func mergeList[string cmp.Ordered](list1, list2 []string) []string {
// 1. Combine both lists
result := append(list1, list2...)
// 2. Sort the combined list
slices.Sort(result)
// 3. Compact removes consecutive duplicates (standard in Go 1.21+)
// e.g. [1, 1, 2, 3, 3] -> [1, 2, 3]
result = slices.Compact(result)
return result
}
// getHealthyMetrics recursively collects healthy and degraded metrics at this level and below.
//
// A metric is considered:
// - Healthy: buffer has recent data within MaxMissingDataPoints threshold AND has few/no NaN values
// - Degraded: buffer exists and has recent data, but contains more than MaxMissingDataPoints NaN values
//
// This routine walks the entire subtree starting from the current level.
//
// Parameters:
// - m: MemoryStore containing the global metric configuration
//
// Returns:
// - []string: Flat list of healthy metric names from this level and all children
// - []string: Flat list of degraded metric names (exist but have too many missing values)
// - error: Non-nil only for internal errors during recursion
//
// The routine mirrors healthCheck() but provides more granular classification:
// - healthCheck() finds problems (stale/missing)
// - getHealthyMetrics() separates healthy from degraded metrics
func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) ([]string, []string, error) {
l.lock.RLock()
defer l.lock.RUnlock()
for _, mc := range m.Metrics {
if b := l.metrics[mc.offset]; b != nil {
count += b.healthCheck()
globalMetrics := m.Metrics
missingList := make([]string, 0)
degradedList := make([]string, 0)
// Phase 1: Check metrics at this level
for _, metricName := range expectedMetrics {
offset := globalMetrics[metricName].offset
b := l.metrics[offset]
if !b.bufferExists() {
missingList = append(missingList, metricName)
} else if !b.isBufferHealthy() {
degradedList = append(degradedList, metricName)
}
}
// Phase 2: Recursively check child levels
for _, lvl := range l.children {
c, err := lvl.healthCheck(m, 0)
childMissing, childDegraded, err := lvl.getHealthyMetrics(m, expectedMetrics)
if err != nil {
return 0, err
return nil, nil, err
}
count += c
missingList = mergeList(missingList, childMissing)
degradedList = mergeList(degradedList, childDegraded)
}
return count, nil
return missingList, degradedList, nil
}
func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error {
// GetHealthyMetrics returns healthy and degraded metrics for a specific node as flat lists.
//
// This routine walks the metric tree starting from the specified node selector
// and collects all metrics that have received data within the last MaxMissingDataPoints
// (default: 5 data points). Metrics are classified into two categories:
//
// - Healthy: Buffer has recent data AND contains few/no NaN (missing) values
// - Degraded: Buffer has recent data BUT contains more than MaxMissingDataPoints NaN values
//
// The returned lists include both node-level metrics (e.g., "load", "mem_used") and
// hardware-level metrics (e.g., "cpu_user", "gpu_temp") in flat slices.
//
// Parameters:
// - selector: Hierarchical path to the target node, typically []string{cluster, hostname}.
// Example: []string{"emmy", "node001"} navigates to the "node001" host in the "emmy" cluster.
// The selector must match the hierarchy used during metric ingestion.
//
// Returns:
// - []string: Flat list of healthy metric names (recent data, few missing values)
// - []string: Flat list of degraded metric names (recent data, many missing values)
// - error: Non-nil if the node is not found or internal errors occur
//
// Example usage:
//
// selector := []string{"emmy", "node001"}
// healthyMetrics, degradedMetrics, err := ms.GetHealthyMetrics(selector)
// if err != nil {
// // Node not found or internal error
// return err
// }
// fmt.Printf("Healthy metrics: %v\n", healthyMetrics)
// // Output: ["load", "mem_used", "cpu_user", ...]
// fmt.Printf("Degraded metrics: %v\n", degradedMetrics)
// // Output: ["gpu_temp", "network_rx", ...] (metrics with many NaN values)
//
// Note: This routine provides more granular classification than HealthCheck:
// - HealthCheck reports stale/missing metrics (problems)
// - GetHealthyMetrics separates fully healthy from degraded metrics (quality levels)
func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, []string, error) {
lvl := m.root.findLevel(selector)
if lvl == nil {
return fmt.Errorf("[METRICSTORE]> not found: %#v", selector)
return nil, nil, fmt.Errorf("[METRICSTORE]> error while GetHealthyMetrics, host not found: %#v", selector)
}
buf := make([]byte, 0, 25)
// buf = append(buf, "{"...)
var count int64 = 0
unhealthyMetricsCount, err := lvl.healthCheck(m, count)
missingList, degradedList, err := lvl.getHealthyMetrics(m, expectedMetrics)
if err != nil {
return err
return nil, nil, err
}
if unhealthyMetricsCount < MaxUnhealthyMetrics {
buf = append(buf, "Healthy"...)
} else {
buf = append(buf, "Unhealthy"...)
}
// buf = append(buf, "}\n"...)
if _, err = w.Write(buf); err != nil {
return err
}
return w.Flush()
return missingList, degradedList, nil
}
// HealthCheck performs health checks on multiple nodes and returns their monitoring states.
//
// This routine provides a batch health check interface that evaluates multiple nodes
// against a specific set of expected metrics. For each node, it determines the overall
// monitoring state based on which metrics are healthy, degraded, or missing.
//
// Health Status Classification:
// - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values)
// - MonitoringStatePartial: Some metrics are degraded (many missing values) or missing
// - MonitoringStateFailed: Node not found or all expected metrics are missing/stale
//
// Parameters:
// - cluster: Cluster name (first element of selector path)
// - nodes: List of node hostnames to check
// - expectedMetrics: List of metric names that should be present on each node
//
// Returns:
// - map[string]schema.MonitoringState: Map keyed by hostname containing monitoring state for each node
// - error: Non-nil only for internal errors (individual node failures are captured as MonitoringStateFailed)
//
// Example usage:
//
// cluster := "emmy"
// nodes := []string{"node001", "node002", "node003"}
// expectedMetrics := []string{"load", "mem_used", "cpu_user", "cpu_system"}
// healthStates, err := ms.HealthCheck(cluster, nodes, expectedMetrics)
// if err != nil {
// return err
// }
// for hostname, state := range healthStates {
// fmt.Printf("Node %s: %s\n", hostname, state)
// }
//
// Note: This routine is optimized for batch operations where you need to check
// the same set of metrics across multiple nodes.
func (m *MemoryStore) HealthCheck(cluster string,
nodes []string, expectedMetrics []string,
) (map[string]schema.MonitoringState, error) {
results := make(map[string]schema.MonitoringState, len(nodes))
// Create a set of expected metrics for fast lookup
expectedSet := make(map[string]bool, len(expectedMetrics))
for _, metric := range expectedMetrics {
expectedSet[metric] = true
}
// Check each node
for _, hostname := range nodes {
selector := []string{cluster, hostname}
status := schema.MonitoringStateFull
healthyCount := 0
degradedCount := 0
missingCount := 0
// Get healthy and degraded metrics for this node
missingList, degradedList, err := m.GetHealthyMetrics(selector, expectedMetrics)
if err != nil {
// Node not found or internal error
results[hostname] = schema.MonitoringStateFailed
continue
}
missingCount = len(missingList)
degradedCount = len(degradedList)
healthyCount = len(expectedMetrics) - (missingCount + degradedCount)
// Determine overall health status
if missingCount > 0 || degradedCount > 0 {
if healthyCount == 0 {
// No healthy metrics at all
status = schema.MonitoringStateFailed
} else {
// Some healthy, some degraded/missing
status = schema.MonitoringStatePartial
}
}
// else: all metrics healthy, status remains MonitoringStateFull
results[hostname] = status
}
return results, nil
}

View File

@@ -7,6 +7,7 @@ package metricstore
import (
"testing"
"time"
"github.com/ClusterCockpit/cc-lib/v2/schema"
)
@@ -88,3 +89,378 @@ func TestBufferRead(t *testing.T) {
t.Errorf("buffer.read() len(result) = %d, want 3", len(result))
}
}
func TestHealthCheck(t *testing.T) {
// Create a test MemoryStore with some metrics
metrics := map[string]MetricConfig{
"load": {Frequency: 10, Aggregation: AvgAggregation, offset: 0},
"mem_used": {Frequency: 10, Aggregation: AvgAggregation, offset: 1},
"cpu_user": {Frequency: 10, Aggregation: AvgAggregation, offset: 2},
"cpu_system": {Frequency: 10, Aggregation: AvgAggregation, offset: 3},
}
ms := &MemoryStore{
Metrics: metrics,
root: Level{
metrics: make([]*buffer, len(metrics)),
children: make(map[string]*Level),
},
}
// Use recent timestamps (current time minus a small offset)
now := time.Now().Unix()
startTime := now - 100 // Start 100 seconds ago to have enough data points
// Setup test data for node001 - all metrics healthy (recent data)
node001 := ms.root.findLevelOrCreate([]string{"testcluster", "node001"}, len(metrics))
for i := 0; i < len(metrics); i++ {
node001.metrics[i] = newBuffer(startTime, 10)
// Write recent data up to now
for ts := startTime; ts <= now; ts += 10 {
node001.metrics[i].write(ts, schema.Float(float64(i+1)))
}
}
// Setup test data for node002 - some metrics stale (old data beyond MaxMissingDataPoints threshold)
node002 := ms.root.findLevelOrCreate([]string{"testcluster", "node002"}, len(metrics))
// MaxMissingDataPoints = 5, frequency = 10, so threshold is 50 seconds
staleTime := now - 100 // Data ends 100 seconds ago (well beyond 50 second threshold)
for i := 0; i < len(metrics); i++ {
node002.metrics[i] = newBuffer(staleTime-50, 10)
if i < 2 {
// First two metrics: healthy (recent data)
for ts := startTime; ts <= now; ts += 10 {
node002.metrics[i].write(ts, schema.Float(float64(i+1)))
}
} else {
// Last two metrics: stale (data ends 100 seconds ago)
for ts := staleTime - 50; ts <= staleTime; ts += 10 {
node002.metrics[i].write(ts, schema.Float(float64(i+1)))
}
}
}
// Setup test data for node003 - some metrics missing (no buffer)
node003 := ms.root.findLevelOrCreate([]string{"testcluster", "node003"}, len(metrics))
// Only create buffers for first two metrics
for i := 0; i < 2; i++ {
node003.metrics[i] = newBuffer(startTime, 10)
for ts := startTime; ts <= now; ts += 10 {
node003.metrics[i].write(ts, schema.Float(float64(i+1)))
}
}
// Leave metrics[2] and metrics[3] as nil (missing)
// Setup test data for node005 - all metrics stale
node005 := ms.root.findLevelOrCreate([]string{"testcluster", "node005"}, len(metrics))
for i := 0; i < len(metrics); i++ {
node005.metrics[i] = newBuffer(staleTime-50, 10)
// All metrics have stale data (ends 100 seconds ago)
for ts := staleTime - 50; ts <= staleTime; ts += 10 {
node005.metrics[i].write(ts, schema.Float(float64(i+1)))
}
}
// node004 doesn't exist at all
tests := []struct {
name string
cluster string
nodes []string
expectedMetrics []string
wantStates map[string]schema.MonitoringState
}{
{
name: "all metrics healthy",
cluster: "testcluster",
nodes: []string{"node001"},
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
wantStates: map[string]schema.MonitoringState{
"node001": schema.MonitoringStateFull,
},
},
{
name: "some metrics stale",
cluster: "testcluster",
nodes: []string{"node002"},
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
wantStates: map[string]schema.MonitoringState{
"node002": schema.MonitoringStatePartial,
},
},
{
name: "some metrics missing",
cluster: "testcluster",
nodes: []string{"node003"},
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
wantStates: map[string]schema.MonitoringState{
"node003": schema.MonitoringStatePartial,
},
},
{
name: "node not found",
cluster: "testcluster",
nodes: []string{"node004"},
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
wantStates: map[string]schema.MonitoringState{
"node004": schema.MonitoringStateFailed,
},
},
{
name: "all metrics stale",
cluster: "testcluster",
nodes: []string{"node005"},
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
wantStates: map[string]schema.MonitoringState{
"node005": schema.MonitoringStateFailed,
},
},
{
name: "multiple nodes mixed states",
cluster: "testcluster",
nodes: []string{"node001", "node002", "node003", "node004", "node005"},
expectedMetrics: []string{"load", "mem_used"},
wantStates: map[string]schema.MonitoringState{
"node001": schema.MonitoringStateFull,
"node002": schema.MonitoringStateFull, // Only checking first 2 metrics which are healthy
"node003": schema.MonitoringStateFull, // Only checking first 2 metrics which exist
"node004": schema.MonitoringStateFailed, // Node doesn't exist
"node005": schema.MonitoringStateFailed, // Both metrics are stale
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
results, err := ms.HealthCheck(tt.cluster, tt.nodes, tt.expectedMetrics)
if err != nil {
t.Errorf("HealthCheck() error = %v", err)
return
}
// Check that we got results for all nodes
if len(results) != len(tt.nodes) {
t.Errorf("HealthCheck() returned %d results, want %d", len(results), len(tt.nodes))
}
// Check each node's state
for _, node := range tt.nodes {
state, ok := results[node]
if !ok {
t.Errorf("HealthCheck() missing result for node %s", node)
continue
}
// Check status
if wantStatus, ok := tt.wantStates[node]; ok {
if state != wantStatus {
t.Errorf("HealthCheck() node %s status = %v, want %v", node, state, wantStatus)
}
}
}
})
}
}
// TestGetHealthyMetrics tests the GetHealthyMetrics function which returns lists of missing and degraded metrics
func TestGetHealthyMetrics(t *testing.T) {
metrics := map[string]MetricConfig{
"load": {Frequency: 10, Aggregation: AvgAggregation, offset: 0},
"mem_used": {Frequency: 10, Aggregation: AvgAggregation, offset: 1},
"cpu_user": {Frequency: 10, Aggregation: AvgAggregation, offset: 2},
}
ms := &MemoryStore{
Metrics: metrics,
root: Level{
metrics: make([]*buffer, len(metrics)),
children: make(map[string]*Level),
},
}
now := time.Now().Unix()
startTime := now - 100
staleTime := now - 100
// Setup node with mixed health states
node := ms.root.findLevelOrCreate([]string{"testcluster", "testnode"}, len(metrics))
// Metric 0 (load): healthy - recent data
node.metrics[0] = newBuffer(startTime, 10)
for ts := startTime; ts <= now; ts += 10 {
node.metrics[0].write(ts, schema.Float(1.0))
}
// Metric 1 (mem_used): degraded - stale data
node.metrics[1] = newBuffer(staleTime-50, 10)
for ts := staleTime - 50; ts <= staleTime; ts += 10 {
node.metrics[1].write(ts, schema.Float(2.0))
}
// Metric 2 (cpu_user): missing - no buffer (nil)
tests := []struct {
name string
selector []string
expectedMetrics []string
wantMissing []string
wantDegraded []string
wantErr bool
}{
{
name: "mixed health states",
selector: []string{"testcluster", "testnode"},
expectedMetrics: []string{"load", "mem_used", "cpu_user"},
wantMissing: []string{"cpu_user"},
wantDegraded: []string{"mem_used"},
wantErr: false,
},
{
name: "node not found",
selector: []string{"testcluster", "nonexistent"},
expectedMetrics: []string{"load"},
wantMissing: nil,
wantDegraded: nil,
wantErr: true,
},
{
name: "check only healthy metric",
selector: []string{"testcluster", "testnode"},
expectedMetrics: []string{"load"},
wantMissing: []string{},
wantDegraded: []string{},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
missing, degraded, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics)
if (err != nil) != tt.wantErr {
t.Errorf("GetHealthyMetrics() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr {
return
}
// Check missing list
if len(missing) != len(tt.wantMissing) {
t.Errorf("GetHealthyMetrics() missing = %v, want %v", missing, tt.wantMissing)
} else {
for i, m := range tt.wantMissing {
if missing[i] != m {
t.Errorf("GetHealthyMetrics() missing[%d] = %v, want %v", i, missing[i], m)
}
}
}
// Check degraded list
if len(degraded) != len(tt.wantDegraded) {
t.Errorf("GetHealthyMetrics() degraded = %v, want %v", degraded, tt.wantDegraded)
} else {
for i, d := range tt.wantDegraded {
if degraded[i] != d {
t.Errorf("GetHealthyMetrics() degraded[%d] = %v, want %v", i, degraded[i], d)
}
}
}
})
}
}
// TestBufferHealthChecks tests the buffer-level health check functions
func TestBufferHealthChecks(t *testing.T) {
now := time.Now().Unix()
tests := []struct {
name string
setupBuffer func() *buffer
wantExists bool
wantHealthy bool
description string
}{
{
name: "nil buffer",
setupBuffer: func() *buffer {
return nil
},
wantExists: false,
wantHealthy: false,
description: "nil buffer should not exist and not be healthy",
},
{
name: "empty buffer",
setupBuffer: func() *buffer {
b := newBuffer(now, 10)
b.data = nil
return b
},
wantExists: false,
wantHealthy: false,
description: "empty buffer should not exist and not be healthy",
},
{
name: "healthy buffer with recent data",
setupBuffer: func() *buffer {
b := newBuffer(now-30, 10)
// Write data up to now (within MaxMissingDataPoints * frequency = 50 seconds)
for ts := now - 30; ts <= now; ts += 10 {
b.write(ts, schema.Float(1.0))
}
return b
},
wantExists: true,
wantHealthy: true,
description: "buffer with recent data should be healthy",
},
{
name: "stale buffer beyond threshold",
setupBuffer: func() *buffer {
b := newBuffer(now-200, 10)
// Write data that ends 100 seconds ago (beyond MaxMissingDataPoints * frequency = 50 seconds)
for ts := now - 200; ts <= now-100; ts += 10 {
b.write(ts, schema.Float(1.0))
}
return b
},
wantExists: true,
wantHealthy: false,
description: "buffer with stale data should exist but not be healthy",
},
{
name: "buffer at threshold boundary",
setupBuffer: func() *buffer {
b := newBuffer(now-50, 10)
// Write data that ends exactly at threshold (MaxMissingDataPoints * frequency = 50 seconds)
for ts := now - 50; ts <= now-50; ts += 10 {
b.write(ts, schema.Float(1.0))
}
return b
},
wantExists: true,
wantHealthy: true,
description: "buffer at threshold boundary should still be healthy",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
b := tt.setupBuffer()
exists := b.bufferExists()
if exists != tt.wantExists {
t.Errorf("bufferExists() = %v, want %v: %s", exists, tt.wantExists, tt.description)
}
if b != nil && b.data != nil && len(b.data) > 0 {
healthy := b.isBufferHealthy()
if healthy != tt.wantHealthy {
t.Errorf("isBufferHealthy() = %v, want %v: %s", healthy, tt.wantHealthy, tt.description)
}
}
})
}
}

View File

@@ -7,7 +7,7 @@
-->
<script>
import { onMount } from "svelte";
import { getContext, onMount } from "svelte";
import {
Row,
Col,
@@ -18,6 +18,7 @@
Spinner,
InputGroup,
Input,
Tooltip
} from "@sveltestrap/sveltestrap";
import {
queryStore,
@@ -29,6 +30,9 @@
scramble,
scrambleNames,
} from "./generic/utils.js";
import {
formatDurationTime
} from "./generic/units.js";
import Filters from "./generic/Filters.svelte";
/* Svelte 5 Props */
@@ -40,48 +44,70 @@
/* Const Init */
const {} = init();
const client = getContextClient();
const shortDuration = getContext("cc-config").jobList_hideShortRunningJobs; // Always configured
/* State Init*/
let filterComponent = $state(); // see why here: https://stackoverflow.com/questions/58287729/how-can-i-export-a-function-from-a-svelte-component-that-changes-a-value-in-the
let jobFilters = $state([]);
let nameFilter = $state("");
let sorting = $state({ field: "totalJobs", direction: "down" });
let sorting = $state({ field: "totalJobs", direction: "desc" });
/* Derived Vars */
const fetchRunning = $derived(jobFilters.some(jf => jf?.state?.length == 1 && jf?.state?.includes("running")));
const numCols = $derived.by(() => {
let colbase = 6
if (fetchRunning) {
colbase += 2
}
return colbase
})
let stats = $derived(
queryStore({
client: client,
query: gql`
query($jobFilters: [JobFilter!]!) {
query($jobFilters: [JobFilter!]!, $fetchRunning: Boolean!) {
rows: jobsStatistics(filter: $jobFilters, groupBy: ${type}) {
id
name
totalJobs
shortJobs
totalCores @include(if: $fetchRunning)
totalAccs @include(if: $fetchRunning)
totalWalltime
totalCoreHours
totalAccHours
}
}`,
variables: { jobFilters },
variables: {
jobFilters,
fetchRunning
},
})
);
/* Functions */
function changeSorting(field) {
sorting = { field, direction: sorting?.direction == "down" ? "up" : "down" };
function changeSorting(newField) {
if (sorting.field == newField) {
// Same Field, Change Direction
sorting = { field: newField, direction: sorting.direction == "desc" ? "asc" : "desc" };
} else {
// Change Field, Apply Default Direction
sorting = { field: newField, direction: "desc" };
}
}
function sort(stats, sorting, nameFilter) {
const idCmp = sorting.direction == "up"
const idCmp = sorting.direction == "asc"
? (a, b) => b.id.localeCompare(a.id)
: (a, b) => a.id.localeCompare(b.id)
// Force empty or undefined strings to the end of the list
const nameCmp = sorting.direction == "up"
const nameCmp = sorting.direction == "asc"
? (a, b) => !a?.name ? 1 : (!b?.name ? -1 : (b.name.localeCompare(a.name)))
: (a, b) => !a?.name ? 1 : (!b?.name ? -1 : (a.name.localeCompare(b.name)))
const intCmp = sorting.direction == "up"
const intCmp = sorting.direction == "asc"
? (a, b) => a[sorting.field] - b[sorting.field]
: (a, b) => b[sorting.field] - a[sorting.field];
@@ -141,7 +167,7 @@
>
{#if sorting?.field == "id"}
<!-- Note on Icon-Name: Arrow-indicator always down, only alpha-indicator switches -->
<Icon name={`sort-alpha-${sorting?.direction == 'down' ? 'down' : 'down-alt'}`} />
<Icon name={`sort-alpha-${sorting?.direction == 'desc' ? 'down' : 'down-alt'}`} />
{:else}
<Icon name="three-dots-vertical" />
{/if}
@@ -156,7 +182,7 @@
onclick={() => changeSorting("name")}
>
{#if sorting?.field == "name"}
<Icon name={`sort-alpha-${sorting?.direction == 'down' ? 'down' : 'down-alt'}`} />
<Icon name={`sort-alpha-${sorting?.direction == 'desc' ? 'down' : 'down-alt'}`} />
{:else}
<Icon name="three-dots-vertical" />
{/if}
@@ -172,12 +198,66 @@
>
{#if sorting?.field == "totalJobs"}
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} />
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
{:else}
<Icon name="three-dots-vertical" />
{/if}
</Button>
</th>
<th scope="col">
<span class="mr-1">
Short Jobs
<Icon id="shortjobs-info" style="cursor:help;" size="sm" name="info-circle"/>
</span>
<Tooltip target={`shortjobs-info`} placement="top">
Job duration less than {formatDurationTime(shortDuration)}
</Tooltip>
&#8239; <!-- Narrow Non-Breaking Space -->
<Button
color={sorting.field == "shortJobs" ? "primary" : "light"}
size="sm"
onclick={() => changeSorting("shortJobs")}
>
{#if sorting?.field == "shortJobs"}
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
{:else}
<Icon name="three-dots-vertical" />
{/if}
</Button>
</th>
{#if fetchRunning}
<th scope="col">
Total Cores
<Button
color={sorting.field == "totalCores" ? "primary" : "light"}
size="sm"
onclick={() => changeSorting("totalCores")}
>
{#if sorting?.field == "totalJCores"}
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
{:else}
<Icon name="three-dots-vertical" />
{/if}
</Button>
</th>
<th scope="col">
Total Accelerators
<Button
color={sorting.field == "totalAccs" ? "primary" : "light"}
size="sm"
onclick={() => changeSorting("totalAccs")}
>
{#if sorting?.field == "totalAccs"}
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
{:else}
<Icon name="three-dots-vertical" />
{/if}
</Button>
</th>
{/if}
<th scope="col">
Total Walltime
<Button
@@ -186,7 +266,7 @@
onclick={() => changeSorting("totalWalltime")}
>
{#if sorting?.field == "totalWalltime"}
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} />
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
{:else}
<Icon name="three-dots-vertical" />
{/if}
@@ -200,7 +280,7 @@
onclick={() => changeSorting("totalCoreHours")}
>
{#if sorting?.field == "totalCoreHours"}
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} />
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
{:else}
<Icon name="three-dots-vertical" />
{/if}
@@ -214,7 +294,7 @@
onclick={() => changeSorting("totalAccHours")}
>
{#if sorting?.field == "totalAccHours"}
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} />
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
{:else}
<Icon name="three-dots-vertical" />
{/if}
@@ -225,11 +305,11 @@
<tbody>
{#if $stats.fetching}
<tr>
<td colspan="4" style="text-align: center;"><Spinner secondary /></td>
<td colspan={numCols} style="text-align: center;"><Spinner secondary /></td>
</tr>
{:else if $stats.error}
<tr>
<td colspan="4"
<td colspan={numCols}
><Card body color="danger" class="mb-3">{$stats.error.message}</Card
></td
>
@@ -260,13 +340,18 @@
>
{/if}
<td>{row.totalJobs}</td>
<td>{row.shortJobs}</td>
{#if fetchRunning}
<td>{row.totalCores}</td>
<td>{row.totalAccs}</td>
{/if}
<td>{row.totalWalltime}</td>
<td>{row.totalCoreHours}</td>
<td>{row.totalAccHours}</td>
</tr>
{:else}
<tr>
<td colspan="4"><i>No {type.toLowerCase()}s/jobs found</i></td>
<td colspan={numCols}><i>No {type.toLowerCase()}s/jobs found</i></td>
</tr>
{/each}
{/if}

View File

@@ -32,7 +32,7 @@
let {
matchedListJobs = $bindable(0),
selectedJobs = $bindable([]),
metrics = getContext("cc-config").metricConfig_jobListMetrics,
metrics = [],
sorting = { field: "startTime", type: "col", order: "DESC" },
showFootprint = false,
filterBuffer = [],
@@ -109,7 +109,7 @@
let paging = $derived({ itemsPerPage, page });
const plotWidth = $derived.by(() => {
return Math.floor(
(tableWidth - jobInfoColumnWidth) / (metrics.length + (showFootprint ? 1 : 0)) - 10,
(tableWidth - jobInfoColumnWidth) / (metrics.length + (showFootprint ? 2 : 1)) - 10,
);
});
let jobsStore = $derived(queryStore({

View File

@@ -133,7 +133,7 @@
}
</script>
<Card class="mt-1 overflow-auto" style="width: {width}; height: {height}">
<Card class="mx-2 overflow-auto" style="width: {width}; height: {height}">
{#if displayTitle}
<CardHeader>
<CardTitle class="mb-0 d-flex justify-content-center">

View File

@@ -79,6 +79,7 @@
/* Derived */
const jobId = $derived(job?.id);
const refinedData = $derived($metricsQuery?.data?.jobMetrics ? sortAndSelectScope($metricsQuery.data.jobMetrics) : []);
const scopes = $derived.by(() => {
if (job.numNodes == 1) {
if (job.numAcc >= 1) return ["core", "accelerator"];
@@ -202,40 +203,45 @@
/>
</td>
{/if}
{#each sortAndSelectScope($metricsQuery.data.jobMetrics) as metric, i (metric?.name || i)}
{#each refinedData as metric, i (metric?.name || i)}
<td>
<!-- Subluster Metricconfig remove keyword for jobtables (joblist main, user joblist, project joblist) to be used here as toplevel case-->
{#if metric.disabled == false && metric.data}
<MetricPlot
onZoom={(detail) => handleZoom(detail, metric.data.name)}
height={plotHeight}
timestep={metric.data.metric.timestep}
scope={metric.data.scope}
series={metric.data.metric.series}
statisticsSeries={metric.data.metric.statisticsSeries}
metric={metric.data.name}
cluster={cluster.find((c) => c.name == job.cluster)}
subCluster={job.subCluster}
isShared={job.shared != "none"}
numhwthreads={job.numHWThreads}
numaccs={job.numAcc}
zoomState={zoomStates[metric.data.name] || null}
thresholdState={thresholdStates[metric.data.name] || null}
/>
{:else if metric.disabled == true && metric.data}
<Card body color="info"
>Metric disabled for subcluster <code
>{metric.data.name}:{job.subCluster}</code
></Card
>
{:else}
<Card body class="mx-2" color="warning">
<p>No dataset(s) returned for <b>{metrics[i]}</b></p>
<p class="mb-1">Metric or host was not found in metric store for cluster <b>{job.cluster}</b>:</p>
<p class="mb-1">Identical messages in <i>{metrics[i]} column</i>: Metric not found.</p>
<p class="mb-1">Identical messages in <i>job {job.jobId} row</i>: Host not found.</p>
</Card>
{/if}
{#key metric}
{#if metric?.data}
{#if metric?.disabled}
<Card body class="mx-2" color="info">
Metric <b>{metric.data.name}</b>: Disabled for subcluster <code>{job.subCluster}</code>
</Card>
{:else}
<MetricPlot
onZoom={(detail) => handleZoom(detail, metric.data.name)}
height={plotHeight}
timestep={metric.data.metric.timestep}
scope={metric.data.scope}
series={metric.data.metric.series}
statisticsSeries={metric.data.metric.statisticsSeries}
metric={metric.data.name}
cluster={cluster.find((c) => c.name == job.cluster)}
subCluster={job.subCluster}
isShared={job.shared != "none"}
numhwthreads={job.numHWThreads}
numaccs={job.numAcc}
zoomState={zoomStates[metric.data.name] || null}
thresholdState={thresholdStates[metric.data.name] || null}
/>
{/if}
{:else}
<Card body class="mx-2" color="warning">
<p>No dataset(s) returned for <b>{metrics[i]}</b></p>
<p class="mb-1">Metric or host was not found in metric store for cluster <b>{job.cluster}</b>:</p>
<p class="mb-1">Identical messages in <i>{metrics[i]} column</i>: Metric not found.</p>
<p class="mb-1">Identical messages in <i>job {job.jobId} row</i>: Host not found.</p>
</Card>
{/if}
{/key}
</td>
{:else}
<td>
<Card body class="mx-2">No metrics selected for display.</Card>
</td>
{/each}
{/if}

View File

@@ -79,7 +79,7 @@
// X
let pendingSeries = [
{
label: "Runtime",
label: "Time",
value: (u, ts, sidx, didx) =>
(didx == null) ? null : formatDurationTime(ts, forNode),
}

View File

@@ -34,6 +34,9 @@
/*Const Init */
const { query: initq } = init();
const useCbColors = getContext("cc-config")?.plotConfiguration_colorblindMode || false
/* Derived */
const subClusters = $derived($initq?.data?.clusters?.find((c) => c.name == presetCluster)?.subClusters || []);
</script>
<!-- Loading indicator & Refresh -->
@@ -66,12 +69,22 @@
</CardBody>
</TabPane>
<TabPane tabId="usage-dash" tab="Usage">
<TabPane tabId="usage-dash" tab="Cluster Usage">
<CardBody>
<UsageDash {presetCluster} {useCbColors}></UsageDash>
</CardBody>
</TabPane>
{#if subClusters?.length > 1}
{#each subClusters.map(sc => sc.name) as scn}
<TabPane tabId="{scn}-usage-dash" tab="{scn.charAt(0).toUpperCase() + scn.slice(1)} Usage">
<CardBody>
<UsageDash {presetCluster} presetSubCluster={scn} {useCbColors}></UsageDash>
</CardBody>
</TabPane>
{/each}
{/if}
<TabPane tabId="metric-dash" tab="Statistics">
<CardBody>
<StatisticsDash {presetCluster} {useCbColors}></StatisticsDash>

View File

@@ -3,6 +3,9 @@
Properties:
- `presetCluster String`: The cluster to show status information for
- `presetSubCluster String?`: The subCluster to show status information for [Default: null]
- `useCbColors Bool?`: Use colorblind friendly colors [Default: false]
- `useAltColors Bool?`: Use alternative color set [Default: false]
-->
<script>
@@ -35,6 +38,7 @@
/* Svelte 5 Props */
let {
presetCluster,
presetSubCluster = null,
useCbColors = false,
useAltColors = false
} = $props();
@@ -52,7 +56,12 @@
let numDurationBins = $state("1h");
/* Derived */
let cluster = $derived(presetCluster)
const canvasPrefix = $derived(`${presetCluster}-${presetSubCluster ? presetSubCluster : ''}`)
const statusFilter = $derived(presetSubCluster
? [{ state: ["running"] }, { cluster: { eq: presetCluster} }, { partition: { eq: presetSubCluster } }]
: [{ state: ["running"] }, { cluster: { eq: presetCluster} }]
);
const topJobsQuery = $derived(queryStore({
client: client,
query: gql`
@@ -82,7 +91,7 @@
}
`,
variables: {
filter: [{ state: ["running"] }, { cluster: { eq: cluster} }],
filter: statusFilter,
paging: pagingState // Top 10
},
requestPolicy: "network-only"
@@ -117,7 +126,7 @@
}
`,
variables: {
filter: [{ state: ["running"] }, { cluster: { eq: cluster } }],
filter: statusFilter,
paging: pagingState
},
requestPolicy: "network-only"
@@ -152,7 +161,7 @@
}
`,
variables: {
filter: [{ state: ["running"] }, { cluster: { eq: cluster } }],
filter: statusFilter,
paging: pagingState
},
requestPolicy: "network-only"
@@ -184,7 +193,7 @@
}
`,
variables: {
filter: [{ state: ["running"] }, { cluster: { eq: cluster } }],
filter: statusFilter,
selectedHistograms: selectedHistograms, // No Metrics requested for node hardware stats
numDurationBins: numDurationBins,
},
@@ -264,7 +273,7 @@
</h4>
<Pie
{useAltColors}
canvasId="hpcpie-jobs-users"
canvasId="{canvasPrefix}-hpcpie-jobs-users"
size={colWidthJobs * 0.75}
sliceLabel="Jobs"
quantities={$topJobsQuery.data.topUser.map(
@@ -284,14 +293,14 @@
{#each $topJobsQuery.data.topUser as tu, i}
<tr>
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
<td id="topName-jobs-{tu.id}">
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={cluster}&state=running"
<td id="{canvasPrefix}-topName-jobs-{tu.id}">
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running"
>{scrambleNames ? scramble(tu.id) : tu.id}
</a>
</td>
{#if tu?.name}
<Tooltip
target={`topName-jobs-${tu.id}`}
target={`${canvasPrefix}-topName-jobs-${tu.id}`}
placement="left"
>{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip
>
@@ -308,7 +317,7 @@
</h4>
<Pie
{useAltColors}
canvasId="hpcpie-jobs-projects"
canvasId="{canvasPrefix}-hpcpie-jobs-projects"
size={colWidthJobs * 0.75}
sliceLabel={'Jobs'}
quantities={$topJobsQuery.data.topProjects.map(
@@ -328,7 +337,7 @@
<tr>
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
<td>
<a target="_blank" href="/monitoring/jobs/?cluster={cluster}&state=running&project={tp.id}&projectMatch=eq"
<a target="_blank" href="/monitoring/jobs/?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running&project={tp.id}&projectMatch=eq"
>{scrambleNames ? scramble(tp.id) : tp.id}
</a>
</td>
@@ -368,7 +377,7 @@
</h4>
<Pie
{useAltColors}
canvasId="hpcpie-nodes-users"
canvasId="{canvasPrefix}-hpcpie-nodes-users"
size={colWidthNodes * 0.75}
sliceLabel="Nodes"
quantities={$topNodesQuery.data.topUser.map(
@@ -388,14 +397,14 @@
{#each $topNodesQuery.data.topUser as tu, i}
<tr>
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
<td id="topName-nodes-{tu.id}">
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={cluster}&state=running"
<td id="{canvasPrefix}-topName-nodes-{tu.id}">
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running"
>{scrambleNames ? scramble(tu.id) : tu.id}
</a>
</td>
{#if tu?.name}
<Tooltip
target={`topName-nodes-${tu.id}`}
target={`${canvasPrefix}-topName-nodes-${tu.id}`}
placement="left"
>{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip
>
@@ -412,7 +421,7 @@
</h4>
<Pie
{useAltColors}
canvasId="hpcpie-nodes-projects"
canvasId="{canvasPrefix}-hpcpie-nodes-projects"
size={colWidthNodes * 0.75}
sliceLabel={'Nodes'}
quantities={$topNodesQuery.data.topProjects.map(
@@ -432,7 +441,7 @@
<tr>
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
<td>
<a target="_blank" href="/monitoring/jobs/?cluster={cluster}&state=running&project={tp.id}&projectMatch=eq"
<a target="_blank" href="/monitoring/jobs/?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running&project={tp.id}&projectMatch=eq"
>{scrambleNames ? scramble(tp.id) : tp.id}
</a>
</td>
@@ -472,7 +481,7 @@
</h4>
<Pie
{useAltColors}
canvasId="hpcpie-accs-users"
canvasId="{canvasPrefix}-hpcpie-accs-users"
size={colWidthAccs * 0.75}
sliceLabel="GPUs"
quantities={$topAccsQuery.data.topUser.map(
@@ -492,14 +501,14 @@
{#each $topAccsQuery.data.topUser as tu, i}
<tr>
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
<td id="topName-accs-{tu.id}">
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={cluster}&state=running"
<td id="{canvasPrefix}-topName-accs-{tu.id}">
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running"
>{scrambleNames ? scramble(tu.id) : tu.id}
</a>
</td>
{#if tu?.name}
<Tooltip
target={`topName-accs-${tu.id}`}
target={`${canvasPrefix}-topName-accs-${tu.id}`}
placement="left"
>{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip
>
@@ -516,7 +525,7 @@
</h4>
<Pie
{useAltColors}
canvasId="hpcpie-accs-projects"
canvasId="{canvasPrefix}-hpcpie-accs-projects"
size={colWidthAccs * 0.75}
sliceLabel={'GPUs'}
quantities={$topAccsQuery.data.topProjects.map(
@@ -536,7 +545,7 @@
<tr>
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
<td>
<a target="_blank" href="/monitoring/jobs/?cluster={cluster}&state=running&project={tp.id}&projectMatch=eq"
<a target="_blank" href="/monitoring/jobs/?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running&project={tp.id}&projectMatch=eq"
>{scrambleNames ? scramble(tp.id) : tp.id}
</a>
</td>

View File

@@ -69,9 +69,9 @@
})
);
let extendedLegendData = $derived($nodeJobsData?.data ? buildExtendedLegend() : null);
let refinedData = $derived(nodeData?.metrics ? sortAndSelectScope(nodeData.metrics) : null);
let dataHealth = $derived(refinedData.filter((rd) => rd.disabled === false).map((enabled) => (enabled?.data?.metric?.series?.length > 0)));
const extendedLegendData = $derived($nodeJobsData?.data ? buildExtendedLegend() : null);
const refinedData = $derived(nodeData?.metrics ? sortAndSelectScope(nodeData.metrics) : []);
const dataHealth = $derived(refinedData.filter((rd) => rd.disabled === false).map((enabled) => (enabled?.data?.metric?.series?.length > 0)));
/* Functions */
const selectScope = (nodeMetrics) =>