mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-02-11 13:31:45 +01:00
2
.gitignore
vendored
2
.gitignore
vendored
@@ -13,7 +13,7 @@
|
||||
/var/checkpoints*
|
||||
|
||||
migrateTimestamps.pl
|
||||
test_ccms_write_api*
|
||||
test_ccms_*
|
||||
|
||||
/web/frontend/public/build
|
||||
/web/frontend/node_modules
|
||||
|
||||
@@ -135,36 +135,3 @@ func debugMetrics(rw http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// handleHealthCheck godoc
|
||||
// @summary HealthCheck endpoint
|
||||
// @tags healthcheck
|
||||
// @description This endpoint allows the users to check if a node is healthy
|
||||
// @produce json
|
||||
// @param selector query string false "Selector"
|
||||
// @success 200 {string} string "Debug dump"
|
||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
||||
// @security ApiKeyAuth
|
||||
// @router /healthcheck/ [get]
|
||||
func metricsHealth(rw http.ResponseWriter, r *http.Request) {
|
||||
rawCluster := r.URL.Query().Get("cluster")
|
||||
rawNode := r.URL.Query().Get("node")
|
||||
|
||||
if rawCluster == "" || rawNode == "" {
|
||||
handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
|
||||
rw.Header().Add("Content-Type", "application/json")
|
||||
|
||||
selector := []string{rawCluster, rawNode}
|
||||
|
||||
ms := metricstore.GetMemoryStore()
|
||||
if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil {
|
||||
handleError(err, http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
@@ -324,11 +324,12 @@ func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
|
||||
}
|
||||
|
||||
repo := repository.GetNodeRepository()
|
||||
requestReceived := time.Now().Unix()
|
||||
|
||||
for _, node := range req.Nodes {
|
||||
state := determineState(node.States)
|
||||
nodeState := schema.NodeStateDB{
|
||||
TimeStamp: time.Now().Unix(),
|
||||
TimeStamp: requestReceived,
|
||||
NodeState: state,
|
||||
CpusAllocated: node.CpusAllocated,
|
||||
MemoryAllocated: node.MemoryAllocated,
|
||||
|
||||
@@ -7,11 +7,14 @@ package api
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"maps"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
@@ -20,6 +23,15 @@ type UpdateNodeStatesRequest struct {
|
||||
Cluster string `json:"cluster" example:"fritz"`
|
||||
}
|
||||
|
||||
// metricListToNames converts a map of metric configurations to a list of metric names
|
||||
func metricListToNames(metricList map[string]*schema.Metric) []string {
|
||||
names := make([]string, 0, len(metricList))
|
||||
for name := range metricList {
|
||||
names = append(names, name)
|
||||
}
|
||||
return names
|
||||
}
|
||||
|
||||
// this routine assumes that only one of them exists per node
|
||||
func determineState(states []string) schema.SchedulerState {
|
||||
for _, state := range states {
|
||||
@@ -62,16 +74,42 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
||||
http.StatusBadRequest, rw)
|
||||
return
|
||||
}
|
||||
requestReceived := time.Now().Unix()
|
||||
repo := repository.GetNodeRepository()
|
||||
ms := metricstore.GetMemoryStore()
|
||||
|
||||
m := make(map[string][]string)
|
||||
healthStates := make(map[string]schema.MonitoringState)
|
||||
|
||||
for _, node := range req.Nodes {
|
||||
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
|
||||
m[sc] = append(m[sc], node.Hostname)
|
||||
}
|
||||
}
|
||||
|
||||
for sc, nl := range m {
|
||||
if sc != "" {
|
||||
metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc)
|
||||
metricNames := metricListToNames(metricList)
|
||||
if states, err := ms.HealthCheck(req.Cluster, nl, metricNames); err == nil {
|
||||
maps.Copy(healthStates, states)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, node := range req.Nodes {
|
||||
state := determineState(node.States)
|
||||
healthState := schema.MonitoringStateFailed
|
||||
if hs, ok := healthStates[node.Hostname]; ok {
|
||||
healthState = hs
|
||||
}
|
||||
nodeState := schema.NodeStateDB{
|
||||
TimeStamp: time.Now().Unix(), NodeState: state,
|
||||
TimeStamp: requestReceived,
|
||||
NodeState: state,
|
||||
CpusAllocated: node.CpusAllocated,
|
||||
MemoryAllocated: node.MemoryAllocated,
|
||||
GpusAllocated: node.GpusAllocated,
|
||||
HealthState: schema.MonitoringStateFull,
|
||||
HealthState: healthState,
|
||||
JobsRunning: node.JobsRunning,
|
||||
}
|
||||
|
||||
|
||||
@@ -81,7 +81,7 @@ func (api *RestAPI) MountAPIRoutes(r *mux.Router) {
|
||||
// Cluster List
|
||||
r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet)
|
||||
// Slurm node state
|
||||
r.HandleFunc("/nodestate/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut)
|
||||
r.HandleFunc("/nodestates/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut)
|
||||
// Job Handler
|
||||
if config.Keys.APISubjects == nil {
|
||||
cclog.Info("Enabling REST start/stop job API")
|
||||
@@ -127,12 +127,12 @@ func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) {
|
||||
r.HandleFunc("/free", freeMetrics).Methods(http.MethodPost)
|
||||
r.HandleFunc("/write", writeMetrics).Methods(http.MethodPost)
|
||||
r.HandleFunc("/debug", debugMetrics).Methods(http.MethodGet)
|
||||
r.HandleFunc("/healthcheck", metricsHealth).Methods(http.MethodGet)
|
||||
r.HandleFunc("/healthcheck", api.updateNodeStates).Methods(http.MethodPost)
|
||||
// Same endpoints but with trailing slash
|
||||
r.HandleFunc("/free/", freeMetrics).Methods(http.MethodPost)
|
||||
r.HandleFunc("/write/", writeMetrics).Methods(http.MethodPost)
|
||||
r.HandleFunc("/debug/", debugMetrics).Methods(http.MethodGet)
|
||||
r.HandleFunc("/healthcheck/", metricsHealth).Methods(http.MethodGet)
|
||||
r.HandleFunc("/healthcheck/", api.updateNodeStates).Methods(http.MethodPost)
|
||||
}
|
||||
|
||||
// MountConfigAPIRoutes registers configuration and user management endpoints.
|
||||
|
||||
@@ -923,15 +923,19 @@ func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metr
|
||||
if !okData && len(ser.Data) != 0 {
|
||||
collectorData[metric] = make([]schema.Float, len(ser.Data))
|
||||
} else if !okData {
|
||||
cclog.Debugf("ClusterMetrics Skip Init: No Data -> %s at %s; Size %d", metric, ser.Hostname, len(ser.Data))
|
||||
cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip init: no data -> %s at %s; size %d", metric, ser.Hostname, len(ser.Data))
|
||||
}
|
||||
// Sum if init'd and matching size
|
||||
if okData && len(ser.Data) == len(collectorData[metric]) {
|
||||
for i, val := range ser.Data {
|
||||
collectorData[metric][i] += val
|
||||
if val.IsNaN() {
|
||||
continue
|
||||
} else {
|
||||
collectorData[metric][i] += val
|
||||
}
|
||||
}
|
||||
} else if okData {
|
||||
cclog.Debugf("ClusterMetrics Skip Sum: Data Diff -> %s at %s; Want Size %d, Have Size %d", metric, ser.Hostname, len(collectorData[metric]), len(ser.Data))
|
||||
cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip sum: data diff -> %s at %s; want size %d, have size %d", metric, ser.Hostname, len(collectorData[metric]), len(ser.Data))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -466,7 +466,7 @@ func (r *JobRepository) JobCountGrouped(
|
||||
// AddJobCountGrouped augments existing statistics with additional job counts by category.
|
||||
//
|
||||
// This method enriches JobsStatistics returned by JobsStatsGrouped or JobCountGrouped
|
||||
// with counts of running or short-running jobs, matched by group ID.
|
||||
// with counts of running or short-running (based on ShortRunningJobsDuration) jobs, matched by group ID.
|
||||
//
|
||||
// Parameters:
|
||||
// - ctx: Context for security checks
|
||||
|
||||
@@ -158,8 +158,7 @@ func cleanupCheckpoints(dir string, cleanupDir string, from int64, deleteInstead
|
||||
return 0, err
|
||||
}
|
||||
|
||||
extension := Keys.Checkpoints.FileFormat
|
||||
files, err := findFiles(entries, from, extension, false)
|
||||
files, err := findFiles(entries, from, false)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
@@ -415,7 +415,7 @@ func enqueueCheckpointHosts(dir string, work chan<- [2]string) error {
|
||||
//
|
||||
// Uses worker pool to load cluster/host combinations. Periodically triggers GC
|
||||
// to prevent excessive heap growth. Returns number of files loaded and any errors.
|
||||
func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) {
|
||||
func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {
|
||||
var wg sync.WaitGroup
|
||||
work := make(chan [2]string, Keys.NumWorkers*4)
|
||||
n, errs := int32(0), int32(0)
|
||||
@@ -426,7 +426,7 @@ func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (
|
||||
defer wg.Done()
|
||||
for host := range work {
|
||||
lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics))
|
||||
nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension)
|
||||
nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from)
|
||||
if err != nil {
|
||||
cclog.Errorf("[METRICSTORE]> error while loading checkpoints for %s/%s: %s", host[0], host[1], err.Error())
|
||||
atomic.AddInt32(&errs, 1)
|
||||
@@ -465,57 +465,7 @@ func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
|
||||
cclog.Debugf("[METRICSTORE]> %#v Directory created successfully", dir)
|
||||
}
|
||||
|
||||
// Config read (replace with your actual config read)
|
||||
fileFormat := Keys.Checkpoints.FileFormat
|
||||
if fileFormat == "" {
|
||||
fileFormat = "avro"
|
||||
}
|
||||
|
||||
// Map to easily get the fallback format
|
||||
oppositeFormat := map[string]string{
|
||||
"json": "avro",
|
||||
"avro": "json",
|
||||
}
|
||||
|
||||
// First, attempt to load the specified format
|
||||
if found, err := checkFilesWithExtension(dir, fileFormat); err != nil {
|
||||
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
|
||||
} else if found {
|
||||
cclog.Infof("[METRICSTORE]> Loading %s files because fileformat is %s", fileFormat, fileFormat)
|
||||
return m.FromCheckpoint(dir, from, fileFormat)
|
||||
}
|
||||
|
||||
// If not found, attempt the opposite format
|
||||
altFormat := oppositeFormat[fileFormat]
|
||||
if found, err := checkFilesWithExtension(dir, altFormat); err != nil {
|
||||
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
|
||||
} else if found {
|
||||
cclog.Infof("[METRICSTORE]> Loading %s files but fileformat is %s", altFormat, fileFormat)
|
||||
return m.FromCheckpoint(dir, from, altFormat)
|
||||
}
|
||||
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// checkFilesWithExtension walks a directory tree to check if files with the given extension exist.
|
||||
func checkFilesWithExtension(dir string, extension string) (bool, error) {
|
||||
found := false
|
||||
|
||||
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return fmt.Errorf("[METRICSTORE]> error accessing path %s: %v", path, err)
|
||||
}
|
||||
if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension {
|
||||
found = true
|
||||
return nil
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("[METRICSTORE]> error walking through directories: %s", err)
|
||||
}
|
||||
|
||||
return found, nil
|
||||
return m.FromCheckpoint(dir, from)
|
||||
}
|
||||
|
||||
func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error {
|
||||
@@ -729,7 +679,7 @@ func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension string) (int, error) {
|
||||
func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, error) {
|
||||
direntries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
@@ -748,33 +698,38 @@ func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension
|
||||
children: make(map[string]*Level),
|
||||
}
|
||||
|
||||
files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from, extension)
|
||||
files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from)
|
||||
filesLoaded += files
|
||||
if err != nil {
|
||||
return filesLoaded, err
|
||||
}
|
||||
|
||||
l.children[e.Name()] = child
|
||||
} else if strings.HasSuffix(e.Name(), "."+extension) {
|
||||
} else if strings.HasSuffix(e.Name(), ".json") || strings.HasSuffix(e.Name(), ".avro") {
|
||||
allFiles = append(allFiles, e)
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
files, err := findFiles(allFiles, from, extension, true)
|
||||
files, err := findFiles(allFiles, from, true)
|
||||
if err != nil {
|
||||
return filesLoaded, err
|
||||
}
|
||||
|
||||
loaders := map[string]func(*MemoryStore, *os.File, int64) error{
|
||||
"json": l.loadJSONFile,
|
||||
"avro": l.loadAvroFile,
|
||||
".json": l.loadJSONFile,
|
||||
".avro": l.loadAvroFile,
|
||||
}
|
||||
|
||||
loader := loaders[extension]
|
||||
|
||||
for _, filename := range files {
|
||||
ext := filepath.Ext(filename)
|
||||
loader := loaders[ext]
|
||||
if loader == nil {
|
||||
cclog.Warnf("Unknown extension for file %s", filename)
|
||||
continue
|
||||
}
|
||||
|
||||
// Use a closure to ensure file is closed immediately after use
|
||||
err := func() error {
|
||||
f, err := os.Open(path.Join(dir, filename))
|
||||
@@ -798,10 +753,12 @@ func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension
|
||||
// This will probably get very slow over time!
|
||||
// A solution could be some sort of an index file in which all other files
|
||||
// and the timespan they contain is listed.
|
||||
func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRecentFiles bool) ([]string, error) {
|
||||
// NOTE: This now assumes that you have distinct timestamps for json and avro files
|
||||
// Also, it assumes that the timestamps are not overlapping/self-modified.
|
||||
func findFiles(direntries []fs.DirEntry, t int64, findMoreRecentFiles bool) ([]string, error) {
|
||||
nums := map[string]int64{}
|
||||
for _, e := range direntries {
|
||||
if !strings.HasSuffix(e.Name(), "."+extension) {
|
||||
if !strings.HasSuffix(e.Name(), ".json") && !strings.HasSuffix(e.Name(), ".avro") {
|
||||
continue
|
||||
}
|
||||
|
||||
|
||||
@@ -6,87 +6,260 @@
|
||||
package metricstore
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"cmp"
|
||||
"fmt"
|
||||
"slices"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
|
||||
// HealthCheckResponse represents the result of a health check operation.
|
||||
//
|
||||
// Status indicates the monitoring state (Full, Partial, Failed).
|
||||
// Error contains any error encountered during the health check.
|
||||
type HealthCheckResponse struct {
|
||||
Status schema.MonitoringState
|
||||
Error error
|
||||
}
|
||||
|
||||
// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing.
|
||||
// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a
|
||||
// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy.
|
||||
const MaxMissingDataPoints int64 = 5
|
||||
|
||||
// MaxUnhealthyMetrics is a threshold which allows upto certain number of metrics in a node to be unhealthly.
|
||||
// Works with MaxMissingDataPoints. Say 5 metrics (including submetrics) do not receive the last
|
||||
// MaxMissingDataPoints data points, then the node will be deemed healthy. Any more metrics that does
|
||||
// not receive data for MaxMissingDataPoints data points will deem the node unhealthy.
|
||||
const MaxUnhealthyMetrics int64 = 5
|
||||
|
||||
func (b *buffer) healthCheck() int64 {
|
||||
// isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints.
|
||||
//
|
||||
// Returns true if the buffer is healthy (recent data within threshold), false otherwise.
|
||||
// A nil buffer or empty buffer is considered unhealthy.
|
||||
func (b *buffer) bufferExists() bool {
|
||||
// Check if the buffer is empty
|
||||
if b.data == nil {
|
||||
return 1
|
||||
if b == nil || b.data == nil || len(b.data) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints.
|
||||
//
|
||||
// Returns true if the buffer is healthy (recent data within threshold), false otherwise.
|
||||
// A nil buffer or empty buffer is considered unhealthy.
|
||||
func (b *buffer) isBufferHealthy() bool {
|
||||
// Get the last endtime of the buffer
|
||||
bufferEnd := b.start + b.frequency*int64(len(b.data))
|
||||
t := time.Now().Unix()
|
||||
|
||||
// Check if the buffer is too old
|
||||
// Check if the buffer has recent data (within MaxMissingDataPoints threshold)
|
||||
if t-bufferEnd > MaxMissingDataPoints*b.frequency {
|
||||
return 1
|
||||
return false
|
||||
}
|
||||
|
||||
return 0
|
||||
return true
|
||||
}
|
||||
|
||||
func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) {
|
||||
// MergeUniqueSorted merges two lists, sorts them, and removes duplicates.
|
||||
// Requires 'cmp.Ordered' because we need to sort the data.
|
||||
func mergeList[string cmp.Ordered](list1, list2 []string) []string {
|
||||
// 1. Combine both lists
|
||||
result := append(list1, list2...)
|
||||
|
||||
// 2. Sort the combined list
|
||||
slices.Sort(result)
|
||||
|
||||
// 3. Compact removes consecutive duplicates (standard in Go 1.21+)
|
||||
// e.g. [1, 1, 2, 3, 3] -> [1, 2, 3]
|
||||
result = slices.Compact(result)
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// getHealthyMetrics recursively collects healthy and degraded metrics at this level and below.
|
||||
//
|
||||
// A metric is considered:
|
||||
// - Healthy: buffer has recent data within MaxMissingDataPoints threshold AND has few/no NaN values
|
||||
// - Degraded: buffer exists and has recent data, but contains more than MaxMissingDataPoints NaN values
|
||||
//
|
||||
// This routine walks the entire subtree starting from the current level.
|
||||
//
|
||||
// Parameters:
|
||||
// - m: MemoryStore containing the global metric configuration
|
||||
//
|
||||
// Returns:
|
||||
// - []string: Flat list of healthy metric names from this level and all children
|
||||
// - []string: Flat list of degraded metric names (exist but have too many missing values)
|
||||
// - error: Non-nil only for internal errors during recursion
|
||||
//
|
||||
// The routine mirrors healthCheck() but provides more granular classification:
|
||||
// - healthCheck() finds problems (stale/missing)
|
||||
// - getHealthyMetrics() separates healthy from degraded metrics
|
||||
func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) ([]string, []string, error) {
|
||||
l.lock.RLock()
|
||||
defer l.lock.RUnlock()
|
||||
|
||||
for _, mc := range m.Metrics {
|
||||
if b := l.metrics[mc.offset]; b != nil {
|
||||
count += b.healthCheck()
|
||||
globalMetrics := m.Metrics
|
||||
|
||||
missingList := make([]string, 0)
|
||||
degradedList := make([]string, 0)
|
||||
|
||||
// Phase 1: Check metrics at this level
|
||||
for _, metricName := range expectedMetrics {
|
||||
offset := globalMetrics[metricName].offset
|
||||
b := l.metrics[offset]
|
||||
|
||||
if !b.bufferExists() {
|
||||
missingList = append(missingList, metricName)
|
||||
} else if !b.isBufferHealthy() {
|
||||
degradedList = append(degradedList, metricName)
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Recursively check child levels
|
||||
for _, lvl := range l.children {
|
||||
c, err := lvl.healthCheck(m, 0)
|
||||
childMissing, childDegraded, err := lvl.getHealthyMetrics(m, expectedMetrics)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
return nil, nil, err
|
||||
}
|
||||
count += c
|
||||
|
||||
missingList = mergeList(missingList, childMissing)
|
||||
degradedList = mergeList(degradedList, childDegraded)
|
||||
}
|
||||
|
||||
return count, nil
|
||||
return missingList, degradedList, nil
|
||||
}
|
||||
|
||||
func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error {
|
||||
// GetHealthyMetrics returns healthy and degraded metrics for a specific node as flat lists.
|
||||
//
|
||||
// This routine walks the metric tree starting from the specified node selector
|
||||
// and collects all metrics that have received data within the last MaxMissingDataPoints
|
||||
// (default: 5 data points). Metrics are classified into two categories:
|
||||
//
|
||||
// - Healthy: Buffer has recent data AND contains few/no NaN (missing) values
|
||||
// - Degraded: Buffer has recent data BUT contains more than MaxMissingDataPoints NaN values
|
||||
//
|
||||
// The returned lists include both node-level metrics (e.g., "load", "mem_used") and
|
||||
// hardware-level metrics (e.g., "cpu_user", "gpu_temp") in flat slices.
|
||||
//
|
||||
// Parameters:
|
||||
// - selector: Hierarchical path to the target node, typically []string{cluster, hostname}.
|
||||
// Example: []string{"emmy", "node001"} navigates to the "node001" host in the "emmy" cluster.
|
||||
// The selector must match the hierarchy used during metric ingestion.
|
||||
//
|
||||
// Returns:
|
||||
// - []string: Flat list of healthy metric names (recent data, few missing values)
|
||||
// - []string: Flat list of degraded metric names (recent data, many missing values)
|
||||
// - error: Non-nil if the node is not found or internal errors occur
|
||||
//
|
||||
// Example usage:
|
||||
//
|
||||
// selector := []string{"emmy", "node001"}
|
||||
// healthyMetrics, degradedMetrics, err := ms.GetHealthyMetrics(selector)
|
||||
// if err != nil {
|
||||
// // Node not found or internal error
|
||||
// return err
|
||||
// }
|
||||
// fmt.Printf("Healthy metrics: %v\n", healthyMetrics)
|
||||
// // Output: ["load", "mem_used", "cpu_user", ...]
|
||||
// fmt.Printf("Degraded metrics: %v\n", degradedMetrics)
|
||||
// // Output: ["gpu_temp", "network_rx", ...] (metrics with many NaN values)
|
||||
//
|
||||
// Note: This routine provides more granular classification than HealthCheck:
|
||||
// - HealthCheck reports stale/missing metrics (problems)
|
||||
// - GetHealthyMetrics separates fully healthy from degraded metrics (quality levels)
|
||||
func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, []string, error) {
|
||||
lvl := m.root.findLevel(selector)
|
||||
if lvl == nil {
|
||||
return fmt.Errorf("[METRICSTORE]> not found: %#v", selector)
|
||||
return nil, nil, fmt.Errorf("[METRICSTORE]> error while GetHealthyMetrics, host not found: %#v", selector)
|
||||
}
|
||||
|
||||
buf := make([]byte, 0, 25)
|
||||
// buf = append(buf, "{"...)
|
||||
|
||||
var count int64 = 0
|
||||
|
||||
unhealthyMetricsCount, err := lvl.healthCheck(m, count)
|
||||
missingList, degradedList, err := lvl.getHealthyMetrics(m, expectedMetrics)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
if unhealthyMetricsCount < MaxUnhealthyMetrics {
|
||||
buf = append(buf, "Healthy"...)
|
||||
} else {
|
||||
buf = append(buf, "Unhealthy"...)
|
||||
}
|
||||
|
||||
// buf = append(buf, "}\n"...)
|
||||
|
||||
if _, err = w.Write(buf); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return w.Flush()
|
||||
return missingList, degradedList, nil
|
||||
}
|
||||
|
||||
// HealthCheck performs health checks on multiple nodes and returns their monitoring states.
|
||||
//
|
||||
// This routine provides a batch health check interface that evaluates multiple nodes
|
||||
// against a specific set of expected metrics. For each node, it determines the overall
|
||||
// monitoring state based on which metrics are healthy, degraded, or missing.
|
||||
//
|
||||
// Health Status Classification:
|
||||
// - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values)
|
||||
// - MonitoringStatePartial: Some metrics are degraded (many missing values) or missing
|
||||
// - MonitoringStateFailed: Node not found or all expected metrics are missing/stale
|
||||
//
|
||||
// Parameters:
|
||||
// - cluster: Cluster name (first element of selector path)
|
||||
// - nodes: List of node hostnames to check
|
||||
// - expectedMetrics: List of metric names that should be present on each node
|
||||
//
|
||||
// Returns:
|
||||
// - map[string]schema.MonitoringState: Map keyed by hostname containing monitoring state for each node
|
||||
// - error: Non-nil only for internal errors (individual node failures are captured as MonitoringStateFailed)
|
||||
//
|
||||
// Example usage:
|
||||
//
|
||||
// cluster := "emmy"
|
||||
// nodes := []string{"node001", "node002", "node003"}
|
||||
// expectedMetrics := []string{"load", "mem_used", "cpu_user", "cpu_system"}
|
||||
// healthStates, err := ms.HealthCheck(cluster, nodes, expectedMetrics)
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
// for hostname, state := range healthStates {
|
||||
// fmt.Printf("Node %s: %s\n", hostname, state)
|
||||
// }
|
||||
//
|
||||
// Note: This routine is optimized for batch operations where you need to check
|
||||
// the same set of metrics across multiple nodes.
|
||||
func (m *MemoryStore) HealthCheck(cluster string,
|
||||
nodes []string, expectedMetrics []string,
|
||||
) (map[string]schema.MonitoringState, error) {
|
||||
results := make(map[string]schema.MonitoringState, len(nodes))
|
||||
|
||||
// Create a set of expected metrics for fast lookup
|
||||
expectedSet := make(map[string]bool, len(expectedMetrics))
|
||||
for _, metric := range expectedMetrics {
|
||||
expectedSet[metric] = true
|
||||
}
|
||||
|
||||
// Check each node
|
||||
for _, hostname := range nodes {
|
||||
selector := []string{cluster, hostname}
|
||||
status := schema.MonitoringStateFull
|
||||
healthyCount := 0
|
||||
degradedCount := 0
|
||||
missingCount := 0
|
||||
|
||||
// Get healthy and degraded metrics for this node
|
||||
missingList, degradedList, err := m.GetHealthyMetrics(selector, expectedMetrics)
|
||||
if err != nil {
|
||||
// Node not found or internal error
|
||||
results[hostname] = schema.MonitoringStateFailed
|
||||
continue
|
||||
}
|
||||
|
||||
missingCount = len(missingList)
|
||||
degradedCount = len(degradedList)
|
||||
healthyCount = len(expectedMetrics) - (missingCount + degradedCount)
|
||||
|
||||
// Determine overall health status
|
||||
if missingCount > 0 || degradedCount > 0 {
|
||||
if healthyCount == 0 {
|
||||
// No healthy metrics at all
|
||||
status = schema.MonitoringStateFailed
|
||||
} else {
|
||||
// Some healthy, some degraded/missing
|
||||
status = schema.MonitoringStatePartial
|
||||
}
|
||||
}
|
||||
// else: all metrics healthy, status remains MonitoringStateFull
|
||||
|
||||
results[hostname] = status
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ package metricstore
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||
)
|
||||
@@ -88,3 +89,378 @@ func TestBufferRead(t *testing.T) {
|
||||
t.Errorf("buffer.read() len(result) = %d, want 3", len(result))
|
||||
}
|
||||
}
|
||||
|
||||
func TestHealthCheck(t *testing.T) {
|
||||
// Create a test MemoryStore with some metrics
|
||||
metrics := map[string]MetricConfig{
|
||||
"load": {Frequency: 10, Aggregation: AvgAggregation, offset: 0},
|
||||
"mem_used": {Frequency: 10, Aggregation: AvgAggregation, offset: 1},
|
||||
"cpu_user": {Frequency: 10, Aggregation: AvgAggregation, offset: 2},
|
||||
"cpu_system": {Frequency: 10, Aggregation: AvgAggregation, offset: 3},
|
||||
}
|
||||
|
||||
ms := &MemoryStore{
|
||||
Metrics: metrics,
|
||||
root: Level{
|
||||
metrics: make([]*buffer, len(metrics)),
|
||||
children: make(map[string]*Level),
|
||||
},
|
||||
}
|
||||
|
||||
// Use recent timestamps (current time minus a small offset)
|
||||
now := time.Now().Unix()
|
||||
startTime := now - 100 // Start 100 seconds ago to have enough data points
|
||||
|
||||
// Setup test data for node001 - all metrics healthy (recent data)
|
||||
node001 := ms.root.findLevelOrCreate([]string{"testcluster", "node001"}, len(metrics))
|
||||
for i := 0; i < len(metrics); i++ {
|
||||
node001.metrics[i] = newBuffer(startTime, 10)
|
||||
// Write recent data up to now
|
||||
for ts := startTime; ts <= now; ts += 10 {
|
||||
node001.metrics[i].write(ts, schema.Float(float64(i+1)))
|
||||
}
|
||||
}
|
||||
|
||||
// Setup test data for node002 - some metrics stale (old data beyond MaxMissingDataPoints threshold)
|
||||
node002 := ms.root.findLevelOrCreate([]string{"testcluster", "node002"}, len(metrics))
|
||||
// MaxMissingDataPoints = 5, frequency = 10, so threshold is 50 seconds
|
||||
staleTime := now - 100 // Data ends 100 seconds ago (well beyond 50 second threshold)
|
||||
for i := 0; i < len(metrics); i++ {
|
||||
node002.metrics[i] = newBuffer(staleTime-50, 10)
|
||||
if i < 2 {
|
||||
// First two metrics: healthy (recent data)
|
||||
for ts := startTime; ts <= now; ts += 10 {
|
||||
node002.metrics[i].write(ts, schema.Float(float64(i+1)))
|
||||
}
|
||||
} else {
|
||||
// Last two metrics: stale (data ends 100 seconds ago)
|
||||
for ts := staleTime - 50; ts <= staleTime; ts += 10 {
|
||||
node002.metrics[i].write(ts, schema.Float(float64(i+1)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Setup test data for node003 - some metrics missing (no buffer)
|
||||
node003 := ms.root.findLevelOrCreate([]string{"testcluster", "node003"}, len(metrics))
|
||||
// Only create buffers for first two metrics
|
||||
for i := 0; i < 2; i++ {
|
||||
node003.metrics[i] = newBuffer(startTime, 10)
|
||||
for ts := startTime; ts <= now; ts += 10 {
|
||||
node003.metrics[i].write(ts, schema.Float(float64(i+1)))
|
||||
}
|
||||
}
|
||||
// Leave metrics[2] and metrics[3] as nil (missing)
|
||||
|
||||
// Setup test data for node005 - all metrics stale
|
||||
node005 := ms.root.findLevelOrCreate([]string{"testcluster", "node005"}, len(metrics))
|
||||
for i := 0; i < len(metrics); i++ {
|
||||
node005.metrics[i] = newBuffer(staleTime-50, 10)
|
||||
// All metrics have stale data (ends 100 seconds ago)
|
||||
for ts := staleTime - 50; ts <= staleTime; ts += 10 {
|
||||
node005.metrics[i].write(ts, schema.Float(float64(i+1)))
|
||||
}
|
||||
}
|
||||
|
||||
// node004 doesn't exist at all
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
cluster string
|
||||
nodes []string
|
||||
expectedMetrics []string
|
||||
wantStates map[string]schema.MonitoringState
|
||||
}{
|
||||
{
|
||||
name: "all metrics healthy",
|
||||
cluster: "testcluster",
|
||||
nodes: []string{"node001"},
|
||||
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
|
||||
wantStates: map[string]schema.MonitoringState{
|
||||
"node001": schema.MonitoringStateFull,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "some metrics stale",
|
||||
cluster: "testcluster",
|
||||
nodes: []string{"node002"},
|
||||
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
|
||||
wantStates: map[string]schema.MonitoringState{
|
||||
"node002": schema.MonitoringStatePartial,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "some metrics missing",
|
||||
cluster: "testcluster",
|
||||
nodes: []string{"node003"},
|
||||
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
|
||||
wantStates: map[string]schema.MonitoringState{
|
||||
"node003": schema.MonitoringStatePartial,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "node not found",
|
||||
cluster: "testcluster",
|
||||
nodes: []string{"node004"},
|
||||
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
|
||||
wantStates: map[string]schema.MonitoringState{
|
||||
"node004": schema.MonitoringStateFailed,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "all metrics stale",
|
||||
cluster: "testcluster",
|
||||
nodes: []string{"node005"},
|
||||
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
|
||||
wantStates: map[string]schema.MonitoringState{
|
||||
"node005": schema.MonitoringStateFailed,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "multiple nodes mixed states",
|
||||
cluster: "testcluster",
|
||||
nodes: []string{"node001", "node002", "node003", "node004", "node005"},
|
||||
expectedMetrics: []string{"load", "mem_used"},
|
||||
wantStates: map[string]schema.MonitoringState{
|
||||
"node001": schema.MonitoringStateFull,
|
||||
"node002": schema.MonitoringStateFull, // Only checking first 2 metrics which are healthy
|
||||
"node003": schema.MonitoringStateFull, // Only checking first 2 metrics which exist
|
||||
"node004": schema.MonitoringStateFailed, // Node doesn't exist
|
||||
"node005": schema.MonitoringStateFailed, // Both metrics are stale
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
results, err := ms.HealthCheck(tt.cluster, tt.nodes, tt.expectedMetrics)
|
||||
if err != nil {
|
||||
t.Errorf("HealthCheck() error = %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Check that we got results for all nodes
|
||||
if len(results) != len(tt.nodes) {
|
||||
t.Errorf("HealthCheck() returned %d results, want %d", len(results), len(tt.nodes))
|
||||
}
|
||||
|
||||
// Check each node's state
|
||||
for _, node := range tt.nodes {
|
||||
state, ok := results[node]
|
||||
if !ok {
|
||||
t.Errorf("HealthCheck() missing result for node %s", node)
|
||||
continue
|
||||
}
|
||||
|
||||
// Check status
|
||||
if wantStatus, ok := tt.wantStates[node]; ok {
|
||||
if state != wantStatus {
|
||||
t.Errorf("HealthCheck() node %s status = %v, want %v", node, state, wantStatus)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestGetHealthyMetrics tests the GetHealthyMetrics function which returns lists of missing and degraded metrics
|
||||
func TestGetHealthyMetrics(t *testing.T) {
|
||||
metrics := map[string]MetricConfig{
|
||||
"load": {Frequency: 10, Aggregation: AvgAggregation, offset: 0},
|
||||
"mem_used": {Frequency: 10, Aggregation: AvgAggregation, offset: 1},
|
||||
"cpu_user": {Frequency: 10, Aggregation: AvgAggregation, offset: 2},
|
||||
}
|
||||
|
||||
ms := &MemoryStore{
|
||||
Metrics: metrics,
|
||||
root: Level{
|
||||
metrics: make([]*buffer, len(metrics)),
|
||||
children: make(map[string]*Level),
|
||||
},
|
||||
}
|
||||
|
||||
now := time.Now().Unix()
|
||||
startTime := now - 100
|
||||
staleTime := now - 100
|
||||
|
||||
// Setup node with mixed health states
|
||||
node := ms.root.findLevelOrCreate([]string{"testcluster", "testnode"}, len(metrics))
|
||||
|
||||
// Metric 0 (load): healthy - recent data
|
||||
node.metrics[0] = newBuffer(startTime, 10)
|
||||
for ts := startTime; ts <= now; ts += 10 {
|
||||
node.metrics[0].write(ts, schema.Float(1.0))
|
||||
}
|
||||
|
||||
// Metric 1 (mem_used): degraded - stale data
|
||||
node.metrics[1] = newBuffer(staleTime-50, 10)
|
||||
for ts := staleTime - 50; ts <= staleTime; ts += 10 {
|
||||
node.metrics[1].write(ts, schema.Float(2.0))
|
||||
}
|
||||
|
||||
// Metric 2 (cpu_user): missing - no buffer (nil)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
selector []string
|
||||
expectedMetrics []string
|
||||
wantMissing []string
|
||||
wantDegraded []string
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "mixed health states",
|
||||
selector: []string{"testcluster", "testnode"},
|
||||
expectedMetrics: []string{"load", "mem_used", "cpu_user"},
|
||||
wantMissing: []string{"cpu_user"},
|
||||
wantDegraded: []string{"mem_used"},
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "node not found",
|
||||
selector: []string{"testcluster", "nonexistent"},
|
||||
expectedMetrics: []string{"load"},
|
||||
wantMissing: nil,
|
||||
wantDegraded: nil,
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "check only healthy metric",
|
||||
selector: []string{"testcluster", "testnode"},
|
||||
expectedMetrics: []string{"load"},
|
||||
wantMissing: []string{},
|
||||
wantDegraded: []string{},
|
||||
wantErr: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
missing, degraded, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics)
|
||||
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("GetHealthyMetrics() error = %v, wantErr %v", err, tt.wantErr)
|
||||
return
|
||||
}
|
||||
|
||||
if tt.wantErr {
|
||||
return
|
||||
}
|
||||
|
||||
// Check missing list
|
||||
if len(missing) != len(tt.wantMissing) {
|
||||
t.Errorf("GetHealthyMetrics() missing = %v, want %v", missing, tt.wantMissing)
|
||||
} else {
|
||||
for i, m := range tt.wantMissing {
|
||||
if missing[i] != m {
|
||||
t.Errorf("GetHealthyMetrics() missing[%d] = %v, want %v", i, missing[i], m)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check degraded list
|
||||
if len(degraded) != len(tt.wantDegraded) {
|
||||
t.Errorf("GetHealthyMetrics() degraded = %v, want %v", degraded, tt.wantDegraded)
|
||||
} else {
|
||||
for i, d := range tt.wantDegraded {
|
||||
if degraded[i] != d {
|
||||
t.Errorf("GetHealthyMetrics() degraded[%d] = %v, want %v", i, degraded[i], d)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestBufferHealthChecks tests the buffer-level health check functions
|
||||
func TestBufferHealthChecks(t *testing.T) {
|
||||
now := time.Now().Unix()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
setupBuffer func() *buffer
|
||||
wantExists bool
|
||||
wantHealthy bool
|
||||
description string
|
||||
}{
|
||||
{
|
||||
name: "nil buffer",
|
||||
setupBuffer: func() *buffer {
|
||||
return nil
|
||||
},
|
||||
wantExists: false,
|
||||
wantHealthy: false,
|
||||
description: "nil buffer should not exist and not be healthy",
|
||||
},
|
||||
{
|
||||
name: "empty buffer",
|
||||
setupBuffer: func() *buffer {
|
||||
b := newBuffer(now, 10)
|
||||
b.data = nil
|
||||
return b
|
||||
},
|
||||
wantExists: false,
|
||||
wantHealthy: false,
|
||||
description: "empty buffer should not exist and not be healthy",
|
||||
},
|
||||
{
|
||||
name: "healthy buffer with recent data",
|
||||
setupBuffer: func() *buffer {
|
||||
b := newBuffer(now-30, 10)
|
||||
// Write data up to now (within MaxMissingDataPoints * frequency = 50 seconds)
|
||||
for ts := now - 30; ts <= now; ts += 10 {
|
||||
b.write(ts, schema.Float(1.0))
|
||||
}
|
||||
return b
|
||||
},
|
||||
wantExists: true,
|
||||
wantHealthy: true,
|
||||
description: "buffer with recent data should be healthy",
|
||||
},
|
||||
{
|
||||
name: "stale buffer beyond threshold",
|
||||
setupBuffer: func() *buffer {
|
||||
b := newBuffer(now-200, 10)
|
||||
// Write data that ends 100 seconds ago (beyond MaxMissingDataPoints * frequency = 50 seconds)
|
||||
for ts := now - 200; ts <= now-100; ts += 10 {
|
||||
b.write(ts, schema.Float(1.0))
|
||||
}
|
||||
return b
|
||||
},
|
||||
wantExists: true,
|
||||
wantHealthy: false,
|
||||
description: "buffer with stale data should exist but not be healthy",
|
||||
},
|
||||
{
|
||||
name: "buffer at threshold boundary",
|
||||
setupBuffer: func() *buffer {
|
||||
b := newBuffer(now-50, 10)
|
||||
// Write data that ends exactly at threshold (MaxMissingDataPoints * frequency = 50 seconds)
|
||||
for ts := now - 50; ts <= now-50; ts += 10 {
|
||||
b.write(ts, schema.Float(1.0))
|
||||
}
|
||||
return b
|
||||
},
|
||||
wantExists: true,
|
||||
wantHealthy: true,
|
||||
description: "buffer at threshold boundary should still be healthy",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
b := tt.setupBuffer()
|
||||
|
||||
exists := b.bufferExists()
|
||||
if exists != tt.wantExists {
|
||||
t.Errorf("bufferExists() = %v, want %v: %s", exists, tt.wantExists, tt.description)
|
||||
}
|
||||
|
||||
if b != nil && b.data != nil && len(b.data) > 0 {
|
||||
healthy := b.isBufferHealthy()
|
||||
if healthy != tt.wantHealthy {
|
||||
t.Errorf("isBufferHealthy() = %v, want %v: %s", healthy, tt.wantHealthy, tt.description)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
-->
|
||||
|
||||
<script>
|
||||
import { onMount } from "svelte";
|
||||
import { getContext, onMount } from "svelte";
|
||||
import {
|
||||
Row,
|
||||
Col,
|
||||
@@ -18,6 +18,7 @@
|
||||
Spinner,
|
||||
InputGroup,
|
||||
Input,
|
||||
Tooltip
|
||||
} from "@sveltestrap/sveltestrap";
|
||||
import {
|
||||
queryStore,
|
||||
@@ -29,6 +30,9 @@
|
||||
scramble,
|
||||
scrambleNames,
|
||||
} from "./generic/utils.js";
|
||||
import {
|
||||
formatDurationTime
|
||||
} from "./generic/units.js";
|
||||
import Filters from "./generic/Filters.svelte";
|
||||
|
||||
/* Svelte 5 Props */
|
||||
@@ -40,48 +44,70 @@
|
||||
/* Const Init */
|
||||
const {} = init();
|
||||
const client = getContextClient();
|
||||
const shortDuration = getContext("cc-config").jobList_hideShortRunningJobs; // Always configured
|
||||
|
||||
/* State Init*/
|
||||
let filterComponent = $state(); // see why here: https://stackoverflow.com/questions/58287729/how-can-i-export-a-function-from-a-svelte-component-that-changes-a-value-in-the
|
||||
let jobFilters = $state([]);
|
||||
let nameFilter = $state("");
|
||||
let sorting = $state({ field: "totalJobs", direction: "down" });
|
||||
let sorting = $state({ field: "totalJobs", direction: "desc" });
|
||||
|
||||
/* Derived Vars */
|
||||
const fetchRunning = $derived(jobFilters.some(jf => jf?.state?.length == 1 && jf?.state?.includes("running")));
|
||||
const numCols = $derived.by(() => {
|
||||
let colbase = 6
|
||||
if (fetchRunning) {
|
||||
colbase += 2
|
||||
}
|
||||
return colbase
|
||||
})
|
||||
|
||||
let stats = $derived(
|
||||
queryStore({
|
||||
client: client,
|
||||
query: gql`
|
||||
query($jobFilters: [JobFilter!]!) {
|
||||
query($jobFilters: [JobFilter!]!, $fetchRunning: Boolean!) {
|
||||
rows: jobsStatistics(filter: $jobFilters, groupBy: ${type}) {
|
||||
id
|
||||
name
|
||||
totalJobs
|
||||
shortJobs
|
||||
totalCores @include(if: $fetchRunning)
|
||||
totalAccs @include(if: $fetchRunning)
|
||||
totalWalltime
|
||||
totalCoreHours
|
||||
totalAccHours
|
||||
}
|
||||
}`,
|
||||
variables: { jobFilters },
|
||||
variables: {
|
||||
jobFilters,
|
||||
fetchRunning
|
||||
},
|
||||
})
|
||||
);
|
||||
|
||||
/* Functions */
|
||||
function changeSorting(field) {
|
||||
sorting = { field, direction: sorting?.direction == "down" ? "up" : "down" };
|
||||
function changeSorting(newField) {
|
||||
if (sorting.field == newField) {
|
||||
// Same Field, Change Direction
|
||||
sorting = { field: newField, direction: sorting.direction == "desc" ? "asc" : "desc" };
|
||||
} else {
|
||||
// Change Field, Apply Default Direction
|
||||
sorting = { field: newField, direction: "desc" };
|
||||
}
|
||||
}
|
||||
|
||||
function sort(stats, sorting, nameFilter) {
|
||||
const idCmp = sorting.direction == "up"
|
||||
const idCmp = sorting.direction == "asc"
|
||||
? (a, b) => b.id.localeCompare(a.id)
|
||||
: (a, b) => a.id.localeCompare(b.id)
|
||||
|
||||
// Force empty or undefined strings to the end of the list
|
||||
const nameCmp = sorting.direction == "up"
|
||||
const nameCmp = sorting.direction == "asc"
|
||||
? (a, b) => !a?.name ? 1 : (!b?.name ? -1 : (b.name.localeCompare(a.name)))
|
||||
: (a, b) => !a?.name ? 1 : (!b?.name ? -1 : (a.name.localeCompare(b.name)))
|
||||
|
||||
const intCmp = sorting.direction == "up"
|
||||
const intCmp = sorting.direction == "asc"
|
||||
? (a, b) => a[sorting.field] - b[sorting.field]
|
||||
: (a, b) => b[sorting.field] - a[sorting.field];
|
||||
|
||||
@@ -141,7 +167,7 @@
|
||||
>
|
||||
{#if sorting?.field == "id"}
|
||||
<!-- Note on Icon-Name: Arrow-indicator always down, only alpha-indicator switches -->
|
||||
<Icon name={`sort-alpha-${sorting?.direction == 'down' ? 'down' : 'down-alt'}`} />
|
||||
<Icon name={`sort-alpha-${sorting?.direction == 'desc' ? 'down' : 'down-alt'}`} />
|
||||
{:else}
|
||||
<Icon name="three-dots-vertical" />
|
||||
{/if}
|
||||
@@ -156,7 +182,7 @@
|
||||
onclick={() => changeSorting("name")}
|
||||
>
|
||||
{#if sorting?.field == "name"}
|
||||
<Icon name={`sort-alpha-${sorting?.direction == 'down' ? 'down' : 'down-alt'}`} />
|
||||
<Icon name={`sort-alpha-${sorting?.direction == 'desc' ? 'down' : 'down-alt'}`} />
|
||||
{:else}
|
||||
<Icon name="three-dots-vertical" />
|
||||
{/if}
|
||||
@@ -172,12 +198,66 @@
|
||||
>
|
||||
{#if sorting?.field == "totalJobs"}
|
||||
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
|
||||
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} />
|
||||
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
|
||||
{:else}
|
||||
<Icon name="three-dots-vertical" />
|
||||
{/if}
|
||||
</Button>
|
||||
</th>
|
||||
<th scope="col">
|
||||
<span class="mr-1">
|
||||
Short Jobs
|
||||
<Icon id="shortjobs-info" style="cursor:help;" size="sm" name="info-circle"/>
|
||||
</span>
|
||||
<Tooltip target={`shortjobs-info`} placement="top">
|
||||
Job duration less than {formatDurationTime(shortDuration)}
|
||||
</Tooltip>
|
||||
  <!-- Narrow Non-Breaking Space -->
|
||||
<Button
|
||||
color={sorting.field == "shortJobs" ? "primary" : "light"}
|
||||
size="sm"
|
||||
onclick={() => changeSorting("shortJobs")}
|
||||
>
|
||||
{#if sorting?.field == "shortJobs"}
|
||||
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
|
||||
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
|
||||
{:else}
|
||||
<Icon name="three-dots-vertical" />
|
||||
{/if}
|
||||
</Button>
|
||||
</th>
|
||||
{#if fetchRunning}
|
||||
<th scope="col">
|
||||
Total Cores
|
||||
<Button
|
||||
color={sorting.field == "totalCores" ? "primary" : "light"}
|
||||
size="sm"
|
||||
onclick={() => changeSorting("totalCores")}
|
||||
>
|
||||
{#if sorting?.field == "totalJCores"}
|
||||
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
|
||||
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
|
||||
{:else}
|
||||
<Icon name="three-dots-vertical" />
|
||||
{/if}
|
||||
</Button>
|
||||
</th>
|
||||
<th scope="col">
|
||||
Total Accelerators
|
||||
<Button
|
||||
color={sorting.field == "totalAccs" ? "primary" : "light"}
|
||||
size="sm"
|
||||
onclick={() => changeSorting("totalAccs")}
|
||||
>
|
||||
{#if sorting?.field == "totalAccs"}
|
||||
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
|
||||
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
|
||||
{:else}
|
||||
<Icon name="three-dots-vertical" />
|
||||
{/if}
|
||||
</Button>
|
||||
</th>
|
||||
{/if}
|
||||
<th scope="col">
|
||||
Total Walltime
|
||||
<Button
|
||||
@@ -186,7 +266,7 @@
|
||||
onclick={() => changeSorting("totalWalltime")}
|
||||
>
|
||||
{#if sorting?.field == "totalWalltime"}
|
||||
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} />
|
||||
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
|
||||
{:else}
|
||||
<Icon name="three-dots-vertical" />
|
||||
{/if}
|
||||
@@ -200,7 +280,7 @@
|
||||
onclick={() => changeSorting("totalCoreHours")}
|
||||
>
|
||||
{#if sorting?.field == "totalCoreHours"}
|
||||
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} />
|
||||
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
|
||||
{:else}
|
||||
<Icon name="three-dots-vertical" />
|
||||
{/if}
|
||||
@@ -214,7 +294,7 @@
|
||||
onclick={() => changeSorting("totalAccHours")}
|
||||
>
|
||||
{#if sorting?.field == "totalAccHours"}
|
||||
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} />
|
||||
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
|
||||
{:else}
|
||||
<Icon name="three-dots-vertical" />
|
||||
{/if}
|
||||
@@ -225,11 +305,11 @@
|
||||
<tbody>
|
||||
{#if $stats.fetching}
|
||||
<tr>
|
||||
<td colspan="4" style="text-align: center;"><Spinner secondary /></td>
|
||||
<td colspan={numCols} style="text-align: center;"><Spinner secondary /></td>
|
||||
</tr>
|
||||
{:else if $stats.error}
|
||||
<tr>
|
||||
<td colspan="4"
|
||||
<td colspan={numCols}
|
||||
><Card body color="danger" class="mb-3">{$stats.error.message}</Card
|
||||
></td
|
||||
>
|
||||
@@ -260,13 +340,18 @@
|
||||
>
|
||||
{/if}
|
||||
<td>{row.totalJobs}</td>
|
||||
<td>{row.shortJobs}</td>
|
||||
{#if fetchRunning}
|
||||
<td>{row.totalCores}</td>
|
||||
<td>{row.totalAccs}</td>
|
||||
{/if}
|
||||
<td>{row.totalWalltime}</td>
|
||||
<td>{row.totalCoreHours}</td>
|
||||
<td>{row.totalAccHours}</td>
|
||||
</tr>
|
||||
{:else}
|
||||
<tr>
|
||||
<td colspan="4"><i>No {type.toLowerCase()}s/jobs found</i></td>
|
||||
<td colspan={numCols}><i>No {type.toLowerCase()}s/jobs found</i></td>
|
||||
</tr>
|
||||
{/each}
|
||||
{/if}
|
||||
|
||||
@@ -32,7 +32,7 @@
|
||||
let {
|
||||
matchedListJobs = $bindable(0),
|
||||
selectedJobs = $bindable([]),
|
||||
metrics = getContext("cc-config").metricConfig_jobListMetrics,
|
||||
metrics = [],
|
||||
sorting = { field: "startTime", type: "col", order: "DESC" },
|
||||
showFootprint = false,
|
||||
filterBuffer = [],
|
||||
@@ -109,7 +109,7 @@
|
||||
let paging = $derived({ itemsPerPage, page });
|
||||
const plotWidth = $derived.by(() => {
|
||||
return Math.floor(
|
||||
(tableWidth - jobInfoColumnWidth) / (metrics.length + (showFootprint ? 1 : 0)) - 10,
|
||||
(tableWidth - jobInfoColumnWidth) / (metrics.length + (showFootprint ? 2 : 1)) - 10,
|
||||
);
|
||||
});
|
||||
let jobsStore = $derived(queryStore({
|
||||
|
||||
@@ -133,7 +133,7 @@
|
||||
}
|
||||
</script>
|
||||
|
||||
<Card class="mt-1 overflow-auto" style="width: {width}; height: {height}">
|
||||
<Card class="mx-2 overflow-auto" style="width: {width}; height: {height}">
|
||||
{#if displayTitle}
|
||||
<CardHeader>
|
||||
<CardTitle class="mb-0 d-flex justify-content-center">
|
||||
|
||||
@@ -79,6 +79,7 @@
|
||||
|
||||
/* Derived */
|
||||
const jobId = $derived(job?.id);
|
||||
const refinedData = $derived($metricsQuery?.data?.jobMetrics ? sortAndSelectScope($metricsQuery.data.jobMetrics) : []);
|
||||
const scopes = $derived.by(() => {
|
||||
if (job.numNodes == 1) {
|
||||
if (job.numAcc >= 1) return ["core", "accelerator"];
|
||||
@@ -202,40 +203,45 @@
|
||||
/>
|
||||
</td>
|
||||
{/if}
|
||||
{#each sortAndSelectScope($metricsQuery.data.jobMetrics) as metric, i (metric?.name || i)}
|
||||
{#each refinedData as metric, i (metric?.name || i)}
|
||||
<td>
|
||||
<!-- Subluster Metricconfig remove keyword for jobtables (joblist main, user joblist, project joblist) to be used here as toplevel case-->
|
||||
{#if metric.disabled == false && metric.data}
|
||||
<MetricPlot
|
||||
onZoom={(detail) => handleZoom(detail, metric.data.name)}
|
||||
height={plotHeight}
|
||||
timestep={metric.data.metric.timestep}
|
||||
scope={metric.data.scope}
|
||||
series={metric.data.metric.series}
|
||||
statisticsSeries={metric.data.metric.statisticsSeries}
|
||||
metric={metric.data.name}
|
||||
cluster={cluster.find((c) => c.name == job.cluster)}
|
||||
subCluster={job.subCluster}
|
||||
isShared={job.shared != "none"}
|
||||
numhwthreads={job.numHWThreads}
|
||||
numaccs={job.numAcc}
|
||||
zoomState={zoomStates[metric.data.name] || null}
|
||||
thresholdState={thresholdStates[metric.data.name] || null}
|
||||
/>
|
||||
{:else if metric.disabled == true && metric.data}
|
||||
<Card body color="info"
|
||||
>Metric disabled for subcluster <code
|
||||
>{metric.data.name}:{job.subCluster}</code
|
||||
></Card
|
||||
>
|
||||
{:else}
|
||||
<Card body class="mx-2" color="warning">
|
||||
<p>No dataset(s) returned for <b>{metrics[i]}</b></p>
|
||||
<p class="mb-1">Metric or host was not found in metric store for cluster <b>{job.cluster}</b>:</p>
|
||||
<p class="mb-1">Identical messages in <i>{metrics[i]} column</i>: Metric not found.</p>
|
||||
<p class="mb-1">Identical messages in <i>job {job.jobId} row</i>: Host not found.</p>
|
||||
</Card>
|
||||
{/if}
|
||||
{#key metric}
|
||||
{#if metric?.data}
|
||||
{#if metric?.disabled}
|
||||
<Card body class="mx-2" color="info">
|
||||
Metric <b>{metric.data.name}</b>: Disabled for subcluster <code>{job.subCluster}</code>
|
||||
</Card>
|
||||
{:else}
|
||||
<MetricPlot
|
||||
onZoom={(detail) => handleZoom(detail, metric.data.name)}
|
||||
height={plotHeight}
|
||||
timestep={metric.data.metric.timestep}
|
||||
scope={metric.data.scope}
|
||||
series={metric.data.metric.series}
|
||||
statisticsSeries={metric.data.metric.statisticsSeries}
|
||||
metric={metric.data.name}
|
||||
cluster={cluster.find((c) => c.name == job.cluster)}
|
||||
subCluster={job.subCluster}
|
||||
isShared={job.shared != "none"}
|
||||
numhwthreads={job.numHWThreads}
|
||||
numaccs={job.numAcc}
|
||||
zoomState={zoomStates[metric.data.name] || null}
|
||||
thresholdState={thresholdStates[metric.data.name] || null}
|
||||
/>
|
||||
{/if}
|
||||
{:else}
|
||||
<Card body class="mx-2" color="warning">
|
||||
<p>No dataset(s) returned for <b>{metrics[i]}</b></p>
|
||||
<p class="mb-1">Metric or host was not found in metric store for cluster <b>{job.cluster}</b>:</p>
|
||||
<p class="mb-1">Identical messages in <i>{metrics[i]} column</i>: Metric not found.</p>
|
||||
<p class="mb-1">Identical messages in <i>job {job.jobId} row</i>: Host not found.</p>
|
||||
</Card>
|
||||
{/if}
|
||||
{/key}
|
||||
</td>
|
||||
{:else}
|
||||
<td>
|
||||
<Card body class="mx-2">No metrics selected for display.</Card>
|
||||
</td>
|
||||
{/each}
|
||||
{/if}
|
||||
|
||||
@@ -79,7 +79,7 @@
|
||||
// X
|
||||
let pendingSeries = [
|
||||
{
|
||||
label: "Runtime",
|
||||
label: "Time",
|
||||
value: (u, ts, sidx, didx) =>
|
||||
(didx == null) ? null : formatDurationTime(ts, forNode),
|
||||
}
|
||||
|
||||
@@ -34,6 +34,9 @@
|
||||
/*Const Init */
|
||||
const { query: initq } = init();
|
||||
const useCbColors = getContext("cc-config")?.plotConfiguration_colorblindMode || false
|
||||
|
||||
/* Derived */
|
||||
const subClusters = $derived($initq?.data?.clusters?.find((c) => c.name == presetCluster)?.subClusters || []);
|
||||
</script>
|
||||
|
||||
<!-- Loading indicator & Refresh -->
|
||||
@@ -66,11 +69,21 @@
|
||||
</CardBody>
|
||||
</TabPane>
|
||||
|
||||
<TabPane tabId="usage-dash" tab="Usage">
|
||||
<TabPane tabId="usage-dash" tab="Cluster Usage">
|
||||
<CardBody>
|
||||
<UsageDash {presetCluster} {useCbColors}></UsageDash>
|
||||
</CardBody>
|
||||
</TabPane>
|
||||
|
||||
{#if subClusters?.length > 1}
|
||||
{#each subClusters.map(sc => sc.name) as scn}
|
||||
<TabPane tabId="{scn}-usage-dash" tab="{scn.charAt(0).toUpperCase() + scn.slice(1)} Usage">
|
||||
<CardBody>
|
||||
<UsageDash {presetCluster} presetSubCluster={scn} {useCbColors}></UsageDash>
|
||||
</CardBody>
|
||||
</TabPane>
|
||||
{/each}
|
||||
{/if}
|
||||
|
||||
<TabPane tabId="metric-dash" tab="Statistics">
|
||||
<CardBody>
|
||||
|
||||
@@ -3,6 +3,9 @@
|
||||
|
||||
Properties:
|
||||
- `presetCluster String`: The cluster to show status information for
|
||||
- `presetSubCluster String?`: The subCluster to show status information for [Default: null]
|
||||
- `useCbColors Bool?`: Use colorblind friendly colors [Default: false]
|
||||
- `useAltColors Bool?`: Use alternative color set [Default: false]
|
||||
-->
|
||||
|
||||
<script>
|
||||
@@ -35,6 +38,7 @@
|
||||
/* Svelte 5 Props */
|
||||
let {
|
||||
presetCluster,
|
||||
presetSubCluster = null,
|
||||
useCbColors = false,
|
||||
useAltColors = false
|
||||
} = $props();
|
||||
@@ -52,7 +56,12 @@
|
||||
let numDurationBins = $state("1h");
|
||||
|
||||
/* Derived */
|
||||
let cluster = $derived(presetCluster)
|
||||
const canvasPrefix = $derived(`${presetCluster}-${presetSubCluster ? presetSubCluster : ''}`)
|
||||
|
||||
const statusFilter = $derived(presetSubCluster
|
||||
? [{ state: ["running"] }, { cluster: { eq: presetCluster} }, { partition: { eq: presetSubCluster } }]
|
||||
: [{ state: ["running"] }, { cluster: { eq: presetCluster} }]
|
||||
);
|
||||
const topJobsQuery = $derived(queryStore({
|
||||
client: client,
|
||||
query: gql`
|
||||
@@ -82,7 +91,7 @@
|
||||
}
|
||||
`,
|
||||
variables: {
|
||||
filter: [{ state: ["running"] }, { cluster: { eq: cluster} }],
|
||||
filter: statusFilter,
|
||||
paging: pagingState // Top 10
|
||||
},
|
||||
requestPolicy: "network-only"
|
||||
@@ -117,7 +126,7 @@
|
||||
}
|
||||
`,
|
||||
variables: {
|
||||
filter: [{ state: ["running"] }, { cluster: { eq: cluster } }],
|
||||
filter: statusFilter,
|
||||
paging: pagingState
|
||||
},
|
||||
requestPolicy: "network-only"
|
||||
@@ -152,7 +161,7 @@
|
||||
}
|
||||
`,
|
||||
variables: {
|
||||
filter: [{ state: ["running"] }, { cluster: { eq: cluster } }],
|
||||
filter: statusFilter,
|
||||
paging: pagingState
|
||||
},
|
||||
requestPolicy: "network-only"
|
||||
@@ -184,7 +193,7 @@
|
||||
}
|
||||
`,
|
||||
variables: {
|
||||
filter: [{ state: ["running"] }, { cluster: { eq: cluster } }],
|
||||
filter: statusFilter,
|
||||
selectedHistograms: selectedHistograms, // No Metrics requested for node hardware stats
|
||||
numDurationBins: numDurationBins,
|
||||
},
|
||||
@@ -264,7 +273,7 @@
|
||||
</h4>
|
||||
<Pie
|
||||
{useAltColors}
|
||||
canvasId="hpcpie-jobs-users"
|
||||
canvasId="{canvasPrefix}-hpcpie-jobs-users"
|
||||
size={colWidthJobs * 0.75}
|
||||
sliceLabel="Jobs"
|
||||
quantities={$topJobsQuery.data.topUser.map(
|
||||
@@ -284,14 +293,14 @@
|
||||
{#each $topJobsQuery.data.topUser as tu, i}
|
||||
<tr>
|
||||
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
||||
<td id="topName-jobs-{tu.id}">
|
||||
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={cluster}&state=running"
|
||||
<td id="{canvasPrefix}-topName-jobs-{tu.id}">
|
||||
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running"
|
||||
>{scrambleNames ? scramble(tu.id) : tu.id}
|
||||
</a>
|
||||
</td>
|
||||
{#if tu?.name}
|
||||
<Tooltip
|
||||
target={`topName-jobs-${tu.id}`}
|
||||
target={`${canvasPrefix}-topName-jobs-${tu.id}`}
|
||||
placement="left"
|
||||
>{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip
|
||||
>
|
||||
@@ -308,7 +317,7 @@
|
||||
</h4>
|
||||
<Pie
|
||||
{useAltColors}
|
||||
canvasId="hpcpie-jobs-projects"
|
||||
canvasId="{canvasPrefix}-hpcpie-jobs-projects"
|
||||
size={colWidthJobs * 0.75}
|
||||
sliceLabel={'Jobs'}
|
||||
quantities={$topJobsQuery.data.topProjects.map(
|
||||
@@ -328,7 +337,7 @@
|
||||
<tr>
|
||||
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
||||
<td>
|
||||
<a target="_blank" href="/monitoring/jobs/?cluster={cluster}&state=running&project={tp.id}&projectMatch=eq"
|
||||
<a target="_blank" href="/monitoring/jobs/?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running&project={tp.id}&projectMatch=eq"
|
||||
>{scrambleNames ? scramble(tp.id) : tp.id}
|
||||
</a>
|
||||
</td>
|
||||
@@ -368,7 +377,7 @@
|
||||
</h4>
|
||||
<Pie
|
||||
{useAltColors}
|
||||
canvasId="hpcpie-nodes-users"
|
||||
canvasId="{canvasPrefix}-hpcpie-nodes-users"
|
||||
size={colWidthNodes * 0.75}
|
||||
sliceLabel="Nodes"
|
||||
quantities={$topNodesQuery.data.topUser.map(
|
||||
@@ -388,14 +397,14 @@
|
||||
{#each $topNodesQuery.data.topUser as tu, i}
|
||||
<tr>
|
||||
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
||||
<td id="topName-nodes-{tu.id}">
|
||||
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={cluster}&state=running"
|
||||
<td id="{canvasPrefix}-topName-nodes-{tu.id}">
|
||||
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running"
|
||||
>{scrambleNames ? scramble(tu.id) : tu.id}
|
||||
</a>
|
||||
</td>
|
||||
{#if tu?.name}
|
||||
<Tooltip
|
||||
target={`topName-nodes-${tu.id}`}
|
||||
target={`${canvasPrefix}-topName-nodes-${tu.id}`}
|
||||
placement="left"
|
||||
>{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip
|
||||
>
|
||||
@@ -412,7 +421,7 @@
|
||||
</h4>
|
||||
<Pie
|
||||
{useAltColors}
|
||||
canvasId="hpcpie-nodes-projects"
|
||||
canvasId="{canvasPrefix}-hpcpie-nodes-projects"
|
||||
size={colWidthNodes * 0.75}
|
||||
sliceLabel={'Nodes'}
|
||||
quantities={$topNodesQuery.data.topProjects.map(
|
||||
@@ -432,7 +441,7 @@
|
||||
<tr>
|
||||
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
||||
<td>
|
||||
<a target="_blank" href="/monitoring/jobs/?cluster={cluster}&state=running&project={tp.id}&projectMatch=eq"
|
||||
<a target="_blank" href="/monitoring/jobs/?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running&project={tp.id}&projectMatch=eq"
|
||||
>{scrambleNames ? scramble(tp.id) : tp.id}
|
||||
</a>
|
||||
</td>
|
||||
@@ -472,7 +481,7 @@
|
||||
</h4>
|
||||
<Pie
|
||||
{useAltColors}
|
||||
canvasId="hpcpie-accs-users"
|
||||
canvasId="{canvasPrefix}-hpcpie-accs-users"
|
||||
size={colWidthAccs * 0.75}
|
||||
sliceLabel="GPUs"
|
||||
quantities={$topAccsQuery.data.topUser.map(
|
||||
@@ -492,14 +501,14 @@
|
||||
{#each $topAccsQuery.data.topUser as tu, i}
|
||||
<tr>
|
||||
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
||||
<td id="topName-accs-{tu.id}">
|
||||
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={cluster}&state=running"
|
||||
<td id="{canvasPrefix}-topName-accs-{tu.id}">
|
||||
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running"
|
||||
>{scrambleNames ? scramble(tu.id) : tu.id}
|
||||
</a>
|
||||
</td>
|
||||
{#if tu?.name}
|
||||
<Tooltip
|
||||
target={`topName-accs-${tu.id}`}
|
||||
target={`${canvasPrefix}-topName-accs-${tu.id}`}
|
||||
placement="left"
|
||||
>{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip
|
||||
>
|
||||
@@ -516,7 +525,7 @@
|
||||
</h4>
|
||||
<Pie
|
||||
{useAltColors}
|
||||
canvasId="hpcpie-accs-projects"
|
||||
canvasId="{canvasPrefix}-hpcpie-accs-projects"
|
||||
size={colWidthAccs * 0.75}
|
||||
sliceLabel={'GPUs'}
|
||||
quantities={$topAccsQuery.data.topProjects.map(
|
||||
@@ -536,7 +545,7 @@
|
||||
<tr>
|
||||
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
||||
<td>
|
||||
<a target="_blank" href="/monitoring/jobs/?cluster={cluster}&state=running&project={tp.id}&projectMatch=eq"
|
||||
<a target="_blank" href="/monitoring/jobs/?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running&project={tp.id}&projectMatch=eq"
|
||||
>{scrambleNames ? scramble(tp.id) : tp.id}
|
||||
</a>
|
||||
</td>
|
||||
|
||||
@@ -69,9 +69,9 @@
|
||||
})
|
||||
);
|
||||
|
||||
let extendedLegendData = $derived($nodeJobsData?.data ? buildExtendedLegend() : null);
|
||||
let refinedData = $derived(nodeData?.metrics ? sortAndSelectScope(nodeData.metrics) : null);
|
||||
let dataHealth = $derived(refinedData.filter((rd) => rd.disabled === false).map((enabled) => (enabled?.data?.metric?.series?.length > 0)));
|
||||
const extendedLegendData = $derived($nodeJobsData?.data ? buildExtendedLegend() : null);
|
||||
const refinedData = $derived(nodeData?.metrics ? sortAndSelectScope(nodeData.metrics) : []);
|
||||
const dataHealth = $derived(refinedData.filter((rd) => rd.disabled === false).map((enabled) => (enabled?.data?.metric?.series?.length > 0)));
|
||||
|
||||
/* Functions */
|
||||
const selectScope = (nodeMetrics) =>
|
||||
|
||||
Reference in New Issue
Block a user