mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-02-11 21:41:46 +01:00
2
.gitignore
vendored
2
.gitignore
vendored
@@ -13,7 +13,7 @@
|
|||||||
/var/checkpoints*
|
/var/checkpoints*
|
||||||
|
|
||||||
migrateTimestamps.pl
|
migrateTimestamps.pl
|
||||||
test_ccms_write_api*
|
test_ccms_*
|
||||||
|
|
||||||
/web/frontend/public/build
|
/web/frontend/public/build
|
||||||
/web/frontend/node_modules
|
/web/frontend/node_modules
|
||||||
|
|||||||
@@ -135,36 +135,3 @@ func debugMetrics(rw http.ResponseWriter, r *http.Request) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// handleHealthCheck godoc
|
|
||||||
// @summary HealthCheck endpoint
|
|
||||||
// @tags healthcheck
|
|
||||||
// @description This endpoint allows the users to check if a node is healthy
|
|
||||||
// @produce json
|
|
||||||
// @param selector query string false "Selector"
|
|
||||||
// @success 200 {string} string "Debug dump"
|
|
||||||
// @failure 400 {object} api.ErrorResponse "Bad Request"
|
|
||||||
// @failure 401 {object} api.ErrorResponse "Unauthorized"
|
|
||||||
// @failure 403 {object} api.ErrorResponse "Forbidden"
|
|
||||||
// @failure 500 {object} api.ErrorResponse "Internal Server Error"
|
|
||||||
// @security ApiKeyAuth
|
|
||||||
// @router /healthcheck/ [get]
|
|
||||||
func metricsHealth(rw http.ResponseWriter, r *http.Request) {
|
|
||||||
rawCluster := r.URL.Query().Get("cluster")
|
|
||||||
rawNode := r.URL.Query().Get("node")
|
|
||||||
|
|
||||||
if rawCluster == "" || rawNode == "" {
|
|
||||||
handleError(errors.New("'cluster' and 'node' are required query parameter"), http.StatusBadRequest, rw)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
rw.Header().Add("Content-Type", "application/json")
|
|
||||||
|
|
||||||
selector := []string{rawCluster, rawNode}
|
|
||||||
|
|
||||||
ms := metricstore.GetMemoryStore()
|
|
||||||
if err := ms.HealthCheck(bufio.NewWriter(rw), selector); err != nil {
|
|
||||||
handleError(err, http.StatusBadRequest, rw)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -324,11 +324,12 @@ func (api *NatsAPI) processNodestateEvent(msg lp.CCMessage) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
repo := repository.GetNodeRepository()
|
repo := repository.GetNodeRepository()
|
||||||
|
requestReceived := time.Now().Unix()
|
||||||
|
|
||||||
for _, node := range req.Nodes {
|
for _, node := range req.Nodes {
|
||||||
state := determineState(node.States)
|
state := determineState(node.States)
|
||||||
nodeState := schema.NodeStateDB{
|
nodeState := schema.NodeStateDB{
|
||||||
TimeStamp: time.Now().Unix(),
|
TimeStamp: requestReceived,
|
||||||
NodeState: state,
|
NodeState: state,
|
||||||
CpusAllocated: node.CpusAllocated,
|
CpusAllocated: node.CpusAllocated,
|
||||||
MemoryAllocated: node.MemoryAllocated,
|
MemoryAllocated: node.MemoryAllocated,
|
||||||
|
|||||||
@@ -7,11 +7,14 @@ package api
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"maps"
|
||||||
"net/http"
|
"net/http"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
"github.com/ClusterCockpit/cc-backend/internal/repository"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/pkg/archive"
|
||||||
|
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
|
||||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -20,6 +23,15 @@ type UpdateNodeStatesRequest struct {
|
|||||||
Cluster string `json:"cluster" example:"fritz"`
|
Cluster string `json:"cluster" example:"fritz"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// metricListToNames converts a map of metric configurations to a list of metric names
|
||||||
|
func metricListToNames(metricList map[string]*schema.Metric) []string {
|
||||||
|
names := make([]string, 0, len(metricList))
|
||||||
|
for name := range metricList {
|
||||||
|
names = append(names, name)
|
||||||
|
}
|
||||||
|
return names
|
||||||
|
}
|
||||||
|
|
||||||
// this routine assumes that only one of them exists per node
|
// this routine assumes that only one of them exists per node
|
||||||
func determineState(states []string) schema.SchedulerState {
|
func determineState(states []string) schema.SchedulerState {
|
||||||
for _, state := range states {
|
for _, state := range states {
|
||||||
@@ -62,16 +74,42 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
|
|||||||
http.StatusBadRequest, rw)
|
http.StatusBadRequest, rw)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
requestReceived := time.Now().Unix()
|
||||||
repo := repository.GetNodeRepository()
|
repo := repository.GetNodeRepository()
|
||||||
|
ms := metricstore.GetMemoryStore()
|
||||||
|
|
||||||
|
m := make(map[string][]string)
|
||||||
|
healthStates := make(map[string]schema.MonitoringState)
|
||||||
|
|
||||||
|
for _, node := range req.Nodes {
|
||||||
|
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
|
||||||
|
m[sc] = append(m[sc], node.Hostname)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for sc, nl := range m {
|
||||||
|
if sc != "" {
|
||||||
|
metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc)
|
||||||
|
metricNames := metricListToNames(metricList)
|
||||||
|
if states, err := ms.HealthCheck(req.Cluster, nl, metricNames); err == nil {
|
||||||
|
maps.Copy(healthStates, states)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for _, node := range req.Nodes {
|
for _, node := range req.Nodes {
|
||||||
state := determineState(node.States)
|
state := determineState(node.States)
|
||||||
|
healthState := schema.MonitoringStateFailed
|
||||||
|
if hs, ok := healthStates[node.Hostname]; ok {
|
||||||
|
healthState = hs
|
||||||
|
}
|
||||||
nodeState := schema.NodeStateDB{
|
nodeState := schema.NodeStateDB{
|
||||||
TimeStamp: time.Now().Unix(), NodeState: state,
|
TimeStamp: requestReceived,
|
||||||
|
NodeState: state,
|
||||||
CpusAllocated: node.CpusAllocated,
|
CpusAllocated: node.CpusAllocated,
|
||||||
MemoryAllocated: node.MemoryAllocated,
|
MemoryAllocated: node.MemoryAllocated,
|
||||||
GpusAllocated: node.GpusAllocated,
|
GpusAllocated: node.GpusAllocated,
|
||||||
HealthState: schema.MonitoringStateFull,
|
HealthState: healthState,
|
||||||
JobsRunning: node.JobsRunning,
|
JobsRunning: node.JobsRunning,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -81,7 +81,7 @@ func (api *RestAPI) MountAPIRoutes(r *mux.Router) {
|
|||||||
// Cluster List
|
// Cluster List
|
||||||
r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet)
|
r.HandleFunc("/clusters/", api.getClusters).Methods(http.MethodGet)
|
||||||
// Slurm node state
|
// Slurm node state
|
||||||
r.HandleFunc("/nodestate/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut)
|
r.HandleFunc("/nodestates/", api.updateNodeStates).Methods(http.MethodPost, http.MethodPut)
|
||||||
// Job Handler
|
// Job Handler
|
||||||
if config.Keys.APISubjects == nil {
|
if config.Keys.APISubjects == nil {
|
||||||
cclog.Info("Enabling REST start/stop job API")
|
cclog.Info("Enabling REST start/stop job API")
|
||||||
@@ -127,12 +127,12 @@ func (api *RestAPI) MountMetricStoreAPIRoutes(r *mux.Router) {
|
|||||||
r.HandleFunc("/free", freeMetrics).Methods(http.MethodPost)
|
r.HandleFunc("/free", freeMetrics).Methods(http.MethodPost)
|
||||||
r.HandleFunc("/write", writeMetrics).Methods(http.MethodPost)
|
r.HandleFunc("/write", writeMetrics).Methods(http.MethodPost)
|
||||||
r.HandleFunc("/debug", debugMetrics).Methods(http.MethodGet)
|
r.HandleFunc("/debug", debugMetrics).Methods(http.MethodGet)
|
||||||
r.HandleFunc("/healthcheck", metricsHealth).Methods(http.MethodGet)
|
r.HandleFunc("/healthcheck", api.updateNodeStates).Methods(http.MethodPost)
|
||||||
// Same endpoints but with trailing slash
|
// Same endpoints but with trailing slash
|
||||||
r.HandleFunc("/free/", freeMetrics).Methods(http.MethodPost)
|
r.HandleFunc("/free/", freeMetrics).Methods(http.MethodPost)
|
||||||
r.HandleFunc("/write/", writeMetrics).Methods(http.MethodPost)
|
r.HandleFunc("/write/", writeMetrics).Methods(http.MethodPost)
|
||||||
r.HandleFunc("/debug/", debugMetrics).Methods(http.MethodGet)
|
r.HandleFunc("/debug/", debugMetrics).Methods(http.MethodGet)
|
||||||
r.HandleFunc("/healthcheck/", metricsHealth).Methods(http.MethodGet)
|
r.HandleFunc("/healthcheck/", api.updateNodeStates).Methods(http.MethodPost)
|
||||||
}
|
}
|
||||||
|
|
||||||
// MountConfigAPIRoutes registers configuration and user management endpoints.
|
// MountConfigAPIRoutes registers configuration and user management endpoints.
|
||||||
|
|||||||
@@ -923,15 +923,19 @@ func (r *queryResolver) ClusterMetrics(ctx context.Context, cluster string, metr
|
|||||||
if !okData && len(ser.Data) != 0 {
|
if !okData && len(ser.Data) != 0 {
|
||||||
collectorData[metric] = make([]schema.Float, len(ser.Data))
|
collectorData[metric] = make([]schema.Float, len(ser.Data))
|
||||||
} else if !okData {
|
} else if !okData {
|
||||||
cclog.Debugf("ClusterMetrics Skip Init: No Data -> %s at %s; Size %d", metric, ser.Hostname, len(ser.Data))
|
cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip init: no data -> %s at %s; size %d", metric, ser.Hostname, len(ser.Data))
|
||||||
}
|
}
|
||||||
// Sum if init'd and matching size
|
// Sum if init'd and matching size
|
||||||
if okData && len(ser.Data) == len(collectorData[metric]) {
|
if okData && len(ser.Data) == len(collectorData[metric]) {
|
||||||
for i, val := range ser.Data {
|
for i, val := range ser.Data {
|
||||||
|
if val.IsNaN() {
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
collectorData[metric][i] += val
|
collectorData[metric][i] += val
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} else if okData {
|
} else if okData {
|
||||||
cclog.Debugf("ClusterMetrics Skip Sum: Data Diff -> %s at %s; Want Size %d, Have Size %d", metric, ser.Hostname, len(collectorData[metric]), len(ser.Data))
|
cclog.Debugf("[SCHEMARESOLVER] clusterMetrics skip sum: data diff -> %s at %s; want size %d, have size %d", metric, ser.Hostname, len(collectorData[metric]), len(ser.Data))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -466,7 +466,7 @@ func (r *JobRepository) JobCountGrouped(
|
|||||||
// AddJobCountGrouped augments existing statistics with additional job counts by category.
|
// AddJobCountGrouped augments existing statistics with additional job counts by category.
|
||||||
//
|
//
|
||||||
// This method enriches JobsStatistics returned by JobsStatsGrouped or JobCountGrouped
|
// This method enriches JobsStatistics returned by JobsStatsGrouped or JobCountGrouped
|
||||||
// with counts of running or short-running jobs, matched by group ID.
|
// with counts of running or short-running (based on ShortRunningJobsDuration) jobs, matched by group ID.
|
||||||
//
|
//
|
||||||
// Parameters:
|
// Parameters:
|
||||||
// - ctx: Context for security checks
|
// - ctx: Context for security checks
|
||||||
|
|||||||
@@ -158,8 +158,7 @@ func cleanupCheckpoints(dir string, cleanupDir string, from int64, deleteInstead
|
|||||||
return 0, err
|
return 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
extension := Keys.Checkpoints.FileFormat
|
files, err := findFiles(entries, from, false)
|
||||||
files, err := findFiles(entries, from, extension, false)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return 0, err
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -415,7 +415,7 @@ func enqueueCheckpointHosts(dir string, work chan<- [2]string) error {
|
|||||||
//
|
//
|
||||||
// Uses worker pool to load cluster/host combinations. Periodically triggers GC
|
// Uses worker pool to load cluster/host combinations. Periodically triggers GC
|
||||||
// to prevent excessive heap growth. Returns number of files loaded and any errors.
|
// to prevent excessive heap growth. Returns number of files loaded and any errors.
|
||||||
func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (int, error) {
|
func (m *MemoryStore) FromCheckpoint(dir string, from int64) (int, error) {
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
work := make(chan [2]string, Keys.NumWorkers*4)
|
work := make(chan [2]string, Keys.NumWorkers*4)
|
||||||
n, errs := int32(0), int32(0)
|
n, errs := int32(0), int32(0)
|
||||||
@@ -426,7 +426,7 @@ func (m *MemoryStore) FromCheckpoint(dir string, from int64, extension string) (
|
|||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
for host := range work {
|
for host := range work {
|
||||||
lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics))
|
lvl := m.root.findLevelOrCreate(host[:], len(m.Metrics))
|
||||||
nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from, extension)
|
nn, err := lvl.fromCheckpoint(m, filepath.Join(dir, host[0], host[1]), from)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.Errorf("[METRICSTORE]> error while loading checkpoints for %s/%s: %s", host[0], host[1], err.Error())
|
cclog.Errorf("[METRICSTORE]> error while loading checkpoints for %s/%s: %s", host[0], host[1], err.Error())
|
||||||
atomic.AddInt32(&errs, 1)
|
atomic.AddInt32(&errs, 1)
|
||||||
@@ -465,57 +465,7 @@ func (m *MemoryStore) FromCheckpointFiles(dir string, from int64) (int, error) {
|
|||||||
cclog.Debugf("[METRICSTORE]> %#v Directory created successfully", dir)
|
cclog.Debugf("[METRICSTORE]> %#v Directory created successfully", dir)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Config read (replace with your actual config read)
|
return m.FromCheckpoint(dir, from)
|
||||||
fileFormat := Keys.Checkpoints.FileFormat
|
|
||||||
if fileFormat == "" {
|
|
||||||
fileFormat = "avro"
|
|
||||||
}
|
|
||||||
|
|
||||||
// Map to easily get the fallback format
|
|
||||||
oppositeFormat := map[string]string{
|
|
||||||
"json": "avro",
|
|
||||||
"avro": "json",
|
|
||||||
}
|
|
||||||
|
|
||||||
// First, attempt to load the specified format
|
|
||||||
if found, err := checkFilesWithExtension(dir, fileFormat); err != nil {
|
|
||||||
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
|
|
||||||
} else if found {
|
|
||||||
cclog.Infof("[METRICSTORE]> Loading %s files because fileformat is %s", fileFormat, fileFormat)
|
|
||||||
return m.FromCheckpoint(dir, from, fileFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
// If not found, attempt the opposite format
|
|
||||||
altFormat := oppositeFormat[fileFormat]
|
|
||||||
if found, err := checkFilesWithExtension(dir, altFormat); err != nil {
|
|
||||||
return 0, fmt.Errorf("[METRICSTORE]> error checking files with extension: %v", err)
|
|
||||||
} else if found {
|
|
||||||
cclog.Infof("[METRICSTORE]> Loading %s files but fileformat is %s", altFormat, fileFormat)
|
|
||||||
return m.FromCheckpoint(dir, from, altFormat)
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// checkFilesWithExtension walks a directory tree to check if files with the given extension exist.
|
|
||||||
func checkFilesWithExtension(dir string, extension string) (bool, error) {
|
|
||||||
found := false
|
|
||||||
|
|
||||||
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("[METRICSTORE]> error accessing path %s: %v", path, err)
|
|
||||||
}
|
|
||||||
if !info.IsDir() && filepath.Ext(info.Name()) == "."+extension {
|
|
||||||
found = true
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("[METRICSTORE]> error walking through directories: %s", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return found, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error {
|
func (l *Level) loadAvroFile(m *MemoryStore, f *os.File, from int64) error {
|
||||||
@@ -729,7 +679,7 @@ func (l *Level) loadFile(cf *CheckpointFile, m *MemoryStore) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension string) (int, error) {
|
func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64) (int, error) {
|
||||||
direntries, err := os.ReadDir(dir)
|
direntries, err := os.ReadDir(dir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if os.IsNotExist(err) {
|
if os.IsNotExist(err) {
|
||||||
@@ -748,33 +698,38 @@ func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension
|
|||||||
children: make(map[string]*Level),
|
children: make(map[string]*Level),
|
||||||
}
|
}
|
||||||
|
|
||||||
files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from, extension)
|
files, err := child.fromCheckpoint(m, path.Join(dir, e.Name()), from)
|
||||||
filesLoaded += files
|
filesLoaded += files
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return filesLoaded, err
|
return filesLoaded, err
|
||||||
}
|
}
|
||||||
|
|
||||||
l.children[e.Name()] = child
|
l.children[e.Name()] = child
|
||||||
} else if strings.HasSuffix(e.Name(), "."+extension) {
|
} else if strings.HasSuffix(e.Name(), ".json") || strings.HasSuffix(e.Name(), ".avro") {
|
||||||
allFiles = append(allFiles, e)
|
allFiles = append(allFiles, e)
|
||||||
} else {
|
} else {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
files, err := findFiles(allFiles, from, extension, true)
|
files, err := findFiles(allFiles, from, true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return filesLoaded, err
|
return filesLoaded, err
|
||||||
}
|
}
|
||||||
|
|
||||||
loaders := map[string]func(*MemoryStore, *os.File, int64) error{
|
loaders := map[string]func(*MemoryStore, *os.File, int64) error{
|
||||||
"json": l.loadJSONFile,
|
".json": l.loadJSONFile,
|
||||||
"avro": l.loadAvroFile,
|
".avro": l.loadAvroFile,
|
||||||
}
|
}
|
||||||
|
|
||||||
loader := loaders[extension]
|
|
||||||
|
|
||||||
for _, filename := range files {
|
for _, filename := range files {
|
||||||
|
ext := filepath.Ext(filename)
|
||||||
|
loader := loaders[ext]
|
||||||
|
if loader == nil {
|
||||||
|
cclog.Warnf("Unknown extension for file %s", filename)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
// Use a closure to ensure file is closed immediately after use
|
// Use a closure to ensure file is closed immediately after use
|
||||||
err := func() error {
|
err := func() error {
|
||||||
f, err := os.Open(path.Join(dir, filename))
|
f, err := os.Open(path.Join(dir, filename))
|
||||||
@@ -798,10 +753,12 @@ func (l *Level) fromCheckpoint(m *MemoryStore, dir string, from int64, extension
|
|||||||
// This will probably get very slow over time!
|
// This will probably get very slow over time!
|
||||||
// A solution could be some sort of an index file in which all other files
|
// A solution could be some sort of an index file in which all other files
|
||||||
// and the timespan they contain is listed.
|
// and the timespan they contain is listed.
|
||||||
func findFiles(direntries []fs.DirEntry, t int64, extension string, findMoreRecentFiles bool) ([]string, error) {
|
// NOTE: This now assumes that you have distinct timestamps for json and avro files
|
||||||
|
// Also, it assumes that the timestamps are not overlapping/self-modified.
|
||||||
|
func findFiles(direntries []fs.DirEntry, t int64, findMoreRecentFiles bool) ([]string, error) {
|
||||||
nums := map[string]int64{}
|
nums := map[string]int64{}
|
||||||
for _, e := range direntries {
|
for _, e := range direntries {
|
||||||
if !strings.HasSuffix(e.Name(), "."+extension) {
|
if !strings.HasSuffix(e.Name(), ".json") && !strings.HasSuffix(e.Name(), ".avro") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -6,87 +6,260 @@
|
|||||||
package metricstore
|
package metricstore
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"cmp"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"slices"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// HealthCheckResponse represents the result of a health check operation.
|
||||||
|
//
|
||||||
|
// Status indicates the monitoring state (Full, Partial, Failed).
|
||||||
|
// Error contains any error encountered during the health check.
|
||||||
|
type HealthCheckResponse struct {
|
||||||
|
Status schema.MonitoringState
|
||||||
|
Error error
|
||||||
|
}
|
||||||
|
|
||||||
// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing.
|
// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing.
|
||||||
// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a
|
// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a
|
||||||
// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy.
|
// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy.
|
||||||
const MaxMissingDataPoints int64 = 5
|
const MaxMissingDataPoints int64 = 5
|
||||||
|
|
||||||
// MaxUnhealthyMetrics is a threshold which allows upto certain number of metrics in a node to be unhealthly.
|
// isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints.
|
||||||
// Works with MaxMissingDataPoints. Say 5 metrics (including submetrics) do not receive the last
|
//
|
||||||
// MaxMissingDataPoints data points, then the node will be deemed healthy. Any more metrics that does
|
// Returns true if the buffer is healthy (recent data within threshold), false otherwise.
|
||||||
// not receive data for MaxMissingDataPoints data points will deem the node unhealthy.
|
// A nil buffer or empty buffer is considered unhealthy.
|
||||||
const MaxUnhealthyMetrics int64 = 5
|
func (b *buffer) bufferExists() bool {
|
||||||
|
|
||||||
func (b *buffer) healthCheck() int64 {
|
|
||||||
// Check if the buffer is empty
|
// Check if the buffer is empty
|
||||||
if b.data == nil {
|
if b == nil || b.data == nil || len(b.data) == 0 {
|
||||||
return 1
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints.
|
||||||
|
//
|
||||||
|
// Returns true if the buffer is healthy (recent data within threshold), false otherwise.
|
||||||
|
// A nil buffer or empty buffer is considered unhealthy.
|
||||||
|
func (b *buffer) isBufferHealthy() bool {
|
||||||
|
// Get the last endtime of the buffer
|
||||||
bufferEnd := b.start + b.frequency*int64(len(b.data))
|
bufferEnd := b.start + b.frequency*int64(len(b.data))
|
||||||
t := time.Now().Unix()
|
t := time.Now().Unix()
|
||||||
|
|
||||||
// Check if the buffer is too old
|
// Check if the buffer has recent data (within MaxMissingDataPoints threshold)
|
||||||
if t-bufferEnd > MaxMissingDataPoints*b.frequency {
|
if t-bufferEnd > MaxMissingDataPoints*b.frequency {
|
||||||
return 1
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l *Level) healthCheck(m *MemoryStore, count int64) (int64, error) {
|
// MergeUniqueSorted merges two lists, sorts them, and removes duplicates.
|
||||||
|
// Requires 'cmp.Ordered' because we need to sort the data.
|
||||||
|
func mergeList[string cmp.Ordered](list1, list2 []string) []string {
|
||||||
|
// 1. Combine both lists
|
||||||
|
result := append(list1, list2...)
|
||||||
|
|
||||||
|
// 2. Sort the combined list
|
||||||
|
slices.Sort(result)
|
||||||
|
|
||||||
|
// 3. Compact removes consecutive duplicates (standard in Go 1.21+)
|
||||||
|
// e.g. [1, 1, 2, 3, 3] -> [1, 2, 3]
|
||||||
|
result = slices.Compact(result)
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// getHealthyMetrics recursively collects healthy and degraded metrics at this level and below.
|
||||||
|
//
|
||||||
|
// A metric is considered:
|
||||||
|
// - Healthy: buffer has recent data within MaxMissingDataPoints threshold AND has few/no NaN values
|
||||||
|
// - Degraded: buffer exists and has recent data, but contains more than MaxMissingDataPoints NaN values
|
||||||
|
//
|
||||||
|
// This routine walks the entire subtree starting from the current level.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - m: MemoryStore containing the global metric configuration
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - []string: Flat list of healthy metric names from this level and all children
|
||||||
|
// - []string: Flat list of degraded metric names (exist but have too many missing values)
|
||||||
|
// - error: Non-nil only for internal errors during recursion
|
||||||
|
//
|
||||||
|
// The routine mirrors healthCheck() but provides more granular classification:
|
||||||
|
// - healthCheck() finds problems (stale/missing)
|
||||||
|
// - getHealthyMetrics() separates healthy from degraded metrics
|
||||||
|
func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) ([]string, []string, error) {
|
||||||
l.lock.RLock()
|
l.lock.RLock()
|
||||||
defer l.lock.RUnlock()
|
defer l.lock.RUnlock()
|
||||||
|
|
||||||
for _, mc := range m.Metrics {
|
globalMetrics := m.Metrics
|
||||||
if b := l.metrics[mc.offset]; b != nil {
|
|
||||||
count += b.healthCheck()
|
missingList := make([]string, 0)
|
||||||
|
degradedList := make([]string, 0)
|
||||||
|
|
||||||
|
// Phase 1: Check metrics at this level
|
||||||
|
for _, metricName := range expectedMetrics {
|
||||||
|
offset := globalMetrics[metricName].offset
|
||||||
|
b := l.metrics[offset]
|
||||||
|
|
||||||
|
if !b.bufferExists() {
|
||||||
|
missingList = append(missingList, metricName)
|
||||||
|
} else if !b.isBufferHealthy() {
|
||||||
|
degradedList = append(degradedList, metricName)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Phase 2: Recursively check child levels
|
||||||
for _, lvl := range l.children {
|
for _, lvl := range l.children {
|
||||||
c, err := lvl.healthCheck(m, 0)
|
childMissing, childDegraded, err := lvl.getHealthyMetrics(m, expectedMetrics)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return nil, nil, err
|
||||||
}
|
|
||||||
count += c
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return count, nil
|
missingList = mergeList(missingList, childMissing)
|
||||||
|
degradedList = mergeList(degradedList, childDegraded)
|
||||||
|
}
|
||||||
|
|
||||||
|
return missingList, degradedList, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MemoryStore) HealthCheck(w *bufio.Writer, selector []string) error {
|
// GetHealthyMetrics returns healthy and degraded metrics for a specific node as flat lists.
|
||||||
|
//
|
||||||
|
// This routine walks the metric tree starting from the specified node selector
|
||||||
|
// and collects all metrics that have received data within the last MaxMissingDataPoints
|
||||||
|
// (default: 5 data points). Metrics are classified into two categories:
|
||||||
|
//
|
||||||
|
// - Healthy: Buffer has recent data AND contains few/no NaN (missing) values
|
||||||
|
// - Degraded: Buffer has recent data BUT contains more than MaxMissingDataPoints NaN values
|
||||||
|
//
|
||||||
|
// The returned lists include both node-level metrics (e.g., "load", "mem_used") and
|
||||||
|
// hardware-level metrics (e.g., "cpu_user", "gpu_temp") in flat slices.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - selector: Hierarchical path to the target node, typically []string{cluster, hostname}.
|
||||||
|
// Example: []string{"emmy", "node001"} navigates to the "node001" host in the "emmy" cluster.
|
||||||
|
// The selector must match the hierarchy used during metric ingestion.
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - []string: Flat list of healthy metric names (recent data, few missing values)
|
||||||
|
// - []string: Flat list of degraded metric names (recent data, many missing values)
|
||||||
|
// - error: Non-nil if the node is not found or internal errors occur
|
||||||
|
//
|
||||||
|
// Example usage:
|
||||||
|
//
|
||||||
|
// selector := []string{"emmy", "node001"}
|
||||||
|
// healthyMetrics, degradedMetrics, err := ms.GetHealthyMetrics(selector)
|
||||||
|
// if err != nil {
|
||||||
|
// // Node not found or internal error
|
||||||
|
// return err
|
||||||
|
// }
|
||||||
|
// fmt.Printf("Healthy metrics: %v\n", healthyMetrics)
|
||||||
|
// // Output: ["load", "mem_used", "cpu_user", ...]
|
||||||
|
// fmt.Printf("Degraded metrics: %v\n", degradedMetrics)
|
||||||
|
// // Output: ["gpu_temp", "network_rx", ...] (metrics with many NaN values)
|
||||||
|
//
|
||||||
|
// Note: This routine provides more granular classification than HealthCheck:
|
||||||
|
// - HealthCheck reports stale/missing metrics (problems)
|
||||||
|
// - GetHealthyMetrics separates fully healthy from degraded metrics (quality levels)
|
||||||
|
func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, []string, error) {
|
||||||
lvl := m.root.findLevel(selector)
|
lvl := m.root.findLevel(selector)
|
||||||
if lvl == nil {
|
if lvl == nil {
|
||||||
return fmt.Errorf("[METRICSTORE]> not found: %#v", selector)
|
return nil, nil, fmt.Errorf("[METRICSTORE]> error while GetHealthyMetrics, host not found: %#v", selector)
|
||||||
}
|
}
|
||||||
|
|
||||||
buf := make([]byte, 0, 25)
|
missingList, degradedList, err := lvl.getHealthyMetrics(m, expectedMetrics)
|
||||||
// buf = append(buf, "{"...)
|
|
||||||
|
|
||||||
var count int64 = 0
|
|
||||||
|
|
||||||
unhealthyMetricsCount, err := lvl.healthCheck(m, count)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if unhealthyMetricsCount < MaxUnhealthyMetrics {
|
return missingList, degradedList, nil
|
||||||
buf = append(buf, "Healthy"...)
|
}
|
||||||
} else {
|
|
||||||
buf = append(buf, "Unhealthy"...)
|
// HealthCheck performs health checks on multiple nodes and returns their monitoring states.
|
||||||
}
|
//
|
||||||
|
// This routine provides a batch health check interface that evaluates multiple nodes
|
||||||
// buf = append(buf, "}\n"...)
|
// against a specific set of expected metrics. For each node, it determines the overall
|
||||||
|
// monitoring state based on which metrics are healthy, degraded, or missing.
|
||||||
if _, err = w.Write(buf); err != nil {
|
//
|
||||||
return err
|
// Health Status Classification:
|
||||||
}
|
// - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values)
|
||||||
|
// - MonitoringStatePartial: Some metrics are degraded (many missing values) or missing
|
||||||
return w.Flush()
|
// - MonitoringStateFailed: Node not found or all expected metrics are missing/stale
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - cluster: Cluster name (first element of selector path)
|
||||||
|
// - nodes: List of node hostnames to check
|
||||||
|
// - expectedMetrics: List of metric names that should be present on each node
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - map[string]schema.MonitoringState: Map keyed by hostname containing monitoring state for each node
|
||||||
|
// - error: Non-nil only for internal errors (individual node failures are captured as MonitoringStateFailed)
|
||||||
|
//
|
||||||
|
// Example usage:
|
||||||
|
//
|
||||||
|
// cluster := "emmy"
|
||||||
|
// nodes := []string{"node001", "node002", "node003"}
|
||||||
|
// expectedMetrics := []string{"load", "mem_used", "cpu_user", "cpu_system"}
|
||||||
|
// healthStates, err := ms.HealthCheck(cluster, nodes, expectedMetrics)
|
||||||
|
// if err != nil {
|
||||||
|
// return err
|
||||||
|
// }
|
||||||
|
// for hostname, state := range healthStates {
|
||||||
|
// fmt.Printf("Node %s: %s\n", hostname, state)
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// Note: This routine is optimized for batch operations where you need to check
|
||||||
|
// the same set of metrics across multiple nodes.
|
||||||
|
func (m *MemoryStore) HealthCheck(cluster string,
|
||||||
|
nodes []string, expectedMetrics []string,
|
||||||
|
) (map[string]schema.MonitoringState, error) {
|
||||||
|
results := make(map[string]schema.MonitoringState, len(nodes))
|
||||||
|
|
||||||
|
// Create a set of expected metrics for fast lookup
|
||||||
|
expectedSet := make(map[string]bool, len(expectedMetrics))
|
||||||
|
for _, metric := range expectedMetrics {
|
||||||
|
expectedSet[metric] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check each node
|
||||||
|
for _, hostname := range nodes {
|
||||||
|
selector := []string{cluster, hostname}
|
||||||
|
status := schema.MonitoringStateFull
|
||||||
|
healthyCount := 0
|
||||||
|
degradedCount := 0
|
||||||
|
missingCount := 0
|
||||||
|
|
||||||
|
// Get healthy and degraded metrics for this node
|
||||||
|
missingList, degradedList, err := m.GetHealthyMetrics(selector, expectedMetrics)
|
||||||
|
if err != nil {
|
||||||
|
// Node not found or internal error
|
||||||
|
results[hostname] = schema.MonitoringStateFailed
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
missingCount = len(missingList)
|
||||||
|
degradedCount = len(degradedList)
|
||||||
|
healthyCount = len(expectedMetrics) - (missingCount + degradedCount)
|
||||||
|
|
||||||
|
// Determine overall health status
|
||||||
|
if missingCount > 0 || degradedCount > 0 {
|
||||||
|
if healthyCount == 0 {
|
||||||
|
// No healthy metrics at all
|
||||||
|
status = schema.MonitoringStateFailed
|
||||||
|
} else {
|
||||||
|
// Some healthy, some degraded/missing
|
||||||
|
status = schema.MonitoringStatePartial
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// else: all metrics healthy, status remains MonitoringStateFull
|
||||||
|
|
||||||
|
results[hostname] = status
|
||||||
|
}
|
||||||
|
|
||||||
|
return results, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ package metricstore
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
"github.com/ClusterCockpit/cc-lib/v2/schema"
|
||||||
)
|
)
|
||||||
@@ -88,3 +89,378 @@ func TestBufferRead(t *testing.T) {
|
|||||||
t.Errorf("buffer.read() len(result) = %d, want 3", len(result))
|
t.Errorf("buffer.read() len(result) = %d, want 3", len(result))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHealthCheck(t *testing.T) {
|
||||||
|
// Create a test MemoryStore with some metrics
|
||||||
|
metrics := map[string]MetricConfig{
|
||||||
|
"load": {Frequency: 10, Aggregation: AvgAggregation, offset: 0},
|
||||||
|
"mem_used": {Frequency: 10, Aggregation: AvgAggregation, offset: 1},
|
||||||
|
"cpu_user": {Frequency: 10, Aggregation: AvgAggregation, offset: 2},
|
||||||
|
"cpu_system": {Frequency: 10, Aggregation: AvgAggregation, offset: 3},
|
||||||
|
}
|
||||||
|
|
||||||
|
ms := &MemoryStore{
|
||||||
|
Metrics: metrics,
|
||||||
|
root: Level{
|
||||||
|
metrics: make([]*buffer, len(metrics)),
|
||||||
|
children: make(map[string]*Level),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use recent timestamps (current time minus a small offset)
|
||||||
|
now := time.Now().Unix()
|
||||||
|
startTime := now - 100 // Start 100 seconds ago to have enough data points
|
||||||
|
|
||||||
|
// Setup test data for node001 - all metrics healthy (recent data)
|
||||||
|
node001 := ms.root.findLevelOrCreate([]string{"testcluster", "node001"}, len(metrics))
|
||||||
|
for i := 0; i < len(metrics); i++ {
|
||||||
|
node001.metrics[i] = newBuffer(startTime, 10)
|
||||||
|
// Write recent data up to now
|
||||||
|
for ts := startTime; ts <= now; ts += 10 {
|
||||||
|
node001.metrics[i].write(ts, schema.Float(float64(i+1)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup test data for node002 - some metrics stale (old data beyond MaxMissingDataPoints threshold)
|
||||||
|
node002 := ms.root.findLevelOrCreate([]string{"testcluster", "node002"}, len(metrics))
|
||||||
|
// MaxMissingDataPoints = 5, frequency = 10, so threshold is 50 seconds
|
||||||
|
staleTime := now - 100 // Data ends 100 seconds ago (well beyond 50 second threshold)
|
||||||
|
for i := 0; i < len(metrics); i++ {
|
||||||
|
node002.metrics[i] = newBuffer(staleTime-50, 10)
|
||||||
|
if i < 2 {
|
||||||
|
// First two metrics: healthy (recent data)
|
||||||
|
for ts := startTime; ts <= now; ts += 10 {
|
||||||
|
node002.metrics[i].write(ts, schema.Float(float64(i+1)))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Last two metrics: stale (data ends 100 seconds ago)
|
||||||
|
for ts := staleTime - 50; ts <= staleTime; ts += 10 {
|
||||||
|
node002.metrics[i].write(ts, schema.Float(float64(i+1)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup test data for node003 - some metrics missing (no buffer)
|
||||||
|
node003 := ms.root.findLevelOrCreate([]string{"testcluster", "node003"}, len(metrics))
|
||||||
|
// Only create buffers for first two metrics
|
||||||
|
for i := 0; i < 2; i++ {
|
||||||
|
node003.metrics[i] = newBuffer(startTime, 10)
|
||||||
|
for ts := startTime; ts <= now; ts += 10 {
|
||||||
|
node003.metrics[i].write(ts, schema.Float(float64(i+1)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Leave metrics[2] and metrics[3] as nil (missing)
|
||||||
|
|
||||||
|
// Setup test data for node005 - all metrics stale
|
||||||
|
node005 := ms.root.findLevelOrCreate([]string{"testcluster", "node005"}, len(metrics))
|
||||||
|
for i := 0; i < len(metrics); i++ {
|
||||||
|
node005.metrics[i] = newBuffer(staleTime-50, 10)
|
||||||
|
// All metrics have stale data (ends 100 seconds ago)
|
||||||
|
for ts := staleTime - 50; ts <= staleTime; ts += 10 {
|
||||||
|
node005.metrics[i].write(ts, schema.Float(float64(i+1)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// node004 doesn't exist at all
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
cluster string
|
||||||
|
nodes []string
|
||||||
|
expectedMetrics []string
|
||||||
|
wantStates map[string]schema.MonitoringState
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "all metrics healthy",
|
||||||
|
cluster: "testcluster",
|
||||||
|
nodes: []string{"node001"},
|
||||||
|
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
|
||||||
|
wantStates: map[string]schema.MonitoringState{
|
||||||
|
"node001": schema.MonitoringStateFull,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "some metrics stale",
|
||||||
|
cluster: "testcluster",
|
||||||
|
nodes: []string{"node002"},
|
||||||
|
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
|
||||||
|
wantStates: map[string]schema.MonitoringState{
|
||||||
|
"node002": schema.MonitoringStatePartial,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "some metrics missing",
|
||||||
|
cluster: "testcluster",
|
||||||
|
nodes: []string{"node003"},
|
||||||
|
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
|
||||||
|
wantStates: map[string]schema.MonitoringState{
|
||||||
|
"node003": schema.MonitoringStatePartial,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "node not found",
|
||||||
|
cluster: "testcluster",
|
||||||
|
nodes: []string{"node004"},
|
||||||
|
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
|
||||||
|
wantStates: map[string]schema.MonitoringState{
|
||||||
|
"node004": schema.MonitoringStateFailed,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "all metrics stale",
|
||||||
|
cluster: "testcluster",
|
||||||
|
nodes: []string{"node005"},
|
||||||
|
expectedMetrics: []string{"load", "mem_used", "cpu_user", "cpu_system"},
|
||||||
|
wantStates: map[string]schema.MonitoringState{
|
||||||
|
"node005": schema.MonitoringStateFailed,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "multiple nodes mixed states",
|
||||||
|
cluster: "testcluster",
|
||||||
|
nodes: []string{"node001", "node002", "node003", "node004", "node005"},
|
||||||
|
expectedMetrics: []string{"load", "mem_used"},
|
||||||
|
wantStates: map[string]schema.MonitoringState{
|
||||||
|
"node001": schema.MonitoringStateFull,
|
||||||
|
"node002": schema.MonitoringStateFull, // Only checking first 2 metrics which are healthy
|
||||||
|
"node003": schema.MonitoringStateFull, // Only checking first 2 metrics which exist
|
||||||
|
"node004": schema.MonitoringStateFailed, // Node doesn't exist
|
||||||
|
"node005": schema.MonitoringStateFailed, // Both metrics are stale
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
results, err := ms.HealthCheck(tt.cluster, tt.nodes, tt.expectedMetrics)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("HealthCheck() error = %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that we got results for all nodes
|
||||||
|
if len(results) != len(tt.nodes) {
|
||||||
|
t.Errorf("HealthCheck() returned %d results, want %d", len(results), len(tt.nodes))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check each node's state
|
||||||
|
for _, node := range tt.nodes {
|
||||||
|
state, ok := results[node]
|
||||||
|
if !ok {
|
||||||
|
t.Errorf("HealthCheck() missing result for node %s", node)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check status
|
||||||
|
if wantStatus, ok := tt.wantStates[node]; ok {
|
||||||
|
if state != wantStatus {
|
||||||
|
t.Errorf("HealthCheck() node %s status = %v, want %v", node, state, wantStatus)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGetHealthyMetrics tests the GetHealthyMetrics function which returns lists of missing and degraded metrics
|
||||||
|
func TestGetHealthyMetrics(t *testing.T) {
|
||||||
|
metrics := map[string]MetricConfig{
|
||||||
|
"load": {Frequency: 10, Aggregation: AvgAggregation, offset: 0},
|
||||||
|
"mem_used": {Frequency: 10, Aggregation: AvgAggregation, offset: 1},
|
||||||
|
"cpu_user": {Frequency: 10, Aggregation: AvgAggregation, offset: 2},
|
||||||
|
}
|
||||||
|
|
||||||
|
ms := &MemoryStore{
|
||||||
|
Metrics: metrics,
|
||||||
|
root: Level{
|
||||||
|
metrics: make([]*buffer, len(metrics)),
|
||||||
|
children: make(map[string]*Level),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
now := time.Now().Unix()
|
||||||
|
startTime := now - 100
|
||||||
|
staleTime := now - 100
|
||||||
|
|
||||||
|
// Setup node with mixed health states
|
||||||
|
node := ms.root.findLevelOrCreate([]string{"testcluster", "testnode"}, len(metrics))
|
||||||
|
|
||||||
|
// Metric 0 (load): healthy - recent data
|
||||||
|
node.metrics[0] = newBuffer(startTime, 10)
|
||||||
|
for ts := startTime; ts <= now; ts += 10 {
|
||||||
|
node.metrics[0].write(ts, schema.Float(1.0))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Metric 1 (mem_used): degraded - stale data
|
||||||
|
node.metrics[1] = newBuffer(staleTime-50, 10)
|
||||||
|
for ts := staleTime - 50; ts <= staleTime; ts += 10 {
|
||||||
|
node.metrics[1].write(ts, schema.Float(2.0))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Metric 2 (cpu_user): missing - no buffer (nil)
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
selector []string
|
||||||
|
expectedMetrics []string
|
||||||
|
wantMissing []string
|
||||||
|
wantDegraded []string
|
||||||
|
wantErr bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "mixed health states",
|
||||||
|
selector: []string{"testcluster", "testnode"},
|
||||||
|
expectedMetrics: []string{"load", "mem_used", "cpu_user"},
|
||||||
|
wantMissing: []string{"cpu_user"},
|
||||||
|
wantDegraded: []string{"mem_used"},
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "node not found",
|
||||||
|
selector: []string{"testcluster", "nonexistent"},
|
||||||
|
expectedMetrics: []string{"load"},
|
||||||
|
wantMissing: nil,
|
||||||
|
wantDegraded: nil,
|
||||||
|
wantErr: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "check only healthy metric",
|
||||||
|
selector: []string{"testcluster", "testnode"},
|
||||||
|
expectedMetrics: []string{"load"},
|
||||||
|
wantMissing: []string{},
|
||||||
|
wantDegraded: []string{},
|
||||||
|
wantErr: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
missing, degraded, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics)
|
||||||
|
|
||||||
|
if (err != nil) != tt.wantErr {
|
||||||
|
t.Errorf("GetHealthyMetrics() error = %v, wantErr %v", err, tt.wantErr)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if tt.wantErr {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check missing list
|
||||||
|
if len(missing) != len(tt.wantMissing) {
|
||||||
|
t.Errorf("GetHealthyMetrics() missing = %v, want %v", missing, tt.wantMissing)
|
||||||
|
} else {
|
||||||
|
for i, m := range tt.wantMissing {
|
||||||
|
if missing[i] != m {
|
||||||
|
t.Errorf("GetHealthyMetrics() missing[%d] = %v, want %v", i, missing[i], m)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check degraded list
|
||||||
|
if len(degraded) != len(tt.wantDegraded) {
|
||||||
|
t.Errorf("GetHealthyMetrics() degraded = %v, want %v", degraded, tt.wantDegraded)
|
||||||
|
} else {
|
||||||
|
for i, d := range tt.wantDegraded {
|
||||||
|
if degraded[i] != d {
|
||||||
|
t.Errorf("GetHealthyMetrics() degraded[%d] = %v, want %v", i, degraded[i], d)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestBufferHealthChecks tests the buffer-level health check functions
|
||||||
|
func TestBufferHealthChecks(t *testing.T) {
|
||||||
|
now := time.Now().Unix()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
setupBuffer func() *buffer
|
||||||
|
wantExists bool
|
||||||
|
wantHealthy bool
|
||||||
|
description string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "nil buffer",
|
||||||
|
setupBuffer: func() *buffer {
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
wantExists: false,
|
||||||
|
wantHealthy: false,
|
||||||
|
description: "nil buffer should not exist and not be healthy",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "empty buffer",
|
||||||
|
setupBuffer: func() *buffer {
|
||||||
|
b := newBuffer(now, 10)
|
||||||
|
b.data = nil
|
||||||
|
return b
|
||||||
|
},
|
||||||
|
wantExists: false,
|
||||||
|
wantHealthy: false,
|
||||||
|
description: "empty buffer should not exist and not be healthy",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "healthy buffer with recent data",
|
||||||
|
setupBuffer: func() *buffer {
|
||||||
|
b := newBuffer(now-30, 10)
|
||||||
|
// Write data up to now (within MaxMissingDataPoints * frequency = 50 seconds)
|
||||||
|
for ts := now - 30; ts <= now; ts += 10 {
|
||||||
|
b.write(ts, schema.Float(1.0))
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
},
|
||||||
|
wantExists: true,
|
||||||
|
wantHealthy: true,
|
||||||
|
description: "buffer with recent data should be healthy",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "stale buffer beyond threshold",
|
||||||
|
setupBuffer: func() *buffer {
|
||||||
|
b := newBuffer(now-200, 10)
|
||||||
|
// Write data that ends 100 seconds ago (beyond MaxMissingDataPoints * frequency = 50 seconds)
|
||||||
|
for ts := now - 200; ts <= now-100; ts += 10 {
|
||||||
|
b.write(ts, schema.Float(1.0))
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
},
|
||||||
|
wantExists: true,
|
||||||
|
wantHealthy: false,
|
||||||
|
description: "buffer with stale data should exist but not be healthy",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "buffer at threshold boundary",
|
||||||
|
setupBuffer: func() *buffer {
|
||||||
|
b := newBuffer(now-50, 10)
|
||||||
|
// Write data that ends exactly at threshold (MaxMissingDataPoints * frequency = 50 seconds)
|
||||||
|
for ts := now - 50; ts <= now-50; ts += 10 {
|
||||||
|
b.write(ts, schema.Float(1.0))
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
},
|
||||||
|
wantExists: true,
|
||||||
|
wantHealthy: true,
|
||||||
|
description: "buffer at threshold boundary should still be healthy",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
b := tt.setupBuffer()
|
||||||
|
|
||||||
|
exists := b.bufferExists()
|
||||||
|
if exists != tt.wantExists {
|
||||||
|
t.Errorf("bufferExists() = %v, want %v: %s", exists, tt.wantExists, tt.description)
|
||||||
|
}
|
||||||
|
|
||||||
|
if b != nil && b.data != nil && len(b.data) > 0 {
|
||||||
|
healthy := b.isBufferHealthy()
|
||||||
|
if healthy != tt.wantHealthy {
|
||||||
|
t.Errorf("isBufferHealthy() = %v, want %v: %s", healthy, tt.wantHealthy, tt.description)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
-->
|
-->
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
import { onMount } from "svelte";
|
import { getContext, onMount } from "svelte";
|
||||||
import {
|
import {
|
||||||
Row,
|
Row,
|
||||||
Col,
|
Col,
|
||||||
@@ -18,6 +18,7 @@
|
|||||||
Spinner,
|
Spinner,
|
||||||
InputGroup,
|
InputGroup,
|
||||||
Input,
|
Input,
|
||||||
|
Tooltip
|
||||||
} from "@sveltestrap/sveltestrap";
|
} from "@sveltestrap/sveltestrap";
|
||||||
import {
|
import {
|
||||||
queryStore,
|
queryStore,
|
||||||
@@ -29,6 +30,9 @@
|
|||||||
scramble,
|
scramble,
|
||||||
scrambleNames,
|
scrambleNames,
|
||||||
} from "./generic/utils.js";
|
} from "./generic/utils.js";
|
||||||
|
import {
|
||||||
|
formatDurationTime
|
||||||
|
} from "./generic/units.js";
|
||||||
import Filters from "./generic/Filters.svelte";
|
import Filters from "./generic/Filters.svelte";
|
||||||
|
|
||||||
/* Svelte 5 Props */
|
/* Svelte 5 Props */
|
||||||
@@ -40,48 +44,70 @@
|
|||||||
/* Const Init */
|
/* Const Init */
|
||||||
const {} = init();
|
const {} = init();
|
||||||
const client = getContextClient();
|
const client = getContextClient();
|
||||||
|
const shortDuration = getContext("cc-config").jobList_hideShortRunningJobs; // Always configured
|
||||||
|
|
||||||
/* State Init*/
|
/* State Init*/
|
||||||
let filterComponent = $state(); // see why here: https://stackoverflow.com/questions/58287729/how-can-i-export-a-function-from-a-svelte-component-that-changes-a-value-in-the
|
let filterComponent = $state(); // see why here: https://stackoverflow.com/questions/58287729/how-can-i-export-a-function-from-a-svelte-component-that-changes-a-value-in-the
|
||||||
let jobFilters = $state([]);
|
let jobFilters = $state([]);
|
||||||
let nameFilter = $state("");
|
let nameFilter = $state("");
|
||||||
let sorting = $state({ field: "totalJobs", direction: "down" });
|
let sorting = $state({ field: "totalJobs", direction: "desc" });
|
||||||
|
|
||||||
/* Derived Vars */
|
/* Derived Vars */
|
||||||
|
const fetchRunning = $derived(jobFilters.some(jf => jf?.state?.length == 1 && jf?.state?.includes("running")));
|
||||||
|
const numCols = $derived.by(() => {
|
||||||
|
let colbase = 6
|
||||||
|
if (fetchRunning) {
|
||||||
|
colbase += 2
|
||||||
|
}
|
||||||
|
return colbase
|
||||||
|
})
|
||||||
|
|
||||||
let stats = $derived(
|
let stats = $derived(
|
||||||
queryStore({
|
queryStore({
|
||||||
client: client,
|
client: client,
|
||||||
query: gql`
|
query: gql`
|
||||||
query($jobFilters: [JobFilter!]!) {
|
query($jobFilters: [JobFilter!]!, $fetchRunning: Boolean!) {
|
||||||
rows: jobsStatistics(filter: $jobFilters, groupBy: ${type}) {
|
rows: jobsStatistics(filter: $jobFilters, groupBy: ${type}) {
|
||||||
id
|
id
|
||||||
name
|
name
|
||||||
totalJobs
|
totalJobs
|
||||||
|
shortJobs
|
||||||
|
totalCores @include(if: $fetchRunning)
|
||||||
|
totalAccs @include(if: $fetchRunning)
|
||||||
totalWalltime
|
totalWalltime
|
||||||
totalCoreHours
|
totalCoreHours
|
||||||
totalAccHours
|
totalAccHours
|
||||||
}
|
}
|
||||||
}`,
|
}`,
|
||||||
variables: { jobFilters },
|
variables: {
|
||||||
|
jobFilters,
|
||||||
|
fetchRunning
|
||||||
|
},
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
||||||
/* Functions */
|
/* Functions */
|
||||||
function changeSorting(field) {
|
function changeSorting(newField) {
|
||||||
sorting = { field, direction: sorting?.direction == "down" ? "up" : "down" };
|
if (sorting.field == newField) {
|
||||||
|
// Same Field, Change Direction
|
||||||
|
sorting = { field: newField, direction: sorting.direction == "desc" ? "asc" : "desc" };
|
||||||
|
} else {
|
||||||
|
// Change Field, Apply Default Direction
|
||||||
|
sorting = { field: newField, direction: "desc" };
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function sort(stats, sorting, nameFilter) {
|
function sort(stats, sorting, nameFilter) {
|
||||||
const idCmp = sorting.direction == "up"
|
const idCmp = sorting.direction == "asc"
|
||||||
? (a, b) => b.id.localeCompare(a.id)
|
? (a, b) => b.id.localeCompare(a.id)
|
||||||
: (a, b) => a.id.localeCompare(b.id)
|
: (a, b) => a.id.localeCompare(b.id)
|
||||||
|
|
||||||
// Force empty or undefined strings to the end of the list
|
// Force empty or undefined strings to the end of the list
|
||||||
const nameCmp = sorting.direction == "up"
|
const nameCmp = sorting.direction == "asc"
|
||||||
? (a, b) => !a?.name ? 1 : (!b?.name ? -1 : (b.name.localeCompare(a.name)))
|
? (a, b) => !a?.name ? 1 : (!b?.name ? -1 : (b.name.localeCompare(a.name)))
|
||||||
: (a, b) => !a?.name ? 1 : (!b?.name ? -1 : (a.name.localeCompare(b.name)))
|
: (a, b) => !a?.name ? 1 : (!b?.name ? -1 : (a.name.localeCompare(b.name)))
|
||||||
|
|
||||||
const intCmp = sorting.direction == "up"
|
const intCmp = sorting.direction == "asc"
|
||||||
? (a, b) => a[sorting.field] - b[sorting.field]
|
? (a, b) => a[sorting.field] - b[sorting.field]
|
||||||
: (a, b) => b[sorting.field] - a[sorting.field];
|
: (a, b) => b[sorting.field] - a[sorting.field];
|
||||||
|
|
||||||
@@ -141,7 +167,7 @@
|
|||||||
>
|
>
|
||||||
{#if sorting?.field == "id"}
|
{#if sorting?.field == "id"}
|
||||||
<!-- Note on Icon-Name: Arrow-indicator always down, only alpha-indicator switches -->
|
<!-- Note on Icon-Name: Arrow-indicator always down, only alpha-indicator switches -->
|
||||||
<Icon name={`sort-alpha-${sorting?.direction == 'down' ? 'down' : 'down-alt'}`} />
|
<Icon name={`sort-alpha-${sorting?.direction == 'desc' ? 'down' : 'down-alt'}`} />
|
||||||
{:else}
|
{:else}
|
||||||
<Icon name="three-dots-vertical" />
|
<Icon name="three-dots-vertical" />
|
||||||
{/if}
|
{/if}
|
||||||
@@ -156,7 +182,7 @@
|
|||||||
onclick={() => changeSorting("name")}
|
onclick={() => changeSorting("name")}
|
||||||
>
|
>
|
||||||
{#if sorting?.field == "name"}
|
{#if sorting?.field == "name"}
|
||||||
<Icon name={`sort-alpha-${sorting?.direction == 'down' ? 'down' : 'down-alt'}`} />
|
<Icon name={`sort-alpha-${sorting?.direction == 'desc' ? 'down' : 'down-alt'}`} />
|
||||||
{:else}
|
{:else}
|
||||||
<Icon name="three-dots-vertical" />
|
<Icon name="three-dots-vertical" />
|
||||||
{/if}
|
{/if}
|
||||||
@@ -172,12 +198,66 @@
|
|||||||
>
|
>
|
||||||
{#if sorting?.field == "totalJobs"}
|
{#if sorting?.field == "totalJobs"}
|
||||||
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
|
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
|
||||||
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} />
|
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
|
||||||
{:else}
|
{:else}
|
||||||
<Icon name="three-dots-vertical" />
|
<Icon name="three-dots-vertical" />
|
||||||
{/if}
|
{/if}
|
||||||
</Button>
|
</Button>
|
||||||
</th>
|
</th>
|
||||||
|
<th scope="col">
|
||||||
|
<span class="mr-1">
|
||||||
|
Short Jobs
|
||||||
|
<Icon id="shortjobs-info" style="cursor:help;" size="sm" name="info-circle"/>
|
||||||
|
</span>
|
||||||
|
<Tooltip target={`shortjobs-info`} placement="top">
|
||||||
|
Job duration less than {formatDurationTime(shortDuration)}
|
||||||
|
</Tooltip>
|
||||||
|
  <!-- Narrow Non-Breaking Space -->
|
||||||
|
<Button
|
||||||
|
color={sorting.field == "shortJobs" ? "primary" : "light"}
|
||||||
|
size="sm"
|
||||||
|
onclick={() => changeSorting("shortJobs")}
|
||||||
|
>
|
||||||
|
{#if sorting?.field == "shortJobs"}
|
||||||
|
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
|
||||||
|
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
|
||||||
|
{:else}
|
||||||
|
<Icon name="three-dots-vertical" />
|
||||||
|
{/if}
|
||||||
|
</Button>
|
||||||
|
</th>
|
||||||
|
{#if fetchRunning}
|
||||||
|
<th scope="col">
|
||||||
|
Total Cores
|
||||||
|
<Button
|
||||||
|
color={sorting.field == "totalCores" ? "primary" : "light"}
|
||||||
|
size="sm"
|
||||||
|
onclick={() => changeSorting("totalCores")}
|
||||||
|
>
|
||||||
|
{#if sorting?.field == "totalJCores"}
|
||||||
|
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
|
||||||
|
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
|
||||||
|
{:else}
|
||||||
|
<Icon name="three-dots-vertical" />
|
||||||
|
{/if}
|
||||||
|
</Button>
|
||||||
|
</th>
|
||||||
|
<th scope="col">
|
||||||
|
Total Accelerators
|
||||||
|
<Button
|
||||||
|
color={sorting.field == "totalAccs" ? "primary" : "light"}
|
||||||
|
size="sm"
|
||||||
|
onclick={() => changeSorting("totalAccs")}
|
||||||
|
>
|
||||||
|
{#if sorting?.field == "totalAccs"}
|
||||||
|
<!-- Note on Icon-Name: Arrow-indicator always down, only numeric-indicator switches -->
|
||||||
|
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
|
||||||
|
{:else}
|
||||||
|
<Icon name="three-dots-vertical" />
|
||||||
|
{/if}
|
||||||
|
</Button>
|
||||||
|
</th>
|
||||||
|
{/if}
|
||||||
<th scope="col">
|
<th scope="col">
|
||||||
Total Walltime
|
Total Walltime
|
||||||
<Button
|
<Button
|
||||||
@@ -186,7 +266,7 @@
|
|||||||
onclick={() => changeSorting("totalWalltime")}
|
onclick={() => changeSorting("totalWalltime")}
|
||||||
>
|
>
|
||||||
{#if sorting?.field == "totalWalltime"}
|
{#if sorting?.field == "totalWalltime"}
|
||||||
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} />
|
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
|
||||||
{:else}
|
{:else}
|
||||||
<Icon name="three-dots-vertical" />
|
<Icon name="three-dots-vertical" />
|
||||||
{/if}
|
{/if}
|
||||||
@@ -200,7 +280,7 @@
|
|||||||
onclick={() => changeSorting("totalCoreHours")}
|
onclick={() => changeSorting("totalCoreHours")}
|
||||||
>
|
>
|
||||||
{#if sorting?.field == "totalCoreHours"}
|
{#if sorting?.field == "totalCoreHours"}
|
||||||
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} />
|
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
|
||||||
{:else}
|
{:else}
|
||||||
<Icon name="three-dots-vertical" />
|
<Icon name="three-dots-vertical" />
|
||||||
{/if}
|
{/if}
|
||||||
@@ -214,7 +294,7 @@
|
|||||||
onclick={() => changeSorting("totalAccHours")}
|
onclick={() => changeSorting("totalAccHours")}
|
||||||
>
|
>
|
||||||
{#if sorting?.field == "totalAccHours"}
|
{#if sorting?.field == "totalAccHours"}
|
||||||
<Icon name={`sort-numeric-${sorting?.direction == 'down' ? 'down-alt' : 'down'}`} />
|
<Icon name={`sort-numeric-${sorting?.direction == 'desc' ? 'down-alt' : 'down'}`} />
|
||||||
{:else}
|
{:else}
|
||||||
<Icon name="three-dots-vertical" />
|
<Icon name="three-dots-vertical" />
|
||||||
{/if}
|
{/if}
|
||||||
@@ -225,11 +305,11 @@
|
|||||||
<tbody>
|
<tbody>
|
||||||
{#if $stats.fetching}
|
{#if $stats.fetching}
|
||||||
<tr>
|
<tr>
|
||||||
<td colspan="4" style="text-align: center;"><Spinner secondary /></td>
|
<td colspan={numCols} style="text-align: center;"><Spinner secondary /></td>
|
||||||
</tr>
|
</tr>
|
||||||
{:else if $stats.error}
|
{:else if $stats.error}
|
||||||
<tr>
|
<tr>
|
||||||
<td colspan="4"
|
<td colspan={numCols}
|
||||||
><Card body color="danger" class="mb-3">{$stats.error.message}</Card
|
><Card body color="danger" class="mb-3">{$stats.error.message}</Card
|
||||||
></td
|
></td
|
||||||
>
|
>
|
||||||
@@ -260,13 +340,18 @@
|
|||||||
>
|
>
|
||||||
{/if}
|
{/if}
|
||||||
<td>{row.totalJobs}</td>
|
<td>{row.totalJobs}</td>
|
||||||
|
<td>{row.shortJobs}</td>
|
||||||
|
{#if fetchRunning}
|
||||||
|
<td>{row.totalCores}</td>
|
||||||
|
<td>{row.totalAccs}</td>
|
||||||
|
{/if}
|
||||||
<td>{row.totalWalltime}</td>
|
<td>{row.totalWalltime}</td>
|
||||||
<td>{row.totalCoreHours}</td>
|
<td>{row.totalCoreHours}</td>
|
||||||
<td>{row.totalAccHours}</td>
|
<td>{row.totalAccHours}</td>
|
||||||
</tr>
|
</tr>
|
||||||
{:else}
|
{:else}
|
||||||
<tr>
|
<tr>
|
||||||
<td colspan="4"><i>No {type.toLowerCase()}s/jobs found</i></td>
|
<td colspan={numCols}><i>No {type.toLowerCase()}s/jobs found</i></td>
|
||||||
</tr>
|
</tr>
|
||||||
{/each}
|
{/each}
|
||||||
{/if}
|
{/if}
|
||||||
|
|||||||
@@ -32,7 +32,7 @@
|
|||||||
let {
|
let {
|
||||||
matchedListJobs = $bindable(0),
|
matchedListJobs = $bindable(0),
|
||||||
selectedJobs = $bindable([]),
|
selectedJobs = $bindable([]),
|
||||||
metrics = getContext("cc-config").metricConfig_jobListMetrics,
|
metrics = [],
|
||||||
sorting = { field: "startTime", type: "col", order: "DESC" },
|
sorting = { field: "startTime", type: "col", order: "DESC" },
|
||||||
showFootprint = false,
|
showFootprint = false,
|
||||||
filterBuffer = [],
|
filterBuffer = [],
|
||||||
@@ -109,7 +109,7 @@
|
|||||||
let paging = $derived({ itemsPerPage, page });
|
let paging = $derived({ itemsPerPage, page });
|
||||||
const plotWidth = $derived.by(() => {
|
const plotWidth = $derived.by(() => {
|
||||||
return Math.floor(
|
return Math.floor(
|
||||||
(tableWidth - jobInfoColumnWidth) / (metrics.length + (showFootprint ? 1 : 0)) - 10,
|
(tableWidth - jobInfoColumnWidth) / (metrics.length + (showFootprint ? 2 : 1)) - 10,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
let jobsStore = $derived(queryStore({
|
let jobsStore = $derived(queryStore({
|
||||||
|
|||||||
@@ -133,7 +133,7 @@
|
|||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<Card class="mt-1 overflow-auto" style="width: {width}; height: {height}">
|
<Card class="mx-2 overflow-auto" style="width: {width}; height: {height}">
|
||||||
{#if displayTitle}
|
{#if displayTitle}
|
||||||
<CardHeader>
|
<CardHeader>
|
||||||
<CardTitle class="mb-0 d-flex justify-content-center">
|
<CardTitle class="mb-0 d-flex justify-content-center">
|
||||||
|
|||||||
@@ -79,6 +79,7 @@
|
|||||||
|
|
||||||
/* Derived */
|
/* Derived */
|
||||||
const jobId = $derived(job?.id);
|
const jobId = $derived(job?.id);
|
||||||
|
const refinedData = $derived($metricsQuery?.data?.jobMetrics ? sortAndSelectScope($metricsQuery.data.jobMetrics) : []);
|
||||||
const scopes = $derived.by(() => {
|
const scopes = $derived.by(() => {
|
||||||
if (job.numNodes == 1) {
|
if (job.numNodes == 1) {
|
||||||
if (job.numAcc >= 1) return ["core", "accelerator"];
|
if (job.numAcc >= 1) return ["core", "accelerator"];
|
||||||
@@ -202,10 +203,15 @@
|
|||||||
/>
|
/>
|
||||||
</td>
|
</td>
|
||||||
{/if}
|
{/if}
|
||||||
{#each sortAndSelectScope($metricsQuery.data.jobMetrics) as metric, i (metric?.name || i)}
|
{#each refinedData as metric, i (metric?.name || i)}
|
||||||
<td>
|
<td>
|
||||||
<!-- Subluster Metricconfig remove keyword for jobtables (joblist main, user joblist, project joblist) to be used here as toplevel case-->
|
{#key metric}
|
||||||
{#if metric.disabled == false && metric.data}
|
{#if metric?.data}
|
||||||
|
{#if metric?.disabled}
|
||||||
|
<Card body class="mx-2" color="info">
|
||||||
|
Metric <b>{metric.data.name}</b>: Disabled for subcluster <code>{job.subCluster}</code>
|
||||||
|
</Card>
|
||||||
|
{:else}
|
||||||
<MetricPlot
|
<MetricPlot
|
||||||
onZoom={(detail) => handleZoom(detail, metric.data.name)}
|
onZoom={(detail) => handleZoom(detail, metric.data.name)}
|
||||||
height={plotHeight}
|
height={plotHeight}
|
||||||
@@ -222,12 +228,7 @@
|
|||||||
zoomState={zoomStates[metric.data.name] || null}
|
zoomState={zoomStates[metric.data.name] || null}
|
||||||
thresholdState={thresholdStates[metric.data.name] || null}
|
thresholdState={thresholdStates[metric.data.name] || null}
|
||||||
/>
|
/>
|
||||||
{:else if metric.disabled == true && metric.data}
|
{/if}
|
||||||
<Card body color="info"
|
|
||||||
>Metric disabled for subcluster <code
|
|
||||||
>{metric.data.name}:{job.subCluster}</code
|
|
||||||
></Card
|
|
||||||
>
|
|
||||||
{:else}
|
{:else}
|
||||||
<Card body class="mx-2" color="warning">
|
<Card body class="mx-2" color="warning">
|
||||||
<p>No dataset(s) returned for <b>{metrics[i]}</b></p>
|
<p>No dataset(s) returned for <b>{metrics[i]}</b></p>
|
||||||
@@ -236,6 +237,11 @@
|
|||||||
<p class="mb-1">Identical messages in <i>job {job.jobId} row</i>: Host not found.</p>
|
<p class="mb-1">Identical messages in <i>job {job.jobId} row</i>: Host not found.</p>
|
||||||
</Card>
|
</Card>
|
||||||
{/if}
|
{/if}
|
||||||
|
{/key}
|
||||||
|
</td>
|
||||||
|
{:else}
|
||||||
|
<td>
|
||||||
|
<Card body class="mx-2">No metrics selected for display.</Card>
|
||||||
</td>
|
</td>
|
||||||
{/each}
|
{/each}
|
||||||
{/if}
|
{/if}
|
||||||
|
|||||||
@@ -79,7 +79,7 @@
|
|||||||
// X
|
// X
|
||||||
let pendingSeries = [
|
let pendingSeries = [
|
||||||
{
|
{
|
||||||
label: "Runtime",
|
label: "Time",
|
||||||
value: (u, ts, sidx, didx) =>
|
value: (u, ts, sidx, didx) =>
|
||||||
(didx == null) ? null : formatDurationTime(ts, forNode),
|
(didx == null) ? null : formatDurationTime(ts, forNode),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -34,6 +34,9 @@
|
|||||||
/*Const Init */
|
/*Const Init */
|
||||||
const { query: initq } = init();
|
const { query: initq } = init();
|
||||||
const useCbColors = getContext("cc-config")?.plotConfiguration_colorblindMode || false
|
const useCbColors = getContext("cc-config")?.plotConfiguration_colorblindMode || false
|
||||||
|
|
||||||
|
/* Derived */
|
||||||
|
const subClusters = $derived($initq?.data?.clusters?.find((c) => c.name == presetCluster)?.subClusters || []);
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<!-- Loading indicator & Refresh -->
|
<!-- Loading indicator & Refresh -->
|
||||||
@@ -66,12 +69,22 @@
|
|||||||
</CardBody>
|
</CardBody>
|
||||||
</TabPane>
|
</TabPane>
|
||||||
|
|
||||||
<TabPane tabId="usage-dash" tab="Usage">
|
<TabPane tabId="usage-dash" tab="Cluster Usage">
|
||||||
<CardBody>
|
<CardBody>
|
||||||
<UsageDash {presetCluster} {useCbColors}></UsageDash>
|
<UsageDash {presetCluster} {useCbColors}></UsageDash>
|
||||||
</CardBody>
|
</CardBody>
|
||||||
</TabPane>
|
</TabPane>
|
||||||
|
|
||||||
|
{#if subClusters?.length > 1}
|
||||||
|
{#each subClusters.map(sc => sc.name) as scn}
|
||||||
|
<TabPane tabId="{scn}-usage-dash" tab="{scn.charAt(0).toUpperCase() + scn.slice(1)} Usage">
|
||||||
|
<CardBody>
|
||||||
|
<UsageDash {presetCluster} presetSubCluster={scn} {useCbColors}></UsageDash>
|
||||||
|
</CardBody>
|
||||||
|
</TabPane>
|
||||||
|
{/each}
|
||||||
|
{/if}
|
||||||
|
|
||||||
<TabPane tabId="metric-dash" tab="Statistics">
|
<TabPane tabId="metric-dash" tab="Statistics">
|
||||||
<CardBody>
|
<CardBody>
|
||||||
<StatisticsDash {presetCluster} {useCbColors}></StatisticsDash>
|
<StatisticsDash {presetCluster} {useCbColors}></StatisticsDash>
|
||||||
|
|||||||
@@ -3,6 +3,9 @@
|
|||||||
|
|
||||||
Properties:
|
Properties:
|
||||||
- `presetCluster String`: The cluster to show status information for
|
- `presetCluster String`: The cluster to show status information for
|
||||||
|
- `presetSubCluster String?`: The subCluster to show status information for [Default: null]
|
||||||
|
- `useCbColors Bool?`: Use colorblind friendly colors [Default: false]
|
||||||
|
- `useAltColors Bool?`: Use alternative color set [Default: false]
|
||||||
-->
|
-->
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
@@ -35,6 +38,7 @@
|
|||||||
/* Svelte 5 Props */
|
/* Svelte 5 Props */
|
||||||
let {
|
let {
|
||||||
presetCluster,
|
presetCluster,
|
||||||
|
presetSubCluster = null,
|
||||||
useCbColors = false,
|
useCbColors = false,
|
||||||
useAltColors = false
|
useAltColors = false
|
||||||
} = $props();
|
} = $props();
|
||||||
@@ -52,7 +56,12 @@
|
|||||||
let numDurationBins = $state("1h");
|
let numDurationBins = $state("1h");
|
||||||
|
|
||||||
/* Derived */
|
/* Derived */
|
||||||
let cluster = $derived(presetCluster)
|
const canvasPrefix = $derived(`${presetCluster}-${presetSubCluster ? presetSubCluster : ''}`)
|
||||||
|
|
||||||
|
const statusFilter = $derived(presetSubCluster
|
||||||
|
? [{ state: ["running"] }, { cluster: { eq: presetCluster} }, { partition: { eq: presetSubCluster } }]
|
||||||
|
: [{ state: ["running"] }, { cluster: { eq: presetCluster} }]
|
||||||
|
);
|
||||||
const topJobsQuery = $derived(queryStore({
|
const topJobsQuery = $derived(queryStore({
|
||||||
client: client,
|
client: client,
|
||||||
query: gql`
|
query: gql`
|
||||||
@@ -82,7 +91,7 @@
|
|||||||
}
|
}
|
||||||
`,
|
`,
|
||||||
variables: {
|
variables: {
|
||||||
filter: [{ state: ["running"] }, { cluster: { eq: cluster} }],
|
filter: statusFilter,
|
||||||
paging: pagingState // Top 10
|
paging: pagingState // Top 10
|
||||||
},
|
},
|
||||||
requestPolicy: "network-only"
|
requestPolicy: "network-only"
|
||||||
@@ -117,7 +126,7 @@
|
|||||||
}
|
}
|
||||||
`,
|
`,
|
||||||
variables: {
|
variables: {
|
||||||
filter: [{ state: ["running"] }, { cluster: { eq: cluster } }],
|
filter: statusFilter,
|
||||||
paging: pagingState
|
paging: pagingState
|
||||||
},
|
},
|
||||||
requestPolicy: "network-only"
|
requestPolicy: "network-only"
|
||||||
@@ -152,7 +161,7 @@
|
|||||||
}
|
}
|
||||||
`,
|
`,
|
||||||
variables: {
|
variables: {
|
||||||
filter: [{ state: ["running"] }, { cluster: { eq: cluster } }],
|
filter: statusFilter,
|
||||||
paging: pagingState
|
paging: pagingState
|
||||||
},
|
},
|
||||||
requestPolicy: "network-only"
|
requestPolicy: "network-only"
|
||||||
@@ -184,7 +193,7 @@
|
|||||||
}
|
}
|
||||||
`,
|
`,
|
||||||
variables: {
|
variables: {
|
||||||
filter: [{ state: ["running"] }, { cluster: { eq: cluster } }],
|
filter: statusFilter,
|
||||||
selectedHistograms: selectedHistograms, // No Metrics requested for node hardware stats
|
selectedHistograms: selectedHistograms, // No Metrics requested for node hardware stats
|
||||||
numDurationBins: numDurationBins,
|
numDurationBins: numDurationBins,
|
||||||
},
|
},
|
||||||
@@ -264,7 +273,7 @@
|
|||||||
</h4>
|
</h4>
|
||||||
<Pie
|
<Pie
|
||||||
{useAltColors}
|
{useAltColors}
|
||||||
canvasId="hpcpie-jobs-users"
|
canvasId="{canvasPrefix}-hpcpie-jobs-users"
|
||||||
size={colWidthJobs * 0.75}
|
size={colWidthJobs * 0.75}
|
||||||
sliceLabel="Jobs"
|
sliceLabel="Jobs"
|
||||||
quantities={$topJobsQuery.data.topUser.map(
|
quantities={$topJobsQuery.data.topUser.map(
|
||||||
@@ -284,14 +293,14 @@
|
|||||||
{#each $topJobsQuery.data.topUser as tu, i}
|
{#each $topJobsQuery.data.topUser as tu, i}
|
||||||
<tr>
|
<tr>
|
||||||
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
||||||
<td id="topName-jobs-{tu.id}">
|
<td id="{canvasPrefix}-topName-jobs-{tu.id}">
|
||||||
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={cluster}&state=running"
|
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running"
|
||||||
>{scrambleNames ? scramble(tu.id) : tu.id}
|
>{scrambleNames ? scramble(tu.id) : tu.id}
|
||||||
</a>
|
</a>
|
||||||
</td>
|
</td>
|
||||||
{#if tu?.name}
|
{#if tu?.name}
|
||||||
<Tooltip
|
<Tooltip
|
||||||
target={`topName-jobs-${tu.id}`}
|
target={`${canvasPrefix}-topName-jobs-${tu.id}`}
|
||||||
placement="left"
|
placement="left"
|
||||||
>{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip
|
>{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip
|
||||||
>
|
>
|
||||||
@@ -308,7 +317,7 @@
|
|||||||
</h4>
|
</h4>
|
||||||
<Pie
|
<Pie
|
||||||
{useAltColors}
|
{useAltColors}
|
||||||
canvasId="hpcpie-jobs-projects"
|
canvasId="{canvasPrefix}-hpcpie-jobs-projects"
|
||||||
size={colWidthJobs * 0.75}
|
size={colWidthJobs * 0.75}
|
||||||
sliceLabel={'Jobs'}
|
sliceLabel={'Jobs'}
|
||||||
quantities={$topJobsQuery.data.topProjects.map(
|
quantities={$topJobsQuery.data.topProjects.map(
|
||||||
@@ -328,7 +337,7 @@
|
|||||||
<tr>
|
<tr>
|
||||||
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
||||||
<td>
|
<td>
|
||||||
<a target="_blank" href="/monitoring/jobs/?cluster={cluster}&state=running&project={tp.id}&projectMatch=eq"
|
<a target="_blank" href="/monitoring/jobs/?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running&project={tp.id}&projectMatch=eq"
|
||||||
>{scrambleNames ? scramble(tp.id) : tp.id}
|
>{scrambleNames ? scramble(tp.id) : tp.id}
|
||||||
</a>
|
</a>
|
||||||
</td>
|
</td>
|
||||||
@@ -368,7 +377,7 @@
|
|||||||
</h4>
|
</h4>
|
||||||
<Pie
|
<Pie
|
||||||
{useAltColors}
|
{useAltColors}
|
||||||
canvasId="hpcpie-nodes-users"
|
canvasId="{canvasPrefix}-hpcpie-nodes-users"
|
||||||
size={colWidthNodes * 0.75}
|
size={colWidthNodes * 0.75}
|
||||||
sliceLabel="Nodes"
|
sliceLabel="Nodes"
|
||||||
quantities={$topNodesQuery.data.topUser.map(
|
quantities={$topNodesQuery.data.topUser.map(
|
||||||
@@ -388,14 +397,14 @@
|
|||||||
{#each $topNodesQuery.data.topUser as tu, i}
|
{#each $topNodesQuery.data.topUser as tu, i}
|
||||||
<tr>
|
<tr>
|
||||||
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
||||||
<td id="topName-nodes-{tu.id}">
|
<td id="{canvasPrefix}-topName-nodes-{tu.id}">
|
||||||
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={cluster}&state=running"
|
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running"
|
||||||
>{scrambleNames ? scramble(tu.id) : tu.id}
|
>{scrambleNames ? scramble(tu.id) : tu.id}
|
||||||
</a>
|
</a>
|
||||||
</td>
|
</td>
|
||||||
{#if tu?.name}
|
{#if tu?.name}
|
||||||
<Tooltip
|
<Tooltip
|
||||||
target={`topName-nodes-${tu.id}`}
|
target={`${canvasPrefix}-topName-nodes-${tu.id}`}
|
||||||
placement="left"
|
placement="left"
|
||||||
>{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip
|
>{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip
|
||||||
>
|
>
|
||||||
@@ -412,7 +421,7 @@
|
|||||||
</h4>
|
</h4>
|
||||||
<Pie
|
<Pie
|
||||||
{useAltColors}
|
{useAltColors}
|
||||||
canvasId="hpcpie-nodes-projects"
|
canvasId="{canvasPrefix}-hpcpie-nodes-projects"
|
||||||
size={colWidthNodes * 0.75}
|
size={colWidthNodes * 0.75}
|
||||||
sliceLabel={'Nodes'}
|
sliceLabel={'Nodes'}
|
||||||
quantities={$topNodesQuery.data.topProjects.map(
|
quantities={$topNodesQuery.data.topProjects.map(
|
||||||
@@ -432,7 +441,7 @@
|
|||||||
<tr>
|
<tr>
|
||||||
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
||||||
<td>
|
<td>
|
||||||
<a target="_blank" href="/monitoring/jobs/?cluster={cluster}&state=running&project={tp.id}&projectMatch=eq"
|
<a target="_blank" href="/monitoring/jobs/?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running&project={tp.id}&projectMatch=eq"
|
||||||
>{scrambleNames ? scramble(tp.id) : tp.id}
|
>{scrambleNames ? scramble(tp.id) : tp.id}
|
||||||
</a>
|
</a>
|
||||||
</td>
|
</td>
|
||||||
@@ -472,7 +481,7 @@
|
|||||||
</h4>
|
</h4>
|
||||||
<Pie
|
<Pie
|
||||||
{useAltColors}
|
{useAltColors}
|
||||||
canvasId="hpcpie-accs-users"
|
canvasId="{canvasPrefix}-hpcpie-accs-users"
|
||||||
size={colWidthAccs * 0.75}
|
size={colWidthAccs * 0.75}
|
||||||
sliceLabel="GPUs"
|
sliceLabel="GPUs"
|
||||||
quantities={$topAccsQuery.data.topUser.map(
|
quantities={$topAccsQuery.data.topUser.map(
|
||||||
@@ -492,14 +501,14 @@
|
|||||||
{#each $topAccsQuery.data.topUser as tu, i}
|
{#each $topAccsQuery.data.topUser as tu, i}
|
||||||
<tr>
|
<tr>
|
||||||
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
||||||
<td id="topName-accs-{tu.id}">
|
<td id="{canvasPrefix}-topName-accs-{tu.id}">
|
||||||
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={cluster}&state=running"
|
<a target="_blank" href="/monitoring/user/{tu.id}?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running"
|
||||||
>{scrambleNames ? scramble(tu.id) : tu.id}
|
>{scrambleNames ? scramble(tu.id) : tu.id}
|
||||||
</a>
|
</a>
|
||||||
</td>
|
</td>
|
||||||
{#if tu?.name}
|
{#if tu?.name}
|
||||||
<Tooltip
|
<Tooltip
|
||||||
target={`topName-accs-${tu.id}`}
|
target={`${canvasPrefix}-topName-accs-${tu.id}`}
|
||||||
placement="left"
|
placement="left"
|
||||||
>{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip
|
>{scrambleNames ? scramble(tu.name) : tu.name}</Tooltip
|
||||||
>
|
>
|
||||||
@@ -516,7 +525,7 @@
|
|||||||
</h4>
|
</h4>
|
||||||
<Pie
|
<Pie
|
||||||
{useAltColors}
|
{useAltColors}
|
||||||
canvasId="hpcpie-accs-projects"
|
canvasId="{canvasPrefix}-hpcpie-accs-projects"
|
||||||
size={colWidthAccs * 0.75}
|
size={colWidthAccs * 0.75}
|
||||||
sliceLabel={'GPUs'}
|
sliceLabel={'GPUs'}
|
||||||
quantities={$topAccsQuery.data.topProjects.map(
|
quantities={$topAccsQuery.data.topProjects.map(
|
||||||
@@ -536,7 +545,7 @@
|
|||||||
<tr>
|
<tr>
|
||||||
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
<td><Icon name="circle-fill" style="color: {legendColors(i)};" /></td>
|
||||||
<td>
|
<td>
|
||||||
<a target="_blank" href="/monitoring/jobs/?cluster={cluster}&state=running&project={tp.id}&projectMatch=eq"
|
<a target="_blank" href="/monitoring/jobs/?cluster={presetCluster}{presetSubCluster ? '&partition='+presetSubCluster : ''}&state=running&project={tp.id}&projectMatch=eq"
|
||||||
>{scrambleNames ? scramble(tp.id) : tp.id}
|
>{scrambleNames ? scramble(tp.id) : tp.id}
|
||||||
</a>
|
</a>
|
||||||
</td>
|
</td>
|
||||||
|
|||||||
@@ -69,9 +69,9 @@
|
|||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
||||||
let extendedLegendData = $derived($nodeJobsData?.data ? buildExtendedLegend() : null);
|
const extendedLegendData = $derived($nodeJobsData?.data ? buildExtendedLegend() : null);
|
||||||
let refinedData = $derived(nodeData?.metrics ? sortAndSelectScope(nodeData.metrics) : null);
|
const refinedData = $derived(nodeData?.metrics ? sortAndSelectScope(nodeData.metrics) : []);
|
||||||
let dataHealth = $derived(refinedData.filter((rd) => rd.disabled === false).map((enabled) => (enabled?.data?.metric?.series?.length > 0)));
|
const dataHealth = $derived(refinedData.filter((rd) => rd.disabled === false).map((enabled) => (enabled?.data?.metric?.series?.length > 0)));
|
||||||
|
|
||||||
/* Functions */
|
/* Functions */
|
||||||
const selectScope = (nodeMetrics) =>
|
const selectScope = (nodeMetrics) =>
|
||||||
|
|||||||
Reference in New Issue
Block a user