From 7123a8c1cc14dbbe7da99a0628f1fdc342399194 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 6 Feb 2026 16:04:01 +0100 Subject: [PATCH 1/7] Updated HealthCheck implementation WIP --- pkg/metricstore/healthcheck.go | 271 ++++++++++----------------------- 1 file changed, 80 insertions(+), 191 deletions(-) diff --git a/pkg/metricstore/healthcheck.go b/pkg/metricstore/healthcheck.go index 59c84f79..5ed10fb5 100644 --- a/pkg/metricstore/healthcheck.go +++ b/pkg/metricstore/healthcheck.go @@ -6,9 +6,7 @@ package metricstore import ( - "cmp" "fmt" - "slices" "time" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" @@ -16,25 +14,18 @@ import ( ) // HealthCheckResponse represents the result of a health check operation. -// -// Status indicates the monitoring state (Full, Partial, Failed). -// Error contains any error encountered during the health check. type HealthCheckResponse struct { Status schema.MonitoringState Error error } -// MaxMissingDataPoints is a threshold that allows a node to be healthy with certain number of data points missing. -// Suppose a node does not receive last 5 data points, then healthCheck endpoint will still say a -// node is healthy. Anything more than 5 missing points in metrics of the node will deem the node unhealthy. +// MaxMissingDataPoints is the threshold for stale data detection. +// A buffer is considered healthy if the gap between its last data point +// and the current time is within MaxMissingDataPoints * frequency. const MaxMissingDataPoints int64 = 5 -// isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints. -// -// Returns true if the buffer is healthy (recent data within threshold), false otherwise. -// A nil buffer or empty buffer is considered unhealthy. +// bufferExists returns true if the buffer is non-nil and contains data. func (b *buffer) bufferExists() bool { - // Check if the buffer is empty if b == nil || b.data == nil || len(b.data) == 0 { return false } @@ -42,233 +33,131 @@ func (b *buffer) bufferExists() bool { return true } -// isBufferHealthy checks if a buffer has received data for the last MaxMissingDataPoints. -// -// Returns true if the buffer is healthy (recent data within threshold), false otherwise. -// A nil buffer or empty buffer is considered unhealthy. +// isBufferHealthy returns true if the buffer has recent data within +// MaxMissingDataPoints * frequency of the current time. func (b *buffer) isBufferHealthy() bool { - // Get the last endtime of the buffer bufferEnd := b.start + b.frequency*int64(len(b.data)) t := time.Now().Unix() - // Check if the buffer has recent data (within MaxMissingDataPoints threshold) - if t-bufferEnd > MaxMissingDataPoints*b.frequency { - return false - } - - return true + return t-bufferEnd <= MaxMissingDataPoints*b.frequency } -// MergeUniqueSorted merges two lists, sorts them, and removes duplicates. -// Requires 'cmp.Ordered' because we need to sort the data. -func mergeList[string cmp.Ordered](list1, list2 []string) []string { - // 1. Combine both lists - result := append(list1, list2...) - - // 2. Sort the combined list - slices.Sort(result) - - // 3. Compact removes consecutive duplicates (standard in Go 1.21+) - // e.g. [1, 1, 2, 3, 3] -> [1, 2, 3] - result = slices.Compact(result) - - return result -} - -// getHealthyMetrics recursively collects healthy and degraded metrics at this level and below. +// collectMetricStatus walks the subtree rooted at l and classifies each +// expected metric into the healthy or degraded map. // -// A metric is considered: -// - Healthy: buffer has recent data within MaxMissingDataPoints threshold AND has few/no NaN values -// - Degraded: buffer exists and has recent data, but contains more than MaxMissingDataPoints NaN values -// -// This routine walks the entire subtree starting from the current level. -// -// Parameters: -// - m: MemoryStore containing the global metric configuration -// -// Returns: -// - []string: Flat list of healthy metric names from this level and all children -// - []string: Flat list of degraded metric names (exist but have too many missing values) -// - error: Non-nil only for internal errors during recursion -// -// The routine mirrors healthCheck() but provides more granular classification: -// - healthCheck() finds problems (stale/missing) -// - getHealthyMetrics() separates healthy from degraded metrics -func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) ([]string, []string, error) { +// Classification rules (evaluated per buffer, pessimistic): +// - A single stale buffer marks the metric as degraded permanently. +// - A healthy buffer only counts if no stale buffer has been seen. +// - Metrics absent from the global config or without any buffer remain +// in neither map and are later reported as missing. +func (l *Level) collectMetricStatus(m *MemoryStore, expectedMetrics []string, healthy, degraded map[string]bool) { l.lock.RLock() defer l.lock.RUnlock() - globalMetrics := m.Metrics + for _, metricName := range expectedMetrics { + if degraded[metricName] { + continue // already degraded, cannot improve + } + mc := m.Metrics[metricName] + b := l.metrics[mc.offset] + if b.bufferExists() { + if !b.isBufferHealthy() { + degraded[metricName] = true + delete(healthy, metricName) + } else if !degraded[metricName] { + healthy[metricName] = true + } + } + } + + for _, lvl := range l.children { + lvl.collectMetricStatus(m, expectedMetrics, healthy, degraded) + } +} + +// getHealthyMetrics walks the complete subtree rooted at l and classifies +// each expected metric by comparing the collected status against the +// expected list. +// +// Returns: +// - missingList: metrics not found in global config or without any buffer +// - degradedList: metrics with at least one stale buffer in the subtree +func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) []string { + healthy := make(map[string]bool, len(expectedMetrics)) + degraded := make(map[string]bool) + + l.collectMetricStatus(m, expectedMetrics, healthy, degraded) - missingList := make([]string, 0) degradedList := make([]string, 0) - // Phase 1: Check metrics at this level for _, metricName := range expectedMetrics { - offset := globalMetrics[metricName].offset - b := l.metrics[offset] + if healthy[metricName] { + continue + } - if !b.bufferExists() { - missingList = append(missingList, metricName) - } else if !b.isBufferHealthy() { + if degraded[metricName] { degradedList = append(degradedList, metricName) } } - // Phase 2: Recursively check child levels - for _, lvl := range l.children { - childMissing, childDegraded, err := lvl.getHealthyMetrics(m, expectedMetrics) - if err != nil { - return nil, nil, err - } - - missingList = mergeList(missingList, childMissing) - degradedList = mergeList(degradedList, childDegraded) - } - - return missingList, degradedList, nil + return degradedList } -// GetHealthyMetrics returns healthy and degraded metrics for a specific node as flat lists. +// GetHealthyMetrics returns missing and degraded metric lists for a node. // -// This routine walks the metric tree starting from the specified node selector -// and collects all metrics that have received data within the last MaxMissingDataPoints -// (default: 5 data points). Metrics are classified into two categories: +// It walks the metric tree starting from the node identified by selector +// and classifies each expected metric: +// - Missing: no buffer anywhere in the subtree, or metric not in global config +// - Degraded: at least one stale buffer exists in the subtree // -// - Healthy: Buffer has recent data AND contains few/no NaN (missing) values -// - Degraded: Buffer has recent data BUT contains more than MaxMissingDataPoints NaN values -// -// The returned lists include both node-level metrics (e.g., "load", "mem_used") and -// hardware-level metrics (e.g., "cpu_user", "gpu_temp") in flat slices. -// -// Parameters: -// - selector: Hierarchical path to the target node, typically []string{cluster, hostname}. -// Example: []string{"emmy", "node001"} navigates to the "node001" host in the "emmy" cluster. -// The selector must match the hierarchy used during metric ingestion. -// -// Returns: -// - []string: Flat list of healthy metric names (recent data, few missing values) -// - []string: Flat list of degraded metric names (recent data, many missing values) -// - error: Non-nil if the node is not found or internal errors occur -// -// Example usage: -// -// selector := []string{"emmy", "node001"} -// healthyMetrics, degradedMetrics, err := ms.GetHealthyMetrics(selector) -// if err != nil { -// // Node not found or internal error -// return err -// } -// fmt.Printf("Healthy metrics: %v\n", healthyMetrics) -// // Output: ["load", "mem_used", "cpu_user", ...] -// fmt.Printf("Degraded metrics: %v\n", degradedMetrics) -// // Output: ["gpu_temp", "network_rx", ...] (metrics with many NaN values) -// -// Note: This routine provides more granular classification than HealthCheck: -// - HealthCheck reports stale/missing metrics (problems) -// - GetHealthyMetrics separates fully healthy from degraded metrics (quality levels) -func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, []string, error) { +// Metrics present in expectedMetrics but absent from both returned lists +// are considered fully healthy. +func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, error) { lvl := m.root.findLevel(selector) if lvl == nil { - return nil, nil, fmt.Errorf("[METRICSTORE]> error while GetHealthyMetrics, host not found: %#v", selector) + return nil, fmt.Errorf("[METRICSTORE]> GetHealthyMetrics: host not found: %#v", selector) } - missingList, degradedList, err := lvl.getHealthyMetrics(m, expectedMetrics) - if err != nil { - return nil, nil, err - } - - return missingList, degradedList, nil + degradedList := lvl.getHealthyMetrics(m, expectedMetrics) + return degradedList, nil } -// HealthCheck performs health checks on multiple nodes and returns their monitoring states. +// HealthCheck evaluates multiple nodes against a set of expected metrics +// and returns a monitoring state per node. // -// This routine provides a batch health check interface that evaluates multiple nodes -// against a specific set of expected metrics. For each node, it determines the overall -// monitoring state based on which metrics are healthy, degraded, or missing. -// -// Health Status Classification: -// - MonitoringStateFull: All expected metrics are healthy (recent data, few missing values) -// - MonitoringStatePartial: Some metrics are degraded (many missing values) or missing -// - MonitoringStateFailed: Node not found or all expected metrics are missing/stale -// -// Parameters: -// - cluster: Cluster name (first element of selector path) -// - nodes: List of node hostnames to check -// - expectedMetrics: List of metric names that should be present on each node -// -// Returns: -// - map[string]schema.MonitoringState: Map keyed by hostname containing monitoring state for each node -// - error: Non-nil only for internal errors (individual node failures are captured as MonitoringStateFailed) -// -// Example usage: -// -// cluster := "emmy" -// nodes := []string{"node001", "node002", "node003"} -// expectedMetrics := []string{"load", "mem_used", "cpu_user", "cpu_system"} -// healthStates, err := ms.HealthCheck(cluster, nodes, expectedMetrics) -// if err != nil { -// return err -// } -// for hostname, state := range healthStates { -// fmt.Printf("Node %s: %s\n", hostname, state) -// } -// -// Note: This routine is optimized for batch operations where you need to check -// the same set of metrics across multiple nodes. +// States: +// - MonitoringStateFull: all expected metrics are healthy +// - MonitoringStatePartial: some metrics are missing or degraded +// - MonitoringStateFailed: node not found, or no healthy metrics at all func (m *MemoryStore) HealthCheck(cluster string, nodes []string, expectedMetrics []string, ) (map[string]schema.MonitoringState, error) { results := make(map[string]schema.MonitoringState, len(nodes)) - // Create a set of expected metrics for fast lookup - expectedSet := make(map[string]bool, len(expectedMetrics)) - for _, metric := range expectedMetrics { - expectedSet[metric] = true - } - - // Check each node for _, hostname := range nodes { selector := []string{cluster, hostname} - status := schema.MonitoringStateFull - healthyCount := 0 - degradedCount := 0 - missingCount := 0 - // Get healthy and degraded metrics for this node - missingList, degradedList, err := m.GetHealthyMetrics(selector, expectedMetrics) + degradedList, err := m.GetHealthyMetrics(selector, expectedMetrics) if err != nil { - // Node not found or internal error results[hostname] = schema.MonitoringStateFailed continue } - missingCount = len(missingList) - degradedCount = len(degradedList) - uniqueList := mergeList(missingList, degradedList) - healthyCount = len(expectedMetrics) - len(uniqueList) + degradedCount := len(degradedList) + healthyCount := len(expectedMetrics) - degradedCount - // Debug log missing and degraded metrics - if missingCount > 0 { - cclog.ComponentDebug("metricstore", "HealthCheck: node", hostname, "missing metrics:", missingList) - } if degradedCount > 0 { cclog.ComponentDebug("metricstore", "HealthCheck: node", hostname, "degraded metrics:", degradedList) } - // Determine overall health status - if missingCount > 0 || degradedCount > 0 { - if healthyCount == 0 { - // No healthy metrics at all - status = schema.MonitoringStateFailed - } else { - // Some healthy, some degraded/missing - status = schema.MonitoringStatePartial - } + switch { + case degradedCount == 0: + results[hostname] = schema.MonitoringStateFull + case healthyCount == 0: + results[hostname] = schema.MonitoringStateFailed + default: + results[hostname] = schema.MonitoringStatePartial } - // else: all metrics healthy, status remains MonitoringStateFull - - results[hostname] = status } return results, nil From 5579b6f40cd15c9002f670a918fd4783a32e2945 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 6 Feb 2026 16:11:10 +0100 Subject: [PATCH 2/7] Adopt unit test to new API --- pkg/metricstore/healthcheck.go | 4 +--- pkg/metricstore/metricstore_test.go | 19 ++----------------- 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/pkg/metricstore/healthcheck.go b/pkg/metricstore/healthcheck.go index 5ed10fb5..b4be57c4 100644 --- a/pkg/metricstore/healthcheck.go +++ b/pkg/metricstore/healthcheck.go @@ -95,9 +95,7 @@ func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) []st continue } - if degraded[metricName] { - degradedList = append(degradedList, metricName) - } + degradedList = append(degradedList, metricName) } return degradedList diff --git a/pkg/metricstore/metricstore_test.go b/pkg/metricstore/metricstore_test.go index f96f49a2..ba9d7ecf 100644 --- a/pkg/metricstore/metricstore_test.go +++ b/pkg/metricstore/metricstore_test.go @@ -303,7 +303,6 @@ func TestGetHealthyMetrics(t *testing.T) { name string selector []string expectedMetrics []string - wantMissing []string wantDegraded []string wantErr bool }{ @@ -311,15 +310,13 @@ func TestGetHealthyMetrics(t *testing.T) { name: "mixed health states", selector: []string{"testcluster", "testnode"}, expectedMetrics: []string{"load", "mem_used", "cpu_user"}, - wantMissing: []string{"cpu_user"}, - wantDegraded: []string{"mem_used"}, + wantDegraded: []string{"mem_used", "cpu_user"}, wantErr: false, }, { name: "node not found", selector: []string{"testcluster", "nonexistent"}, expectedMetrics: []string{"load"}, - wantMissing: nil, wantDegraded: nil, wantErr: true, }, @@ -327,7 +324,6 @@ func TestGetHealthyMetrics(t *testing.T) { name: "check only healthy metric", selector: []string{"testcluster", "testnode"}, expectedMetrics: []string{"load"}, - wantMissing: []string{}, wantDegraded: []string{}, wantErr: false, }, @@ -335,7 +331,7 @@ func TestGetHealthyMetrics(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - missing, degraded, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics) + degraded, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics) if (err != nil) != tt.wantErr { t.Errorf("GetHealthyMetrics() error = %v, wantErr %v", err, tt.wantErr) @@ -346,17 +342,6 @@ func TestGetHealthyMetrics(t *testing.T) { return } - // Check missing list - if len(missing) != len(tt.wantMissing) { - t.Errorf("GetHealthyMetrics() missing = %v, want %v", missing, tt.wantMissing) - } else { - for i, m := range tt.wantMissing { - if missing[i] != m { - t.Errorf("GetHealthyMetrics() missing[%d] = %v, want %v", i, missing[i], m) - } - } - } - // Check degraded list if len(degraded) != len(tt.wantDegraded) { t.Errorf("GetHealthyMetrics() degraded = %v, want %v", degraded, tt.wantDegraded) From a8d385a1eecee2cf838ad846cd65780ffd03edca Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 6 Feb 2026 16:35:02 +0100 Subject: [PATCH 3/7] Update HealthCheck again Still WIP --- pkg/metricstore/healthcheck.go | 27 +++++++++++++++++---------- pkg/metricstore/metricstore_test.go | 19 +++++++++++++++++-- 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/pkg/metricstore/healthcheck.go b/pkg/metricstore/healthcheck.go index b4be57c4..9527d4a2 100644 --- a/pkg/metricstore/healthcheck.go +++ b/pkg/metricstore/healthcheck.go @@ -82,12 +82,13 @@ func (l *Level) collectMetricStatus(m *MemoryStore, expectedMetrics []string, he // Returns: // - missingList: metrics not found in global config or without any buffer // - degradedList: metrics with at least one stale buffer in the subtree -func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) []string { +func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) ([]string, []string) { healthy := make(map[string]bool, len(expectedMetrics)) degraded := make(map[string]bool) l.collectMetricStatus(m, expectedMetrics, healthy, degraded) + missingList := make([]string, 0) degradedList := make([]string, 0) for _, metricName := range expectedMetrics { @@ -95,10 +96,14 @@ func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) []st continue } - degradedList = append(degradedList, metricName) + if degraded[metricName] { + degradedList = append(degradedList, metricName) + } else { + missingList = append(missingList, metricName) + } } - return degradedList + return degradedList, missingList } // GetHealthyMetrics returns missing and degraded metric lists for a node. @@ -110,14 +115,14 @@ func (l *Level) getHealthyMetrics(m *MemoryStore, expectedMetrics []string) []st // // Metrics present in expectedMetrics but absent from both returned lists // are considered fully healthy. -func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, error) { +func (m *MemoryStore) GetHealthyMetrics(selector []string, expectedMetrics []string) ([]string, []string, error) { lvl := m.root.findLevel(selector) if lvl == nil { - return nil, fmt.Errorf("[METRICSTORE]> GetHealthyMetrics: host not found: %#v", selector) + return nil, nil, fmt.Errorf("[METRICSTORE]> GetHealthyMetrics: host not found: %#v", selector) } - degradedList := lvl.getHealthyMetrics(m, expectedMetrics) - return degradedList, nil + degradedList, missingList := lvl.getHealthyMetrics(m, expectedMetrics) + return degradedList, missingList, nil } // HealthCheck evaluates multiple nodes against a set of expected metrics @@ -135,21 +140,23 @@ func (m *MemoryStore) HealthCheck(cluster string, for _, hostname := range nodes { selector := []string{cluster, hostname} - degradedList, err := m.GetHealthyMetrics(selector, expectedMetrics) + degradedList, missingList, err := m.GetHealthyMetrics(selector, expectedMetrics) if err != nil { results[hostname] = schema.MonitoringStateFailed continue } degradedCount := len(degradedList) - healthyCount := len(expectedMetrics) - degradedCount + missingCount := len(missingList) + + healthyCount := len(expectedMetrics) - degradedCount - missingCount if degradedCount > 0 { cclog.ComponentDebug("metricstore", "HealthCheck: node", hostname, "degraded metrics:", degradedList) } switch { - case degradedCount == 0: + case degradedCount == 0 && missingCount == 0: results[hostname] = schema.MonitoringStateFull case healthyCount == 0: results[hostname] = schema.MonitoringStateFailed diff --git a/pkg/metricstore/metricstore_test.go b/pkg/metricstore/metricstore_test.go index ba9d7ecf..4d68d76c 100644 --- a/pkg/metricstore/metricstore_test.go +++ b/pkg/metricstore/metricstore_test.go @@ -304,13 +304,15 @@ func TestGetHealthyMetrics(t *testing.T) { selector []string expectedMetrics []string wantDegraded []string + wantMissing []string wantErr bool }{ { name: "mixed health states", selector: []string{"testcluster", "testnode"}, expectedMetrics: []string{"load", "mem_used", "cpu_user"}, - wantDegraded: []string{"mem_used", "cpu_user"}, + wantDegraded: []string{"mem_used"}, + wantMissing: []string{"cpu_user"}, wantErr: false, }, { @@ -318,6 +320,7 @@ func TestGetHealthyMetrics(t *testing.T) { selector: []string{"testcluster", "nonexistent"}, expectedMetrics: []string{"load"}, wantDegraded: nil, + wantMissing: nil, wantErr: true, }, { @@ -325,13 +328,14 @@ func TestGetHealthyMetrics(t *testing.T) { selector: []string{"testcluster", "testnode"}, expectedMetrics: []string{"load"}, wantDegraded: []string{}, + wantMissing: []string{}, wantErr: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - degraded, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics) + degraded, missing, err := ms.GetHealthyMetrics(tt.selector, tt.expectedMetrics) if (err != nil) != tt.wantErr { t.Errorf("GetHealthyMetrics() error = %v, wantErr %v", err, tt.wantErr) @@ -352,6 +356,17 @@ func TestGetHealthyMetrics(t *testing.T) { } } } + + // Check missing list + if len(missing) != len(tt.wantMissing) { + t.Errorf("GetHealthyMetrics() missing = %v, want %v", missing, tt.wantMissing) + } else { + for i, m := range tt.wantMissing { + if missing[i] != m { + t.Errorf("GetHealthyMetrics() missing[%d] = %v, want %v", i, missing[i], m) + } + } + } }) } } From c43d4a0f16ace98bb0f6e1a300a9b2c4d12fb265 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 6 Feb 2026 17:51:39 +0100 Subject: [PATCH 4/7] complete review of context initialization and access, streamlining --- web/frontend/src/Job.root.svelte | 193 +++++++++--------- web/frontend/src/Jobs.root.svelte | 33 ++- web/frontend/src/Node.root.svelte | 43 ++-- web/frontend/src/Systems.root.svelte | 59 +++--- web/frontend/src/User.root.svelte | 62 +++--- web/frontend/src/generic/JobList.svelte | 18 +- .../src/generic/filters/Cluster.svelte | 9 +- .../src/generic/filters/Resources.svelte | 20 +- web/frontend/src/generic/filters/Tags.svelte | 4 +- web/frontend/src/generic/helper/Tag.svelte | 4 +- .../src/generic/helper/TagManagement.svelte | 7 +- .../src/generic/joblist/JobListRow.svelte | 18 +- .../generic/select/HistogramSelection.svelte | 8 +- .../src/generic/select/MetricSelection.svelte | 25 +-- .../src/generic/select/SortSelection.svelte | 24 +-- web/frontend/src/generic/utils.js | 10 +- web/frontend/src/job/JobRoofline.svelte | 6 +- web/frontend/src/job/StatsTab.svelte | 111 +++++----- web/frontend/src/systems/NodeList.svelte | 10 +- web/frontend/src/systems/NodeOverview.svelte | 16 +- .../src/systems/nodelist/NodeListRow.svelte | 3 + 21 files changed, 365 insertions(+), 318 deletions(-) diff --git a/web/frontend/src/Job.root.svelte b/web/frontend/src/Job.root.svelte index a393995f..50de27b5 100644 --- a/web/frontend/src/Job.root.svelte +++ b/web/frontend/src/Job.root.svelte @@ -68,12 +68,8 @@ energyFootprint { hardware, metric, value } } `); - const client = getContextClient(); - const ccconfig = getContext("cc-config"); - const showRoofline = !!ccconfig[`jobView_showRoofline`]; - const showStatsTable = !!ccconfig[`jobView_showStatTable`]; - /* Note: Actual metric data queried in Component, only require base infos here -> reduce backend load by requesting just stats */ + const client = getContextClient(); const query = gql` query ($dbid: ID!, $selectedMetrics: [String!]!, $selectedScopes: [MetricScope!]!) { scopedJobStats(id: $dbid, metrics: $selectedMetrics, scopes: $selectedScopes) { @@ -89,25 +85,68 @@ /* State Init */ let plots = $state({}); let isMetricsSelectionOpen = $state(false); - let selectedMetrics = $state([]); - let selectedScopes = $state([]); let totalMetrics = $state(0); - /* Derived */ - const showSummary = $derived((!!ccconfig[`jobView_showFootprint`] || !!ccconfig[`jobView_showPolarPlot`])) + /* Derived Init Return */ + const thisJob = $derived($initq?.data ? $initq.data.job : null); + + /* Derived Settings */ + const globalMetrics = $derived(thisJob ? getContext("globalMetrics") : null); + const clusterInfo = $derived(thisJob ? getContext("clusters") : null); + const ccconfig = $derived(thisJob ? getContext("cc-config") : null); + const showRoofline = $derived(ccconfig ? !!ccconfig[`jobView_showRoofline`] : false); + const showStatsTable = $derived(ccconfig ? !!ccconfig[`jobView_showStatTable`] : false); + const showSummary = $derived(ccconfig ? (!!ccconfig[`jobView_showFootprint`] || !!ccconfig[`jobView_showPolarPlot`]) : false) + + /* Derived Var Preprocessing*/ + let selectedMetrics = $derived.by(() => { + if(thisJob && ccconfig) { + if (thisJob.cluster) { + if (thisJob.subCluster) { + return ccconfig[`metricConfig_jobViewPlotMetrics:${thisJob.cluster}:${thisJob.subCluster}`] || + ccconfig[`metricConfig_jobViewPlotMetrics:${thisJob.cluster}`] || + ccconfig.metricConfig_jobViewPlotMetrics + } + return ccconfig[`metricConfig_jobViewPlotMetrics:${thisJob.cluster}`] || + ccconfig.metricConfig_jobViewPlotMetrics + } + return ccconfig.metricConfig_jobViewPlotMetrics + } + return []; + }); + + let selectedScopes = $derived.by(() => { + const pendingScopes = ["node"] + if (thisJob) { + const accScopeDefault = [...selectedMetrics].some(function (m) { + const thisCluster = clusterInfo.find((c) => c.name == thisJob.cluster); + const subCluster = thisCluster.subClusters.find((sc) => sc.name == thisJob.subCluster); + return subCluster.metricConfig.find((smc) => smc.name == m)?.scope === "accelerator"; + }); + + + if (accScopeDefault) pendingScopes.push("accelerator") + if (thisJob.numNodes === 1) { + pendingScopes.push("socket") + pendingScopes.push("core") + } + } + return[...new Set(pendingScopes)]; + }); + + /* Derived Query and Postprocessing*/ const jobMetrics = $derived(queryStore({ client: client, query: query, variables: { dbid, selectedMetrics, selectedScopes }, }) ); - + const missingMetrics = $derived.by(() => { - if ($initq?.data && $jobMetrics?.data) { - let job = $initq.data.job; + if (thisJob && $jobMetrics?.data) { let metrics = $jobMetrics.data.scopedJobStats; - let metricNames = $initq.data.globalMetrics.reduce((names, gm) => { - if (gm.availability.find((av) => av.cluster === job.cluster)) { + let metricNames = globalMetrics.reduce((names, gm) => { + if (gm.availability.find((av) => av.cluster === thisJob.cluster)) { names.push(gm.name); } return names; @@ -118,9 +157,10 @@ !metrics.some((jm) => jm.name == metric) && selectedMetrics.includes(metric) && !checkMetricDisabled( + globalMetrics, metric, - $initq.data.job.cluster, - $initq.data.job.subCluster, + thisJob.cluster, + thisJob.subCluster, ), ); } else { @@ -129,17 +169,16 @@ }); const missingHosts = $derived.by(() => { - if ($initq?.data && $jobMetrics?.data) { - let job = $initq.data.job; + if (thisJob && $jobMetrics?.data) { let metrics = $jobMetrics.data.scopedJobStats; - let metricNames = $initq.data.globalMetrics.reduce((names, gm) => { - if (gm.availability.find((av) => av.cluster === job.cluster)) { + let metricNames = globalMetrics.reduce((names, gm) => { + if (gm.availability.find((av) => av.cluster === thisJob.cluster)) { names.push(gm.name); } return names; }, []); - return job.resources + return thisJob.resources .map(({ hostname }) => ({ hostname: hostname, metrics: metricNames.filter( @@ -165,51 +204,19 @@ ? "Loading..." : $initq?.error ? "Error" - : `Job ${$initq.data.job.jobId} - ClusterCockpit`; - }); - - /* On Init */ - getContext("on-init")(() => { - let job = $initq.data.job; - if (!job) return; - const pendingMetrics = ( - ccconfig[`metricConfig_jobViewPlotMetrics:${job.cluster}:${job.subCluster}`] || - ccconfig[`metricConfig_jobViewPlotMetrics:${job.cluster}`] - ) || - $initq.data.globalMetrics.reduce((names, gm) => { - if (gm.availability.find((av) => av.cluster === job.cluster && av.subClusters.includes(job.subCluster))) { - names.push(gm.name); - } - return names; - }, []) - - // Select default Scopes to load: Check before if any metric has accelerator scope by default - const accScopeDefault = [...pendingMetrics].some(function (m) { - const cluster = $initq.data.clusters.find((c) => c.name == job.cluster); - const subCluster = cluster.subClusters.find((sc) => sc.name == job.subCluster); - return subCluster.metricConfig.find((smc) => smc.name == m)?.scope === "accelerator"; - }); - - const pendingScopes = ["node"] - if (accScopeDefault) pendingScopes.push("accelerator") - if (job.numNodes === 1) { - pendingScopes.push("socket") - pendingScopes.push("core") - } - - selectedMetrics = [...new Set(pendingMetrics)]; - selectedScopes = [...new Set(pendingScopes)]; + : `Job ${thisJob.jobId} - ClusterCockpit`; }); /* Functions */ - const orderAndMap = (grouped, selectedMetrics) => - selectedMetrics.map((metric) => ({ + const orderAndMap = (grouped, inputMetrics) => + inputMetrics.map((metric) => ({ metric: metric, data: grouped.find((group) => group[0].name == metric), disabled: checkMetricDisabled( + globalMetrics, metric, - $initq.data.job.cluster, - $initq.data.job.subCluster, + thisJob.cluster, + thisJob.subCluster, ), })); @@ -219,34 +226,34 @@ {#if $initq.error} {$initq.error.message} - {:else if $initq?.data} + {:else if thisJob} - {#if $initq.data?.job?.metaData?.message} + {#if thisJob?.metaData?.message} -
Job {$initq.data?.job?.jobId} ({$initq.data?.job?.cluster})
+
Job {thisJob?.jobId} ({thisJob?.cluster})
The following note was added by administrators:
- {@html $initq.data.job.metaData.message} + {@html thisJob.metaData.message}
{/if} - + - + - {#if $initq.data.job.concurrentJobs != null && $initq.data.job.concurrentJobs.items.length != 0} + {#if thisJob.concurrentJobs != null && thisJob.concurrentJobs.items.length != 0} - {$initq.data.job.concurrentJobs.items.length} Concurrent Jobs + {thisJob.concurrentJobs.items.length} Concurrent Jobs - roles.manager)}/> + roles.manager)}/> {/if} @@ -261,9 +268,9 @@ {#if $initq.error} {$initq.error.message} - {:else if $initq?.data} + {:else if thisJob} {#if showSummary} - + {/if} {:else} @@ -274,9 +281,9 @@ {#if $initq.error} {$initq.error.message} - {:else if $initq?.data} + {:else if thisJob} {#if showRoofline} - + {/if} {:else} @@ -285,10 +292,10 @@ -{#if $initq?.data && $initq.data.job.energyFootprint.length != 0} +{#if thisJob && thisJob?.energyFootprint?.length != 0} - + {/if} @@ -297,7 +304,7 @@ - {#if $initq?.data} + {#if thisJob} {#if job.numNodes > 1 && job.state === "running"} -