From d74465215d85207e7024381ae8ab01c8fadea6f8 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 6 Mar 2026 10:09:44 +0100 Subject: [PATCH 1/3] simplify and fix adaptive threshold logic --- .../src/generic/joblist/JobListRow.svelte | 2 -- .../src/generic/plots/MetricPlot.svelte | 28 +++++-------------- web/frontend/src/job/Metric.svelte | 4 --- 3 files changed, 7 insertions(+), 27 deletions(-) diff --git a/web/frontend/src/generic/joblist/JobListRow.svelte b/web/frontend/src/generic/joblist/JobListRow.svelte index 3963708f..e9382bee 100644 --- a/web/frontend/src/generic/joblist/JobListRow.svelte +++ b/web/frontend/src/generic/joblist/JobListRow.svelte @@ -234,8 +234,6 @@ cluster={clusterInfos.find((c) => c.name == job.cluster)} subCluster={job.subCluster} isShared={job.shared != "none"} - numhwthreads={job.numHWThreads} - numaccs={job.numAcc} zoomState={zoomStates[metric.data.name] || null} thresholdState={thresholdStates[metric.data.name] || null} {plotSync} diff --git a/web/frontend/src/generic/plots/MetricPlot.svelte b/web/frontend/src/generic/plots/MetricPlot.svelte index 063b43fb..aa38d858 100644 --- a/web/frontend/src/generic/plots/MetricPlot.svelte +++ b/web/frontend/src/generic/plots/MetricPlot.svelte @@ -43,8 +43,6 @@ subCluster, isShared = false, forNode = false, - numhwthreads = 0, - numaccs = 0, zoomState = null, thresholdState = null, extendedLegendData = null, @@ -83,9 +81,7 @@ const thresholds = $derived(findJobAggregationThresholds( subClusterTopology, metricConfig, - scope, - numhwthreads, - numaccs + scope )); const longestSeries = $derived.by(() => { if (useStatsSeries) { @@ -276,9 +272,7 @@ function findJobAggregationThresholds( subClusterTopology, metricConfig, - scope, - numhwthreads, - numaccs + scope ) { if (!subClusterTopology || !metricConfig || !scope) { @@ -303,21 +297,13 @@ } if (metricConfig?.aggregation == "sum") { - // Scale Thresholds - let fraction; - if (numaccs > 0) fraction = subClusterTopology.accelerators.length / numaccs; - else if (numhwthreads > 0) fraction = subClusterTopology.core.length / numhwthreads; - else fraction = 1; // Fallback - let divisor; - // Exclusive: Fraction = 1; Shared: Fraction > 1 - if (scope == 'node') divisor = fraction; - // Cap divisor at number of available sockets or domains - else if (scope == 'socket') divisor = (fraction < subClusterTopology.socket.length) ? subClusterTopology.socket.length : fraction; - else if (scope == "memoryDomain") divisor = (fraction < subClusterTopology.memoryDomain.length) ? subClusterTopology.socket.length : fraction; - // Use Maximum Division for Smallest Scopes + if (scope == 'node') divisor = 1 // Node Scope: Always return unscaled (Maximum Scope) + // Partial Scopes: Get from Topologies + else if (scope == 'socket') divisor = subClusterTopology.socket.length; + else if (scope == "memoryDomain") divisor = subClusterTopology.memoryDomain.length; else if (scope == "core") divisor = subClusterTopology.core.length; - else if (scope == "hwthread") divisor = subClusterTopology.core.length; // alt. name for core + else if (scope == "hwthread") divisor = subClusterTopology.node.length; else if (scope == "accelerator") divisor = subClusterTopology.accelerators.length; else { console.log('Unknown scope, return default aggregation thresholds for sum', scope) diff --git a/web/frontend/src/job/Metric.svelte b/web/frontend/src/job/Metric.svelte index ca32d9f9..1beb88fb 100644 --- a/web/frontend/src/job/Metric.svelte +++ b/web/frontend/src/job/Metric.svelte @@ -178,8 +178,6 @@ timestep={selectedData.timestep} scope={selectedScope} metric={metricName} - numaccs={job.numAcc} - numhwthreads={job.numHWThreads} series={selectedSeries} {isShared} {zoomState} @@ -194,8 +192,6 @@ timestep={selectedData.timestep} scope={selectedScope} metric={metricName} - numaccs={job.numAcc} - numhwthreads={job.numHWThreads} series={selectedSeries} {isShared} {zoomState} From 88bd83b07e64d949020829976a091929ce7cc8c4 Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Fri, 6 Mar 2026 10:19:46 +0100 Subject: [PATCH 2/3] add nullsafe fallbacks --- web/frontend/src/generic/plots/MetricPlot.svelte | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/web/frontend/src/generic/plots/MetricPlot.svelte b/web/frontend/src/generic/plots/MetricPlot.svelte index aa38d858..3969161d 100644 --- a/web/frontend/src/generic/plots/MetricPlot.svelte +++ b/web/frontend/src/generic/plots/MetricPlot.svelte @@ -300,11 +300,11 @@ let divisor; if (scope == 'node') divisor = 1 // Node Scope: Always return unscaled (Maximum Scope) // Partial Scopes: Get from Topologies - else if (scope == 'socket') divisor = subClusterTopology.socket.length; - else if (scope == "memoryDomain") divisor = subClusterTopology.memoryDomain.length; - else if (scope == "core") divisor = subClusterTopology.core.length; - else if (scope == "hwthread") divisor = subClusterTopology.node.length; - else if (scope == "accelerator") divisor = subClusterTopology.accelerators.length; + else if (scope == 'socket') divisor = subClusterTopology?.socket?.length || 1; + else if (scope == "memoryDomain") divisor = subClusterTopology?.memoryDomain?.length || 1; + else if (scope == "core") divisor = subClusterTopology?.core?.length || 1; + else if (scope == "hwthread") divisor = subClusterTopology?.node?.length || 1; + else if (scope == "accelerator") divisor = subClusterTopology?.accelerators?.length || 1; else { console.log('Unknown scope, return default aggregation thresholds for sum', scope) divisor = 1; From 70fea39d0342c81e21e497bebb1519c2c98f8313 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Fri, 6 Mar 2026 10:56:23 +0100 Subject: [PATCH 3/3] Add note on dynamic memory management for restarts --- ReleaseNotes.md | 2 ++ cmd/cc-backend/main.go | 1 + 2 files changed, 3 insertions(+) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 3d352f20..5447167e 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -271,6 +271,8 @@ For release specific notes visit the [ClusterCockpit Documentation](https://clus ## Known issues +- The new dynamic memory management is not bullet proof yet across restarts. We + will fix that in a subsequent patch release - Currently energy footprint metrics of type energy are ignored for calculating total energy. - With energy footprint metrics of type power the unit is ignored and it is diff --git a/cmd/cc-backend/main.go b/cmd/cc-backend/main.go index 5b51b963..57c8d65b 100644 --- a/cmd/cc-backend/main.go +++ b/cmd/cc-backend/main.go @@ -395,6 +395,7 @@ func runServer(ctx context.Context) error { // Set GC percent if not configured if os.Getenv(envGOGC) == "" { + // trigger GC when heap grows 15% above the previous live set debug.SetGCPercent(15) } runtime.SystemdNotify(true, "running")