From 89875db4a9a8d7dc53dacb480a6c144ae847772a Mon Sep 17 00:00:00 2001 From: Christoph Kluge Date: Mon, 22 Dec 2025 10:39:40 +0100 Subject: [PATCH 1/3] dashboard layout fixes --- web/frontend/src/DashPublic.root.svelte | 8 ++++---- web/frontend/src/generic/plots/Stacked.svelte | 2 +- web/frontend/src/status/DashInternal.svelte | 10 ++++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/web/frontend/src/DashPublic.root.svelte b/web/frontend/src/DashPublic.root.svelte index 25e2683..c69b28f 100644 --- a/web/frontend/src/DashPublic.root.svelte +++ b/web/frontend/src/DashPublic.root.svelte @@ -338,7 +338,7 @@ - + - + @@ -540,7 +540,7 @@ Date: Mon, 22 Dec 2025 17:26:56 +0100 Subject: [PATCH 2/3] Rework info panel in public dashboard - change to bootstrap grid from table - add infos, use badges - remove non required query --- internal/metricdata/cc-metric-store.go | 30 ++-- web/frontend/src/DashPublic.root.svelte | 219 ++++++++++++++---------- web/frontend/src/generic/units.js | 2 +- 3 files changed, 146 insertions(+), 105 deletions(-) diff --git a/internal/metricdata/cc-metric-store.go b/internal/metricdata/cc-metric-store.go index 6d446d1..be2e956 100644 --- a/internal/metricdata/cc-metric-store.go +++ b/internal/metricdata/cc-metric-store.go @@ -770,21 +770,25 @@ func (ccms *CCMetricStore) LoadNodeData( } mc := archive.GetMetricConfig(cluster, metric) - hostdata[metric] = append(hostdata[metric], &schema.JobMetric{ - Unit: mc.Unit, - Timestep: mc.Timestep, - Series: []schema.Series{ - { - Hostname: query.Hostname, - Data: qdata.Data, - Statistics: schema.MetricStatistics{ - Avg: float64(qdata.Avg), - Min: float64(qdata.Min), - Max: float64(qdata.Max), + if mc != nil { + hostdata[metric] = append(hostdata[metric], &schema.JobMetric{ + Unit: mc.Unit, + Timestep: mc.Timestep, + Series: []schema.Series{ + { + Hostname: query.Hostname, + Data: qdata.Data, + Statistics: schema.MetricStatistics{ + Avg: float64(qdata.Avg), + Min: float64(qdata.Min), + Max: float64(qdata.Max), + }, }, }, - }, - }) + }) + } else { + cclog.Warnf("Metric '%s' not configured for cluster '%s': Skipped in LoadNodeData() Return!", metric, cluster) + } } if len(errors) != 0 { diff --git a/web/frontend/src/DashPublic.root.svelte b/web/frontend/src/DashPublic.root.svelte index c69b28f..fbbf486 100644 --- a/web/frontend/src/DashPublic.root.svelte +++ b/web/frontend/src/DashPublic.root.svelte @@ -30,7 +30,8 @@ Table, Progress, Icon, - Button + Button, + Badge } from "@sveltestrap/sveltestrap"; import Roofline from "./generic/plots/Roofline.svelte"; import Pie, { colors } from "./generic/plots/Pie.svelte"; @@ -85,7 +86,8 @@ query: gql` query ( $cluster: String! - $metrics: [String!] + $nmetrics: [String!] + $cmetrics: [String!] $from: Time! $to: Time! $clusterFrom: Time! @@ -97,7 +99,7 @@ # Node 5 Minute Averages for Roofline nodeMetrics( cluster: $cluster - metrics: $metrics + metrics: $nmetrics from: $from to: $to ) { @@ -106,6 +108,10 @@ metrics { name metric { + unit { + base + prefix + } series { statistics { avg @@ -114,21 +120,6 @@ } } } - # Running Job Metric Average for Rooflines - jobsMetricStats(filter: $jobFilter, metrics: $metrics) { - id - jobId - duration - numNodes - numAccelerators - subCluster - stats { - name - data { - avg - } - } - } # Get Jobs for Per-Node Counts jobs(filter: $jobFilter, order: $sorting, page: $paging) { items { @@ -175,7 +166,7 @@ # ClusterMetrics for doubleMetricPlot clusterMetrics( cluster: $cluster - metrics: $metrics + metrics: $cmetrics from: $clusterFrom to: $to ) { @@ -194,7 +185,8 @@ `, variables: { cluster: presetCluster, - metrics: ["flops_any", "mem_bw"], // Metrics For Cluster Plot and Roofline + nmetrics: ["flops_any", "mem_bw", "cpu_power", "acc_power"], // Metrics For Roofline and Stats + cmetrics: ["flops_any", "mem_bw"], // Metrics For Cluster Plot from: from.toISOString(), clusterFrom: clusterFrom.toISOString(), to: to.toISOString(), @@ -258,6 +250,11 @@ } } + // Get Idle Infos after Sums + if (!rawInfos['idleNodes']) rawInfos['idleNodes'] = rawInfos['totalNodes'] - rawInfos['allocatedNodes']; + if (!rawInfos['idleCores']) rawInfos['idleCores'] = rawInfos['totalCores'] - rawInfos['allocatedCores']; + if (!rawInfos['idleAccs']) rawInfos['idleAccs'] = rawInfos['totalAccs'] - rawInfos['allocatedAccs']; + // Keymetrics (Data on Cluster-Scope) let rawFlops = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) => sum + (node.metrics.find((m) => m.name == 'flops_any')?.metric?.series[0]?.statistics?.avg || 0), @@ -271,6 +268,26 @@ ) || 0; rawInfos['memBwRate'] = Math.floor((rawMemBw * 100) / 100) + let rawCpuPwr = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) => + sum + (node.metrics.find((m) => m.name == 'cpu_power')?.metric?.series[0]?.statistics?.avg || 0), + 0, // Initial Value + ) || 0; + rawInfos['cpuPwr'] = Math.floor((rawCpuPwr * 100) / 100) + if (!rawInfos['cpuPwrUnit']) { + let rawCpuUnit = $statusQuery?.data?.nodeMetrics[0]?.metrics.find((m) => m.name == 'cpu_power')?.metric?.unit || null + rawInfos['cpuPwrUnit'] = rawCpuUnit ? rawCpuUnit.prefix + rawCpuUnit.base : '' + } + + let rawGpuPwr = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) => + sum + (node.metrics.find((m) => m.name == 'acc_power')?.metric?.series[0]?.statistics?.avg || 0), + 0, // Initial Value + ) || 0; + rawInfos['gpuPwr'] = Math.floor((rawGpuPwr * 100) / 100) + if (!rawInfos['gpuPwrUnit']) { + let rawGpuUnit = $statusQuery?.data?.nodeMetrics[0]?.metrics.find((m) => m.name == 'acc_power')?.metric?.unit || null + rawInfos['gpuPwrUnit'] = rawGpuUnit ? rawGpuUnit.prefix + rawGpuUnit.base : '' + } + return rawInfos } else { return {}; @@ -408,79 +425,99 @@ - - - - - -
- - - - - - - - -
- - - - - - - - - - + + + + {clusterInfo?.runningJobs} + +
+ Running Jobs +
+ + + + {clusterInfo?.activeUsers} + +
+ Active Users +
+ + + + {clusterInfo?.allocatedNodes} + +
+ Active Nodes +
+ + + + + + {clusterInfo?.flopRate} {clusterInfo?.flopRateUnit} + +
+ Total Flop Rate +
+ + + + {clusterInfo?.memBwRate} {clusterInfo?.memBwRateUnit} + +
+ Total Memory Bandwidth +
+ {#if clusterInfo?.totalAccs !== 0} - - - - - + + + {clusterInfo?.gpuPwr} {clusterInfo?.gpuPwrUnit} + +
+ Total GPU Power +
+ + {:else} + + + {clusterInfo?.cpuPwr} {clusterInfo?.cpuPwrUnit} + +
+ Total CPU Power +
+ {/if} -
{clusterInfo?.runningJobs} Running Jobs{clusterInfo?.activeUsers} Active Users
- Flop Rate (Any) - - Memory BW Rate -
- {clusterInfo?.flopRate} - {clusterInfo?.flopRateUnit} - - {clusterInfo?.memBwRate} - {clusterInfo?.memBwRateUnit} -
Allocated Nodes
- -
{clusterInfo?.allocatedNodes} / {clusterInfo?.totalNodes} - Nodes
Allocated Cores
- -
{formatNumber(clusterInfo?.allocatedCores)} / {formatNumber(clusterInfo?.totalCores)} - Cores
Allocated Accelerators
- -
{clusterInfo?.allocatedAccs} / {clusterInfo?.totalAccs} - Accelerators
+
+ + + Active Cores + + + + {formatNumber(clusterInfo?.allocatedCores)} + {formatNumber(clusterInfo?.idleCores)} + + + + Idle Cores + + + {#if clusterInfo?.totalAccs !== 0} + + + Active GPU + + + + {formatNumber(clusterInfo?.allocatedAccs)} + {formatNumber(clusterInfo?.idleAccs)} + + + + Idle GPU + + + {/if}
diff --git a/web/frontend/src/generic/units.js b/web/frontend/src/generic/units.js index 1737b97..3e251fb 100644 --- a/web/frontend/src/generic/units.js +++ b/web/frontend/src/generic/units.js @@ -3,7 +3,7 @@ */ const power = [1, 1e3, 1e6, 1e9, 1e12, 1e15, 1e18, 1e21] -const prefix = ['', 'K', 'M', 'G', 'T', 'P', 'E'] +const prefix = ['', 'k', 'M', 'G', 'T', 'P', 'E'] export function formatNumber(x) { if ( isNaN(x) || x == null) { From 0bc26aa1943cf281165084f46a2273c95ffe8d90 Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Tue, 23 Dec 2025 05:56:46 +0100 Subject: [PATCH 3/3] Add error check --- internal/api/nats.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/internal/api/nats.go b/internal/api/nats.go index 1bfe905..61cbd97 100644 --- a/internal/api/nats.go +++ b/internal/api/nats.go @@ -224,7 +224,10 @@ func (api *NatsAPI) handleNodeState(subject string, data []byte) { JobsRunning: node.JobsRunning, } - repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState) + if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil { + cclog.Errorf("NATS %s: updating node state for %s on %s failed: %v", + subject, node.Hostname, req.Cluster, err) + } } cclog.Debugf("NATS %s: updated %d node states for cluster %s", subject, len(req.Nodes), req.Cluster)