Merge branch 'dev' of github.com:ClusterCockpit/cc-backend into dev

This commit is contained in:
2025-12-23 07:56:16 +01:00
6 changed files with 161 additions and 115 deletions

View File

@@ -260,7 +260,10 @@ func (api *NatsAPI) handleNodeState(subject string, data []byte) {
JobsRunning: node.JobsRunning, JobsRunning: node.JobsRunning,
} }
repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState) if err := repo.UpdateNodeState(node.Hostname, req.Cluster, &nodeState); err != nil {
cclog.Errorf("NATS %s: updating node state for %s on %s failed: %v",
subject, node.Hostname, req.Cluster, err)
}
} }
cclog.Debugf("NATS %s: updated %d node states for cluster %s", subject, len(req.Nodes), req.Cluster) cclog.Debugf("NATS %s: updated %d node states for cluster %s", subject, len(req.Nodes), req.Cluster)

View File

@@ -770,21 +770,25 @@ func (ccms *CCMetricStore) LoadNodeData(
} }
mc := archive.GetMetricConfig(cluster, metric) mc := archive.GetMetricConfig(cluster, metric)
hostdata[metric] = append(hostdata[metric], &schema.JobMetric{ if mc != nil {
Unit: mc.Unit, hostdata[metric] = append(hostdata[metric], &schema.JobMetric{
Timestep: mc.Timestep, Unit: mc.Unit,
Series: []schema.Series{ Timestep: mc.Timestep,
{ Series: []schema.Series{
Hostname: query.Hostname, {
Data: qdata.Data, Hostname: query.Hostname,
Statistics: schema.MetricStatistics{ Data: qdata.Data,
Avg: float64(qdata.Avg), Statistics: schema.MetricStatistics{
Min: float64(qdata.Min), Avg: float64(qdata.Avg),
Max: float64(qdata.Max), Min: float64(qdata.Min),
Max: float64(qdata.Max),
},
}, },
}, },
}, })
}) } else {
cclog.Warnf("Metric '%s' not configured for cluster '%s': Skipped in LoadNodeData() Return!", metric, cluster)
}
} }
if len(errors) != 0 { if len(errors) != 0 {

View File

@@ -30,7 +30,8 @@
Table, Table,
Progress, Progress,
Icon, Icon,
Button Button,
Badge
} from "@sveltestrap/sveltestrap"; } from "@sveltestrap/sveltestrap";
import Roofline from "./generic/plots/Roofline.svelte"; import Roofline from "./generic/plots/Roofline.svelte";
import Pie, { colors } from "./generic/plots/Pie.svelte"; import Pie, { colors } from "./generic/plots/Pie.svelte";
@@ -85,7 +86,8 @@
query: gql` query: gql`
query ( query (
$cluster: String! $cluster: String!
$metrics: [String!] $nmetrics: [String!]
$cmetrics: [String!]
$from: Time! $from: Time!
$to: Time! $to: Time!
$clusterFrom: Time! $clusterFrom: Time!
@@ -97,7 +99,7 @@
# Node 5 Minute Averages for Roofline # Node 5 Minute Averages for Roofline
nodeMetrics( nodeMetrics(
cluster: $cluster cluster: $cluster
metrics: $metrics metrics: $nmetrics
from: $from from: $from
to: $to to: $to
) { ) {
@@ -106,6 +108,10 @@
metrics { metrics {
name name
metric { metric {
unit {
base
prefix
}
series { series {
statistics { statistics {
avg avg
@@ -114,21 +120,6 @@
} }
} }
} }
# Running Job Metric Average for Rooflines
jobsMetricStats(filter: $jobFilter, metrics: $metrics) {
id
jobId
duration
numNodes
numAccelerators
subCluster
stats {
name
data {
avg
}
}
}
# Get Jobs for Per-Node Counts # Get Jobs for Per-Node Counts
jobs(filter: $jobFilter, order: $sorting, page: $paging) { jobs(filter: $jobFilter, order: $sorting, page: $paging) {
items { items {
@@ -175,7 +166,7 @@
# ClusterMetrics for doubleMetricPlot # ClusterMetrics for doubleMetricPlot
clusterMetrics( clusterMetrics(
cluster: $cluster cluster: $cluster
metrics: $metrics metrics: $cmetrics
from: $clusterFrom from: $clusterFrom
to: $to to: $to
) { ) {
@@ -194,7 +185,8 @@
`, `,
variables: { variables: {
cluster: presetCluster, cluster: presetCluster,
metrics: ["flops_any", "mem_bw"], // Metrics For Cluster Plot and Roofline nmetrics: ["flops_any", "mem_bw", "cpu_power", "acc_power"], // Metrics For Roofline and Stats
cmetrics: ["flops_any", "mem_bw"], // Metrics For Cluster Plot
from: from.toISOString(), from: from.toISOString(),
clusterFrom: clusterFrom.toISOString(), clusterFrom: clusterFrom.toISOString(),
to: to.toISOString(), to: to.toISOString(),
@@ -258,6 +250,11 @@
} }
} }
// Get Idle Infos after Sums
if (!rawInfos['idleNodes']) rawInfos['idleNodes'] = rawInfos['totalNodes'] - rawInfos['allocatedNodes'];
if (!rawInfos['idleCores']) rawInfos['idleCores'] = rawInfos['totalCores'] - rawInfos['allocatedCores'];
if (!rawInfos['idleAccs']) rawInfos['idleAccs'] = rawInfos['totalAccs'] - rawInfos['allocatedAccs'];
// Keymetrics (Data on Cluster-Scope) // Keymetrics (Data on Cluster-Scope)
let rawFlops = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) => let rawFlops = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) =>
sum + (node.metrics.find((m) => m.name == 'flops_any')?.metric?.series[0]?.statistics?.avg || 0), sum + (node.metrics.find((m) => m.name == 'flops_any')?.metric?.series[0]?.statistics?.avg || 0),
@@ -271,6 +268,26 @@
) || 0; ) || 0;
rawInfos['memBwRate'] = Math.floor((rawMemBw * 100) / 100) rawInfos['memBwRate'] = Math.floor((rawMemBw * 100) / 100)
let rawCpuPwr = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) =>
sum + (node.metrics.find((m) => m.name == 'cpu_power')?.metric?.series[0]?.statistics?.avg || 0),
0, // Initial Value
) || 0;
rawInfos['cpuPwr'] = Math.floor((rawCpuPwr * 100) / 100)
if (!rawInfos['cpuPwrUnit']) {
let rawCpuUnit = $statusQuery?.data?.nodeMetrics[0]?.metrics.find((m) => m.name == 'cpu_power')?.metric?.unit || null
rawInfos['cpuPwrUnit'] = rawCpuUnit ? rawCpuUnit.prefix + rawCpuUnit.base : ''
}
let rawGpuPwr = $statusQuery?.data?.nodeMetrics?.reduce((sum, node) =>
sum + (node.metrics.find((m) => m.name == 'acc_power')?.metric?.series[0]?.statistics?.avg || 0),
0, // Initial Value
) || 0;
rawInfos['gpuPwr'] = Math.floor((rawGpuPwr * 100) / 100)
if (!rawInfos['gpuPwrUnit']) {
let rawGpuUnit = $statusQuery?.data?.nodeMetrics[0]?.metrics.find((m) => m.name == 'acc_power')?.metric?.unit || null
rawInfos['gpuPwrUnit'] = rawGpuUnit ? rawGpuUnit.prefix + rawGpuUnit.base : ''
}
return rawInfos return rawInfos
} else { } else {
return {}; return {};
@@ -338,7 +355,7 @@
</script> </script>
<Card style="height: 98vh;"> <Card style="height: 98vh;">
<CardBody class="align-content-center p-1"> <CardBody class="align-content-center p-2">
<Row> <Row>
<Col> <Col>
<Refresher <Refresher
@@ -408,79 +425,99 @@
<Col> <!-- Utilization Info Card --> <Col> <!-- Utilization Info Card -->
<Card class="h-100"> <Card class="h-100">
<CardBody> <CardBody>
<Table borderless> <Row class="mb-1">
<tr class="py-2"> <Col xs={4} class="d-inline-flex align-items-center justify-content-center">
<td style="font-size:x-large;">{clusterInfo?.runningJobs} Running Jobs</td> <Badge color="primary" style="font-size:x-large;margin-right:0.25rem;">
<td colspan="2" style="font-size:x-large;">{clusterInfo?.activeUsers} Active Users</td> {clusterInfo?.runningJobs}
</tr> </Badge>
<hr class="my-1"/> <div style="font-size:large;">
<tr class="pt-2"> Running Jobs
<td style="font-size: large;"> </div>
Flop Rate (<span style="cursor: help;" title="Flops[Any] = (Flops[Double] x 2) + Flops[Single]">Any</span>) </Col>
</td> <Col xs={4} class="d-inline-flex align-items-center justify-content-center">
<td colspan="2" style="font-size: large;"> <Badge color="primary" style="font-size:x-large;margin-right:0.25rem;">
Memory BW Rate {clusterInfo?.activeUsers}
</td> </Badge>
</tr> <div style="font-size:large;">
<tr class="pb-2"> Active Users
<td style="font-size:x-large;"> </div>
{clusterInfo?.flopRate} </Col>
{clusterInfo?.flopRateUnit} <Col xs={4} class="d-inline-flex align-items-center justify-content-center">
</td> <Badge color="primary" style="font-size:x-large;margin-right:0.25rem;">
<td colspan="2" style="font-size:x-large;"> {clusterInfo?.allocatedNodes}
{clusterInfo?.memBwRate} </Badge>
{clusterInfo?.memBwRateUnit} <div style="font-size:large;">
</td> Active Nodes
</tr> </div>
<hr class="my-1"/> </Col>
<tr class="py-2"> </Row>
<th scope="col">Allocated Nodes</th> <Row class="mt-1 mb-2">
<td style="min-width: 100px;" <Col xs={4} class="d-inline-flex align-items-center justify-content-center">
><div class="col"> <Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;">
<Progress {clusterInfo?.flopRate} {clusterInfo?.flopRateUnit}
value={clusterInfo?.allocatedNodes} </Badge>
max={clusterInfo?.totalNodes} <div style="font-size:large;">
/> Total Flop Rate
</div></td </div>
> </Col>
<td <Col xs={4} class="d-inline-flex align-items-center justify-content-center">
>{clusterInfo?.allocatedNodes} / {clusterInfo?.totalNodes} <Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;">
Nodes</td {clusterInfo?.memBwRate} {clusterInfo?.memBwRateUnit}
> </Badge>
</tr> <div style="font-size:large;">
<tr class="py-2"> Total Memory Bandwidth
<th scope="col">Allocated Cores</th> </div>
<td style="min-width: 100px;" </Col>
><div class="col">
<Progress
value={clusterInfo?.allocatedCores}
max={clusterInfo?.totalCores}
/>
</div></td
>
<td
>{formatNumber(clusterInfo?.allocatedCores)} / {formatNumber(clusterInfo?.totalCores)}
Cores</td
>
</tr>
{#if clusterInfo?.totalAccs !== 0} {#if clusterInfo?.totalAccs !== 0}
<tr class="py-2"> <Col xs={4} class="d-inline-flex align-items-center justify-content-center">
<th scope="col">Allocated Accelerators</th> <Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;">
<td style="min-width: 100px;" {clusterInfo?.gpuPwr} {clusterInfo?.gpuPwrUnit}
><div class="col"> </Badge>
<Progress <div style="font-size:large;">
value={clusterInfo?.allocatedAccs} Total GPU Power
max={clusterInfo?.totalAccs} </div>
/> </Col>
</div></td {:else}
> <Col xs={4} class="d-inline-flex align-items-center justify-content-center">
<td <Badge color="secondary" style="font-size:x-large;margin-right:0.25rem;">
>{clusterInfo?.allocatedAccs} / {clusterInfo?.totalAccs} {clusterInfo?.cpuPwr} {clusterInfo?.cpuPwrUnit}
Accelerators</td </Badge>
> <div style="font-size:large;">
</tr> Total CPU Power
</div>
</Col>
{/if} {/if}
</Table> </Row>
<Row class="my-1 align-items-baseline">
<Col xs={2} style="font-size:large;">
Active Cores
</Col>
<Col xs={8}>
<Progress multi style="height:2.5rem;font-size:x-large;">
<Progress bar color="success" value={clusterInfo?.allocatedCores}>{formatNumber(clusterInfo?.allocatedCores)}</Progress>
<Progress bar color="light" value={clusterInfo?.idleCores}>{formatNumber(clusterInfo?.idleCores)}</Progress>
</Progress>
</Col>
<Col xs={2} style="font-size:large;">
Idle Cores
</Col>
</Row>
{#if clusterInfo?.totalAccs !== 0}
<Row class="my-1 align-items-baseline">
<Col xs={2} style="font-size:large;">
Active GPU
</Col>
<Col xs={8}>
<Progress multi style="height:2.5rem;font-size:x-large;">
<Progress bar color="success" value={clusterInfo?.allocatedAccs}>{formatNumber(clusterInfo?.allocatedAccs)}</Progress>
<Progress bar color="light" value={clusterInfo?.idleAccs}>{formatNumber(clusterInfo?.idleAccs)}</Progress>
</Progress>
</Col>
<Col xs={2} style="font-size:large;">
Idle GPU
</Col>
</Row>
{/if}
</CardBody> </CardBody>
</Card> </Card>
</Col> </Col>
@@ -506,7 +543,7 @@
useColors={false} useColors={false}
useLegend={false} useLegend={false}
allowSizeChange allowSizeChange
width={colWidthRoof - 10} width={colWidthRoof}
height={300} height={300}
cluster={presetCluster} cluster={presetCluster}
subCluster={clusterInfo?.roofData ? clusterInfo.roofData : null} subCluster={clusterInfo?.roofData ? clusterInfo.roofData : null}
@@ -574,8 +611,8 @@
{#key $statesTimed?.data?.nodeStatesTimed} {#key $statesTimed?.data?.nodeStatesTimed}
<Stacked <Stacked
data={$statesTimed?.data?.nodeStatesTimed} data={$statesTimed?.data?.nodeStatesTimed}
width={colWidthStacked * 0.95} width={colWidthStacked}
xlabel="Time" height={260}
ylabel="Nodes" ylabel="Nodes"
yunit = "#Count" yunit = "#Count"
title = "Cluster Status" title = "Cluster Status"

View File

@@ -23,7 +23,7 @@
width = 0, width = 0,
height = 300, height = 300,
data = null, data = null,
xlabel = "", xlabel = null,
ylabel = "", ylabel = "",
yunit = "", yunit = "",
title = "", title = "",

View File

@@ -3,7 +3,7 @@
*/ */
const power = [1, 1e3, 1e6, 1e9, 1e12, 1e15, 1e18, 1e21] const power = [1, 1e3, 1e6, 1e9, 1e12, 1e15, 1e18, 1e21]
const prefix = ['', 'K', 'M', 'G', 'T', 'P', 'E'] const prefix = ['', 'k', 'M', 'G', 'T', 'P', 'E']
export function formatNumber(x) { export function formatNumber(x) {
if ( isNaN(x) || x == null) { if ( isNaN(x) || x == null) {

View File

@@ -355,7 +355,7 @@
</script> </script>
<Card style="height: 88vh;"> <Card>
<CardBody class="align-content-center"> <CardBody class="align-content-center">
<Row> <Row>
<Col> <Col>
@@ -540,7 +540,7 @@
<Roofline <Roofline
useColors={true} useColors={true}
allowSizeChange allowSizeChange
width={colWidthRoof - 10} width={colWidthRoof}
height={300} height={300}
subCluster={clusterInfo?.roofData ? clusterInfo.roofData : null} subCluster={clusterInfo?.roofData ? clusterInfo.roofData : null}
roofData={transformJobsStatsToData($statusQuery?.data?.jobsMetricStats)} roofData={transformJobsStatsToData($statusQuery?.data?.jobsMetricStats)}
@@ -568,7 +568,8 @@
{#key $statesTimed?.data?.nodeStates} {#key $statesTimed?.data?.nodeStates}
<Stacked <Stacked
data={$statesTimed?.data?.nodeStates} data={$statesTimed?.data?.nodeStates}
width={colWidthStacked1 * 0.95} width={colWidthStacked1}
height={330}
xlabel="Time" xlabel="Time"
ylabel="Nodes" ylabel="Nodes"
yunit = "#Count" yunit = "#Count"
@@ -584,7 +585,8 @@
{#key $statesTimed?.data?.healthStates} {#key $statesTimed?.data?.healthStates}
<Stacked <Stacked
data={$statesTimed?.data?.healthStates} data={$statesTimed?.data?.healthStates}
width={colWidthStacked2 * 0.95} width={colWidthStacked2}
height={330}
xlabel="Time" xlabel="Time"
ylabel="Nodes" ylabel="Nodes"
yunit = "#Count" yunit = "#Count"