Build new statusDash, refine newRoofline data render

This commit is contained in:
Christoph Kluge
2025-08-07 16:10:11 +02:00
parent 71cfb4db77
commit bef832e45b
5 changed files with 345 additions and 121 deletions

View File

@@ -15,7 +15,7 @@
CardBody,
Table,
Progress,
Icon,
// Icon,
} from "@sveltestrap/sveltestrap";
import {
queryStore,
@@ -24,11 +24,11 @@
} from "@urql/svelte";
import {
init,
transformPerNodeDataForRoofline,
// transformPerNodeDataForRoofline,
} from "../generic/utils.js";
import { scaleNumbers } from "../generic/units.js";
import Roofline from "../generic/plots/Roofline.svelte";
import { scaleNumbers, formatTime } from "../generic/units.js";
import NewBubbleRoofline from "../generic/plots/NewBubbleRoofline.svelte";
/* Svelte 5 Props */
let {
@@ -68,9 +68,12 @@
$metrics: [String!]
$from: Time!
$to: Time!
$filter: [JobFilter!]!
$jobFilter: [JobFilter!]!
$nodeFilter: [NodeFilter!]!
$paging: PageRequest!
$sorting: OrderByInput!
) {
# Node 5 Minute Averages for Roofline
nodeMetrics(
cluster: $cluster
metrics: $metrics
@@ -81,27 +84,58 @@
subCluster
metrics {
name
scope
metric {
timestep
unit {
base
prefix
}
series {
data
statistics {
avg
}
}
}
}
}
# Running Job Metric Average for Rooflines
jobsMetricStats(filter: $jobFilter, metrics: $metrics) {
id
jobId
duration
numNodes
numAccelerators
subCluster
stats {
name
data {
avg
}
}
}
# Get Jobs for Per-Node Counts
jobs(filter: $jobFilter, order: $sorting, page: $paging) {
items {
jobId
resources {
hostname
}
}
count
}
# Only counts shared nodes once
allocatedNodes(cluster: $cluster) {
name
count
}
# Get States for Node Roofline; $sorting unused in backend: Use placeholder
nodes(filter: $nodeFilter, order: $sorting) {
count
items {
hostname
cluster
subCluster
nodeState
}
}
# totalNodes includes multiples if shared jobs
jobsStatistics(
filter: $filter
filter: $jobFilter
page: $paging
sortBy: TOTALJOBS
groupBy: SUBCLUSTER
@@ -118,8 +152,10 @@
metrics: ["flops_any", "mem_bw"], // Fixed names for roofline and status bars
from: from.toISOString(),
to: to.toISOString(),
filter: [{ state: ["running"] }, { cluster: { eq: cluster } }],
jobFilter: [{ state: ["running"] }, { cluster: { eq: cluster } }],
nodeFilter: { cluster: { eq: cluster }},
paging: { itemsPerPage: -1, page: 1 }, // Get all: -1
sorting: { field: "startTime", type: "col", order: "DESC" }
},
}));
@@ -170,6 +206,7 @@
});
/* Const Functions */
// New: Sum Up Node Averages
const sumUp = (data, subcluster, metric) =>
data.reduce(
(sum, node) =>
@@ -177,20 +214,132 @@
? sum +
(node.metrics
.find((m) => m.name == metric)
?.metric.series.reduce(
(sum, series) => sum + series.data[series.data.length - 1],
0,
) || 0)
?.metric?.series[0]?.statistics?.avg || 0
)
: sum,
0,
);
// Old: SumUp Metric Time Data
// const sumUp = (data, subcluster, metric) =>
// data.reduce(
// (sum, node) =>
// node.subCluster == subcluster
// ? sum +
// (node.metrics
// .find((m) => m.name == metric)
// ?.metric.series.reduce(
// (sum, series) => sum + series.data[series.data.length - 1],
// 0,
// ) || 0)
// : sum,
// 0,
// );
/* Functions */
function transformJobsStatsToData(subclusterData) {
/* c will contain values from 0 to 1 representing the duration */
let data = null
const x = [], y = [], c = [], day = 86400.0
if (subclusterData) {
for (let i = 0; i < subclusterData.length; i++) {
const flopsData = subclusterData[i].stats.find((s) => s.name == "flops_any")
const memBwData = subclusterData[i].stats.find((s) => s.name == "mem_bw")
const f = flopsData.data.avg
const m = memBwData.data.avg
const d = subclusterData[i].duration / day
const intensity = f / m
if (Number.isNaN(intensity) || !Number.isFinite(intensity))
continue
x.push(intensity)
y.push(f)
// Long Jobs > 1 Day: Use max Color
if (d > 1.0) c.push(1.0)
else c.push(d)
}
} else {
console.warn("transformJobsStatsToData: metrics for 'mem_bw' and/or 'flops_any' missing!")
}
if (x.length > 0 && y.length > 0 && c.length > 0) {
data = [null, [x, y], c] // for dataformat see roofline.svelte
}
return data
}
function transformNodesStatsToData(subclusterData) {
let data = null
const x = [], y = []
if (subclusterData) {
for (let i = 0; i < subclusterData.length; i++) {
const flopsData = subclusterData[i].metrics.find((s) => s.name == "flops_any")
const memBwData = subclusterData[i].metrics.find((s) => s.name == "mem_bw")
const f = flopsData.metric.series[0].statistics.avg
const m = memBwData.metric.series[0].statistics.avg
let intensity = f / m
if (Number.isNaN(intensity) || !Number.isFinite(intensity)) {
// continue // Old: Introduces mismatch between Data and Info Arrays
intensity = 0.0 // New: Set to Float Zero: Will not show in Log-Plot (Always below render limit)
}
x.push(intensity)
y.push(f)
}
} else {
// console.warn("transformNodesStatsToData: metrics for 'mem_bw' and/or 'flops_any' missing!")
}
if (x.length > 0 && y.length > 0) {
data = [null, [x, y]] // for dataformat see roofline.svelte
}
return data
}
function transformJobsStatsToInfo(subclusterData) {
if (subclusterData) {
return subclusterData.map((sc) => { return {id: sc.id, jobId: sc.jobId, numNodes: sc.numNodes, numAcc: sc?.numAccelerators? sc.numAccelerators : 0, duration: formatTime(sc.duration)} })
} else {
console.warn("transformJobsStatsToInfo: jobInfo missing!")
return []
}
}
function transformNodesStatsToInfo(subClusterData) {
let result = [];
if (subClusterData) { // && $nodesState?.data) {
// Use Nodes as Returned from CCMS, *NOT* as saved in DB via SlurmState-API!
for (let j = 0; j < subClusterData.length; j++) {
// nodesCounts[subClusterData[i].subCluster] = $nodesState.data.nodes.count; // Probably better as own derived!
const nodeName = subClusterData[j]?.host ? subClusterData[j].host : "unknown"
const nodeMatch = $statusQuery?.data?.nodes?.items?.find((n) => n.hostname == nodeName && n.subCluster == subClusterData[j].subCluster);
const nodeState = nodeMatch?.nodeState ? nodeMatch.nodeState : "notindb"
let numJobs = 0
if ($statusQuery?.data) {
const nodeJobs = $statusQuery?.data?.jobs?.items?.filter((job) => job.resources.find((res) => res.hostname == nodeName))
numJobs = nodeJobs?.length ? nodeJobs.length : 0
}
result.push({nodeName: nodeName, nodeState: nodeState, numJobs: numJobs})
};
};
return result
}
</script>
<!-- Gauges & Roofline per Subcluster-->
{#if $initq.data && $statusQuery.data}
{#each $initq.data.clusters.find((c) => c.name == cluster).subClusters as subCluster, i}
<Row cols={{ lg: 2, md: 1 , sm: 1}} class="mb-3 justify-content-center">
<Row cols={{ lg: 3, md: 1 , sm: 1}} class="mb-3 justify-content-center">
<Col class="px-3">
<Card class="h-auto mt-1">
<CardHeader>
@@ -204,6 +353,25 @@
<td colspan="2" style="font-size:x-large;">{activeUsers[subCluster.name]} Active Users</td>
</tr>
<hr class="my-1"/>
<tr class="pt-2">
<td style="font-size: large;">
Flop Rate (<span style="cursor: help;" title="Flops[Any] = (Flops[Double] x 2) + Flops[Single]">Any</span>)
</td>
<td colspan="2" style="font-size: large;">
Memory BW Rate
</td>
</tr>
<tr class="pb-2">
<td style="font-size:x-large;">
{flopRate[subCluster.name]}
{flopRateUnitPrefix[subCluster.name]}{flopRateUnitBase[subCluster.name]}
</td>
<td colspan="2" style="font-size:x-large;">
{memBwRate[subCluster.name]}
{memBwRateUnitPrefix[subCluster.name]}{memBwRateUnitBase[subCluster.name]}
</td>
</tr>
<hr class="my-1"/>
<tr class="py-2">
<th scope="col">Allocated Nodes</th>
<td style="min-width: 100px;"
@@ -236,7 +404,8 @@
>
</tr>
{/if}
<tr class="py-2">
<hr class="my-1"/>
<!-- <tr class="py-2">
<th scope="col"
>Flop Rate (Any) <Icon
name="info-circle"
@@ -280,23 +449,49 @@
memBwRateUnitPrefix[subCluster.name],
)}{memBwRateUnitBase[subCluster.name]} [Max]
</td>
</tr>
</tr> -->
</Table>
</CardBody>
</Card>
</Col>
<Col class="px-3 mt-2 mt-lg-0">
<div bind:clientWidth={plotWidths[i]}>
{#key $statusQuery.data.nodeMetrics}
<Roofline
{#key $statusQuery?.data?.nodeMetrics}
<NewBubbleRoofline
useColors={true}
allowSizeChange
width={plotWidths[i] - 10}
height={300}
cluster={cluster}
subCluster={subCluster}
roofData={transformNodesStatsToData($statusQuery?.data?.nodeMetrics.filter(
(data) => data.subCluster == subCluster.name,
)
)}
nodesData={transformNodesStatsToInfo($statusQuery?.data?.nodeMetrics.filter(
(data) => data.subCluster == subCluster.name,
)
)}
/>
{/key}
</div>
</Col>
<Col class="px-3 mt-2 mt-lg-0">
<div bind:clientWidth={plotWidths[i]}>
{#key $statusQuery?.data?.jobsMetricStats}
<NewBubbleRoofline
useColors={true}
allowSizeChange
width={plotWidths[i] - 10}
height={300}
subCluster={subCluster}
data={transformPerNodeDataForRoofline(
$statusQuery.data.nodeMetrics.filter(
roofData={transformJobsStatsToData($statusQuery?.data?.jobsMetricStats.filter(
(data) => data.subCluster == subCluster.name,
),
)
)}
jobsData={transformJobsStatsToInfo($statusQuery?.data?.jobsMetricStats.filter(
(data) => data.subCluster == subCluster.name,
)
)}
/>
{/key}