mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2025-09-05 00:23:00 +02:00
Merge branch 'hotfix' of https://github.com/ClusterCockpit/cc-backend into hotfix
This commit is contained in:
@@ -1,129 +1,199 @@
|
||||
<script>
|
||||
import { getContext } from 'svelte'
|
||||
import {
|
||||
Card,
|
||||
CardHeader,
|
||||
CardTitle,
|
||||
CardBody,
|
||||
Progress,
|
||||
Icon,
|
||||
Tooltip
|
||||
} from "sveltestrap";
|
||||
import { mean, round } from 'mathjs'
|
||||
|
||||
export let job
|
||||
export let jobMetrics
|
||||
export let view = 'job'
|
||||
export let width = 'auto'
|
||||
|
||||
const clusters = getContext('clusters')
|
||||
const subclusterConfig = clusters.find((c) => c.name == job.cluster).subClusters.find((sc) => sc.name == job.subCluster)
|
||||
|
||||
const footprintMetrics = (job.numAcc !== 0)
|
||||
? (job.exclusive !== 1) // GPU
|
||||
? ['acc_utilization', 'acc_mem_used', 'nv_sm_clock', 'nv_mem_util'] // Shared
|
||||
: ['acc_utilization', 'acc_mem_used', 'nv_sm_clock', 'nv_mem_util'] // Exclusive
|
||||
: (job.exclusive !== 1) // CPU only
|
||||
? ['flops_any', 'mem_used'] // Shared
|
||||
: ['cpu_load', 'flops_any', 'mem_used', 'mem_bw'] // Exclusive
|
||||
|
||||
const footprintData = footprintMetrics.map((fm) => {
|
||||
// Mean: Primarily use backend sourced avgs from job.*, secondarily calculate/read from metricdata
|
||||
let mv = null
|
||||
if (fm === 'cpu_load' && job.loadAvg !== 0) {
|
||||
mv = round(job.loadAvg, 2)
|
||||
} else if (fm === 'flops_any' && job.flopsAnyAvg !== 0) {
|
||||
mv = round(job.flopsAnyAvg, 2)
|
||||
} else if (fm === 'mem_bw' && job.memBwAvg !== 0) {
|
||||
mv = round(job.memBwAvg, 2)
|
||||
} else { // Calculate from jobMetrics
|
||||
const jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === 'node')
|
||||
if (jm?.metric?.statisticsSeries) {
|
||||
mv = round(mean(jm.metric.statisticsSeries.mean), 2)
|
||||
} else if (jm?.metric?.series?.length > 1) {
|
||||
const avgs = jm.metric.series.map(jms => jms.statistics.avg)
|
||||
mv = round(mean(avgs), 2)
|
||||
} else if (jm?.metric?.series) {
|
||||
mv = round(jm.metric.series[0].statistics.avg, 2)
|
||||
} else {
|
||||
mv = 0.0
|
||||
}
|
||||
}
|
||||
|
||||
// Unit
|
||||
const fmc = getContext('metrics')(job.cluster, fm)
|
||||
let unit = ''
|
||||
if (fmc?.unit?.base) unit = fmc.unit.prefix + fmc.unit.base
|
||||
|
||||
// Threshold / -Differences
|
||||
const fmt = findJobThresholds(job, fmc, subclusterConfig)
|
||||
if (fm === 'flops_any') fmt.peak = round((fmt.peak * 0.85), 0)
|
||||
|
||||
// Define basic data
|
||||
const fmBase = {
|
||||
name: fm,
|
||||
unit: unit,
|
||||
avg: mv,
|
||||
max: fmt.peak
|
||||
}
|
||||
|
||||
if (evalFootprint(fm, mv, fmt, 'alert')) {
|
||||
return {
|
||||
...fmBase,
|
||||
color: 'danger',
|
||||
message:`Metric average way ${fm === 'mem_used' ? 'above' : 'below' } expected normal thresholds.`,
|
||||
impact: 3
|
||||
}
|
||||
} else if (evalFootprint(fm, mv, fmt, 'caution')) {
|
||||
return {
|
||||
...fmBase,
|
||||
color: 'warning',
|
||||
message: `Metric average ${fm === 'mem_used' ? 'above' : 'below' } expected normal thresholds.`,
|
||||
impact: 2
|
||||
}
|
||||
} else if (evalFootprint(fm, mv, fmt, 'normal')) {
|
||||
return {
|
||||
...fmBase,
|
||||
color: 'success',
|
||||
message: 'Metric average within expected thresholds.',
|
||||
impact: 1
|
||||
}
|
||||
} else if (evalFootprint(fm, mv, fmt, 'peak')) {
|
||||
return {
|
||||
...fmBase,
|
||||
color: 'info',
|
||||
message: 'Metric average above expected normal thresholds: Check for artifacts recommended.',
|
||||
impact: 0
|
||||
}
|
||||
} else {
|
||||
return {
|
||||
...fmBase,
|
||||
color: 'secondary',
|
||||
message: 'Metric average above expected peak threshold: Check for artifacts!',
|
||||
impact: -1
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
function evalFootprint(metric, mean, thresholds, level) {
|
||||
// mem_used has inverse logic regarding threshold levels, notify levels triggered if mean > threshold
|
||||
switch (level) {
|
||||
case 'peak':
|
||||
if (metric === 'mem_used') return false // mem_used over peak -> return false to trigger impact -1
|
||||
else return (mean <= thresholds.peak && mean > thresholds.normal)
|
||||
case 'alert':
|
||||
if (metric === 'mem_used') return (mean <= thresholds.peak && mean >= thresholds.alert)
|
||||
else return (mean <= thresholds.alert && mean >= 0)
|
||||
case 'caution':
|
||||
if (metric === 'mem_used') return (mean < thresholds.alert && mean >= thresholds.caution)
|
||||
else return (mean <= thresholds.caution && mean > thresholds.alert)
|
||||
case 'normal':
|
||||
if (metric === 'mem_used') return (mean < thresholds.caution && mean >= 0)
|
||||
else return (mean <= thresholds.normal && mean > thresholds.caution)
|
||||
default:
|
||||
return false
|
||||
}
|
||||
<script context="module">
|
||||
export function findJobThresholds(job, metricConfig, subClusterConfig) {
|
||||
if (!job || !metricConfig || !subClusterConfig) {
|
||||
console.warn("Argument missing for findJobThresholds!");
|
||||
return null;
|
||||
}
|
||||
|
||||
const subclusterThresholds = metricConfig.subClusters.find(
|
||||
(sc) => sc.name == subClusterConfig.name,
|
||||
);
|
||||
const defaultThresholds = {
|
||||
peak: subclusterThresholds
|
||||
? subclusterThresholds.peak
|
||||
: metricConfig.peak,
|
||||
normal: subclusterThresholds
|
||||
? subclusterThresholds.normal
|
||||
: metricConfig.normal,
|
||||
caution: subclusterThresholds
|
||||
? subclusterThresholds.caution
|
||||
: metricConfig.caution,
|
||||
alert: subclusterThresholds
|
||||
? subclusterThresholds.alert
|
||||
: metricConfig.alert,
|
||||
};
|
||||
|
||||
if (job.exclusive === 1) {
|
||||
// Exclusive: Use as defined
|
||||
return defaultThresholds;
|
||||
} else {
|
||||
// Shared: Handle specifically
|
||||
if (metricConfig.name === "cpu_load") {
|
||||
// Special: Avg Aggregation BUT scaled based on #hwthreads
|
||||
return {
|
||||
peak: job.numHWThreads,
|
||||
normal: job.numHWThreads,
|
||||
caution: defaultThresholds.caution,
|
||||
alert: defaultThresholds.alert,
|
||||
};
|
||||
} else if (metricConfig.aggregation === "avg") {
|
||||
return defaultThresholds;
|
||||
} else if (metricConfig.aggregation === "sum") {
|
||||
const jobFraction =
|
||||
job.numHWThreads / subClusterConfig.topology.node.length;
|
||||
return {
|
||||
peak: round(defaultThresholds.peak * jobFraction, 0),
|
||||
normal: round(defaultThresholds.normal * jobFraction, 0),
|
||||
caution: round(defaultThresholds.caution * jobFraction, 0),
|
||||
alert: round(defaultThresholds.alert * jobFraction, 0),
|
||||
};
|
||||
} else {
|
||||
console.warn(
|
||||
"Missing or unkown aggregation mode (sum/avg) for metric:",
|
||||
metricConfig,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
} // Other job.exclusive cases?
|
||||
}
|
||||
</script>
|
||||
|
||||
<script>
|
||||
import { getContext } from "svelte";
|
||||
import {
|
||||
Card,
|
||||
CardHeader,
|
||||
CardTitle,
|
||||
CardBody,
|
||||
Progress,
|
||||
Icon,
|
||||
Tooltip,
|
||||
} from "@sveltestrap/sveltestrap";
|
||||
import { mean, round } from "mathjs";
|
||||
|
||||
export let job;
|
||||
export let jobMetrics;
|
||||
export let view = "job";
|
||||
export let width = "auto";
|
||||
|
||||
const clusters = getContext("clusters");
|
||||
const subclusterConfig = clusters
|
||||
.find((c) => c.name == job.cluster)
|
||||
.subClusters.find((sc) => sc.name == job.subCluster);
|
||||
|
||||
const footprintMetrics =
|
||||
job.numAcc !== 0
|
||||
? job.exclusive !== 1 // GPU
|
||||
? ["acc_utilization", "acc_mem_used", "nv_sm_clock", "nv_mem_util"] // Shared
|
||||
: ["acc_utilization", "acc_mem_used", "nv_sm_clock", "nv_mem_util"] // Exclusive
|
||||
: (job.exclusive !== 1) // CPU Only
|
||||
? ["flops_any", "mem_used"] // Shared
|
||||
: ["cpu_load", "flops_any", "mem_used", "mem_bw"]; // Exclusive
|
||||
|
||||
const footprintData = footprintMetrics.map((fm) => {
|
||||
// Mean: Primarily use backend sourced avgs from job.*, secondarily calculate/read from metricdata
|
||||
let mv = null;
|
||||
if (fm === "cpu_load" && job.loadAvg !== 0) {
|
||||
mv = round(job.loadAvg, 2);
|
||||
} else if (fm === "flops_any" && job.flopsAnyAvg !== 0) {
|
||||
mv = round(job.flopsAnyAvg, 2);
|
||||
} else if (fm === "mem_bw" && job.memBwAvg !== 0) {
|
||||
mv = round(job.memBwAvg, 2);
|
||||
} else {
|
||||
// Calculate from jobMetrics
|
||||
const jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === "node");
|
||||
if (jm?.metric?.statisticsSeries) {
|
||||
mv = round(mean(jm.metric.statisticsSeries.mean), 2);
|
||||
} else if (jm?.metric?.series?.length > 1) {
|
||||
const avgs = jm.metric.series.map((jms) => jms.statistics.avg);
|
||||
mv = round(mean(avgs), 2);
|
||||
} else if (jm?.metric?.series) {
|
||||
mv = round(jm.metric.series[0].statistics.avg, 2);
|
||||
} else {
|
||||
mv = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
// Unit
|
||||
const fmc = getContext("metrics")(job.cluster, fm);
|
||||
let unit = "";
|
||||
if (fmc?.unit?.base) unit = fmc.unit.prefix + fmc.unit.base;
|
||||
|
||||
// Threshold / -Differences
|
||||
const fmt = findJobThresholds(job, fmc, subclusterConfig);
|
||||
if (fm === "flops_any") fmt.peak = round(fmt.peak * 0.85, 0);
|
||||
|
||||
// Define basic data
|
||||
const fmBase = {
|
||||
name: fm,
|
||||
unit: unit,
|
||||
avg: mv,
|
||||
max: fmt.peak,
|
||||
};
|
||||
|
||||
if (evalFootprint(fm, mv, fmt, "alert")) {
|
||||
return {
|
||||
...fmBase,
|
||||
color: "danger",
|
||||
message: `Metric average way ${fm === "mem_used" ? "above" : "below"} expected normal thresholds.`,
|
||||
impact: 3,
|
||||
};
|
||||
} else if (evalFootprint(fm, mv, fmt, "caution")) {
|
||||
return {
|
||||
...fmBase,
|
||||
color: "warning",
|
||||
message: `Metric average ${fm === "mem_used" ? "above" : "below"} expected normal thresholds.`,
|
||||
impact: 2,
|
||||
};
|
||||
} else if (evalFootprint(fm, mv, fmt, "normal")) {
|
||||
return {
|
||||
...fmBase,
|
||||
color: "success",
|
||||
message: "Metric average within expected thresholds.",
|
||||
impact: 1,
|
||||
};
|
||||
} else if (evalFootprint(fm, mv, fmt, "peak")) {
|
||||
return {
|
||||
...fmBase,
|
||||
color: "info",
|
||||
message:
|
||||
"Metric average above expected normal thresholds: Check for artifacts recommended.",
|
||||
impact: 0,
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
...fmBase,
|
||||
color: "secondary",
|
||||
message:
|
||||
"Metric average above expected peak threshold: Check for artifacts!",
|
||||
impact: -1,
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
function evalFootprint(metric, mean, thresholds, level) {
|
||||
// mem_used has inverse logic regarding threshold levels, notify levels triggered if mean > threshold
|
||||
switch (level) {
|
||||
case "peak":
|
||||
if (metric === "mem_used")
|
||||
return false; // mem_used over peak -> return false to trigger impact -1
|
||||
else return mean <= thresholds.peak && mean > thresholds.normal;
|
||||
case "alert":
|
||||
if (metric === "mem_used")
|
||||
return mean <= thresholds.peak && mean >= thresholds.alert;
|
||||
else return mean <= thresholds.alert && mean >= 0;
|
||||
case "caution":
|
||||
if (metric === "mem_used")
|
||||
return mean < thresholds.alert && mean >= thresholds.caution;
|
||||
else return mean <= thresholds.caution && mean > thresholds.alert;
|
||||
case "normal":
|
||||
if (metric === "mem_used")
|
||||
return mean < thresholds.caution && mean >= 0;
|
||||
else return mean <= thresholds.normal && mean > thresholds.caution;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
<script context="module">
|
||||
@@ -155,13 +225,7 @@
|
||||
} else if (metricConfig.aggregation === 'avg' ){
|
||||
return defaultThresholds
|
||||
} else if (metricConfig.aggregation === 'sum' ){
|
||||
let jobFraction = 0.0
|
||||
if (job.numAcc > 0) {
|
||||
jobFraction = job.numAcc / subClusterConfig.topology.accelerators.length
|
||||
} else if (job.numHWThreads > 0) {
|
||||
jobFraction = job.numHWThreads / subClusterConfig.topology.node.length
|
||||
}
|
||||
|
||||
const jobFraction = job.numHWThreads / subClusterConfig.topology.node.length
|
||||
return {
|
||||
peak: round((defaultThresholds.peak * jobFraction), 0),
|
||||
normal: round((defaultThresholds.normal * jobFraction), 0),
|
||||
@@ -177,62 +241,67 @@
|
||||
</script>
|
||||
|
||||
<Card class="h-auto mt-1" style="width: {width}px;">
|
||||
{#if view === 'job'}
|
||||
{#if view === "job"}
|
||||
<CardHeader>
|
||||
<CardTitle class="mb-0 d-flex justify-content-center">
|
||||
Core Metrics Footprint
|
||||
</CardTitle>
|
||||
<CardTitle class="mb-0 d-flex justify-content-center">
|
||||
Core Metrics Footprint
|
||||
</CardTitle>
|
||||
</CardHeader>
|
||||
{/if}
|
||||
<CardBody>
|
||||
{#each footprintData as fpd, index}
|
||||
<div class="mb-1 d-flex justify-content-between">
|
||||
<div> <b>{fpd.name}</b></div>
|
||||
<!-- For symmetry, see below ...-->
|
||||
<div
|
||||
class="cursor-help d-inline-flex"
|
||||
id={`footprint-${job.jobId}-${index}`}
|
||||
>
|
||||
<div class="mx-1">
|
||||
<!-- Alerts Only -->
|
||||
{#if fpd.impact === 3 || fpd.impact === -1}
|
||||
<Icon name="exclamation-triangle-fill" class="text-danger" />
|
||||
{:else if fpd.impact === 2}
|
||||
<Icon name="exclamation-triangle" class="text-warning" />
|
||||
{/if}
|
||||
<!-- Emoji for all states-->
|
||||
{#if fpd.impact === 3}
|
||||
<Icon name="emoji-frown" class="text-danger" />
|
||||
{:else if fpd.impact === 2}
|
||||
<Icon name="emoji-neutral" class="text-warning" />
|
||||
{:else if fpd.impact === 1}
|
||||
<Icon name="emoji-smile" class="text-success" />
|
||||
{:else if fpd.impact === 0}
|
||||
<Icon name="emoji-laughing" class="text-info" />
|
||||
{:else if fpd.impact === -1}
|
||||
<Icon name="emoji-dizzy" class="text-danger" />
|
||||
{/if}
|
||||
</div>
|
||||
<div>
|
||||
<!-- Print Values -->
|
||||
{fpd.avg} / {fpd.max}
|
||||
{fpd.unit} <!-- To increase margin to tooltip: No other way manageable ... -->
|
||||
</div>
|
||||
</div>
|
||||
<Tooltip
|
||||
target={`footprint-${job.jobId}-${index}`}
|
||||
placement="right"
|
||||
offset={[0, 20]}>{fpd.message}</Tooltip
|
||||
>
|
||||
</div>
|
||||
<div class="mb-2">
|
||||
<Progress value={fpd.avg} max={fpd.max} color={fpd.color} />
|
||||
</div>
|
||||
{/each}
|
||||
{#if job?.metaData?.message}
|
||||
<hr class="mt-1 mb-2" />
|
||||
{@html job.metaData.message}
|
||||
{/if}
|
||||
<CardBody>
|
||||
{#each footprintData as fpd, index}
|
||||
<div class="mb-1 d-flex justify-content-between">
|
||||
<div> <b>{fpd.name}</b></div> <!-- For symmetry, see below ...-->
|
||||
<div class="cursor-help d-inline-flex" id={`footprint-${job.jobId}-${index}`}>
|
||||
<div class="mx-1">
|
||||
<!-- Alerts Only -->
|
||||
{#if fpd.impact === 3 || fpd.impact === -1}
|
||||
<Icon name="exclamation-triangle-fill" class="text-danger"/>
|
||||
{:else if fpd.impact === 2}
|
||||
<Icon name="exclamation-triangle" class="text-warning"/>
|
||||
{/if}
|
||||
<!-- Emoji for all states-->
|
||||
{#if fpd.impact === 3}
|
||||
<Icon name="emoji-frown" class="text-danger"/>
|
||||
{:else if fpd.impact === 2}
|
||||
<Icon name="emoji-neutral" class="text-warning"/>
|
||||
{:else if fpd.impact === 1}
|
||||
<Icon name="emoji-smile" class="text-success"/>
|
||||
{:else if fpd.impact === 0}
|
||||
<Icon name="emoji-laughing" class="text-info"/>
|
||||
{:else if fpd.impact === -1}
|
||||
<Icon name="emoji-dizzy" class="text-danger"/>
|
||||
{/if}
|
||||
</div>
|
||||
<div>
|
||||
<!-- Print Values -->
|
||||
{fpd.avg} / {fpd.max} {fpd.unit} <!-- To increase margin to tooltip: No other way manageable ... -->
|
||||
</div>
|
||||
</div>
|
||||
<Tooltip target={`footprint-${job.jobId}-${index}`} placement="right" offset={[0, 20]}>{fpd.message}</Tooltip>
|
||||
</div>
|
||||
<div class="mb-2">
|
||||
<Progress
|
||||
value={fpd.avg}
|
||||
max={fpd.max}
|
||||
color={fpd.color}
|
||||
/>
|
||||
</div>
|
||||
{/each}
|
||||
{#if job?.metaData?.message}
|
||||
<hr class="mt-1 mb-2"/>
|
||||
{@html job.metaData.message}
|
||||
{/if}
|
||||
</CardBody>
|
||||
</CardBody>
|
||||
</Card>
|
||||
|
||||
<style>
|
||||
.cursor-help {
|
||||
cursor: help;
|
||||
}
|
||||
.cursor-help {
|
||||
cursor: help;
|
||||
}
|
||||
</style>
|
||||
|
Reference in New Issue
Block a user