Handle artifacts, fix single node footprint flops

This commit is contained in:
Christoph Kluge 2023-11-21 10:27:16 +01:00
parent f8f900151a
commit dc860f8fd9
2 changed files with 74 additions and 10 deletions

View File

@ -1,9 +1,6 @@
<script> <script>
import { getContext } from 'svelte' import { getContext } from 'svelte'
import { import {
Row,
Col,
Spinner,
Card, Card,
CardHeader, CardHeader,
CardTitle, CardTitle,
@ -18,7 +15,7 @@
export let job export let job
export let jobMetrics export let jobMetrics
export let view = 'job' export let view = 'job'
export let width = 200 export let width = 'auto'
const footprintMetrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw'] // 'acc_utilization' / missing: energy , move to central config before deployment const footprintMetrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw'] // 'acc_utilization' / missing: energy , move to central config before deployment
@ -37,7 +34,6 @@
// console.log("FMTs", footprintMetricThresholds) // console.log("FMTs", footprintMetricThresholds)
const footprintData = footprintMetrics.map((fm) => { const footprintData = footprintMetrics.map((fm) => {
// From JobMetrics: Only Node Scope
const jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === 'node') const jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === 'node')
// ... get Mean // ... get Mean
let mv = null let mv = null
@ -53,14 +49,13 @@
} else { } else {
unit = '' unit = ''
} }
// From MetricConfig: Scope only for scaling -> Not of interest here // From MetricConfig: Scope only for scaling -> Not of interest here
const metricConfig = footprintMetricConfigs.find((fmc) => fmc.name === fm) const metricConfig = footprintMetricConfigs.find((fmc) => fmc.name === fm)
// ... get Thresholds // ... get Thresholds
const levelPeak = fm === 'flops_any' ? round((metricConfig.peak * 0.85), 0) - mv : metricConfig.peak - mv // Scale flops_any down
const levelNormal = metricConfig.normal - mv const levelNormal = metricConfig.normal - mv
const levelCaution = metricConfig.caution - mv const levelCaution = metricConfig.caution - mv
const levelAlert = metricConfig.alert - mv const levelAlert = metricConfig.alert - mv
// Collect // Collect
if (fm !== 'mem_used') { // Alert if usage is low, peak as maxmimum possible (scaled down for flops_any) if (fm !== 'mem_used') { // Alert if usage is low, peak as maxmimum possible (scaled down for flops_any)
if (levelAlert > 0) { if (levelAlert > 0) {
@ -93,7 +88,7 @@
message: 'Metric within common levels', message: 'Metric within common levels',
impact: 1 impact: 1
} }
} else { } else if (levelPeak > 0) {
return { return {
name: fm, name: fm,
unit: unit, unit: unit,
@ -103,9 +98,68 @@
message: 'Metric performs better than common levels', message: 'Metric performs better than common levels',
impact: 0 impact: 0
} }
} else { // Possible artifacts - <5% Margin OK, >5% warning, > 50% danger
const checkData = {
name: fm,
unit: unit,
avg: mv,
max: fm === 'flops_any' ? round((metricConfig.peak * 0.85), 0) : metricConfig.peak
}
if (checkData.avg >= (1.5 * checkData.max)) {
return {
...checkData,
color: 'danger',
message: 'Metric average at least 50% above common peak value: Check data for artifacts!',
impact: -2
}
} else if (checkData.avg >= (1.05 * checkData.max)) {
return {
...checkData,
color: 'warning',
message: 'Metric average at least 5% above common peak value: Check data for artifacts',
impact: -1
}
} else {
return {
...checkData,
color: 'info',
message: 'Metric performs better than common levels',
impact: 0
}
}
} }
} else { // Inverse Logic: Alert if usage is high, Peak is bad and limits execution } else { // Inverse Logic: Alert if usage is high, Peak is bad and limits execution
if (levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0) { if (levelPeak <= 0 && levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0) { // Possible artifacts - <5% Margin OK, >5% warning, > 50% danger
const checkData = {
name: fm,
unit: unit,
avg: mv,
max: metricConfig.peak
}
if (checkData.avg >= (1.5 * checkData.max)) {
return {
...checkData,
color: 'danger',
message: 'Memory usage at least 50% above possible maximum value: Check data for artifacts!',
impact: -2
}
} else if (checkData.avg >= (1.05 * checkData.max)) {
return {
...checkData,
color: 'warning',
message: 'Memory usage at least 5% above possible maximum value: Check data for artifacts!',
impact: -1
}
} else {
return {
...checkData,
color: 'danger',
message: 'Memory usage extremely above common levels!',
impact: 4
}
}
} else if (levelAlert <= 0 && levelCaution <= 0 && levelNormal <= 0) {
return { return {
name: fm, name: fm,
unit: unit, unit: unit,
@ -153,7 +207,7 @@
</script> </script>
<Card class="h-auto mt-1" style="min-width: {width}px;"> <Card class="h-auto mt-1" style="width: {width}px;">
{#if view === 'job'} {#if view === 'job'}
<CardHeader> <CardHeader>
<CardTitle class="mb-0 d-flex justify-content-center"> <CardTitle class="mb-0 d-flex justify-content-center">
@ -172,6 +226,10 @@
<Icon name="exclamation-triangle-fill" class="text-danger"/> <Icon name="exclamation-triangle-fill" class="text-danger"/>
{:else if fpd.impact === 2} {:else if fpd.impact === 2}
<Icon name="exclamation-triangle" class="text-warning"/> <Icon name="exclamation-triangle" class="text-warning"/>
{:else if fpd.impact === -1}
<Icon name="exclamation-triangle" class="text-warning"/>
{:else if fpd.impact === -2}
<Icon name="exclamation-triangle-fill" class="text-danger"/>
{/if} {/if}
<!-- Emoji for all states--> <!-- Emoji for all states-->
{#if fpd.impact === 4} {#if fpd.impact === 4}
@ -184,6 +242,10 @@
<Icon name="emoji-smile" class="text-success"/> <Icon name="emoji-smile" class="text-success"/>
{:else if fpd.impact === 0} {:else if fpd.impact === 0}
<Icon name="emoji-laughing" class="text-info"/> <Icon name="emoji-laughing" class="text-info"/>
{:else if fpd.impact === -1}
<Icon name="emoji-dizzy" class="text-warning"/>
{:else if fpd.impact === -2}
<Icon name="emoji-dizzy" class="text-danger"/>
{/if} {/if}
</div> </div>
<div> <div>

View File

@ -74,8 +74,10 @@
let queryMetrics = null let queryMetrics = null
$: if (showFootprint) { $: if (showFootprint) {
queryMetrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw', ...metrics].filter(distinct) queryMetrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw', ...metrics].filter(distinct)
scopes = ["node"]
} else { } else {
queryMetrics = [...metrics] queryMetrics = [...metrics]
scopes = [job.numNodes == 1 ? "core" : "node"]
} }
export function refresh() { export function refresh() {