Adapt for accs in shared threshold s

This commit is contained in:
Christoph Kluge 2024-03-14 10:35:14 +01:00
parent ec581e3509
commit 58415ab5c3
3 changed files with 27 additions and 15 deletions

View File

@ -20,12 +20,12 @@
const subclusterConfig = clusters.find((c) => c.name == job.cluster).subClusters.find((sc) => sc.name == job.subCluster) const subclusterConfig = clusters.find((c) => c.name == job.cluster).subClusters.find((sc) => sc.name == job.subCluster)
const footprintMetrics = (job.numAcc !== 0) const footprintMetrics = (job.numAcc !== 0)
? (job.exclusive !== 1) ? (job.exclusive !== 1) // GPU
? ['cpu_load', 'flops_any', 'acc_utilization'] ? ['acc_utilization', 'acc_mem_used', 'nv_sm_clock', 'nv_mem_util'] // Shared
: ['cpu_load', 'flops_any', 'acc_utilization', 'mem_bw'] : ['acc_utilization', 'acc_mem_used', 'nv_sm_clock', 'nv_mem_util'] // Exclusive
: (job.exclusive !== 1) : (job.exclusive !== 1) // CPU only
? ['cpu_load', 'flops_any', 'mem_used'] ? ['flops_any', 'mem_used'] // Shared
: ['cpu_load', 'flops_any', 'mem_used', 'mem_bw'] : ['cpu_load', 'flops_any', 'mem_used', 'mem_bw'] // Exclusive
const footprintData = footprintMetrics.map((fm) => { const footprintData = footprintMetrics.map((fm) => {
// Mean: Primarily use backend sourced avgs from job.*, secondarily calculate/read from metricdata // Mean: Primarily use backend sourced avgs from job.*, secondarily calculate/read from metricdata
@ -155,7 +155,13 @@
} else if (metricConfig.aggregation === 'avg' ){ } else if (metricConfig.aggregation === 'avg' ){
return defaultThresholds return defaultThresholds
} else if (metricConfig.aggregation === 'sum' ){ } else if (metricConfig.aggregation === 'sum' ){
const jobFraction = job.numHWThreads / subClusterConfig.topology.node.length let jobFraction = 0.0
if (job.numAcc > 0) {
jobFraction = job.numAcc / subClusterConfig.topology.accelerators.length
} else if (job.numHWThreads > 0) {
jobFraction = job.numHWThreads / subClusterConfig.topology.node.length
}
return { return {
peak: round((defaultThresholds.peak * jobFraction), 0), peak: round((defaultThresholds.peak * jobFraction), 0),
normal: round((defaultThresholds.normal * jobFraction), 0), normal: round((defaultThresholds.normal * jobFraction), 0),

View File

@ -163,7 +163,8 @@
subCluster={job.subCluster} subCluster={job.subCluster}
isShared={(job.exclusive != 1)} isShared={(job.exclusive != 1)}
resources={job.resources} resources={job.resources}
hwthreads={job.numHWThreads} numhwthreads={job.numHWThreads}
numaccs={job.numAcc}
/> />
{:else if metric.disabled == true && metric.data} {:else if metric.disabled == true && metric.data}
<Card body color="info">Metric disabled for subcluster <code>{metric.data.name}:{job.subCluster}</code></Card> <Card body color="info">Metric disabled for subcluster <code>{metric.data.name}:{job.subCluster}</code></Card>

View File

@ -39,7 +39,8 @@
export let subCluster export let subCluster
export let isShared = false export let isShared = false
export let forNode = false export let forNode = false
export let hwthreads = 0 export let numhwthreads = 0
export let numaccs = 0
if (useStatsSeries == null) if (useStatsSeries == null)
useStatsSeries = statisticsSeries != null useStatsSeries = statisticsSeries != null
@ -54,7 +55,7 @@
const lineWidth = clusterCockpitConfig.plot_general_lineWidth / window.devicePixelRatio const lineWidth = clusterCockpitConfig.plot_general_lineWidth / window.devicePixelRatio
const lineColors = clusterCockpitConfig.plot_general_colorscheme const lineColors = clusterCockpitConfig.plot_general_colorscheme
const backgroundColors = { normal: 'rgba(255, 255, 255, 1.0)', caution: 'rgba(255, 128, 0, 0.3)', alert: 'rgba(255, 0, 0, 0.3)' } const backgroundColors = { normal: 'rgba(255, 255, 255, 1.0)', caution: 'rgba(255, 128, 0, 0.3)', alert: 'rgba(255, 0, 0, 0.3)' }
const thresholds = findThresholds(metricConfig, scope, typeof subCluster == 'string' ? cluster.subClusters.find(sc => sc.name == subCluster) : subCluster, isShared, hwthreads) const thresholds = findThresholds(metricConfig, scope, typeof subCluster == 'string' ? cluster.subClusters.find(sc => sc.name == subCluster) : subCluster, isShared, numhwthreads, numaccs)
// converts the legend into a simple tooltip // converts the legend into a simple tooltip
function legendAsTooltipPlugin({ className, style = { backgroundColor:"rgba(255, 249, 196, 0.92)", color: "black" } } = {}) { function legendAsTooltipPlugin({ className, style = { backgroundColor:"rgba(255, 249, 196, 0.92)", color: "black" } } = {}) {
@ -381,7 +382,7 @@
} }
} }
export function findThresholds(metricConfig, scope, subCluster, isShared, hwthreads) { export function findThresholds(metricConfig, scope, subCluster, isShared, numhwthreads, numaccs) {
// console.log('NAME ' + metricConfig.name + ' / SCOPE ' + scope + ' / SUBCLUSTER ' + subCluster.name) // console.log('NAME ' + metricConfig.name + ' / SCOPE ' + scope + ' / SUBCLUSTER ' + subCluster.name)
if (!metricConfig || !scope || !subCluster) { if (!metricConfig || !scope || !subCluster) {
console.warn('Argument missing for findThresholds!') console.warn('Argument missing for findThresholds!')
@ -409,9 +410,13 @@
} }
let divisor = 1 let divisor = 1
if (isShared == true && hwthreads > 0) { // Shared if (isShared == true) { // Shared
divisor = subCluster.topology.node.length / hwthreads if (numaccs > 0) {
} else if (scope == 'socket') divisor = subCluster.topology.accelerators.length / numaccs
} else if (numhwthreads > 0) {
divisor = subCluster.topology.node.length / numhwthreads
}
else if (scope == 'socket')
divisor = subCluster.topology.socket.length divisor = subCluster.topology.socket.length
else if (scope == 'core') else if (scope == 'core')
divisor = subCluster.topology.core.length divisor = subCluster.topology.core.length
@ -419,7 +424,7 @@
divisor = subCluster.topology.accelerators.length divisor = subCluster.topology.accelerators.length
else if (scope == 'hwthread') else if (scope == 'hwthread')
divisor = subCluster.topology.node.length divisor = subCluster.topology.node.length
else { else
// console.log('TODO: how to calc thresholds for ', scope) // console.log('TODO: how to calc thresholds for ', scope)
return null return null
} }