Adapt for accs in shared threshold s

This commit is contained in:
Christoph Kluge 2024-03-14 10:35:14 +01:00
parent ec581e3509
commit 58415ab5c3
3 changed files with 27 additions and 15 deletions

View File

@ -20,12 +20,12 @@
const subclusterConfig = clusters.find((c) => c.name == job.cluster).subClusters.find((sc) => sc.name == job.subCluster)
const footprintMetrics = (job.numAcc !== 0)
? (job.exclusive !== 1)
? ['cpu_load', 'flops_any', 'acc_utilization']
: ['cpu_load', 'flops_any', 'acc_utilization', 'mem_bw']
: (job.exclusive !== 1)
? ['cpu_load', 'flops_any', 'mem_used']
: ['cpu_load', 'flops_any', 'mem_used', 'mem_bw']
? (job.exclusive !== 1) // GPU
? ['acc_utilization', 'acc_mem_used', 'nv_sm_clock', 'nv_mem_util'] // Shared
: ['acc_utilization', 'acc_mem_used', 'nv_sm_clock', 'nv_mem_util'] // Exclusive
: (job.exclusive !== 1) // CPU only
? ['flops_any', 'mem_used'] // Shared
: ['cpu_load', 'flops_any', 'mem_used', 'mem_bw'] // Exclusive
const footprintData = footprintMetrics.map((fm) => {
// Mean: Primarily use backend sourced avgs from job.*, secondarily calculate/read from metricdata
@ -155,7 +155,13 @@
} else if (metricConfig.aggregation === 'avg' ){
return defaultThresholds
} else if (metricConfig.aggregation === 'sum' ){
const jobFraction = job.numHWThreads / subClusterConfig.topology.node.length
let jobFraction = 0.0
if (job.numAcc > 0) {
jobFraction = job.numAcc / subClusterConfig.topology.accelerators.length
} else if (job.numHWThreads > 0) {
jobFraction = job.numHWThreads / subClusterConfig.topology.node.length
}
return {
peak: round((defaultThresholds.peak * jobFraction), 0),
normal: round((defaultThresholds.normal * jobFraction), 0),

View File

@ -163,7 +163,8 @@
subCluster={job.subCluster}
isShared={(job.exclusive != 1)}
resources={job.resources}
hwthreads={job.numHWThreads}
numhwthreads={job.numHWThreads}
numaccs={job.numAcc}
/>
{:else if metric.disabled == true && metric.data}
<Card body color="info">Metric disabled for subcluster <code>{metric.data.name}:{job.subCluster}</code></Card>

View File

@ -39,7 +39,8 @@
export let subCluster
export let isShared = false
export let forNode = false
export let hwthreads = 0
export let numhwthreads = 0
export let numaccs = 0
if (useStatsSeries == null)
useStatsSeries = statisticsSeries != null
@ -54,7 +55,7 @@
const lineWidth = clusterCockpitConfig.plot_general_lineWidth / window.devicePixelRatio
const lineColors = clusterCockpitConfig.plot_general_colorscheme
const backgroundColors = { normal: 'rgba(255, 255, 255, 1.0)', caution: 'rgba(255, 128, 0, 0.3)', alert: 'rgba(255, 0, 0, 0.3)' }
const thresholds = findThresholds(metricConfig, scope, typeof subCluster == 'string' ? cluster.subClusters.find(sc => sc.name == subCluster) : subCluster, isShared, hwthreads)
const thresholds = findThresholds(metricConfig, scope, typeof subCluster == 'string' ? cluster.subClusters.find(sc => sc.name == subCluster) : subCluster, isShared, numhwthreads, numaccs)
// converts the legend into a simple tooltip
function legendAsTooltipPlugin({ className, style = { backgroundColor:"rgba(255, 249, 196, 0.92)", color: "black" } } = {}) {
@ -381,7 +382,7 @@
}
}
export function findThresholds(metricConfig, scope, subCluster, isShared, hwthreads) {
export function findThresholds(metricConfig, scope, subCluster, isShared, numhwthreads, numaccs) {
// console.log('NAME ' + metricConfig.name + ' / SCOPE ' + scope + ' / SUBCLUSTER ' + subCluster.name)
if (!metricConfig || !scope || !subCluster) {
console.warn('Argument missing for findThresholds!')
@ -409,9 +410,13 @@
}
let divisor = 1
if (isShared == true && hwthreads > 0) { // Shared
divisor = subCluster.topology.node.length / hwthreads
} else if (scope == 'socket')
if (isShared == true) { // Shared
if (numaccs > 0) {
divisor = subCluster.topology.accelerators.length / numaccs
} else if (numhwthreads > 0) {
divisor = subCluster.topology.node.length / numhwthreads
}
else if (scope == 'socket')
divisor = subCluster.topology.socket.length
else if (scope == 'core')
divisor = subCluster.topology.core.length
@ -419,7 +424,7 @@
divisor = subCluster.topology.accelerators.length
else if (scope == 'hwthread')
divisor = subCluster.topology.node.length
else {
else
// console.log('TODO: how to calc thresholds for ', scope)
return null
}