mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2024-12-26 13:29:05 +01:00
Handle accelerated and shared jobs
This commit is contained in:
parent
f7529be3ea
commit
4e375ff32b
@ -20,7 +20,7 @@
|
|||||||
const isAcceleratedJob = (job.numAcc !== 0)
|
const isAcceleratedJob = (job.numAcc !== 0)
|
||||||
const isSharedJob = (job.exclusive !== 1)
|
const isSharedJob = (job.exclusive !== 1)
|
||||||
|
|
||||||
// console.log('JOB', job)
|
console.log('JOB', job)
|
||||||
console.log('ACCELERATED?', isAcceleratedJob)
|
console.log('ACCELERATED?', isAcceleratedJob)
|
||||||
console.log('SHARED?', isSharedJob)
|
console.log('SHARED?', isSharedJob)
|
||||||
|
|
||||||
@ -34,12 +34,15 @@
|
|||||||
> For now: 'acc_util' gegen 'mem_used' für alex
|
> For now: 'acc_util' gegen 'mem_used' für alex
|
||||||
- Energy Metric Missiing, muss eingebaut werden
|
- Energy Metric Missiing, muss eingebaut werden
|
||||||
- Diese Config in config.json?
|
- Diese Config in config.json?
|
||||||
- Erste 5 / letzte 5 pts für avg auslassen? (Wenn minimallänge erreicht?) // Peak limited => Hier eigentlich nicht mein Proble, Ich zeige nur daten an die geliefert werden
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
const footprintMetrics = isAcceleratedJob ?
|
const footprintMetrics = isAcceleratedJob
|
||||||
['cpu_load', 'flops_any', 'acc_utilization', 'mem_bw'] :
|
? isSharedJob
|
||||||
['cpu_load', 'flops_any', 'mem_used', 'mem_bw']
|
? ['cpu_load', 'flops_any', 'acc_utilization']
|
||||||
|
: ['cpu_load', 'flops_any', 'acc_utilization', 'mem_bw']
|
||||||
|
: isSharedJob
|
||||||
|
? ['cpu_load', 'flops_any', 'mem_used']
|
||||||
|
: ['cpu_load', 'flops_any', 'mem_used', 'mem_bw']
|
||||||
|
|
||||||
console.log('JMs', jobMetrics.filter((jm) => footprintMetrics.includes(jm.name)))
|
console.log('JMs', jobMetrics.filter((jm) => footprintMetrics.includes(jm.name)))
|
||||||
|
|
||||||
@ -60,9 +63,12 @@
|
|||||||
// ... get Mean
|
// ... get Mean
|
||||||
let mv = null
|
let mv = null
|
||||||
if (jm?.metric?.statisticsSeries) {
|
if (jm?.metric?.statisticsSeries) {
|
||||||
mv = round(mean(jm.metric.statisticsSeries.mean), 2) // see above
|
mv = round(mean(jm.metric.statisticsSeries.mean), 2)
|
||||||
} else if (jm?.metric?.series[0]) {
|
} else if (jm?.metric?.series?.length > 1) {
|
||||||
mv = jm.metric.series[0].statistics.avg // see above
|
const avgs = jm.metric.series.map(jms => jms.statistics.avg)
|
||||||
|
mv = round(mean(avgs), 2)
|
||||||
|
} else {
|
||||||
|
mv = jm.metric.series[0].statistics.avg
|
||||||
}
|
}
|
||||||
// ... get Unit
|
// ... get Unit
|
||||||
let unit = null
|
let unit = null
|
||||||
@ -238,15 +244,11 @@
|
|||||||
return null
|
return null
|
||||||
}
|
}
|
||||||
|
|
||||||
if (job.numHWThreads == subClusterConfig.topology.node.length || // Job uses all available HWTs of one node
|
let subclusterThresholds = metricConfig.subClusters.find(sc => sc.name == subClusterConfig.name)
|
||||||
job.numAcc == subClusterConfig.topology.accelerators.length || // Job uses all available GPUs of one node
|
if (job.exclusive === 1) { // Exclusive: Use as defined
|
||||||
metricConfig.aggregation == 'avg' ){ // Metric uses "average" aggregation method
|
console.log('Job is exclusive: Use as defined')
|
||||||
|
|
||||||
console.log('Job uses all available Resources of one node OR uses "average" aggregation method, use unscaled thresholds')
|
|
||||||
|
|
||||||
let subclusterThresholds = metricConfig.subClusters.find(sc => sc.name == subClusterConfig.name)
|
|
||||||
if (subclusterThresholds) {
|
if (subclusterThresholds) {
|
||||||
console.log('subClusterThresholds found, use subCluster specific thresholds:', subclusterThresholds)
|
console.log('subClusterThresholds found: use subCluster specific thresholds', subclusterThresholds)
|
||||||
return {
|
return {
|
||||||
peak: subclusterThresholds.peak,
|
peak: subclusterThresholds.peak,
|
||||||
normal: subclusterThresholds.normal,
|
normal: subclusterThresholds.normal,
|
||||||
@ -254,32 +256,47 @@
|
|||||||
alert: subclusterThresholds.alert
|
alert: subclusterThresholds.alert
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
peak: metricConfig.peak,
|
peak: metricConfig.peak,
|
||||||
normal: metricConfig.normal,
|
normal: metricConfig.normal,
|
||||||
caution: metricConfig.caution,
|
caution: metricConfig.caution,
|
||||||
alert: metricConfig.alert
|
alert: metricConfig.alert
|
||||||
}
|
}
|
||||||
}
|
} else { // Shared
|
||||||
|
if (metricConfig.aggregation === 'avg' ){
|
||||||
|
console.log('metric uses "average" aggregation method: use unscaled thresholds except if cpu_load')
|
||||||
|
if (subclusterThresholds) {
|
||||||
|
console.log('subClusterThresholds found: use subCluster specific thresholds', subclusterThresholds)
|
||||||
|
console.log('PEAK/NORMAL USED', metricConfig.name === 'cpu_load' ? job.numHWThreads : subclusterThresholds.peak)
|
||||||
|
return { // If 'cpu_load': Peak/Normal === #HWThreads, keep other thresholds
|
||||||
|
peak: metricConfig.name === 'cpu_load' ? job.numHWThreads : subclusterThresholds.peak,
|
||||||
|
normal: metricConfig.name === 'cpu_load' ? job.numHWThreads : subclusterThresholds.normal,
|
||||||
|
caution: subclusterThresholds.caution,
|
||||||
|
alert: subclusterThresholds.alert
|
||||||
|
}
|
||||||
|
}
|
||||||
|
console.log('PEAK/NORMAL USED', metricConfig.name === 'cpu_load' ? job.numHWThreads : metricConfig.peak)
|
||||||
|
return {
|
||||||
|
peak: metricConfig.name === 'cpu_load' ? job.numHWThreads : metricConfig.peak,
|
||||||
|
normal: metricConfig.name === 'cpu_load' ? job.numHWThreads : metricConfig.normal,
|
||||||
|
caution: metricConfig.caution,
|
||||||
|
alert: metricConfig.alert
|
||||||
|
}
|
||||||
|
} else if (metricConfig.aggregation === 'sum' ){
|
||||||
|
const jobFraction = job.numHWThreads / subClusterConfig.topology.node.length
|
||||||
|
console.log('Fraction', jobFraction)
|
||||||
|
|
||||||
if (metricConfig.aggregation != 'sum') {
|
return {
|
||||||
console.warn('Missing or unkown aggregation mode (sum/avg) for metric:', metricConfig)
|
peak: round((metricConfig.peak * jobFraction), 0),
|
||||||
return null
|
normal: round((metricConfig.normal * jobFraction), 0),
|
||||||
}
|
caution: round((metricConfig.caution * jobFraction), 0),
|
||||||
|
alert: round((metricConfig.alert * jobFraction), 0)
|
||||||
/* Adapt based on numAccs? */
|
}
|
||||||
const jobFraction = job.numHWThreads / subClusterConfig.topology.node.length
|
} else {
|
||||||
//const fractionAcc = job.numAcc / subClusterConfig.topology.accelerators.length
|
console.warn('Missing or unkown aggregation mode (sum/avg) for metric:', metricConfig)
|
||||||
|
return null
|
||||||
console.log('Fraction', jobFraction)
|
}
|
||||||
|
} // Other job.exclusive cases?
|
||||||
return {
|
|
||||||
peak: round((metricConfig.peak * jobFraction), 0),
|
|
||||||
normal: round((metricConfig.normal * jobFraction), 0),
|
|
||||||
caution: round((metricConfig.caution * jobFraction), 0),
|
|
||||||
alert: round((metricConfig.alert * jobFraction), 0)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
|
@ -72,7 +72,7 @@
|
|||||||
|
|
||||||
let queryMetrics = null
|
let queryMetrics = null
|
||||||
$: if (showFootprint) {
|
$: if (showFootprint) {
|
||||||
queryMetrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw', ...metrics].filter(distinct)
|
queryMetrics = ['cpu_load', 'flops_any', 'mem_used', 'mem_bw', 'acc_utilization', ...metrics].filter(distinct)
|
||||||
scopes = ["node"]
|
scopes = ["node"]
|
||||||
} else {
|
} else {
|
||||||
queryMetrics = [...metrics]
|
queryMetrics = [...metrics]
|
||||||
|
Loading…
Reference in New Issue
Block a user