Merge pull request #251 from ClusterCockpit/hotfix

Accelerator ID Display Bugs and Footprint
This commit is contained in:
Jan Eitzinger 2024-03-25 10:35:48 +01:00 committed by GitHub
commit ddd3fad1c6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 62 additions and 55 deletions

View File

@ -205,7 +205,7 @@ func (ccms *CCMetricStore) LoadData(
jobData[metric][scope] = jobMetric
}
for _, res := range row {
for ndx, res := range row {
if res.Error != nil {
/* Build list for "partial errors", if any */
errors = append(errors, fmt.Sprintf("failed to fetch '%s' from host '%s': %s", query.Metric, query.Hostname, *res.Error))
@ -215,7 +215,7 @@ func (ccms *CCMetricStore) LoadData(
id := (*string)(nil)
if query.Type != nil {
id = new(string)
*id = query.TypeIds[0]
*id = query.TypeIds[ndx]
}
if res.Avg.IsNaN() || res.Min.IsNaN() || res.Max.IsNaN() {
@ -312,6 +312,11 @@ func (ccms *CCMetricStore) buildQueries(
// Accelerator -> Accelerator (Use "accelerator" scope if requested scope is lower than node)
if nativeScope == schema.MetricScopeAccelerator && scope.LT(schema.MetricScopeNode) {
if scope != schema.MetricScopeAccelerator {
// Skip all other catched cases
continue
}
queries = append(queries, ApiQuery{
Metric: remoteName,
Hostname: host.Hostname,

View File

@ -23,38 +23,25 @@
: metricConfig.alert,
};
if (job.exclusive === 1) {
// Exclusive: Use as defined
// Job_Exclusivity does not matter, only aggregation
if (metricConfig.aggregation === "avg") {
return defaultThresholds;
} else if (metricConfig.aggregation === "sum") {
const jobFraction =
job.numHWThreads / subClusterConfig.topology.node.length;
return {
peak: round(defaultThresholds.peak * jobFraction, 0),
normal: round(defaultThresholds.normal * jobFraction, 0),
caution: round(defaultThresholds.caution * jobFraction, 0),
alert: round(defaultThresholds.alert * jobFraction, 0),
};
} else {
// Shared: Handle specifically
if (metricConfig.name === "cpu_load") {
// Special: Avg Aggregation BUT scaled based on #hwthreads
return {
peak: job.numHWThreads,
normal: job.numHWThreads,
caution: defaultThresholds.caution,
alert: defaultThresholds.alert,
};
} else if (metricConfig.aggregation === "avg") {
return defaultThresholds;
} else if (metricConfig.aggregation === "sum") {
const jobFraction =
job.numHWThreads / subClusterConfig.topology.node.length;
return {
peak: round(defaultThresholds.peak * jobFraction, 0),
normal: round(defaultThresholds.normal * jobFraction, 0),
caution: round(defaultThresholds.caution * jobFraction, 0),
alert: round(defaultThresholds.alert * jobFraction, 0),
};
} else {
console.warn(
"Missing or unkown aggregation mode (sum/avg) for metric:",
metricConfig,
);
return null;
}
} // Other job.exclusive cases?
console.warn(
"Missing or unkown aggregation mode (sum/avg) for metric:",
metricConfig,
);
return defaultThresholds;
}
}
</script>
@ -91,29 +78,6 @@
: ["cpu_load", "flops_any", "mem_used", "mem_bw"]; // Exclusive
const footprintData = footprintMetrics.map((fm) => {
// Mean: Primarily use backend sourced avgs from job.*, secondarily calculate/read from metricdata
let mv = null;
if (fm === "cpu_load" && job.loadAvg !== 0) {
mv = round(job.loadAvg, 2);
} else if (fm === "flops_any" && job.flopsAnyAvg !== 0) {
mv = round(job.flopsAnyAvg, 2);
} else if (fm === "mem_bw" && job.memBwAvg !== 0) {
mv = round(job.memBwAvg, 2);
} else {
// Calculate from jobMetrics
const jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === "node");
if (jm?.metric?.statisticsSeries) {
mv = round(mean(jm.metric.statisticsSeries.mean), 2);
} else if (jm?.metric?.series?.length > 1) {
const avgs = jm.metric.series.map((jms) => jms.statistics.avg);
mv = round(mean(avgs), 2);
} else if (jm?.metric?.series) {
mv = round(jm.metric.series[0].statistics.avg, 2);
} else {
mv = 0.0;
}
}
// Unit
const fmc = getContext("metrics")(job.cluster, fm);
let unit = "";
@ -123,6 +87,44 @@
const fmt = findJobThresholds(job, fmc, subclusterConfig);
if (fm === "flops_any") fmt.peak = round(fmt.peak * 0.85, 0);
// Value: Primarily use backend sourced avgs from job.*, secondarily calculate/read from metricdata
// Exclusivity does not matter
let mv = 0.0;
if (fmc.aggregation === "avg") {
if (fm === "cpu_load" && job.loadAvg !== 0) {
mv = round(job.loadAvg, 2);
} else if (fm === "flops_any" && job.flopsAnyAvg !== 0) {
mv = round(job.flopsAnyAvg, 2);
} else if (fm === "mem_bw" && job.memBwAvg !== 0) {
mv = round(job.memBwAvg, 2);
} else {
// Calculate Avg from jobMetrics
const jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === "node");
if (jm?.metric?.statisticsSeries) {
mv = round(mean(jm.metric.statisticsSeries.mean), 2);
} else if (jm?.metric?.series?.length > 1) {
const avgs = jm.metric.series.map((jms) => jms.statistics.avg);
mv = round(mean(avgs), 2);
} else if (jm?.metric?.series) {
mv = round(jm.metric.series[0].statistics.avg, 2);
}
}
} else if (fmc.aggregation === "sum") {
// Calculate Sum from jobMetrics: Sum all node averages
const jm = jobMetrics.find((jm) => jm.name === fm && jm.scope === "node");
if (jm?.metric?.series?.length > 1) { // More than 1 node
const avgs = jm.metric.series.map((jms) => jms.statistics.avg);
mv = round(avgs.reduce((a, b) => a + b, 0));
} else if (jm?.metric?.series) {
mv = round(jm.metric.series[0].statistics.avg, 2);
}
} else {
console.warn(
"Missing or unkown aggregation mode (sum/avg) for metric:",
metricConfig,
);
}
// Define basic data
const fmBase = {
name: fm,