mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2024-12-26 05:19:05 +01:00
fix: fix footprint logic, do not scale thresholds on multi node jobs
This commit is contained in:
parent
b0c0d15505
commit
ab07c7928f
@ -23,6 +23,25 @@
|
|||||||
alert: metricConfig.alert
|
alert: metricConfig.alert
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
NEW: Footprints should be comparable: Always use Unchanged Single Node Thresholds, except for shared jobs.
|
||||||
|
HW Clocks, HW Temperatures and File/Net IO Thresholds will be scaled down too, even if they are independent.
|
||||||
|
'jf.stats' is one of: avg, min, max -> Always relative to one nodes' thresholds as configured.
|
||||||
|
*/
|
||||||
|
if (job.exclusive === 1) {
|
||||||
|
return defaultThresholds
|
||||||
|
} else {
|
||||||
|
const topol = getContext("getHardwareTopology")(job.cluster, job.subCluster)
|
||||||
|
const jobFraction = job.numHWThreads / topol.node.length;
|
||||||
|
return {
|
||||||
|
peak: round(defaultThresholds.peak * jobFraction, 0),
|
||||||
|
normal: round(defaultThresholds.normal * jobFraction, 0),
|
||||||
|
caution: round(defaultThresholds.caution * jobFraction, 0),
|
||||||
|
alert: round(defaultThresholds.alert * jobFraction, 0),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/* OLD: Based on Metric Aggregation Setting
|
||||||
// Job_Exclusivity does not matter, only aggregation
|
// Job_Exclusivity does not matter, only aggregation
|
||||||
if (metricConfig.aggregation === "avg") {
|
if (metricConfig.aggregation === "avg") {
|
||||||
return defaultThresholds;
|
return defaultThresholds;
|
||||||
@ -43,6 +62,7 @@
|
|||||||
);
|
);
|
||||||
return defaultThresholds;
|
return defaultThresholds;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
@ -136,25 +156,25 @@
|
|||||||
return a.impact - b.impact || ((a.name > b.name) ? 1 : ((b.name > a.name) ? -1 : 0));
|
return a.impact - b.impact || ((a.name > b.name) ? 1 : ((b.name > a.name) ? -1 : 0));
|
||||||
});;
|
});;
|
||||||
|
|
||||||
function evalFootprint(mean, thresholds, lowerIsBetter, level) {
|
function evalFootprint(value, thresholds, lowerIsBetter, level) {
|
||||||
// Handle Metrics in which less value is better
|
// Handle Metrics in which less value is better
|
||||||
switch (level) {
|
switch (level) {
|
||||||
case "peak":
|
case "peak":
|
||||||
if (lowerIsBetter)
|
if (lowerIsBetter)
|
||||||
return false; // metric over peak -> return false to trigger impact -1
|
return false; // metric over peak -> return false to trigger impact -1
|
||||||
else return mean <= thresholds.peak && mean > thresholds.normal;
|
else return value <= thresholds.peak && value > thresholds.normal;
|
||||||
case "alert":
|
case "alert":
|
||||||
if (lowerIsBetter)
|
if (lowerIsBetter)
|
||||||
return mean <= thresholds.peak && mean >= thresholds.alert;
|
return value <= thresholds.peak && value >= thresholds.alert;
|
||||||
else return mean <= thresholds.alert && mean >= 0;
|
else return value <= thresholds.alert && value >= 0;
|
||||||
case "caution":
|
case "caution":
|
||||||
if (lowerIsBetter)
|
if (lowerIsBetter)
|
||||||
return mean < thresholds.alert && mean >= thresholds.caution;
|
return value < thresholds.alert && value >= thresholds.caution;
|
||||||
else return mean <= thresholds.caution && mean > thresholds.alert;
|
else return value <= thresholds.caution && value > thresholds.alert;
|
||||||
case "normal":
|
case "normal":
|
||||||
if (lowerIsBetter)
|
if (lowerIsBetter)
|
||||||
return mean < thresholds.caution && mean >= 0;
|
return value < thresholds.caution && value >= 0;
|
||||||
else return mean <= thresholds.normal && mean > thresholds.caution;
|
else return value <= thresholds.normal && value > thresholds.caution;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -23,6 +23,25 @@
|
|||||||
alert: metricConfig.alert
|
alert: metricConfig.alert
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
NEW: Footprints should be comparable: Always use Unchanged Single Node Thresholds, except for shared jobs.
|
||||||
|
HW Clocks, HW Temperatures and File/Net IO Thresholds will be scaled down too, even if they are independent.
|
||||||
|
'jf.stats' is one of: avg, min, max -> Always relative to one nodes' thresholds as configured.
|
||||||
|
*/
|
||||||
|
if (job.exclusive === 1) {
|
||||||
|
return defaultThresholds
|
||||||
|
} else {
|
||||||
|
const topol = getContext("getHardwareTopology")(job.cluster, job.subCluster)
|
||||||
|
const jobFraction = job.numHWThreads / topol.node.length;
|
||||||
|
return {
|
||||||
|
peak: round(defaultThresholds.peak * jobFraction, 0),
|
||||||
|
normal: round(defaultThresholds.normal * jobFraction, 0),
|
||||||
|
caution: round(defaultThresholds.caution * jobFraction, 0),
|
||||||
|
alert: round(defaultThresholds.alert * jobFraction, 0),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/* OLD: Based on Metric Aggregation Setting
|
||||||
// Job_Exclusivity does not matter, only aggregation
|
// Job_Exclusivity does not matter, only aggregation
|
||||||
if (metricConfig.aggregation === "avg") {
|
if (metricConfig.aggregation === "avg") {
|
||||||
return defaultThresholds;
|
return defaultThresholds;
|
||||||
@ -43,6 +62,7 @@
|
|||||||
);
|
);
|
||||||
return defaultThresholds;
|
return defaultThresholds;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
@ -142,25 +162,25 @@
|
|||||||
return a.impact - b.impact || ((a.name > b.name) ? 1 : ((b.name > a.name) ? -1 : 0));
|
return a.impact - b.impact || ((a.name > b.name) ? 1 : ((b.name > a.name) ? -1 : 0));
|
||||||
});;
|
});;
|
||||||
|
|
||||||
function evalFootprint(mean, thresholds, lowerIsBetter, level) {
|
function evalFootprint(value, thresholds, lowerIsBetter, level) {
|
||||||
// Handle Metrics in which less value is better
|
// Handle Metrics in which less value is better
|
||||||
switch (level) {
|
switch (level) {
|
||||||
case "peak":
|
case "peak":
|
||||||
if (lowerIsBetter)
|
if (lowerIsBetter)
|
||||||
return false; // metric over peak -> return false to trigger impact -1
|
return false; // metric over peak -> return false to trigger impact -1
|
||||||
else return mean <= thresholds.peak && mean > thresholds.normal;
|
else return value <= thresholds.peak && value > thresholds.normal;
|
||||||
case "alert":
|
case "alert":
|
||||||
if (lowerIsBetter)
|
if (lowerIsBetter)
|
||||||
return mean <= thresholds.peak && mean >= thresholds.alert;
|
return value <= thresholds.peak && value >= thresholds.alert;
|
||||||
else return mean <= thresholds.alert && mean >= 0;
|
else return value <= thresholds.alert && value >= 0;
|
||||||
case "caution":
|
case "caution":
|
||||||
if (lowerIsBetter)
|
if (lowerIsBetter)
|
||||||
return mean < thresholds.alert && mean >= thresholds.caution;
|
return value < thresholds.alert && value >= thresholds.caution;
|
||||||
else return mean <= thresholds.caution && mean > thresholds.alert;
|
else return value <= thresholds.caution && value > thresholds.alert;
|
||||||
case "normal":
|
case "normal":
|
||||||
if (lowerIsBetter)
|
if (lowerIsBetter)
|
||||||
return mean < thresholds.caution && mean >= 0;
|
return value < thresholds.caution && value >= 0;
|
||||||
else return mean <= thresholds.normal && mean > thresholds.caution;
|
else return value <= thresholds.normal && value > thresholds.caution;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user