Fix bugs in job classifier and tagger infrastructure

This commit is contained in:
2025-05-26 13:08:03 +02:00
parent 3c66840f95
commit f14bdb3068
8 changed files with 105 additions and 47 deletions

View File

@@ -7,27 +7,21 @@
"job_min_duration_seconds",
"sampling_interval_seconds"
],
"metrics": [
"cpu_load"
],
"metrics": ["cpu_load"],
"requirements": [
"job.exclusive == 1",
"job.duration > job_min_duration_seconds"
],
"terms": [
{
"name": "",
"load_mean": "cpu_load[cpu_load_pre_cutoff_samples].mean('all')"
},
"variables": [
{
"name": "load_threshold",
"expr": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor"
"expr": "(job.numCores / job.numNodes) * excessivecpuload_threshold_factor"
},
{
"name": "load_perc",
"expr": "load_mean / load_threshold"
"expr": "cpu_load / load_threshold"
}
],
"rule": "cpu_load > load_threshold",
"hint": "This job was detected as excessiveload because the average cpu load {{ cpu_load }} falls above the threshold {{ load_threshold }}."
"hint": "This job was detected as excessiveload because the average cpu load {{.cpu_load}} falls above the threshold {{.load_threshold}}."
}

View File

@@ -6,9 +6,7 @@
"job_min_duration_seconds",
"sampling_interval_seconds"
],
"metrics": [
"cpu_load"
],
"metrics": ["cpu_load"],
"requirements": [
"job.exclusive == 1",
"job.duration > job_min_duration_seconds"
@@ -16,7 +14,7 @@
"variables": [
{
"name": "load_threshold",
"expr": "job.numHwthreads * lowcpuload_threshold_factor"
"expr": "job.numCores * lowcpuload_threshold_factor"
},
{
"name": "load_perc",
@@ -24,5 +22,5 @@
}
],
"rule": "cpu_load < load_threshold",
"hint": "This job was detected as lowload because the average cpu load {{ cpu_load }} falls below the threshold {{ load_threshold }}."
"hint": "This job was detected as lowload because the average cpu load {{.cpu_load}} falls below the threshold {{.load_threshold}}."
}