Fix and extend jobclass rules

This commit is contained in:
2026-02-22 13:27:51 +01:00
parent 86fbecc679
commit c9d8de0d56
6 changed files with 61 additions and 23 deletions

View File

@@ -0,0 +1,25 @@
{
"name": "High memory usage",
"tag": "highmemory",
"parameters": [
"highmemoryusage_threshold_factor",
"job_min_duration_seconds"
],
"metrics": ["mem_used"],
"requirements": [
"job.shared == \"none\"",
"job.duration > job_min_duration_seconds"
],
"variables": [
{
"name": "memory_threshold",
"expr": "mem_used.limits.peak * highmemoryusage_threshold_factor"
},
{
"name": "memory_usage_pct",
"expr": "mem_used.max / mem_used.limits.peak * 100.0"
}
],
"rule": "mem_used.max > memory_threshold",
"hint": "This job used high memory: peak memory usage {{.mem_used.max}} GB ({{.memory_usage_pct}}% of {{.mem_used.limits.peak}} GB node capacity), exceeding the {{.highmemoryusage_threshold_factor}} utilization threshold. Risk of out-of-memory conditions."
}

View File

@@ -3,8 +3,7 @@
"tag": "excessiveload", "tag": "excessiveload",
"parameters": [ "parameters": [
"excessivecpuload_threshold_factor", "excessivecpuload_threshold_factor",
"job_min_duration_seconds", "job_min_duration_seconds"
"sampling_interval_seconds"
], ],
"metrics": ["cpu_load"], "metrics": ["cpu_load"],
"requirements": [ "requirements": [
@@ -15,12 +14,8 @@
{ {
"name": "load_threshold", "name": "load_threshold",
"expr": "cpu_load.limits.peak * excessivecpuload_threshold_factor" "expr": "cpu_load.limits.peak * excessivecpuload_threshold_factor"
},
{
"name": "load_perc",
"expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)"
} }
], ],
"rule": "cpu_load.avg > load_threshold", "rule": "cpu_load.avg > load_threshold",
"hint": "This job was detected as excessiveload because the average cpu load {{.cpu_load.avg}} falls above the threshold {{.load_threshold}}." "hint": "This job was detected as having excessive CPU load: average cpu load {{.cpu_load.avg}} exceeds the oversubscription threshold {{.load_threshold}} ({{.excessivecpuload_threshold_factor}} \u00d7 {{.cpu_load.limits.peak}} peak cores), indicating CPU contention."
} }

View File

@@ -1,5 +1,5 @@
{ {
"name": "Low ressource utilization", "name": "Low resource utilization",
"tag": "lowutilization", "tag": "lowutilization",
"parameters": ["job_min_duration_seconds"], "parameters": ["job_min_duration_seconds"],
"metrics": ["flops_any", "mem_bw"], "metrics": ["flops_any", "mem_bw"],
@@ -9,14 +9,14 @@
], ],
"variables": [ "variables": [
{ {
"name": "mem_bw_perc", "name": "mem_bw_pct",
"expr": "1.0 - (mem_bw.avg / mem_bw.limits.peak)" "expr": "mem_bw.avg / mem_bw.limits.peak * 100.0"
}, },
{ {
"name": "flops_any_perc", "name": "flops_any_pct",
"expr": "1.0 - (flops_any.avg / flops_any.limits.peak)" "expr": "flops_any.avg / flops_any.limits.peak * 100.0"
} }
], ],
"rule": "flops_any.avg < flops_any.limits.alert && mem_bw.avg < mem_bw.limits.alert", "rule": "flops_any.avg < flops_any.limits.alert && mem_bw.avg < mem_bw.limits.alert",
"hint": "This job was detected as low utilization because the average flop rate {{.flops_any.avg}} falls below the threshold {{.flops_any.limits.alert}}." "hint": "This job shows low resource utilization: FLOP rate {{.flops_any.avg}} GF/s ({{.flops_any_pct}}% of peak) and memory bandwidth {{.mem_bw.avg}} GB/s ({{.mem_bw_pct}}% of peak) are both below their alert thresholds."
} }

View File

@@ -3,8 +3,7 @@
"tag": "lowload", "tag": "lowload",
"parameters": [ "parameters": [
"lowcpuload_threshold_factor", "lowcpuload_threshold_factor",
"job_min_duration_seconds", "job_min_duration_seconds"
"sampling_interval_seconds"
], ],
"metrics": ["cpu_load"], "metrics": ["cpu_load"],
"requirements": [ "requirements": [
@@ -15,12 +14,8 @@
{ {
"name": "load_threshold", "name": "load_threshold",
"expr": "job.numCores * lowcpuload_threshold_factor" "expr": "job.numCores * lowcpuload_threshold_factor"
},
{
"name": "load_perc",
"expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)"
} }
], ],
"rule": "cpu_load.avg < cpu_load.limits.caution", "rule": "cpu_load.avg < load_threshold",
"hint": "This job was detected as lowload because the average cpu load {{.cpu_load}} falls below the threshold {{.cpu_load.limits.caution}}." "hint": "This job was detected as low CPU load: average cpu load {{.cpu_load.avg}} is below the threshold {{.load_threshold}} ({{.lowcpuload_threshold_factor}} \u00d7 {{.job.numCores}} allocated cores)."
} }

View File

@@ -0,0 +1,22 @@
{
"name": "Memory bandwidth bound",
"tag": "memorybound",
"parameters": ["membound_bw_threshold_factor", "job_min_duration_seconds"],
"metrics": ["mem_bw"],
"requirements": [
"job.shared == \"none\"",
"job.duration > job_min_duration_seconds"
],
"variables": [
{
"name": "mem_bw_threshold",
"expr": "mem_bw.limits.peak * membound_bw_threshold_factor"
},
{
"name": "mem_bw_pct",
"expr": "mem_bw.avg / mem_bw.limits.peak * 100.0"
}
],
"rule": "mem_bw.avg > mem_bw_threshold",
"hint": "This job is memory bandwidth bound: memory bandwidth {{.mem_bw.avg}} GB/s ({{.mem_bw_pct}}% of peak) is within {{.membound_bw_threshold_factor}} of peak bandwidth. Consider improving data reuse or compute intensity."
}

View File

@@ -1,11 +1,12 @@
{ {
"lowcpuload_threshold_factor": 0.9, "lowcpuload_threshold_factor": 0.85,
"excessivecpuload_threshold_factor": 1.1, "excessivecpuload_threshold_factor": 1.2,
"highmemoryusage_threshold_factor": 0.9, "highmemoryusage_threshold_factor": 0.9,
"node_load_imbalance_threshold_factor": 0.1, "node_load_imbalance_threshold_factor": 0.1,
"core_load_imbalance_threshold_factor": 0.1, "core_load_imbalance_threshold_factor": 0.1,
"high_memory_load_threshold_factor": 0.9, "high_memory_load_threshold_factor": 0.9,
"lowgpuload_threshold_factor": 0.7, "lowgpuload_threshold_factor": 0.7,
"membound_bw_threshold_factor": 0.8,
"memory_leak_slope_threshold": 0.1, "memory_leak_slope_threshold": 0.1,
"job_min_duration_seconds": 600.0, "job_min_duration_seconds": 600.0,
"sampling_interval_seconds": 30.0, "sampling_interval_seconds": 30.0,