mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-02-24 11:27:30 +01:00
Fix and extend jobclass rules
This commit is contained in:
25
configs/tagger/jobclasses/highMemoryUsage.json
Normal file
25
configs/tagger/jobclasses/highMemoryUsage.json
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
{
|
||||||
|
"name": "High memory usage",
|
||||||
|
"tag": "highmemory",
|
||||||
|
"parameters": [
|
||||||
|
"highmemoryusage_threshold_factor",
|
||||||
|
"job_min_duration_seconds"
|
||||||
|
],
|
||||||
|
"metrics": ["mem_used"],
|
||||||
|
"requirements": [
|
||||||
|
"job.shared == \"none\"",
|
||||||
|
"job.duration > job_min_duration_seconds"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "memory_threshold",
|
||||||
|
"expr": "mem_used.limits.peak * highmemoryusage_threshold_factor"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "memory_usage_pct",
|
||||||
|
"expr": "mem_used.max / mem_used.limits.peak * 100.0"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"rule": "mem_used.max > memory_threshold",
|
||||||
|
"hint": "This job used high memory: peak memory usage {{.mem_used.max}} GB ({{.memory_usage_pct}}% of {{.mem_used.limits.peak}} GB node capacity), exceeding the {{.highmemoryusage_threshold_factor}} utilization threshold. Risk of out-of-memory conditions."
|
||||||
|
}
|
||||||
@@ -3,8 +3,7 @@
|
|||||||
"tag": "excessiveload",
|
"tag": "excessiveload",
|
||||||
"parameters": [
|
"parameters": [
|
||||||
"excessivecpuload_threshold_factor",
|
"excessivecpuload_threshold_factor",
|
||||||
"job_min_duration_seconds",
|
"job_min_duration_seconds"
|
||||||
"sampling_interval_seconds"
|
|
||||||
],
|
],
|
||||||
"metrics": ["cpu_load"],
|
"metrics": ["cpu_load"],
|
||||||
"requirements": [
|
"requirements": [
|
||||||
@@ -15,12 +14,8 @@
|
|||||||
{
|
{
|
||||||
"name": "load_threshold",
|
"name": "load_threshold",
|
||||||
"expr": "cpu_load.limits.peak * excessivecpuload_threshold_factor"
|
"expr": "cpu_load.limits.peak * excessivecpuload_threshold_factor"
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "load_perc",
|
|
||||||
"expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"rule": "cpu_load.avg > load_threshold",
|
"rule": "cpu_load.avg > load_threshold",
|
||||||
"hint": "This job was detected as excessiveload because the average cpu load {{.cpu_load.avg}} falls above the threshold {{.load_threshold}}."
|
"hint": "This job was detected as having excessive CPU load: average cpu load {{.cpu_load.avg}} exceeds the oversubscription threshold {{.load_threshold}} ({{.excessivecpuload_threshold_factor}} \u00d7 {{.cpu_load.limits.peak}} peak cores), indicating CPU contention."
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"name": "Low ressource utilization",
|
"name": "Low resource utilization",
|
||||||
"tag": "lowutilization",
|
"tag": "lowutilization",
|
||||||
"parameters": ["job_min_duration_seconds"],
|
"parameters": ["job_min_duration_seconds"],
|
||||||
"metrics": ["flops_any", "mem_bw"],
|
"metrics": ["flops_any", "mem_bw"],
|
||||||
@@ -9,14 +9,14 @@
|
|||||||
],
|
],
|
||||||
"variables": [
|
"variables": [
|
||||||
{
|
{
|
||||||
"name": "mem_bw_perc",
|
"name": "mem_bw_pct",
|
||||||
"expr": "1.0 - (mem_bw.avg / mem_bw.limits.peak)"
|
"expr": "mem_bw.avg / mem_bw.limits.peak * 100.0"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "flops_any_perc",
|
"name": "flops_any_pct",
|
||||||
"expr": "1.0 - (flops_any.avg / flops_any.limits.peak)"
|
"expr": "flops_any.avg / flops_any.limits.peak * 100.0"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"rule": "flops_any.avg < flops_any.limits.alert && mem_bw.avg < mem_bw.limits.alert",
|
"rule": "flops_any.avg < flops_any.limits.alert && mem_bw.avg < mem_bw.limits.alert",
|
||||||
"hint": "This job was detected as low utilization because the average flop rate {{.flops_any.avg}} falls below the threshold {{.flops_any.limits.alert}}."
|
"hint": "This job shows low resource utilization: FLOP rate {{.flops_any.avg}} GF/s ({{.flops_any_pct}}% of peak) and memory bandwidth {{.mem_bw.avg}} GB/s ({{.mem_bw_pct}}% of peak) are both below their alert thresholds."
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,8 +3,7 @@
|
|||||||
"tag": "lowload",
|
"tag": "lowload",
|
||||||
"parameters": [
|
"parameters": [
|
||||||
"lowcpuload_threshold_factor",
|
"lowcpuload_threshold_factor",
|
||||||
"job_min_duration_seconds",
|
"job_min_duration_seconds"
|
||||||
"sampling_interval_seconds"
|
|
||||||
],
|
],
|
||||||
"metrics": ["cpu_load"],
|
"metrics": ["cpu_load"],
|
||||||
"requirements": [
|
"requirements": [
|
||||||
@@ -15,12 +14,8 @@
|
|||||||
{
|
{
|
||||||
"name": "load_threshold",
|
"name": "load_threshold",
|
||||||
"expr": "job.numCores * lowcpuload_threshold_factor"
|
"expr": "job.numCores * lowcpuload_threshold_factor"
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "load_perc",
|
|
||||||
"expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"rule": "cpu_load.avg < cpu_load.limits.caution",
|
"rule": "cpu_load.avg < load_threshold",
|
||||||
"hint": "This job was detected as lowload because the average cpu load {{.cpu_load}} falls below the threshold {{.cpu_load.limits.caution}}."
|
"hint": "This job was detected as low CPU load: average cpu load {{.cpu_load.avg}} is below the threshold {{.load_threshold}} ({{.lowcpuload_threshold_factor}} \u00d7 {{.job.numCores}} allocated cores)."
|
||||||
}
|
}
|
||||||
|
|||||||
22
configs/tagger/jobclasses/memoryBound.json
Normal file
22
configs/tagger/jobclasses/memoryBound.json
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
{
|
||||||
|
"name": "Memory bandwidth bound",
|
||||||
|
"tag": "memorybound",
|
||||||
|
"parameters": ["membound_bw_threshold_factor", "job_min_duration_seconds"],
|
||||||
|
"metrics": ["mem_bw"],
|
||||||
|
"requirements": [
|
||||||
|
"job.shared == \"none\"",
|
||||||
|
"job.duration > job_min_duration_seconds"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "mem_bw_threshold",
|
||||||
|
"expr": "mem_bw.limits.peak * membound_bw_threshold_factor"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "mem_bw_pct",
|
||||||
|
"expr": "mem_bw.avg / mem_bw.limits.peak * 100.0"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"rule": "mem_bw.avg > mem_bw_threshold",
|
||||||
|
"hint": "This job is memory bandwidth bound: memory bandwidth {{.mem_bw.avg}} GB/s ({{.mem_bw_pct}}% of peak) is within {{.membound_bw_threshold_factor}} of peak bandwidth. Consider improving data reuse or compute intensity."
|
||||||
|
}
|
||||||
@@ -1,11 +1,12 @@
|
|||||||
{
|
{
|
||||||
"lowcpuload_threshold_factor": 0.9,
|
"lowcpuload_threshold_factor": 0.85,
|
||||||
"excessivecpuload_threshold_factor": 1.1,
|
"excessivecpuload_threshold_factor": 1.2,
|
||||||
"highmemoryusage_threshold_factor": 0.9,
|
"highmemoryusage_threshold_factor": 0.9,
|
||||||
"node_load_imbalance_threshold_factor": 0.1,
|
"node_load_imbalance_threshold_factor": 0.1,
|
||||||
"core_load_imbalance_threshold_factor": 0.1,
|
"core_load_imbalance_threshold_factor": 0.1,
|
||||||
"high_memory_load_threshold_factor": 0.9,
|
"high_memory_load_threshold_factor": 0.9,
|
||||||
"lowgpuload_threshold_factor": 0.7,
|
"lowgpuload_threshold_factor": 0.7,
|
||||||
|
"membound_bw_threshold_factor": 0.8,
|
||||||
"memory_leak_slope_threshold": 0.1,
|
"memory_leak_slope_threshold": 0.1,
|
||||||
"job_min_duration_seconds": 600.0,
|
"job_min_duration_seconds": 600.0,
|
||||||
"sampling_interval_seconds": 30.0,
|
"sampling_interval_seconds": 30.0,
|
||||||
|
|||||||
Reference in New Issue
Block a user