mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2026-02-24 11:27:30 +01:00
@@ -11,15 +11,11 @@
|
||||
"job.duration > job_min_duration_seconds"
|
||||
],
|
||||
"variables": [
|
||||
{
|
||||
"name": "memory_threshold",
|
||||
"expr": "mem_used.limits.peak * highmemoryusage_threshold_factor"
|
||||
},
|
||||
{
|
||||
"name": "memory_usage_pct",
|
||||
"expr": "mem_used.max / mem_used.limits.peak * 100.0"
|
||||
}
|
||||
],
|
||||
"rule": "mem_used.max > memory_threshold",
|
||||
"rule": "mem_used.max > memory_used.limits.alert",
|
||||
"hint": "This job used high memory: peak memory usage {{.mem_used.max}} GB ({{.memory_usage_pct}}% of {{.mem_used.limits.peak}} GB node capacity), exceeding the {{.highmemoryusage_threshold_factor}} utilization threshold. Risk of out-of-memory conditions."
|
||||
}
|
||||
|
||||
@@ -1,10 +1,7 @@
|
||||
{
|
||||
"name": "Low CPU load",
|
||||
"tag": "lowload",
|
||||
"parameters": [
|
||||
"lowcpuload_threshold_factor",
|
||||
"job_min_duration_seconds"
|
||||
],
|
||||
"parameters": ["lowcpuload_threshold_factor", "job_min_duration_seconds"],
|
||||
"metrics": ["cpu_load"],
|
||||
"requirements": [
|
||||
"job.shared == \"none\"",
|
||||
@@ -13,9 +10,9 @@
|
||||
"variables": [
|
||||
{
|
||||
"name": "load_threshold",
|
||||
"expr": "job.numCores * lowcpuload_threshold_factor"
|
||||
"expr": "cpu_load.limits.peak * lowcpuload_threshold_factor"
|
||||
}
|
||||
],
|
||||
"rule": "cpu_load.avg < load_threshold",
|
||||
"hint": "This job was detected as low CPU load: average cpu load {{.cpu_load.avg}} is below the threshold {{.load_threshold}} ({{.lowcpuload_threshold_factor}} \u00d7 {{.job.numCores}} allocated cores)."
|
||||
"hint": "This job was detected as low CPU load: average cpu load {{.cpu_load.avg}} is below the threshold {{.load_threshold}} ({{.lowcpuload_threshold_factor}})."
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user