diff --git a/configs/tagger/jobclasses/highMemoryUsage.json b/configs/tagger/jobclasses/highMemoryUsage.json index 3c10b06f..f241457d 100644 --- a/configs/tagger/jobclasses/highMemoryUsage.json +++ b/configs/tagger/jobclasses/highMemoryUsage.json @@ -11,15 +11,11 @@ "job.duration > job_min_duration_seconds" ], "variables": [ - { - "name": "memory_threshold", - "expr": "mem_used.limits.peak * highmemoryusage_threshold_factor" - }, { "name": "memory_usage_pct", "expr": "mem_used.max / mem_used.limits.peak * 100.0" } ], - "rule": "mem_used.max > memory_threshold", + "rule": "mem_used.max > memory_used.limits.alert", "hint": "This job used high memory: peak memory usage {{.mem_used.max}} GB ({{.memory_usage_pct}}% of {{.mem_used.limits.peak}} GB node capacity), exceeding the {{.highmemoryusage_threshold_factor}} utilization threshold. Risk of out-of-memory conditions." } diff --git a/configs/tagger/jobclasses/lowload.json b/configs/tagger/jobclasses/lowload.json index 7fa3ca3b..767d8f45 100644 --- a/configs/tagger/jobclasses/lowload.json +++ b/configs/tagger/jobclasses/lowload.json @@ -1,10 +1,7 @@ { "name": "Low CPU load", "tag": "lowload", - "parameters": [ - "lowcpuload_threshold_factor", - "job_min_duration_seconds" - ], + "parameters": ["lowcpuload_threshold_factor", "job_min_duration_seconds"], "metrics": ["cpu_load"], "requirements": [ "job.shared == \"none\"", @@ -13,9 +10,9 @@ "variables": [ { "name": "load_threshold", - "expr": "job.numCores * lowcpuload_threshold_factor" + "expr": "cpu_load.limits.peak * lowcpuload_threshold_factor" } ], "rule": "cpu_load.avg < load_threshold", - "hint": "This job was detected as low CPU load: average cpu load {{.cpu_load.avg}} is below the threshold {{.load_threshold}} ({{.lowcpuload_threshold_factor}} \u00d7 {{.job.numCores}} allocated cores)." + "hint": "This job was detected as low CPU load: average cpu load {{.cpu_load.avg}} is below the threshold {{.load_threshold}} ({{.lowcpuload_threshold_factor}})." }