diff --git a/configs/tagger/jobclasses/highMemoryUsage.json b/configs/tagger/jobclasses/highMemoryUsage.json new file mode 100644 index 00000000..3c10b06f --- /dev/null +++ b/configs/tagger/jobclasses/highMemoryUsage.json @@ -0,0 +1,25 @@ +{ + "name": "High memory usage", + "tag": "highmemory", + "parameters": [ + "highmemoryusage_threshold_factor", + "job_min_duration_seconds" + ], + "metrics": ["mem_used"], + "requirements": [ + "job.shared == \"none\"", + "job.duration > job_min_duration_seconds" + ], + "variables": [ + { + "name": "memory_threshold", + "expr": "mem_used.limits.peak * highmemoryusage_threshold_factor" + }, + { + "name": "memory_usage_pct", + "expr": "mem_used.max / mem_used.limits.peak * 100.0" + } + ], + "rule": "mem_used.max > memory_threshold", + "hint": "This job used high memory: peak memory usage {{.mem_used.max}} GB ({{.memory_usage_pct}}% of {{.mem_used.limits.peak}} GB node capacity), exceeding the {{.highmemoryusage_threshold_factor}} utilization threshold. Risk of out-of-memory conditions." +} diff --git a/configs/tagger/jobclasses/highload.json b/configs/tagger/jobclasses/highload.json index 9667011b..a442a3ac 100644 --- a/configs/tagger/jobclasses/highload.json +++ b/configs/tagger/jobclasses/highload.json @@ -3,8 +3,7 @@ "tag": "excessiveload", "parameters": [ "excessivecpuload_threshold_factor", - "job_min_duration_seconds", - "sampling_interval_seconds" + "job_min_duration_seconds" ], "metrics": ["cpu_load"], "requirements": [ @@ -15,12 +14,8 @@ { "name": "load_threshold", "expr": "cpu_load.limits.peak * excessivecpuload_threshold_factor" - }, - { - "name": "load_perc", - "expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)" } ], "rule": "cpu_load.avg > load_threshold", - "hint": "This job was detected as excessiveload because the average cpu load {{.cpu_load.avg}} falls above the threshold {{.load_threshold}}." + "hint": "This job was detected as having excessive CPU load: average cpu load {{.cpu_load.avg}} exceeds the oversubscription threshold {{.load_threshold}} ({{.excessivecpuload_threshold_factor}} \u00d7 {{.cpu_load.limits.peak}} peak cores), indicating CPU contention." } diff --git a/configs/tagger/jobclasses/lowUtilization.json b/configs/tagger/jobclasses/lowUtilization.json index e84b81da..1d365150 100644 --- a/configs/tagger/jobclasses/lowUtilization.json +++ b/configs/tagger/jobclasses/lowUtilization.json @@ -1,5 +1,5 @@ { - "name": "Low ressource utilization", + "name": "Low resource utilization", "tag": "lowutilization", "parameters": ["job_min_duration_seconds"], "metrics": ["flops_any", "mem_bw"], @@ -9,14 +9,14 @@ ], "variables": [ { - "name": "mem_bw_perc", - "expr": "1.0 - (mem_bw.avg / mem_bw.limits.peak)" + "name": "mem_bw_pct", + "expr": "mem_bw.avg / mem_bw.limits.peak * 100.0" }, { - "name": "flops_any_perc", - "expr": "1.0 - (flops_any.avg / flops_any.limits.peak)" + "name": "flops_any_pct", + "expr": "flops_any.avg / flops_any.limits.peak * 100.0" } ], "rule": "flops_any.avg < flops_any.limits.alert && mem_bw.avg < mem_bw.limits.alert", - "hint": "This job was detected as low utilization because the average flop rate {{.flops_any.avg}} falls below the threshold {{.flops_any.limits.alert}}." + "hint": "This job shows low resource utilization: FLOP rate {{.flops_any.avg}} GF/s ({{.flops_any_pct}}% of peak) and memory bandwidth {{.mem_bw.avg}} GB/s ({{.mem_bw_pct}}% of peak) are both below their alert thresholds." } diff --git a/configs/tagger/jobclasses/lowload.json b/configs/tagger/jobclasses/lowload.json index f952da59..7fa3ca3b 100644 --- a/configs/tagger/jobclasses/lowload.json +++ b/configs/tagger/jobclasses/lowload.json @@ -3,8 +3,7 @@ "tag": "lowload", "parameters": [ "lowcpuload_threshold_factor", - "job_min_duration_seconds", - "sampling_interval_seconds" + "job_min_duration_seconds" ], "metrics": ["cpu_load"], "requirements": [ @@ -15,12 +14,8 @@ { "name": "load_threshold", "expr": "job.numCores * lowcpuload_threshold_factor" - }, - { - "name": "load_perc", - "expr": "1.0 - (cpu_load.avg / cpu_load.limits.peak)" } ], - "rule": "cpu_load.avg < cpu_load.limits.caution", - "hint": "This job was detected as lowload because the average cpu load {{.cpu_load}} falls below the threshold {{.cpu_load.limits.caution}}." + "rule": "cpu_load.avg < load_threshold", + "hint": "This job was detected as low CPU load: average cpu load {{.cpu_load.avg}} is below the threshold {{.load_threshold}} ({{.lowcpuload_threshold_factor}} \u00d7 {{.job.numCores}} allocated cores)." } diff --git a/configs/tagger/jobclasses/memoryBound.json b/configs/tagger/jobclasses/memoryBound.json new file mode 100644 index 00000000..01368c08 --- /dev/null +++ b/configs/tagger/jobclasses/memoryBound.json @@ -0,0 +1,22 @@ +{ + "name": "Memory bandwidth bound", + "tag": "memorybound", + "parameters": ["membound_bw_threshold_factor", "job_min_duration_seconds"], + "metrics": ["mem_bw"], + "requirements": [ + "job.shared == \"none\"", + "job.duration > job_min_duration_seconds" + ], + "variables": [ + { + "name": "mem_bw_threshold", + "expr": "mem_bw.limits.peak * membound_bw_threshold_factor" + }, + { + "name": "mem_bw_pct", + "expr": "mem_bw.avg / mem_bw.limits.peak * 100.0" + } + ], + "rule": "mem_bw.avg > mem_bw_threshold", + "hint": "This job is memory bandwidth bound: memory bandwidth {{.mem_bw.avg}} GB/s ({{.mem_bw_pct}}% of peak) is within {{.membound_bw_threshold_factor}} of peak bandwidth. Consider improving data reuse or compute intensity." +} diff --git a/configs/tagger/jobclasses/parameters.json b/configs/tagger/jobclasses/parameters.json index 39e94c1c..c3fb5cdc 100644 --- a/configs/tagger/jobclasses/parameters.json +++ b/configs/tagger/jobclasses/parameters.json @@ -1,11 +1,12 @@ { - "lowcpuload_threshold_factor": 0.9, - "excessivecpuload_threshold_factor": 1.1, + "lowcpuload_threshold_factor": 0.85, + "excessivecpuload_threshold_factor": 1.2, "highmemoryusage_threshold_factor": 0.9, "node_load_imbalance_threshold_factor": 0.1, "core_load_imbalance_threshold_factor": 0.1, "high_memory_load_threshold_factor": 0.9, "lowgpuload_threshold_factor": 0.7, + "membound_bw_threshold_factor": 0.8, "memory_leak_slope_threshold": 0.1, "job_min_duration_seconds": 600.0, "sampling_interval_seconds": 30.0,