Refactor taggers. Refine Job Hooks. Start job classifier

This commit is contained in:
2025-05-22 07:10:41 +02:00
parent 9abc206d1a
commit ca634bb707
14 changed files with 316 additions and 61 deletions

View File

@@ -0,0 +1,38 @@
{
"name": "Excessive CPU load",
"tag": "excessiveload",
"comment": "Assumptions: all nodes have the same number of cores.",
"parameters": [
"excessivecpuload_threshold_factor",
"job_min_duration_seconds",
"sampling_interval_seconds"
],
"metrics": [
"cpu_load"
],
"requirements": [
"job.exclusive == 1",
"job.duration > job_min_duration_seconds",
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds"
],
"terms": [
{
"load_mean": "cpu_load[cpu_load_pre_cutoff_samples].mean('all')"
},
{
"load_threshold": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor"
},
{
"highload_nodes": "load_mean > load_threshold"
},
{
"highload": "highload_nodes.any('all')"
},
{
"load_perc": "load_mean / load_threshold"
}
],
"output": "highload",
"output_scalar": "load_perc",
"template": "Job ({{ job.jobId }})\nThis job was detected as excessiveload because the mean cpu load {{ load_mean.array }} falls above the threshold {{ load_threshold }}."
}

View File

@@ -0,0 +1,40 @@
{
"name": "High memory usage",
"tag": "high_memory_load",
"parameters": [
"high_memory_load_threshold_factor",
"job_min_duration_seconds",
"sampling_interval_seconds"
],
"metrics": [
"mem_used"
],
"requirements": [
"job.duration > job_min_duration_seconds",
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds",
"hasattr(job, \"allocated_memory\")"
],
"terms": [
{
"memory_alloc": "job.allocated_memory"
},
{
"memory_used": "mem_used.max('time')"
},
{
"load_threshold": "memory_alloc * high_memory_load_threshold_factor"
},
{
"high_mem_nodes": "memory_used > load_threshold"
},
{
"high_mem": "high_mem_nodes.any('all')"
},
{
"load_perc": "memory_used / (memory_alloc * high_memory_load_threshold_factor)"
}
],
"output": "high_mem",
"output_scalar": "load_perc",
"template": "Job ({{ job.jobId }})\nThis job was detected as high_memory_load because the memory usage {{ high_mem_nodes.array }} falls above the threshold {{ load_threshold }}."
}

View File

@@ -0,0 +1,36 @@
{
"name": "Low GPU load",
"tag": "lowgpuload",
"parameters": [
"lowgpuload_threshold_factor",
"job_min_duration_seconds",
"sampling_interval_seconds"
],
"metrics": [
"nv_util"
],
"requirements": [
"job.duration > job_min_duration_seconds",
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds"
],
"terms": [
{
"load_mean": "nv_util.mean('all')"
},
{
"load_threshold": "job.numAcc * lowgpuload_threshold_factor"
},
{
"lowload_nodes": "load_mean < load_threshold"
},
{
"lowload": "lowload_nodes.any('all')"
},
{
"load_perc": "1.0 - (load_mean / load_threshold)"
}
],
"output": "lowload",
"output_scalar": "load_perc",
"template": "Job ({{ job.jobId }})\nThis job was detected as lowgpuload because the mean gpu load {{ load_mean }} falls below the threshold {{ load_threshold }}."
}

View File

@@ -0,0 +1,38 @@
{
"name": "Low CPU load",
"tag": "lowload",
"parameters": [
"lowcpuload_threshold_factor",
"job_min_duration_seconds",
"sampling_interval_seconds"
],
"metrics": [
"cpu_load"
],
"requirements": [
"job.exclusive == 1",
"job.duration > job_min_duration_seconds",
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds"
],
"tagRule": [
{
"load_mean": "cpu_load[cpu_load_pre_cutoff_samples:].mean('all')"
},
{
"load_threshold": "job.numHwthreads * lowcpuload_threshold_factor"
},
{
"lowload_nodes": "load_mean < load_threshold"
},
{
"lowload": "lowload_nodes.any('all')"
},
{
"load_perc": "1.0 - (load_mean / load_threshold)"
}
],
"valueRule": [],
"output": "lowload",
"output_scalar": "load_perc",
"hint": "Job ({{ job.jobId }})\nThis job was detected as lowload because the mean cpu load {{ load_mean }} falls below the threshold {{ load_threshold }}."
}