Revert interface from jobMeta to job type. Extend job classifier tagger. Cleanup test rules.

This commit is contained in:
2025-05-23 07:48:27 +02:00
parent ca634bb707
commit 733e3ea9d5
11 changed files with 202 additions and 158 deletions

View File

@@ -12,27 +12,22 @@
],
"requirements": [
"job.exclusive == 1",
"job.duration > job_min_duration_seconds",
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds"
"job.duration > job_min_duration_seconds"
],
"terms": [
{
"name": "",
"load_mean": "cpu_load[cpu_load_pre_cutoff_samples].mean('all')"
},
{
"load_threshold": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor"
"name": "load_threshold",
"expr": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor"
},
{
"highload_nodes": "load_mean > load_threshold"
},
{
"highload": "highload_nodes.any('all')"
},
{
"load_perc": "load_mean / load_threshold"
"name": "load_perc",
"expr": "load_mean / load_threshold"
}
],
"output": "highload",
"output_scalar": "load_perc",
"template": "Job ({{ job.jobId }})\nThis job was detected as excessiveload because the mean cpu load {{ load_mean.array }} falls above the threshold {{ load_threshold }}."
"rule": "cpu_load > load_threshold",
"hint": "This job was detected as excessiveload because the average cpu load {{ cpu_load }} falls above the threshold {{ load_threshold }}."
}

View File

@@ -1,40 +0,0 @@
{
"name": "High memory usage",
"tag": "high_memory_load",
"parameters": [
"high_memory_load_threshold_factor",
"job_min_duration_seconds",
"sampling_interval_seconds"
],
"metrics": [
"mem_used"
],
"requirements": [
"job.duration > job_min_duration_seconds",
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds",
"hasattr(job, \"allocated_memory\")"
],
"terms": [
{
"memory_alloc": "job.allocated_memory"
},
{
"memory_used": "mem_used.max('time')"
},
{
"load_threshold": "memory_alloc * high_memory_load_threshold_factor"
},
{
"high_mem_nodes": "memory_used > load_threshold"
},
{
"high_mem": "high_mem_nodes.any('all')"
},
{
"load_perc": "memory_used / (memory_alloc * high_memory_load_threshold_factor)"
}
],
"output": "high_mem",
"output_scalar": "load_perc",
"template": "Job ({{ job.jobId }})\nThis job was detected as high_memory_load because the memory usage {{ high_mem_nodes.array }} falls above the threshold {{ load_threshold }}."
}

View File

@@ -1,36 +0,0 @@
{
"name": "Low GPU load",
"tag": "lowgpuload",
"parameters": [
"lowgpuload_threshold_factor",
"job_min_duration_seconds",
"sampling_interval_seconds"
],
"metrics": [
"nv_util"
],
"requirements": [
"job.duration > job_min_duration_seconds",
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds"
],
"terms": [
{
"load_mean": "nv_util.mean('all')"
},
{
"load_threshold": "job.numAcc * lowgpuload_threshold_factor"
},
{
"lowload_nodes": "load_mean < load_threshold"
},
{
"lowload": "lowload_nodes.any('all')"
},
{
"load_perc": "1.0 - (load_mean / load_threshold)"
}
],
"output": "lowload",
"output_scalar": "load_perc",
"template": "Job ({{ job.jobId }})\nThis job was detected as lowgpuload because the mean gpu load {{ load_mean }} falls below the threshold {{ load_threshold }}."
}

View File

@@ -11,28 +11,18 @@
],
"requirements": [
"job.exclusive == 1",
"job.duration > job_min_duration_seconds",
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds"
"job.duration > job_min_duration_seconds"
],
"tagRule": [
"variables": [
{
"load_mean": "cpu_load[cpu_load_pre_cutoff_samples:].mean('all')"
"name": "load_threshold",
"expr": "job.numHwthreads * lowcpuload_threshold_factor"
},
{
"load_threshold": "job.numHwthreads * lowcpuload_threshold_factor"
},
{
"lowload_nodes": "load_mean < load_threshold"
},
{
"lowload": "lowload_nodes.any('all')"
},
{
"load_perc": "1.0 - (load_mean / load_threshold)"
"name": "load_perc",
"expr": "1.0 - (cpu_load / load_threshold)"
}
],
"valueRule": [],
"output": "lowload",
"output_scalar": "load_perc",
"hint": "Job ({{ job.jobId }})\nThis job was detected as lowload because the mean cpu load {{ load_mean }} falls below the threshold {{ load_threshold }}."
"rule": "cpu_load < load_threshold",
"hint": "This job was detected as lowload because the average cpu load {{ cpu_load }} falls below the threshold {{ load_threshold }}."
}

View File

@@ -0,0 +1,14 @@
{
"lowcpuload_threshold_factor": 0.9,
"excessivecpuload_threshold_factor": 1.1,
"highmemoryusage_threshold_factor": 0.9,
"node_load_imbalance_threshold_factor": 0.1,
"core_load_imbalance_threshold_factor": 0.1,
"high_memory_load_threshold_factor": 0.9,
"lowgpuload_threshold_factor": 0.7,
"memory_leak_slope_threshold": 0.1,
"job_min_duration_seconds": 600.0,
"sampling_interval_seconds": 30.0,
"cpu_load_pre_cutoff_samples": 11.0,
"cpu_load_core_pre_cutoff_samples": 6.0
}