mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2025-07-24 21:26:08 +02:00
Refactor taggers. Refine Job Hooks. Start job classifier
This commit is contained in:
38
internal/tagger/jobclasses/highload.json
Normal file
38
internal/tagger/jobclasses/highload.json
Normal file
@@ -0,0 +1,38 @@
|
||||
{
|
||||
"name": "Excessive CPU load",
|
||||
"tag": "excessiveload",
|
||||
"comment": "Assumptions: all nodes have the same number of cores.",
|
||||
"parameters": [
|
||||
"excessivecpuload_threshold_factor",
|
||||
"job_min_duration_seconds",
|
||||
"sampling_interval_seconds"
|
||||
],
|
||||
"metrics": [
|
||||
"cpu_load"
|
||||
],
|
||||
"requirements": [
|
||||
"job.exclusive == 1",
|
||||
"job.duration > job_min_duration_seconds",
|
||||
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds"
|
||||
],
|
||||
"terms": [
|
||||
{
|
||||
"load_mean": "cpu_load[cpu_load_pre_cutoff_samples].mean('all')"
|
||||
},
|
||||
{
|
||||
"load_threshold": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor"
|
||||
},
|
||||
{
|
||||
"highload_nodes": "load_mean > load_threshold"
|
||||
},
|
||||
{
|
||||
"highload": "highload_nodes.any('all')"
|
||||
},
|
||||
{
|
||||
"load_perc": "load_mean / load_threshold"
|
||||
}
|
||||
],
|
||||
"output": "highload",
|
||||
"output_scalar": "load_perc",
|
||||
"template": "Job ({{ job.jobId }})\nThis job was detected as excessiveload because the mean cpu load {{ load_mean.array }} falls above the threshold {{ load_threshold }}."
|
||||
}
|
40
internal/tagger/jobclasses/highmem.json
Normal file
40
internal/tagger/jobclasses/highmem.json
Normal file
@@ -0,0 +1,40 @@
|
||||
{
|
||||
"name": "High memory usage",
|
||||
"tag": "high_memory_load",
|
||||
"parameters": [
|
||||
"high_memory_load_threshold_factor",
|
||||
"job_min_duration_seconds",
|
||||
"sampling_interval_seconds"
|
||||
],
|
||||
"metrics": [
|
||||
"mem_used"
|
||||
],
|
||||
"requirements": [
|
||||
"job.duration > job_min_duration_seconds",
|
||||
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds",
|
||||
"hasattr(job, \"allocated_memory\")"
|
||||
],
|
||||
"terms": [
|
||||
{
|
||||
"memory_alloc": "job.allocated_memory"
|
||||
},
|
||||
{
|
||||
"memory_used": "mem_used.max('time')"
|
||||
},
|
||||
{
|
||||
"load_threshold": "memory_alloc * high_memory_load_threshold_factor"
|
||||
},
|
||||
{
|
||||
"high_mem_nodes": "memory_used > load_threshold"
|
||||
},
|
||||
{
|
||||
"high_mem": "high_mem_nodes.any('all')"
|
||||
},
|
||||
{
|
||||
"load_perc": "memory_used / (memory_alloc * high_memory_load_threshold_factor)"
|
||||
}
|
||||
],
|
||||
"output": "high_mem",
|
||||
"output_scalar": "load_perc",
|
||||
"template": "Job ({{ job.jobId }})\nThis job was detected as high_memory_load because the memory usage {{ high_mem_nodes.array }} falls above the threshold {{ load_threshold }}."
|
||||
}
|
36
internal/tagger/jobclasses/lowgpuload.json
Normal file
36
internal/tagger/jobclasses/lowgpuload.json
Normal file
@@ -0,0 +1,36 @@
|
||||
{
|
||||
"name": "Low GPU load",
|
||||
"tag": "lowgpuload",
|
||||
"parameters": [
|
||||
"lowgpuload_threshold_factor",
|
||||
"job_min_duration_seconds",
|
||||
"sampling_interval_seconds"
|
||||
],
|
||||
"metrics": [
|
||||
"nv_util"
|
||||
],
|
||||
"requirements": [
|
||||
"job.duration > job_min_duration_seconds",
|
||||
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds"
|
||||
],
|
||||
"terms": [
|
||||
{
|
||||
"load_mean": "nv_util.mean('all')"
|
||||
},
|
||||
{
|
||||
"load_threshold": "job.numAcc * lowgpuload_threshold_factor"
|
||||
},
|
||||
{
|
||||
"lowload_nodes": "load_mean < load_threshold"
|
||||
},
|
||||
{
|
||||
"lowload": "lowload_nodes.any('all')"
|
||||
},
|
||||
{
|
||||
"load_perc": "1.0 - (load_mean / load_threshold)"
|
||||
}
|
||||
],
|
||||
"output": "lowload",
|
||||
"output_scalar": "load_perc",
|
||||
"template": "Job ({{ job.jobId }})\nThis job was detected as lowgpuload because the mean gpu load {{ load_mean }} falls below the threshold {{ load_threshold }}."
|
||||
}
|
38
internal/tagger/jobclasses/lowload.json
Normal file
38
internal/tagger/jobclasses/lowload.json
Normal file
@@ -0,0 +1,38 @@
|
||||
{
|
||||
"name": "Low CPU load",
|
||||
"tag": "lowload",
|
||||
"parameters": [
|
||||
"lowcpuload_threshold_factor",
|
||||
"job_min_duration_seconds",
|
||||
"sampling_interval_seconds"
|
||||
],
|
||||
"metrics": [
|
||||
"cpu_load"
|
||||
],
|
||||
"requirements": [
|
||||
"job.exclusive == 1",
|
||||
"job.duration > job_min_duration_seconds",
|
||||
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds"
|
||||
],
|
||||
"tagRule": [
|
||||
{
|
||||
"load_mean": "cpu_load[cpu_load_pre_cutoff_samples:].mean('all')"
|
||||
},
|
||||
{
|
||||
"load_threshold": "job.numHwthreads * lowcpuload_threshold_factor"
|
||||
},
|
||||
{
|
||||
"lowload_nodes": "load_mean < load_threshold"
|
||||
},
|
||||
{
|
||||
"lowload": "lowload_nodes.any('all')"
|
||||
},
|
||||
{
|
||||
"load_perc": "1.0 - (load_mean / load_threshold)"
|
||||
}
|
||||
],
|
||||
"valueRule": [],
|
||||
"output": "lowload",
|
||||
"output_scalar": "load_perc",
|
||||
"hint": "Job ({{ job.jobId }})\nThis job was detected as lowload because the mean cpu load {{ load_mean }} falls below the threshold {{ load_threshold }}."
|
||||
}
|
Reference in New Issue
Block a user