mirror of
https://github.com/ClusterCockpit/cc-backend
synced 2025-07-24 21:26:08 +02:00
Revert interface from jobMeta to job type. Extend job classifier tagger. Cleanup test rules.
This commit is contained in:
@@ -12,27 +12,22 @@
|
||||
],
|
||||
"requirements": [
|
||||
"job.exclusive == 1",
|
||||
"job.duration > job_min_duration_seconds",
|
||||
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds"
|
||||
"job.duration > job_min_duration_seconds"
|
||||
],
|
||||
"terms": [
|
||||
{
|
||||
"name": "",
|
||||
"load_mean": "cpu_load[cpu_load_pre_cutoff_samples].mean('all')"
|
||||
},
|
||||
{
|
||||
"load_threshold": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor"
|
||||
"name": "load_threshold",
|
||||
"expr": "(job.numHwthreads/job.numNodes) * excessivecpuload_threshold_factor"
|
||||
},
|
||||
{
|
||||
"highload_nodes": "load_mean > load_threshold"
|
||||
},
|
||||
{
|
||||
"highload": "highload_nodes.any('all')"
|
||||
},
|
||||
{
|
||||
"load_perc": "load_mean / load_threshold"
|
||||
"name": "load_perc",
|
||||
"expr": "load_mean / load_threshold"
|
||||
}
|
||||
],
|
||||
"output": "highload",
|
||||
"output_scalar": "load_perc",
|
||||
"template": "Job ({{ job.jobId }})\nThis job was detected as excessiveload because the mean cpu load {{ load_mean.array }} falls above the threshold {{ load_threshold }}."
|
||||
"rule": "cpu_load > load_threshold",
|
||||
"hint": "This job was detected as excessiveload because the average cpu load {{ cpu_load }} falls above the threshold {{ load_threshold }}."
|
||||
}
|
||||
|
@@ -1,40 +0,0 @@
|
||||
{
|
||||
"name": "High memory usage",
|
||||
"tag": "high_memory_load",
|
||||
"parameters": [
|
||||
"high_memory_load_threshold_factor",
|
||||
"job_min_duration_seconds",
|
||||
"sampling_interval_seconds"
|
||||
],
|
||||
"metrics": [
|
||||
"mem_used"
|
||||
],
|
||||
"requirements": [
|
||||
"job.duration > job_min_duration_seconds",
|
||||
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds",
|
||||
"hasattr(job, \"allocated_memory\")"
|
||||
],
|
||||
"terms": [
|
||||
{
|
||||
"memory_alloc": "job.allocated_memory"
|
||||
},
|
||||
{
|
||||
"memory_used": "mem_used.max('time')"
|
||||
},
|
||||
{
|
||||
"load_threshold": "memory_alloc * high_memory_load_threshold_factor"
|
||||
},
|
||||
{
|
||||
"high_mem_nodes": "memory_used > load_threshold"
|
||||
},
|
||||
{
|
||||
"high_mem": "high_mem_nodes.any('all')"
|
||||
},
|
||||
{
|
||||
"load_perc": "memory_used / (memory_alloc * high_memory_load_threshold_factor)"
|
||||
}
|
||||
],
|
||||
"output": "high_mem",
|
||||
"output_scalar": "load_perc",
|
||||
"template": "Job ({{ job.jobId }})\nThis job was detected as high_memory_load because the memory usage {{ high_mem_nodes.array }} falls above the threshold {{ load_threshold }}."
|
||||
}
|
@@ -1,36 +0,0 @@
|
||||
{
|
||||
"name": "Low GPU load",
|
||||
"tag": "lowgpuload",
|
||||
"parameters": [
|
||||
"lowgpuload_threshold_factor",
|
||||
"job_min_duration_seconds",
|
||||
"sampling_interval_seconds"
|
||||
],
|
||||
"metrics": [
|
||||
"nv_util"
|
||||
],
|
||||
"requirements": [
|
||||
"job.duration > job_min_duration_seconds",
|
||||
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds"
|
||||
],
|
||||
"terms": [
|
||||
{
|
||||
"load_mean": "nv_util.mean('all')"
|
||||
},
|
||||
{
|
||||
"load_threshold": "job.numAcc * lowgpuload_threshold_factor"
|
||||
},
|
||||
{
|
||||
"lowload_nodes": "load_mean < load_threshold"
|
||||
},
|
||||
{
|
||||
"lowload": "lowload_nodes.any('all')"
|
||||
},
|
||||
{
|
||||
"load_perc": "1.0 - (load_mean / load_threshold)"
|
||||
}
|
||||
],
|
||||
"output": "lowload",
|
||||
"output_scalar": "load_perc",
|
||||
"template": "Job ({{ job.jobId }})\nThis job was detected as lowgpuload because the mean gpu load {{ load_mean }} falls below the threshold {{ load_threshold }}."
|
||||
}
|
@@ -11,28 +11,18 @@
|
||||
],
|
||||
"requirements": [
|
||||
"job.exclusive == 1",
|
||||
"job.duration > job_min_duration_seconds",
|
||||
"required_metrics_min_samples > job_min_duration_seconds / sampling_interval_seconds"
|
||||
"job.duration > job_min_duration_seconds"
|
||||
],
|
||||
"tagRule": [
|
||||
"variables": [
|
||||
{
|
||||
"load_mean": "cpu_load[cpu_load_pre_cutoff_samples:].mean('all')"
|
||||
"name": "load_threshold",
|
||||
"expr": "job.numHwthreads * lowcpuload_threshold_factor"
|
||||
},
|
||||
{
|
||||
"load_threshold": "job.numHwthreads * lowcpuload_threshold_factor"
|
||||
},
|
||||
{
|
||||
"lowload_nodes": "load_mean < load_threshold"
|
||||
},
|
||||
{
|
||||
"lowload": "lowload_nodes.any('all')"
|
||||
},
|
||||
{
|
||||
"load_perc": "1.0 - (load_mean / load_threshold)"
|
||||
"name": "load_perc",
|
||||
"expr": "1.0 - (cpu_load / load_threshold)"
|
||||
}
|
||||
],
|
||||
"valueRule": [],
|
||||
"output": "lowload",
|
||||
"output_scalar": "load_perc",
|
||||
"hint": "Job ({{ job.jobId }})\nThis job was detected as lowload because the mean cpu load {{ load_mean }} falls below the threshold {{ load_threshold }}."
|
||||
"rule": "cpu_load < load_threshold",
|
||||
"hint": "This job was detected as lowload because the average cpu load {{ cpu_load }} falls below the threshold {{ load_threshold }}."
|
||||
}
|
||||
|
14
internal/tagger/jobclasses/parameters.json
Normal file
14
internal/tagger/jobclasses/parameters.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"lowcpuload_threshold_factor": 0.9,
|
||||
"excessivecpuload_threshold_factor": 1.1,
|
||||
"highmemoryusage_threshold_factor": 0.9,
|
||||
"node_load_imbalance_threshold_factor": 0.1,
|
||||
"core_load_imbalance_threshold_factor": 0.1,
|
||||
"high_memory_load_threshold_factor": 0.9,
|
||||
"lowgpuload_threshold_factor": 0.7,
|
||||
"memory_leak_slope_threshold": 0.1,
|
||||
"job_min_duration_seconds": 600.0,
|
||||
"sampling_interval_seconds": 30.0,
|
||||
"cpu_load_pre_cutoff_samples": 11.0,
|
||||
"cpu_load_core_pre_cutoff_samples": 6.0
|
||||
}
|
Reference in New Issue
Block a user