Update config for v1.5.0

This commit is contained in:
Jan Eitzinger
2026-03-04 14:57:11 +01:00
parent 8c8c40b547
commit bd22bfe5e6
13 changed files with 4091 additions and 6825 deletions

View File

@@ -9,25 +9,15 @@ You can find an overview about all clusters
Some systems run with job exclusive nodes, others have node sharing enabled. Some systems run with job exclusive nodes, others have node sharing enabled.
There are CPU systems (Fritz, Meggie, Woody, TinyFat) as well as GPU accelerated There are CPU systems (Fritz, Meggie, Woody, TinyFat) as well as GPU accelerated
clusters (Alex, TinyGPU). clusters (Alex, Helma, TinyGPU).
NHR@FAU uses the following stack: NHR@FAU uses the following stack:
* `cc-metric-collector` as node agent * `cc-metric-collector`
* `cc-metric-store` as temporal metric time series cache. We use one instance
for all clusters.
* `cc-backend` * `cc-backend`
* A homegrown python script running on the management nodes for providing job * `cc-slurm-adapter`
meta data from Slurm
* Builtin sqlite database for job meta and user data (currently 50GB large)
* Job Archive without retention using compressed data.json files (around 700GB)
Currently all API use regular HTTP protocol, but we plan to switch to NATS for We use the following server with Ubuntu Linux:
all communication.
We also push the metric data to an InfluxDB instance for debugging purposes.
The backend and metric store run on the same dedicated Dell server running
Ubuntu Linux:
* Two Intel Xeon(R) Platinum 8352Y with 32 cores each * Two Intel Xeon(R) Platinum 8352Y with 32 cores each
* 512 GB Main memory capacity * 512 GB Main memory capacity

View File

@@ -1,17 +1,19 @@
[Unit] [Unit]
Description=ClusterCockpit Web Server Description=ClusterCockpit Backend
Documentation=https://clustercockpit.org Documentation=https://github.com/ClusterCockpit/cc-backend
Wants=network-online.target Wants=network-online.target
After=network-online.target After=network-online.target
After=mariadb.service mysql.service
[Service] [Service]
WorkingDirectory=/opt/monitoring/cc-backend WorkingDirectory=/opt/monitoring/cc-backend
Type=notify Type=notify
User=clustercockpit
Group=clustercockpit
NotifyAccess=all NotifyAccess=all
Restart=on-failure Restart=on-failure
RestartSec=30 RestartSec=30
TimeoutStopSec=100 TimeoutStartSec=200
TimeoutStopSec=200
ExecStart=/opt/monitoring/cc-backend/cc-backend -loglevel info -server -config ./config.json ExecStart=/opt/monitoring/cc-backend/cc-backend -loglevel info -server -config ./config.json
[Install] [Install]

View File

@@ -1,26 +1,65 @@
{ {
"addr": "0.0.0.0:443", "main": {
"stop-jobs-exceeding-walltime": 288000, "addr": "127.0.0.1:8050",
"api-allowed-ips": ["*"],
"stop-jobs-exceeding-walltime":288000,
"short-running-jobs-duration": 300, "short-running-jobs-duration": 300,
"resampling": {
"minimum-points": 600,
"trigger": 180,
"resolutions": [240, 60]
},
"nodestate-retention": {
"policy": "move",
"target-kind": "file",
"target-path": "/opt/monitoring/cc-backend/var/nodestate-archive/"
},
"emission-constant": 317,
"enable-job-taggers": true
},
"cron": {
"commit-job-worker": "1m",
"duration-worker": "5m",
"footprint-worker": "10m"
},
"archive": {
"kind": "file",
"path": "./var/job-archive"
},
"auth": {
"ldap": { "ldap": {
"url": "ldaps://hpcldap.rrze.uni-erlangen.de", "url": "ldaps://hpcldap.rrze.uni-erlangen.de",
"user_base": "ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", "user-base": "ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
"search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", "search-dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
"user_bind": "uid={username},ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", "user-bind": "uid={username},ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
"user_filter": "(&(objectclass=posixAccount))", "user-filter": "(&(objectclass=posixAccount))",
"sync_interval": "24h" "sync-interval": "24h"
}, },
"jwts": { "jwts": {
"syncUserOnLogin": true, "sync-user-on-login": true,
"updateUserOnLogin": true, "update-user-on-login": true,
"trustedIssuer": "https://portal.hpc.fau.de/", "validate-user": false,
"validateUser": false, "max-age": "1h",
"max-age": "168h" "trusted-issuer": "https://portal.hpc.fau.de/"
}
},
"metric-store": {
"checkpoints": {
"file-format": "wal",
"directory": "./var/metric-checkpoints"
},
"cleanup": {
"mode": "archive",
"directory": "./var/metric-archive"
},
"nats-subscriptions": [
{
"subscribe-to": "ccmetrics.>"
}
],
"retention-in-memory": "24h",
"memory-cap": 200
}, },
"https-cert-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/fullchain.pem",
"https-key-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/privkey.pem",
"user": "clustercockpit",
"group": "clustercockpit",
"archive": { "archive": {
"kind": "file", "kind": "file",
"path": "./var/job-archive", "path": "./var/job-archive",
@@ -29,213 +68,10 @@
"policy": "none" "policy": "none"
} }
}, },
"enable-resampling": { "nats": {
"trigger": 30, "address": "nats://monitoring.nhr.fau.de:4222",
"resolutions": [ "username": "metricstore",
600, "password": "XXX"
300,
120,
60
]
}, },
"emission-constant": 317, "ui-file": "uiConfig.json"
"ui-defaults": {
"analysis_view_histogramMetrics": [
"flops_any",
"mem_bw",
"mem_used"
],
"analysis_view_scatterPlotMetrics": [
[
"flops_any",
"mem_bw"
],
[
"flops_any",
"cpu_load"
],
[
"cpu_load",
"mem_bw"
]
],
"job_view_nodestats_selectedMetrics": [
"flops_any",
"mem_bw",
"mem_used"
],
"job_view_polarPlotMetrics": [
"flops_any",
"mem_bw",
"mem_used"
],
"job_view_selectedMetrics": [
"flops_any",
"mem_bw",
"mem_used"
],
"job_view_showFootprint": true,
"job_list_usePaging": false,
"plot_general_colorBackground": true,
"plot_general_colorscheme": [
"#00bfff",
"#0000ff",
"#ff00ff",
"#ff0000",
"#ff8000",
"#ffff00",
"#80ff00"
],
"plot_general_lineWidth": 3,
"plot_list_jobsPerPage": 10,
"plot_list_selectedMetrics": [
"cpu_load",
"mem_used",
"flops_any",
"mem_bw"
],
"plot_view_plotsPerRow": 3,
"plot_view_showPolarplot": true,
"plot_view_showRoofline": true,
"plot_view_showStatTable": true,
"system_view_selectedMetric": "cpu_load",
"analysis_view_selectedTopEntity": "user",
"analysis_view_selectedTopCategory": "totalWalltime",
"status_view_selectedTopUserCategory": "totalJobs",
"status_view_selectedTopProjectCategory": "totalJobs"
},
"clusters": [
{
"name": "fritz",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": "-"
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
},
{
"name": "alex",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": "-"
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
},
{
"name": "woody",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": "-"
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 1
},
"duration": {
"from": 0,
"to": 172800
},
"startTime": {
"from": "2020-01-01T00:00:00Z",
"to": null
}
}
},
{
"name": "tinyfat",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": "-"
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 1
},
"duration": {
"from": 0,
"to": 172800
},
"startTime": {
"from": "2020-01-01T00:00:00Z",
"to": null
}
}
},
{
"name": "tinygpu",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": "-"
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 1
},
"duration": {
"from": 0,
"to": 172800
},
"startTime": {
"from": "2020-01-01T00:00:00Z",
"to": null
}
}
},
{
"name": "meggie",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": "-"
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2018-01-01T00:00:00Z",
"to": null
}
}
}
]
} }

View File

@@ -0,0 +1,59 @@
{
"job-view": {
"show-polar-plot":true,
"show-footprint": true,
"show-roofline": true,
"show-stat-table": true
},
"metric-config": {
"job-list-metrics": ["cpu_load", "flops_any", "mem_bw", "mem_used"],
"job-view-plot-metrics": ["cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "nfs4_total"],
"job-view-table-metrics": ["mem_bw", "flops_any", "mem_used"],
"clusters": [
{
"name": "fritz",
"job-view-plot-metrics": ["cpu_load", "cpu_user", "flops_any", "vectorization_ratio", "mem_bw", "mem_used", "flops_dp", "flops_sp", "ib_recv", "ib_xmit", "clock", "ipc", "cpu_power", "mem_power", "nfs4_total"]
},
{
"name": "alex",
"job-list-metrics": ["acc_utilization", "acc_mem_used", "cpu_load", "flops_any", "mem_bw", "mem_used"],
"job-view-plot-metrics": ["acc_utilization", "nv_mem_util", "acc_mem_used", "acc_power", "nv_sm_clock", "nv_temp", "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "nfs4_total"],
"job-view-table-metrics": ["acc_utilization", "mem_bw", "flops_any", "mem_used"]
},
{
"name": "tinygpu",
"job-list-metrics": ["acc_utilization", "acc_mem_used", "cpu_load", "flops_any", "mem_bw", "mem_used"],
"job-view-plot-metrics": ["acc_utilization", "nv_mem_util", "acc_mem_used", "acc_power", "nv_sm_clock", "nv_temp", "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "nfs4_total"],
"job-view-table-metrics": ["acc_utilization", "mem_bw", "flops_any", "mem_used"]
},
{
"name": "helma",
"job-list-metrics": ["acc_utilization", "acc_mem_used", "cpu_load", "flops_any", "mem_bw", "mem_used"],
"job-view-plot-metrics": ["acc_utilization", "nv_mem_util", "acc_mem_used", "acc_power", "nv_sm_clock", "nv_temp", "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "ib_recv", "ib_xmit", "nfs4_total"],
"job-view-table-metrics": ["acc_utilization", "mem_bw", "flops_any", "mem_used"],
"sub-clusters": [
{
"name": "cpu",
"job-list-metrics": ["cpu_load", "flops_any", "mem_bw", "mem_used"],
"job-view-plot-metrics": [ "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "flops_dp", "flops_sp", "ib_recv", "ib_xmit", "nfs4_total"],
"job-view-table-metrics": ["mem_bw", "flops_any", "mem_used"]
}
]
}
]
},
"plot-configuration": {
"plots-per-row": 3,
"color-background": true,
"line-width": 3,
"color-scheme": [
"#00bfff",
"#0000ff",
"#ff00ff",
"#ff0000",
"#ff8000",
"#ffff00",
"#80ff00"
]
}
}

View File

@@ -1,19 +0,0 @@
[Unit]
Description=ClusterCockpit In-Memory Timeseries Database for Fritz (cc-metric-store)
Documentation=https://github.com/ClusterCockpit/cc-metric-store
Wants=network-online.target
After=network-online.target
[Service]
Type=simple
User=clustercockpit
Group=clustercockpit
Restart=on-failure
RestartSec=30
TimeoutStopSec=100
WorkingDirectory=/opt/monitoring/cc-metric-store/fritz
ExecStart=/opt/monitoring/cc-metric-store/repo/cc-metric-store --config ./config.json
LimitNOFILE=500000
[Install]
WantedBy=multi-user.target

View File

@@ -1,180 +0,0 @@
{
"metrics": {
"clock": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_idle": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_iowait": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_irq": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_system": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_user": {
"frequency": 60,
"aggregation": "avg"
},
"nv_mem_util": {
"frequency": 60,
"aggregation": "avg"
},
"nv_temp": {
"frequency": 60,
"aggregation": "avg"
},
"nv_sm_clock": {
"frequency": 60,
"aggregation": "avg"
},
"acc_utilization": {
"frequency": 60,
"aggregation": "avg"
},
"acc_mem_used": {
"frequency": 60,
"aggregation": "sum"
},
"acc_power": {
"frequency": 60,
"aggregation": "sum"
},
"flops_any": {
"frequency": 60,
"aggregation": "sum"
},
"flops_dp": {
"frequency": 60,
"aggregation": "sum"
},
"flops_sp": {
"frequency": 60,
"aggregation": "sum"
},
"ib_recv": {
"frequency": 60,
"aggregation": "sum"
},
"ib_xmit": {
"frequency": 60,
"aggregation": "sum"
},
"ib_recv_pkts": {
"frequency": 60,
"aggregation": "sum"
},
"ib_xmit_pkts": {
"frequency": 60,
"aggregation": "sum"
},
"cpu_power": {
"frequency": 60,
"aggregation": "sum"
},
"core_power": {
"frequency": 60,
"aggregation": "sum"
},
"mem_power": {
"frequency": 60,
"aggregation": "sum"
},
"ipc": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_load": {
"frequency": 60,
"aggregation": null
},
"lustre_close": {
"frequency": 60,
"aggregation": null
},
"lustre_open": {
"frequency": 60,
"aggregation": null
},
"lustre_statfs": {
"frequency": 60,
"aggregation": null
},
"lustre_read_bytes": {
"frequency": 60,
"aggregation": null
},
"lustre_write_bytes": {
"frequency": 60,
"aggregation": null
},
"net_bw": {
"frequency": 60,
"aggregation": null
},
"file_bw": {
"frequency": 60,
"aggregation": null
},
"mem_bw": {
"frequency": 60,
"aggregation": "sum"
},
"mem_cached": {
"frequency": 60,
"aggregation": null
},
"mem_used": {
"frequency": 60,
"aggregation": null
},
"net_bytes_in": {
"frequency": 60,
"aggregation": null
},
"net_bytes_out": {
"frequency": 60,
"aggregation": null
},
"nfs4_read": {
"frequency": 60,
"aggregation": null
},
"nfs4_total": {
"frequency": 60,
"aggregation": null
},
"nfs4_write": {
"frequency": 60,
"aggregation": null
},
"vectorization_ratio": {
"frequency": 60,
"aggregation": "avg"
}
},
"checkpoints": {
"interval": "12h",
"directory": "/opt/monitoring/cc-metric-store/fritz/checkpoints",
"restore": "48h"
},
"archive": {
"interval": "50h",
"directory": "/opt/monitoring/cc-metric-store/fritz/archive"
},
"http-api": {
"address": "0.0.0.0:8082",
"https-cert-file": null,
"https-key-file": null
},
"retention-in-memory": "48h",
"jwt-public-key": "-"
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -73,6 +73,32 @@
"caution": 10, "caution": 10,
"alert": 2 "alert": 2
}, },
{
"name": "net_bytes_in",
"unit": {
"base": "B/s"
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 50000000,
"normal": 10000000,
"caution": 5000,
"alert": 1000
},
{
"name": "net_bytes_out",
"unit": {
"base": "B/s"
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 50000000,
"normal": 200000,
"caution": 5000,
"alert": 1000
},
{ {
"name": "mem_bw", "name": "mem_bw",
"unit": { "unit": {
@@ -165,9 +191,9 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 10,
"alert": 1 "alert": 1
}, },
{ {
@@ -179,10 +205,10 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 20,
"alert": 1 "alert": 5
} }
], ],
"subClusters": [ "subClusters": [
@@ -216,140 +242,18 @@
}, },
"topology": { "topology": {
"node": [ "node": [
0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19
], ],
"socket": [ "socket": [
[ [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ],
0, [ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ]
1,
2,
3,
4,
5,
6,
7,
8,
9
],
[
10,
11,
12,
13,
14,
15,
16,
17,
18,
19
]
], ],
"memoryDomain": [ "memoryDomain": [
[ [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ],
0, [ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ]
1,
2,
3,
4,
5,
6,
7,
8,
9
],
[
10,
11,
12,
13,
14,
15,
16,
17,
18,
19
]
], ],
"core": [ "core": [
[ [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ]
0
],
[
1
],
[
2
],
[
3
],
[
4
],
[
5
],
[
6
],
[
7
],
[
8
],
[
9
],
[
10
],
[
11
],
[
12
],
[
13
],
[
14
],
[
15
],
[
16
],
[
17
],
[
18
],
[
19
]
] ]
} }
} }

View File

@@ -178,9 +178,9 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 10,
"alert": 1 "alert": 1
}, },
{ {
@@ -192,9 +192,9 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 10,
"alert": 1 "alert": 1
}, },
{ {
@@ -206,10 +206,10 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 20,
"alert": 1 "alert": 5
} }
], ],
"subClusters": [ "subClusters": [

View File

@@ -86,6 +86,32 @@
"caution": 100, "caution": 100,
"alert": 50 "alert": 50
}, },
{
"name": "net_bytes_in",
"unit": {
"base": "B/s"
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 50000000,
"normal": 10000000,
"caution": 5000,
"alert": 1000
},
{
"name": "net_bytes_out",
"unit": {
"base": "B/s"
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 50000000,
"normal": 200000,
"caution": 5000,
"alert": 1000
},
{ {
"name": "mem_bw", "name": "mem_bw",
"unit": { "unit": {
@@ -234,6 +260,7 @@
}, },
"scope": "accelerator", "scope": "accelerator",
"aggregation": "sum", "aggregation": "sum",
"energy": "power",
"timestep": 60, "timestep": 60,
"peak": 400, "peak": 400,
"normal": 200, "normal": 200,
@@ -290,9 +317,9 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 10,
"alert": 1 "alert": 1
}, },
{ {
@@ -304,9 +331,9 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 10,
"alert": 1 "alert": 1
}, },
{ {
@@ -318,10 +345,10 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 20,
"alert": 1 "alert": 5
} }
], ],
"subClusters": [ "subClusters": [

View File

@@ -61,6 +61,7 @@
"aggregation": "sum", "aggregation": "sum",
"footprint": "max", "footprint": "max",
"timestep": 60, "timestep": 60,
"lowerIsBetter": true,
"peak": 32, "peak": 32,
"normal": 16, "normal": 16,
"caution": 28, "caution": 28,
@@ -89,11 +90,11 @@
"normal": 50, "normal": 50,
"caution": 20, "caution": 20,
"alert": 10, "alert": 10,
"footprint": "avg", "footprint":"avg",
"subClusters": [ "subClusters": [
{ {
"name": "icelake", "name": "icelake",
"footprint": "avg", "footprint":"avg",
"peak": 2970, "peak": 2970,
"normal": 1000, "normal": 1000,
"caution": 100, "caution": 100,
@@ -109,10 +110,10 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 125000000, "peak": 50000000,
"normal": 125000000, "normal": 10000000,
"caution": 200, "caution": 5000,
"alert": 240 "alert": 1000
}, },
{ {
"name": "net_bytes_out", "name": "net_bytes_out",
@@ -122,10 +123,10 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 125000000, "peak": 50000000,
"normal": 125000000, "normal": 200000,
"caution": 200, "caution": 5000,
"alert": 240 "alert": 1000
}, },
{ {
"name": "flops_dp", "name": "flops_dp",
@@ -186,12 +187,12 @@
"normal": 10, "normal": 10,
"caution": 5, "caution": 5,
"alert": 2, "alert": 2,
"footprint": "avg", "footprint":"avg",
"subClusters": [ "subClusters": [
{ {
"name": "icelake", "name": "icelake",
"peak": 350, "peak": 350,
"footprint": "avg", "footprint":"avg",
"normal": 100, "normal": 100,
"caution": 50, "caution": 50,
"alert": 20 "alert": 20
@@ -234,9 +235,9 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 10,
"alert": 1 "alert": 1
}, },
{ {
@@ -248,10 +249,10 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 20,
"alert": 1 "alert": 5
} }
], ],
"subClusters": [ "subClusters": [
@@ -285,40 +286,16 @@
}, },
"topology": { "topology": {
"node": [ "node": [
0, 0, 1, 2, 3
1,
2,
3
], ],
"socket": [ "socket": [
[ [ 0, 1, 2, 3 ]
0,
1,
2,
3
]
], ],
"memoryDomain": [ "memoryDomain": [
[ [ 0, 1, 2, 3 ]
0,
1,
2,
3
]
], ],
"core": [ "core": [
[ [ 0 ], [ 1 ], [ 2 ], [ 3 ]
0
],
[
1
],
[
2
],
[
3
]
] ]
} }
}, },
@@ -352,40 +329,16 @@
}, },
"topology": { "topology": {
"node": [ "node": [
0, 0, 1, 2, 3
1,
2,
3
], ],
"socket": [ "socket": [
[ [ 0, 1, 2, 3 ]
0,
1,
2,
3
]
], ],
"memoryDomain": [ "memoryDomain": [
[ [ 0, 1, 2, 3 ]
0,
1,
2,
3
]
], ],
"core": [ "core": [
[ [ 0 ], [ 1 ], [ 2 ], [ 3 ]
0
],
[
1
],
[
2
],
[
3
]
] ]
} }
}, },
@@ -419,47 +372,22 @@
}, },
"topology": { "topology": {
"node": [ "node": [
0, 0, 1, 2, 3
1,
2,
3
], ],
"socket": [ "socket": [
[ [ 0, 1, 2, 3 ]
0,
1,
2,
3
]
], ],
"memoryDomain": [ "memoryDomain": [
[ [ 0, 1, 2, 3 ]
0,
1,
2,
3
]
], ],
"core": [ "core": [
[ [ 0 ], [ 1 ], [ 2 ], [ 3 ]
0
],
[
1
],
[
2
],
[
3
]
] ]
} }
}, },
{ {
"name": "icelake", "name": "icelake",
"nodes": "w22[01-35],w23[01-35],w24[01-20],w25[01-20]", "processorType": "Intel(R) Xeon(R) Gold 6326",
"processorType": "Intel Xeon Gold 6326",
"socketsPerNode": 2, "socketsPerNode": 2,
"coresPerSocket": 16, "coresPerSocket": 16,
"threadsPerCore": 1, "threadsPerCore": 1,
@@ -468,475 +396,37 @@
"base": "F/s", "base": "F/s",
"prefix": "G" "prefix": "G"
}, },
"value": 432 "value": 78
}, },
"flopRateSimd": { "flopRateSimd": {
"unit": { "unit": {
"base": "F/s", "base": "F/s",
"prefix": "G" "prefix": "G"
}, },
"value": 9216 "value": 1448
}, },
"memoryBandwidth": { "memoryBandwidth": {
"unit": { "unit": {
"base": "B/s", "base": "B/s",
"prefix": "G" "prefix": "G"
}, },
"value": 350 "value": 274
}, },
"nodes": "w22[01-35],w23[01-35],w24[01-20],w25[01-20]",
"topology": { "topology": {
"node": [ "node": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
67,
68,
69,
70,
71
],
"socket": [ "socket": [
[ [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
0, [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35
],
[
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
67,
68,
69,
70,
71
]
], ],
"memoryDomain": [ "memoryDomain": [
[ [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
0, [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17
],
[
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35
],
[
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53
],
[
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
65,
66,
67,
68,
69,
70,
71
]
], ],
"core": [ "core": [
[ [0],[1],[2],[3],[4],[5],[6],[7],[8],[9],[10],[11],[12],[13],[14],[15],[16],[17],[18],[19],[20],[21],[22],[23],[24],[25],[26],[27],[28],[29],[30],[31]
0
],
[
1
],
[
2
],
[
3
],
[
4
],
[
5
],
[
6
],
[
7
],
[
8
],
[
9
],
[
10
],
[
11
],
[
12
],
[
13
],
[
14
],
[
15
],
[
16
],
[
17
],
[
18
],
[
19
],
[
20
],
[
21
],
[
22
],
[
23
],
[
24
],
[
25
],
[
26
],
[
27
],
[
28
],
[
29
],
[
30
],
[
31
],
[
32
],
[
33
],
[
34
],
[
35
],
[
36
],
[
37
],
[
38
],
[
39
],
[
40
],
[
41
],
[
42
],
[
43
],
[
44
],
[
45
],
[
46
],
[
47
],
[
48
],
[
49
],
[
50
],
[
51
],
[
52
],
[
53
],
[
54
],
[
55
],
[
56
],
[
57
],
[
58
],
[
59
],
[
60
],
[
61
],
[
62
],
[
63
],
[
64
],
[
65
],
[
66
],
[
67
],
[
68
],
[
69
],
[
70
],
[
71
]
] ]
} }
} }
] ]