diff --git a/nhr@fau/README.md b/nhr@fau/README.md index c7858f8..209b6fd 100644 --- a/nhr@fau/README.md +++ b/nhr@fau/README.md @@ -9,25 +9,15 @@ You can find an overview about all clusters Some systems run with job exclusive nodes, others have node sharing enabled. There are CPU systems (Fritz, Meggie, Woody, TinyFat) as well as GPU accelerated -clusters (Alex, TinyGPU). +clusters (Alex, Helma, TinyGPU). NHR@FAU uses the following stack: -* `cc-metric-collector` as node agent -* `cc-metric-store` as temporal metric time series cache. We use one instance -for all clusters. +* `cc-metric-collector` * `cc-backend` -* A homegrown python script running on the management nodes for providing job -meta data from Slurm -* Builtin sqlite database for job meta and user data (currently 50GB large) -* Job Archive without retention using compressed data.json files (around 700GB) +* `cc-slurm-adapter` -Currently all API use regular HTTP protocol, but we plan to switch to NATS for -all communication. -We also push the metric data to an InfluxDB instance for debugging purposes. - -The backend and metric store run on the same dedicated Dell server running -Ubuntu Linux: +We use the following server with Ubuntu Linux: * Two Intel Xeon(R) Platinum 8352Y with 32 cores each * 512 GB Main memory capacity diff --git a/nhr@fau/cc-backend/clustercockpit.service b/nhr@fau/cc-backend/clustercockpit.service index 89bdd42..987057d 100644 --- a/nhr@fau/cc-backend/clustercockpit.service +++ b/nhr@fau/cc-backend/clustercockpit.service @@ -1,18 +1,20 @@ [Unit] -Description=ClusterCockpit Web Server -Documentation=https://clustercockpit.org +Description=ClusterCockpit Backend +Documentation=https://github.com/ClusterCockpit/cc-backend Wants=network-online.target After=network-online.target -After=mariadb.service mysql.service [Service] WorkingDirectory=/opt/monitoring/cc-backend Type=notify +User=clustercockpit +Group=clustercockpit NotifyAccess=all Restart=on-failure RestartSec=30 -TimeoutStopSec=100 -ExecStart=/opt/monitoring/cc-backend/cc-backend -loglevel info -server -config ./config.json +TimeoutStartSec=200 +TimeoutStopSec=200 +ExecStart=/opt/monitoring/cc-backend/cc-backend -loglevel info -server -config ./config.json [Install] WantedBy=multi-user.target diff --git a/nhr@fau/cc-backend/config.json b/nhr@fau/cc-backend/config.json index 062a98e..200be6e 100644 --- a/nhr@fau/cc-backend/config.json +++ b/nhr@fau/cc-backend/config.json @@ -1,241 +1,77 @@ { - "addr": "0.0.0.0:443", - "stop-jobs-exceeding-walltime": 288000, - "short-running-jobs-duration": 300, - "ldap": { - "url": "ldaps://hpcldap.rrze.uni-erlangen.de", - "user_base": "ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", - "search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", - "user_bind": "uid={username},ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", - "user_filter": "(&(objectclass=posixAccount))", - "sync_interval": "24h" - }, - "jwts": { - "syncUserOnLogin": true, - "updateUserOnLogin": true, - "trustedIssuer": "https://portal.hpc.fau.de/", - "validateUser": false, - "max-age": "168h" - }, - "https-cert-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/fullchain.pem", - "https-key-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/privkey.pem", - "user": "clustercockpit", - "group": "clustercockpit", - "archive": { - "kind": "file", - "path": "./var/job-archive", - "compression": 7, - "retention": { - "policy": "none" - } - }, - "enable-resampling": { - "trigger": 30, - "resolutions": [ - 600, - 300, - 120, - 60 - ] - }, - "emission-constant": 317, - "ui-defaults": { - "analysis_view_histogramMetrics": [ - "flops_any", - "mem_bw", - "mem_used" - ], - "analysis_view_scatterPlotMetrics": [ - [ - "flops_any", - "mem_bw" - ], - [ - "flops_any", - "cpu_load" - ], - [ - "cpu_load", - "mem_bw" - ] - ], - "job_view_nodestats_selectedMetrics": [ - "flops_any", - "mem_bw", - "mem_used" - ], - "job_view_polarPlotMetrics": [ - "flops_any", - "mem_bw", - "mem_used" - ], - "job_view_selectedMetrics": [ - "flops_any", - "mem_bw", - "mem_used" - ], - "job_view_showFootprint": true, - "job_list_usePaging": false, - "plot_general_colorBackground": true, - "plot_general_colorscheme": [ - "#00bfff", - "#0000ff", - "#ff00ff", - "#ff0000", - "#ff8000", - "#ffff00", - "#80ff00" - ], - "plot_general_lineWidth": 3, - "plot_list_jobsPerPage": 10, - "plot_list_selectedMetrics": [ - "cpu_load", - "mem_used", - "flops_any", - "mem_bw" - ], - "plot_view_plotsPerRow": 3, - "plot_view_showPolarplot": true, - "plot_view_showRoofline": true, - "plot_view_showStatTable": true, - "system_view_selectedMetric": "cpu_load", - "analysis_view_selectedTopEntity": "user", - "analysis_view_selectedTopCategory": "totalWalltime", - "status_view_selectedTopUserCategory": "totalJobs", - "status_view_selectedTopProjectCategory": "totalJobs" - }, - "clusters": [ - { - "name": "fritz", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "-" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 64 + "main": { + "addr": "127.0.0.1:8050", + "api-allowed-ips": ["*"], + "stop-jobs-exceeding-walltime":288000, + "short-running-jobs-duration": 300, + "resampling": { + "minimum-points": 600, + "trigger": 180, + "resolutions": [240, 60] }, - "duration": { - "from": 0, - "to": 86400 + "nodestate-retention": { + "policy": "move", + "target-kind": "file", + "target-path": "/opt/monitoring/cc-backend/var/nodestate-archive/" }, - "startTime": { - "from": "2022-01-01T00:00:00Z", - "to": null - } - } + "emission-constant": 317, + "enable-job-taggers": true }, - { - "name": "alex", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "-" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 64 - }, - "duration": { - "from": 0, - "to": 86400 - }, - "startTime": { - "from": "2022-01-01T00:00:00Z", - "to": null - } - } + "cron": { + "commit-job-worker": "1m", + "duration-worker": "5m", + "footprint-worker": "10m" }, - { - "name": "woody", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "-" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 1 - }, - "duration": { - "from": 0, - "to": 172800 - }, - "startTime": { - "from": "2020-01-01T00:00:00Z", - "to": null - } - } + "archive": { + "kind": "file", + "path": "./var/job-archive" }, - { - "name": "tinyfat", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "-" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 1 + "auth": { + "ldap": { + "url": "ldaps://hpcldap.rrze.uni-erlangen.de", + "user-base": "ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", + "search-dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", + "user-bind": "uid={username},ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", + "user-filter": "(&(objectclass=posixAccount))", + "sync-interval": "24h" }, - "duration": { - "from": 0, - "to": 172800 - }, - "startTime": { - "from": "2020-01-01T00:00:00Z", - "to": null + "jwts": { + "sync-user-on-login": true, + "update-user-on-login": true, + "validate-user": false, + "max-age": "1h", + "trusted-issuer": "https://portal.hpc.fau.de/" } - } }, - { - "name": "tinygpu", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "-" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 1 + "metric-store": { + "checkpoints": { + "file-format": "wal", + "directory": "./var/metric-checkpoints" }, - "duration": { - "from": 0, - "to": 172800 + "cleanup": { + "mode": "archive", + "directory": "./var/metric-archive" }, - "startTime": { - "from": "2020-01-01T00:00:00Z", - "to": null - } - } + "nats-subscriptions": [ + { + "subscribe-to": "ccmetrics.>" + } + ], + "retention-in-memory": "24h", + "memory-cap": 200 }, - { - "name": "meggie", - "metricDataRepository": { - "kind": "cc-metric-store", - "url": "http://localhost:8082", - "token": "-" - }, - "filterRanges": { - "numNodes": { - "from": 1, - "to": 64 - }, - "duration": { - "from": 0, - "to": 86400 - }, - "startTime": { - "from": "2018-01-01T00:00:00Z", - "to": null + "archive": { + "kind": "file", + "path": "./var/job-archive", + "compression": 7, + "retention": { + "policy": "none" } - } - } - ] + }, + "nats": { + "address": "nats://monitoring.nhr.fau.de:4222", + "username": "metricstore", + "password": "XXX" + }, + "ui-file": "uiConfig.json" } diff --git a/nhr@fau/cc-backend/uiConfig.json b/nhr@fau/cc-backend/uiConfig.json new file mode 100644 index 0000000..999e3db --- /dev/null +++ b/nhr@fau/cc-backend/uiConfig.json @@ -0,0 +1,59 @@ +{ + "job-view": { + "show-polar-plot":true, + "show-footprint": true, + "show-roofline": true, + "show-stat-table": true + }, + "metric-config": { + "job-list-metrics": ["cpu_load", "flops_any", "mem_bw", "mem_used"], + "job-view-plot-metrics": ["cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "nfs4_total"], + "job-view-table-metrics": ["mem_bw", "flops_any", "mem_used"], + "clusters": [ + { + "name": "fritz", + "job-view-plot-metrics": ["cpu_load", "cpu_user", "flops_any", "vectorization_ratio", "mem_bw", "mem_used", "flops_dp", "flops_sp", "ib_recv", "ib_xmit", "clock", "ipc", "cpu_power", "mem_power", "nfs4_total"] + }, + { + "name": "alex", + "job-list-metrics": ["acc_utilization", "acc_mem_used", "cpu_load", "flops_any", "mem_bw", "mem_used"], + "job-view-plot-metrics": ["acc_utilization", "nv_mem_util", "acc_mem_used", "acc_power", "nv_sm_clock", "nv_temp", "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "nfs4_total"], + "job-view-table-metrics": ["acc_utilization", "mem_bw", "flops_any", "mem_used"] + }, + { + "name": "tinygpu", + "job-list-metrics": ["acc_utilization", "acc_mem_used", "cpu_load", "flops_any", "mem_bw", "mem_used"], + "job-view-plot-metrics": ["acc_utilization", "nv_mem_util", "acc_mem_used", "acc_power", "nv_sm_clock", "nv_temp", "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "nfs4_total"], + "job-view-table-metrics": ["acc_utilization", "mem_bw", "flops_any", "mem_used"] + }, + { + "name": "helma", + "job-list-metrics": ["acc_utilization", "acc_mem_used", "cpu_load", "flops_any", "mem_bw", "mem_used"], + "job-view-plot-metrics": ["acc_utilization", "nv_mem_util", "acc_mem_used", "acc_power", "nv_sm_clock", "nv_temp", "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "ib_recv", "ib_xmit", "nfs4_total"], + "job-view-table-metrics": ["acc_utilization", "mem_bw", "flops_any", "mem_used"], + "sub-clusters": [ + { + "name": "cpu", + "job-list-metrics": ["cpu_load", "flops_any", "mem_bw", "mem_used"], + "job-view-plot-metrics": [ "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "flops_dp", "flops_sp", "ib_recv", "ib_xmit", "nfs4_total"], + "job-view-table-metrics": ["mem_bw", "flops_any", "mem_used"] + } + ] + } + ] + }, + "plot-configuration": { + "plots-per-row": 3, + "color-background": true, + "line-width": 3, + "color-scheme": [ + "#00bfff", + "#0000ff", + "#ff00ff", + "#ff0000", + "#ff8000", + "#ffff00", + "#80ff00" + ] + } +} diff --git a/nhr@fau/cc-metric-store/cc-metric-store.service b/nhr@fau/cc-metric-store/cc-metric-store.service deleted file mode 100644 index 210f2de..0000000 --- a/nhr@fau/cc-metric-store/cc-metric-store.service +++ /dev/null @@ -1,19 +0,0 @@ -[Unit] -Description=ClusterCockpit In-Memory Timeseries Database for Fritz (cc-metric-store) -Documentation=https://github.com/ClusterCockpit/cc-metric-store -Wants=network-online.target -After=network-online.target - -[Service] -Type=simple -User=clustercockpit -Group=clustercockpit -Restart=on-failure -RestartSec=30 -TimeoutStopSec=100 -WorkingDirectory=/opt/monitoring/cc-metric-store/fritz -ExecStart=/opt/monitoring/cc-metric-store/repo/cc-metric-store --config ./config.json -LimitNOFILE=500000 - -[Install] -WantedBy=multi-user.target diff --git a/nhr@fau/cc-metric-store/config.json b/nhr@fau/cc-metric-store/config.json deleted file mode 100644 index 1915608..0000000 --- a/nhr@fau/cc-metric-store/config.json +++ /dev/null @@ -1,180 +0,0 @@ -{ - "metrics": { - "clock": { - "frequency": 60, - "aggregation": "avg" - }, - "cpu_idle": { - "frequency": 60, - "aggregation": "avg" - }, - "cpu_iowait": { - "frequency": 60, - "aggregation": "avg" - }, - "cpu_irq": { - "frequency": 60, - "aggregation": "avg" - }, - "cpu_system": { - "frequency": 60, - "aggregation": "avg" - }, - "cpu_user": { - "frequency": 60, - "aggregation": "avg" - }, - "nv_mem_util": { - "frequency": 60, - "aggregation": "avg" - }, - "nv_temp": { - "frequency": 60, - "aggregation": "avg" - }, - "nv_sm_clock": { - "frequency": 60, - "aggregation": "avg" - }, - "acc_utilization": { - "frequency": 60, - "aggregation": "avg" - }, - "acc_mem_used": { - "frequency": 60, - "aggregation": "sum" - }, - "acc_power": { - "frequency": 60, - "aggregation": "sum" - }, - "flops_any": { - "frequency": 60, - "aggregation": "sum" - }, - "flops_dp": { - "frequency": 60, - "aggregation": "sum" - }, - "flops_sp": { - "frequency": 60, - "aggregation": "sum" - }, - "ib_recv": { - "frequency": 60, - "aggregation": "sum" - }, - "ib_xmit": { - "frequency": 60, - "aggregation": "sum" - }, - "ib_recv_pkts": { - "frequency": 60, - "aggregation": "sum" - }, - "ib_xmit_pkts": { - "frequency": 60, - "aggregation": "sum" - }, - "cpu_power": { - "frequency": 60, - "aggregation": "sum" - }, - "core_power": { - "frequency": 60, - "aggregation": "sum" - }, - "mem_power": { - "frequency": 60, - "aggregation": "sum" - }, - "ipc": { - "frequency": 60, - "aggregation": "avg" - }, - "cpu_load": { - "frequency": 60, - "aggregation": null - }, - "lustre_close": { - "frequency": 60, - "aggregation": null - }, - "lustre_open": { - "frequency": 60, - "aggregation": null - }, - "lustre_statfs": { - "frequency": 60, - "aggregation": null - }, - "lustre_read_bytes": { - "frequency": 60, - "aggregation": null - }, - "lustre_write_bytes": { - "frequency": 60, - "aggregation": null - }, - "net_bw": { - "frequency": 60, - "aggregation": null - }, - "file_bw": { - "frequency": 60, - "aggregation": null - }, - "mem_bw": { - "frequency": 60, - "aggregation": "sum" - }, - "mem_cached": { - "frequency": 60, - "aggregation": null - }, - "mem_used": { - "frequency": 60, - "aggregation": null - }, - "net_bytes_in": { - "frequency": 60, - "aggregation": null - }, - "net_bytes_out": { - "frequency": 60, - "aggregation": null - }, - "nfs4_read": { - "frequency": 60, - "aggregation": null - }, - "nfs4_total": { - "frequency": 60, - "aggregation": null - }, - "nfs4_write": { - "frequency": 60, - "aggregation": null - }, - "vectorization_ratio": { - "frequency": 60, - "aggregation": "avg" - } - }, - "checkpoints": { - "interval": "12h", - "directory": "/opt/monitoring/cc-metric-store/fritz/checkpoints", - "restore": "48h" - }, - "archive": { - "interval": "50h", - "directory": "/opt/monitoring/cc-metric-store/fritz/archive" - }, - "http-api": { - "address": "0.0.0.0:8082", - "https-cert-file": null, - "https-key-file": null - }, - "retention-in-memory": "48h", - "jwt-public-key": "-" -} diff --git a/nhr@fau/job-archive/cluster-alex.json b/nhr@fau/job-archive/cluster-alex.json index 0356e28..c0e3444 100644 --- a/nhr@fau/job-archive/cluster-alex.json +++ b/nhr@fau/job-archive/cluster-alex.json @@ -1,2809 +1,559 @@ { - "name": "alex", - "metricConfig": [ - { - "name": "cpu_load", - "unit": { - "base": "" - }, - "scope": "node", - "aggregation": "avg", - "footprint": "avg", - "timestep": 60, - "peak": 128, - "normal": 128, - "caution": 10, - "alert": 5 - }, - { - "name": "cpu_user", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "mem_used", - "unit": { - "base": "B", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "footprint": "max", - "timestep": 60, - "peak": 512, - "normal": 128, - "caution": 200, - "alert": 240 - }, - { - "name": "flops_any", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "footprint": "avg", - "timestep": 60, - "peak": 9216, - "normal": 1000, - "caution": 200, - "alert": 50 - }, - { - "name": "net_bytes_in", - "unit": { - "base": "B/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 125000000, - "normal": 125000000, - "caution": 200, - "alert": 240 - }, - { - "name": "net_bytes_out", - "unit": { - "base": "B/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 125000000, - "normal": 125000000, - "caution": 200, - "alert": 240 - }, - { - "name": "mem_bw", - "unit": { - "base": "B/s", - "prefix": "G" - }, - "scope": "socket", - "aggregation": "sum", - "footprint": "avg", - "timestep": 60, - "peak": 350, - "normal": 100, - "caution": 50, - "alert": 10 - }, - { - "name": "clock", - "unit": { - "base": "Hz", - "prefix": "M" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 3000, - "normal": 2400, - "caution": 1800, - "alert": 1200 - }, - { - "name": "core_power", - "unit": { - "base": "W" - }, - "scope": "hwthread", - "aggregation": "sum", - "energy": "power", - "timestep": 60, - "peak": 500, - "normal": 250, - "caution": 100, - "alert": 50 - }, - { - "name": "acc_utilization", - "unit": { - "base": "" - }, - "scope": "accelerator", - "aggregation": "avg", - "footprint": "avg", - "timestep": 60, - "peak": 100, - "normal": 80, - "caution": 50, - "alert": 20 - }, - { - "name": "acc_mem_used", - "unit": { - "base": "B", - "prefix": "M" - }, - "scope": "accelerator", - "aggregation": "sum", - "footprint": "max", - "timestep": 60, - "peak": 320000, - "normal": 160000, - "caution": 80000, - "alert": 40000, - "subClusters": [ + "name": "alex", + "metricConfig": [ { - "name": "a100m80", - "peak": 640000, - "normal": 320000, - "caution": 160000, - "alert": 80000, - "footprint": "max" - } - ] - }, + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "footprint": "avg", + "timestep": 60, + "peak": 128, + "normal": 128, + "caution": 10, + "alert": 5 + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": "max", + "timestep": 60, + "lowerIsBetter": true, + "peak": 512, + "normal": 128, + "caution": 200, + "alert": 240 + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 9216, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "net_bytes_in", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000000, + "normal": 10000000, + "caution": 5000, + "alert": 1000 + }, + { + "name": "net_bytes_out", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000000, + "normal": 200000, + "caution": 5000, + "alert": 1000 + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10 + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + }, + { + "name": "core_power", + "unit": { + "base": "W" + }, + "scope": "hwthread", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "acc_utilization", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "footprint": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 50, + "alert": 20 + }, + { + "name": "acc_mem_used", + "unit": { + "base": "B", + "prefix": "M" + }, + "scope": "accelerator", + "aggregation": "sum", + "footprint": "max", + "timestep": 60, + "lowerIsBetter": true, + "peak": 320000, + "normal": 160000, + "caution": 80000, + "alert": 40000, + "subClusters": [ + { + "name": "a100m80", + "peak": 640000, + "normal": 320000, + "caution": 160000, + "alert": 80000, + "footprint":"max" + } + ] + }, + { + "name": "acc_power", + "unit": { + "base": "W" + }, + "scope": "accelerator", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 3200, + "normal": 1600, + "caution": 400, + "alert": 160 + }, + { + "name": "nv_mem_util", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 20, + "alert": 10 + }, + { + "name": "nv_temp", + "unit": { + "base": "°C" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 40, + "normal": 20, + "caution": 5, + "alert": 2 + }, + { + "name": "nv_sm_clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 1400, + "normal": 1200, + "caution": 100, + "alert": 50 + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, { - "name": "acc_power", + "name": "nfs4_read", "unit": { - "base": "W" + "base": "IOP", + "prefix": "" }, - "scope": "accelerator", + "scope": "node", "aggregation": "sum", - "energy": "power", "timestep": 60, - "peak": 3200, - "normal": 1600, - "caution": 400, - "alert": 160 + "peak": 50000, + "normal": 10000, + "caution": 10, + "alert": 1 }, { - "name": "nv_mem_util", + "name": "nfs4_total", "unit": { - "base": "" + "base": "IOP", + "prefix": "" }, - "scope": "accelerator", - "aggregation": "avg", + "scope": "node", + "aggregation": "sum", "timestep": 60, - "peak": 100, - "normal": 80, + "peak": 50000, + "normal": 10000, "caution": 20, - "alert": 10 - }, - { - "name": "nv_temp", - "unit": { - "base": "°C" - }, - "scope": "accelerator", - "aggregation": "avg", - "timestep": 60, - "peak": 40, - "normal": 20, - "caution": 5, - "alert": 2 - }, - { - "name": "nv_sm_clock", - "unit": { - "base": "Hz", - "prefix": "M" - }, - "scope": "accelerator", - "aggregation": "avg", - "timestep": 60, - "peak": 1400, - "normal": 1200, - "caution": 100, - "alert": 50 - }, - { - "name": "cpu_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "energy": "power", - "timestep": 60, - "peak": 500, - "normal": 250, - "caution": 100, - "alert": 50 - }, - { - "name": "ipc", - "unit": { - "base": "IPC" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 4, - "normal": 2, - "caution": 1, - "alert": 0.5 + "alert": 5 } - ], - "subClusters": [ - { - "name": "a40", - "nodes": "a[0121-0129],a[0221-0229],a[0321-0329],a[0421-0429],a[0521-0522],a[1621-1624],a[1721-1722]", - "processorType": "AMD Milan", - "socketsPerNode": 2, - "coresPerSocket": 64, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" + ], + "subClusters": [ + { + "name": "a40", + "nodes": "a[0121-0129],a[0221-0229],a[0321-0329],a[0421-0429],a[0521-0522],a[1621-1624],a[1721-1722]", + "processorType": "AMD Milan", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 400 + }, + "topology": { + "node": [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ], + "socket": [ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 + ], + [ + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ] + ], + "memoryDomain": [ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] + ], + "accelerators": [ + { + "id": "00000000:01:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:25:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:41:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:61:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:81:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:A1:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:C1:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:E1:00.0", + "type": "Nvidia GPU", + "model": "A40" + } + ] + } }, - "value": 432 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" + { + "name": "a100", + "nodes": "a[0601-0605],a[0701-0705],a[0801-0805],a[0901-0905]", + "processorType": "AMD Milan", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 400 + }, + "topology": { + "node": [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ], + "socket": [ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 + ], + [ + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ] + ], + "memoryDomain": [ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] + ], + "accelerators": [ + { + "id": "00000000:0E:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:13:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:49:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:4F:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:90:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:96:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:CC:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:D1:00.0", + "type": "Nvidia GPU", + "model": "A100" + } + ] + } }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 400 - }, - "topology": { - "node": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 - ], - "socket": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63 - ], - [ - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 - ] - ], - "memoryDomain": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 - ] - ], - "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ], - [ - 4 - ], - [ - 5 - ], - [ - 6 - ], - [ - 7 - ], - [ - 8 - ], - [ - 9 - ], - [ - 10 - ], - [ - 11 - ], - [ - 12 - ], - [ - 13 - ], - [ - 14 - ], - [ - 15 - ], - [ - 16 - ], - [ - 17 - ], - [ - 18 - ], - [ - 19 - ], - [ - 20 - ], - [ - 21 - ], - [ - 22 - ], - [ - 23 - ], - [ - 24 - ], - [ - 25 - ], - [ - 26 - ], - [ - 27 - ], - [ - 28 - ], - [ - 29 - ], - [ - 30 - ], - [ - 31 - ], - [ - 32 - ], - [ - 33 - ], - [ - 34 - ], - [ - 35 - ], - [ - 36 - ], - [ - 37 - ], - [ - 38 - ], - [ - 39 - ], - [ - 40 - ], - [ - 41 - ], - [ - 42 - ], - [ - 43 - ], - [ - 44 - ], - [ - 45 - ], - [ - 46 - ], - [ - 47 - ], - [ - 48 - ], - [ - 49 - ], - [ - 50 - ], - [ - 51 - ], - [ - 52 - ], - [ - 53 - ], - [ - 54 - ], - [ - 55 - ], - [ - 56 - ], - [ - 57 - ], - [ - 58 - ], - [ - 59 - ], - [ - 60 - ], - [ - 61 - ], - [ - 62 - ], - [ - 63 - ], - [ - 64 - ], - [ - 65 - ], - [ - 66 - ], - [ - 67 - ], - [ - 68 - ], - [ - 69 - ], - [ - 70 - ], - [ - 71 - ], - [ - 73 - ], - [ - 74 - ], - [ - 75 - ], - [ - 76 - ], - [ - 77 - ], - [ - 78 - ], - [ - 79 - ], - [ - 80 - ], - [ - 81 - ], - [ - 82 - ], - [ - 83 - ], - [ - 84 - ], - [ - 85 - ], - [ - 86 - ], - [ - 87 - ], - [ - 88 - ], - [ - 89 - ], - [ - 90 - ], - [ - 91 - ], - [ - 92 - ], - [ - 93 - ], - [ - 94 - ], - [ - 95 - ], - [ - 96 - ], - [ - 97 - ], - [ - 98 - ], - [ - 99 - ], - [ - 100 - ], - [ - 101 - ], - [ - 102 - ], - [ - 103 - ], - [ - 104 - ], - [ - 105 - ], - [ - 106 - ], - [ - 107 - ], - [ - 108 - ], - [ - 109 - ], - [ - 110 - ], - [ - 111 - ], - [ - 112 - ], - [ - 113 - ], - [ - 114 - ], - [ - 115 - ], - [ - 116 - ], - [ - 117 - ], - [ - 118 - ], - [ - 119 - ], - [ - 120 - ], - [ - 121 - ], - [ - 122 - ], - [ - 123 - ], - [ - 124 - ], - [ - 125 - ], - [ - 126 - ], - [ - 127 - ] - ], - "accelerators": [ - { - "id": "00000000:01:00.0", - "type": "Nvidia GPU", - "model": "A40" - }, - { - "id": "00000000:25:00.0", - "type": "Nvidia GPU", - "model": "A40" - }, - { - "id": "00000000:41:00.0", - "type": "Nvidia GPU", - "model": "A40" - }, - { - "id": "00000000:61:00.0", - "type": "Nvidia GPU", - "model": "A40" - }, - { - "id": "00000000:81:00.0", - "type": "Nvidia GPU", - "model": "A40" - }, - { - "id": "00000000:A1:00.0", - "type": "Nvidia GPU", - "model": "A40" - }, - { - "id": "00000000:C1:00.0", - "type": "Nvidia GPU", - "model": "A40" - }, - { - "id": "00000000:E1:00.0", - "type": "Nvidia GPU", - "model": "A40" - } - ] - } - }, - { - "name": "a100", - "nodes": "a[0601-0605],a[0701-0705],a[0801-0805],a[0901-0905]", - "processorType": "AMD Milan", - "socketsPerNode": 2, - "coresPerSocket": 64, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 432 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 400 - }, - "topology": { - "node": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 - ], - "socket": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63 - ], - [ - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 - ] - ], - "memoryDomain": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 - ] - ], - "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ], - [ - 4 - ], - [ - 5 - ], - [ - 6 - ], - [ - 7 - ], - [ - 8 - ], - [ - 9 - ], - [ - 10 - ], - [ - 11 - ], - [ - 12 - ], - [ - 13 - ], - [ - 14 - ], - [ - 15 - ], - [ - 16 - ], - [ - 17 - ], - [ - 18 - ], - [ - 19 - ], - [ - 20 - ], - [ - 21 - ], - [ - 22 - ], - [ - 23 - ], - [ - 24 - ], - [ - 25 - ], - [ - 26 - ], - [ - 27 - ], - [ - 28 - ], - [ - 29 - ], - [ - 30 - ], - [ - 31 - ], - [ - 32 - ], - [ - 33 - ], - [ - 34 - ], - [ - 35 - ], - [ - 36 - ], - [ - 37 - ], - [ - 38 - ], - [ - 39 - ], - [ - 40 - ], - [ - 41 - ], - [ - 42 - ], - [ - 43 - ], - [ - 44 - ], - [ - 45 - ], - [ - 46 - ], - [ - 47 - ], - [ - 48 - ], - [ - 49 - ], - [ - 50 - ], - [ - 51 - ], - [ - 52 - ], - [ - 53 - ], - [ - 54 - ], - [ - 55 - ], - [ - 56 - ], - [ - 57 - ], - [ - 58 - ], - [ - 59 - ], - [ - 60 - ], - [ - 61 - ], - [ - 62 - ], - [ - 63 - ], - [ - 64 - ], - [ - 65 - ], - [ - 66 - ], - [ - 67 - ], - [ - 68 - ], - [ - 69 - ], - [ - 70 - ], - [ - 71 - ], - [ - 73 - ], - [ - 74 - ], - [ - 75 - ], - [ - 76 - ], - [ - 77 - ], - [ - 78 - ], - [ - 79 - ], - [ - 80 - ], - [ - 81 - ], - [ - 82 - ], - [ - 83 - ], - [ - 84 - ], - [ - 85 - ], - [ - 86 - ], - [ - 87 - ], - [ - 88 - ], - [ - 89 - ], - [ - 90 - ], - [ - 91 - ], - [ - 92 - ], - [ - 93 - ], - [ - 94 - ], - [ - 95 - ], - [ - 96 - ], - [ - 97 - ], - [ - 98 - ], - [ - 99 - ], - [ - 100 - ], - [ - 101 - ], - [ - 102 - ], - [ - 103 - ], - [ - 104 - ], - [ - 105 - ], - [ - 106 - ], - [ - 107 - ], - [ - 108 - ], - [ - 109 - ], - [ - 110 - ], - [ - 111 - ], - [ - 112 - ], - [ - 113 - ], - [ - 114 - ], - [ - 115 - ], - [ - 116 - ], - [ - 117 - ], - [ - 118 - ], - [ - 119 - ], - [ - 120 - ], - [ - 121 - ], - [ - 122 - ], - [ - 123 - ], - [ - 124 - ], - [ - 125 - ], - [ - 126 - ], - [ - 127 - ] - ], - "accelerators": [ - { - "id": "00000000:0E:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:13:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:49:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:4F:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:90:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:96:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:CC:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:D1:00.0", - "type": "Nvidia GPU", - "model": "A100" - } - ] - } - }, - { - "name": "a100m80", - "nodes": "a[0531-0537],a[0631-0633],a0731,a[0831-0833],a[0931-0934]", - "processorType": "AMD Milan", - "socketsPerNode": 2, - "coresPerSocket": 64, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 432 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 400 - }, - "topology": { - "node": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 - ], - "socket": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63 - ], - [ - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 - ] - ], - "memoryDomain": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127 - ] - ], - "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ], - [ - 4 - ], - [ - 5 - ], - [ - 6 - ], - [ - 7 - ], - [ - 8 - ], - [ - 9 - ], - [ - 10 - ], - [ - 11 - ], - [ - 12 - ], - [ - 13 - ], - [ - 14 - ], - [ - 15 - ], - [ - 16 - ], - [ - 17 - ], - [ - 18 - ], - [ - 19 - ], - [ - 20 - ], - [ - 21 - ], - [ - 22 - ], - [ - 23 - ], - [ - 24 - ], - [ - 25 - ], - [ - 26 - ], - [ - 27 - ], - [ - 28 - ], - [ - 29 - ], - [ - 30 - ], - [ - 31 - ], - [ - 32 - ], - [ - 33 - ], - [ - 34 - ], - [ - 35 - ], - [ - 36 - ], - [ - 37 - ], - [ - 38 - ], - [ - 39 - ], - [ - 40 - ], - [ - 41 - ], - [ - 42 - ], - [ - 43 - ], - [ - 44 - ], - [ - 45 - ], - [ - 46 - ], - [ - 47 - ], - [ - 48 - ], - [ - 49 - ], - [ - 50 - ], - [ - 51 - ], - [ - 52 - ], - [ - 53 - ], - [ - 54 - ], - [ - 55 - ], - [ - 56 - ], - [ - 57 - ], - [ - 58 - ], - [ - 59 - ], - [ - 60 - ], - [ - 61 - ], - [ - 62 - ], - [ - 63 - ], - [ - 64 - ], - [ - 65 - ], - [ - 66 - ], - [ - 67 - ], - [ - 68 - ], - [ - 69 - ], - [ - 70 - ], - [ - 71 - ], - [ - 73 - ], - [ - 74 - ], - [ - 75 - ], - [ - 76 - ], - [ - 77 - ], - [ - 78 - ], - [ - 79 - ], - [ - 80 - ], - [ - 81 - ], - [ - 82 - ], - [ - 83 - ], - [ - 84 - ], - [ - 85 - ], - [ - 86 - ], - [ - 87 - ], - [ - 88 - ], - [ - 89 - ], - [ - 90 - ], - [ - 91 - ], - [ - 92 - ], - [ - 93 - ], - [ - 94 - ], - [ - 95 - ], - [ - 96 - ], - [ - 97 - ], - [ - 98 - ], - [ - 99 - ], - [ - 100 - ], - [ - 101 - ], - [ - 102 - ], - [ - 103 - ], - [ - 104 - ], - [ - 105 - ], - [ - 106 - ], - [ - 107 - ], - [ - 108 - ], - [ - 109 - ], - [ - 110 - ], - [ - 111 - ], - [ - 112 - ], - [ - 113 - ], - [ - 114 - ], - [ - 115 - ], - [ - 116 - ], - [ - 117 - ], - [ - 118 - ], - [ - 119 - ], - [ - 120 - ], - [ - 121 - ], - [ - 122 - ], - [ - 123 - ], - [ - 124 - ], - [ - 125 - ], - [ - 126 - ], - [ - 127 - ] - ], - "accelerators": [ - { - "id": "00000000:0E:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:13:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:49:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:4F:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:90:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:96:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:CC:00.0", - "type": "Nvidia GPU", - "model": "A100" - }, - { - "id": "00000000:D1:00.0", - "type": "Nvidia GPU", - "model": "A100" - } - ] - } - } - ] + { + "name": "a100m80", + "nodes": "a[0531-0537],a[0631-0633],a0731,a[0831-0833],a[0931-0934]", + "processorType": "AMD Milan", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 400 + }, + "topology": { + "node": [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ], + "socket": [ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 + ], + [ + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ] + ], + "memoryDomain": [ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] + ], + "accelerators": [ + { + "id": "00000000:0E:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:13:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:49:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:4F:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:90:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:96:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:CC:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:D1:00.0", + "type": "Nvidia GPU", + "model": "A100" + } + ] + } + } + ] } diff --git a/nhr@fau/job-archive/cluster-fritz.json b/nhr@fau/job-archive/cluster-fritz.json index 201d6b3..b565706 100644 --- a/nhr@fau/job-archive/cluster-fritz.json +++ b/nhr@fau/job-archive/cluster-fritz.json @@ -1,2293 +1,627 @@ { - "name": "fritz", - "metricConfig": [ - { - "name": "cpu_load", - "unit": { - "base": "" - }, - "scope": "node", - "aggregation": "avg", - "footprint": "avg", - "timestep": 60, - "peak": 72, - "normal": 72, - "caution": 36, - "alert": 20, - "subClusters": [ + "name": "fritz", + "metricConfig": [ { - "name": "spr1tb", - "footprint": "avg", - "peak": 104, - "normal": 104, - "caution": 52, - "alert": 20 + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "footprint": "avg", + "timestep": 60, + "peak": 72, + "normal": 72, + "caution": 36, + "alert": 20, + "subClusters": [ + { + "name": "spr1tb", + "footprint": "avg", + "peak": 104, + "normal": 104, + "caution": 52, + "alert": 20 + }, + { + "name": "spr2tb", + "footprint": "avg", + "peak": 104, + "normal": 104, + "caution": 52, + "alert": 20 + } + ] }, { - "name": "spr2tb", - "footprint": "avg", - "peak": 104, - "normal": 104, - "caution": 52, - "alert": 20 + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": "max", + "timestep": 60, + "lowerIsBetter": true, + "peak": 256, + "normal": 128, + "caution": 200, + "alert": 240, + "subClusters": [ + { + "name": "spr1tb", + "footprint": "max", + "peak": 1024, + "normal": 512, + "caution": 900, + "alert": 1000 + }, + { + "name": "spr2tb", + "footprint": "max", + "peak": 2048, + "normal": 1024, + "caution": 1800, + "alert": 2000 + } + ] + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50, + "subClusters": [ + { + "name": "spr1tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50, + "footprint": "avg" + }, + { + "name": "spr2tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50, + "footprint": "avg" + } + ] + }, + { + "name": "net_bytes_in", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000000, + "normal": 10000000, + "caution": 5000, + "alert": 1000 + }, + { + "name": "net_bytes_out", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000000, + "normal": 200000, + "caution": 5000, + "alert": 1000 + }, + { + "name": "flops_sp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50, + "subClusters": [ + { + "name": "spr1tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50 + }, + { + "name": "spr2tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50 + } + ] + }, + { + "name": "flops_dp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 2300, + "normal": 500, + "caution": 100, + "alert": 50, + "subClusters": [ + { + "name": "spr1tb", + "peak": 3300, + "normal": 750, + "caution": 200, + "alert": 50 + }, + { + "name": "spr2tb", + "peak": 3300, + "normal": 750, + "caution": 200, + "alert": 50 + } + ] + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10, + "subClusters": [ + { + "name": "spr1tb", + "footprint": "avg", + "peak": 549, + "normal": 200, + "caution": 100, + "alert": 20 + }, + { + "name": "spr2tb", + "footprint": "avg", + "peak": 520, + "normal": 200, + "caution": 100, + "alert": 20 + } + ] + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200, + "subClusters": [ + { + "name": "spr1tb", + "peak": 3000, + "normal": 2000, + "caution": 1600, + "alert": 1200, + "remove": false + }, + { + "name": "spr2tb", + "peak": 3000, + "normal": 2000, + "caution": 1600, + "alert": 1200, + "remove": false + } + ] + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50, + "subClusters": [ + { + "name": "spr1tb", + "peak": 700, + "energy": "power", + "normal": 350, + "caution": 150, + "alert": 50 + }, + { + "name": "spr2tb", + "peak": 700, + "energy": "power", + "normal": 350, + "caution": 150, + "alert": 50 + } + ] + }, + { + "name": "mem_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10, + "subClusters": [ + { + "name": "spr1tb", + "peak": 400, + "energy": "power", + "normal": 200, + "caution": 80, + "alert": 40 + }, + { + "name": "spr2tb", + "peak": 800, + "energy": "power", + "normal": 400, + "caution": 160, + "alert": 80 + } + ] + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5, + "subClusters": [ + { + "name": "spr1tb", + "peak": 6, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, + { + "name": "spr2tb", + "peak": 6, + "normal": 2, + "caution": 1, + "alert": 0.5 + } + ] + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10, + "subClusters": [ + { + "name": "spr1tb", + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "spr2tb", + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + } + ] + }, + { + "name": "ib_recv", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 800000000, + "normal": 200000000, + "caution": 200000, + "alert": 100 + }, + { + "name": "ib_xmit", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 800000000, + "normal": 200000000, + "caution": 200000, + "alert": 100 + }, + { + "name": "ib_recv_pkts", + "unit": { + "base": "packets/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 600000, + "normal": 300000, + "caution": 3000, + "alert": 1 + }, + { + "name": "ib_xmit_pkts", + "unit": { + "base": "packets/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 600000, + "normal": 300000, + "caution": 3000, + "alert": 1 + }, + { + "name": "nfs4_read", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000, + "normal": 10000, + "caution": 10, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000, + "normal": 10000, + "caution": 20, + "alert": 5 } - ] - }, - { - "name": "cpu_user", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "mem_used", - "unit": { - "base": "B", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "footprint": "max", - "timestep": 60, - "peak": 256, - "normal": 128, - "caution": 200, - "alert": 240, - "subClusters": [ + ], + "subClusters": [ { - "name": "spr1tb", - "footprint": "max", - "peak": 1024, - "normal": 512, - "caution": 900, - "alert": 1000 + "name": "main", + "nodes": "f[0101-0188,0201-0288,0301-0388,0401-0488,0501-0588,0601-0688,0701-0788,0801-0888,0901-0988,1001-1088,1101-1156,1201-1256]", + "processorType": "Intel Icelake", + "socketsPerNode": 2, + "coresPerSocket": 36, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 350 + }, + "topology": { + "node": [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71 + ], + "socket": [ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 ], + [ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71 ] + ], + "memoryDomain": [ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 ], + [ 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 ], + [ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ], + [ 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71 ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ] + ] + } }, { - "name": "spr2tb", - "footprint": "max", - "peak": 2048, - "normal": 1024, - "caution": 1800, - "alert": 2000 + "name": "spr1tb", + "processorType": "Intel(R) Xeon(R) Platinum 8470", + "socketsPerNode": 2, + "coresPerSocket": 52, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 695 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 549 + }, + "nodes": "f[2157-2180,2257-2280]", + "topology": { + "node": + [ + 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,5152,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103 + ], + "socket": + [ + [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51], + [52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103] + ], + "memoryDomain": [ + [0,1,2,3,4,5,6,7,8,9,10,11,12], + [13,14,15,16,17,18,19,20,21,22,23,24,25], + [26,27,28,29,30,31,32,33,34,35,36,37,38], + [39,40,41,42,43,44,45,46,47,48,49,50,51], + [52,53,54,55,56,57,58,59,60,61,62,63,64], + [65,66,67,68,69,70,71,72,73,74,75,76,77], + [78,79,80,81,82,83,84,85,86,87,88,89,90], + [91,92,93,94,95,96,97,98,99,100,101,102,103] + ], + "core": [ + [0],[1],[2],[3],[4],[5],[6],[7],[8],[9],[10],[11],[12],[13],[14],[15],[16],[17],[18],[19],[20],[21],[22],[23],[24],[25],[26],[27],[28],[29],[30],[31],[32],[33],[34],[35],[36],[37],[38],[39],[40],[41],[42],[43],[44],[45],[46],[47],[48],[49],[50],[51],[52],[53],[54],[55],[56],[57],[58],[59],[60],[61],[62],[63],[64],[65],[66],[67],[68],[69],[70],[71],[72],[73],[74],[75],[76],[77],[78],[79],[80],[81],[82],[83],[84],[85],[86],[87],[88],[89],[90],[91],[92],[93],[94],[95],[96],[97],[98],[99],[100],[101],[102],[103] + ] + } + }, + { + "name": "spr2tb", + "processorType": "Intel(R) Xeon(R) Platinum 8470", + "socketsPerNode": 2, + "coresPerSocket": 52, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 695 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 515 + }, + "nodes": "f[2181-2188,2281-2288]", + "topology": { + "node": [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103 + ], + "socket": [ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 + ], + [ + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103 + ] + ], + "memoryDomain": [ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ], + [ 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 ], + [ 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38 ], + [ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 ], + [ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64 ], + [ 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77 ], + [ 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90 ], + [ 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103 ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 72 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ] + ] + } } - ] - }, - { - "name": "flops_any", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "footprint": "avg", - "timestep": 60, - "peak": 5600, - "normal": 1000, - "caution": 200, - "alert": 50, - "subClusters": [ - { - "name": "spr1tb", - "peak": 6656, - "normal": 1500, - "caution": 400, - "alert": 50, - "footprint": "avg" - }, - { - "name": "spr2tb", - "peak": 6656, - "normal": 1500, - "caution": 400, - "alert": 50, - "footprint": "avg" - } - ] - }, - { - "name": "flops_sp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 5600, - "normal": 1000, - "caution": 200, - "alert": 50, - "subClusters": [ - { - "name": "spr1tb", - "peak": 6656, - "normal": 1500, - "caution": 400, - "alert": 50 - }, - { - "name": "spr2tb", - "peak": 6656, - "normal": 1500, - "caution": 400, - "alert": 50 - } - ] - }, - { - "name": "flops_dp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 2300, - "normal": 500, - "caution": 100, - "alert": 50, - "subClusters": [ - { - "name": "spr1tb", - "peak": 3300, - "normal": 750, - "caution": 200, - "alert": 50 - }, - { - "name": "spr2tb", - "peak": 3300, - "normal": 750, - "caution": 200, - "alert": 50 - } - ] - }, - { - "name": "mem_bw", - "unit": { - "base": "B/s", - "prefix": "G" - }, - "scope": "socket", - "aggregation": "sum", - "footprint": "avg", - "timestep": 60, - "peak": 350, - "normal": 100, - "caution": 50, - "alert": 10, - "subClusters": [ - { - "name": "spr1tb", - "footprint": "avg", - "peak": 549, - "normal": 200, - "caution": 100, - "alert": 20 - }, - { - "name": "spr2tb", - "footprint": "avg", - "peak": 520, - "normal": 200, - "caution": 100, - "alert": 20 - } - ] - }, - { - "name": "clock", - "unit": { - "base": "Hz", - "prefix": "M" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 3000, - "normal": 2400, - "caution": 1800, - "alert": 1200, - "subClusters": [ - { - "name": "spr1tb", - "peak": 3000, - "normal": 2000, - "caution": 1600, - "alert": 1200, - "remove": false - }, - { - "name": "spr2tb", - "peak": 3000, - "normal": 2000, - "caution": 1600, - "alert": 1200, - "remove": false - } - ] - }, - { - "name": "cpu_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "energy": "power", - "timestep": 60, - "peak": 500, - "normal": 250, - "caution": 100, - "alert": 50, - "subClusters": [ - { - "name": "spr1tb", - "peak": 700, - "energy": "power", - "normal": 350, - "caution": 150, - "alert": 50 - }, - { - "name": "spr2tb", - "peak": 700, - "energy": "power", - "normal": 350, - "caution": 150, - "alert": 50 - } - ] - }, - { - "name": "mem_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "energy": "power", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10, - "subClusters": [ - { - "name": "spr1tb", - "peak": 400, - "energy": "power", - "normal": 200, - "caution": 80, - "alert": 40 - }, - { - "name": "spr2tb", - "peak": 800, - "energy": "power", - "normal": 400, - "caution": 160, - "alert": 80 - } - ] - }, - { - "name": "ipc", - "unit": { - "base": "IPC" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 4, - "normal": 2, - "caution": 1, - "alert": 0.5, - "subClusters": [ - { - "name": "spr1tb", - "peak": 6, - "normal": 2, - "caution": 1, - "alert": 0.5 - }, - { - "name": "spr2tb", - "peak": 6, - "normal": 2, - "caution": 1, - "alert": 0.5 - } - ] - }, - { - "name": "vectorization_ratio", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 60, - "caution": 40, - "alert": 10, - "subClusters": [ - { - "name": "spr1tb", - "peak": 100, - "normal": 60, - "caution": 40, - "alert": 10 - }, - { - "name": "spr2tb", - "peak": 100, - "normal": 60, - "caution": 40, - "alert": 10 - } - ] - }, - { - "name": "ib_recv", - "unit": { - "base": "B/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 1250000, - "normal": 6000000, - "caution": 200, - "alert": 1 - }, - { - "name": "ib_xmit", - "unit": { - "base": "B/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 1250000, - "normal": 6000000, - "caution": 200, - "alert": 1 - }, - { - "name": "ib_recv_pkts", - "unit": { - "base": "packets/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "ib_xmit_pkts", - "unit": { - "base": "packets/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_read", - "unit": { - "base": "IOP", - "prefix": "" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 1000, - "normal": 50, - "caution": 200, - "alert": 500 - }, - { - "name": "nfs4_total", - "unit": { - "base": "IOP", - "prefix": "" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 1000, - "normal": 50, - "caution": 200, - "alert": 500 - } - ], - "subClusters": [ - { - "name": "main", - "nodes": "f[0101-0188,0201-0288,0301-0388,0401-0488,0501-0588,0601-0688,0701-0788,0801-0888,0901-0988,1001-1088,1101-1156,1201-1256]", - "processorType": "Intel Icelake", - "socketsPerNode": 2, - "coresPerSocket": 36, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 432 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 350 - }, - "topology": { - "node": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71 - ], - "socket": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35 - ], - [ - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71 - ] - ], - "memoryDomain": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17 - ], - [ - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35 - ], - [ - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53 - ], - [ - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71 - ] - ], - "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ], - [ - 4 - ], - [ - 5 - ], - [ - 6 - ], - [ - 7 - ], - [ - 8 - ], - [ - 9 - ], - [ - 10 - ], - [ - 11 - ], - [ - 12 - ], - [ - 13 - ], - [ - 14 - ], - [ - 15 - ], - [ - 16 - ], - [ - 17 - ], - [ - 18 - ], - [ - 19 - ], - [ - 20 - ], - [ - 21 - ], - [ - 22 - ], - [ - 23 - ], - [ - 24 - ], - [ - 25 - ], - [ - 26 - ], - [ - 27 - ], - [ - 28 - ], - [ - 29 - ], - [ - 30 - ], - [ - 31 - ], - [ - 32 - ], - [ - 33 - ], - [ - 34 - ], - [ - 35 - ], - [ - 36 - ], - [ - 37 - ], - [ - 38 - ], - [ - 39 - ], - [ - 40 - ], - [ - 41 - ], - [ - 42 - ], - [ - 43 - ], - [ - 44 - ], - [ - 45 - ], - [ - 46 - ], - [ - 47 - ], - [ - 48 - ], - [ - 49 - ], - [ - 50 - ], - [ - 51 - ], - [ - 52 - ], - [ - 53 - ], - [ - 54 - ], - [ - 55 - ], - [ - 56 - ], - [ - 57 - ], - [ - 58 - ], - [ - 59 - ], - [ - 60 - ], - [ - 61 - ], - [ - 62 - ], - [ - 63 - ], - [ - 64 - ], - [ - 65 - ], - [ - 66 - ], - [ - 67 - ], - [ - 68 - ], - [ - 69 - ], - [ - 70 - ], - [ - 71 - ] - ] - } - }, - { - "name": "spr1tb", - "processorType": "Intel(R) Xeon(R) Platinum 8470", - "socketsPerNode": 2, - "coresPerSocket": 52, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 695 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 549 - }, - "nodes": "f[2157-2180,2257-2280]", - "topology": { - "node": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 5152, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103 - ], - "socket": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51 - ], - [ - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103 - ] - ], - "memoryDomain": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12 - ], - [ - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25 - ], - [ - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38 - ], - [ - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51 - ], - [ - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64 - ], - [ - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77 - ], - [ - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90 - ], - [ - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103 - ] - ], - "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ], - [ - 4 - ], - [ - 5 - ], - [ - 6 - ], - [ - 7 - ], - [ - 8 - ], - [ - 9 - ], - [ - 10 - ], - [ - 11 - ], - [ - 12 - ], - [ - 13 - ], - [ - 14 - ], - [ - 15 - ], - [ - 16 - ], - [ - 17 - ], - [ - 18 - ], - [ - 19 - ], - [ - 20 - ], - [ - 21 - ], - [ - 22 - ], - [ - 23 - ], - [ - 24 - ], - [ - 25 - ], - [ - 26 - ], - [ - 27 - ], - [ - 28 - ], - [ - 29 - ], - [ - 30 - ], - [ - 31 - ], - [ - 32 - ], - [ - 33 - ], - [ - 34 - ], - [ - 35 - ], - [ - 36 - ], - [ - 37 - ], - [ - 38 - ], - [ - 39 - ], - [ - 40 - ], - [ - 41 - ], - [ - 42 - ], - [ - 43 - ], - [ - 44 - ], - [ - 45 - ], - [ - 46 - ], - [ - 47 - ], - [ - 48 - ], - [ - 49 - ], - [ - 50 - ], - [ - 51 - ], - [ - 52 - ], - [ - 53 - ], - [ - 54 - ], - [ - 55 - ], - [ - 56 - ], - [ - 57 - ], - [ - 58 - ], - [ - 59 - ], - [ - 60 - ], - [ - 61 - ], - [ - 62 - ], - [ - 63 - ], - [ - 64 - ], - [ - 65 - ], - [ - 66 - ], - [ - 67 - ], - [ - 68 - ], - [ - 69 - ], - [ - 70 - ], - [ - 71 - ], - [ - 72 - ], - [ - 73 - ], - [ - 74 - ], - [ - 75 - ], - [ - 76 - ], - [ - 77 - ], - [ - 78 - ], - [ - 79 - ], - [ - 80 - ], - [ - 81 - ], - [ - 82 - ], - [ - 83 - ], - [ - 84 - ], - [ - 85 - ], - [ - 86 - ], - [ - 87 - ], - [ - 88 - ], - [ - 89 - ], - [ - 90 - ], - [ - 91 - ], - [ - 92 - ], - [ - 93 - ], - [ - 94 - ], - [ - 95 - ], - [ - 96 - ], - [ - 97 - ], - [ - 98 - ], - [ - 99 - ], - [ - 100 - ], - [ - 101 - ], - [ - 102 - ], - [ - 103 - ] - ] - } - }, - { - "name": "spr2tb", - "processorType": "Intel(R) Xeon(R) Platinum 8470", - "socketsPerNode": 2, - "coresPerSocket": 52, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 695 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 515 - }, - "nodes": "f[2181-2188,2281-2288]", - "topology": { - "node": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103 - ], - "socket": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51 - ], - [ - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103 - ] - ], - "memoryDomain": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12 - ], - [ - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25 - ], - [ - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38 - ], - [ - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51 - ], - [ - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64 - ], - [ - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77 - ], - [ - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90 - ], - [ - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103 - ] - ], - "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ], - [ - 4 - ], - [ - 5 - ], - [ - 6 - ], - [ - 7 - ], - [ - 8 - ], - [ - 9 - ], - [ - 10 - ], - [ - 11 - ], - [ - 12 - ], - [ - 13 - ], - [ - 14 - ], - [ - 15 - ], - [ - 16 - ], - [ - 17 - ], - [ - 18 - ], - [ - 19 - ], - [ - 20 - ], - [ - 21 - ], - [ - 22 - ], - [ - 23 - ], - [ - 24 - ], - [ - 25 - ], - [ - 26 - ], - [ - 27 - ], - [ - 28 - ], - [ - 29 - ], - [ - 30 - ], - [ - 31 - ], - [ - 32 - ], - [ - 33 - ], - [ - 34 - ], - [ - 35 - ], - [ - 36 - ], - [ - 37 - ], - [ - 38 - ], - [ - 39 - ], - [ - 40 - ], - [ - 41 - ], - [ - 42 - ], - [ - 43 - ], - [ - 44 - ], - [ - 45 - ], - [ - 46 - ], - [ - 47 - ], - [ - 48 - ], - [ - 49 - ], - [ - 50 - ], - [ - 51 - ], - [ - 52 - ], - [ - 53 - ], - [ - 54 - ], - [ - 55 - ], - [ - 56 - ], - [ - 57 - ], - [ - 58 - ], - [ - 59 - ], - [ - 60 - ], - [ - 61 - ], - [ - 62 - ], - [ - 63 - ], - [ - 64 - ], - [ - 65 - ], - [ - 66 - ], - [ - 67 - ], - [ - 68 - ], - [ - 69 - ], - [ - 70 - ], - [ - 71 - ], - [ - 72 - ], - [ - 73 - ], - [ - 74 - ], - [ - 75 - ], - [ - 76 - ], - [ - 77 - ], - [ - 78 - ], - [ - 79 - ], - [ - 80 - ], - [ - 81 - ], - [ - 82 - ], - [ - 83 - ], - [ - 84 - ], - [ - 85 - ], - [ - 86 - ], - [ - 87 - ], - [ - 88 - ], - [ - 89 - ], - [ - 90 - ], - [ - 91 - ], - [ - 92 - ], - [ - 93 - ], - [ - 94 - ], - [ - 95 - ], - [ - 96 - ], - [ - 97 - ], - [ - 98 - ], - [ - 99 - ], - [ - 100 - ], - [ - 101 - ], - [ - 102 - ], - [ - 103 - ] - ] - } - } - ] + ] } diff --git a/nhr@fau/job-archive/cluster-helma.json b/nhr@fau/job-archive/cluster-helma.json new file mode 100644 index 0000000..9e03454 --- /dev/null +++ b/nhr@fau/job-archive/cluster-helma.json @@ -0,0 +1,2073 @@ +{ + "name": "helma", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "footprint": "avg", + "timestep": 60, + "peak": 128, + "normal": 128, + "caution": 10, + "alert": 5 + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": "max", + "timestep": 60, + "lowerIsBetter": true, + "peak": 512, + "normal": 128, + "caution": 200, + "alert": 240 + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 9216, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "net_bytes_in", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000000, + "normal": 10000000, + "caution": 5000, + "alert": 1000 + }, + { + "name": "net_bytes_out", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000000, + "normal": 200000, + "caution": 5000, + "alert": 1000 + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10 + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + }, + { + "name": "core_power", + "unit": { + "base": "W" + }, + "scope": "core", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "acc_utilization", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "footprint": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 50, + "alert": 20 + }, + { + "name": "acc_mem_used", + "unit": { + "base": "B", + "prefix": "M" + }, + "scope": "accelerator", + "aggregation": "sum", + "footprint": "max", + "timestep": 60, + "lowerIsBetter": true, + "peak": 320000, + "normal": 160000, + "caution": 80000, + "alert": 40000 + }, + { + "name": "acc_power", + "unit": { + "base": "W" + }, + "scope": "accelerator", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 3200, + "normal": 1600, + "caution": 400, + "alert": 160 + }, + { + "name": "nv_mem_util", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 20, + "alert": 10 + }, + { + "name": "nv_temp", + "unit": { + "base": "°C" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 40, + "normal": 20, + "caution": 5, + "alert": 2 + }, + { + "name": "nv_sm_clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 1400, + "normal": 1200, + "caution": 100, + "alert": 50 + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, + { + "name": "ib_recv", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1250000, + "normal": 6000000, + "caution": 200, + "alert": 1 + }, + { + "name": "ib_xmit", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1250000, + "normal": 6000000, + "caution": 200, + "alert": 1 + }, + { + "name": "ib_recv_pkts", + "unit": { + "base": "packets/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "ib_xmit_pkts", + "unit": { + "base": "packets/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_read", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000, + "normal": 10000, + "caution": 10, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000, + "normal": 10000, + "caution": 20, + "alert": 5 + } + ], + "subClusters": [ + { + "name": "h100", + "processorType": "AMD EPYC 9554 64-Core Processor", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 1018 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 5974 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 569 + }, + "nodes": "h11-[01-24],h12-[01-24],h13-[01-24],h14-[01-24]", + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + [ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ], + [ + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ], + [ + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47 + ], + [ + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + [ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79 + ], + [ + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95 + ], + [ + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111 + ], + [ + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 72 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ], + [ + 104 + ], + [ + 105 + ], + [ + 106 + ], + [ + 107 + ], + [ + 108 + ], + [ + 109 + ], + [ + 110 + ], + [ + 111 + ], + [ + 112 + ], + [ + 113 + ], + [ + 114 + ], + [ + 115 + ], + [ + 116 + ], + [ + 117 + ], + [ + 118 + ], + [ + 119 + ], + [ + 120 + ], + [ + 121 + ], + [ + 122 + ], + [ + 123 + ], + [ + 124 + ], + [ + 125 + ], + [ + 126 + ], + [ + 127 + ] + ], + "accelerators":[ + { + "id": "00000000:06:00.0", + "type": "Nvidia GPU", + "model": "H100" + }, + { + "id": "00000000:26:00.0", + "type": "Nvidia GPU", + "model": "H100" + }, + { + "id": "00000000:A6:00.0", + "type": "Nvidia GPU", + "model": "H100" + }, + { + "id": "00000000:C6:00.0", + "type": "Nvidia GPU", + "model": "H100" + } + ] + } + }, + { + "name": "h200", + "processorType": "AMD EPYC 9554 64-Core Processor", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 1018 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 5974 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 569 + }, + "nodes": "h21-[01-24],h22-[01-24],h23-[01-24],h24-[01-24]", + "topology": { + "node": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ], + "socket": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + [ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "memoryDomain": [ + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ], + [ + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ], + [ + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47 + ], + [ + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ], + [ + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79 + ], + [ + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95 + ], + [ + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111 + ], + [ + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127 + ] + ], + "core": [ + [ + 0 + ], + [ + 1 + ], + [ + 2 + ], + [ + 3 + ], + [ + 4 + ], + [ + 5 + ], + [ + 6 + ], + [ + 7 + ], + [ + 8 + ], + [ + 9 + ], + [ + 10 + ], + [ + 11 + ], + [ + 12 + ], + [ + 13 + ], + [ + 14 + ], + [ + 15 + ], + [ + 16 + ], + [ + 17 + ], + [ + 18 + ], + [ + 19 + ], + [ + 20 + ], + [ + 21 + ], + [ + 22 + ], + [ + 23 + ], + [ + 24 + ], + [ + 25 + ], + [ + 26 + ], + [ + 27 + ], + [ + 28 + ], + [ + 29 + ], + [ + 30 + ], + [ + 31 + ], + [ + 32 + ], + [ + 33 + ], + [ + 34 + ], + [ + 35 + ], + [ + 36 + ], + [ + 37 + ], + [ + 38 + ], + [ + 39 + ], + [ + 40 + ], + [ + 41 + ], + [ + 42 + ], + [ + 43 + ], + [ + 44 + ], + [ + 45 + ], + [ + 46 + ], + [ + 47 + ], + [ + 48 + ], + [ + 49 + ], + [ + 50 + ], + [ + 51 + ], + [ + 52 + ], + [ + 53 + ], + [ + 54 + ], + [ + 55 + ], + [ + 56 + ], + [ + 57 + ], + [ + 58 + ], + [ + 59 + ], + [ + 60 + ], + [ + 61 + ], + [ + 62 + ], + [ + 63 + ], + [ + 64 + ], + [ + 65 + ], + [ + 66 + ], + [ + 67 + ], + [ + 68 + ], + [ + 69 + ], + [ + 70 + ], + [ + 71 + ], + [ + 72 + ], + [ + 73 + ], + [ + 74 + ], + [ + 75 + ], + [ + 76 + ], + [ + 77 + ], + [ + 78 + ], + [ + 79 + ], + [ + 80 + ], + [ + 81 + ], + [ + 82 + ], + [ + 83 + ], + [ + 84 + ], + [ + 85 + ], + [ + 86 + ], + [ + 87 + ], + [ + 88 + ], + [ + 89 + ], + [ + 90 + ], + [ + 91 + ], + [ + 92 + ], + [ + 93 + ], + [ + 94 + ], + [ + 95 + ], + [ + 96 + ], + [ + 97 + ], + [ + 98 + ], + [ + 99 + ], + [ + 100 + ], + [ + 101 + ], + [ + 102 + ], + [ + 103 + ], + [ + 104 + ], + [ + 105 + ], + [ + 106 + ], + [ + 107 + ], + [ + 108 + ], + [ + 109 + ], + [ + 110 + ], + [ + 111 + ], + [ + 112 + ], + [ + 113 + ], + [ + 114 + ], + [ + 115 + ], + [ + 116 + ], + [ + 117 + ], + [ + 118 + ], + [ + 119 + ], + [ + 120 + ], + [ + 121 + ], + [ + 122 + ], + [ + 123 + ], + [ + 124 + ], + [ + 125 + ], + [ + 126 + ], + [ + 127 + ] + ], + "accelerators":[ + { + "id": "00000000:06:00.0", + "type": "Nvidia GPU", + "model": "H200" + }, + { + "id": "00000000:26:00.0", + "type": "Nvidia GPU", + "model": "H200" + }, + { + "id": "00000000:A6:00.0", + "type": "Nvidia GPU", + "model": "H200" + }, + { + "id": "00000000:C6:00.0", + "type": "Nvidia GPU", + "model": "H200" + } + ] + } + }, + { + "name": "cpu", + "processorType": "AMD EPYC 9965 192-Core Processor", + "socketsPerNode": 2, + "coresPerSocket": 192, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 3412 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 39758 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 979 + }, + "nodes": "h31-[01-72],h32-[01-60],h33-[01-60],h34-[01-60],h35-[01-60]", + "topology": { + "node": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383], + "socket": [ + [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191], + [192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383] + ], + "memoryDomain": [ + [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47], + [48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95], + [96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143], + [144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191], + [192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239], + [240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287], + [288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335], + [336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383] + ], + "core": [ + [0],[1],[2],[3],[4],[5],[6],[7],[8],[9],[10],[11],[12],[13],[14],[15],[16],[17],[18],[19],[20],[21],[22],[23],[24],[25],[26],[27],[28],[29],[30],[31],[32],[33],[34],[35],[36],[37],[38],[39],[40],[41],[42],[43],[44],[45],[46],[47],[48],[49],[50],[51],[52],[53],[54],[55],[56],[57],[58],[59],[60],[61],[62],[63],[64],[65],[66],[67],[68],[69],[70],[71],[72],[73],[74],[75],[76],[77],[78],[79],[80],[81],[82],[83],[84],[85],[86],[87],[88],[89],[90],[91],[92],[93],[94],[95],[96],[97],[98],[99],[100],[101],[102],[103],[104],[105],[106],[107],[108],[109],[110],[111],[112],[113],[114],[115],[116],[117],[118],[119],[120],[121],[122],[123],[124],[125],[126],[127],[128],[129],[130],[131],[132],[133],[134],[135],[136],[137],[138],[139],[140],[141],[142],[143],[144],[145],[146],[147],[148],[149],[150],[151],[152],[153],[154],[155],[156],[157],[158],[159],[160],[161],[162],[163],[164],[165],[166],[167],[168],[169],[170],[171],[172],[173],[174],[175],[176],[177],[178],[179],[180],[181],[182],[183],[184],[185],[186],[187],[188],[189],[190],[191],[192],[193],[194],[195],[196],[197],[198],[199],[200],[201],[202],[203],[204],[205],[206],[207],[208],[209],[210],[211],[212],[213],[214],[215],[216],[217],[218],[219],[220],[221],[222],[223],[224],[225],[226],[227],[228],[229],[230],[231],[232],[233],[234],[235],[236],[237],[238],[239],[240],[241],[242],[243],[244],[245],[246],[247],[248],[249],[250],[251],[252],[253],[254],[255],[256],[257],[258],[259],[260],[261],[262],[263],[264],[265],[266],[267],[268],[269],[270],[271],[272],[273],[274],[275],[276],[277],[278],[279],[280],[281],[282],[283],[284],[285],[286],[287],[288],[289],[290],[291],[292],[293],[294],[295],[296],[297],[298],[299],[300],[301],[302],[303],[304],[305],[306],[307],[308],[309],[310],[311],[312],[313],[314],[315],[316],[317],[318],[319],[320],[321],[322],[323],[324],[325],[326],[327],[328],[329],[330],[331],[332],[333],[334],[335],[336],[337],[338],[339],[340],[341],[342],[343],[344],[345],[346],[347],[348],[349],[350],[351],[352],[353],[354],[355],[356],[357],[358],[359],[360],[361],[362],[363],[364],[365],[366],[367],[368],[369],[370],[371],[372],[373],[374],[375],[376],[377],[378],[379],[380],[381],[382],[383] + ] + + } + } + ] +} diff --git a/nhr@fau/job-archive/cluster-meggie.json b/nhr@fau/job-archive/cluster-meggie.json index 8894aa2..28bb959 100644 --- a/nhr@fau/job-archive/cluster-meggie.json +++ b/nhr@fau/job-archive/cluster-meggie.json @@ -1,357 +1,261 @@ { - "name": "meggie", - "metricConfig": [ - { - "name": "cpu_load", - "unit": { - "base": "load" - }, - "scope": "node", - "aggregation": "avg", - "footprint": "avg", - "timestep": 60, - "peak": 40, - "normal": 20, - "caution": 15, - "alert": 10 - }, - { - "name": "mem_used", - "unit": { - "base": "B", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "footprint": "max", - "timestep": 60, - "peak": 64, - "normal": 20, - "caution": 40, - "alert": 55 - }, - { - "name": "flops_any", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "footprint": "avg", - "timestep": 60, - "peak": 1536, - "normal": 200, - "caution": 40, - "alert": 4 - }, - { - "name": "flops_sp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 1536, - "normal": 100, - "caution": 20, - "alert": 2 - }, - { - "name": "flops_dp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 768, - "normal": 50, - "caution": 10, - "alert": 2 - }, - { - "name": "mem_bw", - "unit": { - "base": "B/s", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "footprint": "avg", - "timestep": 60, - "peak": 140, - "normal": 70, - "caution": 20, - "alert": 5 - }, - { - "name": "clock", - "unit": { - "base": "Hz", - "prefix": "M" - }, - "scope": "node", - "aggregation": "avg", - "timestep": 60, - "peak": 3000, - "normal": 2400, - "caution": 1800, - "alert": 1200 - }, - { - "name": "cpu_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "energy": "power", - "timestep": 60, - "peak": 80, - "normal": 30, - "caution": 10, - "alert": 5 - }, - { - "name": "mem_power", - "unit": { - "base": "W" - }, - "scope": "socket", - "aggregation": "sum", - "energy": "power", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "ipc", - "unit": { - "base": "IPC" - }, - "scope": "node", - "aggregation": "avg", - "timestep": 60, - "peak": 4, - "normal": 2, - "caution": 1, - "alert": 0.5 - }, - { - "name": "vectorization_ratio", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 60, - "caution": 40, - "alert": 10 - }, - { - "name": "nfs4_read", - "unit": { - "base": "IOP", - "prefix": "" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_total", - "unit": { - "base": "IOP", - "prefix": "" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - } - ], - "subClusters": [ - { - "name": "main", - "nodes": "m[0101-0164,0201-0264,0301-0364,0401-0464,0601-0676,0701-0776,0801-0872,0901-0972,1001-1072,1101-1172]", - "processorType": "Intel Broadwell", - "socketsPerNode": 2, - "coresPerSocket": 10, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" + "name": "meggie", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "load" + }, + "scope": "node", + "aggregation": "avg", + "footprint": "avg", + "timestep": 60, + "peak": 40, + "normal": 20, + "caution": 15, + "alert": 10 }, - "value": 96 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": "max", + "timestep": 60, + "peak": 64, + "normal": 20, + "caution": 40, + "alert": 55 }, - "value": 1536 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 1536, + "normal": 200, + "caution": 40, + "alert": 4 }, - "value": 140 - }, - "topology": { - "node": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19 - ], - "socket": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9 - ], - [ - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19 - ] - ], - "memoryDomain": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9 - ], - [ - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19 - ] - ], - "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ], - [ - 4 - ], - [ - 5 - ], - [ - 6 - ], - [ - 7 - ], - [ - 8 - ], - [ - 9 - ], - [ - 10 - ], - [ - 11 - ], - [ - 12 - ], - [ - 13 - ], - [ - 14 - ], - [ - 15 - ], - [ - 16 - ], - [ - 17 - ], - [ - 18 - ], - [ - 19 - ] - ] - } - } - ] + { + "name": "flops_sp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1536, + "normal": 100, + "caution": 20, + "alert": 2 + }, + { + "name": "flops_dp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 768, + "normal": 50, + "caution": 10, + "alert": 2 + }, + { + "name": "net_bytes_in", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000000, + "normal": 10000000, + "caution": 5000, + "alert": 1000 + }, + { + "name": "net_bytes_out", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000000, + "normal": 200000, + "caution": 5000, + "alert": 1000 + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": "avg", + "timestep": 60, + "peak": 140, + "normal": 70, + "caution": 20, + "alert": 5 + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 80, + "normal": 30, + "caution": 10, + "alert": 5 + }, + { + "name": "mem_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "energy": "power", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "nfs4_read", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000, + "normal": 10000, + "caution": 10, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000, + "normal": 10000, + "caution": 20, + "alert": 5 + } + ], + "subClusters": [ + { + "name": "main", + "nodes": "m[0101-0164,0201-0264,0301-0364,0401-0464,0601-0676,0701-0776,0801-0872,0901-0972,1001-1072,1101-1172]", + "processorType": "Intel Broadwell", + "socketsPerNode": 2, + "coresPerSocket": 10, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 96 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 1536 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 140 + }, + "topology": { + "node": [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 + ], + "socket": [ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ], + [ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ] + ], + "memoryDomain": [ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ], + [ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ] + ] + } + } + ] } diff --git a/nhr@fau/job-archive/cluster-tinyfat.json b/nhr@fau/job-archive/cluster-tinyfat.json index a2a4bf9..a2e1933 100644 --- a/nhr@fau/job-archive/cluster-tinyfat.json +++ b/nhr@fau/job-archive/cluster-tinyfat.json @@ -178,9 +178,9 @@ "scope": "node", "aggregation": "sum", "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, + "peak": 50000, + "normal": 10000, + "caution": 10, "alert": 1 }, { @@ -192,9 +192,9 @@ "scope": "node", "aggregation": "sum", "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, + "peak": 50000, + "normal": 10000, + "caution": 10, "alert": 1 }, { @@ -206,10 +206,10 @@ "scope": "node", "aggregation": "sum", "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 + "peak": 50000, + "normal": 10000, + "caution": 20, + "alert": 5 } ], "subClusters": [ diff --git a/nhr@fau/job-archive/cluster-tinygpu.json b/nhr@fau/job-archive/cluster-tinygpu.json index 4e44c5d..724da1e 100644 --- a/nhr@fau/job-archive/cluster-tinygpu.json +++ b/nhr@fau/job-archive/cluster-tinygpu.json @@ -86,6 +86,32 @@ "caution": 100, "alert": 50 }, + { + "name": "net_bytes_in", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000000, + "normal": 10000000, + "caution": 5000, + "alert": 1000 + }, + { + "name": "net_bytes_out", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000000, + "normal": 200000, + "caution": 5000, + "alert": 1000 + }, { "name": "mem_bw", "unit": { @@ -197,14 +223,14 @@ "caution": 10000, "alert": 5000, "subClusters": [ - { + { "name": "a100", "peak": 160000, "normal": 120000, "caution": 80000, "alert": 40000 }, - { + { "name": "v100", "peak": 128000, "normal": 96000, @@ -234,6 +260,7 @@ }, "scope": "accelerator", "aggregation": "sum", + "energy": "power", "timestep": 60, "peak": 400, "normal": 200, @@ -290,9 +317,9 @@ "scope": "node", "aggregation": "sum", "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, + "peak": 50000, + "normal": 10000, + "caution": 10, "alert": 1 }, { @@ -304,9 +331,9 @@ "scope": "node", "aggregation": "sum", "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, + "peak": 50000, + "normal": 10000, + "caution": 10, "alert": 1 }, { @@ -318,10 +345,10 @@ "scope": "node", "aggregation": "sum", "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 + "peak": 50000, + "normal": 10000, + "caution": 20, + "alert": 5 } ], "subClusters": [ diff --git a/nhr@fau/job-archive/cluster-woody.json b/nhr@fau/job-archive/cluster-woody.json index ac66988..3845660 100644 --- a/nhr@fau/job-archive/cluster-woody.json +++ b/nhr@fau/job-archive/cluster-woody.json @@ -1,943 +1,433 @@ { - "name": "woody", - "metricConfig": [ - { - "name": "cpu_load", - "unit": { - "base": "" - }, - "scope": "node", - "aggregation": "avg", - "timestep": 60, - "peak": 4, - "normal": 4, - "caution": 4, - "alert": 1, - "footprint": "avg", - "subClusters": [ + "name": "woody", + "metricConfig": [ { - "name": "icelake", - "peak": 32, - "normal": 32, - "caution": 30, - "footprint": "avg", - "alert": 10 - } - ] - }, - { - "name": "cpu_user", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 50, - "caution": 20, - "alert": 10 - }, - { - "name": "ipc", - "unit": { - "base": "IPC" - }, - "scope": "node", - "aggregation": "avg", - "timestep": 60, - "peak": 4, - "normal": 2, - "caution": 1, - "alert": 0.25 - }, - { - "name": "mem_used", - "unit": { - "base": "B", - "prefix": "G" - }, - "scope": "node", - "aggregation": "sum", - "footprint": "max", - "timestep": 60, - "peak": 32, - "normal": 16, - "caution": 28, - "alert": 30, - "subClusters": [ + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 4, + "caution": 4, + "alert": 1, + "footprint": "avg", + "subClusters": [ + { + "name": "icelake", + "peak": 32, + "normal": 32, + "caution": 30, + "footprint": "avg", + "alert": 10 + } + ] + }, { - "name": "icelake", - "peak": 256, - "footprint": "max", - "normal": 128, - "caution": 200, - "alert": 240 - } - ] - }, - { - "name": "flops_any", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 112, - "normal": 50, - "caution": 20, - "alert": 10, - "footprint": "avg", - "subClusters": [ + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, { - "name": "icelake", - "footprint": "avg", - "peak": 2970, - "normal": 1000, - "caution": 100, - "alert": 50 - } - ] - }, - { - "name": "net_bytes_in", - "unit": { - "base": "B/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 125000000, - "normal": 125000000, - "caution": 200, - "alert": 240 - }, - { - "name": "net_bytes_out", - "unit": { - "base": "B/s" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 125000000, - "normal": 125000000, - "caution": 200, - "alert": 240 - }, - { - "name": "flops_dp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 56, - "normal": 30, - "caution": 15, - "alert": 5, - "subClusters": [ + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.25 + }, { - "name": "icelake", - "peak": 1450, - "normal": 700, - "caution": 100, - "alert": 50 - } - ] - }, - { - "name": "flops_sp", - "unit": { - "base": "Flops/s", - "prefix": "G" - }, - "scope": "hwthread", - "aggregation": "sum", - "timestep": 60, - "peak": 112, - "normal": 50, - "caution": 20, - "alert": 10, - "subClusters": [ + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "footprint": "max", + "timestep": 60, + "lowerIsBetter": true, + "peak": 32, + "normal": 16, + "caution": 28, + "alert": 30, + "subClusters": [ + { + "name": "icelake", + "peak": 256, + "footprint": "max", + "normal": 128, + "caution": 200, + "alert": 240 + } + ] + }, { - "name": "icelake", - "peak": 2970, - "normal": 1000, - "caution": 100, - "alert": 50 - } - ] - }, - { - "name": "mem_bw", - "unit": { - "base": "B/s", - "prefix": "G" - }, - "scope": "socket", - "aggregation": "sum", - "timestep": 60, - "peak": 24, - "normal": 10, - "caution": 5, - "alert": 2, - "footprint": "avg", - "subClusters": [ + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 112, + "normal": 50, + "caution": 20, + "alert": 10, + "footprint":"avg", + "subClusters": [ + { + "name": "icelake", + "footprint":"avg", + "peak": 2970, + "normal": 1000, + "caution": 100, + "alert": 50 + } + ] + }, { - "name": "icelake", - "peak": 350, - "footprint": "avg", - "normal": 100, - "caution": 50, - "alert": 20 + "name": "net_bytes_in", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000000, + "normal": 10000000, + "caution": 5000, + "alert": 1000 + }, + { + "name": "net_bytes_out", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000000, + "normal": 200000, + "caution": 5000, + "alert": 1000 + }, + { + "name": "flops_dp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 56, + "normal": 30, + "caution": 15, + "alert": 5, + "subClusters": [ + { + "name": "icelake", + "peak": 1450, + "normal": 700, + "caution": 100, + "alert": 50 + } + ] + }, + { + "name": "flops_sp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 112, + "normal": 50, + "caution": 20, + "alert": 10, + "subClusters": [ + { + "name": "icelake", + "peak": 2970, + "normal": 1000, + "caution": 100, + "alert": 50 + } + ] + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 24, + "normal": 10, + "caution": 5, + "alert": 2, + "footprint":"avg", + "subClusters": [ + { + "name": "icelake", + "peak": 350, + "footprint":"avg", + "normal": 100, + "caution": 50, + "alert": 20 + } + ] + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2000, + "caution": 1500, + "alert": 1200 + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "nfs4_read", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000, + "normal": 10000, + "caution": 10, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "IOP", + "prefix": "" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 50000, + "normal": 10000, + "caution": 20, + "alert": 5 } - ] - }, - { - "name": "clock", - "unit": { - "base": "Hz", - "prefix": "M" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 3000, - "normal": 2000, - "caution": 1500, - "alert": 1200 - }, - { - "name": "vectorization_ratio", - "unit": { - "base": "" - }, - "scope": "hwthread", - "aggregation": "avg", - "timestep": 60, - "peak": 100, - "normal": 60, - "caution": 40, - "alert": 10 - }, - { - "name": "nfs4_read", - "unit": { - "base": "IOP", - "prefix": "" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - }, - { - "name": "nfs4_total", - "unit": { - "base": "IOP", - "prefix": "" - }, - "scope": "node", - "aggregation": "sum", - "timestep": 60, - "peak": 6, - "normal": 4, - "caution": 2, - "alert": 1 - } - ], - "subClusters": [ - { - "name": "haswell", - "nodes": "w11[27-45,49-63,69-72]", - "processorType": "Intel Xeon E3-1240 v3", - "socketsPerNode": 1, - "coresPerSocket": 4, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" + ], + "subClusters": [ + { + "name": "haswell", + "nodes": "w11[27-45,49-63,69-72]", + "processorType": "Intel Xeon E3-1240 v3", + "socketsPerNode": 1, + "coresPerSocket": 4, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 14 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 112 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 24 + }, + "topology": { + "node": [ + 0, 1, 2, 3 + ], + "socket": [ + [ 0, 1, 2, 3 ] + ], + "memoryDomain": [ + [ 0, 1, 2, 3 ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ] + ] + } }, - "value": 14 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" + { + "name": "skylake", + "nodes": "w12[01-08],w13[01-31,33-56]", + "processorType": "Intel Xeon E3-1240 v5 ", + "socketsPerNode": 1, + "coresPerSocket": 4, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 14 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 112 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 24 + }, + "topology": { + "node": [ + 0, 1, 2, 3 + ], + "socket": [ + [ 0, 1, 2, 3 ] + ], + "memoryDomain": [ + [ 0, 1, 2, 3 ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ] + ] + } }, - "value": 112 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" + { + "name": "kabylake", + "nodes": "w14[01-56],w15[01-05,07-56]", + "processorType": "Intel Xeon E3-1240 v6", + "socketsPerNode": 1, + "coresPerSocket": 4, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 14 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 112 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 24 + }, + "topology": { + "node": [ + 0, 1, 2, 3 + ], + "socket": [ + [ 0, 1, 2, 3 ] + ], + "memoryDomain": [ + [ 0, 1, 2, 3 ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ] + ] + } }, - "value": 24 - }, - "topology": { - "node": [ - 0, - 1, - 2, - 3 - ], - "socket": [ - [ - 0, - 1, - 2, - 3 - ] - ], - "memoryDomain": [ - [ - 0, - 1, - 2, - 3 - ] - ], - "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ] - ] - } - }, - { - "name": "skylake", - "nodes": "w12[01-08],w13[01-31,33-56]", - "processorType": "Intel Xeon E3-1240 v5 ", - "socketsPerNode": 1, - "coresPerSocket": 4, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 14 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 112 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 24 - }, - "topology": { - "node": [ - 0, - 1, - 2, - 3 - ], - "socket": [ - [ - 0, - 1, - 2, - 3 - ] - ], - "memoryDomain": [ - [ - 0, - 1, - 2, - 3 - ] - ], - "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ] - ] - } - }, - { - "name": "kabylake", - "nodes": "w14[01-56],w15[01-05,07-56]", - "processorType": "Intel Xeon E3-1240 v6", - "socketsPerNode": 1, - "coresPerSocket": 4, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 14 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 112 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 24 - }, - "topology": { - "node": [ - 0, - 1, - 2, - 3 - ], - "socket": [ - [ - 0, - 1, - 2, - 3 - ] - ], - "memoryDomain": [ - [ - 0, - 1, - 2, - 3 - ] - ], - "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ] - ] - } - }, - { - "name": "icelake", - "nodes": "w22[01-35],w23[01-35],w24[01-20],w25[01-20]", - "processorType": "Intel Xeon Gold 6326", - "socketsPerNode": 2, - "coresPerSocket": 16, - "threadsPerCore": 1, - "flopRateScalar": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 432 - }, - "flopRateSimd": { - "unit": { - "base": "F/s", - "prefix": "G" - }, - "value": 9216 - }, - "memoryBandwidth": { - "unit": { - "base": "B/s", - "prefix": "G" - }, - "value": 350 - }, - "topology": { - "node": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71 - ], - "socket": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35 - ], - [ - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71 - ] - ], - "memoryDomain": [ - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17 - ], - [ - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35 - ], - [ - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53 - ], - [ - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71 - ] - ], - "core": [ - [ - 0 - ], - [ - 1 - ], - [ - 2 - ], - [ - 3 - ], - [ - 4 - ], - [ - 5 - ], - [ - 6 - ], - [ - 7 - ], - [ - 8 - ], - [ - 9 - ], - [ - 10 - ], - [ - 11 - ], - [ - 12 - ], - [ - 13 - ], - [ - 14 - ], - [ - 15 - ], - [ - 16 - ], - [ - 17 - ], - [ - 18 - ], - [ - 19 - ], - [ - 20 - ], - [ - 21 - ], - [ - 22 - ], - [ - 23 - ], - [ - 24 - ], - [ - 25 - ], - [ - 26 - ], - [ - 27 - ], - [ - 28 - ], - [ - 29 - ], - [ - 30 - ], - [ - 31 - ], - [ - 32 - ], - [ - 33 - ], - [ - 34 - ], - [ - 35 - ], - [ - 36 - ], - [ - 37 - ], - [ - 38 - ], - [ - 39 - ], - [ - 40 - ], - [ - 41 - ], - [ - 42 - ], - [ - 43 - ], - [ - 44 - ], - [ - 45 - ], - [ - 46 - ], - [ - 47 - ], - [ - 48 - ], - [ - 49 - ], - [ - 50 - ], - [ - 51 - ], - [ - 52 - ], - [ - 53 - ], - [ - 54 - ], - [ - 55 - ], - [ - 56 - ], - [ - 57 - ], - [ - 58 - ], - [ - 59 - ], - [ - 60 - ], - [ - 61 - ], - [ - 62 - ], - [ - 63 - ], - [ - 64 - ], - [ - 65 - ], - [ - 66 - ], - [ - 67 - ], - [ - 68 - ], - [ - 69 - ], - [ - 70 - ], - [ - 71 - ] - ] - } - } - ] + { + "name": "icelake", + "processorType": "Intel(R) Xeon(R) Gold 6326", + "socketsPerNode": 2, + "coresPerSocket": 16, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 78 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 1448 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 274 + }, + "nodes": "w22[01-35],w23[01-35],w24[01-20],w25[01-20]", + "topology": { + "node": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31], + "socket": [ + [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], + [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] + ], + "memoryDomain": [ + [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], + [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] + ], + "core": [ + [0],[1],[2],[3],[4],[5],[6],[7],[8],[9],[10],[11],[12],[13],[14],[15],[16],[17],[18],[19],[20],[21],[22],[23],[24],[25],[26],[27],[28],[29],[30],[31] + ] + + } + } + ] }