mirror of
https://github.com/ClusterCockpit/cc-examples.git
synced 2026-03-17 14:07:30 +01:00
Update config for v1.5.0
This commit is contained in:
@@ -9,25 +9,15 @@ You can find an overview about all clusters
|
||||
|
||||
Some systems run with job exclusive nodes, others have node sharing enabled.
|
||||
There are CPU systems (Fritz, Meggie, Woody, TinyFat) as well as GPU accelerated
|
||||
clusters (Alex, TinyGPU).
|
||||
clusters (Alex, Helma, TinyGPU).
|
||||
|
||||
NHR@FAU uses the following stack:
|
||||
|
||||
* `cc-metric-collector` as node agent
|
||||
* `cc-metric-store` as temporal metric time series cache. We use one instance
|
||||
for all clusters.
|
||||
* `cc-metric-collector`
|
||||
* `cc-backend`
|
||||
* A homegrown python script running on the management nodes for providing job
|
||||
meta data from Slurm
|
||||
* Builtin sqlite database for job meta and user data (currently 50GB large)
|
||||
* Job Archive without retention using compressed data.json files (around 700GB)
|
||||
* `cc-slurm-adapter`
|
||||
|
||||
Currently all API use regular HTTP protocol, but we plan to switch to NATS for
|
||||
all communication.
|
||||
We also push the metric data to an InfluxDB instance for debugging purposes.
|
||||
|
||||
The backend and metric store run on the same dedicated Dell server running
|
||||
Ubuntu Linux:
|
||||
We use the following server with Ubuntu Linux:
|
||||
|
||||
* Two Intel Xeon(R) Platinum 8352Y with 32 cores each
|
||||
* 512 GB Main memory capacity
|
||||
|
||||
@@ -1,18 +1,20 @@
|
||||
[Unit]
|
||||
Description=ClusterCockpit Web Server
|
||||
Documentation=https://clustercockpit.org
|
||||
Description=ClusterCockpit Backend
|
||||
Documentation=https://github.com/ClusterCockpit/cc-backend
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
After=mariadb.service mysql.service
|
||||
|
||||
[Service]
|
||||
WorkingDirectory=/opt/monitoring/cc-backend
|
||||
Type=notify
|
||||
User=clustercockpit
|
||||
Group=clustercockpit
|
||||
NotifyAccess=all
|
||||
Restart=on-failure
|
||||
RestartSec=30
|
||||
TimeoutStopSec=100
|
||||
ExecStart=/opt/monitoring/cc-backend/cc-backend -loglevel info -server -config ./config.json
|
||||
TimeoutStartSec=200
|
||||
TimeoutStopSec=200
|
||||
ExecStart=/opt/monitoring/cc-backend/cc-backend -loglevel info -server -config ./config.json
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
@@ -1,241 +1,77 @@
|
||||
{
|
||||
"addr": "0.0.0.0:443",
|
||||
"stop-jobs-exceeding-walltime": 288000,
|
||||
"short-running-jobs-duration": 300,
|
||||
"ldap": {
|
||||
"url": "ldaps://hpcldap.rrze.uni-erlangen.de",
|
||||
"user_base": "ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
|
||||
"search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
|
||||
"user_bind": "uid={username},ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
|
||||
"user_filter": "(&(objectclass=posixAccount))",
|
||||
"sync_interval": "24h"
|
||||
},
|
||||
"jwts": {
|
||||
"syncUserOnLogin": true,
|
||||
"updateUserOnLogin": true,
|
||||
"trustedIssuer": "https://portal.hpc.fau.de/",
|
||||
"validateUser": false,
|
||||
"max-age": "168h"
|
||||
},
|
||||
"https-cert-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/fullchain.pem",
|
||||
"https-key-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/privkey.pem",
|
||||
"user": "clustercockpit",
|
||||
"group": "clustercockpit",
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive",
|
||||
"compression": 7,
|
||||
"retention": {
|
||||
"policy": "none"
|
||||
}
|
||||
},
|
||||
"enable-resampling": {
|
||||
"trigger": 30,
|
||||
"resolutions": [
|
||||
600,
|
||||
300,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"emission-constant": 317,
|
||||
"ui-defaults": {
|
||||
"analysis_view_histogramMetrics": [
|
||||
"flops_any",
|
||||
"mem_bw",
|
||||
"mem_used"
|
||||
],
|
||||
"analysis_view_scatterPlotMetrics": [
|
||||
[
|
||||
"flops_any",
|
||||
"mem_bw"
|
||||
],
|
||||
[
|
||||
"flops_any",
|
||||
"cpu_load"
|
||||
],
|
||||
[
|
||||
"cpu_load",
|
||||
"mem_bw"
|
||||
]
|
||||
],
|
||||
"job_view_nodestats_selectedMetrics": [
|
||||
"flops_any",
|
||||
"mem_bw",
|
||||
"mem_used"
|
||||
],
|
||||
"job_view_polarPlotMetrics": [
|
||||
"flops_any",
|
||||
"mem_bw",
|
||||
"mem_used"
|
||||
],
|
||||
"job_view_selectedMetrics": [
|
||||
"flops_any",
|
||||
"mem_bw",
|
||||
"mem_used"
|
||||
],
|
||||
"job_view_showFootprint": true,
|
||||
"job_list_usePaging": false,
|
||||
"plot_general_colorBackground": true,
|
||||
"plot_general_colorscheme": [
|
||||
"#00bfff",
|
||||
"#0000ff",
|
||||
"#ff00ff",
|
||||
"#ff0000",
|
||||
"#ff8000",
|
||||
"#ffff00",
|
||||
"#80ff00"
|
||||
],
|
||||
"plot_general_lineWidth": 3,
|
||||
"plot_list_jobsPerPage": 10,
|
||||
"plot_list_selectedMetrics": [
|
||||
"cpu_load",
|
||||
"mem_used",
|
||||
"flops_any",
|
||||
"mem_bw"
|
||||
],
|
||||
"plot_view_plotsPerRow": 3,
|
||||
"plot_view_showPolarplot": true,
|
||||
"plot_view_showRoofline": true,
|
||||
"plot_view_showStatTable": true,
|
||||
"system_view_selectedMetric": "cpu_load",
|
||||
"analysis_view_selectedTopEntity": "user",
|
||||
"analysis_view_selectedTopCategory": "totalWalltime",
|
||||
"status_view_selectedTopUserCategory": "totalJobs",
|
||||
"status_view_selectedTopProjectCategory": "totalJobs"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "fritz",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "-"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
"main": {
|
||||
"addr": "127.0.0.1:8050",
|
||||
"api-allowed-ips": ["*"],
|
||||
"stop-jobs-exceeding-walltime":288000,
|
||||
"short-running-jobs-duration": 300,
|
||||
"resampling": {
|
||||
"minimum-points": 600,
|
||||
"trigger": 180,
|
||||
"resolutions": [240, 60]
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
"nodestate-retention": {
|
||||
"policy": "move",
|
||||
"target-kind": "file",
|
||||
"target-path": "/opt/monitoring/cc-backend/var/nodestate-archive/"
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
"emission-constant": 317,
|
||||
"enable-job-taggers": true
|
||||
},
|
||||
{
|
||||
"name": "alex",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "-"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
"cron": {
|
||||
"commit-job-worker": "1m",
|
||||
"duration-worker": "5m",
|
||||
"footprint-worker": "10m"
|
||||
},
|
||||
{
|
||||
"name": "woody",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "-"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 1
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 172800
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2020-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive"
|
||||
},
|
||||
{
|
||||
"name": "tinyfat",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "-"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 1
|
||||
"auth": {
|
||||
"ldap": {
|
||||
"url": "ldaps://hpcldap.rrze.uni-erlangen.de",
|
||||
"user-base": "ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
|
||||
"search-dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
|
||||
"user-bind": "uid={username},ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
|
||||
"user-filter": "(&(objectclass=posixAccount))",
|
||||
"sync-interval": "24h"
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 172800
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2020-01-01T00:00:00Z",
|
||||
"to": null
|
||||
"jwts": {
|
||||
"sync-user-on-login": true,
|
||||
"update-user-on-login": true,
|
||||
"validate-user": false,
|
||||
"max-age": "1h",
|
||||
"trusted-issuer": "https://portal.hpc.fau.de/"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "tinygpu",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "-"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 1
|
||||
"metric-store": {
|
||||
"checkpoints": {
|
||||
"file-format": "wal",
|
||||
"directory": "./var/metric-checkpoints"
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 172800
|
||||
"cleanup": {
|
||||
"mode": "archive",
|
||||
"directory": "./var/metric-archive"
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2020-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
"nats-subscriptions": [
|
||||
{
|
||||
"subscribe-to": "ccmetrics.>"
|
||||
}
|
||||
],
|
||||
"retention-in-memory": "24h",
|
||||
"memory-cap": 200
|
||||
},
|
||||
{
|
||||
"name": "meggie",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "-"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2018-01-01T00:00:00Z",
|
||||
"to": null
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive",
|
||||
"compression": 7,
|
||||
"retention": {
|
||||
"policy": "none"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"nats": {
|
||||
"address": "nats://monitoring.nhr.fau.de:4222",
|
||||
"username": "metricstore",
|
||||
"password": "XXX"
|
||||
},
|
||||
"ui-file": "uiConfig.json"
|
||||
}
|
||||
|
||||
59
nhr@fau/cc-backend/uiConfig.json
Normal file
59
nhr@fau/cc-backend/uiConfig.json
Normal file
@@ -0,0 +1,59 @@
|
||||
{
|
||||
"job-view": {
|
||||
"show-polar-plot":true,
|
||||
"show-footprint": true,
|
||||
"show-roofline": true,
|
||||
"show-stat-table": true
|
||||
},
|
||||
"metric-config": {
|
||||
"job-list-metrics": ["cpu_load", "flops_any", "mem_bw", "mem_used"],
|
||||
"job-view-plot-metrics": ["cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "nfs4_total"],
|
||||
"job-view-table-metrics": ["mem_bw", "flops_any", "mem_used"],
|
||||
"clusters": [
|
||||
{
|
||||
"name": "fritz",
|
||||
"job-view-plot-metrics": ["cpu_load", "cpu_user", "flops_any", "vectorization_ratio", "mem_bw", "mem_used", "flops_dp", "flops_sp", "ib_recv", "ib_xmit", "clock", "ipc", "cpu_power", "mem_power", "nfs4_total"]
|
||||
},
|
||||
{
|
||||
"name": "alex",
|
||||
"job-list-metrics": ["acc_utilization", "acc_mem_used", "cpu_load", "flops_any", "mem_bw", "mem_used"],
|
||||
"job-view-plot-metrics": ["acc_utilization", "nv_mem_util", "acc_mem_used", "acc_power", "nv_sm_clock", "nv_temp", "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "nfs4_total"],
|
||||
"job-view-table-metrics": ["acc_utilization", "mem_bw", "flops_any", "mem_used"]
|
||||
},
|
||||
{
|
||||
"name": "tinygpu",
|
||||
"job-list-metrics": ["acc_utilization", "acc_mem_used", "cpu_load", "flops_any", "mem_bw", "mem_used"],
|
||||
"job-view-plot-metrics": ["acc_utilization", "nv_mem_util", "acc_mem_used", "acc_power", "nv_sm_clock", "nv_temp", "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "nfs4_total"],
|
||||
"job-view-table-metrics": ["acc_utilization", "mem_bw", "flops_any", "mem_used"]
|
||||
},
|
||||
{
|
||||
"name": "helma",
|
||||
"job-list-metrics": ["acc_utilization", "acc_mem_used", "cpu_load", "flops_any", "mem_bw", "mem_used"],
|
||||
"job-view-plot-metrics": ["acc_utilization", "nv_mem_util", "acc_mem_used", "acc_power", "nv_sm_clock", "nv_temp", "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "ib_recv", "ib_xmit", "nfs4_total"],
|
||||
"job-view-table-metrics": ["acc_utilization", "mem_bw", "flops_any", "mem_used"],
|
||||
"sub-clusters": [
|
||||
{
|
||||
"name": "cpu",
|
||||
"job-list-metrics": ["cpu_load", "flops_any", "mem_bw", "mem_used"],
|
||||
"job-view-plot-metrics": [ "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "flops_dp", "flops_sp", "ib_recv", "ib_xmit", "nfs4_total"],
|
||||
"job-view-table-metrics": ["mem_bw", "flops_any", "mem_used"]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"plot-configuration": {
|
||||
"plots-per-row": 3,
|
||||
"color-background": true,
|
||||
"line-width": 3,
|
||||
"color-scheme": [
|
||||
"#00bfff",
|
||||
"#0000ff",
|
||||
"#ff00ff",
|
||||
"#ff0000",
|
||||
"#ff8000",
|
||||
"#ffff00",
|
||||
"#80ff00"
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
[Unit]
|
||||
Description=ClusterCockpit In-Memory Timeseries Database for Fritz (cc-metric-store)
|
||||
Documentation=https://github.com/ClusterCockpit/cc-metric-store
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=clustercockpit
|
||||
Group=clustercockpit
|
||||
Restart=on-failure
|
||||
RestartSec=30
|
||||
TimeoutStopSec=100
|
||||
WorkingDirectory=/opt/monitoring/cc-metric-store/fritz
|
||||
ExecStart=/opt/monitoring/cc-metric-store/repo/cc-metric-store --config ./config.json
|
||||
LimitNOFILE=500000
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@@ -1,180 +0,0 @@
|
||||
{
|
||||
"metrics": {
|
||||
"clock": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"cpu_idle": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"cpu_iowait": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"cpu_irq": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"cpu_system": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"cpu_user": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"nv_mem_util": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"nv_temp": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"nv_sm_clock": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"acc_utilization": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"acc_mem_used": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"acc_power": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"flops_any": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"flops_dp": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"flops_sp": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"ib_recv": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"ib_xmit": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"ib_recv_pkts": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"ib_xmit_pkts": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"cpu_power": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"core_power": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"mem_power": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"ipc": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"cpu_load": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"lustre_close": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"lustre_open": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"lustre_statfs": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"lustre_read_bytes": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"lustre_write_bytes": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"net_bw": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"file_bw": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"mem_bw": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"mem_cached": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"mem_used": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"net_bytes_in": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"net_bytes_out": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"nfs4_read": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"nfs4_total": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"nfs4_write": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"vectorization_ratio": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
}
|
||||
},
|
||||
"checkpoints": {
|
||||
"interval": "12h",
|
||||
"directory": "/opt/monitoring/cc-metric-store/fritz/checkpoints",
|
||||
"restore": "48h"
|
||||
},
|
||||
"archive": {
|
||||
"interval": "50h",
|
||||
"directory": "/opt/monitoring/cc-metric-store/fritz/archive"
|
||||
},
|
||||
"http-api": {
|
||||
"address": "0.0.0.0:8082",
|
||||
"https-cert-file": null,
|
||||
"https-key-file": null
|
||||
},
|
||||
"retention-in-memory": "48h",
|
||||
"jwt-public-key": "-"
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
2073
nhr@fau/job-archive/cluster-helma.json
Normal file
2073
nhr@fau/job-archive/cluster-helma.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,357 +1,261 @@
|
||||
{
|
||||
"name": "meggie",
|
||||
"metricConfig": [
|
||||
{
|
||||
"name": "cpu_load",
|
||||
"unit": {
|
||||
"base": "load"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "avg",
|
||||
"footprint": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 40,
|
||||
"normal": 20,
|
||||
"caution": 15,
|
||||
"alert": 10
|
||||
},
|
||||
{
|
||||
"name": "mem_used",
|
||||
"unit": {
|
||||
"base": "B",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"footprint": "max",
|
||||
"timestep": 60,
|
||||
"peak": 64,
|
||||
"normal": 20,
|
||||
"caution": 40,
|
||||
"alert": 55
|
||||
},
|
||||
{
|
||||
"name": "flops_any",
|
||||
"unit": {
|
||||
"base": "Flops/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"footprint": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 1536,
|
||||
"normal": 200,
|
||||
"caution": 40,
|
||||
"alert": 4
|
||||
},
|
||||
{
|
||||
"name": "flops_sp",
|
||||
"unit": {
|
||||
"base": "Flops/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 1536,
|
||||
"normal": 100,
|
||||
"caution": 20,
|
||||
"alert": 2
|
||||
},
|
||||
{
|
||||
"name": "flops_dp",
|
||||
"unit": {
|
||||
"base": "Flops/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 768,
|
||||
"normal": 50,
|
||||
"caution": 10,
|
||||
"alert": 2
|
||||
},
|
||||
{
|
||||
"name": "mem_bw",
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"footprint": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 140,
|
||||
"normal": 70,
|
||||
"caution": 20,
|
||||
"alert": 5
|
||||
},
|
||||
{
|
||||
"name": "clock",
|
||||
"unit": {
|
||||
"base": "Hz",
|
||||
"prefix": "M"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 3000,
|
||||
"normal": 2400,
|
||||
"caution": 1800,
|
||||
"alert": 1200
|
||||
},
|
||||
{
|
||||
"name": "cpu_power",
|
||||
"unit": {
|
||||
"base": "W"
|
||||
},
|
||||
"scope": "socket",
|
||||
"aggregation": "sum",
|
||||
"energy": "power",
|
||||
"timestep": 60,
|
||||
"peak": 80,
|
||||
"normal": 30,
|
||||
"caution": 10,
|
||||
"alert": 5
|
||||
},
|
||||
{
|
||||
"name": "mem_power",
|
||||
"unit": {
|
||||
"base": "W"
|
||||
},
|
||||
"scope": "socket",
|
||||
"aggregation": "sum",
|
||||
"energy": "power",
|
||||
"timestep": 60,
|
||||
"peak": 100,
|
||||
"normal": 50,
|
||||
"caution": 20,
|
||||
"alert": 10
|
||||
},
|
||||
{
|
||||
"name": "ipc",
|
||||
"unit": {
|
||||
"base": "IPC"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 4,
|
||||
"normal": 2,
|
||||
"caution": 1,
|
||||
"alert": 0.5
|
||||
},
|
||||
{
|
||||
"name": "vectorization_ratio",
|
||||
"unit": {
|
||||
"base": ""
|
||||
},
|
||||
"scope": "hwthread",
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 100,
|
||||
"normal": 60,
|
||||
"caution": 40,
|
||||
"alert": 10
|
||||
},
|
||||
{
|
||||
"name": "nfs4_read",
|
||||
"unit": {
|
||||
"base": "IOP",
|
||||
"prefix": ""
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 6,
|
||||
"normal": 4,
|
||||
"caution": 2,
|
||||
"alert": 1
|
||||
},
|
||||
{
|
||||
"name": "nfs4_total",
|
||||
"unit": {
|
||||
"base": "IOP",
|
||||
"prefix": ""
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 6,
|
||||
"normal": 4,
|
||||
"caution": 2,
|
||||
"alert": 1
|
||||
}
|
||||
],
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "main",
|
||||
"nodes": "m[0101-0164,0201-0264,0301-0364,0401-0464,0601-0676,0701-0776,0801-0872,0901-0972,1001-1072,1101-1172]",
|
||||
"processorType": "Intel Broadwell",
|
||||
"socketsPerNode": 2,
|
||||
"coresPerSocket": 10,
|
||||
"threadsPerCore": 1,
|
||||
"flopRateScalar": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
"name": "meggie",
|
||||
"metricConfig": [
|
||||
{
|
||||
"name": "cpu_load",
|
||||
"unit": {
|
||||
"base": "load"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "avg",
|
||||
"footprint": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 40,
|
||||
"normal": 20,
|
||||
"caution": 15,
|
||||
"alert": 10
|
||||
},
|
||||
"value": 96
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
{
|
||||
"name": "mem_used",
|
||||
"unit": {
|
||||
"base": "B",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"footprint": "max",
|
||||
"timestep": 60,
|
||||
"peak": 64,
|
||||
"normal": 20,
|
||||
"caution": 40,
|
||||
"alert": 55
|
||||
},
|
||||
"value": 1536
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
{
|
||||
"name": "flops_any",
|
||||
"unit": {
|
||||
"base": "Flops/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"footprint": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 1536,
|
||||
"normal": 200,
|
||||
"caution": 40,
|
||||
"alert": 4
|
||||
},
|
||||
"value": 140
|
||||
},
|
||||
"topology": {
|
||||
"node": [
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
16,
|
||||
17,
|
||||
18,
|
||||
19
|
||||
],
|
||||
"socket": [
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9
|
||||
],
|
||||
[
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
16,
|
||||
17,
|
||||
18,
|
||||
19
|
||||
]
|
||||
],
|
||||
"memoryDomain": [
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9
|
||||
],
|
||||
[
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
16,
|
||||
17,
|
||||
18,
|
||||
19
|
||||
]
|
||||
],
|
||||
"core": [
|
||||
[
|
||||
0
|
||||
],
|
||||
[
|
||||
1
|
||||
],
|
||||
[
|
||||
2
|
||||
],
|
||||
[
|
||||
3
|
||||
],
|
||||
[
|
||||
4
|
||||
],
|
||||
[
|
||||
5
|
||||
],
|
||||
[
|
||||
6
|
||||
],
|
||||
[
|
||||
7
|
||||
],
|
||||
[
|
||||
8
|
||||
],
|
||||
[
|
||||
9
|
||||
],
|
||||
[
|
||||
10
|
||||
],
|
||||
[
|
||||
11
|
||||
],
|
||||
[
|
||||
12
|
||||
],
|
||||
[
|
||||
13
|
||||
],
|
||||
[
|
||||
14
|
||||
],
|
||||
[
|
||||
15
|
||||
],
|
||||
[
|
||||
16
|
||||
],
|
||||
[
|
||||
17
|
||||
],
|
||||
[
|
||||
18
|
||||
],
|
||||
[
|
||||
19
|
||||
]
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
{
|
||||
"name": "flops_sp",
|
||||
"unit": {
|
||||
"base": "Flops/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 1536,
|
||||
"normal": 100,
|
||||
"caution": 20,
|
||||
"alert": 2
|
||||
},
|
||||
{
|
||||
"name": "flops_dp",
|
||||
"unit": {
|
||||
"base": "Flops/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 768,
|
||||
"normal": 50,
|
||||
"caution": 10,
|
||||
"alert": 2
|
||||
},
|
||||
{
|
||||
"name": "net_bytes_in",
|
||||
"unit": {
|
||||
"base": "B/s"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 50000000,
|
||||
"normal": 10000000,
|
||||
"caution": 5000,
|
||||
"alert": 1000
|
||||
},
|
||||
{
|
||||
"name": "net_bytes_out",
|
||||
"unit": {
|
||||
"base": "B/s"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 50000000,
|
||||
"normal": 200000,
|
||||
"caution": 5000,
|
||||
"alert": 1000
|
||||
},
|
||||
{
|
||||
"name": "mem_bw",
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"footprint": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 140,
|
||||
"normal": 70,
|
||||
"caution": 20,
|
||||
"alert": 5
|
||||
},
|
||||
{
|
||||
"name": "clock",
|
||||
"unit": {
|
||||
"base": "Hz",
|
||||
"prefix": "M"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 3000,
|
||||
"normal": 2400,
|
||||
"caution": 1800,
|
||||
"alert": 1200
|
||||
},
|
||||
{
|
||||
"name": "cpu_power",
|
||||
"unit": {
|
||||
"base": "W"
|
||||
},
|
||||
"scope": "socket",
|
||||
"aggregation": "sum",
|
||||
"energy": "power",
|
||||
"timestep": 60,
|
||||
"peak": 80,
|
||||
"normal": 30,
|
||||
"caution": 10,
|
||||
"alert": 5
|
||||
},
|
||||
{
|
||||
"name": "mem_power",
|
||||
"unit": {
|
||||
"base": "W"
|
||||
},
|
||||
"scope": "socket",
|
||||
"aggregation": "sum",
|
||||
"energy": "power",
|
||||
"timestep": 60,
|
||||
"peak": 100,
|
||||
"normal": 50,
|
||||
"caution": 20,
|
||||
"alert": 10
|
||||
},
|
||||
{
|
||||
"name": "ipc",
|
||||
"unit": {
|
||||
"base": "IPC"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 4,
|
||||
"normal": 2,
|
||||
"caution": 1,
|
||||
"alert": 0.5
|
||||
},
|
||||
{
|
||||
"name": "vectorization_ratio",
|
||||
"unit": {
|
||||
"base": ""
|
||||
},
|
||||
"scope": "hwthread",
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 100,
|
||||
"normal": 60,
|
||||
"caution": 40,
|
||||
"alert": 10
|
||||
},
|
||||
{
|
||||
"name": "nfs4_read",
|
||||
"unit": {
|
||||
"base": "IOP",
|
||||
"prefix": ""
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 50000,
|
||||
"normal": 10000,
|
||||
"caution": 10,
|
||||
"alert": 1
|
||||
},
|
||||
{
|
||||
"name": "nfs4_total",
|
||||
"unit": {
|
||||
"base": "IOP",
|
||||
"prefix": ""
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 50000,
|
||||
"normal": 10000,
|
||||
"caution": 20,
|
||||
"alert": 5
|
||||
}
|
||||
],
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "main",
|
||||
"nodes": "m[0101-0164,0201-0264,0301-0364,0401-0464,0601-0676,0701-0776,0801-0872,0901-0972,1001-1072,1101-1172]",
|
||||
"processorType": "Intel Broadwell",
|
||||
"socketsPerNode": 2,
|
||||
"coresPerSocket": 10,
|
||||
"threadsPerCore": 1,
|
||||
"flopRateScalar": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 96
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 1536
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 140
|
||||
},
|
||||
"topology": {
|
||||
"node": [
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
|
||||
],
|
||||
"socket": [
|
||||
[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ],
|
||||
[ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ]
|
||||
],
|
||||
"memoryDomain": [
|
||||
[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ],
|
||||
[ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ]
|
||||
],
|
||||
"core": [
|
||||
[ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ]
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -178,9 +178,9 @@
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 6,
|
||||
"normal": 4,
|
||||
"caution": 2,
|
||||
"peak": 50000,
|
||||
"normal": 10000,
|
||||
"caution": 10,
|
||||
"alert": 1
|
||||
},
|
||||
{
|
||||
@@ -192,9 +192,9 @@
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 6,
|
||||
"normal": 4,
|
||||
"caution": 2,
|
||||
"peak": 50000,
|
||||
"normal": 10000,
|
||||
"caution": 10,
|
||||
"alert": 1
|
||||
},
|
||||
{
|
||||
@@ -206,10 +206,10 @@
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 6,
|
||||
"normal": 4,
|
||||
"caution": 2,
|
||||
"alert": 1
|
||||
"peak": 50000,
|
||||
"normal": 10000,
|
||||
"caution": 20,
|
||||
"alert": 5
|
||||
}
|
||||
],
|
||||
"subClusters": [
|
||||
|
||||
@@ -86,6 +86,32 @@
|
||||
"caution": 100,
|
||||
"alert": 50
|
||||
},
|
||||
{
|
||||
"name": "net_bytes_in",
|
||||
"unit": {
|
||||
"base": "B/s"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 50000000,
|
||||
"normal": 10000000,
|
||||
"caution": 5000,
|
||||
"alert": 1000
|
||||
},
|
||||
{
|
||||
"name": "net_bytes_out",
|
||||
"unit": {
|
||||
"base": "B/s"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 50000000,
|
||||
"normal": 200000,
|
||||
"caution": 5000,
|
||||
"alert": 1000
|
||||
},
|
||||
{
|
||||
"name": "mem_bw",
|
||||
"unit": {
|
||||
@@ -197,14 +223,14 @@
|
||||
"caution": 10000,
|
||||
"alert": 5000,
|
||||
"subClusters": [
|
||||
{
|
||||
{
|
||||
"name": "a100",
|
||||
"peak": 160000,
|
||||
"normal": 120000,
|
||||
"caution": 80000,
|
||||
"alert": 40000
|
||||
},
|
||||
{
|
||||
{
|
||||
"name": "v100",
|
||||
"peak": 128000,
|
||||
"normal": 96000,
|
||||
@@ -234,6 +260,7 @@
|
||||
},
|
||||
"scope": "accelerator",
|
||||
"aggregation": "sum",
|
||||
"energy": "power",
|
||||
"timestep": 60,
|
||||
"peak": 400,
|
||||
"normal": 200,
|
||||
@@ -290,9 +317,9 @@
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 6,
|
||||
"normal": 4,
|
||||
"caution": 2,
|
||||
"peak": 50000,
|
||||
"normal": 10000,
|
||||
"caution": 10,
|
||||
"alert": 1
|
||||
},
|
||||
{
|
||||
@@ -304,9 +331,9 @@
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 6,
|
||||
"normal": 4,
|
||||
"caution": 2,
|
||||
"peak": 50000,
|
||||
"normal": 10000,
|
||||
"caution": 10,
|
||||
"alert": 1
|
||||
},
|
||||
{
|
||||
@@ -318,10 +345,10 @@
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 6,
|
||||
"normal": 4,
|
||||
"caution": 2,
|
||||
"alert": 1
|
||||
"peak": 50000,
|
||||
"normal": 10000,
|
||||
"caution": 20,
|
||||
"alert": 5
|
||||
}
|
||||
],
|
||||
"subClusters": [
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user