8 Commits

Author SHA1 Message Date
Michael Panzlaff
238b49bc21 add cc-slurm-adapter examples 2026-03-04 17:25:26 +01:00
Michael Panzlaff
59aa0a09bb cc-metric-collector: update woody 2026-03-04 17:13:34 +01:00
Michael Panzlaff
ae65147c77 cc-metric-collector: update tinygpu 2026-03-04 17:13:28 +01:00
Michael Panzlaff
e28a9aafd5 cc-metric-collector: update tinyfat 2026-03-04 17:13:17 +01:00
Michael Panzlaff
95d1f1d5e9 cc-metric-collector: update fritz.spr 2026-03-04 17:01:41 +01:00
Michael Panzlaff
771d7964df cc-metric-collector: update fritz 2026-03-04 16:57:28 +01:00
Michael Panzlaff
7d0f09ecb9 cc-metric-collector: update alex 2026-03-04 16:48:30 +01:00
Jan Eitzinger
bd22bfe5e6 Update config for v1.5.0 2026-03-04 14:57:11 +01:00
57 changed files with 5920 additions and 7451 deletions

View File

@@ -9,25 +9,15 @@ You can find an overview about all clusters
Some systems run with job exclusive nodes, others have node sharing enabled. Some systems run with job exclusive nodes, others have node sharing enabled.
There are CPU systems (Fritz, Meggie, Woody, TinyFat) as well as GPU accelerated There are CPU systems (Fritz, Meggie, Woody, TinyFat) as well as GPU accelerated
clusters (Alex, TinyGPU). clusters (Alex, Helma, TinyGPU).
NHR@FAU uses the following stack: NHR@FAU uses the following stack:
* `cc-metric-collector` as node agent * `cc-metric-collector`
* `cc-metric-store` as temporal metric time series cache. We use one instance
for all clusters.
* `cc-backend` * `cc-backend`
* A homegrown python script running on the management nodes for providing job * `cc-slurm-adapter`
meta data from Slurm
* Builtin sqlite database for job meta and user data (currently 50GB large)
* Job Archive without retention using compressed data.json files (around 700GB)
Currently all API use regular HTTP protocol, but we plan to switch to NATS for We use the following server with Ubuntu Linux:
all communication.
We also push the metric data to an InfluxDB instance for debugging purposes.
The backend and metric store run on the same dedicated Dell server running
Ubuntu Linux:
* Two Intel Xeon(R) Platinum 8352Y with 32 cores each * Two Intel Xeon(R) Platinum 8352Y with 32 cores each
* 512 GB Main memory capacity * 512 GB Main memory capacity

View File

@@ -1,18 +1,20 @@
[Unit] [Unit]
Description=ClusterCockpit Web Server Description=ClusterCockpit Backend
Documentation=https://clustercockpit.org Documentation=https://github.com/ClusterCockpit/cc-backend
Wants=network-online.target Wants=network-online.target
After=network-online.target After=network-online.target
After=mariadb.service mysql.service
[Service] [Service]
WorkingDirectory=/opt/monitoring/cc-backend WorkingDirectory=/opt/monitoring/cc-backend
Type=notify Type=notify
User=clustercockpit
Group=clustercockpit
NotifyAccess=all NotifyAccess=all
Restart=on-failure Restart=on-failure
RestartSec=30 RestartSec=30
TimeoutStopSec=100 TimeoutStartSec=200
ExecStart=/opt/monitoring/cc-backend/cc-backend -loglevel info -server -config ./config.json TimeoutStopSec=200
ExecStart=/opt/monitoring/cc-backend/cc-backend -loglevel info -server -config ./config.json
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

View File

@@ -1,241 +1,77 @@
{ {
"addr": "0.0.0.0:443", "main": {
"stop-jobs-exceeding-walltime": 288000, "addr": "127.0.0.1:8050",
"short-running-jobs-duration": 300, "api-allowed-ips": ["*"],
"ldap": { "stop-jobs-exceeding-walltime":288000,
"url": "ldaps://hpcldap.rrze.uni-erlangen.de", "short-running-jobs-duration": 300,
"user_base": "ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", "resampling": {
"search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", "minimum-points": 600,
"user_bind": "uid={username},ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", "trigger": 180,
"user_filter": "(&(objectclass=posixAccount))", "resolutions": [240, 60]
"sync_interval": "24h"
},
"jwts": {
"syncUserOnLogin": true,
"updateUserOnLogin": true,
"trustedIssuer": "https://portal.hpc.fau.de/",
"validateUser": false,
"max-age": "168h"
},
"https-cert-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/fullchain.pem",
"https-key-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/privkey.pem",
"user": "clustercockpit",
"group": "clustercockpit",
"archive": {
"kind": "file",
"path": "./var/job-archive",
"compression": 7,
"retention": {
"policy": "none"
}
},
"enable-resampling": {
"trigger": 30,
"resolutions": [
600,
300,
120,
60
]
},
"emission-constant": 317,
"ui-defaults": {
"analysis_view_histogramMetrics": [
"flops_any",
"mem_bw",
"mem_used"
],
"analysis_view_scatterPlotMetrics": [
[
"flops_any",
"mem_bw"
],
[
"flops_any",
"cpu_load"
],
[
"cpu_load",
"mem_bw"
]
],
"job_view_nodestats_selectedMetrics": [
"flops_any",
"mem_bw",
"mem_used"
],
"job_view_polarPlotMetrics": [
"flops_any",
"mem_bw",
"mem_used"
],
"job_view_selectedMetrics": [
"flops_any",
"mem_bw",
"mem_used"
],
"job_view_showFootprint": true,
"job_list_usePaging": false,
"plot_general_colorBackground": true,
"plot_general_colorscheme": [
"#00bfff",
"#0000ff",
"#ff00ff",
"#ff0000",
"#ff8000",
"#ffff00",
"#80ff00"
],
"plot_general_lineWidth": 3,
"plot_list_jobsPerPage": 10,
"plot_list_selectedMetrics": [
"cpu_load",
"mem_used",
"flops_any",
"mem_bw"
],
"plot_view_plotsPerRow": 3,
"plot_view_showPolarplot": true,
"plot_view_showRoofline": true,
"plot_view_showStatTable": true,
"system_view_selectedMetric": "cpu_load",
"analysis_view_selectedTopEntity": "user",
"analysis_view_selectedTopCategory": "totalWalltime",
"status_view_selectedTopUserCategory": "totalJobs",
"status_view_selectedTopProjectCategory": "totalJobs"
},
"clusters": [
{
"name": "fritz",
"metricDataRepository": {
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": "-"
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
}, },
"duration": { "nodestate-retention": {
"from": 0, "policy": "move",
"to": 86400 "target-kind": "file",
"target-path": "/opt/monitoring/cc-backend/var/nodestate-archive/"
}, },
"startTime": { "emission-constant": 317,
"from": "2022-01-01T00:00:00Z", "enable-job-taggers": true
"to": null
}
}
}, },
{ "cron": {
"name": "alex", "commit-job-worker": "1m",
"metricDataRepository": { "duration-worker": "5m",
"kind": "cc-metric-store", "footprint-worker": "10m"
"url": "http://localhost:8082",
"token": "-"
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2022-01-01T00:00:00Z",
"to": null
}
}
}, },
{ "archive": {
"name": "woody", "kind": "file",
"metricDataRepository": { "path": "./var/job-archive"
"kind": "cc-metric-store",
"url": "http://localhost:8082",
"token": "-"
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 1
},
"duration": {
"from": 0,
"to": 172800
},
"startTime": {
"from": "2020-01-01T00:00:00Z",
"to": null
}
}
}, },
{ "auth": {
"name": "tinyfat", "ldap": {
"metricDataRepository": { "url": "ldaps://hpcldap.rrze.uni-erlangen.de",
"kind": "cc-metric-store", "user-base": "ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
"url": "http://localhost:8082", "search-dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
"token": "-" "user-bind": "uid={username},ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
}, "user-filter": "(&(objectclass=posixAccount))",
"filterRanges": { "sync-interval": "24h"
"numNodes": {
"from": 1,
"to": 1
}, },
"duration": { "jwts": {
"from": 0, "sync-user-on-login": true,
"to": 172800 "update-user-on-login": true,
}, "validate-user": false,
"startTime": { "max-age": "1h",
"from": "2020-01-01T00:00:00Z", "trusted-issuer": "https://portal.hpc.fau.de/"
"to": null
} }
}
}, },
{ "metric-store": {
"name": "tinygpu", "checkpoints": {
"metricDataRepository": { "file-format": "wal",
"kind": "cc-metric-store", "directory": "./var/metric-checkpoints"
"url": "http://localhost:8082",
"token": "-"
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 1
}, },
"duration": { "cleanup": {
"from": 0, "mode": "archive",
"to": 172800 "directory": "./var/metric-archive"
}, },
"startTime": { "nats-subscriptions": [
"from": "2020-01-01T00:00:00Z", {
"to": null "subscribe-to": "ccmetrics.>"
} }
} ],
"retention-in-memory": "24h",
"memory-cap": 200
}, },
{ "archive": {
"name": "meggie", "kind": "file",
"metricDataRepository": { "path": "./var/job-archive",
"kind": "cc-metric-store", "compression": 7,
"url": "http://localhost:8082", "retention": {
"token": "-" "policy": "none"
},
"filterRanges": {
"numNodes": {
"from": 1,
"to": 64
},
"duration": {
"from": 0,
"to": 86400
},
"startTime": {
"from": "2018-01-01T00:00:00Z",
"to": null
} }
} },
} "nats": {
] "address": "nats://monitoring.nhr.fau.de:4222",
"username": "metricstore",
"password": "XXX"
},
"ui-file": "uiConfig.json"
} }

View File

@@ -0,0 +1,59 @@
{
"job-view": {
"show-polar-plot":true,
"show-footprint": true,
"show-roofline": true,
"show-stat-table": true
},
"metric-config": {
"job-list-metrics": ["cpu_load", "flops_any", "mem_bw", "mem_used"],
"job-view-plot-metrics": ["cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "nfs4_total"],
"job-view-table-metrics": ["mem_bw", "flops_any", "mem_used"],
"clusters": [
{
"name": "fritz",
"job-view-plot-metrics": ["cpu_load", "cpu_user", "flops_any", "vectorization_ratio", "mem_bw", "mem_used", "flops_dp", "flops_sp", "ib_recv", "ib_xmit", "clock", "ipc", "cpu_power", "mem_power", "nfs4_total"]
},
{
"name": "alex",
"job-list-metrics": ["acc_utilization", "acc_mem_used", "cpu_load", "flops_any", "mem_bw", "mem_used"],
"job-view-plot-metrics": ["acc_utilization", "nv_mem_util", "acc_mem_used", "acc_power", "nv_sm_clock", "nv_temp", "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "nfs4_total"],
"job-view-table-metrics": ["acc_utilization", "mem_bw", "flops_any", "mem_used"]
},
{
"name": "tinygpu",
"job-list-metrics": ["acc_utilization", "acc_mem_used", "cpu_load", "flops_any", "mem_bw", "mem_used"],
"job-view-plot-metrics": ["acc_utilization", "nv_mem_util", "acc_mem_used", "acc_power", "nv_sm_clock", "nv_temp", "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "nfs4_total"],
"job-view-table-metrics": ["acc_utilization", "mem_bw", "flops_any", "mem_used"]
},
{
"name": "helma",
"job-list-metrics": ["acc_utilization", "acc_mem_used", "cpu_load", "flops_any", "mem_bw", "mem_used"],
"job-view-plot-metrics": ["acc_utilization", "nv_mem_util", "acc_mem_used", "acc_power", "nv_sm_clock", "nv_temp", "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "ib_recv", "ib_xmit", "nfs4_total"],
"job-view-table-metrics": ["acc_utilization", "mem_bw", "flops_any", "mem_used"],
"sub-clusters": [
{
"name": "cpu",
"job-list-metrics": ["cpu_load", "flops_any", "mem_bw", "mem_used"],
"job-view-plot-metrics": [ "cpu_load", "cpu_user", "flops_any", "mem_bw", "mem_used", "clock", "ipc", "cpu_power", "flops_dp", "flops_sp", "ib_recv", "ib_xmit", "nfs4_total"],
"job-view-table-metrics": ["mem_bw", "flops_any", "mem_used"]
}
]
}
]
},
"plot-configuration": {
"plots-per-row": 3,
"color-background": true,
"line-width": 3,
"color-scheme": [
"#00bfff",
"#0000ff",
"#ff00ff",
"#ff0000",
"#ff8000",
"#ffff00",
"#80ff00"
]
}
}

View File

@@ -27,15 +27,11 @@
"send_derived_values": true "send_derived_values": true
}, },
"diskstat" : {}, "diskstat" : {},
"iostat" : {}, "nfsiostat" : {},
"ibstat" : { "ibstat" : {
"send_abs_values": true, "send_abs_values": true,
"send_derived_values": true "send_derived_values": true
}, },
"ipmistat" : {
"send_abs_values": true,
"send_derived_values": true
},
"tempstat" : { "tempstat" : {
"tag_override" : { "tag_override" : {
"hwmon0" : { "hwmon0" : {

View File

@@ -1,8 +1,10 @@
{ {
"sinks": "/etc/cc-metric-collector/sinks.json", "sinks-file": "/etc/cc-metric-collector/sinks.json",
"collectors" : "/etc/cc-metric-collector/collectors.json", "collectors-file" : "/etc/cc-metric-collector/collectors.json",
"receivers" : "/etc/cc-metric-collector/receivers.json", "receivers-file" : "/etc/cc-metric-collector/receivers.json",
"router" : "/etc/cc-metric-collector/router.json", "router-file" : "/etc/cc-metric-collector/router.json",
"interval": "60s", "main" : {
"duration": "10s" "interval": "60s",
"duration": "10s"
}
} }

View File

@@ -1,58 +1,60 @@
{ {
"add_tags" : [ "process_messages" : {
{ "rename_messages" : {
"key" : "cluster", "load_one" : "cpu_load",
"value" : "alex", "cpu_load_core" : "cpu_load",
"if" : "*" "net_bytes_in_bw" : "net_bytes_in",
} "net_bytes_out_bw" : "net_bytes_out",
], "net_pkts_in_bw" : "net_pkts_in",
"rename_metrics" : { "net_pkts_out_bw" : "net_pkts_out",
"load_one" : "cpu_load", "ib_recv_bw" : "ib_recv",
"cpu_load_core" : "cpu_load", "ib_xmit_bw" : "ib_xmit",
"net_bytes_in_bw" : "net_bytes_in", "ib_recv_pkts_bw": "ib_recv_pkts",
"net_bytes_out_bw" : "net_bytes_out", "ib_xmit_pkts_bw": "ib_xmit_pkts",
"net_pkts_in_bw" : "net_pkts_in", "lustre_read_bytes_diff" : "lustre_read_bytes",
"net_pkts_out_bw" : "net_pkts_out", "lustre_read_requests_diff" : "lustre_read_requests",
"ib_recv_bw" : "ib_recv", "lustre_write_bytes_diff" : "lustre_write_bytes",
"ib_xmit_bw" : "ib_xmit", "lustre_write_requests_diff" : "lustre_write_requests",
"ib_recv_pkts_bw": "ib_recv_pkts", "lustre_open_diff" : "lustre_open",
"ib_xmit_pkts_bw": "ib_xmit_pkts", "lustre_close_diff" : "lustre_close",
"lustre_read_bytes_diff" : "lustre_read_bytes", "lustre_setattr_diff" : "lustre_setattr",
"lustre_read_requests_diff" : "lustre_read_requests", "lustre_getattr_diff" : "lustre_getattr",
"lustre_write_bytes_diff" : "lustre_write_bytes", "lustre_statfs_diff": "lustre_statfs",
"lustre_write_requests_diff" : "lustre_write_requests", "lustre_inode_permission_diff" : "lustre_inode_permission",
"lustre_open_diff" : "lustre_open", "nv_util" : "acc_utilization",
"lustre_close_diff" : "lustre_close", "nv_fb_mem_used" : "acc_mem_used",
"lustre_setattr_diff" : "lustre_setattr", "nv_power_usage" : "acc_power"
"lustre_getattr_diff" : "lustre_getattr", },
"lustre_statfs_diff": "lustre_statfs", "add_tags_if" : [
"lustre_inode_permission_diff" : "lustre_inode_permission", {
"nv_util" : "acc_utilization", "key" : "cluster",
"nv_fb_mem_used" : "acc_mem_used", "value" : "alex",
"nv_power_usage" : "acc_power" "if" : "true"
}
],
"drop_messages" : [
"net_bytes_in",
"net_bytes_out",
"ib_recv",
"ib_xmit",
"ib_recv_pkts",
"ib_xmit_pkts",
"net_pkts_in",
"net_pkts_out",
"lustre_read_bytes",
"lustre_read_requests",
"lustre_write_bytes",
"lustre_write_requests"
],
"change_unit_prefix": {
"name == 'mem_used'": "G",
"name == 'swap_used'": "G",
"name == 'mem_total'": "G",
"name == 'swap_total'": "G",
"name == 'cpufreq'": "M"
},
"normalize_metrics" : true
}, },
"drop_metrics" : [
"net_bytes_in",
"net_bytes_out",
"ib_recv",
"ib_xmit",
"ib_recv_pkts",
"ib_xmit_pkts",
"net_pkts_in",
"net_pkts_out",
"lustre_read_bytes",
"lustre_read_requests",
"lustre_write_bytes",
"lustre_write_requests"
],
"interval_timestamp" : false, "interval_timestamp" : false,
"num_cache_intervals" : 0, "num_cache_intervals" : 0
"change_unit_prefix": {
"mem_used": "G",
"swap_used": "G",
"mem_total": "G",
"swap_total": "G",
"cpufreq": "M"
},
"normalize_metrics" : true
} }

View File

@@ -1,26 +1,27 @@
{ {
"nhrinflux": { "nhrinflux" : {
"type": "influxasync", "type" : "influxasync",
"host": "monitoring-test.nhr.uni-erlangen.de", "host": "monitoring-test.nhr.uni-erlangen.de",
"port": "8086", "port": "8086",
"organization": "ClusterCockpit", "organization" : "ClusterCockpit",
"database": "alex", "database" : "alex",
"password": "XYZ", "password": "XYX",
"ssl": true, "ssl": true,
"meta_as_tags": [ "meta_as_tags" : [
"unit" "unit"
] ]
}, },
"alexstore": { "alexstore" : {
"type": "http", "type" : "http",
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=alex", "url" : "http://monitoring.nhr.fau.de:8082/api/write?cluster=alex",
"jwt": "XYZ", "jwt": "XYZ",
"meta_as_tags": [ "meta_as_tags" : [
"unit" "unit"
], ],
"idle_connection_timeout": "60s", "idle_connection_timeout" : "60s",
"flush_delay": "2s", "flush_delay" : "2s",
"max_retries": 1, "max_retries" : 1,
"timeout": "10s" "timeout" : "10s",
} "precision": "s"
}
} }

View File

@@ -1,33 +1,36 @@
{ {
"fritzganglia": { "fritzganglia" : {
"type": "libganglia", "type" : "libganglia",
"gmond_config": "/etc/ganglia/gmond.conf", "gmond_config" : "/etc/ganglia/gmond.conf",
"libganglia_path": "libganglia.so.0", "libganglia_path": "libganglia.so.0",
"add_ganglia_group": true "add_ganglia_group": true
}, },
"nhrinflux": { "nhrinflux" : {
"type": "influxasync", "type" : "influxasync",
"host": "monitoring-test.nhr.uni-erlangen.de", "host": "monitoring-test.nhr.uni-erlangen.de",
"port": "8086", "port": "8086",
"organization": "ClusterCockpit", "organization" : "ClusterCockpit",
"database": "fritz_neu", "database" : "alex",
"password": "XZY", "password": "XYZ",
"ssl": true, "ssl": true,
"meta_as_tags": [ "meta_as_tags" : [
"unit" "unit"
] ]
}, },
"fritzstore": { "alexstore" : {
"type": "http", "type" : "http",
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz", "url" : "http://monitoring.nhr.fau.de:8082/api/write?cluster=alex",
"jwt": "XZY", "jwt": "XYZ",
"meta_as_tags": [ "meta_as_tags" : [
"unit" "unit"
], ],
"idle_connection_timeout": "60s" "idle_connection_timeout" : "60s",
}, "flush_delay" : "2s",
"debugstdout": { "max_retries" : 1,
"type": "stdout", "timeout" : "10s"
"output_file": "/tmp/debug.log" },
} "debugstdout" : {
"type": "stdout",
"output_file" : "/tmp/debug.log"
}
} }

View File

@@ -38,6 +38,177 @@
} }
} }
}, },
"cpufreq_cpuinfo": {}, "nfsiostat": {},
"nfsiostat": {} "likwid": {
"force_overwrite" : true,
"invalid_to_zero" : true,
"access_mode" : "accessdaemon",
"accessdaemon_path" : "/apps/likwid/5.3.0-spr/sbin",
"liblikwid_path": "/apps/likwid/5.3.0-spr/lib/liblikwid.so",
"eventsets": [
{
"events": {
"FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"FIXC2": "CPU_CLK_UNHALTED_REF",
"MBOX0C0": "CAS_COUNT_RD",
"MBOX0C1": "CAS_COUNT_WR",
"MBOX1C0": "CAS_COUNT_RD",
"MBOX1C1": "CAS_COUNT_WR",
"MBOX2C0": "CAS_COUNT_RD",
"MBOX2C1": "CAS_COUNT_WR",
"MBOX3C0": "CAS_COUNT_RD",
"MBOX3C1": "CAS_COUNT_WR",
"MBOX4C0": "CAS_COUNT_RD",
"MBOX4C1": "CAS_COUNT_WR",
"MBOX5C0": "CAS_COUNT_RD",
"MBOX5C1": "CAS_COUNT_WR",
"MBOX6C0": "CAS_COUNT_RD",
"MBOX6C1": "CAS_COUNT_WR",
"MBOX7C0": "CAS_COUNT_RD",
"MBOX7C1": "CAS_COUNT_WR",
"MBOX8C0": "CAS_COUNT_RD",
"MBOX8C1": "CAS_COUNT_WR",
"MBOX9C0": "CAS_COUNT_RD",
"MBOX9C1": "CAS_COUNT_WR",
"MBOX10C0": "CAS_COUNT_RD",
"MBOX10C1": "CAS_COUNT_WR",
"MBOX11C0": "CAS_COUNT_RD",
"MBOX11C1": "CAS_COUNT_WR",
"MBOX12C0": "CAS_COUNT_RD",
"MBOX12C1": "CAS_COUNT_WR",
"MBOX13C0": "CAS_COUNT_RD",
"MBOX13C1": "CAS_COUNT_WR",
"MBOX14C0": "CAS_COUNT_RD",
"MBOX14C1": "CAS_COUNT_WR",
"MBOX15C0": "CAS_COUNT_RD",
"MBOX15C1": "CAS_COUNT_WR",
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE",
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE",
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE",
"PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE",
"PWR0": "PWR_PKG_ENERGY",
"PWR3": "PWR_DRAM_ENERGY"
},
"metrics": [
{
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
"name": "clock",
"publish": true,
"unit": "MHz",
"type": "hwthread"
},
{
"calc": "FIXC0/FIXC1",
"name": "ipc",
"publish": true,
"type": "hwthread"
},
{
"calc": "PWR0/time",
"name": "cpu_power",
"publish": true,
"unit": "Watt",
"type": "socket"
},
{
"calc": "PWR0",
"name": "cpu_energy",
"publish": true,
"unit": "Joules",
"type": "socket"
},
{
"calc": "PWR3/time",
"name": "mem_power",
"unit": "Watt",
"publish": true,
"type": "socket"
},
{
"calc": "PWR3",
"name": "mem_energy",
"publish": true,
"unit": "Joules",
"type": "socket"
},
{
"calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time",
"name": "flops_dp",
"unit": "GFlops/s",
"publish": true,
"type": "hwthread"
},
{
"calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX8C0+MBOX9C0+MBOX10C0+MBOX11C0+MBOX12C0+MBOX13C0+MBOX14C0+MBOX15C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1+MBOX8C1+MBOX9C1+MBOX10C1+MBOX11C1+MBOX12C1+MBOX13C1+MBOX14C1+MBOX15C1)*64.0/time",
"name": "mem_bw",
"unit": "GBytes/s",
"publish": true,
"type": "socket"
},
{
"calc": "PMC0+PMC2+PMC3",
"name": "dp_vec_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "PMC0+PMC1+PMC2+PMC3",
"name": "dp_fp_ins",
"type": "hwthread",
"publish": false
}
]
},
{
"events": {
"FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"FIXC2": "CPU_CLK_UNHALTED_REF",
"FIXC3": "TOPDOWN_SLOTS",
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE",
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE",
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE",
"PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE"
},
"metrics": [
{
"calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time",
"name": "flops_sp",
"unit": "GFlops/s",
"publish": true,
"type": "hwthread"
},
{
"calc": "PMC0+PMC2+PMC3",
"name": "sp_vec_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "PMC0+PMC1+PMC2+PMC3",
"name": "sp_fp_ins",
"type": "hwthread",
"publish": false
}
]
}
],
"globalmetrics": [
{
"calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))",
"name": "vectorization_ratio",
"unit": "%",
"type": "hwthread",
"publish": true
},
{
"calc": "(flops_dp * 2) + flops_sp",
"name": "flops_any",
"unit": "GFlops/s",
"type": "hwthread",
"publish": true
}
]
}
} }

View File

@@ -1,8 +1,10 @@
{ {
"sinks": "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/sinks2.json", "sinks-file": "/etc/cc-metric-collector/sinks.json",
"collectors" : "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/collectors.json", "collectors-file" : "/etc/cc-metric-collector/collectors.json",
"receivers" : "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/receivers.json", "receivers-file" : "/etc/cc-metric-collector/receivers.json",
"router" : "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/router.json", "router-file" : "/etc/cc-metric-collector/router.json",
"interval": "60s", "main" : {
"duration": "10s" "interval": "60s",
} "duration": "10s"
}
}

View File

@@ -1,54 +1,56 @@
{ {
"add_tags" : [ "process_messages": {
{ "rename_messages" : {
"key" : "cluster", "load_one" : "cpu_load",
"value" : "fritz", "net_bytes_in_bw" : "net_bytes_in",
"if" : "*" "net_bytes_out_bw" : "net_bytes_out",
} "net_pkts_in_bw" : "net_pkts_in",
], "net_pkts_out_bw" : "net_pkts_out",
"rename_metrics" : { "ib_recv_bw" : "ib_recv",
"load_one" : "cpu_load", "ib_xmit_bw" : "ib_xmit",
"net_bytes_in_bw" : "net_bytes_in", "ib_recv_pkts_bw": "ib_recv_pkts",
"net_bytes_out_bw" : "net_bytes_out", "ib_xmit_pkts_bw": "ib_xmit_pkts",
"net_pkts_in_bw" : "net_pkts_in", "lustre_read_bytes_diff" : "lustre_read_bytes",
"net_pkts_out_bw" : "net_pkts_out", "lustre_read_requests_diff" : "lustre_read_requests",
"ib_recv_bw" : "ib_recv", "lustre_write_bytes_diff" : "lustre_write_bytes",
"ib_xmit_bw" : "ib_xmit", "lustre_write_requests_diff" : "lustre_write_requests",
"ib_recv_pkts_bw": "ib_recv_pkts", "lustre_open_diff" : "lustre_open",
"ib_xmit_pkts_bw": "ib_xmit_pkts", "lustre_close_diff" : "lustre_close",
"lustre_read_bytes_diff" : "lustre_read_bytes", "lustre_setattr_diff" : "lustre_setattr",
"lustre_read_requests_diff" : "lustre_read_requests", "lustre_getattr_diff" : "lustre_getattr",
"lustre_write_bytes_diff" : "lustre_write_bytes", "lustre_statfs_diff": "lustre_statfs",
"lustre_write_requests_diff" : "lustre_write_requests", "lustre_inode_permission_diff" : "lustre_inode_permission",
"lustre_open_diff" : "lustre_open", "cpufreq" : "clock"
"lustre_close_diff" : "lustre_close", },
"lustre_setattr_diff" : "lustre_setattr", "add_tags_if" : [
"lustre_getattr_diff" : "lustre_getattr", {
"lustre_statfs_diff": "lustre_statfs", "key" : "cluster",
"lustre_inode_permission_diff" : "lustre_inode_permission", "value" : "fritz",
"cpufreq" : "clock" "if" : "true"
}
],
"drop_messages" : [
"net_bytes_in",
"net_bytes_out",
"ib_recv",
"ib_xmit",
"ib_recv_pkts",
"ib_xmit_pkts",
"net_pkts_in",
"net_pkts_out",
"lustre_read_bytes",
"lustre_read_requests",
"lustre_write_bytes",
"lustre_write_requests"
],
"change_unit_prefix": {
"name == 'mem_used'": "G",
"name == 'swap_used'": "G",
"name == 'mem_total'": "G",
"name == 'swap_total'": "G"
},
"normalize_metrics" : true
}, },
"drop_metrics" : [
"net_bytes_in",
"net_bytes_out",
"ib_recv",
"ib_xmit",
"ib_recv_pkts",
"ib_xmit_pkts",
"net_pkts_in",
"net_pkts_out",
"lustre_read_bytes",
"lustre_read_requests",
"lustre_write_bytes",
"lustre_write_requests"
],
"interval_timestamp" : false, "interval_timestamp" : false,
"num_cache_intervals" : 0, "num_cache_intervals" : 0
"change_unit_prefix": {
"mem_used": "G",
"swap_used": "G",
"mem_total": "G",
"swap_total": "G"
},
"normalize_metrics" : true
} }

View File

@@ -1,32 +1,27 @@
{ {
"fritzganglia": { "nhrinflux" : {
"type": "libganglia", "type" : "influxasync",
"gmond_config": "/etc/ganglia/gmond.conf", "host": "monitoring-test.nhr.uni-erlangen.de",
"libganglia_path": "libganglia.so.0", "port": "8086",
"add_ganglia_group": true "organization" : "ClusterCockpit",
}, "database" : "fritz_neu",
"nhrinflux": { "password": "XYZ",
"type": "influxasync", "ssl": true,
"host": "monitoring-test.nhr.uni-erlangen.de", "meta_as_tags" : [
"port": "8086", "unit"
"organization": "ClusterCockpit", ]
"database": "fritz_neu", },
"password": "XZY", "fritzstore" : {
"ssl": true, "type" : "http",
"meta_as_tags": [ "url" : "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz",
"unit" "jwt": "XYZ",
] "meta_as_tags" : [
}, "unit"
"fritzstore": { ],
"type": "http", "idle_connection_timeout": "60s",
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz", "flush_delay" : "2s",
"jwt": "XZY", "max_retries" : 1,
"meta_as_tags": [ "timeout" : "10s",
"unit" "precision": "s"
], }
"idle_connection_timeout": "60s",
"flush_delay": "2s",
"max_retries": 1,
"timeout": "10s"
}
} }

View File

@@ -1,8 +1,10 @@
{ {
"sinks": "/etc/cc-metric-collector/sinks.json", "sinks-file": "/etc/cc-metric-collector/sinks.json",
"collectors" : "/etc/cc-metric-collector/collectors.json", "collectors-file" : "/etc/cc-metric-collector/collectors.json",
"receivers" : "/etc/cc-metric-collector/receivers.json", "receivers-file" : "/etc/cc-metric-collector/receivers.json",
"router" : "/etc/cc-metric-collector/router.json", "router-file" : "/etc/cc-metric-collector/router.json",
"interval": "60s", "main" : {
"duration": "10s" "interval": "60s",
"duration": "10s"
}
} }

View File

@@ -1,53 +1,55 @@
{ {
"add_tags" : [ "process_messages": {
{ "rename_messages" : {
"key" : "cluster", "load_one" : "cpu_load",
"value" : "fritz", "net_bytes_in_bw" : "net_bytes_in",
"if" : "*" "net_bytes_out_bw" : "net_bytes_out",
} "net_pkts_in_bw" : "net_pkts_in",
], "net_pkts_out_bw" : "net_pkts_out",
"rename_metrics" : { "ib_recv_bw" : "ib_recv",
"load_one" : "cpu_load", "ib_xmit_bw" : "ib_xmit",
"net_bytes_in_bw" : "net_bytes_in", "ib_recv_pkts_bw": "ib_recv_pkts",
"net_bytes_out_bw" : "net_bytes_out", "ib_xmit_pkts_bw": "ib_xmit_pkts",
"net_pkts_in_bw" : "net_pkts_in", "lustre_read_bytes_diff" : "lustre_read_bytes",
"net_pkts_out_bw" : "net_pkts_out", "lustre_read_requests_diff" : "lustre_read_requests",
"ib_recv_bw" : "ib_recv", "lustre_write_bytes_diff" : "lustre_write_bytes",
"ib_xmit_bw" : "ib_xmit", "lustre_write_requests_diff" : "lustre_write_requests",
"ib_recv_pkts_bw": "ib_recv_pkts", "lustre_open_diff" : "lustre_open",
"ib_xmit_pkts_bw": "ib_xmit_pkts", "lustre_close_diff" : "lustre_close",
"lustre_read_bytes_diff" : "lustre_read_bytes", "lustre_setattr_diff" : "lustre_setattr",
"lustre_read_requests_diff" : "lustre_read_requests", "lustre_getattr_diff" : "lustre_getattr",
"lustre_write_bytes_diff" : "lustre_write_bytes", "lustre_statfs_diff": "lustre_statfs",
"lustre_write_requests_diff" : "lustre_write_requests", "lustre_inode_permission_diff" : "lustre_inode_permission"
"lustre_open_diff" : "lustre_open", },
"lustre_close_diff" : "lustre_close", "add_tags_if" : [
"lustre_setattr_diff" : "lustre_setattr", {
"lustre_getattr_diff" : "lustre_getattr", "key" : "cluster",
"lustre_statfs_diff": "lustre_statfs", "value" : "fritz",
"lustre_inode_permission_diff" : "lustre_inode_permission" "if" : "true"
}
],
"drop_messages" : [
"net_bytes_in",
"net_bytes_out",
"ib_recv",
"ib_xmit",
"ib_recv_pkts",
"ib_xmit_pkts",
"net_pkts_in",
"net_pkts_out",
"lustre_read_bytes",
"lustre_read_requests",
"lustre_write_bytes",
"lustre_write_requests"
],
"change_unit_prefix": {
"name == 'mem_used'": "G",
"name == 'swap_used'": "G",
"name == 'mem_total'": "G",
"name == 'swap_total'": "G"
},
"normalize_metrics" : true
}, },
"drop_metrics" : [
"net_bytes_in",
"net_bytes_out",
"ib_recv",
"ib_xmit",
"ib_recv_pkts",
"ib_xmit_pkts",
"net_pkts_in",
"net_pkts_out",
"lustre_read_bytes",
"lustre_read_requests",
"lustre_write_bytes",
"lustre_write_requests"
],
"interval_timestamp" : false, "interval_timestamp" : false,
"num_cache_intervals" : 0, "num_cache_intervals" : 0
"change_unit_prefix": {
"mem_used": "G",
"swap_used": "G",
"mem_total": "G",
"swap_total": "G"
},
"normalize_metrics" : true
} }

View File

@@ -1,32 +1,27 @@
{ {
"fritzganglia": { "nhrinflux" : {
"type": "libganglia", "type" : "influxasync",
"gmond_config": "/etc/ganglia/gmond.conf", "host": "monitoring-test.nhr.uni-erlangen.de",
"libganglia_path": "libganglia.so.0", "port": "8086",
"add_ganglia_group": true "organization" : "ClusterCockpit",
}, "database" : "fritz_neu",
"nhrinflux": { "password": "XYZ",
"type": "influxasync", "ssl": true,
"host": "monitoring-test.nhr.uni-erlangen.de", "meta_as_tags" : [
"port": "8086", "unit"
"organization": "ClusterCockpit", ]
"database": "fritz_neu", },
"password": "XYZ", "fritzstore" : {
"ssl": true, "type" : "http",
"meta_as_tags": [ "url" : "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz",
"unit" "jwt": "XYZ",
] "meta_as_tags" : [
}, "unit"
"fritzstore": { ],
"type": "http", "idle_connection_timeout": "60s",
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz", "flush_delay" : "2s",
"jwt": "XYZ", "max_retries" : 1,
"meta_as_tags": [ "timeout" : "10s",
"unit" "precision": "s"
], }
"idle_connection_timeout": "60s",
"flush_delay": "2s",
"max_retries": 1,
"timeout": "10s"
}
} }

View File

@@ -1,33 +0,0 @@
{
"fritzganglia": {
"type": "libganglia",
"gmond_config": "/etc/ganglia/gmond.conf",
"libganglia_path": "libganglia.so.0",
"add_ganglia_group": true
},
"nhrinflux": {
"type": "influxasync",
"host": "monitoring-test.nhr.uni-erlangen.de",
"port": "8086",
"organization": "ClusterCockpit",
"database": "fritz_neu",
"password": "XZY",
"ssl": true,
"meta_as_tags": [
"unit"
]
},
"fritzstore": {
"type": "http",
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz",
"jwt": "XZY",
"meta_as_tags": [
"unit"
],
"idle_connection_timeout": "60s"
},
"debugstdout": {
"type": "stdout",
"output_file": "/tmp/debug.log"
}
}

View File

@@ -0,0 +1 @@
collectors.bw512.json

View File

@@ -0,0 +1,169 @@
{
"nfs4stat" : {},
"memstat" : {
"numa_stats": true,
"node_stats": true
},
"cpustat" : {},
"loadavg" : {},
"schedstat": {},
"netstat" : {
"include_devices" : [
"eth0",
"eth1",
"eth2",
"enp3s0"
],
"send_abs_values": true,
"send_derived_values": true
},
"diskstat" : {},
"iostat" : {},
"nfsiostat" : {},
"tempstat" : {
"tag_override" : {
"hwmon2" : {
"type" : "socket",
"type-id" : "0"
},
"hwmon3" : {
"type" : "socket",
"type-id" : "1"
}
}
},
"likwid": {
"force_overwrite" : true,
"invalid_to_zero" : true,
"access_mode" : "accessdaemon",
"accessdaemon_path" : "/apps/likwid/system/sbin",
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
"eventsets": [
{
"events": {
"FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"FIXC2": "CPU_CLK_UNHALTED_REF",
"MBOX0C0": "CAS_COUNT_RD",
"MBOX0C1": "CAS_COUNT_WR",
"MBOX1C0": "CAS_COUNT_RD",
"MBOX1C1": "CAS_COUNT_WR",
"MBOX2C0": "CAS_COUNT_RD",
"MBOX2C1": "CAS_COUNT_WR",
"MBOX3C0": "CAS_COUNT_RD",
"MBOX3C1": "CAS_COUNT_WR",
"MBOX4C0": "CAS_COUNT_RD",
"MBOX4C1": "CAS_COUNT_WR",
"MBOX5C0": "CAS_COUNT_RD",
"MBOX5C1": "CAS_COUNT_WR",
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE",
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE",
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE",
"PWR0": "PWR_PKG_ENERGY",
"PWR3": "PWR_DRAM_ENERGY"
},
"metrics": [
{
"calc": "1E-6*(FIXC1/FIXC2)/inverseClock",
"name": "clock",
"publish": true,
"unit": "MHz",
"type": "hwthread"
},
{
"calc": "FIXC0/FIXC1",
"name": "ipc",
"publish": true,
"type": "hwthread"
},
{
"calc": "PWR0/time",
"name": "pwr_pkg",
"publish": true,
"unit": "Watt",
"type": "socket"
},
{
"calc": "PWR3/time",
"name": "pwr_dram",
"publish": true,
"unit": "Watt",
"type": "socket"
},
{
"calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time",
"name": "flops_dp",
"publish": true,
"unit": "GFlops/s",
"type": "hwthread"
},
{
"calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time",
"name": "mem_bw",
"publish": true,
"unit": "GBytes/s",
"type": "socket"
},
{
"calc": "PMC0+PMC2",
"name": "dp_vec_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "PMC0+PMC1+PMC2",
"name": "dp_fp_ins",
"type": "hwthread",
"publish": false
}
]
},
{
"events": {
"FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"FIXC2": "CPU_CLK_UNHALTED_REF",
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE",
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE",
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE"
},
"metrics": [
{
"calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time",
"name": "flops_sp",
"publish": true,
"unit": "GFlops/s",
"type": "hwthread"
},
{
"calc": "PMC0+PMC2",
"name": "sp_vec_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "PMC0+PMC1+PMC2",
"name": "sp_fp_ins",
"type": "hwthread",
"publish": false
}
]
}
],
"globalmetrics": [
{
"calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))",
"name": "vectorization_ratio",
"type": "hwthread",
"publish": true
},
{
"calc": "(flops_dp * 2) + flops_sp",
"name": "flops_any",
"type": "hwthread",
"unit": "GFlops/s",
"publish": true
}
]
}
}

View File

@@ -11,13 +11,15 @@
"include_devices" : [ "include_devices" : [
"eth0", "eth0",
"eth1", "eth1",
"eth2" "eth2",
"enp1s0f0"
], ],
"send_abs_values": true, "send_abs_values": true,
"send_derived_values": true "send_derived_values": true
}, },
"diskstat" : {}, "diskstat" : {},
"iostat" : {}, "iostat" : {},
"nfsiostat" : {},
"tempstat" : { "tempstat" : {
"tag_override" : { "tag_override" : {
"hwmon1" : { "hwmon1" : {
@@ -45,8 +47,8 @@
"PMC1": "CPU_CLOCKS_UNHALTED", "PMC1": "CPU_CLOCKS_UNHALTED",
"PMC2": "RETIRED_SSE_AVX_FLOPS_ALL", "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL",
"PMC3": "MERGE", "PMC3": "MERGE",
"DFC0": "DATA_FROM_LOCAL_DRAM_CHANNEL", "DFC0": "DRAM_CHANNEL_0",
"DFC1": "DATA_TO_LOCAL_DRAM_CHANNEL", "DFC1": "DRAM_CHANNEL_1",
"PWR0": "RAPL_CORE_ENERGY", "PWR0": "RAPL_CORE_ENERGY",
"PWR1": "RAPL_PKG_ENERGY" "PWR1": "RAPL_PKG_ENERGY"
}, },
@@ -87,7 +89,7 @@
}, },
{ {
"name": "mem_bw", "name": "mem_bw",
"calc": "1E-9*(DFC0+DFC1)*64.0/time", "calc": "1E-9*(DFC0+DFC1)*(4.0/(8/4))*64.0/time",
"unit": "Gbyte/s", "unit": "Gbyte/s",
"type": "socket", "type": "socket",
"publish": true "publish": true

View File

@@ -1,8 +1,10 @@
{ {
"sinks": "/etc/cc-metric-collector/sinks.json", "sinks-file": "/etc/cc-metric-collector/sinks.json",
"collectors" : "/etc/cc-metric-collector/collectors.json", "collectors-file" : "/etc/cc-metric-collector/collectors.json",
"receivers" : "/etc/cc-metric-collector/receivers.json", "receivers-file" : "/etc/cc-metric-collector/receivers.json",
"router" : "/etc/cc-metric-collector/router.json", "router-file" : "/etc/cc-metric-collector/router.json",
"interval": "60s", "main" : {
"duration": "10s" "interval": "60s",
} "duration": "10s"
}
}

View File

@@ -1,49 +1,58 @@
{ {
"add_tags" : [ "process_messages": {
{ "rename_messages" : {
"key" : "cluster", "load_one" : "cpu_load",
"value" : "tinyfat", "cpu_load_core" : "cpu_load",
"if" : "*" "net_bytes_in_bw" : "net_bytes_in",
} "net_bytes_out_bw" : "net_bytes_out",
], "net_pkts_in_bw" : "net_pkts_in",
"rename_metrics" : { "net_pkts_out_bw" : "net_pkts_out",
"load_one" : "cpu_load", "ib_recv_bw" : "ib_recv",
"cpu_load_core" : "cpu_load", "ib_xmit_bw" : "ib_xmit",
"net_bytes_in_bw" : "net_bytes_in", "ib_recv_pkts_bw": "ib_recv_pkts",
"net_bytes_out_bw" : "net_bytes_out", "ib_xmit_pkts_bw": "ib_xmit_pkts",
"net_pkts_in_bw" : "net_pkts_in", "lustre_read_bytes_diff" : "lustre_read_bytes",
"net_pkts_out_bw" : "net_pkts_out", "lustre_read_requests_diff" : "lustre_read_requests",
"ib_recv_bw" : "ib_recv", "lustre_write_bytes_diff" : "lustre_write_bytes",
"ib_xmit_bw" : "ib_xmit", "lustre_write_requests_diff" : "lustre_write_requests",
"ib_recv_pkts_bw": "ib_recv_pkts", "lustre_open_diff" : "lustre_open",
"ib_xmit_pkts_bw": "ib_xmit_pkts", "lustre_close_diff" : "lustre_close",
"lustre_read_bytes_diff" : "lustre_read_bytes", "lustre_setattr_diff" : "lustre_setattr",
"lustre_read_requests_diff" : "lustre_read_requests", "lustre_getattr_diff" : "lustre_getattr",
"lustre_write_bytes_diff" : "lustre_write_bytes", "lustre_statfs_diff": "lustre_statfs",
"lustre_write_requests_diff" : "lustre_write_requests", "lustre_inode_permission_diff" : "lustre_inode_permission",
"lustre_open_diff" : "lustre_open", "pwr_pkg": "cpu_power",
"lustre_close_diff" : "lustre_close", "pwr_dram": "mem_power"
"lustre_setattr_diff" : "lustre_setattr", },
"lustre_getattr_diff" : "lustre_getattr", "add_tags_if" : [
"lustre_statfs_diff": "lustre_statfs", {
"lustre_inode_permission_diff" : "lustre_inode_permission", "key" : "cluster",
"pwr_pkg": "cpu_power", "value" : "tinyfat",
"pwr_dram": "mem_power" "if" : "true"
}
],
"drop_messages" : [
"net_bytes_in",
"net_bytes_out",
"ib_recv",
"ib_xmit",
"ib_recv_pkts",
"ib_xmit_pkts",
"net_pkts_in",
"net_pkts_out",
"lustre_read_bytes",
"lustre_read_requests",
"lustre_write_bytes",
"lustre_write_requests"
],
"change_unit_prefix": {
"name == 'mem_used'": "G",
"name == 'swap_used'": "G",
"name == 'mem_total'": "G",
"name == 'swap_total'": "G"
},
"normalize_messages" : true
}, },
"drop_metrics" : [
"net_bytes_in",
"net_bytes_out",
"ib_recv",
"ib_xmit",
"ib_recv_pkts",
"ib_xmit_pkts",
"net_pkts_in",
"net_pkts_out",
"lustre_read_bytes",
"lustre_read_requests",
"lustre_write_bytes",
"lustre_write_requests"
],
"interval_timestamp" : false, "interval_timestamp" : false,
"num_cache_intervals" : 0 "num_cache_intervals" : 0
} }

View File

@@ -1,26 +1,53 @@
{ {
"influx": { "influx" : {
"type": "influxasync", "type" : "influxasync",
"host": "monitoring-test.nhr.uni-erlangen.de", "host": "monitoring-test.nhr.uni-erlangen.de",
"port": "8086", "port": "8086",
"organization": "ClusterCockpit", "organization" : "ClusterCockpit",
"database": "tinyfat", "database" : "tinyfat",
"password": "XZY", "password": "XYZ",
"ssl": true, "ssl": true,
"meta_as_tags": [ "process_messages": {
"unit" "move_meta_to_tag_if": [
] {
}, "key": "unit",
"metricstore": { "if": "true"
"type": "http", }
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinyfat", ]
"jwt": "XYZ",
"meta_as_tags": [
"unit"
],
"idle_connection_timeout": "60s",
"flush_delay": "2s",
"max_retries": 1,
"timeout": "10s"
} }
},
"metricstore" : {
"type" : "http",
"url" : "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinyfat",
"jwt": "XYZ",
"idle_connection_timeout": "60s",
"flush_delay" : "2s",
"max_retries" : 1,
"timeout" : "10s",
"precision": "s",
"process_messages": {
"move_meta_to_tag_if": [
{
"key": "unit",
"if": "true"
}
]
}
},
"tinyfatnats": {
"type": "nats",
"host": "monitoring.nhr.fau.de",
"database": "tinyfat",
"nkey_file": "/etc/cc-metric-collector/nats.nkey",
"flush_delay": "1s",
"precision": "s",
"process_messages": {
"move_meta_to_tag_if": [
{
"key": "unit",
"if": "true"
}
]
}
}
} }

View File

@@ -1,30 +0,0 @@
{
"influx": {
"type": "influxasync",
"host": "monitoring-test.nhr.uni-erlangen.de",
"port": "8086",
"organization": "ClusterCockpit",
"database": "tinyfat",
"password": "XZY",
"ssl": true,
"meta_as_tags": [
"unit"
]
},
"metricstore": {
"type": "http",
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinyfat",
"jwt": "XZY",
"meta_as_tags": [
"unit"
],
"idle_connection_timeout": "60s",
"flush_delay": "2s",
"max_retries": 1,
"timeout": "10s"
},
"debugstdout": {
"type": "stdout",
"output_file": "/tmp/debug.log"
}
}

View File

@@ -0,0 +1,220 @@
{
"nfs4stat" : {},
"memstat" : {
"numa_stats": true,
"node_stats": true
},
"cpustat" : {},
"loadavg" : {},
"schedstat": {},
"netstat" : {
"include_devices" : [
"eth0"
],
"send_abs_values": true,
"send_derived_values": true
},
"diskstat" : {},
"iostat" : {},
"nfsiostat" : {},
"tempstat" : {
"tag_override" : {
"hwmon0" : {
"type" : "socket",
"type-id" : "0"
},
"hwmon1" : {
"type" : "socket",
"type-id" : "1"
}
}
},
"nvidia" : {
"use_pci_info_as_type_id": true,
"process_mig_devices": true
},
"likwid": {
"force_overwrite" : true,
"invalid_to_zero" : true,
"access_mode" : "accessdaemon",
"accessdaemon_path" : "/apps/likwid/system/sbin",
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
"eventsets": [
{
"events": {
"FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"FIXC2": "CPU_CLK_UNHALTED_REF",
"MBOX0C0": "CAS_COUNT_RD",
"MBOX0C1": "CAS_COUNT_WR",
"MBOX1C0": "CAS_COUNT_RD",
"MBOX1C1": "CAS_COUNT_WR",
"MBOX2C0": "CAS_COUNT_RD",
"MBOX2C1": "CAS_COUNT_WR",
"MBOX3C0": "CAS_COUNT_RD",
"MBOX3C1": "CAS_COUNT_WR",
"MBOX4C0": "CAS_COUNT_RD",
"MBOX4C1": "CAS_COUNT_WR",
"MBOX5C0": "CAS_COUNT_RD",
"MBOX5C1": "CAS_COUNT_WR",
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE",
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE",
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE",
"PWR0": "PWR_PKG_ENERGY",
"PWR3": "PWR_DRAM_ENERGY"
},
"metrics": [
{
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
"name": "clock",
"publish": true,
"unit": "MHz",
"type": "hwthread"
},
{
"calc": "FIXC0/FIXC1",
"name": "ipc",
"publish": true,
"type": "hwthread"
},
{
"calc": "PWR0/time",
"name": "cpu_power",
"publish": true,
"unit": "Watt",
"type": "socket"
},
{
"calc": "PWR3/time",
"name": "mem_power",
"unit": "Watt",
"publish": true,
"type": "socket"
},
{
"calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time",
"name": "flops_dp1",
"unit": "GFlops/s",
"publish": false,
"type": "hwthread"
},
{
"calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time",
"name": "mem_bw",
"unit": "GBytes/s",
"publish": true,
"type": "socket"
},
{
"calc": "PMC0+PMC2",
"name": "dp_vec_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "PMC0+PMC1+PMC2",
"name": "dp_fp_ins",
"type": "hwthread",
"publish": false
}
]
},
{
"events": {
"FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"FIXC2": "CPU_CLK_UNHALTED_REF",
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE",
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE",
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE"
},
"metrics": [
{
"calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time",
"name": "flops_sp1",
"publish": false,
"type": "hwthread"
},
{
"calc": "PMC0+PMC2",
"name": "sp_vec_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "PMC0+PMC1+PMC2",
"name": "sp_fp_ins",
"type": "hwthread",
"publish": false
}
]
},
{
"events": {
"FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"FIXC2": "CPU_CLK_UNHALTED_REF",
"PMC0": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE",
"PMC1": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE"
},
"metrics": [
{
"calc": "PMC0",
"name": "dp_avx_512_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "1E-9*(PMC0*8.0)/time",
"name": "flops_dp2",
"type": "hwthread",
"publish": false
},
{
"calc": "PMC1",
"name": "sp_avx_512_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "1E-9*(PMC1*16.0)/time",
"name": "flops_sp2",
"type": "hwthread",
"publish": false
}
]
}
],
"globalmetrics": [
{
"calc": "100*((sp_vec_ins+dp_vec_ins+dp_avx_512_ins+sp_avx_512_ins)/(sp_fp_ins+dp_fp_ins+dp_avx_512_ins+sp_avx_512_ins))",
"name": "vectorization_ratio",
"unit": "%",
"type": "hwthread",
"publish": true
},
{
"calc": "(flops_sp1+flops_sp2)",
"name": "flops_sp",
"unit": "GFlops/s",
"type": "hwthread",
"publish": true
},
{
"calc": "(flops_dp1+flops_dp2)",
"name": "flops_dp",
"unit": "GFlops/s",
"type": "hwthread",
"publish": true
},
{
"calc": "((flops_dp1+flops_dp2) * 2) + (flops_sp1+flops_sp2)",
"name": "flops_any",
"unit": "GFlops/s",
"type": "hwthread",
"publish": true
}
]
}
}

View File

@@ -0,0 +1,180 @@
{
"nfs4stat" : {},
"memstat" : {
"numa_stats": true,
"node_stats": true
},
"cpustat" : {},
"loadavg" : {},
"schedstat": {},
"netstat" : {
"include_devices" : [
"eth0",
"eth1",
"eth2"
],
"send_abs_values": true,
"send_derived_values": true
},
"diskstat" : {},
"iostat" : {},
"nfsiostat" : {},
"tempstat" : {
"tag_override" : {
"hwmon1" : {
"type" : "socket",
"type-id" : "0"
},
"hwmon2" : {
"type" : "socket",
"type-id" : "1"
}
}
},
"ipmistat" : {
"send_abs_values": true,
"send_derived_values": true
},
"nvidia" : {
"use_pci_info_as_type_id": true,
"process_mig_devices": true
},
"likwid": {
"force_overwrite" : true,
"invalid_to_zero" : true,
"access_mode" : "accessdaemon",
"accessdaemon_path" : "/apps/likwid/system/sbin",
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
"eventsets": [
{
"events": {
"FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"FIXC2": "CPU_CLK_UNHALTED_REF",
"MBOX0C0": "CAS_COUNT_RD",
"MBOX0C1": "CAS_COUNT_WR",
"MBOX1C0": "CAS_COUNT_RD",
"MBOX1C1": "CAS_COUNT_WR",
"MBOX2C0": "CAS_COUNT_RD",
"MBOX2C1": "CAS_COUNT_WR",
"MBOX3C0": "CAS_COUNT_RD",
"MBOX3C1": "CAS_COUNT_WR",
"MBOX4C0": "CAS_COUNT_RD",
"MBOX4C1": "CAS_COUNT_WR",
"MBOX5C0": "CAS_COUNT_RD",
"MBOX5C1": "CAS_COUNT_WR",
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE",
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE",
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE",
"PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE",
"PWR0": "PWR_PKG_ENERGY",
"PWR3": "PWR_DRAM_ENERGY"
},
"metrics": [
{
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
"name": "clock",
"publish": true,
"unit": "MHz",
"type": "hwthread"
},
{
"calc": "FIXC0/FIXC1",
"name": "ipc",
"publish": true,
"type": "hwthread"
},
{
"calc": "PWR0/time",
"name": "cpu_power",
"publish": true,
"unit": "Watt",
"type": "socket"
},
{
"calc": "PWR3/time",
"name": "mem_power",
"unit": "Watt",
"publish": true,
"type": "socket"
},
{
"calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time",
"name": "flops_dp",
"unit": "GFlops/s",
"publish": true,
"type": "hwthread"
},
{
"calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time",
"name": "mem_bw",
"unit": "GBytes/s",
"publish": true,
"type": "socket"
},
{
"calc": "PMC0+PMC2+PMC3",
"name": "dp_vec_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "PMC0+PMC1+PMC2+PMC3",
"name": "dp_fp_ins",
"type": "hwthread",
"publish": false
}
]
},
{
"events": {
"FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"FIXC2": "CPU_CLK_UNHALTED_REF",
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE",
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE",
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE",
"PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE"
},
"metrics": [
{
"calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time",
"name": "flops_sp",
"unit": "GFlops/s",
"publish": true,
"type": "hwthread"
},
{
"calc": "PMC0+PMC2+PMC3",
"name": "sp_vec_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "PMC0+PMC1+PMC2+PMC3",
"name": "sp_fp_ins",
"type": "hwthread",
"publish": false
}
]
}
],
"globalmetrics": [
{
"calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))",
"name": "vectorization_ratio",
"unit": "%",
"type": "hwthread",
"publish": true
},
{
"calc": "(flops_dp * 2) + flops_sp",
"name": "flops_any",
"unit": "GFlops/s",
"type": "hwthread",
"publish": true
}
]
}
}

View File

@@ -0,0 +1,104 @@
{
"nfs4stat" : {},
"memstat" : {
"numa_stats": true,
"node_stats": true
},
"cpustat" : {},
"loadavg" : {},
"schedstat": {},
"netstat" : {
"include_devices" : [
"eth0"
],
"send_abs_values": true,
"send_derived_values": true
},
"diskstat" : {},
"iostat" : {},
"nfsiostat" : {},
"tempstat" : {
"tag_override" : {
"hwmon0" : {
"type" : "socket",
"type-id" : "0"
},
"hwmon1" : {
"type" : "socket",
"type-id" : "1"
}
}
},
"nvidia" : {
"use_pci_info_as_type_id": true,
"process_mig_devices": true
},
"likwid": {
"force_overwrite" : true,
"invalid_to_zero" : true,
"access_mode" : "accessdaemon",
"accessdaemon_path" : "/apps/likwid/system/sbin",
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
"eventsets": [
{
"events": {
"FIXC1": "ACTUAL_CPU_CLOCK",
"FIXC2": "MAX_CPU_CLOCK",
"PMC0": "RETIRED_INSTRUCTIONS",
"PMC1": "CPU_CLOCKS_UNHALTED",
"PMC2": "RETIRED_SSE_AVX_FLOPS_ALL",
"PMC3": "MERGE",
"DFC0": "DATA_FROM_LOCAL_DRAM_CHANNEL",
"DFC1": "DATA_TO_LOCAL_DRAM_CHANNEL",
"PWR0": "RAPL_CORE_ENERGY",
"PWR1": "RAPL_PKG_ENERGY"
},
"metrics": [
{
"name": "ipc",
"calc": "PMC0/PMC1",
"type": "hwthread",
"publish": true
},
{
"name": "flops_any",
"calc": "1E-9*PMC2/time",
"unit": "GFlops/s",
"type": "hwthread",
"publish": true
},
{
"name": "clock",
"calc": "1E-6*(FIXC1/FIXC2)/inverseClock",
"type": "hwthread",
"unit": "MHz",
"publish": true
},
{
"name": "pwr_core",
"calc": "PWR0/time",
"unit": "Watt",
"type": "socket",
"publish": true
},
{
"name": "pwr_pkg",
"calc": "PWR1/time",
"type": "socket",
"unit": "Watt",
"publish": true
},
{
"name": "mem_bw",
"calc": "1E-9*(DFC0+DFC1)*64.0/time",
"unit": "Gbyte/s",
"type": "socket",
"publish": true
}
]
}
],
"globalmetrics": []
}
}

View File

@@ -0,0 +1,2 @@
{
}

View File

@@ -0,0 +1,2 @@
{
}

View File

@@ -16,6 +16,7 @@
}, },
"diskstat" : {}, "diskstat" : {},
"iostat" : {}, "iostat" : {},
"nfsiostat" : {},
"tempstat" : { "tempstat" : {
"tag_override" : { "tag_override" : {
"hwmon0" : { "hwmon0" : {

View File

@@ -16,6 +16,7 @@
}, },
"diskstat" : {}, "diskstat" : {},
"iostat" : {}, "iostat" : {},
"nfsiostat" : {},
"tempstat" : { "tempstat" : {
"tag_override" : { "tag_override" : {
"hwmon0" : { "hwmon0" : {

View File

@@ -18,6 +18,7 @@
}, },
"diskstat" : {}, "diskstat" : {},
"iostat" : {}, "iostat" : {},
"nfsiostat" : {},
"tempstat" : { "tempstat" : {
"tag_override" : { "tag_override" : {
"hwmon1" : { "hwmon1" : {

View File

@@ -16,6 +16,7 @@
}, },
"diskstat" : {}, "diskstat" : {},
"iostat" : {}, "iostat" : {},
"nfsiostat" : {},
"tempstat" : { "tempstat" : {
"tag_override" : { "tag_override" : {
"hwmon0" : { "hwmon0" : {

View File

@@ -0,0 +1,220 @@
{
"nfs4stat" : {},
"memstat" : {
"numa_stats": true,
"node_stats": true
},
"cpustat" : {},
"loadavg" : {},
"schedstat": {},
"netstat" : {
"include_devices" : [
"eth0"
],
"send_abs_values": true,
"send_derived_values": true
},
"diskstat" : {},
"iostat" : {},
"nfsiostat" : {},
"tempstat" : {
"tag_override" : {
"hwmon0" : {
"type" : "socket",
"type-id" : "0"
},
"hwmon1" : {
"type" : "socket",
"type-id" : "1"
}
}
},
"nvidia" : {
"use_pci_info_as_type_id": true,
"process_mig_devices": true
},
"likwid": {
"force_overwrite" : true,
"invalid_to_zero" : true,
"access_mode" : "accessdaemon",
"accessdaemon_path" : "/apps/likwid/system/sbin",
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
"eventsets": [
{
"events": {
"FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"FIXC2": "CPU_CLK_UNHALTED_REF",
"MBOX0C0": "CAS_COUNT_RD",
"MBOX0C1": "CAS_COUNT_WR",
"MBOX1C0": "CAS_COUNT_RD",
"MBOX1C1": "CAS_COUNT_WR",
"MBOX2C0": "CAS_COUNT_RD",
"MBOX2C1": "CAS_COUNT_WR",
"MBOX3C0": "CAS_COUNT_RD",
"MBOX3C1": "CAS_COUNT_WR",
"MBOX4C0": "CAS_COUNT_RD",
"MBOX4C1": "CAS_COUNT_WR",
"MBOX5C0": "CAS_COUNT_RD",
"MBOX5C1": "CAS_COUNT_WR",
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE",
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE",
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE",
"PWR0": "PWR_PKG_ENERGY",
"PWR3": "PWR_DRAM_ENERGY"
},
"metrics": [
{
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
"name": "clock",
"publish": true,
"unit": "MHz",
"type": "hwthread"
},
{
"calc": "FIXC0/FIXC1",
"name": "ipc",
"publish": true,
"type": "hwthread"
},
{
"calc": "PWR0/time",
"name": "cpu_power",
"publish": true,
"unit": "Watt",
"type": "socket"
},
{
"calc": "PWR3/time",
"name": "mem_power",
"unit": "Watt",
"publish": true,
"type": "socket"
},
{
"calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time",
"name": "flops_dp1",
"unit": "GFlops/s",
"publish": false,
"type": "hwthread"
},
{
"calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time",
"name": "mem_bw",
"unit": "GBytes/s",
"publish": true,
"type": "socket"
},
{
"calc": "PMC0+PMC2",
"name": "dp_vec_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "PMC0+PMC1+PMC2",
"name": "dp_fp_ins",
"type": "hwthread",
"publish": false
}
]
},
{
"events": {
"FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"FIXC2": "CPU_CLK_UNHALTED_REF",
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE",
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE",
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE"
},
"metrics": [
{
"calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time",
"name": "flops_sp1",
"publish": false,
"type": "hwthread"
},
{
"calc": "PMC0+PMC2",
"name": "sp_vec_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "PMC0+PMC1+PMC2",
"name": "sp_fp_ins",
"type": "hwthread",
"publish": false
}
]
},
{
"events": {
"FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"FIXC2": "CPU_CLK_UNHALTED_REF",
"PMC0": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE",
"PMC1": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE"
},
"metrics": [
{
"calc": "PMC0",
"name": "dp_avx_512_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "1E-9*(PMC0*8.0)/time",
"name": "flops_dp2",
"type": "hwthread",
"publish": false
},
{
"calc": "PMC1",
"name": "sp_avx_512_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "1E-9*(PMC1*16.0)/time",
"name": "flops_sp2",
"type": "hwthread",
"publish": false
}
]
}
],
"globalmetrics": [
{
"calc": "100*((sp_vec_ins+dp_vec_ins+dp_avx_512_ins+sp_avx_512_ins)/(sp_fp_ins+dp_fp_ins+dp_avx_512_ins+sp_avx_512_ins))",
"name": "vectorization_ratio",
"unit": "%",
"type": "hwthread",
"publish": true
},
{
"calc": "(flops_sp1+flops_sp2)",
"name": "flops_sp",
"unit": "GFlops/s",
"type": "hwthread",
"publish": true
},
{
"calc": "(flops_dp1+flops_dp2)",
"name": "flops_dp",
"unit": "GFlops/s",
"type": "hwthread",
"publish": true
},
{
"calc": "((flops_dp1+flops_dp2) * 2) + (flops_sp1+flops_sp2)",
"name": "flops_any",
"unit": "GFlops/s",
"type": "hwthread",
"publish": true
}
]
}
}

View File

@@ -1,8 +1,10 @@
{ {
"sinks": "/etc/cc-metric-collector/sinks.json", "sinks-file": "/etc/cc-metric-collector/sinks.json",
"collectors" : "/etc/cc-metric-collector/collectors.json", "collectors-file" : "/etc/cc-metric-collector/collectors.json",
"receivers" : "/etc/cc-metric-collector/receivers.json", "receivers-file" : "/etc/cc-metric-collector/receivers.json",
"router" : "/etc/cc-metric-collector/router.json", "router-file" : "/etc/cc-metric-collector/router.json",
"interval": "60s", "main" : {
"duration": "10s" "interval": "60s",
} "duration": "10s"
}
}

View File

@@ -1,60 +1,62 @@
{ {
"add_tags" : [ "process_messages": {
{ "rename_messages" : {
"key" : "cluster", "load_one" : "cpu_load",
"value" : "tinygpu", "cpu_load_core" : "cpu_load",
"if" : "*" "net_bytes_in_bw" : "net_bytes_in",
} "net_bytes_out_bw" : "net_bytes_out",
], "net_pkts_in_bw" : "net_pkts_in",
"rename_metrics" : { "net_pkts_out_bw" : "net_pkts_out",
"load_one" : "cpu_load", "ib_recv_bw" : "ib_recv",
"cpu_load_core" : "cpu_load", "ib_xmit_bw" : "ib_xmit",
"net_bytes_in_bw" : "net_bytes_in", "ib_recv_pkts_bw": "ib_recv_pkts",
"net_bytes_out_bw" : "net_bytes_out", "ib_xmit_pkts_bw": "ib_xmit_pkts",
"net_pkts_in_bw" : "net_pkts_in", "lustre_read_bytes_diff" : "lustre_read_bytes",
"net_pkts_out_bw" : "net_pkts_out", "lustre_read_requests_diff" : "lustre_read_requests",
"ib_recv_bw" : "ib_recv", "lustre_write_bytes_diff" : "lustre_write_bytes",
"ib_xmit_bw" : "ib_xmit", "lustre_write_requests_diff" : "lustre_write_requests",
"ib_recv_pkts_bw": "ib_recv_pkts", "lustre_open_diff" : "lustre_open",
"ib_xmit_pkts_bw": "ib_xmit_pkts", "lustre_close_diff" : "lustre_close",
"lustre_read_bytes_diff" : "lustre_read_bytes", "lustre_setattr_diff" : "lustre_setattr",
"lustre_read_requests_diff" : "lustre_read_requests", "lustre_getattr_diff" : "lustre_getattr",
"lustre_write_bytes_diff" : "lustre_write_bytes", "lustre_statfs_diff": "lustre_statfs",
"lustre_write_requests_diff" : "lustre_write_requests", "lustre_inode_permission_diff" : "lustre_inode_permission",
"lustre_open_diff" : "lustre_open", "nv_util" : "acc_utilization",
"lustre_close_diff" : "lustre_close", "nv_fb_mem_used" : "acc_mem_used",
"lustre_setattr_diff" : "lustre_setattr", "nv_power_usage" : "acc_power",
"lustre_getattr_diff" : "lustre_getattr", "pwr_pkg": "cpu_power",
"lustre_statfs_diff": "lustre_statfs", "pwr_dram": "mem_power"
"lustre_inode_permission_diff" : "lustre_inode_permission", },
"nv_util" : "acc_utilization", "add_tags_if" : [
"nv_fb_mem_used" : "acc_mem_used", {
"nv_power_usage" : "acc_power", "key" : "cluster",
"pwr_pkg": "cpu_power", "value" : "tinygpu",
"pwr_dram": "mem_power" "if" : "true"
}
],
"drop_messages" : [
"net_bytes_in",
"net_bytes_out",
"ib_recv",
"ib_xmit",
"ib_recv_pkts",
"ib_xmit_pkts",
"net_pkts_in",
"net_pkts_out",
"lustre_read_bytes",
"lustre_read_requests",
"lustre_write_bytes",
"lustre_write_requests"
],
"change_unit_prefix": {
"name == 'mem_used'": "G",
"name == 'swap_used'": "G",
"name == 'mem_total'": "G",
"name == 'swap_total'": "G",
"name == 'cpufreq'": "M"
},
"normalize_units" : true
}, },
"drop_metrics" : [
"net_bytes_in",
"net_bytes_out",
"ib_recv",
"ib_xmit",
"ib_recv_pkts",
"ib_xmit_pkts",
"net_pkts_in",
"net_pkts_out",
"lustre_read_bytes",
"lustre_read_requests",
"lustre_write_bytes",
"lustre_write_requests"
],
"interval_timestamp" : false, "interval_timestamp" : false,
"num_cache_intervals" : 0, "num_cache_intervals" : 0
"change_unit_prefix": {
"mem_used": "G",
"swap_used": "G",
"mem_total": "G",
"swap_total": "G",
"cpufreq": "M"
},
"normalize_metrics" : true
} }

View File

@@ -1,26 +1,53 @@
{ {
"influx": { "influx" : {
"type": "influxasync", "type" : "influxasync",
"host": "monitoring-test.nhr.uni-erlangen.de", "host": "monitoring-test.nhr.uni-erlangen.de",
"port": "8086", "port": "8086",
"organization": "ClusterCockpit", "organization" : "ClusterCockpit",
"database": "tinygpu", "database" : "tinygpu",
"password": "XZY", "password": "XYZ",
"ssl": true, "ssl": true,
"meta_as_tags": [ "process_messages": {
"unit" "move_meta_to_tag_if": [
] {
}, "key": "unit",
"metricstore": { "if": "true"
"type": "http", }
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinygpu", ]
"jwt": "XZY",
"meta_as_tags": [
"unit"
],
"idle_connection_timeout": "60s",
"flush_delay": "2s",
"max_retries": 1,
"timeout": "10s"
} }
},
"metricstore" : {
"type" : "http",
"url" : "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinygpu",
"jwt": "XYZ",
"idle_connection_timeout": "60s",
"flush_delay" : "2s",
"max_retries" : 1,
"timeout" : "10s",
"precision": "s",
"process_messages": {
"move_meta_to_tag_if": [
{
"key": "unit",
"if": "true"
}
]
}
},
"tinygpunats": {
"type": "nats",
"host": "monitoring.nhr.fau.de",
"database": "tinygpu",
"nkey_file": "/etc/cc-metric-collector/nats.nkey",
"flush_delay": "1s",
"precision": "s",
"process_messages": {
"move_meta_to_tag_if": [
{
"key": "unit",
"if": "true"
}
]
}
}
} }

View File

@@ -1,30 +0,0 @@
{
"influx": {
"type": "influxasync",
"host": "monitoring-test.nhr.uni-erlangen.de",
"port": "8086",
"organization": "ClusterCockpit",
"database": "tinygpu",
"password": "XZY",
"ssl": true,
"meta_as_tags": [
"unit"
]
},
"metricstore": {
"type": "http",
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinygpu",
"jwt": "XZY",
"meta_as_tags": [
"unit"
],
"idle_connection_timeout": "60s",
"flush_delay": "2s",
"max_retries": 1,
"timeout": "10s"
},
"debugstdout": {
"type": "stdout",
"output_file": "/tmp/debug.log"
}
}

View File

@@ -30,6 +30,7 @@
} }
}, },
"nfs4stat" : {}, "nfs4stat" : {},
"nfsiostat" : {},
"likwid": { "likwid": {
"force_overwrite" : true, "force_overwrite" : true,
"invalid_to_zero" : true, "invalid_to_zero" : true,

View File

@@ -18,6 +18,7 @@
}, },
"tempstat" : {}, "tempstat" : {},
"nfs4stat" : {}, "nfs4stat" : {},
"nfsiostat" : {},
"likwid": { "likwid": {
"force_overwrite" : true, "force_overwrite" : true,
"invalid_to_zero" : true, "invalid_to_zero" : true,

View File

@@ -1,8 +1,10 @@
{ {
"sinks": "/etc/cc-metric-collector/sinks.json", "sinks-file": "/etc/cc-metric-collector/sinks.json",
"collectors" : "/etc/cc-metric-collector/collectors.json", "collectors-file" : "/etc/cc-metric-collector/collectors.json",
"receivers" : "/etc/cc-metric-collector/receivers.json", "receivers-file" : "/etc/cc-metric-collector/receivers.json",
"router" : "/etc/cc-metric-collector/router.json", "router-file" : "/etc/cc-metric-collector/router.json",
"interval": "60s", "main" : {
"duration": "10s" "interval": "60s",
} "duration": "10s"
}
}

View File

@@ -1,54 +1,56 @@
{ {
"add_tags" : [ "process_messages":{
{ "rename_messages" : {
"key" : "cluster", "load_one" : "cpu_load",
"value" : "woodyng", "cpu_load_core" : "cpu_load",
"if" : "*" "net_bytes_in_bw" : "net_bytes_in",
} "net_bytes_out_bw" : "net_bytes_out",
], "net_pkts_in_bw" : "net_pkts_in",
"rename_metrics" : { "net_pkts_out_bw" : "net_pkts_out",
"load_one" : "cpu_load", "ib_recv_bw" : "ib_recv",
"cpu_load_core" : "cpu_load", "ib_xmit_bw" : "ib_xmit",
"net_bytes_in_bw" : "net_bytes_in", "ib_recv_pkts_bw": "ib_recv_pkts",
"net_bytes_out_bw" : "net_bytes_out", "ib_xmit_pkts_bw": "ib_xmit_pkts",
"net_pkts_in_bw" : "net_pkts_in", "lustre_read_bytes_diff" : "lustre_read_bytes",
"net_pkts_out_bw" : "net_pkts_out", "lustre_read_requests_diff" : "lustre_read_requests",
"ib_recv_bw" : "ib_recv", "lustre_write_bytes_diff" : "lustre_write_bytes",
"ib_xmit_bw" : "ib_xmit", "lustre_write_requests_diff" : "lustre_write_requests",
"ib_recv_pkts_bw": "ib_recv_pkts", "lustre_open_diff" : "lustre_open",
"ib_xmit_pkts_bw": "ib_xmit_pkts", "lustre_close_diff" : "lustre_close",
"lustre_read_bytes_diff" : "lustre_read_bytes", "lustre_setattr_diff" : "lustre_setattr",
"lustre_read_requests_diff" : "lustre_read_requests", "lustre_getattr_diff" : "lustre_getattr",
"lustre_write_bytes_diff" : "lustre_write_bytes", "lustre_statfs_diff": "lustre_statfs",
"lustre_write_requests_diff" : "lustre_write_requests", "lustre_inode_permission_diff" : "lustre_inode_permission"
"lustre_open_diff" : "lustre_open", },
"lustre_close_diff" : "lustre_close", "add_tags_if" : [
"lustre_setattr_diff" : "lustre_setattr", {
"lustre_getattr_diff" : "lustre_getattr", "key" : "cluster",
"lustre_statfs_diff": "lustre_statfs", "value" : "woody",
"lustre_inode_permission_diff" : "lustre_inode_permission" "if" : "true"
}
],
"drop_messages" : [
"net_bytes_in",
"net_bytes_out",
"ib_recv",
"ib_xmit",
"ib_recv_pkts",
"ib_xmit_pkts",
"net_pkts_in",
"net_pkts_out",
"lustre_read_bytes",
"lustre_read_requests",
"lustre_write_bytes",
"lustre_write_requests"
],
"change_unit_prefix": {
"name == 'mem_used'": "G",
"name == 'swap_used'": "G",
"name == 'mem_total'": "G",
"name == 'swap_total'": "G"
},
"normalize_metrics" : true
}, },
"drop_metrics" : [
"net_bytes_in",
"net_bytes_out",
"ib_recv",
"ib_xmit",
"ib_recv_pkts",
"ib_xmit_pkts",
"net_pkts_in",
"net_pkts_out",
"lustre_read_bytes",
"lustre_read_requests",
"lustre_write_bytes",
"lustre_write_requests"
],
"interval_timestamp" : true, "interval_timestamp" : true,
"num_cache_intervals" : 0, "num_cache_intervals" : 0
"change_unit_prefix": {
"mem_used": "G",
"swap_used": "G",
"mem_total": "G",
"swap_total": "G"
},
"normalize_metrics" : true
} }

View File

@@ -1,20 +1,21 @@
{ {
"nhrinflux": { "nhrinflux" : {
"type": "influxasync", "type" : "influxasync",
"host": "monitoring-test.nhr.uni-erlangen.de", "host": "monitoring-test.nhr.uni-erlangen.de",
"port": "8086", "port": "8086",
"organization": "ClusterCockpit", "organization" : "ClusterCockpit",
"database": "woodyng", "database" : "woodyng",
"password": "XZY", "password": "XYZ",
"ssl": true "ssl": true
}, },
"woodystore": { "woodystore" : {
"type": "http", "type" : "http",
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=woodyng", "url" : "http://monitoring.nhr.fau.de:8082/api/write?cluster=woody",
"jwt": "XZY", "jwt": "XYZ",
"meta_as_tags": [ "meta_as_tags" : [
"unit" "unit"
], ],
"idle_connection_timeout": "60s" "idle_connection_timeout": "60s",
} "precision": "s"
}
} }

View File

@@ -1,19 +0,0 @@
[Unit]
Description=ClusterCockpit In-Memory Timeseries Database for Fritz (cc-metric-store)
Documentation=https://github.com/ClusterCockpit/cc-metric-store
Wants=network-online.target
After=network-online.target
[Service]
Type=simple
User=clustercockpit
Group=clustercockpit
Restart=on-failure
RestartSec=30
TimeoutStopSec=100
WorkingDirectory=/opt/monitoring/cc-metric-store/fritz
ExecStart=/opt/monitoring/cc-metric-store/repo/cc-metric-store --config ./config.json
LimitNOFILE=500000
[Install]
WantedBy=multi-user.target

View File

@@ -1,180 +0,0 @@
{
"metrics": {
"clock": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_idle": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_iowait": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_irq": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_system": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_user": {
"frequency": 60,
"aggregation": "avg"
},
"nv_mem_util": {
"frequency": 60,
"aggregation": "avg"
},
"nv_temp": {
"frequency": 60,
"aggregation": "avg"
},
"nv_sm_clock": {
"frequency": 60,
"aggregation": "avg"
},
"acc_utilization": {
"frequency": 60,
"aggregation": "avg"
},
"acc_mem_used": {
"frequency": 60,
"aggregation": "sum"
},
"acc_power": {
"frequency": 60,
"aggregation": "sum"
},
"flops_any": {
"frequency": 60,
"aggregation": "sum"
},
"flops_dp": {
"frequency": 60,
"aggregation": "sum"
},
"flops_sp": {
"frequency": 60,
"aggregation": "sum"
},
"ib_recv": {
"frequency": 60,
"aggregation": "sum"
},
"ib_xmit": {
"frequency": 60,
"aggregation": "sum"
},
"ib_recv_pkts": {
"frequency": 60,
"aggregation": "sum"
},
"ib_xmit_pkts": {
"frequency": 60,
"aggregation": "sum"
},
"cpu_power": {
"frequency": 60,
"aggregation": "sum"
},
"core_power": {
"frequency": 60,
"aggregation": "sum"
},
"mem_power": {
"frequency": 60,
"aggregation": "sum"
},
"ipc": {
"frequency": 60,
"aggregation": "avg"
},
"cpu_load": {
"frequency": 60,
"aggregation": null
},
"lustre_close": {
"frequency": 60,
"aggregation": null
},
"lustre_open": {
"frequency": 60,
"aggregation": null
},
"lustre_statfs": {
"frequency": 60,
"aggregation": null
},
"lustre_read_bytes": {
"frequency": 60,
"aggregation": null
},
"lustre_write_bytes": {
"frequency": 60,
"aggregation": null
},
"net_bw": {
"frequency": 60,
"aggregation": null
},
"file_bw": {
"frequency": 60,
"aggregation": null
},
"mem_bw": {
"frequency": 60,
"aggregation": "sum"
},
"mem_cached": {
"frequency": 60,
"aggregation": null
},
"mem_used": {
"frequency": 60,
"aggregation": null
},
"net_bytes_in": {
"frequency": 60,
"aggregation": null
},
"net_bytes_out": {
"frequency": 60,
"aggregation": null
},
"nfs4_read": {
"frequency": 60,
"aggregation": null
},
"nfs4_total": {
"frequency": 60,
"aggregation": null
},
"nfs4_write": {
"frequency": 60,
"aggregation": null
},
"vectorization_ratio": {
"frequency": 60,
"aggregation": "avg"
}
},
"checkpoints": {
"interval": "12h",
"directory": "/opt/monitoring/cc-metric-store/fritz/checkpoints",
"restore": "48h"
},
"archive": {
"interval": "50h",
"directory": "/opt/monitoring/cc-metric-store/fritz/archive"
},
"http-api": {
"address": "0.0.0.0:8082",
"https-cert-file": null,
"https-key-file": null
},
"retention-in-memory": "48h",
"jwt-public-key": "-"
}

View File

@@ -0,0 +1,42 @@
{
"ccRestUrl": "https://monitoring.nhr.fau.de",
"ccRestJwt": "XYZ",
"lastRunPath": "/home.local/hpcop/last_run",
"natsServer": "monitoring.nhr.fau.de",
"natsSubject": "alex",
"natsUser": "alex",
"natsPassword": "XYZ",
"slurmPollInterval": 300,
"gpuPciAddrs": {
"^(a0[1-4]\\d\\d|a052\\d|a162\\d|a172\\d)$" : [
"00000000:01:00.0",
"00000000:25:00.0",
"00000000:41:00.0",
"00000000:61:00.0",
"00000000:81:00.0",
"00000000:A1:00.0",
"00000000:C1:00.0",
"00000000:E1:00.0"
],
"^(a0704|a0731|a0832|a0833)$" : [
"00000000:0E:00.0",
"00000000:13:00.0",
"00000000:49:00.0",
"00000000:4F:00.0",
"00000000:91:00.0",
"00000000:97:00.0",
"00000000:CD:00.0",
"00000000:D2:00.0"
],
"^(a0[6-9]\\d\\d|a053\\d)$" : [
"00000000:0E:00.0",
"00000000:13:00.0",
"00000000:49:00.0",
"00000000:4F:00.0",
"00000000:90:00.0",
"00000000:96:00.0",
"00000000:CC:00.0",
"00000000:D1:00.0"
]
}
}

View File

@@ -0,0 +1,19 @@
{
"ccRestUrl": "https://monitoring.nhr.fau.de",
"ccRestJwt": "XYZ",
"lastRunPath": "/home.local/hpcop/last_run",
"natsServer": "monitoring.nhr.fau.de",
"natsSubject": "fritz",
"natsUser": "fritz",
"natsPassword": "XYZ",
"slurmPollInterval": 300,
"ignoreHosts": "^fviz1$",
"gpuPciAddrs": {
"fviz1" : [
"00000000:CE:00.0",
"00000000:CF:00.0",
"00000000:D0:00.0",
"00000000:D1:00.0"
]
}
}

View File

@@ -0,0 +1,24 @@
{
"ccRestUrl": "https://monitoring.nhr.fau.de",
"ccRestJwt": "XYZ",
"lastRunPath": "/home.local/hpcop/last_run",
"natsServer": "monitoring.nhr.fau.de",
"natsSubject": "helma",
"natsUser": "helma-slurm-adapter",
"natsPassword": "XYZ",
"slurmPollInterval": 300,
"gpuPciAddrs": {
"h1[1-4]-[0-9][0-9]" : [
"00000000:06:00.0",
"00000000:26:00.0",
"00000000:A6:00.0",
"00000000:C6:00.0"
],
"h2[0-9]-[0-9][0-9]" : [
"00000000:06:00.0",
"00000000:26:00.0",
"00000000:A6:00.0",
"00000000:C6:00.0"
]
}
}

View File

@@ -0,0 +1,47 @@
{
"ccRestUrl": "https://monitoring.nhr.fau.de",
"ccRestJwt": "XYZ",
"lastRunPath": "/home.local/hpcop/last_run",
"natsServer": "monitoring.nhr.fau.de",
"natsSubject": "tinyx",
"natsUser": "tinyx",
"natsPassword": "XYZ",
"slurmPollInterval": 300,
"ignoreHosts": "^tg0[3-4]\\d$",
"gpuPciAddrs": {
"^tg04[0-9a-z]$" : [
"00000000:02:00.0",
"00000000:03:00.0",
"00000000:82:00.0",
"00000000:83:00.0"
],
"^tg06[0-9a-z]$" : [
"00000000:18:00.0",
"00000000:3B:00.0",
"00000000:86:00.0",
"00000000:AF:00.0"
],
"^tg07[0-9a-z]$" : [
"00000000:18:00.0",
"00000000:3B:00.0",
"00000000:86:00.0",
"00000000:AF:00.0"
],
"^tg08[0-9a-z]$" : [
"00000000:1A:00.0",
"00000000:1B:00.0",
"00000000:3D:00.0",
"00000000:3E:00.0",
"00000000:B1:00.0",
"00000000:B2:00.0",
"00000000:DA:00.0",
"00000000:DB:00.0"
],
"^tg09[0-9a-z]$" : [
"00000000:01:00.0",
"00000000:41:00.0",
"00000000:81:00.0",
"00000000:C1:00.0"
]
}
}

View File

@@ -0,0 +1,11 @@
{
"ccRestUrl": "https://monitoring.nhr.fau.de",
"ccRestJwt": "XYZ",
"lastRunPath": "/home.local/hpcop/last_run",
"natsServer": "monitoring.nhr.fau.de",
"natsSubject": "woody",
"natsUser": "woody",
"natsPassword": "XYZ",
"slurmQueryMaxSpan": 86400,
"slurmPollInterval": 300
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,357 +1,261 @@
{ {
"name": "meggie", "name": "meggie",
"metricConfig": [ "metricConfig": [
{ {
"name": "cpu_load", "name": "cpu_load",
"unit": { "unit": {
"base": "load" "base": "load"
}, },
"scope": "node", "scope": "node",
"aggregation": "avg", "aggregation": "avg",
"footprint": "avg", "footprint": "avg",
"timestep": 60, "timestep": 60,
"peak": 40, "peak": 40,
"normal": 20, "normal": 20,
"caution": 15, "caution": 15,
"alert": 10 "alert": 10
},
{
"name": "mem_used",
"unit": {
"base": "B",
"prefix": "G"
},
"scope": "node",
"aggregation": "sum",
"footprint": "max",
"timestep": 60,
"peak": 64,
"normal": 20,
"caution": 40,
"alert": 55
},
{
"name": "flops_any",
"unit": {
"base": "Flops/s",
"prefix": "G"
},
"scope": "node",
"aggregation": "sum",
"footprint": "avg",
"timestep": 60,
"peak": 1536,
"normal": 200,
"caution": 40,
"alert": 4
},
{
"name": "flops_sp",
"unit": {
"base": "Flops/s",
"prefix": "G"
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 1536,
"normal": 100,
"caution": 20,
"alert": 2
},
{
"name": "flops_dp",
"unit": {
"base": "Flops/s",
"prefix": "G"
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 768,
"normal": 50,
"caution": 10,
"alert": 2
},
{
"name": "mem_bw",
"unit": {
"base": "B/s",
"prefix": "G"
},
"scope": "node",
"aggregation": "sum",
"footprint": "avg",
"timestep": 60,
"peak": 140,
"normal": 70,
"caution": 20,
"alert": 5
},
{
"name": "clock",
"unit": {
"base": "Hz",
"prefix": "M"
},
"scope": "node",
"aggregation": "avg",
"timestep": 60,
"peak": 3000,
"normal": 2400,
"caution": 1800,
"alert": 1200
},
{
"name": "cpu_power",
"unit": {
"base": "W"
},
"scope": "socket",
"aggregation": "sum",
"energy": "power",
"timestep": 60,
"peak": 80,
"normal": 30,
"caution": 10,
"alert": 5
},
{
"name": "mem_power",
"unit": {
"base": "W"
},
"scope": "socket",
"aggregation": "sum",
"energy": "power",
"timestep": 60,
"peak": 100,
"normal": 50,
"caution": 20,
"alert": 10
},
{
"name": "ipc",
"unit": {
"base": "IPC"
},
"scope": "node",
"aggregation": "avg",
"timestep": 60,
"peak": 4,
"normal": 2,
"caution": 1,
"alert": 0.5
},
{
"name": "vectorization_ratio",
"unit": {
"base": ""
},
"scope": "hwthread",
"aggregation": "avg",
"timestep": 60,
"peak": 100,
"normal": 60,
"caution": 40,
"alert": 10
},
{
"name": "nfs4_read",
"unit": {
"base": "IOP",
"prefix": ""
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 6,
"normal": 4,
"caution": 2,
"alert": 1
},
{
"name": "nfs4_total",
"unit": {
"base": "IOP",
"prefix": ""
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 6,
"normal": 4,
"caution": 2,
"alert": 1
}
],
"subClusters": [
{
"name": "main",
"nodes": "m[0101-0164,0201-0264,0301-0364,0401-0464,0601-0676,0701-0776,0801-0872,0901-0972,1001-1072,1101-1172]",
"processorType": "Intel Broadwell",
"socketsPerNode": 2,
"coresPerSocket": 10,
"threadsPerCore": 1,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
}, },
"value": 96 {
}, "name": "mem_used",
"flopRateSimd": { "unit": {
"unit": { "base": "B",
"base": "F/s", "prefix": "G"
"prefix": "G" },
"scope": "node",
"aggregation": "sum",
"footprint": "max",
"timestep": 60,
"peak": 64,
"normal": 20,
"caution": 40,
"alert": 55
}, },
"value": 1536 {
}, "name": "flops_any",
"memoryBandwidth": { "unit": {
"unit": { "base": "Flops/s",
"base": "B/s", "prefix": "G"
"prefix": "G" },
"scope": "node",
"aggregation": "sum",
"footprint": "avg",
"timestep": 60,
"peak": 1536,
"normal": 200,
"caution": 40,
"alert": 4
}, },
"value": 140 {
}, "name": "flops_sp",
"topology": { "unit": {
"node": [ "base": "Flops/s",
0, "prefix": "G"
1, },
2, "scope": "node",
3, "aggregation": "sum",
4, "timestep": 60,
5, "peak": 1536,
6, "normal": 100,
7, "caution": 20,
8, "alert": 2
9, },
10, {
11, "name": "flops_dp",
12, "unit": {
13, "base": "Flops/s",
14, "prefix": "G"
15, },
16, "scope": "node",
17, "aggregation": "sum",
18, "timestep": 60,
19 "peak": 768,
], "normal": 50,
"socket": [ "caution": 10,
[ "alert": 2
0, },
1, {
2, "name": "net_bytes_in",
3, "unit": {
4, "base": "B/s"
5, },
6, "scope": "node",
7, "aggregation": "sum",
8, "timestep": 60,
9 "peak": 50000000,
], "normal": 10000000,
[ "caution": 5000,
10, "alert": 1000
11, },
12, {
13, "name": "net_bytes_out",
14, "unit": {
15, "base": "B/s"
16, },
17, "scope": "node",
18, "aggregation": "sum",
19 "timestep": 60,
] "peak": 50000000,
], "normal": 200000,
"memoryDomain": [ "caution": 5000,
[ "alert": 1000
0, },
1, {
2, "name": "mem_bw",
3, "unit": {
4, "base": "B/s",
5, "prefix": "G"
6, },
7, "scope": "node",
8, "aggregation": "sum",
9 "footprint": "avg",
], "timestep": 60,
[ "peak": 140,
10, "normal": 70,
11, "caution": 20,
12, "alert": 5
13, },
14, {
15, "name": "clock",
16, "unit": {
17, "base": "Hz",
18, "prefix": "M"
19 },
] "scope": "node",
], "aggregation": "avg",
"core": [ "timestep": 60,
[ "peak": 3000,
0 "normal": 2400,
], "caution": 1800,
[ "alert": 1200
1 },
], {
[ "name": "cpu_power",
2 "unit": {
], "base": "W"
[ },
3 "scope": "socket",
], "aggregation": "sum",
[ "energy": "power",
4 "timestep": 60,
], "peak": 80,
[ "normal": 30,
5 "caution": 10,
], "alert": 5
[ },
6 {
], "name": "mem_power",
[ "unit": {
7 "base": "W"
], },
[ "scope": "socket",
8 "aggregation": "sum",
], "energy": "power",
[ "timestep": 60,
9 "peak": 100,
], "normal": 50,
[ "caution": 20,
10 "alert": 10
], },
[ {
11 "name": "ipc",
], "unit": {
[ "base": "IPC"
12 },
], "scope": "node",
[ "aggregation": "avg",
13 "timestep": 60,
], "peak": 4,
[ "normal": 2,
14 "caution": 1,
], "alert": 0.5
[ },
15 {
], "name": "vectorization_ratio",
[ "unit": {
16 "base": ""
], },
[ "scope": "hwthread",
17 "aggregation": "avg",
], "timestep": 60,
[ "peak": 100,
18 "normal": 60,
], "caution": 40,
[ "alert": 10
19 },
] {
] "name": "nfs4_read",
} "unit": {
} "base": "IOP",
] "prefix": ""
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 50000,
"normal": 10000,
"caution": 10,
"alert": 1
},
{
"name": "nfs4_total",
"unit": {
"base": "IOP",
"prefix": ""
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 50000,
"normal": 10000,
"caution": 20,
"alert": 5
}
],
"subClusters": [
{
"name": "main",
"nodes": "m[0101-0164,0201-0264,0301-0364,0401-0464,0601-0676,0701-0776,0801-0872,0901-0972,1001-1072,1101-1172]",
"processorType": "Intel Broadwell",
"socketsPerNode": 2,
"coresPerSocket": 10,
"threadsPerCore": 1,
"flopRateScalar": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 96
},
"flopRateSimd": {
"unit": {
"base": "F/s",
"prefix": "G"
},
"value": 1536
},
"memoryBandwidth": {
"unit": {
"base": "B/s",
"prefix": "G"
},
"value": 140
},
"topology": {
"node": [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
],
"socket": [
[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ],
[ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ]
],
"memoryDomain": [
[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ],
[ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ]
],
"core": [
[ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ]
]
}
}
]
} }

View File

@@ -178,9 +178,9 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 10,
"alert": 1 "alert": 1
}, },
{ {
@@ -192,9 +192,9 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 10,
"alert": 1 "alert": 1
}, },
{ {
@@ -206,10 +206,10 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 20,
"alert": 1 "alert": 5
} }
], ],
"subClusters": [ "subClusters": [

View File

@@ -86,6 +86,32 @@
"caution": 100, "caution": 100,
"alert": 50 "alert": 50
}, },
{
"name": "net_bytes_in",
"unit": {
"base": "B/s"
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 50000000,
"normal": 10000000,
"caution": 5000,
"alert": 1000
},
{
"name": "net_bytes_out",
"unit": {
"base": "B/s"
},
"scope": "node",
"aggregation": "sum",
"timestep": 60,
"peak": 50000000,
"normal": 200000,
"caution": 5000,
"alert": 1000
},
{ {
"name": "mem_bw", "name": "mem_bw",
"unit": { "unit": {
@@ -197,14 +223,14 @@
"caution": 10000, "caution": 10000,
"alert": 5000, "alert": 5000,
"subClusters": [ "subClusters": [
{ {
"name": "a100", "name": "a100",
"peak": 160000, "peak": 160000,
"normal": 120000, "normal": 120000,
"caution": 80000, "caution": 80000,
"alert": 40000 "alert": 40000
}, },
{ {
"name": "v100", "name": "v100",
"peak": 128000, "peak": 128000,
"normal": 96000, "normal": 96000,
@@ -234,6 +260,7 @@
}, },
"scope": "accelerator", "scope": "accelerator",
"aggregation": "sum", "aggregation": "sum",
"energy": "power",
"timestep": 60, "timestep": 60,
"peak": 400, "peak": 400,
"normal": 200, "normal": 200,
@@ -290,9 +317,9 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 10,
"alert": 1 "alert": 1
}, },
{ {
@@ -304,9 +331,9 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 10,
"alert": 1 "alert": 1
}, },
{ {
@@ -318,10 +345,10 @@
"scope": "node", "scope": "node",
"aggregation": "sum", "aggregation": "sum",
"timestep": 60, "timestep": 60,
"peak": 6, "peak": 50000,
"normal": 4, "normal": 10000,
"caution": 2, "caution": 20,
"alert": 1 "alert": 5
} }
], ],
"subClusters": [ "subClusters": [

File diff suppressed because it is too large Load Diff