mirror of
https://github.com/ClusterCockpit/cc-examples.git
synced 2025-07-28 07:36:11 +02:00
Rename folder and update config
This commit is contained in:
37
nhr@fau/README.md
Normal file
37
nhr@fau/README.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# ClusterCockpit at NHR@FAU
|
||||
|
||||
NHR@FAU provides a production instance of ClusterCockpit for support personnel
|
||||
and users. Authentication is via an LDAP directory as well as via our HPC Portal
|
||||
(homegrown account management platform) using JWT tokens.
|
||||
|
||||
You can find an overview about all clusters
|
||||
[here](https://doc.nhr.fau.de/clusters/overview/).
|
||||
|
||||
Some systems run with job exclusive nodes, others have node sharing enabled.
|
||||
There are CPU systems (Fritz, Meggie, Woody, TinyFat) as well as GPU accelerated
|
||||
clusters (Alex, TinyGPU).
|
||||
|
||||
NHR@FAU uses the following stack:
|
||||
|
||||
* `cc-metric-collector` as node agent
|
||||
* `cc-metric-store` as temporal metric time series cache. We use one instance
|
||||
for all clusters.
|
||||
* `cc-backend`
|
||||
* A homegrown python script running on the management nodes for providing job
|
||||
meta data from Slurm
|
||||
* Builtin sqlite database for job meta and user data (currently 50GB large)
|
||||
* Job Archive without retention using compressed data.json files (around 700GB)
|
||||
|
||||
Currently all API use regular HTTP protocol, but we plan to switch to NATS for
|
||||
all communication.
|
||||
We also push the metric data to an InfluxDB instance for debugging purposes.
|
||||
|
||||
The backend and metric store run on the same dedicated Dell server running
|
||||
Ubuntu Linux:
|
||||
|
||||
* Two Intel Xeon(R) Platinum 8352Y with 32 cores each
|
||||
* 512 GB Main memory capacity
|
||||
* A NVMe Raid with two 7TB disks
|
||||
|
||||
This configuration is probably complete overkill, but we wanted to be on the
|
||||
safe side.
|
18
nhr@fau/cc-backend/clustercockpit.service
Normal file
18
nhr@fau/cc-backend/clustercockpit.service
Normal file
@@ -0,0 +1,18 @@
|
||||
[Unit]
|
||||
Description=ClusterCockpit Web Server
|
||||
Documentation=https://clustercockpit.org
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
After=mariadb.service mysql.service
|
||||
|
||||
[Service]
|
||||
WorkingDirectory=/opt/monitoring/cc-backend
|
||||
Type=notify
|
||||
NotifyAccess=all
|
||||
Restart=on-failure
|
||||
RestartSec=30
|
||||
TimeoutStopSec=100
|
||||
ExecStart=/opt/monitoring/cc-backend/cc-backend -loglevel info -server -config ./config.json
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
241
nhr@fau/cc-backend/config.json
Normal file
241
nhr@fau/cc-backend/config.json
Normal file
@@ -0,0 +1,241 @@
|
||||
{
|
||||
"addr": "0.0.0.0:443",
|
||||
"stop-jobs-exceeding-walltime": 288000,
|
||||
"short-running-jobs-duration": 300,
|
||||
"ldap": {
|
||||
"url": "ldaps://hpcldap.rrze.uni-erlangen.de",
|
||||
"user_base": "ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
|
||||
"search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
|
||||
"user_bind": "uid={username},ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de",
|
||||
"user_filter": "(&(objectclass=posixAccount))",
|
||||
"sync_interval": "24h"
|
||||
},
|
||||
"jwts": {
|
||||
"syncUserOnLogin": true,
|
||||
"updateUserOnLogin": true,
|
||||
"trustedIssuer": "https://portal.hpc.fau.de/",
|
||||
"validateUser": false,
|
||||
"max-age": "168h"
|
||||
},
|
||||
"https-cert-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/fullchain.pem",
|
||||
"https-key-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/privkey.pem",
|
||||
"user": "clustercockpit",
|
||||
"group": "clustercockpit",
|
||||
"archive": {
|
||||
"kind": "file",
|
||||
"path": "./var/job-archive",
|
||||
"compression": 7,
|
||||
"retention": {
|
||||
"policy": "none"
|
||||
}
|
||||
},
|
||||
"enable-resampling": {
|
||||
"trigger": 30,
|
||||
"resolutions": [
|
||||
600,
|
||||
300,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"emission-constant": 317,
|
||||
"ui-defaults": {
|
||||
"analysis_view_histogramMetrics": [
|
||||
"flops_any",
|
||||
"mem_bw",
|
||||
"mem_used"
|
||||
],
|
||||
"analysis_view_scatterPlotMetrics": [
|
||||
[
|
||||
"flops_any",
|
||||
"mem_bw"
|
||||
],
|
||||
[
|
||||
"flops_any",
|
||||
"cpu_load"
|
||||
],
|
||||
[
|
||||
"cpu_load",
|
||||
"mem_bw"
|
||||
]
|
||||
],
|
||||
"job_view_nodestats_selectedMetrics": [
|
||||
"flops_any",
|
||||
"mem_bw",
|
||||
"mem_used"
|
||||
],
|
||||
"job_view_polarPlotMetrics": [
|
||||
"flops_any",
|
||||
"mem_bw",
|
||||
"mem_used"
|
||||
],
|
||||
"job_view_selectedMetrics": [
|
||||
"flops_any",
|
||||
"mem_bw",
|
||||
"mem_used"
|
||||
],
|
||||
"job_view_showFootprint": true,
|
||||
"job_list_usePaging": false,
|
||||
"plot_general_colorBackground": true,
|
||||
"plot_general_colorscheme": [
|
||||
"#00bfff",
|
||||
"#0000ff",
|
||||
"#ff00ff",
|
||||
"#ff0000",
|
||||
"#ff8000",
|
||||
"#ffff00",
|
||||
"#80ff00"
|
||||
],
|
||||
"plot_general_lineWidth": 3,
|
||||
"plot_list_jobsPerPage": 10,
|
||||
"plot_list_selectedMetrics": [
|
||||
"cpu_load",
|
||||
"mem_used",
|
||||
"flops_any",
|
||||
"mem_bw"
|
||||
],
|
||||
"plot_view_plotsPerRow": 3,
|
||||
"plot_view_showPolarplot": true,
|
||||
"plot_view_showRoofline": true,
|
||||
"plot_view_showStatTable": true,
|
||||
"system_view_selectedMetric": "cpu_load",
|
||||
"analysis_view_selectedTopEntity": "user",
|
||||
"analysis_view_selectedTopCategory": "totalWalltime",
|
||||
"status_view_selectedTopUserCategory": "totalJobs",
|
||||
"status_view_selectedTopProjectCategory": "totalJobs"
|
||||
},
|
||||
"clusters": [
|
||||
{
|
||||
"name": "fritz",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "-"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "alex",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "-"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2022-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "woody",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "-"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 1
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 172800
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2020-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "tinyfat",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "-"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 1
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 172800
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2020-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "tinygpu",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "-"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 1
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 172800
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2020-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "meggie",
|
||||
"metricDataRepository": {
|
||||
"kind": "cc-metric-store",
|
||||
"url": "http://localhost:8082",
|
||||
"token": "-"
|
||||
},
|
||||
"filterRanges": {
|
||||
"numNodes": {
|
||||
"from": 1,
|
||||
"to": 64
|
||||
},
|
||||
"duration": {
|
||||
"from": 0,
|
||||
"to": 86400
|
||||
},
|
||||
"startTime": {
|
||||
"from": "2018-01-01T00:00:00Z",
|
||||
"to": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
145
nhr@fau/cc-metric-collector/alex/collectors.json
Normal file
145
nhr@fau/cc-metric-collector/alex/collectors.json
Normal file
@@ -0,0 +1,145 @@
|
||||
{
|
||||
"nfs4stat" : {},
|
||||
"memstat" : {
|
||||
"numa_stats": true,
|
||||
"node_stats": true
|
||||
},
|
||||
"cpustat" : {},
|
||||
"loadavg" : {},
|
||||
"schedstat": {},
|
||||
"nvidia" : {
|
||||
"use_pci_info_as_type_id": true
|
||||
},
|
||||
"lustrestat" : {
|
||||
"send_all_metrics" : true,
|
||||
"use_sudo": false,
|
||||
"send_diff_values": true,
|
||||
"send_derived_values": true,
|
||||
"send_abs_values": false
|
||||
},
|
||||
"netstat" : {
|
||||
"include_devices" : [
|
||||
"enp1s0",
|
||||
"enp70s0f0",
|
||||
"enp195s0f0"
|
||||
],
|
||||
"send_abs_values": true,
|
||||
"send_derived_values": true
|
||||
},
|
||||
"diskstat" : {},
|
||||
"iostat" : {},
|
||||
"ibstat" : {
|
||||
"send_abs_values": true,
|
||||
"send_derived_values": true
|
||||
},
|
||||
"ipmistat" : {
|
||||
"send_abs_values": true,
|
||||
"send_derived_values": true
|
||||
},
|
||||
"tempstat" : {
|
||||
"tag_override" : {
|
||||
"hwmon0" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "0"
|
||||
},
|
||||
"hwmon1" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "1"
|
||||
}
|
||||
}
|
||||
},
|
||||
"likwid": {
|
||||
"force_overwrite" : true,
|
||||
"invalid_to_zero" : true,
|
||||
"access_mode" : "accessdaemon",
|
||||
"accessdaemon_path" : "/apps/likwid/system/sbin",
|
||||
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
|
||||
"eventsets": [
|
||||
{
|
||||
"events": {
|
||||
"FIXC1": "ACTUAL_CPU_CLOCK",
|
||||
"FIXC2": "MAX_CPU_CLOCK",
|
||||
"PMC0": "RETIRED_INSTRUCTIONS",
|
||||
"PMC1": "CPU_CLOCKS_UNHALTED",
|
||||
"PMC2": "RETIRED_SSE_AVX_FLOPS_ALL",
|
||||
"PMC3": "MERGE",
|
||||
"DFC0": "DRAM_CHANNEL_0",
|
||||
"DFC1": "DRAM_CHANNEL_1",
|
||||
"DFC2": "DRAM_CHANNEL_2",
|
||||
"DFC3": "DRAM_CHANNEL_3"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"name": "ipc",
|
||||
"calc": "PMC0/PMC1",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "flops_any",
|
||||
"calc": "1E-9*PMC2/time",
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "clock",
|
||||
"calc": "1E-6*(FIXC1/FIXC2)/inverseClock",
|
||||
"type": "hwthread",
|
||||
"unit": "MHz",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "mem1",
|
||||
"calc": "1E-9*(DFC0+DFC1+DFC2+DFC3)*64.0/time",
|
||||
"unit": "Gbyte/s",
|
||||
"type": "socket",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"events": {
|
||||
"DFC0": "DRAM_CHANNEL_4",
|
||||
"DFC1": "DRAM_CHANNEL_5",
|
||||
"DFC2": "DRAM_CHANNEL_6",
|
||||
"DFC3": "DRAM_CHANNEL_7",
|
||||
"PWR0": "RAPL_CORE_ENERGY",
|
||||
"PWR1": "RAPL_PKG_ENERGY"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"name": "core_power",
|
||||
"calc": "PWR0/time",
|
||||
"unit": "Watt",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "cpu_power",
|
||||
"calc": "PWR1/time",
|
||||
"type": "socket",
|
||||
"unit": "Watt",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "mem2",
|
||||
"calc": "1E-9*(DFC0+DFC1+DFC2+DFC3)*64.0/time",
|
||||
"unit": "Gbyte/s",
|
||||
"type": "socket",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"globalmetrics": [
|
||||
{
|
||||
"name": "mem_bw",
|
||||
"calc": "mem1+mem2",
|
||||
"type": "socket",
|
||||
"unit": "Gbyte/s",
|
||||
"publish": true
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
8
nhr@fau/cc-metric-collector/alex/config.json
Normal file
8
nhr@fau/cc-metric-collector/alex/config.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"sinks": "/etc/cc-metric-collector/sinks.json",
|
||||
"collectors" : "/etc/cc-metric-collector/collectors.json",
|
||||
"receivers" : "/etc/cc-metric-collector/receivers.json",
|
||||
"router" : "/etc/cc-metric-collector/router.json",
|
||||
"interval": "60s",
|
||||
"duration": "10s"
|
||||
}
|
1
nhr@fau/cc-metric-collector/alex/receivers.json
Normal file
1
nhr@fau/cc-metric-collector/alex/receivers.json
Normal file
@@ -0,0 +1 @@
|
||||
{}
|
58
nhr@fau/cc-metric-collector/alex/router.json
Normal file
58
nhr@fau/cc-metric-collector/alex/router.json
Normal file
@@ -0,0 +1,58 @@
|
||||
{
|
||||
"add_tags" : [
|
||||
{
|
||||
"key" : "cluster",
|
||||
"value" : "alex",
|
||||
"if" : "*"
|
||||
}
|
||||
],
|
||||
"rename_metrics" : {
|
||||
"load_one" : "cpu_load",
|
||||
"cpu_load_core" : "cpu_load",
|
||||
"net_bytes_in_bw" : "net_bytes_in",
|
||||
"net_bytes_out_bw" : "net_bytes_out",
|
||||
"net_pkts_in_bw" : "net_pkts_in",
|
||||
"net_pkts_out_bw" : "net_pkts_out",
|
||||
"ib_recv_bw" : "ib_recv",
|
||||
"ib_xmit_bw" : "ib_xmit",
|
||||
"ib_recv_pkts_bw": "ib_recv_pkts",
|
||||
"ib_xmit_pkts_bw": "ib_xmit_pkts",
|
||||
"lustre_read_bytes_diff" : "lustre_read_bytes",
|
||||
"lustre_read_requests_diff" : "lustre_read_requests",
|
||||
"lustre_write_bytes_diff" : "lustre_write_bytes",
|
||||
"lustre_write_requests_diff" : "lustre_write_requests",
|
||||
"lustre_open_diff" : "lustre_open",
|
||||
"lustre_close_diff" : "lustre_close",
|
||||
"lustre_setattr_diff" : "lustre_setattr",
|
||||
"lustre_getattr_diff" : "lustre_getattr",
|
||||
"lustre_statfs_diff": "lustre_statfs",
|
||||
"lustre_inode_permission_diff" : "lustre_inode_permission",
|
||||
"nv_util" : "acc_utilization",
|
||||
"nv_fb_mem_used" : "acc_mem_used",
|
||||
"nv_power_usage" : "acc_power"
|
||||
},
|
||||
"drop_metrics" : [
|
||||
"net_bytes_in",
|
||||
"net_bytes_out",
|
||||
"ib_recv",
|
||||
"ib_xmit",
|
||||
"ib_recv_pkts",
|
||||
"ib_xmit_pkts",
|
||||
"net_pkts_in",
|
||||
"net_pkts_out",
|
||||
"lustre_read_bytes",
|
||||
"lustre_read_requests",
|
||||
"lustre_write_bytes",
|
||||
"lustre_write_requests"
|
||||
],
|
||||
"interval_timestamp" : false,
|
||||
"num_cache_intervals" : 0,
|
||||
"change_unit_prefix": {
|
||||
"mem_used": "G",
|
||||
"swap_used": "G",
|
||||
"mem_total": "G",
|
||||
"swap_total": "G",
|
||||
"cpufreq": "M"
|
||||
},
|
||||
"normalize_metrics" : true
|
||||
}
|
26
nhr@fau/cc-metric-collector/alex/sinks.json
Normal file
26
nhr@fau/cc-metric-collector/alex/sinks.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"nhrinflux": {
|
||||
"type": "influxasync",
|
||||
"host": "monitoring-test.nhr.uni-erlangen.de",
|
||||
"port": "8086",
|
||||
"organization": "ClusterCockpit",
|
||||
"database": "alex",
|
||||
"password": "XYZ",
|
||||
"ssl": true,
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
]
|
||||
},
|
||||
"alexstore": {
|
||||
"type": "http",
|
||||
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=alex",
|
||||
"jwt": "XYZ",
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
],
|
||||
"idle_connection_timeout": "60s",
|
||||
"flush_delay": "2s",
|
||||
"max_retries": 1,
|
||||
"timeout": "10s"
|
||||
}
|
||||
}
|
33
nhr@fau/cc-metric-collector/alex/sinks_debug.json
Normal file
33
nhr@fau/cc-metric-collector/alex/sinks_debug.json
Normal file
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"fritzganglia": {
|
||||
"type": "libganglia",
|
||||
"gmond_config": "/etc/ganglia/gmond.conf",
|
||||
"libganglia_path": "libganglia.so.0",
|
||||
"add_ganglia_group": true
|
||||
},
|
||||
"nhrinflux": {
|
||||
"type": "influxasync",
|
||||
"host": "monitoring-test.nhr.uni-erlangen.de",
|
||||
"port": "8086",
|
||||
"organization": "ClusterCockpit",
|
||||
"database": "fritz_neu",
|
||||
"password": "XZY",
|
||||
"ssl": true,
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
]
|
||||
},
|
||||
"fritzstore": {
|
||||
"type": "http",
|
||||
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz",
|
||||
"jwt": "XZY",
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
],
|
||||
"idle_connection_timeout": "60s"
|
||||
},
|
||||
"debugstdout": {
|
||||
"type": "stdout",
|
||||
"output_file": "/tmp/debug.log"
|
||||
}
|
||||
}
|
43
nhr@fau/cc-metric-collector/fritz.spr/collectors.json
Normal file
43
nhr@fau/cc-metric-collector/fritz.spr/collectors.json
Normal file
@@ -0,0 +1,43 @@
|
||||
{
|
||||
"nfs4stat" : {},
|
||||
"memstat" : {
|
||||
"node_stats": true
|
||||
},
|
||||
"cpustat" : {},
|
||||
"loadavg" : {},
|
||||
"lustrestat" : {
|
||||
"send_all_metrics" : true,
|
||||
"use_sudo": false,
|
||||
"send_diff_values": true,
|
||||
"send_derived_values": true,
|
||||
"send_abs_values": false
|
||||
},
|
||||
"netstat" : {
|
||||
"include_devices" : [
|
||||
"enp1s0",
|
||||
"enp22s0"
|
||||
],
|
||||
"send_abs_values": true,
|
||||
"send_derived_values": true
|
||||
},
|
||||
"diskstat" : {},
|
||||
"iostat" : {},
|
||||
"ibstat" : {
|
||||
"send_abs_values": true,
|
||||
"send_derived_values": true
|
||||
},
|
||||
"tempstat" : {
|
||||
"tag_override" : {
|
||||
"hwmon0" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "0"
|
||||
},
|
||||
"hwmon1" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "1"
|
||||
}
|
||||
}
|
||||
},
|
||||
"cpufreq_cpuinfo": {},
|
||||
"nfsiostat": {}
|
||||
}
|
8
nhr@fau/cc-metric-collector/fritz.spr/config.json
Normal file
8
nhr@fau/cc-metric-collector/fritz.spr/config.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"sinks": "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/sinks2.json",
|
||||
"collectors" : "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/collectors.json",
|
||||
"receivers" : "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/receivers.json",
|
||||
"router" : "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/router.json",
|
||||
"interval": "60s",
|
||||
"duration": "10s"
|
||||
}
|
1
nhr@fau/cc-metric-collector/fritz.spr/receivers.json
Normal file
1
nhr@fau/cc-metric-collector/fritz.spr/receivers.json
Normal file
@@ -0,0 +1 @@
|
||||
{}
|
54
nhr@fau/cc-metric-collector/fritz.spr/router.json
Normal file
54
nhr@fau/cc-metric-collector/fritz.spr/router.json
Normal file
@@ -0,0 +1,54 @@
|
||||
{
|
||||
"add_tags" : [
|
||||
{
|
||||
"key" : "cluster",
|
||||
"value" : "fritz",
|
||||
"if" : "*"
|
||||
}
|
||||
],
|
||||
"rename_metrics" : {
|
||||
"load_one" : "cpu_load",
|
||||
"net_bytes_in_bw" : "net_bytes_in",
|
||||
"net_bytes_out_bw" : "net_bytes_out",
|
||||
"net_pkts_in_bw" : "net_pkts_in",
|
||||
"net_pkts_out_bw" : "net_pkts_out",
|
||||
"ib_recv_bw" : "ib_recv",
|
||||
"ib_xmit_bw" : "ib_xmit",
|
||||
"ib_recv_pkts_bw": "ib_recv_pkts",
|
||||
"ib_xmit_pkts_bw": "ib_xmit_pkts",
|
||||
"lustre_read_bytes_diff" : "lustre_read_bytes",
|
||||
"lustre_read_requests_diff" : "lustre_read_requests",
|
||||
"lustre_write_bytes_diff" : "lustre_write_bytes",
|
||||
"lustre_write_requests_diff" : "lustre_write_requests",
|
||||
"lustre_open_diff" : "lustre_open",
|
||||
"lustre_close_diff" : "lustre_close",
|
||||
"lustre_setattr_diff" : "lustre_setattr",
|
||||
"lustre_getattr_diff" : "lustre_getattr",
|
||||
"lustre_statfs_diff": "lustre_statfs",
|
||||
"lustre_inode_permission_diff" : "lustre_inode_permission",
|
||||
"cpufreq" : "clock"
|
||||
},
|
||||
"drop_metrics" : [
|
||||
"net_bytes_in",
|
||||
"net_bytes_out",
|
||||
"ib_recv",
|
||||
"ib_xmit",
|
||||
"ib_recv_pkts",
|
||||
"ib_xmit_pkts",
|
||||
"net_pkts_in",
|
||||
"net_pkts_out",
|
||||
"lustre_read_bytes",
|
||||
"lustre_read_requests",
|
||||
"lustre_write_bytes",
|
||||
"lustre_write_requests"
|
||||
],
|
||||
"interval_timestamp" : false,
|
||||
"num_cache_intervals" : 0,
|
||||
"change_unit_prefix": {
|
||||
"mem_used": "G",
|
||||
"swap_used": "G",
|
||||
"mem_total": "G",
|
||||
"swap_total": "G"
|
||||
},
|
||||
"normalize_metrics" : true
|
||||
}
|
32
nhr@fau/cc-metric-collector/fritz.spr/sinks.json
Normal file
32
nhr@fau/cc-metric-collector/fritz.spr/sinks.json
Normal file
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"fritzganglia": {
|
||||
"type": "libganglia",
|
||||
"gmond_config": "/etc/ganglia/gmond.conf",
|
||||
"libganglia_path": "libganglia.so.0",
|
||||
"add_ganglia_group": true
|
||||
},
|
||||
"nhrinflux": {
|
||||
"type": "influxasync",
|
||||
"host": "monitoring-test.nhr.uni-erlangen.de",
|
||||
"port": "8086",
|
||||
"organization": "ClusterCockpit",
|
||||
"database": "fritz_neu",
|
||||
"password": "XZY",
|
||||
"ssl": true,
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
]
|
||||
},
|
||||
"fritzstore": {
|
||||
"type": "http",
|
||||
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz",
|
||||
"jwt": "XZY",
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
],
|
||||
"idle_connection_timeout": "60s",
|
||||
"flush_delay": "2s",
|
||||
"max_retries": 1,
|
||||
"timeout": "10s"
|
||||
}
|
||||
}
|
184
nhr@fau/cc-metric-collector/fritz/collectors.json
Normal file
184
nhr@fau/cc-metric-collector/fritz/collectors.json
Normal file
@@ -0,0 +1,184 @@
|
||||
{
|
||||
"nfs4stat" : {},
|
||||
"memstat" : {
|
||||
"node_stats": true
|
||||
},
|
||||
"cpustat" : {},
|
||||
"loadavg" : {},
|
||||
"lustrestat" : {
|
||||
"send_all_metrics" : true,
|
||||
"use_sudo": false,
|
||||
"send_diff_values": true,
|
||||
"send_derived_values": true,
|
||||
"send_abs_values": false
|
||||
},
|
||||
"netstat" : {
|
||||
"include_devices" : [
|
||||
"enp1s0",
|
||||
"enp22s0"
|
||||
],
|
||||
"send_abs_values": true,
|
||||
"send_derived_values": true
|
||||
},
|
||||
"diskstat" : {},
|
||||
"iostat" : {},
|
||||
"ibstat" : {
|
||||
"send_abs_values": true,
|
||||
"send_derived_values": true
|
||||
},
|
||||
"tempstat" : {
|
||||
"tag_override" : {
|
||||
"hwmon0" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "0"
|
||||
},
|
||||
"hwmon1" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "1"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nfsiostat": {},
|
||||
"likwid": {
|
||||
"force_overwrite" : true,
|
||||
"invalid_to_zero" : true,
|
||||
"access_mode" : "accessdaemon",
|
||||
"accessdaemon_path" : "/apps/likwid/system/sbin",
|
||||
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
|
||||
"eventsets": [
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"MBOX0C0": "CAS_COUNT_RD",
|
||||
"MBOX0C1": "CAS_COUNT_WR",
|
||||
"MBOX1C0": "CAS_COUNT_RD",
|
||||
"MBOX1C1": "CAS_COUNT_WR",
|
||||
"MBOX2C0": "CAS_COUNT_RD",
|
||||
"MBOX2C1": "CAS_COUNT_WR",
|
||||
"MBOX3C0": "CAS_COUNT_RD",
|
||||
"MBOX3C1": "CAS_COUNT_WR",
|
||||
"MBOX4C0": "CAS_COUNT_RD",
|
||||
"MBOX4C1": "CAS_COUNT_WR",
|
||||
"MBOX5C0": "CAS_COUNT_RD",
|
||||
"MBOX5C1": "CAS_COUNT_WR",
|
||||
"MBOX6C0": "CAS_COUNT_RD",
|
||||
"MBOX6C1": "CAS_COUNT_WR",
|
||||
"MBOX7C0": "CAS_COUNT_RD",
|
||||
"MBOX7C1": "CAS_COUNT_WR",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE",
|
||||
"PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE",
|
||||
"PWR0": "PWR_PKG_ENERGY",
|
||||
"PWR3": "PWR_DRAM_ENERGY"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
|
||||
"name": "clock",
|
||||
"publish": true,
|
||||
"unit": "MHz",
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "FIXC0/FIXC1",
|
||||
"name": "ipc",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PWR0/time",
|
||||
"name": "cpu_power",
|
||||
"publish": true,
|
||||
"unit": "Watt",
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PWR3/time",
|
||||
"name": "mem_power",
|
||||
"unit": "Watt",
|
||||
"publish": true,
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time",
|
||||
"name": "flops_dp",
|
||||
"unit": "GFlops/s",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time",
|
||||
"name": "mem_bw",
|
||||
"unit": "GBytes/s",
|
||||
"publish": true,
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2+PMC3",
|
||||
"name": "dp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2+PMC3",
|
||||
"name": "dp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"FIXC3": "TOPDOWN_SLOTS",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE",
|
||||
"PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time",
|
||||
"name": "flops_sp",
|
||||
"unit": "GFlops/s",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2+PMC3",
|
||||
"name": "sp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2+PMC3",
|
||||
"name": "sp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"globalmetrics": [
|
||||
{
|
||||
"calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))",
|
||||
"name": "vectorization_ratio",
|
||||
"unit": "%",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"calc": "(flops_dp * 2) + flops_sp",
|
||||
"name": "flops_any",
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
8
nhr@fau/cc-metric-collector/fritz/config.json
Normal file
8
nhr@fau/cc-metric-collector/fritz/config.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"sinks": "/etc/cc-metric-collector/sinks.json",
|
||||
"collectors" : "/etc/cc-metric-collector/collectors.json",
|
||||
"receivers" : "/etc/cc-metric-collector/receivers.json",
|
||||
"router" : "/etc/cc-metric-collector/router.json",
|
||||
"interval": "60s",
|
||||
"duration": "10s"
|
||||
}
|
1
nhr@fau/cc-metric-collector/fritz/receivers.json
Normal file
1
nhr@fau/cc-metric-collector/fritz/receivers.json
Normal file
@@ -0,0 +1 @@
|
||||
{}
|
53
nhr@fau/cc-metric-collector/fritz/router.json
Normal file
53
nhr@fau/cc-metric-collector/fritz/router.json
Normal file
@@ -0,0 +1,53 @@
|
||||
{
|
||||
"add_tags" : [
|
||||
{
|
||||
"key" : "cluster",
|
||||
"value" : "fritz",
|
||||
"if" : "*"
|
||||
}
|
||||
],
|
||||
"rename_metrics" : {
|
||||
"load_one" : "cpu_load",
|
||||
"net_bytes_in_bw" : "net_bytes_in",
|
||||
"net_bytes_out_bw" : "net_bytes_out",
|
||||
"net_pkts_in_bw" : "net_pkts_in",
|
||||
"net_pkts_out_bw" : "net_pkts_out",
|
||||
"ib_recv_bw" : "ib_recv",
|
||||
"ib_xmit_bw" : "ib_xmit",
|
||||
"ib_recv_pkts_bw": "ib_recv_pkts",
|
||||
"ib_xmit_pkts_bw": "ib_xmit_pkts",
|
||||
"lustre_read_bytes_diff" : "lustre_read_bytes",
|
||||
"lustre_read_requests_diff" : "lustre_read_requests",
|
||||
"lustre_write_bytes_diff" : "lustre_write_bytes",
|
||||
"lustre_write_requests_diff" : "lustre_write_requests",
|
||||
"lustre_open_diff" : "lustre_open",
|
||||
"lustre_close_diff" : "lustre_close",
|
||||
"lustre_setattr_diff" : "lustre_setattr",
|
||||
"lustre_getattr_diff" : "lustre_getattr",
|
||||
"lustre_statfs_diff": "lustre_statfs",
|
||||
"lustre_inode_permission_diff" : "lustre_inode_permission"
|
||||
},
|
||||
"drop_metrics" : [
|
||||
"net_bytes_in",
|
||||
"net_bytes_out",
|
||||
"ib_recv",
|
||||
"ib_xmit",
|
||||
"ib_recv_pkts",
|
||||
"ib_xmit_pkts",
|
||||
"net_pkts_in",
|
||||
"net_pkts_out",
|
||||
"lustre_read_bytes",
|
||||
"lustre_read_requests",
|
||||
"lustre_write_bytes",
|
||||
"lustre_write_requests"
|
||||
],
|
||||
"interval_timestamp" : false,
|
||||
"num_cache_intervals" : 0,
|
||||
"change_unit_prefix": {
|
||||
"mem_used": "G",
|
||||
"swap_used": "G",
|
||||
"mem_total": "G",
|
||||
"swap_total": "G"
|
||||
},
|
||||
"normalize_metrics" : true
|
||||
}
|
32
nhr@fau/cc-metric-collector/fritz/sinks.json
Normal file
32
nhr@fau/cc-metric-collector/fritz/sinks.json
Normal file
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"fritzganglia": {
|
||||
"type": "libganglia",
|
||||
"gmond_config": "/etc/ganglia/gmond.conf",
|
||||
"libganglia_path": "libganglia.so.0",
|
||||
"add_ganglia_group": true
|
||||
},
|
||||
"nhrinflux": {
|
||||
"type": "influxasync",
|
||||
"host": "monitoring-test.nhr.uni-erlangen.de",
|
||||
"port": "8086",
|
||||
"organization": "ClusterCockpit",
|
||||
"database": "fritz_neu",
|
||||
"password": "XYZ",
|
||||
"ssl": true,
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
]
|
||||
},
|
||||
"fritzstore": {
|
||||
"type": "http",
|
||||
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz",
|
||||
"jwt": "XYZ",
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
],
|
||||
"idle_connection_timeout": "60s",
|
||||
"flush_delay": "2s",
|
||||
"max_retries": 1,
|
||||
"timeout": "10s"
|
||||
}
|
||||
}
|
33
nhr@fau/cc-metric-collector/fritz/sinks_debug.json
Normal file
33
nhr@fau/cc-metric-collector/fritz/sinks_debug.json
Normal file
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"fritzganglia": {
|
||||
"type": "libganglia",
|
||||
"gmond_config": "/etc/ganglia/gmond.conf",
|
||||
"libganglia_path": "libganglia.so.0",
|
||||
"add_ganglia_group": true
|
||||
},
|
||||
"nhrinflux": {
|
||||
"type": "influxasync",
|
||||
"host": "monitoring-test.nhr.uni-erlangen.de",
|
||||
"port": "8086",
|
||||
"organization": "ClusterCockpit",
|
||||
"database": "fritz_neu",
|
||||
"password": "XZY",
|
||||
"ssl": true,
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
]
|
||||
},
|
||||
"fritzstore": {
|
||||
"type": "http",
|
||||
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz",
|
||||
"jwt": "XZY",
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
],
|
||||
"idle_connection_timeout": "60s"
|
||||
},
|
||||
"debugstdout": {
|
||||
"type": "stdout",
|
||||
"output_file": "/tmp/debug.log"
|
||||
}
|
||||
}
|
0
nhr@fau/cc-metric-collector/meggie-ng/.gitkeep
Normal file
0
nhr@fau/cc-metric-collector/meggie-ng/.gitkeep
Normal file
161
nhr@fau/cc-metric-collector/meggie-ng/collectors.json
Normal file
161
nhr@fau/cc-metric-collector/meggie-ng/collectors.json
Normal file
@@ -0,0 +1,161 @@
|
||||
{
|
||||
"diskstat" : {},
|
||||
"ibstat" : {},
|
||||
"nfs4stat" : {},
|
||||
"cpustat" : {},
|
||||
"loadavg" : {},
|
||||
"iostat" : {},
|
||||
"netstat" : {
|
||||
"include_devices" : [
|
||||
"eno1"
|
||||
],
|
||||
"send_abs_values" : true,
|
||||
"send_derived_values" : true
|
||||
},
|
||||
"tempstat" : {
|
||||
"tag_override" : {
|
||||
"hwmon1" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "0"
|
||||
},
|
||||
"hwmon2" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "1"
|
||||
}
|
||||
}
|
||||
},
|
||||
"memstat" : {
|
||||
"node_stats": true
|
||||
},
|
||||
"likwid": {
|
||||
"force_overwrite" : true,
|
||||
"invalid_to_zero" : true,
|
||||
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
|
||||
"access_mode" : "accessdaemon",
|
||||
"accessdaemon_path" : "/apps/likwid/system/sbin",
|
||||
"eventsets": [
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"MBOX0C0": "CAS_COUNT_RD",
|
||||
"MBOX0C1": "CAS_COUNT_WR",
|
||||
"MBOX1C0": "CAS_COUNT_RD",
|
||||
"MBOX1C1": "CAS_COUNT_WR",
|
||||
"MBOX2C0": "CAS_COUNT_RD",
|
||||
"MBOX2C1": "CAS_COUNT_WR",
|
||||
"MBOX3C0": "CAS_COUNT_RD",
|
||||
"MBOX3C1": "CAS_COUNT_WR",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE",
|
||||
"PWR0": "PWR_PKG_ENERGY",
|
||||
"PWR3": "PWR_DRAM_ENERGY"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "1E-6*(FIXC1/FIXC2)/inverseClock",
|
||||
"name": "clock",
|
||||
"publish": true,
|
||||
"unit": "MHz",
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "FIXC0/FIXC1",
|
||||
"name": "ipc",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PWR0/time",
|
||||
"name": "cpu_power",
|
||||
"publish": true,
|
||||
"unit": "Watt",
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PWR3/time",
|
||||
"name": "mem_power",
|
||||
"publish": true,
|
||||
"unit": "Watt",
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time",
|
||||
"name": "flops_dp",
|
||||
"publish": true,
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time",
|
||||
"name": "mem_bw",
|
||||
"publish": true,
|
||||
"unit": "GBytes/s",
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2",
|
||||
"name": "dp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2",
|
||||
"name": "dp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time",
|
||||
"name": "flops_sp",
|
||||
"unit": "GFlops/s",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2",
|
||||
"name": "sp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2",
|
||||
"name": "sp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"globalmetrics": [
|
||||
{
|
||||
"calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))",
|
||||
"name": "vectorization_ratio",
|
||||
"type": "hwthread",
|
||||
"unit": "%",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"calc": "(flops_dp * 2) + flops_sp",
|
||||
"name": "flops_any",
|
||||
"type": "hwthread",
|
||||
"unit": "GFlops/s",
|
||||
"publish": true
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
8
nhr@fau/cc-metric-collector/meggie-ng/config.json
Normal file
8
nhr@fau/cc-metric-collector/meggie-ng/config.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"sinks": "/etc/cc-metric-collector/sinks.json",
|
||||
"collectors" : "/etc/cc-metric-collector/collectors.json",
|
||||
"receivers" : "/etc/cc-metric-collector/receivers.json",
|
||||
"router" : "/etc/cc-metric-collector/router.json",
|
||||
"interval": "60s",
|
||||
"duration": "10s"
|
||||
}
|
1
nhr@fau/cc-metric-collector/meggie-ng/receivers.json
Normal file
1
nhr@fau/cc-metric-collector/meggie-ng/receivers.json
Normal file
@@ -0,0 +1 @@
|
||||
{}
|
53
nhr@fau/cc-metric-collector/meggie-ng/router.json
Normal file
53
nhr@fau/cc-metric-collector/meggie-ng/router.json
Normal file
@@ -0,0 +1,53 @@
|
||||
{
|
||||
"add_tags" : [
|
||||
{
|
||||
"key" : "cluster",
|
||||
"value" : "meggie",
|
||||
"if" : "*"
|
||||
}
|
||||
],
|
||||
"rename_metrics" : {
|
||||
"load_one" : "cpu_load",
|
||||
"net_bytes_in_bw" : "net_bytes_in",
|
||||
"net_bytes_out_bw" : "net_bytes_out",
|
||||
"net_pkts_in_bw" : "net_pkts_in",
|
||||
"net_pkts_out_bw" : "net_pkts_out",
|
||||
"ib_recv_bw" : "ib_recv",
|
||||
"ib_xmit_bw" : "ib_xmit",
|
||||
"ib_recv_pkts_bw": "ib_recv_pkts",
|
||||
"ib_xmit_pkts_bw": "ib_xmit_pkts",
|
||||
"lustre_read_bytes_diff" : "lustre_read_bytes",
|
||||
"lustre_read_requests_diff" : "lustre_read_requests",
|
||||
"lustre_write_bytes_diff" : "lustre_write_bytes",
|
||||
"lustre_write_requests_diff" : "lustre_write_requests",
|
||||
"lustre_open_diff" : "lustre_open",
|
||||
"lustre_close_diff" : "lustre_close",
|
||||
"lustre_setattr_diff" : "lustre_setattr",
|
||||
"lustre_getattr_diff" : "lustre_getattr",
|
||||
"lustre_statfs_diff": "lustre_statfs",
|
||||
"lustre_inode_permission_diff" : "lustre_inode_permission"
|
||||
},
|
||||
"drop_metrics" : [
|
||||
"net_bytes_in",
|
||||
"net_bytes_out",
|
||||
"ib_recv",
|
||||
"ib_xmit",
|
||||
"ib_recv_pkts",
|
||||
"ib_xmit_pkts",
|
||||
"net_pkts_in",
|
||||
"net_pkts_out",
|
||||
"lustre_read_bytes",
|
||||
"lustre_read_requests",
|
||||
"lustre_write_bytes",
|
||||
"lustre_write_requests"
|
||||
],
|
||||
"interval_timestamp" : true,
|
||||
"num_cache_intervals" : 0,
|
||||
"change_unit_prefix": {
|
||||
"mem_used": "G",
|
||||
"swap_used": "G",
|
||||
"mem_total": "G",
|
||||
"swap_total": "G"
|
||||
},
|
||||
"normalize_metrics" : true
|
||||
}
|
26
nhr@fau/cc-metric-collector/meggie-ng/sinks.json
Normal file
26
nhr@fau/cc-metric-collector/meggie-ng/sinks.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"nhrinflux": {
|
||||
"type": "influxasync",
|
||||
"host": "monitoring-test.nhr.uni-erlangen.de",
|
||||
"port": "8086",
|
||||
"organization": "ClusterCockpit",
|
||||
"database": "meggie",
|
||||
"password": "XZY",
|
||||
"ssl": true,
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
]
|
||||
},
|
||||
"fritzstore": {
|
||||
"type": "http",
|
||||
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=meggie",
|
||||
"jwt": "XZY",
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
],
|
||||
"idle_connection_timeout": "60s",
|
||||
"flush_delay": "2s",
|
||||
"max_retries": 1,
|
||||
"timeout": "10s"
|
||||
}
|
||||
}
|
167
nhr@fau/cc-metric-collector/tinyfat/collectors.bdw.json
Normal file
167
nhr@fau/cc-metric-collector/tinyfat/collectors.bdw.json
Normal file
@@ -0,0 +1,167 @@
|
||||
{
|
||||
"nfs4stat" : {},
|
||||
"memstat" : {
|
||||
"numa_stats": true,
|
||||
"node_stats": true
|
||||
},
|
||||
"cpustat" : {},
|
||||
"loadavg" : {},
|
||||
"schedstat": {},
|
||||
"netstat" : {
|
||||
"include_devices" : [
|
||||
"eth0",
|
||||
"eth1",
|
||||
"eth2"
|
||||
],
|
||||
"send_abs_values": true,
|
||||
"send_derived_values": true
|
||||
},
|
||||
"diskstat" : {},
|
||||
"iostat" : {},
|
||||
"tempstat" : {
|
||||
"tag_override" : {
|
||||
"hwmon2" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "0"
|
||||
},
|
||||
"hwmon3" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "1"
|
||||
}
|
||||
}
|
||||
},
|
||||
"likwid": {
|
||||
"force_overwrite" : true,
|
||||
"invalid_to_zero" : true,
|
||||
"access_mode" : "accessdaemon",
|
||||
"accessdaemon_path" : "/apps/likwid/system/sbin",
|
||||
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
|
||||
"eventsets": [
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"MBOX0C0": "CAS_COUNT_RD",
|
||||
"MBOX0C1": "CAS_COUNT_WR",
|
||||
"MBOX1C0": "CAS_COUNT_RD",
|
||||
"MBOX1C1": "CAS_COUNT_WR",
|
||||
"MBOX2C0": "CAS_COUNT_RD",
|
||||
"MBOX2C1": "CAS_COUNT_WR",
|
||||
"MBOX3C0": "CAS_COUNT_RD",
|
||||
"MBOX3C1": "CAS_COUNT_WR",
|
||||
"MBOX4C0": "CAS_COUNT_RD",
|
||||
"MBOX4C1": "CAS_COUNT_WR",
|
||||
"MBOX5C0": "CAS_COUNT_RD",
|
||||
"MBOX5C1": "CAS_COUNT_WR",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE",
|
||||
"PWR0": "PWR_PKG_ENERGY",
|
||||
"PWR3": "PWR_DRAM_ENERGY"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "1E-6*(FIXC1/FIXC2)/inverseClock",
|
||||
"name": "clock",
|
||||
"publish": true,
|
||||
"unit": "MHz",
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "FIXC0/FIXC1",
|
||||
"name": "ipc",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PWR0/time",
|
||||
"name": "pwr_pkg",
|
||||
"publish": true,
|
||||
"unit": "Watt",
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PWR3/time",
|
||||
"name": "pwr_dram",
|
||||
"publish": true,
|
||||
"unit": "Watt",
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time",
|
||||
"name": "flops_dp",
|
||||
"publish": true,
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time",
|
||||
"name": "mem_bw",
|
||||
"publish": true,
|
||||
"unit": "GBytes/s",
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2",
|
||||
"name": "dp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2",
|
||||
"name": "dp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time",
|
||||
"name": "flops_sp",
|
||||
"publish": true,
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2",
|
||||
"name": "sp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2",
|
||||
"name": "sp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"globalmetrics": [
|
||||
{
|
||||
"calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))",
|
||||
"name": "vectorization_ratio",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"calc": "(flops_dp * 2) + flops_sp",
|
||||
"name": "flops_any",
|
||||
"type": "hwthread",
|
||||
"unit": "GFlops/s",
|
||||
"publish": true
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
100
nhr@fau/cc-metric-collector/tinyfat/collectors.rome.json
Normal file
100
nhr@fau/cc-metric-collector/tinyfat/collectors.rome.json
Normal file
@@ -0,0 +1,100 @@
|
||||
{
|
||||
"nfs4stat" : {},
|
||||
"memstat" : {
|
||||
"numa_stats": true,
|
||||
"node_stats": true
|
||||
},
|
||||
"cpustat" : {},
|
||||
"loadavg" : {},
|
||||
"schedstat": {},
|
||||
"netstat" : {
|
||||
"include_devices" : [
|
||||
"eth0",
|
||||
"eth1",
|
||||
"eth2"
|
||||
],
|
||||
"send_abs_values": true,
|
||||
"send_derived_values": true
|
||||
},
|
||||
"diskstat" : {},
|
||||
"iostat" : {},
|
||||
"tempstat" : {
|
||||
"tag_override" : {
|
||||
"hwmon1" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "0"
|
||||
},
|
||||
"hwmon2" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "1"
|
||||
}
|
||||
}
|
||||
},
|
||||
"likwid": {
|
||||
"force_overwrite" : true,
|
||||
"invalid_to_zero" : true,
|
||||
"access_mode" : "accessdaemon",
|
||||
"accessdaemon_path" : "/apps/likwid/system/sbin",
|
||||
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
|
||||
"eventsets": [
|
||||
{
|
||||
"events": {
|
||||
"FIXC1": "ACTUAL_CPU_CLOCK",
|
||||
"FIXC2": "MAX_CPU_CLOCK",
|
||||
"PMC0": "RETIRED_INSTRUCTIONS",
|
||||
"PMC1": "CPU_CLOCKS_UNHALTED",
|
||||
"PMC2": "RETIRED_SSE_AVX_FLOPS_ALL",
|
||||
"PMC3": "MERGE",
|
||||
"DFC0": "DATA_FROM_LOCAL_DRAM_CHANNEL",
|
||||
"DFC1": "DATA_TO_LOCAL_DRAM_CHANNEL",
|
||||
"PWR0": "RAPL_CORE_ENERGY",
|
||||
"PWR1": "RAPL_PKG_ENERGY"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"name": "ipc",
|
||||
"calc": "PMC0/PMC1",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "flops_any",
|
||||
"calc": "1E-9*PMC2/time",
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "clock",
|
||||
"calc": "1E-6*(FIXC1/FIXC2)/inverseClock",
|
||||
"type": "hwthread",
|
||||
"unit": "MHz",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "pwr_core",
|
||||
"calc": "PWR0/time",
|
||||
"unit": "Watt",
|
||||
"type": "socket",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "pwr_pkg",
|
||||
"calc": "PWR1/time",
|
||||
"type": "socket",
|
||||
"unit": "Watt",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "mem_bw",
|
||||
"calc": "1E-9*(DFC0+DFC1)*64.0/time",
|
||||
"unit": "Gbyte/s",
|
||||
"type": "socket",
|
||||
"publish": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"globalmetrics": []
|
||||
}
|
||||
}
|
8
nhr@fau/cc-metric-collector/tinyfat/config.json
Normal file
8
nhr@fau/cc-metric-collector/tinyfat/config.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"sinks": "/etc/cc-metric-collector/sinks.json",
|
||||
"collectors" : "/etc/cc-metric-collector/collectors.json",
|
||||
"receivers" : "/etc/cc-metric-collector/receivers.json",
|
||||
"router" : "/etc/cc-metric-collector/router.json",
|
||||
"interval": "60s",
|
||||
"duration": "10s"
|
||||
}
|
1
nhr@fau/cc-metric-collector/tinyfat/receivers.json
Normal file
1
nhr@fau/cc-metric-collector/tinyfat/receivers.json
Normal file
@@ -0,0 +1 @@
|
||||
{}
|
49
nhr@fau/cc-metric-collector/tinyfat/router.json
Normal file
49
nhr@fau/cc-metric-collector/tinyfat/router.json
Normal file
@@ -0,0 +1,49 @@
|
||||
{
|
||||
"add_tags" : [
|
||||
{
|
||||
"key" : "cluster",
|
||||
"value" : "tinyfat",
|
||||
"if" : "*"
|
||||
}
|
||||
],
|
||||
"rename_metrics" : {
|
||||
"load_one" : "cpu_load",
|
||||
"cpu_load_core" : "cpu_load",
|
||||
"net_bytes_in_bw" : "net_bytes_in",
|
||||
"net_bytes_out_bw" : "net_bytes_out",
|
||||
"net_pkts_in_bw" : "net_pkts_in",
|
||||
"net_pkts_out_bw" : "net_pkts_out",
|
||||
"ib_recv_bw" : "ib_recv",
|
||||
"ib_xmit_bw" : "ib_xmit",
|
||||
"ib_recv_pkts_bw": "ib_recv_pkts",
|
||||
"ib_xmit_pkts_bw": "ib_xmit_pkts",
|
||||
"lustre_read_bytes_diff" : "lustre_read_bytes",
|
||||
"lustre_read_requests_diff" : "lustre_read_requests",
|
||||
"lustre_write_bytes_diff" : "lustre_write_bytes",
|
||||
"lustre_write_requests_diff" : "lustre_write_requests",
|
||||
"lustre_open_diff" : "lustre_open",
|
||||
"lustre_close_diff" : "lustre_close",
|
||||
"lustre_setattr_diff" : "lustre_setattr",
|
||||
"lustre_getattr_diff" : "lustre_getattr",
|
||||
"lustre_statfs_diff": "lustre_statfs",
|
||||
"lustre_inode_permission_diff" : "lustre_inode_permission",
|
||||
"pwr_pkg": "cpu_power",
|
||||
"pwr_dram": "mem_power"
|
||||
},
|
||||
"drop_metrics" : [
|
||||
"net_bytes_in",
|
||||
"net_bytes_out",
|
||||
"ib_recv",
|
||||
"ib_xmit",
|
||||
"ib_recv_pkts",
|
||||
"ib_xmit_pkts",
|
||||
"net_pkts_in",
|
||||
"net_pkts_out",
|
||||
"lustre_read_bytes",
|
||||
"lustre_read_requests",
|
||||
"lustre_write_bytes",
|
||||
"lustre_write_requests"
|
||||
],
|
||||
"interval_timestamp" : false,
|
||||
"num_cache_intervals" : 0
|
||||
}
|
26
nhr@fau/cc-metric-collector/tinyfat/sinks.json
Normal file
26
nhr@fau/cc-metric-collector/tinyfat/sinks.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"influx": {
|
||||
"type": "influxasync",
|
||||
"host": "monitoring-test.nhr.uni-erlangen.de",
|
||||
"port": "8086",
|
||||
"organization": "ClusterCockpit",
|
||||
"database": "tinyfat",
|
||||
"password": "XZY",
|
||||
"ssl": true,
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
]
|
||||
},
|
||||
"metricstore": {
|
||||
"type": "http",
|
||||
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinyfat",
|
||||
"jwt": "XYZ",
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
],
|
||||
"idle_connection_timeout": "60s",
|
||||
"flush_delay": "2s",
|
||||
"max_retries": 1,
|
||||
"timeout": "10s"
|
||||
}
|
||||
}
|
30
nhr@fau/cc-metric-collector/tinyfat/sinks_debug.json
Normal file
30
nhr@fau/cc-metric-collector/tinyfat/sinks_debug.json
Normal file
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"influx": {
|
||||
"type": "influxasync",
|
||||
"host": "monitoring-test.nhr.uni-erlangen.de",
|
||||
"port": "8086",
|
||||
"organization": "ClusterCockpit",
|
||||
"database": "tinyfat",
|
||||
"password": "XZY",
|
||||
"ssl": true,
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
]
|
||||
},
|
||||
"metricstore": {
|
||||
"type": "http",
|
||||
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinyfat",
|
||||
"jwt": "XZY",
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
],
|
||||
"idle_connection_timeout": "60s",
|
||||
"flush_delay": "2s",
|
||||
"max_retries": 1,
|
||||
"timeout": "10s"
|
||||
},
|
||||
"debugstdout": {
|
||||
"type": "stdout",
|
||||
"output_file": "/tmp/debug.log"
|
||||
}
|
||||
}
|
103
nhr@fau/cc-metric-collector/tinygpu/collectors.rome.a100.json
Normal file
103
nhr@fau/cc-metric-collector/tinygpu/collectors.rome.a100.json
Normal file
@@ -0,0 +1,103 @@
|
||||
{
|
||||
"nfs4stat" : {},
|
||||
"memstat" : {
|
||||
"numa_stats": true,
|
||||
"node_stats": true
|
||||
},
|
||||
"cpustat" : {},
|
||||
"loadavg" : {},
|
||||
"schedstat": {},
|
||||
"netstat" : {
|
||||
"include_devices" : [
|
||||
"eth0"
|
||||
],
|
||||
"send_abs_values": true,
|
||||
"send_derived_values": true
|
||||
},
|
||||
"diskstat" : {},
|
||||
"iostat" : {},
|
||||
"tempstat" : {
|
||||
"tag_override" : {
|
||||
"hwmon0" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "0"
|
||||
},
|
||||
"hwmon1" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "1"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nvidia" : {
|
||||
"use_pci_info_as_type_id": true,
|
||||
"process_mig_devices": true
|
||||
},
|
||||
"likwid": {
|
||||
"force_overwrite" : true,
|
||||
"invalid_to_zero" : true,
|
||||
"access_mode" : "accessdaemon",
|
||||
"accessdaemon_path" : "/apps/likwid/system/sbin",
|
||||
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
|
||||
"eventsets": [
|
||||
{
|
||||
"events": {
|
||||
"FIXC1": "ACTUAL_CPU_CLOCK",
|
||||
"FIXC2": "MAX_CPU_CLOCK",
|
||||
"PMC0": "RETIRED_INSTRUCTIONS",
|
||||
"PMC1": "CPU_CLOCKS_UNHALTED",
|
||||
"PMC2": "RETIRED_SSE_AVX_FLOPS_ALL",
|
||||
"PMC3": "MERGE",
|
||||
"DFC0": "DATA_FROM_LOCAL_DRAM_CHANNEL",
|
||||
"DFC1": "DATA_TO_LOCAL_DRAM_CHANNEL",
|
||||
"PWR0": "RAPL_CORE_ENERGY",
|
||||
"PWR1": "RAPL_PKG_ENERGY"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"name": "ipc",
|
||||
"calc": "PMC0/PMC1",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "flops_any",
|
||||
"calc": "1E-9*PMC2/time",
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "clock",
|
||||
"calc": "1E-6*(FIXC1/FIXC2)/inverseClock",
|
||||
"type": "hwthread",
|
||||
"unit": "MHz",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "pwr_core",
|
||||
"calc": "PWR0/time",
|
||||
"unit": "Watt",
|
||||
"type": "socket",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "pwr_pkg",
|
||||
"calc": "PWR1/time",
|
||||
"type": "socket",
|
||||
"unit": "Watt",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "mem_bw",
|
||||
"calc": "1E-9*(DFC0+DFC1)*64.0/time",
|
||||
"unit": "Gbyte/s",
|
||||
"type": "socket",
|
||||
"publish": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"globalmetrics": []
|
||||
}
|
||||
|
||||
}
|
219
nhr@fau/cc-metric-collector/tinygpu/collectors.skx.2080.json
Normal file
219
nhr@fau/cc-metric-collector/tinygpu/collectors.skx.2080.json
Normal file
@@ -0,0 +1,219 @@
|
||||
{
|
||||
"nfs4stat" : {},
|
||||
"memstat" : {
|
||||
"numa_stats": true,
|
||||
"node_stats": true
|
||||
},
|
||||
"cpustat" : {},
|
||||
"loadavg" : {},
|
||||
"schedstat": {},
|
||||
"netstat" : {
|
||||
"include_devices" : [
|
||||
"eth0"
|
||||
],
|
||||
"send_abs_values": true,
|
||||
"send_derived_values": true
|
||||
},
|
||||
"diskstat" : {},
|
||||
"iostat" : {},
|
||||
"tempstat" : {
|
||||
"tag_override" : {
|
||||
"hwmon0" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "0"
|
||||
},
|
||||
"hwmon1" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "1"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nvidia" : {
|
||||
"use_pci_info_as_type_id": true,
|
||||
"process_mig_devices": true
|
||||
},
|
||||
"likwid": {
|
||||
"force_overwrite" : true,
|
||||
"invalid_to_zero" : true,
|
||||
"access_mode" : "accessdaemon",
|
||||
"accessdaemon_path" : "/apps/likwid/system/sbin",
|
||||
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
|
||||
"eventsets": [
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"MBOX0C0": "CAS_COUNT_RD",
|
||||
"MBOX0C1": "CAS_COUNT_WR",
|
||||
"MBOX1C0": "CAS_COUNT_RD",
|
||||
"MBOX1C1": "CAS_COUNT_WR",
|
||||
"MBOX2C0": "CAS_COUNT_RD",
|
||||
"MBOX2C1": "CAS_COUNT_WR",
|
||||
"MBOX3C0": "CAS_COUNT_RD",
|
||||
"MBOX3C1": "CAS_COUNT_WR",
|
||||
"MBOX4C0": "CAS_COUNT_RD",
|
||||
"MBOX4C1": "CAS_COUNT_WR",
|
||||
"MBOX5C0": "CAS_COUNT_RD",
|
||||
"MBOX5C1": "CAS_COUNT_WR",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE",
|
||||
"PWR0": "PWR_PKG_ENERGY",
|
||||
"PWR3": "PWR_DRAM_ENERGY"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
|
||||
"name": "clock",
|
||||
"publish": true,
|
||||
"unit": "MHz",
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "FIXC0/FIXC1",
|
||||
"name": "ipc",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PWR0/time",
|
||||
"name": "cpu_power",
|
||||
"publish": true,
|
||||
"unit": "Watt",
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PWR3/time",
|
||||
"name": "mem_power",
|
||||
"unit": "Watt",
|
||||
"publish": true,
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time",
|
||||
"name": "flops_dp1",
|
||||
"unit": "GFlops/s",
|
||||
"publish": false,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time",
|
||||
"name": "mem_bw",
|
||||
"unit": "GBytes/s",
|
||||
"publish": true,
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2",
|
||||
"name": "dp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2",
|
||||
"name": "dp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time",
|
||||
"name": "flops_sp1",
|
||||
"publish": false,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2",
|
||||
"name": "sp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2",
|
||||
"name": "sp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "PMC0",
|
||||
"name": "dp_avx_512_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(PMC0*8.0)/time",
|
||||
"name": "flops_dp2",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC1",
|
||||
"name": "sp_avx_512_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(PMC1*16.0)/time",
|
||||
"name": "flops_sp2",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"globalmetrics": [
|
||||
{
|
||||
"calc": "100*((sp_vec_ins+dp_vec_ins+dp_avx_512_ins+sp_avx_512_ins)/(sp_fp_ins+dp_fp_ins+dp_avx_512_ins+sp_avx_512_ins))",
|
||||
"name": "vectorization_ratio",
|
||||
"unit": "%",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"calc": "(flops_sp1+flops_sp2)",
|
||||
"name": "flops_sp",
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"calc": "(flops_dp1+flops_dp2)",
|
||||
"name": "flops_dp",
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"calc": "((flops_dp1+flops_dp2) * 2) + (flops_sp1+flops_sp2)",
|
||||
"name": "flops_any",
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
}
|
175
nhr@fau/cc-metric-collector/tinygpu/collectors.skx.3080.json
Normal file
175
nhr@fau/cc-metric-collector/tinygpu/collectors.skx.3080.json
Normal file
@@ -0,0 +1,175 @@
|
||||
{
|
||||
"nfs4stat" : {},
|
||||
"memstat" : {
|
||||
"numa_stats": true,
|
||||
"node_stats": true
|
||||
},
|
||||
"cpustat" : {},
|
||||
"loadavg" : {},
|
||||
"schedstat": {},
|
||||
"netstat" : {
|
||||
"include_devices" : [
|
||||
"eth0",
|
||||
"eth1",
|
||||
"eth2"
|
||||
],
|
||||
"send_abs_values": true,
|
||||
"send_derived_values": true
|
||||
},
|
||||
"diskstat" : {},
|
||||
"iostat" : {},
|
||||
"tempstat" : {
|
||||
"tag_override" : {
|
||||
"hwmon1" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "0"
|
||||
},
|
||||
"hwmon2" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "1"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nvidia" : {
|
||||
"use_pci_info_as_type_id": true,
|
||||
"process_mig_devices": true
|
||||
},
|
||||
"likwid": {
|
||||
"force_overwrite" : true,
|
||||
"invalid_to_zero" : true,
|
||||
"access_mode" : "accessdaemon",
|
||||
"accessdaemon_path" : "/apps/likwid/system/sbin",
|
||||
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
|
||||
"eventsets": [
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"MBOX0C0": "CAS_COUNT_RD",
|
||||
"MBOX0C1": "CAS_COUNT_WR",
|
||||
"MBOX1C0": "CAS_COUNT_RD",
|
||||
"MBOX1C1": "CAS_COUNT_WR",
|
||||
"MBOX2C0": "CAS_COUNT_RD",
|
||||
"MBOX2C1": "CAS_COUNT_WR",
|
||||
"MBOX3C0": "CAS_COUNT_RD",
|
||||
"MBOX3C1": "CAS_COUNT_WR",
|
||||
"MBOX4C0": "CAS_COUNT_RD",
|
||||
"MBOX4C1": "CAS_COUNT_WR",
|
||||
"MBOX5C0": "CAS_COUNT_RD",
|
||||
"MBOX5C1": "CAS_COUNT_WR",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE",
|
||||
"PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE",
|
||||
"PWR0": "PWR_PKG_ENERGY",
|
||||
"PWR3": "PWR_DRAM_ENERGY"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
|
||||
"name": "clock",
|
||||
"publish": true,
|
||||
"unit": "MHz",
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "FIXC0/FIXC1",
|
||||
"name": "ipc",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PWR0/time",
|
||||
"name": "cpu_power",
|
||||
"publish": true,
|
||||
"unit": "Watt",
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PWR3/time",
|
||||
"name": "mem_power",
|
||||
"unit": "Watt",
|
||||
"publish": true,
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time",
|
||||
"name": "flops_dp",
|
||||
"unit": "GFlops/s",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time",
|
||||
"name": "mem_bw",
|
||||
"unit": "GBytes/s",
|
||||
"publish": true,
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2+PMC3",
|
||||
"name": "dp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2+PMC3",
|
||||
"name": "dp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE",
|
||||
"PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time",
|
||||
"name": "flops_sp",
|
||||
"unit": "GFlops/s",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2+PMC3",
|
||||
"name": "sp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2+PMC3",
|
||||
"name": "sp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"globalmetrics": [
|
||||
{
|
||||
"calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))",
|
||||
"name": "vectorization_ratio",
|
||||
"unit": "%",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"calc": "(flops_dp * 2) + flops_sp",
|
||||
"name": "flops_any",
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
}
|
219
nhr@fau/cc-metric-collector/tinygpu/collectors.skx.v100.json
Normal file
219
nhr@fau/cc-metric-collector/tinygpu/collectors.skx.v100.json
Normal file
@@ -0,0 +1,219 @@
|
||||
{
|
||||
"nfs4stat" : {},
|
||||
"memstat" : {
|
||||
"numa_stats": true,
|
||||
"node_stats": true
|
||||
},
|
||||
"cpustat" : {},
|
||||
"loadavg" : {},
|
||||
"schedstat": {},
|
||||
"netstat" : {
|
||||
"include_devices" : [
|
||||
"eth0"
|
||||
],
|
||||
"send_abs_values": true,
|
||||
"send_derived_values": true
|
||||
},
|
||||
"diskstat" : {},
|
||||
"iostat" : {},
|
||||
"tempstat" : {
|
||||
"tag_override" : {
|
||||
"hwmon0" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "0"
|
||||
},
|
||||
"hwmon1" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "1"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nvidia" : {
|
||||
"use_pci_info_as_type_id": true,
|
||||
"process_mig_devices": true
|
||||
},
|
||||
"likwid": {
|
||||
"force_overwrite" : true,
|
||||
"invalid_to_zero" : true,
|
||||
"access_mode" : "accessdaemon",
|
||||
"accessdaemon_path" : "/apps/likwid/system/sbin",
|
||||
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
|
||||
"eventsets": [
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"MBOX0C0": "CAS_COUNT_RD",
|
||||
"MBOX0C1": "CAS_COUNT_WR",
|
||||
"MBOX1C0": "CAS_COUNT_RD",
|
||||
"MBOX1C1": "CAS_COUNT_WR",
|
||||
"MBOX2C0": "CAS_COUNT_RD",
|
||||
"MBOX2C1": "CAS_COUNT_WR",
|
||||
"MBOX3C0": "CAS_COUNT_RD",
|
||||
"MBOX3C1": "CAS_COUNT_WR",
|
||||
"MBOX4C0": "CAS_COUNT_RD",
|
||||
"MBOX4C1": "CAS_COUNT_WR",
|
||||
"MBOX5C0": "CAS_COUNT_RD",
|
||||
"MBOX5C1": "CAS_COUNT_WR",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE",
|
||||
"PWR0": "PWR_PKG_ENERGY",
|
||||
"PWR3": "PWR_DRAM_ENERGY"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
|
||||
"name": "clock",
|
||||
"publish": true,
|
||||
"unit": "MHz",
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "FIXC0/FIXC1",
|
||||
"name": "ipc",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PWR0/time",
|
||||
"name": "cpu_power",
|
||||
"publish": true,
|
||||
"unit": "Watt",
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PWR3/time",
|
||||
"name": "mem_power",
|
||||
"unit": "Watt",
|
||||
"publish": true,
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time",
|
||||
"name": "flops_dp1",
|
||||
"unit": "GFlops/s",
|
||||
"publish": false,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time",
|
||||
"name": "mem_bw",
|
||||
"unit": "GBytes/s",
|
||||
"publish": true,
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2",
|
||||
"name": "dp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2",
|
||||
"name": "dp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time",
|
||||
"name": "flops_sp1",
|
||||
"publish": false,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2",
|
||||
"name": "sp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2",
|
||||
"name": "sp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "PMC0",
|
||||
"name": "dp_avx_512_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(PMC0*8.0)/time",
|
||||
"name": "flops_dp2",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC1",
|
||||
"name": "sp_avx_512_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(PMC1*16.0)/time",
|
||||
"name": "flops_sp2",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"globalmetrics": [
|
||||
{
|
||||
"calc": "100*((sp_vec_ins+dp_vec_ins+dp_avx_512_ins+sp_avx_512_ins)/(sp_fp_ins+dp_fp_ins+dp_avx_512_ins+sp_avx_512_ins))",
|
||||
"name": "vectorization_ratio",
|
||||
"unit": "%",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"calc": "(flops_sp1+flops_sp2)",
|
||||
"name": "flops_sp",
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"calc": "(flops_dp1+flops_dp2)",
|
||||
"name": "flops_dp",
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"calc": "((flops_dp1+flops_dp2) * 2) + (flops_sp1+flops_sp2)",
|
||||
"name": "flops_any",
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
}
|
8
nhr@fau/cc-metric-collector/tinygpu/config.json
Normal file
8
nhr@fau/cc-metric-collector/tinygpu/config.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"sinks": "/etc/cc-metric-collector/sinks.json",
|
||||
"collectors" : "/etc/cc-metric-collector/collectors.json",
|
||||
"receivers" : "/etc/cc-metric-collector/receivers.json",
|
||||
"router" : "/etc/cc-metric-collector/router.json",
|
||||
"interval": "60s",
|
||||
"duration": "10s"
|
||||
}
|
1
nhr@fau/cc-metric-collector/tinygpu/receivers.json
Normal file
1
nhr@fau/cc-metric-collector/tinygpu/receivers.json
Normal file
@@ -0,0 +1 @@
|
||||
{}
|
60
nhr@fau/cc-metric-collector/tinygpu/router.json
Normal file
60
nhr@fau/cc-metric-collector/tinygpu/router.json
Normal file
@@ -0,0 +1,60 @@
|
||||
{
|
||||
"add_tags" : [
|
||||
{
|
||||
"key" : "cluster",
|
||||
"value" : "tinygpu",
|
||||
"if" : "*"
|
||||
}
|
||||
],
|
||||
"rename_metrics" : {
|
||||
"load_one" : "cpu_load",
|
||||
"cpu_load_core" : "cpu_load",
|
||||
"net_bytes_in_bw" : "net_bytes_in",
|
||||
"net_bytes_out_bw" : "net_bytes_out",
|
||||
"net_pkts_in_bw" : "net_pkts_in",
|
||||
"net_pkts_out_bw" : "net_pkts_out",
|
||||
"ib_recv_bw" : "ib_recv",
|
||||
"ib_xmit_bw" : "ib_xmit",
|
||||
"ib_recv_pkts_bw": "ib_recv_pkts",
|
||||
"ib_xmit_pkts_bw": "ib_xmit_pkts",
|
||||
"lustre_read_bytes_diff" : "lustre_read_bytes",
|
||||
"lustre_read_requests_diff" : "lustre_read_requests",
|
||||
"lustre_write_bytes_diff" : "lustre_write_bytes",
|
||||
"lustre_write_requests_diff" : "lustre_write_requests",
|
||||
"lustre_open_diff" : "lustre_open",
|
||||
"lustre_close_diff" : "lustre_close",
|
||||
"lustre_setattr_diff" : "lustre_setattr",
|
||||
"lustre_getattr_diff" : "lustre_getattr",
|
||||
"lustre_statfs_diff": "lustre_statfs",
|
||||
"lustre_inode_permission_diff" : "lustre_inode_permission",
|
||||
"nv_util" : "acc_utilization",
|
||||
"nv_fb_mem_used" : "acc_mem_used",
|
||||
"nv_power_usage" : "acc_power",
|
||||
"pwr_pkg": "cpu_power",
|
||||
"pwr_dram": "mem_power"
|
||||
},
|
||||
"drop_metrics" : [
|
||||
"net_bytes_in",
|
||||
"net_bytes_out",
|
||||
"ib_recv",
|
||||
"ib_xmit",
|
||||
"ib_recv_pkts",
|
||||
"ib_xmit_pkts",
|
||||
"net_pkts_in",
|
||||
"net_pkts_out",
|
||||
"lustre_read_bytes",
|
||||
"lustre_read_requests",
|
||||
"lustre_write_bytes",
|
||||
"lustre_write_requests"
|
||||
],
|
||||
"interval_timestamp" : false,
|
||||
"num_cache_intervals" : 0,
|
||||
"change_unit_prefix": {
|
||||
"mem_used": "G",
|
||||
"swap_used": "G",
|
||||
"mem_total": "G",
|
||||
"swap_total": "G",
|
||||
"cpufreq": "M"
|
||||
},
|
||||
"normalize_metrics" : true
|
||||
}
|
26
nhr@fau/cc-metric-collector/tinygpu/sinks.json
Normal file
26
nhr@fau/cc-metric-collector/tinygpu/sinks.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"influx": {
|
||||
"type": "influxasync",
|
||||
"host": "monitoring-test.nhr.uni-erlangen.de",
|
||||
"port": "8086",
|
||||
"organization": "ClusterCockpit",
|
||||
"database": "tinygpu",
|
||||
"password": "XZY",
|
||||
"ssl": true,
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
]
|
||||
},
|
||||
"metricstore": {
|
||||
"type": "http",
|
||||
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinygpu",
|
||||
"jwt": "XZY",
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
],
|
||||
"idle_connection_timeout": "60s",
|
||||
"flush_delay": "2s",
|
||||
"max_retries": 1,
|
||||
"timeout": "10s"
|
||||
}
|
||||
}
|
30
nhr@fau/cc-metric-collector/tinygpu/sinks_debug.json
Normal file
30
nhr@fau/cc-metric-collector/tinygpu/sinks_debug.json
Normal file
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"influx": {
|
||||
"type": "influxasync",
|
||||
"host": "monitoring-test.nhr.uni-erlangen.de",
|
||||
"port": "8086",
|
||||
"organization": "ClusterCockpit",
|
||||
"database": "tinygpu",
|
||||
"password": "XZY",
|
||||
"ssl": true,
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
]
|
||||
},
|
||||
"metricstore": {
|
||||
"type": "http",
|
||||
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinygpu",
|
||||
"jwt": "XZY",
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
],
|
||||
"idle_connection_timeout": "60s",
|
||||
"flush_delay": "2s",
|
||||
"max_retries": 1,
|
||||
"timeout": "10s"
|
||||
},
|
||||
"debugstdout": {
|
||||
"type": "stdout",
|
||||
"output_file": "/tmp/debug.log"
|
||||
}
|
||||
}
|
176
nhr@fau/cc-metric-collector/woody-ng/collectors.icx.json
Normal file
176
nhr@fau/cc-metric-collector/woody-ng/collectors.icx.json
Normal file
@@ -0,0 +1,176 @@
|
||||
{
|
||||
"diskstat" : {},
|
||||
"iostat" : {},
|
||||
"cpustat": {},
|
||||
"memstat" : {
|
||||
"numa_stats": true,
|
||||
"node_stats": true
|
||||
},
|
||||
"loadavg" : {},
|
||||
"schedstat": {},
|
||||
"netstat" : {
|
||||
"include_devices" : [
|
||||
"enp2s0f0",
|
||||
"eno1",
|
||||
"ens2f0"
|
||||
],
|
||||
"send_abs_values" : true,
|
||||
"send_derived_values" : true
|
||||
},
|
||||
"tempstat" : {
|
||||
"tag_override" : {
|
||||
"hwmon0" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "0"
|
||||
},
|
||||
"hwmon1" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "1"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nfs4stat" : {},
|
||||
"likwid": {
|
||||
"force_overwrite" : true,
|
||||
"invalid_to_zero" : true,
|
||||
"access_mode" : "accessdaemon",
|
||||
"accessdaemon_path" : "/apps/likwid/system/sbin",
|
||||
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
|
||||
"eventsets": [
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"MBOX0C0": "CAS_COUNT_RD",
|
||||
"MBOX0C1": "CAS_COUNT_WR",
|
||||
"MBOX1C0": "CAS_COUNT_RD",
|
||||
"MBOX1C1": "CAS_COUNT_WR",
|
||||
"MBOX2C0": "CAS_COUNT_RD",
|
||||
"MBOX2C1": "CAS_COUNT_WR",
|
||||
"MBOX3C0": "CAS_COUNT_RD",
|
||||
"MBOX3C1": "CAS_COUNT_WR",
|
||||
"MBOX4C0": "CAS_COUNT_RD",
|
||||
"MBOX4C1": "CAS_COUNT_WR",
|
||||
"MBOX5C0": "CAS_COUNT_RD",
|
||||
"MBOX5C1": "CAS_COUNT_WR",
|
||||
"MBOX6C0": "CAS_COUNT_RD",
|
||||
"MBOX6C1": "CAS_COUNT_WR",
|
||||
"MBOX7C0": "CAS_COUNT_RD",
|
||||
"MBOX7C1": "CAS_COUNT_WR",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE",
|
||||
"PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE",
|
||||
"PWR0": "PWR_PKG_ENERGY",
|
||||
"PWR3": "PWR_DRAM_ENERGY"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
|
||||
"name": "clock",
|
||||
"publish": true,
|
||||
"unit": "MHz",
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "FIXC0/FIXC1",
|
||||
"name": "ipc",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PWR0/time",
|
||||
"name": "cpu_power",
|
||||
"publish": true,
|
||||
"unit": "Watt",
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PWR3/time",
|
||||
"name": "mem_power",
|
||||
"unit": "Watt",
|
||||
"publish": true,
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time",
|
||||
"name": "flops_dp",
|
||||
"unit": "GFlops/s",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time",
|
||||
"name": "mem_bw",
|
||||
"unit": "GBytes/s",
|
||||
"publish": true,
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2+PMC3",
|
||||
"name": "dp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2+PMC3",
|
||||
"name": "dp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"FIXC3": "TOPDOWN_SLOTS",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE",
|
||||
"PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time",
|
||||
"name": "flops_sp",
|
||||
"unit": "GFlops/s",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2+PMC3",
|
||||
"name": "sp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2+PMC3",
|
||||
"name": "sp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"globalmetrics": [
|
||||
{
|
||||
"calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))",
|
||||
"name": "vectorization_ratio",
|
||||
"unit": "%",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"calc": "(flops_dp * 2) + flops_sp",
|
||||
"name": "flops_any",
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
}
|
147
nhr@fau/cc-metric-collector/woody-ng/collectors.skl.json
Normal file
147
nhr@fau/cc-metric-collector/woody-ng/collectors.skl.json
Normal file
@@ -0,0 +1,147 @@
|
||||
{
|
||||
"diskstat" : {},
|
||||
"iostat" : {},
|
||||
"cpustat": {},
|
||||
"memstat" : {
|
||||
"node_stats": true
|
||||
},
|
||||
"loadavg" : {},
|
||||
"schedstat": {},
|
||||
"netstat" : {
|
||||
"include_devices" : [
|
||||
"enp2s0f0",
|
||||
"eno1",
|
||||
"ens2f0"
|
||||
],
|
||||
"send_abs_values" : true,
|
||||
"send_derived_values" : true
|
||||
},
|
||||
"tempstat" : {},
|
||||
"nfs4stat" : {},
|
||||
"likwid": {
|
||||
"force_overwrite" : true,
|
||||
"invalid_to_zero" : true,
|
||||
"access_mode" : "accessdaemon",
|
||||
"accessdaemon_path" : "/apps/likwid/system/sbin",
|
||||
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
|
||||
"eventsets": [
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"MBOX0C1": "DRAM_READS",
|
||||
"MBOX0C2": "DRAM_WRITES",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE",
|
||||
"PWR0": "PWR_PKG_ENERGY",
|
||||
"PWR3": "PWR_DRAM_ENERGY"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "1E-6*(FIXC1/FIXC2)/inverseClock",
|
||||
"name": "clock",
|
||||
"unit": "MHz",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "FIXC0/FIXC1",
|
||||
"name": "ipc",
|
||||
"publish": true,
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PWR0/time",
|
||||
"name": "pwr_pkg",
|
||||
"unit": "Watt",
|
||||
"publish": true,
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PWR3/time",
|
||||
"name": "pwr_dram",
|
||||
"publish": true,
|
||||
"unit": "Watt",
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time",
|
||||
"name": "flops_dp",
|
||||
"publish": true,
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "1E-9*(MBOX0C1+MBOX0C2)*64.0/time",
|
||||
"name": "mem_bw",
|
||||
"publish": true,
|
||||
"unit": "GBytes/s",
|
||||
"type": "socket"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2",
|
||||
"name": "dp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2",
|
||||
"name": "dp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"events": {
|
||||
"FIXC0": "INSTR_RETIRED_ANY",
|
||||
"FIXC1": "CPU_CLK_UNHALTED_CORE",
|
||||
"FIXC2": "CPU_CLK_UNHALTED_REF",
|
||||
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE",
|
||||
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE",
|
||||
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time",
|
||||
"name": "flops_sp",
|
||||
"publish": true,
|
||||
"unit": "GFlops/s",
|
||||
"type": "hwthread"
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC2",
|
||||
"name": "sp_vec_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
},
|
||||
{
|
||||
"calc": "PMC0+PMC1+PMC2",
|
||||
"name": "sp_fp_ins",
|
||||
"type": "hwthread",
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"globalmetrics": [
|
||||
{
|
||||
"calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))",
|
||||
"name": "vectorization_ratio",
|
||||
"unit": "%",
|
||||
"type": "hwthread",
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"calc": "(flops_dp * 2) + flops_sp",
|
||||
"name": "flops_any",
|
||||
"type": "hwthread",
|
||||
"unit": "GFlops/s",
|
||||
"publish": true
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
}
|
8
nhr@fau/cc-metric-collector/woody-ng/config.json
Normal file
8
nhr@fau/cc-metric-collector/woody-ng/config.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"sinks": "/etc/cc-metric-collector/sinks.json",
|
||||
"collectors" : "/etc/cc-metric-collector/collectors.json",
|
||||
"receivers" : "/etc/cc-metric-collector/receivers.json",
|
||||
"router" : "/etc/cc-metric-collector/router.json",
|
||||
"interval": "60s",
|
||||
"duration": "10s"
|
||||
}
|
1
nhr@fau/cc-metric-collector/woody-ng/receivers.json
Normal file
1
nhr@fau/cc-metric-collector/woody-ng/receivers.json
Normal file
@@ -0,0 +1 @@
|
||||
{}
|
54
nhr@fau/cc-metric-collector/woody-ng/router.json
Normal file
54
nhr@fau/cc-metric-collector/woody-ng/router.json
Normal file
@@ -0,0 +1,54 @@
|
||||
{
|
||||
"add_tags" : [
|
||||
{
|
||||
"key" : "cluster",
|
||||
"value" : "woodyng",
|
||||
"if" : "*"
|
||||
}
|
||||
],
|
||||
"rename_metrics" : {
|
||||
"load_one" : "cpu_load",
|
||||
"cpu_load_core" : "cpu_load",
|
||||
"net_bytes_in_bw" : "net_bytes_in",
|
||||
"net_bytes_out_bw" : "net_bytes_out",
|
||||
"net_pkts_in_bw" : "net_pkts_in",
|
||||
"net_pkts_out_bw" : "net_pkts_out",
|
||||
"ib_recv_bw" : "ib_recv",
|
||||
"ib_xmit_bw" : "ib_xmit",
|
||||
"ib_recv_pkts_bw": "ib_recv_pkts",
|
||||
"ib_xmit_pkts_bw": "ib_xmit_pkts",
|
||||
"lustre_read_bytes_diff" : "lustre_read_bytes",
|
||||
"lustre_read_requests_diff" : "lustre_read_requests",
|
||||
"lustre_write_bytes_diff" : "lustre_write_bytes",
|
||||
"lustre_write_requests_diff" : "lustre_write_requests",
|
||||
"lustre_open_diff" : "lustre_open",
|
||||
"lustre_close_diff" : "lustre_close",
|
||||
"lustre_setattr_diff" : "lustre_setattr",
|
||||
"lustre_getattr_diff" : "lustre_getattr",
|
||||
"lustre_statfs_diff": "lustre_statfs",
|
||||
"lustre_inode_permission_diff" : "lustre_inode_permission"
|
||||
},
|
||||
"drop_metrics" : [
|
||||
"net_bytes_in",
|
||||
"net_bytes_out",
|
||||
"ib_recv",
|
||||
"ib_xmit",
|
||||
"ib_recv_pkts",
|
||||
"ib_xmit_pkts",
|
||||
"net_pkts_in",
|
||||
"net_pkts_out",
|
||||
"lustre_read_bytes",
|
||||
"lustre_read_requests",
|
||||
"lustre_write_bytes",
|
||||
"lustre_write_requests"
|
||||
],
|
||||
"interval_timestamp" : true,
|
||||
"num_cache_intervals" : 0,
|
||||
"change_unit_prefix": {
|
||||
"mem_used": "G",
|
||||
"swap_used": "G",
|
||||
"mem_total": "G",
|
||||
"swap_total": "G"
|
||||
},
|
||||
"normalize_metrics" : true
|
||||
}
|
20
nhr@fau/cc-metric-collector/woody-ng/sinks.json
Normal file
20
nhr@fau/cc-metric-collector/woody-ng/sinks.json
Normal file
@@ -0,0 +1,20 @@
|
||||
{
|
||||
"nhrinflux": {
|
||||
"type": "influxasync",
|
||||
"host": "monitoring-test.nhr.uni-erlangen.de",
|
||||
"port": "8086",
|
||||
"organization": "ClusterCockpit",
|
||||
"database": "woodyng",
|
||||
"password": "XZY",
|
||||
"ssl": true
|
||||
},
|
||||
"woodystore": {
|
||||
"type": "http",
|
||||
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=woodyng",
|
||||
"jwt": "XZY",
|
||||
"meta_as_tags": [
|
||||
"unit"
|
||||
],
|
||||
"idle_connection_timeout": "60s"
|
||||
}
|
||||
}
|
19
nhr@fau/cc-metric-store/cc-metric-store.service
Normal file
19
nhr@fau/cc-metric-store/cc-metric-store.service
Normal file
@@ -0,0 +1,19 @@
|
||||
[Unit]
|
||||
Description=ClusterCockpit In-Memory Timeseries Database for Fritz (cc-metric-store)
|
||||
Documentation=https://github.com/ClusterCockpit/cc-metric-store
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=clustercockpit
|
||||
Group=clustercockpit
|
||||
Restart=on-failure
|
||||
RestartSec=30
|
||||
TimeoutStopSec=100
|
||||
WorkingDirectory=/opt/monitoring/cc-metric-store/fritz
|
||||
ExecStart=/opt/monitoring/cc-metric-store/repo/cc-metric-store --config ./config.json
|
||||
LimitNOFILE=500000
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
180
nhr@fau/cc-metric-store/config.json
Normal file
180
nhr@fau/cc-metric-store/config.json
Normal file
@@ -0,0 +1,180 @@
|
||||
{
|
||||
"metrics": {
|
||||
"clock": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"cpu_idle": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"cpu_iowait": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"cpu_irq": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"cpu_system": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"cpu_user": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"nv_mem_util": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"nv_temp": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"nv_sm_clock": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"acc_utilization": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"acc_mem_used": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"acc_power": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"flops_any": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"flops_dp": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"flops_sp": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"ib_recv": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"ib_xmit": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"ib_recv_pkts": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"ib_xmit_pkts": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"cpu_power": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"core_power": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"mem_power": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"ipc": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
},
|
||||
"cpu_load": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"lustre_close": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"lustre_open": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"lustre_statfs": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"lustre_read_bytes": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"lustre_write_bytes": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"net_bw": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"file_bw": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"mem_bw": {
|
||||
"frequency": 60,
|
||||
"aggregation": "sum"
|
||||
},
|
||||
"mem_cached": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"mem_used": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"net_bytes_in": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"net_bytes_out": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"nfs4_read": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"nfs4_total": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"nfs4_write": {
|
||||
"frequency": 60,
|
||||
"aggregation": null
|
||||
},
|
||||
"vectorization_ratio": {
|
||||
"frequency": 60,
|
||||
"aggregation": "avg"
|
||||
}
|
||||
},
|
||||
"checkpoints": {
|
||||
"interval": "12h",
|
||||
"directory": "/opt/monitoring/cc-metric-store/fritz/checkpoints",
|
||||
"restore": "48h"
|
||||
},
|
||||
"archive": {
|
||||
"interval": "50h",
|
||||
"directory": "/opt/monitoring/cc-metric-store/fritz/archive"
|
||||
},
|
||||
"http-api": {
|
||||
"address": "0.0.0.0:8082",
|
||||
"https-cert-file": null,
|
||||
"https-key-file": null
|
||||
},
|
||||
"retention-in-memory": "48h",
|
||||
"jwt-public-key": "-"
|
||||
}
|
2809
nhr@fau/job-archive/cluster-alex.json
Normal file
2809
nhr@fau/job-archive/cluster-alex.json
Normal file
File diff suppressed because it is too large
Load Diff
2293
nhr@fau/job-archive/cluster-fritz.json
Normal file
2293
nhr@fau/job-archive/cluster-fritz.json
Normal file
File diff suppressed because it is too large
Load Diff
357
nhr@fau/job-archive/cluster-meggie.json
Normal file
357
nhr@fau/job-archive/cluster-meggie.json
Normal file
@@ -0,0 +1,357 @@
|
||||
{
|
||||
"name": "meggie",
|
||||
"metricConfig": [
|
||||
{
|
||||
"name": "cpu_load",
|
||||
"unit": {
|
||||
"base": "load"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "avg",
|
||||
"footprint": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 40,
|
||||
"normal": 20,
|
||||
"caution": 15,
|
||||
"alert": 10
|
||||
},
|
||||
{
|
||||
"name": "mem_used",
|
||||
"unit": {
|
||||
"base": "B",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"footprint": "max",
|
||||
"timestep": 60,
|
||||
"peak": 64,
|
||||
"normal": 20,
|
||||
"caution": 40,
|
||||
"alert": 55
|
||||
},
|
||||
{
|
||||
"name": "flops_any",
|
||||
"unit": {
|
||||
"base": "Flops/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"footprint": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 1536,
|
||||
"normal": 200,
|
||||
"caution": 40,
|
||||
"alert": 4
|
||||
},
|
||||
{
|
||||
"name": "flops_sp",
|
||||
"unit": {
|
||||
"base": "Flops/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 1536,
|
||||
"normal": 100,
|
||||
"caution": 20,
|
||||
"alert": 2
|
||||
},
|
||||
{
|
||||
"name": "flops_dp",
|
||||
"unit": {
|
||||
"base": "Flops/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 768,
|
||||
"normal": 50,
|
||||
"caution": 10,
|
||||
"alert": 2
|
||||
},
|
||||
{
|
||||
"name": "mem_bw",
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"footprint": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 140,
|
||||
"normal": 70,
|
||||
"caution": 20,
|
||||
"alert": 5
|
||||
},
|
||||
{
|
||||
"name": "clock",
|
||||
"unit": {
|
||||
"base": "Hz",
|
||||
"prefix": "M"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 3000,
|
||||
"normal": 2400,
|
||||
"caution": 1800,
|
||||
"alert": 1200
|
||||
},
|
||||
{
|
||||
"name": "cpu_power",
|
||||
"unit": {
|
||||
"base": "W"
|
||||
},
|
||||
"scope": "socket",
|
||||
"aggregation": "sum",
|
||||
"energy": "power",
|
||||
"timestep": 60,
|
||||
"peak": 80,
|
||||
"normal": 30,
|
||||
"caution": 10,
|
||||
"alert": 5
|
||||
},
|
||||
{
|
||||
"name": "mem_power",
|
||||
"unit": {
|
||||
"base": "W"
|
||||
},
|
||||
"scope": "socket",
|
||||
"aggregation": "sum",
|
||||
"energy": "power",
|
||||
"timestep": 60,
|
||||
"peak": 100,
|
||||
"normal": 50,
|
||||
"caution": 20,
|
||||
"alert": 10
|
||||
},
|
||||
{
|
||||
"name": "ipc",
|
||||
"unit": {
|
||||
"base": "IPC"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 4,
|
||||
"normal": 2,
|
||||
"caution": 1,
|
||||
"alert": 0.5
|
||||
},
|
||||
{
|
||||
"name": "vectorization_ratio",
|
||||
"unit": {
|
||||
"base": ""
|
||||
},
|
||||
"scope": "hwthread",
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 100,
|
||||
"normal": 60,
|
||||
"caution": 40,
|
||||
"alert": 10
|
||||
},
|
||||
{
|
||||
"name": "nfs4_read",
|
||||
"unit": {
|
||||
"base": "IOP",
|
||||
"prefix": ""
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 6,
|
||||
"normal": 4,
|
||||
"caution": 2,
|
||||
"alert": 1
|
||||
},
|
||||
{
|
||||
"name": "nfs4_total",
|
||||
"unit": {
|
||||
"base": "IOP",
|
||||
"prefix": ""
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 6,
|
||||
"normal": 4,
|
||||
"caution": 2,
|
||||
"alert": 1
|
||||
}
|
||||
],
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "main",
|
||||
"nodes": "m[0101-0164,0201-0264,0301-0364,0401-0464,0601-0676,0701-0776,0801-0872,0901-0972,1001-1072,1101-1172]",
|
||||
"processorType": "Intel Broadwell",
|
||||
"socketsPerNode": 2,
|
||||
"coresPerSocket": 10,
|
||||
"threadsPerCore": 1,
|
||||
"flopRateScalar": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 96
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 1536
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 140
|
||||
},
|
||||
"topology": {
|
||||
"node": [
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
16,
|
||||
17,
|
||||
18,
|
||||
19
|
||||
],
|
||||
"socket": [
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9
|
||||
],
|
||||
[
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
16,
|
||||
17,
|
||||
18,
|
||||
19
|
||||
]
|
||||
],
|
||||
"memoryDomain": [
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9
|
||||
],
|
||||
[
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
16,
|
||||
17,
|
||||
18,
|
||||
19
|
||||
]
|
||||
],
|
||||
"core": [
|
||||
[
|
||||
0
|
||||
],
|
||||
[
|
||||
1
|
||||
],
|
||||
[
|
||||
2
|
||||
],
|
||||
[
|
||||
3
|
||||
],
|
||||
[
|
||||
4
|
||||
],
|
||||
[
|
||||
5
|
||||
],
|
||||
[
|
||||
6
|
||||
],
|
||||
[
|
||||
7
|
||||
],
|
||||
[
|
||||
8
|
||||
],
|
||||
[
|
||||
9
|
||||
],
|
||||
[
|
||||
10
|
||||
],
|
||||
[
|
||||
11
|
||||
],
|
||||
[
|
||||
12
|
||||
],
|
||||
[
|
||||
13
|
||||
],
|
||||
[
|
||||
14
|
||||
],
|
||||
[
|
||||
15
|
||||
],
|
||||
[
|
||||
16
|
||||
],
|
||||
[
|
||||
17
|
||||
],
|
||||
[
|
||||
18
|
||||
],
|
||||
[
|
||||
19
|
||||
]
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
1411
nhr@fau/job-archive/cluster-tinyfat.json
Normal file
1411
nhr@fau/job-archive/cluster-tinyfat.json
Normal file
File diff suppressed because it is too large
Load Diff
2486
nhr@fau/job-archive/cluster-tinygpu.json
Normal file
2486
nhr@fau/job-archive/cluster-tinygpu.json
Normal file
File diff suppressed because it is too large
Load Diff
943
nhr@fau/job-archive/cluster-woody.json
Normal file
943
nhr@fau/job-archive/cluster-woody.json
Normal file
@@ -0,0 +1,943 @@
|
||||
{
|
||||
"name": "woody",
|
||||
"metricConfig": [
|
||||
{
|
||||
"name": "cpu_load",
|
||||
"unit": {
|
||||
"base": ""
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 4,
|
||||
"normal": 4,
|
||||
"caution": 4,
|
||||
"alert": 1,
|
||||
"footprint": "avg",
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "icelake",
|
||||
"peak": 32,
|
||||
"normal": 32,
|
||||
"caution": 30,
|
||||
"footprint": "avg",
|
||||
"alert": 10
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "cpu_user",
|
||||
"unit": {
|
||||
"base": ""
|
||||
},
|
||||
"scope": "hwthread",
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 100,
|
||||
"normal": 50,
|
||||
"caution": 20,
|
||||
"alert": 10
|
||||
},
|
||||
{
|
||||
"name": "ipc",
|
||||
"unit": {
|
||||
"base": "IPC"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 4,
|
||||
"normal": 2,
|
||||
"caution": 1,
|
||||
"alert": 0.25
|
||||
},
|
||||
{
|
||||
"name": "mem_used",
|
||||
"unit": {
|
||||
"base": "B",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"footprint": "max",
|
||||
"timestep": 60,
|
||||
"peak": 32,
|
||||
"normal": 16,
|
||||
"caution": 28,
|
||||
"alert": 30,
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "icelake",
|
||||
"peak": 256,
|
||||
"footprint": "max",
|
||||
"normal": 128,
|
||||
"caution": 200,
|
||||
"alert": 240
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "flops_any",
|
||||
"unit": {
|
||||
"base": "Flops/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "hwthread",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 112,
|
||||
"normal": 50,
|
||||
"caution": 20,
|
||||
"alert": 10,
|
||||
"footprint": "avg",
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "icelake",
|
||||
"footprint": "avg",
|
||||
"peak": 2970,
|
||||
"normal": 1000,
|
||||
"caution": 100,
|
||||
"alert": 50
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "net_bytes_in",
|
||||
"unit": {
|
||||
"base": "B/s"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 125000000,
|
||||
"normal": 125000000,
|
||||
"caution": 200,
|
||||
"alert": 240
|
||||
},
|
||||
{
|
||||
"name": "net_bytes_out",
|
||||
"unit": {
|
||||
"base": "B/s"
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 125000000,
|
||||
"normal": 125000000,
|
||||
"caution": 200,
|
||||
"alert": 240
|
||||
},
|
||||
{
|
||||
"name": "flops_dp",
|
||||
"unit": {
|
||||
"base": "Flops/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "hwthread",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 56,
|
||||
"normal": 30,
|
||||
"caution": 15,
|
||||
"alert": 5,
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "icelake",
|
||||
"peak": 1450,
|
||||
"normal": 700,
|
||||
"caution": 100,
|
||||
"alert": 50
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "flops_sp",
|
||||
"unit": {
|
||||
"base": "Flops/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "hwthread",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 112,
|
||||
"normal": 50,
|
||||
"caution": 20,
|
||||
"alert": 10,
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "icelake",
|
||||
"peak": 2970,
|
||||
"normal": 1000,
|
||||
"caution": 100,
|
||||
"alert": 50
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "mem_bw",
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"scope": "socket",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 24,
|
||||
"normal": 10,
|
||||
"caution": 5,
|
||||
"alert": 2,
|
||||
"footprint": "avg",
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "icelake",
|
||||
"peak": 350,
|
||||
"footprint": "avg",
|
||||
"normal": 100,
|
||||
"caution": 50,
|
||||
"alert": 20
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "clock",
|
||||
"unit": {
|
||||
"base": "Hz",
|
||||
"prefix": "M"
|
||||
},
|
||||
"scope": "hwthread",
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 3000,
|
||||
"normal": 2000,
|
||||
"caution": 1500,
|
||||
"alert": 1200
|
||||
},
|
||||
{
|
||||
"name": "vectorization_ratio",
|
||||
"unit": {
|
||||
"base": ""
|
||||
},
|
||||
"scope": "hwthread",
|
||||
"aggregation": "avg",
|
||||
"timestep": 60,
|
||||
"peak": 100,
|
||||
"normal": 60,
|
||||
"caution": 40,
|
||||
"alert": 10
|
||||
},
|
||||
{
|
||||
"name": "nfs4_read",
|
||||
"unit": {
|
||||
"base": "IOP",
|
||||
"prefix": ""
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 6,
|
||||
"normal": 4,
|
||||
"caution": 2,
|
||||
"alert": 1
|
||||
},
|
||||
{
|
||||
"name": "nfs4_total",
|
||||
"unit": {
|
||||
"base": "IOP",
|
||||
"prefix": ""
|
||||
},
|
||||
"scope": "node",
|
||||
"aggregation": "sum",
|
||||
"timestep": 60,
|
||||
"peak": 6,
|
||||
"normal": 4,
|
||||
"caution": 2,
|
||||
"alert": 1
|
||||
}
|
||||
],
|
||||
"subClusters": [
|
||||
{
|
||||
"name": "haswell",
|
||||
"nodes": "w11[27-45,49-63,69-72]",
|
||||
"processorType": "Intel Xeon E3-1240 v3",
|
||||
"socketsPerNode": 1,
|
||||
"coresPerSocket": 4,
|
||||
"threadsPerCore": 1,
|
||||
"flopRateScalar": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 14
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 112
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 24
|
||||
},
|
||||
"topology": {
|
||||
"node": [
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3
|
||||
],
|
||||
"socket": [
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3
|
||||
]
|
||||
],
|
||||
"memoryDomain": [
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3
|
||||
]
|
||||
],
|
||||
"core": [
|
||||
[
|
||||
0
|
||||
],
|
||||
[
|
||||
1
|
||||
],
|
||||
[
|
||||
2
|
||||
],
|
||||
[
|
||||
3
|
||||
]
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "skylake",
|
||||
"nodes": "w12[01-08],w13[01-31,33-56]",
|
||||
"processorType": "Intel Xeon E3-1240 v5 ",
|
||||
"socketsPerNode": 1,
|
||||
"coresPerSocket": 4,
|
||||
"threadsPerCore": 1,
|
||||
"flopRateScalar": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 14
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 112
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 24
|
||||
},
|
||||
"topology": {
|
||||
"node": [
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3
|
||||
],
|
||||
"socket": [
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3
|
||||
]
|
||||
],
|
||||
"memoryDomain": [
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3
|
||||
]
|
||||
],
|
||||
"core": [
|
||||
[
|
||||
0
|
||||
],
|
||||
[
|
||||
1
|
||||
],
|
||||
[
|
||||
2
|
||||
],
|
||||
[
|
||||
3
|
||||
]
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "kabylake",
|
||||
"nodes": "w14[01-56],w15[01-05,07-56]",
|
||||
"processorType": "Intel Xeon E3-1240 v6",
|
||||
"socketsPerNode": 1,
|
||||
"coresPerSocket": 4,
|
||||
"threadsPerCore": 1,
|
||||
"flopRateScalar": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 14
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 112
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 24
|
||||
},
|
||||
"topology": {
|
||||
"node": [
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3
|
||||
],
|
||||
"socket": [
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3
|
||||
]
|
||||
],
|
||||
"memoryDomain": [
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3
|
||||
]
|
||||
],
|
||||
"core": [
|
||||
[
|
||||
0
|
||||
],
|
||||
[
|
||||
1
|
||||
],
|
||||
[
|
||||
2
|
||||
],
|
||||
[
|
||||
3
|
||||
]
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "icelake",
|
||||
"nodes": "w22[01-35],w23[01-35],w24[01-20],w25[01-20]",
|
||||
"processorType": "Intel Xeon Gold 6326",
|
||||
"socketsPerNode": 2,
|
||||
"coresPerSocket": 16,
|
||||
"threadsPerCore": 1,
|
||||
"flopRateScalar": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 432
|
||||
},
|
||||
"flopRateSimd": {
|
||||
"unit": {
|
||||
"base": "F/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 9216
|
||||
},
|
||||
"memoryBandwidth": {
|
||||
"unit": {
|
||||
"base": "B/s",
|
||||
"prefix": "G"
|
||||
},
|
||||
"value": 350
|
||||
},
|
||||
"topology": {
|
||||
"node": [
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
16,
|
||||
17,
|
||||
18,
|
||||
19,
|
||||
20,
|
||||
21,
|
||||
22,
|
||||
23,
|
||||
24,
|
||||
25,
|
||||
26,
|
||||
27,
|
||||
28,
|
||||
29,
|
||||
30,
|
||||
31,
|
||||
32,
|
||||
33,
|
||||
34,
|
||||
35,
|
||||
36,
|
||||
37,
|
||||
38,
|
||||
39,
|
||||
40,
|
||||
41,
|
||||
42,
|
||||
43,
|
||||
44,
|
||||
45,
|
||||
46,
|
||||
47,
|
||||
48,
|
||||
49,
|
||||
50,
|
||||
51,
|
||||
52,
|
||||
53,
|
||||
54,
|
||||
55,
|
||||
56,
|
||||
57,
|
||||
58,
|
||||
59,
|
||||
60,
|
||||
61,
|
||||
62,
|
||||
63,
|
||||
64,
|
||||
65,
|
||||
66,
|
||||
67,
|
||||
68,
|
||||
69,
|
||||
70,
|
||||
71
|
||||
],
|
||||
"socket": [
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
16,
|
||||
17,
|
||||
18,
|
||||
19,
|
||||
20,
|
||||
21,
|
||||
22,
|
||||
23,
|
||||
24,
|
||||
25,
|
||||
26,
|
||||
27,
|
||||
28,
|
||||
29,
|
||||
30,
|
||||
31,
|
||||
32,
|
||||
33,
|
||||
34,
|
||||
35
|
||||
],
|
||||
[
|
||||
36,
|
||||
37,
|
||||
38,
|
||||
39,
|
||||
40,
|
||||
41,
|
||||
42,
|
||||
43,
|
||||
44,
|
||||
45,
|
||||
46,
|
||||
47,
|
||||
48,
|
||||
49,
|
||||
50,
|
||||
51,
|
||||
52,
|
||||
53,
|
||||
54,
|
||||
55,
|
||||
56,
|
||||
57,
|
||||
58,
|
||||
59,
|
||||
60,
|
||||
61,
|
||||
62,
|
||||
63,
|
||||
64,
|
||||
65,
|
||||
66,
|
||||
67,
|
||||
68,
|
||||
69,
|
||||
70,
|
||||
71
|
||||
]
|
||||
],
|
||||
"memoryDomain": [
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
16,
|
||||
17
|
||||
],
|
||||
[
|
||||
18,
|
||||
19,
|
||||
20,
|
||||
21,
|
||||
22,
|
||||
23,
|
||||
24,
|
||||
25,
|
||||
26,
|
||||
27,
|
||||
28,
|
||||
29,
|
||||
30,
|
||||
31,
|
||||
32,
|
||||
33,
|
||||
34,
|
||||
35
|
||||
],
|
||||
[
|
||||
36,
|
||||
37,
|
||||
38,
|
||||
39,
|
||||
40,
|
||||
41,
|
||||
42,
|
||||
43,
|
||||
44,
|
||||
45,
|
||||
46,
|
||||
47,
|
||||
48,
|
||||
49,
|
||||
50,
|
||||
51,
|
||||
52,
|
||||
53
|
||||
],
|
||||
[
|
||||
54,
|
||||
55,
|
||||
56,
|
||||
57,
|
||||
58,
|
||||
59,
|
||||
60,
|
||||
61,
|
||||
62,
|
||||
63,
|
||||
64,
|
||||
65,
|
||||
66,
|
||||
67,
|
||||
68,
|
||||
69,
|
||||
70,
|
||||
71
|
||||
]
|
||||
],
|
||||
"core": [
|
||||
[
|
||||
0
|
||||
],
|
||||
[
|
||||
1
|
||||
],
|
||||
[
|
||||
2
|
||||
],
|
||||
[
|
||||
3
|
||||
],
|
||||
[
|
||||
4
|
||||
],
|
||||
[
|
||||
5
|
||||
],
|
||||
[
|
||||
6
|
||||
],
|
||||
[
|
||||
7
|
||||
],
|
||||
[
|
||||
8
|
||||
],
|
||||
[
|
||||
9
|
||||
],
|
||||
[
|
||||
10
|
||||
],
|
||||
[
|
||||
11
|
||||
],
|
||||
[
|
||||
12
|
||||
],
|
||||
[
|
||||
13
|
||||
],
|
||||
[
|
||||
14
|
||||
],
|
||||
[
|
||||
15
|
||||
],
|
||||
[
|
||||
16
|
||||
],
|
||||
[
|
||||
17
|
||||
],
|
||||
[
|
||||
18
|
||||
],
|
||||
[
|
||||
19
|
||||
],
|
||||
[
|
||||
20
|
||||
],
|
||||
[
|
||||
21
|
||||
],
|
||||
[
|
||||
22
|
||||
],
|
||||
[
|
||||
23
|
||||
],
|
||||
[
|
||||
24
|
||||
],
|
||||
[
|
||||
25
|
||||
],
|
||||
[
|
||||
26
|
||||
],
|
||||
[
|
||||
27
|
||||
],
|
||||
[
|
||||
28
|
||||
],
|
||||
[
|
||||
29
|
||||
],
|
||||
[
|
||||
30
|
||||
],
|
||||
[
|
||||
31
|
||||
],
|
||||
[
|
||||
32
|
||||
],
|
||||
[
|
||||
33
|
||||
],
|
||||
[
|
||||
34
|
||||
],
|
||||
[
|
||||
35
|
||||
],
|
||||
[
|
||||
36
|
||||
],
|
||||
[
|
||||
37
|
||||
],
|
||||
[
|
||||
38
|
||||
],
|
||||
[
|
||||
39
|
||||
],
|
||||
[
|
||||
40
|
||||
],
|
||||
[
|
||||
41
|
||||
],
|
||||
[
|
||||
42
|
||||
],
|
||||
[
|
||||
43
|
||||
],
|
||||
[
|
||||
44
|
||||
],
|
||||
[
|
||||
45
|
||||
],
|
||||
[
|
||||
46
|
||||
],
|
||||
[
|
||||
47
|
||||
],
|
||||
[
|
||||
48
|
||||
],
|
||||
[
|
||||
49
|
||||
],
|
||||
[
|
||||
50
|
||||
],
|
||||
[
|
||||
51
|
||||
],
|
||||
[
|
||||
52
|
||||
],
|
||||
[
|
||||
53
|
||||
],
|
||||
[
|
||||
54
|
||||
],
|
||||
[
|
||||
55
|
||||
],
|
||||
[
|
||||
56
|
||||
],
|
||||
[
|
||||
57
|
||||
],
|
||||
[
|
||||
58
|
||||
],
|
||||
[
|
||||
59
|
||||
],
|
||||
[
|
||||
60
|
||||
],
|
||||
[
|
||||
61
|
||||
],
|
||||
[
|
||||
62
|
||||
],
|
||||
[
|
||||
63
|
||||
],
|
||||
[
|
||||
64
|
||||
],
|
||||
[
|
||||
65
|
||||
],
|
||||
[
|
||||
66
|
||||
],
|
||||
[
|
||||
67
|
||||
],
|
||||
[
|
||||
68
|
||||
],
|
||||
[
|
||||
69
|
||||
],
|
||||
[
|
||||
70
|
||||
],
|
||||
[
|
||||
71
|
||||
]
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
Reference in New Issue
Block a user