diff --git a/README.md b/README.md index a41ba17..52c97a6 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,11 @@ -# cc-examples -Example configurations for ClusterCockpit installations +# Example configurations for ClusterCockpit installations + +This is a collection of overall ClusterCockpit configurations that are used in +production. + +We are aware that the configuration of ClusterOCkcpit is currently tedious +involving to edit many different files with partly redundnat information. +Hopefully we can provide something simpler in the future. + +Please note that the subcluster sections in the job archive cluster config files +can (and should) be generated with [this](https://github.com/ClusterCockpit/cc-backend/blob/master/configs/generate-subcluster.pl) provided perl skript. diff --git a/fau-systems/README.md b/fau-systems/README.md new file mode 100644 index 0000000..0347ca8 --- /dev/null +++ b/fau-systems/README.md @@ -0,0 +1,30 @@ +# ClusterCockpit at NHR@FAU + +NHR@FAU provides a production instance of ClusterCockpit for support personel +and users. Authentication is via an LDAP directory as well as via our HPC Portal +(homegrown account management platform) using JWT tokens. + +You can find an overview about all clusters +[here](https://hpc.fau.de/systems-services/documentation-instructions/). + +Some systems run with exclusive nodes, others have node sharing enabled. +There are CPU systems (Fritz, Meggie, Woody, TinyuFat) as well as GPU enabled +clusters (Alex, TinyGPUs). + +NHR@FAU uses the following stack: +* `cc-metric-collector` as node agent +* `cc-metric-store` as temporal metric timeseries cache. We use one instance for all clusters. +* `cc-backend` +* A homegrown python script running on the management nodes for providing job +meta data from Slurm + +We also push the metric data to an InfluxDB instance for debugging purposes. + +The backend and metric store run on the same dedicated Dell server running +Ubuntu Linux: +* Two Intel Xeon(R) Platinum 8352Y with 32 cores each +* 512 GB Main memory capacity +* A NVMe Raid with two 7TB disks + +This configuration is probably complete overkill, but we wanted to be on the +safe side. diff --git a/fau-systems/cc-backend/clustercockpit.service b/fau-systems/cc-backend/clustercockpit.service new file mode 100644 index 0000000..cf06726 --- /dev/null +++ b/fau-systems/cc-backend/clustercockpit.service @@ -0,0 +1,18 @@ +[Unit] +Description=ClusterCockpit Web Server (Go edition) +Documentation=https://github.com/ClusterCockpit/cc-backend +Wants=network-online.target +After=network-online.target +After=mariadb.service mysql.service + +[Service] +WorkingDirectory=/opt/monitoring/cc-backend +Type=notify +NotifyAccess=all +Restart=on-failure +RestartSec=30 +TimeoutStopSec=100 +ExecStart=/opt/monitoring/cc-backend/cc-backend -loglevel info -server -config ./config.json + +[Install] +WantedBy=multi-user.target diff --git a/fau-systems/cc-backend/config.json b/fau-systems/cc-backend/config.json new file mode 100644 index 0000000..c0d1587 --- /dev/null +++ b/fau-systems/cc-backend/config.json @@ -0,0 +1,158 @@ +{ + "addr": "0.0.0.0:443", + "stop-jobs-exceeding-walltime": 288000, + "ldap": { + "url": "ldaps://hpcldap.rrze.uni-erlangen.de", + "user_base": "ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", + "search_dn": "cn=hpcmonitoring,ou=roadm,ou=profile,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", + "user_bind": "uid={username},ou=people,ou=hpc,dc=rrze,dc=uni-erlangen,dc=de", + "user_filter": "(&(objectclass=posixAccount)(uid=*))", + "sync_interval": "24h" + }, + "https-cert-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/fullchain.pem", + "https-key-file": "/etc/letsencrypt/live/monitoring.nhr.fau.de/privkey.pem", + "user": "clustercockpit", + "group": "clustercockpit", + "archive": { + "kind": "file", + "path": "./var/job-archive", + "compression": 7, + "retention": { + "policy": "none" + } + }, + "clusters": [ + { + "name": "fritz", + "metricDataRepository": { + "kind": "cc-metric-store", + "url": "http://localhost:8082", + "token": "XZY" + }, + "filterRanges": { + "numNodes": { + "from": 1, + "to": 64 + }, + "duration": { + "from": 0, + "to": 86400 + }, + "startTime": { + "from": "2022-01-01T00:00:00Z", + "to": null + } + } + }, + { + "name": "alex", + "metricDataRepository": { + "kind": "cc-metric-store", + "url": "http://localhost:8082", + "token": "XZY" + }, + "filterRanges": { + "numNodes": { + "from": 1, + "to": 64 + }, + "duration": { + "from": 0, + "to": 86400 + }, + "startTime": { + "from": "2022-01-01T00:00:00Z", + "to": null + } + } + }, + { + "name": "woody", + "metricDataRepository": { + "kind": "cc-metric-store", + "url": "http://localhost:8082", + "token": "XZY" + }, + "filterRanges": { + "numNodes": { + "from": 1, + "to": 1 + }, + "duration": { + "from": 0, + "to": 172800 + }, + "startTime": { + "from": "2020-01-01T00:00:00Z", + "to": null + } + } + }, + { + "name": "tinyfat", + "metricDataRepository": { + "kind": "cc-metric-store", + "url": "http://localhost:8082", + "token": "XZY" + }, + "filterRanges": { + "numNodes": { + "from": 1, + "to": 1 + }, + "duration": { + "from": 0, + "to": 172800 + }, + "startTime": { + "from": "2020-01-01T00:00:00Z", + "to": null + } + } + }, + { + "name": "tinygpu", + "metricDataRepository": { + "kind": "cc-metric-store", + "url": "http://localhost:8082", + "token": "XZY" + }, + "filterRanges": { + "numNodes": { + "from": 1, + "to": 1 + }, + "duration": { + "from": 0, + "to": 172800 + }, + "startTime": { + "from": "2020-01-01T00:00:00Z", + "to": null + } + } + }, + { + "name": "meggie", + "metricDataRepository": { + "kind": "cc-metric-store", + "url": "http://localhost:8082", + "token": "XZY" + }, + "filterRanges": { + "numNodes": { + "from": 1, + "to": 64 + }, + "duration": { + "from": 0, + "to": 86400 + }, + "startTime": { + "from": "2018-01-01T00:00:00Z", + "to": null + } + } + } + ] +} diff --git a/fau-systems/cc-metric-collector/alex/collectors.json b/fau-systems/cc-metric-collector/alex/collectors.json new file mode 100644 index 0000000..830f0b8 --- /dev/null +++ b/fau-systems/cc-metric-collector/alex/collectors.json @@ -0,0 +1,145 @@ +{ + "nfs4stat" : {}, + "memstat" : { + "numa_stats": true, + "node_stats": true + }, + "cpustat" : {}, + "loadavg" : {}, + "schedstat": {}, + "nvidia" : { + "use_pci_info_as_type_id": true + }, + "lustrestat" : { + "send_all_metrics" : true, + "use_sudo": false, + "send_diff_values": true, + "send_derived_values": true, + "send_abs_values": false + }, + "netstat" : { + "include_devices" : [ + "enp1s0", + "enp70s0f0", + "enp195s0f0" + ], + "send_abs_values": true, + "send_derived_values": true + }, + "diskstat" : {}, + "iostat" : {}, + "ibstat" : { + "send_abs_values": true, + "send_derived_values": true + }, + "ipmistat" : { + "send_abs_values": true, + "send_derived_values": true + }, + "tempstat" : { + "tag_override" : { + "hwmon0" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon1" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC1": "ACTUAL_CPU_CLOCK", + "FIXC2": "MAX_CPU_CLOCK", + "PMC0": "RETIRED_INSTRUCTIONS", + "PMC1": "CPU_CLOCKS_UNHALTED", + "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL", + "PMC3": "MERGE", + "DFC0": "DRAM_CHANNEL_0", + "DFC1": "DRAM_CHANNEL_1", + "DFC2": "DRAM_CHANNEL_2", + "DFC3": "DRAM_CHANNEL_3" + }, + "metrics": [ + { + "name": "ipc", + "calc": "PMC0/PMC1", + "type": "hwthread", + "publish": true + }, + { + "name": "flops_any", + "calc": "1E-9*PMC2/time", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + }, + { + "name": "clock", + "calc": "1E-6*(FIXC1/FIXC2)/inverseClock", + "type": "hwthread", + "unit": "MHz", + "publish": true + }, + { + "name": "mem1", + "calc": "1E-9*(DFC0+DFC1+DFC2+DFC3)*64.0/time", + "unit": "Gbyte/s", + "type": "socket", + "publish": false + } + ] + }, + { + "events": { + "DFC0": "DRAM_CHANNEL_4", + "DFC1": "DRAM_CHANNEL_5", + "DFC2": "DRAM_CHANNEL_6", + "DFC3": "DRAM_CHANNEL_7", + "PWR0": "RAPL_CORE_ENERGY", + "PWR1": "RAPL_PKG_ENERGY" + }, + "metrics": [ + { + "name": "core_power", + "calc": "PWR0/time", + "unit": "Watt", + "type": "hwthread", + "publish": true + }, + { + "name": "cpu_power", + "calc": "PWR1/time", + "type": "socket", + "unit": "Watt", + "publish": true + }, + { + "name": "mem2", + "calc": "1E-9*(DFC0+DFC1+DFC2+DFC3)*64.0/time", + "unit": "Gbyte/s", + "type": "socket", + "publish": false + } + ] + } + ], + "globalmetrics": [ + { + "name": "mem_bw", + "calc": "mem1+mem2", + "type": "socket", + "unit": "Gbyte/s", + "publish": true + } + ] + } +} diff --git a/fau-systems/cc-metric-collector/alex/config.json b/fau-systems/cc-metric-collector/alex/config.json new file mode 100644 index 0000000..308b08f --- /dev/null +++ b/fau-systems/cc-metric-collector/alex/config.json @@ -0,0 +1,8 @@ +{ + "sinks": "/etc/cc-metric-collector/sinks.json", + "collectors" : "/etc/cc-metric-collector/collectors.json", + "receivers" : "/etc/cc-metric-collector/receivers.json", + "router" : "/etc/cc-metric-collector/router.json", + "interval": "60s", + "duration": "10s" +} diff --git a/fau-systems/cc-metric-collector/alex/receivers.json b/fau-systems/cc-metric-collector/alex/receivers.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/fau-systems/cc-metric-collector/alex/receivers.json @@ -0,0 +1 @@ +{} diff --git a/fau-systems/cc-metric-collector/alex/router.json b/fau-systems/cc-metric-collector/alex/router.json new file mode 100644 index 0000000..bb73852 --- /dev/null +++ b/fau-systems/cc-metric-collector/alex/router.json @@ -0,0 +1,58 @@ +{ + "add_tags" : [ + { + "key" : "cluster", + "value" : "alex", + "if" : "*" + } + ], + "rename_metrics" : { + "load_one" : "cpu_load", + "cpu_load_core" : "cpu_load", + "net_bytes_in_bw" : "net_bytes_in", + "net_bytes_out_bw" : "net_bytes_out", + "net_pkts_in_bw" : "net_pkts_in", + "net_pkts_out_bw" : "net_pkts_out", + "ib_recv_bw" : "ib_recv", + "ib_xmit_bw" : "ib_xmit", + "ib_recv_pkts_bw": "ib_recv_pkts", + "ib_xmit_pkts_bw": "ib_xmit_pkts", + "lustre_read_bytes_diff" : "lustre_read_bytes", + "lustre_read_requests_diff" : "lustre_read_requests", + "lustre_write_bytes_diff" : "lustre_write_bytes", + "lustre_write_requests_diff" : "lustre_write_requests", + "lustre_open_diff" : "lustre_open", + "lustre_close_diff" : "lustre_close", + "lustre_setattr_diff" : "lustre_setattr", + "lustre_getattr_diff" : "lustre_getattr", + "lustre_statfs_diff": "lustre_statfs", + "lustre_inode_permission_diff" : "lustre_inode_permission", + "nv_util" : "acc_utilization", + "nv_fb_mem_used" : "acc_mem_used", + "nv_power_usage" : "acc_power" + }, + "drop_metrics" : [ + "net_bytes_in", + "net_bytes_out", + "ib_recv", + "ib_xmit", + "ib_recv_pkts", + "ib_xmit_pkts", + "net_pkts_in", + "net_pkts_out", + "lustre_read_bytes", + "lustre_read_requests", + "lustre_write_bytes", + "lustre_write_requests" + ], + "interval_timestamp" : false, + "num_cache_intervals" : 0, + "change_unit_prefix": { + "mem_used": "G", + "swap_used": "G", + "mem_total": "G", + "swap_total": "G", + "cpufreq": "M" + }, + "normalize_metrics" : true +} diff --git a/fau-systems/cc-metric-collector/alex/sinks.json b/fau-systems/cc-metric-collector/alex/sinks.json new file mode 100644 index 0000000..75e092a --- /dev/null +++ b/fau-systems/cc-metric-collector/alex/sinks.json @@ -0,0 +1,26 @@ +{ + "nhrinflux": { + "type": "influxasync", + "host": "monitoring-test.nhr.uni-erlangen.de", + "port": "8086", + "organization": "ClusterCockpit", + "database": "alex", + "password": "XYZ", + "ssl": true, + "meta_as_tags": [ + "unit" + ] + }, + "alexstore": { + "type": "http", + "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=alex", + "jwt": "XYZ", + "meta_as_tags": [ + "unit" + ], + "idle_connection_timeout": "60s", + "flush_delay": "2s", + "max_retries": 1, + "timeout": "10s" + } +} diff --git a/fau-systems/cc-metric-collector/alex/sinks_debug.json b/fau-systems/cc-metric-collector/alex/sinks_debug.json new file mode 100644 index 0000000..5f7215f --- /dev/null +++ b/fau-systems/cc-metric-collector/alex/sinks_debug.json @@ -0,0 +1,33 @@ +{ + "fritzganglia": { + "type": "libganglia", + "gmond_config": "/etc/ganglia/gmond.conf", + "libganglia_path": "libganglia.so.0", + "add_ganglia_group": true + }, + "nhrinflux": { + "type": "influxasync", + "host": "monitoring-test.nhr.uni-erlangen.de", + "port": "8086", + "organization": "ClusterCockpit", + "database": "fritz_neu", + "password": "XZY", + "ssl": true, + "meta_as_tags": [ + "unit" + ] + }, + "fritzstore": { + "type": "http", + "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz", + "jwt": "XZY", + "meta_as_tags": [ + "unit" + ], + "idle_connection_timeout": "60s" + }, + "debugstdout": { + "type": "stdout", + "output_file": "/tmp/debug.log" + } +} diff --git a/fau-systems/cc-metric-collector/fritz.spr/collectors.json b/fau-systems/cc-metric-collector/fritz.spr/collectors.json new file mode 100644 index 0000000..44a7649 --- /dev/null +++ b/fau-systems/cc-metric-collector/fritz.spr/collectors.json @@ -0,0 +1,43 @@ +{ + "nfs4stat" : {}, + "memstat" : { + "node_stats": true + }, + "cpustat" : {}, + "loadavg" : {}, + "lustrestat" : { + "send_all_metrics" : true, + "use_sudo": false, + "send_diff_values": true, + "send_derived_values": true, + "send_abs_values": false + }, + "netstat" : { + "include_devices" : [ + "enp1s0", + "enp22s0" + ], + "send_abs_values": true, + "send_derived_values": true + }, + "diskstat" : {}, + "iostat" : {}, + "ibstat" : { + "send_abs_values": true, + "send_derived_values": true + }, + "tempstat" : { + "tag_override" : { + "hwmon0" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon1" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "cpufreq_cpuinfo": {}, + "nfsiostat": {} +} diff --git a/fau-systems/cc-metric-collector/fritz.spr/config.json b/fau-systems/cc-metric-collector/fritz.spr/config.json new file mode 100644 index 0000000..3a91189 --- /dev/null +++ b/fau-systems/cc-metric-collector/fritz.spr/config.json @@ -0,0 +1,8 @@ +{ + "sinks": "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/sinks2.json", + "collectors" : "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/collectors.json", + "receivers" : "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/receivers.json", + "router" : "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/router.json", + "interval": "60s", + "duration": "10s" +} diff --git a/fau-systems/cc-metric-collector/fritz.spr/receivers.json b/fau-systems/cc-metric-collector/fritz.spr/receivers.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/fau-systems/cc-metric-collector/fritz.spr/receivers.json @@ -0,0 +1 @@ +{} diff --git a/fau-systems/cc-metric-collector/fritz.spr/router.json b/fau-systems/cc-metric-collector/fritz.spr/router.json new file mode 100644 index 0000000..09b3813 --- /dev/null +++ b/fau-systems/cc-metric-collector/fritz.spr/router.json @@ -0,0 +1,54 @@ +{ + "add_tags" : [ + { + "key" : "cluster", + "value" : "fritz", + "if" : "*" + } + ], + "rename_metrics" : { + "load_one" : "cpu_load", + "net_bytes_in_bw" : "net_bytes_in", + "net_bytes_out_bw" : "net_bytes_out", + "net_pkts_in_bw" : "net_pkts_in", + "net_pkts_out_bw" : "net_pkts_out", + "ib_recv_bw" : "ib_recv", + "ib_xmit_bw" : "ib_xmit", + "ib_recv_pkts_bw": "ib_recv_pkts", + "ib_xmit_pkts_bw": "ib_xmit_pkts", + "lustre_read_bytes_diff" : "lustre_read_bytes", + "lustre_read_requests_diff" : "lustre_read_requests", + "lustre_write_bytes_diff" : "lustre_write_bytes", + "lustre_write_requests_diff" : "lustre_write_requests", + "lustre_open_diff" : "lustre_open", + "lustre_close_diff" : "lustre_close", + "lustre_setattr_diff" : "lustre_setattr", + "lustre_getattr_diff" : "lustre_getattr", + "lustre_statfs_diff": "lustre_statfs", + "lustre_inode_permission_diff" : "lustre_inode_permission", + "cpufreq" : "clock" + }, + "drop_metrics" : [ + "net_bytes_in", + "net_bytes_out", + "ib_recv", + "ib_xmit", + "ib_recv_pkts", + "ib_xmit_pkts", + "net_pkts_in", + "net_pkts_out", + "lustre_read_bytes", + "lustre_read_requests", + "lustre_write_bytes", + "lustre_write_requests" + ], + "interval_timestamp" : false, + "num_cache_intervals" : 0, + "change_unit_prefix": { + "mem_used": "G", + "swap_used": "G", + "mem_total": "G", + "swap_total": "G" + }, + "normalize_metrics" : true +} diff --git a/fau-systems/cc-metric-collector/fritz.spr/sinks.json b/fau-systems/cc-metric-collector/fritz.spr/sinks.json new file mode 100644 index 0000000..c7085c7 --- /dev/null +++ b/fau-systems/cc-metric-collector/fritz.spr/sinks.json @@ -0,0 +1,32 @@ +{ + "fritzganglia": { + "type": "libganglia", + "gmond_config": "/etc/ganglia/gmond.conf", + "libganglia_path": "libganglia.so.0", + "add_ganglia_group": true + }, + "nhrinflux": { + "type": "influxasync", + "host": "monitoring-test.nhr.uni-erlangen.de", + "port": "8086", + "organization": "ClusterCockpit", + "database": "fritz_neu", + "password": "XZY", + "ssl": true, + "meta_as_tags": [ + "unit" + ] + }, + "fritzstore": { + "type": "http", + "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz", + "jwt": "XZY", + "meta_as_tags": [ + "unit" + ], + "idle_connection_timeout": "60s", + "flush_delay": "2s", + "max_retries": 1, + "timeout": "10s" + } +} diff --git a/fau-systems/cc-metric-collector/fritz/collectors.json b/fau-systems/cc-metric-collector/fritz/collectors.json new file mode 100644 index 0000000..4c242bf --- /dev/null +++ b/fau-systems/cc-metric-collector/fritz/collectors.json @@ -0,0 +1,184 @@ +{ + "nfs4stat" : {}, + "memstat" : { + "node_stats": true + }, + "cpustat" : {}, + "loadavg" : {}, + "lustrestat" : { + "send_all_metrics" : true, + "use_sudo": false, + "send_diff_values": true, + "send_derived_values": true, + "send_abs_values": false + }, + "netstat" : { + "include_devices" : [ + "enp1s0", + "enp22s0" + ], + "send_abs_values": true, + "send_derived_values": true + }, + "diskstat" : {}, + "iostat" : {}, + "ibstat" : { + "send_abs_values": true, + "send_derived_values": true + }, + "tempstat" : { + "tag_override" : { + "hwmon0" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon1" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "nfsiostat": {}, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "MBOX0C0": "CAS_COUNT_RD", + "MBOX0C1": "CAS_COUNT_WR", + "MBOX1C0": "CAS_COUNT_RD", + "MBOX1C1": "CAS_COUNT_WR", + "MBOX2C0": "CAS_COUNT_RD", + "MBOX2C1": "CAS_COUNT_WR", + "MBOX3C0": "CAS_COUNT_RD", + "MBOX3C1": "CAS_COUNT_WR", + "MBOX4C0": "CAS_COUNT_RD", + "MBOX4C1": "CAS_COUNT_WR", + "MBOX5C0": "CAS_COUNT_RD", + "MBOX5C1": "CAS_COUNT_WR", + "MBOX6C0": "CAS_COUNT_RD", + "MBOX6C1": "CAS_COUNT_WR", + "MBOX7C0": "CAS_COUNT_RD", + "MBOX7C1": "CAS_COUNT_WR", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE", + "PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE", + "PWR0": "PWR_PKG_ENERGY", + "PWR3": "PWR_DRAM_ENERGY" + }, + "metrics": [ + { + "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", + "name": "clock", + "publish": true, + "unit": "MHz", + "type": "hwthread" + }, + { + "calc": "FIXC0/FIXC1", + "name": "ipc", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PWR0/time", + "name": "cpu_power", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "PWR3/time", + "name": "mem_power", + "unit": "Watt", + "publish": true, + "type": "socket" + }, + { + "calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time", + "name": "flops_dp", + "unit": "GFlops/s", + "publish": true, + "type": "hwthread" + }, + { + "calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time", + "name": "mem_bw", + "unit": "GBytes/s", + "publish": true, + "type": "socket" + }, + { + "calc": "PMC0+PMC2+PMC3", + "name": "dp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2+PMC3", + "name": "dp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "FIXC3": "TOPDOWN_SLOTS", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE", + "PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time", + "name": "flops_sp", + "unit": "GFlops/s", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PMC0+PMC2+PMC3", + "name": "sp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2+PMC3", + "name": "sp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + } + ], + "globalmetrics": [ + { + "calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))", + "name": "vectorization_ratio", + "unit": "%", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_dp * 2) + flops_sp", + "name": "flops_any", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + } + ] + } +} diff --git a/fau-systems/cc-metric-collector/fritz/config.json b/fau-systems/cc-metric-collector/fritz/config.json new file mode 100644 index 0000000..308b08f --- /dev/null +++ b/fau-systems/cc-metric-collector/fritz/config.json @@ -0,0 +1,8 @@ +{ + "sinks": "/etc/cc-metric-collector/sinks.json", + "collectors" : "/etc/cc-metric-collector/collectors.json", + "receivers" : "/etc/cc-metric-collector/receivers.json", + "router" : "/etc/cc-metric-collector/router.json", + "interval": "60s", + "duration": "10s" +} diff --git a/fau-systems/cc-metric-collector/fritz/receivers.json b/fau-systems/cc-metric-collector/fritz/receivers.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/fau-systems/cc-metric-collector/fritz/receivers.json @@ -0,0 +1 @@ +{} diff --git a/fau-systems/cc-metric-collector/fritz/router.json b/fau-systems/cc-metric-collector/fritz/router.json new file mode 100644 index 0000000..31d11ce --- /dev/null +++ b/fau-systems/cc-metric-collector/fritz/router.json @@ -0,0 +1,53 @@ +{ + "add_tags" : [ + { + "key" : "cluster", + "value" : "fritz", + "if" : "*" + } + ], + "rename_metrics" : { + "load_one" : "cpu_load", + "net_bytes_in_bw" : "net_bytes_in", + "net_bytes_out_bw" : "net_bytes_out", + "net_pkts_in_bw" : "net_pkts_in", + "net_pkts_out_bw" : "net_pkts_out", + "ib_recv_bw" : "ib_recv", + "ib_xmit_bw" : "ib_xmit", + "ib_recv_pkts_bw": "ib_recv_pkts", + "ib_xmit_pkts_bw": "ib_xmit_pkts", + "lustre_read_bytes_diff" : "lustre_read_bytes", + "lustre_read_requests_diff" : "lustre_read_requests", + "lustre_write_bytes_diff" : "lustre_write_bytes", + "lustre_write_requests_diff" : "lustre_write_requests", + "lustre_open_diff" : "lustre_open", + "lustre_close_diff" : "lustre_close", + "lustre_setattr_diff" : "lustre_setattr", + "lustre_getattr_diff" : "lustre_getattr", + "lustre_statfs_diff": "lustre_statfs", + "lustre_inode_permission_diff" : "lustre_inode_permission" + }, + "drop_metrics" : [ + "net_bytes_in", + "net_bytes_out", + "ib_recv", + "ib_xmit", + "ib_recv_pkts", + "ib_xmit_pkts", + "net_pkts_in", + "net_pkts_out", + "lustre_read_bytes", + "lustre_read_requests", + "lustre_write_bytes", + "lustre_write_requests" + ], + "interval_timestamp" : false, + "num_cache_intervals" : 0, + "change_unit_prefix": { + "mem_used": "G", + "swap_used": "G", + "mem_total": "G", + "swap_total": "G" + }, + "normalize_metrics" : true +} diff --git a/fau-systems/cc-metric-collector/fritz/sinks.json b/fau-systems/cc-metric-collector/fritz/sinks.json new file mode 100644 index 0000000..bfe5c66 --- /dev/null +++ b/fau-systems/cc-metric-collector/fritz/sinks.json @@ -0,0 +1,32 @@ +{ + "fritzganglia": { + "type": "libganglia", + "gmond_config": "/etc/ganglia/gmond.conf", + "libganglia_path": "libganglia.so.0", + "add_ganglia_group": true + }, + "nhrinflux": { + "type": "influxasync", + "host": "monitoring-test.nhr.uni-erlangen.de", + "port": "8086", + "organization": "ClusterCockpit", + "database": "fritz_neu", + "password": "XYZ", + "ssl": true, + "meta_as_tags": [ + "unit" + ] + }, + "fritzstore": { + "type": "http", + "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz", + "jwt": "XYZ", + "meta_as_tags": [ + "unit" + ], + "idle_connection_timeout": "60s", + "flush_delay": "2s", + "max_retries": 1, + "timeout": "10s" + } +} diff --git a/fau-systems/cc-metric-collector/fritz/sinks_debug.json b/fau-systems/cc-metric-collector/fritz/sinks_debug.json new file mode 100644 index 0000000..5f7215f --- /dev/null +++ b/fau-systems/cc-metric-collector/fritz/sinks_debug.json @@ -0,0 +1,33 @@ +{ + "fritzganglia": { + "type": "libganglia", + "gmond_config": "/etc/ganglia/gmond.conf", + "libganglia_path": "libganglia.so.0", + "add_ganglia_group": true + }, + "nhrinflux": { + "type": "influxasync", + "host": "monitoring-test.nhr.uni-erlangen.de", + "port": "8086", + "organization": "ClusterCockpit", + "database": "fritz_neu", + "password": "XZY", + "ssl": true, + "meta_as_tags": [ + "unit" + ] + }, + "fritzstore": { + "type": "http", + "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz", + "jwt": "XZY", + "meta_as_tags": [ + "unit" + ], + "idle_connection_timeout": "60s" + }, + "debugstdout": { + "type": "stdout", + "output_file": "/tmp/debug.log" + } +} diff --git a/fau-systems/cc-metric-collector/meggie-ng/.gitkeep b/fau-systems/cc-metric-collector/meggie-ng/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/fau-systems/cc-metric-collector/meggie-ng/collectors.json b/fau-systems/cc-metric-collector/meggie-ng/collectors.json new file mode 100644 index 0000000..0a83eab --- /dev/null +++ b/fau-systems/cc-metric-collector/meggie-ng/collectors.json @@ -0,0 +1,161 @@ +{ + "diskstat" : {}, + "ibstat" : {}, + "nfs4stat" : {}, + "cpustat" : {}, + "loadavg" : {}, + "iostat" : {}, + "netstat" : { + "include_devices" : [ + "eno1" + ], + "send_abs_values" : true, + "send_derived_values" : true + }, + "tempstat" : { + "tag_override" : { + "hwmon1" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon2" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "memstat" : { + "node_stats": true + }, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "eventsets": [ + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "MBOX0C0": "CAS_COUNT_RD", + "MBOX0C1": "CAS_COUNT_WR", + "MBOX1C0": "CAS_COUNT_RD", + "MBOX1C1": "CAS_COUNT_WR", + "MBOX2C0": "CAS_COUNT_RD", + "MBOX2C1": "CAS_COUNT_WR", + "MBOX3C0": "CAS_COUNT_RD", + "MBOX3C1": "CAS_COUNT_WR", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE", + "PWR0": "PWR_PKG_ENERGY", + "PWR3": "PWR_DRAM_ENERGY" + }, + "metrics": [ + { + "calc": "1E-6*(FIXC1/FIXC2)/inverseClock", + "name": "clock", + "publish": true, + "unit": "MHz", + "type": "hwthread" + }, + { + "calc": "FIXC0/FIXC1", + "name": "ipc", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PWR0/time", + "name": "cpu_power", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "PWR3/time", + "name": "mem_power", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time", + "name": "flops_dp", + "publish": true, + "unit": "GFlops/s", + "type": "hwthread" + }, + { + "calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time", + "name": "mem_bw", + "publish": true, + "unit": "GBytes/s", + "type": "socket" + }, + { + "calc": "PMC0+PMC2", + "name": "dp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "dp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time", + "name": "flops_sp", + "unit": "GFlops/s", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PMC0+PMC2", + "name": "sp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "sp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + } + ], + "globalmetrics": [ + { + "calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))", + "name": "vectorization_ratio", + "type": "hwthread", + "unit": "%", + "publish": true + }, + { + "calc": "(flops_dp * 2) + flops_sp", + "name": "flops_any", + "type": "hwthread", + "unit": "GFlops/s", + "publish": true + } + ] + } +} diff --git a/fau-systems/cc-metric-collector/meggie-ng/config.json b/fau-systems/cc-metric-collector/meggie-ng/config.json new file mode 100644 index 0000000..308b08f --- /dev/null +++ b/fau-systems/cc-metric-collector/meggie-ng/config.json @@ -0,0 +1,8 @@ +{ + "sinks": "/etc/cc-metric-collector/sinks.json", + "collectors" : "/etc/cc-metric-collector/collectors.json", + "receivers" : "/etc/cc-metric-collector/receivers.json", + "router" : "/etc/cc-metric-collector/router.json", + "interval": "60s", + "duration": "10s" +} diff --git a/fau-systems/cc-metric-collector/meggie-ng/receivers.json b/fau-systems/cc-metric-collector/meggie-ng/receivers.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/fau-systems/cc-metric-collector/meggie-ng/receivers.json @@ -0,0 +1 @@ +{} diff --git a/fau-systems/cc-metric-collector/meggie-ng/router.json b/fau-systems/cc-metric-collector/meggie-ng/router.json new file mode 100644 index 0000000..3eb232e --- /dev/null +++ b/fau-systems/cc-metric-collector/meggie-ng/router.json @@ -0,0 +1,53 @@ +{ + "add_tags" : [ + { + "key" : "cluster", + "value" : "meggie", + "if" : "*" + } + ], + "rename_metrics" : { + "load_one" : "cpu_load", + "net_bytes_in_bw" : "net_bytes_in", + "net_bytes_out_bw" : "net_bytes_out", + "net_pkts_in_bw" : "net_pkts_in", + "net_pkts_out_bw" : "net_pkts_out", + "ib_recv_bw" : "ib_recv", + "ib_xmit_bw" : "ib_xmit", + "ib_recv_pkts_bw": "ib_recv_pkts", + "ib_xmit_pkts_bw": "ib_xmit_pkts", + "lustre_read_bytes_diff" : "lustre_read_bytes", + "lustre_read_requests_diff" : "lustre_read_requests", + "lustre_write_bytes_diff" : "lustre_write_bytes", + "lustre_write_requests_diff" : "lustre_write_requests", + "lustre_open_diff" : "lustre_open", + "lustre_close_diff" : "lustre_close", + "lustre_setattr_diff" : "lustre_setattr", + "lustre_getattr_diff" : "lustre_getattr", + "lustre_statfs_diff": "lustre_statfs", + "lustre_inode_permission_diff" : "lustre_inode_permission" + }, + "drop_metrics" : [ + "net_bytes_in", + "net_bytes_out", + "ib_recv", + "ib_xmit", + "ib_recv_pkts", + "ib_xmit_pkts", + "net_pkts_in", + "net_pkts_out", + "lustre_read_bytes", + "lustre_read_requests", + "lustre_write_bytes", + "lustre_write_requests" + ], + "interval_timestamp" : true, + "num_cache_intervals" : 0, + "change_unit_prefix": { + "mem_used": "G", + "swap_used": "G", + "mem_total": "G", + "swap_total": "G" + }, + "normalize_metrics" : true +} diff --git a/fau-systems/cc-metric-collector/meggie-ng/sinks.json b/fau-systems/cc-metric-collector/meggie-ng/sinks.json new file mode 100644 index 0000000..29716bd --- /dev/null +++ b/fau-systems/cc-metric-collector/meggie-ng/sinks.json @@ -0,0 +1,26 @@ +{ + "nhrinflux": { + "type": "influxasync", + "host": "monitoring-test.nhr.uni-erlangen.de", + "port": "8086", + "organization": "ClusterCockpit", + "database": "meggie", + "password": "XZY", + "ssl": true, + "meta_as_tags": [ + "unit" + ] + }, + "fritzstore": { + "type": "http", + "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=meggie", + "jwt": "XZY", + "meta_as_tags": [ + "unit" + ], + "idle_connection_timeout": "60s", + "flush_delay": "2s", + "max_retries": 1, + "timeout": "10s" + } +} diff --git a/fau-systems/cc-metric-collector/tinyfat/collectors.bdw.json b/fau-systems/cc-metric-collector/tinyfat/collectors.bdw.json new file mode 100644 index 0000000..fee7f70 --- /dev/null +++ b/fau-systems/cc-metric-collector/tinyfat/collectors.bdw.json @@ -0,0 +1,167 @@ +{ + "nfs4stat" : {}, + "memstat" : { + "numa_stats": true, + "node_stats": true + }, + "cpustat" : {}, + "loadavg" : {}, + "schedstat": {}, + "netstat" : { + "include_devices" : [ + "eth0", + "eth1", + "eth2" + ], + "send_abs_values": true, + "send_derived_values": true + }, + "diskstat" : {}, + "iostat" : {}, + "tempstat" : { + "tag_override" : { + "hwmon2" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon3" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "MBOX0C0": "CAS_COUNT_RD", + "MBOX0C1": "CAS_COUNT_WR", + "MBOX1C0": "CAS_COUNT_RD", + "MBOX1C1": "CAS_COUNT_WR", + "MBOX2C0": "CAS_COUNT_RD", + "MBOX2C1": "CAS_COUNT_WR", + "MBOX3C0": "CAS_COUNT_RD", + "MBOX3C1": "CAS_COUNT_WR", + "MBOX4C0": "CAS_COUNT_RD", + "MBOX4C1": "CAS_COUNT_WR", + "MBOX5C0": "CAS_COUNT_RD", + "MBOX5C1": "CAS_COUNT_WR", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE", + "PWR0": "PWR_PKG_ENERGY", + "PWR3": "PWR_DRAM_ENERGY" + }, + "metrics": [ + { + "calc": "1E-6*(FIXC1/FIXC2)/inverseClock", + "name": "clock", + "publish": true, + "unit": "MHz", + "type": "hwthread" + }, + { + "calc": "FIXC0/FIXC1", + "name": "ipc", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PWR0/time", + "name": "pwr_pkg", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "PWR3/time", + "name": "pwr_dram", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time", + "name": "flops_dp", + "publish": true, + "unit": "GFlops/s", + "type": "hwthread" + }, + { + "calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time", + "name": "mem_bw", + "publish": true, + "unit": "GBytes/s", + "type": "socket" + }, + { + "calc": "PMC0+PMC2", + "name": "dp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "dp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time", + "name": "flops_sp", + "publish": true, + "unit": "GFlops/s", + "type": "hwthread" + }, + { + "calc": "PMC0+PMC2", + "name": "sp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "sp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + } + ], + "globalmetrics": [ + { + "calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))", + "name": "vectorization_ratio", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_dp * 2) + flops_sp", + "name": "flops_any", + "type": "hwthread", + "unit": "GFlops/s", + "publish": true + } + ] + } +} diff --git a/fau-systems/cc-metric-collector/tinyfat/collectors.rome.json b/fau-systems/cc-metric-collector/tinyfat/collectors.rome.json new file mode 100644 index 0000000..d3c76c6 --- /dev/null +++ b/fau-systems/cc-metric-collector/tinyfat/collectors.rome.json @@ -0,0 +1,100 @@ +{ + "nfs4stat" : {}, + "memstat" : { + "numa_stats": true, + "node_stats": true + }, + "cpustat" : {}, + "loadavg" : {}, + "schedstat": {}, + "netstat" : { + "include_devices" : [ + "eth0", + "eth1", + "eth2" + ], + "send_abs_values": true, + "send_derived_values": true + }, + "diskstat" : {}, + "iostat" : {}, + "tempstat" : { + "tag_override" : { + "hwmon1" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon2" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC1": "ACTUAL_CPU_CLOCK", + "FIXC2": "MAX_CPU_CLOCK", + "PMC0": "RETIRED_INSTRUCTIONS", + "PMC1": "CPU_CLOCKS_UNHALTED", + "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL", + "PMC3": "MERGE", + "DFC0": "DATA_FROM_LOCAL_DRAM_CHANNEL", + "DFC1": "DATA_TO_LOCAL_DRAM_CHANNEL", + "PWR0": "RAPL_CORE_ENERGY", + "PWR1": "RAPL_PKG_ENERGY" + }, + "metrics": [ + { + "name": "ipc", + "calc": "PMC0/PMC1", + "type": "hwthread", + "publish": true + }, + { + "name": "flops_any", + "calc": "1E-9*PMC2/time", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + }, + { + "name": "clock", + "calc": "1E-6*(FIXC1/FIXC2)/inverseClock", + "type": "hwthread", + "unit": "MHz", + "publish": true + }, + { + "name": "pwr_core", + "calc": "PWR0/time", + "unit": "Watt", + "type": "socket", + "publish": true + }, + { + "name": "pwr_pkg", + "calc": "PWR1/time", + "type": "socket", + "unit": "Watt", + "publish": true + }, + { + "name": "mem_bw", + "calc": "1E-9*(DFC0+DFC1)*64.0/time", + "unit": "Gbyte/s", + "type": "socket", + "publish": true + } + ] + } + ], + "globalmetrics": [] + } +} diff --git a/fau-systems/cc-metric-collector/tinyfat/config.json b/fau-systems/cc-metric-collector/tinyfat/config.json new file mode 100644 index 0000000..308b08f --- /dev/null +++ b/fau-systems/cc-metric-collector/tinyfat/config.json @@ -0,0 +1,8 @@ +{ + "sinks": "/etc/cc-metric-collector/sinks.json", + "collectors" : "/etc/cc-metric-collector/collectors.json", + "receivers" : "/etc/cc-metric-collector/receivers.json", + "router" : "/etc/cc-metric-collector/router.json", + "interval": "60s", + "duration": "10s" +} diff --git a/fau-systems/cc-metric-collector/tinyfat/receivers.json b/fau-systems/cc-metric-collector/tinyfat/receivers.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/fau-systems/cc-metric-collector/tinyfat/receivers.json @@ -0,0 +1 @@ +{} diff --git a/fau-systems/cc-metric-collector/tinyfat/router.json b/fau-systems/cc-metric-collector/tinyfat/router.json new file mode 100644 index 0000000..4a1588e --- /dev/null +++ b/fau-systems/cc-metric-collector/tinyfat/router.json @@ -0,0 +1,49 @@ +{ + "add_tags" : [ + { + "key" : "cluster", + "value" : "tinyfat", + "if" : "*" + } + ], + "rename_metrics" : { + "load_one" : "cpu_load", + "cpu_load_core" : "cpu_load", + "net_bytes_in_bw" : "net_bytes_in", + "net_bytes_out_bw" : "net_bytes_out", + "net_pkts_in_bw" : "net_pkts_in", + "net_pkts_out_bw" : "net_pkts_out", + "ib_recv_bw" : "ib_recv", + "ib_xmit_bw" : "ib_xmit", + "ib_recv_pkts_bw": "ib_recv_pkts", + "ib_xmit_pkts_bw": "ib_xmit_pkts", + "lustre_read_bytes_diff" : "lustre_read_bytes", + "lustre_read_requests_diff" : "lustre_read_requests", + "lustre_write_bytes_diff" : "lustre_write_bytes", + "lustre_write_requests_diff" : "lustre_write_requests", + "lustre_open_diff" : "lustre_open", + "lustre_close_diff" : "lustre_close", + "lustre_setattr_diff" : "lustre_setattr", + "lustre_getattr_diff" : "lustre_getattr", + "lustre_statfs_diff": "lustre_statfs", + "lustre_inode_permission_diff" : "lustre_inode_permission", + "pwr_pkg": "cpu_power", + "pwr_dram": "mem_power" + }, + "drop_metrics" : [ + "net_bytes_in", + "net_bytes_out", + "ib_recv", + "ib_xmit", + "ib_recv_pkts", + "ib_xmit_pkts", + "net_pkts_in", + "net_pkts_out", + "lustre_read_bytes", + "lustre_read_requests", + "lustre_write_bytes", + "lustre_write_requests" + ], + "interval_timestamp" : false, + "num_cache_intervals" : 0 +} diff --git a/fau-systems/cc-metric-collector/tinyfat/sinks.json b/fau-systems/cc-metric-collector/tinyfat/sinks.json new file mode 100644 index 0000000..dedcb94 --- /dev/null +++ b/fau-systems/cc-metric-collector/tinyfat/sinks.json @@ -0,0 +1,26 @@ +{ + "influx": { + "type": "influxasync", + "host": "monitoring-test.nhr.uni-erlangen.de", + "port": "8086", + "organization": "ClusterCockpit", + "database": "tinyfat", + "password": "XZY", + "ssl": true, + "meta_as_tags": [ + "unit" + ] + }, + "metricstore": { + "type": "http", + "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinyfat", + "jwt": "XYZ", + "meta_as_tags": [ + "unit" + ], + "idle_connection_timeout": "60s", + "flush_delay": "2s", + "max_retries": 1, + "timeout": "10s" + } +} diff --git a/fau-systems/cc-metric-collector/tinyfat/sinks_debug.json b/fau-systems/cc-metric-collector/tinyfat/sinks_debug.json new file mode 100644 index 0000000..3a6359a --- /dev/null +++ b/fau-systems/cc-metric-collector/tinyfat/sinks_debug.json @@ -0,0 +1,30 @@ +{ + "influx": { + "type": "influxasync", + "host": "monitoring-test.nhr.uni-erlangen.de", + "port": "8086", + "organization": "ClusterCockpit", + "database": "tinyfat", + "password": "XZY", + "ssl": true, + "meta_as_tags": [ + "unit" + ] + }, + "metricstore": { + "type": "http", + "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinyfat", + "jwt": "XZY", + "meta_as_tags": [ + "unit" + ], + "idle_connection_timeout": "60s", + "flush_delay": "2s", + "max_retries": 1, + "timeout": "10s" + }, + "debugstdout": { + "type": "stdout", + "output_file": "/tmp/debug.log" + } +} diff --git a/fau-systems/cc-metric-collector/tinygpu/collectors.rome.a100.json b/fau-systems/cc-metric-collector/tinygpu/collectors.rome.a100.json new file mode 100644 index 0000000..5f1ff1c --- /dev/null +++ b/fau-systems/cc-metric-collector/tinygpu/collectors.rome.a100.json @@ -0,0 +1,103 @@ +{ + "nfs4stat" : {}, + "memstat" : { + "numa_stats": true, + "node_stats": true + }, + "cpustat" : {}, + "loadavg" : {}, + "schedstat": {}, + "netstat" : { + "include_devices" : [ + "eth0" + ], + "send_abs_values": true, + "send_derived_values": true + }, + "diskstat" : {}, + "iostat" : {}, + "tempstat" : { + "tag_override" : { + "hwmon0" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon1" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "nvidia" : { + "use_pci_info_as_type_id": true, + "process_mig_devices": true + }, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC1": "ACTUAL_CPU_CLOCK", + "FIXC2": "MAX_CPU_CLOCK", + "PMC0": "RETIRED_INSTRUCTIONS", + "PMC1": "CPU_CLOCKS_UNHALTED", + "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL", + "PMC3": "MERGE", + "DFC0": "DATA_FROM_LOCAL_DRAM_CHANNEL", + "DFC1": "DATA_TO_LOCAL_DRAM_CHANNEL", + "PWR0": "RAPL_CORE_ENERGY", + "PWR1": "RAPL_PKG_ENERGY" + }, + "metrics": [ + { + "name": "ipc", + "calc": "PMC0/PMC1", + "type": "hwthread", + "publish": true + }, + { + "name": "flops_any", + "calc": "1E-9*PMC2/time", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + }, + { + "name": "clock", + "calc": "1E-6*(FIXC1/FIXC2)/inverseClock", + "type": "hwthread", + "unit": "MHz", + "publish": true + }, + { + "name": "pwr_core", + "calc": "PWR0/time", + "unit": "Watt", + "type": "socket", + "publish": true + }, + { + "name": "pwr_pkg", + "calc": "PWR1/time", + "type": "socket", + "unit": "Watt", + "publish": true + }, + { + "name": "mem_bw", + "calc": "1E-9*(DFC0+DFC1)*64.0/time", + "unit": "Gbyte/s", + "type": "socket", + "publish": true + } + ] + } + ], + "globalmetrics": [] + } + +} diff --git a/fau-systems/cc-metric-collector/tinygpu/collectors.skx.2080.json b/fau-systems/cc-metric-collector/tinygpu/collectors.skx.2080.json new file mode 100644 index 0000000..a06e387 --- /dev/null +++ b/fau-systems/cc-metric-collector/tinygpu/collectors.skx.2080.json @@ -0,0 +1,219 @@ +{ + "nfs4stat" : {}, + "memstat" : { + "numa_stats": true, + "node_stats": true + }, + "cpustat" : {}, + "loadavg" : {}, + "schedstat": {}, + "netstat" : { + "include_devices" : [ + "eth0" + ], + "send_abs_values": true, + "send_derived_values": true + }, + "diskstat" : {}, + "iostat" : {}, + "tempstat" : { + "tag_override" : { + "hwmon0" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon1" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "nvidia" : { + "use_pci_info_as_type_id": true, + "process_mig_devices": true + }, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "MBOX0C0": "CAS_COUNT_RD", + "MBOX0C1": "CAS_COUNT_WR", + "MBOX1C0": "CAS_COUNT_RD", + "MBOX1C1": "CAS_COUNT_WR", + "MBOX2C0": "CAS_COUNT_RD", + "MBOX2C1": "CAS_COUNT_WR", + "MBOX3C0": "CAS_COUNT_RD", + "MBOX3C1": "CAS_COUNT_WR", + "MBOX4C0": "CAS_COUNT_RD", + "MBOX4C1": "CAS_COUNT_WR", + "MBOX5C0": "CAS_COUNT_RD", + "MBOX5C1": "CAS_COUNT_WR", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE", + "PWR0": "PWR_PKG_ENERGY", + "PWR3": "PWR_DRAM_ENERGY" + }, + "metrics": [ + { + "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", + "name": "clock", + "publish": true, + "unit": "MHz", + "type": "hwthread" + }, + { + "calc": "FIXC0/FIXC1", + "name": "ipc", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PWR0/time", + "name": "cpu_power", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "PWR3/time", + "name": "mem_power", + "unit": "Watt", + "publish": true, + "type": "socket" + }, + { + "calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time", + "name": "flops_dp1", + "unit": "GFlops/s", + "publish": false, + "type": "hwthread" + }, + { + "calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time", + "name": "mem_bw", + "unit": "GBytes/s", + "publish": true, + "type": "socket" + }, + { + "calc": "PMC0+PMC2", + "name": "dp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "dp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time", + "name": "flops_sp1", + "publish": false, + "type": "hwthread" + }, + { + "calc": "PMC0+PMC2", + "name": "sp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "sp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "PMC0": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "PMC0", + "name": "dp_avx_512_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "1E-9*(PMC0*8.0)/time", + "name": "flops_dp2", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC1", + "name": "sp_avx_512_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "1E-9*(PMC1*16.0)/time", + "name": "flops_sp2", + "type": "hwthread", + "publish": false + } + ] + } + ], + "globalmetrics": [ + { + "calc": "100*((sp_vec_ins+dp_vec_ins+dp_avx_512_ins+sp_avx_512_ins)/(sp_fp_ins+dp_fp_ins+dp_avx_512_ins+sp_avx_512_ins))", + "name": "vectorization_ratio", + "unit": "%", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_sp1+flops_sp2)", + "name": "flops_sp", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_dp1+flops_dp2)", + "name": "flops_dp", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + }, + { + "calc": "((flops_dp1+flops_dp2) * 2) + (flops_sp1+flops_sp2)", + "name": "flops_any", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + } + ] + } + +} diff --git a/fau-systems/cc-metric-collector/tinygpu/collectors.skx.3080.json b/fau-systems/cc-metric-collector/tinygpu/collectors.skx.3080.json new file mode 100644 index 0000000..d6c6ae0 --- /dev/null +++ b/fau-systems/cc-metric-collector/tinygpu/collectors.skx.3080.json @@ -0,0 +1,175 @@ +{ + "nfs4stat" : {}, + "memstat" : { + "numa_stats": true, + "node_stats": true + }, + "cpustat" : {}, + "loadavg" : {}, + "schedstat": {}, + "netstat" : { + "include_devices" : [ + "eth0", + "eth1", + "eth2" + ], + "send_abs_values": true, + "send_derived_values": true + }, + "diskstat" : {}, + "iostat" : {}, + "tempstat" : { + "tag_override" : { + "hwmon1" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon2" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "nvidia" : { + "use_pci_info_as_type_id": true, + "process_mig_devices": true + }, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "MBOX0C0": "CAS_COUNT_RD", + "MBOX0C1": "CAS_COUNT_WR", + "MBOX1C0": "CAS_COUNT_RD", + "MBOX1C1": "CAS_COUNT_WR", + "MBOX2C0": "CAS_COUNT_RD", + "MBOX2C1": "CAS_COUNT_WR", + "MBOX3C0": "CAS_COUNT_RD", + "MBOX3C1": "CAS_COUNT_WR", + "MBOX4C0": "CAS_COUNT_RD", + "MBOX4C1": "CAS_COUNT_WR", + "MBOX5C0": "CAS_COUNT_RD", + "MBOX5C1": "CAS_COUNT_WR", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE", + "PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE", + "PWR0": "PWR_PKG_ENERGY", + "PWR3": "PWR_DRAM_ENERGY" + }, + "metrics": [ + { + "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", + "name": "clock", + "publish": true, + "unit": "MHz", + "type": "hwthread" + }, + { + "calc": "FIXC0/FIXC1", + "name": "ipc", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PWR0/time", + "name": "cpu_power", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "PWR3/time", + "name": "mem_power", + "unit": "Watt", + "publish": true, + "type": "socket" + }, + { + "calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time", + "name": "flops_dp", + "unit": "GFlops/s", + "publish": true, + "type": "hwthread" + }, + { + "calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time", + "name": "mem_bw", + "unit": "GBytes/s", + "publish": true, + "type": "socket" + }, + { + "calc": "PMC0+PMC2+PMC3", + "name": "dp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2+PMC3", + "name": "dp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE", + "PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time", + "name": "flops_sp", + "unit": "GFlops/s", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PMC0+PMC2+PMC3", + "name": "sp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2+PMC3", + "name": "sp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + } + ], + "globalmetrics": [ + { + "calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))", + "name": "vectorization_ratio", + "unit": "%", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_dp * 2) + flops_sp", + "name": "flops_any", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + } + ] + } + +} diff --git a/fau-systems/cc-metric-collector/tinygpu/collectors.skx.v100.json b/fau-systems/cc-metric-collector/tinygpu/collectors.skx.v100.json new file mode 100644 index 0000000..a06e387 --- /dev/null +++ b/fau-systems/cc-metric-collector/tinygpu/collectors.skx.v100.json @@ -0,0 +1,219 @@ +{ + "nfs4stat" : {}, + "memstat" : { + "numa_stats": true, + "node_stats": true + }, + "cpustat" : {}, + "loadavg" : {}, + "schedstat": {}, + "netstat" : { + "include_devices" : [ + "eth0" + ], + "send_abs_values": true, + "send_derived_values": true + }, + "diskstat" : {}, + "iostat" : {}, + "tempstat" : { + "tag_override" : { + "hwmon0" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon1" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "nvidia" : { + "use_pci_info_as_type_id": true, + "process_mig_devices": true + }, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "MBOX0C0": "CAS_COUNT_RD", + "MBOX0C1": "CAS_COUNT_WR", + "MBOX1C0": "CAS_COUNT_RD", + "MBOX1C1": "CAS_COUNT_WR", + "MBOX2C0": "CAS_COUNT_RD", + "MBOX2C1": "CAS_COUNT_WR", + "MBOX3C0": "CAS_COUNT_RD", + "MBOX3C1": "CAS_COUNT_WR", + "MBOX4C0": "CAS_COUNT_RD", + "MBOX4C1": "CAS_COUNT_WR", + "MBOX5C0": "CAS_COUNT_RD", + "MBOX5C1": "CAS_COUNT_WR", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE", + "PWR0": "PWR_PKG_ENERGY", + "PWR3": "PWR_DRAM_ENERGY" + }, + "metrics": [ + { + "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", + "name": "clock", + "publish": true, + "unit": "MHz", + "type": "hwthread" + }, + { + "calc": "FIXC0/FIXC1", + "name": "ipc", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PWR0/time", + "name": "cpu_power", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "PWR3/time", + "name": "mem_power", + "unit": "Watt", + "publish": true, + "type": "socket" + }, + { + "calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time", + "name": "flops_dp1", + "unit": "GFlops/s", + "publish": false, + "type": "hwthread" + }, + { + "calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time", + "name": "mem_bw", + "unit": "GBytes/s", + "publish": true, + "type": "socket" + }, + { + "calc": "PMC0+PMC2", + "name": "dp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "dp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time", + "name": "flops_sp1", + "publish": false, + "type": "hwthread" + }, + { + "calc": "PMC0+PMC2", + "name": "sp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "sp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "PMC0": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "PMC0", + "name": "dp_avx_512_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "1E-9*(PMC0*8.0)/time", + "name": "flops_dp2", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC1", + "name": "sp_avx_512_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "1E-9*(PMC1*16.0)/time", + "name": "flops_sp2", + "type": "hwthread", + "publish": false + } + ] + } + ], + "globalmetrics": [ + { + "calc": "100*((sp_vec_ins+dp_vec_ins+dp_avx_512_ins+sp_avx_512_ins)/(sp_fp_ins+dp_fp_ins+dp_avx_512_ins+sp_avx_512_ins))", + "name": "vectorization_ratio", + "unit": "%", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_sp1+flops_sp2)", + "name": "flops_sp", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_dp1+flops_dp2)", + "name": "flops_dp", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + }, + { + "calc": "((flops_dp1+flops_dp2) * 2) + (flops_sp1+flops_sp2)", + "name": "flops_any", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + } + ] + } + +} diff --git a/fau-systems/cc-metric-collector/tinygpu/config.json b/fau-systems/cc-metric-collector/tinygpu/config.json new file mode 100644 index 0000000..308b08f --- /dev/null +++ b/fau-systems/cc-metric-collector/tinygpu/config.json @@ -0,0 +1,8 @@ +{ + "sinks": "/etc/cc-metric-collector/sinks.json", + "collectors" : "/etc/cc-metric-collector/collectors.json", + "receivers" : "/etc/cc-metric-collector/receivers.json", + "router" : "/etc/cc-metric-collector/router.json", + "interval": "60s", + "duration": "10s" +} diff --git a/fau-systems/cc-metric-collector/tinygpu/receivers.json b/fau-systems/cc-metric-collector/tinygpu/receivers.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/fau-systems/cc-metric-collector/tinygpu/receivers.json @@ -0,0 +1 @@ +{} diff --git a/fau-systems/cc-metric-collector/tinygpu/router.json b/fau-systems/cc-metric-collector/tinygpu/router.json new file mode 100644 index 0000000..b7de4a8 --- /dev/null +++ b/fau-systems/cc-metric-collector/tinygpu/router.json @@ -0,0 +1,60 @@ +{ + "add_tags" : [ + { + "key" : "cluster", + "value" : "tinygpu", + "if" : "*" + } + ], + "rename_metrics" : { + "load_one" : "cpu_load", + "cpu_load_core" : "cpu_load", + "net_bytes_in_bw" : "net_bytes_in", + "net_bytes_out_bw" : "net_bytes_out", + "net_pkts_in_bw" : "net_pkts_in", + "net_pkts_out_bw" : "net_pkts_out", + "ib_recv_bw" : "ib_recv", + "ib_xmit_bw" : "ib_xmit", + "ib_recv_pkts_bw": "ib_recv_pkts", + "ib_xmit_pkts_bw": "ib_xmit_pkts", + "lustre_read_bytes_diff" : "lustre_read_bytes", + "lustre_read_requests_diff" : "lustre_read_requests", + "lustre_write_bytes_diff" : "lustre_write_bytes", + "lustre_write_requests_diff" : "lustre_write_requests", + "lustre_open_diff" : "lustre_open", + "lustre_close_diff" : "lustre_close", + "lustre_setattr_diff" : "lustre_setattr", + "lustre_getattr_diff" : "lustre_getattr", + "lustre_statfs_diff": "lustre_statfs", + "lustre_inode_permission_diff" : "lustre_inode_permission", + "nv_util" : "acc_utilization", + "nv_fb_mem_used" : "acc_mem_used", + "nv_power_usage" : "acc_power", + "pwr_pkg": "cpu_power", + "pwr_dram": "mem_power" + }, + "drop_metrics" : [ + "net_bytes_in", + "net_bytes_out", + "ib_recv", + "ib_xmit", + "ib_recv_pkts", + "ib_xmit_pkts", + "net_pkts_in", + "net_pkts_out", + "lustre_read_bytes", + "lustre_read_requests", + "lustre_write_bytes", + "lustre_write_requests" + ], + "interval_timestamp" : false, + "num_cache_intervals" : 0, + "change_unit_prefix": { + "mem_used": "G", + "swap_used": "G", + "mem_total": "G", + "swap_total": "G", + "cpufreq": "M" + }, + "normalize_metrics" : true +} diff --git a/fau-systems/cc-metric-collector/tinygpu/sinks.json b/fau-systems/cc-metric-collector/tinygpu/sinks.json new file mode 100644 index 0000000..105f653 --- /dev/null +++ b/fau-systems/cc-metric-collector/tinygpu/sinks.json @@ -0,0 +1,26 @@ +{ + "influx": { + "type": "influxasync", + "host": "monitoring-test.nhr.uni-erlangen.de", + "port": "8086", + "organization": "ClusterCockpit", + "database": "tinygpu", + "password": "XZY", + "ssl": true, + "meta_as_tags": [ + "unit" + ] + }, + "metricstore": { + "type": "http", + "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinygpu", + "jwt": "XZY", + "meta_as_tags": [ + "unit" + ], + "idle_connection_timeout": "60s", + "flush_delay": "2s", + "max_retries": 1, + "timeout": "10s" + } +} diff --git a/fau-systems/cc-metric-collector/tinygpu/sinks_debug.json b/fau-systems/cc-metric-collector/tinygpu/sinks_debug.json new file mode 100644 index 0000000..02ee30b --- /dev/null +++ b/fau-systems/cc-metric-collector/tinygpu/sinks_debug.json @@ -0,0 +1,30 @@ +{ + "influx": { + "type": "influxasync", + "host": "monitoring-test.nhr.uni-erlangen.de", + "port": "8086", + "organization": "ClusterCockpit", + "database": "tinygpu", + "password": "XZY", + "ssl": true, + "meta_as_tags": [ + "unit" + ] + }, + "metricstore": { + "type": "http", + "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinygpu", + "jwt": "XZY", + "meta_as_tags": [ + "unit" + ], + "idle_connection_timeout": "60s", + "flush_delay": "2s", + "max_retries": 1, + "timeout": "10s" + }, + "debugstdout": { + "type": "stdout", + "output_file": "/tmp/debug.log" + } +} diff --git a/fau-systems/cc-metric-collector/woody-ng/collectors.icx.json b/fau-systems/cc-metric-collector/woody-ng/collectors.icx.json new file mode 100644 index 0000000..d49790b --- /dev/null +++ b/fau-systems/cc-metric-collector/woody-ng/collectors.icx.json @@ -0,0 +1,176 @@ +{ + "diskstat" : {}, + "iostat" : {}, + "cpustat": {}, + "memstat" : { + "numa_stats": true, + "node_stats": true + }, + "loadavg" : {}, + "schedstat": {}, + "netstat" : { + "include_devices" : [ + "enp2s0f0", + "eno1", + "ens2f0" + ], + "send_abs_values" : true, + "send_derived_values" : true + }, + "tempstat" : { + "tag_override" : { + "hwmon0" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon1" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "nfs4stat" : {}, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "MBOX0C0": "CAS_COUNT_RD", + "MBOX0C1": "CAS_COUNT_WR", + "MBOX1C0": "CAS_COUNT_RD", + "MBOX1C1": "CAS_COUNT_WR", + "MBOX2C0": "CAS_COUNT_RD", + "MBOX2C1": "CAS_COUNT_WR", + "MBOX3C0": "CAS_COUNT_RD", + "MBOX3C1": "CAS_COUNT_WR", + "MBOX4C0": "CAS_COUNT_RD", + "MBOX4C1": "CAS_COUNT_WR", + "MBOX5C0": "CAS_COUNT_RD", + "MBOX5C1": "CAS_COUNT_WR", + "MBOX6C0": "CAS_COUNT_RD", + "MBOX6C1": "CAS_COUNT_WR", + "MBOX7C0": "CAS_COUNT_RD", + "MBOX7C1": "CAS_COUNT_WR", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE", + "PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE", + "PWR0": "PWR_PKG_ENERGY", + "PWR3": "PWR_DRAM_ENERGY" + }, + "metrics": [ + { + "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", + "name": "clock", + "publish": true, + "unit": "MHz", + "type": "hwthread" + }, + { + "calc": "FIXC0/FIXC1", + "name": "ipc", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PWR0/time", + "name": "cpu_power", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "PWR3/time", + "name": "mem_power", + "unit": "Watt", + "publish": true, + "type": "socket" + }, + { + "calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time", + "name": "flops_dp", + "unit": "GFlops/s", + "publish": true, + "type": "hwthread" + }, + { + "calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time", + "name": "mem_bw", + "unit": "GBytes/s", + "publish": true, + "type": "socket" + }, + { + "calc": "PMC0+PMC2+PMC3", + "name": "dp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2+PMC3", + "name": "dp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "FIXC3": "TOPDOWN_SLOTS", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE", + "PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time", + "name": "flops_sp", + "unit": "GFlops/s", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PMC0+PMC2+PMC3", + "name": "sp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2+PMC3", + "name": "sp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + } + ], + "globalmetrics": [ + { + "calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))", + "name": "vectorization_ratio", + "unit": "%", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_dp * 2) + flops_sp", + "name": "flops_any", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + } + ] + } + +} diff --git a/fau-systems/cc-metric-collector/woody-ng/collectors.skl.json b/fau-systems/cc-metric-collector/woody-ng/collectors.skl.json new file mode 100644 index 0000000..ecd51e3 --- /dev/null +++ b/fau-systems/cc-metric-collector/woody-ng/collectors.skl.json @@ -0,0 +1,147 @@ +{ + "diskstat" : {}, + "iostat" : {}, + "cpustat": {}, + "memstat" : { + "node_stats": true + }, + "loadavg" : {}, + "schedstat": {}, + "netstat" : { + "include_devices" : [ + "enp2s0f0", + "eno1", + "ens2f0" + ], + "send_abs_values" : true, + "send_derived_values" : true + }, + "tempstat" : {}, + "nfs4stat" : {}, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "MBOX0C1": "DRAM_READS", + "MBOX0C2": "DRAM_WRITES", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE", + "PWR0": "PWR_PKG_ENERGY", + "PWR3": "PWR_DRAM_ENERGY" + }, + "metrics": [ + { + "calc": "1E-6*(FIXC1/FIXC2)/inverseClock", + "name": "clock", + "unit": "MHz", + "publish": true, + "type": "hwthread" + }, + { + "calc": "FIXC0/FIXC1", + "name": "ipc", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PWR0/time", + "name": "pwr_pkg", + "unit": "Watt", + "publish": true, + "type": "socket" + }, + { + "calc": "PWR3/time", + "name": "pwr_dram", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time", + "name": "flops_dp", + "publish": true, + "unit": "GFlops/s", + "type": "hwthread" + }, + { + "calc": "1E-9*(MBOX0C1+MBOX0C2)*64.0/time", + "name": "mem_bw", + "publish": true, + "unit": "GBytes/s", + "type": "socket" + }, + { + "calc": "PMC0+PMC2", + "name": "dp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "dp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time", + "name": "flops_sp", + "publish": true, + "unit": "GFlops/s", + "type": "hwthread" + }, + { + "calc": "PMC0+PMC2", + "name": "sp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "sp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + } + ], + "globalmetrics": [ + { + "calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))", + "name": "vectorization_ratio", + "unit": "%", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_dp * 2) + flops_sp", + "name": "flops_any", + "type": "hwthread", + "unit": "GFlops/s", + "publish": true + } + ] + } + +} diff --git a/fau-systems/cc-metric-collector/woody-ng/config.json b/fau-systems/cc-metric-collector/woody-ng/config.json new file mode 100644 index 0000000..308b08f --- /dev/null +++ b/fau-systems/cc-metric-collector/woody-ng/config.json @@ -0,0 +1,8 @@ +{ + "sinks": "/etc/cc-metric-collector/sinks.json", + "collectors" : "/etc/cc-metric-collector/collectors.json", + "receivers" : "/etc/cc-metric-collector/receivers.json", + "router" : "/etc/cc-metric-collector/router.json", + "interval": "60s", + "duration": "10s" +} diff --git a/fau-systems/cc-metric-collector/woody-ng/receivers.json b/fau-systems/cc-metric-collector/woody-ng/receivers.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/fau-systems/cc-metric-collector/woody-ng/receivers.json @@ -0,0 +1 @@ +{} diff --git a/fau-systems/cc-metric-collector/woody-ng/router.json b/fau-systems/cc-metric-collector/woody-ng/router.json new file mode 100644 index 0000000..9b311db --- /dev/null +++ b/fau-systems/cc-metric-collector/woody-ng/router.json @@ -0,0 +1,54 @@ +{ + "add_tags" : [ + { + "key" : "cluster", + "value" : "woodyng", + "if" : "*" + } + ], + "rename_metrics" : { + "load_one" : "cpu_load", + "cpu_load_core" : "cpu_load", + "net_bytes_in_bw" : "net_bytes_in", + "net_bytes_out_bw" : "net_bytes_out", + "net_pkts_in_bw" : "net_pkts_in", + "net_pkts_out_bw" : "net_pkts_out", + "ib_recv_bw" : "ib_recv", + "ib_xmit_bw" : "ib_xmit", + "ib_recv_pkts_bw": "ib_recv_pkts", + "ib_xmit_pkts_bw": "ib_xmit_pkts", + "lustre_read_bytes_diff" : "lustre_read_bytes", + "lustre_read_requests_diff" : "lustre_read_requests", + "lustre_write_bytes_diff" : "lustre_write_bytes", + "lustre_write_requests_diff" : "lustre_write_requests", + "lustre_open_diff" : "lustre_open", + "lustre_close_diff" : "lustre_close", + "lustre_setattr_diff" : "lustre_setattr", + "lustre_getattr_diff" : "lustre_getattr", + "lustre_statfs_diff": "lustre_statfs", + "lustre_inode_permission_diff" : "lustre_inode_permission" + }, + "drop_metrics" : [ + "net_bytes_in", + "net_bytes_out", + "ib_recv", + "ib_xmit", + "ib_recv_pkts", + "ib_xmit_pkts", + "net_pkts_in", + "net_pkts_out", + "lustre_read_bytes", + "lustre_read_requests", + "lustre_write_bytes", + "lustre_write_requests" + ], + "interval_timestamp" : true, + "num_cache_intervals" : 0, + "change_unit_prefix": { + "mem_used": "G", + "swap_used": "G", + "mem_total": "G", + "swap_total": "G" + }, + "normalize_metrics" : true +} diff --git a/fau-systems/cc-metric-collector/woody-ng/sinks.json b/fau-systems/cc-metric-collector/woody-ng/sinks.json new file mode 100644 index 0000000..984be7d --- /dev/null +++ b/fau-systems/cc-metric-collector/woody-ng/sinks.json @@ -0,0 +1,20 @@ +{ + "nhrinflux": { + "type": "influxasync", + "host": "monitoring-test.nhr.uni-erlangen.de", + "port": "8086", + "organization": "ClusterCockpit", + "database": "woodyng", + "password": "XZY", + "ssl": true + }, + "woodystore": { + "type": "http", + "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=woodyng", + "jwt": "XZY", + "meta_as_tags": [ + "unit" + ], + "idle_connection_timeout": "60s" + } +} diff --git a/fau-systems/cc-metric-store/cc-metric-store.service b/fau-systems/cc-metric-store/cc-metric-store.service new file mode 100644 index 0000000..210f2de --- /dev/null +++ b/fau-systems/cc-metric-store/cc-metric-store.service @@ -0,0 +1,19 @@ +[Unit] +Description=ClusterCockpit In-Memory Timeseries Database for Fritz (cc-metric-store) +Documentation=https://github.com/ClusterCockpit/cc-metric-store +Wants=network-online.target +After=network-online.target + +[Service] +Type=simple +User=clustercockpit +Group=clustercockpit +Restart=on-failure +RestartSec=30 +TimeoutStopSec=100 +WorkingDirectory=/opt/monitoring/cc-metric-store/fritz +ExecStart=/opt/monitoring/cc-metric-store/repo/cc-metric-store --config ./config.json +LimitNOFILE=500000 + +[Install] +WantedBy=multi-user.target diff --git a/fau-systems/cc-metric-store/config.json b/fau-systems/cc-metric-store/config.json new file mode 100644 index 0000000..272a790 --- /dev/null +++ b/fau-systems/cc-metric-store/config.json @@ -0,0 +1,180 @@ +{ + "metrics": { + "clock": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_idle": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_iowait": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_irq": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_system": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_user": { + "frequency": 60, + "aggregation": "avg" + }, + "nv_mem_util": { + "frequency": 60, + "aggregation": "avg" + }, + "nv_temp": { + "frequency": 60, + "aggregation": "avg" + }, + "nv_sm_clock": { + "frequency": 60, + "aggregation": "avg" + }, + "acc_utilization": { + "frequency": 60, + "aggregation": "avg" + }, + "acc_mem_used": { + "frequency": 60, + "aggregation": "sum" + }, + "acc_power": { + "frequency": 60, + "aggregation": "sum" + }, + "flops_any": { + "frequency": 60, + "aggregation": "sum" + }, + "flops_dp": { + "frequency": 60, + "aggregation": "sum" + }, + "flops_sp": { + "frequency": 60, + "aggregation": "sum" + }, + "ib_recv": { + "frequency": 60, + "aggregation": "sum" + }, + "ib_xmit": { + "frequency": 60, + "aggregation": "sum" + }, + "ib_recv_pkts": { + "frequency": 60, + "aggregation": "sum" + }, + "ib_xmit_pkts": { + "frequency": 60, + "aggregation": "sum" + }, + "cpu_power": { + "frequency": 60, + "aggregation": "sum" + }, + "core_power": { + "frequency": 60, + "aggregation": "sum" + }, + "mem_power": { + "frequency": 60, + "aggregation": "sum" + }, + "ipc": { + "frequency": 60, + "aggregation": "avg" + }, + "cpu_load": { + "frequency": 60, + "aggregation": null + }, + "lustre_close": { + "frequency": 60, + "aggregation": null + }, + "lustre_open": { + "frequency": 60, + "aggregation": null + }, + "lustre_statfs": { + "frequency": 60, + "aggregation": null + }, + "lustre_read_bytes": { + "frequency": 60, + "aggregation": null + }, + "lustre_write_bytes": { + "frequency": 60, + "aggregation": null + }, + "net_bw": { + "frequency": 60, + "aggregation": null + }, + "file_bw": { + "frequency": 60, + "aggregation": null + }, + "mem_bw": { + "frequency": 60, + "aggregation": "sum" + }, + "mem_cached": { + "frequency": 60, + "aggregation": null + }, + "mem_used": { + "frequency": 60, + "aggregation": null + }, + "net_bytes_in": { + "frequency": 60, + "aggregation": null + }, + "net_bytes_out": { + "frequency": 60, + "aggregation": null + }, + "nfs4_read": { + "frequency": 60, + "aggregation": null + }, + "nfs4_total": { + "frequency": 60, + "aggregation": null + }, + "nfs4_write": { + "frequency": 60, + "aggregation": null + }, + "vectorization_ratio": { + "frequency": 60, + "aggregation": "avg" + } + }, + "checkpoints": { + "interval": "12h", + "directory": "/opt/monitoring/cc-metric-store/fritz/checkpoints", + "restore": "48h" + }, + "archive": { + "interval": "50h", + "directory": "/opt/monitoring/cc-metric-store/fritz/archive" + }, + "http-api": { + "address": "0.0.0.0:8082", + "https-cert-file": null, + "https-key-file": null + }, + "retention-in-memory": "48h", + "jwt-public-key": "XZY" +} diff --git a/fau-systems/job-archive/cluster-alex.json b/fau-systems/job-archive/cluster-alex.json new file mode 100644 index 0000000..a669cba --- /dev/null +++ b/fau-systems/job-archive/cluster-alex.json @@ -0,0 +1,484 @@ +{ + "name": "alex", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 128, + "normal": 128, + "caution": 10, + "alert": 5 + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 512, + "normal": 128, + "caution": 200, + "alert": 240 + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 9216, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10 + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + }, + { + "name": "core_power", + "unit": { + "base": "W" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "acc_utilization", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 50, + "alert": 20 + }, + { + "name": "acc_mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 40, + "normal": 20, + "caution": 10, + "alert": 5 + }, + { + "name": "acc_power", + "unit": { + "base": "W" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 400, + "normal": 200, + "caution": 50, + "alert": 20 + }, + { + "name": "nv_mem_util", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 20, + "alert": 10 + }, + { + "name": "nv_temp", + "unit": { + "base": "°C" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 40, + "normal": 20, + "caution": 5, + "alert": 2 + }, + { + "name": "nv_sm_clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 1400, + "normal": 1200, + "caution": 100, + "alert": 50 + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + } + ], + "subClusters": [ + { + "name": "a40", + "nodes": "a[0121-0129],a[0221-0229],a[0321-0329],a[0421-0429],a[0521-0522]", + "processorType": "AMD Milan", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 400 + }, + "topology": { + "node": [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ], + "socket": [ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 + ], + [ + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ] + ], + "memoryDomain": [ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] + ], + "accelerators": [ + { + "id": "00000000:01:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:25:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:41:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:61:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:81:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:A1:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:C1:00.0", + "type": "Nvidia GPU", + "model": "A40" + }, + { + "id": "00000000:E1:00.0", + "type": "Nvidia GPU", + "model": "A40" + } + ] + } + }, + { + "name": "a100", + "nodes": "a[0601-0605],a[0701-0705],a[0801-0805],a[0901-0905]", + "processorType": "AMD Milan", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 400 + }, + "topology": { + "node": [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ], + "socket": [ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 + ], + [ + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ] + ], + "memoryDomain": [ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] + ], + "accelerators": [ + { + "id": "00000000:0E:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:13:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:49:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:4F:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:90:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:96:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:CC:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:D1:00.0", + "type": "Nvidia GPU", + "model": "A100" + } + ] + } + }, + { + "name": "a100m80", + "nodes": "a[0531-0537],a[0631-0633],a0831,a[0931-0934]", + "processorType": "AMD Milan", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 400 + }, + "topology": { + "node": [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ], + "socket": [ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 + ], + [ + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ] + ], + "memoryDomain": [ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] + ], + "accelerators": [ + { + "id": "00000000:0E:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:13:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:49:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:4F:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:90:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:96:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:CC:00.0", + "type": "Nvidia GPU", + "model": "A100" + }, + { + "id": "00000000:D1:00.0", + "type": "Nvidia GPU", + "model": "A100" + } + ] + } + } + ] +} \ No newline at end of file diff --git a/fau-systems/job-archive/cluster-emmy.json b/fau-systems/job-archive/cluster-emmy.json new file mode 100644 index 0000000..59502e5 --- /dev/null +++ b/fau-systems/job-archive/cluster-emmy.json @@ -0,0 +1,164 @@ +{ + "name": "emmy", + "subClusters": [ + { + "name": "main", + "numberOfNode": 560, + "processorType": "Intel IvyBridge", + "socketsPerNode": 2, + "coresPerSocket": 10, + "threadsPerCore": 2, + "flopRateScalar": 88, + "flopRateSimd": 704, + "memoryBandwidth": 80, + "topology": { + "node": [0,20,1,21,2,22,3,23,4,24,5,25,6,26,7,27,8,28,9,29,10,30,11,31,12,32,13,33,14,34,15,35,16,36,17,37,18,38,19,39], + "socket": [ + [0,20,1,21,2,22,3,23,4,24,5,25,6,26,7,27,8,28,9,29], + [10,30,11,31,12,32,13,33,14,34,15,35,16,36,17,37,18,38,19,39] + ], + "memoryDomain": [ + [0,20,1,21,2,22,3,23,4,24,5,25,6,26,7,27,8,28,9,29], + [10,30,11,31,12,32,13,33,14,34,15,35,16,36,17,37,18,38,19,39] + ], + "core": [ + [0,20],[1,21],[2,22],[3,23],[4,24],[5,25],[6,26],[7,27],[8,28],[9,29],[10,30],[11,31],[12,32],[13,33],[14,34],[15,35],[16,36],[17,37],[18,38],[19,39] + ] + } + } + ], + "metricConfig": [ + { + "name": "cpu_load", + "scope": "node", + "unit": "load", + "timestep": 60, + "aggregation": null, + "peak": 40, + "normal": 20, + "caution": 15, + "alert": 10, + "measurement": "data" + }, + { + "name": "mem_used", + "scope": "node", + "unit": "GB", + "timestep": 60, + "aggregation": null, + "peak": 64, + "normal": 20, + "caution": 40, + "alert": 55, + "measurement": "data" + }, + { + "name": "flops_any", + "scope": "node", + "unit": "GF/s", + "timestep": 60, + "aggregation": "sum", + "peak": 704, + "normal": 100, + "caution": 20, + "alert": 2, + "measurement": "data" + }, + { + "name": "flops_sp", + "scope": "node", + "unit": "GF/s", + "timestep": 60, + "aggregation": "sum", + "peak": 704, + "normal": 100, + "caution": 20, + "alert": 2, + "measurement": "data" + }, + { + "name": "flops_dp", + "scope": "node", + "unit": "GF/s", + "timestep": 60, + "aggregation": "sum", + "peak": 350, + "normal": 50, + "caution": 10, + "alert": 2, + "measurement": "data" + }, + { + "name": "mem_bw", + "scope": "node", + "unit": "GB/s", + "timestep": 60, + "aggregation": "sum", + "peak": 80, + "normal": 30, + "caution": 10, + "alert": 5, + "measurement": "data" + }, + { + "name": "ipc", + "scope": "node", + "unit": "IPC", + "timestep": 60, + "aggregation": "avg", + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5, + "measurement": "data" + }, + { + "name": "clock", + "scope": "node", + "unit": "MHz", + "timestep": 60, + "aggregation": "avg", + "peak": 3000, + "normal": 2200, + "caution": 1800, + "alert": 1200, + "measurement": "data" + }, + { + "name": "rapl_power", + "scope": "node", + "unit": "W", + "timestep": 60, + "aggregation": "sum", + "peak": 160, + "normal": 120, + "caution": 45, + "alert": 10, + "measurement": "data" + }, + { + "name": "ib_bw", + "scope": "node", + "unit": "GB/s", + "timestep": 60, + "aggregation": null, + "peak": 6, + "normal": 2, + "caution": 1, + "alert": 0.5, + "measurement": "data" + }, + { + "name": "lustre_bw", + "scope": "node", + "unit": "GB/s", + "timestep": 60, + "aggregation": null, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5, + "measurement": "data" + } + ] +} diff --git a/fau-systems/job-archive/cluster-fritz.json b/fau-systems/job-archive/cluster-fritz.json new file mode 100644 index 0000000..9437a8d --- /dev/null +++ b/fau-systems/job-archive/cluster-fritz.json @@ -0,0 +1,540 @@ +{ + "name": "fritz", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 72, + "normal": 72, + "caution": 36, + "alert": 20, + "subClusters": [ + { + "name": "spr1tb", + "peak": 104, + "normal": 104, + "caution": 52, + "alert": 20 + }, + { + "name": "spr2tb", + "peak": 104, + "normal": 104, + "caution": 52, + "alert": 20 + } + ] + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 256, + "normal": 128, + "caution": 200, + "alert": 240, + "subClusters": [ + { + "name": "spr1tb", + "peak": 1024, + "normal": 512, + "caution": 900, + "alert": 1000 + }, + { + "name": "spr2tb", + "peak": 2048, + "normal": 1024, + "caution": 1800, + "alert": 2000 + } + ] + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50, + "subClusters": [ + { + "name": "spr1tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50, + "remove": true + }, + { + "name": "spr2tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50, + "remove": true + } + ] + }, + { + "name": "flops_sp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50, + "subClusters": [ + { + "name": "spr1tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50, + "remove": true + }, + { + "name": "spr2tb", + "peak": 6656, + "normal": 1500, + "caution": 400, + "alert": 50, + "remove": true + } + ] + }, + { + "name": "flops_dp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 2300, + "normal": 500, + "caution": 100, + "alert": 50, + "subClusters": [ + { + "name": "spr1tb", + "peak": 3300, + "normal": 750, + "caution": 200, + "alert": 50, + "remove": true + }, + { + "name": "spr2tb", + "peak": 3300, + "normal": 750, + "caution": 200, + "alert": 50, + "remove": true + } + ] + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10, + "subClusters": [ + { + "name": "spr1tb", + "peak": 549, + "normal": 200, + "caution": 100, + "alert": 20, + "remove": true + }, + { + "name": "spr2tb", + "peak": 520, + "normal": 200, + "caution": 100, + "alert": 20, + "remove": true + } + ] + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200, + "subClusters": [ + { + "name": "spr1tb", + "peak": 549, + "normal": 2000, + "caution": 1600, + "alert": 1200, + "remove": true + }, + { + "name": "spr2tb", + "peak": 520, + "normal": 2000, + "caution": 1600, + "alert": 1200, + "remove": true + } + ] + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "mem_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "ib_recv", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1250000, + "normal": 6000000, + "caution": 200, + "alert": 1 + }, + { + "name": "ib_xmit", + "unit": { + "base": "B/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1250000, + "normal": 6000000, + "caution": 200, + "alert": 1 + }, + { + "name": "ib_recv_pkts", + "unit": { + "base": "packets/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "ib_xmit_pkts", + "unit": { + "base": "packets/s" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_read", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_write", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + } + ], + "subClusters": [ + { + "name": "main", + "nodes": "f[0101-0188,0201-0288,0301-0388,0401-0488,0501-0588,0601-0688,0701-0788,0801-0888,0901-0988,1001-1088,1101-1156,1201-1256]", + "processorType": "Intel Icelake", + "socketsPerNode": 2, + "coresPerSocket": 36, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 350 + }, + "topology": { + "node": [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71 + ], + "socket": [ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 ], + [ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71 ] + ], + "memoryDomain": [ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 ], + [ 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 ], + [ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ], + [ 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71 ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ] + ] + } + }, + { + "name": "spr1tb", + "processorType": "Intel(R) Xeon(R) Platinum 8470", + "socketsPerNode": 2, + "coresPerSocket": 52, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 695 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 549 + }, + "nodes": "f[2157-2188,2257-2288]", + "topology": { + "node": + [ + 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,5152,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103 + ], + "socket": + [ + [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51], + [52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103] + ], + "memoryDomain": [ + [0,1,2,3,4,5,6,7,8,9,10,11,12], + [13,14,15,16,17,18,19,20,21,22,23,24,25], + [26,27,28,29,30,31,32,33,34,35,36,37,38], + [39,40,41,42,43,44,45,46,47,48,49,50,51], + [52,53,54,55,56,57,58,59,60,61,62,63,64], + [65,66,67,68,69,70,71,72,73,74,75,76,77], + [78,79,80,81,82,83,84,85,86,87,88,89,90], + [91,92,93,94,95,96,97,98,99,100,101,102,103] + ], + "core": [ + [0],[1],[2],[3],[4],[5],[6],[7],[8],[9],[10],[11],[12],[13],[14],[15],[16],[17],[18],[19],[20],[21],[22],[23],[24],[25],[26],[27],[28],[29],[30],[31],[32],[33],[34],[35],[36],[37],[38],[39],[40],[41],[42],[43],[44],[45],[46],[47],[48],[49],[50],[51],[52],[53],[54],[55],[56],[57],[58],[59],[60],[61],[62],[63],[64],[65],[66],[67],[68],[69],[70],[71],[72],[73],[74],[75],[76],[77],[78],[79],[80],[81],[82],[83],[84],[85],[86],[87],[88],[89],[90],[91],[92],[93],[94],[95],[96],[97],[98],[99],[100],[101],[102],[103] + ] + } + }, + { + "name": "spr2tb", + "processorType": "Intel(R) Xeon(R) Platinum 8470", + "socketsPerNode": 2, + "coresPerSocket": 52, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 695 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 515 + }, + "nodes": "f[2181-2188,2281-2288]", + "topology": { + "node": [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103 + ], + "socket": [ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 + ], + [ + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103 + ] + ], + "memoryDomain": [ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ], + [ 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 ], + [ 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38 ], + [ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 ], + [ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64 ], + [ 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77 ], + [ 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90 ], + [ 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103 ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 72 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ] + ] + } + } + ] +} \ No newline at end of file diff --git a/fau-systems/job-archive/cluster-meggie.json b/fau-systems/job-archive/cluster-meggie.json new file mode 100644 index 0000000..bfaeb99 --- /dev/null +++ b/fau-systems/job-archive/cluster-meggie.json @@ -0,0 +1,243 @@ +{ + "name": "meggie", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "load" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 40, + "normal": 20, + "caution": 15, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 64, + "normal": 20, + "caution": 40, + "alert": 55 + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1536, + "normal": 200, + "caution": 40, + "alert": 4 + }, + { + "name": "flops_sp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 1536, + "normal": 100, + "caution": 20, + "alert": 2 + }, + { + "name": "flops_dp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 768, + "normal": 50, + "caution": 10, + "alert": 2 + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 140, + "normal": 70, + "caution": 20, + "alert": 5 + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 80, + "normal": 30, + "caution": 10, + "alert": 5 + }, + { + "name": "mem_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "nfs4_read", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_write", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + } + ], + "subClusters": [ + { + "name": "main", + "nodes": "m[0101-0164,0201-0264,0301-0364,0401-0464,0601-0676,0701-0776,0801-0872,0901-0972,1001-1072,1101-1172]", + "processorType": "Intel Broadwell", + "socketsPerNode": 2, + "coresPerSocket": 10, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 96 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 1536 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 140 + }, + "topology": { + "node": [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 + ], + "socket": [ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ], + [ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ] + ], + "memoryDomain": [ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ], + [ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ] + ] + } + } + ] +} \ No newline at end of file diff --git a/fau-systems/job-archive/cluster-tinyfat.json b/fau-systems/job-archive/cluster-tinyfat.json new file mode 100644 index 0000000..2d22b11 --- /dev/null +++ b/fau-systems/job-archive/cluster-tinyfat.json @@ -0,0 +1,466 @@ +{ + "name": "tinyfat", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 128, + "normal": 64, + "caution": 60, + "alert": 20, + "subClusters": [ + { + "name": "broadwell_256gb", + "peak": 24, + "normal": 24, + "caution": 12, + "alert": 10 + }, + { + "name": "broadwell_512gb", + "peak": 56, + "normal": 56, + "caution": 28, + "alert": 20 + } + ] + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 512, + "normal": 256, + "caution": 480, + "alert": 500, + "subClusters": [ + { + "name": "broadwell_256gb", + "peak": 256, + "normal": 128, + "caution": 200, + "alert": 240 + }, + { + "name": "broadwell_512gb", + "peak": 512, + "normal": 256, + "caution": 480, + "alert": 500 + } + ] + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 2560, + "normal": 800, + "caution": 100, + "alert": 20, + "subClusters": [ + { + "name": "broadwell_256gb", + "peak": 653, + "normal": 200, + "caution": 50, + "alert": 10 + }, + { + "name": "broadwell_512gb", + "peak": 1075, + "normal": 500, + "caution": 80, + "alert": 20 + } + ] + }, + { + "name": "flops_sp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50, + "subClusters": [ + { + "name": "broadwell_256gb", + "peak": 653, + "normal": 200, + "caution": 50, + "alert": 10 + }, + { + "name": "broadwell_512gb", + "peak": 1075, + "normal": 500, + "caution": 80, + "alert": 20 + } + ] + }, + { + "name": "flops_dp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 2300, + "normal": 500, + "caution": 100, + "alert": 50, + "subClusters": [ + { + "name": "broadwell_256gb", + "peak": 325, + "normal": 100, + "caution": 25, + "alert": 10 + }, + { + "name": "broadwell_512gb", + "peak": 500, + "normal": 250, + "caution": 40, + "alert": 10 + } + ] + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 278, + "normal": 100, + "caution": 50, + "alert": 10, + "subClusters": [ + { + "name": "broadwell_256gb", + "peak": 114, + "normal": 50, + "caution": 25, + "alert": 10 + }, + { + "name": "broadwell_512gb", + "peak": 128, + "normal": 50, + "caution": 25, + "alert": 10 + } + ] + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2500, + "caution": 1800, + "alert": 1200, + "subClusters": [ + { + "name": "broadwell_256gb", + "peak": 3800, + "normal": 3400, + "caution": 2000, + "alert": 1200 + }, + { + "name": "broadwell_512gb", + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + } + ] + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "mem_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "nfs4_read", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_write", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + } + ], + "subClusters": [ + { + "name": "broadwell_512gb", + "nodes": "tf040,tf041,tf042", + "processorType": "Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz", + "socketsPerNode": 2, + "coresPerSocket": 14, + "threadsPerCore": 2, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 158 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 1236 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 128 + }, + "topology": { + "node": [ + 0, 28, 1, 29, 2, 30, 3, 31, 4, 32, 5, 33, 6, 34, 7, 35, 8, 36, 9, 37, 10, 38, 11, 39, 12, 40, 13, 41, 14, 42, 15, 43, 16, 44, 17, 45, 18, 46, 19, 47, 20, 48, 21, 49, 22, 50, 23, 51, 24, 52, 25, 53, 26, 54, 27, 55 + ], + "socket": [ + [ 0, 28, 1, 29, 2, 30, 3, 31, 4, 32, 5, 33, 6, 34, 7, 35, 8, 36, 9, 37, 10, 38, 11, 39, 12, 40, 13, 41 ], + [ 14, 42, 15, 43, 16, 44, 17, 45, 18, 46, 19, 47, 20, 48, 21, 49, 22, 50, 23, 51, 24, 52, 25, 53, 26, 54, 27, 55 ] + ], + "memoryDomain": [ + [ 0, 28, 1, 29, 2, 30, 3, 31, 4, 32, 5, 33, 6, 34 ], + [ 7, 35, 8, 36, 9, 37, 10, 38, 11, 39, 12, 40, 13, 41 ], + [ 14, 42, 15, 43, 16, 44, 17, 45, 18, 46, 19, 47, 20, 48 ], + [ 21, 49, 22, 50, 23, 51, 24, 52, 25, 53, 26, 54, 27, 55 ] + ], + "core": [ + [ 0, 28 ], [ 1, 29 ], [ 2, 30 ], [ 3, 31 ], [ 4, 32 ], [ 5, 33 ], [ 6, 34 ], [ 7, 35 ], [ 8, 36 ], [ 9, 37 ], [ 10, 38 ], [ 11, 39 ], [ 12, 40 ], [ 13, 41 ], [ 14, 42 ], [ 15, 43 ], [ 16, 44 ], [ 17, 45 ], [ 18, 46 ], [ 19, 47 ], [ 20, 48 ], [ 21, 49 ], [ 22, 50 ], [ 23, 51 ], [ 24, 52 ], [ 25, 53 ], [ 26, 54 ], [ 27, 55 ] + ] + } + }, + { + "name": "broadwell_256gb", + "nodes": "tf0[50-57]", + "processorType": "Intel(R) Xeon(R) CPU E5-2643 v4 @ 3.40GHz", + "socketsPerNode": 2, + "coresPerSocket": 6, + "threadsPerCore": 2, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 85 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 672 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 114 + }, + "topology": { + "node": [ + 0, 12, 1, 13, 2, 14, 3, 15, 4, 16, 5, 17, 6, 18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23 + ], + "socket": [ + [ 0, 12, 1, 13, 2, 14, 3, 15, 4, 16, 5, 17 ], + [ 6, 18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23 ] + ], + "memoryDomain": [ + [ 0, 12, 1, 13, 2, 14, 3, 15, 4, 16, 5, 17 ], + [ 6, 18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23 ] + ], + "core": [ + [ 0, 12 ], [ 1, 13 ], [ 2, 14 ], [ 3, 15 ], [ 4, 16 ], [ 5, 17 ], [ 6, 18 ], [ 7, 19 ], [ 8, 20 ], [ 9, 21 ], [ 10, 22 ], [ 11, 23 ] + ] + } + }, + { + "name": "rome_512gb", + "nodes": "tf0[60-95]", + "processorType": "AMD EPYC 7502 32-Core Processor ", + "socketsPerNode": 2, + "coresPerSocket": 32, + "threadsPerCore": 2, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 553 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 3198 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 278 + }, + "topology": { + "node": [ + 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95, 32, 96, 33, 97, 34, 98, 35, 99, 36, 100, 37, 101, 38, 102, 39, 103, 40, 104, 41, 105, 42, 106, 43, 107, 44, 108, 45, 109, 46, 110, 47, 111, 48, 112, 49, 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119, 56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, 126, 63, 127 + ], + "socket": [ + [ 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 ], + [ 32, 96, 33, 97, 34, 98, 35, 99, 36, 100, 37, 101, 38, 102, 39, 103, 40, 104, 41, 105, 42, 106, 43, 107, 44, 108, 45, 109, 46, 110, 47, 111, 48, 112, 49, 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119, 56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, 126, 63, 127 ] + ], + "memoryDomain": [ + [ 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71 ], + [ 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79 ], + [ 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87 ], + [ 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 ], + [ 32, 96, 33, 97, 34, 98, 35, 99, 36, 100, 37, 101, 38, 102, 39, 103 ], + [ 40, 104, 41, 105, 42, 106, 43, 107, 44, 108, 45, 109, 46, 110, 47, 111 ], + [ 48, 112, 49, 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119 ], + [ 56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, 126, 63, 127 ] + ], + "core": [ + [ 0, 64 ], [ 1, 65 ], [ 2, 66 ], [ 3, 67 ], [ 4, 68 ], [ 5, 69 ], [ 6, 70 ], [ 7, 71 ], [ 8, 72 ], [ 9, 73 ], [ 10, 74 ], [ 11, 75 ], [ 12, 76 ], [ 13, 77 ], [ 14, 78 ], [ 15, 79 ], [ 16, 80 ], [ 17, 81 ], [ 18, 82 ], [ 19, 83 ], [ 20, 84 ], [ 21, 85 ], [ 22, 86 ], [ 23, 87 ], [ 24, 88 ], [ 25, 89 ], [ 26, 90 ], [ 27, 91 ], [ 28, 92 ], [ 29, 93 ], [ 30, 94 ], [ 31, 95 ], [ 32, 96 ], [ 33, 97 ], [ 34, 98 ], [ 35, 99 ], [ 36, 100 ], [ 37, 101 ], [ 38, 102 ], [ 39, 103 ], [ 40, 104 ], [ 41, 105 ], [ 42, 106 ], [ 43, 107 ], [ 44, 108 ], [ 45, 109 ], [ 46, 110 ], [ 47, 111 ], [ 48, 112 ], [ 49, 113 ], [ 50, 114 ], [ 51, 115 ], [ 52, 116 ], [ 53, 117 ], [ 54, 118 ], [ 55, 119 ], [ 56, 120 ], [ 57, 121 ], [ 58, 122 ], [ 59, 123 ], [ 60, 124 ], [ 61, 125 ], [ 62, 126 ], [ 63, 127 ] + ] + } + } + ] +} \ No newline at end of file diff --git a/fau-systems/job-archive/cluster-tinygpu.json b/fau-systems/job-archive/cluster-tinygpu.json new file mode 100644 index 0000000..534b4fe --- /dev/null +++ b/fau-systems/job-archive/cluster-tinygpu.json @@ -0,0 +1,621 @@ +{ + "name": "tinygpu", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 64, + "normal": 32, + "caution": 30, + "alert": 20, + "subClusters": [ + { + "name": "rtx2080", + "peak": 32, + "normal": 16, + "caution": 14, + "alert": 6 + }, + { + "name": "a100", + "peak": 128, + "normal": 128, + "caution": 60, + "alert": 20 + }, + { + "name": "v100", + "peak": 32, + "normal": 16, + "caution": 14, + "alert": 6 + } + ] + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 96, + "normal": 45, + "caution": 80, + "alert": 90, + "subClusters": [ + { + "name": "rtx3080", + "peak": 384, + "normal": 192, + "caution": 320, + "alert": 375 + }, + { + "name": "a100", + "peak": 512, + "normal": 256, + "caution": 480, + "alert": 500 + } + ] + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "flops_sp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 5600, + "normal": 1000, + "caution": 200, + "alert": 50 + }, + { + "name": "flops_dp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 2300, + "normal": 500, + "caution": 100, + "alert": 50 + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 10 + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2400, + "caution": 1800, + "alert": 1200 + }, + { + "name": "cpu_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 500, + "normal": 250, + "caution": 100, + "alert": 50 + }, + { + "name": "mem_power", + "unit": { + "base": "W" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.5 + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "acc_utilization", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 50, + "alert": 20 + }, + { + "name": "acc_mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 40, + "normal": 20, + "caution": 10, + "alert": 5 + }, + { + "name": "acc_power", + "unit": { + "base": "W" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 400, + "normal": 200, + "caution": 50, + "alert": 20 + }, + { + "name": "nv_mem_util", + "unit": { + "base": "" + }, + "scope": "accelerator", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 80, + "caution": 20, + "alert": 10 + }, + { + "name": "nv_temp", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 40, + "normal": 20, + "caution": 5, + "alert": 2 + }, + { + "name": "nv_sm_clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "accelerator", + "aggregation": "sum", + "timestep": 60, + "peak": 1400, + "normal": 1200, + "caution": 100, + "alert": 50 + }, + { + "name": "nfs4_read", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_write", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + } + ], + "subClusters": [ + { + "name": "rtx3080", + "nodes": "tg0[80-86]", + "processorType": "Intel(R) Xeon(R) Gold 6226R CPU @ 2.90GHz", + "socketsPerNode": 2, + "coresPerSocket": 16, + "threadsPerCore": 2, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 111 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 787 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 229 + }, + "topology": { + "node": [ + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 + ], + "socket": [ + [ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 ], + [ 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 ] + ], + "memoryDomain": [ + [ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 ], + [ 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 ] + ], + "core": [ + [ 0, 32 ], [ 1, 33 ], [ 2, 34 ], [ 3, 35 ], [ 4, 36 ], [ 5, 37 ], [ 6, 38 ], [ 7, 39 ], [ 8, 40 ], [ 9, 41 ], [ 10, 42 ], [ 11, 43 ], [ 12, 44 ], [ 13, 45 ], [ 14, 46 ], [ 15, 47 ], [ 16, 48 ], [ 17, 49 ], [ 18, 50 ], [ 19, 51 ], [ 20, 52 ], [ 21, 53 ], [ 22, 54 ], [ 23, 55 ], [ 24, 56 ], [ 25, 57 ], [ 26, 58 ], [ 27, 59 ], [ 28, 60 ], [ 29, 61 ], [ 30, 62 ], [ 31, 63 ] + ], + "accelerators": [ + { + "id": "00000000:1a:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + }, + { + "id": "00000000:1b:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + }, + { + "id": "00000000:3d:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + }, + { + "id": "00000000:3e:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + }, + { + "id": "00000000:b1:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + }, + { + "id": "00000000:b2:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + }, + { + "id": "00000000:da:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + }, + { + "id": "00000000:db:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 3080" + } + ] + } + }, + { + "name": "rtx2080", + "nodes": "tg0[60-69],tg06a,tg06b", + "processorType": "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz", + "socketsPerNode": 2, + "coresPerSocket": 8, + "threadsPerCore": 2, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 47 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 326 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 137 + }, + "topology": { + "node": [ + 0, 19, 1, 17, 2, 18, 3, 16, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 + ], + "socket": [ + [ 0, 19, 1, 17, 2, 18, 3, 16, 4, 20, 5, 21, 6, 22, 7, 23 ], + [ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 ] + ], + "memoryDomain": [ + [ 0, 19, 1, 17, 2, 18, 3, 16, 4, 20, 5, 21, 6, 22, 7, 23 ], + [ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 ] + ], + "core": [ + [ 0, 19 ], [ 1, 17 ], [ 2, 18 ], [ 3, 16 ], [ 4, 20 ], [ 5, 21 ], [ 6, 22 ], [ 7, 23 ], [ 8, 24 ], [ 9, 25 ], [ 10, 26 ], [ 11, 27 ], [ 12, 28 ], [ 13, 29 ], [ 14, 30 ], [ 15, 31 ] + ], + "accelerators": [ + { + "id": "00000000:18:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 2080 Ti" + }, + { + "id": "00000000:3b:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 2080 Ti" + }, + { + "id": "00000000:86:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 2080 Ti" + }, + { + "id": "00000000:af:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA GeForce RTX 2080 Ti" + } + ] + } + }, + { + "name": "a100", + "nodes": "tg0[90-97]", + "processorType": "AMD EPYC 7662 64-Core Processor", + "socketsPerNode": 2, + "coresPerSocket": 64, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 987 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 5660 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 306 + }, + "topology": { + "node": [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 + ], + "socket": [ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ], + [ 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] + ], + "memoryDomain": [ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ], + [ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ], + [ 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 ], + [ 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ], [ 72 ], [ 73 ], [ 74 ], [ 75 ], [ 76 ], [ 77 ], [ 78 ], [ 79 ], [ 80 ], [ 81 ], [ 82 ], [ 83 ], [ 84 ], [ 85 ], [ 86 ], [ 87 ], [ 88 ], [ 89 ], [ 90 ], [ 91 ], [ 92 ], [ 93 ], [ 94 ], [ 95 ], [ 96 ], [ 97 ], [ 98 ], [ 99 ], [ 100 ], [ 101 ], [ 102 ], [ 103 ], [ 104 ], [ 105 ], [ 106 ], [ 107 ], [ 108 ], [ 109 ], [ 110 ], [ 111 ], [ 112 ], [ 113 ], [ 114 ], [ 115 ], [ 116 ], [ 117 ], [ 118 ], [ 119 ], [ 120 ], [ 121 ], [ 122 ], [ 123 ], [ 124 ], [ 125 ], [ 126 ], [ 127 ] + ], + "accelerators": [ + { + "id": "00000000:01:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA A100-SXM4-40GB" + }, + { + "id": "00000000:41:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA A100-SXM4-40GB" + }, + { + "id": "00000000:81:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA A100-SXM4-40GB" + }, + { + "id": "00000000:c1:00.0", + "type": "Nvidia GPU", + "model": "NVIDIA A100-SXM4-40GB" + } + ] + } + }, + { + "name": "v100", + "nodes": "tg0[71-74]", + "processorType": "Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz", + "socketsPerNode": 2, + "coresPerSocket": 8, + "threadsPerCore": 2, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 59 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 430 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 177 + }, + "topology": { + "node": [ + 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 + ], + "socket": [ + [ 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 ], + [ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 ] + ], + "memoryDomain": [ + [ 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 ], + [ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 ] + ], + "core": [ + [ 0, 16 ], [ 1, 17 ], [ 2, 18 ], [ 3, 19 ], [ 4, 20 ], [ 5, 21 ], [ 6, 22 ], [ 7, 23 ], [ 8, 24 ], [ 9, 25 ], [ 10, 26 ], [ 11, 27 ], [ 12, 28 ], [ 13, 29 ], [ 14, 30 ], [ 15, 31 ] + ], + "accelerators": [ + { + "id": "00000000:18:00.0", + "type": "", + "model": "Tesla V100-PCIE-32GB" + }, + { + "id": "00000000:3b:00.0", + "type": "", + "model": "Tesla V100-PCIE-32GB" + }, + { + "id": "00000000:86:00.0", + "type": "", + "model": "Tesla V100-PCIE-32GB" + }, + { + "id": "00000000:af:00.0", + "type": "", + "model": "Tesla V100-PCIE-32GB" + } + ] + } + } + ] +} \ No newline at end of file diff --git a/fau-systems/job-archive/cluster-woody.json b/fau-systems/job-archive/cluster-woody.json new file mode 100644 index 0000000..50a4045 --- /dev/null +++ b/fau-systems/job-archive/cluster-woody.json @@ -0,0 +1,415 @@ +{ + "name": "woody", + "metricConfig": [ + { + "name": "cpu_load", + "unit": { + "base": "" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 4, + "caution": 4, + "alert": 1, + "subClusters": [ + { + "name": "icelake", + "peak": 32, + "normal": 32, + "caution": 30, + "alert": 10 + } + ] + }, + { + "name": "cpu_user", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 50, + "caution": 20, + "alert": 10 + }, + { + "name": "ipc", + "unit": { + "base": "IPC" + }, + "scope": "node", + "aggregation": "avg", + "timestep": 60, + "peak": 4, + "normal": 2, + "caution": 1, + "alert": 0.25 + }, + { + "name": "mem_used", + "unit": { + "base": "B", + "prefix": "G" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 32, + "normal": 16, + "caution": 28, + "alert": 30, + "subClusters": [ + { + "name": "icelake", + "peak": 256, + "normal": 128, + "caution": 200, + "alert": 240 + } + ] + }, + { + "name": "flops_any", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 112, + "normal": 50, + "caution": 20, + "alert": 10, + "subClusters": [ + { + "name": "icelake", + "peak": 2970, + "normal": 1000, + "caution": 100, + "alert": 50 + } + ] + }, + { + "name": "flops_dp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 56, + "normal": 30, + "caution": 15, + "alert": 5, + "subClusters": [ + { + "name": "icelake", + "peak": 1450, + "normal": 700, + "caution": 100, + "alert": 50 + } + ] + }, + { + "name": "flops_sp", + "unit": { + "base": "Flops/s", + "prefix": "G" + }, + "scope": "hwthread", + "aggregation": "sum", + "timestep": 60, + "peak": 112, + "normal": 50, + "caution": 20, + "alert": 10, + "subClusters": [ + { + "name": "icelake", + "peak": 2970, + "normal": 1000, + "caution": 100, + "alert": 50 + } + ] + }, + { + "name": "mem_bw", + "unit": { + "base": "B/s", + "prefix": "G" + }, + "scope": "socket", + "aggregation": "sum", + "timestep": 60, + "peak": 24, + "normal": 10, + "caution": 5, + "alert": 2, + "subClusters": [ + { + "name": "icelake", + "peak": 350, + "normal": 100, + "caution": 50, + "alert": 20 + } + ] + }, + { + "name": "clock", + "unit": { + "base": "Hz", + "prefix": "M" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 3000, + "normal": 2000, + "caution": 1500, + "alert": 1200 + }, + { + "name": "vectorization_ratio", + "unit": { + "base": "" + }, + "scope": "hwthread", + "aggregation": "avg", + "timestep": 60, + "peak": 100, + "normal": 60, + "caution": 40, + "alert": 10 + }, + { + "name": "nfs4_read", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_write", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + }, + { + "name": "nfs4_total", + "unit": { + "base": "B/s", + "prefix": "M" + }, + "scope": "node", + "aggregation": "sum", + "timestep": 60, + "peak": 6, + "normal": 4, + "caution": 2, + "alert": 1 + } + ], + "subClusters": [ + { + "name": "haswell", + "nodes": "w11[27-45,49-63,69-72]", + "processorType": "Intel Xeon E3-1240 v3", + "socketsPerNode": 1, + "coresPerSocket": 4, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 14 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 112 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 24 + }, + "topology": { + "node": [ + 0, 1, 2, 3 + ], + "socket": [ + [ 0, 1, 2, 3 ] + ], + "memoryDomain": [ + [ 0, 1, 2, 3 ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ] + ] + } + }, + { + "name": "skylake", + "nodes": "w12[01-08],w13[01-31,33-56]", + "processorType": "Intel Xeon E3-1240 v5 ", + "socketsPerNode": 1, + "coresPerSocket": 4, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 14 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 112 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 24 + }, + "topology": { + "node": [ + 0, 1, 2, 3 + ], + "socket": [ + [ 0, 1, 2, 3 ] + ], + "memoryDomain": [ + [ 0, 1, 2, 3 ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ] + ] + } + }, + { + "name": "kabylake", + "nodes": "w14[01-56],w15[01-05,07-56]", + "processorType": "Intel Xeon E3-1240 v6", + "socketsPerNode": 1, + "coresPerSocket": 4, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 14 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 112 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 24 + }, + "topology": { + "node": [ + 0, 1, 2, 3 + ], + "socket": [ + [ 0, 1, 2, 3 ] + ], + "memoryDomain": [ + [ 0, 1, 2, 3 ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ] + ] + } + }, + { + "name": "icelake", + "nodes": "w22[01-35],w23[01-35]", + "processorType": "Intel Xeon Gold 6326", + "socketsPerNode": 2, + "coresPerSocket": 16, + "threadsPerCore": 1, + "flopRateScalar": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 432 + }, + "flopRateSimd": { + "unit": { + "base": "F/s", + "prefix": "G" + }, + "value": 9216 + }, + "memoryBandwidth": { + "unit": { + "base": "B/s", + "prefix": "G" + }, + "value": 350 + }, + "topology": { + "node": [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71 + ], + "socket": [ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 ], + [ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71 ] + ], + "memoryDomain": [ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 ], + [ 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 ], + [ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ], + [ 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71 ] + ], + "core": [ + [ 0 ], [ 1 ], [ 2 ], [ 3 ], [ 4 ], [ 5 ], [ 6 ], [ 7 ], [ 8 ], [ 9 ], [ 10 ], [ 11 ], [ 12 ], [ 13 ], [ 14 ], [ 15 ], [ 16 ], [ 17 ], [ 18 ], [ 19 ], [ 20 ], [ 21 ], [ 22 ], [ 23 ], [ 24 ], [ 25 ], [ 26 ], [ 27 ], [ 28 ], [ 29 ], [ 30 ], [ 31 ], [ 32 ], [ 33 ], [ 34 ], [ 35 ], [ 36 ], [ 37 ], [ 38 ], [ 39 ], [ 40 ], [ 41 ], [ 42 ], [ 43 ], [ 44 ], [ 45 ], [ 46 ], [ 47 ], [ 48 ], [ 49 ], [ 50 ], [ 51 ], [ 52 ], [ 53 ], [ 54 ], [ 55 ], [ 56 ], [ 57 ], [ 58 ], [ 59 ], [ 60 ], [ 61 ], [ 62 ], [ 63 ], [ 64 ], [ 65 ], [ 66 ], [ 67 ], [ 68 ], [ 69 ], [ 70 ], [ 71 ] + ] + } + } + ] +} \ No newline at end of file