diff --git a/nhr@fau/cc-metric-collector/tinyfat/collectors.bw256.json b/nhr@fau/cc-metric-collector/tinyfat/collectors.bw256.json new file mode 120000 index 0000000..0980ae3 --- /dev/null +++ b/nhr@fau/cc-metric-collector/tinyfat/collectors.bw256.json @@ -0,0 +1 @@ +collectors.bw512.json \ No newline at end of file diff --git a/nhr@fau/cc-metric-collector/tinyfat/collectors.bw512.json b/nhr@fau/cc-metric-collector/tinyfat/collectors.bw512.json new file mode 100644 index 0000000..0babc6c --- /dev/null +++ b/nhr@fau/cc-metric-collector/tinyfat/collectors.bw512.json @@ -0,0 +1,169 @@ +{ + "nfs4stat" : {}, + "memstat" : { + "numa_stats": true, + "node_stats": true + }, + "cpustat" : {}, + "loadavg" : {}, + "schedstat": {}, + "netstat" : { + "include_devices" : [ + "eth0", + "eth1", + "eth2", + "enp3s0" + ], + "send_abs_values": true, + "send_derived_values": true + }, + "diskstat" : {}, + "iostat" : {}, + "nfsiostat" : {}, + "tempstat" : { + "tag_override" : { + "hwmon2" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon3" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "MBOX0C0": "CAS_COUNT_RD", + "MBOX0C1": "CAS_COUNT_WR", + "MBOX1C0": "CAS_COUNT_RD", + "MBOX1C1": "CAS_COUNT_WR", + "MBOX2C0": "CAS_COUNT_RD", + "MBOX2C1": "CAS_COUNT_WR", + "MBOX3C0": "CAS_COUNT_RD", + "MBOX3C1": "CAS_COUNT_WR", + "MBOX4C0": "CAS_COUNT_RD", + "MBOX4C1": "CAS_COUNT_WR", + "MBOX5C0": "CAS_COUNT_RD", + "MBOX5C1": "CAS_COUNT_WR", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE", + "PWR0": "PWR_PKG_ENERGY", + "PWR3": "PWR_DRAM_ENERGY" + }, + "metrics": [ + { + "calc": "1E-6*(FIXC1/FIXC2)/inverseClock", + "name": "clock", + "publish": true, + "unit": "MHz", + "type": "hwthread" + }, + { + "calc": "FIXC0/FIXC1", + "name": "ipc", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PWR0/time", + "name": "pwr_pkg", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "PWR3/time", + "name": "pwr_dram", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time", + "name": "flops_dp", + "publish": true, + "unit": "GFlops/s", + "type": "hwthread" + }, + { + "calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time", + "name": "mem_bw", + "publish": true, + "unit": "GBytes/s", + "type": "socket" + }, + { + "calc": "PMC0+PMC2", + "name": "dp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "dp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time", + "name": "flops_sp", + "publish": true, + "unit": "GFlops/s", + "type": "hwthread" + }, + { + "calc": "PMC0+PMC2", + "name": "sp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "sp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + } + ], + "globalmetrics": [ + { + "calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))", + "name": "vectorization_ratio", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_dp * 2) + flops_sp", + "name": "flops_any", + "type": "hwthread", + "unit": "GFlops/s", + "publish": true + } + ] + } +} diff --git a/nhr@fau/cc-metric-collector/tinyfat/collectors.rome.json b/nhr@fau/cc-metric-collector/tinyfat/collectors.rome.json index d3c76c6..fe002e0 100644 --- a/nhr@fau/cc-metric-collector/tinyfat/collectors.rome.json +++ b/nhr@fau/cc-metric-collector/tinyfat/collectors.rome.json @@ -11,13 +11,15 @@ "include_devices" : [ "eth0", "eth1", - "eth2" + "eth2", + "enp1s0f0" ], "send_abs_values": true, "send_derived_values": true }, "diskstat" : {}, "iostat" : {}, + "nfsiostat" : {}, "tempstat" : { "tag_override" : { "hwmon1" : { @@ -45,8 +47,8 @@ "PMC1": "CPU_CLOCKS_UNHALTED", "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL", "PMC3": "MERGE", - "DFC0": "DATA_FROM_LOCAL_DRAM_CHANNEL", - "DFC1": "DATA_TO_LOCAL_DRAM_CHANNEL", + "DFC0": "DRAM_CHANNEL_0", + "DFC1": "DRAM_CHANNEL_1", "PWR0": "RAPL_CORE_ENERGY", "PWR1": "RAPL_PKG_ENERGY" }, @@ -87,7 +89,7 @@ }, { "name": "mem_bw", - "calc": "1E-9*(DFC0+DFC1)*64.0/time", + "calc": "1E-9*(DFC0+DFC1)*(4.0/(8/4))*64.0/time", "unit": "Gbyte/s", "type": "socket", "publish": true diff --git a/nhr@fau/cc-metric-collector/tinyfat/config.json b/nhr@fau/cc-metric-collector/tinyfat/config.json index 308b08f..d66d9f1 100644 --- a/nhr@fau/cc-metric-collector/tinyfat/config.json +++ b/nhr@fau/cc-metric-collector/tinyfat/config.json @@ -1,8 +1,10 @@ { - "sinks": "/etc/cc-metric-collector/sinks.json", - "collectors" : "/etc/cc-metric-collector/collectors.json", - "receivers" : "/etc/cc-metric-collector/receivers.json", - "router" : "/etc/cc-metric-collector/router.json", - "interval": "60s", - "duration": "10s" -} + "sinks-file": "/etc/cc-metric-collector/sinks.json", + "collectors-file" : "/etc/cc-metric-collector/collectors.json", + "receivers-file" : "/etc/cc-metric-collector/receivers.json", + "router-file" : "/etc/cc-metric-collector/router.json", + "main" : { + "interval": "60s", + "duration": "10s" + } +} \ No newline at end of file diff --git a/nhr@fau/cc-metric-collector/tinyfat/router.json b/nhr@fau/cc-metric-collector/tinyfat/router.json index 4a1588e..a3c0adf 100644 --- a/nhr@fau/cc-metric-collector/tinyfat/router.json +++ b/nhr@fau/cc-metric-collector/tinyfat/router.json @@ -1,49 +1,58 @@ { - "add_tags" : [ - { - "key" : "cluster", - "value" : "tinyfat", - "if" : "*" - } - ], - "rename_metrics" : { - "load_one" : "cpu_load", - "cpu_load_core" : "cpu_load", - "net_bytes_in_bw" : "net_bytes_in", - "net_bytes_out_bw" : "net_bytes_out", - "net_pkts_in_bw" : "net_pkts_in", - "net_pkts_out_bw" : "net_pkts_out", - "ib_recv_bw" : "ib_recv", - "ib_xmit_bw" : "ib_xmit", - "ib_recv_pkts_bw": "ib_recv_pkts", - "ib_xmit_pkts_bw": "ib_xmit_pkts", - "lustre_read_bytes_diff" : "lustre_read_bytes", - "lustre_read_requests_diff" : "lustre_read_requests", - "lustre_write_bytes_diff" : "lustre_write_bytes", - "lustre_write_requests_diff" : "lustre_write_requests", - "lustre_open_diff" : "lustre_open", - "lustre_close_diff" : "lustre_close", - "lustre_setattr_diff" : "lustre_setattr", - "lustre_getattr_diff" : "lustre_getattr", - "lustre_statfs_diff": "lustre_statfs", - "lustre_inode_permission_diff" : "lustre_inode_permission", - "pwr_pkg": "cpu_power", - "pwr_dram": "mem_power" + "process_messages": { + "rename_messages" : { + "load_one" : "cpu_load", + "cpu_load_core" : "cpu_load", + "net_bytes_in_bw" : "net_bytes_in", + "net_bytes_out_bw" : "net_bytes_out", + "net_pkts_in_bw" : "net_pkts_in", + "net_pkts_out_bw" : "net_pkts_out", + "ib_recv_bw" : "ib_recv", + "ib_xmit_bw" : "ib_xmit", + "ib_recv_pkts_bw": "ib_recv_pkts", + "ib_xmit_pkts_bw": "ib_xmit_pkts", + "lustre_read_bytes_diff" : "lustre_read_bytes", + "lustre_read_requests_diff" : "lustre_read_requests", + "lustre_write_bytes_diff" : "lustre_write_bytes", + "lustre_write_requests_diff" : "lustre_write_requests", + "lustre_open_diff" : "lustre_open", + "lustre_close_diff" : "lustre_close", + "lustre_setattr_diff" : "lustre_setattr", + "lustre_getattr_diff" : "lustre_getattr", + "lustre_statfs_diff": "lustre_statfs", + "lustre_inode_permission_diff" : "lustre_inode_permission", + "pwr_pkg": "cpu_power", + "pwr_dram": "mem_power" + }, + "add_tags_if" : [ + { + "key" : "cluster", + "value" : "tinyfat", + "if" : "true" + } + ], + "drop_messages" : [ + "net_bytes_in", + "net_bytes_out", + "ib_recv", + "ib_xmit", + "ib_recv_pkts", + "ib_xmit_pkts", + "net_pkts_in", + "net_pkts_out", + "lustre_read_bytes", + "lustre_read_requests", + "lustre_write_bytes", + "lustre_write_requests" + ], + "change_unit_prefix": { + "name == 'mem_used'": "G", + "name == 'swap_used'": "G", + "name == 'mem_total'": "G", + "name == 'swap_total'": "G" + }, + "normalize_messages" : true }, - "drop_metrics" : [ - "net_bytes_in", - "net_bytes_out", - "ib_recv", - "ib_xmit", - "ib_recv_pkts", - "ib_xmit_pkts", - "net_pkts_in", - "net_pkts_out", - "lustre_read_bytes", - "lustre_read_requests", - "lustre_write_bytes", - "lustre_write_requests" - ], "interval_timestamp" : false, "num_cache_intervals" : 0 } diff --git a/nhr@fau/cc-metric-collector/tinyfat/sinks.json b/nhr@fau/cc-metric-collector/tinyfat/sinks.json index dedcb94..4a084cd 100644 --- a/nhr@fau/cc-metric-collector/tinyfat/sinks.json +++ b/nhr@fau/cc-metric-collector/tinyfat/sinks.json @@ -1,26 +1,53 @@ { - "influx": { - "type": "influxasync", - "host": "monitoring-test.nhr.uni-erlangen.de", - "port": "8086", - "organization": "ClusterCockpit", - "database": "tinyfat", - "password": "XZY", - "ssl": true, - "meta_as_tags": [ - "unit" - ] - }, - "metricstore": { - "type": "http", - "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinyfat", - "jwt": "XYZ", - "meta_as_tags": [ - "unit" - ], - "idle_connection_timeout": "60s", - "flush_delay": "2s", - "max_retries": 1, - "timeout": "10s" + "influx" : { + "type" : "influxasync", + "host": "monitoring-test.nhr.uni-erlangen.de", + "port": "8086", + "organization" : "ClusterCockpit", + "database" : "tinyfat", + "password": "XYZ", + "ssl": true, + "process_messages": { + "move_meta_to_tag_if": [ + { + "key": "unit", + "if": "true" + } + ] } + }, + "metricstore" : { + "type" : "http", + "url" : "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinyfat", + "jwt": "XYZ", + "idle_connection_timeout": "60s", + "flush_delay" : "2s", + "max_retries" : 1, + "timeout" : "10s", + "precision": "s", + "process_messages": { + "move_meta_to_tag_if": [ + { + "key": "unit", + "if": "true" + } + ] + } + }, + "tinyfatnats": { + "type": "nats", + "host": "monitoring.nhr.fau.de", + "database": "tinyfat", + "nkey_file": "/etc/cc-metric-collector/nats.nkey", + "flush_delay": "1s", + "precision": "s", + "process_messages": { + "move_meta_to_tag_if": [ + { + "key": "unit", + "if": "true" + } + ] + } + } } diff --git a/nhr@fau/cc-metric-collector/tinyfat/sinks_debug.json b/nhr@fau/cc-metric-collector/tinyfat/sinks_debug.json deleted file mode 100644 index 3a6359a..0000000 --- a/nhr@fau/cc-metric-collector/tinyfat/sinks_debug.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "influx": { - "type": "influxasync", - "host": "monitoring-test.nhr.uni-erlangen.de", - "port": "8086", - "organization": "ClusterCockpit", - "database": "tinyfat", - "password": "XZY", - "ssl": true, - "meta_as_tags": [ - "unit" - ] - }, - "metricstore": { - "type": "http", - "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinyfat", - "jwt": "XZY", - "meta_as_tags": [ - "unit" - ], - "idle_connection_timeout": "60s", - "flush_delay": "2s", - "max_retries": 1, - "timeout": "10s" - }, - "debugstdout": { - "type": "stdout", - "output_file": "/tmp/debug.log" - } -}