diff --git a/nhr@fau/cc-metric-collector/tinygpu/collectors.2080ti.json b/nhr@fau/cc-metric-collector/tinygpu/collectors.2080ti.json new file mode 100644 index 0000000..1045a69 --- /dev/null +++ b/nhr@fau/cc-metric-collector/tinygpu/collectors.2080ti.json @@ -0,0 +1,220 @@ +{ + "nfs4stat" : {}, + "memstat" : { + "numa_stats": true, + "node_stats": true + }, + "cpustat" : {}, + "loadavg" : {}, + "schedstat": {}, + "netstat" : { + "include_devices" : [ + "eth0" + ], + "send_abs_values": true, + "send_derived_values": true + }, + "diskstat" : {}, + "iostat" : {}, + "nfsiostat" : {}, + "tempstat" : { + "tag_override" : { + "hwmon0" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon1" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "nvidia" : { + "use_pci_info_as_type_id": true, + "process_mig_devices": true + }, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "MBOX0C0": "CAS_COUNT_RD", + "MBOX0C1": "CAS_COUNT_WR", + "MBOX1C0": "CAS_COUNT_RD", + "MBOX1C1": "CAS_COUNT_WR", + "MBOX2C0": "CAS_COUNT_RD", + "MBOX2C1": "CAS_COUNT_WR", + "MBOX3C0": "CAS_COUNT_RD", + "MBOX3C1": "CAS_COUNT_WR", + "MBOX4C0": "CAS_COUNT_RD", + "MBOX4C1": "CAS_COUNT_WR", + "MBOX5C0": "CAS_COUNT_RD", + "MBOX5C1": "CAS_COUNT_WR", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE", + "PWR0": "PWR_PKG_ENERGY", + "PWR3": "PWR_DRAM_ENERGY" + }, + "metrics": [ + { + "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", + "name": "clock", + "publish": true, + "unit": "MHz", + "type": "hwthread" + }, + { + "calc": "FIXC0/FIXC1", + "name": "ipc", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PWR0/time", + "name": "cpu_power", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "PWR3/time", + "name": "mem_power", + "unit": "Watt", + "publish": true, + "type": "socket" + }, + { + "calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time", + "name": "flops_dp1", + "unit": "GFlops/s", + "publish": false, + "type": "hwthread" + }, + { + "calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time", + "name": "mem_bw", + "unit": "GBytes/s", + "publish": true, + "type": "socket" + }, + { + "calc": "PMC0+PMC2", + "name": "dp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "dp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time", + "name": "flops_sp1", + "publish": false, + "type": "hwthread" + }, + { + "calc": "PMC0+PMC2", + "name": "sp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "sp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "PMC0": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "PMC0", + "name": "dp_avx_512_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "1E-9*(PMC0*8.0)/time", + "name": "flops_dp2", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC1", + "name": "sp_avx_512_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "1E-9*(PMC1*16.0)/time", + "name": "flops_sp2", + "type": "hwthread", + "publish": false + } + ] + } + ], + "globalmetrics": [ + { + "calc": "100*((sp_vec_ins+dp_vec_ins+dp_avx_512_ins+sp_avx_512_ins)/(sp_fp_ins+dp_fp_ins+dp_avx_512_ins+sp_avx_512_ins))", + "name": "vectorization_ratio", + "unit": "%", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_sp1+flops_sp2)", + "name": "flops_sp", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_dp1+flops_dp2)", + "name": "flops_dp", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + }, + { + "calc": "((flops_dp1+flops_dp2) * 2) + (flops_sp1+flops_sp2)", + "name": "flops_any", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + } + ] + } +} + diff --git a/nhr@fau/cc-metric-collector/tinygpu/collectors.3080.json b/nhr@fau/cc-metric-collector/tinygpu/collectors.3080.json new file mode 100644 index 0000000..848d3b7 --- /dev/null +++ b/nhr@fau/cc-metric-collector/tinygpu/collectors.3080.json @@ -0,0 +1,180 @@ +{ + "nfs4stat" : {}, + "memstat" : { + "numa_stats": true, + "node_stats": true + }, + "cpustat" : {}, + "loadavg" : {}, + "schedstat": {}, + "netstat" : { + "include_devices" : [ + "eth0", + "eth1", + "eth2" + ], + "send_abs_values": true, + "send_derived_values": true + }, + "diskstat" : {}, + "iostat" : {}, + "nfsiostat" : {}, + "tempstat" : { + "tag_override" : { + "hwmon1" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon2" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "ipmistat" : { + "send_abs_values": true, + "send_derived_values": true + }, + "nvidia" : { + "use_pci_info_as_type_id": true, + "process_mig_devices": true + }, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "MBOX0C0": "CAS_COUNT_RD", + "MBOX0C1": "CAS_COUNT_WR", + "MBOX1C0": "CAS_COUNT_RD", + "MBOX1C1": "CAS_COUNT_WR", + "MBOX2C0": "CAS_COUNT_RD", + "MBOX2C1": "CAS_COUNT_WR", + "MBOX3C0": "CAS_COUNT_RD", + "MBOX3C1": "CAS_COUNT_WR", + "MBOX4C0": "CAS_COUNT_RD", + "MBOX4C1": "CAS_COUNT_WR", + "MBOX5C0": "CAS_COUNT_RD", + "MBOX5C1": "CAS_COUNT_WR", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE", + "PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE", + "PWR0": "PWR_PKG_ENERGY", + "PWR3": "PWR_DRAM_ENERGY" + }, + "metrics": [ + { + "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", + "name": "clock", + "publish": true, + "unit": "MHz", + "type": "hwthread" + }, + { + "calc": "FIXC0/FIXC1", + "name": "ipc", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PWR0/time", + "name": "cpu_power", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "PWR3/time", + "name": "mem_power", + "unit": "Watt", + "publish": true, + "type": "socket" + }, + { + "calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time", + "name": "flops_dp", + "unit": "GFlops/s", + "publish": true, + "type": "hwthread" + }, + { + "calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time", + "name": "mem_bw", + "unit": "GBytes/s", + "publish": true, + "type": "socket" + }, + { + "calc": "PMC0+PMC2+PMC3", + "name": "dp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2+PMC3", + "name": "dp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE", + "PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time", + "name": "flops_sp", + "unit": "GFlops/s", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PMC0+PMC2+PMC3", + "name": "sp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2+PMC3", + "name": "sp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + } + ], + "globalmetrics": [ + { + "calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))", + "name": "vectorization_ratio", + "unit": "%", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_dp * 2) + flops_sp", + "name": "flops_any", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + } + ] + } +} + diff --git a/nhr@fau/cc-metric-collector/tinygpu/collectors.a100.json b/nhr@fau/cc-metric-collector/tinygpu/collectors.a100.json new file mode 100644 index 0000000..943f1ab --- /dev/null +++ b/nhr@fau/cc-metric-collector/tinygpu/collectors.a100.json @@ -0,0 +1,104 @@ +{ + "nfs4stat" : {}, + "memstat" : { + "numa_stats": true, + "node_stats": true + }, + "cpustat" : {}, + "loadavg" : {}, + "schedstat": {}, + "netstat" : { + "include_devices" : [ + "eth0" + ], + "send_abs_values": true, + "send_derived_values": true + }, + "diskstat" : {}, + "iostat" : {}, + "nfsiostat" : {}, + "tempstat" : { + "tag_override" : { + "hwmon0" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon1" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "nvidia" : { + "use_pci_info_as_type_id": true, + "process_mig_devices": true + }, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC1": "ACTUAL_CPU_CLOCK", + "FIXC2": "MAX_CPU_CLOCK", + "PMC0": "RETIRED_INSTRUCTIONS", + "PMC1": "CPU_CLOCKS_UNHALTED", + "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL", + "PMC3": "MERGE", + "DFC0": "DATA_FROM_LOCAL_DRAM_CHANNEL", + "DFC1": "DATA_TO_LOCAL_DRAM_CHANNEL", + "PWR0": "RAPL_CORE_ENERGY", + "PWR1": "RAPL_PKG_ENERGY" + }, + "metrics": [ + { + "name": "ipc", + "calc": "PMC0/PMC1", + "type": "hwthread", + "publish": true + }, + { + "name": "flops_any", + "calc": "1E-9*PMC2/time", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + }, + { + "name": "clock", + "calc": "1E-6*(FIXC1/FIXC2)/inverseClock", + "type": "hwthread", + "unit": "MHz", + "publish": true + }, + { + "name": "pwr_core", + "calc": "PWR0/time", + "unit": "Watt", + "type": "socket", + "publish": true + }, + { + "name": "pwr_pkg", + "calc": "PWR1/time", + "type": "socket", + "unit": "Watt", + "publish": true + }, + { + "name": "mem_bw", + "calc": "1E-9*(DFC0+DFC1)*64.0/time", + "unit": "Gbyte/s", + "type": "socket", + "publish": true + } + ] + } + ], + "globalmetrics": [] + } +} + diff --git a/nhr@fau/cc-metric-collector/tinygpu/collectors.gtx1080.json b/nhr@fau/cc-metric-collector/tinygpu/collectors.gtx1080.json new file mode 100644 index 0000000..2c63c08 --- /dev/null +++ b/nhr@fau/cc-metric-collector/tinygpu/collectors.gtx1080.json @@ -0,0 +1,2 @@ +{ +} diff --git a/nhr@fau/cc-metric-collector/tinygpu/collectors.h100.json b/nhr@fau/cc-metric-collector/tinygpu/collectors.h100.json new file mode 100644 index 0000000..2c63c08 --- /dev/null +++ b/nhr@fau/cc-metric-collector/tinygpu/collectors.h100.json @@ -0,0 +1,2 @@ +{ +} diff --git a/nhr@fau/cc-metric-collector/tinygpu/collectors.rome.a100.json b/nhr@fau/cc-metric-collector/tinygpu/collectors.rome.a100.json index 5f1ff1c..05b7fd4 100644 --- a/nhr@fau/cc-metric-collector/tinygpu/collectors.rome.a100.json +++ b/nhr@fau/cc-metric-collector/tinygpu/collectors.rome.a100.json @@ -16,6 +16,7 @@ }, "diskstat" : {}, "iostat" : {}, + "nfsiostat" : {}, "tempstat" : { "tag_override" : { "hwmon0" : { diff --git a/nhr@fau/cc-metric-collector/tinygpu/collectors.skx.2080.json b/nhr@fau/cc-metric-collector/tinygpu/collectors.skx.2080.json index a06e387..3ff8ff6 100644 --- a/nhr@fau/cc-metric-collector/tinygpu/collectors.skx.2080.json +++ b/nhr@fau/cc-metric-collector/tinygpu/collectors.skx.2080.json @@ -16,6 +16,7 @@ }, "diskstat" : {}, "iostat" : {}, + "nfsiostat" : {}, "tempstat" : { "tag_override" : { "hwmon0" : { diff --git a/nhr@fau/cc-metric-collector/tinygpu/collectors.skx.3080.json b/nhr@fau/cc-metric-collector/tinygpu/collectors.skx.3080.json index d6c6ae0..f9f69ee 100644 --- a/nhr@fau/cc-metric-collector/tinygpu/collectors.skx.3080.json +++ b/nhr@fau/cc-metric-collector/tinygpu/collectors.skx.3080.json @@ -18,6 +18,7 @@ }, "diskstat" : {}, "iostat" : {}, + "nfsiostat" : {}, "tempstat" : { "tag_override" : { "hwmon1" : { diff --git a/nhr@fau/cc-metric-collector/tinygpu/collectors.skx.v100.json b/nhr@fau/cc-metric-collector/tinygpu/collectors.skx.v100.json index a06e387..3ff8ff6 100644 --- a/nhr@fau/cc-metric-collector/tinygpu/collectors.skx.v100.json +++ b/nhr@fau/cc-metric-collector/tinygpu/collectors.skx.v100.json @@ -16,6 +16,7 @@ }, "diskstat" : {}, "iostat" : {}, + "nfsiostat" : {}, "tempstat" : { "tag_override" : { "hwmon0" : { diff --git a/nhr@fau/cc-metric-collector/tinygpu/collectors.v100.json b/nhr@fau/cc-metric-collector/tinygpu/collectors.v100.json new file mode 100644 index 0000000..1045a69 --- /dev/null +++ b/nhr@fau/cc-metric-collector/tinygpu/collectors.v100.json @@ -0,0 +1,220 @@ +{ + "nfs4stat" : {}, + "memstat" : { + "numa_stats": true, + "node_stats": true + }, + "cpustat" : {}, + "loadavg" : {}, + "schedstat": {}, + "netstat" : { + "include_devices" : [ + "eth0" + ], + "send_abs_values": true, + "send_derived_values": true + }, + "diskstat" : {}, + "iostat" : {}, + "nfsiostat" : {}, + "tempstat" : { + "tag_override" : { + "hwmon0" : { + "type" : "socket", + "type-id" : "0" + }, + "hwmon1" : { + "type" : "socket", + "type-id" : "1" + } + } + }, + "nvidia" : { + "use_pci_info_as_type_id": true, + "process_mig_devices": true + }, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/system/sbin", + "liblikwid_path": "/apps/likwid/system/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "MBOX0C0": "CAS_COUNT_RD", + "MBOX0C1": "CAS_COUNT_WR", + "MBOX1C0": "CAS_COUNT_RD", + "MBOX1C1": "CAS_COUNT_WR", + "MBOX2C0": "CAS_COUNT_RD", + "MBOX2C1": "CAS_COUNT_WR", + "MBOX3C0": "CAS_COUNT_RD", + "MBOX3C1": "CAS_COUNT_WR", + "MBOX4C0": "CAS_COUNT_RD", + "MBOX4C1": "CAS_COUNT_WR", + "MBOX5C0": "CAS_COUNT_RD", + "MBOX5C1": "CAS_COUNT_WR", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE", + "PWR0": "PWR_PKG_ENERGY", + "PWR3": "PWR_DRAM_ENERGY" + }, + "metrics": [ + { + "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", + "name": "clock", + "publish": true, + "unit": "MHz", + "type": "hwthread" + }, + { + "calc": "FIXC0/FIXC1", + "name": "ipc", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PWR0/time", + "name": "cpu_power", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "PWR3/time", + "name": "mem_power", + "unit": "Watt", + "publish": true, + "type": "socket" + }, + { + "calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time", + "name": "flops_dp1", + "unit": "GFlops/s", + "publish": false, + "type": "hwthread" + }, + { + "calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time", + "name": "mem_bw", + "unit": "GBytes/s", + "publish": true, + "type": "socket" + }, + { + "calc": "PMC0+PMC2", + "name": "dp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "dp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time", + "name": "flops_sp1", + "publish": false, + "type": "hwthread" + }, + { + "calc": "PMC0+PMC2", + "name": "sp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2", + "name": "sp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "PMC0": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "PMC0", + "name": "dp_avx_512_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "1E-9*(PMC0*8.0)/time", + "name": "flops_dp2", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC1", + "name": "sp_avx_512_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "1E-9*(PMC1*16.0)/time", + "name": "flops_sp2", + "type": "hwthread", + "publish": false + } + ] + } + ], + "globalmetrics": [ + { + "calc": "100*((sp_vec_ins+dp_vec_ins+dp_avx_512_ins+sp_avx_512_ins)/(sp_fp_ins+dp_fp_ins+dp_avx_512_ins+sp_avx_512_ins))", + "name": "vectorization_ratio", + "unit": "%", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_sp1+flops_sp2)", + "name": "flops_sp", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_dp1+flops_dp2)", + "name": "flops_dp", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + }, + { + "calc": "((flops_dp1+flops_dp2) * 2) + (flops_sp1+flops_sp2)", + "name": "flops_any", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + } + ] + } +} + diff --git a/nhr@fau/cc-metric-collector/tinygpu/config.json b/nhr@fau/cc-metric-collector/tinygpu/config.json index 308b08f..d66d9f1 100644 --- a/nhr@fau/cc-metric-collector/tinygpu/config.json +++ b/nhr@fau/cc-metric-collector/tinygpu/config.json @@ -1,8 +1,10 @@ { - "sinks": "/etc/cc-metric-collector/sinks.json", - "collectors" : "/etc/cc-metric-collector/collectors.json", - "receivers" : "/etc/cc-metric-collector/receivers.json", - "router" : "/etc/cc-metric-collector/router.json", - "interval": "60s", - "duration": "10s" -} + "sinks-file": "/etc/cc-metric-collector/sinks.json", + "collectors-file" : "/etc/cc-metric-collector/collectors.json", + "receivers-file" : "/etc/cc-metric-collector/receivers.json", + "router-file" : "/etc/cc-metric-collector/router.json", + "main" : { + "interval": "60s", + "duration": "10s" + } +} \ No newline at end of file diff --git a/nhr@fau/cc-metric-collector/tinygpu/router.json b/nhr@fau/cc-metric-collector/tinygpu/router.json index b7de4a8..3e72621 100644 --- a/nhr@fau/cc-metric-collector/tinygpu/router.json +++ b/nhr@fau/cc-metric-collector/tinygpu/router.json @@ -1,60 +1,62 @@ { - "add_tags" : [ - { - "key" : "cluster", - "value" : "tinygpu", - "if" : "*" - } - ], - "rename_metrics" : { - "load_one" : "cpu_load", - "cpu_load_core" : "cpu_load", - "net_bytes_in_bw" : "net_bytes_in", - "net_bytes_out_bw" : "net_bytes_out", - "net_pkts_in_bw" : "net_pkts_in", - "net_pkts_out_bw" : "net_pkts_out", - "ib_recv_bw" : "ib_recv", - "ib_xmit_bw" : "ib_xmit", - "ib_recv_pkts_bw": "ib_recv_pkts", - "ib_xmit_pkts_bw": "ib_xmit_pkts", - "lustre_read_bytes_diff" : "lustre_read_bytes", - "lustre_read_requests_diff" : "lustre_read_requests", - "lustre_write_bytes_diff" : "lustre_write_bytes", - "lustre_write_requests_diff" : "lustre_write_requests", - "lustre_open_diff" : "lustre_open", - "lustre_close_diff" : "lustre_close", - "lustre_setattr_diff" : "lustre_setattr", - "lustre_getattr_diff" : "lustre_getattr", - "lustre_statfs_diff": "lustre_statfs", - "lustre_inode_permission_diff" : "lustre_inode_permission", - "nv_util" : "acc_utilization", - "nv_fb_mem_used" : "acc_mem_used", - "nv_power_usage" : "acc_power", - "pwr_pkg": "cpu_power", - "pwr_dram": "mem_power" + "process_messages": { + "rename_messages" : { + "load_one" : "cpu_load", + "cpu_load_core" : "cpu_load", + "net_bytes_in_bw" : "net_bytes_in", + "net_bytes_out_bw" : "net_bytes_out", + "net_pkts_in_bw" : "net_pkts_in", + "net_pkts_out_bw" : "net_pkts_out", + "ib_recv_bw" : "ib_recv", + "ib_xmit_bw" : "ib_xmit", + "ib_recv_pkts_bw": "ib_recv_pkts", + "ib_xmit_pkts_bw": "ib_xmit_pkts", + "lustre_read_bytes_diff" : "lustre_read_bytes", + "lustre_read_requests_diff" : "lustre_read_requests", + "lustre_write_bytes_diff" : "lustre_write_bytes", + "lustre_write_requests_diff" : "lustre_write_requests", + "lustre_open_diff" : "lustre_open", + "lustre_close_diff" : "lustre_close", + "lustre_setattr_diff" : "lustre_setattr", + "lustre_getattr_diff" : "lustre_getattr", + "lustre_statfs_diff": "lustre_statfs", + "lustre_inode_permission_diff" : "lustre_inode_permission", + "nv_util" : "acc_utilization", + "nv_fb_mem_used" : "acc_mem_used", + "nv_power_usage" : "acc_power", + "pwr_pkg": "cpu_power", + "pwr_dram": "mem_power" + }, + "add_tags_if" : [ + { + "key" : "cluster", + "value" : "tinygpu", + "if" : "true" + } + ], + "drop_messages" : [ + "net_bytes_in", + "net_bytes_out", + "ib_recv", + "ib_xmit", + "ib_recv_pkts", + "ib_xmit_pkts", + "net_pkts_in", + "net_pkts_out", + "lustre_read_bytes", + "lustre_read_requests", + "lustre_write_bytes", + "lustre_write_requests" + ], + "change_unit_prefix": { + "name == 'mem_used'": "G", + "name == 'swap_used'": "G", + "name == 'mem_total'": "G", + "name == 'swap_total'": "G", + "name == 'cpufreq'": "M" + }, + "normalize_units" : true }, - "drop_metrics" : [ - "net_bytes_in", - "net_bytes_out", - "ib_recv", - "ib_xmit", - "ib_recv_pkts", - "ib_xmit_pkts", - "net_pkts_in", - "net_pkts_out", - "lustre_read_bytes", - "lustre_read_requests", - "lustre_write_bytes", - "lustre_write_requests" - ], "interval_timestamp" : false, - "num_cache_intervals" : 0, - "change_unit_prefix": { - "mem_used": "G", - "swap_used": "G", - "mem_total": "G", - "swap_total": "G", - "cpufreq": "M" - }, - "normalize_metrics" : true + "num_cache_intervals" : 0 } diff --git a/nhr@fau/cc-metric-collector/tinygpu/sinks.json b/nhr@fau/cc-metric-collector/tinygpu/sinks.json index 105f653..e3846c2 100644 --- a/nhr@fau/cc-metric-collector/tinygpu/sinks.json +++ b/nhr@fau/cc-metric-collector/tinygpu/sinks.json @@ -1,26 +1,53 @@ { - "influx": { - "type": "influxasync", - "host": "monitoring-test.nhr.uni-erlangen.de", - "port": "8086", - "organization": "ClusterCockpit", - "database": "tinygpu", - "password": "XZY", - "ssl": true, - "meta_as_tags": [ - "unit" - ] - }, - "metricstore": { - "type": "http", - "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinygpu", - "jwt": "XZY", - "meta_as_tags": [ - "unit" - ], - "idle_connection_timeout": "60s", - "flush_delay": "2s", - "max_retries": 1, - "timeout": "10s" + "influx" : { + "type" : "influxasync", + "host": "monitoring-test.nhr.uni-erlangen.de", + "port": "8086", + "organization" : "ClusterCockpit", + "database" : "tinygpu", + "password": "XYZ", + "ssl": true, + "process_messages": { + "move_meta_to_tag_if": [ + { + "key": "unit", + "if": "true" + } + ] } + }, + "metricstore" : { + "type" : "http", + "url" : "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinygpu", + "jwt": "XYZ", + "idle_connection_timeout": "60s", + "flush_delay" : "2s", + "max_retries" : 1, + "timeout" : "10s", + "precision": "s", + "process_messages": { + "move_meta_to_tag_if": [ + { + "key": "unit", + "if": "true" + } + ] + } + }, + "tinygpunats": { + "type": "nats", + "host": "monitoring.nhr.fau.de", + "database": "tinygpu", + "nkey_file": "/etc/cc-metric-collector/nats.nkey", + "flush_delay": "1s", + "precision": "s", + "process_messages": { + "move_meta_to_tag_if": [ + { + "key": "unit", + "if": "true" + } + ] + } + } } diff --git a/nhr@fau/cc-metric-collector/tinygpu/sinks_debug.json b/nhr@fau/cc-metric-collector/tinygpu/sinks_debug.json deleted file mode 100644 index 02ee30b..0000000 --- a/nhr@fau/cc-metric-collector/tinygpu/sinks_debug.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "influx": { - "type": "influxasync", - "host": "monitoring-test.nhr.uni-erlangen.de", - "port": "8086", - "organization": "ClusterCockpit", - "database": "tinygpu", - "password": "XZY", - "ssl": true, - "meta_as_tags": [ - "unit" - ] - }, - "metricstore": { - "type": "http", - "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinygpu", - "jwt": "XZY", - "meta_as_tags": [ - "unit" - ], - "idle_connection_timeout": "60s", - "flush_delay": "2s", - "max_retries": 1, - "timeout": "10s" - }, - "debugstdout": { - "type": "stdout", - "output_file": "/tmp/debug.log" - } -}