From 95d1f1d5e9b60bb7e8e8bd2e876c3f3887657289 Mon Sep 17 00:00:00 2001 From: Michael Panzlaff Date: Wed, 4 Mar 2026 17:01:41 +0100 Subject: [PATCH] cc-metric-collector: update fritz.spr --- .../fritz.spr/collectors.json | 175 +++++++++++++++++- .../cc-metric-collector/fritz.spr/config.json | 16 +- .../cc-metric-collector/fritz.spr/router.json | 102 +++++----- .../cc-metric-collector/fritz.spr/sinks.json | 55 +++--- 4 files changed, 259 insertions(+), 89 deletions(-) diff --git a/nhr@fau/cc-metric-collector/fritz.spr/collectors.json b/nhr@fau/cc-metric-collector/fritz.spr/collectors.json index 44a7649..064a518 100644 --- a/nhr@fau/cc-metric-collector/fritz.spr/collectors.json +++ b/nhr@fau/cc-metric-collector/fritz.spr/collectors.json @@ -38,6 +38,177 @@ } } }, - "cpufreq_cpuinfo": {}, - "nfsiostat": {} + "nfsiostat": {}, + "likwid": { + "force_overwrite" : true, + "invalid_to_zero" : true, + "access_mode" : "accessdaemon", + "accessdaemon_path" : "/apps/likwid/5.3.0-spr/sbin", + "liblikwid_path": "/apps/likwid/5.3.0-spr/lib/liblikwid.so", + "eventsets": [ + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "MBOX0C0": "CAS_COUNT_RD", + "MBOX0C1": "CAS_COUNT_WR", + "MBOX1C0": "CAS_COUNT_RD", + "MBOX1C1": "CAS_COUNT_WR", + "MBOX2C0": "CAS_COUNT_RD", + "MBOX2C1": "CAS_COUNT_WR", + "MBOX3C0": "CAS_COUNT_RD", + "MBOX3C1": "CAS_COUNT_WR", + "MBOX4C0": "CAS_COUNT_RD", + "MBOX4C1": "CAS_COUNT_WR", + "MBOX5C0": "CAS_COUNT_RD", + "MBOX5C1": "CAS_COUNT_WR", + "MBOX6C0": "CAS_COUNT_RD", + "MBOX6C1": "CAS_COUNT_WR", + "MBOX7C0": "CAS_COUNT_RD", + "MBOX7C1": "CAS_COUNT_WR", + "MBOX8C0": "CAS_COUNT_RD", + "MBOX8C1": "CAS_COUNT_WR", + "MBOX9C0": "CAS_COUNT_RD", + "MBOX9C1": "CAS_COUNT_WR", + "MBOX10C0": "CAS_COUNT_RD", + "MBOX10C1": "CAS_COUNT_WR", + "MBOX11C0": "CAS_COUNT_RD", + "MBOX11C1": "CAS_COUNT_WR", + "MBOX12C0": "CAS_COUNT_RD", + "MBOX12C1": "CAS_COUNT_WR", + "MBOX13C0": "CAS_COUNT_RD", + "MBOX13C1": "CAS_COUNT_WR", + "MBOX14C0": "CAS_COUNT_RD", + "MBOX14C1": "CAS_COUNT_WR", + "MBOX15C0": "CAS_COUNT_RD", + "MBOX15C1": "CAS_COUNT_WR", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE", + "PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE", + "PWR0": "PWR_PKG_ENERGY", + "PWR3": "PWR_DRAM_ENERGY" + }, + "metrics": [ + { + "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", + "name": "clock", + "publish": true, + "unit": "MHz", + "type": "hwthread" + }, + { + "calc": "FIXC0/FIXC1", + "name": "ipc", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PWR0/time", + "name": "cpu_power", + "publish": true, + "unit": "Watt", + "type": "socket" + }, + { + "calc": "PWR0", + "name": "cpu_energy", + "publish": true, + "unit": "Joules", + "type": "socket" + }, + { + "calc": "PWR3/time", + "name": "mem_power", + "unit": "Watt", + "publish": true, + "type": "socket" + }, + { + "calc": "PWR3", + "name": "mem_energy", + "publish": true, + "unit": "Joules", + "type": "socket" + }, + { + "calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time", + "name": "flops_dp", + "unit": "GFlops/s", + "publish": true, + "type": "hwthread" + }, + { + "calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX8C0+MBOX9C0+MBOX10C0+MBOX11C0+MBOX12C0+MBOX13C0+MBOX14C0+MBOX15C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1+MBOX8C1+MBOX9C1+MBOX10C1+MBOX11C1+MBOX12C1+MBOX13C1+MBOX14C1+MBOX15C1)*64.0/time", + "name": "mem_bw", + "unit": "GBytes/s", + "publish": true, + "type": "socket" + }, + { + "calc": "PMC0+PMC2+PMC3", + "name": "dp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2+PMC3", + "name": "dp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + }, + { + "events": { + "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", + "FIXC2": "CPU_CLK_UNHALTED_REF", + "FIXC3": "TOPDOWN_SLOTS", + "PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE", + "PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE", + "PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE", + "PMC3": "FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE" + }, + "metrics": [ + { + "calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time", + "name": "flops_sp", + "unit": "GFlops/s", + "publish": true, + "type": "hwthread" + }, + { + "calc": "PMC0+PMC2+PMC3", + "name": "sp_vec_ins", + "type": "hwthread", + "publish": false + }, + { + "calc": "PMC0+PMC1+PMC2+PMC3", + "name": "sp_fp_ins", + "type": "hwthread", + "publish": false + } + ] + } + ], + "globalmetrics": [ + { + "calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))", + "name": "vectorization_ratio", + "unit": "%", + "type": "hwthread", + "publish": true + }, + { + "calc": "(flops_dp * 2) + flops_sp", + "name": "flops_any", + "unit": "GFlops/s", + "type": "hwthread", + "publish": true + } + ] + } } diff --git a/nhr@fau/cc-metric-collector/fritz.spr/config.json b/nhr@fau/cc-metric-collector/fritz.spr/config.json index 3a91189..d66d9f1 100644 --- a/nhr@fau/cc-metric-collector/fritz.spr/config.json +++ b/nhr@fau/cc-metric-collector/fritz.spr/config.json @@ -1,8 +1,10 @@ { - "sinks": "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/sinks2.json", - "collectors" : "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/collectors.json", - "receivers" : "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/receivers.json", - "router" : "/home/hpc/unrz/unrz139/Work/cc-metric-collector/configs/fritz.spr2/router.json", - "interval": "60s", - "duration": "10s" -} + "sinks-file": "/etc/cc-metric-collector/sinks.json", + "collectors-file" : "/etc/cc-metric-collector/collectors.json", + "receivers-file" : "/etc/cc-metric-collector/receivers.json", + "router-file" : "/etc/cc-metric-collector/router.json", + "main" : { + "interval": "60s", + "duration": "10s" + } +} \ No newline at end of file diff --git a/nhr@fau/cc-metric-collector/fritz.spr/router.json b/nhr@fau/cc-metric-collector/fritz.spr/router.json index 09b3813..8fb9e9a 100644 --- a/nhr@fau/cc-metric-collector/fritz.spr/router.json +++ b/nhr@fau/cc-metric-collector/fritz.spr/router.json @@ -1,54 +1,56 @@ { - "add_tags" : [ - { - "key" : "cluster", - "value" : "fritz", - "if" : "*" - } - ], - "rename_metrics" : { - "load_one" : "cpu_load", - "net_bytes_in_bw" : "net_bytes_in", - "net_bytes_out_bw" : "net_bytes_out", - "net_pkts_in_bw" : "net_pkts_in", - "net_pkts_out_bw" : "net_pkts_out", - "ib_recv_bw" : "ib_recv", - "ib_xmit_bw" : "ib_xmit", - "ib_recv_pkts_bw": "ib_recv_pkts", - "ib_xmit_pkts_bw": "ib_xmit_pkts", - "lustre_read_bytes_diff" : "lustre_read_bytes", - "lustre_read_requests_diff" : "lustre_read_requests", - "lustre_write_bytes_diff" : "lustre_write_bytes", - "lustre_write_requests_diff" : "lustre_write_requests", - "lustre_open_diff" : "lustre_open", - "lustre_close_diff" : "lustre_close", - "lustre_setattr_diff" : "lustre_setattr", - "lustre_getattr_diff" : "lustre_getattr", - "lustre_statfs_diff": "lustre_statfs", - "lustre_inode_permission_diff" : "lustre_inode_permission", - "cpufreq" : "clock" + "process_messages": { + "rename_messages" : { + "load_one" : "cpu_load", + "net_bytes_in_bw" : "net_bytes_in", + "net_bytes_out_bw" : "net_bytes_out", + "net_pkts_in_bw" : "net_pkts_in", + "net_pkts_out_bw" : "net_pkts_out", + "ib_recv_bw" : "ib_recv", + "ib_xmit_bw" : "ib_xmit", + "ib_recv_pkts_bw": "ib_recv_pkts", + "ib_xmit_pkts_bw": "ib_xmit_pkts", + "lustre_read_bytes_diff" : "lustre_read_bytes", + "lustre_read_requests_diff" : "lustre_read_requests", + "lustre_write_bytes_diff" : "lustre_write_bytes", + "lustre_write_requests_diff" : "lustre_write_requests", + "lustre_open_diff" : "lustre_open", + "lustre_close_diff" : "lustre_close", + "lustre_setattr_diff" : "lustre_setattr", + "lustre_getattr_diff" : "lustre_getattr", + "lustre_statfs_diff": "lustre_statfs", + "lustre_inode_permission_diff" : "lustre_inode_permission", + "cpufreq" : "clock" + }, + "add_tags_if" : [ + { + "key" : "cluster", + "value" : "fritz", + "if" : "true" + } + ], + "drop_messages" : [ + "net_bytes_in", + "net_bytes_out", + "ib_recv", + "ib_xmit", + "ib_recv_pkts", + "ib_xmit_pkts", + "net_pkts_in", + "net_pkts_out", + "lustre_read_bytes", + "lustre_read_requests", + "lustre_write_bytes", + "lustre_write_requests" + ], + "change_unit_prefix": { + "name == 'mem_used'": "G", + "name == 'swap_used'": "G", + "name == 'mem_total'": "G", + "name == 'swap_total'": "G" + }, + "normalize_metrics" : true }, - "drop_metrics" : [ - "net_bytes_in", - "net_bytes_out", - "ib_recv", - "ib_xmit", - "ib_recv_pkts", - "ib_xmit_pkts", - "net_pkts_in", - "net_pkts_out", - "lustre_read_bytes", - "lustre_read_requests", - "lustre_write_bytes", - "lustre_write_requests" - ], "interval_timestamp" : false, - "num_cache_intervals" : 0, - "change_unit_prefix": { - "mem_used": "G", - "swap_used": "G", - "mem_total": "G", - "swap_total": "G" - }, - "normalize_metrics" : true + "num_cache_intervals" : 0 } diff --git a/nhr@fau/cc-metric-collector/fritz.spr/sinks.json b/nhr@fau/cc-metric-collector/fritz.spr/sinks.json index c7085c7..7ee169f 100644 --- a/nhr@fau/cc-metric-collector/fritz.spr/sinks.json +++ b/nhr@fau/cc-metric-collector/fritz.spr/sinks.json @@ -1,32 +1,27 @@ { - "fritzganglia": { - "type": "libganglia", - "gmond_config": "/etc/ganglia/gmond.conf", - "libganglia_path": "libganglia.so.0", - "add_ganglia_group": true - }, - "nhrinflux": { - "type": "influxasync", - "host": "monitoring-test.nhr.uni-erlangen.de", - "port": "8086", - "organization": "ClusterCockpit", - "database": "fritz_neu", - "password": "XZY", - "ssl": true, - "meta_as_tags": [ - "unit" - ] - }, - "fritzstore": { - "type": "http", - "url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz", - "jwt": "XZY", - "meta_as_tags": [ - "unit" - ], - "idle_connection_timeout": "60s", - "flush_delay": "2s", - "max_retries": 1, - "timeout": "10s" - } + "nhrinflux" : { + "type" : "influxasync", + "host": "monitoring-test.nhr.uni-erlangen.de", + "port": "8086", + "organization" : "ClusterCockpit", + "database" : "fritz_neu", + "password": "XYZ", + "ssl": true, + "meta_as_tags" : [ + "unit" + ] + }, + "fritzstore" : { + "type" : "http", + "url" : "http://monitoring.nhr.fau.de:8082/api/write?cluster=fritz", + "jwt": "XYZ", + "meta_as_tags" : [ + "unit" + ], + "idle_connection_timeout": "60s", + "flush_delay" : "2s", + "max_retries" : 1, + "timeout" : "10s", + "precision": "s" + } }