cc-metric-collector: update tinyfat

This commit is contained in:
Michael Panzlaff
2026-03-04 17:09:34 +01:00
parent 95d1f1d5e9
commit e28a9aafd5
7 changed files with 288 additions and 108 deletions

View File

@@ -0,0 +1 @@
collectors.bw512.json

View File

@@ -0,0 +1,169 @@
{
"nfs4stat" : {},
"memstat" : {
"numa_stats": true,
"node_stats": true
},
"cpustat" : {},
"loadavg" : {},
"schedstat": {},
"netstat" : {
"include_devices" : [
"eth0",
"eth1",
"eth2",
"enp3s0"
],
"send_abs_values": true,
"send_derived_values": true
},
"diskstat" : {},
"iostat" : {},
"nfsiostat" : {},
"tempstat" : {
"tag_override" : {
"hwmon2" : {
"type" : "socket",
"type-id" : "0"
},
"hwmon3" : {
"type" : "socket",
"type-id" : "1"
}
}
},
"likwid": {
"force_overwrite" : true,
"invalid_to_zero" : true,
"access_mode" : "accessdaemon",
"accessdaemon_path" : "/apps/likwid/system/sbin",
"liblikwid_path": "/apps/likwid/system/lib/liblikwid.so",
"eventsets": [
{
"events": {
"FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"FIXC2": "CPU_CLK_UNHALTED_REF",
"MBOX0C0": "CAS_COUNT_RD",
"MBOX0C1": "CAS_COUNT_WR",
"MBOX1C0": "CAS_COUNT_RD",
"MBOX1C1": "CAS_COUNT_WR",
"MBOX2C0": "CAS_COUNT_RD",
"MBOX2C1": "CAS_COUNT_WR",
"MBOX3C0": "CAS_COUNT_RD",
"MBOX3C1": "CAS_COUNT_WR",
"MBOX4C0": "CAS_COUNT_RD",
"MBOX4C1": "CAS_COUNT_WR",
"MBOX5C0": "CAS_COUNT_RD",
"MBOX5C1": "CAS_COUNT_WR",
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE",
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_DOUBLE",
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE",
"PWR0": "PWR_PKG_ENERGY",
"PWR3": "PWR_DRAM_ENERGY"
},
"metrics": [
{
"calc": "1E-6*(FIXC1/FIXC2)/inverseClock",
"name": "clock",
"publish": true,
"unit": "MHz",
"type": "hwthread"
},
{
"calc": "FIXC0/FIXC1",
"name": "ipc",
"publish": true,
"type": "hwthread"
},
{
"calc": "PWR0/time",
"name": "pwr_pkg",
"publish": true,
"unit": "Watt",
"type": "socket"
},
{
"calc": "PWR3/time",
"name": "pwr_dram",
"publish": true,
"unit": "Watt",
"type": "socket"
},
{
"calc": "1E-9*(PMC0*2.0+PMC1+PMC2*4.0)/time",
"name": "flops_dp",
"publish": true,
"unit": "GFlops/s",
"type": "hwthread"
},
{
"calc": "1E-9*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time",
"name": "mem_bw",
"publish": true,
"unit": "GBytes/s",
"type": "socket"
},
{
"calc": "PMC0+PMC2",
"name": "dp_vec_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "PMC0+PMC1+PMC2",
"name": "dp_fp_ins",
"type": "hwthread",
"publish": false
}
]
},
{
"events": {
"FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"FIXC2": "CPU_CLK_UNHALTED_REF",
"PMC0": "FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE",
"PMC1": "FP_ARITH_INST_RETIRED_SCALAR_SINGLE",
"PMC2": "FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE"
},
"metrics": [
{
"calc": "1E-9*(PMC0*4.0+PMC1+PMC2*8.0)/time",
"name": "flops_sp",
"publish": true,
"unit": "GFlops/s",
"type": "hwthread"
},
{
"calc": "PMC0+PMC2",
"name": "sp_vec_ins",
"type": "hwthread",
"publish": false
},
{
"calc": "PMC0+PMC1+PMC2",
"name": "sp_fp_ins",
"type": "hwthread",
"publish": false
}
]
}
],
"globalmetrics": [
{
"calc": "100*((sp_vec_ins+dp_vec_ins)/(sp_fp_ins+dp_fp_ins))",
"name": "vectorization_ratio",
"type": "hwthread",
"publish": true
},
{
"calc": "(flops_dp * 2) + flops_sp",
"name": "flops_any",
"type": "hwthread",
"unit": "GFlops/s",
"publish": true
}
]
}
}

View File

@@ -11,13 +11,15 @@
"include_devices" : [ "include_devices" : [
"eth0", "eth0",
"eth1", "eth1",
"eth2" "eth2",
"enp1s0f0"
], ],
"send_abs_values": true, "send_abs_values": true,
"send_derived_values": true "send_derived_values": true
}, },
"diskstat" : {}, "diskstat" : {},
"iostat" : {}, "iostat" : {},
"nfsiostat" : {},
"tempstat" : { "tempstat" : {
"tag_override" : { "tag_override" : {
"hwmon1" : { "hwmon1" : {
@@ -45,8 +47,8 @@
"PMC1": "CPU_CLOCKS_UNHALTED", "PMC1": "CPU_CLOCKS_UNHALTED",
"PMC2": "RETIRED_SSE_AVX_FLOPS_ALL", "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL",
"PMC3": "MERGE", "PMC3": "MERGE",
"DFC0": "DATA_FROM_LOCAL_DRAM_CHANNEL", "DFC0": "DRAM_CHANNEL_0",
"DFC1": "DATA_TO_LOCAL_DRAM_CHANNEL", "DFC1": "DRAM_CHANNEL_1",
"PWR0": "RAPL_CORE_ENERGY", "PWR0": "RAPL_CORE_ENERGY",
"PWR1": "RAPL_PKG_ENERGY" "PWR1": "RAPL_PKG_ENERGY"
}, },
@@ -87,7 +89,7 @@
}, },
{ {
"name": "mem_bw", "name": "mem_bw",
"calc": "1E-9*(DFC0+DFC1)*64.0/time", "calc": "1E-9*(DFC0+DFC1)*(4.0/(8/4))*64.0/time",
"unit": "Gbyte/s", "unit": "Gbyte/s",
"type": "socket", "type": "socket",
"publish": true "publish": true

View File

@@ -1,8 +1,10 @@
{ {
"sinks": "/etc/cc-metric-collector/sinks.json", "sinks-file": "/etc/cc-metric-collector/sinks.json",
"collectors" : "/etc/cc-metric-collector/collectors.json", "collectors-file" : "/etc/cc-metric-collector/collectors.json",
"receivers" : "/etc/cc-metric-collector/receivers.json", "receivers-file" : "/etc/cc-metric-collector/receivers.json",
"router" : "/etc/cc-metric-collector/router.json", "router-file" : "/etc/cc-metric-collector/router.json",
"interval": "60s", "main" : {
"duration": "10s" "interval": "60s",
"duration": "10s"
}
} }

View File

@@ -1,49 +1,58 @@
{ {
"add_tags" : [ "process_messages": {
{ "rename_messages" : {
"key" : "cluster", "load_one" : "cpu_load",
"value" : "tinyfat", "cpu_load_core" : "cpu_load",
"if" : "*" "net_bytes_in_bw" : "net_bytes_in",
} "net_bytes_out_bw" : "net_bytes_out",
], "net_pkts_in_bw" : "net_pkts_in",
"rename_metrics" : { "net_pkts_out_bw" : "net_pkts_out",
"load_one" : "cpu_load", "ib_recv_bw" : "ib_recv",
"cpu_load_core" : "cpu_load", "ib_xmit_bw" : "ib_xmit",
"net_bytes_in_bw" : "net_bytes_in", "ib_recv_pkts_bw": "ib_recv_pkts",
"net_bytes_out_bw" : "net_bytes_out", "ib_xmit_pkts_bw": "ib_xmit_pkts",
"net_pkts_in_bw" : "net_pkts_in", "lustre_read_bytes_diff" : "lustre_read_bytes",
"net_pkts_out_bw" : "net_pkts_out", "lustre_read_requests_diff" : "lustre_read_requests",
"ib_recv_bw" : "ib_recv", "lustre_write_bytes_diff" : "lustre_write_bytes",
"ib_xmit_bw" : "ib_xmit", "lustre_write_requests_diff" : "lustre_write_requests",
"ib_recv_pkts_bw": "ib_recv_pkts", "lustre_open_diff" : "lustre_open",
"ib_xmit_pkts_bw": "ib_xmit_pkts", "lustre_close_diff" : "lustre_close",
"lustre_read_bytes_diff" : "lustre_read_bytes", "lustre_setattr_diff" : "lustre_setattr",
"lustre_read_requests_diff" : "lustre_read_requests", "lustre_getattr_diff" : "lustre_getattr",
"lustre_write_bytes_diff" : "lustre_write_bytes", "lustre_statfs_diff": "lustre_statfs",
"lustre_write_requests_diff" : "lustre_write_requests", "lustre_inode_permission_diff" : "lustre_inode_permission",
"lustre_open_diff" : "lustre_open", "pwr_pkg": "cpu_power",
"lustre_close_diff" : "lustre_close", "pwr_dram": "mem_power"
"lustre_setattr_diff" : "lustre_setattr", },
"lustre_getattr_diff" : "lustre_getattr", "add_tags_if" : [
"lustre_statfs_diff": "lustre_statfs", {
"lustre_inode_permission_diff" : "lustre_inode_permission", "key" : "cluster",
"pwr_pkg": "cpu_power", "value" : "tinyfat",
"pwr_dram": "mem_power" "if" : "true"
}
],
"drop_messages" : [
"net_bytes_in",
"net_bytes_out",
"ib_recv",
"ib_xmit",
"ib_recv_pkts",
"ib_xmit_pkts",
"net_pkts_in",
"net_pkts_out",
"lustre_read_bytes",
"lustre_read_requests",
"lustre_write_bytes",
"lustre_write_requests"
],
"change_unit_prefix": {
"name == 'mem_used'": "G",
"name == 'swap_used'": "G",
"name == 'mem_total'": "G",
"name == 'swap_total'": "G"
},
"normalize_messages" : true
}, },
"drop_metrics" : [
"net_bytes_in",
"net_bytes_out",
"ib_recv",
"ib_xmit",
"ib_recv_pkts",
"ib_xmit_pkts",
"net_pkts_in",
"net_pkts_out",
"lustre_read_bytes",
"lustre_read_requests",
"lustre_write_bytes",
"lustre_write_requests"
],
"interval_timestamp" : false, "interval_timestamp" : false,
"num_cache_intervals" : 0 "num_cache_intervals" : 0
} }

View File

@@ -1,26 +1,53 @@
{ {
"influx": { "influx" : {
"type": "influxasync", "type" : "influxasync",
"host": "monitoring-test.nhr.uni-erlangen.de", "host": "monitoring-test.nhr.uni-erlangen.de",
"port": "8086", "port": "8086",
"organization": "ClusterCockpit", "organization" : "ClusterCockpit",
"database": "tinyfat", "database" : "tinyfat",
"password": "XZY", "password": "XYZ",
"ssl": true, "ssl": true,
"meta_as_tags": [ "process_messages": {
"unit" "move_meta_to_tag_if": [
] {
}, "key": "unit",
"metricstore": { "if": "true"
"type": "http", }
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinyfat", ]
"jwt": "XYZ",
"meta_as_tags": [
"unit"
],
"idle_connection_timeout": "60s",
"flush_delay": "2s",
"max_retries": 1,
"timeout": "10s"
} }
},
"metricstore" : {
"type" : "http",
"url" : "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinyfat",
"jwt": "XYZ",
"idle_connection_timeout": "60s",
"flush_delay" : "2s",
"max_retries" : 1,
"timeout" : "10s",
"precision": "s",
"process_messages": {
"move_meta_to_tag_if": [
{
"key": "unit",
"if": "true"
}
]
}
},
"tinyfatnats": {
"type": "nats",
"host": "monitoring.nhr.fau.de",
"database": "tinyfat",
"nkey_file": "/etc/cc-metric-collector/nats.nkey",
"flush_delay": "1s",
"precision": "s",
"process_messages": {
"move_meta_to_tag_if": [
{
"key": "unit",
"if": "true"
}
]
}
}
} }

View File

@@ -1,30 +0,0 @@
{
"influx": {
"type": "influxasync",
"host": "monitoring-test.nhr.uni-erlangen.de",
"port": "8086",
"organization": "ClusterCockpit",
"database": "tinyfat",
"password": "XZY",
"ssl": true,
"meta_as_tags": [
"unit"
]
},
"metricstore": {
"type": "http",
"url": "http://monitoring.nhr.fau.de:8082/api/write?cluster=tinyfat",
"jwt": "XZY",
"meta_as_tags": [
"unit"
],
"idle_connection_timeout": "60s",
"flush_delay": "2s",
"max_retries": 1,
"timeout": "10s"
},
"debugstdout": {
"type": "stdout",
"output_file": "/tmp/debug.log"
}
}