From 1db5f3b29ad1dc78e6cd057ddfaf256b9608e0ee Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Fri, 13 May 2022 14:09:45 +0200 Subject: [PATCH] Rename `cpu` type to `hwthread` (#69) * Rename 'cpu' type to 'hwthread' to avoid naming clashes with MetricStore and CC-Webfrontend --- .github/ci-sinks.json | 4 +- collectors/cpufreqCpuinfoMetric.go | 2 +- collectors/cpufreqCpuinfoMetric.md | 2 +- collectors/cpufreqMetric.go | 2 +- collectors/cpufreqMetric.md | 2 +- collectors/cpustatMetric.go | 2 +- collectors/likwidMetric.md | 50 +++++++++++++++---- collectors/sampleMetric.go | 7 ++- .../metricAggregatorFunctions.go | 2 +- scripts/likwid_perfgroup_to_cc_config.py | 2 +- 10 files changed, 56 insertions(+), 19 deletions(-) diff --git a/.github/ci-sinks.json b/.github/ci-sinks.json index aa8ae80..2b78305 100644 --- a/.github/ci-sinks.json +++ b/.github/ci-sinks.json @@ -1,6 +1,8 @@ { "testoutput" : { "type" : "stdout", - "meta_as_tags" : true + "meta_as_tags" : [ + "unit" + ] } } diff --git a/collectors/cpufreqCpuinfoMetric.go b/collectors/cpufreqCpuinfoMetric.go index 6c3de7a..96ee9c5 100644 --- a/collectors/cpufreqCpuinfoMetric.go +++ b/collectors/cpufreqCpuinfoMetric.go @@ -150,7 +150,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { t.numNonHT = numNonHT t.numNonHT_int = numNonHT_int t.tagSet = map[string]string{ - "type": "cpu", + "type": "hwthread", "type-id": t.processor, "package_id": t.physicalPackageID, } diff --git a/collectors/cpufreqCpuinfoMetric.md b/collectors/cpufreqCpuinfoMetric.md index 8b0216f..de93176 100644 --- a/collectors/cpufreqCpuinfoMetric.md +++ b/collectors/cpufreqCpuinfoMetric.md @@ -4,7 +4,7 @@ "cpufreq_cpuinfo": {} ``` -The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **cpu** metrics. +The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **hwthread** metrics. Metrics: * `cpufreq` diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index 0bf6d4c..076fdf5 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -161,7 +161,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { t.numNonHT = numNonHT t.numNonHT_int = numNonHT_int t.tagSet = map[string]string{ - "type": "cpu", + "type": "hwthread", "type-id": t.processor, "package_id": t.physicalPackageID, } diff --git a/collectors/cpufreqMetric.md b/collectors/cpufreqMetric.md index b62d16e..71a6446 100644 --- a/collectors/cpufreqMetric.md +++ b/collectors/cpufreqMetric.md @@ -5,7 +5,7 @@ } ``` -The `cpufreq` collector reads the clock frequency from `/sys/devices/system/cpu/cpu*/cpufreq` and outputs a handful **cpu** metrics. +The `cpufreq` collector reads the clock frequency from `/sys/devices/system/cpu/cpu*/cpufreq` and outputs a handful **hwthread** metrics. Metrics: * `cpufreq` \ No newline at end of file diff --git a/collectors/cpustatMetric.go b/collectors/cpustatMetric.go index 556aad4..2f2b084 100644 --- a/collectors/cpustatMetric.go +++ b/collectors/cpustatMetric.go @@ -82,7 +82,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error { if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 { cpustr := strings.TrimLeft(linefields[0], "cpu") cpu, _ := strconv.Atoi(cpustr) - m.cputags[linefields[0]] = map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", cpu)} + m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)} num_cpus++ } } diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index 2d622d1..1bb211f 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -19,7 +19,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li "calc": "COUNTER0 + COUNTER1", "publish": false, "unit": "myunit", - "type": "cpu" + "type": "hwthread" } ] } @@ -30,7 +30,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li "calc": "sum_01", "publish": true, "unit": "myunit", - "type": "cpu" + "type": "hwthread" } ] } @@ -51,15 +51,15 @@ Additional options: Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the LikwidCollector provides the specification of a `type` for each metric. -- `cpu` : One metric per CPU hardware thread with the tags `"type" : "cpu"` and `"type-id" : "$cpu_id"` +- `hwthread` : One metric per CPU hardware thread with the tags `"type" : "hwthread"` and `"type-id" : "$hwthread_id"` - `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"` -**Note:** You should not specify the `socket` type for a metric that is measured at `cpu` scope and vice versa, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific. +**Note:** You cannot specify `socket` scope for a metric that is measured at `hwthread` scope, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific. As a guideline: -- All counters `FIXCx`, `PMCy` and `TMAz` have the scope `cpu` +- All counters `FIXCx`, `PMCy` and `TMAz` have the scope `hwthread` - All counters names containing `BOX` have the scope `socket` -- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `cpu` scope (AMD Zen) +- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `hwthread` scope - All `DFCx` counters have scope `socket` ### Help with the configuration @@ -90,7 +90,7 @@ $ scripts/likwid_perfgroup_to_cc_config.py ICX MEM_DP "name": "Runtime (RDTSC) [s]", "publish": true, "unit": "seconds" - "scope": "cpu" + "scope": "hwthread" }, { "..." : "..." @@ -147,20 +147,20 @@ One might think this does not happen often but often used metrics in the world o { "name": "ipc", "calc": "PMC0/PMC1", - "type": "cpu", + "type": "hwthread", "publish": true }, { "name": "flops_any", "calc": "0.000001*PMC2/time", "unit": "MFlops/s", - "type": "cpu", + "type": "hwthread", "publish": true }, { "name": "clock", "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", - "type": "cpu", + "type": "hwthread", "unit": "MHz", "publish": true }, @@ -219,3 +219,33 @@ One might think this does not happen often but often used metrics in the world o } ``` +### How to get the eventsets and metrics from LIKWID + +The `likwid` collector reads hardware performance counters at a **hwthread** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility. + +The logic is as following: There are multiple eventsets, each consisting of a list of counters+events and a list of metrics. If you compare a common performance group with the example setting above, there is not much difference: +``` +EVENTSET -> "events": { +FIXC1 ACTUAL_CPU_CLOCK -> "FIXC1": "ACTUAL_CPU_CLOCK", +FIXC2 MAX_CPU_CLOCK -> "FIXC2": "MAX_CPU_CLOCK", +PMC0 RETIRED_INSTRUCTIONS -> "PMC0" : "RETIRED_INSTRUCTIONS", +PMC1 CPU_CLOCKS_UNHALTED -> "PMC1" : "CPU_CLOCKS_UNHALTED", +PMC2 RETIRED_SSE_AVX_FLOPS_ALL -> "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL", +PMC3 MERGE -> "PMC3": "MERGE", + -> } +``` + +The metrics are following the same procedure: + +``` +METRICS -> "metrics": [ +IPC PMC0/PMC1 -> { + -> "name" : "IPC", + -> "calc" : "PMC0/PMC1", + -> "scope": "hwthread", + -> "publish": true + -> } + -> ] +``` + +The script `scripts/likwid_perfgroup_to_cc_config.py` might help you. diff --git a/collectors/sampleMetric.go b/collectors/sampleMetric.go index 47078a6..bea4df0 100644 --- a/collectors/sampleMetric.go +++ b/collectors/sampleMetric.go @@ -42,7 +42,12 @@ func (m *SampleCollector) Init(config json.RawMessage) error { // The 'type' tag is always needed, it defines the granulatity of the metric // node -> whole system // socket -> CPU socket (requires socket ID as 'type-id' tag) - // cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag) + // die -> CPU die (requires CPU die ID as 'type-id' tag) + // memoryDomain -> NUMA domain (requires NUMA domain ID as 'type-id' tag) + // llc -> Last level cache (requires last level cache ID as 'type-id' tag) + // core -> single CPU core that may consist of multiple hardware threads (SMT) (requires core ID as 'type-id' tag) + // hwthtread -> single CPU hardware thread (requires hardware thread ID as 'type-id' tag) + // accelerator -> A accelerator device like GPU or FPGA (requires an accelerator ID as 'type-id' tag) m.tags = map[string]string{"type": "node"} // Read in the JSON configuration if len(config) > 0 { diff --git a/internal/metricAggregator/metricAggregatorFunctions.go b/internal/metricAggregator/metricAggregatorFunctions.go index 1fbef65..0228047 100644 --- a/internal/metricAggregator/metricAggregatorFunctions.go +++ b/internal/metricAggregator/metricAggregatorFunctions.go @@ -355,7 +355,7 @@ func getCpuListOfType(args ...interface{}) (interface{}, error) { return getCpuListOfNumaDomainFunc(args[1]) case "core": return getCpuListOfCoreFunc(args[1]) - case "cpu": + case "hwthread": var cpu int switch id := args[1].(type) { diff --git a/scripts/likwid_perfgroup_to_cc_config.py b/scripts/likwid_perfgroup_to_cc_config.py index f1c3451..52959ed 100755 --- a/scripts/likwid_perfgroup_to_cc_config.py +++ b/scripts/likwid_perfgroup_to_cc_config.py @@ -39,7 +39,7 @@ def group_to_json(groupfile): llist = re.split("\s+", line) calc = llist[-1] metric = " ".join(llist[:-1]) - scope = "cpu" + scope = "hwthread" if "BOX" in calc: scope = "socket" if "PWR" in calc: