From 07fe17c2d1a280da6ed3f989b6788e9d0d9e5272 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 18 Mar 2022 13:41:54 +0100 Subject: [PATCH] Rename 'cpu' type to 'hwthread' to avoid naming clashes with MetricStore and CC-Webfrontend --- collectors/cpufreqCpuinfoMetric.go | 2 +- collectors/cpufreqCpuinfoMetric.md | 2 +- collectors/cpufreqMetric.go | 2 +- collectors/cpufreqMetric.md | 2 +- collectors/cpustatMetric.go | 2 +- collectors/likwidMetric.md | 22 +++++++++---------- collectors/sampleMetric.go | 7 +++++- .../metricAggregatorFunctions.go | 2 +- scripts/likwid_perfgroup_to_cc_config.py | 2 +- 9 files changed, 24 insertions(+), 19 deletions(-) diff --git a/collectors/cpufreqCpuinfoMetric.go b/collectors/cpufreqCpuinfoMetric.go index 6c3de7a..96ee9c5 100644 --- a/collectors/cpufreqCpuinfoMetric.go +++ b/collectors/cpufreqCpuinfoMetric.go @@ -150,7 +150,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { t.numNonHT = numNonHT t.numNonHT_int = numNonHT_int t.tagSet = map[string]string{ - "type": "cpu", + "type": "hwthread", "type-id": t.processor, "package_id": t.physicalPackageID, } diff --git a/collectors/cpufreqCpuinfoMetric.md b/collectors/cpufreqCpuinfoMetric.md index 8b0216f..de93176 100644 --- a/collectors/cpufreqCpuinfoMetric.md +++ b/collectors/cpufreqCpuinfoMetric.md @@ -4,7 +4,7 @@ "cpufreq_cpuinfo": {} ``` -The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **cpu** metrics. +The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **hwthread** metrics. Metrics: * `cpufreq` diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index 0bf6d4c..076fdf5 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -161,7 +161,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { t.numNonHT = numNonHT t.numNonHT_int = numNonHT_int t.tagSet = map[string]string{ - "type": "cpu", + "type": "hwthread", "type-id": t.processor, "package_id": t.physicalPackageID, } diff --git a/collectors/cpufreqMetric.md b/collectors/cpufreqMetric.md index b62d16e..71a6446 100644 --- a/collectors/cpufreqMetric.md +++ b/collectors/cpufreqMetric.md @@ -5,7 +5,7 @@ } ``` -The `cpufreq` collector reads the clock frequency from `/sys/devices/system/cpu/cpu*/cpufreq` and outputs a handful **cpu** metrics. +The `cpufreq` collector reads the clock frequency from `/sys/devices/system/cpu/cpu*/cpufreq` and outputs a handful **hwthread** metrics. Metrics: * `cpufreq` \ No newline at end of file diff --git a/collectors/cpustatMetric.go b/collectors/cpustatMetric.go index 556aad4..2f2b084 100644 --- a/collectors/cpustatMetric.go +++ b/collectors/cpustatMetric.go @@ -82,7 +82,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error { if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 { cpustr := strings.TrimLeft(linefields[0], "cpu") cpu, _ := strconv.Atoi(cpustr) - m.cputags[linefields[0]] = map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", cpu)} + m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)} num_cpus++ } } diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index fe28857..86c1dda 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -4,7 +4,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID library is included as static library with *direct* access mode. The *direct* access mode is suitable if the daemon is executed by a root user. The static library does not contain the performance groups, so all information needs to be provided in the configuration. The `likwid` configuration consists of two parts, the "eventsets" and "globalmetrics": -- An event set list itself has two parts, the "events" and a set of derivable "metrics". Each of the "events" is a counter:event pair in LIKWID's syntax. The "metrics" are a list of formulas to derive the metric value from the measurements of the "events". Each metric has a name, the formula, a scope and a publish flag. Counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. The scope tells the Collector whether it is a metric for each hardware thread (`cpu`) or each CPU socket (`socket`). You may specify a unit for the metric with `unit`. The last one is the publishing flag. It tells the collector whether a metric should be sent to the router. +- An event set list itself has two parts, the "events" and a set of derivable "metrics". Each of the "events" is a counter:event pair in LIKWID's syntax. The "metrics" are a list of formulas to derive the metric value from the measurements of the "events". Each metric has a name, the formula, a scope and a publish flag. Counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. The scope tells the Collector whether it is a metric for each hardware thread (`hwthread`) or each CPU socket (`socket`). You may specify a unit for the metric with `unit`. The last one is the publishing flag. It tells the collector whether a metric should be sent to the router. - The global metrics are metrics which require data from all event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. So, the idea is to derive a metric in the "eventsets" section and reuse it in the "globalmetrics" part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics (`publish=false`). **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Similar to the metrics in the eventset, you can specify a metric unit with the `unit` field. Additional options: @@ -20,15 +20,15 @@ Additional options: Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the collector provides the specification of a 'scope' for each metric. -- `cpu` : One metric per CPU hardware thread with the tags `"type" : "cpu"` and `"type-id" : "$cpu_id"` +- `hwthread` : One metric per CPU hardware thread with the tags `"type" : "hwthread"` and `"type-id" : "$hwthread_id"` - `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"` -**Note:** You cannot specify `socket` scope for a metric that is measured at `cpu` scope, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific. +**Note:** You cannot specify `socket` scope for a metric that is measured at `hwthread` scope, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific. As a guideline: -- All counters `FIXCx`, `PMCy` and `TMAz` have the scope `cpu` +- All counters `FIXCx`, `PMCy` and `TMAz` have the scope `hwthread` - All counters names containing `BOX` have the scope `socket` -- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `cpu` scope +- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `hwthread` scope - All `DFCx` counters have scope `socket` ### Help with the configuration @@ -58,7 +58,7 @@ $ scripts/likwid_perfgroup_to_cc_config.py ICX MEM_DP "name": "Runtime (RDTSC) [s]", "publish": true, "unit": "seconds" - "scope": "cpu" + "scope": "hwthread" }, { "..." : "..." @@ -108,20 +108,20 @@ $ chwon $CCUSER /var/run/likwid.lock { "name": "ipc", "calc": "PMC0/PMC1", - "type": "cpu", + "type": "hwthread", "publish": true }, { "name": "flops_any", "calc": "0.000001*PMC2/time", "unit": "MFlops/s", - "type": "cpu", + "type": "hwthread", "publish": true }, { "name": "clock", "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", - "type": "cpu", + "type": "hwthread", "unit": "MHz", "publish": true }, @@ -182,7 +182,7 @@ $ chwon $CCUSER /var/run/likwid.lock ### How to get the eventsets and metrics from LIKWID -The `likwid` collector reads hardware performance counters at a **cpu** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility. +The `likwid` collector reads hardware performance counters at a **hwthread** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility. The logic is as following: There are multiple eventsets, each consisting of a list of counters+events and a list of metrics. If you compare a common performance group with the example setting above, there is not much difference: ``` @@ -203,7 +203,7 @@ METRICS -> "metrics": [ IPC PMC0/PMC1 -> { -> "name" : "IPC", -> "calc" : "PMC0/PMC1", - -> "scope": "cpu", + -> "scope": "hwthread", -> "publish": true -> } -> ] diff --git a/collectors/sampleMetric.go b/collectors/sampleMetric.go index 47078a6..bea4df0 100644 --- a/collectors/sampleMetric.go +++ b/collectors/sampleMetric.go @@ -42,7 +42,12 @@ func (m *SampleCollector) Init(config json.RawMessage) error { // The 'type' tag is always needed, it defines the granulatity of the metric // node -> whole system // socket -> CPU socket (requires socket ID as 'type-id' tag) - // cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag) + // die -> CPU die (requires CPU die ID as 'type-id' tag) + // memoryDomain -> NUMA domain (requires NUMA domain ID as 'type-id' tag) + // llc -> Last level cache (requires last level cache ID as 'type-id' tag) + // core -> single CPU core that may consist of multiple hardware threads (SMT) (requires core ID as 'type-id' tag) + // hwthtread -> single CPU hardware thread (requires hardware thread ID as 'type-id' tag) + // accelerator -> A accelerator device like GPU or FPGA (requires an accelerator ID as 'type-id' tag) m.tags = map[string]string{"type": "node"} // Read in the JSON configuration if len(config) > 0 { diff --git a/internal/metricAggregator/metricAggregatorFunctions.go b/internal/metricAggregator/metricAggregatorFunctions.go index 1fbef65..0228047 100644 --- a/internal/metricAggregator/metricAggregatorFunctions.go +++ b/internal/metricAggregator/metricAggregatorFunctions.go @@ -355,7 +355,7 @@ func getCpuListOfType(args ...interface{}) (interface{}, error) { return getCpuListOfNumaDomainFunc(args[1]) case "core": return getCpuListOfCoreFunc(args[1]) - case "cpu": + case "hwthread": var cpu int switch id := args[1].(type) { diff --git a/scripts/likwid_perfgroup_to_cc_config.py b/scripts/likwid_perfgroup_to_cc_config.py index f1c3451..52959ed 100755 --- a/scripts/likwid_perfgroup_to_cc_config.py +++ b/scripts/likwid_perfgroup_to_cc_config.py @@ -39,7 +39,7 @@ def group_to_json(groupfile): llist = re.split("\s+", line) calc = llist[-1] metric = " ".join(llist[:-1]) - scope = "cpu" + scope = "hwthread" if "BOX" in calc: scope = "socket" if "PWR" in calc: