mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2024-11-10 04:27:25 +01:00
Rename cpu
type to hwthread
(#69)
* Rename 'cpu' type to 'hwthread' to avoid naming clashes with MetricStore and CC-Webfrontend
This commit is contained in:
parent
0623691bab
commit
1db5f3b29a
4
.github/ci-sinks.json
vendored
4
.github/ci-sinks.json
vendored
@ -1,6 +1,8 @@
|
|||||||
{
|
{
|
||||||
"testoutput" : {
|
"testoutput" : {
|
||||||
"type" : "stdout",
|
"type" : "stdout",
|
||||||
"meta_as_tags" : true
|
"meta_as_tags" : [
|
||||||
|
"unit"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -150,7 +150,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
|||||||
t.numNonHT = numNonHT
|
t.numNonHT = numNonHT
|
||||||
t.numNonHT_int = numNonHT_int
|
t.numNonHT_int = numNonHT_int
|
||||||
t.tagSet = map[string]string{
|
t.tagSet = map[string]string{
|
||||||
"type": "cpu",
|
"type": "hwthread",
|
||||||
"type-id": t.processor,
|
"type-id": t.processor,
|
||||||
"package_id": t.physicalPackageID,
|
"package_id": t.physicalPackageID,
|
||||||
}
|
}
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
"cpufreq_cpuinfo": {}
|
"cpufreq_cpuinfo": {}
|
||||||
```
|
```
|
||||||
|
|
||||||
The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **cpu** metrics.
|
The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **hwthread** metrics.
|
||||||
|
|
||||||
Metrics:
|
Metrics:
|
||||||
* `cpufreq`
|
* `cpufreq`
|
||||||
|
@ -161,7 +161,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
|||||||
t.numNonHT = numNonHT
|
t.numNonHT = numNonHT
|
||||||
t.numNonHT_int = numNonHT_int
|
t.numNonHT_int = numNonHT_int
|
||||||
t.tagSet = map[string]string{
|
t.tagSet = map[string]string{
|
||||||
"type": "cpu",
|
"type": "hwthread",
|
||||||
"type-id": t.processor,
|
"type-id": t.processor,
|
||||||
"package_id": t.physicalPackageID,
|
"package_id": t.physicalPackageID,
|
||||||
}
|
}
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The `cpufreq` collector reads the clock frequency from `/sys/devices/system/cpu/cpu*/cpufreq` and outputs a handful **cpu** metrics.
|
The `cpufreq` collector reads the clock frequency from `/sys/devices/system/cpu/cpu*/cpufreq` and outputs a handful **hwthread** metrics.
|
||||||
|
|
||||||
Metrics:
|
Metrics:
|
||||||
* `cpufreq`
|
* `cpufreq`
|
@ -82,7 +82,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
|||||||
if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
||||||
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
||||||
cpu, _ := strconv.Atoi(cpustr)
|
cpu, _ := strconv.Atoi(cpustr)
|
||||||
m.cputags[linefields[0]] = map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", cpu)}
|
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
|
||||||
num_cpus++
|
num_cpus++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -19,7 +19,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li
|
|||||||
"calc": "COUNTER0 + COUNTER1",
|
"calc": "COUNTER0 + COUNTER1",
|
||||||
"publish": false,
|
"publish": false,
|
||||||
"unit": "myunit",
|
"unit": "myunit",
|
||||||
"type": "cpu"
|
"type": "hwthread"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -30,7 +30,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li
|
|||||||
"calc": "sum_01",
|
"calc": "sum_01",
|
||||||
"publish": true,
|
"publish": true,
|
||||||
"unit": "myunit",
|
"unit": "myunit",
|
||||||
"type": "cpu"
|
"type": "hwthread"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -51,15 +51,15 @@ Additional options:
|
|||||||
|
|
||||||
Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the LikwidCollector provides the specification of a `type` for each metric.
|
Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the LikwidCollector provides the specification of a `type` for each metric.
|
||||||
|
|
||||||
- `cpu` : One metric per CPU hardware thread with the tags `"type" : "cpu"` and `"type-id" : "$cpu_id"`
|
- `hwthread` : One metric per CPU hardware thread with the tags `"type" : "hwthread"` and `"type-id" : "$hwthread_id"`
|
||||||
- `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"`
|
- `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"`
|
||||||
|
|
||||||
**Note:** You should not specify the `socket` type for a metric that is measured at `cpu` scope and vice versa, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific.
|
**Note:** You cannot specify `socket` scope for a metric that is measured at `hwthread` scope, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific.
|
||||||
|
|
||||||
As a guideline:
|
As a guideline:
|
||||||
- All counters `FIXCx`, `PMCy` and `TMAz` have the scope `cpu`
|
- All counters `FIXCx`, `PMCy` and `TMAz` have the scope `hwthread`
|
||||||
- All counters names containing `BOX` have the scope `socket`
|
- All counters names containing `BOX` have the scope `socket`
|
||||||
- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `cpu` scope (AMD Zen)
|
- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `hwthread` scope
|
||||||
- All `DFCx` counters have scope `socket`
|
- All `DFCx` counters have scope `socket`
|
||||||
|
|
||||||
### Help with the configuration
|
### Help with the configuration
|
||||||
@ -90,7 +90,7 @@ $ scripts/likwid_perfgroup_to_cc_config.py ICX MEM_DP
|
|||||||
"name": "Runtime (RDTSC) [s]",
|
"name": "Runtime (RDTSC) [s]",
|
||||||
"publish": true,
|
"publish": true,
|
||||||
"unit": "seconds"
|
"unit": "seconds"
|
||||||
"scope": "cpu"
|
"scope": "hwthread"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"..." : "..."
|
"..." : "..."
|
||||||
@ -147,20 +147,20 @@ One might think this does not happen often but often used metrics in the world o
|
|||||||
{
|
{
|
||||||
"name": "ipc",
|
"name": "ipc",
|
||||||
"calc": "PMC0/PMC1",
|
"calc": "PMC0/PMC1",
|
||||||
"type": "cpu",
|
"type": "hwthread",
|
||||||
"publish": true
|
"publish": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "flops_any",
|
"name": "flops_any",
|
||||||
"calc": "0.000001*PMC2/time",
|
"calc": "0.000001*PMC2/time",
|
||||||
"unit": "MFlops/s",
|
"unit": "MFlops/s",
|
||||||
"type": "cpu",
|
"type": "hwthread",
|
||||||
"publish": true
|
"publish": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "clock",
|
"name": "clock",
|
||||||
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
|
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
|
||||||
"type": "cpu",
|
"type": "hwthread",
|
||||||
"unit": "MHz",
|
"unit": "MHz",
|
||||||
"publish": true
|
"publish": true
|
||||||
},
|
},
|
||||||
@ -219,3 +219,33 @@ One might think this does not happen often but often used metrics in the world o
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### How to get the eventsets and metrics from LIKWID
|
||||||
|
|
||||||
|
The `likwid` collector reads hardware performance counters at a **hwthread** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility.
|
||||||
|
|
||||||
|
The logic is as following: There are multiple eventsets, each consisting of a list of counters+events and a list of metrics. If you compare a common performance group with the example setting above, there is not much difference:
|
||||||
|
```
|
||||||
|
EVENTSET -> "events": {
|
||||||
|
FIXC1 ACTUAL_CPU_CLOCK -> "FIXC1": "ACTUAL_CPU_CLOCK",
|
||||||
|
FIXC2 MAX_CPU_CLOCK -> "FIXC2": "MAX_CPU_CLOCK",
|
||||||
|
PMC0 RETIRED_INSTRUCTIONS -> "PMC0" : "RETIRED_INSTRUCTIONS",
|
||||||
|
PMC1 CPU_CLOCKS_UNHALTED -> "PMC1" : "CPU_CLOCKS_UNHALTED",
|
||||||
|
PMC2 RETIRED_SSE_AVX_FLOPS_ALL -> "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL",
|
||||||
|
PMC3 MERGE -> "PMC3": "MERGE",
|
||||||
|
-> }
|
||||||
|
```
|
||||||
|
|
||||||
|
The metrics are following the same procedure:
|
||||||
|
|
||||||
|
```
|
||||||
|
METRICS -> "metrics": [
|
||||||
|
IPC PMC0/PMC1 -> {
|
||||||
|
-> "name" : "IPC",
|
||||||
|
-> "calc" : "PMC0/PMC1",
|
||||||
|
-> "scope": "hwthread",
|
||||||
|
-> "publish": true
|
||||||
|
-> }
|
||||||
|
-> ]
|
||||||
|
```
|
||||||
|
|
||||||
|
The script `scripts/likwid_perfgroup_to_cc_config.py` might help you.
|
||||||
|
@ -42,7 +42,12 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
|
|||||||
// The 'type' tag is always needed, it defines the granulatity of the metric
|
// The 'type' tag is always needed, it defines the granulatity of the metric
|
||||||
// node -> whole system
|
// node -> whole system
|
||||||
// socket -> CPU socket (requires socket ID as 'type-id' tag)
|
// socket -> CPU socket (requires socket ID as 'type-id' tag)
|
||||||
// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
|
// die -> CPU die (requires CPU die ID as 'type-id' tag)
|
||||||
|
// memoryDomain -> NUMA domain (requires NUMA domain ID as 'type-id' tag)
|
||||||
|
// llc -> Last level cache (requires last level cache ID as 'type-id' tag)
|
||||||
|
// core -> single CPU core that may consist of multiple hardware threads (SMT) (requires core ID as 'type-id' tag)
|
||||||
|
// hwthtread -> single CPU hardware thread (requires hardware thread ID as 'type-id' tag)
|
||||||
|
// accelerator -> A accelerator device like GPU or FPGA (requires an accelerator ID as 'type-id' tag)
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
// Read in the JSON configuration
|
// Read in the JSON configuration
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
|
@ -355,7 +355,7 @@ func getCpuListOfType(args ...interface{}) (interface{}, error) {
|
|||||||
return getCpuListOfNumaDomainFunc(args[1])
|
return getCpuListOfNumaDomainFunc(args[1])
|
||||||
case "core":
|
case "core":
|
||||||
return getCpuListOfCoreFunc(args[1])
|
return getCpuListOfCoreFunc(args[1])
|
||||||
case "cpu":
|
case "hwthread":
|
||||||
var cpu int
|
var cpu int
|
||||||
|
|
||||||
switch id := args[1].(type) {
|
switch id := args[1].(type) {
|
||||||
|
@ -39,7 +39,7 @@ def group_to_json(groupfile):
|
|||||||
llist = re.split("\s+", line)
|
llist = re.split("\s+", line)
|
||||||
calc = llist[-1]
|
calc = llist[-1]
|
||||||
metric = " ".join(llist[:-1])
|
metric = " ".join(llist[:-1])
|
||||||
scope = "cpu"
|
scope = "hwthread"
|
||||||
if "BOX" in calc:
|
if "BOX" in calc:
|
||||||
scope = "socket"
|
scope = "socket"
|
||||||
if "PWR" in calc:
|
if "PWR" in calc:
|
||||||
|
Loading…
Reference in New Issue
Block a user