mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-01-13 23:59:13 +01:00
Add collector for always running energy measurements with LIKWID
This commit is contained in:
parent
6aada60d97
commit
303fe1d80f
@ -41,6 +41,7 @@ var AvailableCollectors = map[string]MetricCollector{
|
|||||||
"self": new(SelfCollector),
|
"self": new(SelfCollector),
|
||||||
"schedstat": new(SchedstatCollector),
|
"schedstat": new(SchedstatCollector),
|
||||||
"nfsiostat": new(NfsIOStatCollector),
|
"nfsiostat": new(NfsIOStatCollector),
|
||||||
|
"likwidenergy": new(LikwidEnergyCollector),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Metric collector manager data structure
|
// Metric collector manager data structure
|
||||||
|
281
collectors/likwidenergyMetric.go
Normal file
281
collectors/likwidenergyMetric.go
Normal file
@ -0,0 +1,281 @@
|
|||||||
|
package collectors
|
||||||
|
|
||||||
|
/*
|
||||||
|
#cgo CFLAGS: -I./likwid
|
||||||
|
#cgo LDFLAGS: -Wl,--unresolved-symbols=ignore-in-object-files
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <likwid.h>
|
||||||
|
*/
|
||||||
|
import "C"
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
|
||||||
|
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||||
|
"github.com/NVIDIA/go-nvml/pkg/dl"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
LIKWIDENERGY_LIB_NAME = "liblikwid.so"
|
||||||
|
LIKWIDENERGY_LIB_DL_FLAGS = dl.RTLD_LAZY | dl.RTLD_GLOBAL
|
||||||
|
LIKWIDENERGY_DEF_ACCESSMODE = "direct"
|
||||||
|
LIKWIDENERGY_DEF_LOCKFILE = "/var/run/likwid.lock"
|
||||||
|
)
|
||||||
|
|
||||||
|
// These are the fields we read from the JSON configuration
|
||||||
|
type LikwidEnergyCollectorConfig struct {
|
||||||
|
AccessMode string `json:"access_mode,omitempty"`
|
||||||
|
DaemonPath string `json:"accessdaemon_path,omitempty"`
|
||||||
|
LibraryPath string `json:"liblikwid_path,omitempty"`
|
||||||
|
LockfilePath string `json:"lockfile_path,omitempty"`
|
||||||
|
SendDiff bool `json:"send_difference,omitempty"`
|
||||||
|
SendAbs bool `json:"send_absolute,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type LikwidEnergyDomainEntry struct {
|
||||||
|
readcpu int
|
||||||
|
value uint32
|
||||||
|
total uint64
|
||||||
|
tags map[string]string
|
||||||
|
}
|
||||||
|
|
||||||
|
type LikwidEnergyDomain struct {
|
||||||
|
values map[int]LikwidEnergyDomainEntry
|
||||||
|
granularity string
|
||||||
|
metricname string
|
||||||
|
domaintype int
|
||||||
|
energyUnit float64
|
||||||
|
}
|
||||||
|
|
||||||
|
// This contains all variables we need during execution and the variables
|
||||||
|
// defined by metricCollector (name, init, ...)
|
||||||
|
type LikwidEnergyCollector struct {
|
||||||
|
metricCollector
|
||||||
|
config LikwidEnergyCollectorConfig // the configuration structure
|
||||||
|
meta map[string]string // default meta information
|
||||||
|
tags map[string]string // default tags
|
||||||
|
domains map[int]LikwidEnergyDomain
|
||||||
|
}
|
||||||
|
|
||||||
|
// Init initializes the sample collector
|
||||||
|
// Called once by the collector manager
|
||||||
|
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
||||||
|
func (m *LikwidEnergyCollector) Init(config json.RawMessage) error {
|
||||||
|
var err error = nil
|
||||||
|
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||||
|
m.name = "LikwidEnergyCollector"
|
||||||
|
// This is for later use, also call it early
|
||||||
|
m.setup()
|
||||||
|
// Tell whether the collector should be run in parallel with others (reading files, ...)
|
||||||
|
// or it should be run serially, mostly for collectors actually doing measurements
|
||||||
|
// because they should not measure the execution of the other collectors
|
||||||
|
m.parallel = true
|
||||||
|
// Define meta information sent with each metric
|
||||||
|
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||||
|
m.meta = map[string]string{"source": m.name, "group": "LIKWID", "unit": "Joules"}
|
||||||
|
// Define tags sent with each metric
|
||||||
|
// The 'type' tag is always needed, it defines the granularity of the metric
|
||||||
|
// node -> whole system
|
||||||
|
// socket -> CPU socket (requires socket ID as 'type-id' tag)
|
||||||
|
// die -> CPU die (requires CPU die ID as 'type-id' tag)
|
||||||
|
// memoryDomain -> NUMA domain (requires NUMA domain ID as 'type-id' tag)
|
||||||
|
// llc -> Last level cache (requires last level cache ID as 'type-id' tag)
|
||||||
|
// core -> single CPU core that may consist of multiple hardware threads (SMT) (requires core ID as 'type-id' tag)
|
||||||
|
// hwthtread -> single CPU hardware thread (requires hardware thread ID as 'type-id' tag)
|
||||||
|
// accelerator -> A accelerator device like GPU or FPGA (requires an accelerator ID as 'type-id' tag)
|
||||||
|
m.tags = map[string]string{}
|
||||||
|
// Read in the JSON configuration
|
||||||
|
m.config.AccessMode = LIKWID_DEF_ACCESSMODE
|
||||||
|
m.config.LibraryPath = LIKWID_LIB_NAME
|
||||||
|
m.config.LockfilePath = LIKWID_DEF_LOCKFILE
|
||||||
|
m.config.SendAbs = true
|
||||||
|
m.config.SendDiff = true
|
||||||
|
if len(config) > 0 {
|
||||||
|
err = json.Unmarshal(config, &m.config)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cclog.ComponentDebug(m.name, "Opening ", m.config.LibraryPath)
|
||||||
|
lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS)
|
||||||
|
if lib == nil {
|
||||||
|
return fmt.Errorf("error instantiating DynamicLibrary for %s", m.config.LibraryPath)
|
||||||
|
}
|
||||||
|
err = lib.Open()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error opening %s: %v", m.config.LibraryPath, err)
|
||||||
|
}
|
||||||
|
cclog.ComponentDebug(m.name, "Init topology ", m.config.AccessMode)
|
||||||
|
ret := C.topology_init()
|
||||||
|
if ret != 0 {
|
||||||
|
return fmt.Errorf("error initializing topology: %d", ret)
|
||||||
|
}
|
||||||
|
cclog.ComponentDebug(m.name, "Setting accessmode ", m.config.AccessMode)
|
||||||
|
switch m.config.AccessMode {
|
||||||
|
case "direct":
|
||||||
|
C.HPMmode(0)
|
||||||
|
case "accessdaemon":
|
||||||
|
if len(m.config.DaemonPath) > 0 {
|
||||||
|
p := os.Getenv("PATH")
|
||||||
|
os.Setenv("PATH", m.config.DaemonPath+":"+p)
|
||||||
|
}
|
||||||
|
C.HPMmode(1)
|
||||||
|
retCode := C.HPMinit()
|
||||||
|
if retCode != 0 {
|
||||||
|
err := fmt.Errorf("C.HPMinit() failed with return code %v", retCode)
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
initCpus := make([]int, 0)
|
||||||
|
ret = C.HPMaddThread(0)
|
||||||
|
if ret != 0 {
|
||||||
|
return fmt.Errorf("error initializing access: %d", ret)
|
||||||
|
}
|
||||||
|
initCpus = append(initCpus, 0)
|
||||||
|
|
||||||
|
cinfo := C.get_cpuInfo()
|
||||||
|
domainnames := make(map[int]string)
|
||||||
|
if cinfo.isIntel == C.int(1) {
|
||||||
|
domainnames[0] = "pkg"
|
||||||
|
domainnames[1] = "pp0"
|
||||||
|
domainnames[2] = "pp1"
|
||||||
|
domainnames[3] = "dram"
|
||||||
|
domainnames[4] = "platform"
|
||||||
|
} else {
|
||||||
|
switch cinfo.family {
|
||||||
|
case 0x17:
|
||||||
|
domainnames[0] = "core"
|
||||||
|
domainnames[1] = "pkg"
|
||||||
|
case 0x19:
|
||||||
|
switch cinfo.model {
|
||||||
|
case 0x01, 0x21, 0x50:
|
||||||
|
domainnames[0] = "core"
|
||||||
|
domainnames[1] = "pkg"
|
||||||
|
case 0x61, 0x11:
|
||||||
|
domainnames[0] = "core"
|
||||||
|
domainnames[1] = "l3"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set up everything that the collector requires during the Read() execution
|
||||||
|
// Check files required, test execution of some commands, create data structure
|
||||||
|
// for all topological entities (sockets, NUMA domains, ...)
|
||||||
|
// Return some useful error message in case of any failures
|
||||||
|
cclog.ComponentDebug(m.name, "Initializing Power module")
|
||||||
|
ret = C.power_init(0)
|
||||||
|
if ret == C.int(0) {
|
||||||
|
cclog.ComponentPrint(m.name, "No RAPL support")
|
||||||
|
}
|
||||||
|
m.domains = make(map[int]LikwidEnergyDomain)
|
||||||
|
Pinfo := C.get_powerInfo()
|
||||||
|
for i := 0; i < int(Pinfo.numDomains); i++ {
|
||||||
|
d := Pinfo.domains[C.int(i)]
|
||||||
|
name := domainnames[int(d._type)]
|
||||||
|
domain := LikwidEnergyDomain{
|
||||||
|
values: make(map[int]LikwidEnergyDomainEntry),
|
||||||
|
metricname: fmt.Sprintf("likwidenergy_%s", strings.ToLower(name)),
|
||||||
|
granularity: "socket",
|
||||||
|
domaintype: int(d._type),
|
||||||
|
energyUnit: float64(C.power_getEnergyUnit(C.int(d._type))),
|
||||||
|
}
|
||||||
|
if name == "core" {
|
||||||
|
domain.granularity = "core"
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, c := range topo.GetTypeList(domain.granularity) {
|
||||||
|
clist := topo.GetSocketHwthreads(c)
|
||||||
|
if len(clist) > 0 {
|
||||||
|
var cur C.PowerData
|
||||||
|
if _, ok := intArrayContains(initCpus, clist[0]); !ok {
|
||||||
|
initCpus = append(initCpus, clist[0])
|
||||||
|
C.HPMaddThread(C.int(clist[0]))
|
||||||
|
}
|
||||||
|
cclog.ComponentDebug(m.name, "Reading current value on CPU ", clist[0], " for ", domain.metricname, "on", domain.granularity, c)
|
||||||
|
ret = C.power_start(&cur, C.int(clist[0]), C.PowerType(domain.domaintype))
|
||||||
|
cclog.ComponentDebug(m.name, "Reading ", uint64(cur.before))
|
||||||
|
if ret == 0 {
|
||||||
|
domain.values[c] = LikwidEnergyDomainEntry{
|
||||||
|
readcpu: clist[0],
|
||||||
|
value: uint32(cur.before),
|
||||||
|
total: uint64(cur.before),
|
||||||
|
tags: map[string]string{
|
||||||
|
"type": domain.granularity,
|
||||||
|
"type-id": fmt.Sprintf("%d", c),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cclog.ComponentDebug(m.name, "Adding domain ", domain.metricname, " with granularity ", domain.granularity)
|
||||||
|
m.domains[domain.domaintype] = domain
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set this flag only if everything is initialized properly, all required files exist, ...
|
||||||
|
m.init = true
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read collects all metrics belonging to the sample collector
|
||||||
|
// and sends them through the output channel to the collector manager
|
||||||
|
func (m *LikwidEnergyCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||||
|
// Create a sample metric
|
||||||
|
timestamp := time.Now()
|
||||||
|
|
||||||
|
for dt, domain := range m.domains {
|
||||||
|
for i, entry := range domain.values {
|
||||||
|
var cur C.PowerData
|
||||||
|
ret := C.power_start(&cur, C.int(entry.readcpu), C.PowerType(dt))
|
||||||
|
if ret == 0 {
|
||||||
|
now := uint32(cur.before)
|
||||||
|
diff := now - entry.value
|
||||||
|
if now < entry.value {
|
||||||
|
diff = (^uint32(0)) - entry.value
|
||||||
|
diff += now
|
||||||
|
}
|
||||||
|
if m.config.SendDiff {
|
||||||
|
y, err := lp.New(domain.metricname, entry.tags, m.meta, map[string]interface{}{"value": float64(diff) * domain.energyUnit}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
for k, v := range m.tags {
|
||||||
|
y.AddTag(k, v)
|
||||||
|
}
|
||||||
|
// Send it to output channel
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if m.config.SendAbs {
|
||||||
|
total := float64(entry.total + uint64(diff))
|
||||||
|
|
||||||
|
y, err := lp.New(fmt.Sprintf("%s_abs", domain.metricname), entry.tags, m.meta, map[string]interface{}{"value": total * domain.energyUnit}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
for k, v := range m.tags {
|
||||||
|
y.AddTag(k, v)
|
||||||
|
}
|
||||||
|
// Send it to output channel
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
entry.value = uint32(cur.before)
|
||||||
|
entry.total += uint64(diff)
|
||||||
|
domain.values[i] = entry
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close metric collector: close network connection, close files, close libraries, ...
|
||||||
|
// Called once by the collector manager
|
||||||
|
func (m *LikwidEnergyCollector) Close() {
|
||||||
|
C.power_finalize()
|
||||||
|
C.topology_finalize()
|
||||||
|
// Unset flag
|
||||||
|
m.init = false
|
||||||
|
}
|
20
collectors/likwidenergyMetric.md
Normal file
20
collectors/likwidenergyMetric.md
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
## `likwidenergy` collector
|
||||||
|
|
||||||
|
In contrast to the more general [`likwid` collector](./likwidMetric.md), this collector just reads the RAPL counters to provide energy metrics. In contrast to the `likwid` collector, this collector keeps the energy counters running the whole time and not just of a measurement interval. It covers all RAPL domains (`PKG`, `DRAM`, `PP0`, `PP1`, ...). Depending whether the domain is per socket, per L3 segment or per core, metrics are read and send.
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"likwidenergy" : {
|
||||||
|
"liblikwid_path" : "/path/to/liblikwid.so",
|
||||||
|
"accessdaemon_path" : "/folder/that/contains/likwid-accessD",
|
||||||
|
"access_mode" : "direct or accessdaemon",
|
||||||
|
"send_difference": true,
|
||||||
|
"send_absolute": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The first three entries (`liblikwid_path`, `accessdaemon_path` and `access_mode`) are required to set up the access to the RAPL counters. The `access_mode` = `perf_event` is not supported at the moment.
|
||||||
|
|
||||||
|
With `send_differences` the difference to the last measurement is provided to the system. With `send_absolute`, the absolute value since start of the system is submitted as metric. It reads the counter at initialization and then updates the value after each measurement.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user