mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-10-24 06:45:07 +02:00
Add running average power limit (RAPL) metric collector
This commit is contained in:
272
collectors/raplMetric.go
Normal file
272
collectors/raplMetric.go
Normal file
@@ -0,0 +1,272 @@
|
|||||||
|
package collectors
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
|
||||||
|
)
|
||||||
|
|
||||||
|
// running average power limit (RAPL) monitoring attributes for a zone
|
||||||
|
type RAPLZoneInfo struct {
|
||||||
|
// tags describing the RAPL zone:
|
||||||
|
// * zone_name, subzone_name: e.g. psys, dram, core, uncore, package-0
|
||||||
|
// * zone_id: e.g. 0:1 (zone 0 sub zone 1)
|
||||||
|
tags map[string]string
|
||||||
|
energyFilepath string // path to a file containing the zones current energy counter in micro joules
|
||||||
|
energy int64 // current reading of the energy counter in micro joules
|
||||||
|
energyTimestamp time.Time // timestamp when energy counter was read
|
||||||
|
maxEnergyRange int64 // Range of the above energy counter in micro-joules
|
||||||
|
}
|
||||||
|
|
||||||
|
type RAPLCollector struct {
|
||||||
|
metricCollector
|
||||||
|
config struct {
|
||||||
|
// Exclude IDs for RAPL zones, e.g.
|
||||||
|
// * 0 for zone 0
|
||||||
|
// * 0:1 for zone 0 subzone 1
|
||||||
|
ExcludeByID []string `json:"exclude_device_by_id,omitempty"`
|
||||||
|
isIDExcluded map[string]bool
|
||||||
|
// Exclude names for RAPL zones, e.g. psys, dram, core, uncore, package-0
|
||||||
|
ExcludeByName []string `json:"exclude_device_by_name,omitempty"`
|
||||||
|
isNameExcluded map[string]bool
|
||||||
|
}
|
||||||
|
RAPLZoneInfo []RAPLZoneInfo
|
||||||
|
meta map[string]string // default meta information
|
||||||
|
}
|
||||||
|
|
||||||
|
// Init initializes the running average power limit (RAPL) collector
|
||||||
|
func (m *RAPLCollector) Init(config json.RawMessage) error {
|
||||||
|
|
||||||
|
// Release resources only needed in Init()
|
||||||
|
defer func() {
|
||||||
|
m.config.ExcludeByID = nil
|
||||||
|
m.config.isIDExcluded = nil
|
||||||
|
m.config.ExcludeByName = nil
|
||||||
|
m.config.isNameExcluded = nil
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Check if already initialized
|
||||||
|
if m.init {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var err error = nil
|
||||||
|
m.name = "RAPLCollector"
|
||||||
|
m.setup()
|
||||||
|
m.parallel = true
|
||||||
|
m.meta = map[string]string{
|
||||||
|
"source": m.name,
|
||||||
|
"group": "energy",
|
||||||
|
"unit": "Watt",
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read in the JSON configuration
|
||||||
|
if len(config) > 0 {
|
||||||
|
err = json.Unmarshal(config, &m.config)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Configure excluded RAPL zones
|
||||||
|
m.config.isIDExcluded = make(map[string]bool)
|
||||||
|
if m.config.ExcludeByID != nil {
|
||||||
|
for _, ID := range m.config.ExcludeByID {
|
||||||
|
m.config.isIDExcluded[ID] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
m.config.isNameExcluded = make(map[string]bool)
|
||||||
|
if m.config.ExcludeByName != nil {
|
||||||
|
for _, name := range m.config.ExcludeByName {
|
||||||
|
m.config.isNameExcluded[name] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// readZoneInfo reads RAPL monitoring attributes for a zone given by zonePath
|
||||||
|
// See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes
|
||||||
|
readZoneInfo := func(zonePath string) (z struct {
|
||||||
|
name string // zones name e.g. psys, dram, core, uncore, package-0
|
||||||
|
energyFilepath string // path to a file containing the zones current energy counter in micro joules
|
||||||
|
energy int64 // current reading of the energy counter in micro joules
|
||||||
|
energyTimestamp time.Time // timestamp when energy counter was read
|
||||||
|
maxEnergyRange int64 // Range of the above energy counter in micro-joules
|
||||||
|
ok bool // Are all information available?
|
||||||
|
}) {
|
||||||
|
// zones name e.g. psys, dram, core, uncore, package-0
|
||||||
|
foundName := false
|
||||||
|
if v, err :=
|
||||||
|
os.ReadFile(
|
||||||
|
filepath.Join(zonePath, "name")); err == nil {
|
||||||
|
foundName = true
|
||||||
|
z.name = strings.TrimSpace(string(v))
|
||||||
|
}
|
||||||
|
|
||||||
|
// path to a file containing the zones current energy counter in micro joules
|
||||||
|
z.energyFilepath = filepath.Join(zonePath, "energy_uj")
|
||||||
|
|
||||||
|
// current reading of the energy counter in micro joules
|
||||||
|
foundEnergy := false
|
||||||
|
if v, err := os.ReadFile(z.energyFilepath); err == nil {
|
||||||
|
// timestamp when energy counter was read
|
||||||
|
z.energyTimestamp = time.Now()
|
||||||
|
if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil {
|
||||||
|
foundEnergy = true
|
||||||
|
z.energy = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Range of the above energy counter in micro-joules
|
||||||
|
foundMaxEnergyRange := false
|
||||||
|
if v, err :=
|
||||||
|
os.ReadFile(
|
||||||
|
filepath.Join(zonePath, "max_energy_range_uj")); err == nil {
|
||||||
|
if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil {
|
||||||
|
foundMaxEnergyRange = true
|
||||||
|
z.maxEnergyRange = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Are all information available?
|
||||||
|
z.ok = foundName && foundEnergy && foundMaxEnergyRange
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
powerCapPrefix := "/sys/devices/virtual/powercap"
|
||||||
|
controlType := "intel-rapl"
|
||||||
|
controlTypePath := filepath.Join(powerCapPrefix, controlType)
|
||||||
|
|
||||||
|
// Find all RAPL zones
|
||||||
|
zonePrefix := filepath.Join(controlTypePath, controlType+":")
|
||||||
|
zonesPath, err := filepath.Glob(zonePrefix + "*")
|
||||||
|
if err != nil || zonesPath == nil {
|
||||||
|
return fmt.Errorf("unable to find any zones under %s", controlTypePath)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, zonePath := range zonesPath {
|
||||||
|
zoneID := strings.TrimPrefix(zonePath, zonePrefix)
|
||||||
|
z := readZoneInfo(zonePath)
|
||||||
|
if z.ok &&
|
||||||
|
!m.config.isIDExcluded[zoneID] &&
|
||||||
|
!m.config.isNameExcluded[z.name] {
|
||||||
|
|
||||||
|
// Add RAPL monitoring attributes for a zone
|
||||||
|
m.RAPLZoneInfo =
|
||||||
|
append(
|
||||||
|
m.RAPLZoneInfo,
|
||||||
|
RAPLZoneInfo{
|
||||||
|
tags: map[string]string{
|
||||||
|
"id": zoneID,
|
||||||
|
"zone_name": z.name,
|
||||||
|
},
|
||||||
|
energyFilepath: z.energyFilepath,
|
||||||
|
energy: z.energy,
|
||||||
|
energyTimestamp: z.energyTimestamp,
|
||||||
|
maxEnergyRange: z.maxEnergyRange,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// find all sub zones for the given zone
|
||||||
|
subZonePrefix := filepath.Join(zonePath, controlType+":"+zoneID+":")
|
||||||
|
subZonesPath, err := filepath.Glob(subZonePrefix + "*")
|
||||||
|
if err != nil || subZonesPath == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, subZonePath := range subZonesPath {
|
||||||
|
subZoneID := strings.TrimPrefix(subZonePath, subZonePrefix)
|
||||||
|
sz := readZoneInfo(subZonePath)
|
||||||
|
if len(zoneID) > 0 && len(z.name) > 0 &&
|
||||||
|
sz.ok &&
|
||||||
|
!m.config.isIDExcluded[zoneID+":"+subZoneID] &&
|
||||||
|
!m.config.isNameExcluded[sz.name] {
|
||||||
|
m.RAPLZoneInfo =
|
||||||
|
append(
|
||||||
|
m.RAPLZoneInfo,
|
||||||
|
RAPLZoneInfo{
|
||||||
|
tags: map[string]string{
|
||||||
|
"id": zoneID + ":" + subZoneID,
|
||||||
|
"zone_name": z.name,
|
||||||
|
"sub_zone_name": sz.name,
|
||||||
|
},
|
||||||
|
energyFilepath: sz.energyFilepath,
|
||||||
|
energy: sz.energy,
|
||||||
|
energyTimestamp: sz.energyTimestamp,
|
||||||
|
maxEnergyRange: sz.maxEnergyRange,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if m.RAPLZoneInfo == nil {
|
||||||
|
return fmt.Errorf("no running average power limit (RAPL) device found in %s", controlTypePath)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialized
|
||||||
|
cclog.ComponentDebug(
|
||||||
|
m.name,
|
||||||
|
"initialized",
|
||||||
|
len(m.RAPLZoneInfo),
|
||||||
|
"zones with running average power limit (RAPL) monitoring attributes")
|
||||||
|
m.init = true
|
||||||
|
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read reads running average power limit (RAPL) monitoring attributes for all initialized zones
|
||||||
|
// See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes
|
||||||
|
func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||||
|
|
||||||
|
for i := range m.RAPLZoneInfo {
|
||||||
|
p := &m.RAPLZoneInfo[i]
|
||||||
|
|
||||||
|
// Read current value of the energy counter in micro joules
|
||||||
|
if v, err := os.ReadFile(p.energyFilepath); err == nil {
|
||||||
|
energyTimestamp := time.Now()
|
||||||
|
if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil {
|
||||||
|
energy := i
|
||||||
|
|
||||||
|
// Compute average power (Δ energy / Δ time)
|
||||||
|
energyDiff := energy - p.energy
|
||||||
|
if energyDiff < 0 {
|
||||||
|
// Handle overflow:
|
||||||
|
// ( p.maxEnergyRange - p.energy ) + energy
|
||||||
|
// = p.maxEnergyRange + ( energy - p.energy )
|
||||||
|
// = p.maxEnergyRange + diffEnergy
|
||||||
|
energyDiff += p.maxEnergyRange
|
||||||
|
}
|
||||||
|
timeDiff := energyTimestamp.Sub(p.energyTimestamp)
|
||||||
|
averagePower := float64(energyDiff) / float64(timeDiff.Microseconds())
|
||||||
|
|
||||||
|
y, err := lp.New(
|
||||||
|
"rapl_average_power",
|
||||||
|
p.tags,
|
||||||
|
m.meta,
|
||||||
|
map[string]interface{}{"value": averagePower},
|
||||||
|
energyTimestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save current energy counter state
|
||||||
|
p.energy = energy
|
||||||
|
p.energyTimestamp = energyTimestamp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close closes running average power limit (RAPL) metric collector
|
||||||
|
func (m *RAPLCollector) Close() {
|
||||||
|
// Unset flag
|
||||||
|
m.init = false
|
||||||
|
}
|
18
collectors/raplMetric.md
Normal file
18
collectors/raplMetric.md
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
# Running average power limit (RAPL) metric collector
|
||||||
|
|
||||||
|
This collector reads running average power limit (RAPL) monitoring attributes to compute average power consumption metrics. See <https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes>.
|
||||||
|
|
||||||
|
The Likwid metric collector provides similar functionality.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
```json
|
||||||
|
"rapl": {
|
||||||
|
"exclude_device_by_id": ["0:1", "0:2"],
|
||||||
|
"exclude_device_by_name": ["psys"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Metrics
|
||||||
|
|
||||||
|
* `rapl_average_power`: average power consumption in Watt. The average is computed over the entire runtime from the last measurement to the current measurement
|
Reference in New Issue
Block a user