From 77fceb17d1d3e99bbabf98ce7d8bfa248507afe6 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 4 Apr 2025 12:04:27 +0200 Subject: [PATCH] Update rapl collector with powercap limits --- collectors/raplMetric.go | 319 +++++++++++++++++++++++++++------------ collectors/raplMetric.md | 18 ++- 2 files changed, 233 insertions(+), 104 deletions(-) diff --git a/collectors/raplMetric.go b/collectors/raplMetric.go index 7800288..08d73da 100644 --- a/collectors/raplMetric.go +++ b/collectors/raplMetric.go @@ -9,20 +9,29 @@ import ( "strings" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" ) // running average power limit (RAPL) monitoring attributes for a zone +// Only for Intel systems + type RAPLZoneInfo struct { + energy int64 // current reading of the energy counter in micro joules + maxEnergyRange int64 // Range of the above energy counter in micro-joules + energyTimestamp time.Time // timestamp when energy counter was read + energyFilepath string // path to a file containing the zones current energy counter in micro joules + shortTermFilepath string // path to short term power limit + longTermFilepath string // path to long term power limit + enabledFilepath string // path to check whether limits are enabled + name string + // tags describing the RAPL zone: // * zone_name, subzone_name: e.g. psys, dram, core, uncore, package-0 // * zone_id: e.g. 0:1 (zone 0 sub zone 1) - tags map[string]string - energyFilepath string // path to a file containing the zones current energy counter in micro joules - energy int64 // current reading of the energy counter in micro joules - energyTimestamp time.Time // timestamp when energy counter was read - maxEnergyRange int64 // Range of the above energy counter in micro-joules + // type=socket for dram, core, uncore, package-* and type=node for psys + // type-id=socket id + tags map[string]string } type RAPLCollector struct { @@ -33,12 +42,40 @@ type RAPLCollector struct { // * 0:1 for zone 0 subzone 1 ExcludeByID []string `json:"exclude_device_by_id,omitempty"` // Exclude names for RAPL zones, e.g. psys, dram, core, uncore, package-0 - ExcludeByName []string `json:"exclude_device_by_name,omitempty"` + ExcludeByName []string `json:"exclude_device_by_name,omitempty"` + SkipEnergyReading bool `json:"skip_energy_reading,omitempty"` + SkipLimitsReading bool `json:"skip_limits_reading,omitempty"` + OnlyEnabledLimits bool `json:"only_enabled_limits,omitempty"` } - RAPLZoneInfo []RAPLZoneInfo + raplZoneInfo []RAPLZoneInfo meta map[string]string // default meta information } +// Get the path to the power limit file for zone selectable by limit name +// Common limit names for Intel systems are +// - long_term +// - short_term +// Does not support AMD as AMD systems do not provide the power limits +// through sysfs +func ZoneLimitFile(folder string, limit_name string) string { + nameGlob := filepath.Join(folder, "constraint_*_name") + candidates, err := filepath.Glob(nameGlob) + if err == nil { + for _, c := range candidates { + if v, err := os.ReadFile(c); err == nil { + if strings.TrimSpace(string(v)) == limit_name { + var i int + n, err := fmt.Sscanf(filepath.Base(c), "constraint_%d_name", &i) + if err == nil && n == 1 { + return filepath.Join(folder, fmt.Sprintf("constraint_%d_power_limit_uw", i)) + } + } + } + } + } + return "" +} + // Init initializes the running average power limit (RAPL) collector func (m *RAPLCollector) Init(config json.RawMessage) error { @@ -58,6 +95,9 @@ func (m *RAPLCollector) Init(config json.RawMessage) error { } // Read in the JSON configuration + m.config.SkipEnergyReading = false + m.config.SkipLimitsReading = false + m.config.OnlyEnabledLimits = true if len(config) > 0 { err = json.Unmarshal(config, &m.config) if err != nil { @@ -83,50 +123,62 @@ func (m *RAPLCollector) Init(config json.RawMessage) error { // readZoneInfo reads RAPL monitoring attributes for a zone given by zonePath // See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes readZoneInfo := func(zonePath string) (z struct { - name string // zones name e.g. psys, dram, core, uncore, package-0 - energyFilepath string // path to a file containing the zones current energy counter in micro joules - energy int64 // current reading of the energy counter in micro joules - energyTimestamp time.Time // timestamp when energy counter was read - maxEnergyRange int64 // Range of the above energy counter in micro-joules - ok bool // Are all information available? + name string // zones name e.g. psys, dram, core, uncore, package-0 + energyFilepath string // path to a file containing the zones current energy counter in micro joules + energy int64 // current reading of the energy counter in micro joules + energyTimestamp time.Time // timestamp when energy counter was read + maxEnergyRange int64 // Range of the above energy counter in micro-joules + shortTermFilepath string + longTermFilepath string + enabledFilepath string }) { // zones name e.g. psys, dram, core, uncore, package-0 - foundName := false + if v, err := os.ReadFile( filepath.Join(zonePath, "name")); err == nil { - foundName = true z.name = strings.TrimSpace(string(v)) } - // path to a file containing the zones current energy counter in micro joules - z.energyFilepath = filepath.Join(zonePath, "energy_uj") - - // current reading of the energy counter in micro joules - foundEnergy := false - if v, err := os.ReadFile(z.energyFilepath); err == nil { - // timestamp when energy counter was read - z.energyTimestamp = time.Now() - if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil { - foundEnergy = true - z.energy = i + if os.Geteuid() == 0 && (!m.config.SkipEnergyReading) { + // path to a file containing the zones current energy counter in micro joules + z.energyFilepath = filepath.Join(zonePath, "energy_uj") + // current reading of the energy counter in micro joules + if v, err := os.ReadFile(z.energyFilepath); err == nil { + if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil { + z.energy = i + // timestamp when energy counter was read + z.energyTimestamp = time.Now() + } + } else { + cclog.ComponentError(m.name, "Cannot read energy file for ", z.name, ":", err.Error()) } + // Range of the above energy counter in micro-joules + if v, err := + os.ReadFile( + filepath.Join(zonePath, "max_energy_range_uj")); err == nil { + if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil { + z.maxEnergyRange = i + } + } + } else { + cclog.ComponentInfo(m.name, "Energy readings for", zonePath, "disabled") } - // Range of the above energy counter in micro-joules - foundMaxEnergyRange := false - if v, err := - os.ReadFile( - filepath.Join(zonePath, "max_energy_range_uj")); err == nil { - if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil { - foundMaxEnergyRange = true - z.maxEnergyRange = i + if !m.config.SkipLimitsReading { + z.shortTermFilepath = ZoneLimitFile(zonePath, "short_term") + if _, err := os.Stat(z.shortTermFilepath); err != nil { + z.shortTermFilepath = "" } + z.longTermFilepath = ZoneLimitFile(zonePath, "long_term") + if _, err := os.Stat(z.longTermFilepath); err != nil { + z.longTermFilepath = "" + } + z.enabledFilepath = filepath.Join(zonePath, "enabled") + } else { + cclog.ComponentInfo(m.name, "Power limit readings for", zonePath, "disabled") } - // Are all information available? - z.ok = foundName && foundEnergy && foundMaxEnergyRange - return } @@ -143,25 +195,42 @@ func (m *RAPLCollector) Init(config json.RawMessage) error { for _, zonePath := range zonesPath { zoneID := strings.TrimPrefix(zonePath, zonePrefix) + zonetags := make(map[string]string) + z := readZoneInfo(zonePath) - if z.ok && - !isIDExcluded[zoneID] && + if !isIDExcluded[zoneID] && !isNameExcluded[z.name] { + si := RAPLZoneInfo{ + tags: make(map[string]string), + energyFilepath: z.energyFilepath, + energy: z.energy, + energyTimestamp: z.energyTimestamp, + maxEnergyRange: z.maxEnergyRange, + shortTermFilepath: z.shortTermFilepath, + longTermFilepath: z.longTermFilepath, + enabledFilepath: z.enabledFilepath, + name: z.name, + } + si.tags["type"] = "node" + si.tags["type-id"] = "0" + var pid int = 0 + if strings.HasPrefix(z.name, "package-") { + n, err := fmt.Sscanf(z.name, "package-%d", &pid) + if err == nil && n == 1 { + si.tags["type-id"] = fmt.Sprintf("%d", pid) + si.tags["type"] = "socket" + } + si.name = "pkg" + } // Add RAPL monitoring attributes for a zone - m.RAPLZoneInfo = - append( - m.RAPLZoneInfo, - RAPLZoneInfo{ - tags: map[string]string{ - "id": zoneID, - "zone_name": z.name, - }, - energyFilepath: z.energyFilepath, - energy: z.energy, - energyTimestamp: z.energyTimestamp, - maxEnergyRange: z.maxEnergyRange, - }) + if _, ok1 := si.tags["type"]; ok1 { + if _, ok2 := si.tags["type-id"]; ok2 { + m.raplZoneInfo = append(m.raplZoneInfo, si) + zonetags["type"] = si.tags["type"] + zonetags["type-id"] = si.tags["type-id"] + } + } } // find all sub zones for the given zone @@ -174,29 +243,32 @@ func (m *RAPLCollector) Init(config json.RawMessage) error { for _, subZonePath := range subZonesPath { subZoneID := strings.TrimPrefix(subZonePath, subZonePrefix) sz := readZoneInfo(subZonePath) + if len(zoneID) > 0 && len(z.name) > 0 && - sz.ok && !isIDExcluded[zoneID+":"+subZoneID] && !isNameExcluded[sz.name] { - m.RAPLZoneInfo = - append( - m.RAPLZoneInfo, - RAPLZoneInfo{ - tags: map[string]string{ - "id": zoneID + ":" + subZoneID, - "zone_name": z.name, - "sub_zone_name": sz.name, - }, - energyFilepath: sz.energyFilepath, - energy: sz.energy, - energyTimestamp: sz.energyTimestamp, - maxEnergyRange: sz.maxEnergyRange, - }) + + si := RAPLZoneInfo{ + tags: zonetags, + energyFilepath: sz.energyFilepath, + energy: sz.energy, + energyTimestamp: sz.energyTimestamp, + maxEnergyRange: sz.maxEnergyRange, + shortTermFilepath: sz.shortTermFilepath, + longTermFilepath: sz.longTermFilepath, + enabledFilepath: sz.enabledFilepath, + name: sz.name, + } + if _, ok1 := si.tags["type"]; ok1 { + if _, ok2 := si.tags["type-id"]; ok2 { + m.raplZoneInfo = append(m.raplZoneInfo, si) + } + } } } } - if m.RAPLZoneInfo == nil { + if m.raplZoneInfo == nil { return fmt.Errorf("no running average power limit (RAPL) device found in %s", controlTypePath) } @@ -205,7 +277,7 @@ func (m *RAPLCollector) Init(config json.RawMessage) error { cclog.ComponentDebug( m.name, "initialized", - len(m.RAPLZoneInfo), + len(m.raplZoneInfo), "zones with running average power limit (RAPL) monitoring attributes") m.init = true @@ -216,40 +288,89 @@ func (m *RAPLCollector) Init(config json.RawMessage) error { // See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) { - for i := range m.RAPLZoneInfo { - p := &m.RAPLZoneInfo[i] + for i := range m.raplZoneInfo { + p := &m.raplZoneInfo[i] - // Read current value of the energy counter in micro joules - if v, err := os.ReadFile(p.energyFilepath); err == nil { - energyTimestamp := time.Now() - if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil { - energy := i + if os.Geteuid() == 0 && (!m.config.SkipEnergyReading) { + // Read current value of the energy counter in micro joules + if v, err := os.ReadFile(p.energyFilepath); err == nil { + energyTimestamp := time.Now() + if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil { + energy := i - // Compute average power (Δ energy / Δ time) - energyDiff := energy - p.energy - if energyDiff < 0 { - // Handle overflow: - // ( p.maxEnergyRange - p.energy ) + energy - // = p.maxEnergyRange + ( energy - p.energy ) - // = p.maxEnergyRange + diffEnergy - energyDiff += p.maxEnergyRange + // Compute average power (Δ energy / Δ time) + energyDiff := energy - p.energy + if energyDiff < 0 { + // Handle overflow: + // ( p.maxEnergyRange - p.energy ) + energy + // = p.maxEnergyRange + ( energy - p.energy ) + // = p.maxEnergyRange + diffEnergy + energyDiff += p.maxEnergyRange + } + timeDiff := energyTimestamp.Sub(p.energyTimestamp) + averagePower := float64(energyDiff) / float64(timeDiff.Microseconds()) + + y, err := lp.NewMetric( + fmt.Sprintf("rapl_%s_average_power", p.name), + p.tags, + m.meta, + averagePower, + energyTimestamp) + if err == nil { + output <- y + } + + e, err := lp.NewMetric( + fmt.Sprintf("rapl_%s_energy", p.name), + p.tags, + m.meta, + float64(energyDiff)*1e-3, + energyTimestamp) + if err == nil { + e.AddMeta("unit", "Joules") + output <- e + } + + // Save current energy counter state + p.energy = energy + p.energyTimestamp = energyTimestamp } - timeDiff := energyTimestamp.Sub(p.energyTimestamp) - averagePower := float64(energyDiff) / float64(timeDiff.Microseconds()) - - y, err := lp.NewMessage( - "rapl_average_power", - p.tags, - m.meta, - map[string]interface{}{"value": averagePower}, - energyTimestamp) - if err == nil { - output <- y + } + } + // https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#constraints + if !m.config.SkipLimitsReading { + skip := false + if m.config.OnlyEnabledLimits { + if v, err := os.ReadFile(p.enabledFilepath); err == nil { + if strings.TrimSpace(string(v)) == "0" { + skip = true + } + } + } + if !skip { + if len(p.shortTermFilepath) > 0 { + if v, err := os.ReadFile(p.shortTermFilepath); err == nil { + if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil { + name := fmt.Sprintf("rapl_%s_limit_short_term", p.name) + y, err := lp.NewMetric(name, p.tags, m.meta, i/1e6, time.Now()) + if err == nil { + output <- y + } + } + } } - // Save current energy counter state - p.energy = energy - p.energyTimestamp = energyTimestamp + if len(p.longTermFilepath) > 0 { + if v, err := os.ReadFile(p.longTermFilepath); err == nil { + if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil { + name := fmt.Sprintf("rapl_%s_limit_long_term", p.name) + y, err := lp.NewMetric(name, p.tags, m.meta, i/1e6, time.Now()) + if err == nil { + output <- y + } + } + } + } } } } diff --git a/collectors/raplMetric.md b/collectors/raplMetric.md index 8eb792f..5d9a78d 100644 --- a/collectors/raplMetric.md +++ b/collectors/raplMetric.md @@ -1,15 +1,23 @@ ## `rapl` collector -This collector reads running average power limit (RAPL) monitoring attributes to compute average power consumption metrics. See . - -The Likwid metric collector provides similar functionality. +This collector reads running average power limit (RAPL) monitoring attributes to compute average power consumption metrics. See . ```json "rapl": { "exclude_device_by_id": ["0:1", "0:2"], - "exclude_device_by_name": ["psys"] + "exclude_device_by_name": ["psys"], + "skip_energy_reading": false, + "skip_limits_reading": false, + "only_enabled_limits": true } ``` Metrics: -* `rapl_average_power`: average power consumption in Watt. The average is computed over the entire runtime from the last measurement to the current measurement +* `rapl__average_power`: average power consumption in Watt. The average is computed over the entire runtime from the last measurement to the current measurement +* `rapl__energy`: Difference from the last measurement +* `rapl__limit_short_term`: Short term powercap setting for the domain +* `rapl__limit_long_term`: Long term powercap setting for the domain + +Only the `rapl__average_power` and `rapl__energy` metrics require root-permissions. The limits can be read as user. Some domains have limits available but they are not enabled. By default, only enabled domain limits are collected. + +Energy and power measurments can also be done with the Likwid metric collector. \ No newline at end of file