From 92e45ca62cc0bff663a2b0dda86da3770b514f7a Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 15 Nov 2022 17:09:26 +0100 Subject: [PATCH] Add running average power limit (RAPL) metric collector --- collectors/raplMetric.go | 272 +++++++++++++++++++++++++++++++++++++++ collectors/raplMetric.md | 18 +++ 2 files changed, 290 insertions(+) create mode 100644 collectors/raplMetric.go create mode 100644 collectors/raplMetric.md diff --git a/collectors/raplMetric.go b/collectors/raplMetric.go new file mode 100644 index 0000000..529dc87 --- /dev/null +++ b/collectors/raplMetric.go @@ -0,0 +1,272 @@ +package collectors + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" +) + +// running average power limit (RAPL) monitoring attributes for a zone +type RAPLZoneInfo struct { + // tags describing the RAPL zone: + // * zone_name, subzone_name: e.g. psys, dram, core, uncore, package-0 + // * zone_id: e.g. 0:1 (zone 0 sub zone 1) + tags map[string]string + energyFilepath string // path to a file containing the zones current energy counter in micro joules + energy int64 // current reading of the energy counter in micro joules + energyTimestamp time.Time // timestamp when energy counter was read + maxEnergyRange int64 // Range of the above energy counter in micro-joules +} + +type RAPLCollector struct { + metricCollector + config struct { + // Exclude IDs for RAPL zones, e.g. + // * 0 for zone 0 + // * 0:1 for zone 0 subzone 1 + ExcludeByID []string `json:"exclude_device_by_id,omitempty"` + isIDExcluded map[string]bool + // Exclude names for RAPL zones, e.g. psys, dram, core, uncore, package-0 + ExcludeByName []string `json:"exclude_device_by_name,omitempty"` + isNameExcluded map[string]bool + } + RAPLZoneInfo []RAPLZoneInfo + meta map[string]string // default meta information +} + +// Init initializes the running average power limit (RAPL) collector +func (m *RAPLCollector) Init(config json.RawMessage) error { + + // Release resources only needed in Init() + defer func() { + m.config.ExcludeByID = nil + m.config.isIDExcluded = nil + m.config.ExcludeByName = nil + m.config.isNameExcluded = nil + }() + + // Check if already initialized + if m.init { + return nil + } + + var err error = nil + m.name = "RAPLCollector" + m.setup() + m.parallel = true + m.meta = map[string]string{ + "source": m.name, + "group": "energy", + "unit": "Watt", + } + + // Read in the JSON configuration + if len(config) > 0 { + err = json.Unmarshal(config, &m.config) + if err != nil { + cclog.ComponentError(m.name, "Error reading config:", err.Error()) + return err + } + } + + // Configure excluded RAPL zones + m.config.isIDExcluded = make(map[string]bool) + if m.config.ExcludeByID != nil { + for _, ID := range m.config.ExcludeByID { + m.config.isIDExcluded[ID] = true + } + } + m.config.isNameExcluded = make(map[string]bool) + if m.config.ExcludeByName != nil { + for _, name := range m.config.ExcludeByName { + m.config.isNameExcluded[name] = true + } + } + + // readZoneInfo reads RAPL monitoring attributes for a zone given by zonePath + // See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes + readZoneInfo := func(zonePath string) (z struct { + name string // zones name e.g. psys, dram, core, uncore, package-0 + energyFilepath string // path to a file containing the zones current energy counter in micro joules + energy int64 // current reading of the energy counter in micro joules + energyTimestamp time.Time // timestamp when energy counter was read + maxEnergyRange int64 // Range of the above energy counter in micro-joules + ok bool // Are all information available? + }) { + // zones name e.g. psys, dram, core, uncore, package-0 + foundName := false + if v, err := + os.ReadFile( + filepath.Join(zonePath, "name")); err == nil { + foundName = true + z.name = strings.TrimSpace(string(v)) + } + + // path to a file containing the zones current energy counter in micro joules + z.energyFilepath = filepath.Join(zonePath, "energy_uj") + + // current reading of the energy counter in micro joules + foundEnergy := false + if v, err := os.ReadFile(z.energyFilepath); err == nil { + // timestamp when energy counter was read + z.energyTimestamp = time.Now() + if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil { + foundEnergy = true + z.energy = i + } + } + + // Range of the above energy counter in micro-joules + foundMaxEnergyRange := false + if v, err := + os.ReadFile( + filepath.Join(zonePath, "max_energy_range_uj")); err == nil { + if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil { + foundMaxEnergyRange = true + z.maxEnergyRange = i + } + } + + // Are all information available? + z.ok = foundName && foundEnergy && foundMaxEnergyRange + + return + } + + powerCapPrefix := "/sys/devices/virtual/powercap" + controlType := "intel-rapl" + controlTypePath := filepath.Join(powerCapPrefix, controlType) + + // Find all RAPL zones + zonePrefix := filepath.Join(controlTypePath, controlType+":") + zonesPath, err := filepath.Glob(zonePrefix + "*") + if err != nil || zonesPath == nil { + return fmt.Errorf("unable to find any zones under %s", controlTypePath) + } + + for _, zonePath := range zonesPath { + zoneID := strings.TrimPrefix(zonePath, zonePrefix) + z := readZoneInfo(zonePath) + if z.ok && + !m.config.isIDExcluded[zoneID] && + !m.config.isNameExcluded[z.name] { + + // Add RAPL monitoring attributes for a zone + m.RAPLZoneInfo = + append( + m.RAPLZoneInfo, + RAPLZoneInfo{ + tags: map[string]string{ + "id": zoneID, + "zone_name": z.name, + }, + energyFilepath: z.energyFilepath, + energy: z.energy, + energyTimestamp: z.energyTimestamp, + maxEnergyRange: z.maxEnergyRange, + }) + } + + // find all sub zones for the given zone + subZonePrefix := filepath.Join(zonePath, controlType+":"+zoneID+":") + subZonesPath, err := filepath.Glob(subZonePrefix + "*") + if err != nil || subZonesPath == nil { + continue + } + + for _, subZonePath := range subZonesPath { + subZoneID := strings.TrimPrefix(subZonePath, subZonePrefix) + sz := readZoneInfo(subZonePath) + if len(zoneID) > 0 && len(z.name) > 0 && + sz.ok && + !m.config.isIDExcluded[zoneID+":"+subZoneID] && + !m.config.isNameExcluded[sz.name] { + m.RAPLZoneInfo = + append( + m.RAPLZoneInfo, + RAPLZoneInfo{ + tags: map[string]string{ + "id": zoneID + ":" + subZoneID, + "zone_name": z.name, + "sub_zone_name": sz.name, + }, + energyFilepath: sz.energyFilepath, + energy: sz.energy, + energyTimestamp: sz.energyTimestamp, + maxEnergyRange: sz.maxEnergyRange, + }) + } + } + } + + if m.RAPLZoneInfo == nil { + return fmt.Errorf("no running average power limit (RAPL) device found in %s", controlTypePath) + + } + + // Initialized + cclog.ComponentDebug( + m.name, + "initialized", + len(m.RAPLZoneInfo), + "zones with running average power limit (RAPL) monitoring attributes") + m.init = true + + return err +} + +// Read reads running average power limit (RAPL) monitoring attributes for all initialized zones +// See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes +func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMetric) { + + for i := range m.RAPLZoneInfo { + p := &m.RAPLZoneInfo[i] + + // Read current value of the energy counter in micro joules + if v, err := os.ReadFile(p.energyFilepath); err == nil { + energyTimestamp := time.Now() + if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil { + energy := i + + // Compute average power (Δ energy / Δ time) + energyDiff := energy - p.energy + if energyDiff < 0 { + // Handle overflow: + // ( p.maxEnergyRange - p.energy ) + energy + // = p.maxEnergyRange + ( energy - p.energy ) + // = p.maxEnergyRange + diffEnergy + energyDiff += p.maxEnergyRange + } + timeDiff := energyTimestamp.Sub(p.energyTimestamp) + averagePower := float64(energyDiff) / float64(timeDiff.Microseconds()) + + y, err := lp.New( + "rapl_average_power", + p.tags, + m.meta, + map[string]interface{}{"value": averagePower}, + energyTimestamp) + if err == nil { + output <- y + } + + // Save current energy counter state + p.energy = energy + p.energyTimestamp = energyTimestamp + } + } + } +} + +// Close closes running average power limit (RAPL) metric collector +func (m *RAPLCollector) Close() { + // Unset flag + m.init = false +} diff --git a/collectors/raplMetric.md b/collectors/raplMetric.md new file mode 100644 index 0000000..f857d7c --- /dev/null +++ b/collectors/raplMetric.md @@ -0,0 +1,18 @@ +# Running average power limit (RAPL) metric collector + +This collector reads running average power limit (RAPL) monitoring attributes to compute average power consumption metrics. See . + +The Likwid metric collector provides similar functionality. + +## Configuration + +```json + "rapl": { + "exclude_device_by_id": ["0:1", "0:2"], + "exclude_device_by_name": ["psys"] + } +``` + +## Metrics + +* `rapl_average_power`: average power consumption in Watt. The average is computed over the entire runtime from the last measurement to the current measurement