Files
cc-metric-collector/collectors/infinibandMetric.go
2026-06-08 14:58:29 +02:00

338 lines
9.8 KiB
Go

// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"bytes"
"encoding/json"
"fmt"
"os"
"path/filepath"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
"golang.org/x/sys/unix"
)
// See: https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-class-infiniband
const (
ibBasePath = "/sys/class/infiniband/"
ibDataUnit = "bytes"
ibDataRateUnit = ibDataUnit + "/sec"
ibPkgUnit = "packets"
ibPkgRateUnit = ibPkgUnit + "/sec"
)
type InfinibandCollectorMetric struct {
name string
path string
unit string
unitRates string
scaleByFourLanes bool
addToIBTotal bool
addToIBTotalPkgs bool
lastState uint64
lastStateAvailable bool
}
type InfinibandCollectorInfo struct {
lid string // IB local Identifier (LID)
device string // IB device
port string // IB device port
portCounterFiles []InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric
tagSet map[string]string // corresponding tag list
}
type InfinibandCollector struct {
metricCollector
config struct {
ExcludeDevices []string `json:"exclude_devices,omitempty"` // IB device to exclude e.g. mlx5_0
SendAbsoluteValues bool `json:"send_abs_values"` // Send absolut values as read from sys filesystem
SendTotalValues bool `json:"send_total_values"` // Send computed total values
SendDerivedValues bool `json:"send_derived_values"` // Send derived values e.g. rates
}
info []InfinibandCollectorInfo
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
}
// Init initializes the Infiniband collector by walking through files below ibBasePath
func (m *InfinibandCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
var err error
m.name = "InfinibandCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true
m.meta = map[string]string{
"source": m.name,
"group": "Network",
}
// Set default configuration,
m.config.SendAbsoluteValues = true
m.config.SendDerivedValues = false
// Read configuration file, allow overwriting default config
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
}
}
// Loop for all InfiniBand directories
globPattern := filepath.Join(ibBasePath, "*", "ports", "*")
ibDirs, err := filepath.Glob(globPattern)
if err != nil {
return fmt.Errorf("%s Init(): unable to glob files with pattern %s: %w", m.name, globPattern, err)
}
if ibDirs == nil {
return fmt.Errorf("%s Init(): unable to find any directories with pattern %s", m.name, globPattern)
}
for _, path := range ibDirs {
// Skip, when no LID is assigned
line, err := os.ReadFile(filepath.Join(path, "lid"))
if err != nil {
continue
}
LID := strings.TrimSpace(string(line))
if LID == "0x0" {
continue
}
// Get device and port component
pathSplit := strings.Split(path, string(os.PathSeparator))
device := pathSplit[4]
port := pathSplit[6]
// Skip excluded devices
if slices.Contains(m.config.ExcludeDevices, device) {
continue
}
// Check access to counter files
countersDir := filepath.Join(path, "counters")
portCounterFiles := []InfinibandCollectorMetric{
{
// Total number of data octets, divided by 4 (lanes), received on all VLs.
// This is 64 bit counter
name: "ib_recv",
path: filepath.Join(countersDir, "port_rcv_data"),
unit: ibDataUnit,
unitRates: ibDataRateUnit,
scaleByFourLanes: true,
addToIBTotal: true,
},
{
// Total number of data octets, divided by 4 (lanes), transmitted on all VLs.
// This is 64 bit counter
name: "ib_xmit",
path: filepath.Join(countersDir, "port_xmit_data"),
unit: ibDataUnit,
unitRates: ibDataRateUnit,
scaleByFourLanes: true,
addToIBTotal: true,
},
{
// Total number of packets received on all VLs from this port (this may include packets containing Errors.
// This is 64 bit counter.
name: "ib_recv_pkts",
path: filepath.Join(countersDir, "port_rcv_packets"),
unit: ibPkgUnit,
unitRates: ibPkgRateUnit,
addToIBTotalPkgs: true,
},
{
// Total number of packets transmitted on all VLs from this port. This may include packets with errors.
// This is 64 bit counter.
name: "ib_xmit_pkts",
path: filepath.Join(countersDir, "port_xmit_packets"),
unit: ibPkgUnit,
unitRates: ibPkgRateUnit,
addToIBTotalPkgs: true,
},
}
for _, counter := range portCounterFiles {
err := unix.Access(counter.path, unix.R_OK)
if err != nil {
return fmt.Errorf("%s Init(): unable to access %s: %w", m.name, counter.path, err)
}
}
m.info = append(m.info,
InfinibandCollectorInfo{
lid: LID,
device: device,
port: port,
portCounterFiles: portCounterFiles,
tagSet: map[string]string{
"type": "node",
"device": device,
"port": port,
"lid": LID,
},
})
}
if len(m.info) == 0 {
return fmt.Errorf("%s Init(): found no IB devices", m.name)
}
m.init = true
return nil
}
// Read reads Infiniband counter files below ibBasePath
func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Check if already initialized
if !m.init {
return
}
// Current time stamp
now := time.Now()
// time difference to last time stamp
timeDiff := now.Sub(m.lastTimestamp).Seconds()
// Save current timestamp
m.lastTimestamp = now
for i := range m.info {
info := &m.info[i]
var ibTotal, ibTotalPkts uint64 // sum of xmit and recv counters
var ibTotalBw, ibTotalPktsBw float64 // sum of xmit and recv rates
var ibTotalBwAvailable, ibTotalPktsBwAvailable bool
for i := range info.portCounterFiles {
counterDef := &info.portCounterFiles[i]
// Read counter file
line, err := os.ReadFile(counterDef.path)
if err != nil {
cclog.ComponentErrorf(
m.name,
"Read(): Failed to read from file '%s': %v", counterDef.path, err)
// Current counter can not be saved as last state
counterDef.lastStateAvailable = false
continue
}
data := strings.TrimSpace(string(line))
// convert counter to uint64
vRawCounter, err := strconv.ParseUint(data, 10, 64)
if err != nil {
cclog.ComponentErrorf(
m.name,
"Read(): Failed to convert Infininiband metrice %s='%s' to uint64: %v", counterDef.name, data, err)
// Current counter can not be saved as last state
counterDef.lastStateAvailable = false
continue
}
vScaledCounter := vRawCounter
if counterDef.scaleByFourLanes {
vScaledCounter *= uint64(4)
}
// Send absolut values
if m.config.SendAbsoluteValues {
if y, err := lp.NewMetric(counterDef.name, info.tagSet, m.meta, vScaledCounter, now); err == nil {
y.AddMeta("unit", counterDef.unit)
output <- y
}
}
// Send derived values
if m.config.SendDerivedValues {
if counterDef.lastStateAvailable {
var rate float64
// uint64 subtraction handles wraparound automatically
// in case vRawCounter < counterDef.lastState we would compute:
// math.MaxUint64 - lastState + vRawCounter + 1
// = (2^64 - 1) - lastState + vRawCounter + 1
// = 2^64 - lastState + vRawCounter
// ≡ vRawCounter - lastState (mod 2^64)
rate = float64(vRawCounter-counterDef.lastState) / timeDiff
if counterDef.scaleByFourLanes {
rate *= float64(4)
}
if y, err := lp.NewMetric(counterDef.name+"_bw", info.tagSet, m.meta, rate, now); err == nil {
y.AddMeta("unit", counterDef.unitRates)
output <- y
}
// Sum up rates for total rates
if m.config.SendTotalValues {
switch {
case counterDef.addToIBTotal:
ibTotalBw += rate
ibTotalBwAvailable = true
case counterDef.addToIBTotalPkgs:
ibTotalPktsBw += rate
ibTotalPktsBwAvailable = true
}
}
}
counterDef.lastState = vRawCounter
counterDef.lastStateAvailable = true
}
// Sum up total values
if m.config.SendTotalValues {
switch {
case counterDef.addToIBTotal:
ibTotal += vScaledCounter
case counterDef.addToIBTotalPkgs:
ibTotalPkts += vScaledCounter
}
}
}
// Send total values
if m.config.SendTotalValues {
if y, err := lp.NewMetric("ib_total", info.tagSet, m.meta, ibTotal, now); err == nil {
y.AddMeta("unit", ibDataUnit)
output <- y
}
if y, err := lp.NewMetric("ib_total_pkts", info.tagSet, m.meta, ibTotalPkts, now); err == nil {
y.AddMeta("unit", ibPkgUnit)
output <- y
}
if m.config.SendDerivedValues && ibTotalBwAvailable {
if y, err := lp.NewMetric("ib_total_bw", info.tagSet, m.meta, ibTotalBw, now); err == nil {
y.AddMeta("unit", ibDataRateUnit)
output <- y
}
}
if m.config.SendDerivedValues && ibTotalPktsBwAvailable {
if y, err := lp.NewMetric("ib_total_pkts_bw", info.tagSet, m.meta, ibTotalPktsBw, now); err == nil {
y.AddMeta("unit", ibPkgRateUnit)
output <- y
}
}
}
}
}
func (m *InfinibandCollector) Close() {
m.init = false
}