// Copyright (C) NHR@FAU, University Erlangen-Nuremberg. // All rights reserved. This file is part of cc-lib. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. // additional authors: // Holger Obermaier (NHR@KIT) package collectors import ( "bytes" "encoding/json" "fmt" "os" "path/filepath" "slices" "strconv" "strings" "time" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" "golang.org/x/sys/unix" ) // See: https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-class-infiniband const ( ibBasePath = "/sys/class/infiniband/" ibDataUnit = "bytes" ibDataRateUnit = ibDataUnit + "/sec" ibPkgUnit = "packets" ibPkgRateUnit = ibPkgUnit + "/sec" ) type InfinibandCollectorMetric struct { name string path string unit string unitRates string scaleByFourLanes bool addToIBTotal bool addToIBTotalPkgs bool lastState uint64 lastStateAvailable bool } type InfinibandCollectorInfo struct { lid string // IB local Identifier (LID) device string // IB device port string // IB device port portCounterFiles []InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric tagSet map[string]string // corresponding tag list } type InfinibandCollector struct { metricCollector config struct { ExcludeDevices []string `json:"exclude_devices,omitempty"` // IB device to exclude e.g. mlx5_0 SendAbsoluteValues bool `json:"send_abs_values"` // Send absolut values as read from sys filesystem SendTotalValues bool `json:"send_total_values"` // Send computed total values SendDerivedValues bool `json:"send_derived_values"` // Send derived values e.g. rates } info []InfinibandCollectorInfo lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths } // Init initializes the Infiniband collector by walking through files below ibBasePath func (m *InfinibandCollector) Init(config json.RawMessage) error { // Check if already initialized if m.init { return nil } var err error m.name = "InfinibandCollector" if err := m.setup(); err != nil { return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err) } m.parallel = true m.meta = map[string]string{ "source": m.name, "group": "Network", } // Set default configuration, m.config.SendAbsoluteValues = true m.config.SendDerivedValues = false // Read configuration file, allow overwriting default config if len(config) > 0 { d := json.NewDecoder(bytes.NewReader(config)) d.DisallowUnknownFields() if err := d.Decode(&m.config); err != nil { return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err) } } // Loop for all InfiniBand directories globPattern := filepath.Join(ibBasePath, "*", "ports", "*") ibDirs, err := filepath.Glob(globPattern) if err != nil { return fmt.Errorf("%s Init(): unable to glob files with pattern %s: %w", m.name, globPattern, err) } if ibDirs == nil { return fmt.Errorf("%s Init(): unable to find any directories with pattern %s", m.name, globPattern) } for _, path := range ibDirs { // Skip, when no LID is assigned line, err := os.ReadFile(filepath.Join(path, "lid")) if err != nil { continue } LID := strings.TrimSpace(string(line)) if LID == "0x0" { continue } // Get device and port component pathSplit := strings.Split(path, string(os.PathSeparator)) device := pathSplit[4] port := pathSplit[6] // Skip excluded devices if slices.Contains(m.config.ExcludeDevices, device) { continue } // Check access to counter files countersDir := filepath.Join(path, "counters") portCounterFiles := []InfinibandCollectorMetric{ { // Total number of data octets, divided by 4 (lanes), received on all VLs. // This is 64 bit counter name: "ib_recv", path: filepath.Join(countersDir, "port_rcv_data"), unit: ibDataUnit, unitRates: ibDataRateUnit, scaleByFourLanes: true, addToIBTotal: true, }, { // Total number of data octets, divided by 4 (lanes), transmitted on all VLs. // This is 64 bit counter name: "ib_xmit", path: filepath.Join(countersDir, "port_xmit_data"), unit: ibDataUnit, unitRates: ibDataRateUnit, scaleByFourLanes: true, addToIBTotal: true, }, { // Total number of packets received on all VLs from this port (this may include packets containing Errors. // This is 64 bit counter. name: "ib_recv_pkts", path: filepath.Join(countersDir, "port_rcv_packets"), unit: ibPkgUnit, unitRates: ibPkgRateUnit, addToIBTotalPkgs: true, }, { // Total number of packets transmitted on all VLs from this port. This may include packets with errors. // This is 64 bit counter. name: "ib_xmit_pkts", path: filepath.Join(countersDir, "port_xmit_packets"), unit: ibPkgUnit, unitRates: ibPkgRateUnit, addToIBTotalPkgs: true, }, } for _, counter := range portCounterFiles { err := unix.Access(counter.path, unix.R_OK) if err != nil { return fmt.Errorf("%s Init(): unable to access %s: %w", m.name, counter.path, err) } } m.info = append(m.info, InfinibandCollectorInfo{ lid: LID, device: device, port: port, portCounterFiles: portCounterFiles, tagSet: map[string]string{ "type": "node", "device": device, "port": port, "lid": LID, }, }) } if len(m.info) == 0 { return fmt.Errorf("%s Init(): found no IB devices", m.name) } m.init = true return nil } // Read reads Infiniband counter files below ibBasePath func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMessage) { // Check if already initialized if !m.init { return } // Current time stamp now := time.Now() // time difference to last time stamp timeDiff := now.Sub(m.lastTimestamp).Seconds() // Save current timestamp m.lastTimestamp = now for i := range m.info { info := &m.info[i] var ibTotal, ibTotalPkts uint64 // sum of xmit and recv counters var ibTotalBw, ibTotalPktsBw float64 // sum of xmit and recv rates var ibTotalBwAvailable, ibTotalPktsBwAvailable bool for i := range info.portCounterFiles { counterDef := &info.portCounterFiles[i] // Read counter file line, err := os.ReadFile(counterDef.path) if err != nil { cclog.ComponentErrorf( m.name, "Read(): Failed to read from file '%s': %v", counterDef.path, err) // Current counter can not be saved as last state counterDef.lastStateAvailable = false continue } data := strings.TrimSpace(string(line)) // convert counter to uint64 vRawCounter, err := strconv.ParseUint(data, 10, 64) if err != nil { cclog.ComponentErrorf( m.name, "Read(): Failed to convert Infininiband metrice %s='%s' to uint64: %v", counterDef.name, data, err) // Current counter can not be saved as last state counterDef.lastStateAvailable = false continue } vScaledCounter := vRawCounter if counterDef.scaleByFourLanes { vScaledCounter *= uint64(4) } // Send absolut values if m.config.SendAbsoluteValues { if y, err := lp.NewMetric(counterDef.name, info.tagSet, m.meta, vScaledCounter, now); err == nil { y.AddMeta("unit", counterDef.unit) output <- y } } // Send derived values if m.config.SendDerivedValues { if counterDef.lastStateAvailable { var rate float64 // uint64 subtraction handles wraparound automatically // in case vRawCounter < counterDef.lastState we would compute: // math.MaxUint64 - lastState + vRawCounter + 1 // = (2^64 - 1) - lastState + vRawCounter + 1 // = 2^64 - lastState + vRawCounter // ≡ vRawCounter - lastState (mod 2^64) rate = float64(vRawCounter-counterDef.lastState) / timeDiff if counterDef.scaleByFourLanes { rate *= float64(4) } if y, err := lp.NewMetric(counterDef.name+"_bw", info.tagSet, m.meta, rate, now); err == nil { y.AddMeta("unit", counterDef.unitRates) output <- y } // Sum up rates for total rates if m.config.SendTotalValues { switch { case counterDef.addToIBTotal: ibTotalBw += rate ibTotalBwAvailable = true case counterDef.addToIBTotalPkgs: ibTotalPktsBw += rate ibTotalPktsBwAvailable = true } } } counterDef.lastState = vRawCounter counterDef.lastStateAvailable = true } // Sum up total values if m.config.SendTotalValues { switch { case counterDef.addToIBTotal: ibTotal += vScaledCounter case counterDef.addToIBTotalPkgs: ibTotalPkts += vScaledCounter } } } // Send total values if m.config.SendTotalValues { if y, err := lp.NewMetric("ib_total", info.tagSet, m.meta, ibTotal, now); err == nil { y.AddMeta("unit", ibDataUnit) output <- y } if y, err := lp.NewMetric("ib_total_pkts", info.tagSet, m.meta, ibTotalPkts, now); err == nil { y.AddMeta("unit", ibPkgUnit) output <- y } if m.config.SendDerivedValues && ibTotalBwAvailable { if y, err := lp.NewMetric("ib_total_bw", info.tagSet, m.meta, ibTotalBw, now); err == nil { y.AddMeta("unit", ibDataRateUnit) output <- y } } if m.config.SendDerivedValues && ibTotalPktsBwAvailable { if y, err := lp.NewMetric("ib_total_pkts_bw", info.tagSet, m.meta, ibTotalPktsBw, now); err == nil { y.AddMeta("unit", ibPkgRateUnit) output <- y } } } } } func (m *InfinibandCollector) Close() { m.init = false }