2021-03-25 15:55:06 +01:00
|
|
|
package collectors
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2022-01-27 11:08:27 +01:00
|
|
|
"os"
|
2022-01-26 20:18:47 +01:00
|
|
|
|
2022-10-10 11:53:11 +02:00
|
|
|
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
2024-12-19 23:00:14 +01:00
|
|
|
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
2022-01-27 11:08:27 +01:00
|
|
|
"golang.org/x/sys/unix"
|
2022-01-26 20:18:47 +01:00
|
|
|
|
2021-11-25 15:11:39 +01:00
|
|
|
"encoding/json"
|
2021-11-25 14:04:03 +01:00
|
|
|
"path/filepath"
|
2021-03-25 15:55:06 +01:00
|
|
|
"strconv"
|
2021-03-25 17:47:08 +01:00
|
|
|
"strings"
|
2021-03-25 15:55:06 +01:00
|
|
|
"time"
|
|
|
|
)
|
|
|
|
|
2022-03-11 13:48:18 +01:00
|
|
|
const IB_BASEPATH = "/sys/class/infiniband/"
|
2021-11-25 14:04:03 +01:00
|
|
|
|
2022-04-01 17:14:26 +02:00
|
|
|
type InfinibandCollectorMetric struct {
|
2023-08-29 14:12:49 +02:00
|
|
|
name string
|
|
|
|
path string
|
|
|
|
unit string
|
|
|
|
scale int64
|
|
|
|
addToIBTotal bool
|
|
|
|
addToIBTotalPkgs bool
|
|
|
|
currentState int64
|
|
|
|
lastState int64
|
2022-04-01 17:14:26 +02:00
|
|
|
}
|
|
|
|
|
2022-01-27 11:08:27 +01:00
|
|
|
type InfinibandCollectorInfo struct {
|
2023-08-29 14:12:49 +02:00
|
|
|
LID string // IB local Identifier (LID)
|
|
|
|
device string // IB device
|
|
|
|
port string // IB device port
|
|
|
|
portCounterFiles []InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric
|
|
|
|
tagSet map[string]string // corresponding tag list
|
2022-01-27 11:08:27 +01:00
|
|
|
}
|
|
|
|
|
2021-03-25 15:55:06 +01:00
|
|
|
type InfinibandCollector struct {
|
2022-01-25 15:37:43 +01:00
|
|
|
metricCollector
|
2022-01-26 20:18:47 +01:00
|
|
|
config struct {
|
2022-03-11 13:48:18 +01:00
|
|
|
ExcludeDevices []string `json:"exclude_devices,omitempty"` // IB device to exclude e.g. mlx5_0
|
|
|
|
SendAbsoluteValues bool `json:"send_abs_values"` // Send absolut values as read from sys filesystem
|
2023-08-29 14:12:49 +02:00
|
|
|
SendTotalValues bool `json:"send_total_values"` // Send computed total values
|
2022-03-11 13:48:18 +01:00
|
|
|
SendDerivedValues bool `json:"send_derived_values"` // Send derived values e.g. rates
|
2022-01-26 20:18:47 +01:00
|
|
|
}
|
2023-08-29 14:12:49 +02:00
|
|
|
info []InfinibandCollectorInfo
|
2022-03-11 13:48:18 +01:00
|
|
|
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
2021-03-25 15:55:06 +01:00
|
|
|
}
|
|
|
|
|
2022-01-27 11:08:27 +01:00
|
|
|
// Init initializes the Infiniband collector by walking through files below IB_BASEPATH
|
2022-01-25 15:37:43 +01:00
|
|
|
func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
2022-02-07 09:46:19 +01:00
|
|
|
|
|
|
|
// Check if already initialized
|
2022-02-07 15:43:57 +01:00
|
|
|
if m.init {
|
2022-02-07 09:46:19 +01:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2021-11-25 15:11:39 +01:00
|
|
|
var err error
|
2021-03-25 17:47:08 +01:00
|
|
|
m.name = "InfinibandCollector"
|
2021-03-25 15:55:06 +01:00
|
|
|
m.setup()
|
2022-05-13 14:10:39 +02:00
|
|
|
m.parallel = true
|
2022-01-27 11:08:27 +01:00
|
|
|
m.meta = map[string]string{
|
|
|
|
"source": m.name,
|
|
|
|
"group": "Network",
|
|
|
|
}
|
2022-03-11 13:48:18 +01:00
|
|
|
|
|
|
|
// Set default configuration,
|
|
|
|
m.config.SendAbsoluteValues = true
|
|
|
|
m.config.SendDerivedValues = false
|
|
|
|
// Read configuration file, allow overwriting default config
|
2021-11-25 14:04:03 +01:00
|
|
|
if len(config) > 0 {
|
2021-11-25 15:11:39 +01:00
|
|
|
err = json.Unmarshal(config, &m.config)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2021-11-25 14:04:03 +01:00
|
|
|
}
|
2022-01-27 11:08:27 +01:00
|
|
|
|
|
|
|
// Loop for all InfiniBand directories
|
|
|
|
globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*")
|
|
|
|
ibDirs, err := filepath.Glob(globPattern)
|
|
|
|
if err != nil {
|
2022-03-11 13:48:18 +01:00
|
|
|
return fmt.Errorf("unable to glob files with pattern %s: %v", globPattern, err)
|
2022-01-27 11:08:27 +01:00
|
|
|
}
|
|
|
|
if ibDirs == nil {
|
2022-03-11 13:48:18 +01:00
|
|
|
return fmt.Errorf("unable to find any directories with pattern %s", globPattern)
|
2022-01-27 11:08:27 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, path := range ibDirs {
|
|
|
|
|
|
|
|
// Skip, when no LID is assigned
|
2022-10-09 17:03:38 +02:00
|
|
|
line, err := os.ReadFile(filepath.Join(path, "lid"))
|
2022-02-10 09:28:06 +01:00
|
|
|
if err != nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
LID := strings.TrimSpace(string(line))
|
|
|
|
if LID == "0x0" {
|
2022-01-27 11:08:27 +01:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get device and port component
|
|
|
|
pathSplit := strings.Split(path, string(os.PathSeparator))
|
|
|
|
device := pathSplit[4]
|
|
|
|
port := pathSplit[6]
|
|
|
|
|
|
|
|
// Skip excluded devices
|
|
|
|
skip := false
|
|
|
|
for _, excludedDevice := range m.config.ExcludeDevices {
|
|
|
|
if excludedDevice == device {
|
|
|
|
skip = true
|
|
|
|
break
|
2021-11-25 15:11:39 +01:00
|
|
|
}
|
2022-01-27 11:08:27 +01:00
|
|
|
}
|
|
|
|
if skip {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check access to counter files
|
|
|
|
countersDir := filepath.Join(path, "counters")
|
2023-08-29 14:12:49 +02:00
|
|
|
portCounterFiles := []InfinibandCollectorMetric{
|
|
|
|
{
|
|
|
|
name: "ib_recv",
|
|
|
|
path: filepath.Join(countersDir, "port_rcv_data"),
|
|
|
|
unit: "bytes",
|
|
|
|
scale: 4,
|
|
|
|
addToIBTotal: true,
|
|
|
|
lastState: -1,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "ib_xmit",
|
|
|
|
path: filepath.Join(countersDir, "port_xmit_data"),
|
|
|
|
unit: "bytes",
|
|
|
|
scale: 4,
|
|
|
|
addToIBTotal: true,
|
|
|
|
lastState: -1,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "ib_recv_pkts",
|
|
|
|
path: filepath.Join(countersDir, "port_rcv_packets"),
|
|
|
|
unit: "packets",
|
|
|
|
scale: 1,
|
|
|
|
addToIBTotalPkgs: true,
|
|
|
|
lastState: -1,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "ib_xmit_pkts",
|
|
|
|
path: filepath.Join(countersDir, "port_xmit_packets"),
|
|
|
|
unit: "packets",
|
|
|
|
scale: 1,
|
|
|
|
addToIBTotalPkgs: true,
|
|
|
|
lastState: -1,
|
|
|
|
},
|
2022-01-27 11:08:27 +01:00
|
|
|
}
|
2022-04-01 17:14:26 +02:00
|
|
|
for _, counter := range portCounterFiles {
|
|
|
|
err := unix.Access(counter.path, unix.R_OK)
|
2022-01-27 11:08:27 +01:00
|
|
|
if err != nil {
|
2022-04-01 17:14:26 +02:00
|
|
|
return fmt.Errorf("unable to access %s: %v", counter.path, err)
|
2021-11-25 15:11:39 +01:00
|
|
|
}
|
|
|
|
}
|
2022-01-27 11:08:27 +01:00
|
|
|
|
|
|
|
m.info = append(m.info,
|
2023-08-29 14:12:49 +02:00
|
|
|
InfinibandCollectorInfo{
|
2022-01-27 11:08:27 +01:00
|
|
|
LID: LID,
|
|
|
|
device: device,
|
|
|
|
port: port,
|
|
|
|
portCounterFiles: portCounterFiles,
|
|
|
|
tagSet: map[string]string{
|
|
|
|
"type": "node",
|
|
|
|
"device": device,
|
|
|
|
"port": port,
|
|
|
|
"lid": LID,
|
|
|
|
},
|
|
|
|
})
|
2021-11-25 14:04:03 +01:00
|
|
|
}
|
2021-11-25 15:11:39 +01:00
|
|
|
|
2022-01-27 11:08:27 +01:00
|
|
|
if len(m.info) == 0 {
|
2022-03-11 13:48:18 +01:00
|
|
|
return fmt.Errorf("found no IB devices")
|
2021-03-25 15:55:06 +01:00
|
|
|
}
|
2021-03-25 17:47:08 +01:00
|
|
|
|
2022-01-26 20:18:47 +01:00
|
|
|
m.init = true
|
|
|
|
return nil
|
2021-11-25 14:04:03 +01:00
|
|
|
}
|
2021-03-25 17:47:08 +01:00
|
|
|
|
2022-01-27 11:08:27 +01:00
|
|
|
// Read reads Infiniband counter files below IB_BASEPATH
|
2024-12-19 23:00:14 +01:00
|
|
|
func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
2021-03-25 15:55:06 +01:00
|
|
|
|
2022-01-27 11:08:27 +01:00
|
|
|
// Check if already initialized
|
|
|
|
if !m.init {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2022-03-11 13:48:18 +01:00
|
|
|
// Current time stamp
|
2022-01-27 11:08:27 +01:00
|
|
|
now := time.Now()
|
2022-03-11 13:48:18 +01:00
|
|
|
// time difference to last time stamp
|
|
|
|
timeDiff := now.Sub(m.lastTimestamp).Seconds()
|
|
|
|
// Save current timestamp
|
|
|
|
m.lastTimestamp = now
|
|
|
|
|
2023-08-29 14:12:49 +02:00
|
|
|
for i := range m.info {
|
|
|
|
info := &m.info[i]
|
|
|
|
|
|
|
|
var ib_total, ib_total_pkts int64
|
|
|
|
for i := range info.portCounterFiles {
|
|
|
|
counterDef := &info.portCounterFiles[i]
|
2022-03-11 13:48:18 +01:00
|
|
|
|
|
|
|
// Read counter file
|
2022-10-09 17:03:38 +02:00
|
|
|
line, err := os.ReadFile(counterDef.path)
|
2022-02-10 09:28:06 +01:00
|
|
|
if err != nil {
|
2022-02-07 10:02:38 +01:00
|
|
|
cclog.ComponentError(
|
|
|
|
m.name,
|
2022-04-01 17:14:26 +02:00
|
|
|
fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err))
|
2022-02-07 10:02:38 +01:00
|
|
|
continue
|
|
|
|
}
|
2022-02-10 09:28:06 +01:00
|
|
|
data := strings.TrimSpace(string(line))
|
2022-03-11 13:48:18 +01:00
|
|
|
|
|
|
|
// convert counter to int64
|
2022-02-07 10:02:38 +01:00
|
|
|
v, err := strconv.ParseInt(data, 10, 64)
|
|
|
|
if err != nil {
|
|
|
|
cclog.ComponentError(
|
|
|
|
m.name,
|
2023-08-29 14:12:49 +02:00
|
|
|
fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterDef.name, data, err))
|
2022-02-07 10:02:38 +01:00
|
|
|
continue
|
|
|
|
}
|
2022-09-09 19:27:20 +02:00
|
|
|
// Scale raw value
|
|
|
|
v *= counterDef.scale
|
2022-03-11 13:48:18 +01:00
|
|
|
|
2023-08-29 14:12:49 +02:00
|
|
|
// Save current state
|
|
|
|
counterDef.currentState = v
|
|
|
|
|
2022-03-11 13:48:18 +01:00
|
|
|
// Send absolut values
|
|
|
|
if m.config.SendAbsoluteValues {
|
2023-08-29 14:12:49 +02:00
|
|
|
if y, err :=
|
2024-12-19 23:00:14 +01:00
|
|
|
lp.NewMessage(
|
2023-08-29 14:12:49 +02:00
|
|
|
counterDef.name,
|
|
|
|
info.tagSet,
|
|
|
|
m.meta,
|
|
|
|
map[string]interface{}{
|
|
|
|
"value": counterDef.currentState,
|
|
|
|
},
|
|
|
|
now); err == nil {
|
2022-04-01 17:14:26 +02:00
|
|
|
y.AddMeta("unit", counterDef.unit)
|
2022-03-11 13:48:18 +01:00
|
|
|
output <- y
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Send derived values
|
|
|
|
if m.config.SendDerivedValues {
|
2023-08-29 14:12:49 +02:00
|
|
|
if counterDef.lastState >= 0 {
|
|
|
|
rate := float64((counterDef.currentState - counterDef.lastState)) / timeDiff
|
|
|
|
if y, err :=
|
2024-12-19 23:00:14 +01:00
|
|
|
lp.NewMessage(
|
2023-08-29 14:12:49 +02:00
|
|
|
counterDef.name+"_bw",
|
|
|
|
info.tagSet,
|
|
|
|
m.meta,
|
|
|
|
map[string]interface{}{
|
|
|
|
"value": rate,
|
|
|
|
},
|
|
|
|
now); err == nil {
|
2022-04-01 17:14:26 +02:00
|
|
|
y.AddMeta("unit", counterDef.unit+"/sec")
|
2022-03-11 13:48:18 +01:00
|
|
|
output <- y
|
2023-08-29 14:12:49 +02:00
|
|
|
|
2022-03-11 13:48:18 +01:00
|
|
|
}
|
|
|
|
}
|
2023-08-29 14:12:49 +02:00
|
|
|
counterDef.lastState = counterDef.currentState
|
|
|
|
}
|
|
|
|
|
|
|
|
// Sum up total values
|
|
|
|
if m.config.SendTotalValues {
|
|
|
|
switch {
|
|
|
|
case counterDef.addToIBTotal:
|
|
|
|
ib_total += counterDef.currentState
|
|
|
|
case counterDef.addToIBTotalPkgs:
|
|
|
|
ib_total_pkts += counterDef.currentState
|
|
|
|
}
|
2021-11-25 15:11:39 +01:00
|
|
|
}
|
|
|
|
}
|
2022-02-07 10:02:38 +01:00
|
|
|
|
2023-08-29 14:12:49 +02:00
|
|
|
// Send total values
|
|
|
|
if m.config.SendTotalValues {
|
|
|
|
if y, err :=
|
2024-12-19 23:00:14 +01:00
|
|
|
lp.NewMessage(
|
2023-08-29 14:12:49 +02:00
|
|
|
"ib_total",
|
|
|
|
info.tagSet,
|
|
|
|
m.meta,
|
|
|
|
map[string]interface{}{
|
|
|
|
"value": ib_total,
|
|
|
|
},
|
|
|
|
now); err == nil {
|
|
|
|
y.AddMeta("unit", "bytes")
|
|
|
|
output <- y
|
|
|
|
}
|
|
|
|
|
|
|
|
if y, err :=
|
2024-12-19 23:00:14 +01:00
|
|
|
lp.NewMessage(
|
2023-08-29 14:12:49 +02:00
|
|
|
"ib_total_pkts",
|
|
|
|
info.tagSet,
|
|
|
|
m.meta,
|
|
|
|
map[string]interface{}{
|
|
|
|
"value": ib_total_pkts,
|
|
|
|
},
|
|
|
|
now); err == nil {
|
|
|
|
y.AddMeta("unit", "packets")
|
|
|
|
output <- y
|
|
|
|
}
|
|
|
|
}
|
2021-11-25 15:11:39 +01:00
|
|
|
}
|
2021-03-25 15:55:06 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
func (m *InfinibandCollector) Close() {
|
2021-10-04 15:47:03 +02:00
|
|
|
m.init = false
|
2021-03-25 15:55:06 +01:00
|
|
|
}
|