cc-metric-collector/collectors/infinibandMetric.go

227 lines
6.2 KiB
Go
Raw Normal View History

2021-03-25 15:55:06 +01:00
package collectors
import (
"fmt"
"io/ioutil"
"os"
2022-02-07 10:02:38 +01:00
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
Modularize the whole thing (#16) * Use channels, add a metric router, split up configuration and use extended version of Influx line protocol internally * Use central timer for collectors and router. Add expressions to router * Add expression to router config * Update entry points * Start with README * Update README for CCMetric * Formatting * Update README.md * Add README for MultiChanTicker * Add README for MultiChanTicker * Update README.md * Add README to metric router * Update main README * Remove SinkEntity type * Update README for sinks * Update go files * Update README for receivers * Update collectors README * Update collectors README * Use seperate page per collector * Fix for tempstat page * Add docs for customcmd collector * Add docs for ipmistat collector * Add docs for topprocs collector * Update customCmdMetric.md * Use seconds when calculating LIKWID metrics * Add IB metrics ib_recv_pkts and ib_xmit_pkts * Drop domain part of host name * Updated to latest stable version of likwid * Define source code dependencies in Makefile * Add GPFS / IBM Spectrum Scale collector * Add vet and staticcheck make targets * Add vet and staticcheck make targets * Avoid go vet warning: struct field tag `json:"..., omitempty"` not compatible with reflect.StructTag.Get: suspicious space in struct tag value struct field tag `json:"...", omitempty` not compatible with reflect.StructTag.Get: key:"value" pairs not separated by spaces * Add sample collector to README.md * Add CPU frequency collector * Avoid staticcheck warning: redundant return statement * Avoid staticcheck warning: unnecessary assignment to the blank identifier * Simplified code * Add CPUFreqCollectorCpuinfo a metric collector to measure the current frequency of the CPUs as obtained from /proc/cpuinfo Only measure on the first hyperthread * Add collector for NFS clients * Move publication of metrics into Flush() for NatsSink * Update GitHub actions * Refactoring * Avoid vet warning: Println arg list ends with redundant newline * Avoid vet warning struct field commands has json tag but is not exported * Avoid vet warning: return copies lock value. * Corrected typo * Refactoring * Add go sources in internal/... * Bad separator in Makefile * Fix Infiniband collector Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
2022-01-25 15:37:43 +01:00
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
2022-04-02 16:05:52 +02:00
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
"golang.org/x/sys/unix"
2021-11-25 15:11:39 +01:00
"encoding/json"
"path/filepath"
2021-03-25 15:55:06 +01:00
"strconv"
2021-03-25 17:47:08 +01:00
"strings"
2021-03-25 15:55:06 +01:00
"time"
)
const IB_BASEPATH = "/sys/class/infiniband/"
2022-04-01 17:14:26 +02:00
type InfinibandCollectorMetric struct {
path string
unit string
}
type InfinibandCollectorInfo struct {
2022-04-01 17:14:26 +02:00
LID string // IB local Identifier (LID)
device string // IB device
port string // IB device port
portCounterFiles map[string]InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric
tagSet map[string]string // corresponding tag list
lastState map[string]int64 // State from last measurement
}
2021-03-25 15:55:06 +01:00
type InfinibandCollector struct {
Modularize the whole thing (#16) * Use channels, add a metric router, split up configuration and use extended version of Influx line protocol internally * Use central timer for collectors and router. Add expressions to router * Add expression to router config * Update entry points * Start with README * Update README for CCMetric * Formatting * Update README.md * Add README for MultiChanTicker * Add README for MultiChanTicker * Update README.md * Add README to metric router * Update main README * Remove SinkEntity type * Update README for sinks * Update go files * Update README for receivers * Update collectors README * Update collectors README * Use seperate page per collector * Fix for tempstat page * Add docs for customcmd collector * Add docs for ipmistat collector * Add docs for topprocs collector * Update customCmdMetric.md * Use seconds when calculating LIKWID metrics * Add IB metrics ib_recv_pkts and ib_xmit_pkts * Drop domain part of host name * Updated to latest stable version of likwid * Define source code dependencies in Makefile * Add GPFS / IBM Spectrum Scale collector * Add vet and staticcheck make targets * Add vet and staticcheck make targets * Avoid go vet warning: struct field tag `json:"..., omitempty"` not compatible with reflect.StructTag.Get: suspicious space in struct tag value struct field tag `json:"...", omitempty` not compatible with reflect.StructTag.Get: key:"value" pairs not separated by spaces * Add sample collector to README.md * Add CPU frequency collector * Avoid staticcheck warning: redundant return statement * Avoid staticcheck warning: unnecessary assignment to the blank identifier * Simplified code * Add CPUFreqCollectorCpuinfo a metric collector to measure the current frequency of the CPUs as obtained from /proc/cpuinfo Only measure on the first hyperthread * Add collector for NFS clients * Move publication of metrics into Flush() for NatsSink * Update GitHub actions * Refactoring * Avoid vet warning: Println arg list ends with redundant newline * Avoid vet warning struct field commands has json tag but is not exported * Avoid vet warning: return copies lock value. * Corrected typo * Refactoring * Add go sources in internal/... * Bad separator in Makefile * Fix Infiniband collector Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
2022-01-25 15:37:43 +01:00
metricCollector
config struct {
ExcludeDevices []string `json:"exclude_devices,omitempty"` // IB device to exclude e.g. mlx5_0
SendAbsoluteValues bool `json:"send_abs_values"` // Send absolut values as read from sys filesystem
SendDerivedValues bool `json:"send_derived_values"` // Send derived values e.g. rates
}
2022-04-02 16:05:52 +02:00
info []*InfinibandCollectorInfo
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
statsProcessedMetrics int64
2021-03-25 15:55:06 +01:00
}
// Init initializes the Infiniband collector by walking through files below IB_BASEPATH
Modularize the whole thing (#16) * Use channels, add a metric router, split up configuration and use extended version of Influx line protocol internally * Use central timer for collectors and router. Add expressions to router * Add expression to router config * Update entry points * Start with README * Update README for CCMetric * Formatting * Update README.md * Add README for MultiChanTicker * Add README for MultiChanTicker * Update README.md * Add README to metric router * Update main README * Remove SinkEntity type * Update README for sinks * Update go files * Update README for receivers * Update collectors README * Update collectors README * Use seperate page per collector * Fix for tempstat page * Add docs for customcmd collector * Add docs for ipmistat collector * Add docs for topprocs collector * Update customCmdMetric.md * Use seconds when calculating LIKWID metrics * Add IB metrics ib_recv_pkts and ib_xmit_pkts * Drop domain part of host name * Updated to latest stable version of likwid * Define source code dependencies in Makefile * Add GPFS / IBM Spectrum Scale collector * Add vet and staticcheck make targets * Add vet and staticcheck make targets * Avoid go vet warning: struct field tag `json:"..., omitempty"` not compatible with reflect.StructTag.Get: suspicious space in struct tag value struct field tag `json:"...", omitempty` not compatible with reflect.StructTag.Get: key:"value" pairs not separated by spaces * Add sample collector to README.md * Add CPU frequency collector * Avoid staticcheck warning: redundant return statement * Avoid staticcheck warning: unnecessary assignment to the blank identifier * Simplified code * Add CPUFreqCollectorCpuinfo a metric collector to measure the current frequency of the CPUs as obtained from /proc/cpuinfo Only measure on the first hyperthread * Add collector for NFS clients * Move publication of metrics into Flush() for NatsSink * Update GitHub actions * Refactoring * Avoid vet warning: Println arg list ends with redundant newline * Avoid vet warning struct field commands has json tag but is not exported * Avoid vet warning: return copies lock value. * Corrected typo * Refactoring * Add go sources in internal/... * Bad separator in Makefile * Fix Infiniband collector Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
2022-01-25 15:37:43 +01:00
func (m *InfinibandCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
2021-11-25 15:11:39 +01:00
var err error
2021-03-25 17:47:08 +01:00
m.name = "InfinibandCollector"
2021-03-25 15:55:06 +01:00
m.setup()
m.meta = map[string]string{
"source": m.name,
"group": "Network",
}
// Set default configuration,
m.config.SendAbsoluteValues = true
m.config.SendDerivedValues = false
// Read configuration file, allow overwriting default config
if len(config) > 0 {
2021-11-25 15:11:39 +01:00
err = json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
// Loop for all InfiniBand directories
globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*")
ibDirs, err := filepath.Glob(globPattern)
if err != nil {
return fmt.Errorf("unable to glob files with pattern %s: %v", globPattern, err)
}
if ibDirs == nil {
return fmt.Errorf("unable to find any directories with pattern %s", globPattern)
}
for _, path := range ibDirs {
// Skip, when no LID is assigned
line, err := ioutil.ReadFile(filepath.Join(path, "lid"))
if err != nil {
continue
}
LID := strings.TrimSpace(string(line))
if LID == "0x0" {
continue
}
// Get device and port component
pathSplit := strings.Split(path, string(os.PathSeparator))
device := pathSplit[4]
port := pathSplit[6]
// Skip excluded devices
skip := false
for _, excludedDevice := range m.config.ExcludeDevices {
if excludedDevice == device {
skip = true
break
2021-11-25 15:11:39 +01:00
}
}
if skip {
continue
}
// Check access to counter files
countersDir := filepath.Join(path, "counters")
2022-04-01 17:14:26 +02:00
portCounterFiles := map[string]InfinibandCollectorMetric{
"ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes"},
"ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes"},
"ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets"},
"ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets"},
}
2022-04-01 17:14:26 +02:00
for _, counter := range portCounterFiles {
err := unix.Access(counter.path, unix.R_OK)
if err != nil {
2022-04-01 17:14:26 +02:00
return fmt.Errorf("unable to access %s: %v", counter.path, err)
2021-11-25 15:11:39 +01:00
}
}
// Initialize last state
lastState := make(map[string]int64)
for counter := range portCounterFiles {
lastState[counter] = -1
}
m.info = append(m.info,
2022-02-15 15:37:25 +01:00
&InfinibandCollectorInfo{
LID: LID,
device: device,
port: port,
portCounterFiles: portCounterFiles,
tagSet: map[string]string{
"type": "node",
"device": device,
"port": port,
"lid": LID,
},
lastState: lastState,
})
}
2021-11-25 15:11:39 +01:00
if len(m.info) == 0 {
return fmt.Errorf("found no IB devices")
2021-03-25 15:55:06 +01:00
}
2022-04-02 16:05:52 +02:00
m.statsProcessedMetrics = 0
m.init = true
return nil
}
2021-03-25 17:47:08 +01:00
// Read reads Infiniband counter files below IB_BASEPATH
func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetric) {
2021-03-25 15:55:06 +01:00
// Check if already initialized
if !m.init {
return
}
// Current time stamp
now := time.Now()
// time difference to last time stamp
timeDiff := now.Sub(m.lastTimestamp).Seconds()
// Save current timestamp
m.lastTimestamp = now
2022-02-15 15:37:25 +01:00
for _, info := range m.info {
2022-04-01 17:14:26 +02:00
for counterName, counterDef := range info.portCounterFiles {
// Read counter file
2022-04-01 17:14:26 +02:00
line, err := ioutil.ReadFile(counterDef.path)
if err != nil {
2022-02-07 10:02:38 +01:00
cclog.ComponentError(
m.name,
2022-04-01 17:14:26 +02:00
fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err))
2022-02-07 10:02:38 +01:00
continue
}
data := strings.TrimSpace(string(line))
// convert counter to int64
2022-02-07 10:02:38 +01:00
v, err := strconv.ParseInt(data, 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterName, data, err))
continue
}
// Send absolut values
if m.config.SendAbsoluteValues {
if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil {
2022-04-01 17:14:26 +02:00
y.AddMeta("unit", counterDef.unit)
output <- y
2022-04-02 16:05:52 +02:00
m.statsProcessedMetrics++
}
}
// Send derived values
if m.config.SendDerivedValues {
if info.lastState[counterName] >= 0 {
rate := float64((v - info.lastState[counterName])) / timeDiff
if y, err := lp.New(counterName+"_bw", info.tagSet, m.meta, map[string]interface{}{"value": rate}, now); err == nil {
2022-04-01 17:14:26 +02:00
y.AddMeta("unit", counterDef.unit+"/sec")
output <- y
2022-04-02 16:05:52 +02:00
m.statsProcessedMetrics++
}
}
// Save current state
info.lastState[counterName] = v
2021-11-25 15:11:39 +01:00
}
}
2022-02-07 10:02:38 +01:00
2021-11-25 15:11:39 +01:00
}
2022-04-02 16:05:52 +02:00
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
2021-03-25 15:55:06 +01:00
}
func (m *InfinibandCollector) Close() {
2021-10-04 15:47:03 +02:00
m.init = false
2021-03-25 15:55:06 +01:00
}