mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-04-06 05:35:54 +02:00
add only_metrics. docs: add units
This commit is contained in:
parent
bcfe2b522a
commit
4702ab1570
@ -1,18 +1,17 @@
|
|||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
|
||||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
|
||||||
"golang.org/x/sys/unix"
|
|
||||||
|
|
||||||
"encoding/json"
|
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
)
|
)
|
||||||
|
|
||||||
const IB_BASEPATH = "/sys/class/infiniband/"
|
const IB_BASEPATH = "/sys/class/infiniband/"
|
||||||
@ -32,26 +31,45 @@ type InfinibandCollectorInfo struct {
|
|||||||
LID string // IB local Identifier (LID)
|
LID string // IB local Identifier (LID)
|
||||||
device string // IB device
|
device string // IB device
|
||||||
port string // IB device port
|
port string // IB device port
|
||||||
portCounterFiles []InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric
|
portCounterFiles []InfinibandCollectorMetric // list of counters for this port
|
||||||
tagSet map[string]string // corresponding tag list
|
tagSet map[string]string // tags for this IB port
|
||||||
}
|
}
|
||||||
|
|
||||||
type InfinibandCollector struct {
|
type InfinibandCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
config struct {
|
config struct {
|
||||||
ExcludeDevices []string `json:"exclude_devices,omitempty"` // IB device to exclude e.g. mlx5_0
|
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
||||||
SendAbsoluteValues bool `json:"send_abs_values"` // Send absolut values as read from sys filesystem
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
SendTotalValues bool `json:"send_total_values"` // Send computed total values
|
OnlyMetrics []string `json:"only_metrics,omitempty"`
|
||||||
SendDerivedValues bool `json:"send_derived_values"` // Send derived values e.g. rates
|
SendAbsoluteValues bool `json:"send_abs_values"`
|
||||||
|
SendTotalValues bool `json:"send_total_values"`
|
||||||
|
SendDerivedValues bool `json:"send_derived_values"`
|
||||||
}
|
}
|
||||||
info []InfinibandCollectorInfo
|
info []InfinibandCollectorInfo
|
||||||
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
lastTimestamp time.Time // For derived calculations
|
||||||
}
|
}
|
||||||
|
|
||||||
// Init initializes the Infiniband collector by walking through files below IB_BASEPATH
|
// shouldOutput returns true if a metric (or its derived variant) should be forwarded.
|
||||||
func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
func (m *InfinibandCollector) shouldOutput(metricName string) bool {
|
||||||
|
// If only_metrics is set, only metrics with an exact match are allowed.
|
||||||
|
if len(m.config.OnlyMetrics) > 0 {
|
||||||
|
for _, n := range m.config.OnlyMetrics {
|
||||||
|
if n == metricName {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
// Otherwise, exclude if present in exclude_metrics.
|
||||||
|
for _, n := range m.config.ExcludeMetrics {
|
||||||
|
if n == metricName {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
// Check if already initialized
|
func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||||
if m.init {
|
if m.init {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -64,7 +82,6 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
"source": m.name,
|
"source": m.name,
|
||||||
"group": "Network",
|
"group": "Network",
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set default configuration,
|
// Set default configuration,
|
||||||
m.config.SendAbsoluteValues = true
|
m.config.SendAbsoluteValues = true
|
||||||
m.config.SendDerivedValues = false
|
m.config.SendDerivedValues = false
|
||||||
@ -87,9 +104,9 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, path := range ibDirs {
|
for _, path := range ibDirs {
|
||||||
|
// Skip when no LID is assigned.
|
||||||
// Skip, when no LID is assigned
|
lidFile := filepath.Join(path, "lid")
|
||||||
line, err := os.ReadFile(filepath.Join(path, "lid"))
|
line, err := os.ReadFile(lidFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@ -98,12 +115,15 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get device and port component
|
// Get device and port components.
|
||||||
pathSplit := strings.Split(path, string(os.PathSeparator))
|
pathSplit := strings.Split(path, string(os.PathSeparator))
|
||||||
|
if len(pathSplit) < 7 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
device := pathSplit[4]
|
device := pathSplit[4]
|
||||||
port := pathSplit[6]
|
port := pathSplit[6]
|
||||||
|
|
||||||
// Skip excluded devices
|
// Skip excluded devices.
|
||||||
skip := false
|
skip := false
|
||||||
for _, excludedDevice := range m.config.ExcludeDevices {
|
for _, excludedDevice := range m.config.ExcludeDevices {
|
||||||
if excludedDevice == device {
|
if excludedDevice == device {
|
||||||
@ -115,7 +135,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check access to counter files
|
// Define the counters for the port.
|
||||||
countersDir := filepath.Join(path, "counters")
|
countersDir := filepath.Join(path, "counters")
|
||||||
portCounterFiles := []InfinibandCollectorMetric{
|
portCounterFiles := []InfinibandCollectorMetric{
|
||||||
{
|
{
|
||||||
@ -158,18 +178,19 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tagSet := map[string]string{
|
||||||
|
"type": "node",
|
||||||
|
"device": device,
|
||||||
|
"port": port,
|
||||||
|
"lid": LID,
|
||||||
|
}
|
||||||
m.info = append(m.info,
|
m.info = append(m.info,
|
||||||
InfinibandCollectorInfo{
|
InfinibandCollectorInfo{
|
||||||
LID: LID,
|
LID: LID,
|
||||||
device: device,
|
device: device,
|
||||||
port: port,
|
port: port,
|
||||||
portCounterFiles: portCounterFiles,
|
portCounterFiles: portCounterFiles,
|
||||||
tagSet: map[string]string{
|
tagSet: tagSet,
|
||||||
"type": "node",
|
|
||||||
"device": device,
|
|
||||||
"port": port,
|
|
||||||
"lid": LID,
|
|
||||||
},
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -178,6 +199,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
m.init = true
|
m.init = true
|
||||||
|
m.lastTimestamp = time.Now()
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -198,109 +220,68 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
|
|
||||||
for i := range m.info {
|
for i := range m.info {
|
||||||
info := &m.info[i]
|
info := &m.info[i]
|
||||||
|
var ib_total, ib_total_pkgs int64
|
||||||
var ib_total, ib_total_pkts int64
|
for j := range info.portCounterFiles {
|
||||||
for i := range info.portCounterFiles {
|
counterDef := &info.portCounterFiles[j]
|
||||||
counterDef := &info.portCounterFiles[i]
|
|
||||||
|
|
||||||
// Read counter file
|
|
||||||
line, err := os.ReadFile(counterDef.path)
|
line, err := os.ReadFile(counterDef.path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err))
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err))
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
data := strings.TrimSpace(string(line))
|
data := strings.TrimSpace(string(line))
|
||||||
|
|
||||||
// convert counter to int64
|
|
||||||
v, err := strconv.ParseInt(data, 10, 64)
|
v, err := strconv.ParseInt(data, 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert counter %s='%s': %v", counterDef.name, data, err))
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterDef.name, data, err))
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Scale raw value
|
// Scale raw value.
|
||||||
v *= counterDef.scale
|
v *= counterDef.scale
|
||||||
|
|
||||||
// Save current state
|
|
||||||
counterDef.currentState = v
|
counterDef.currentState = v
|
||||||
|
|
||||||
// Send absolut values
|
// Send absolute values.
|
||||||
if m.config.SendAbsoluteValues {
|
if m.config.SendAbsoluteValues && m.shouldOutput(counterDef.name) {
|
||||||
if y, err :=
|
if y, err := lp.NewMessage(counterDef.name, info.tagSet, m.meta, map[string]interface{}{"value": counterDef.currentState}, now); err == nil {
|
||||||
lp.NewMessage(
|
|
||||||
counterDef.name,
|
|
||||||
info.tagSet,
|
|
||||||
m.meta,
|
|
||||||
map[string]interface{}{
|
|
||||||
"value": counterDef.currentState,
|
|
||||||
},
|
|
||||||
now); err == nil {
|
|
||||||
y.AddMeta("unit", counterDef.unit)
|
y.AddMeta("unit", counterDef.unit)
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send derived values
|
// Send derived values.
|
||||||
if m.config.SendDerivedValues {
|
if m.config.SendDerivedValues {
|
||||||
if counterDef.lastState >= 0 {
|
if counterDef.lastState >= 0 {
|
||||||
rate := float64((counterDef.currentState - counterDef.lastState)) / timeDiff
|
rate := float64(counterDef.currentState-counterDef.lastState) / timeDiff
|
||||||
if y, err :=
|
if m.shouldOutput(counterDef.name + "_bw") {
|
||||||
lp.NewMessage(
|
if y, err := lp.NewMessage(counterDef.name+"_bw", info.tagSet, m.meta, map[string]interface{}{"value": rate}, now); err == nil {
|
||||||
counterDef.name+"_bw",
|
y.AddMeta("unit", counterDef.unit+"/sec")
|
||||||
info.tagSet,
|
output <- y
|
||||||
m.meta,
|
}
|
||||||
map[string]interface{}{
|
|
||||||
"value": rate,
|
|
||||||
},
|
|
||||||
now); err == nil {
|
|
||||||
y.AddMeta("unit", counterDef.unit+"/sec")
|
|
||||||
output <- y
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
counterDef.lastState = counterDef.currentState
|
counterDef.lastState = counterDef.currentState
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sum up total values
|
// Sum up total values if enabled.
|
||||||
if m.config.SendTotalValues {
|
if m.config.SendTotalValues {
|
||||||
switch {
|
if counterDef.addToIBTotal {
|
||||||
case counterDef.addToIBTotal:
|
|
||||||
ib_total += counterDef.currentState
|
ib_total += counterDef.currentState
|
||||||
case counterDef.addToIBTotalPkgs:
|
} else if counterDef.addToIBTotalPkgs {
|
||||||
ib_total_pkts += counterDef.currentState
|
ib_total_pkgs += counterDef.currentState
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Send total values.
|
||||||
// Send total values
|
|
||||||
if m.config.SendTotalValues {
|
if m.config.SendTotalValues {
|
||||||
if y, err :=
|
if m.shouldOutput("ib_total") {
|
||||||
lp.NewMessage(
|
if y, err := lp.NewMessage("ib_total", info.tagSet, m.meta, map[string]interface{}{"value": ib_total}, now); err == nil {
|
||||||
"ib_total",
|
y.AddMeta("unit", "bytes")
|
||||||
info.tagSet,
|
output <- y
|
||||||
m.meta,
|
}
|
||||||
map[string]interface{}{
|
|
||||||
"value": ib_total,
|
|
||||||
},
|
|
||||||
now); err == nil {
|
|
||||||
y.AddMeta("unit", "bytes")
|
|
||||||
output <- y
|
|
||||||
}
|
}
|
||||||
|
if m.shouldOutput("ib_total_pkts") {
|
||||||
if y, err :=
|
if y, err := lp.NewMessage("ib_total_pkts", info.tagSet, m.meta, map[string]interface{}{"value": ib_total_pkgs}, now); err == nil {
|
||||||
lp.NewMessage(
|
y.AddMeta("unit", "packets")
|
||||||
"ib_total_pkts",
|
output <- y
|
||||||
info.tagSet,
|
}
|
||||||
m.meta,
|
|
||||||
map[string]interface{}{
|
|
||||||
"value": ib_total_pkts,
|
|
||||||
},
|
|
||||||
now); err == nil {
|
|
||||||
y.AddMeta("unit", "packets")
|
|
||||||
output <- y
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
## `ibstat` collector
|
## `ibstat` collector
|
||||||
|
|
||||||
```json
|
```json
|
||||||
@ -6,30 +5,41 @@
|
|||||||
"exclude_devices": [
|
"exclude_devices": [
|
||||||
"mlx4"
|
"mlx4"
|
||||||
],
|
],
|
||||||
|
"exclude_metrics": [
|
||||||
|
"ib_total"
|
||||||
|
],
|
||||||
|
"only_metrics": [
|
||||||
|
"ib_revc_bw"
|
||||||
|
],
|
||||||
"send_abs_values": true,
|
"send_abs_values": true,
|
||||||
"send_derived_values": true
|
"send_derived_values": true,
|
||||||
|
"send_total_values": true
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The `ibstat` collector includes all Infiniband devices that can be
|
The ibstat collector includes all InfiniBand devices found under `/sys/class/infiniband/` for which a LID file (`/sys/class/infiniband/<dev>/ports/<port>/lid`) is present.
|
||||||
found below `/sys/class/infiniband/` and where any of the ports provides a
|
Devices can be filtered with the `exclude_devices` option.
|
||||||
LID file (`/sys/class/infiniband/<dev>/ports/<port>/lid`)
|
|
||||||
|
|
||||||
The devices can be filtered with the `exclude_devices` option in the configuration.
|
|
||||||
|
|
||||||
For each found LID the collector reads data through the sysfs files below `/sys/class/infiniband/<device>`. (See: <https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-class-infiniband>)
|
For each found LID the collector reads data through the sysfs files below `/sys/class/infiniband/<device>`. (See: <https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-class-infiniband>)
|
||||||
|
|
||||||
Metrics:
|
Both filtering mechanisms are supported:
|
||||||
|
- `exclude_metrics`: Excludes the specified metrics.
|
||||||
|
- `only_metrics`: If provided, only the listed metrics are collected. This takes precedence over `exclude_metrics`.
|
||||||
|
|
||||||
* `ib_recv`
|
**Absolute Metrics:**
|
||||||
* `ib_xmit`
|
- `ib_recv` (unit: `bytes`)
|
||||||
* `ib_recv_pkts`
|
- `ib_xmit` (unit: `bytes`)
|
||||||
* `ib_xmit_pkts`
|
- `ib_recv_pkts` (unit: `packets`)
|
||||||
* `ib_total = ib_recv + ib_xmit` (if `send_total_values == true`)
|
- `ib_xmit_pkts` (unit: `packets`)
|
||||||
* `ib_total_pkts = ib_recv_pkts + ib_xmit_pkts` (if `send_total_values == true`)
|
|
||||||
* `ib_recv_bw` (if `send_derived_values == true`)
|
**Derived Metrics:**
|
||||||
* `ib_xmit_bw` (if `send_derived_values == true`)
|
- `ib_recvi_bw` (unit: `bytes/s`)
|
||||||
* `ib_recv_pkts_bw` (if `send_derived_values == true`)
|
- `ib_xmit_bw` (unit: `bytes/s`)
|
||||||
* `ib_xmit_pkts_bw` (if `send_derived_values == true`)
|
- `ib_recv_pkts_bw` (unit: `packets/s`)
|
||||||
|
- `ib_xmit_pkts_bw` (unit: `packets/s`)
|
||||||
|
|
||||||
|
**Global metrics** (if `send_total_values` is enabled):
|
||||||
|
- `ib_total` = ib_recv + ib_xmit (unit: `bytes`)
|
||||||
|
- `ib_total_pkts` = ib_recv_pkts + ib_xmit_pkts (unit: `packets`)
|
||||||
|
|
||||||
The collector adds a `device` tag to all metrics
|
The collector adds a `device` tag to all metrics
|
||||||
|
Loading…
x
Reference in New Issue
Block a user