Merge development branch to main (#141)

* Remove go-toolkit as build requirement for RPM builds if run in CI

* Remove condition around BuildRequires and use go-toolkit for RPM builds

* use go-toolkit for RPM builds

* Install go-toolkit to fulfill build requirements for RPM

* Add golang-race for UBI9 and Alma9

* Fix wrongly named packages

* Fix wrongly named packages

* Fix Release part

* Fix Release part

* Fix documentation of RAPL collector

* Mark all JSON config fields of message processor as omitempty

* Generate HUGO inputs out of Markdown files

* Check creation of CCMessage in NATS receiver

* Use CCMessage FromBytes instead of Influx's decoder

* Rename 'process_message' to 'process_messages' in metricRouter config

This makes the behavior more consistent with the other modules, which
have their MessageProcessor named 'process_messages'. This most likely
was just a typo.

* Add optional interface alias in netstat (#130)

* Check creation of CCMessage in NATS receiver

* add optional interface aliases for netstatMetric

* small fix

---------

Co-authored-by: Thomas Roehl <thomas.roehl@fau.de>
Co-authored-by: exterr2f <Robert.Externbrink@rub.de>
Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com>

* Fix excluded metrics for diskstat and add exclude_mounts (#131)

* Check creation of CCMessage in NATS receiver

* fix excluded metrics and add optional mountpoint exclude

---------

Co-authored-by: Thomas Roehl <thomas.roehl@fau.de>
Co-authored-by: exterr2f <Robert.Externbrink@rub.de>
Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com>

* Add derived values for nfsiostat (#132)

* Check creation of CCMessage in NATS receiver

* add derived_values for nfsiostatMetric

---------

Co-authored-by: Thomas Roehl <thomas.roehl@fau.de>
Co-authored-by: exterr2f <Robert.Externbrink@rub.de>
Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com>

* Add exclude_devices to iostat (#133)

* Check creation of CCMessage in NATS receiver

* add exclude_device for iostatMetric

* add md file

---------

Co-authored-by: Thomas Roehl <thomas.roehl@fau.de>
Co-authored-by: exterr2f <Robert.Externbrink@rub.de>
Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com>

* Add derived_values for numastats (#134)

* Check creation of CCMessage in NATS receiver

* add derived_values for numastats

* change to ccMessage

* remove vim command artefact

---------

Co-authored-by: Thomas Roehl <thomas.roehl@fau.de>
Co-authored-by: exterr2f <Robert.Externbrink@rub.de>
Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com>

* Fix artifacts of not done cc-lib switch

* Fix artifacts in netstat collector of not done cc-lib switch

* Change to cc-lib (#135)

* Change to ccMessage from cc-lib

* Remove local development path

* Use receiver, sinks, ccLogger and ccConfig from cc-lib

* Fix ccLogger import path

* Update CI

* Delete mountpoint when it vanishes, not just its data (#137)

---------

Co-authored-by: Michael Panzlaff <michael.panzlaff@fau.de>
Co-authored-by: brinkcoder <Robert.Externbrink@ruhr-uni-bochum.de>
Co-authored-by: exterr2f <Robert.Externbrink@rub.de>
This commit is contained in:
Thomas Gruber
2025-04-16 13:38:11 +02:00
committed by GitHub
parent c9f2378813
commit bd04e19c96
91 changed files with 628 additions and 8698 deletions

View File

@@ -14,8 +14,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const DEFAULT_BEEGFS_CMD = "beegfs-ctl"

View File

@@ -14,8 +14,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// Struct for the collector-specific JSON config

View File

@@ -2,12 +2,11 @@ package collectors
import (
"encoding/json"
"os"
"sync"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
)
@@ -59,7 +58,7 @@ type collectorManager struct {
// Metric collector manager access functions
type CollectorManager interface {
Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) error
Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfig json.RawMessage) error
AddOutput(output chan lp.CCMessage)
Start()
Close()
@@ -72,7 +71,7 @@ type CollectorManager interface {
// * ticker (from variable ticker)
// * configuration (read from config file in variable collectConfigFile)
// Initialization is done for all configured collectors
func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) error {
func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfig json.RawMessage) error {
cm.collectors = make([]MetricCollector, 0)
cm.serial = make([]MetricCollector, 0)
cm.output = nil
@@ -81,15 +80,7 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
cm.ticker = ticker
cm.duration = duration
// Read collector config file
configFile, err := os.Open(collectConfigFile)
if err != nil {
cclog.Error(err.Error())
return err
}
defer configFile.Close()
jsonParser := json.NewDecoder(configFile)
err = jsonParser.Decode(&cm.config)
err := json.Unmarshal(collectConfig, &cm.config)
if err != nil {
cclog.Error(err.Error())
return err
@@ -200,9 +191,9 @@ func (cm *collectorManager) Close() {
}
// New creates a new initialized metric collector manager
func New(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) (CollectorManager, error) {
func New(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfig json.RawMessage) (CollectorManager, error) {
cm := new(collectorManager)
err := cm.Init(ticker, duration, wg, collectConfigFile)
err := cm.Init(ticker, duration, wg, collectConfig)
if err != nil {
return nil, err
}

View File

@@ -10,8 +10,8 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// CPUFreqCollector

View File

@@ -9,8 +9,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"golang.org/x/sys/unix"
)

View File

@@ -9,8 +9,8 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
sysconf "github.com/tklauser/go-sysconf"
)

View File

@@ -9,7 +9,7 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
influx "github.com/influxdata/line-protocol"
)

View File

@@ -8,23 +8,21 @@ import (
"syscall"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// "log"
const MOUNTFILE = `/proc/self/mounts`
type DiskstatCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ExcludeMounts []string `json:"exclude_mounts,omitempty"`
}
type DiskstatCollector struct {
metricCollector
//matches map[string]int
config IOstatCollectorConfig
//devices map[string]IOstatCollectorEntry
config DiskstatCollectorConfig
allowedMetrics map[string]bool
}
func (m *DiskstatCollector) Init(config json.RawMessage) error {
@@ -33,12 +31,21 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
m.meta = map[string]string{"source": m.name, "group": "Disk"}
m.setup()
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
if err != nil {
if err := json.Unmarshal(config, &m.config); err != nil {
return err
}
}
file, err := os.Open(string(MOUNTFILE))
m.allowedMetrics = map[string]bool{
"disk_total": true,
"disk_free": true,
"part_max_used": true,
}
for _, excl := range m.config.ExcludeMetrics {
if _, ok := m.allowedMetrics[excl]; ok {
m.allowedMetrics[excl] = false
}
}
file, err := os.Open(MOUNTFILE)
if err != nil {
cclog.ComponentError(m.name, err.Error())
return err
@@ -53,7 +60,7 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
return
}
file, err := os.Open(string(MOUNTFILE))
file, err := os.Open(MOUNTFILE)
if err != nil {
cclog.ComponentError(m.name, err.Error())
return
@@ -62,6 +69,7 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
part_max_used := uint64(0)
scanner := bufio.NewScanner(file)
mountLoop:
for scanner.Scan() {
line := scanner.Text()
if len(line) == 0 {
@@ -77,13 +85,17 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
if strings.Contains(linefields[1], "boot") {
continue
}
path := strings.Replace(linefields[1], `\040`, " ", -1)
stat := syscall.Statfs_t{
Blocks: 0,
Bsize: 0,
Bfree: 0,
mountPath := strings.Replace(linefields[1], `\040`, " ", -1)
for _, excl := range m.config.ExcludeMounts {
if strings.Contains(mountPath, excl) {
continue mountLoop
}
}
err := syscall.Statfs(path, &stat)
stat := syscall.Statfs_t{}
err := syscall.Statfs(mountPath, &stat)
if err != nil {
continue
}
@@ -92,16 +104,20 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
}
tags := map[string]string{"type": "node", "device": linefields[0]}
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
if err == nil {
y.AddMeta("unit", "GBytes")
output <- y
if m.allowedMetrics["disk_total"] {
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
if err == nil {
y.AddMeta("unit", "GBytes")
output <- y
}
}
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000)
y, err = lp.NewMessage("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
if err == nil {
y.AddMeta("unit", "GBytes")
output <- y
if m.allowedMetrics["disk_free"] {
y, err := lp.NewMessage("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
if err == nil {
y.AddMeta("unit", "GBytes")
output <- y
}
}
if total > 0 {
perc := (100 * (total - free)) / total
@@ -110,10 +126,12 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
}
}
}
y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
if err == nil {
y.AddMeta("unit", "percent")
output <- y
if m.allowedMetrics["part_max_used"] {
y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
if err == nil {
y.AddMeta("unit", "percent")
output <- y
}
}
}

View File

@@ -6,10 +6,13 @@
"exclude_metrics": [
"disk_total"
],
"exclude_mounts": [
"slurm-tmpfs"
]
}
```
The `diskstat` collector reads data from `/proc/self/mounts` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink.
The `diskstat` collector reads data from `/proc/self/mounts` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. Additionally, any mount point containing one of the strings specified in `exclude_mounts` will be skipped during metric collection.
Metrics per device (with `device` tag):
* `disk_total` (unit `GBytes`)

View File

@@ -13,8 +13,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const DEFAULT_GPFS_CMD = "mmpmon"

View File

@@ -4,8 +4,8 @@ import (
"fmt"
"os"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"golang.org/x/sys/unix"
"encoding/json"

View File

@@ -2,24 +2,24 @@ package collectors
import (
"bufio"
"os"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
// "log"
"encoding/json"
"errors"
"os"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// Konstante für den Pfad zu /proc/diskstats
const IOSTATFILE = `/proc/diskstats`
const IOSTAT_SYSFSPATH = `/sys/block`
type IOstatCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
// Neues Feld zum Ausschließen von Devices per JSON-Konfiguration
ExcludeDevices []string `json:"exclude_devices,omitempty"`
}
type IOstatCollectorEntry struct {
@@ -76,7 +76,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
if len(m.matches) == 0 {
return errors.New("no metrics to collect")
}
file, err := os.Open(string(IOSTATFILE))
file, err := os.Open(IOSTATFILE)
if err != nil {
cclog.ComponentError(m.name, err.Error())
return err
@@ -87,17 +87,24 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
for scanner.Scan() {
line := scanner.Text()
linefields := strings.Fields(line)
if len(linefields) < 3 {
continue
}
device := linefields[2]
if strings.Contains(device, "loop") {
continue
}
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
continue
}
values := make(map[string]int64)
for m := range m.matches {
values[m] = 0
}
m.devices[device] = IOstatCollectorEntry{
tags: map[string]string{
"device": linefields[2],
"device": device,
"type": "node",
},
lastValues: values,
@@ -112,7 +119,7 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
return
}
file, err := os.Open(string(IOSTATFILE))
file, err := os.Open(IOSTATFILE)
if err != nil {
cclog.ComponentError(m.name, err.Error())
return
@@ -126,10 +133,16 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
continue
}
linefields := strings.Fields(line)
if len(linefields) < 3 {
continue
}
device := linefields[2]
if strings.Contains(device, "loop") {
continue
}
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
continue
}
if _, ok := m.devices[device]; !ok {
continue
}

View File

@@ -4,12 +4,17 @@
```json
"iostat": {
"exclude_metrics": [
"read_ms"
"io_read_ms"
],
"exclude_devices": [
"nvme0n1p1",
"nvme0n1p2",
"md127"
]
}
```
The `iostat` collector reads data from `/proc/diskstats` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink.
The `iostat` collector reads data from `/proc/diskstats` and outputs a handful **node** metrics. If a metric or device is not required, it can be excluded from forwarding it to the sink.
Metrics:
* `io_reads`

View File

@@ -13,8 +13,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const IPMISENSORS_PATH = `ipmi-sensors`

View File

@@ -24,9 +24,9 @@ import (
"time"
"unsafe"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"github.com/NVIDIA/go-nvml/pkg/dl"
"github.com/fsnotify/fsnotify"

View File

@@ -8,8 +8,8 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// LoadavgCollector collects:

View File

@@ -10,8 +10,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const LUSTRE_SYSFS = `/sys/fs/lustre`

View File

@@ -12,8 +12,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const MEMSTATFILE = "/proc/meminfo"

View File

@@ -5,7 +5,7 @@ import (
"fmt"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
type MetricCollector interface {
@@ -14,7 +14,7 @@ type MetricCollector interface {
Initialized() bool // Is metric collector initialized?
Parallel() bool
Read(duration time.Duration, output chan lp.CCMessage) // Read metrics from metric collector
Close() // Close / finish metric collector
Close() // Close / finish metric collector
}
type metricCollector struct {

View File

@@ -9,16 +9,17 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const NETSTATFILE = "/proc/net/dev"
type NetstatCollectorConfig struct {
IncludeDevices []string `json:"include_devices"`
SendAbsoluteValues bool `json:"send_abs_values"`
SendDerivedValues bool `json:"send_derived_values"`
IncludeDevices []string `json:"include_devices"`
SendAbsoluteValues bool `json:"send_abs_values"`
SendDerivedValues bool `json:"send_derived_values"`
InterfaceAliases map[string][]string `json:"interface_aliases,omitempty"`
}
type NetstatCollectorMetric struct {
@@ -32,9 +33,26 @@ type NetstatCollectorMetric struct {
type NetstatCollector struct {
metricCollector
config NetstatCollectorConfig
matches map[string][]NetstatCollectorMetric
lastTimestamp time.Time
config NetstatCollectorConfig
aliasToCanonical map[string]string
matches map[string][]NetstatCollectorMetric
lastTimestamp time.Time
}
func (m *NetstatCollector) buildAliasMapping() {
m.aliasToCanonical = make(map[string]string)
for canon, aliases := range m.config.InterfaceAliases {
for _, alias := range aliases {
m.aliasToCanonical[alias] = canon
}
}
}
func getCanonicalName(raw string, aliasToCanonical map[string]string) string {
if canon, ok := aliasToCanonical[raw]; ok {
return canon
}
return raw
}
func (m *NetstatCollector) Init(config json.RawMessage) error {
@@ -77,6 +95,8 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
}
}
m.buildAliasMapping()
// Check access to net statistic file
file, err := os.Open(NETSTATFILE)
if err != nil {
@@ -97,18 +117,20 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
// Split line into fields
f := strings.Fields(l)
// Get net device entry
dev := strings.Trim(f[0], ": ")
// Get raw and canonical names
raw := strings.Trim(f[0], ": ")
canonical := getCanonicalName(raw, m.aliasToCanonical)
// Check if device is a included device
if _, ok := stringArrayContains(m.config.IncludeDevices, dev); ok {
tags := map[string]string{"stype": "network", "stype-id": dev, "type": "node"}
if _, ok := stringArrayContains(m.config.IncludeDevices, canonical); ok {
// Tag will contain original device name (raw).
tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"}
meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"}
meta_unit_byte_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "bytes/sec"}
meta_unit_pkts := map[string]string{"source": m.name, "group": "Network", "unit": "packets"}
meta_unit_pkts_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "packets/sec"}
m.matches[dev] = []NetstatCollectorMetric{
m.matches[canonical] = []NetstatCollectorMetric{
{
name: "net_bytes_in",
index: fieldReceiveBytes,
@@ -143,7 +165,6 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
},
}
}
}
if len(m.matches) == 0 {
@@ -164,7 +185,7 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
// Save current timestamp
m.lastTimestamp = now
file, err := os.Open(string(NETSTATFILE))
file, err := os.Open(NETSTATFILE)
if err != nil {
cclog.ComponentError(m.name, err.Error())
return
@@ -183,11 +204,12 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
// Split line into fields
f := strings.Fields(l)
// Get net device entry
dev := strings.Trim(f[0], ":")
// Get raw and canonical names
raw := strings.Trim(f[0], ":")
canonical := getCanonicalName(raw, m.aliasToCanonical)
// Check if device is a included device
if devmetrics, ok := m.matches[dev]; ok {
if devmetrics, ok := m.matches[canonical]; ok {
for i := range devmetrics {
metric := &devmetrics[i]

View File

@@ -4,14 +4,19 @@
```json
"netstat": {
"include_devices": [
"eth0"
"eth0",
"eno1"
],
"send_abs_values" : true,
"send_derived_values" : true
"send_abs_values": true,
"send_derived_values": true,
"interface_aliases": {
"eno1": ["eno1np0", "eno1_alt"],
"eth0": ["eth0_alias"]
}
}
```
The `netstat` collector reads data from `/proc/net/dev` and outputs a handful **node** metrics. With the `include_devices` list you can specify which network devices should be measured. **Note**: Most other collectors use an _exclude_ list instead of an include list.
The `netstat` collector reads data from `/proc/net/dev` and outputs a handful **node** metrics. With the `include_devices` list you can specify which network devices should be measured. **Note**: Most other collectors use an _exclude_ list instead of an include list. Optionally, you can define an interface_aliases mapping. For each canonical device (as listed in include_devices), you may provide an array of aliases that may be reported by the system. When an alias is detected, it is preferred for matching, while the output tag stype-id always shows the actual system-reported name.
Metrics:
* `net_bytes_in` (`unit=bytes`)
@@ -23,5 +28,4 @@ Metrics:
* `net_pkts_in_bw` (`unit=packets/sec` if `send_derived_values == true`)
* `net_pkts_out_bw` (`unit=packets/sec` if `send_derived_values == true`)
The device name is added as tag `stype=network,stype-id=<device>`.
The device name is added as tag `stype=network,stype-id=<device>`.

View File

@@ -11,7 +11,7 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// First part contains the code for the general NfsCollector.

View File

@@ -9,8 +9,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// These are the fields we read from the JSON configuration
@@ -18,17 +18,20 @@ type NfsIOStatCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
UseServerAddressAsSType bool `json:"use_server_as_stype,omitempty"`
SendAbsoluteValues bool `json:"send_abs_values"`
SendDerivedValues bool `json:"send_derived_values"`
}
// This contains all variables we need during execution and the variables
// defined by metricCollector (name, init, ...)
type NfsIOStatCollector struct {
metricCollector
config NfsIOStatCollectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
data map[string]map[string]int64 // data storage for difference calculation
key string // which device info should be used as subtype ID? 'server' or 'mntpoint', see NfsIOStatCollectorConfig.UseServerAddressAsSType
config NfsIOStatCollectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
data map[string]map[string]int64 // data storage for difference calculation
key string // which device info should be used as subtype ID? 'server' or 'mntpoint'
lastTimestamp time.Time
}
var deviceRegex = regexp.MustCompile(`device (?P<server>[^ ]+) mounted on (?P<mntpoint>[^ ]+) with fstype nfs(?P<version>\d*) statvers=[\d\.]+`)
@@ -81,7 +84,6 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
data[current[m.key]][name] = val
}
}
}
current = nil
}
@@ -98,6 +100,9 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
m.meta = map[string]string{"source": m.name, "group": "NFS", "unit": "bytes"}
m.tags = map[string]string{"type": "node"}
m.config.UseServerAddressAsSType = false
// Set default configuration
m.config.SendAbsoluteValues = true
m.config.SendDerivedValues = false
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
@@ -110,12 +115,15 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
m.key = "server"
}
m.data = m.readNfsiostats()
m.lastTimestamp = time.Now()
m.init = true
return err
}
func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
timestamp := time.Now()
now := time.Now()
timeDiff := now.Sub(m.lastTimestamp).Seconds()
m.lastTimestamp = now
// Get the current values for all mountpoints
newdata := m.readNfsiostats()
@@ -123,21 +131,30 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
for mntpoint, values := range newdata {
// Was the mount point already present in the last iteration
if old, ok := m.data[mntpoint]; ok {
// Calculate the difference of old and new values
for i := range values {
x := values[i] - old[i]
y, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", i), m.tags, m.meta, map[string]interface{}{"value": x}, timestamp)
if err == nil {
if strings.HasPrefix(i, "page") {
y.AddMeta("unit", "4K_Pages")
for name, newVal := range values {
if m.config.SendAbsoluteValues {
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", name), m.tags, m.meta, map[string]interface{}{"value": newVal}, now)
if err == nil {
msg.AddTag("stype", "filesystem")
msg.AddTag("stype-id", mntpoint)
output <- msg
}
y.AddTag("stype", "filesystem")
y.AddTag("stype-id", mntpoint)
// Send it to output channel
output <- y
}
// Update old to the new value for the next iteration
old[i] = values[i]
if m.config.SendDerivedValues {
rate := float64(newVal-old[name]) / timeDiff
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]interface{}{"value": rate}, now)
if err == nil {
if strings.HasPrefix(name, "page") {
msg.AddMeta("unit", "4K_pages/s")
} else {
msg.AddMeta("unit", "bytes/sec")
}
msg.AddTag("stype", "filesystem")
msg.AddTag("stype-id", mntpoint)
output <- msg
}
}
old[name] = newVal
}
} else {
// First time we see this mount point, store all values
@@ -154,10 +171,9 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
}
}
if !found {
m.data[mntpoint] = nil
delete(m.data, mntpoint)
}
}
}
func (m *NfsIOStatCollector) Close() {

View File

@@ -3,16 +3,18 @@
```json
"nfsiostat": {
"exclude_metrics": [
"nfsio_oread"
"oread", "pageread"
],
"exclude_filesystems" : [
"/mnt",
"exclude_filesystems": [
"/mnt"
],
"use_server_as_stype": false
"use_server_as_stype": false,
"send_abs_values": false,
"send_derived_values": true
}
```
The `nfsiostat` collector reads data from `/proc/self/mountstats` and outputs a handful **node** metrics for each NFS filesystem. If a metric or filesystem is not required, it can be excluded from forwarding it to the sink.
The `nfsiostat` collector reads data from `/proc/self/mountstats` and outputs a handful **node** metrics for each NFS filesystem. If a metric or filesystem is not required, it can be excluded from forwarding it to the sink. **Note:** When excluding metrics, you must provide the base metric name (e.g. pageread) without the nfsio_ prefix. This exclusion applies to both absolute and derived values.
Metrics:
* `nfsio_nread`: Bytes transferred by normal `read()` calls
@@ -24,4 +26,9 @@ Metrics:
* `nfsio_nfsread`: Bytes transferred for reading from the server
* `nfsio_nfswrite`: Pages transferred by writing to the server
The `nfsiostat` collector adds the mountpoint to the tags as `stype=filesystem,stype-id=<mountpoint>`. If the server address should be used instead of the mountpoint, use the `use_server_as_stype` config setting.
For each of these, if derived values are enabled, an additional metric is sent with the `_bw` suffix, which represents the rate:
* For normal byte metrics: `unit=bytes/sec`
* For page metrics: `unit=4K_pages/s`
The `nfsiostat` collector adds the mountpoint to the tags as `stype=filesystem,stype-id=<mountpoint>`. If the server address should be used instead of the mountpoint, use the `use_server_as_stype` config setting.

View File

@@ -10,10 +10,15 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
type NUMAStatsCollectorConfig struct {
SendAbsoluteValues bool `json:"send_abs_values"`
SendDerivedValues bool `json:"send_derived_values"`
}
// Non-Uniform Memory Access (NUMA) policy hit/miss statistics
//
// numa_hit:
@@ -47,13 +52,16 @@ import (
//
// See: https://www.kernel.org/doc/html/latest/admin-guide/numastat.html
type NUMAStatsCollectorTopolgy struct {
file string
tagSet map[string]string
file string
tagSet map[string]string
previousValues map[string]int64
}
type NUMAStatsCollector struct {
metricCollector
topology []NUMAStatsCollectorTopolgy
topology []NUMAStatsCollectorTopolgy
config NUMAStatsCollectorConfig
lastTimestamp time.Time
}
func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
@@ -86,8 +94,9 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
file := filepath.Join(dir, "numastat")
m.topology = append(m.topology,
NUMAStatsCollectorTopolgy{
file: file,
tagSet: map[string]string{"memoryDomain": node},
file: file,
tagSet: map[string]string{"memoryDomain": node},
previousValues: make(map[string]int64),
})
}
@@ -102,23 +111,27 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
return
}
now := time.Now()
timeDiff := now.Sub(m.lastTimestamp).Seconds()
m.lastTimestamp = now
for i := range m.topology {
// Loop for all NUMA domains
t := &m.topology[i]
now := time.Now()
file, err := os.Open(t.file)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", t.file, err))
return
continue
}
scanner := bufio.NewScanner(file)
// Read line by line
for scanner.Scan() {
split := strings.Fields(scanner.Text())
line := scanner.Text()
split := strings.Fields(line)
if len(split) != 2 {
continue
}
@@ -130,18 +143,38 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
fmt.Sprintf("Read(): Failed to convert %s='%s' to int64: %v", key, split[1], err))
continue
}
y, err := lp.NewMessage(
"numastats_"+key,
t.tagSet,
m.meta,
map[string]interface{}{"value": value},
now,
)
if err == nil {
output <- y
if m.config.SendAbsoluteValues {
msg, err := lp.NewMessage(
"numastats_"+key,
t.tagSet,
m.meta,
map[string]interface{}{"value": value},
now,
)
if err == nil {
output <- msg
}
}
if m.config.SendDerivedValues {
prev, ok := t.previousValues[key]
if ok {
rate := float64(value-prev) / timeDiff
msg, err := lp.NewMessage(
"numastats_"+key+"_rate",
t.tagSet,
m.meta,
map[string]interface{}{"value": rate},
now,
)
if err == nil {
output <- msg
}
}
t.previousValues[key] = value
}
}
file.Close()
}
}

View File

@@ -2,7 +2,10 @@
## `numastat` collector
```json
"numastats": {}
"numastats": {
"send_abs_values" : true,
"send_derived_values" : true
}
```
The `numastat` collector reads data from `/sys/devices/system/node/node*/numastat` and outputs a handful **memoryDomain** metrics. See: <https://www.kernel.org/doc/html/latest/admin-guide/numastat.html>
@@ -15,3 +18,9 @@ Metrics:
* `numastats_local_node`: A process ran on this node's CPU, and got memory from this node.
* `numastats_other_node`: A process ran on a different node's CPU, and got memory from this node.
* `numastats_interleave_hit`: Interleaving wanted to allocate from this node and succeeded.
* `numastats_numa_hit_rate` (if `send_derived_values == true`): Derived rate value per second.
* `numastats_numa_miss_rate` (if `send_derived_values == true`): Derived rate value per second.
* `numastats_numa_foreign_rate` (if `send_derived_values == true`): Derived rate value per second.
* `numastats_local_node_rate` (if `send_derived_values == true`): Derived rate value per second.
* `numastats_other_node_rate` (if `send_derived_values == true`): Derived rate value per second.
* `numastats_interleave_hit_rate` (if `send_derived_values == true`): Derived rate value per second.

View File

@@ -8,8 +8,8 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"github.com/NVIDIA/go-nvml/pkg/nvml"
)

View File

@@ -9,8 +9,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// running average power limit (RAPL) monitoring attributes for a zone

View File

@@ -1,11 +1,9 @@
# Running average power limit (RAPL) metric collector
## `rapl` collector
This collector reads running average power limit (RAPL) monitoring attributes to compute average power consumption metrics. See <https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes>.
The Likwid metric collector provides similar functionality.
## Configuration
```json
"rapl": {
"exclude_device_by_id": ["0:1", "0:2"],
@@ -13,6 +11,5 @@ The Likwid metric collector provides similar functionality.
}
```
## Metrics
Metrics:
* `rapl_average_power`: average power consumption in Watt. The average is computed over the entire runtime from the last measurement to the current measurement

View File

@@ -6,8 +6,8 @@ import (
"fmt"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
)

View File

@@ -4,8 +4,8 @@ import (
"encoding/json"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// These are the fields we read from the JSON configuration

View File

@@ -5,8 +5,8 @@ import (
"sync"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// These are the fields we read from the JSON configuration

View File

@@ -10,8 +10,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const SCHEDSTATFILE = `/proc/schedstat`

View File

@@ -6,8 +6,8 @@ import (
"syscall"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
type SelfCollectorConfig struct {

View File

@@ -9,8 +9,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html

View File

@@ -9,7 +9,7 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const MAX_NUM_PROCS = 10