mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-07-23 05:11:40 +02:00
Merge development branch to main (#141)
* Remove go-toolkit as build requirement for RPM builds if run in CI * Remove condition around BuildRequires and use go-toolkit for RPM builds * use go-toolkit for RPM builds * Install go-toolkit to fulfill build requirements for RPM * Add golang-race for UBI9 and Alma9 * Fix wrongly named packages * Fix wrongly named packages * Fix Release part * Fix Release part * Fix documentation of RAPL collector * Mark all JSON config fields of message processor as omitempty * Generate HUGO inputs out of Markdown files * Check creation of CCMessage in NATS receiver * Use CCMessage FromBytes instead of Influx's decoder * Rename 'process_message' to 'process_messages' in metricRouter config This makes the behavior more consistent with the other modules, which have their MessageProcessor named 'process_messages'. This most likely was just a typo. * Add optional interface alias in netstat (#130) * Check creation of CCMessage in NATS receiver * add optional interface aliases for netstatMetric * small fix --------- Co-authored-by: Thomas Roehl <thomas.roehl@fau.de> Co-authored-by: exterr2f <Robert.Externbrink@rub.de> Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com> * Fix excluded metrics for diskstat and add exclude_mounts (#131) * Check creation of CCMessage in NATS receiver * fix excluded metrics and add optional mountpoint exclude --------- Co-authored-by: Thomas Roehl <thomas.roehl@fau.de> Co-authored-by: exterr2f <Robert.Externbrink@rub.de> Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com> * Add derived values for nfsiostat (#132) * Check creation of CCMessage in NATS receiver * add derived_values for nfsiostatMetric --------- Co-authored-by: Thomas Roehl <thomas.roehl@fau.de> Co-authored-by: exterr2f <Robert.Externbrink@rub.de> Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com> * Add exclude_devices to iostat (#133) * Check creation of CCMessage in NATS receiver * add exclude_device for iostatMetric * add md file --------- Co-authored-by: Thomas Roehl <thomas.roehl@fau.de> Co-authored-by: exterr2f <Robert.Externbrink@rub.de> Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com> * Add derived_values for numastats (#134) * Check creation of CCMessage in NATS receiver * add derived_values for numastats * change to ccMessage * remove vim command artefact --------- Co-authored-by: Thomas Roehl <thomas.roehl@fau.de> Co-authored-by: exterr2f <Robert.Externbrink@rub.de> Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com> * Fix artifacts of not done cc-lib switch * Fix artifacts in netstat collector of not done cc-lib switch * Change to cc-lib (#135) * Change to ccMessage from cc-lib * Remove local development path * Use receiver, sinks, ccLogger and ccConfig from cc-lib * Fix ccLogger import path * Update CI * Delete mountpoint when it vanishes, not just its data (#137) --------- Co-authored-by: Michael Panzlaff <michael.panzlaff@fau.de> Co-authored-by: brinkcoder <Robert.Externbrink@ruhr-uni-bochum.de> Co-authored-by: exterr2f <Robert.Externbrink@rub.de>
This commit is contained in:
@@ -14,8 +14,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
const DEFAULT_BEEGFS_CMD = "beegfs-ctl"
|
||||
|
@@ -14,8 +14,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
// Struct for the collector-specific JSON config
|
||||
|
@@ -2,12 +2,11 @@ package collectors
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
||||
)
|
||||
|
||||
@@ -59,7 +58,7 @@ type collectorManager struct {
|
||||
|
||||
// Metric collector manager access functions
|
||||
type CollectorManager interface {
|
||||
Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) error
|
||||
Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfig json.RawMessage) error
|
||||
AddOutput(output chan lp.CCMessage)
|
||||
Start()
|
||||
Close()
|
||||
@@ -72,7 +71,7 @@ type CollectorManager interface {
|
||||
// * ticker (from variable ticker)
|
||||
// * configuration (read from config file in variable collectConfigFile)
|
||||
// Initialization is done for all configured collectors
|
||||
func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) error {
|
||||
func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfig json.RawMessage) error {
|
||||
cm.collectors = make([]MetricCollector, 0)
|
||||
cm.serial = make([]MetricCollector, 0)
|
||||
cm.output = nil
|
||||
@@ -81,15 +80,7 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
|
||||
cm.ticker = ticker
|
||||
cm.duration = duration
|
||||
|
||||
// Read collector config file
|
||||
configFile, err := os.Open(collectConfigFile)
|
||||
if err != nil {
|
||||
cclog.Error(err.Error())
|
||||
return err
|
||||
}
|
||||
defer configFile.Close()
|
||||
jsonParser := json.NewDecoder(configFile)
|
||||
err = jsonParser.Decode(&cm.config)
|
||||
err := json.Unmarshal(collectConfig, &cm.config)
|
||||
if err != nil {
|
||||
cclog.Error(err.Error())
|
||||
return err
|
||||
@@ -200,9 +191,9 @@ func (cm *collectorManager) Close() {
|
||||
}
|
||||
|
||||
// New creates a new initialized metric collector manager
|
||||
func New(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) (CollectorManager, error) {
|
||||
func New(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfig json.RawMessage) (CollectorManager, error) {
|
||||
cm := new(collectorManager)
|
||||
err := cm.Init(ticker, duration, wg, collectConfigFile)
|
||||
err := cm.Init(ticker, duration, wg, collectConfig)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@@ -10,8 +10,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
// CPUFreqCollector
|
||||
|
@@ -9,8 +9,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
"github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
@@ -9,8 +9,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
sysconf "github.com/tklauser/go-sysconf"
|
||||
)
|
||||
|
||||
|
@@ -9,7 +9,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
influx "github.com/influxdata/line-protocol"
|
||||
)
|
||||
|
||||
|
@@ -8,23 +8,21 @@ import (
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
// "log"
|
||||
|
||||
const MOUNTFILE = `/proc/self/mounts`
|
||||
|
||||
type DiskstatCollectorConfig struct {
|
||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||
ExcludeMounts []string `json:"exclude_mounts,omitempty"`
|
||||
}
|
||||
|
||||
type DiskstatCollector struct {
|
||||
metricCollector
|
||||
//matches map[string]int
|
||||
config IOstatCollectorConfig
|
||||
//devices map[string]IOstatCollectorEntry
|
||||
config DiskstatCollectorConfig
|
||||
allowedMetrics map[string]bool
|
||||
}
|
||||
|
||||
func (m *DiskstatCollector) Init(config json.RawMessage) error {
|
||||
@@ -33,12 +31,21 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
|
||||
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
||||
m.setup()
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
if err := json.Unmarshal(config, &m.config); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
file, err := os.Open(string(MOUNTFILE))
|
||||
m.allowedMetrics = map[string]bool{
|
||||
"disk_total": true,
|
||||
"disk_free": true,
|
||||
"part_max_used": true,
|
||||
}
|
||||
for _, excl := range m.config.ExcludeMetrics {
|
||||
if _, ok := m.allowedMetrics[excl]; ok {
|
||||
m.allowedMetrics[excl] = false
|
||||
}
|
||||
}
|
||||
file, err := os.Open(MOUNTFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
@@ -53,7 +60,7 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
|
||||
return
|
||||
}
|
||||
|
||||
file, err := os.Open(string(MOUNTFILE))
|
||||
file, err := os.Open(MOUNTFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return
|
||||
@@ -62,6 +69,7 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
|
||||
|
||||
part_max_used := uint64(0)
|
||||
scanner := bufio.NewScanner(file)
|
||||
mountLoop:
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if len(line) == 0 {
|
||||
@@ -77,13 +85,17 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
|
||||
if strings.Contains(linefields[1], "boot") {
|
||||
continue
|
||||
}
|
||||
path := strings.Replace(linefields[1], `\040`, " ", -1)
|
||||
stat := syscall.Statfs_t{
|
||||
Blocks: 0,
|
||||
Bsize: 0,
|
||||
Bfree: 0,
|
||||
|
||||
mountPath := strings.Replace(linefields[1], `\040`, " ", -1)
|
||||
|
||||
for _, excl := range m.config.ExcludeMounts {
|
||||
if strings.Contains(mountPath, excl) {
|
||||
continue mountLoop
|
||||
}
|
||||
}
|
||||
err := syscall.Statfs(path, &stat)
|
||||
|
||||
stat := syscall.Statfs_t{}
|
||||
err := syscall.Statfs(mountPath, &stat)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
@@ -92,16 +104,20 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
|
||||
}
|
||||
tags := map[string]string{"type": "node", "device": linefields[0]}
|
||||
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
|
||||
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "GBytes")
|
||||
output <- y
|
||||
if m.allowedMetrics["disk_total"] {
|
||||
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "GBytes")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000)
|
||||
y, err = lp.NewMessage("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "GBytes")
|
||||
output <- y
|
||||
if m.allowedMetrics["disk_free"] {
|
||||
y, err := lp.NewMessage("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "GBytes")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if total > 0 {
|
||||
perc := (100 * (total - free)) / total
|
||||
@@ -110,10 +126,12 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
|
||||
}
|
||||
}
|
||||
}
|
||||
y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "percent")
|
||||
output <- y
|
||||
if m.allowedMetrics["part_max_used"] {
|
||||
y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "percent")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -6,10 +6,13 @@
|
||||
"exclude_metrics": [
|
||||
"disk_total"
|
||||
],
|
||||
"exclude_mounts": [
|
||||
"slurm-tmpfs"
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
The `diskstat` collector reads data from `/proc/self/mounts` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink.
|
||||
The `diskstat` collector reads data from `/proc/self/mounts` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. Additionally, any mount point containing one of the strings specified in `exclude_mounts` will be skipped during metric collection.
|
||||
|
||||
Metrics per device (with `device` tag):
|
||||
* `disk_total` (unit `GBytes`)
|
||||
|
@@ -13,8 +13,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
const DEFAULT_GPFS_CMD = "mmpmon"
|
||||
|
@@ -4,8 +4,8 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"encoding/json"
|
||||
|
@@ -2,24 +2,24 @@ package collectors
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"os"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
|
||||
// "log"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
// Konstante für den Pfad zu /proc/diskstats
|
||||
const IOSTATFILE = `/proc/diskstats`
|
||||
const IOSTAT_SYSFSPATH = `/sys/block`
|
||||
|
||||
type IOstatCollectorConfig struct {
|
||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||
// Neues Feld zum Ausschließen von Devices per JSON-Konfiguration
|
||||
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
||||
}
|
||||
|
||||
type IOstatCollectorEntry struct {
|
||||
@@ -76,7 +76,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||
if len(m.matches) == 0 {
|
||||
return errors.New("no metrics to collect")
|
||||
}
|
||||
file, err := os.Open(string(IOSTATFILE))
|
||||
file, err := os.Open(IOSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
@@ -87,17 +87,24 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
linefields := strings.Fields(line)
|
||||
if len(linefields) < 3 {
|
||||
continue
|
||||
}
|
||||
device := linefields[2]
|
||||
|
||||
if strings.Contains(device, "loop") {
|
||||
continue
|
||||
}
|
||||
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
|
||||
continue
|
||||
}
|
||||
values := make(map[string]int64)
|
||||
for m := range m.matches {
|
||||
values[m] = 0
|
||||
}
|
||||
m.devices[device] = IOstatCollectorEntry{
|
||||
tags: map[string]string{
|
||||
"device": linefields[2],
|
||||
"device": device,
|
||||
"type": "node",
|
||||
},
|
||||
lastValues: values,
|
||||
@@ -112,7 +119,7 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
return
|
||||
}
|
||||
|
||||
file, err := os.Open(string(IOSTATFILE))
|
||||
file, err := os.Open(IOSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return
|
||||
@@ -126,10 +133,16 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
continue
|
||||
}
|
||||
linefields := strings.Fields(line)
|
||||
if len(linefields) < 3 {
|
||||
continue
|
||||
}
|
||||
device := linefields[2]
|
||||
if strings.Contains(device, "loop") {
|
||||
continue
|
||||
}
|
||||
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
|
||||
continue
|
||||
}
|
||||
if _, ok := m.devices[device]; !ok {
|
||||
continue
|
||||
}
|
||||
|
@@ -4,12 +4,17 @@
|
||||
```json
|
||||
"iostat": {
|
||||
"exclude_metrics": [
|
||||
"read_ms"
|
||||
"io_read_ms"
|
||||
],
|
||||
"exclude_devices": [
|
||||
"nvme0n1p1",
|
||||
"nvme0n1p2",
|
||||
"md127"
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
The `iostat` collector reads data from `/proc/diskstats` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink.
|
||||
The `iostat` collector reads data from `/proc/diskstats` and outputs a handful **node** metrics. If a metric or device is not required, it can be excluded from forwarding it to the sink.
|
||||
|
||||
Metrics:
|
||||
* `io_reads`
|
||||
|
@@ -13,8 +13,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
const IPMISENSORS_PATH = `ipmi-sensors`
|
||||
|
@@ -24,9 +24,9 @@ import (
|
||||
"time"
|
||||
"unsafe"
|
||||
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||
"github.com/NVIDIA/go-nvml/pkg/dl"
|
||||
"github.com/fsnotify/fsnotify"
|
||||
|
@@ -8,8 +8,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
// LoadavgCollector collects:
|
||||
|
@@ -10,8 +10,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
const LUSTRE_SYSFS = `/sys/fs/lustre`
|
||||
|
@@ -12,8 +12,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
const MEMSTATFILE = "/proc/meminfo"
|
||||
|
@@ -5,7 +5,7 @@ import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
type MetricCollector interface {
|
||||
@@ -14,7 +14,7 @@ type MetricCollector interface {
|
||||
Initialized() bool // Is metric collector initialized?
|
||||
Parallel() bool
|
||||
Read(duration time.Duration, output chan lp.CCMessage) // Read metrics from metric collector
|
||||
Close() // Close / finish metric collector
|
||||
Close() // Close / finish metric collector
|
||||
}
|
||||
|
||||
type metricCollector struct {
|
||||
|
@@ -9,16 +9,17 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
const NETSTATFILE = "/proc/net/dev"
|
||||
|
||||
type NetstatCollectorConfig struct {
|
||||
IncludeDevices []string `json:"include_devices"`
|
||||
SendAbsoluteValues bool `json:"send_abs_values"`
|
||||
SendDerivedValues bool `json:"send_derived_values"`
|
||||
IncludeDevices []string `json:"include_devices"`
|
||||
SendAbsoluteValues bool `json:"send_abs_values"`
|
||||
SendDerivedValues bool `json:"send_derived_values"`
|
||||
InterfaceAliases map[string][]string `json:"interface_aliases,omitempty"`
|
||||
}
|
||||
|
||||
type NetstatCollectorMetric struct {
|
||||
@@ -32,9 +33,26 @@ type NetstatCollectorMetric struct {
|
||||
|
||||
type NetstatCollector struct {
|
||||
metricCollector
|
||||
config NetstatCollectorConfig
|
||||
matches map[string][]NetstatCollectorMetric
|
||||
lastTimestamp time.Time
|
||||
config NetstatCollectorConfig
|
||||
aliasToCanonical map[string]string
|
||||
matches map[string][]NetstatCollectorMetric
|
||||
lastTimestamp time.Time
|
||||
}
|
||||
|
||||
func (m *NetstatCollector) buildAliasMapping() {
|
||||
m.aliasToCanonical = make(map[string]string)
|
||||
for canon, aliases := range m.config.InterfaceAliases {
|
||||
for _, alias := range aliases {
|
||||
m.aliasToCanonical[alias] = canon
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func getCanonicalName(raw string, aliasToCanonical map[string]string) string {
|
||||
if canon, ok := aliasToCanonical[raw]; ok {
|
||||
return canon
|
||||
}
|
||||
return raw
|
||||
}
|
||||
|
||||
func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||
@@ -77,6 +95,8 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
}
|
||||
|
||||
m.buildAliasMapping()
|
||||
|
||||
// Check access to net statistic file
|
||||
file, err := os.Open(NETSTATFILE)
|
||||
if err != nil {
|
||||
@@ -97,18 +117,20 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||
// Split line into fields
|
||||
f := strings.Fields(l)
|
||||
|
||||
// Get net device entry
|
||||
dev := strings.Trim(f[0], ": ")
|
||||
// Get raw and canonical names
|
||||
raw := strings.Trim(f[0], ": ")
|
||||
canonical := getCanonicalName(raw, m.aliasToCanonical)
|
||||
|
||||
// Check if device is a included device
|
||||
if _, ok := stringArrayContains(m.config.IncludeDevices, dev); ok {
|
||||
tags := map[string]string{"stype": "network", "stype-id": dev, "type": "node"}
|
||||
if _, ok := stringArrayContains(m.config.IncludeDevices, canonical); ok {
|
||||
// Tag will contain original device name (raw).
|
||||
tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"}
|
||||
meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"}
|
||||
meta_unit_byte_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "bytes/sec"}
|
||||
meta_unit_pkts := map[string]string{"source": m.name, "group": "Network", "unit": "packets"}
|
||||
meta_unit_pkts_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "packets/sec"}
|
||||
|
||||
m.matches[dev] = []NetstatCollectorMetric{
|
||||
m.matches[canonical] = []NetstatCollectorMetric{
|
||||
{
|
||||
name: "net_bytes_in",
|
||||
index: fieldReceiveBytes,
|
||||
@@ -143,7 +165,6 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if len(m.matches) == 0 {
|
||||
@@ -164,7 +185,7 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
// Save current timestamp
|
||||
m.lastTimestamp = now
|
||||
|
||||
file, err := os.Open(string(NETSTATFILE))
|
||||
file, err := os.Open(NETSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return
|
||||
@@ -183,11 +204,12 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
// Split line into fields
|
||||
f := strings.Fields(l)
|
||||
|
||||
// Get net device entry
|
||||
dev := strings.Trim(f[0], ":")
|
||||
// Get raw and canonical names
|
||||
raw := strings.Trim(f[0], ":")
|
||||
canonical := getCanonicalName(raw, m.aliasToCanonical)
|
||||
|
||||
// Check if device is a included device
|
||||
if devmetrics, ok := m.matches[dev]; ok {
|
||||
if devmetrics, ok := m.matches[canonical]; ok {
|
||||
for i := range devmetrics {
|
||||
metric := &devmetrics[i]
|
||||
|
||||
|
@@ -4,14 +4,19 @@
|
||||
```json
|
||||
"netstat": {
|
||||
"include_devices": [
|
||||
"eth0"
|
||||
"eth0",
|
||||
"eno1"
|
||||
],
|
||||
"send_abs_values" : true,
|
||||
"send_derived_values" : true
|
||||
"send_abs_values": true,
|
||||
"send_derived_values": true,
|
||||
"interface_aliases": {
|
||||
"eno1": ["eno1np0", "eno1_alt"],
|
||||
"eth0": ["eth0_alias"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The `netstat` collector reads data from `/proc/net/dev` and outputs a handful **node** metrics. With the `include_devices` list you can specify which network devices should be measured. **Note**: Most other collectors use an _exclude_ list instead of an include list.
|
||||
The `netstat` collector reads data from `/proc/net/dev` and outputs a handful **node** metrics. With the `include_devices` list you can specify which network devices should be measured. **Note**: Most other collectors use an _exclude_ list instead of an include list. Optionally, you can define an interface_aliases mapping. For each canonical device (as listed in include_devices), you may provide an array of aliases that may be reported by the system. When an alias is detected, it is preferred for matching, while the output tag stype-id always shows the actual system-reported name.
|
||||
|
||||
Metrics:
|
||||
* `net_bytes_in` (`unit=bytes`)
|
||||
@@ -23,5 +28,4 @@ Metrics:
|
||||
* `net_pkts_in_bw` (`unit=packets/sec` if `send_derived_values == true`)
|
||||
* `net_pkts_out_bw` (`unit=packets/sec` if `send_derived_values == true`)
|
||||
|
||||
The device name is added as tag `stype=network,stype-id=<device>`.
|
||||
|
||||
The device name is added as tag `stype=network,stype-id=<device>`.
|
@@ -11,7 +11,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
// First part contains the code for the general NfsCollector.
|
||||
|
@@ -9,8 +9,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
// These are the fields we read from the JSON configuration
|
||||
@@ -18,17 +18,20 @@ type NfsIOStatCollectorConfig struct {
|
||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
|
||||
UseServerAddressAsSType bool `json:"use_server_as_stype,omitempty"`
|
||||
SendAbsoluteValues bool `json:"send_abs_values"`
|
||||
SendDerivedValues bool `json:"send_derived_values"`
|
||||
}
|
||||
|
||||
// This contains all variables we need during execution and the variables
|
||||
// defined by metricCollector (name, init, ...)
|
||||
type NfsIOStatCollector struct {
|
||||
metricCollector
|
||||
config NfsIOStatCollectorConfig // the configuration structure
|
||||
meta map[string]string // default meta information
|
||||
tags map[string]string // default tags
|
||||
data map[string]map[string]int64 // data storage for difference calculation
|
||||
key string // which device info should be used as subtype ID? 'server' or 'mntpoint', see NfsIOStatCollectorConfig.UseServerAddressAsSType
|
||||
config NfsIOStatCollectorConfig // the configuration structure
|
||||
meta map[string]string // default meta information
|
||||
tags map[string]string // default tags
|
||||
data map[string]map[string]int64 // data storage for difference calculation
|
||||
key string // which device info should be used as subtype ID? 'server' or 'mntpoint'
|
||||
lastTimestamp time.Time
|
||||
}
|
||||
|
||||
var deviceRegex = regexp.MustCompile(`device (?P<server>[^ ]+) mounted on (?P<mntpoint>[^ ]+) with fstype nfs(?P<version>\d*) statvers=[\d\.]+`)
|
||||
@@ -81,7 +84,6 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
|
||||
data[current[m.key]][name] = val
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
current = nil
|
||||
}
|
||||
@@ -98,6 +100,9 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
||||
m.meta = map[string]string{"source": m.name, "group": "NFS", "unit": "bytes"}
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
m.config.UseServerAddressAsSType = false
|
||||
// Set default configuration
|
||||
m.config.SendAbsoluteValues = true
|
||||
m.config.SendDerivedValues = false
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
@@ -110,12 +115,15 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
||||
m.key = "server"
|
||||
}
|
||||
m.data = m.readNfsiostats()
|
||||
m.lastTimestamp = time.Now()
|
||||
m.init = true
|
||||
return err
|
||||
}
|
||||
|
||||
func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
timestamp := time.Now()
|
||||
now := time.Now()
|
||||
timeDiff := now.Sub(m.lastTimestamp).Seconds()
|
||||
m.lastTimestamp = now
|
||||
|
||||
// Get the current values for all mountpoints
|
||||
newdata := m.readNfsiostats()
|
||||
@@ -123,21 +131,30 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
for mntpoint, values := range newdata {
|
||||
// Was the mount point already present in the last iteration
|
||||
if old, ok := m.data[mntpoint]; ok {
|
||||
// Calculate the difference of old and new values
|
||||
for i := range values {
|
||||
x := values[i] - old[i]
|
||||
y, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", i), m.tags, m.meta, map[string]interface{}{"value": x}, timestamp)
|
||||
if err == nil {
|
||||
if strings.HasPrefix(i, "page") {
|
||||
y.AddMeta("unit", "4K_Pages")
|
||||
for name, newVal := range values {
|
||||
if m.config.SendAbsoluteValues {
|
||||
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", name), m.tags, m.meta, map[string]interface{}{"value": newVal}, now)
|
||||
if err == nil {
|
||||
msg.AddTag("stype", "filesystem")
|
||||
msg.AddTag("stype-id", mntpoint)
|
||||
output <- msg
|
||||
}
|
||||
y.AddTag("stype", "filesystem")
|
||||
y.AddTag("stype-id", mntpoint)
|
||||
// Send it to output channel
|
||||
output <- y
|
||||
}
|
||||
// Update old to the new value for the next iteration
|
||||
old[i] = values[i]
|
||||
if m.config.SendDerivedValues {
|
||||
rate := float64(newVal-old[name]) / timeDiff
|
||||
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]interface{}{"value": rate}, now)
|
||||
if err == nil {
|
||||
if strings.HasPrefix(name, "page") {
|
||||
msg.AddMeta("unit", "4K_pages/s")
|
||||
} else {
|
||||
msg.AddMeta("unit", "bytes/sec")
|
||||
}
|
||||
msg.AddTag("stype", "filesystem")
|
||||
msg.AddTag("stype-id", mntpoint)
|
||||
output <- msg
|
||||
}
|
||||
}
|
||||
old[name] = newVal
|
||||
}
|
||||
} else {
|
||||
// First time we see this mount point, store all values
|
||||
@@ -154,10 +171,9 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
m.data[mntpoint] = nil
|
||||
delete(m.data, mntpoint)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func (m *NfsIOStatCollector) Close() {
|
||||
|
@@ -3,16 +3,18 @@
|
||||
```json
|
||||
"nfsiostat": {
|
||||
"exclude_metrics": [
|
||||
"nfsio_oread"
|
||||
"oread", "pageread"
|
||||
],
|
||||
"exclude_filesystems" : [
|
||||
"/mnt",
|
||||
"exclude_filesystems": [
|
||||
"/mnt"
|
||||
],
|
||||
"use_server_as_stype": false
|
||||
"use_server_as_stype": false,
|
||||
"send_abs_values": false,
|
||||
"send_derived_values": true
|
||||
}
|
||||
```
|
||||
|
||||
The `nfsiostat` collector reads data from `/proc/self/mountstats` and outputs a handful **node** metrics for each NFS filesystem. If a metric or filesystem is not required, it can be excluded from forwarding it to the sink.
|
||||
The `nfsiostat` collector reads data from `/proc/self/mountstats` and outputs a handful **node** metrics for each NFS filesystem. If a metric or filesystem is not required, it can be excluded from forwarding it to the sink. **Note:** When excluding metrics, you must provide the base metric name (e.g. pageread) without the nfsio_ prefix. This exclusion applies to both absolute and derived values.
|
||||
|
||||
Metrics:
|
||||
* `nfsio_nread`: Bytes transferred by normal `read()` calls
|
||||
@@ -24,4 +26,9 @@ Metrics:
|
||||
* `nfsio_nfsread`: Bytes transferred for reading from the server
|
||||
* `nfsio_nfswrite`: Pages transferred by writing to the server
|
||||
|
||||
The `nfsiostat` collector adds the mountpoint to the tags as `stype=filesystem,stype-id=<mountpoint>`. If the server address should be used instead of the mountpoint, use the `use_server_as_stype` config setting.
|
||||
For each of these, if derived values are enabled, an additional metric is sent with the `_bw` suffix, which represents the rate:
|
||||
|
||||
* For normal byte metrics: `unit=bytes/sec`
|
||||
* For page metrics: `unit=4K_pages/s`
|
||||
|
||||
The `nfsiostat` collector adds the mountpoint to the tags as `stype=filesystem,stype-id=<mountpoint>`. If the server address should be used instead of the mountpoint, use the `use_server_as_stype` config setting.
|
||||
|
@@ -10,10 +10,15 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
type NUMAStatsCollectorConfig struct {
|
||||
SendAbsoluteValues bool `json:"send_abs_values"`
|
||||
SendDerivedValues bool `json:"send_derived_values"`
|
||||
}
|
||||
|
||||
// Non-Uniform Memory Access (NUMA) policy hit/miss statistics
|
||||
//
|
||||
// numa_hit:
|
||||
@@ -47,13 +52,16 @@ import (
|
||||
//
|
||||
// See: https://www.kernel.org/doc/html/latest/admin-guide/numastat.html
|
||||
type NUMAStatsCollectorTopolgy struct {
|
||||
file string
|
||||
tagSet map[string]string
|
||||
file string
|
||||
tagSet map[string]string
|
||||
previousValues map[string]int64
|
||||
}
|
||||
|
||||
type NUMAStatsCollector struct {
|
||||
metricCollector
|
||||
topology []NUMAStatsCollectorTopolgy
|
||||
topology []NUMAStatsCollectorTopolgy
|
||||
config NUMAStatsCollectorConfig
|
||||
lastTimestamp time.Time
|
||||
}
|
||||
|
||||
func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
||||
@@ -86,8 +94,9 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
||||
file := filepath.Join(dir, "numastat")
|
||||
m.topology = append(m.topology,
|
||||
NUMAStatsCollectorTopolgy{
|
||||
file: file,
|
||||
tagSet: map[string]string{"memoryDomain": node},
|
||||
file: file,
|
||||
tagSet: map[string]string{"memoryDomain": node},
|
||||
previousValues: make(map[string]int64),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -102,23 +111,27 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
return
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
timeDiff := now.Sub(m.lastTimestamp).Seconds()
|
||||
m.lastTimestamp = now
|
||||
|
||||
for i := range m.topology {
|
||||
// Loop for all NUMA domains
|
||||
t := &m.topology[i]
|
||||
|
||||
now := time.Now()
|
||||
file, err := os.Open(t.file)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", t.file, err))
|
||||
return
|
||||
continue
|
||||
}
|
||||
scanner := bufio.NewScanner(file)
|
||||
|
||||
// Read line by line
|
||||
for scanner.Scan() {
|
||||
split := strings.Fields(scanner.Text())
|
||||
line := scanner.Text()
|
||||
split := strings.Fields(line)
|
||||
if len(split) != 2 {
|
||||
continue
|
||||
}
|
||||
@@ -130,18 +143,38 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
fmt.Sprintf("Read(): Failed to convert %s='%s' to int64: %v", key, split[1], err))
|
||||
continue
|
||||
}
|
||||
y, err := lp.NewMessage(
|
||||
"numastats_"+key,
|
||||
t.tagSet,
|
||||
m.meta,
|
||||
map[string]interface{}{"value": value},
|
||||
now,
|
||||
)
|
||||
if err == nil {
|
||||
output <- y
|
||||
|
||||
if m.config.SendAbsoluteValues {
|
||||
msg, err := lp.NewMessage(
|
||||
"numastats_"+key,
|
||||
t.tagSet,
|
||||
m.meta,
|
||||
map[string]interface{}{"value": value},
|
||||
now,
|
||||
)
|
||||
if err == nil {
|
||||
output <- msg
|
||||
}
|
||||
}
|
||||
|
||||
if m.config.SendDerivedValues {
|
||||
prev, ok := t.previousValues[key]
|
||||
if ok {
|
||||
rate := float64(value-prev) / timeDiff
|
||||
msg, err := lp.NewMessage(
|
||||
"numastats_"+key+"_rate",
|
||||
t.tagSet,
|
||||
m.meta,
|
||||
map[string]interface{}{"value": rate},
|
||||
now,
|
||||
)
|
||||
if err == nil {
|
||||
output <- msg
|
||||
}
|
||||
}
|
||||
t.previousValues[key] = value
|
||||
}
|
||||
}
|
||||
|
||||
file.Close()
|
||||
}
|
||||
}
|
||||
|
@@ -2,7 +2,10 @@
|
||||
## `numastat` collector
|
||||
|
||||
```json
|
||||
"numastats": {}
|
||||
"numastats": {
|
||||
"send_abs_values" : true,
|
||||
"send_derived_values" : true
|
||||
}
|
||||
```
|
||||
|
||||
The `numastat` collector reads data from `/sys/devices/system/node/node*/numastat` and outputs a handful **memoryDomain** metrics. See: <https://www.kernel.org/doc/html/latest/admin-guide/numastat.html>
|
||||
@@ -15,3 +18,9 @@ Metrics:
|
||||
* `numastats_local_node`: A process ran on this node's CPU, and got memory from this node.
|
||||
* `numastats_other_node`: A process ran on a different node's CPU, and got memory from this node.
|
||||
* `numastats_interleave_hit`: Interleaving wanted to allocate from this node and succeeded.
|
||||
* `numastats_numa_hit_rate` (if `send_derived_values == true`): Derived rate value per second.
|
||||
* `numastats_numa_miss_rate` (if `send_derived_values == true`): Derived rate value per second.
|
||||
* `numastats_numa_foreign_rate` (if `send_derived_values == true`): Derived rate value per second.
|
||||
* `numastats_local_node_rate` (if `send_derived_values == true`): Derived rate value per second.
|
||||
* `numastats_other_node_rate` (if `send_derived_values == true`): Derived rate value per second.
|
||||
* `numastats_interleave_hit_rate` (if `send_derived_values == true`): Derived rate value per second.
|
||||
|
@@ -8,8 +8,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
)
|
||||
|
||||
|
@@ -9,8 +9,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
// running average power limit (RAPL) monitoring attributes for a zone
|
||||
|
@@ -1,11 +1,9 @@
|
||||
# Running average power limit (RAPL) metric collector
|
||||
## `rapl` collector
|
||||
|
||||
This collector reads running average power limit (RAPL) monitoring attributes to compute average power consumption metrics. See <https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes>.
|
||||
|
||||
The Likwid metric collector provides similar functionality.
|
||||
|
||||
## Configuration
|
||||
|
||||
```json
|
||||
"rapl": {
|
||||
"exclude_device_by_id": ["0:1", "0:2"],
|
||||
@@ -13,6 +11,5 @@ The Likwid metric collector provides similar functionality.
|
||||
}
|
||||
```
|
||||
|
||||
## Metrics
|
||||
|
||||
Metrics:
|
||||
* `rapl_average_power`: average power consumption in Watt. The average is computed over the entire runtime from the last measurement to the current measurement
|
||||
|
@@ -6,8 +6,8 @@ import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
|
||||
)
|
||||
|
||||
|
@@ -4,8 +4,8 @@ import (
|
||||
"encoding/json"
|
||||
"time"
|
||||
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
// These are the fields we read from the JSON configuration
|
||||
|
@@ -5,8 +5,8 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
// These are the fields we read from the JSON configuration
|
||||
|
@@ -10,8 +10,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
const SCHEDSTATFILE = `/proc/schedstat`
|
||||
|
@@ -6,8 +6,8 @@ import (
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
type SelfCollectorConfig struct {
|
||||
|
@@ -9,8 +9,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
|
||||
|
@@ -9,7 +9,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||
)
|
||||
|
||||
const MAX_NUM_PROCS = 10
|
||||
|
Reference in New Issue
Block a user