mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-06-10 13:57:30 +02:00
Compare commits
55 Commits
v0.7.3
...
cclog_upda
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c9dfca622f | ||
|
|
3b0638e815 | ||
|
|
037b4f1526 | ||
|
|
bed5491068 | ||
|
|
a2eba41150 | ||
|
|
5d55ee7a77 | ||
|
|
5938368a76 | ||
|
|
077204d39f | ||
|
|
dcc9746df4 | ||
|
|
2c51a3ed72 | ||
|
|
656ea73d12 | ||
|
|
330f923596 | ||
|
|
8e58072ff6 | ||
|
|
0f6fee9db4 | ||
|
|
7585ee7289 | ||
|
|
30b2eb69dd | ||
|
|
2a51bd17f3 | ||
|
|
34d3d8970e | ||
|
|
50c7eba192 | ||
|
|
d215cabb3e | ||
|
|
86da3c15f7 | ||
|
|
93cd397b79 | ||
|
|
65b9c0ea14 | ||
|
|
0ecf06cee7 | ||
|
|
9eaf77db4f | ||
|
|
7cb5d1b47a | ||
|
|
319e71a853 | ||
|
|
1251f9ef6b | ||
|
|
f816f4991b | ||
|
|
e40816eb17 | ||
|
|
b947f98459 | ||
|
|
c328fbf05a | ||
|
|
37ec7c19e6 | ||
|
|
13fc8a53d3 | ||
|
|
1937ef2587 | ||
|
|
35510d3d39 | ||
|
|
ef5e4c2604 | ||
|
|
44401318e4 | ||
|
|
2e60d3111c | ||
|
|
e8734c02db | ||
|
|
54650d40a6 | ||
|
|
e7050834f5 | ||
|
|
893a0d69de | ||
|
|
345119866a | ||
|
|
ec917cf802 | ||
|
|
c7cfc0723b | ||
|
|
4f2685f4c4 | ||
|
|
439bfacfd9 | ||
|
|
cd4ac9c885 | ||
|
|
eeb60ba0df | ||
|
|
a481a34dcd | ||
|
|
b65576431e | ||
|
|
a927565868 | ||
|
|
0b67993eb0 | ||
|
|
4164e3d1a3 |
4
.github/workflows/Release.yml
vendored
4
.github/workflows/Release.yml
vendored
@@ -270,7 +270,7 @@ jobs:
|
||||
- name: Install development packages
|
||||
run: |
|
||||
apt update && apt --assume-yes upgrade
|
||||
apt --assume-yes install build-essential sed git wget bash
|
||||
apt --assume-yes install build-essential sed git wget bash libdrm-dev
|
||||
# Checkout git repository and submodules
|
||||
# fetch-depth must be 0 to use git describe
|
||||
# See: https://github.com/marketplace/actions/checkout
|
||||
@@ -321,7 +321,7 @@ jobs:
|
||||
- name: Install development packages
|
||||
run: |
|
||||
apt update && apt --assume-yes upgrade
|
||||
apt --assume-yes install build-essential sed git wget bash
|
||||
apt --assume-yes install build-essential sed git wget bash libdrm-dev
|
||||
# Checkout git repository and submodules
|
||||
# fetch-depth must be 0 to use git describe
|
||||
# See: https://github.com/marketplace/actions/checkout
|
||||
|
||||
13
Makefile
13
Makefile
@@ -27,6 +27,17 @@ $(APP): $(GOSRC) go.mod
|
||||
$(GOBIN) get
|
||||
$(GOBIN) build -o $(APP) $(GOSRC_APP)
|
||||
|
||||
# -ldflags:
|
||||
# -s : drops the OS symbol table
|
||||
# -w : drops DWARF
|
||||
# -> Panic stack traces still show function names and file:line
|
||||
.PHONY: build-stripped
|
||||
build-stripped:
|
||||
make -C collectors
|
||||
$(GOBIN) get
|
||||
$(GOBIN) build -ldflags "-s -w" -trimpath -o $(APP) $(GOSRC_APP)
|
||||
|
||||
.PHONY: install
|
||||
install: $(APP)
|
||||
@WORKSPACE=$(PREFIX)
|
||||
@if [ -z "$${WORKSPACE}" ]; then exit 1; fi
|
||||
@@ -89,7 +100,7 @@ staticcheck:
|
||||
.PHONY: golangci-lint
|
||||
golangci-lint:
|
||||
$(GOBIN) install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@latest
|
||||
$$($(GOBIN) env GOPATH)/bin/golangci-lint run
|
||||
$$($(GOBIN) env GOPATH)/bin/golangci-lint run --enable errorlint,govet,misspell,modernize,prealloc,staticcheck,unconvert,wastedassign
|
||||
|
||||
.ONESHELL:
|
||||
.PHONY: RPM
|
||||
|
||||
16
README.md
16
README.md
@@ -13,11 +13,7 @@ hugo_path: docs/reference/cc-metric-collector/_index.md
|
||||
|
||||
A node agent for measuring, processing and forwarding node level metrics. It is part of the [ClusterCockpit ecosystem](https://clustercockpit.org/docs/overview/).
|
||||
|
||||
The metric collector sends (and receives) metric in the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/) as it provides flexibility while providing a separation between tags (like index columns in relational databases) and fields (like data columns).
|
||||
|
||||
There is a single timer loop that triggers all collectors serially, collects the collectors' data and sends the metrics to the sink. This is done as all data is submitted with a single time stamp. The sinks currently use mostly blocking APIs.
|
||||
|
||||
The receiver runs as a go routine side-by-side with the timer loop and asynchronously forwards received metrics to the sink.
|
||||
The `cc-metric-collector` sends (and maybe receives) metrics in the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/) as it provides flexibility while providing a separation between tags (like index columns in relational databases) and fields (like data columns). The `cc-metric-collector` consists of 4 components: collectors, router, sinks and receivers. The collectors read data from the current system and submit metrics to the router. The router can be configured to manipulate the metrics before forwarding them to the sinks. The receivers are also attached to the router like the collectors but they receive data from external source like other `cc-metric-collector` instances.
|
||||
|
||||
|
||||
[](https://doi.org/10.5281/zenodo.7438287)
|
||||
@@ -43,7 +39,7 @@ There is a main configuration file with basic settings that point to the other c
|
||||
}
|
||||
```
|
||||
|
||||
The `interval` defines how often the metrics should be read and send to the sink. The `duration` tells collectors how long one measurement has to take. This is important for some collectors, like the `likwid` collector. For more information, see [here](./docs/configuration.md).
|
||||
The `interval` defines how often the metrics should be read and send to the sink(s). The `duration` tells the collectors how long one measurement has to take. This is important for some collectors, like the `likwid` collector. For more information, see [here](./docs/configuration.md).
|
||||
|
||||
See the component READMEs for their configuration:
|
||||
|
||||
@@ -57,7 +53,7 @@ See the component READMEs for their configuration:
|
||||
```
|
||||
$ git clone git@github.com:ClusterCockpit/cc-metric-collector.git
|
||||
$ make (downloads LIKWID, builds it as static library with 'direct' accessmode and copies all required files for the collector)
|
||||
$ go get (requires at least golang 1.16)
|
||||
$ go get
|
||||
$ make
|
||||
```
|
||||
|
||||
@@ -67,11 +63,13 @@ For more information, see [here](./docs/building.md).
|
||||
|
||||
```
|
||||
$ ./cc-metric-collector --help
|
||||
Usage of metric-collector:
|
||||
Usage of ./cc-metric-collector:
|
||||
-config string
|
||||
Path to configuration file (default "./config.json")
|
||||
-log string
|
||||
Path for logfile (default "stderr")
|
||||
-loglevel string
|
||||
Set log level (default "info")
|
||||
-once
|
||||
Run all collectors only once
|
||||
```
|
||||
@@ -114,7 +112,7 @@ flowchart TD
|
||||
|
||||
# Contributing
|
||||
|
||||
The ClusterCockpit ecosystem is designed to be used by different HPC computing centers. Since configurations and setups differ between the centers, the centers likely have to put some work into the cc-metric-collector to gather all desired metrics.
|
||||
The ClusterCockpit ecosystem is designed to be used by different HPC computing centers. Since configurations and setups differ between the centers, the centers likely have to put some work into `cc-metric-collector` to gather all desired metrics.
|
||||
|
||||
You are free to open an issue to request a collector but we would also be happy about PRs.
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"os"
|
||||
@@ -48,22 +49,22 @@ type RuntimeConfig struct {
|
||||
Sync sync.WaitGroup
|
||||
}
|
||||
|
||||
// ReadCli reads the command line arguments
|
||||
func ReadCli() map[string]string {
|
||||
var m map[string]string
|
||||
cfg := flag.String("config", "./config.json", "Path to configuration file")
|
||||
logfile := flag.String("log", "stderr", "Path for logfile")
|
||||
once := flag.Bool("once", false, "Run all collectors only once")
|
||||
loglevel := flag.String("loglevel", "info", "Set log level")
|
||||
flag.Parse()
|
||||
m = make(map[string]string)
|
||||
m["configfile"] = *cfg
|
||||
m["logfile"] = *logfile
|
||||
m := map[string]string{
|
||||
"configfile": *cfg,
|
||||
"logfile": *logfile,
|
||||
"once": "false",
|
||||
"loglevel": *loglevel,
|
||||
}
|
||||
if *once {
|
||||
m["once"] = "true"
|
||||
} else {
|
||||
m["once"] = "false"
|
||||
}
|
||||
m["loglevel"] = *loglevel
|
||||
return m
|
||||
}
|
||||
|
||||
@@ -120,9 +121,10 @@ func mainFunc() int {
|
||||
|
||||
// Load and check configuration
|
||||
main := ccconf.GetPackageConfig("main")
|
||||
err = json.Unmarshal(main, &rcfg.ConfigFile)
|
||||
if err != nil {
|
||||
cclog.Error("Error reading configuration file ", rcfg.CliArgs["configfile"], ": ", err.Error())
|
||||
d := json.NewDecoder(bytes.NewReader(main))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&rcfg.ConfigFile); err != nil {
|
||||
cclog.Errorf("Error reading configuration file %s: %v", rcfg.CliArgs["configfile"], err)
|
||||
return 1
|
||||
}
|
||||
|
||||
@@ -130,11 +132,11 @@ func mainFunc() int {
|
||||
if len(rcfg.ConfigFile.Interval) > 0 {
|
||||
t, err := time.ParseDuration(rcfg.ConfigFile.Interval)
|
||||
if err != nil {
|
||||
cclog.Error("Configuration value 'interval' no valid duration")
|
||||
cclog.Errorf("Configuration value interval=%s no valid duration", rcfg.ConfigFile.Interval)
|
||||
}
|
||||
rcfg.Interval = t
|
||||
if rcfg.Interval == 0 {
|
||||
cclog.Error("Configuration value 'interval' must be greater than zero")
|
||||
cclog.Errorf("Configuration value interval=%s must be greater than zero", rcfg.ConfigFile.Interval)
|
||||
return 1
|
||||
}
|
||||
}
|
||||
@@ -143,11 +145,11 @@ func mainFunc() int {
|
||||
if len(rcfg.ConfigFile.Duration) > 0 {
|
||||
t, err := time.ParseDuration(rcfg.ConfigFile.Duration)
|
||||
if err != nil {
|
||||
cclog.Error("Configuration value 'duration' no valid duration")
|
||||
cclog.Error("Configuration value duration=%s no valid duration", rcfg.ConfigFile.Duration)
|
||||
}
|
||||
rcfg.Duration = t
|
||||
if rcfg.Duration == 0 {
|
||||
cclog.Error("Configuration value 'duration' must be greater than zero")
|
||||
cclog.Error("Configuration value duration=%s must be greater than zero", rcfg.ConfigFile.Duration)
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,6 +59,7 @@ In contrast to the configuration files for sinks and receivers, the collectors c
|
||||
* [ ] Aggreate metrics to higher topology entity (sum hwthread metrics to socket metric, ...). Needs to be configurable
|
||||
|
||||
# Contributing own collectors
|
||||
|
||||
A collector reads data from any source, parses it to metrics and submits these metrics to the `metric-collector`. A collector provides three function:
|
||||
|
||||
* `Name() string`: Return the name of the collector
|
||||
@@ -104,8 +105,10 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
if len(config) > 0 {
|
||||
if err := json.Unmarshal(config, &m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): json.Unmarshal() call failed: %w", m.name, err)
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.meta = map[string]string{"source": m.name, "group": "Sample"}
|
||||
|
||||
@@ -32,7 +32,7 @@ const DEFAULT_BEEGFS_CMD = "beegfs-ctl"
|
||||
type BeegfsMetaCollectorConfig struct {
|
||||
Beegfs string `json:"beegfs_path"`
|
||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||
ExcludeFilesystem []string `json:"exclude_filesystem"`
|
||||
ExcludeFilesystems []string `json:"exclude_filesystem"`
|
||||
}
|
||||
|
||||
type BeegfsMetaCollector struct {
|
||||
@@ -74,9 +74,10 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Read JSON configuration
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Failed to decode JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -99,23 +100,23 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
||||
"filesystem": "",
|
||||
}
|
||||
m.skipFS = make(map[string]struct{})
|
||||
for _, fs := range m.config.ExcludeFilesystem {
|
||||
for _, fs := range m.config.ExcludeFilesystems {
|
||||
m.skipFS[fs] = struct{}{}
|
||||
}
|
||||
|
||||
// Beegfs file system statistics can only be queried by user root
|
||||
user, err := user.Current()
|
||||
if err != nil {
|
||||
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to get current user: %w", err)
|
||||
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
|
||||
}
|
||||
if user.Uid != "0" {
|
||||
return fmt.Errorf("BeegfsMetaCollector.Init(): BeeGFS file system statistics can only be queried by user root")
|
||||
return fmt.Errorf("%s Init(): BeeGFS file system statistics can only be queried by user root", m.name)
|
||||
}
|
||||
|
||||
// Check if beegfs-ctl is in executable search path
|
||||
_, err = exec.LookPath(m.config.Beegfs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err)
|
||||
return fmt.Errorf("%s Init(): Failed to find beegfs-ctl binary '%s': %w", m.name, m.config.Beegfs, err)
|
||||
}
|
||||
m.init = true
|
||||
return nil
|
||||
@@ -208,16 +209,16 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
} else {
|
||||
f1, err := strconv.ParseFloat(m.matches["other"], 32)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
|
||||
"Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
|
||||
continue
|
||||
}
|
||||
f2, err := strconv.ParseFloat(split[i], 32)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
|
||||
"Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
|
||||
continue
|
||||
}
|
||||
m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2)
|
||||
@@ -226,8 +227,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
|
||||
for key, data := range m.matches {
|
||||
value, _ := strconv.ParseFloat(data, 32)
|
||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric(key, m.tags, m.meta, value, time.Now()); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,7 +30,7 @@ import (
|
||||
type BeegfsStorageCollectorConfig struct {
|
||||
Beegfs string `json:"beegfs_path"`
|
||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||
ExcludeFilesystem []string `json:"exclude_filesystem"`
|
||||
ExcludeFilesystems []string `json:"exclude_filesystem"`
|
||||
}
|
||||
|
||||
type BeegfsStorageCollector struct {
|
||||
@@ -67,9 +67,10 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Read JSON configuration
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,23 +93,23 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
||||
"filesystem": "",
|
||||
}
|
||||
m.skipFS = make(map[string]struct{})
|
||||
for _, fs := range m.config.ExcludeFilesystem {
|
||||
for _, fs := range m.config.ExcludeFilesystems {
|
||||
m.skipFS[fs] = struct{}{}
|
||||
}
|
||||
|
||||
// Beegfs file system statistics can only be queried by user root
|
||||
user, err := user.Current()
|
||||
if err != nil {
|
||||
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to get current user: %w", err)
|
||||
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
|
||||
}
|
||||
if user.Uid != "0" {
|
||||
return fmt.Errorf("BeegfsStorageCollector.Init(): BeeGFS file system statistics can only be queried by user root")
|
||||
return fmt.Errorf("%s Init(): BeeGFS file system statistics can only be queried by user root", m.name)
|
||||
}
|
||||
|
||||
// Check if beegfs-ctl is in executable search path
|
||||
_, err = exec.LookPath(m.config.Beegfs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err)
|
||||
return fmt.Errorf("%s Init(): Failed to find beegfs-ctl binary '%s': %w", m.name, m.config.Beegfs, err)
|
||||
}
|
||||
m.init = true
|
||||
return nil
|
||||
@@ -199,16 +200,16 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
||||
} else {
|
||||
f1, err := strconv.ParseFloat(m.matches["other"], 32)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
|
||||
"Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
|
||||
continue
|
||||
}
|
||||
f2, err := strconv.ParseFloat(split[i], 32)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
|
||||
"Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
|
||||
continue
|
||||
}
|
||||
m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2)
|
||||
@@ -217,8 +218,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
||||
|
||||
for key, data := range m.matches {
|
||||
value, _ := strconv.ParseFloat(data, 32)
|
||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric(key, m.tags, m.meta, value, time.Now()); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"sync"
|
||||
@@ -48,6 +49,8 @@ var AvailableCollectors = map[string]MetricCollector{
|
||||
"schedstat": new(SchedstatCollector),
|
||||
"nfsiostat": new(NfsIOStatCollector),
|
||||
"slurm_cgroup": new(SlurmCgroupCollector),
|
||||
"smartmon": new(SmartMonCollector),
|
||||
"nvidia_gpm": new(NvidiaGPMCollector),
|
||||
}
|
||||
|
||||
// Metric collector manager data structure
|
||||
@@ -88,26 +91,26 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
|
||||
cm.ticker = ticker
|
||||
cm.duration = duration
|
||||
|
||||
err := json.Unmarshal(collectConfig, &cm.config)
|
||||
if err != nil {
|
||||
cclog.Error(err.Error())
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(collectConfig))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&cm.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding collector manager config: %w", "CollectorManager", err)
|
||||
}
|
||||
|
||||
// Initialize configured collectors
|
||||
for collectorName, collectorCfg := range cm.config {
|
||||
if _, found := AvailableCollectors[collectorName]; !found {
|
||||
cclog.ComponentError("CollectorManager", "SKIP unknown collector", collectorName)
|
||||
cclog.ComponentErrorf("CollectorManager", "SKIP unknown collector %s", collectorName)
|
||||
continue
|
||||
}
|
||||
collector := AvailableCollectors[collectorName]
|
||||
|
||||
err = collector.Init(collectorCfg)
|
||||
err := collector.Init(collectorCfg)
|
||||
if err != nil {
|
||||
cclog.ComponentError("CollectorManager", fmt.Sprintf("Collector %s initialization failed: %v", collectorName, err))
|
||||
cclog.ComponentErrorf("CollectorManager", "Collector %s initialization failed: %v", collectorName, err)
|
||||
continue
|
||||
}
|
||||
cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())
|
||||
cclog.ComponentDebugf("CollectorManager", "ADD COLLECTOR %s", collector.Name())
|
||||
if collector.Parallel() {
|
||||
cm.collectors = append(cm.collectors, collector)
|
||||
} else {
|
||||
@@ -153,7 +156,7 @@ func (cm *collectorManager) Start() {
|
||||
return
|
||||
default:
|
||||
// Read metrics from collector c via goroutine
|
||||
cclog.ComponentDebug("CollectorManager", c.Name(), t)
|
||||
cclog.ComponentDebugf("CollectorManager: Read %s at %v", c.Name(), t)
|
||||
cm.collector_wg.Add(1)
|
||||
go func(myc MetricCollector) {
|
||||
myc.Read(cm.duration, cm.output)
|
||||
@@ -171,7 +174,7 @@ func (cm *collectorManager) Start() {
|
||||
return
|
||||
default:
|
||||
// Read metrics from collector c
|
||||
cclog.ComponentDebug("CollectorManager", c.Name(), t)
|
||||
cclog.ComponentDebugf("CollectorManager: Read %s at %v", c.Name(), t)
|
||||
c.Read(cm.duration, cm.output)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -139,16 +139,16 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
||||
const cpuInfoFile = "/proc/cpuinfo"
|
||||
file, err := os.Open(cpuInfoFile)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err))
|
||||
"Read(): Failed to open file '%s': %v", cpuInfoFile, err)
|
||||
return
|
||||
}
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", cpuInfoFile, err))
|
||||
"Read(): Failed to close file '%s': %v", cpuInfoFile, err)
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -166,12 +166,12 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
||||
if !t.isHT {
|
||||
value, err := strconv.ParseFloat(strings.TrimSpace(lineSplit[1]), 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
|
||||
"Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err)
|
||||
return
|
||||
}
|
||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": value}, now); err == nil {
|
||||
if y, err := lp.NewMetric("cpufreq", t.tagSet, m.meta, value, now); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,9 @@ hugo_path: docs/reference/cc-metric-collector/collectors/cpufreq_cpuinfo.md
|
||||
## `cpufreq_cpuinfo` collector
|
||||
|
||||
```json
|
||||
"cpufreq_cpuinfo": {}
|
||||
"cpufreq_cpuinfo": {
|
||||
"exclude_metrics": []
|
||||
}
|
||||
```
|
||||
|
||||
The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **hwthread** metrics.
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -54,9 +55,10 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
m.parallel = true
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.meta = map[string]string{
|
||||
@@ -77,7 +79,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
||||
scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq")
|
||||
err := unix.Access(scalingCurFreqFile, unix.R_OK)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to access file '%s': %w", scalingCurFreqFile, err)
|
||||
return fmt.Errorf("%s Init(): unable to access file '%s': %w", m.name, scalingCurFreqFile, err)
|
||||
}
|
||||
|
||||
m.topology = append(m.topology,
|
||||
@@ -93,10 +95,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
// Initialized
|
||||
cclog.ComponentDebug(
|
||||
m.name,
|
||||
"initialized",
|
||||
len(m.topology), "non-hyper-threading CPUs")
|
||||
cclog.ComponentDebugf(m.name, "initialized %d non-hyper-threading CPUs")
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
@@ -114,20 +113,18 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
// Read current frequency
|
||||
line, err := os.ReadFile(t.scalingCurFreqFile)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to read file '%s': %v", t.scalingCurFreqFile, err))
|
||||
cclog.ComponentErrorf(
|
||||
m.name, "Read(): Failed to read file '%s': %v", t.scalingCurFreqFile, err)
|
||||
continue
|
||||
}
|
||||
cpuFreq, err := strconv.ParseInt(strings.TrimSpace(string(line)), 10, 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to convert CPU frequency '%s' to int64: %v", line, err))
|
||||
m.name, "Read(): Failed to convert CPU frequency '%s' to int64: %v", line, err)
|
||||
continue
|
||||
}
|
||||
|
||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": cpuFreq}, now); err == nil {
|
||||
if y, err := lp.NewMetric("cpufreq", t.tagSet, m.meta, cpuFreq, now); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ package collectors
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -26,6 +27,7 @@ const CPUSTATFILE = `/proc/stat`
|
||||
|
||||
type CpustatCollectorConfig struct {
|
||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||
excludeNumCPUs bool
|
||||
}
|
||||
|
||||
type CpustatCollector struct {
|
||||
@@ -53,9 +55,10 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||
"type": "node",
|
||||
}
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
matches := map[string]int{
|
||||
@@ -77,21 +80,13 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||
m.matches[match] = index
|
||||
}
|
||||
}
|
||||
m.config.excludeNumCPUs = slices.Contains(m.config.ExcludeMetrics, "num_cpus")
|
||||
|
||||
// Check input file
|
||||
file, err := os.Open(string(CPUSTATFILE))
|
||||
file, err := os.Open(CPUSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Init(): Failed to open file '%s': %v", string(CPUSTATFILE), err))
|
||||
return fmt.Errorf("%s Init(): Failed to open file '%s': %w", m.name, CPUSTATFILE, err)
|
||||
}
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Init(): Failed to close file '%s': %v", string(CPUSTATFILE), err))
|
||||
}
|
||||
}()
|
||||
|
||||
// Pre-generate tags for all CPUs
|
||||
num_cpus := 0
|
||||
@@ -102,11 +97,13 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||
line := scanner.Text()
|
||||
linefields := strings.Fields(line)
|
||||
if strings.Compare(linefields[0], "cpu") == 0 {
|
||||
// Kernel system statistics for all CPUs
|
||||
m.olddata["cpu"] = make(map[string]int64)
|
||||
for k, v := range m.matches {
|
||||
m.olddata["cpu"][k], _ = strconv.ParseInt(linefields[v], 0, 64)
|
||||
}
|
||||
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
||||
// Kernel system statistics per CPU
|
||||
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
||||
cpu, _ := strconv.Atoi(cpustr)
|
||||
m.cputags[linefields[0]] = map[string]string{
|
||||
@@ -120,6 +117,12 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||
num_cpus++
|
||||
}
|
||||
}
|
||||
|
||||
// Close file
|
||||
if err := file.Close(); err != nil {
|
||||
return fmt.Errorf("%s Init(): Failed to close file '%s': %w", m.name, CPUSTATFILE, err)
|
||||
}
|
||||
|
||||
m.lastTimestamp = time.Now()
|
||||
m.init = true
|
||||
return nil
|
||||
@@ -142,7 +145,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
|
||||
sum := float64(0)
|
||||
for name, value := range values {
|
||||
sum += value
|
||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value * 100}, now)
|
||||
y, err := lp.NewMetric(name, tags, m.meta, value*100, now)
|
||||
if err == nil {
|
||||
y.AddTag("unit", "Percent")
|
||||
output <- y
|
||||
@@ -150,7 +153,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
|
||||
}
|
||||
if v, ok := values["cpu_idle"]; ok {
|
||||
sum -= v
|
||||
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]any{"value": sum * 100}, now)
|
||||
y, err := lp.NewMetric("cpu_used", tags, m.meta, sum*100, now)
|
||||
if err == nil {
|
||||
y.AddTag("unit", "Percent")
|
||||
output <- y
|
||||
@@ -166,17 +169,17 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
now := time.Now()
|
||||
tsdelta := now.Sub(m.lastTimestamp)
|
||||
|
||||
file, err := os.Open(string(CPUSTATFILE))
|
||||
file, err := os.Open(CPUSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", string(CPUSTATFILE), err))
|
||||
"Read(): Failed to open file '%s': %v", CPUSTATFILE, err)
|
||||
}
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", string(CPUSTATFILE), err))
|
||||
"Read(): Failed to close file '%s': %v", string(CPUSTATFILE), err)
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -192,15 +195,11 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
}
|
||||
}
|
||||
|
||||
num_cpus_metric, err := lp.NewMessage("num_cpus",
|
||||
m.nodetags,
|
||||
m.meta,
|
||||
map[string]any{"value": num_cpus},
|
||||
now,
|
||||
)
|
||||
if err == nil {
|
||||
if !m.config.excludeNumCPUs {
|
||||
if num_cpus_metric, err := lp.NewMetric("num_cpus", m.nodetags, m.meta, num_cpus, now); err == nil {
|
||||
output <- num_cpus_metric
|
||||
}
|
||||
}
|
||||
|
||||
m.lastTimestamp = now
|
||||
}
|
||||
|
||||
@@ -8,8 +8,8 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
@@ -47,8 +47,10 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Read configuration
|
||||
if len(config) > 0 {
|
||||
if err := json.Unmarshal(config, &m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): json.Unmarshal() call failed: %w", m.name, err)
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -62,9 +64,9 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
||||
cmdFields := strings.Fields(c)
|
||||
command := exec.Command(cmdFields[0], cmdFields[1:]...)
|
||||
if _, err := command.Output(); err != nil {
|
||||
cclog.ComponentWarn(
|
||||
cclog.ComponentWarnf(
|
||||
m.name,
|
||||
fmt.Sprintf("%s Init(): Execution of command \"%s\" failed: %v", m.name, command.String(), err))
|
||||
"%s Init(): Execution of command \"%s\" failed: %v", m.name, command.String(), err)
|
||||
continue
|
||||
}
|
||||
m.cmdFieldsSlice = append(m.cmdFieldsSlice, cmdFields)
|
||||
@@ -75,14 +77,14 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
||||
if _, err := os.ReadFile(fileName); err != nil {
|
||||
cclog.ComponentWarn(
|
||||
m.name,
|
||||
fmt.Sprintf("%s Init(): Reading of file \"%s\" failed: %v", m.name, fileName, err))
|
||||
"%s Init(): Reading of file \"%s\" failed: %v", m.name, fileName, err)
|
||||
continue
|
||||
}
|
||||
m.files = append(m.files, fileName)
|
||||
}
|
||||
|
||||
if len(m.files) == 0 && len(m.cmdFieldsSlice) == 0 {
|
||||
return errors.New("no metrics to collect")
|
||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||
}
|
||||
m.init = true
|
||||
return nil
|
||||
@@ -98,20 +100,18 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
command := exec.Command(cmdFields[0], cmdFields[1:]...)
|
||||
stdout, err := command.Output()
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to read command output for command \"%s\": %v", command.String(), err),
|
||||
)
|
||||
"Read(): Failed to read command output for command \"%s\": %v", command.String(), err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Read and decode influxDB line-protocol from command output
|
||||
metrics, err := lp.FromBytes(stdout)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to decode influx Message: %v", err),
|
||||
)
|
||||
"Read(): Failed to decode influx Message: %v", err)
|
||||
continue
|
||||
}
|
||||
for _, metric := range metrics {
|
||||
@@ -126,20 +126,18 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
for _, filename := range m.files {
|
||||
input, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to read file \"%s\": %v\n", filename, err),
|
||||
)
|
||||
"Read(): Failed to read file \"%s\": %v\n", filename, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Read and decode influxDB line-protocol from file
|
||||
metrics, err := lp.FromBytes(input)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to decode influx Message: %v", err),
|
||||
)
|
||||
"Read(): Failed to decode influx Message: %v", err)
|
||||
continue
|
||||
}
|
||||
for _, metric := range metrics {
|
||||
|
||||
@@ -9,6 +9,7 @@ package collectors
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -42,8 +43,10 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
if len(config) > 0 {
|
||||
if err := json.Unmarshal(config, &m.config); err != nil {
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.allowedMetrics = map[string]bool{
|
||||
@@ -74,16 +77,16 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
|
||||
|
||||
file, err := os.Open(MOUNTFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", MOUNTFILE, err))
|
||||
"Read(): Failed to open file '%s': %v", MOUNTFILE, err)
|
||||
return
|
||||
}
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", MOUNTFILE, err))
|
||||
"Read(): Failed to close file '%s': %v", MOUNTFILE, err)
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -125,30 +128,14 @@ mountLoop:
|
||||
tags := map[string]string{"type": "node", "device": linefields[0]}
|
||||
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000_000_000)
|
||||
if m.allowedMetrics["disk_total"] {
|
||||
y, err := lp.NewMessage(
|
||||
"disk_total",
|
||||
tags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": total,
|
||||
},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("disk_total", tags, m.meta, total, time.Now()); err == nil {
|
||||
y.AddMeta("unit", "GBytes")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000_000_000)
|
||||
if m.allowedMetrics["disk_free"] {
|
||||
y, err := lp.NewMessage(
|
||||
"disk_free",
|
||||
tags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": free,
|
||||
},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("disk_free", tags, m.meta, free, time.Now()); err == nil {
|
||||
y.AddMeta("unit", "GBytes")
|
||||
output <- y
|
||||
}
|
||||
@@ -161,16 +148,7 @@ mountLoop:
|
||||
}
|
||||
}
|
||||
if m.allowedMetrics["part_max_used"] {
|
||||
y, err := lp.NewMessage(
|
||||
"part_max_used",
|
||||
map[string]string{
|
||||
"type": "node",
|
||||
},
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": int(part_max_used),
|
||||
},
|
||||
time.Now())
|
||||
y, err := lp.NewMetric("part_max_used", map[string]string{"type": "node"}, m.meta, int(part_max_used), time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "percent")
|
||||
output <- y
|
||||
|
||||
@@ -32,7 +32,7 @@ type GpfsCollectorState map[string]int64
|
||||
|
||||
type GpfsCollectorConfig struct {
|
||||
Mmpmon string `json:"mmpmon_path,omitempty"`
|
||||
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
|
||||
ExcludeFilesystems []string `json:"exclude_filesystem,omitempty"`
|
||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||
Sudo bool `json:"use_sudo,omitempty"`
|
||||
SendAbsoluteValues bool `json:"send_abs_values,omitempty"`
|
||||
@@ -322,9 +322,10 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Read JSON configuration
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err)
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.meta = map[string]string{
|
||||
@@ -336,7 +337,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||
"filesystem": "",
|
||||
}
|
||||
m.skipFS = make(map[string]struct{})
|
||||
for _, fs := range m.config.ExcludeFilesystem {
|
||||
for _, fs := range m.config.ExcludeFilesystems {
|
||||
m.skipFS[fs] = struct{}{}
|
||||
}
|
||||
m.lastState = make(map[string]GpfsCollectorState)
|
||||
@@ -346,18 +347,15 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||
if !m.config.Sudo {
|
||||
user, err := user.Current()
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to get current user: %w", m.name, err)
|
||||
}
|
||||
if user.Uid != "0" {
|
||||
cclog.ComponentError(m.name, "GPFS file system statistics can only be queried by user root")
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): GPFS file system statistics can only be queried by user root", m.name)
|
||||
}
|
||||
} else {
|
||||
p, err := exec.LookPath("sudo")
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Cannot find 'sudo'")
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): cannot find 'sudo': %w", m.name, err)
|
||||
}
|
||||
m.sudoCmd = p
|
||||
}
|
||||
@@ -373,11 +371,10 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||
if err != nil {
|
||||
// if using sudo, exec.lookPath will return EACCES (file mode r-x------), this can be ignored
|
||||
if m.config.Sudo && errors.Is(err, syscall.EACCES) {
|
||||
cclog.ComponentWarn(m.name, fmt.Sprintf("got error looking for mmpmon binary '%s': %v . This is expected when using sudo, continuing.", m.config.Mmpmon, err))
|
||||
cclog.ComponentWarnf(m.name, "got error looking for mmpmon binary '%s': %v . This is expected when using sudo, continuing.", m.config.Mmpmon, err)
|
||||
// the file was given in the config, use it
|
||||
p = m.config.Mmpmon
|
||||
} else {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err))
|
||||
return fmt.Errorf("%s Init(): failed to find mmpmon binary '%s': %w", m.name, m.config.Mmpmon, err)
|
||||
}
|
||||
}
|
||||
@@ -434,7 +431,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
}
|
||||
if len(m.definitions) == 0 {
|
||||
return errors.New("no metrics to collect")
|
||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||
}
|
||||
|
||||
m.init = true
|
||||
@@ -520,23 +517,23 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
// return code
|
||||
rc, err := strconv.Atoi(key_value["_rc_"])
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err))
|
||||
cclog.ComponentErrorf(m.name, "Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err)
|
||||
continue
|
||||
}
|
||||
if rc != 0 {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Filesystem '%s' is not ok.", filesystem))
|
||||
cclog.ComponentErrorf(m.name, "Read(): Filesystem '%s' is not ok.", filesystem)
|
||||
continue
|
||||
}
|
||||
|
||||
// timestamp
|
||||
sec, err := strconv.ParseInt(key_value["_t_"], 10, 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err))
|
||||
cclog.ComponentErrorf(m.name, "Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err)
|
||||
continue
|
||||
}
|
||||
msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err))
|
||||
cclog.ComponentErrorf(m.name, "Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err)
|
||||
continue
|
||||
}
|
||||
timestamp := time.Unix(sec, msec*1000)
|
||||
@@ -554,7 +551,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
for _, metric := range GpfsAbsMetrics {
|
||||
value, err := strconv.ParseInt(key_value[metric.prefix], 10, 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert %s '%s' to int64: %v", metric.desc, key_value[metric.prefix], err))
|
||||
cclog.ComponentErrorf(m.name, "Read(): Failed to convert %s '%s' to int64: %v", metric.desc, key_value[metric.prefix], err)
|
||||
continue
|
||||
}
|
||||
newstate[metric.prefix] = value
|
||||
@@ -639,7 +636,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
}
|
||||
} else {
|
||||
// the value could not be computed correctly
|
||||
cclog.ComponentWarn(m.name, fmt.Sprintf("Read(): Could not compute value for filesystem %s of metric %s: vold_ok = %t, vnew_ok = %t", filesystem, metric.name, vold_ok, vnew_ok))
|
||||
cclog.ComponentWarnf(m.name, "Read(): Could not compute value for filesystem %s of metric %s: vold_ok = %t, vnew_ok = %t", filesystem, metric.name, vold_ok, vnew_ok)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/gpfs.md
|
||||
```json
|
||||
"gpfs": {
|
||||
"mmpmon_path": "/path/to/mmpmon",
|
||||
"use_sudo": "true",
|
||||
"use_sudo": true,
|
||||
"exclude_filesystem": [
|
||||
"fs1"
|
||||
],
|
||||
@@ -81,3 +81,16 @@ Metrics:
|
||||
* `gpfs_metaops_rate` (if `send_total_values == true` and `send_derived_values == true`)
|
||||
|
||||
The collector adds a `filesystem` tag to all metrics
|
||||
|
||||
`mmpmon` typically require root to run.
|
||||
In order to run `cc-metric-collector` without root priviliges, you can enable `use_sudo`.
|
||||
Add a file like this in `/etc/sudoers.d/` to allow `cc-metric-collector` to run the required command:
|
||||
|
||||
```
|
||||
# Do not log the following sudo commands from monitoring, since this causes a lot of log spam.
|
||||
# However keep log_denied enabled, to detect failures
|
||||
Defaults: monitoring !log_allowed, !pam_session
|
||||
|
||||
# Allow to use mmpmon
|
||||
monitoring ALL = (root) NOPASSWD:/absolute/path/to/mmpmon -p -s
|
||||
```
|
||||
@@ -8,6 +8,7 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -22,21 +23,29 @@ import (
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
const IB_BASEPATH = "/sys/class/infiniband/"
|
||||
// See: https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-class-infiniband
|
||||
const (
|
||||
ibBasePath = "/sys/class/infiniband/"
|
||||
ibDataUnit = "bytes"
|
||||
ibDataRateUnit = ibDataUnit + "/sec"
|
||||
ibPkgUnit = "packets"
|
||||
ibPkgRateUnit = ibPkgUnit + "/sec"
|
||||
)
|
||||
|
||||
type InfinibandCollectorMetric struct {
|
||||
name string
|
||||
path string
|
||||
unit string
|
||||
scale int64
|
||||
unitRates string
|
||||
scaleByFourLanes bool
|
||||
addToIBTotal bool
|
||||
addToIBTotalPkgs bool
|
||||
currentState int64
|
||||
lastState int64
|
||||
lastState uint64
|
||||
lastStateAvailable bool
|
||||
}
|
||||
|
||||
type InfinibandCollectorInfo struct {
|
||||
LID string // IB local Identifier (LID)
|
||||
lid string // IB local Identifier (LID)
|
||||
device string // IB device
|
||||
port string // IB device port
|
||||
portCounterFiles []InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric
|
||||
@@ -56,7 +65,7 @@ type InfinibandCollector struct {
|
||||
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
||||
}
|
||||
|
||||
// Init initializes the Infiniband collector by walking through files below IB_BASEPATH
|
||||
// Init initializes the Infiniband collector by walking through files below ibBasePath
|
||||
func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
// Check if already initialized
|
||||
if m.init {
|
||||
@@ -79,14 +88,15 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
m.config.SendDerivedValues = false
|
||||
// Read configuration file, allow overwriting default config
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Loop for all InfiniBand directories
|
||||
globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*")
|
||||
globPattern := filepath.Join(ibBasePath, "*", "ports", "*")
|
||||
ibDirs, err := filepath.Glob(globPattern)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s Init(): unable to glob files with pattern %s: %w", m.name, globPattern, err)
|
||||
@@ -121,36 +131,42 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
countersDir := filepath.Join(path, "counters")
|
||||
portCounterFiles := []InfinibandCollectorMetric{
|
||||
{
|
||||
// Total number of data octets, divided by 4 (lanes), received on all VLs.
|
||||
// This is 64 bit counter
|
||||
name: "ib_recv",
|
||||
path: filepath.Join(countersDir, "port_rcv_data"),
|
||||
unit: "bytes",
|
||||
scale: 4,
|
||||
unit: ibDataUnit,
|
||||
unitRates: ibDataRateUnit,
|
||||
scaleByFourLanes: true,
|
||||
addToIBTotal: true,
|
||||
lastState: -1,
|
||||
},
|
||||
{
|
||||
// Total number of data octets, divided by 4 (lanes), transmitted on all VLs.
|
||||
// This is 64 bit counter
|
||||
name: "ib_xmit",
|
||||
path: filepath.Join(countersDir, "port_xmit_data"),
|
||||
unit: "bytes",
|
||||
scale: 4,
|
||||
unit: ibDataUnit,
|
||||
unitRates: ibDataRateUnit,
|
||||
scaleByFourLanes: true,
|
||||
addToIBTotal: true,
|
||||
lastState: -1,
|
||||
},
|
||||
{
|
||||
// Total number of packets received on all VLs from this port (this may include packets containing Errors.
|
||||
// This is 64 bit counter.
|
||||
name: "ib_recv_pkts",
|
||||
path: filepath.Join(countersDir, "port_rcv_packets"),
|
||||
unit: "packets",
|
||||
scale: 1,
|
||||
unit: ibPkgUnit,
|
||||
unitRates: ibPkgRateUnit,
|
||||
addToIBTotalPkgs: true,
|
||||
lastState: -1,
|
||||
},
|
||||
{
|
||||
// Total number of packets transmitted on all VLs from this port. This may include packets with errors.
|
||||
// This is 64 bit counter.
|
||||
name: "ib_xmit_pkts",
|
||||
path: filepath.Join(countersDir, "port_xmit_packets"),
|
||||
unit: "packets",
|
||||
scale: 1,
|
||||
unit: ibPkgUnit,
|
||||
unitRates: ibPkgRateUnit,
|
||||
addToIBTotalPkgs: true,
|
||||
lastState: -1,
|
||||
},
|
||||
}
|
||||
for _, counter := range portCounterFiles {
|
||||
@@ -162,7 +178,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
|
||||
m.info = append(m.info,
|
||||
InfinibandCollectorInfo{
|
||||
LID: LID,
|
||||
lid: LID,
|
||||
device: device,
|
||||
port: port,
|
||||
portCounterFiles: portCounterFiles,
|
||||
@@ -183,7 +199,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Read reads Infiniband counter files below IB_BASEPATH
|
||||
// Read reads Infiniband counter files below ibBasePath
|
||||
func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
// Check if already initialized
|
||||
if !m.init {
|
||||
@@ -200,44 +216,42 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
for i := range m.info {
|
||||
info := &m.info[i]
|
||||
|
||||
var ib_total, ib_total_pkts int64
|
||||
var ibTotal, ibTotalPkts uint64 // sum of xmit and recv counters
|
||||
var ibTotalBw, ibTotalPktsBw float64 // sum of xmit and recv rates
|
||||
var ibTotalBwAvailable, ibTotalPktsBwAvailable bool
|
||||
for i := range info.portCounterFiles {
|
||||
counterDef := &info.portCounterFiles[i]
|
||||
|
||||
// Read counter file
|
||||
line, err := os.ReadFile(counterDef.path)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err))
|
||||
"Read(): Failed to read from file '%s': %v", counterDef.path, err)
|
||||
// Current counter can not be saved as last state
|
||||
counterDef.lastStateAvailable = false
|
||||
continue
|
||||
}
|
||||
data := strings.TrimSpace(string(line))
|
||||
|
||||
// convert counter to int64
|
||||
v, err := strconv.ParseInt(data, 10, 64)
|
||||
// convert counter to uint64
|
||||
vRawCounter, err := strconv.ParseUint(data, 10, 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterDef.name, data, err))
|
||||
"Read(): Failed to convert Infininiband metrice %s='%s' to uint64: %v", counterDef.name, data, err)
|
||||
// Current counter can not be saved as last state
|
||||
counterDef.lastStateAvailable = false
|
||||
continue
|
||||
}
|
||||
// Scale raw value
|
||||
v *= counterDef.scale
|
||||
|
||||
// Save current state
|
||||
counterDef.currentState = v
|
||||
vScaledCounter := vRawCounter
|
||||
if counterDef.scaleByFourLanes {
|
||||
vScaledCounter *= uint64(4)
|
||||
}
|
||||
|
||||
// Send absolut values
|
||||
if m.config.SendAbsoluteValues {
|
||||
if y, err := lp.NewMessage(
|
||||
counterDef.name,
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": counterDef.currentState,
|
||||
},
|
||||
now); err == nil {
|
||||
if y, err := lp.NewMetric(counterDef.name, info.tagSet, m.meta, vScaledCounter, now); err == nil {
|
||||
y.AddMeta("unit", counterDef.unit)
|
||||
output <- y
|
||||
}
|
||||
@@ -245,60 +259,75 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
|
||||
// Send derived values
|
||||
if m.config.SendDerivedValues {
|
||||
if counterDef.lastState >= 0 {
|
||||
rate := float64((counterDef.currentState - counterDef.lastState)) / timeDiff
|
||||
if y, err := lp.NewMessage(
|
||||
counterDef.name+"_bw",
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": rate,
|
||||
},
|
||||
now); err == nil {
|
||||
y.AddMeta("unit", counterDef.unit+"/sec")
|
||||
if counterDef.lastStateAvailable {
|
||||
var rate float64
|
||||
// uint64 subtraction handles wraparound automatically
|
||||
// in case vRawCounter < counterDef.lastState we would compute:
|
||||
// math.MaxUint64 - lastState + vRawCounter + 1
|
||||
// = (2^64 - 1) - lastState + vRawCounter + 1
|
||||
// = 2^64 - lastState + vRawCounter
|
||||
// ≡ vRawCounter - lastState (mod 2^64)
|
||||
rate = float64(vRawCounter-counterDef.lastState) / timeDiff
|
||||
if counterDef.scaleByFourLanes {
|
||||
rate *= float64(4)
|
||||
}
|
||||
if y, err := lp.NewMetric(counterDef.name+"_bw", info.tagSet, m.meta, rate, now); err == nil {
|
||||
y.AddMeta("unit", counterDef.unitRates)
|
||||
output <- y
|
||||
}
|
||||
|
||||
// Sum up rates for total rates
|
||||
if m.config.SendTotalValues {
|
||||
switch {
|
||||
case counterDef.addToIBTotal:
|
||||
ibTotalBw += rate
|
||||
ibTotalBwAvailable = true
|
||||
case counterDef.addToIBTotalPkgs:
|
||||
ibTotalPktsBw += rate
|
||||
ibTotalPktsBwAvailable = true
|
||||
}
|
||||
}
|
||||
counterDef.lastState = counterDef.currentState
|
||||
}
|
||||
counterDef.lastState = vRawCounter
|
||||
counterDef.lastStateAvailable = true
|
||||
}
|
||||
|
||||
// Sum up total values
|
||||
if m.config.SendTotalValues {
|
||||
switch {
|
||||
case counterDef.addToIBTotal:
|
||||
ib_total += counterDef.currentState
|
||||
ibTotal += vScaledCounter
|
||||
case counterDef.addToIBTotalPkgs:
|
||||
ib_total_pkts += counterDef.currentState
|
||||
ibTotalPkts += vScaledCounter
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Send total values
|
||||
if m.config.SendTotalValues {
|
||||
if y, err := lp.NewMessage(
|
||||
"ib_total",
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": ib_total,
|
||||
},
|
||||
now); err == nil {
|
||||
y.AddMeta("unit", "bytes")
|
||||
if y, err := lp.NewMetric("ib_total", info.tagSet, m.meta, ibTotal, now); err == nil {
|
||||
y.AddMeta("unit", ibDataUnit)
|
||||
output <- y
|
||||
}
|
||||
|
||||
if y, err := lp.NewMessage(
|
||||
"ib_total_pkts",
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": ib_total_pkts,
|
||||
},
|
||||
now); err == nil {
|
||||
y.AddMeta("unit", "packets")
|
||||
if y, err := lp.NewMetric("ib_total_pkts", info.tagSet, m.meta, ibTotalPkts, now); err == nil {
|
||||
y.AddMeta("unit", ibPkgUnit)
|
||||
output <- y
|
||||
}
|
||||
|
||||
if m.config.SendDerivedValues && ibTotalBwAvailable {
|
||||
if y, err := lp.NewMetric("ib_total_bw", info.tagSet, m.meta, ibTotalBw, now); err == nil {
|
||||
y.AddMeta("unit", ibDataRateUnit)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
if m.config.SendDerivedValues && ibTotalPktsBwAvailable {
|
||||
if y, err := lp.NewMetric("ib_total_pkts_bw", info.tagSet, m.meta, ibTotalPktsBw, now); err == nil {
|
||||
y.AddMeta("unit", ibPkgRateUnit)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,5 +41,7 @@ Metrics:
|
||||
* `ib_xmit_bw` (if `send_derived_values == true`)
|
||||
* `ib_recv_pkts_bw` (if `send_derived_values == true`)
|
||||
* `ib_xmit_pkts_bw` (if `send_derived_values == true`)
|
||||
* `ib_total_bw` (if `send_total_values == true` and `send_derived_values == true`)
|
||||
* `ib_total_pkts_bw` (if `send_total_values == true` and `send_derived_values == true`)
|
||||
|
||||
The collector adds a `device` tag to all metrics
|
||||
|
||||
@@ -9,8 +9,8 @@ package collectors
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"slices"
|
||||
@@ -44,7 +44,6 @@ type IOstatCollector struct {
|
||||
}
|
||||
|
||||
func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||
var err error
|
||||
m.name = "IOstatCollector"
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
||||
@@ -52,9 +51,10 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
// https://www.kernel.org/doc/html/latest/admin-guide/iostats.html
|
||||
@@ -85,7 +85,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
}
|
||||
if len(m.matches) == 0 {
|
||||
return errors.New("no metrics to collect")
|
||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||
}
|
||||
file, err := os.Open(IOSTATFILE)
|
||||
if err != nil {
|
||||
@@ -135,7 +135,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
m.init = true
|
||||
return err
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
@@ -145,16 +145,16 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
|
||||
file, err := os.Open(IOSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", IOSTATFILE, err))
|
||||
"Read(): Failed to open file '%s': %v", IOSTATFILE, err)
|
||||
return
|
||||
}
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", IOSTATFILE, err))
|
||||
"Read(): Failed to close file '%s': %v", IOSTATFILE, err)
|
||||
}
|
||||
}()
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@ import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os/exec"
|
||||
@@ -29,10 +28,11 @@ type IpmiCollector struct {
|
||||
metricCollector
|
||||
|
||||
config struct {
|
||||
ExcludeDevices []string `json:"exclude_devices"`
|
||||
IpmitoolPath string `json:"ipmitool_path"`
|
||||
IpmisensorsPath string `json:"ipmisensors_path"`
|
||||
Sudo bool `json:"use_sudo"`
|
||||
}
|
||||
|
||||
ipmitool string
|
||||
ipmisensors string
|
||||
}
|
||||
@@ -55,57 +55,75 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
|
||||
// default path to IPMI tools
|
||||
m.config.IpmitoolPath = "ipmitool"
|
||||
m.config.IpmisensorsPath = "ipmi-sensors"
|
||||
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
// Check if executables ipmitool or ipmisensors are found
|
||||
p, err := exec.LookPath(m.config.IpmitoolPath)
|
||||
if err == nil {
|
||||
command := exec.Command(p)
|
||||
err := command.Run()
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
|
||||
m.ipmitool = ""
|
||||
} else {
|
||||
m.ipmitool = p
|
||||
}
|
||||
}
|
||||
p, err = exec.LookPath(m.config.IpmisensorsPath)
|
||||
if err == nil {
|
||||
command := exec.Command(p)
|
||||
err := command.Run()
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
|
||||
m.ipmisensors = ""
|
||||
} else {
|
||||
m.ipmisensors = p
|
||||
}
|
||||
}
|
||||
if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 {
|
||||
return errors.New("no usable IPMI reader found")
|
||||
}
|
||||
|
||||
m.ipmitool = m.config.IpmitoolPath
|
||||
m.ipmisensors = m.config.IpmisensorsPath
|
||||
|
||||
// Test if any of the supported backends work
|
||||
var dummyChan chan lp.CCMessage
|
||||
dummyConsumer := func() {
|
||||
for range dummyChan {
|
||||
}
|
||||
}
|
||||
|
||||
// Test if ipmi-sensors works (preferred over ipmitool, because it's faster)
|
||||
var ipmiSensorsErr error
|
||||
if _, ipmiSensorsErr = exec.LookPath(m.ipmisensors); ipmiSensorsErr == nil {
|
||||
dummyChan = make(chan lp.CCMessage)
|
||||
go dummyConsumer()
|
||||
ipmiSensorsErr = m.readIpmiSensors(dummyChan)
|
||||
close(dummyChan)
|
||||
if ipmiSensorsErr == nil {
|
||||
cclog.ComponentDebugf(m.name, "Using ipmi-sensors for ipmistat collector")
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
}
|
||||
cclog.ComponentDebugf(m.name, "Unable to use ipmi-sensors for ipmistat collector: %v", ipmiSensorsErr)
|
||||
m.ipmisensors = ""
|
||||
|
||||
// Test if ipmitool works (may be very slow)
|
||||
var ipmiToolErr error
|
||||
if _, ipmiToolErr = exec.LookPath(m.ipmitool); ipmiToolErr == nil {
|
||||
dummyChan = make(chan lp.CCMessage)
|
||||
go dummyConsumer()
|
||||
ipmiToolErr = m.readIpmiTool(dummyChan)
|
||||
close(dummyChan)
|
||||
if ipmiToolErr == nil {
|
||||
cclog.ComponentDebugf(m.name, "Using ipmitool for ipmistat collector")
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
}
|
||||
m.ipmitool = ""
|
||||
cclog.ComponentDebugf(m.name, "Unable to use ipmitool for ipmistat collector: %v", ipmiToolErr)
|
||||
|
||||
return fmt.Errorf("unable to init neither ipmitool (%w) nor ipmi-sensors (%w)", ipmiToolErr, ipmiSensorsErr)
|
||||
}
|
||||
|
||||
func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
|
||||
func (m *IpmiCollector) readIpmiTool(output chan lp.CCMessage) error {
|
||||
// Setup ipmitool command
|
||||
command := exec.Command(cmd, "sensor")
|
||||
argv := make([]string, 0)
|
||||
if m.config.Sudo {
|
||||
argv = append(argv, "sudo", "-n")
|
||||
}
|
||||
argv = append(argv, m.ipmitool, "sensor")
|
||||
command := exec.Command(argv[0], argv[1:]...)
|
||||
stdout, _ := command.StdoutPipe()
|
||||
errBuf := new(bytes.Buffer)
|
||||
command.Stderr = errBuf
|
||||
|
||||
// start command
|
||||
if err := command.Start(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("readIpmiTool(): Failed to start command \"%s\": %v", command.String(), err),
|
||||
)
|
||||
return
|
||||
return fmt.Errorf("failed to start command '%s': %w", command.String(), err)
|
||||
}
|
||||
|
||||
// Read command output
|
||||
@@ -115,8 +133,17 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
|
||||
if len(lv) < 3 {
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.TrimSpace(lv[1]) == "0x0" || strings.TrimSpace(lv[1]) == "na" {
|
||||
// Ignore known non-float values
|
||||
continue
|
||||
}
|
||||
|
||||
v, err := strconv.ParseFloat(strings.TrimSpace(lv[1]), 64)
|
||||
if err == nil {
|
||||
if err != nil {
|
||||
cclog.ComponentErrorf(m.name, "Failed to parse float '%s': %v", lv[1], err)
|
||||
continue
|
||||
}
|
||||
name := strings.ToLower(strings.ReplaceAll(strings.TrimSpace(lv[0]), " ", "_"))
|
||||
unit := strings.TrimSpace(lv[2])
|
||||
switch unit {
|
||||
@@ -130,71 +157,76 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
|
||||
unit = "Watts"
|
||||
}
|
||||
|
||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
|
||||
if err == nil {
|
||||
y, err := lp.NewMetric(name, map[string]string{"type": "node"}, m.meta, v, time.Now())
|
||||
if err != nil {
|
||||
cclog.ComponentErrorf(m.name, "Failed to create message: %v", err)
|
||||
continue
|
||||
}
|
||||
y.AddMeta("unit", unit)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for command end
|
||||
if err := command.Wait(); err != nil {
|
||||
errMsg, _ := io.ReadAll(errBuf)
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("readIpmiTool(): Failed to wait for the end of command \"%s\": %v\n", command.String(), err),
|
||||
)
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("readIpmiTool(): command stderr: \"%s\"\n", strings.TrimSpace(string(errMsg))))
|
||||
return
|
||||
return fmt.Errorf("failed to complete command '%s': %w (stderr: %s)", command.String(), err, strings.TrimSpace(string(errMsg)))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
|
||||
func (m *IpmiCollector) readIpmiSensors(output chan lp.CCMessage) error {
|
||||
// Setup ipmisensors command
|
||||
command := exec.Command(cmd, "--comma-separated-output", "--sdr-cache-recreate")
|
||||
argv := make([]string, 0)
|
||||
if m.config.Sudo {
|
||||
argv = append(argv, "sudo", "-n")
|
||||
}
|
||||
argv = append(argv, m.ipmisensors, "--comma-separated-output", "--sdr-cache-recreate")
|
||||
command := exec.Command(argv[0], argv[1:]...)
|
||||
stdout, _ := command.StdoutPipe()
|
||||
errBuf := new(bytes.Buffer)
|
||||
command.Stderr = errBuf
|
||||
|
||||
// start command
|
||||
if err := command.Start(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("readIpmiSensors(): Failed to start command \"%s\": %v", command.String(), err),
|
||||
)
|
||||
return
|
||||
return fmt.Errorf("failed to start command '%s': %w", command.String(), err)
|
||||
}
|
||||
|
||||
// Read command output
|
||||
scanner := bufio.NewScanner(stdout)
|
||||
for scanner.Scan() {
|
||||
lv := strings.Split(scanner.Text(), ",")
|
||||
if len(lv) > 3 {
|
||||
v, err := strconv.ParseFloat(lv[3], 64)
|
||||
if err == nil {
|
||||
if len(lv) <= 3 {
|
||||
continue
|
||||
}
|
||||
if lv[3] == "N/A" || lv[3] == "Reading" {
|
||||
// Ignore known non-float values
|
||||
continue
|
||||
}
|
||||
v, err := strconv.ParseFloat(strings.TrimSpace(lv[3]), 64)
|
||||
if err != nil {
|
||||
cclog.ComponentErrorf(m.name, "Failed to parse float '%s': %v", lv[3], err)
|
||||
continue
|
||||
}
|
||||
name := strings.ToLower(strings.ReplaceAll(lv[1], " ", "_"))
|
||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
|
||||
if err == nil {
|
||||
y, err := lp.NewMetric(name, map[string]string{"type": "node"}, m.meta, v, time.Now())
|
||||
if err != nil {
|
||||
cclog.ComponentErrorf(m.name, "Failed to create message: %v", err)
|
||||
continue
|
||||
}
|
||||
if len(lv) > 4 {
|
||||
y.AddMeta("unit", lv[4])
|
||||
}
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for command end
|
||||
if err := command.Wait(); err != nil {
|
||||
errMsg, _ := io.ReadAll(errBuf)
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("readIpmiSensors(): Failed to wait for the end of command \"%s\": %v\n", command.String(), err),
|
||||
)
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("readIpmiSensors(): command stderr: \"%s\"\n", strings.TrimSpace(string(errMsg))))
|
||||
return
|
||||
return fmt.Errorf("failed to complete command '%s': %w (stderr: %s)", command.String(), err, strings.TrimSpace(string(errMsg)))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
@@ -203,10 +235,16 @@ func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
return
|
||||
}
|
||||
|
||||
if len(m.config.IpmitoolPath) > 0 {
|
||||
m.readIpmiTool(m.config.IpmitoolPath, output)
|
||||
} else if len(m.config.IpmisensorsPath) > 0 {
|
||||
m.readIpmiSensors(m.config.IpmisensorsPath, output)
|
||||
if len(m.ipmisensors) > 0 {
|
||||
err := m.readIpmiSensors(output)
|
||||
if err != nil {
|
||||
cclog.ComponentErrorf(m.name, "readIpmiSensors() failed: %v", err)
|
||||
}
|
||||
} else if len(m.ipmitool) > 0 {
|
||||
err := m.readIpmiTool(output)
|
||||
if err != nil {
|
||||
cclog.ComponentErrorf(m.name, "readIpmiTool() failed: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -15,9 +15,24 @@ hugo_path: docs/reference/cc-metric-collector/collectors/ipmi.md
|
||||
"ipmistat": {
|
||||
"ipmitool_path": "/path/to/ipmitool",
|
||||
"ipmisensors_path": "/path/to/ipmi-sensors",
|
||||
"use_sudo": true
|
||||
}
|
||||
```
|
||||
|
||||
The `ipmistat` collector reads data from `ipmitool` (`ipmitool sensor`) or `ipmi-sensors` (`ipmi-sensors --sdr-cache-recreate --comma-separated-output`).
|
||||
|
||||
The metrics depend on the output of the underlying tools but contain temperature, power and energy metrics.
|
||||
|
||||
`ipmitool` and `ipmi-sensors` typically require root to run.
|
||||
In order to run `cc-metric-collector` without root priviliges, you can enable `use_sudo`.
|
||||
Add a file like this in `/etc/sudoers.d/` to allow `cc-metric-collector` to run the required commands:
|
||||
|
||||
```
|
||||
# Do not log the following sudo commands from monitoring, since this causes a lot of log spam.
|
||||
# However keep log_denied enabled, to detect failures
|
||||
Defaults: monitoring !log_allowed, !pam_session
|
||||
|
||||
# Allow to use ipmitool and ipmi-sensors
|
||||
monitoring ALL = (root) NOPASSWD:/usr/bin/ipmitool sensor
|
||||
monitoring ALL = (root) NOPASSWD:/usr/sbin/ipmi-sensors --comma-separated-output --sdr-cache-recreate
|
||||
```
|
||||
|
||||
@@ -12,12 +12,18 @@ package collectors
|
||||
#cgo LDFLAGS: -Wl,--unresolved-symbols=ignore-in-object-files
|
||||
#include <stdlib.h>
|
||||
#include <likwid.h>
|
||||
|
||||
|
||||
int cc_add_hwthread(int cpu_id) {
|
||||
return HPMaddThread(cpu_id);
|
||||
}
|
||||
|
||||
*/
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"maps"
|
||||
"math"
|
||||
@@ -207,24 +213,25 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
m.config.LibraryPath = LIKWID_LIB_NAME
|
||||
m.config.LockfilePath = LIKWID_DEF_LOCKFILE
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err)
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS)
|
||||
if lib == nil {
|
||||
return fmt.Errorf("error instantiating DynamicLibrary for %s", m.config.LibraryPath)
|
||||
return fmt.Errorf("%s Init(): error instantiating DynamicLibrary for %s", m.name, m.config.LibraryPath)
|
||||
}
|
||||
err := lib.Open()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error opening %s: %w", m.config.LibraryPath, err)
|
||||
return fmt.Errorf("%s Init(): error opening %s: %w", m.name, m.config.LibraryPath, err)
|
||||
}
|
||||
|
||||
if m.config.ForceOverwrite {
|
||||
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
|
||||
if err := os.Setenv("LIKWID_FORCE", "1"); err != nil {
|
||||
return fmt.Errorf("error setting environment variable LIKWID_FORCE=1: %w", err)
|
||||
return fmt.Errorf("%s Init(): error setting environment variable LIKWID_FORCE=1: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
if err := m.setup(); err != nil {
|
||||
@@ -260,12 +267,12 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
for _, metric := range evset.Metrics {
|
||||
// Try to evaluate the metric
|
||||
cclog.ComponentDebug(m.name, "Checking", metric.Name)
|
||||
cclog.ComponentDebugf(m.name, "Checking %s", metric.Name)
|
||||
if !checkMetricType(metric.Type) {
|
||||
cclog.ComponentError(m.name, "Metric", metric.Name, "uses invalid type", metric.Type)
|
||||
cclog.ComponentErrorf(m.name, "Metric %s uses invalid type %s", metric.Name, metric.Type)
|
||||
metric.Calc = ""
|
||||
} else if !testLikwidMetricFormula(metric.Calc, params) {
|
||||
cclog.ComponentError(m.name, "Metric", metric.Name, "cannot be calculated with given counters")
|
||||
cclog.ComponentError(m.name, "Metric %s cannot be calculated with given counters", metric.Name)
|
||||
metric.Calc = ""
|
||||
} else {
|
||||
globalParams = append(globalParams, metric.Name)
|
||||
@@ -280,13 +287,13 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
for _, metric := range m.config.Metrics {
|
||||
// Try to evaluate the global metric
|
||||
if !checkMetricType(metric.Type) {
|
||||
cclog.ComponentError(m.name, "Metric", metric.Name, "uses invalid type", metric.Type)
|
||||
cclog.ComponentErrorf(m.name, "Metric %s uses invalid type %s", metric.Name, metric.Type)
|
||||
metric.Calc = ""
|
||||
} else if !testLikwidMetricFormula(metric.Calc, globalParams) {
|
||||
cclog.ComponentError(m.name, "Metric", metric.Name, "cannot be calculated with given counters")
|
||||
cclog.ComponentError(m.name, "Metric %s cannot be calculated with given counters", metric.Name)
|
||||
metric.Calc = ""
|
||||
} else if !checkMetricType(metric.Type) {
|
||||
cclog.ComponentError(m.name, "Metric", metric.Name, "has invalid type")
|
||||
cclog.ComponentError(m.name, "Metric %s has invalid type", metric.Name)
|
||||
metric.Calc = ""
|
||||
} else {
|
||||
totalMetrics++
|
||||
@@ -295,16 +302,12 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// If no event set could be added, shut down LikwidCollector
|
||||
if totalMetrics == 0 {
|
||||
err := errors.New("no LIKWID eventset or metric usable")
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): no LIKWID eventset or metric usable", m.name)
|
||||
}
|
||||
|
||||
ret := C.topology_init()
|
||||
if ret != 0 {
|
||||
err := errors.New("failed to initialize topology module")
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to initialize topology module", m.name)
|
||||
}
|
||||
m.measureThread = thread.New()
|
||||
switch m.config.AccessMode {
|
||||
@@ -319,7 +322,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
p = m.config.DaemonPath
|
||||
}
|
||||
if err := os.Setenv("PATH", p); err != nil {
|
||||
return fmt.Errorf("error setting environment variable PATH=%s: %w", p, err)
|
||||
return fmt.Errorf("%s Init(): error setting environment variable PATH=%s: %w", m.name, p, err)
|
||||
}
|
||||
}
|
||||
C.HPMmode(1)
|
||||
@@ -331,7 +334,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
for _, c := range m.cpulist {
|
||||
m.measureThread.Call(
|
||||
func() {
|
||||
retCode := C.HPMaddThread(C.uint32_t(c))
|
||||
retCode := C.cc_add_hwthread(C.int(c))
|
||||
if retCode != 0 {
|
||||
err := fmt.Errorf("C.HPMaddThread(%v) failed with return code %v", c, retCode)
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
@@ -378,16 +381,16 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
// Watch changes for the lock file ()
|
||||
watcher, err := fsnotify.NewWatcher()
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("takeMeasurement(): Failed to create a new fsnotify.Watcher: %v", err))
|
||||
"takeMeasurement(): Failed to create a new fsnotify.Watcher: %v", err)
|
||||
return true, err
|
||||
}
|
||||
defer func() {
|
||||
if err := watcher.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("takeMeasurement(): Failed to close fsnotify.Watcher: %v", err))
|
||||
"takeMeasurement(): Failed to close fsnotify.Watcher: %v", err)
|
||||
}
|
||||
}()
|
||||
if len(m.config.LockfilePath) > 0 {
|
||||
@@ -600,7 +603,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
if tid >= 0 && len(metric.Calc) > 0 {
|
||||
value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid])
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Calculation for metric %s failed: %s", metric.Name, err.Error())
|
||||
value = 0.0
|
||||
}
|
||||
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
|
||||
@@ -765,7 +768,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
||||
// Evaluate the metric
|
||||
value, err := agg.EvalFloat64Condition(metric.Calc, params)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Calculation for metric %s failed: %s", metric.Name, err.Error())
|
||||
value = 0.0
|
||||
}
|
||||
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -48,9 +49,10 @@ func (m *LoadavgCollector) Init(config json.RawMessage) error {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.meta = map[string]string{
|
||||
@@ -63,16 +65,17 @@ func (m *LoadavgCollector) Init(config json.RawMessage) error {
|
||||
"load_five",
|
||||
"load_fifteen",
|
||||
}
|
||||
m.load_skips = make([]bool, len(m.load_matches))
|
||||
m.proc_matches = []string{
|
||||
"proc_run",
|
||||
"proc_total",
|
||||
}
|
||||
m.proc_skips = make([]bool, len(m.proc_matches))
|
||||
|
||||
m.load_skips = make([]bool, len(m.load_matches))
|
||||
for i, name := range m.load_matches {
|
||||
m.load_skips[i] = slices.Contains(m.config.ExcludeMetrics, name)
|
||||
}
|
||||
|
||||
m.proc_skips = make([]bool, len(m.proc_matches))
|
||||
for i, name := range m.proc_matches {
|
||||
m.proc_skips[i] = slices.Contains(m.config.ExcludeMetrics, name)
|
||||
}
|
||||
@@ -86,9 +89,9 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
}
|
||||
buffer, err := os.ReadFile(LOADAVGFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to read file '%s': %v", LOADAVGFILE, err))
|
||||
"Read(): Failed to read file '%s': %v", LOADAVGFILE, err)
|
||||
return
|
||||
}
|
||||
now := time.Now()
|
||||
@@ -98,15 +101,15 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
for i, name := range m.load_matches {
|
||||
x, err := strconv.ParseFloat(ls[i], 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", ls[i], err))
|
||||
"Read(): Failed to convert '%s' to float64: %v", ls[i], err)
|
||||
continue
|
||||
}
|
||||
if m.load_skips[i] {
|
||||
continue
|
||||
}
|
||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
|
||||
y, err := lp.NewMetric(name, m.tags, m.meta, x, now)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -117,15 +120,15 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
for i, name := range m.proc_matches {
|
||||
x, err := strconv.ParseInt(lv[i], 10, 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", lv[i], err))
|
||||
"Read(): Failed to convert '%s' to float64: %v", lv[i], err)
|
||||
continue
|
||||
}
|
||||
if m.proc_skips[i] {
|
||||
continue
|
||||
}
|
||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
|
||||
y, err := lp.NewMetric(name, m.tags, m.meta, x, now)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
@@ -18,7 +19,6 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
)
|
||||
|
||||
@@ -300,9 +300,10 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
||||
m.name = "LustreCollector"
|
||||
m.parallel = true
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
if err := m.setup(); err != nil {
|
||||
@@ -316,18 +317,15 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
||||
if !m.config.Sudo {
|
||||
user, err := user.Current()
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
|
||||
}
|
||||
if user.Uid != "0" {
|
||||
cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root")
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Lustre file system statistics can only be queried by user root", m.name)
|
||||
}
|
||||
} else {
|
||||
p, err := exec.LookPath("sudo")
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Cannot find 'sudo'")
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Cannot find 'sudo': %w", m.name, err)
|
||||
}
|
||||
m.sudoCmd = p
|
||||
}
|
||||
@@ -336,7 +334,7 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
||||
if err != nil {
|
||||
p, err = exec.LookPath(LCTL_CMD)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Cannot find %s command: %w", m.name, LCTL_CMD, err)
|
||||
}
|
||||
}
|
||||
m.lctl = p
|
||||
@@ -364,12 +362,12 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
}
|
||||
if len(m.definitions) == 0 {
|
||||
return errors.New("no metrics to collect")
|
||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||
}
|
||||
|
||||
devices := m.getDevices()
|
||||
if len(devices) == 0 {
|
||||
return errors.New("no Lustre devices found")
|
||||
return fmt.Errorf("%s Init(): no Lustre devices found", m.name)
|
||||
}
|
||||
m.stats = make(map[string]map[string]int64)
|
||||
for _, d := range devices {
|
||||
|
||||
@@ -55,3 +55,16 @@ Metrics:
|
||||
* `lustre_inode_permission_diff` (if `send_diff_values == true`)
|
||||
|
||||
This collector adds an `device` tag.
|
||||
|
||||
`lctl` typically require root to run.
|
||||
In order to run `cc-metric-collector` without root priviliges, you can enable `use_sudo`.
|
||||
Add a file like this in `/etc/sudoers.d/` to allow `cc-metric-collector` to run the required command:
|
||||
|
||||
```
|
||||
# Do not log the following sudo commands from monitoring, since this causes a lot of log spam.
|
||||
# However keep log_denied enabled, to detect failures
|
||||
Defaults: monitoring !log_allowed, !pam_session
|
||||
|
||||
# Allow to use lctl
|
||||
monitoring ALL = (root) NOPASSWD:/absolute/path/to/lctl get_param llite.*.stats
|
||||
```
|
||||
@@ -9,8 +9,8 @@ package collectors
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
@@ -72,7 +72,8 @@ func getStats(filename string) map[string]MemstatStats {
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
linefields := strings.Fields(line)
|
||||
if len(linefields) == 3 {
|
||||
switch len(linefields) {
|
||||
case 3:
|
||||
v, err := strconv.ParseFloat(linefields[1], 64)
|
||||
if err == nil {
|
||||
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
|
||||
@@ -80,10 +81,10 @@ func getStats(filename string) map[string]MemstatStats {
|
||||
unit: linefields[2],
|
||||
}
|
||||
}
|
||||
} else if len(linefields) == 5 {
|
||||
case 5:
|
||||
v, err := strconv.ParseFloat(linefields[3], 64)
|
||||
if err == nil {
|
||||
cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4])
|
||||
cclog.ComponentDebug("MemstatCollector", "getStats %s value %v unit %s", strings.Trim(linefields[2], ":"), v, linefields[4])
|
||||
stats[strings.Trim(linefields[2], ":")] = MemstatStats{
|
||||
value: v,
|
||||
unit: linefields[4],
|
||||
@@ -95,18 +96,21 @@ func getStats(filename string) map[string]MemstatStats {
|
||||
}
|
||||
|
||||
func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
var err error
|
||||
m.name = "MemstatCollector"
|
||||
m.parallel = true
|
||||
m.config.NodeStats = true
|
||||
m.config.NumaStats = false
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.meta = map[string]string{"source": m.name, "group": "Memory"}
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "Memory",
|
||||
}
|
||||
m.stats = make(map[string]int64)
|
||||
m.matches = make(map[string]string)
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
@@ -120,10 +124,32 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
"Cached": "mem_cached",
|
||||
"MemAvailable": "mem_available",
|
||||
"SwapFree": "swap_free",
|
||||
"MemShared": "mem_shared",
|
||||
"Shmem": "mem_shared",
|
||||
"Active": "mem_active",
|
||||
"Inactive": "mem_inactive",
|
||||
"Dirty": "mem_dirty",
|
||||
"Writeback": "mem_writeback",
|
||||
"AnonPages": "mem_anon_pages",
|
||||
"Mapped": "mem_mapped",
|
||||
"VmallocTotal": "mem_vmalloc_total",
|
||||
"AnonHugePages": "mem_anon_hugepages",
|
||||
"ShmemHugePages": "mem_shared_hugepages",
|
||||
"ShmemPmdMapped": "mem_shared_pmd_mapped",
|
||||
"HugePages_Total": "mem_hugepages_total",
|
||||
"HugePages_Free": "mem_hugepages_free",
|
||||
"HugePages_Rsvd": "mem_hugepages_reserved",
|
||||
"HugePages_Surp": "mem_hugepages_surplus",
|
||||
"Hugepagesize": "mem_hugepages_size",
|
||||
"DirectMap4k": "mem_direct_mapped_4k",
|
||||
"DirectMap4M": "mem_direct_mapped_4m",
|
||||
"DirectMap2M": "mem_direct_mapped_2m",
|
||||
"DirectMap1G": "mem_direct_mapped_1g",
|
||||
"Mlocked": "mem_locked",
|
||||
"PageTables": "mem_pagetables",
|
||||
"KernelStack": "mem_kernelstack",
|
||||
}
|
||||
for k, v := range matches {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, k) {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, v) {
|
||||
m.matches[k] = v
|
||||
}
|
||||
}
|
||||
@@ -131,8 +157,8 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, "mem_used") {
|
||||
m.sendMemUsed = true
|
||||
}
|
||||
if len(m.matches) == 0 {
|
||||
return errors.New("no metrics to collect")
|
||||
if len(m.matches) == 0 && !m.sendMemUsed {
|
||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||
}
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
@@ -140,7 +166,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
|
||||
if m.config.NodeStats {
|
||||
if stats := getStats(MEMSTATFILE); len(stats) == 0 {
|
||||
return fmt.Errorf("cannot read data from file %s", MEMSTATFILE)
|
||||
return fmt.Errorf("%s Init(): cannot read data from file %s", m.name, MEMSTATFILE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,7 +178,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
m.nodefiles = make(map[int]MemstatCollectorNode)
|
||||
for _, f := range files {
|
||||
if stats := getStats(f); len(stats) == 0 {
|
||||
return fmt.Errorf("cannot read data from file %s", f)
|
||||
return fmt.Errorf("%s Init(): cannot read data from file %s", m.name, f)
|
||||
}
|
||||
rematch := regex.FindStringSubmatch(f)
|
||||
if len(rematch) == 2 {
|
||||
@@ -172,7 +198,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
}
|
||||
m.init = true
|
||||
return err
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
@@ -191,7 +217,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
}
|
||||
}
|
||||
|
||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||
y, err := lp.NewMetric(name, tags, m.meta, value, time.Now())
|
||||
if err == nil {
|
||||
if len(unit) > 0 {
|
||||
y.AddMeta("unit", unit)
|
||||
@@ -221,10 +247,16 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
unit = cacheVal.unit
|
||||
}
|
||||
}
|
||||
if shmemVal, shmem := stats["Shmem"]; shmem {
|
||||
memUsed -= shmemVal.value
|
||||
if len(shmemVal.unit) > 0 && len(unit) == 0 {
|
||||
unit = shmemVal.unit
|
||||
}
|
||||
}
|
||||
}
|
||||
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]any{"value": memUsed}, time.Now())
|
||||
}
|
||||
}
|
||||
y, err := lp.NewMetric("mem_used", tags, m.meta, memUsed, time.Now())
|
||||
if err == nil {
|
||||
if len(unit) > 0 {
|
||||
y.AddMeta("unit", unit)
|
||||
|
||||
@@ -32,7 +32,29 @@ Metrics:
|
||||
* `mem_cached`
|
||||
* `mem_available`
|
||||
* `mem_shared`
|
||||
* `mem_active`
|
||||
* `mem_inactive`
|
||||
* `mem_dirty`
|
||||
* `mem_writeback`
|
||||
* `mem_anon_pages`
|
||||
* `mem_mapped`
|
||||
* `mem_vmalloc_total`
|
||||
* `mem_anon_hugepages`
|
||||
* `mem_shared_hugepages`
|
||||
* `mem_shared_pmd_mapped`
|
||||
* `mem_hugepages_total`
|
||||
* `mem_hugepages_free`
|
||||
* `mem_hugepages_reserved`
|
||||
* `mem_hugepages_surplus`
|
||||
* `mem_hugepages_size`
|
||||
* `mem_direct_mapped_4k`
|
||||
* `mem_direct_mapped_2m`
|
||||
* `mem_direct_mapped_4m`
|
||||
* `mem_direct_mapped_1g`
|
||||
* `mem_locked`
|
||||
* `mem_pagetables`
|
||||
* `mem_kernelstack`
|
||||
* `swap_total`
|
||||
* `swap_free`
|
||||
* `mem_used` = `mem_total` - (`mem_free` + `mem_buffers` + `mem_cached`)
|
||||
* `mem_used` = `mem_total` - (`mem_free` + `mem_buffers` + `mem_cached` + `mem_shared`)
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ package collectors
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -99,10 +100,10 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||
m.config.SendDerivedValues = false
|
||||
// Read configuration file, allow overwriting default config
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -133,11 +134,31 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||
// Check if device is a included device
|
||||
if slices.Contains(m.config.IncludeDevices, canonical) {
|
||||
// Tag will contain original device name (raw).
|
||||
tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"}
|
||||
meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"}
|
||||
meta_unit_byte_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "bytes/sec"}
|
||||
meta_unit_pkts := map[string]string{"source": m.name, "group": "Network", "unit": "packets"}
|
||||
meta_unit_pkts_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "packets/sec"}
|
||||
tags := map[string]string{
|
||||
"stype": "network",
|
||||
"stype-id": raw,
|
||||
"type": "node",
|
||||
}
|
||||
meta_unit_byte := map[string]string{
|
||||
"source": m.name,
|
||||
"group": "Network",
|
||||
"unit": "bytes",
|
||||
}
|
||||
meta_unit_byte_per_sec := map[string]string{
|
||||
"source": m.name,
|
||||
"group": "Network",
|
||||
"unit": "bytes/sec",
|
||||
}
|
||||
meta_unit_pkts := map[string]string{
|
||||
"source": m.name,
|
||||
"group": "Network",
|
||||
"unit": "packets",
|
||||
}
|
||||
meta_unit_pkts_per_sec := map[string]string{
|
||||
"source": m.name,
|
||||
"group": "Network",
|
||||
"unit": "packets/sec",
|
||||
}
|
||||
|
||||
m.matches[canonical] = []NetstatCollectorMetric{
|
||||
{
|
||||
@@ -201,16 +222,16 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
|
||||
file, err := os.Open(NETSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", NETSTATFILE, err))
|
||||
"Read(): Failed to open file '%s': %v", NETSTATFILE, err)
|
||||
return
|
||||
}
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", NETSTATFILE, err))
|
||||
"Read(): Failed to close file '%s': %v", NETSTATFILE, err)
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -241,14 +262,14 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
continue
|
||||
}
|
||||
if m.config.SendAbsoluteValues {
|
||||
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]any{"value": v}, now); err == nil {
|
||||
if y, err := lp.NewMetric(metric.name, metric.tags, metric.meta, v, now); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if m.config.SendDerivedValues {
|
||||
if metric.lastValue >= 0 {
|
||||
rate := float64(v-metric.lastValue) / timeDiff
|
||||
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]any{"value": rate}, now); err == nil {
|
||||
if y, err := lp.NewMetric(metric.name+"_bw", metric.tags, metric.meta_rates, rate, now); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"slices"
|
||||
@@ -45,12 +46,7 @@ type nfsCollector struct {
|
||||
}
|
||||
|
||||
func (m *nfsCollector) updateStats() error {
|
||||
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
|
||||
|
||||
// Wait for cmd end
|
||||
if err := cmd.Wait(); err != nil {
|
||||
return fmt.Errorf("%s updateStats(): %w", m.name, err)
|
||||
}
|
||||
cmd := exec.Command(m.config.Nfsstats, "-l", "--all")
|
||||
|
||||
buffer, err := cmd.Output()
|
||||
if err != nil {
|
||||
@@ -95,9 +91,10 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
|
||||
m.config.Nfsstats = string(NFSSTAT_EXEC)
|
||||
// Read JSON configuration
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err)
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.meta = map[string]string{
|
||||
@@ -128,10 +125,9 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
timestamp := time.Now()
|
||||
|
||||
if err := m.updateStats(); err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): updateStats() failed: %v", err),
|
||||
)
|
||||
"Read(): updateStats() failed: %v", err)
|
||||
return
|
||||
}
|
||||
var prefix string
|
||||
@@ -149,16 +145,15 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
continue
|
||||
}
|
||||
|
||||
valueMap := make(map[string]any)
|
||||
if data.current >= 0 && data.last >= 0 {
|
||||
valueMap["value"] = data.current - data.last
|
||||
}
|
||||
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, valueMap, timestamp)
|
||||
value := data.current - data.last
|
||||
y, err := lp.NewMetric(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, value, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("version", m.version)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *nfsCollector) Close() {
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -17,14 +18,13 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
)
|
||||
|
||||
// These are the fields we read from the JSON configuration
|
||||
type NfsIOStatCollectorConfig struct {
|
||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
|
||||
ExcludeFilesystems []string `json:"exclude_filesystem,omitempty"`
|
||||
UseServerAddressAsSType bool `json:"use_server_as_stype,omitempty"`
|
||||
SendAbsoluteValues bool `json:"send_abs_values"`
|
||||
SendDerivedValues bool `json:"send_derived_values"`
|
||||
@@ -75,7 +75,7 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
|
||||
// Is this a device line with mount point, remote target and NFS version?
|
||||
dev := resolve_regex_fields(l, deviceRegex)
|
||||
if len(dev) > 0 {
|
||||
if !slices.Contains(m.config.ExcludeFilesystem, dev[m.key]) {
|
||||
if !slices.Contains(m.config.ExcludeFilesystems, dev[m.key]) {
|
||||
current = dev
|
||||
if len(current["version"]) == 0 {
|
||||
current["version"] = "3"
|
||||
@@ -104,7 +104,6 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
|
||||
}
|
||||
|
||||
func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
||||
var err error = nil
|
||||
m.name = "NfsIOStatCollector"
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
@@ -117,10 +116,10 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
||||
m.config.SendAbsoluteValues = true
|
||||
m.config.SendDerivedValues = false
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.key = "mntpoint"
|
||||
@@ -130,7 +129,7 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
||||
m.data = m.readNfsiostats()
|
||||
m.lastTimestamp = time.Now()
|
||||
m.init = true
|
||||
return err
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
@@ -146,14 +145,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
if old, ok := m.data[mntpoint]; ok {
|
||||
for name, newVal := range values {
|
||||
if m.config.SendAbsoluteValues {
|
||||
msg, err := lp.NewMessage(
|
||||
"nfsio_"+name,
|
||||
m.tags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": newVal,
|
||||
},
|
||||
now)
|
||||
msg, err := lp.NewMetric("nfsio_"+name, m.tags, m.meta, newVal, now)
|
||||
if err == nil {
|
||||
msg.AddTag("stype", "filesystem")
|
||||
msg.AddTag("stype-id", mntpoint)
|
||||
@@ -162,7 +154,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
}
|
||||
if m.config.SendDerivedValues {
|
||||
rate := float64(newVal-old[name]) / timeDiff
|
||||
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]any{"value": rate}, now)
|
||||
msg, err := lp.NewMetric(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, rate, now)
|
||||
if err == nil {
|
||||
if strings.HasPrefix(name, "page") {
|
||||
msg.AddMeta("unit", "4K_pages/s")
|
||||
|
||||
@@ -16,7 +16,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/nfsio.md
|
||||
"exclude_metrics": [
|
||||
"oread", "pageread"
|
||||
],
|
||||
"exclude_filesystems": [
|
||||
"exclude_filesystem": [
|
||||
"/mnt"
|
||||
],
|
||||
"use_server_as_stype": false,
|
||||
|
||||
@@ -2,6 +2,7 @@ package collectors
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -83,9 +84,10 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
||||
|
||||
m.config.SendAbsoluteValues = true
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s Init(): unable to unmarshal numastat configuration: %w", m.name, err)
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -115,7 +117,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
// Initialized
|
||||
cclog.ComponentDebug(m.name, "initialized", len(m.topology), "NUMA domains")
|
||||
cclog.ComponentDebugf(m.name, "initialized %d NUMA domains", len(m.topology))
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/numastat.md
|
||||
"numastats": {
|
||||
"send_abs_values" : true,
|
||||
"send_derived_values" : true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The `numastat` collector reads data from `/sys/devices/system/node/node*/numastat` and outputs a handful **memoryDomain** metrics. See: <https://www.kernel.org/doc/html/latest/admin-guide/numastat.html>
|
||||
|
||||
396
collectors/nvidiaGPMMetric.go
Normal file
396
collectors/nvidiaGPMMetric.go
Normal file
@@ -0,0 +1,396 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
)
|
||||
|
||||
type NvidiaGPMMetricDef struct {
|
||||
name string
|
||||
outname string
|
||||
id nvml.GpmMetricId
|
||||
unit string
|
||||
}
|
||||
|
||||
var NvidiaGPMMetrics []NvidiaGPMMetricDef = []NvidiaGPMMetricDef{
|
||||
{
|
||||
name: "GRAPHICS_UTIL",
|
||||
outname: "nv_gpm_graphics_util",
|
||||
id: nvml.GPM_METRIC_GRAPHICS_UTIL,
|
||||
unit: "%",
|
||||
},
|
||||
{
|
||||
name: "SM_UTIL",
|
||||
outname: "nv_gpm_sm_util",
|
||||
id: nvml.GPM_METRIC_SM_UTIL,
|
||||
unit: "%",
|
||||
},
|
||||
{
|
||||
name: "SM_OCCUPANCY",
|
||||
outname: "nv_gpm_sm_occupancy",
|
||||
id: nvml.GPM_METRIC_SM_OCCUPANCY,
|
||||
unit: "%",
|
||||
},
|
||||
{
|
||||
name: "INTEGER_UTIL",
|
||||
outname: "nv_gpm_integer_util",
|
||||
id: nvml.GPM_METRIC_INTEGER_UTIL,
|
||||
unit: "%",
|
||||
},
|
||||
{
|
||||
name: "ANY_TENSOR_UTIL",
|
||||
outname: "nv_gpm_any_tensor_util",
|
||||
id: nvml.GPM_METRIC_ANY_TENSOR_UTIL,
|
||||
unit: "%",
|
||||
},
|
||||
{
|
||||
name: "DFMA_TENSOR_UTIL",
|
||||
outname: "nv_gpm_dfma_tensor_util",
|
||||
id: nvml.GPM_METRIC_DFMA_TENSOR_UTIL,
|
||||
unit: "%",
|
||||
},
|
||||
{
|
||||
name: "HMMA_TENSOR_UTIL",
|
||||
outname: "nv_gpm_hmma_tensor_util",
|
||||
id: nvml.GPM_METRIC_HMMA_TENSOR_UTIL,
|
||||
unit: "%",
|
||||
},
|
||||
{
|
||||
name: "IMMA_TENSOR_UTIL",
|
||||
outname: "nv_gpm_imma_tensor_util",
|
||||
id: nvml.GPM_METRIC_IMMA_TENSOR_UTIL,
|
||||
unit: "%",
|
||||
},
|
||||
{
|
||||
name: "DRAM_BW_UTIL",
|
||||
outname: "nv_gpm_dram_bw_util",
|
||||
id: nvml.GPM_METRIC_DRAM_BW_UTIL,
|
||||
unit: "%",
|
||||
},
|
||||
{
|
||||
name: "FP64_UTIL",
|
||||
outname: "nv_gpm_fp64_util",
|
||||
id: nvml.GPM_METRIC_FP64_UTIL,
|
||||
unit: "%",
|
||||
},
|
||||
{
|
||||
name: "FP32_UTIL",
|
||||
outname: "nv_gpm_fp32_util",
|
||||
id: nvml.GPM_METRIC_FP32_UTIL,
|
||||
unit: "%",
|
||||
},
|
||||
{
|
||||
name: "FP16_UTIL",
|
||||
outname: "nv_gpm_fp16_util",
|
||||
id: nvml.GPM_METRIC_FP16_UTIL,
|
||||
unit: "%",
|
||||
},
|
||||
}
|
||||
|
||||
type NvidiaGPMCollectorConfig struct {
|
||||
Metrics []string `json:"metrics,omitempty"`
|
||||
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
||||
AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"`
|
||||
UsePciInfoAsTypeId bool `json:"use_pci_info_as_type_id,omitempty"`
|
||||
AddUuidMeta bool `json:"add_uuid_meta,omitempty"`
|
||||
AddBoardNumberMeta bool `json:"add_board_number_meta,omitempty"`
|
||||
AddSerialMeta bool `json:"add_serial_meta,omitempty"`
|
||||
ProcessMigDevices bool `json:"process_mig_devices,omitempty"`
|
||||
UseUuidForMigDevices bool `json:"use_uuid_for_mig_device,omitempty"`
|
||||
UseSliceForMigDevices bool `json:"use_slice_for_mig_device,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaGPMCollectorDevice struct {
|
||||
device nvml.Device
|
||||
tags map[string]string
|
||||
meta map[string]string
|
||||
startTime time.Time
|
||||
endTime time.Time
|
||||
measurement nvml.GpmMetricsGetType
|
||||
metricsLookup map[int]NvidiaGPMMetricDef
|
||||
}
|
||||
|
||||
type NvidiaGPMCollector struct {
|
||||
metricCollector
|
||||
|
||||
config NvidiaGPMCollectorConfig
|
||||
gpus []NvidiaGPMCollectorDevice
|
||||
num_gpus int
|
||||
}
|
||||
|
||||
func (m *NvidiaGPMCollector) Init(config json.RawMessage) error {
|
||||
var err error = nil
|
||||
m.name = "NvidiaGPMCollector"
|
||||
m.parallel = true
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
if len(config) > 0 {
|
||||
d := json.NewDecoder(strings.NewReader(string(config)))
|
||||
d.DisallowUnknownFields()
|
||||
if err = d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "NvidiaGPM",
|
||||
}
|
||||
|
||||
// Initialize NVIDIA Management Library (NVML)
|
||||
ret := nvml.Init()
|
||||
|
||||
// Error: NVML library not found
|
||||
// (nvml.ErrorString can not be used in this case)
|
||||
if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
|
||||
return fmt.Errorf("%s Init(): NVML library not found", m.name)
|
||||
}
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
return fmt.Errorf("%s Init(): Unable to initialize NVML: %w", m.name, err)
|
||||
}
|
||||
|
||||
// Number of NVIDIA GPUs
|
||||
num_gpus, ret := nvml.DeviceGetCount()
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
return fmt.Errorf("%s Init(): Unable to get device count: %w", m.name, err)
|
||||
}
|
||||
|
||||
// For all GPUs
|
||||
m.gpus = make([]NvidiaGPMCollectorDevice, 0, num_gpus)
|
||||
for i := range num_gpus {
|
||||
|
||||
// Skip excluded devices by ID
|
||||
str_i := strconv.Itoa(i)
|
||||
if slices.Contains(m.config.ExcludeDevices, str_i) {
|
||||
cclog.ComponentDebugf(m.name, "Skipping excluded device %s", str_i)
|
||||
continue
|
||||
}
|
||||
|
||||
// Get device handle
|
||||
device, ret := nvml.DeviceGetHandleByIndex(i)
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentErrorf(m.name, "Unable to get device at index %d: %s", i, err.Error())
|
||||
continue
|
||||
}
|
||||
|
||||
supportInfo, ret := nvml.GpmQueryDeviceSupport(device)
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentErrorf(m.name, "Unable to query GPM support for device at index %d: %s", i, err.Error())
|
||||
continue
|
||||
} else {
|
||||
if supportInfo.IsSupportedDevice == uint32(nvml.FEATURE_DISABLED) {
|
||||
cclog.ComponentErrorf(m.name, "Device at index %d does not support GPM metrics", i)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
stream, ret := nvml.GpmQueryIfStreamingEnabled(device)
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentErrorf(m.name, "Unable to query GPM streaming for device at index %d: %s", i, err.Error())
|
||||
continue
|
||||
} else {
|
||||
if stream == uint32(nvml.FEATURE_DISABLED) {
|
||||
ret = nvml.GpmSetStreamingEnabled(device, uint32(nvml.FEATURE_ENABLED))
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentErrorf(m.name, "Unable to set streaming mode for device at index %d: %s", i, err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get device's PCI info
|
||||
pciInfo, ret := nvml.DeviceGetPciInfo(device)
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentErrorf(m.name, "Unable to get PCI info for device at index %d: %s", i, err.Error())
|
||||
continue
|
||||
}
|
||||
// Create PCI ID in the common format used by the NVML.
|
||||
pci_id := fmt.Sprintf(
|
||||
nvml.DEVICE_PCI_BUS_ID_FMT,
|
||||
pciInfo.Domain,
|
||||
pciInfo.Bus,
|
||||
pciInfo.Device)
|
||||
|
||||
// Skip excluded devices specified by PCI ID
|
||||
if slices.Contains(m.config.ExcludeDevices, pci_id) {
|
||||
cclog.ComponentDebugf(m.name, "Skipping excluded device %s", pci_id)
|
||||
continue
|
||||
}
|
||||
ss, nvmlErr := nvml.GpmSampleAlloc()
|
||||
if nvmlErr != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error())
|
||||
continue
|
||||
}
|
||||
es, nvmlErr := nvml.GpmSampleAlloc()
|
||||
if nvmlErr != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error())
|
||||
continue
|
||||
}
|
||||
|
||||
// Select which value to use as 'type-id'.
|
||||
// The PCI ID is commonly required in SLURM environments because the
|
||||
// numberic IDs used by SLURM and the ones used by NVML might differ
|
||||
// depending on the job type. The PCI ID is more reliable but is commonly
|
||||
// not recorded for a job, so it must be added manually in prologue or epilogue
|
||||
// e.g. to the comment field
|
||||
tid := str_i
|
||||
if m.config.UsePciInfoAsTypeId {
|
||||
tid = pci_id
|
||||
}
|
||||
|
||||
// Now we got all infos together, populate the device list
|
||||
g := NvidiaGPMCollectorDevice{}
|
||||
|
||||
// Add device handle
|
||||
g.device = device
|
||||
|
||||
// Add tags
|
||||
g.tags = map[string]string{
|
||||
"type": "accelerator",
|
||||
"type-id": tid,
|
||||
}
|
||||
|
||||
// Add PCI info as tag if not already used as 'type-id'
|
||||
if m.config.AddPciInfoTag && !m.config.UsePciInfoAsTypeId {
|
||||
g.tags["pci_identifier"] = pci_id
|
||||
}
|
||||
|
||||
g.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "Nvidia",
|
||||
}
|
||||
|
||||
if m.config.AddBoardNumberMeta {
|
||||
board, ret := nvml.DeviceGetBoardPartNumber(device)
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentError(m.name, "Unable to get boart part number for device at index", i, ":", err.Error())
|
||||
} else {
|
||||
g.meta["board_number"] = board
|
||||
}
|
||||
}
|
||||
if m.config.AddSerialMeta {
|
||||
serial, ret := nvml.DeviceGetSerial(device)
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error())
|
||||
} else {
|
||||
g.meta["serial"] = serial
|
||||
}
|
||||
}
|
||||
if m.config.AddUuidMeta {
|
||||
uuid, ret := nvml.DeviceGetUUID(device)
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error())
|
||||
} else {
|
||||
g.meta["uuid"] = uuid
|
||||
}
|
||||
}
|
||||
|
||||
g.measurement.Sample1 = ss
|
||||
g.measurement.Sample2 = es
|
||||
g.measurement.Version = nvml.GPM_METRICS_GET_VERSION
|
||||
g.metricsLookup = make(map[int]NvidiaGPMMetricDef)
|
||||
metIdx := 0
|
||||
for _, inmetric := range m.config.Metrics {
|
||||
for _, defmetric := range NvidiaGPMMetrics {
|
||||
if inmetric == defmetric.outname || inmetric == defmetric.name {
|
||||
g.measurement.Metrics[metIdx] = nvml.GpmMetric{
|
||||
MetricId: uint32(defmetric.id),
|
||||
}
|
||||
g.metricsLookup[metIdx] = defmetric
|
||||
metIdx += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
g.measurement.NumMetrics = uint32(metIdx)
|
||||
m.gpus = append(m.gpus, g)
|
||||
}
|
||||
cclog.ComponentDebugf(m.name, "Found %d Nvidia GPUs with GPM support", len(m.gpus))
|
||||
m.num_gpus = len(m.gpus)
|
||||
m.init = true
|
||||
return err
|
||||
}
|
||||
|
||||
func (m *NvidiaGPMCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
var err error
|
||||
if !m.init {
|
||||
return
|
||||
}
|
||||
for i, gpu := range m.gpus {
|
||||
gpu.startTime = time.Now()
|
||||
nvmlErr := gpu.measurement.Sample1.Get(gpu.device)
|
||||
if nvmlErr != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(nvmlErr))
|
||||
cclog.ComponentError(m.name, "Unable to get start GPM sample for device at index", i, ":", err.Error())
|
||||
continue
|
||||
}
|
||||
}
|
||||
time.Sleep(interval)
|
||||
|
||||
for i, gpu := range m.gpus {
|
||||
gpu.endTime = time.Now()
|
||||
nvmlErr := gpu.measurement.Sample2.Get(gpu.device)
|
||||
if nvmlErr != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(nvmlErr))
|
||||
cclog.ComponentError(m.name, "Unable to get stop GPM sample for device at index", i, ":", err.Error())
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
for i, gpu := range m.gpus {
|
||||
nvmlErr := nvml.GpmMetricsGet(&gpu.measurement)
|
||||
if nvmlErr != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(nvmlErr))
|
||||
cclog.ComponentError(m.name, "Unable to get evaluate GPM sample for device at index", i, ":", err.Error())
|
||||
continue
|
||||
}
|
||||
for idx, metricDef := range gpu.metricsLookup {
|
||||
y, err := lp.NewMetric(metricDef.outname, gpu.tags, gpu.meta, gpu.measurement.Metrics[idx].Value, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", metricDef.unit)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func (m *NvidiaGPMCollector) Close() {
|
||||
if m.init {
|
||||
for i, gpu := range m.gpus {
|
||||
ret := gpu.measurement.Sample1.Free()
|
||||
if ret != nvml.SUCCESS {
|
||||
err := errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentErrorf(m.name, "Unable to free start sample for device at index %d: %s", i, err.Error())
|
||||
}
|
||||
ret = gpu.measurement.Sample2.Free()
|
||||
if ret != nvml.SUCCESS {
|
||||
err := errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentErrorf(m.name, "Unable to free stop sample for device at index %d: %s", i, err.Error())
|
||||
}
|
||||
}
|
||||
if ret := nvml.Shutdown(); ret != nvml.SUCCESS {
|
||||
cclog.ComponentError(m.name, "nvml.Shutdown() not successful")
|
||||
}
|
||||
m.init = false
|
||||
}
|
||||
}
|
||||
54
collectors/nvidiaGPMMetric.md
Normal file
54
collectors/nvidiaGPMMetric.md
Normal file
@@ -0,0 +1,54 @@
|
||||
<!--
|
||||
---
|
||||
title: "Nvidia NVML GPM metric collector"
|
||||
description: Collect metrics for Nvidia GPUs using the NVML GPM interface
|
||||
categories: [cc-metric-collector]
|
||||
tags: ['Admin']
|
||||
weight: 2
|
||||
hugo_path: docs/reference/cc-metric-collector/collectors/nvidiaGPM.md
|
||||
---
|
||||
-->
|
||||
|
||||
## `nvidiaGPM` collector
|
||||
|
||||
```json
|
||||
"nvidia_gpm": {
|
||||
"metrics": [
|
||||
"nv_fb_mem_used",
|
||||
"nv_fan"
|
||||
],
|
||||
"exclude_devices": [
|
||||
"0","1", "0000000:ff:01.0"
|
||||
],
|
||||
|
||||
"process_mig_devices": false,
|
||||
"use_pci_info_as_type_id": true,
|
||||
"add_pci_info_tag": false,
|
||||
"add_uuid_meta": false,
|
||||
"add_board_number_meta": false,
|
||||
"add_serial_meta": false,
|
||||
"use_uuid_for_mig_device": false,
|
||||
"use_slice_for_mig_device": false
|
||||
}
|
||||
```
|
||||
|
||||
The `nvidia_gpm` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=<mig_index>`). With the options `use_uuid_for_mig_device` and `use_slice_for_mig_device`, the `<mig_index>` can be replaced with the UUID (e.g. `MIG-6a9f7cc8-6d5b-5ce0-92de-750edc4d8849`) or the MIG slice name (e.g. `1g.5gb`).
|
||||
|
||||
The metrics sent by the `nvidia_gpm` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
|
||||
|
||||
Optionally, it is possible to add the UUID, the board part number and the serial to the meta informations. They are not sent to the sinks (if not configured otherwise).
|
||||
|
||||
|
||||
Available Metrics:
|
||||
* `nv_gpm_graphics_util`
|
||||
* `nv_gpm_sm_util`
|
||||
* `nv_gpm_sm_occupancy`
|
||||
* `nv_gpm_integer_util`
|
||||
* `nv_gpm_any_tensor_util`
|
||||
* `nv_gpm_dfma_tensor_util`
|
||||
* `nv_gpm_hmma_tensor_util`
|
||||
* `nv_gpm_imma_tensor_util`
|
||||
* `nv_gpm_dram_bw_util`
|
||||
* `nv_gpm_fp64_util`
|
||||
* `nv_gpm_fp32_util`
|
||||
* `nv_gpm_fp16_util`
|
||||
@@ -72,9 +72,10 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
d := json.NewDecoder(strings.NewReader(string(config)))
|
||||
d.DisallowUnknownFields()
|
||||
if err = d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.meta = map[string]string{
|
||||
@@ -90,22 +91,18 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
// Error: NVML library not found
|
||||
// (nvml.ErrorString can not be used in this case)
|
||||
if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
|
||||
err = fmt.Errorf("NVML library not found")
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): NVML library not found", m.name)
|
||||
}
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Unable to initialize NVML: %w", m.name, err)
|
||||
}
|
||||
|
||||
// Number of NVIDIA GPUs
|
||||
num_gpus, ret := nvml.DeviceGetCount()
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentError(m.name, "Unable to get device count", err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Unable to get device count: %w", m.name, err)
|
||||
}
|
||||
|
||||
// For all GPUs
|
||||
@@ -116,7 +113,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
// Skip excluded devices by ID
|
||||
str_i := strconv.Itoa(i)
|
||||
if slices.Contains(m.config.ExcludeDevices, str_i) {
|
||||
cclog.ComponentDebug(m.name, "Skipping excluded device", str_i)
|
||||
cclog.ComponentDebugf(m.name, "Skipping excluded device %s", str_i)
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -124,7 +121,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
device, ret := nvml.DeviceGetHandleByIndex(i)
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Unable to get device at index %d: %s", i, err.Error())
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -132,7 +129,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
pciInfo, ret := nvml.DeviceGetPciInfo(device)
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentError(m.name, "Unable to get PCI info for device at index", i, ":", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Unable to get PCI info for device at index %d: %s", i, err.Error())
|
||||
continue
|
||||
}
|
||||
// Create PCI ID in the common format used by the NVML.
|
||||
@@ -144,7 +141,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Skip excluded devices specified by PCI ID
|
||||
if slices.Contains(m.config.ExcludeDevices, pci_id) {
|
||||
cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id)
|
||||
cclog.ComponentDebugf(m.name, "Skipping excluded device %s", pci_id)
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -186,7 +183,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
if m.config.AddBoardNumberMeta {
|
||||
board, ret := nvml.DeviceGetBoardPartNumber(device)
|
||||
if ret != nvml.SUCCESS {
|
||||
cclog.ComponentError(m.name, "Unable to get boart part number for device at index", i, ":", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Unable to get boart part number for device at index %d: %s", i, err.Error())
|
||||
} else {
|
||||
g.meta["board_number"] = board
|
||||
}
|
||||
@@ -194,7 +191,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
if m.config.AddSerialMeta {
|
||||
serial, ret := nvml.DeviceGetSerial(device)
|
||||
if ret != nvml.SUCCESS {
|
||||
cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Unable to get serial number for device at index %d: %s", i, err.Error())
|
||||
} else {
|
||||
g.meta["serial"] = serial
|
||||
}
|
||||
@@ -202,7 +199,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
if m.config.AddUuidMeta {
|
||||
uuid, ret := nvml.DeviceGetUUID(device)
|
||||
if ret != nvml.SUCCESS {
|
||||
cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Unable to get UUID for device at index %d: %s", i, err.Error())
|
||||
} else {
|
||||
g.meta["uuid"] = uuid
|
||||
}
|
||||
@@ -1131,97 +1128,97 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
}
|
||||
err = readMemoryInfo(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readMemoryInfo for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readMemoryInfo for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readUtilization(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readUtilization for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readUtilization for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readTemp(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readTemp for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readTemp for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readFan(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readFan for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readFan for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readEccMode(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readEccMode for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readEccMode for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readPerfState(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readPerfState for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readPerfState for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readPowerUsage(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readPowerUsage for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readEnergyConsumption(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readEnergyConsumption for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readEnergyConsumption for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readClocks(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readClocks for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readMaxClocks(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readMaxClocks for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readMaxClocks for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readEccErrors(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readEccErrors for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readEccErrors for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readPowerLimit(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readPowerLimit for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readPowerLimit for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readEncUtilization(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readEncUtilization for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readEncUtilization for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readDecUtilization(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readDecUtilization for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readDecUtilization for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readRemappedRows(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readRemappedRows for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readRemappedRows for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readBarMemoryInfo(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readBarMemoryInfo for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readBarMemoryInfo for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readProcessCounts(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readProcessCounts for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readProcessCounts for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readViolationStats(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readViolationStats for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readViolationStats for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readNVLinkStats(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readNVLinkStats for device %s failed", name)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1247,7 +1244,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
if maxMig == 0 {
|
||||
continue
|
||||
}
|
||||
cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i)
|
||||
cclog.ComponentDebugf(m.name, "Reading MIG devices for GPU %d", i)
|
||||
|
||||
for j := range maxMig {
|
||||
mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
|
||||
@@ -1271,7 +1268,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
if m.config.UseUuidForMigDevices {
|
||||
uuid, ret := nvml.DeviceGetUUID(mdev)
|
||||
if ret != nvml.SUCCESS {
|
||||
cclog.ComponentError(m.name, "Unable to get UUID for mig device at index", j, ":", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Unable to get UUID for mig device at index %d: %s", j, err.Error())
|
||||
} else {
|
||||
migDevice.tags["stype-id"] = uuid
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -67,10 +68,10 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Read in the JSON configuration
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -207,11 +208,10 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
// Initialized
|
||||
cclog.ComponentDebug(
|
||||
cclog.ComponentDebugf(
|
||||
m.name,
|
||||
"initialized",
|
||||
len(m.RAPLZoneInfo),
|
||||
"zones with running average power limit (RAPL) monitoring attributes")
|
||||
"initialized %d zones with running average power limit (RAPL) monitoring attributes",
|
||||
len(m.RAPLZoneInfo))
|
||||
m.init = true
|
||||
|
||||
return err
|
||||
@@ -241,12 +241,7 @@ func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
timeDiff := energyTimestamp.Sub(p.energyTimestamp)
|
||||
averagePower := float64(energyDiff) / float64(timeDiff.Microseconds())
|
||||
|
||||
y, err := lp.NewMessage(
|
||||
"rapl_average_power",
|
||||
p.tags,
|
||||
m.meta,
|
||||
map[string]any{"value": averagePower},
|
||||
energyTimestamp)
|
||||
y, err := lp.NewMetric("rapl_average_power", p.tags, m.meta, averagePower, energyTimestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
@@ -8,8 +8,8 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"slices"
|
||||
"strconv"
|
||||
@@ -51,7 +51,6 @@ type RocmSmiCollector struct {
|
||||
// Called once by the collector manager
|
||||
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
||||
func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
var err error = nil
|
||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||
m.name = "RocmSmiCollector"
|
||||
// This is for later use, also call it early
|
||||
@@ -60,25 +59,21 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
// Read in the JSON configuration
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
|
||||
ret := rocm_smi.Init()
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = errors.New("failed to initialize ROCm SMI library")
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to initialize ROCm SMI library", m.name)
|
||||
}
|
||||
|
||||
numDevs, ret := rocm_smi.NumMonitorDevices()
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = errors.New("failed to get number of GPUs from ROCm SMI library")
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to get number of GPUs from ROCm SMI library", m.name)
|
||||
}
|
||||
|
||||
m.devices = make([]RocmSmiCollectorDevice, 0)
|
||||
@@ -90,16 +85,12 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = fmt.Errorf("failed to get handle for GPU %d", i)
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to get get handle for GPU %d", m.name, i)
|
||||
}
|
||||
|
||||
pciInfo, ret := rocm_smi.DeviceGetPciInfo(device)
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = fmt.Errorf("failed to get PCI information for GPU %d", i)
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to get PCI information for GPU %d", m.name, i)
|
||||
}
|
||||
|
||||
pciId := fmt.Sprintf(
|
||||
@@ -133,7 +124,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
if m.config.AddSerialMeta {
|
||||
serial, ret := rocm_smi.DeviceGetSerialNumber(device)
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", rocm_smi.StatusStringNoError(ret))
|
||||
cclog.ComponentErrorf(m.name, "Unable to get serial number for device at index %d: %s", i, rocm_smi.StatusStringNoError(ret))
|
||||
} else {
|
||||
dev.meta["serial"] = serial
|
||||
}
|
||||
@@ -149,7 +140,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Set this flag only if everything is initialized properly, all required files exist, ...
|
||||
m.init = true
|
||||
return err
|
||||
return nil
|
||||
}
|
||||
|
||||
// Read collects all metrics belonging to the sample collector
|
||||
@@ -161,134 +152,116 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
for _, dev := range m.devices {
|
||||
metrics, ret := rocm_smi.DeviceGetMetrics(dev.device)
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
cclog.ComponentError(m.name, "Unable to get metrics for device at index", dev.index, ":", rocm_smi.StatusStringNoError(ret))
|
||||
cclog.ComponentErrorf(m.name, "Unable to get metrics for device at index %d: %s", dev.index, rocm_smi.StatusStringNoError(ret))
|
||||
continue
|
||||
}
|
||||
|
||||
if !dev.excludeMetrics["rocm_gfx_util"] {
|
||||
value := metrics.Average_gfx_activity
|
||||
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_gfx_util", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_umc_util"] {
|
||||
value := metrics.Average_umc_activity
|
||||
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_umc_util", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_mm_util"] {
|
||||
value := metrics.Average_mm_activity
|
||||
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_mm_util", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_avg_power"] {
|
||||
value := metrics.Average_socket_power
|
||||
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_avg_power", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_mem"] {
|
||||
value := metrics.Temperature_mem
|
||||
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_temp_mem", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_hotspot"] {
|
||||
value := metrics.Temperature_hotspot
|
||||
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_temp_hotspot", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_edge"] {
|
||||
value := metrics.Temperature_edge
|
||||
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_temp_edge", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_vrgfx"] {
|
||||
value := metrics.Temperature_vrgfx
|
||||
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_temp_vrgfx", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_vrsoc"] {
|
||||
value := metrics.Temperature_vrsoc
|
||||
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_temp_vrsoc", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_vrmem"] {
|
||||
value := metrics.Temperature_vrmem
|
||||
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_temp_vrmem", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_gfx_clock"] {
|
||||
value := metrics.Average_gfxclk_frequency
|
||||
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_gfx_clock", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_soc_clock"] {
|
||||
value := metrics.Average_socclk_frequency
|
||||
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_soc_clock", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_u_clock"] {
|
||||
value := metrics.Average_uclk_frequency
|
||||
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_u_clock", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_v0_clock"] {
|
||||
value := metrics.Average_vclk0_frequency
|
||||
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_v0_clock", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_v1_clock"] {
|
||||
value := metrics.Average_vclk1_frequency
|
||||
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_v1_clock", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_d0_clock"] {
|
||||
value := metrics.Average_dclk0_frequency
|
||||
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_d0_clock", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_d1_clock"] {
|
||||
value := metrics.Average_dclk1_frequency
|
||||
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_d1_clock", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_hbm"] {
|
||||
for i := range rocm_smi.NUM_HBM_INSTANCES {
|
||||
value := metrics.Temperature_hbm[i]
|
||||
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric("rocm_temp_hbm", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||
y.AddTag("stype", "device")
|
||||
y.AddTag("stype-id", strconv.Itoa(i))
|
||||
output <- y
|
||||
|
||||
@@ -8,11 +8,11 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
)
|
||||
|
||||
@@ -52,7 +52,10 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
|
||||
m.parallel = true
|
||||
// Define meta information sent with each metric
|
||||
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "SAMPLE",
|
||||
}
|
||||
// Define tags sent with each metric
|
||||
// The 'type' tag is always needed, it defines the granularity of the metric
|
||||
// node -> whole system
|
||||
@@ -63,13 +66,15 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
|
||||
// core -> single CPU core that may consist of multiple hardware threads (SMT) (requires core ID as 'type-id' tag)
|
||||
// hwthtread -> single CPU hardware thread (requires hardware thread ID as 'type-id' tag)
|
||||
// accelerator -> A accelerator device like GPU or FPGA (requires an accelerator ID as 'type-id' tag)
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
m.tags = map[string]string{
|
||||
"type": "node",
|
||||
}
|
||||
// Read in the JSON configuration
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -96,7 +101,7 @@ func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
// stop := readState()
|
||||
// value = (stop - start) / interval.Seconds()
|
||||
|
||||
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMetric("sample_metric", m.tags, m.meta, value, timestamp)
|
||||
if err == nil {
|
||||
// Send it to output channel
|
||||
output <- y
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"sync"
|
||||
@@ -47,26 +48,30 @@ func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
|
||||
}
|
||||
// Define meta information sent with each metric
|
||||
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "SAMPLE",
|
||||
}
|
||||
// Define tags sent with each metric
|
||||
// The 'type' tag is always needed, it defines the granularity of the metric
|
||||
// node -> whole system
|
||||
// socket -> CPU socket (requires socket ID as 'type-id' tag)
|
||||
// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
m.tags = map[string]string{
|
||||
"type": "node",
|
||||
}
|
||||
// Read in the JSON configuration
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
// Parse the read interval duration
|
||||
m.interval, err = time.ParseDuration(m.config.Interval)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error parsing interval:", err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): error parsing interval: %w", m.name, err)
|
||||
}
|
||||
|
||||
// Storage for output channel
|
||||
@@ -77,13 +82,11 @@ func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
|
||||
m.ticker = time.NewTicker(m.interval)
|
||||
|
||||
// Start the timer loop with return functionality by sending 'true' to the done channel
|
||||
m.wg.Add(1)
|
||||
go func() {
|
||||
m.wg.Go(func() {
|
||||
select {
|
||||
case <-m.done:
|
||||
// Exit the timer loop
|
||||
cclog.ComponentDebug(m.name, "Closing...")
|
||||
m.wg.Done()
|
||||
return
|
||||
case timestamp := <-m.ticker.C:
|
||||
// This is executed every timer tick but we have to wait until the first
|
||||
@@ -92,7 +95,7 @@ func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
|
||||
m.ReadMetrics(timestamp)
|
||||
}
|
||||
}
|
||||
}()
|
||||
})
|
||||
|
||||
// Set this flag only if everything is initialized properly, all required files exist, ...
|
||||
m.init = true
|
||||
@@ -111,7 +114,7 @@ func (m *SampleTimerCollector) ReadMetrics(timestamp time.Time) {
|
||||
// stop := readState()
|
||||
// value = (stop - start) / interval.Seconds()
|
||||
|
||||
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMetric("sample_metric", m.tags, m.meta, value, timestamp)
|
||||
if err == nil && m.output != nil {
|
||||
// Send it to output channel if we have a valid channel
|
||||
m.output <- y
|
||||
|
||||
@@ -9,6 +9,7 @@ package collectors
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -66,8 +67,10 @@ func (m *SchedstatCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Read in the JSON configuration
|
||||
if len(config) > 0 {
|
||||
if err := json.Unmarshal(config, &m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error reading config: %w", m.name, err)
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,7 +127,7 @@ func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]
|
||||
m.olddata[linefields[0]]["waiting"] = waiting
|
||||
value := l_running + l_waiting
|
||||
|
||||
y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]any{"value": value}, now)
|
||||
y, err := lp.NewMetric("cpu_load_core", tags, m.meta, value, now)
|
||||
if err == nil {
|
||||
// Send it to output channel
|
||||
output <- y
|
||||
@@ -144,15 +147,15 @@ func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
|
||||
file, err := os.Open(SCHEDSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", SCHEDSTATFILE, err))
|
||||
"Read(): Failed to open file '%s': %v", SCHEDSTATFILE, err)
|
||||
}
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", SCHEDSTATFILE, err))
|
||||
"Read(): Failed to close file '%s': %v", SCHEDSTATFILE, err)
|
||||
}
|
||||
}()
|
||||
|
||||
|
||||
@@ -8,13 +8,13 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"runtime"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
)
|
||||
|
||||
@@ -40,13 +40,18 @@ func (m *SelfCollector) Init(config json.RawMessage) error {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{"source": m.name, "group": "Self"}
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "Self",
|
||||
}
|
||||
m.tags = map[string]string{
|
||||
"type": "node",
|
||||
}
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.init = true
|
||||
@@ -60,49 +65,49 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
var memstats runtime.MemStats
|
||||
runtime.ReadMemStats(&memstats)
|
||||
|
||||
y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]any{"value": memstats.TotalAlloc}, timestamp)
|
||||
y, err := lp.NewMetric("total_alloc", m.tags, m.meta, memstats.TotalAlloc, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]any{"value": memstats.HeapAlloc}, timestamp)
|
||||
y, err = lp.NewMetric("heap_alloc", m.tags, m.meta, memstats.HeapAlloc, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]any{"value": memstats.HeapSys}, timestamp)
|
||||
y, err = lp.NewMetric("heap_sys", m.tags, m.meta, memstats.HeapSys, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]any{"value": memstats.HeapIdle}, timestamp)
|
||||
y, err = lp.NewMetric("heap_idle", m.tags, m.meta, memstats.HeapIdle, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]any{"value": memstats.HeapInuse}, timestamp)
|
||||
y, err = lp.NewMetric("heap_inuse", m.tags, m.meta, memstats.HeapInuse, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]any{"value": memstats.HeapReleased}, timestamp)
|
||||
y, err = lp.NewMetric("heap_released", m.tags, m.meta, memstats.HeapReleased, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]any{"value": memstats.HeapObjects}, timestamp)
|
||||
y, err = lp.NewMetric("heap_objects", m.tags, m.meta, memstats.HeapObjects, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if m.config.GoRoutines {
|
||||
y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]any{"value": runtime.NumGoroutine()}, timestamp)
|
||||
y, err := lp.NewMetric("num_goroutines", m.tags, m.meta, runtime.NumGoroutine(), timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if m.config.CgoCalls {
|
||||
y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]any{"value": runtime.NumCgoCall()}, timestamp)
|
||||
y, err := lp.NewMetric("num_cgo_calls", m.tags, m.meta, runtime.NumCgoCall(), timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -113,35 +118,35 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
if err == nil {
|
||||
sec, nsec := rusage.Utime.Unix()
|
||||
t := float64(sec) + (float64(nsec) * 1e-9)
|
||||
y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]any{"value": t}, timestamp)
|
||||
y, err := lp.NewMetric("rusage_user_time", m.tags, m.meta, t, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "seconds")
|
||||
output <- y
|
||||
}
|
||||
sec, nsec = rusage.Stime.Unix()
|
||||
t = float64(sec) + (float64(nsec) * 1e-9)
|
||||
y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]any{"value": t}, timestamp)
|
||||
y, err = lp.NewMetric("rusage_system_time", m.tags, m.meta, t, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "seconds")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]any{"value": rusage.Nvcsw}, timestamp)
|
||||
y, err = lp.NewMetric("rusage_vol_ctx_switch", m.tags, m.meta, rusage.Nvcsw, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]any{"value": rusage.Nivcsw}, timestamp)
|
||||
y, err = lp.NewMetric("rusage_invol_ctx_switch", m.tags, m.meta, rusage.Nivcsw, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]any{"value": rusage.Nsignals}, timestamp)
|
||||
y, err = lp.NewMetric("rusage_signals", m.tags, m.meta, rusage.Nsignals, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]any{"value": rusage.Majflt}, timestamp)
|
||||
y, err = lp.NewMetric("rusage_major_pgfaults", m.tags, m.meta, rusage.Majflt, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]any{"value": rusage.Minflt}, timestamp)
|
||||
y, err = lp.NewMetric("rusage_minor_pgfaults", m.tags, m.meta, rusage.Minflt, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
@@ -119,8 +119,9 @@ func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
|
||||
m.cgroupBase = defaultCgroupBase
|
||||
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
d := json.NewDecoder(strings.NewReader(string(config)))
|
||||
d.DisallowUnknownFields()
|
||||
if err = d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error reading JSON config: %w", m.name, err)
|
||||
}
|
||||
m.excludeMetrics = make(map[string]struct{})
|
||||
@@ -239,7 +240,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
globPattern := filepath.Join(m.cgroupBase, "job_*")
|
||||
jobDirs, err := filepath.Glob(globPattern)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error globbing job directories:", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Error globbing job directories: %s", err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
@@ -248,7 +249,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
|
||||
jobdata, err := m.ReadJobData(jKey)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error reading job data for", jKey, ":", err.Error())
|
||||
cclog.ComponentError(m.name, "Error reading job data for %s: %s", jKey, err.Error())
|
||||
continue
|
||||
}
|
||||
|
||||
|
||||
360
collectors/smartmonMetric.go
Normal file
360
collectors/smartmonMetric.go
Normal file
@@ -0,0 +1,360 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"slices"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
)
|
||||
|
||||
type SmartMonCollectorConfig struct {
|
||||
UseSudo bool `json:"use_sudo,omitempty"`
|
||||
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
||||
ExcludeMetrics []string `json:"excludeMetrics,omitempty"`
|
||||
Devices []struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
} `json:"devices,omitempty"`
|
||||
}
|
||||
|
||||
type deviceT struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
queryCommand []string
|
||||
}
|
||||
|
||||
type SmartMonCollector struct {
|
||||
metricCollector
|
||||
config SmartMonCollectorConfig // the configuration structure
|
||||
meta map[string]string // default meta information
|
||||
tags map[string]string // default tags
|
||||
devices []deviceT // smartmon devices
|
||||
sudoCmd string // Full path to 'sudo' command
|
||||
smartCtlCmd string // Full path to 'smartctl' command
|
||||
excludeMetric struct {
|
||||
temp,
|
||||
percentUsed,
|
||||
availSpare,
|
||||
dataUnitsRead,
|
||||
dataUnitsWrite,
|
||||
hostReads,
|
||||
hostWrites,
|
||||
powerCycles,
|
||||
powerOn,
|
||||
UnsafeShutdowns,
|
||||
mediaErrors,
|
||||
errlogEntries,
|
||||
warnTempTime,
|
||||
critCompTime bool
|
||||
}
|
||||
}
|
||||
|
||||
func (m *SmartMonCollector) getSmartmonDevices() error {
|
||||
// Use configured devices
|
||||
if len(m.config.Devices) > 0 {
|
||||
for _, configDevice := range m.config.Devices {
|
||||
if !slices.Contains(m.config.ExcludeDevices, configDevice.Name) {
|
||||
d := deviceT{
|
||||
Name: configDevice.Name,
|
||||
Type: configDevice.Type,
|
||||
}
|
||||
if m.config.UseSudo {
|
||||
d.queryCommand = append(d.queryCommand, m.sudoCmd)
|
||||
}
|
||||
d.queryCommand = append(d.queryCommand, m.smartCtlCmd, "--json=c", "--device="+d.Type, "--all", d.Name)
|
||||
|
||||
m.devices = append(m.devices, d)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Use scan command
|
||||
var scanCmd []string
|
||||
if m.config.UseSudo {
|
||||
scanCmd = append(scanCmd, m.sudoCmd)
|
||||
}
|
||||
scanCmd = append(scanCmd, m.smartCtlCmd, "--scan", "--json=c")
|
||||
command := exec.Command(scanCmd[0], scanCmd[1:]...)
|
||||
|
||||
stdout, err := command.Output()
|
||||
if err != nil {
|
||||
return fmt.Errorf(
|
||||
"%s getSmartmonDevices(): Failed to execute device scan command %s: %w",
|
||||
m.name, command.String(), err)
|
||||
}
|
||||
|
||||
var scanOutput struct {
|
||||
Devices []deviceT `json:"devices"`
|
||||
}
|
||||
err = json.Unmarshal(stdout, &scanOutput)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s getSmartmonDevices(): Failed to parse JSON output from device scan command: %w",
|
||||
m.name, err)
|
||||
}
|
||||
|
||||
m.devices = make([]deviceT, 0)
|
||||
for _, d := range scanOutput.Devices {
|
||||
if !slices.Contains(m.config.ExcludeDevices, d.Name) {
|
||||
if m.config.UseSudo {
|
||||
d.queryCommand = append(d.queryCommand, m.sudoCmd)
|
||||
}
|
||||
d.queryCommand = append(d.queryCommand, m.smartCtlCmd, "--json=c", "--device="+d.Type, "--all", d.Name)
|
||||
|
||||
m.devices = append(m.devices, d)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *SmartMonCollector) Init(config json.RawMessage) error {
|
||||
m.name = "SmartMonCollector"
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "Disk",
|
||||
}
|
||||
m.tags = map[string]string{
|
||||
"type": "node",
|
||||
"stype": "disk",
|
||||
}
|
||||
|
||||
// Read in the JSON configuration
|
||||
if len(config) > 0 {
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error reading config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
for _, excludeMetric := range m.config.ExcludeMetrics {
|
||||
switch excludeMetric {
|
||||
case "smartmon_temp":
|
||||
m.excludeMetric.temp = true
|
||||
case "smartmon_percent_used":
|
||||
m.excludeMetric.percentUsed = true
|
||||
case "smartmon_avail_spare":
|
||||
m.excludeMetric.availSpare = true
|
||||
case "smartmon_data_units_read":
|
||||
m.excludeMetric.dataUnitsRead = true
|
||||
case "smartmon_data_units_write":
|
||||
m.excludeMetric.dataUnitsWrite = true
|
||||
case "smartmon_host_reads":
|
||||
m.excludeMetric.hostReads = true
|
||||
case "smartmon_host_writes":
|
||||
m.excludeMetric.hostWrites = true
|
||||
case "smartmon_power_cycles":
|
||||
m.excludeMetric.powerCycles = true
|
||||
case "smartmon_power_on":
|
||||
m.excludeMetric.powerOn = true
|
||||
case "smartmon_unsafe_shutdowns":
|
||||
m.excludeMetric.UnsafeShutdowns = true
|
||||
case "smartmon_media_errors":
|
||||
m.excludeMetric.mediaErrors = true
|
||||
case "smartmon_errlog_entries":
|
||||
m.excludeMetric.errlogEntries = true
|
||||
case "smartmon_warn_temp_time":
|
||||
m.excludeMetric.warnTempTime = true
|
||||
case "smartmon_crit_comp_time":
|
||||
m.excludeMetric.critCompTime = true
|
||||
default:
|
||||
return fmt.Errorf("%s Init(): Unknown excluded metric: %s", m.name, excludeMetric)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Check if sudo and smartctl are in search path
|
||||
if m.config.UseSudo {
|
||||
p, err := exec.LookPath("sudo")
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s Init(): No sudo command found in search path: %w", m.name, err)
|
||||
}
|
||||
m.sudoCmd = p
|
||||
}
|
||||
p, err := exec.LookPath("smartctl")
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s Init(): No smartctl command found in search path: %w", m.name, err)
|
||||
}
|
||||
m.smartCtlCmd = p
|
||||
|
||||
if err = m.getSmartmonDevices(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
m.init = true
|
||||
return err
|
||||
}
|
||||
|
||||
type SmartMonData struct {
|
||||
SerialNumber string `json:"serial_number"`
|
||||
UserCapacity struct {
|
||||
Blocks int `json:"blocks"`
|
||||
Bytes int `json:"bytes"`
|
||||
} `json:"user_capacity"`
|
||||
HealthLog struct {
|
||||
// Available SMART health information:
|
||||
// sudo smartctl -a --json=c /dev/nvme0 | jq --color-output | less --RAW-CONTROL-CHARS
|
||||
Temperature int `json:"temperature"`
|
||||
PercentageUsed int `json:"percentage_used"`
|
||||
AvailableSpare int `json:"available_spare"`
|
||||
DataUnitsRead int `json:"data_units_read"`
|
||||
DataUnitsWrite int `json:"data_units_written"`
|
||||
HostReads int `json:"host_reads"`
|
||||
HostWrites int `json:"host_writes"`
|
||||
PowerCycles int `json:"power_cycles"`
|
||||
PowerOnHours int `json:"power_on_hours"`
|
||||
UnsafeShutdowns int `json:"unsafe_shutdowns"`
|
||||
MediaErrors int `json:"media_errors"`
|
||||
NumErrorLogEntries int `json:"num_err_log_entries"`
|
||||
WarnTempTime int `json:"warning_temp_time"`
|
||||
CriticalCompTime int `json:"critical_comp_time"`
|
||||
} `json:"nvme_smart_health_information_log"`
|
||||
}
|
||||
|
||||
func (m *SmartMonCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
timestamp := time.Now()
|
||||
for _, d := range m.devices {
|
||||
var data SmartMonData
|
||||
command := exec.Command(d.queryCommand[0], d.queryCommand[1:]...)
|
||||
|
||||
stdout, err := command.Output()
|
||||
if err != nil {
|
||||
cclog.ComponentErrorf(m.name, "cannot read data for device %s", d.Name)
|
||||
continue
|
||||
}
|
||||
err = json.Unmarshal(stdout, &data)
|
||||
if err != nil {
|
||||
cclog.ComponentErrorf(m.name, "cannot unmarshal data for device %s", d.Name)
|
||||
continue
|
||||
}
|
||||
if !m.excludeMetric.temp {
|
||||
y, err := lp.NewMetric(
|
||||
"smartmon_temp", m.tags, m.meta, data.HealthLog.Temperature, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype-id", d.Name)
|
||||
y.AddMeta("unit", "degC")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !m.excludeMetric.percentUsed {
|
||||
y, err := lp.NewMetric(
|
||||
"smartmon_percent_used", m.tags, m.meta, data.HealthLog.PercentageUsed, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype-id", d.Name)
|
||||
y.AddMeta("unit", "percent")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !m.excludeMetric.availSpare {
|
||||
y, err := lp.NewMetric(
|
||||
"smartmon_avail_spare", m.tags, m.meta, data.HealthLog.AvailableSpare, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype-id", d.Name)
|
||||
y.AddMeta("unit", "percent")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !m.excludeMetric.dataUnitsRead {
|
||||
y, err := lp.NewMetric(
|
||||
"smartmon_data_units_read", m.tags, m.meta, data.HealthLog.DataUnitsRead, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype-id", d.Name)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !m.excludeMetric.dataUnitsWrite {
|
||||
y, err := lp.NewMetric(
|
||||
"smartmon_data_units_write", m.tags, m.meta, data.HealthLog.DataUnitsWrite, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype-id", d.Name)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !m.excludeMetric.hostReads {
|
||||
y, err := lp.NewMetric(
|
||||
"smartmon_host_reads", m.tags, m.meta, data.HealthLog.HostReads, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype-id", d.Name)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !m.excludeMetric.hostWrites {
|
||||
y, err := lp.NewMetric(
|
||||
"smartmon_host_writes", m.tags, m.meta, data.HealthLog.HostWrites, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype-id", d.Name)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !m.excludeMetric.powerCycles {
|
||||
y, err := lp.NewMetric(
|
||||
"smartmon_power_cycles", m.tags, m.meta, data.HealthLog.PowerCycles, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype-id", d.Name)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !m.excludeMetric.powerOn {
|
||||
y, err := lp.NewMetric(
|
||||
"smartmon_power_on", m.tags, m.meta, int64(data.HealthLog.PowerOnHours)*3600, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype-id", d.Name)
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !m.excludeMetric.UnsafeShutdowns {
|
||||
y, err := lp.NewMetric(
|
||||
"smartmon_unsafe_shutdowns", m.tags, m.meta, data.HealthLog.UnsafeShutdowns, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype-id", d.Name)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !m.excludeMetric.mediaErrors {
|
||||
y, err := lp.NewMetric(
|
||||
"smartmon_media_errors", m.tags, m.meta, data.HealthLog.MediaErrors, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype-id", d.Name)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !m.excludeMetric.errlogEntries {
|
||||
y, err := lp.NewMetric(
|
||||
"smartmon_errlog_entries", m.tags, m.meta, data.HealthLog.NumErrorLogEntries, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype-id", d.Name)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !m.excludeMetric.warnTempTime {
|
||||
y, err := lp.NewMetric(
|
||||
"smartmon_warn_temp_time", m.tags, m.meta, data.HealthLog.WarnTempTime, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype-id", d.Name)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !m.excludeMetric.critCompTime {
|
||||
y, err := lp.NewMetric(
|
||||
"smartmon_crit_comp_time", m.tags, m.meta, data.HealthLog.CriticalCompTime, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype-id", d.Name)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *SmartMonCollector) Close() {
|
||||
m.init = false
|
||||
}
|
||||
67
collectors/smartmonMetric.md
Normal file
67
collectors/smartmonMetric.md
Normal file
@@ -0,0 +1,67 @@
|
||||
<!--
|
||||
---
|
||||
title: smartmon metric collector
|
||||
description: Collect S.M.A.R.T data from NVMEs
|
||||
categories: [cc-metric-collector]
|
||||
tags: ['Admin']
|
||||
weight: 2
|
||||
hugo_path: docs/reference/cc-metric-collector/collectors/smartmonMetric.md
|
||||
---
|
||||
-->
|
||||
|
||||
## `smartmon` collector
|
||||
|
||||
```json
|
||||
"smartmon": {
|
||||
"use_sudo": true,
|
||||
"exclude_devices": [
|
||||
"/dev/sda"
|
||||
],
|
||||
"excludeMetrics": [
|
||||
"smartmon_warn_temp_time",
|
||||
"smartmon_crit_comp_time"
|
||||
],
|
||||
"devices": [
|
||||
{
|
||||
"name": "/dev/nvme0",
|
||||
"type": "nvme"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
The `smartmon` collector retrieves S.M.A.R.T data from NVMEs via command `smartctl`.
|
||||
|
||||
Available NVMEs can be either automatically detected by a device scan or manually added with the "devices" config option.
|
||||
|
||||
Metrics:
|
||||
|
||||
* `smartmon_temp`: Temperature of the device (`unit=degC`)
|
||||
* `smartmon_avail_spare`: Amount of spare left (`unit=percent`)
|
||||
* `smartmon_percent_used`: Percentage of the device is used (`unit=percent`)
|
||||
* `smartmon_data_units_read`: Read data units
|
||||
* `smartmon_data_units_write`: Written data units
|
||||
* `smartmon_host_reads`: Read operations
|
||||
* `smartmon_host_writes`: Write operations
|
||||
* `smartmon_power_cycles`: Number of power cycles
|
||||
* `smartmon_power_on`: Seconds the device is powered on (`unit=seconds`)
|
||||
* `smartmon_unsafe_shutdowns`: Count of unsafe shutdowns
|
||||
* `smartmon_media_errors`: Media errors of the device
|
||||
* `smartmon_errlog_entries`: Error log entries
|
||||
* `smartmon_warn_temp_time`: Time above the warning temperature threshold
|
||||
* `smartmon_crit_comp_time`: Time above the critical composite temperature threshold
|
||||
|
||||
`smartctl` typically require root to run.
|
||||
In order to run `cc-metric-collector` without root priviliges, you can enable `use_sudo`.
|
||||
Add a file like this in `/etc/sudoers.d/` to allow `cc-metric-collector` to run the required command:
|
||||
|
||||
```
|
||||
# Do not log the following sudo commands from monitoring, since this causes a lot of log spam.
|
||||
# However keep log_denied enabled, to detect failures
|
||||
Defaults: monitoring !log_allowed, !pam_session
|
||||
|
||||
# Allow to use lctl
|
||||
monitoring ALL = (root) NOPASSWD:/absolute/path/to/smartctl --json=c --device=* "--all" *
|
||||
# Or add individual rules for each device
|
||||
# monitoring ALL = (root) NOPASSWD:/absolute/path/to/smartctl --json=c --device=<device_type> "--all" <device>
|
||||
```
|
||||
@@ -8,6 +8,7 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -63,9 +64,10 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err)
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -186,39 +188,27 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
// Read sensor file
|
||||
buffer, err := os.ReadFile(sensor.file)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to read file '%s': %v", sensor.file, err))
|
||||
"Read(): Failed to read file '%s': %v", sensor.file, err)
|
||||
continue
|
||||
}
|
||||
x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to convert temperature '%s' to int64: %v", buffer, err))
|
||||
"Read(): Failed to convert temperature '%s' to int64: %v", buffer, err)
|
||||
continue
|
||||
}
|
||||
x /= 1000
|
||||
y, err := lp.NewMessage(
|
||||
sensor.metricName,
|
||||
sensor.tags,
|
||||
m.meta,
|
||||
map[string]any{"value": x},
|
||||
time.Now(),
|
||||
)
|
||||
y, err := lp.NewMetric(sensor.metricName, sensor.tags, m.meta, x, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
// max temperature
|
||||
if m.config.ReportMaxTemp && sensor.maxTemp != 0 {
|
||||
y, err := lp.NewMessage(
|
||||
sensor.maxTempName,
|
||||
sensor.tags,
|
||||
m.meta,
|
||||
map[string]any{"value": sensor.maxTemp},
|
||||
time.Now(),
|
||||
)
|
||||
y, err := lp.NewMetric(sensor.maxTempName, sensor.tags, m.meta, sensor.maxTemp, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -226,13 +216,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
|
||||
// critical temperature
|
||||
if m.config.ReportCriticalTemp && sensor.critTemp != 0 {
|
||||
y, err := lp.NewMessage(
|
||||
sensor.critTempName,
|
||||
sensor.tags,
|
||||
m.meta,
|
||||
map[string]any{"value": sensor.critTemp},
|
||||
time.Now(),
|
||||
)
|
||||
y, err := lp.NewMetric(sensor.critTempName, sensor.tags, m.meta, sensor.critTemp, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
@@ -14,10 +14,10 @@ hugo_path: docs/reference/cc-metric-collector/collectors/temp.md
|
||||
|
||||
```json
|
||||
"tempstat": {
|
||||
"tag_override" : {
|
||||
"<device like hwmon1>" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "0"
|
||||
"tag_override": {
|
||||
"<device like hwmon1>": {
|
||||
"type": "socket",
|
||||
"type-id": "0"
|
||||
}
|
||||
},
|
||||
"exclude_metrics": [
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
@@ -46,9 +47,10 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
|
||||
"group": "TopProcs",
|
||||
}
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s Init(): json.Unmarshal() failed: %w", m.name, err)
|
||||
d := json.NewDecoder(bytes.NewReader(config))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||
}
|
||||
} else {
|
||||
m.config.Num_procs = int(DEFAULT_NUM_PROCS)
|
||||
@@ -75,24 +77,16 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessag
|
||||
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
||||
stdout, err := command.Output()
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to read output from command \"%s\": %v", command.String(), err))
|
||||
"Read(): Failed to read output from command \"%s\": %v", command.String(), err)
|
||||
return
|
||||
}
|
||||
|
||||
lines := strings.Split(string(stdout), "\n")
|
||||
for i := 1; i < m.config.Num_procs+1; i++ {
|
||||
name := fmt.Sprintf("topproc%d", i)
|
||||
y, err := lp.NewMessage(
|
||||
name,
|
||||
m.tags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": lines[i],
|
||||
},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
if y, err := lp.NewMetric(name, m.tags, m.meta, lines[i], time.Now()); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,8 +34,8 @@
|
||||
},
|
||||
"numastats": {},
|
||||
"nvidia": {},
|
||||
"schedstat": {
|
||||
},
|
||||
"schedstat": {},
|
||||
"smartmon": {},
|
||||
"tempstat": {
|
||||
"report_max_temperature": true,
|
||||
"report_critical_temperature": true,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"process_messages" : {
|
||||
"add_tag_if": [
|
||||
"add_tags_if": [
|
||||
{
|
||||
"key" : "cluster",
|
||||
"value" : "testcluster",
|
||||
@@ -12,7 +12,7 @@
|
||||
"if" : "name == 'temp_package_id_0'"
|
||||
}
|
||||
],
|
||||
"delete_tag_if": [
|
||||
"delete_meta_if": [
|
||||
{
|
||||
"key" : "unit",
|
||||
"if" : "true"
|
||||
|
||||
33
go.mod
33
go.mod
@@ -3,14 +3,14 @@ module github.com/ClusterCockpit/cc-metric-collector
|
||||
go 1.25.0
|
||||
|
||||
require (
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.7.0
|
||||
github.com/ClusterCockpit/go-rocm-smi v0.3.0
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.12.0
|
||||
github.com/ClusterCockpit/go-rocm-smi v0.4.0
|
||||
github.com/NVIDIA/go-nvml v0.13.0-1
|
||||
github.com/PaesslerAG/gval v1.2.4
|
||||
github.com/fsnotify/fsnotify v1.9.0
|
||||
github.com/tklauser/go-sysconf v0.3.16
|
||||
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1
|
||||
golang.org/x/sys v0.41.0
|
||||
github.com/fsnotify/fsnotify v1.10.1
|
||||
github.com/tklauser/go-sysconf v0.4.0
|
||||
golang.design/x/thread v0.3.2
|
||||
golang.org/x/sys v0.45.0
|
||||
)
|
||||
|
||||
require (
|
||||
@@ -18,28 +18,29 @@ require (
|
||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/coder/websocket v1.8.14 // indirect
|
||||
github.com/expr-lang/expr v1.17.8 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/gorilla/mux v1.8.1 // indirect
|
||||
github.com/influxdata/influxdb-client-go/v2 v2.14.0 // indirect
|
||||
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect
|
||||
github.com/klauspost/compress v1.18.4 // indirect
|
||||
github.com/klauspost/compress v1.18.5 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/nats-io/nats.go v1.49.0 // indirect
|
||||
github.com/nats-io/nats.go v1.51.0 // indirect
|
||||
github.com/nats-io/nkeys v0.4.15 // indirect
|
||||
github.com/nats-io/nuid v1.0.1 // indirect
|
||||
github.com/oapi-codegen/runtime v1.2.0 // indirect
|
||||
github.com/oapi-codegen/runtime v1.4.0 // indirect
|
||||
github.com/prometheus/client_golang v1.23.2 // indirect
|
||||
github.com/prometheus/client_model v0.6.2 // indirect
|
||||
github.com/prometheus/common v0.67.5 // indirect
|
||||
github.com/prometheus/procfs v0.20.0 // indirect
|
||||
github.com/prometheus/procfs v0.20.1 // indirect
|
||||
github.com/questdb/go-questdb-client/v4 v4.2.0 // indirect
|
||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 // indirect
|
||||
github.com/shopspring/decimal v1.4.0 // indirect
|
||||
github.com/stmcginnis/gofish v0.21.3 // indirect
|
||||
github.com/tklauser/numcpus v0.11.0 // indirect
|
||||
go.yaml.in/yaml/v2 v2.4.3 // indirect
|
||||
golang.org/x/crypto v0.48.0 // indirect
|
||||
golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa // indirect
|
||||
golang.org/x/net v0.51.0 // indirect
|
||||
github.com/stmcginnis/gofish v0.21.6 // indirect
|
||||
github.com/tklauser/numcpus v0.12.0 // indirect
|
||||
go.yaml.in/yaml/v2 v2.4.4 // indirect
|
||||
golang.org/x/crypto v0.50.0 // indirect
|
||||
golang.org/x/net v0.53.0 // indirect
|
||||
google.golang.org/protobuf v1.36.11 // indirect
|
||||
)
|
||||
|
||||
170
go.sum
170
go.sum
@@ -1,10 +1,17 @@
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.7.0 h1:EMTShk6rMTR1wlfmQ8SVCawH1OdltUbD3kVQmaW+5pE=
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.7.0/go.mod h1:0Etx8WMs0lYZ4tiOQizY18CQop+2i3WROvU9rMUxHA4=
|
||||
dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk=
|
||||
dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk=
|
||||
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0=
|
||||
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.12.0 h1:ZbGD68nDniuvzFjJCdyYawpCBrabdSyWOg5FFSyFbjQ=
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.12.0/go.mod h1:ml8xtcYa5WhPM7JDQ+M9/R9ZBxITCR/5xqGJ//GxXJI=
|
||||
github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 h1:hIzxgTBWcmCIHtoDKDkSCsKCOCOwUC34sFsbD2wcW0Q=
|
||||
github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0/go.mod h1:y42qUu+YFmu5fdNuUAS4VbbIKxVjxCvbVqFdpdh8ahY=
|
||||
github.com/ClusterCockpit/go-rocm-smi v0.3.0 h1:1qZnSpG7/NyLtc7AjqnUL9Jb8xtqG1nMVgp69rJfaR8=
|
||||
github.com/ClusterCockpit/go-rocm-smi v0.3.0/go.mod h1:+I3UMeX3OlizXDf1WpGD43W4KGZZGVSGmny6rTeOnWA=
|
||||
github.com/NVIDIA/go-nvml v0.11.6-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
|
||||
github.com/ClusterCockpit/go-rocm-smi v0.4.0 h1:3+bEPrSkjEJcOtt+qBUX48ugDVlOFaKUnXHTef2Ve2Q=
|
||||
github.com/ClusterCockpit/go-rocm-smi v0.4.0/go.mod h1:c19u5vBCcgb7DjL4EWTGSGpo6c79d07r4rxD50z25ng=
|
||||
github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow=
|
||||
github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM=
|
||||
github.com/Microsoft/hcsshim v0.11.4 h1:68vKo2VN8DE9AdN4tnkWnmdhqdbpUFM8OF3Airm7fz8=
|
||||
github.com/Microsoft/hcsshim v0.11.4/go.mod h1:smjE4dvqPX9Zldna+t5FG3rnoHhaB7QYxPRqGcpAD9w=
|
||||
github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw=
|
||||
github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
|
||||
github.com/PaesslerAG/gval v1.2.4 h1:rhX7MpjJlcxYwL2eTTYIOBUyEKZ+A96T9vQySWkVUiU=
|
||||
@@ -12,28 +19,52 @@ github.com/PaesslerAG/gval v1.2.4/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbV
|
||||
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
|
||||
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
|
||||
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
|
||||
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op h1:Ucf+QxEKMbPogRO5guBNe5cgd9uZgfoJLOYs8WWhtjM=
|
||||
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E=
|
||||
github.com/antithesishq/antithesis-sdk-go v0.6.0-default-no-op h1:kpBdlEPbRvff0mDD1gk7o9BhI16b9p5yYAXRlidpqJE=
|
||||
github.com/antithesishq/antithesis-sdk-go v0.6.0-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E=
|
||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
|
||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
|
||||
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
||||
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
|
||||
github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w=
|
||||
github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM=
|
||||
github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
|
||||
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g=
|
||||
github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg=
|
||||
github.com/containerd/containerd v1.7.12 h1:+KQsnv4VnzyxWcfO9mlxxELaoztsDEjOuCMPAuPqgU0=
|
||||
github.com/containerd/containerd v1.7.12/go.mod h1:/5OMpE1p0ylxtEUGY8kuCYkDRzJm9NO1TFMWjUpdevk=
|
||||
github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I=
|
||||
github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo=
|
||||
github.com/cpuguy83/dockercfg v0.3.1 h1:/FpZ+JaygUR/lZP2NlFI2DVfrOEMAIKP5wWEJdoYe9E=
|
||||
github.com/cpuguy83/dockercfg v0.3.1/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8=
|
||||
github.com/docker/distribution v2.8.2+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w=
|
||||
github.com/docker/docker v24.0.9+incompatible h1:HPGzNmwfLZWdxHqK9/II92pyi1EpYKsAqcl4G0Of9v0=
|
||||
github.com/docker/docker v24.0.9+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
|
||||
github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c=
|
||||
github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc=
|
||||
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
|
||||
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
|
||||
github.com/expr-lang/expr v1.17.8 h1:W1loDTT+0PQf5YteHSTpju2qfUfNoBt4yw9+wOEU9VM=
|
||||
github.com/expr-lang/expr v1.17.8/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
|
||||
github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
|
||||
github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
|
||||
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
|
||||
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
|
||||
github.com/fsnotify/fsnotify v1.10.1 h1:b0/UzAf9yR5rhf3RPm9gf3ehBPpf0oZKIjtpKrx59Ho=
|
||||
github.com/fsnotify/fsnotify v1.10.1/go.mod h1:TLheqan6HD6GBK6PrDWyDPBaEV8LspOxvPSjC+bVfgo=
|
||||
github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE=
|
||||
github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78=
|
||||
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
|
||||
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
|
||||
github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg=
|
||||
github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
|
||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||
github.com/google/go-tpm v0.9.7 h1:u89J4tUUeDTlH8xxC3CTW7OHZjbjKoHdQ9W7gCUhtxA=
|
||||
github.com/google/go-tpm v0.9.7/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY=
|
||||
github.com/google/go-tpm v0.9.8 h1:slArAR9Ft+1ybZu0lBwpSmpwhRXaa85hWtMinMyRAWo=
|
||||
github.com/google/go-tpm v0.9.8/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
|
||||
@@ -45,79 +76,126 @@ github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1
|
||||
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig=
|
||||
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo=
|
||||
github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
|
||||
github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c=
|
||||
github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
|
||||
github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE=
|
||||
github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ=
|
||||
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
|
||||
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
|
||||
github.com/lufia/plan9stats v0.0.0-20230326075908-cb1d2100619a h1:N9zuLhTvBSRt0gWSiJswwQ2HqDmtX/ZCDJURnKUt1Ik=
|
||||
github.com/lufia/plan9stats v0.0.0-20230326075908-cb1d2100619a/go.mod h1:JKx41uQRwqlTZabZc+kILPrO/3jlKnQ2Z8b7YiVw5cE=
|
||||
github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY=
|
||||
github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0=
|
||||
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 h1:KGuD/pM2JpL9FAYvBrnBBeENKZNh6eNtjqytV6TYjnk=
|
||||
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ=
|
||||
github.com/moby/patternmatcher v0.6.0 h1:GmP9lR19aU5GqSSFko+5pRqHi+Ohk1O69aFiKkVGiPk=
|
||||
github.com/moby/patternmatcher v0.6.0/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc=
|
||||
github.com/moby/sys/sequential v0.5.0 h1:OPvI35Lzn9K04PBbCLW0g4LcFAJgHsvXsRyewg5lXtc=
|
||||
github.com/moby/sys/sequential v0.5.0/go.mod h1:tH2cOOs5V9MlPiXcQzRC+eEyab644PWKGRYaaV5ZZlo=
|
||||
github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
|
||||
github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
|
||||
github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
|
||||
github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
||||
github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g=
|
||||
github.com/nats-io/jwt/v2 v2.8.0/go.mod h1:me11pOkwObtcBNR8AiMrUbtVOUGkqYjMQZ6jnSdVUIA=
|
||||
github.com/nats-io/nats-server/v2 v2.12.3 h1:KRv+1n7lddMVgkJPQer+pt36TcO0ENxjilBmeWdjcHs=
|
||||
github.com/nats-io/nats-server/v2 v2.12.3/go.mod h1:MQXjG9WjyXKz9koWzUc3jYUMKD8x3CLmTNy91IQQz3Y=
|
||||
github.com/nats-io/nats.go v1.49.0 h1:yh/WvY59gXqYpgl33ZI+XoVPKyut/IcEaqtsiuTJpoE=
|
||||
github.com/nats-io/nats.go v1.49.0/go.mod h1:fDCn3mN5cY8HooHwE2ukiLb4p4G4ImmzvXyJt+tGwdw=
|
||||
github.com/nats-io/jwt/v2 v2.8.1 h1:V0xpGuD/N8Mi+fQNDynXohVvp7ZztevW5io8CUWlPmU=
|
||||
github.com/nats-io/jwt/v2 v2.8.1/go.mod h1:nWnOEEiVMiKHQpnAy4eXlizVEtSfzacZ1Q43LIRavZg=
|
||||
github.com/nats-io/nats-server/v2 v2.12.7 h1:prQ9cPiWHcnwfT81Wi5lU9LL8TLY+7pxDru6fQYLCQQ=
|
||||
github.com/nats-io/nats-server/v2 v2.12.7/go.mod h1:dOnmkprKMluTmTF7/QHZioxlau3sKHUM/LBPy9AiBPw=
|
||||
github.com/nats-io/nats.go v1.51.0 h1:ByW84XTz6W03GSSsygsZcA+xgKK8vPGaa/FCAAEHnAI=
|
||||
github.com/nats-io/nats.go v1.51.0/go.mod h1:26HypzazeOkyO3/mqd1zZd53STJN0EjCYF9Uy2ZOBno=
|
||||
github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4=
|
||||
github.com/nats-io/nkeys v0.4.15/go.mod h1:CpMchTXC9fxA5zrMo4KpySxNjiDVvr8ANOSZdiNfUrs=
|
||||
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
|
||||
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
|
||||
github.com/oapi-codegen/runtime v1.2.0 h1:RvKc1CVS1QeKSNzO97FBQbSMZyQ8s6rZd+LpmzwHMP4=
|
||||
github.com/oapi-codegen/runtime v1.2.0/go.mod h1:Y7ZhmmlE8ikZOmuHRRndiIm7nf3xcVv+YMweKgG1DT0=
|
||||
github.com/oapi-codegen/runtime v1.4.0 h1:KLOSFOp7UzkbS7Cs1ms6NBEKYr0WmH2wZG0KKbd2er4=
|
||||
github.com/oapi-codegen/runtime v1.4.0/go.mod h1:5sw5fxCDmnOzKNYmkVNF8d34kyUeejJEY8HNT2WaPec=
|
||||
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
|
||||
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
|
||||
github.com/opencontainers/image-spec v1.1.0-rc5 h1:Ygwkfw9bpDvs+c9E34SdgGOj41dX/cbdlwvlWt0pnFI=
|
||||
github.com/opencontainers/image-spec v1.1.0-rc5/go.mod h1:X4pATf0uXsnn3g5aiGIsVnJBR4mxhKzfwmvK/B2NTm8=
|
||||
github.com/opencontainers/runc v1.1.5 h1:L44KXEpKmfWDcS02aeGm8QNTFXTo2D+8MYGDIJ/GDEs=
|
||||
github.com/opencontainers/runc v1.1.5/go.mod h1:1J5XiS+vdZ3wCyZybsuxXZWGrgSr8fFJHLXuG2PsnNg=
|
||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b h1:0LFwY6Q3gMACTjAbMZBjXAqTOzOwFaj2Ld6cjeQ7Rig=
|
||||
github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
|
||||
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
|
||||
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
|
||||
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
|
||||
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
|
||||
github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4=
|
||||
github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw=
|
||||
github.com/prometheus/procfs v0.20.0 h1:AA7aCvjxwAquZAlonN7888f2u4IN8WVeFgBi4k82M4Q=
|
||||
github.com/prometheus/procfs v0.20.0/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo=
|
||||
github.com/prometheus/procfs v0.20.1 h1:XwbrGOIplXW/AU3YhIhLODXMJYyC1isLFfYCsTEycfc=
|
||||
github.com/prometheus/procfs v0.20.1/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo=
|
||||
github.com/questdb/go-questdb-client/v4 v4.2.0 h1:+d0HJwCjUWMj7zmY6qmhoqTJzTyoYKl+LSTYGN0T8T8=
|
||||
github.com/questdb/go-questdb-client/v4 v4.2.0/go.mod h1:/2x93LK1wjM4JX/b5c6q7Yqk22htjWY1lE6p1X8iLbE=
|
||||
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
|
||||
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
|
||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4=
|
||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY=
|
||||
github.com/shirou/gopsutil/v3 v3.23.12 h1:z90NtUkp3bMtmICZKpC4+WaknU1eXtp5vtbQ11DgpE4=
|
||||
github.com/shirou/gopsutil/v3 v3.23.12/go.mod h1:1FrWgea594Jp7qmjHUUPlJDTPgcsb9mGnXDxavtikzM=
|
||||
github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM=
|
||||
github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ=
|
||||
github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
|
||||
github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k=
|
||||
github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME=
|
||||
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
|
||||
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
|
||||
github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
|
||||
github.com/stmcginnis/gofish v0.21.3 h1:EBLCHfORnbx7MPw7lplOOVe9QAD1T3XRVz6+a1Z4z5Q=
|
||||
github.com/stmcginnis/gofish v0.21.3/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
|
||||
github.com/stmcginnis/gofish v0.21.6 h1:jK3TGD6VANaAHKHypVNfD6io2nPrU+6eF8X4qARsTlY=
|
||||
github.com/stmcginnis/gofish v0.21.6/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
||||
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
|
||||
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA=
|
||||
github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI=
|
||||
github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw=
|
||||
github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ=
|
||||
github.com/testcontainers/testcontainers-go v0.26.0 h1:uqcYdoOHBy1ca7gKODfBd9uTHVK3a7UL848z09MVZ0c=
|
||||
github.com/testcontainers/testcontainers-go v0.26.0/go.mod h1:ICriE9bLX5CLxL9OFQ2N+2N+f+803LNJ1utJb1+Inx0=
|
||||
github.com/tklauser/go-sysconf v0.4.0 h1:7H0uAN+7RkwWRaxhYXDLqa5V3LPrJeV8wmD9dRUgPQU=
|
||||
github.com/tklauser/go-sysconf v0.4.0/go.mod h1:8mTNWyog7H+MpKijp4VmKJAd2bbYQ2zuUwkYRbUArPI=
|
||||
github.com/tklauser/numcpus v0.12.0 h1:NR85qdvHA9pFse3x3weVZ0r0ST8R6l5RHbZrlRaqob4=
|
||||
github.com/tklauser/numcpus v0.12.0/go.mod h1:ABHeXzJnr/qqwguhClkZKT1/8VABcYrsyUiUGobwWJg=
|
||||
github.com/yusufpapurcu/wmi v1.2.3 h1:E1ctvB7uKFMOJw3fdOW32DwGE9I7t++CRUEMKvFoFiw=
|
||||
github.com/yusufpapurcu/wmi v1.2.3/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
|
||||
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||
go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
|
||||
go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
|
||||
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 h1:P7S/GeHBAFEZIYp0ePPs2kHXoazz8q2KsyxHyQVGCJg=
|
||||
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1/go.mod h1:9CWpnTUmlQkfdpdutA1nNf4iE5lAVt3QZOu0Z6hahBE=
|
||||
golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts=
|
||||
golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos=
|
||||
golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa h1:Zt3DZoOFFYkKhDT3v7Lm9FDMEV06GpzjG2jrqW+QTE0=
|
||||
golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa/go.mod h1:K79w1Vqn7PoiZn+TkNpx3BUWUQksGO3JcVX6qIjytmA=
|
||||
golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo=
|
||||
golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y=
|
||||
golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k=
|
||||
golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
|
||||
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
|
||||
go.yaml.in/yaml/v2 v2.4.4 h1:tuyd0P+2Ont/d6e2rl3be67goVK4R6deVxCUX5vyPaQ=
|
||||
go.yaml.in/yaml/v2 v2.4.4/go.mod h1:gMZqIpDtDqOfM0uNfy0SkpRhvUryYH0Z6wdMYcacYXQ=
|
||||
golang.design/x/thread v0.3.2 h1:FmD1glspGrQCe6FuQLmSrT6wz2CSzq7vKVDluyiMnqo=
|
||||
golang.design/x/thread v0.3.2/go.mod h1:6+Hi2rMOgMHZdKDWaqNHyWtoFUx1HxZ06LfHPh5Z/hQ=
|
||||
golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI=
|
||||
golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q=
|
||||
golang.org/x/exp v0.0.0-20231005195138-3e424a577f31 h1:9k5exFQKQglLo+RoP+4zMjOFE14P6+vyR0baDAi0Rcs=
|
||||
golang.org/x/exp v0.0.0-20231005195138-3e424a577f31/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k=
|
||||
golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY=
|
||||
golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA=
|
||||
golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs=
|
||||
golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY=
|
||||
golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
|
||||
golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
|
||||
golang.org/x/tools v0.14.0 h1:jvNa2pY0M4r62jkRQ6RwEZZyPcymeL9XZMLBbV7U2nc=
|
||||
golang.org/x/tools v0.14.0/go.mod h1:uYBEerGOWcJyEORxN+Ek8+TT266gXkNlHdJBwexUsBg=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20231002182017-d307bd883b97 h1:6GQBEOdGkX6MMTLT9V+TjtIRZCw9VPD5Z+yHY9wMgS0=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20231002182017-d307bd883b97/go.mod h1:v7nGkzlmW8P3n/bKmWBn2WpBjpOEx8Q6gMueudAmKfY=
|
||||
google.golang.org/grpc v1.58.3 h1:BjnpXut1btbtgN/6sp+brB2Kbm2LjNXnidYujAVbSoQ=
|
||||
google.golang.org/grpc v1.58.3/go.mod h1:tgX3ZQDlNJGU96V6yHh1T/JeoBQ2TXdr43YbYSsCJk0=
|
||||
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
|
||||
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
|
||||
@@ -94,8 +94,7 @@ func (c *metricAggregator) Init(output chan lp.CCMessage) error {
|
||||
// Set hostname
|
||||
hostname, err := os.Hostname()
|
||||
if err != nil {
|
||||
cclog.Error(err.Error())
|
||||
return err
|
||||
return fmt.Errorf("metricAggregator: failed to get hostname: %w", err)
|
||||
}
|
||||
// Drop domain part of host name
|
||||
c.constants["hostname"] = strings.SplitN(hostname, `.`, 2)[0]
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
package metricRouter
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -70,8 +71,7 @@ func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker,
|
||||
// The code is executed by the MetricCache goroutine
|
||||
c.aggEngine, err = agg.NewAggregator(c.output)
|
||||
if err != nil {
|
||||
cclog.ComponentError("MetricCache", "Cannot create aggregator")
|
||||
return err
|
||||
return fmt.Errorf("MetricCache: failed to create aggregator: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
package metricRouter
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"maps"
|
||||
@@ -34,19 +35,18 @@ type metricRouterTagConfig struct {
|
||||
|
||||
// Metric router configuration
|
||||
type metricRouterConfig struct {
|
||||
HostnameTagName string `json:"hostname_tag"` // Key name used when adding the hostname to a metric (default 'hostname')
|
||||
AddTags []metricRouterTagConfig `json:"add_tags"` // List of tags that are added when the condition is met
|
||||
DelTags []metricRouterTagConfig `json:"delete_tags"` // List of tags that are removed when the condition is met
|
||||
IntervalAgg []agg.MetricAggregatorIntervalConfig `json:"interval_aggregates"` // List of aggregation function processed at the end of an interval
|
||||
DropMetrics []string `json:"drop_metrics"` // List of metric names to drop. For fine-grained dropping use drop_metrics_if
|
||||
DropMetricsIf []string `json:"drop_metrics_if"` // List of evaluatable terms to drop metrics
|
||||
RenameMetrics map[string]string `json:"rename_metrics"` // Map to rename metric name from key to value
|
||||
IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically by ticker each interval?
|
||||
NumCacheIntervals int `json:"num_cache_intervals"` // Number of intervals of cached metrics for evaluation
|
||||
MaxForward int `json:"max_forward"` // Number of maximal forwarded metrics at one select
|
||||
NormalizeUnits bool `json:"normalize_units"` // Check unit meta flag and normalize it using cc-units
|
||||
ChangeUnitPrefix map[string]string `json:"change_unit_prefix"` // Add prefix that should be applied to the metrics
|
||||
// dropMetrics map[string]bool // Internal map for O(1) lookup
|
||||
HostnameTagName string `json:"hostname_tag,omitempty"` // Key name used when adding the hostname to a metric (default 'hostname')
|
||||
AddTags []metricRouterTagConfig `json:"add_tags,omitempty"` // List of tags that are added when the condition is met
|
||||
DelTags []metricRouterTagConfig `json:"delete_tags,omitempty"` // List of tags that are removed when the condition is met
|
||||
IntervalAgg []agg.MetricAggregatorIntervalConfig `json:"interval_aggregates,omitempty"` // List of aggregation function processed at the end of an interval
|
||||
DropMetrics []string `json:"drop_metrics,omitempty"` // List of metric names to drop. For fine-grained dropping use drop_metrics_if
|
||||
DropMetricsIf []string `json:"drop_metrics_if,omitempty"` // List of evaluatable terms to drop metrics
|
||||
RenameMetrics map[string]string `json:"rename_metrics,omitempty"` // Map to rename metric name from key to value
|
||||
IntervalStamp bool `json:"interval_timestamp,omitempty"` // Update timestamp periodically by ticker each interval?
|
||||
NumCacheIntervals int `json:"num_cache_intervals,omitempty"` // Number of intervals of cached metrics for evaluation
|
||||
MaxForward int `json:"max_forward,omitempty"` // Number of maximal forwarded metrics at one select
|
||||
NormalizeUnits bool `json:"normalize_units,omitempty"` // Check unit meta flag and normalize it using cc-units
|
||||
ChangeUnitPrefix map[string]string `json:"change_unit_prefix,omitempty"` // Add prefix that should be applied to the metrics
|
||||
MessageProcessor json.RawMessage `json:"process_messages,omitempty"`
|
||||
}
|
||||
|
||||
@@ -102,18 +102,17 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
|
||||
// Drop domain part of host name
|
||||
r.hostname = strings.SplitN(hostname, `.`, 2)[0]
|
||||
|
||||
err = json.Unmarshal(routerConfig, &r.config)
|
||||
if err != nil {
|
||||
cclog.ComponentError("MetricRouter", err.Error())
|
||||
return err
|
||||
d := json.NewDecoder(bytes.NewReader(routerConfig))
|
||||
d.DisallowUnknownFields()
|
||||
if err := d.Decode(&r.config); err != nil {
|
||||
return fmt.Errorf("failed to decode metric router config: %w", err)
|
||||
}
|
||||
r.maxForward = max(1, r.config.MaxForward)
|
||||
|
||||
if r.config.NumCacheIntervals > 0 {
|
||||
r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals)
|
||||
if err != nil {
|
||||
cclog.ComponentError("MetricRouter", "MetricCache initialization failed:", err.Error())
|
||||
return err
|
||||
return fmt.Errorf("MetricRouter: failed to initialize MetricCache: %w", err)
|
||||
}
|
||||
for _, agg := range r.config.IntervalAgg {
|
||||
err = r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
|
||||
@@ -298,7 +297,7 @@ func (r *metricRouter) Start() {
|
||||
|
||||
case timestamp := <-timeChan:
|
||||
r.timestamp = timestamp
|
||||
cclog.ComponentDebug("MetricRouter", "Update timestamp", r.timestamp.UnixNano())
|
||||
cclog.ComponentDebugf("MetricRouter", "Update timestamp %d", r.timestamp.UnixNano())
|
||||
|
||||
case p := <-r.coll_input:
|
||||
coll_forward(p)
|
||||
|
||||
@@ -6,7 +6,7 @@ Installed-Size: {INSTALLED_SIZE}
|
||||
Architecture: {ARCH}
|
||||
Maintainer: thomas.gruber@fau.de
|
||||
Depends: libc6 (>= 2.2.1)
|
||||
Build-Depends: debhelper-compat (= 13), git, golang-go
|
||||
Build-Depends: debhelper-compat (= 13), git, golang-go, libdrm-dev
|
||||
Description: Metric collection daemon from the ClusterCockpit suite
|
||||
Homepage: https://github.com/ClusterCockpit/cc-metric-collector
|
||||
Source: cc-metric-collector
|
||||
|
||||
@@ -29,7 +29,7 @@ make
|
||||
|
||||
|
||||
%install
|
||||
install -Dpm 0750 %{name} %{buildroot}%{_bindir}/%{name}
|
||||
install -Dpm 0755 %{name} %{buildroot}%{_bindir}/%{name}
|
||||
install -Dpm 0600 example-configs/config.json %{buildroot}%{_sysconfdir}/%{name}/%{name}.json
|
||||
install -Dpm 0600 example-configs/collectors.json %{buildroot}%{_sysconfdir}/%{name}/collectors.json
|
||||
install -Dpm 0600 example-configs/sinks.json %{buildroot}%{_sysconfdir}/%{name}/sinks.json
|
||||
@@ -54,7 +54,7 @@ install -Dpm 0644 scripts/%{name}.sysusers %{buildroot}%{_sysusersdir}/%{name}.c
|
||||
|
||||
%files
|
||||
# Binary
|
||||
%attr(-,clustercockpit,clustercockpit) %{_bindir}/%{name}
|
||||
%attr(-,root,root) %{_bindir}/%{name}
|
||||
# Config
|
||||
%dir %{_sysconfdir}/%{name}
|
||||
%attr(0600,clustercockpit,clustercockpit) %config(noreplace) %{_sysconfdir}/%{name}/%{name}.json
|
||||
|
||||
Reference in New Issue
Block a user