mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-03-15 13:07:28 +01:00
Avoid duplicate error printing
This commit is contained in:
@@ -107,16 +107,16 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
|||||||
// Beegfs file system statistics can only be queried by user root
|
// Beegfs file system statistics can only be queried by user root
|
||||||
user, err := user.Current()
|
user, err := user.Current()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to get current user: %w", err)
|
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
|
||||||
}
|
}
|
||||||
if user.Uid != "0" {
|
if user.Uid != "0" {
|
||||||
return fmt.Errorf("BeegfsMetaCollector.Init(): BeeGFS file system statistics can only be queried by user root")
|
return fmt.Errorf("%s Init(): BeeGFS file system statistics can only be queried by user root", m.name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if beegfs-ctl is in executable search path
|
// Check if beegfs-ctl is in executable search path
|
||||||
_, err = exec.LookPath(m.config.Beegfs)
|
_, err = exec.LookPath(m.config.Beegfs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err)
|
return fmt.Errorf("%s Init(): Failed to find beegfs-ctl binary '%s': %w", m.name, m.config.Beegfs, err)
|
||||||
}
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -100,16 +100,16 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
|||||||
// Beegfs file system statistics can only be queried by user root
|
// Beegfs file system statistics can only be queried by user root
|
||||||
user, err := user.Current()
|
user, err := user.Current()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to get current user: %w", err)
|
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
|
||||||
}
|
}
|
||||||
if user.Uid != "0" {
|
if user.Uid != "0" {
|
||||||
return fmt.Errorf("BeegfsStorageCollector.Init(): BeeGFS file system statistics can only be queried by user root")
|
return fmt.Errorf("%s Init(): BeeGFS file system statistics can only be queried by user root", m.name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if beegfs-ctl is in executable search path
|
// Check if beegfs-ctl is in executable search path
|
||||||
_, err = exec.LookPath(m.config.Beegfs)
|
_, err = exec.LookPath(m.config.Beegfs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err)
|
return fmt.Errorf("%s Init(): Failed to find beegfs-ctl binary '%s': %w", m.name, m.config.Beegfs, err)
|
||||||
}
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
|||||||
scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq")
|
scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq")
|
||||||
err := unix.Access(scalingCurFreqFile, unix.R_OK)
|
err := unix.Access(scalingCurFreqFile, unix.R_OK)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("unable to access file '%s': %w", scalingCurFreqFile, err)
|
return fmt.Errorf("%s Init(): unable to access file '%s': %w", m.name, scalingCurFreqFile, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
m.topology = append(m.topology,
|
m.topology = append(m.topology,
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ package collectors
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
@@ -85,7 +84,7 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(m.files) == 0 && len(m.cmdFieldsSlice) == 0 {
|
if len(m.files) == 0 && len(m.cmdFieldsSlice) == 0 {
|
||||||
return errors.New("no metrics to collect")
|
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||||
}
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -347,18 +347,15 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
|||||||
if !m.config.Sudo {
|
if !m.config.Sudo {
|
||||||
user, err := user.Current()
|
user, err := user.Current()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
|
return fmt.Errorf("%s Init(): failed to get current user: %w", m.name, err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
if user.Uid != "0" {
|
if user.Uid != "0" {
|
||||||
cclog.ComponentError(m.name, "GPFS file system statistics can only be queried by user root")
|
return fmt.Errorf("%s Init(): GPFS file system statistics can only be queried by user root", m.name)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
p, err := exec.LookPath("sudo")
|
p, err := exec.LookPath("sudo")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "Cannot find 'sudo'")
|
return fmt.Errorf("%s Init(): cannot find 'sudo': %w", m.name, err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
m.sudoCmd = p
|
m.sudoCmd = p
|
||||||
}
|
}
|
||||||
@@ -434,7 +431,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(m.definitions) == 0 {
|
if len(m.definitions) == 0 {
|
||||||
return errors.New("no metrics to collect")
|
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||||
}
|
}
|
||||||
|
|
||||||
m.init = true
|
m.init = true
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ import (
|
|||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"slices"
|
"slices"
|
||||||
@@ -45,7 +44,6 @@ type IOstatCollector struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *IOstatCollector) Init(config json.RawMessage) error {
|
func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||||
var err error
|
|
||||||
m.name = "IOstatCollector"
|
m.name = "IOstatCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
||||||
@@ -87,7 +85,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(m.matches) == 0 {
|
if len(m.matches) == 0 {
|
||||||
return errors.New("no metrics to collect")
|
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||||
}
|
}
|
||||||
file, err := os.Open(IOSTATFILE)
|
file, err := os.Open(IOSTATFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -137,7 +135,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
m.init = true
|
m.init = true
|
||||||
return err
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ import (
|
|||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
@@ -68,7 +67,7 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
|
|||||||
command := exec.Command(p)
|
command := exec.Command(p)
|
||||||
err := command.Run()
|
err := command.Run()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
|
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %s", p, err.Error()))
|
||||||
m.ipmitool = ""
|
m.ipmitool = ""
|
||||||
} else {
|
} else {
|
||||||
m.ipmitool = p
|
m.ipmitool = p
|
||||||
@@ -79,14 +78,14 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
|
|||||||
command := exec.Command(p)
|
command := exec.Command(p)
|
||||||
err := command.Run()
|
err := command.Run()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
|
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %s", p, err.Error()))
|
||||||
m.ipmisensors = ""
|
m.ipmisensors = ""
|
||||||
} else {
|
} else {
|
||||||
m.ipmisensors = p
|
m.ipmisensors = p
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 {
|
if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 {
|
||||||
return errors.New("no usable IPMI reader found")
|
return fmt.Errorf("%s Init(): no usable IPMI reader found", m.name)
|
||||||
}
|
}
|
||||||
|
|
||||||
m.init = true
|
m.init = true
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ import "C"
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"maps"
|
"maps"
|
||||||
"math"
|
"math"
|
||||||
@@ -216,17 +215,17 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS)
|
lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS)
|
||||||
if lib == nil {
|
if lib == nil {
|
||||||
return fmt.Errorf("error instantiating DynamicLibrary for %s", m.config.LibraryPath)
|
return fmt.Errorf("%s Init(): error instantiating DynamicLibrary for %s", m.name, m.config.LibraryPath)
|
||||||
}
|
}
|
||||||
err := lib.Open()
|
err := lib.Open()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("error opening %s: %w", m.config.LibraryPath, err)
|
return fmt.Errorf("%s Init(): error opening %s: %w", m.name, m.config.LibraryPath, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if m.config.ForceOverwrite {
|
if m.config.ForceOverwrite {
|
||||||
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
|
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
|
||||||
if err := os.Setenv("LIKWID_FORCE", "1"); err != nil {
|
if err := os.Setenv("LIKWID_FORCE", "1"); err != nil {
|
||||||
return fmt.Errorf("error setting environment variable LIKWID_FORCE=1: %w", err)
|
return fmt.Errorf("%s Init(): error setting environment variable LIKWID_FORCE=1: %w", m.name, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if err := m.setup(); err != nil {
|
if err := m.setup(); err != nil {
|
||||||
@@ -297,16 +296,12 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// If no event set could be added, shut down LikwidCollector
|
// If no event set could be added, shut down LikwidCollector
|
||||||
if totalMetrics == 0 {
|
if totalMetrics == 0 {
|
||||||
err := errors.New("no LIKWID eventset or metric usable")
|
return fmt.Errorf("%s Init(): no LIKWID eventset or metric usable", m.name)
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ret := C.topology_init()
|
ret := C.topology_init()
|
||||||
if ret != 0 {
|
if ret != 0 {
|
||||||
err := errors.New("failed to initialize topology module")
|
return fmt.Errorf("%s Init(): failed to initialize topology module", m.name)
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
m.measureThread = thread.New()
|
m.measureThread = thread.New()
|
||||||
switch m.config.AccessMode {
|
switch m.config.AccessMode {
|
||||||
@@ -321,7 +316,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
p = m.config.DaemonPath
|
p = m.config.DaemonPath
|
||||||
}
|
}
|
||||||
if err := os.Setenv("PATH", p); err != nil {
|
if err := os.Setenv("PATH", p); err != nil {
|
||||||
return fmt.Errorf("error setting environment variable PATH=%s: %w", p, err)
|
return fmt.Errorf("%s Init(): error setting environment variable PATH=%s: %w", m.name, p, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
C.HPMmode(1)
|
C.HPMmode(1)
|
||||||
|
|||||||
@@ -19,7 +19,6 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -318,18 +317,15 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
|||||||
if !m.config.Sudo {
|
if !m.config.Sudo {
|
||||||
user, err := user.Current()
|
user, err := user.Current()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
|
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
if user.Uid != "0" {
|
if user.Uid != "0" {
|
||||||
cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root")
|
return fmt.Errorf("%s Init(): Lustre file system statistics can only be queried by user root", m.name)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
p, err := exec.LookPath("sudo")
|
p, err := exec.LookPath("sudo")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "Cannot find 'sudo'")
|
return fmt.Errorf("%s Init(): Cannot find 'sudo': %w", m.name, err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
m.sudoCmd = p
|
m.sudoCmd = p
|
||||||
}
|
}
|
||||||
@@ -338,7 +334,7 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
p, err = exec.LookPath(LCTL_CMD)
|
p, err = exec.LookPath(LCTL_CMD)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("%s Init(): Cannot find %s command: %w", m.name, LCTL_CMD, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.lctl = p
|
m.lctl = p
|
||||||
@@ -366,12 +362,12 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(m.definitions) == 0 {
|
if len(m.definitions) == 0 {
|
||||||
return errors.New("no metrics to collect")
|
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||||
}
|
}
|
||||||
|
|
||||||
devices := m.getDevices()
|
devices := m.getDevices()
|
||||||
if len(devices) == 0 {
|
if len(devices) == 0 {
|
||||||
return errors.New("no Lustre devices found")
|
return fmt.Errorf("%s Init(): no Lustre devices found", m.name)
|
||||||
}
|
}
|
||||||
m.stats = make(map[string]map[string]int64)
|
m.stats = make(map[string]map[string]int64)
|
||||||
for _, d := range devices {
|
for _, d := range devices {
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ import (
|
|||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@@ -96,7 +95,6 @@ func getStats(filename string) map[string]MemstatStats {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *MemstatCollector) Init(config json.RawMessage) error {
|
func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||||
var err error
|
|
||||||
m.name = "MemstatCollector"
|
m.name = "MemstatCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.config.NodeStats = true
|
m.config.NodeStats = true
|
||||||
@@ -134,7 +132,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
m.sendMemUsed = true
|
m.sendMemUsed = true
|
||||||
}
|
}
|
||||||
if len(m.matches) == 0 {
|
if len(m.matches) == 0 {
|
||||||
return errors.New("no metrics to collect")
|
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||||
}
|
}
|
||||||
if err := m.setup(); err != nil {
|
if err := m.setup(); err != nil {
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
@@ -142,7 +140,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
if m.config.NodeStats {
|
if m.config.NodeStats {
|
||||||
if stats := getStats(MEMSTATFILE); len(stats) == 0 {
|
if stats := getStats(MEMSTATFILE); len(stats) == 0 {
|
||||||
return fmt.Errorf("cannot read data from file %s", MEMSTATFILE)
|
return fmt.Errorf("%s Init(): cannot read data from file %s", m.name, MEMSTATFILE)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -154,7 +152,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
m.nodefiles = make(map[int]MemstatCollectorNode)
|
m.nodefiles = make(map[int]MemstatCollectorNode)
|
||||||
for _, f := range files {
|
for _, f := range files {
|
||||||
if stats := getStats(f); len(stats) == 0 {
|
if stats := getStats(f); len(stats) == 0 {
|
||||||
return fmt.Errorf("cannot read data from file %s", f)
|
return fmt.Errorf("%s Init(): cannot read data from file %s", m.name, f)
|
||||||
}
|
}
|
||||||
rematch := regex.FindStringSubmatch(f)
|
rematch := regex.FindStringSubmatch(f)
|
||||||
if len(rematch) == 2 {
|
if len(rematch) == 2 {
|
||||||
@@ -174,7 +172,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
return err
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
|
|||||||
@@ -134,11 +134,31 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
|||||||
// Check if device is a included device
|
// Check if device is a included device
|
||||||
if slices.Contains(m.config.IncludeDevices, canonical) {
|
if slices.Contains(m.config.IncludeDevices, canonical) {
|
||||||
// Tag will contain original device name (raw).
|
// Tag will contain original device name (raw).
|
||||||
tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"}
|
tags := map[string]string{
|
||||||
meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"}
|
"stype": "network",
|
||||||
meta_unit_byte_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "bytes/sec"}
|
"stype-id": raw,
|
||||||
meta_unit_pkts := map[string]string{"source": m.name, "group": "Network", "unit": "packets"}
|
"type": "node",
|
||||||
meta_unit_pkts_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "packets/sec"}
|
}
|
||||||
|
meta_unit_byte := map[string]string{
|
||||||
|
"source": m.name,
|
||||||
|
"group": "Network",
|
||||||
|
"unit": "bytes",
|
||||||
|
}
|
||||||
|
meta_unit_byte_per_sec := map[string]string{
|
||||||
|
"source": m.name,
|
||||||
|
"group": "Network",
|
||||||
|
"unit": "bytes/sec",
|
||||||
|
}
|
||||||
|
meta_unit_pkts := map[string]string{
|
||||||
|
"source": m.name,
|
||||||
|
"group": "Network",
|
||||||
|
"unit": "packets",
|
||||||
|
}
|
||||||
|
meta_unit_pkts_per_sec := map[string]string{
|
||||||
|
"source": m.name,
|
||||||
|
"group": "Network",
|
||||||
|
"unit": "packets/sec",
|
||||||
|
}
|
||||||
|
|
||||||
m.matches[canonical] = []NetstatCollectorMetric{
|
m.matches[canonical] = []NetstatCollectorMetric{
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -104,7 +104,6 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
||||||
var err error = nil
|
|
||||||
m.name = "NfsIOStatCollector"
|
m.name = "NfsIOStatCollector"
|
||||||
if err := m.setup(); err != nil {
|
if err := m.setup(); err != nil {
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
@@ -130,7 +129,7 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
|||||||
m.data = m.readNfsiostats()
|
m.data = m.readNfsiostats()
|
||||||
m.lastTimestamp = time.Now()
|
m.lastTimestamp = time.Now()
|
||||||
m.init = true
|
m.init = true
|
||||||
return err
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
|
|||||||
@@ -91,22 +91,18 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
// Error: NVML library not found
|
// Error: NVML library not found
|
||||||
// (nvml.ErrorString can not be used in this case)
|
// (nvml.ErrorString can not be used in this case)
|
||||||
if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
|
if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
|
||||||
err = fmt.Errorf("NVML library not found")
|
return fmt.Errorf("%s Init(): NVML library not found", m.name)
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err = errors.New(nvml.ErrorString(ret))
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error())
|
return fmt.Errorf("%s Init(): Unable to initialize NVML: %w", m.name, err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Number of NVIDIA GPUs
|
// Number of NVIDIA GPUs
|
||||||
num_gpus, ret := nvml.DeviceGetCount()
|
num_gpus, ret := nvml.DeviceGetCount()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err = errors.New(nvml.ErrorString(ret))
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
cclog.ComponentError(m.name, "Unable to get device count", err.Error())
|
return fmt.Errorf("%s Init(): Unable to get device count: %w", m.name, err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// For all GPUs
|
// For all GPUs
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ package collectors
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
@@ -52,7 +51,6 @@ type RocmSmiCollector struct {
|
|||||||
// Called once by the collector manager
|
// Called once by the collector manager
|
||||||
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
||||||
func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||||
var err error = nil
|
|
||||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||||
m.name = "RocmSmiCollector"
|
m.name = "RocmSmiCollector"
|
||||||
// This is for later use, also call it early
|
// This is for later use, also call it early
|
||||||
@@ -70,16 +68,12 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
ret := rocm_smi.Init()
|
ret := rocm_smi.Init()
|
||||||
if ret != rocm_smi.STATUS_SUCCESS {
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
err = errors.New("failed to initialize ROCm SMI library")
|
return fmt.Errorf("%s Init(): failed to initialize ROCm SMI library", m.name)
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
numDevs, ret := rocm_smi.NumMonitorDevices()
|
numDevs, ret := rocm_smi.NumMonitorDevices()
|
||||||
if ret != rocm_smi.STATUS_SUCCESS {
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
err = errors.New("failed to get number of GPUs from ROCm SMI library")
|
return fmt.Errorf("%s Init(): failed to get number of GPUs from ROCm SMI library", m.name)
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
m.devices = make([]RocmSmiCollectorDevice, 0)
|
m.devices = make([]RocmSmiCollectorDevice, 0)
|
||||||
@@ -91,16 +85,12 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
|
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
|
||||||
if ret != rocm_smi.STATUS_SUCCESS {
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
err = fmt.Errorf("failed to get handle for GPU %d", i)
|
return fmt.Errorf("%s Init(): failed to get get handle for GPU %d", m.name, i)
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pciInfo, ret := rocm_smi.DeviceGetPciInfo(device)
|
pciInfo, ret := rocm_smi.DeviceGetPciInfo(device)
|
||||||
if ret != rocm_smi.STATUS_SUCCESS {
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
err = fmt.Errorf("failed to get PCI information for GPU %d", i)
|
return fmt.Errorf("%s Init(): failed to get PCI information for GPU %d", m.name, i)
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pciId := fmt.Sprintf(
|
pciId := fmt.Sprintf(
|
||||||
@@ -150,7 +140,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Set this flag only if everything is initialized properly, all required files exist, ...
|
// Set this flag only if everything is initialized properly, all required files exist, ...
|
||||||
m.init = true
|
m.init = true
|
||||||
return err
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read collects all metrics belonging to the sample collector
|
// Read collects all metrics belonging to the sample collector
|
||||||
|
|||||||
@@ -94,8 +94,7 @@ func (c *metricAggregator) Init(output chan lp.CCMessage) error {
|
|||||||
// Set hostname
|
// Set hostname
|
||||||
hostname, err := os.Hostname()
|
hostname, err := os.Hostname()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.Error(err.Error())
|
return fmt.Errorf("metricAggregator: failed to get hostname: %w", err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
// Drop domain part of host name
|
// Drop domain part of host name
|
||||||
c.constants["hostname"] = strings.SplitN(hostname, `.`, 2)[0]
|
c.constants["hostname"] = strings.SplitN(hostname, `.`, 2)[0]
|
||||||
|
|||||||
@@ -8,6 +8,7 @@
|
|||||||
package metricRouter
|
package metricRouter
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -70,8 +71,7 @@ func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker,
|
|||||||
// The code is executed by the MetricCache goroutine
|
// The code is executed by the MetricCache goroutine
|
||||||
c.aggEngine, err = agg.NewAggregator(c.output)
|
c.aggEngine, err = agg.NewAggregator(c.output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError("MetricCache", "Cannot create aggregator")
|
return fmt.Errorf("MetricCache: failed to create aggregator: %w", err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -112,8 +112,7 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
|
|||||||
if r.config.NumCacheIntervals > 0 {
|
if r.config.NumCacheIntervals > 0 {
|
||||||
r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals)
|
r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError("MetricRouter", "MetricCache initialization failed:", err.Error())
|
return fmt.Errorf("MetricRouter: failed to initialize MetricCache: %w", err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
for _, agg := range r.config.IntervalAgg {
|
for _, agg := range r.config.IntervalAgg {
|
||||||
err = r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
|
err = r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
|
||||||
|
|||||||
Reference in New Issue
Block a user