mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-03-15 13:07:28 +01:00
Avoid duplicate error printing
This commit is contained in:
@@ -107,16 +107,16 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
||||
// Beegfs file system statistics can only be queried by user root
|
||||
user, err := user.Current()
|
||||
if err != nil {
|
||||
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to get current user: %w", err)
|
||||
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
|
||||
}
|
||||
if user.Uid != "0" {
|
||||
return fmt.Errorf("BeegfsMetaCollector.Init(): BeeGFS file system statistics can only be queried by user root")
|
||||
return fmt.Errorf("%s Init(): BeeGFS file system statistics can only be queried by user root", m.name)
|
||||
}
|
||||
|
||||
// Check if beegfs-ctl is in executable search path
|
||||
_, err = exec.LookPath(m.config.Beegfs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err)
|
||||
return fmt.Errorf("%s Init(): Failed to find beegfs-ctl binary '%s': %w", m.name, m.config.Beegfs, err)
|
||||
}
|
||||
m.init = true
|
||||
return nil
|
||||
|
||||
@@ -100,16 +100,16 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
||||
// Beegfs file system statistics can only be queried by user root
|
||||
user, err := user.Current()
|
||||
if err != nil {
|
||||
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to get current user: %w", err)
|
||||
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
|
||||
}
|
||||
if user.Uid != "0" {
|
||||
return fmt.Errorf("BeegfsStorageCollector.Init(): BeeGFS file system statistics can only be queried by user root")
|
||||
return fmt.Errorf("%s Init(): BeeGFS file system statistics can only be queried by user root", m.name)
|
||||
}
|
||||
|
||||
// Check if beegfs-ctl is in executable search path
|
||||
_, err = exec.LookPath(m.config.Beegfs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err)
|
||||
return fmt.Errorf("%s Init(): Failed to find beegfs-ctl binary '%s': %w", m.name, m.config.Beegfs, err)
|
||||
}
|
||||
m.init = true
|
||||
return nil
|
||||
|
||||
@@ -79,7 +79,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
||||
scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq")
|
||||
err := unix.Access(scalingCurFreqFile, unix.R_OK)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to access file '%s': %w", scalingCurFreqFile, err)
|
||||
return fmt.Errorf("%s Init(): unable to access file '%s': %w", m.name, scalingCurFreqFile, err)
|
||||
}
|
||||
|
||||
m.topology = append(m.topology,
|
||||
|
||||
@@ -10,7 +10,6 @@ package collectors
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
@@ -85,7 +84,7 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
if len(m.files) == 0 && len(m.cmdFieldsSlice) == 0 {
|
||||
return errors.New("no metrics to collect")
|
||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||
}
|
||||
m.init = true
|
||||
return nil
|
||||
|
||||
@@ -347,18 +347,15 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||
if !m.config.Sudo {
|
||||
user, err := user.Current()
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to get current user: %w", m.name, err)
|
||||
}
|
||||
if user.Uid != "0" {
|
||||
cclog.ComponentError(m.name, "GPFS file system statistics can only be queried by user root")
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): GPFS file system statistics can only be queried by user root", m.name)
|
||||
}
|
||||
} else {
|
||||
p, err := exec.LookPath("sudo")
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Cannot find 'sudo'")
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): cannot find 'sudo': %w", m.name, err)
|
||||
}
|
||||
m.sudoCmd = p
|
||||
}
|
||||
@@ -434,7 +431,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
}
|
||||
if len(m.definitions) == 0 {
|
||||
return errors.New("no metrics to collect")
|
||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||
}
|
||||
|
||||
m.init = true
|
||||
|
||||
@@ -11,7 +11,6 @@ import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"slices"
|
||||
@@ -45,7 +44,6 @@ type IOstatCollector struct {
|
||||
}
|
||||
|
||||
func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||
var err error
|
||||
m.name = "IOstatCollector"
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
||||
@@ -87,7 +85,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
}
|
||||
if len(m.matches) == 0 {
|
||||
return errors.New("no metrics to collect")
|
||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||
}
|
||||
file, err := os.Open(IOSTATFILE)
|
||||
if err != nil {
|
||||
@@ -137,7 +135,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
m.init = true
|
||||
return err
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
|
||||
@@ -11,7 +11,6 @@ import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os/exec"
|
||||
@@ -68,7 +67,7 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
|
||||
command := exec.Command(p)
|
||||
err := command.Run()
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %s", p, err.Error()))
|
||||
m.ipmitool = ""
|
||||
} else {
|
||||
m.ipmitool = p
|
||||
@@ -79,14 +78,14 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
|
||||
command := exec.Command(p)
|
||||
err := command.Run()
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %s", p, err.Error()))
|
||||
m.ipmisensors = ""
|
||||
} else {
|
||||
m.ipmisensors = p
|
||||
}
|
||||
}
|
||||
if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 {
|
||||
return errors.New("no usable IPMI reader found")
|
||||
return fmt.Errorf("%s Init(): no usable IPMI reader found", m.name)
|
||||
}
|
||||
|
||||
m.init = true
|
||||
|
||||
@@ -18,7 +18,6 @@ import "C"
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"maps"
|
||||
"math"
|
||||
@@ -216,17 +215,17 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS)
|
||||
if lib == nil {
|
||||
return fmt.Errorf("error instantiating DynamicLibrary for %s", m.config.LibraryPath)
|
||||
return fmt.Errorf("%s Init(): error instantiating DynamicLibrary for %s", m.name, m.config.LibraryPath)
|
||||
}
|
||||
err := lib.Open()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error opening %s: %w", m.config.LibraryPath, err)
|
||||
return fmt.Errorf("%s Init(): error opening %s: %w", m.name, m.config.LibraryPath, err)
|
||||
}
|
||||
|
||||
if m.config.ForceOverwrite {
|
||||
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
|
||||
if err := os.Setenv("LIKWID_FORCE", "1"); err != nil {
|
||||
return fmt.Errorf("error setting environment variable LIKWID_FORCE=1: %w", err)
|
||||
return fmt.Errorf("%s Init(): error setting environment variable LIKWID_FORCE=1: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
if err := m.setup(); err != nil {
|
||||
@@ -297,16 +296,12 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// If no event set could be added, shut down LikwidCollector
|
||||
if totalMetrics == 0 {
|
||||
err := errors.New("no LIKWID eventset or metric usable")
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): no LIKWID eventset or metric usable", m.name)
|
||||
}
|
||||
|
||||
ret := C.topology_init()
|
||||
if ret != 0 {
|
||||
err := errors.New("failed to initialize topology module")
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to initialize topology module", m.name)
|
||||
}
|
||||
m.measureThread = thread.New()
|
||||
switch m.config.AccessMode {
|
||||
@@ -321,7 +316,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
p = m.config.DaemonPath
|
||||
}
|
||||
if err := os.Setenv("PATH", p); err != nil {
|
||||
return fmt.Errorf("error setting environment variable PATH=%s: %w", p, err)
|
||||
return fmt.Errorf("%s Init(): error setting environment variable PATH=%s: %w", m.name, p, err)
|
||||
}
|
||||
}
|
||||
C.HPMmode(1)
|
||||
|
||||
@@ -19,7 +19,6 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
)
|
||||
|
||||
@@ -318,18 +317,15 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
||||
if !m.config.Sudo {
|
||||
user, err := user.Current()
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
|
||||
}
|
||||
if user.Uid != "0" {
|
||||
cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root")
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Lustre file system statistics can only be queried by user root", m.name)
|
||||
}
|
||||
} else {
|
||||
p, err := exec.LookPath("sudo")
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Cannot find 'sudo'")
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Cannot find 'sudo': %w", m.name, err)
|
||||
}
|
||||
m.sudoCmd = p
|
||||
}
|
||||
@@ -338,7 +334,7 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
||||
if err != nil {
|
||||
p, err = exec.LookPath(LCTL_CMD)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Cannot find %s command: %w", m.name, LCTL_CMD, err)
|
||||
}
|
||||
}
|
||||
m.lctl = p
|
||||
@@ -366,12 +362,12 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
}
|
||||
if len(m.definitions) == 0 {
|
||||
return errors.New("no metrics to collect")
|
||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||
}
|
||||
|
||||
devices := m.getDevices()
|
||||
if len(devices) == 0 {
|
||||
return errors.New("no Lustre devices found")
|
||||
return fmt.Errorf("%s Init(): no Lustre devices found", m.name)
|
||||
}
|
||||
m.stats = make(map[string]map[string]int64)
|
||||
for _, d := range devices {
|
||||
|
||||
@@ -11,7 +11,6 @@ import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
@@ -96,7 +95,6 @@ func getStats(filename string) map[string]MemstatStats {
|
||||
}
|
||||
|
||||
func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
var err error
|
||||
m.name = "MemstatCollector"
|
||||
m.parallel = true
|
||||
m.config.NodeStats = true
|
||||
@@ -134,7 +132,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
m.sendMemUsed = true
|
||||
}
|
||||
if len(m.matches) == 0 {
|
||||
return errors.New("no metrics to collect")
|
||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||
}
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
@@ -142,7 +140,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
|
||||
if m.config.NodeStats {
|
||||
if stats := getStats(MEMSTATFILE); len(stats) == 0 {
|
||||
return fmt.Errorf("cannot read data from file %s", MEMSTATFILE)
|
||||
return fmt.Errorf("%s Init(): cannot read data from file %s", m.name, MEMSTATFILE)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -154,7 +152,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
m.nodefiles = make(map[int]MemstatCollectorNode)
|
||||
for _, f := range files {
|
||||
if stats := getStats(f); len(stats) == 0 {
|
||||
return fmt.Errorf("cannot read data from file %s", f)
|
||||
return fmt.Errorf("%s Init(): cannot read data from file %s", m.name, f)
|
||||
}
|
||||
rematch := regex.FindStringSubmatch(f)
|
||||
if len(rematch) == 2 {
|
||||
@@ -174,7 +172,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
}
|
||||
m.init = true
|
||||
return err
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
|
||||
@@ -134,11 +134,31 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||
// Check if device is a included device
|
||||
if slices.Contains(m.config.IncludeDevices, canonical) {
|
||||
// Tag will contain original device name (raw).
|
||||
tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"}
|
||||
meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"}
|
||||
meta_unit_byte_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "bytes/sec"}
|
||||
meta_unit_pkts := map[string]string{"source": m.name, "group": "Network", "unit": "packets"}
|
||||
meta_unit_pkts_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "packets/sec"}
|
||||
tags := map[string]string{
|
||||
"stype": "network",
|
||||
"stype-id": raw,
|
||||
"type": "node",
|
||||
}
|
||||
meta_unit_byte := map[string]string{
|
||||
"source": m.name,
|
||||
"group": "Network",
|
||||
"unit": "bytes",
|
||||
}
|
||||
meta_unit_byte_per_sec := map[string]string{
|
||||
"source": m.name,
|
||||
"group": "Network",
|
||||
"unit": "bytes/sec",
|
||||
}
|
||||
meta_unit_pkts := map[string]string{
|
||||
"source": m.name,
|
||||
"group": "Network",
|
||||
"unit": "packets",
|
||||
}
|
||||
meta_unit_pkts_per_sec := map[string]string{
|
||||
"source": m.name,
|
||||
"group": "Network",
|
||||
"unit": "packets/sec",
|
||||
}
|
||||
|
||||
m.matches[canonical] = []NetstatCollectorMetric{
|
||||
{
|
||||
|
||||
@@ -104,7 +104,6 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
|
||||
}
|
||||
|
||||
func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
||||
var err error = nil
|
||||
m.name = "NfsIOStatCollector"
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
@@ -130,7 +129,7 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
||||
m.data = m.readNfsiostats()
|
||||
m.lastTimestamp = time.Now()
|
||||
m.init = true
|
||||
return err
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
|
||||
@@ -91,22 +91,18 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
// Error: NVML library not found
|
||||
// (nvml.ErrorString can not be used in this case)
|
||||
if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
|
||||
err = fmt.Errorf("NVML library not found")
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): NVML library not found", m.name)
|
||||
}
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Unable to initialize NVML: %w", m.name, err)
|
||||
}
|
||||
|
||||
// Number of NVIDIA GPUs
|
||||
num_gpus, ret := nvml.DeviceGetCount()
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentError(m.name, "Unable to get device count", err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Unable to get device count: %w", m.name, err)
|
||||
}
|
||||
|
||||
// For all GPUs
|
||||
|
||||
@@ -10,7 +10,6 @@ package collectors
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"slices"
|
||||
"strconv"
|
||||
@@ -52,7 +51,6 @@ type RocmSmiCollector struct {
|
||||
// Called once by the collector manager
|
||||
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
||||
func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
var err error = nil
|
||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||
m.name = "RocmSmiCollector"
|
||||
// This is for later use, also call it early
|
||||
@@ -70,16 +68,12 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
|
||||
ret := rocm_smi.Init()
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = errors.New("failed to initialize ROCm SMI library")
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to initialize ROCm SMI library", m.name)
|
||||
}
|
||||
|
||||
numDevs, ret := rocm_smi.NumMonitorDevices()
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = errors.New("failed to get number of GPUs from ROCm SMI library")
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to get number of GPUs from ROCm SMI library", m.name)
|
||||
}
|
||||
|
||||
m.devices = make([]RocmSmiCollectorDevice, 0)
|
||||
@@ -91,16 +85,12 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = fmt.Errorf("failed to get handle for GPU %d", i)
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to get get handle for GPU %d", m.name, i)
|
||||
}
|
||||
|
||||
pciInfo, ret := rocm_smi.DeviceGetPciInfo(device)
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = fmt.Errorf("failed to get PCI information for GPU %d", i)
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to get PCI information for GPU %d", m.name, i)
|
||||
}
|
||||
|
||||
pciId := fmt.Sprintf(
|
||||
@@ -150,7 +140,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Set this flag only if everything is initialized properly, all required files exist, ...
|
||||
m.init = true
|
||||
return err
|
||||
return nil
|
||||
}
|
||||
|
||||
// Read collects all metrics belonging to the sample collector
|
||||
|
||||
Reference in New Issue
Block a user