Avoid duplicate error printing

This commit is contained in:
Holger Obermaier
2026-03-12 10:08:23 +01:00
parent b65576431e
commit a481a34dcd
17 changed files with 71 additions and 86 deletions

View File

@@ -107,16 +107,16 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
// Beegfs file system statistics can only be queried by user root
user, err := user.Current()
if err != nil {
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to get current user: %w", err)
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
}
if user.Uid != "0" {
return fmt.Errorf("BeegfsMetaCollector.Init(): BeeGFS file system statistics can only be queried by user root")
return fmt.Errorf("%s Init(): BeeGFS file system statistics can only be queried by user root", m.name)
}
// Check if beegfs-ctl is in executable search path
_, err = exec.LookPath(m.config.Beegfs)
if err != nil {
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err)
return fmt.Errorf("%s Init(): Failed to find beegfs-ctl binary '%s': %w", m.name, m.config.Beegfs, err)
}
m.init = true
return nil

View File

@@ -100,16 +100,16 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
// Beegfs file system statistics can only be queried by user root
user, err := user.Current()
if err != nil {
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to get current user: %w", err)
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
}
if user.Uid != "0" {
return fmt.Errorf("BeegfsStorageCollector.Init(): BeeGFS file system statistics can only be queried by user root")
return fmt.Errorf("%s Init(): BeeGFS file system statistics can only be queried by user root", m.name)
}
// Check if beegfs-ctl is in executable search path
_, err = exec.LookPath(m.config.Beegfs)
if err != nil {
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err)
return fmt.Errorf("%s Init(): Failed to find beegfs-ctl binary '%s': %w", m.name, m.config.Beegfs, err)
}
m.init = true
return nil

View File

@@ -79,7 +79,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq")
err := unix.Access(scalingCurFreqFile, unix.R_OK)
if err != nil {
return fmt.Errorf("unable to access file '%s': %w", scalingCurFreqFile, err)
return fmt.Errorf("%s Init(): unable to access file '%s': %w", m.name, scalingCurFreqFile, err)
}
m.topology = append(m.topology,

View File

@@ -10,7 +10,6 @@ package collectors
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"os"
"os/exec"
@@ -85,7 +84,7 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
}
if len(m.files) == 0 && len(m.cmdFieldsSlice) == 0 {
return errors.New("no metrics to collect")
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
}
m.init = true
return nil

View File

@@ -347,18 +347,15 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
if !m.config.Sudo {
user, err := user.Current()
if err != nil {
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
return err
return fmt.Errorf("%s Init(): failed to get current user: %w", m.name, err)
}
if user.Uid != "0" {
cclog.ComponentError(m.name, "GPFS file system statistics can only be queried by user root")
return err
return fmt.Errorf("%s Init(): GPFS file system statistics can only be queried by user root", m.name)
}
} else {
p, err := exec.LookPath("sudo")
if err != nil {
cclog.ComponentError(m.name, "Cannot find 'sudo'")
return err
return fmt.Errorf("%s Init(): cannot find 'sudo': %w", m.name, err)
}
m.sudoCmd = p
}
@@ -434,7 +431,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
}
}
if len(m.definitions) == 0 {
return errors.New("no metrics to collect")
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
}
m.init = true

View File

@@ -11,7 +11,6 @@ import (
"bufio"
"bytes"
"encoding/json"
"errors"
"fmt"
"os"
"slices"
@@ -45,7 +44,6 @@ type IOstatCollector struct {
}
func (m *IOstatCollector) Init(config json.RawMessage) error {
var err error
m.name = "IOstatCollector"
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Disk"}
@@ -87,7 +85,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
}
}
if len(m.matches) == 0 {
return errors.New("no metrics to collect")
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
}
file, err := os.Open(IOSTATFILE)
if err != nil {
@@ -137,7 +135,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
}
m.init = true
return err
return nil
}
func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {

View File

@@ -11,7 +11,6 @@ import (
"bufio"
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"os/exec"
@@ -68,7 +67,7 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
command := exec.Command(p)
err := command.Run()
if err != nil {
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %s", p, err.Error()))
m.ipmitool = ""
} else {
m.ipmitool = p
@@ -79,14 +78,14 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
command := exec.Command(p)
err := command.Run()
if err != nil {
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %s", p, err.Error()))
m.ipmisensors = ""
} else {
m.ipmisensors = p
}
}
if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 {
return errors.New("no usable IPMI reader found")
return fmt.Errorf("%s Init(): no usable IPMI reader found", m.name)
}
m.init = true

View File

@@ -18,7 +18,6 @@ import "C"
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"maps"
"math"
@@ -216,17 +215,17 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
}
lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS)
if lib == nil {
return fmt.Errorf("error instantiating DynamicLibrary for %s", m.config.LibraryPath)
return fmt.Errorf("%s Init(): error instantiating DynamicLibrary for %s", m.name, m.config.LibraryPath)
}
err := lib.Open()
if err != nil {
return fmt.Errorf("error opening %s: %w", m.config.LibraryPath, err)
return fmt.Errorf("%s Init(): error opening %s: %w", m.name, m.config.LibraryPath, err)
}
if m.config.ForceOverwrite {
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
if err := os.Setenv("LIKWID_FORCE", "1"); err != nil {
return fmt.Errorf("error setting environment variable LIKWID_FORCE=1: %w", err)
return fmt.Errorf("%s Init(): error setting environment variable LIKWID_FORCE=1: %w", m.name, err)
}
}
if err := m.setup(); err != nil {
@@ -297,16 +296,12 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
// If no event set could be added, shut down LikwidCollector
if totalMetrics == 0 {
err := errors.New("no LIKWID eventset or metric usable")
cclog.ComponentError(m.name, err.Error())
return err
return fmt.Errorf("%s Init(): no LIKWID eventset or metric usable", m.name)
}
ret := C.topology_init()
if ret != 0 {
err := errors.New("failed to initialize topology module")
cclog.ComponentError(m.name, err.Error())
return err
return fmt.Errorf("%s Init(): failed to initialize topology module", m.name)
}
m.measureThread = thread.New()
switch m.config.AccessMode {
@@ -321,7 +316,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
p = m.config.DaemonPath
}
if err := os.Setenv("PATH", p); err != nil {
return fmt.Errorf("error setting environment variable PATH=%s: %w", p, err)
return fmt.Errorf("%s Init(): error setting environment variable PATH=%s: %w", m.name, p, err)
}
}
C.HPMmode(1)

View File

@@ -19,7 +19,6 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
)
@@ -318,18 +317,15 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
if !m.config.Sudo {
user, err := user.Current()
if err != nil {
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
return err
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
}
if user.Uid != "0" {
cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root")
return err
return fmt.Errorf("%s Init(): Lustre file system statistics can only be queried by user root", m.name)
}
} else {
p, err := exec.LookPath("sudo")
if err != nil {
cclog.ComponentError(m.name, "Cannot find 'sudo'")
return err
return fmt.Errorf("%s Init(): Cannot find 'sudo': %w", m.name, err)
}
m.sudoCmd = p
}
@@ -338,7 +334,7 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
if err != nil {
p, err = exec.LookPath(LCTL_CMD)
if err != nil {
return err
return fmt.Errorf("%s Init(): Cannot find %s command: %w", m.name, LCTL_CMD, err)
}
}
m.lctl = p
@@ -366,12 +362,12 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
}
}
if len(m.definitions) == 0 {
return errors.New("no metrics to collect")
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
}
devices := m.getDevices()
if len(devices) == 0 {
return errors.New("no Lustre devices found")
return fmt.Errorf("%s Init(): no Lustre devices found", m.name)
}
m.stats = make(map[string]map[string]int64)
for _, d := range devices {

View File

@@ -11,7 +11,6 @@ import (
"bufio"
"bytes"
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
@@ -96,7 +95,6 @@ func getStats(filename string) map[string]MemstatStats {
}
func (m *MemstatCollector) Init(config json.RawMessage) error {
var err error
m.name = "MemstatCollector"
m.parallel = true
m.config.NodeStats = true
@@ -134,7 +132,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
m.sendMemUsed = true
}
if len(m.matches) == 0 {
return errors.New("no metrics to collect")
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
}
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
@@ -142,7 +140,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
if m.config.NodeStats {
if stats := getStats(MEMSTATFILE); len(stats) == 0 {
return fmt.Errorf("cannot read data from file %s", MEMSTATFILE)
return fmt.Errorf("%s Init(): cannot read data from file %s", m.name, MEMSTATFILE)
}
}
@@ -154,7 +152,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
m.nodefiles = make(map[int]MemstatCollectorNode)
for _, f := range files {
if stats := getStats(f); len(stats) == 0 {
return fmt.Errorf("cannot read data from file %s", f)
return fmt.Errorf("%s Init(): cannot read data from file %s", m.name, f)
}
rematch := regex.FindStringSubmatch(f)
if len(rematch) == 2 {
@@ -174,7 +172,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
}
}
m.init = true
return err
return nil
}
func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {

View File

@@ -134,11 +134,31 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
// Check if device is a included device
if slices.Contains(m.config.IncludeDevices, canonical) {
// Tag will contain original device name (raw).
tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"}
meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"}
meta_unit_byte_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "bytes/sec"}
meta_unit_pkts := map[string]string{"source": m.name, "group": "Network", "unit": "packets"}
meta_unit_pkts_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "packets/sec"}
tags := map[string]string{
"stype": "network",
"stype-id": raw,
"type": "node",
}
meta_unit_byte := map[string]string{
"source": m.name,
"group": "Network",
"unit": "bytes",
}
meta_unit_byte_per_sec := map[string]string{
"source": m.name,
"group": "Network",
"unit": "bytes/sec",
}
meta_unit_pkts := map[string]string{
"source": m.name,
"group": "Network",
"unit": "packets",
}
meta_unit_pkts_per_sec := map[string]string{
"source": m.name,
"group": "Network",
"unit": "packets/sec",
}
m.matches[canonical] = []NetstatCollectorMetric{
{

View File

@@ -104,7 +104,6 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
}
func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
var err error = nil
m.name = "NfsIOStatCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
@@ -130,7 +129,7 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
m.data = m.readNfsiostats()
m.lastTimestamp = time.Now()
m.init = true
return err
return nil
}
func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessage) {

View File

@@ -91,22 +91,18 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
// Error: NVML library not found
// (nvml.ErrorString can not be used in this case)
if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
err = fmt.Errorf("NVML library not found")
cclog.ComponentError(m.name, err.Error())
return err
return fmt.Errorf("%s Init(): NVML library not found", m.name)
}
if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error())
return err
return fmt.Errorf("%s Init(): Unable to initialize NVML: %w", m.name, err)
}
// Number of NVIDIA GPUs
num_gpus, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to get device count", err.Error())
return err
return fmt.Errorf("%s Init(): Unable to get device count: %w", m.name, err)
}
// For all GPUs

View File

@@ -10,7 +10,6 @@ package collectors
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"slices"
"strconv"
@@ -52,7 +51,6 @@ type RocmSmiCollector struct {
// Called once by the collector manager
// All tags, meta data tags and metrics that do not change over the runtime should be set here
func (m *RocmSmiCollector) Init(config json.RawMessage) error {
var err error = nil
// Always set the name early in Init() to use it in cclog.Component* functions
m.name = "RocmSmiCollector"
// This is for later use, also call it early
@@ -70,16 +68,12 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
ret := rocm_smi.Init()
if ret != rocm_smi.STATUS_SUCCESS {
err = errors.New("failed to initialize ROCm SMI library")
cclog.ComponentError(m.name, err.Error())
return err
return fmt.Errorf("%s Init(): failed to initialize ROCm SMI library", m.name)
}
numDevs, ret := rocm_smi.NumMonitorDevices()
if ret != rocm_smi.STATUS_SUCCESS {
err = errors.New("failed to get number of GPUs from ROCm SMI library")
cclog.ComponentError(m.name, err.Error())
return err
return fmt.Errorf("%s Init(): failed to get number of GPUs from ROCm SMI library", m.name)
}
m.devices = make([]RocmSmiCollectorDevice, 0)
@@ -91,16 +85,12 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
}
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
if ret != rocm_smi.STATUS_SUCCESS {
err = fmt.Errorf("failed to get handle for GPU %d", i)
cclog.ComponentError(m.name, err.Error())
return err
return fmt.Errorf("%s Init(): failed to get get handle for GPU %d", m.name, i)
}
pciInfo, ret := rocm_smi.DeviceGetPciInfo(device)
if ret != rocm_smi.STATUS_SUCCESS {
err = fmt.Errorf("failed to get PCI information for GPU %d", i)
cclog.ComponentError(m.name, err.Error())
return err
return fmt.Errorf("%s Init(): failed to get PCI information for GPU %d", m.name, i)
}
pciId := fmt.Sprintf(
@@ -150,7 +140,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
// Set this flag only if everything is initialized properly, all required files exist, ...
m.init = true
return err
return nil
}
// Read collects all metrics belonging to the sample collector