diff --git a/collectors/beegfsmetaMetric.go b/collectors/beegfsmetaMetric.go index da63202..13afa60 100644 --- a/collectors/beegfsmetaMetric.go +++ b/collectors/beegfsmetaMetric.go @@ -107,16 +107,16 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error { // Beegfs file system statistics can only be queried by user root user, err := user.Current() if err != nil { - return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to get current user: %w", err) + return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err) } if user.Uid != "0" { - return fmt.Errorf("BeegfsMetaCollector.Init(): BeeGFS file system statistics can only be queried by user root") + return fmt.Errorf("%s Init(): BeeGFS file system statistics can only be queried by user root", m.name) } // Check if beegfs-ctl is in executable search path _, err = exec.LookPath(m.config.Beegfs) if err != nil { - return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err) + return fmt.Errorf("%s Init(): Failed to find beegfs-ctl binary '%s': %w", m.name, m.config.Beegfs, err) } m.init = true return nil diff --git a/collectors/beegfsstorageMetric.go b/collectors/beegfsstorageMetric.go index 57a5844..2990a02 100644 --- a/collectors/beegfsstorageMetric.go +++ b/collectors/beegfsstorageMetric.go @@ -100,16 +100,16 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error { // Beegfs file system statistics can only be queried by user root user, err := user.Current() if err != nil { - return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to get current user: %w", err) + return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err) } if user.Uid != "0" { - return fmt.Errorf("BeegfsStorageCollector.Init(): BeeGFS file system statistics can only be queried by user root") + return fmt.Errorf("%s Init(): BeeGFS file system statistics can only be queried by user root", m.name) } // Check if beegfs-ctl is in executable search path _, err = exec.LookPath(m.config.Beegfs) if err != nil { - return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err) + return fmt.Errorf("%s Init(): Failed to find beegfs-ctl binary '%s': %w", m.name, m.config.Beegfs, err) } m.init = true return nil diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index 84f828a..116731e 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -79,7 +79,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq") err := unix.Access(scalingCurFreqFile, unix.R_OK) if err != nil { - return fmt.Errorf("unable to access file '%s': %w", scalingCurFreqFile, err) + return fmt.Errorf("%s Init(): unable to access file '%s': %w", m.name, scalingCurFreqFile, err) } m.topology = append(m.topology, diff --git a/collectors/customCmdMetric.go b/collectors/customCmdMetric.go index c28e108..f77d92e 100644 --- a/collectors/customCmdMetric.go +++ b/collectors/customCmdMetric.go @@ -10,7 +10,6 @@ package collectors import ( "bytes" "encoding/json" - "errors" "fmt" "os" "os/exec" @@ -85,7 +84,7 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error { } if len(m.files) == 0 && len(m.cmdFieldsSlice) == 0 { - return errors.New("no metrics to collect") + return fmt.Errorf("%s Init(): no metrics to collect", m.name) } m.init = true return nil diff --git a/collectors/gpfsMetric.go b/collectors/gpfsMetric.go index b20a997..1d1a963 100644 --- a/collectors/gpfsMetric.go +++ b/collectors/gpfsMetric.go @@ -347,18 +347,15 @@ func (m *GpfsCollector) Init(config json.RawMessage) error { if !m.config.Sudo { user, err := user.Current() if err != nil { - cclog.ComponentError(m.name, "Failed to get current user:", err.Error()) - return err + return fmt.Errorf("%s Init(): failed to get current user: %w", m.name, err) } if user.Uid != "0" { - cclog.ComponentError(m.name, "GPFS file system statistics can only be queried by user root") - return err + return fmt.Errorf("%s Init(): GPFS file system statistics can only be queried by user root", m.name) } } else { p, err := exec.LookPath("sudo") if err != nil { - cclog.ComponentError(m.name, "Cannot find 'sudo'") - return err + return fmt.Errorf("%s Init(): cannot find 'sudo': %w", m.name, err) } m.sudoCmd = p } @@ -434,7 +431,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error { } } if len(m.definitions) == 0 { - return errors.New("no metrics to collect") + return fmt.Errorf("%s Init(): no metrics to collect", m.name) } m.init = true diff --git a/collectors/iostatMetric.go b/collectors/iostatMetric.go index e70bfe8..a71f9b3 100644 --- a/collectors/iostatMetric.go +++ b/collectors/iostatMetric.go @@ -11,7 +11,6 @@ import ( "bufio" "bytes" "encoding/json" - "errors" "fmt" "os" "slices" @@ -45,7 +44,6 @@ type IOstatCollector struct { } func (m *IOstatCollector) Init(config json.RawMessage) error { - var err error m.name = "IOstatCollector" m.parallel = true m.meta = map[string]string{"source": m.name, "group": "Disk"} @@ -87,7 +85,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error { } } if len(m.matches) == 0 { - return errors.New("no metrics to collect") + return fmt.Errorf("%s Init(): no metrics to collect", m.name) } file, err := os.Open(IOSTATFILE) if err != nil { @@ -137,7 +135,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error { } m.init = true - return err + return nil } func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage) { diff --git a/collectors/ipmiMetric.go b/collectors/ipmiMetric.go index a1dbdf5..86b9e28 100644 --- a/collectors/ipmiMetric.go +++ b/collectors/ipmiMetric.go @@ -11,7 +11,6 @@ import ( "bufio" "bytes" "encoding/json" - "errors" "fmt" "io" "os/exec" @@ -68,7 +67,7 @@ func (m *IpmiCollector) Init(config json.RawMessage) error { command := exec.Command(p) err := command.Run() if err != nil { - cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error())) + cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %s", p, err.Error())) m.ipmitool = "" } else { m.ipmitool = p @@ -79,14 +78,14 @@ func (m *IpmiCollector) Init(config json.RawMessage) error { command := exec.Command(p) err := command.Run() if err != nil { - cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error())) + cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %s", p, err.Error())) m.ipmisensors = "" } else { m.ipmisensors = p } } if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 { - return errors.New("no usable IPMI reader found") + return fmt.Errorf("%s Init(): no usable IPMI reader found", m.name) } m.init = true diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 321307b..d09b2e2 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -18,7 +18,6 @@ import "C" import ( "bytes" "encoding/json" - "errors" "fmt" "maps" "math" @@ -216,17 +215,17 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { } lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS) if lib == nil { - return fmt.Errorf("error instantiating DynamicLibrary for %s", m.config.LibraryPath) + return fmt.Errorf("%s Init(): error instantiating DynamicLibrary for %s", m.name, m.config.LibraryPath) } err := lib.Open() if err != nil { - return fmt.Errorf("error opening %s: %w", m.config.LibraryPath, err) + return fmt.Errorf("%s Init(): error opening %s: %w", m.name, m.config.LibraryPath, err) } if m.config.ForceOverwrite { cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1") if err := os.Setenv("LIKWID_FORCE", "1"); err != nil { - return fmt.Errorf("error setting environment variable LIKWID_FORCE=1: %w", err) + return fmt.Errorf("%s Init(): error setting environment variable LIKWID_FORCE=1: %w", m.name, err) } } if err := m.setup(); err != nil { @@ -297,16 +296,12 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { // If no event set could be added, shut down LikwidCollector if totalMetrics == 0 { - err := errors.New("no LIKWID eventset or metric usable") - cclog.ComponentError(m.name, err.Error()) - return err + return fmt.Errorf("%s Init(): no LIKWID eventset or metric usable", m.name) } ret := C.topology_init() if ret != 0 { - err := errors.New("failed to initialize topology module") - cclog.ComponentError(m.name, err.Error()) - return err + return fmt.Errorf("%s Init(): failed to initialize topology module", m.name) } m.measureThread = thread.New() switch m.config.AccessMode { @@ -321,7 +316,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { p = m.config.DaemonPath } if err := os.Setenv("PATH", p); err != nil { - return fmt.Errorf("error setting environment variable PATH=%s: %w", p, err) + return fmt.Errorf("%s Init(): error setting environment variable PATH=%s: %w", m.name, p, err) } } C.HPMmode(1) diff --git a/collectors/lustreMetric.go b/collectors/lustreMetric.go index fb86106..5eea82f 100644 --- a/collectors/lustreMetric.go +++ b/collectors/lustreMetric.go @@ -19,7 +19,6 @@ import ( "strings" "time" - cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" ) @@ -318,18 +317,15 @@ func (m *LustreCollector) Init(config json.RawMessage) error { if !m.config.Sudo { user, err := user.Current() if err != nil { - cclog.ComponentError(m.name, "Failed to get current user:", err.Error()) - return err + return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err) } if user.Uid != "0" { - cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root") - return err + return fmt.Errorf("%s Init(): Lustre file system statistics can only be queried by user root", m.name) } } else { p, err := exec.LookPath("sudo") if err != nil { - cclog.ComponentError(m.name, "Cannot find 'sudo'") - return err + return fmt.Errorf("%s Init(): Cannot find 'sudo': %w", m.name, err) } m.sudoCmd = p } @@ -338,7 +334,7 @@ func (m *LustreCollector) Init(config json.RawMessage) error { if err != nil { p, err = exec.LookPath(LCTL_CMD) if err != nil { - return err + return fmt.Errorf("%s Init(): Cannot find %s command: %w", m.name, LCTL_CMD, err) } } m.lctl = p @@ -366,12 +362,12 @@ func (m *LustreCollector) Init(config json.RawMessage) error { } } if len(m.definitions) == 0 { - return errors.New("no metrics to collect") + return fmt.Errorf("%s Init(): no metrics to collect", m.name) } devices := m.getDevices() if len(devices) == 0 { - return errors.New("no Lustre devices found") + return fmt.Errorf("%s Init(): no Lustre devices found", m.name) } m.stats = make(map[string]map[string]int64) for _, d := range devices { diff --git a/collectors/memstatMetric.go b/collectors/memstatMetric.go index e57ead0..271e6fd 100644 --- a/collectors/memstatMetric.go +++ b/collectors/memstatMetric.go @@ -11,7 +11,6 @@ import ( "bufio" "bytes" "encoding/json" - "errors" "fmt" "os" "path/filepath" @@ -96,7 +95,6 @@ func getStats(filename string) map[string]MemstatStats { } func (m *MemstatCollector) Init(config json.RawMessage) error { - var err error m.name = "MemstatCollector" m.parallel = true m.config.NodeStats = true @@ -134,7 +132,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error { m.sendMemUsed = true } if len(m.matches) == 0 { - return errors.New("no metrics to collect") + return fmt.Errorf("%s Init(): no metrics to collect", m.name) } if err := m.setup(); err != nil { return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err) @@ -142,7 +140,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error { if m.config.NodeStats { if stats := getStats(MEMSTATFILE); len(stats) == 0 { - return fmt.Errorf("cannot read data from file %s", MEMSTATFILE) + return fmt.Errorf("%s Init(): cannot read data from file %s", m.name, MEMSTATFILE) } } @@ -154,7 +152,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error { m.nodefiles = make(map[int]MemstatCollectorNode) for _, f := range files { if stats := getStats(f); len(stats) == 0 { - return fmt.Errorf("cannot read data from file %s", f) + return fmt.Errorf("%s Init(): cannot read data from file %s", m.name, f) } rematch := regex.FindStringSubmatch(f) if len(rematch) == 2 { @@ -174,7 +172,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error { } } m.init = true - return err + return nil } func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage) { diff --git a/collectors/netstatMetric.go b/collectors/netstatMetric.go index 75023bd..b8b5b06 100644 --- a/collectors/netstatMetric.go +++ b/collectors/netstatMetric.go @@ -134,11 +134,31 @@ func (m *NetstatCollector) Init(config json.RawMessage) error { // Check if device is a included device if slices.Contains(m.config.IncludeDevices, canonical) { // Tag will contain original device name (raw). - tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"} - meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"} - meta_unit_byte_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "bytes/sec"} - meta_unit_pkts := map[string]string{"source": m.name, "group": "Network", "unit": "packets"} - meta_unit_pkts_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "packets/sec"} + tags := map[string]string{ + "stype": "network", + "stype-id": raw, + "type": "node", + } + meta_unit_byte := map[string]string{ + "source": m.name, + "group": "Network", + "unit": "bytes", + } + meta_unit_byte_per_sec := map[string]string{ + "source": m.name, + "group": "Network", + "unit": "bytes/sec", + } + meta_unit_pkts := map[string]string{ + "source": m.name, + "group": "Network", + "unit": "packets", + } + meta_unit_pkts_per_sec := map[string]string{ + "source": m.name, + "group": "Network", + "unit": "packets/sec", + } m.matches[canonical] = []NetstatCollectorMetric{ { diff --git a/collectors/nfsiostatMetric.go b/collectors/nfsiostatMetric.go index 0cfbffe..e9052bb 100644 --- a/collectors/nfsiostatMetric.go +++ b/collectors/nfsiostatMetric.go @@ -104,7 +104,6 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 { } func (m *NfsIOStatCollector) Init(config json.RawMessage) error { - var err error = nil m.name = "NfsIOStatCollector" if err := m.setup(); err != nil { return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err) @@ -130,7 +129,7 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error { m.data = m.readNfsiostats() m.lastTimestamp = time.Now() m.init = true - return err + return nil } func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessage) { diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 860b93c..f23f2fe 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -91,22 +91,18 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { // Error: NVML library not found // (nvml.ErrorString can not be used in this case) if ret == nvml.ERROR_LIBRARY_NOT_FOUND { - err = fmt.Errorf("NVML library not found") - cclog.ComponentError(m.name, err.Error()) - return err + return fmt.Errorf("%s Init(): NVML library not found", m.name) } if ret != nvml.SUCCESS { err = errors.New(nvml.ErrorString(ret)) - cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error()) - return err + return fmt.Errorf("%s Init(): Unable to initialize NVML: %w", m.name, err) } // Number of NVIDIA GPUs num_gpus, ret := nvml.DeviceGetCount() if ret != nvml.SUCCESS { err = errors.New(nvml.ErrorString(ret)) - cclog.ComponentError(m.name, "Unable to get device count", err.Error()) - return err + return fmt.Errorf("%s Init(): Unable to get device count: %w", m.name, err) } // For all GPUs diff --git a/collectors/rocmsmiMetric.go b/collectors/rocmsmiMetric.go index c3b6806..c801d29 100644 --- a/collectors/rocmsmiMetric.go +++ b/collectors/rocmsmiMetric.go @@ -10,7 +10,6 @@ package collectors import ( "bytes" "encoding/json" - "errors" "fmt" "slices" "strconv" @@ -52,7 +51,6 @@ type RocmSmiCollector struct { // Called once by the collector manager // All tags, meta data tags and metrics that do not change over the runtime should be set here func (m *RocmSmiCollector) Init(config json.RawMessage) error { - var err error = nil // Always set the name early in Init() to use it in cclog.Component* functions m.name = "RocmSmiCollector" // This is for later use, also call it early @@ -70,16 +68,12 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error { ret := rocm_smi.Init() if ret != rocm_smi.STATUS_SUCCESS { - err = errors.New("failed to initialize ROCm SMI library") - cclog.ComponentError(m.name, err.Error()) - return err + return fmt.Errorf("%s Init(): failed to initialize ROCm SMI library", m.name) } numDevs, ret := rocm_smi.NumMonitorDevices() if ret != rocm_smi.STATUS_SUCCESS { - err = errors.New("failed to get number of GPUs from ROCm SMI library") - cclog.ComponentError(m.name, err.Error()) - return err + return fmt.Errorf("%s Init(): failed to get number of GPUs from ROCm SMI library", m.name) } m.devices = make([]RocmSmiCollectorDevice, 0) @@ -91,16 +85,12 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error { } device, ret := rocm_smi.DeviceGetHandleByIndex(i) if ret != rocm_smi.STATUS_SUCCESS { - err = fmt.Errorf("failed to get handle for GPU %d", i) - cclog.ComponentError(m.name, err.Error()) - return err + return fmt.Errorf("%s Init(): failed to get get handle for GPU %d", m.name, i) } pciInfo, ret := rocm_smi.DeviceGetPciInfo(device) if ret != rocm_smi.STATUS_SUCCESS { - err = fmt.Errorf("failed to get PCI information for GPU %d", i) - cclog.ComponentError(m.name, err.Error()) - return err + return fmt.Errorf("%s Init(): failed to get PCI information for GPU %d", m.name, i) } pciId := fmt.Sprintf( @@ -150,7 +140,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error { // Set this flag only if everything is initialized properly, all required files exist, ... m.init = true - return err + return nil } // Read collects all metrics belonging to the sample collector diff --git a/internal/metricAggregator/metricAggregator.go b/internal/metricAggregator/metricAggregator.go index 6d4320a..08c6ea0 100644 --- a/internal/metricAggregator/metricAggregator.go +++ b/internal/metricAggregator/metricAggregator.go @@ -94,8 +94,7 @@ func (c *metricAggregator) Init(output chan lp.CCMessage) error { // Set hostname hostname, err := os.Hostname() if err != nil { - cclog.Error(err.Error()) - return err + return fmt.Errorf("metricAggregator: failed to get hostname: %w", err) } // Drop domain part of host name c.constants["hostname"] = strings.SplitN(hostname, `.`, 2)[0] diff --git a/internal/metricRouter/metricCache.go b/internal/metricRouter/metricCache.go index 4acfa90..9f22bdf 100644 --- a/internal/metricRouter/metricCache.go +++ b/internal/metricRouter/metricCache.go @@ -8,6 +8,7 @@ package metricRouter import ( + "fmt" "sync" "time" @@ -70,8 +71,7 @@ func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker, // The code is executed by the MetricCache goroutine c.aggEngine, err = agg.NewAggregator(c.output) if err != nil { - cclog.ComponentError("MetricCache", "Cannot create aggregator") - return err + return fmt.Errorf("MetricCache: failed to create aggregator: %w", err) } return nil diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 7bd312f..718b6aa 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -112,8 +112,7 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout if r.config.NumCacheIntervals > 0 { r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals) if err != nil { - cclog.ComponentError("MetricRouter", "MetricCache initialization failed:", err.Error()) - return err + return fmt.Errorf("MetricRouter: failed to initialize MetricCache: %w", err) } for _, agg := range r.config.IntervalAgg { err = r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)