Compare commits

...

5 Commits

Author SHA1 Message Date
Thomas Roehl
c9dfca622f Update cclog calls in latest IB Collector 2026-06-08 14:58:29 +02:00
Thomas Roehl
3b0638e815 Merge branch 'main' into cclog_update 2026-06-08 14:57:31 +02:00
Thomas Roehl
037b4f1526 Update cclog calls 2026-06-08 14:52:24 +02:00
Holger Obermaier
bed5491068 Fix Overflows in Infiniband collector (#219)
* Add information about the used infiniband counters
* Change datatype from int64 to uint64
* uint64 subtraction handles wraparound automatically
* Compute total rates by summing up the xmit and recv rates.
This avoids overflows in the raw counters
* Check for cases where the current counter can not be saved as last state
* Use golang variable naming convention (camelCase)
2026-06-08 14:00:09 +02:00
dependabot[bot]
a2eba41150 Bump golang.design/x/thread
Bumps [golang.design/x/thread](https://github.com/golang-design/thread) from 0.0.0-20210122121316-335e9adffdf1 to 0.3.2.
- [Release notes](https://github.com/golang-design/thread/releases)
- [Commits](https://github.com/golang-design/thread/commits/v0.3.2)

---
updated-dependencies:
- dependency-name: golang.design/x/thread
  dependency-version: 0.3.2
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-06-08 13:10:27 +02:00
28 changed files with 243 additions and 219 deletions

View File

@@ -132,11 +132,11 @@ func mainFunc() int {
if len(rcfg.ConfigFile.Interval) > 0 { if len(rcfg.ConfigFile.Interval) > 0 {
t, err := time.ParseDuration(rcfg.ConfigFile.Interval) t, err := time.ParseDuration(rcfg.ConfigFile.Interval)
if err != nil { if err != nil {
cclog.Error("Configuration value 'interval' no valid duration") cclog.Errorf("Configuration value interval=%s no valid duration", rcfg.ConfigFile.Interval)
} }
rcfg.Interval = t rcfg.Interval = t
if rcfg.Interval == 0 { if rcfg.Interval == 0 {
cclog.Error("Configuration value 'interval' must be greater than zero") cclog.Errorf("Configuration value interval=%s must be greater than zero", rcfg.ConfigFile.Interval)
return 1 return 1
} }
} }
@@ -145,11 +145,11 @@ func mainFunc() int {
if len(rcfg.ConfigFile.Duration) > 0 { if len(rcfg.ConfigFile.Duration) > 0 {
t, err := time.ParseDuration(rcfg.ConfigFile.Duration) t, err := time.ParseDuration(rcfg.ConfigFile.Duration)
if err != nil { if err != nil {
cclog.Error("Configuration value 'duration' no valid duration") cclog.Error("Configuration value duration=%s no valid duration", rcfg.ConfigFile.Duration)
} }
rcfg.Duration = t rcfg.Duration = t
if rcfg.Duration == 0 { if rcfg.Duration == 0 {
cclog.Error("Configuration value 'duration' must be greater than zero") cclog.Error("Configuration value duration=%s must be greater than zero", rcfg.ConfigFile.Duration)
return 1 return 1
} }
} }

View File

@@ -209,16 +209,16 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
} else { } else {
f1, err := strconv.ParseFloat(m.matches["other"], 32) f1, err := strconv.ParseFloat(m.matches["other"], 32)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)) "Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
continue continue
} }
f2, err := strconv.ParseFloat(split[i], 32) f2, err := strconv.ParseFloat(split[i], 32)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)) "Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
continue continue
} }
m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2) m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2)

View File

@@ -200,16 +200,16 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
} else { } else {
f1, err := strconv.ParseFloat(m.matches["other"], 32) f1, err := strconv.ParseFloat(m.matches["other"], 32)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)) "Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
continue continue
} }
f2, err := strconv.ParseFloat(split[i], 32) f2, err := strconv.ParseFloat(split[i], 32)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)) "Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
continue continue
} }
m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2) m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2)

View File

@@ -139,16 +139,16 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
const cpuInfoFile = "/proc/cpuinfo" const cpuInfoFile = "/proc/cpuinfo"
file, err := os.Open(cpuInfoFile) file, err := os.Open(cpuInfoFile)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err)) "Read(): Failed to open file '%s': %v", cpuInfoFile, err)
return return
} }
defer func() { defer func() {
if err := file.Close(); err != nil { if err := file.Close(); err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", cpuInfoFile, err)) "Read(): Failed to close file '%s': %v", cpuInfoFile, err)
} }
}() }()
@@ -166,9 +166,9 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
if !t.isHT { if !t.isHT {
value, err := strconv.ParseFloat(strings.TrimSpace(lineSplit[1]), 64) value, err := strconv.ParseFloat(strings.TrimSpace(lineSplit[1]), 64)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err)) "Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err)
return return
} }
if y, err := lp.NewMetric("cpufreq", t.tagSet, m.meta, value, now); err == nil { if y, err := lp.NewMetric("cpufreq", t.tagSet, m.meta, value, now); err == nil {

View File

@@ -95,10 +95,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
} }
// Initialized // Initialized
cclog.ComponentDebug( cclog.ComponentDebugf(m.name, "initialized %d non-hyper-threading CPUs")
m.name,
"initialized",
len(m.topology), "non-hyper-threading CPUs")
m.init = true m.init = true
return nil return nil
} }
@@ -116,16 +113,14 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage
// Read current frequency // Read current frequency
line, err := os.ReadFile(t.scalingCurFreqFile) line, err := os.ReadFile(t.scalingCurFreqFile)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name, "Read(): Failed to read file '%s': %v", t.scalingCurFreqFile, err)
fmt.Sprintf("Read(): Failed to read file '%s': %v", t.scalingCurFreqFile, err))
continue continue
} }
cpuFreq, err := strconv.ParseInt(strings.TrimSpace(string(line)), 10, 64) cpuFreq, err := strconv.ParseInt(strings.TrimSpace(string(line)), 10, 64)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name, "Read(): Failed to convert CPU frequency '%s' to int64: %v", line, err)
fmt.Sprintf("Read(): Failed to convert CPU frequency '%s' to int64: %v", line, err))
continue continue
} }

View File

@@ -171,15 +171,15 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
file, err := os.Open(CPUSTATFILE) file, err := os.Open(CPUSTATFILE)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", CPUSTATFILE, err)) "Read(): Failed to open file '%s': %v", CPUSTATFILE, err)
} }
defer func() { defer func() {
if err := file.Close(); err != nil { if err := file.Close(); err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", string(CPUSTATFILE), err)) "Read(): Failed to close file '%s': %v", string(CPUSTATFILE), err)
} }
}() }()

View File

@@ -64,9 +64,9 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
cmdFields := strings.Fields(c) cmdFields := strings.Fields(c)
command := exec.Command(cmdFields[0], cmdFields[1:]...) command := exec.Command(cmdFields[0], cmdFields[1:]...)
if _, err := command.Output(); err != nil { if _, err := command.Output(); err != nil {
cclog.ComponentWarn( cclog.ComponentWarnf(
m.name, m.name,
fmt.Sprintf("%s Init(): Execution of command \"%s\" failed: %v", m.name, command.String(), err)) "%s Init(): Execution of command \"%s\" failed: %v", m.name, command.String(), err)
continue continue
} }
m.cmdFieldsSlice = append(m.cmdFieldsSlice, cmdFields) m.cmdFieldsSlice = append(m.cmdFieldsSlice, cmdFields)
@@ -77,7 +77,7 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
if _, err := os.ReadFile(fileName); err != nil { if _, err := os.ReadFile(fileName); err != nil {
cclog.ComponentWarn( cclog.ComponentWarn(
m.name, m.name,
fmt.Sprintf("%s Init(): Reading of file \"%s\" failed: %v", m.name, fileName, err)) "%s Init(): Reading of file \"%s\" failed: %v", m.name, fileName, err)
continue continue
} }
m.files = append(m.files, fileName) m.files = append(m.files, fileName)
@@ -100,20 +100,18 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
command := exec.Command(cmdFields[0], cmdFields[1:]...) command := exec.Command(cmdFields[0], cmdFields[1:]...)
stdout, err := command.Output() stdout, err := command.Output()
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to read command output for command \"%s\": %v", command.String(), err), "Read(): Failed to read command output for command \"%s\": %v", command.String(), err)
)
continue continue
} }
// Read and decode influxDB line-protocol from command output // Read and decode influxDB line-protocol from command output
metrics, err := lp.FromBytes(stdout) metrics, err := lp.FromBytes(stdout)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to decode influx Message: %v", err), "Read(): Failed to decode influx Message: %v", err)
)
continue continue
} }
for _, metric := range metrics { for _, metric := range metrics {
@@ -128,20 +126,18 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
for _, filename := range m.files { for _, filename := range m.files {
input, err := os.ReadFile(filename) input, err := os.ReadFile(filename)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to read file \"%s\": %v\n", filename, err), "Read(): Failed to read file \"%s\": %v\n", filename, err)
)
continue continue
} }
// Read and decode influxDB line-protocol from file // Read and decode influxDB line-protocol from file
metrics, err := lp.FromBytes(input) metrics, err := lp.FromBytes(input)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to decode influx Message: %v", err), "Read(): Failed to decode influx Message: %v", err)
)
continue continue
} }
for _, metric := range metrics { for _, metric := range metrics {

View File

@@ -77,16 +77,16 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
file, err := os.Open(MOUNTFILE) file, err := os.Open(MOUNTFILE)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", MOUNTFILE, err)) "Read(): Failed to open file '%s': %v", MOUNTFILE, err)
return return
} }
defer func() { defer func() {
if err := file.Close(); err != nil { if err := file.Close(); err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", MOUNTFILE, err)) "Read(): Failed to close file '%s': %v", MOUNTFILE, err)
} }
}() }()

View File

@@ -371,7 +371,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
if err != nil { if err != nil {
// if using sudo, exec.lookPath will return EACCES (file mode r-x------), this can be ignored // if using sudo, exec.lookPath will return EACCES (file mode r-x------), this can be ignored
if m.config.Sudo && errors.Is(err, syscall.EACCES) { if m.config.Sudo && errors.Is(err, syscall.EACCES) {
cclog.ComponentWarn(m.name, fmt.Sprintf("got error looking for mmpmon binary '%s': %v . This is expected when using sudo, continuing.", m.config.Mmpmon, err)) cclog.ComponentWarnf(m.name, "got error looking for mmpmon binary '%s': %v . This is expected when using sudo, continuing.", m.config.Mmpmon, err)
// the file was given in the config, use it // the file was given in the config, use it
p = m.config.Mmpmon p = m.config.Mmpmon
} else { } else {
@@ -517,23 +517,23 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// return code // return code
rc, err := strconv.Atoi(key_value["_rc_"]) rc, err := strconv.Atoi(key_value["_rc_"])
if err != nil { if err != nil {
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err)) cclog.ComponentErrorf(m.name, "Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err)
continue continue
} }
if rc != 0 { if rc != 0 {
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Filesystem '%s' is not ok.", filesystem)) cclog.ComponentErrorf(m.name, "Read(): Filesystem '%s' is not ok.", filesystem)
continue continue
} }
// timestamp // timestamp
sec, err := strconv.ParseInt(key_value["_t_"], 10, 64) sec, err := strconv.ParseInt(key_value["_t_"], 10, 64)
if err != nil { if err != nil {
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err)) cclog.ComponentErrorf(m.name, "Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err)
continue continue
} }
msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64) msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64)
if err != nil { if err != nil {
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err)) cclog.ComponentErrorf(m.name, "Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err)
continue continue
} }
timestamp := time.Unix(sec, msec*1000) timestamp := time.Unix(sec, msec*1000)
@@ -551,7 +551,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
for _, metric := range GpfsAbsMetrics { for _, metric := range GpfsAbsMetrics {
value, err := strconv.ParseInt(key_value[metric.prefix], 10, 64) value, err := strconv.ParseInt(key_value[metric.prefix], 10, 64)
if err != nil { if err != nil {
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert %s '%s' to int64: %v", metric.desc, key_value[metric.prefix], err)) cclog.ComponentErrorf(m.name, "Read(): Failed to convert %s '%s' to int64: %v", metric.desc, key_value[metric.prefix], err)
continue continue
} }
newstate[metric.prefix] = value newstate[metric.prefix] = value
@@ -636,7 +636,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
} }
} else { } else {
// the value could not be computed correctly // the value could not be computed correctly
cclog.ComponentWarn(m.name, fmt.Sprintf("Read(): Could not compute value for filesystem %s of metric %s: vold_ok = %t, vnew_ok = %t", filesystem, metric.name, vold_ok, vnew_ok)) cclog.ComponentWarnf(m.name, "Read(): Could not compute value for filesystem %s of metric %s: vold_ok = %t, vnew_ok = %t", filesystem, metric.name, vold_ok, vnew_ok)
} }
} }

View File

@@ -23,20 +23,29 @@ import (
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
const IB_BASEPATH = "/sys/class/infiniband/" // See: https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-class-infiniband
const (
ibBasePath = "/sys/class/infiniband/"
ibDataUnit = "bytes"
ibDataRateUnit = ibDataUnit + "/sec"
ibPkgUnit = "packets"
ibPkgRateUnit = ibPkgUnit + "/sec"
)
type InfinibandCollectorMetric struct { type InfinibandCollectorMetric struct {
name string name string
path string path string
unit string unit string
scale int64 unitRates string
scaleByFourLanes bool
addToIBTotal bool addToIBTotal bool
addToIBTotalPkgs bool addToIBTotalPkgs bool
lastState int64 lastState uint64
lastStateAvailable bool
} }
type InfinibandCollectorInfo struct { type InfinibandCollectorInfo struct {
LID string // IB local Identifier (LID) lid string // IB local Identifier (LID)
device string // IB device device string // IB device
port string // IB device port port string // IB device port
portCounterFiles []InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric portCounterFiles []InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric
@@ -56,7 +65,7 @@ type InfinibandCollector struct {
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
} }
// Init initializes the Infiniband collector by walking through files below IB_BASEPATH // Init initializes the Infiniband collector by walking through files below ibBasePath
func (m *InfinibandCollector) Init(config json.RawMessage) error { func (m *InfinibandCollector) Init(config json.RawMessage) error {
// Check if already initialized // Check if already initialized
if m.init { if m.init {
@@ -87,7 +96,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
} }
// Loop for all InfiniBand directories // Loop for all InfiniBand directories
globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*") globPattern := filepath.Join(ibBasePath, "*", "ports", "*")
ibDirs, err := filepath.Glob(globPattern) ibDirs, err := filepath.Glob(globPattern)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): unable to glob files with pattern %s: %w", m.name, globPattern, err) return fmt.Errorf("%s Init(): unable to glob files with pattern %s: %w", m.name, globPattern, err)
@@ -122,36 +131,42 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
countersDir := filepath.Join(path, "counters") countersDir := filepath.Join(path, "counters")
portCounterFiles := []InfinibandCollectorMetric{ portCounterFiles := []InfinibandCollectorMetric{
{ {
// Total number of data octets, divided by 4 (lanes), received on all VLs.
// This is 64 bit counter
name: "ib_recv", name: "ib_recv",
path: filepath.Join(countersDir, "port_rcv_data"), path: filepath.Join(countersDir, "port_rcv_data"),
unit: "bytes", unit: ibDataUnit,
scale: 4, unitRates: ibDataRateUnit,
scaleByFourLanes: true,
addToIBTotal: true, addToIBTotal: true,
lastState: -1,
}, },
{ {
// Total number of data octets, divided by 4 (lanes), transmitted on all VLs.
// This is 64 bit counter
name: "ib_xmit", name: "ib_xmit",
path: filepath.Join(countersDir, "port_xmit_data"), path: filepath.Join(countersDir, "port_xmit_data"),
unit: "bytes", unit: ibDataUnit,
scale: 4, unitRates: ibDataRateUnit,
scaleByFourLanes: true,
addToIBTotal: true, addToIBTotal: true,
lastState: -1,
}, },
{ {
// Total number of packets received on all VLs from this port (this may include packets containing Errors.
// This is 64 bit counter.
name: "ib_recv_pkts", name: "ib_recv_pkts",
path: filepath.Join(countersDir, "port_rcv_packets"), path: filepath.Join(countersDir, "port_rcv_packets"),
unit: "packets", unit: ibPkgUnit,
scale: 1, unitRates: ibPkgRateUnit,
addToIBTotalPkgs: true, addToIBTotalPkgs: true,
lastState: -1,
}, },
{ {
// Total number of packets transmitted on all VLs from this port. This may include packets with errors.
// This is 64 bit counter.
name: "ib_xmit_pkts", name: "ib_xmit_pkts",
path: filepath.Join(countersDir, "port_xmit_packets"), path: filepath.Join(countersDir, "port_xmit_packets"),
unit: "packets", unit: ibPkgUnit,
scale: 1, unitRates: ibPkgRateUnit,
addToIBTotalPkgs: true, addToIBTotalPkgs: true,
lastState: -1,
}, },
} }
for _, counter := range portCounterFiles { for _, counter := range portCounterFiles {
@@ -163,7 +178,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
m.info = append(m.info, m.info = append(m.info,
InfinibandCollectorInfo{ InfinibandCollectorInfo{
LID: LID, lid: LID,
device: device, device: device,
port: port, port: port,
portCounterFiles: portCounterFiles, portCounterFiles: portCounterFiles,
@@ -184,7 +199,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
return nil return nil
} }
// Read reads Infiniband counter files below IB_BASEPATH // Read reads Infiniband counter files below ibBasePath
func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMessage) { func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Check if already initialized // Check if already initialized
if !m.init { if !m.init {
@@ -201,36 +216,42 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
for i := range m.info { for i := range m.info {
info := &m.info[i] info := &m.info[i]
var ib_total, ib_total_last_state, var ibTotal, ibTotalPkts uint64 // sum of xmit and recv counters
ib_total_pkts, ib_total_pkts_last_state int64 var ibTotalBw, ibTotalPktsBw float64 // sum of xmit and recv rates
var ib_total_last_state_available, ib_total_pkts_last_state_available bool var ibTotalBwAvailable, ibTotalPktsBwAvailable bool
for i := range info.portCounterFiles { for i := range info.portCounterFiles {
counterDef := &info.portCounterFiles[i] counterDef := &info.portCounterFiles[i]
// Read counter file // Read counter file
line, err := os.ReadFile(counterDef.path) line, err := os.ReadFile(counterDef.path)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err)) "Read(): Failed to read from file '%s': %v", counterDef.path, err)
// Current counter can not be saved as last state
counterDef.lastStateAvailable = false
continue continue
} }
data := strings.TrimSpace(string(line)) data := strings.TrimSpace(string(line))
// convert counter to int64 // convert counter to uint64
v, err := strconv.ParseInt(data, 10, 64) vRawCounter, err := strconv.ParseUint(data, 10, 64)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterDef.name, data, err)) "Read(): Failed to convert Infininiband metrice %s='%s' to uint64: %v", counterDef.name, data, err)
// Current counter can not be saved as last state
counterDef.lastStateAvailable = false
continue continue
} }
// Scale raw value vScaledCounter := vRawCounter
v *= counterDef.scale if counterDef.scaleByFourLanes {
vScaledCounter *= uint64(4)
}
// Send absolut values // Send absolut values
if m.config.SendAbsoluteValues { if m.config.SendAbsoluteValues {
if y, err := lp.NewMetric(counterDef.name, info.tagSet, m.meta, v, now); err == nil { if y, err := lp.NewMetric(counterDef.name, info.tagSet, m.meta, vScaledCounter, now); err == nil {
y.AddMeta("unit", counterDef.unit) y.AddMeta("unit", counterDef.unit)
output <- y output <- y
} }
@@ -238,63 +259,72 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
// Send derived values // Send derived values
if m.config.SendDerivedValues { if m.config.SendDerivedValues {
if counterDef.lastState >= 0 { if counterDef.lastStateAvailable {
rate := float64((v - counterDef.lastState)) / timeDiff var rate float64
// uint64 subtraction handles wraparound automatically
// in case vRawCounter < counterDef.lastState we would compute:
// math.MaxUint64 - lastState + vRawCounter + 1
// = (2^64 - 1) - lastState + vRawCounter + 1
// = 2^64 - lastState + vRawCounter
// ≡ vRawCounter - lastState (mod 2^64)
rate = float64(vRawCounter-counterDef.lastState) / timeDiff
if counterDef.scaleByFourLanes {
rate *= float64(4)
}
if y, err := lp.NewMetric(counterDef.name+"_bw", info.tagSet, m.meta, rate, now); err == nil { if y, err := lp.NewMetric(counterDef.name+"_bw", info.tagSet, m.meta, rate, now); err == nil {
y.AddMeta("unit", counterDef.unit+"/sec") y.AddMeta("unit", counterDef.unitRates)
output <- y output <- y
} }
// Sum up total values of last state // Sum up rates for total rates
if m.config.SendTotalValues { if m.config.SendTotalValues {
switch { switch {
case counterDef.addToIBTotal: case counterDef.addToIBTotal:
ib_total_last_state += counterDef.lastState ibTotalBw += rate
ib_total_last_state_available = true ibTotalBwAvailable = true
case counterDef.addToIBTotalPkgs: case counterDef.addToIBTotalPkgs:
ib_total_pkts_last_state += counterDef.lastState ibTotalPktsBw += rate
ib_total_pkts_last_state_available = true ibTotalPktsBwAvailable = true
} }
} }
} }
counterDef.lastState = v counterDef.lastState = vRawCounter
counterDef.lastStateAvailable = true
} }
// Sum up total values // Sum up total values
if m.config.SendTotalValues { if m.config.SendTotalValues {
switch { switch {
case counterDef.addToIBTotal: case counterDef.addToIBTotal:
ib_total += v ibTotal += vScaledCounter
case counterDef.addToIBTotalPkgs: case counterDef.addToIBTotalPkgs:
ib_total_pkts += v ibTotalPkts += vScaledCounter
} }
} }
} }
// Send total values // Send total values
if m.config.SendTotalValues { if m.config.SendTotalValues {
if y, err := lp.NewMetric("ib_total", info.tagSet, m.meta, ib_total, now); err == nil { if y, err := lp.NewMetric("ib_total", info.tagSet, m.meta, ibTotal, now); err == nil {
y.AddMeta("unit", "bytes") y.AddMeta("unit", ibDataUnit)
output <- y output <- y
} }
if y, err := lp.NewMetric("ib_total_pkts", info.tagSet, m.meta, ib_total_pkts, now); err == nil { if y, err := lp.NewMetric("ib_total_pkts", info.tagSet, m.meta, ibTotalPkts, now); err == nil {
y.AddMeta("unit", "packets") y.AddMeta("unit", ibPkgUnit)
output <- y output <- y
} }
if m.config.SendDerivedValues && ib_total_last_state_available { if m.config.SendDerivedValues && ibTotalBwAvailable {
rate := float64((ib_total - ib_total_last_state)) / timeDiff if y, err := lp.NewMetric("ib_total_bw", info.tagSet, m.meta, ibTotalBw, now); err == nil {
if y, err := lp.NewMetric("ib_total_bw", info.tagSet, m.meta, rate, now); err == nil { y.AddMeta("unit", ibDataRateUnit)
y.AddMeta("unit", "bytes/sec")
output <- y output <- y
} }
} }
if m.config.SendDerivedValues && ib_total_pkts_last_state_available { if m.config.SendDerivedValues && ibTotalPktsBwAvailable {
rate := float64((ib_total_pkts - ib_total_pkts_last_state)) / timeDiff if y, err := lp.NewMetric("ib_total_pkts_bw", info.tagSet, m.meta, ibTotalPktsBw, now); err == nil {
if y, err := lp.NewMetric("ib_total_pkts_bw", info.tagSet, m.meta, rate, now); err == nil { y.AddMeta("unit", ibPkgRateUnit)
y.AddMeta("unit", "packets/sec")
output <- y output <- y
} }
} }

View File

@@ -145,16 +145,16 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
file, err := os.Open(IOSTATFILE) file, err := os.Open(IOSTATFILE)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", IOSTATFILE, err)) "Read(): Failed to open file '%s': %v", IOSTATFILE, err)
return return
} }
defer func() { defer func() {
if err := file.Close(); err != nil { if err := file.Close(); err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", IOSTATFILE, err)) "Read(): Failed to close file '%s': %v", IOSTATFILE, err)
} }
}() }()

View File

@@ -12,6 +12,12 @@ package collectors
#cgo LDFLAGS: -Wl,--unresolved-symbols=ignore-in-object-files #cgo LDFLAGS: -Wl,--unresolved-symbols=ignore-in-object-files
#include <stdlib.h> #include <stdlib.h>
#include <likwid.h> #include <likwid.h>
int cc_add_hwthread(int cpu_id) {
return HPMaddThread(cpu_id);
}
*/ */
import "C" import "C"
@@ -261,12 +267,12 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
} }
for _, metric := range evset.Metrics { for _, metric := range evset.Metrics {
// Try to evaluate the metric // Try to evaluate the metric
cclog.ComponentDebug(m.name, "Checking", metric.Name) cclog.ComponentDebugf(m.name, "Checking %s", metric.Name)
if !checkMetricType(metric.Type) { if !checkMetricType(metric.Type) {
cclog.ComponentError(m.name, "Metric", metric.Name, "uses invalid type", metric.Type) cclog.ComponentErrorf(m.name, "Metric %s uses invalid type %s", metric.Name, metric.Type)
metric.Calc = "" metric.Calc = ""
} else if !testLikwidMetricFormula(metric.Calc, params) { } else if !testLikwidMetricFormula(metric.Calc, params) {
cclog.ComponentError(m.name, "Metric", metric.Name, "cannot be calculated with given counters") cclog.ComponentError(m.name, "Metric %s cannot be calculated with given counters", metric.Name)
metric.Calc = "" metric.Calc = ""
} else { } else {
globalParams = append(globalParams, metric.Name) globalParams = append(globalParams, metric.Name)
@@ -281,13 +287,13 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
for _, metric := range m.config.Metrics { for _, metric := range m.config.Metrics {
// Try to evaluate the global metric // Try to evaluate the global metric
if !checkMetricType(metric.Type) { if !checkMetricType(metric.Type) {
cclog.ComponentError(m.name, "Metric", metric.Name, "uses invalid type", metric.Type) cclog.ComponentErrorf(m.name, "Metric %s uses invalid type %s", metric.Name, metric.Type)
metric.Calc = "" metric.Calc = ""
} else if !testLikwidMetricFormula(metric.Calc, globalParams) { } else if !testLikwidMetricFormula(metric.Calc, globalParams) {
cclog.ComponentError(m.name, "Metric", metric.Name, "cannot be calculated with given counters") cclog.ComponentError(m.name, "Metric %s cannot be calculated with given counters", metric.Name)
metric.Calc = "" metric.Calc = ""
} else if !checkMetricType(metric.Type) { } else if !checkMetricType(metric.Type) {
cclog.ComponentError(m.name, "Metric", metric.Name, "has invalid type") cclog.ComponentError(m.name, "Metric %s has invalid type", metric.Name)
metric.Calc = "" metric.Calc = ""
} else { } else {
totalMetrics++ totalMetrics++
@@ -328,7 +334,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
for _, c := range m.cpulist { for _, c := range m.cpulist {
m.measureThread.Call( m.measureThread.Call(
func() { func() {
retCode := C.HPMaddThread(C.uint32_t(c)) retCode := C.cc_add_hwthread(C.int(c))
if retCode != 0 { if retCode != 0 {
err := fmt.Errorf("C.HPMaddThread(%v) failed with return code %v", c, retCode) err := fmt.Errorf("C.HPMaddThread(%v) failed with return code %v", c, retCode)
cclog.ComponentError(m.name, err.Error()) cclog.ComponentError(m.name, err.Error())
@@ -375,16 +381,16 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
// Watch changes for the lock file () // Watch changes for the lock file ()
watcher, err := fsnotify.NewWatcher() watcher, err := fsnotify.NewWatcher()
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("takeMeasurement(): Failed to create a new fsnotify.Watcher: %v", err)) "takeMeasurement(): Failed to create a new fsnotify.Watcher: %v", err)
return true, err return true, err
} }
defer func() { defer func() {
if err := watcher.Close(); err != nil { if err := watcher.Close(); err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("takeMeasurement(): Failed to close fsnotify.Watcher: %v", err)) "takeMeasurement(): Failed to close fsnotify.Watcher: %v", err)
} }
}() }()
if len(m.config.LockfilePath) > 0 { if len(m.config.LockfilePath) > 0 {
@@ -597,7 +603,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
if tid >= 0 && len(metric.Calc) > 0 { if tid >= 0 && len(metric.Calc) > 0 {
value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid]) value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid])
if err != nil { if err != nil {
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) cclog.ComponentErrorf(m.name, "Calculation for metric %s failed: %s", metric.Name, err.Error())
value = 0.0 value = 0.0
} }
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) { if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
@@ -762,7 +768,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
// Evaluate the metric // Evaluate the metric
value, err := agg.EvalFloat64Condition(metric.Calc, params) value, err := agg.EvalFloat64Condition(metric.Calc, params)
if err != nil { if err != nil {
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) cclog.ComponentErrorf(m.name, "Calculation for metric %s failed: %s", metric.Name, err.Error())
value = 0.0 value = 0.0
} }
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) { if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {

View File

@@ -89,9 +89,9 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
} }
buffer, err := os.ReadFile(LOADAVGFILE) buffer, err := os.ReadFile(LOADAVGFILE)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to read file '%s': %v", LOADAVGFILE, err)) "Read(): Failed to read file '%s': %v", LOADAVGFILE, err)
return return
} }
now := time.Now() now := time.Now()
@@ -101,9 +101,9 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
for i, name := range m.load_matches { for i, name := range m.load_matches {
x, err := strconv.ParseFloat(ls[i], 64) x, err := strconv.ParseFloat(ls[i], 64)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", ls[i], err)) "Read(): Failed to convert '%s' to float64: %v", ls[i], err)
continue continue
} }
if m.load_skips[i] { if m.load_skips[i] {
@@ -120,9 +120,9 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
for i, name := range m.proc_matches { for i, name := range m.proc_matches {
x, err := strconv.ParseInt(lv[i], 10, 64) x, err := strconv.ParseInt(lv[i], 10, 64)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", lv[i], err)) "Read(): Failed to convert '%s' to float64: %v", lv[i], err)
continue continue
} }
if m.proc_skips[i] { if m.proc_skips[i] {

View File

@@ -84,7 +84,7 @@ func getStats(filename string) map[string]MemstatStats {
case 5: case 5:
v, err := strconv.ParseFloat(linefields[3], 64) v, err := strconv.ParseFloat(linefields[3], 64)
if err == nil { if err == nil {
cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4]) cclog.ComponentDebug("MemstatCollector", "getStats %s value %v unit %s", strings.Trim(linefields[2], ":"), v, linefields[4])
stats[strings.Trim(linefields[2], ":")] = MemstatStats{ stats[strings.Trim(linefields[2], ":")] = MemstatStats{
value: v, value: v,
unit: linefields[4], unit: linefields[4],

View File

@@ -222,16 +222,16 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
file, err := os.Open(NETSTATFILE) file, err := os.Open(NETSTATFILE)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", NETSTATFILE, err)) "Read(): Failed to open file '%s': %v", NETSTATFILE, err)
return return
} }
defer func() { defer func() {
if err := file.Close(); err != nil { if err := file.Close(); err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", NETSTATFILE, err)) "Read(): Failed to close file '%s': %v", NETSTATFILE, err)
} }
}() }()

View File

@@ -125,10 +125,9 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
timestamp := time.Now() timestamp := time.Now()
if err := m.updateStats(); err != nil { if err := m.updateStats(); err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): updateStats() failed: %v", err), "Read(): updateStats() failed: %v", err)
)
return return
} }
var prefix string var prefix string

View File

@@ -117,7 +117,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
} }
// Initialized // Initialized
cclog.ComponentDebug(m.name, "initialized", len(m.topology), "NUMA domains") cclog.ComponentDebugf(m.name, "initialized %d NUMA domains", len(m.topology))
m.init = true m.init = true
return nil return nil
} }

View File

@@ -113,7 +113,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
// Skip excluded devices by ID // Skip excluded devices by ID
str_i := strconv.Itoa(i) str_i := strconv.Itoa(i)
if slices.Contains(m.config.ExcludeDevices, str_i) { if slices.Contains(m.config.ExcludeDevices, str_i) {
cclog.ComponentDebug(m.name, "Skipping excluded device", str_i) cclog.ComponentDebugf(m.name, "Skipping excluded device %s", str_i)
continue continue
} }
@@ -121,7 +121,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
device, ret := nvml.DeviceGetHandleByIndex(i) device, ret := nvml.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret)) err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error()) cclog.ComponentErrorf(m.name, "Unable to get device at index %d: %s", i, err.Error())
continue continue
} }
@@ -129,7 +129,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
pciInfo, ret := nvml.DeviceGetPciInfo(device) pciInfo, ret := nvml.DeviceGetPciInfo(device)
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret)) err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to get PCI info for device at index", i, ":", err.Error()) cclog.ComponentErrorf(m.name, "Unable to get PCI info for device at index %d: %s", i, err.Error())
continue continue
} }
// Create PCI ID in the common format used by the NVML. // Create PCI ID in the common format used by the NVML.
@@ -141,7 +141,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
// Skip excluded devices specified by PCI ID // Skip excluded devices specified by PCI ID
if slices.Contains(m.config.ExcludeDevices, pci_id) { if slices.Contains(m.config.ExcludeDevices, pci_id) {
cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id) cclog.ComponentDebugf(m.name, "Skipping excluded device %s", pci_id)
continue continue
} }
@@ -183,7 +183,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
if m.config.AddBoardNumberMeta { if m.config.AddBoardNumberMeta {
board, ret := nvml.DeviceGetBoardPartNumber(device) board, ret := nvml.DeviceGetBoardPartNumber(device)
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
cclog.ComponentError(m.name, "Unable to get boart part number for device at index", i, ":", err.Error()) cclog.ComponentErrorf(m.name, "Unable to get boart part number for device at index %d: %s", i, err.Error())
} else { } else {
g.meta["board_number"] = board g.meta["board_number"] = board
} }
@@ -191,7 +191,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
if m.config.AddSerialMeta { if m.config.AddSerialMeta {
serial, ret := nvml.DeviceGetSerial(device) serial, ret := nvml.DeviceGetSerial(device)
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error()) cclog.ComponentErrorf(m.name, "Unable to get serial number for device at index %d: %s", i, err.Error())
} else { } else {
g.meta["serial"] = serial g.meta["serial"] = serial
} }
@@ -199,7 +199,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
if m.config.AddUuidMeta { if m.config.AddUuidMeta {
uuid, ret := nvml.DeviceGetUUID(device) uuid, ret := nvml.DeviceGetUUID(device)
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error()) cclog.ComponentErrorf(m.name, "Unable to get UUID for device at index %d: %s", i, err.Error())
} else { } else {
g.meta["uuid"] = uuid g.meta["uuid"] = uuid
} }
@@ -1128,97 +1128,97 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
} }
err = readMemoryInfo(device, output) err = readMemoryInfo(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readMemoryInfo for device", name, "failed") cclog.ComponentDebugf(m.name, "readMemoryInfo for device %s failed", name)
} }
err = readUtilization(device, output) err = readUtilization(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readUtilization for device", name, "failed") cclog.ComponentDebugf(m.name, "readUtilization for device %s failed", name)
} }
err = readTemp(device, output) err = readTemp(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readTemp for device", name, "failed") cclog.ComponentDebugf(m.name, "readTemp for device %s failed", name)
} }
err = readFan(device, output) err = readFan(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readFan for device", name, "failed") cclog.ComponentDebugf(m.name, "readFan for device %s failed", name)
} }
err = readEccMode(device, output) err = readEccMode(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readEccMode for device", name, "failed") cclog.ComponentDebugf(m.name, "readEccMode for device %s failed", name)
} }
err = readPerfState(device, output) err = readPerfState(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readPerfState for device", name, "failed") cclog.ComponentDebugf(m.name, "readPerfState for device %s failed", name)
} }
err = readPowerUsage(device, output) err = readPowerUsage(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed") cclog.ComponentDebugf(m.name, "readPowerUsage for device %s failed", name)
} }
err = readEnergyConsumption(device, output) err = readEnergyConsumption(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readEnergyConsumption for device", name, "failed") cclog.ComponentDebugf(m.name, "readEnergyConsumption for device %s failed", name)
} }
err = readClocks(device, output) err = readClocks(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readClocks for device", name, "failed") cclog.ComponentDebugf(m.name, "readClocks for device %s failed", name)
} }
err = readMaxClocks(device, output) err = readMaxClocks(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readMaxClocks for device", name, "failed") cclog.ComponentDebugf(m.name, "readMaxClocks for device %s failed", name)
} }
err = readEccErrors(device, output) err = readEccErrors(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readEccErrors for device", name, "failed") cclog.ComponentDebugf(m.name, "readEccErrors for device %s failed", name)
} }
err = readPowerLimit(device, output) err = readPowerLimit(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readPowerLimit for device", name, "failed") cclog.ComponentDebugf(m.name, "readPowerLimit for device %s failed", name)
} }
err = readEncUtilization(device, output) err = readEncUtilization(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readEncUtilization for device", name, "failed") cclog.ComponentDebugf(m.name, "readEncUtilization for device %s failed", name)
} }
err = readDecUtilization(device, output) err = readDecUtilization(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readDecUtilization for device", name, "failed") cclog.ComponentDebugf(m.name, "readDecUtilization for device %s failed", name)
} }
err = readRemappedRows(device, output) err = readRemappedRows(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readRemappedRows for device", name, "failed") cclog.ComponentDebugf(m.name, "readRemappedRows for device %s failed", name)
} }
err = readBarMemoryInfo(device, output) err = readBarMemoryInfo(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readBarMemoryInfo for device", name, "failed") cclog.ComponentDebugf(m.name, "readBarMemoryInfo for device %s failed", name)
} }
err = readProcessCounts(device, output) err = readProcessCounts(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readProcessCounts for device", name, "failed") cclog.ComponentDebugf(m.name, "readProcessCounts for device %s failed", name)
} }
err = readViolationStats(device, output) err = readViolationStats(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readViolationStats for device", name, "failed") cclog.ComponentDebugf(m.name, "readViolationStats for device %s failed", name)
} }
err = readNVLinkStats(device, output) err = readNVLinkStats(device, output)
if err != nil { if err != nil {
cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed") cclog.ComponentDebugf(m.name, "readNVLinkStats for device %s failed", name)
} }
} }
@@ -1244,7 +1244,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
if maxMig == 0 { if maxMig == 0 {
continue continue
} }
cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i) cclog.ComponentDebugf(m.name, "Reading MIG devices for GPU %d", i)
for j := range maxMig { for j := range maxMig {
mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j) mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
@@ -1268,7 +1268,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
if m.config.UseUuidForMigDevices { if m.config.UseUuidForMigDevices {
uuid, ret := nvml.DeviceGetUUID(mdev) uuid, ret := nvml.DeviceGetUUID(mdev)
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
cclog.ComponentError(m.name, "Unable to get UUID for mig device at index", j, ":", err.Error()) cclog.ComponentErrorf(m.name, "Unable to get UUID for mig device at index %d: %s", j, err.Error())
} else { } else {
migDevice.tags["stype-id"] = uuid migDevice.tags["stype-id"] = uuid
} }

View File

@@ -208,11 +208,10 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
} }
// Initialized // Initialized
cclog.ComponentDebug( cclog.ComponentDebugf(
m.name, m.name,
"initialized", "initialized %d zones with running average power limit (RAPL) monitoring attributes",
len(m.RAPLZoneInfo), len(m.RAPLZoneInfo))
"zones with running average power limit (RAPL) monitoring attributes")
m.init = true m.init = true
return err return err

View File

@@ -124,7 +124,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
if m.config.AddSerialMeta { if m.config.AddSerialMeta {
serial, ret := rocm_smi.DeviceGetSerialNumber(device) serial, ret := rocm_smi.DeviceGetSerialNumber(device)
if ret != rocm_smi.STATUS_SUCCESS { if ret != rocm_smi.STATUS_SUCCESS {
cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", rocm_smi.StatusStringNoError(ret)) cclog.ComponentErrorf(m.name, "Unable to get serial number for device at index %d: %s", i, rocm_smi.StatusStringNoError(ret))
} else { } else {
dev.meta["serial"] = serial dev.meta["serial"] = serial
} }
@@ -152,7 +152,7 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage
for _, dev := range m.devices { for _, dev := range m.devices {
metrics, ret := rocm_smi.DeviceGetMetrics(dev.device) metrics, ret := rocm_smi.DeviceGetMetrics(dev.device)
if ret != rocm_smi.STATUS_SUCCESS { if ret != rocm_smi.STATUS_SUCCESS {
cclog.ComponentError(m.name, "Unable to get metrics for device at index", dev.index, ":", rocm_smi.StatusStringNoError(ret)) cclog.ComponentErrorf(m.name, "Unable to get metrics for device at index %d: %s", dev.index, rocm_smi.StatusStringNoError(ret))
continue continue
} }

View File

@@ -147,15 +147,15 @@ func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessa
file, err := os.Open(SCHEDSTATFILE) file, err := os.Open(SCHEDSTATFILE)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", SCHEDSTATFILE, err)) "Read(): Failed to open file '%s': %v", SCHEDSTATFILE, err)
} }
defer func() { defer func() {
if err := file.Close(); err != nil { if err := file.Close(); err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", SCHEDSTATFILE, err)) "Read(): Failed to close file '%s': %v", SCHEDSTATFILE, err)
} }
}() }()

View File

@@ -240,7 +240,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
globPattern := filepath.Join(m.cgroupBase, "job_*") globPattern := filepath.Join(m.cgroupBase, "job_*")
jobDirs, err := filepath.Glob(globPattern) jobDirs, err := filepath.Glob(globPattern)
if err != nil { if err != nil {
cclog.ComponentError(m.name, "Error globbing job directories:", err.Error()) cclog.ComponentErrorf(m.name, "Error globbing job directories: %s", err.Error())
return return
} }
@@ -249,7 +249,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
jobdata, err := m.ReadJobData(jKey) jobdata, err := m.ReadJobData(jKey)
if err != nil { if err != nil {
cclog.ComponentError(m.name, "Error reading job data for", jKey, ":", err.Error()) cclog.ComponentError(m.name, "Error reading job data for %s: %s", jKey, err.Error())
continue continue
} }

View File

@@ -228,12 +228,12 @@ func (m *SmartMonCollector) Read(interval time.Duration, output chan lp.CCMessag
stdout, err := command.Output() stdout, err := command.Output()
if err != nil { if err != nil {
cclog.ComponentError(m.name, "cannot read data for device", d.Name) cclog.ComponentErrorf(m.name, "cannot read data for device %s", d.Name)
continue continue
} }
err = json.Unmarshal(stdout, &data) err = json.Unmarshal(stdout, &data)
if err != nil { if err != nil {
cclog.ComponentError(m.name, "cannot unmarshal data for device", d.Name) cclog.ComponentErrorf(m.name, "cannot unmarshal data for device %s", d.Name)
continue continue
} }
if !m.excludeMetric.temp { if !m.excludeMetric.temp {

View File

@@ -188,16 +188,16 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Read sensor file // Read sensor file
buffer, err := os.ReadFile(sensor.file) buffer, err := os.ReadFile(sensor.file)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to read file '%s': %v", sensor.file, err)) "Read(): Failed to read file '%s': %v", sensor.file, err)
continue continue
} }
x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64) x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to convert temperature '%s' to int64: %v", buffer, err)) "Read(): Failed to convert temperature '%s' to int64: %v", buffer, err)
continue continue
} }
x /= 1000 x /= 1000

View File

@@ -77,9 +77,9 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessag
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu") command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
stdout, err := command.Output() stdout, err := command.Output()
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentErrorf(
m.name, m.name,
fmt.Sprintf("Read(): Failed to read output from command \"%s\": %v", command.String(), err)) "Read(): Failed to read output from command \"%s\": %v", command.String(), err)
return return
} }

2
go.mod
View File

@@ -9,7 +9,7 @@ require (
github.com/PaesslerAG/gval v1.2.4 github.com/PaesslerAG/gval v1.2.4
github.com/fsnotify/fsnotify v1.10.1 github.com/fsnotify/fsnotify v1.10.1
github.com/tklauser/go-sysconf v0.4.0 github.com/tklauser/go-sysconf v0.4.0
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 golang.design/x/thread v0.3.2
golang.org/x/sys v0.45.0 golang.org/x/sys v0.45.0
) )

5
go.sum
View File

@@ -173,8 +173,8 @@ go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.yaml.in/yaml/v2 v2.4.4 h1:tuyd0P+2Ont/d6e2rl3be67goVK4R6deVxCUX5vyPaQ= go.yaml.in/yaml/v2 v2.4.4 h1:tuyd0P+2Ont/d6e2rl3be67goVK4R6deVxCUX5vyPaQ=
go.yaml.in/yaml/v2 v2.4.4/go.mod h1:gMZqIpDtDqOfM0uNfy0SkpRhvUryYH0Z6wdMYcacYXQ= go.yaml.in/yaml/v2 v2.4.4/go.mod h1:gMZqIpDtDqOfM0uNfy0SkpRhvUryYH0Z6wdMYcacYXQ=
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 h1:P7S/GeHBAFEZIYp0ePPs2kHXoazz8q2KsyxHyQVGCJg= golang.design/x/thread v0.3.2 h1:FmD1glspGrQCe6FuQLmSrT6wz2CSzq7vKVDluyiMnqo=
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1/go.mod h1:9CWpnTUmlQkfdpdutA1nNf4iE5lAVt3QZOu0Z6hahBE= golang.design/x/thread v0.3.2/go.mod h1:6+Hi2rMOgMHZdKDWaqNHyWtoFUx1HxZ06LfHPh5Z/hQ=
golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI= golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI=
golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q= golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q=
golang.org/x/exp v0.0.0-20231005195138-3e424a577f31 h1:9k5exFQKQglLo+RoP+4zMjOFE14P6+vyR0baDAi0Rcs= golang.org/x/exp v0.0.0-20231005195138-3e424a577f31 h1:9k5exFQKQglLo+RoP+4zMjOFE14P6+vyR0baDAi0Rcs=
@@ -183,7 +183,6 @@ golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY=
golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA=
golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs=
golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY= golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY=
golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U= golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=

View File

@@ -35,18 +35,18 @@ type metricRouterTagConfig struct {
// Metric router configuration // Metric router configuration
type metricRouterConfig struct { type metricRouterConfig struct {
HostnameTagName string `json:"hostname_tag"` // Key name used when adding the hostname to a metric (default 'hostname') HostnameTagName string `json:"hostname_tag,omitempty"` // Key name used when adding the hostname to a metric (default 'hostname')
AddTags []metricRouterTagConfig `json:"add_tags"` // List of tags that are added when the condition is met AddTags []metricRouterTagConfig `json:"add_tags,omitempty"` // List of tags that are added when the condition is met
DelTags []metricRouterTagConfig `json:"delete_tags"` // List of tags that are removed when the condition is met DelTags []metricRouterTagConfig `json:"delete_tags,omitempty"` // List of tags that are removed when the condition is met
IntervalAgg []agg.MetricAggregatorIntervalConfig `json:"interval_aggregates"` // List of aggregation function processed at the end of an interval IntervalAgg []agg.MetricAggregatorIntervalConfig `json:"interval_aggregates,omitempty"` // List of aggregation function processed at the end of an interval
DropMetrics []string `json:"drop_metrics"` // List of metric names to drop. For fine-grained dropping use drop_metrics_if DropMetrics []string `json:"drop_metrics,omitempty"` // List of metric names to drop. For fine-grained dropping use drop_metrics_if
DropMetricsIf []string `json:"drop_metrics_if"` // List of evaluatable terms to drop metrics DropMetricsIf []string `json:"drop_metrics_if,omitempty"` // List of evaluatable terms to drop metrics
RenameMetrics map[string]string `json:"rename_metrics"` // Map to rename metric name from key to value RenameMetrics map[string]string `json:"rename_metrics,omitempty"` // Map to rename metric name from key to value
IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically by ticker each interval? IntervalStamp bool `json:"interval_timestamp,omitempty"` // Update timestamp periodically by ticker each interval?
NumCacheIntervals int `json:"num_cache_intervals"` // Number of intervals of cached metrics for evaluation NumCacheIntervals int `json:"num_cache_intervals,omitempty"` // Number of intervals of cached metrics for evaluation
MaxForward int `json:"max_forward"` // Number of maximal forwarded metrics at one select MaxForward int `json:"max_forward,omitempty"` // Number of maximal forwarded metrics at one select
NormalizeUnits bool `json:"normalize_units"` // Check unit meta flag and normalize it using cc-units NormalizeUnits bool `json:"normalize_units,omitempty"` // Check unit meta flag and normalize it using cc-units
ChangeUnitPrefix map[string]string `json:"change_unit_prefix"` // Add prefix that should be applied to the metrics ChangeUnitPrefix map[string]string `json:"change_unit_prefix,omitempty"` // Add prefix that should be applied to the metrics
MessageProcessor json.RawMessage `json:"process_messages,omitempty"` MessageProcessor json.RawMessage `json:"process_messages,omitempty"`
} }
@@ -297,7 +297,7 @@ func (r *metricRouter) Start() {
case timestamp := <-timeChan: case timestamp := <-timeChan:
r.timestamp = timestamp r.timestamp = timestamp
cclog.ComponentDebug("MetricRouter", "Update timestamp", r.timestamp.UnixNano()) cclog.ComponentDebugf("MetricRouter", "Update timestamp %d", r.timestamp.UnixNano())
case p := <-r.coll_input: case p := <-r.coll_input:
coll_forward(p) coll_forward(p)