mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-06-09 21:37:29 +02:00
Compare commits
5 Commits
nvidia_gpm
...
cclog_upda
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c9dfca622f | ||
|
|
3b0638e815 | ||
|
|
037b4f1526 | ||
|
|
bed5491068 | ||
|
|
a2eba41150 |
@@ -132,11 +132,11 @@ func mainFunc() int {
|
||||
if len(rcfg.ConfigFile.Interval) > 0 {
|
||||
t, err := time.ParseDuration(rcfg.ConfigFile.Interval)
|
||||
if err != nil {
|
||||
cclog.Error("Configuration value 'interval' no valid duration")
|
||||
cclog.Errorf("Configuration value interval=%s no valid duration", rcfg.ConfigFile.Interval)
|
||||
}
|
||||
rcfg.Interval = t
|
||||
if rcfg.Interval == 0 {
|
||||
cclog.Error("Configuration value 'interval' must be greater than zero")
|
||||
cclog.Errorf("Configuration value interval=%s must be greater than zero", rcfg.ConfigFile.Interval)
|
||||
return 1
|
||||
}
|
||||
}
|
||||
@@ -145,11 +145,11 @@ func mainFunc() int {
|
||||
if len(rcfg.ConfigFile.Duration) > 0 {
|
||||
t, err := time.ParseDuration(rcfg.ConfigFile.Duration)
|
||||
if err != nil {
|
||||
cclog.Error("Configuration value 'duration' no valid duration")
|
||||
cclog.Error("Configuration value duration=%s no valid duration", rcfg.ConfigFile.Duration)
|
||||
}
|
||||
rcfg.Duration = t
|
||||
if rcfg.Duration == 0 {
|
||||
cclog.Error("Configuration value 'duration' must be greater than zero")
|
||||
cclog.Error("Configuration value duration=%s must be greater than zero", rcfg.ConfigFile.Duration)
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,16 +209,16 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
} else {
|
||||
f1, err := strconv.ParseFloat(m.matches["other"], 32)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
|
||||
"Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
|
||||
continue
|
||||
}
|
||||
f2, err := strconv.ParseFloat(split[i], 32)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
|
||||
"Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
|
||||
continue
|
||||
}
|
||||
m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2)
|
||||
|
||||
@@ -200,16 +200,16 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
||||
} else {
|
||||
f1, err := strconv.ParseFloat(m.matches["other"], 32)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
|
||||
"Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
|
||||
continue
|
||||
}
|
||||
f2, err := strconv.ParseFloat(split[i], 32)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
|
||||
"Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
|
||||
continue
|
||||
}
|
||||
m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2)
|
||||
|
||||
@@ -139,16 +139,16 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
||||
const cpuInfoFile = "/proc/cpuinfo"
|
||||
file, err := os.Open(cpuInfoFile)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err))
|
||||
"Read(): Failed to open file '%s': %v", cpuInfoFile, err)
|
||||
return
|
||||
}
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", cpuInfoFile, err))
|
||||
"Read(): Failed to close file '%s': %v", cpuInfoFile, err)
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -166,9 +166,9 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
||||
if !t.isHT {
|
||||
value, err := strconv.ParseFloat(strings.TrimSpace(lineSplit[1]), 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
|
||||
"Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err)
|
||||
return
|
||||
}
|
||||
if y, err := lp.NewMetric("cpufreq", t.tagSet, m.meta, value, now); err == nil {
|
||||
|
||||
@@ -95,10 +95,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
// Initialized
|
||||
cclog.ComponentDebug(
|
||||
m.name,
|
||||
"initialized",
|
||||
len(m.topology), "non-hyper-threading CPUs")
|
||||
cclog.ComponentDebugf(m.name, "initialized %d non-hyper-threading CPUs")
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
@@ -116,16 +113,14 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
// Read current frequency
|
||||
line, err := os.ReadFile(t.scalingCurFreqFile)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to read file '%s': %v", t.scalingCurFreqFile, err))
|
||||
cclog.ComponentErrorf(
|
||||
m.name, "Read(): Failed to read file '%s': %v", t.scalingCurFreqFile, err)
|
||||
continue
|
||||
}
|
||||
cpuFreq, err := strconv.ParseInt(strings.TrimSpace(string(line)), 10, 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to convert CPU frequency '%s' to int64: %v", line, err))
|
||||
m.name, "Read(): Failed to convert CPU frequency '%s' to int64: %v", line, err)
|
||||
continue
|
||||
}
|
||||
|
||||
|
||||
@@ -171,15 +171,15 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
|
||||
file, err := os.Open(CPUSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", CPUSTATFILE, err))
|
||||
"Read(): Failed to open file '%s': %v", CPUSTATFILE, err)
|
||||
}
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", string(CPUSTATFILE), err))
|
||||
"Read(): Failed to close file '%s': %v", string(CPUSTATFILE), err)
|
||||
}
|
||||
}()
|
||||
|
||||
|
||||
@@ -64,9 +64,9 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
||||
cmdFields := strings.Fields(c)
|
||||
command := exec.Command(cmdFields[0], cmdFields[1:]...)
|
||||
if _, err := command.Output(); err != nil {
|
||||
cclog.ComponentWarn(
|
||||
cclog.ComponentWarnf(
|
||||
m.name,
|
||||
fmt.Sprintf("%s Init(): Execution of command \"%s\" failed: %v", m.name, command.String(), err))
|
||||
"%s Init(): Execution of command \"%s\" failed: %v", m.name, command.String(), err)
|
||||
continue
|
||||
}
|
||||
m.cmdFieldsSlice = append(m.cmdFieldsSlice, cmdFields)
|
||||
@@ -77,7 +77,7 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
||||
if _, err := os.ReadFile(fileName); err != nil {
|
||||
cclog.ComponentWarn(
|
||||
m.name,
|
||||
fmt.Sprintf("%s Init(): Reading of file \"%s\" failed: %v", m.name, fileName, err))
|
||||
"%s Init(): Reading of file \"%s\" failed: %v", m.name, fileName, err)
|
||||
continue
|
||||
}
|
||||
m.files = append(m.files, fileName)
|
||||
@@ -100,20 +100,18 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
command := exec.Command(cmdFields[0], cmdFields[1:]...)
|
||||
stdout, err := command.Output()
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to read command output for command \"%s\": %v", command.String(), err),
|
||||
)
|
||||
"Read(): Failed to read command output for command \"%s\": %v", command.String(), err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Read and decode influxDB line-protocol from command output
|
||||
metrics, err := lp.FromBytes(stdout)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to decode influx Message: %v", err),
|
||||
)
|
||||
"Read(): Failed to decode influx Message: %v", err)
|
||||
continue
|
||||
}
|
||||
for _, metric := range metrics {
|
||||
@@ -128,20 +126,18 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
for _, filename := range m.files {
|
||||
input, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to read file \"%s\": %v\n", filename, err),
|
||||
)
|
||||
"Read(): Failed to read file \"%s\": %v\n", filename, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Read and decode influxDB line-protocol from file
|
||||
metrics, err := lp.FromBytes(input)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to decode influx Message: %v", err),
|
||||
)
|
||||
"Read(): Failed to decode influx Message: %v", err)
|
||||
continue
|
||||
}
|
||||
for _, metric := range metrics {
|
||||
|
||||
@@ -77,16 +77,16 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
|
||||
|
||||
file, err := os.Open(MOUNTFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", MOUNTFILE, err))
|
||||
"Read(): Failed to open file '%s': %v", MOUNTFILE, err)
|
||||
return
|
||||
}
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", MOUNTFILE, err))
|
||||
"Read(): Failed to close file '%s': %v", MOUNTFILE, err)
|
||||
}
|
||||
}()
|
||||
|
||||
|
||||
@@ -371,7 +371,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||
if err != nil {
|
||||
// if using sudo, exec.lookPath will return EACCES (file mode r-x------), this can be ignored
|
||||
if m.config.Sudo && errors.Is(err, syscall.EACCES) {
|
||||
cclog.ComponentWarn(m.name, fmt.Sprintf("got error looking for mmpmon binary '%s': %v . This is expected when using sudo, continuing.", m.config.Mmpmon, err))
|
||||
cclog.ComponentWarnf(m.name, "got error looking for mmpmon binary '%s': %v . This is expected when using sudo, continuing.", m.config.Mmpmon, err)
|
||||
// the file was given in the config, use it
|
||||
p = m.config.Mmpmon
|
||||
} else {
|
||||
@@ -517,23 +517,23 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
// return code
|
||||
rc, err := strconv.Atoi(key_value["_rc_"])
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err))
|
||||
cclog.ComponentErrorf(m.name, "Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err)
|
||||
continue
|
||||
}
|
||||
if rc != 0 {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Filesystem '%s' is not ok.", filesystem))
|
||||
cclog.ComponentErrorf(m.name, "Read(): Filesystem '%s' is not ok.", filesystem)
|
||||
continue
|
||||
}
|
||||
|
||||
// timestamp
|
||||
sec, err := strconv.ParseInt(key_value["_t_"], 10, 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err))
|
||||
cclog.ComponentErrorf(m.name, "Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err)
|
||||
continue
|
||||
}
|
||||
msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err))
|
||||
cclog.ComponentErrorf(m.name, "Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err)
|
||||
continue
|
||||
}
|
||||
timestamp := time.Unix(sec, msec*1000)
|
||||
@@ -551,7 +551,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
for _, metric := range GpfsAbsMetrics {
|
||||
value, err := strconv.ParseInt(key_value[metric.prefix], 10, 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert %s '%s' to int64: %v", metric.desc, key_value[metric.prefix], err))
|
||||
cclog.ComponentErrorf(m.name, "Read(): Failed to convert %s '%s' to int64: %v", metric.desc, key_value[metric.prefix], err)
|
||||
continue
|
||||
}
|
||||
newstate[metric.prefix] = value
|
||||
@@ -636,7 +636,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
}
|
||||
} else {
|
||||
// the value could not be computed correctly
|
||||
cclog.ComponentWarn(m.name, fmt.Sprintf("Read(): Could not compute value for filesystem %s of metric %s: vold_ok = %t, vnew_ok = %t", filesystem, metric.name, vold_ok, vnew_ok))
|
||||
cclog.ComponentWarnf(m.name, "Read(): Could not compute value for filesystem %s of metric %s: vold_ok = %t, vnew_ok = %t", filesystem, metric.name, vold_ok, vnew_ok)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -23,20 +23,29 @@ import (
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
const IB_BASEPATH = "/sys/class/infiniband/"
|
||||
// See: https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-class-infiniband
|
||||
const (
|
||||
ibBasePath = "/sys/class/infiniband/"
|
||||
ibDataUnit = "bytes"
|
||||
ibDataRateUnit = ibDataUnit + "/sec"
|
||||
ibPkgUnit = "packets"
|
||||
ibPkgRateUnit = ibPkgUnit + "/sec"
|
||||
)
|
||||
|
||||
type InfinibandCollectorMetric struct {
|
||||
name string
|
||||
path string
|
||||
unit string
|
||||
scale int64
|
||||
unitRates string
|
||||
scaleByFourLanes bool
|
||||
addToIBTotal bool
|
||||
addToIBTotalPkgs bool
|
||||
lastState int64
|
||||
lastState uint64
|
||||
lastStateAvailable bool
|
||||
}
|
||||
|
||||
type InfinibandCollectorInfo struct {
|
||||
LID string // IB local Identifier (LID)
|
||||
lid string // IB local Identifier (LID)
|
||||
device string // IB device
|
||||
port string // IB device port
|
||||
portCounterFiles []InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric
|
||||
@@ -56,7 +65,7 @@ type InfinibandCollector struct {
|
||||
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
||||
}
|
||||
|
||||
// Init initializes the Infiniband collector by walking through files below IB_BASEPATH
|
||||
// Init initializes the Infiniband collector by walking through files below ibBasePath
|
||||
func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
// Check if already initialized
|
||||
if m.init {
|
||||
@@ -87,7 +96,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
// Loop for all InfiniBand directories
|
||||
globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*")
|
||||
globPattern := filepath.Join(ibBasePath, "*", "ports", "*")
|
||||
ibDirs, err := filepath.Glob(globPattern)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s Init(): unable to glob files with pattern %s: %w", m.name, globPattern, err)
|
||||
@@ -122,36 +131,42 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
countersDir := filepath.Join(path, "counters")
|
||||
portCounterFiles := []InfinibandCollectorMetric{
|
||||
{
|
||||
// Total number of data octets, divided by 4 (lanes), received on all VLs.
|
||||
// This is 64 bit counter
|
||||
name: "ib_recv",
|
||||
path: filepath.Join(countersDir, "port_rcv_data"),
|
||||
unit: "bytes",
|
||||
scale: 4,
|
||||
unit: ibDataUnit,
|
||||
unitRates: ibDataRateUnit,
|
||||
scaleByFourLanes: true,
|
||||
addToIBTotal: true,
|
||||
lastState: -1,
|
||||
},
|
||||
{
|
||||
// Total number of data octets, divided by 4 (lanes), transmitted on all VLs.
|
||||
// This is 64 bit counter
|
||||
name: "ib_xmit",
|
||||
path: filepath.Join(countersDir, "port_xmit_data"),
|
||||
unit: "bytes",
|
||||
scale: 4,
|
||||
unit: ibDataUnit,
|
||||
unitRates: ibDataRateUnit,
|
||||
scaleByFourLanes: true,
|
||||
addToIBTotal: true,
|
||||
lastState: -1,
|
||||
},
|
||||
{
|
||||
// Total number of packets received on all VLs from this port (this may include packets containing Errors.
|
||||
// This is 64 bit counter.
|
||||
name: "ib_recv_pkts",
|
||||
path: filepath.Join(countersDir, "port_rcv_packets"),
|
||||
unit: "packets",
|
||||
scale: 1,
|
||||
unit: ibPkgUnit,
|
||||
unitRates: ibPkgRateUnit,
|
||||
addToIBTotalPkgs: true,
|
||||
lastState: -1,
|
||||
},
|
||||
{
|
||||
// Total number of packets transmitted on all VLs from this port. This may include packets with errors.
|
||||
// This is 64 bit counter.
|
||||
name: "ib_xmit_pkts",
|
||||
path: filepath.Join(countersDir, "port_xmit_packets"),
|
||||
unit: "packets",
|
||||
scale: 1,
|
||||
unit: ibPkgUnit,
|
||||
unitRates: ibPkgRateUnit,
|
||||
addToIBTotalPkgs: true,
|
||||
lastState: -1,
|
||||
},
|
||||
}
|
||||
for _, counter := range portCounterFiles {
|
||||
@@ -163,7 +178,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
|
||||
m.info = append(m.info,
|
||||
InfinibandCollectorInfo{
|
||||
LID: LID,
|
||||
lid: LID,
|
||||
device: device,
|
||||
port: port,
|
||||
portCounterFiles: portCounterFiles,
|
||||
@@ -184,7 +199,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Read reads Infiniband counter files below IB_BASEPATH
|
||||
// Read reads Infiniband counter files below ibBasePath
|
||||
func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
// Check if already initialized
|
||||
if !m.init {
|
||||
@@ -201,36 +216,42 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
for i := range m.info {
|
||||
info := &m.info[i]
|
||||
|
||||
var ib_total, ib_total_last_state,
|
||||
ib_total_pkts, ib_total_pkts_last_state int64
|
||||
var ib_total_last_state_available, ib_total_pkts_last_state_available bool
|
||||
var ibTotal, ibTotalPkts uint64 // sum of xmit and recv counters
|
||||
var ibTotalBw, ibTotalPktsBw float64 // sum of xmit and recv rates
|
||||
var ibTotalBwAvailable, ibTotalPktsBwAvailable bool
|
||||
for i := range info.portCounterFiles {
|
||||
counterDef := &info.portCounterFiles[i]
|
||||
|
||||
// Read counter file
|
||||
line, err := os.ReadFile(counterDef.path)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err))
|
||||
"Read(): Failed to read from file '%s': %v", counterDef.path, err)
|
||||
// Current counter can not be saved as last state
|
||||
counterDef.lastStateAvailable = false
|
||||
continue
|
||||
}
|
||||
data := strings.TrimSpace(string(line))
|
||||
|
||||
// convert counter to int64
|
||||
v, err := strconv.ParseInt(data, 10, 64)
|
||||
// convert counter to uint64
|
||||
vRawCounter, err := strconv.ParseUint(data, 10, 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterDef.name, data, err))
|
||||
"Read(): Failed to convert Infininiband metrice %s='%s' to uint64: %v", counterDef.name, data, err)
|
||||
// Current counter can not be saved as last state
|
||||
counterDef.lastStateAvailable = false
|
||||
continue
|
||||
}
|
||||
// Scale raw value
|
||||
v *= counterDef.scale
|
||||
vScaledCounter := vRawCounter
|
||||
if counterDef.scaleByFourLanes {
|
||||
vScaledCounter *= uint64(4)
|
||||
}
|
||||
|
||||
// Send absolut values
|
||||
if m.config.SendAbsoluteValues {
|
||||
if y, err := lp.NewMetric(counterDef.name, info.tagSet, m.meta, v, now); err == nil {
|
||||
if y, err := lp.NewMetric(counterDef.name, info.tagSet, m.meta, vScaledCounter, now); err == nil {
|
||||
y.AddMeta("unit", counterDef.unit)
|
||||
output <- y
|
||||
}
|
||||
@@ -238,63 +259,72 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
|
||||
// Send derived values
|
||||
if m.config.SendDerivedValues {
|
||||
if counterDef.lastState >= 0 {
|
||||
rate := float64((v - counterDef.lastState)) / timeDiff
|
||||
if counterDef.lastStateAvailable {
|
||||
var rate float64
|
||||
// uint64 subtraction handles wraparound automatically
|
||||
// in case vRawCounter < counterDef.lastState we would compute:
|
||||
// math.MaxUint64 - lastState + vRawCounter + 1
|
||||
// = (2^64 - 1) - lastState + vRawCounter + 1
|
||||
// = 2^64 - lastState + vRawCounter
|
||||
// ≡ vRawCounter - lastState (mod 2^64)
|
||||
rate = float64(vRawCounter-counterDef.lastState) / timeDiff
|
||||
if counterDef.scaleByFourLanes {
|
||||
rate *= float64(4)
|
||||
}
|
||||
if y, err := lp.NewMetric(counterDef.name+"_bw", info.tagSet, m.meta, rate, now); err == nil {
|
||||
y.AddMeta("unit", counterDef.unit+"/sec")
|
||||
y.AddMeta("unit", counterDef.unitRates)
|
||||
output <- y
|
||||
}
|
||||
|
||||
// Sum up total values of last state
|
||||
// Sum up rates for total rates
|
||||
if m.config.SendTotalValues {
|
||||
switch {
|
||||
case counterDef.addToIBTotal:
|
||||
ib_total_last_state += counterDef.lastState
|
||||
ib_total_last_state_available = true
|
||||
ibTotalBw += rate
|
||||
ibTotalBwAvailable = true
|
||||
case counterDef.addToIBTotalPkgs:
|
||||
ib_total_pkts_last_state += counterDef.lastState
|
||||
ib_total_pkts_last_state_available = true
|
||||
ibTotalPktsBw += rate
|
||||
ibTotalPktsBwAvailable = true
|
||||
}
|
||||
}
|
||||
}
|
||||
counterDef.lastState = v
|
||||
counterDef.lastState = vRawCounter
|
||||
counterDef.lastStateAvailable = true
|
||||
}
|
||||
|
||||
// Sum up total values
|
||||
if m.config.SendTotalValues {
|
||||
switch {
|
||||
case counterDef.addToIBTotal:
|
||||
ib_total += v
|
||||
ibTotal += vScaledCounter
|
||||
case counterDef.addToIBTotalPkgs:
|
||||
ib_total_pkts += v
|
||||
ibTotalPkts += vScaledCounter
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Send total values
|
||||
if m.config.SendTotalValues {
|
||||
if y, err := lp.NewMetric("ib_total", info.tagSet, m.meta, ib_total, now); err == nil {
|
||||
y.AddMeta("unit", "bytes")
|
||||
if y, err := lp.NewMetric("ib_total", info.tagSet, m.meta, ibTotal, now); err == nil {
|
||||
y.AddMeta("unit", ibDataUnit)
|
||||
output <- y
|
||||
}
|
||||
|
||||
if y, err := lp.NewMetric("ib_total_pkts", info.tagSet, m.meta, ib_total_pkts, now); err == nil {
|
||||
y.AddMeta("unit", "packets")
|
||||
if y, err := lp.NewMetric("ib_total_pkts", info.tagSet, m.meta, ibTotalPkts, now); err == nil {
|
||||
y.AddMeta("unit", ibPkgUnit)
|
||||
output <- y
|
||||
}
|
||||
|
||||
if m.config.SendDerivedValues && ib_total_last_state_available {
|
||||
rate := float64((ib_total - ib_total_last_state)) / timeDiff
|
||||
if y, err := lp.NewMetric("ib_total_bw", info.tagSet, m.meta, rate, now); err == nil {
|
||||
y.AddMeta("unit", "bytes/sec")
|
||||
if m.config.SendDerivedValues && ibTotalBwAvailable {
|
||||
if y, err := lp.NewMetric("ib_total_bw", info.tagSet, m.meta, ibTotalBw, now); err == nil {
|
||||
y.AddMeta("unit", ibDataRateUnit)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
if m.config.SendDerivedValues && ib_total_pkts_last_state_available {
|
||||
rate := float64((ib_total_pkts - ib_total_pkts_last_state)) / timeDiff
|
||||
if y, err := lp.NewMetric("ib_total_pkts_bw", info.tagSet, m.meta, rate, now); err == nil {
|
||||
y.AddMeta("unit", "packets/sec")
|
||||
if m.config.SendDerivedValues && ibTotalPktsBwAvailable {
|
||||
if y, err := lp.NewMetric("ib_total_pkts_bw", info.tagSet, m.meta, ibTotalPktsBw, now); err == nil {
|
||||
y.AddMeta("unit", ibPkgRateUnit)
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
@@ -145,16 +145,16 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
|
||||
file, err := os.Open(IOSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", IOSTATFILE, err))
|
||||
"Read(): Failed to open file '%s': %v", IOSTATFILE, err)
|
||||
return
|
||||
}
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", IOSTATFILE, err))
|
||||
"Read(): Failed to close file '%s': %v", IOSTATFILE, err)
|
||||
}
|
||||
}()
|
||||
|
||||
|
||||
@@ -12,6 +12,12 @@ package collectors
|
||||
#cgo LDFLAGS: -Wl,--unresolved-symbols=ignore-in-object-files
|
||||
#include <stdlib.h>
|
||||
#include <likwid.h>
|
||||
|
||||
|
||||
int cc_add_hwthread(int cpu_id) {
|
||||
return HPMaddThread(cpu_id);
|
||||
}
|
||||
|
||||
*/
|
||||
import "C"
|
||||
|
||||
@@ -261,12 +267,12 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
for _, metric := range evset.Metrics {
|
||||
// Try to evaluate the metric
|
||||
cclog.ComponentDebug(m.name, "Checking", metric.Name)
|
||||
cclog.ComponentDebugf(m.name, "Checking %s", metric.Name)
|
||||
if !checkMetricType(metric.Type) {
|
||||
cclog.ComponentError(m.name, "Metric", metric.Name, "uses invalid type", metric.Type)
|
||||
cclog.ComponentErrorf(m.name, "Metric %s uses invalid type %s", metric.Name, metric.Type)
|
||||
metric.Calc = ""
|
||||
} else if !testLikwidMetricFormula(metric.Calc, params) {
|
||||
cclog.ComponentError(m.name, "Metric", metric.Name, "cannot be calculated with given counters")
|
||||
cclog.ComponentError(m.name, "Metric %s cannot be calculated with given counters", metric.Name)
|
||||
metric.Calc = ""
|
||||
} else {
|
||||
globalParams = append(globalParams, metric.Name)
|
||||
@@ -281,13 +287,13 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
for _, metric := range m.config.Metrics {
|
||||
// Try to evaluate the global metric
|
||||
if !checkMetricType(metric.Type) {
|
||||
cclog.ComponentError(m.name, "Metric", metric.Name, "uses invalid type", metric.Type)
|
||||
cclog.ComponentErrorf(m.name, "Metric %s uses invalid type %s", metric.Name, metric.Type)
|
||||
metric.Calc = ""
|
||||
} else if !testLikwidMetricFormula(metric.Calc, globalParams) {
|
||||
cclog.ComponentError(m.name, "Metric", metric.Name, "cannot be calculated with given counters")
|
||||
cclog.ComponentError(m.name, "Metric %s cannot be calculated with given counters", metric.Name)
|
||||
metric.Calc = ""
|
||||
} else if !checkMetricType(metric.Type) {
|
||||
cclog.ComponentError(m.name, "Metric", metric.Name, "has invalid type")
|
||||
cclog.ComponentError(m.name, "Metric %s has invalid type", metric.Name)
|
||||
metric.Calc = ""
|
||||
} else {
|
||||
totalMetrics++
|
||||
@@ -328,7 +334,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
for _, c := range m.cpulist {
|
||||
m.measureThread.Call(
|
||||
func() {
|
||||
retCode := C.HPMaddThread(C.uint32_t(c))
|
||||
retCode := C.cc_add_hwthread(C.int(c))
|
||||
if retCode != 0 {
|
||||
err := fmt.Errorf("C.HPMaddThread(%v) failed with return code %v", c, retCode)
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
@@ -375,16 +381,16 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
// Watch changes for the lock file ()
|
||||
watcher, err := fsnotify.NewWatcher()
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("takeMeasurement(): Failed to create a new fsnotify.Watcher: %v", err))
|
||||
"takeMeasurement(): Failed to create a new fsnotify.Watcher: %v", err)
|
||||
return true, err
|
||||
}
|
||||
defer func() {
|
||||
if err := watcher.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("takeMeasurement(): Failed to close fsnotify.Watcher: %v", err))
|
||||
"takeMeasurement(): Failed to close fsnotify.Watcher: %v", err)
|
||||
}
|
||||
}()
|
||||
if len(m.config.LockfilePath) > 0 {
|
||||
@@ -597,7 +603,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
if tid >= 0 && len(metric.Calc) > 0 {
|
||||
value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid])
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Calculation for metric %s failed: %s", metric.Name, err.Error())
|
||||
value = 0.0
|
||||
}
|
||||
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
|
||||
@@ -762,7 +768,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
||||
// Evaluate the metric
|
||||
value, err := agg.EvalFloat64Condition(metric.Calc, params)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Calculation for metric %s failed: %s", metric.Name, err.Error())
|
||||
value = 0.0
|
||||
}
|
||||
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
|
||||
|
||||
@@ -89,9 +89,9 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
}
|
||||
buffer, err := os.ReadFile(LOADAVGFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to read file '%s': %v", LOADAVGFILE, err))
|
||||
"Read(): Failed to read file '%s': %v", LOADAVGFILE, err)
|
||||
return
|
||||
}
|
||||
now := time.Now()
|
||||
@@ -101,9 +101,9 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
for i, name := range m.load_matches {
|
||||
x, err := strconv.ParseFloat(ls[i], 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", ls[i], err))
|
||||
"Read(): Failed to convert '%s' to float64: %v", ls[i], err)
|
||||
continue
|
||||
}
|
||||
if m.load_skips[i] {
|
||||
@@ -120,9 +120,9 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
for i, name := range m.proc_matches {
|
||||
x, err := strconv.ParseInt(lv[i], 10, 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", lv[i], err))
|
||||
"Read(): Failed to convert '%s' to float64: %v", lv[i], err)
|
||||
continue
|
||||
}
|
||||
if m.proc_skips[i] {
|
||||
|
||||
@@ -84,7 +84,7 @@ func getStats(filename string) map[string]MemstatStats {
|
||||
case 5:
|
||||
v, err := strconv.ParseFloat(linefields[3], 64)
|
||||
if err == nil {
|
||||
cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4])
|
||||
cclog.ComponentDebug("MemstatCollector", "getStats %s value %v unit %s", strings.Trim(linefields[2], ":"), v, linefields[4])
|
||||
stats[strings.Trim(linefields[2], ":")] = MemstatStats{
|
||||
value: v,
|
||||
unit: linefields[4],
|
||||
|
||||
@@ -222,16 +222,16 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
|
||||
file, err := os.Open(NETSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", NETSTATFILE, err))
|
||||
"Read(): Failed to open file '%s': %v", NETSTATFILE, err)
|
||||
return
|
||||
}
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", NETSTATFILE, err))
|
||||
"Read(): Failed to close file '%s': %v", NETSTATFILE, err)
|
||||
}
|
||||
}()
|
||||
|
||||
|
||||
@@ -125,10 +125,9 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
timestamp := time.Now()
|
||||
|
||||
if err := m.updateStats(); err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): updateStats() failed: %v", err),
|
||||
)
|
||||
"Read(): updateStats() failed: %v", err)
|
||||
return
|
||||
}
|
||||
var prefix string
|
||||
|
||||
@@ -117,7 +117,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
// Initialized
|
||||
cclog.ComponentDebug(m.name, "initialized", len(m.topology), "NUMA domains")
|
||||
cclog.ComponentDebugf(m.name, "initialized %d NUMA domains", len(m.topology))
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -113,7 +113,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
// Skip excluded devices by ID
|
||||
str_i := strconv.Itoa(i)
|
||||
if slices.Contains(m.config.ExcludeDevices, str_i) {
|
||||
cclog.ComponentDebug(m.name, "Skipping excluded device", str_i)
|
||||
cclog.ComponentDebugf(m.name, "Skipping excluded device %s", str_i)
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -121,7 +121,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
device, ret := nvml.DeviceGetHandleByIndex(i)
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Unable to get device at index %d: %s", i, err.Error())
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -129,7 +129,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
pciInfo, ret := nvml.DeviceGetPciInfo(device)
|
||||
if ret != nvml.SUCCESS {
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
cclog.ComponentError(m.name, "Unable to get PCI info for device at index", i, ":", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Unable to get PCI info for device at index %d: %s", i, err.Error())
|
||||
continue
|
||||
}
|
||||
// Create PCI ID in the common format used by the NVML.
|
||||
@@ -141,7 +141,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Skip excluded devices specified by PCI ID
|
||||
if slices.Contains(m.config.ExcludeDevices, pci_id) {
|
||||
cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id)
|
||||
cclog.ComponentDebugf(m.name, "Skipping excluded device %s", pci_id)
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -183,7 +183,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
if m.config.AddBoardNumberMeta {
|
||||
board, ret := nvml.DeviceGetBoardPartNumber(device)
|
||||
if ret != nvml.SUCCESS {
|
||||
cclog.ComponentError(m.name, "Unable to get boart part number for device at index", i, ":", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Unable to get boart part number for device at index %d: %s", i, err.Error())
|
||||
} else {
|
||||
g.meta["board_number"] = board
|
||||
}
|
||||
@@ -191,7 +191,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
if m.config.AddSerialMeta {
|
||||
serial, ret := nvml.DeviceGetSerial(device)
|
||||
if ret != nvml.SUCCESS {
|
||||
cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Unable to get serial number for device at index %d: %s", i, err.Error())
|
||||
} else {
|
||||
g.meta["serial"] = serial
|
||||
}
|
||||
@@ -199,7 +199,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
if m.config.AddUuidMeta {
|
||||
uuid, ret := nvml.DeviceGetUUID(device)
|
||||
if ret != nvml.SUCCESS {
|
||||
cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Unable to get UUID for device at index %d: %s", i, err.Error())
|
||||
} else {
|
||||
g.meta["uuid"] = uuid
|
||||
}
|
||||
@@ -1128,97 +1128,97 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
}
|
||||
err = readMemoryInfo(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readMemoryInfo for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readMemoryInfo for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readUtilization(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readUtilization for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readUtilization for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readTemp(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readTemp for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readTemp for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readFan(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readFan for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readFan for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readEccMode(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readEccMode for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readEccMode for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readPerfState(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readPerfState for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readPerfState for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readPowerUsage(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readPowerUsage for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readEnergyConsumption(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readEnergyConsumption for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readEnergyConsumption for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readClocks(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readClocks for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readMaxClocks(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readMaxClocks for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readMaxClocks for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readEccErrors(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readEccErrors for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readEccErrors for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readPowerLimit(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readPowerLimit for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readPowerLimit for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readEncUtilization(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readEncUtilization for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readEncUtilization for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readDecUtilization(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readDecUtilization for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readDecUtilization for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readRemappedRows(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readRemappedRows for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readRemappedRows for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readBarMemoryInfo(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readBarMemoryInfo for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readBarMemoryInfo for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readProcessCounts(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readProcessCounts for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readProcessCounts for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readViolationStats(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readViolationStats for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readViolationStats for device %s failed", name)
|
||||
}
|
||||
|
||||
err = readNVLinkStats(device, output)
|
||||
if err != nil {
|
||||
cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
|
||||
cclog.ComponentDebugf(m.name, "readNVLinkStats for device %s failed", name)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1244,7 +1244,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
if maxMig == 0 {
|
||||
continue
|
||||
}
|
||||
cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i)
|
||||
cclog.ComponentDebugf(m.name, "Reading MIG devices for GPU %d", i)
|
||||
|
||||
for j := range maxMig {
|
||||
mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
|
||||
@@ -1268,7 +1268,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
if m.config.UseUuidForMigDevices {
|
||||
uuid, ret := nvml.DeviceGetUUID(mdev)
|
||||
if ret != nvml.SUCCESS {
|
||||
cclog.ComponentError(m.name, "Unable to get UUID for mig device at index", j, ":", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Unable to get UUID for mig device at index %d: %s", j, err.Error())
|
||||
} else {
|
||||
migDevice.tags["stype-id"] = uuid
|
||||
}
|
||||
|
||||
@@ -208,11 +208,10 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
// Initialized
|
||||
cclog.ComponentDebug(
|
||||
cclog.ComponentDebugf(
|
||||
m.name,
|
||||
"initialized",
|
||||
len(m.RAPLZoneInfo),
|
||||
"zones with running average power limit (RAPL) monitoring attributes")
|
||||
"initialized %d zones with running average power limit (RAPL) monitoring attributes",
|
||||
len(m.RAPLZoneInfo))
|
||||
m.init = true
|
||||
|
||||
return err
|
||||
|
||||
@@ -124,7 +124,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
if m.config.AddSerialMeta {
|
||||
serial, ret := rocm_smi.DeviceGetSerialNumber(device)
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", rocm_smi.StatusStringNoError(ret))
|
||||
cclog.ComponentErrorf(m.name, "Unable to get serial number for device at index %d: %s", i, rocm_smi.StatusStringNoError(ret))
|
||||
} else {
|
||||
dev.meta["serial"] = serial
|
||||
}
|
||||
@@ -152,7 +152,7 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
for _, dev := range m.devices {
|
||||
metrics, ret := rocm_smi.DeviceGetMetrics(dev.device)
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
cclog.ComponentError(m.name, "Unable to get metrics for device at index", dev.index, ":", rocm_smi.StatusStringNoError(ret))
|
||||
cclog.ComponentErrorf(m.name, "Unable to get metrics for device at index %d: %s", dev.index, rocm_smi.StatusStringNoError(ret))
|
||||
continue
|
||||
}
|
||||
|
||||
|
||||
@@ -147,15 +147,15 @@ func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
|
||||
file, err := os.Open(SCHEDSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", SCHEDSTATFILE, err))
|
||||
"Read(): Failed to open file '%s': %v", SCHEDSTATFILE, err)
|
||||
}
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", SCHEDSTATFILE, err))
|
||||
"Read(): Failed to close file '%s': %v", SCHEDSTATFILE, err)
|
||||
}
|
||||
}()
|
||||
|
||||
|
||||
@@ -240,7 +240,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
globPattern := filepath.Join(m.cgroupBase, "job_*")
|
||||
jobDirs, err := filepath.Glob(globPattern)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error globbing job directories:", err.Error())
|
||||
cclog.ComponentErrorf(m.name, "Error globbing job directories: %s", err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
@@ -249,7 +249,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
|
||||
jobdata, err := m.ReadJobData(jKey)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error reading job data for", jKey, ":", err.Error())
|
||||
cclog.ComponentError(m.name, "Error reading job data for %s: %s", jKey, err.Error())
|
||||
continue
|
||||
}
|
||||
|
||||
|
||||
@@ -228,12 +228,12 @@ func (m *SmartMonCollector) Read(interval time.Duration, output chan lp.CCMessag
|
||||
|
||||
stdout, err := command.Output()
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "cannot read data for device", d.Name)
|
||||
cclog.ComponentErrorf(m.name, "cannot read data for device %s", d.Name)
|
||||
continue
|
||||
}
|
||||
err = json.Unmarshal(stdout, &data)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "cannot unmarshal data for device", d.Name)
|
||||
cclog.ComponentErrorf(m.name, "cannot unmarshal data for device %s", d.Name)
|
||||
continue
|
||||
}
|
||||
if !m.excludeMetric.temp {
|
||||
|
||||
@@ -188,16 +188,16 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
// Read sensor file
|
||||
buffer, err := os.ReadFile(sensor.file)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to read file '%s': %v", sensor.file, err))
|
||||
"Read(): Failed to read file '%s': %v", sensor.file, err)
|
||||
continue
|
||||
}
|
||||
x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to convert temperature '%s' to int64: %v", buffer, err))
|
||||
"Read(): Failed to convert temperature '%s' to int64: %v", buffer, err)
|
||||
continue
|
||||
}
|
||||
x /= 1000
|
||||
|
||||
@@ -77,9 +77,9 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessag
|
||||
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
||||
stdout, err := command.Output()
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
cclog.ComponentErrorf(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to read output from command \"%s\": %v", command.String(), err))
|
||||
"Read(): Failed to read output from command \"%s\": %v", command.String(), err)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
2
go.mod
2
go.mod
@@ -9,7 +9,7 @@ require (
|
||||
github.com/PaesslerAG/gval v1.2.4
|
||||
github.com/fsnotify/fsnotify v1.10.1
|
||||
github.com/tklauser/go-sysconf v0.4.0
|
||||
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1
|
||||
golang.design/x/thread v0.3.2
|
||||
golang.org/x/sys v0.45.0
|
||||
)
|
||||
|
||||
|
||||
5
go.sum
5
go.sum
@@ -173,8 +173,8 @@ go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||
go.yaml.in/yaml/v2 v2.4.4 h1:tuyd0P+2Ont/d6e2rl3be67goVK4R6deVxCUX5vyPaQ=
|
||||
go.yaml.in/yaml/v2 v2.4.4/go.mod h1:gMZqIpDtDqOfM0uNfy0SkpRhvUryYH0Z6wdMYcacYXQ=
|
||||
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 h1:P7S/GeHBAFEZIYp0ePPs2kHXoazz8q2KsyxHyQVGCJg=
|
||||
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1/go.mod h1:9CWpnTUmlQkfdpdutA1nNf4iE5lAVt3QZOu0Z6hahBE=
|
||||
golang.design/x/thread v0.3.2 h1:FmD1glspGrQCe6FuQLmSrT6wz2CSzq7vKVDluyiMnqo=
|
||||
golang.design/x/thread v0.3.2/go.mod h1:6+Hi2rMOgMHZdKDWaqNHyWtoFUx1HxZ06LfHPh5Z/hQ=
|
||||
golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI=
|
||||
golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q=
|
||||
golang.org/x/exp v0.0.0-20231005195138-3e424a577f31 h1:9k5exFQKQglLo+RoP+4zMjOFE14P6+vyR0baDAi0Rcs=
|
||||
@@ -183,7 +183,6 @@ golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY=
|
||||
golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA=
|
||||
golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs=
|
||||
golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY=
|
||||
golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
|
||||
|
||||
@@ -35,18 +35,18 @@ type metricRouterTagConfig struct {
|
||||
|
||||
// Metric router configuration
|
||||
type metricRouterConfig struct {
|
||||
HostnameTagName string `json:"hostname_tag"` // Key name used when adding the hostname to a metric (default 'hostname')
|
||||
AddTags []metricRouterTagConfig `json:"add_tags"` // List of tags that are added when the condition is met
|
||||
DelTags []metricRouterTagConfig `json:"delete_tags"` // List of tags that are removed when the condition is met
|
||||
IntervalAgg []agg.MetricAggregatorIntervalConfig `json:"interval_aggregates"` // List of aggregation function processed at the end of an interval
|
||||
DropMetrics []string `json:"drop_metrics"` // List of metric names to drop. For fine-grained dropping use drop_metrics_if
|
||||
DropMetricsIf []string `json:"drop_metrics_if"` // List of evaluatable terms to drop metrics
|
||||
RenameMetrics map[string]string `json:"rename_metrics"` // Map to rename metric name from key to value
|
||||
IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically by ticker each interval?
|
||||
NumCacheIntervals int `json:"num_cache_intervals"` // Number of intervals of cached metrics for evaluation
|
||||
MaxForward int `json:"max_forward"` // Number of maximal forwarded metrics at one select
|
||||
NormalizeUnits bool `json:"normalize_units"` // Check unit meta flag and normalize it using cc-units
|
||||
ChangeUnitPrefix map[string]string `json:"change_unit_prefix"` // Add prefix that should be applied to the metrics
|
||||
HostnameTagName string `json:"hostname_tag,omitempty"` // Key name used when adding the hostname to a metric (default 'hostname')
|
||||
AddTags []metricRouterTagConfig `json:"add_tags,omitempty"` // List of tags that are added when the condition is met
|
||||
DelTags []metricRouterTagConfig `json:"delete_tags,omitempty"` // List of tags that are removed when the condition is met
|
||||
IntervalAgg []agg.MetricAggregatorIntervalConfig `json:"interval_aggregates,omitempty"` // List of aggregation function processed at the end of an interval
|
||||
DropMetrics []string `json:"drop_metrics,omitempty"` // List of metric names to drop. For fine-grained dropping use drop_metrics_if
|
||||
DropMetricsIf []string `json:"drop_metrics_if,omitempty"` // List of evaluatable terms to drop metrics
|
||||
RenameMetrics map[string]string `json:"rename_metrics,omitempty"` // Map to rename metric name from key to value
|
||||
IntervalStamp bool `json:"interval_timestamp,omitempty"` // Update timestamp periodically by ticker each interval?
|
||||
NumCacheIntervals int `json:"num_cache_intervals,omitempty"` // Number of intervals of cached metrics for evaluation
|
||||
MaxForward int `json:"max_forward,omitempty"` // Number of maximal forwarded metrics at one select
|
||||
NormalizeUnits bool `json:"normalize_units,omitempty"` // Check unit meta flag and normalize it using cc-units
|
||||
ChangeUnitPrefix map[string]string `json:"change_unit_prefix,omitempty"` // Add prefix that should be applied to the metrics
|
||||
MessageProcessor json.RawMessage `json:"process_messages,omitempty"`
|
||||
}
|
||||
|
||||
@@ -297,7 +297,7 @@ func (r *metricRouter) Start() {
|
||||
|
||||
case timestamp := <-timeChan:
|
||||
r.timestamp = timestamp
|
||||
cclog.ComponentDebug("MetricRouter", "Update timestamp", r.timestamp.UnixNano())
|
||||
cclog.ComponentDebugf("MetricRouter", "Update timestamp %d", r.timestamp.UnixNano())
|
||||
|
||||
case p := <-r.coll_input:
|
||||
coll_forward(p)
|
||||
|
||||
Reference in New Issue
Block a user