mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-06-11 14:27:31 +02:00
Compare commits
5 Commits
main
...
cclog_upda
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c9dfca622f | ||
|
|
3b0638e815 | ||
|
|
037b4f1526 | ||
|
|
5d55ee7a77 | ||
|
|
5938368a76 |
@@ -132,11 +132,11 @@ func mainFunc() int {
|
|||||||
if len(rcfg.ConfigFile.Interval) > 0 {
|
if len(rcfg.ConfigFile.Interval) > 0 {
|
||||||
t, err := time.ParseDuration(rcfg.ConfigFile.Interval)
|
t, err := time.ParseDuration(rcfg.ConfigFile.Interval)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.Error("Configuration value 'interval' no valid duration")
|
cclog.Errorf("Configuration value interval=%s no valid duration", rcfg.ConfigFile.Interval)
|
||||||
}
|
}
|
||||||
rcfg.Interval = t
|
rcfg.Interval = t
|
||||||
if rcfg.Interval == 0 {
|
if rcfg.Interval == 0 {
|
||||||
cclog.Error("Configuration value 'interval' must be greater than zero")
|
cclog.Errorf("Configuration value interval=%s must be greater than zero", rcfg.ConfigFile.Interval)
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -145,11 +145,11 @@ func mainFunc() int {
|
|||||||
if len(rcfg.ConfigFile.Duration) > 0 {
|
if len(rcfg.ConfigFile.Duration) > 0 {
|
||||||
t, err := time.ParseDuration(rcfg.ConfigFile.Duration)
|
t, err := time.ParseDuration(rcfg.ConfigFile.Duration)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.Error("Configuration value 'duration' no valid duration")
|
cclog.Error("Configuration value duration=%s no valid duration", rcfg.ConfigFile.Duration)
|
||||||
}
|
}
|
||||||
rcfg.Duration = t
|
rcfg.Duration = t
|
||||||
if rcfg.Duration == 0 {
|
if rcfg.Duration == 0 {
|
||||||
cclog.Error("Configuration value 'duration' must be greater than zero")
|
cclog.Error("Configuration value duration=%s must be greater than zero", rcfg.ConfigFile.Duration)
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -209,16 +209,16 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
} else {
|
} else {
|
||||||
f1, err := strconv.ParseFloat(m.matches["other"], 32)
|
f1, err := strconv.ParseFloat(m.matches["other"], 32)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
|
"Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
f2, err := strconv.ParseFloat(split[i], 32)
|
f2, err := strconv.ParseFloat(split[i], 32)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
|
"Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2)
|
m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2)
|
||||||
|
|||||||
@@ -200,16 +200,16 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
|||||||
} else {
|
} else {
|
||||||
f1, err := strconv.ParseFloat(m.matches["other"], 32)
|
f1, err := strconv.ParseFloat(m.matches["other"], 32)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
|
"Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
f2, err := strconv.ParseFloat(split[i], 32)
|
f2, err := strconv.ParseFloat(split[i], 32)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
|
"Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2)
|
m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2)
|
||||||
|
|||||||
@@ -50,6 +50,7 @@ var AvailableCollectors = map[string]MetricCollector{
|
|||||||
"nfsiostat": new(NfsIOStatCollector),
|
"nfsiostat": new(NfsIOStatCollector),
|
||||||
"slurm_cgroup": new(SlurmCgroupCollector),
|
"slurm_cgroup": new(SlurmCgroupCollector),
|
||||||
"smartmon": new(SmartMonCollector),
|
"smartmon": new(SmartMonCollector),
|
||||||
|
"nvidia_gpm": new(NvidiaGPMCollector),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Metric collector manager data structure
|
// Metric collector manager data structure
|
||||||
@@ -99,17 +100,17 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
|
|||||||
// Initialize configured collectors
|
// Initialize configured collectors
|
||||||
for collectorName, collectorCfg := range cm.config {
|
for collectorName, collectorCfg := range cm.config {
|
||||||
if _, found := AvailableCollectors[collectorName]; !found {
|
if _, found := AvailableCollectors[collectorName]; !found {
|
||||||
cclog.ComponentError("CollectorManager", "SKIP unknown collector", collectorName)
|
cclog.ComponentErrorf("CollectorManager", "SKIP unknown collector %s", collectorName)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
collector := AvailableCollectors[collectorName]
|
collector := AvailableCollectors[collectorName]
|
||||||
|
|
||||||
err := collector.Init(collectorCfg)
|
err := collector.Init(collectorCfg)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError("CollectorManager", fmt.Sprintf("Collector %s initialization failed: %v", collectorName, err))
|
cclog.ComponentErrorf("CollectorManager", "Collector %s initialization failed: %v", collectorName, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())
|
cclog.ComponentDebugf("CollectorManager", "ADD COLLECTOR %s", collector.Name())
|
||||||
if collector.Parallel() {
|
if collector.Parallel() {
|
||||||
cm.collectors = append(cm.collectors, collector)
|
cm.collectors = append(cm.collectors, collector)
|
||||||
} else {
|
} else {
|
||||||
@@ -155,7 +156,7 @@ func (cm *collectorManager) Start() {
|
|||||||
return
|
return
|
||||||
default:
|
default:
|
||||||
// Read metrics from collector c via goroutine
|
// Read metrics from collector c via goroutine
|
||||||
cclog.ComponentDebug("CollectorManager", c.Name(), t)
|
cclog.ComponentDebugf("CollectorManager: Read %s at %v", c.Name(), t)
|
||||||
cm.collector_wg.Add(1)
|
cm.collector_wg.Add(1)
|
||||||
go func(myc MetricCollector) {
|
go func(myc MetricCollector) {
|
||||||
myc.Read(cm.duration, cm.output)
|
myc.Read(cm.duration, cm.output)
|
||||||
@@ -173,7 +174,7 @@ func (cm *collectorManager) Start() {
|
|||||||
return
|
return
|
||||||
default:
|
default:
|
||||||
// Read metrics from collector c
|
// Read metrics from collector c
|
||||||
cclog.ComponentDebug("CollectorManager", c.Name(), t)
|
cclog.ComponentDebugf("CollectorManager: Read %s at %v", c.Name(), t)
|
||||||
c.Read(cm.duration, cm.output)
|
c.Read(cm.duration, cm.output)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -139,16 +139,16 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
|||||||
const cpuInfoFile = "/proc/cpuinfo"
|
const cpuInfoFile = "/proc/cpuinfo"
|
||||||
file, err := os.Open(cpuInfoFile)
|
file, err := os.Open(cpuInfoFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err))
|
"Read(): Failed to open file '%s': %v", cpuInfoFile, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
if err := file.Close(); err != nil {
|
if err := file.Close(); err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", cpuInfoFile, err))
|
"Read(): Failed to close file '%s': %v", cpuInfoFile, err)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@@ -166,9 +166,9 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
|||||||
if !t.isHT {
|
if !t.isHT {
|
||||||
value, err := strconv.ParseFloat(strings.TrimSpace(lineSplit[1]), 64)
|
value, err := strconv.ParseFloat(strings.TrimSpace(lineSplit[1]), 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
|
"Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if y, err := lp.NewMetric("cpufreq", t.tagSet, m.meta, value, now); err == nil {
|
if y, err := lp.NewMetric("cpufreq", t.tagSet, m.meta, value, now); err == nil {
|
||||||
|
|||||||
@@ -95,10 +95,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Initialized
|
// Initialized
|
||||||
cclog.ComponentDebug(
|
cclog.ComponentDebugf(m.name, "initialized %d non-hyper-threading CPUs")
|
||||||
m.name,
|
|
||||||
"initialized",
|
|
||||||
len(m.topology), "non-hyper-threading CPUs")
|
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -116,16 +113,14 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
// Read current frequency
|
// Read current frequency
|
||||||
line, err := os.ReadFile(t.scalingCurFreqFile)
|
line, err := os.ReadFile(t.scalingCurFreqFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name, "Read(): Failed to read file '%s': %v", t.scalingCurFreqFile, err)
|
||||||
fmt.Sprintf("Read(): Failed to read file '%s': %v", t.scalingCurFreqFile, err))
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
cpuFreq, err := strconv.ParseInt(strings.TrimSpace(string(line)), 10, 64)
|
cpuFreq, err := strconv.ParseInt(strings.TrimSpace(string(line)), 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(
|
||||||
m.name,
|
m.name, "Read(): Failed to convert CPU frequency '%s' to int64: %v", line, err)
|
||||||
fmt.Sprintf("Read(): Failed to convert CPU frequency '%s' to int64: %v", line, err))
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -171,15 +171,15 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
|
|
||||||
file, err := os.Open(CPUSTATFILE)
|
file, err := os.Open(CPUSTATFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", CPUSTATFILE, err))
|
"Read(): Failed to open file '%s': %v", CPUSTATFILE, err)
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
if err := file.Close(); err != nil {
|
if err := file.Close(); err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", string(CPUSTATFILE), err))
|
"Read(): Failed to close file '%s': %v", string(CPUSTATFILE), err)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
|||||||
@@ -64,9 +64,9 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
|||||||
cmdFields := strings.Fields(c)
|
cmdFields := strings.Fields(c)
|
||||||
command := exec.Command(cmdFields[0], cmdFields[1:]...)
|
command := exec.Command(cmdFields[0], cmdFields[1:]...)
|
||||||
if _, err := command.Output(); err != nil {
|
if _, err := command.Output(); err != nil {
|
||||||
cclog.ComponentWarn(
|
cclog.ComponentWarnf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("%s Init(): Execution of command \"%s\" failed: %v", m.name, command.String(), err))
|
"%s Init(): Execution of command \"%s\" failed: %v", m.name, command.String(), err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
m.cmdFieldsSlice = append(m.cmdFieldsSlice, cmdFields)
|
m.cmdFieldsSlice = append(m.cmdFieldsSlice, cmdFields)
|
||||||
@@ -77,7 +77,7 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
|||||||
if _, err := os.ReadFile(fileName); err != nil {
|
if _, err := os.ReadFile(fileName); err != nil {
|
||||||
cclog.ComponentWarn(
|
cclog.ComponentWarn(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("%s Init(): Reading of file \"%s\" failed: %v", m.name, fileName, err))
|
"%s Init(): Reading of file \"%s\" failed: %v", m.name, fileName, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
m.files = append(m.files, fileName)
|
m.files = append(m.files, fileName)
|
||||||
@@ -100,20 +100,18 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
command := exec.Command(cmdFields[0], cmdFields[1:]...)
|
command := exec.Command(cmdFields[0], cmdFields[1:]...)
|
||||||
stdout, err := command.Output()
|
stdout, err := command.Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to read command output for command \"%s\": %v", command.String(), err),
|
"Read(): Failed to read command output for command \"%s\": %v", command.String(), err)
|
||||||
)
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read and decode influxDB line-protocol from command output
|
// Read and decode influxDB line-protocol from command output
|
||||||
metrics, err := lp.FromBytes(stdout)
|
metrics, err := lp.FromBytes(stdout)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to decode influx Message: %v", err),
|
"Read(): Failed to decode influx Message: %v", err)
|
||||||
)
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
for _, metric := range metrics {
|
for _, metric := range metrics {
|
||||||
@@ -128,20 +126,18 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
for _, filename := range m.files {
|
for _, filename := range m.files {
|
||||||
input, err := os.ReadFile(filename)
|
input, err := os.ReadFile(filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to read file \"%s\": %v\n", filename, err),
|
"Read(): Failed to read file \"%s\": %v\n", filename, err)
|
||||||
)
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read and decode influxDB line-protocol from file
|
// Read and decode influxDB line-protocol from file
|
||||||
metrics, err := lp.FromBytes(input)
|
metrics, err := lp.FromBytes(input)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to decode influx Message: %v", err),
|
"Read(): Failed to decode influx Message: %v", err)
|
||||||
)
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
for _, metric := range metrics {
|
for _, metric := range metrics {
|
||||||
|
|||||||
@@ -77,16 +77,16 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
|
|||||||
|
|
||||||
file, err := os.Open(MOUNTFILE)
|
file, err := os.Open(MOUNTFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", MOUNTFILE, err))
|
"Read(): Failed to open file '%s': %v", MOUNTFILE, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
if err := file.Close(); err != nil {
|
if err := file.Close(); err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", MOUNTFILE, err))
|
"Read(): Failed to close file '%s': %v", MOUNTFILE, err)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
|||||||
@@ -371,7 +371,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
// if using sudo, exec.lookPath will return EACCES (file mode r-x------), this can be ignored
|
// if using sudo, exec.lookPath will return EACCES (file mode r-x------), this can be ignored
|
||||||
if m.config.Sudo && errors.Is(err, syscall.EACCES) {
|
if m.config.Sudo && errors.Is(err, syscall.EACCES) {
|
||||||
cclog.ComponentWarn(m.name, fmt.Sprintf("got error looking for mmpmon binary '%s': %v . This is expected when using sudo, continuing.", m.config.Mmpmon, err))
|
cclog.ComponentWarnf(m.name, "got error looking for mmpmon binary '%s': %v . This is expected when using sudo, continuing.", m.config.Mmpmon, err)
|
||||||
// the file was given in the config, use it
|
// the file was given in the config, use it
|
||||||
p = m.config.Mmpmon
|
p = m.config.Mmpmon
|
||||||
} else {
|
} else {
|
||||||
@@ -517,23 +517,23 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
// return code
|
// return code
|
||||||
rc, err := strconv.Atoi(key_value["_rc_"])
|
rc, err := strconv.Atoi(key_value["_rc_"])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err))
|
cclog.ComponentErrorf(m.name, "Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if rc != 0 {
|
if rc != 0 {
|
||||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Filesystem '%s' is not ok.", filesystem))
|
cclog.ComponentErrorf(m.name, "Read(): Filesystem '%s' is not ok.", filesystem)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// timestamp
|
// timestamp
|
||||||
sec, err := strconv.ParseInt(key_value["_t_"], 10, 64)
|
sec, err := strconv.ParseInt(key_value["_t_"], 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err))
|
cclog.ComponentErrorf(m.name, "Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64)
|
msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err))
|
cclog.ComponentErrorf(m.name, "Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
timestamp := time.Unix(sec, msec*1000)
|
timestamp := time.Unix(sec, msec*1000)
|
||||||
@@ -551,7 +551,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
for _, metric := range GpfsAbsMetrics {
|
for _, metric := range GpfsAbsMetrics {
|
||||||
value, err := strconv.ParseInt(key_value[metric.prefix], 10, 64)
|
value, err := strconv.ParseInt(key_value[metric.prefix], 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert %s '%s' to int64: %v", metric.desc, key_value[metric.prefix], err))
|
cclog.ComponentErrorf(m.name, "Read(): Failed to convert %s '%s' to int64: %v", metric.desc, key_value[metric.prefix], err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
newstate[metric.prefix] = value
|
newstate[metric.prefix] = value
|
||||||
@@ -636,7 +636,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// the value could not be computed correctly
|
// the value could not be computed correctly
|
||||||
cclog.ComponentWarn(m.name, fmt.Sprintf("Read(): Could not compute value for filesystem %s of metric %s: vold_ok = %t, vnew_ok = %t", filesystem, metric.name, vold_ok, vnew_ok))
|
cclog.ComponentWarnf(m.name, "Read(): Could not compute value for filesystem %s of metric %s: vold_ok = %t, vnew_ok = %t", filesystem, metric.name, vold_ok, vnew_ok)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -225,9 +225,9 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
// Read counter file
|
// Read counter file
|
||||||
line, err := os.ReadFile(counterDef.path)
|
line, err := os.ReadFile(counterDef.path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err))
|
"Read(): Failed to read from file '%s': %v", counterDef.path, err)
|
||||||
// Current counter can not be saved as last state
|
// Current counter can not be saved as last state
|
||||||
counterDef.lastStateAvailable = false
|
counterDef.lastStateAvailable = false
|
||||||
continue
|
continue
|
||||||
@@ -237,9 +237,9 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
// convert counter to uint64
|
// convert counter to uint64
|
||||||
vRawCounter, err := strconv.ParseUint(data, 10, 64)
|
vRawCounter, err := strconv.ParseUint(data, 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to uint64: %v", counterDef.name, data, err))
|
"Read(): Failed to convert Infininiband metrice %s='%s' to uint64: %v", counterDef.name, data, err)
|
||||||
// Current counter can not be saved as last state
|
// Current counter can not be saved as last state
|
||||||
counterDef.lastStateAvailable = false
|
counterDef.lastStateAvailable = false
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -145,16 +145,16 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
|
|
||||||
file, err := os.Open(IOSTATFILE)
|
file, err := os.Open(IOSTATFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", IOSTATFILE, err))
|
"Read(): Failed to open file '%s': %v", IOSTATFILE, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
if err := file.Close(); err != nil {
|
if err := file.Close(); err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", IOSTATFILE, err))
|
"Read(): Failed to close file '%s': %v", IOSTATFILE, err)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,12 @@ package collectors
|
|||||||
#cgo LDFLAGS: -Wl,--unresolved-symbols=ignore-in-object-files
|
#cgo LDFLAGS: -Wl,--unresolved-symbols=ignore-in-object-files
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <likwid.h>
|
#include <likwid.h>
|
||||||
|
|
||||||
|
|
||||||
|
int cc_add_hwthread(int cpu_id) {
|
||||||
|
return HPMaddThread(cpu_id);
|
||||||
|
}
|
||||||
|
|
||||||
*/
|
*/
|
||||||
import "C"
|
import "C"
|
||||||
|
|
||||||
@@ -261,12 +267,12 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
for _, metric := range evset.Metrics {
|
for _, metric := range evset.Metrics {
|
||||||
// Try to evaluate the metric
|
// Try to evaluate the metric
|
||||||
cclog.ComponentDebug(m.name, "Checking", metric.Name)
|
cclog.ComponentDebugf(m.name, "Checking %s", metric.Name)
|
||||||
if !checkMetricType(metric.Type) {
|
if !checkMetricType(metric.Type) {
|
||||||
cclog.ComponentError(m.name, "Metric", metric.Name, "uses invalid type", metric.Type)
|
cclog.ComponentErrorf(m.name, "Metric %s uses invalid type %s", metric.Name, metric.Type)
|
||||||
metric.Calc = ""
|
metric.Calc = ""
|
||||||
} else if !testLikwidMetricFormula(metric.Calc, params) {
|
} else if !testLikwidMetricFormula(metric.Calc, params) {
|
||||||
cclog.ComponentError(m.name, "Metric", metric.Name, "cannot be calculated with given counters")
|
cclog.ComponentError(m.name, "Metric %s cannot be calculated with given counters", metric.Name)
|
||||||
metric.Calc = ""
|
metric.Calc = ""
|
||||||
} else {
|
} else {
|
||||||
globalParams = append(globalParams, metric.Name)
|
globalParams = append(globalParams, metric.Name)
|
||||||
@@ -281,13 +287,13 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
for _, metric := range m.config.Metrics {
|
for _, metric := range m.config.Metrics {
|
||||||
// Try to evaluate the global metric
|
// Try to evaluate the global metric
|
||||||
if !checkMetricType(metric.Type) {
|
if !checkMetricType(metric.Type) {
|
||||||
cclog.ComponentError(m.name, "Metric", metric.Name, "uses invalid type", metric.Type)
|
cclog.ComponentErrorf(m.name, "Metric %s uses invalid type %s", metric.Name, metric.Type)
|
||||||
metric.Calc = ""
|
metric.Calc = ""
|
||||||
} else if !testLikwidMetricFormula(metric.Calc, globalParams) {
|
} else if !testLikwidMetricFormula(metric.Calc, globalParams) {
|
||||||
cclog.ComponentError(m.name, "Metric", metric.Name, "cannot be calculated with given counters")
|
cclog.ComponentError(m.name, "Metric %s cannot be calculated with given counters", metric.Name)
|
||||||
metric.Calc = ""
|
metric.Calc = ""
|
||||||
} else if !checkMetricType(metric.Type) {
|
} else if !checkMetricType(metric.Type) {
|
||||||
cclog.ComponentError(m.name, "Metric", metric.Name, "has invalid type")
|
cclog.ComponentError(m.name, "Metric %s has invalid type", metric.Name)
|
||||||
metric.Calc = ""
|
metric.Calc = ""
|
||||||
} else {
|
} else {
|
||||||
totalMetrics++
|
totalMetrics++
|
||||||
@@ -328,7 +334,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
for _, c := range m.cpulist {
|
for _, c := range m.cpulist {
|
||||||
m.measureThread.Call(
|
m.measureThread.Call(
|
||||||
func() {
|
func() {
|
||||||
retCode := C.HPMaddThread(C.uint32_t(c))
|
retCode := C.cc_add_hwthread(C.int(c))
|
||||||
if retCode != 0 {
|
if retCode != 0 {
|
||||||
err := fmt.Errorf("C.HPMaddThread(%v) failed with return code %v", c, retCode)
|
err := fmt.Errorf("C.HPMaddThread(%v) failed with return code %v", c, retCode)
|
||||||
cclog.ComponentError(m.name, err.Error())
|
cclog.ComponentError(m.name, err.Error())
|
||||||
@@ -375,16 +381,16 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
|||||||
// Watch changes for the lock file ()
|
// Watch changes for the lock file ()
|
||||||
watcher, err := fsnotify.NewWatcher()
|
watcher, err := fsnotify.NewWatcher()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("takeMeasurement(): Failed to create a new fsnotify.Watcher: %v", err))
|
"takeMeasurement(): Failed to create a new fsnotify.Watcher: %v", err)
|
||||||
return true, err
|
return true, err
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
if err := watcher.Close(); err != nil {
|
if err := watcher.Close(); err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("takeMeasurement(): Failed to close fsnotify.Watcher: %v", err))
|
"takeMeasurement(): Failed to close fsnotify.Watcher: %v", err)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
if len(m.config.LockfilePath) > 0 {
|
if len(m.config.LockfilePath) > 0 {
|
||||||
@@ -597,7 +603,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
|||||||
if tid >= 0 && len(metric.Calc) > 0 {
|
if tid >= 0 && len(metric.Calc) > 0 {
|
||||||
value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid])
|
value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
cclog.ComponentErrorf(m.name, "Calculation for metric %s failed: %s", metric.Name, err.Error())
|
||||||
value = 0.0
|
value = 0.0
|
||||||
}
|
}
|
||||||
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
|
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
|
||||||
@@ -762,7 +768,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
|||||||
// Evaluate the metric
|
// Evaluate the metric
|
||||||
value, err := agg.EvalFloat64Condition(metric.Calc, params)
|
value, err := agg.EvalFloat64Condition(metric.Calc, params)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
cclog.ComponentErrorf(m.name, "Calculation for metric %s failed: %s", metric.Name, err.Error())
|
||||||
value = 0.0
|
value = 0.0
|
||||||
}
|
}
|
||||||
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
|
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
|
||||||
|
|||||||
@@ -89,9 +89,9 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
}
|
}
|
||||||
buffer, err := os.ReadFile(LOADAVGFILE)
|
buffer, err := os.ReadFile(LOADAVGFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to read file '%s': %v", LOADAVGFILE, err))
|
"Read(): Failed to read file '%s': %v", LOADAVGFILE, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
@@ -101,9 +101,9 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
for i, name := range m.load_matches {
|
for i, name := range m.load_matches {
|
||||||
x, err := strconv.ParseFloat(ls[i], 64)
|
x, err := strconv.ParseFloat(ls[i], 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", ls[i], err))
|
"Read(): Failed to convert '%s' to float64: %v", ls[i], err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if m.load_skips[i] {
|
if m.load_skips[i] {
|
||||||
@@ -120,9 +120,9 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
for i, name := range m.proc_matches {
|
for i, name := range m.proc_matches {
|
||||||
x, err := strconv.ParseInt(lv[i], 10, 64)
|
x, err := strconv.ParseInt(lv[i], 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", lv[i], err))
|
"Read(): Failed to convert '%s' to float64: %v", lv[i], err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if m.proc_skips[i] {
|
if m.proc_skips[i] {
|
||||||
|
|||||||
@@ -84,7 +84,7 @@ func getStats(filename string) map[string]MemstatStats {
|
|||||||
case 5:
|
case 5:
|
||||||
v, err := strconv.ParseFloat(linefields[3], 64)
|
v, err := strconv.ParseFloat(linefields[3], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4])
|
cclog.ComponentDebug("MemstatCollector", "getStats %s value %v unit %s", strings.Trim(linefields[2], ":"), v, linefields[4])
|
||||||
stats[strings.Trim(linefields[2], ":")] = MemstatStats{
|
stats[strings.Trim(linefields[2], ":")] = MemstatStats{
|
||||||
value: v,
|
value: v,
|
||||||
unit: linefields[4],
|
unit: linefields[4],
|
||||||
|
|||||||
@@ -222,16 +222,16 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
|
|
||||||
file, err := os.Open(NETSTATFILE)
|
file, err := os.Open(NETSTATFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", NETSTATFILE, err))
|
"Read(): Failed to open file '%s': %v", NETSTATFILE, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
if err := file.Close(); err != nil {
|
if err := file.Close(); err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", NETSTATFILE, err))
|
"Read(): Failed to close file '%s': %v", NETSTATFILE, err)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
|||||||
@@ -125,10 +125,9 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
timestamp := time.Now()
|
timestamp := time.Now()
|
||||||
|
|
||||||
if err := m.updateStats(); err != nil {
|
if err := m.updateStats(); err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): updateStats() failed: %v", err),
|
"Read(): updateStats() failed: %v", err)
|
||||||
)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
var prefix string
|
var prefix string
|
||||||
|
|||||||
@@ -117,7 +117,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Initialized
|
// Initialized
|
||||||
cclog.ComponentDebug(m.name, "initialized", len(m.topology), "NUMA domains")
|
cclog.ComponentDebugf(m.name, "initialized %d NUMA domains", len(m.topology))
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
396
collectors/nvidiaGPMMetric.go
Normal file
396
collectors/nvidiaGPMMetric.go
Normal file
@@ -0,0 +1,396 @@
|
|||||||
|
package collectors
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"slices"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
|
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||||
|
)
|
||||||
|
|
||||||
|
type NvidiaGPMMetricDef struct {
|
||||||
|
name string
|
||||||
|
outname string
|
||||||
|
id nvml.GpmMetricId
|
||||||
|
unit string
|
||||||
|
}
|
||||||
|
|
||||||
|
var NvidiaGPMMetrics []NvidiaGPMMetricDef = []NvidiaGPMMetricDef{
|
||||||
|
{
|
||||||
|
name: "GRAPHICS_UTIL",
|
||||||
|
outname: "nv_gpm_graphics_util",
|
||||||
|
id: nvml.GPM_METRIC_GRAPHICS_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "SM_UTIL",
|
||||||
|
outname: "nv_gpm_sm_util",
|
||||||
|
id: nvml.GPM_METRIC_SM_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "SM_OCCUPANCY",
|
||||||
|
outname: "nv_gpm_sm_occupancy",
|
||||||
|
id: nvml.GPM_METRIC_SM_OCCUPANCY,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "INTEGER_UTIL",
|
||||||
|
outname: "nv_gpm_integer_util",
|
||||||
|
id: nvml.GPM_METRIC_INTEGER_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ANY_TENSOR_UTIL",
|
||||||
|
outname: "nv_gpm_any_tensor_util",
|
||||||
|
id: nvml.GPM_METRIC_ANY_TENSOR_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "DFMA_TENSOR_UTIL",
|
||||||
|
outname: "nv_gpm_dfma_tensor_util",
|
||||||
|
id: nvml.GPM_METRIC_DFMA_TENSOR_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "HMMA_TENSOR_UTIL",
|
||||||
|
outname: "nv_gpm_hmma_tensor_util",
|
||||||
|
id: nvml.GPM_METRIC_HMMA_TENSOR_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "IMMA_TENSOR_UTIL",
|
||||||
|
outname: "nv_gpm_imma_tensor_util",
|
||||||
|
id: nvml.GPM_METRIC_IMMA_TENSOR_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "DRAM_BW_UTIL",
|
||||||
|
outname: "nv_gpm_dram_bw_util",
|
||||||
|
id: nvml.GPM_METRIC_DRAM_BW_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "FP64_UTIL",
|
||||||
|
outname: "nv_gpm_fp64_util",
|
||||||
|
id: nvml.GPM_METRIC_FP64_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "FP32_UTIL",
|
||||||
|
outname: "nv_gpm_fp32_util",
|
||||||
|
id: nvml.GPM_METRIC_FP32_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "FP16_UTIL",
|
||||||
|
outname: "nv_gpm_fp16_util",
|
||||||
|
id: nvml.GPM_METRIC_FP16_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaGPMCollectorConfig struct {
|
||||||
|
Metrics []string `json:"metrics,omitempty"`
|
||||||
|
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
||||||
|
AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"`
|
||||||
|
UsePciInfoAsTypeId bool `json:"use_pci_info_as_type_id,omitempty"`
|
||||||
|
AddUuidMeta bool `json:"add_uuid_meta,omitempty"`
|
||||||
|
AddBoardNumberMeta bool `json:"add_board_number_meta,omitempty"`
|
||||||
|
AddSerialMeta bool `json:"add_serial_meta,omitempty"`
|
||||||
|
ProcessMigDevices bool `json:"process_mig_devices,omitempty"`
|
||||||
|
UseUuidForMigDevices bool `json:"use_uuid_for_mig_device,omitempty"`
|
||||||
|
UseSliceForMigDevices bool `json:"use_slice_for_mig_device,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaGPMCollectorDevice struct {
|
||||||
|
device nvml.Device
|
||||||
|
tags map[string]string
|
||||||
|
meta map[string]string
|
||||||
|
startTime time.Time
|
||||||
|
endTime time.Time
|
||||||
|
measurement nvml.GpmMetricsGetType
|
||||||
|
metricsLookup map[int]NvidiaGPMMetricDef
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaGPMCollector struct {
|
||||||
|
metricCollector
|
||||||
|
|
||||||
|
config NvidiaGPMCollectorConfig
|
||||||
|
gpus []NvidiaGPMCollectorDevice
|
||||||
|
num_gpus int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *NvidiaGPMCollector) Init(config json.RawMessage) error {
|
||||||
|
var err error = nil
|
||||||
|
m.name = "NvidiaGPMCollector"
|
||||||
|
m.parallel = true
|
||||||
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
|
if len(config) > 0 {
|
||||||
|
d := json.NewDecoder(strings.NewReader(string(config)))
|
||||||
|
d.DisallowUnknownFields()
|
||||||
|
if err = d.Decode(&m.config); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
m.meta = map[string]string{
|
||||||
|
"source": m.name,
|
||||||
|
"group": "NvidiaGPM",
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize NVIDIA Management Library (NVML)
|
||||||
|
ret := nvml.Init()
|
||||||
|
|
||||||
|
// Error: NVML library not found
|
||||||
|
// (nvml.ErrorString can not be used in this case)
|
||||||
|
if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
|
||||||
|
return fmt.Errorf("%s Init(): NVML library not found", m.name)
|
||||||
|
}
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
return fmt.Errorf("%s Init(): Unable to initialize NVML: %w", m.name, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Number of NVIDIA GPUs
|
||||||
|
num_gpus, ret := nvml.DeviceGetCount()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
return fmt.Errorf("%s Init(): Unable to get device count: %w", m.name, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// For all GPUs
|
||||||
|
m.gpus = make([]NvidiaGPMCollectorDevice, 0, num_gpus)
|
||||||
|
for i := range num_gpus {
|
||||||
|
|
||||||
|
// Skip excluded devices by ID
|
||||||
|
str_i := strconv.Itoa(i)
|
||||||
|
if slices.Contains(m.config.ExcludeDevices, str_i) {
|
||||||
|
cclog.ComponentDebugf(m.name, "Skipping excluded device %s", str_i)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get device handle
|
||||||
|
device, ret := nvml.DeviceGetHandleByIndex(i)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Unable to get device at index %d: %s", i, err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
supportInfo, ret := nvml.GpmQueryDeviceSupport(device)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Unable to query GPM support for device at index %d: %s", i, err.Error())
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
|
if supportInfo.IsSupportedDevice == uint32(nvml.FEATURE_DISABLED) {
|
||||||
|
cclog.ComponentErrorf(m.name, "Device at index %d does not support GPM metrics", i)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stream, ret := nvml.GpmQueryIfStreamingEnabled(device)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Unable to query GPM streaming for device at index %d: %s", i, err.Error())
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
|
if stream == uint32(nvml.FEATURE_DISABLED) {
|
||||||
|
ret = nvml.GpmSetStreamingEnabled(device, uint32(nvml.FEATURE_ENABLED))
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Unable to set streaming mode for device at index %d: %s", i, err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get device's PCI info
|
||||||
|
pciInfo, ret := nvml.DeviceGetPciInfo(device)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Unable to get PCI info for device at index %d: %s", i, err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Create PCI ID in the common format used by the NVML.
|
||||||
|
pci_id := fmt.Sprintf(
|
||||||
|
nvml.DEVICE_PCI_BUS_ID_FMT,
|
||||||
|
pciInfo.Domain,
|
||||||
|
pciInfo.Bus,
|
||||||
|
pciInfo.Device)
|
||||||
|
|
||||||
|
// Skip excluded devices specified by PCI ID
|
||||||
|
if slices.Contains(m.config.ExcludeDevices, pci_id) {
|
||||||
|
cclog.ComponentDebugf(m.name, "Skipping excluded device %s", pci_id)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ss, nvmlErr := nvml.GpmSampleAlloc()
|
||||||
|
if nvmlErr != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
es, nvmlErr := nvml.GpmSampleAlloc()
|
||||||
|
if nvmlErr != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Select which value to use as 'type-id'.
|
||||||
|
// The PCI ID is commonly required in SLURM environments because the
|
||||||
|
// numberic IDs used by SLURM and the ones used by NVML might differ
|
||||||
|
// depending on the job type. The PCI ID is more reliable but is commonly
|
||||||
|
// not recorded for a job, so it must be added manually in prologue or epilogue
|
||||||
|
// e.g. to the comment field
|
||||||
|
tid := str_i
|
||||||
|
if m.config.UsePciInfoAsTypeId {
|
||||||
|
tid = pci_id
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now we got all infos together, populate the device list
|
||||||
|
g := NvidiaGPMCollectorDevice{}
|
||||||
|
|
||||||
|
// Add device handle
|
||||||
|
g.device = device
|
||||||
|
|
||||||
|
// Add tags
|
||||||
|
g.tags = map[string]string{
|
||||||
|
"type": "accelerator",
|
||||||
|
"type-id": tid,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add PCI info as tag if not already used as 'type-id'
|
||||||
|
if m.config.AddPciInfoTag && !m.config.UsePciInfoAsTypeId {
|
||||||
|
g.tags["pci_identifier"] = pci_id
|
||||||
|
}
|
||||||
|
|
||||||
|
g.meta = map[string]string{
|
||||||
|
"source": m.name,
|
||||||
|
"group": "Nvidia",
|
||||||
|
}
|
||||||
|
|
||||||
|
if m.config.AddBoardNumberMeta {
|
||||||
|
board, ret := nvml.DeviceGetBoardPartNumber(device)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentError(m.name, "Unable to get boart part number for device at index", i, ":", err.Error())
|
||||||
|
} else {
|
||||||
|
g.meta["board_number"] = board
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if m.config.AddSerialMeta {
|
||||||
|
serial, ret := nvml.DeviceGetSerial(device)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error())
|
||||||
|
} else {
|
||||||
|
g.meta["serial"] = serial
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if m.config.AddUuidMeta {
|
||||||
|
uuid, ret := nvml.DeviceGetUUID(device)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error())
|
||||||
|
} else {
|
||||||
|
g.meta["uuid"] = uuid
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
g.measurement.Sample1 = ss
|
||||||
|
g.measurement.Sample2 = es
|
||||||
|
g.measurement.Version = nvml.GPM_METRICS_GET_VERSION
|
||||||
|
g.metricsLookup = make(map[int]NvidiaGPMMetricDef)
|
||||||
|
metIdx := 0
|
||||||
|
for _, inmetric := range m.config.Metrics {
|
||||||
|
for _, defmetric := range NvidiaGPMMetrics {
|
||||||
|
if inmetric == defmetric.outname || inmetric == defmetric.name {
|
||||||
|
g.measurement.Metrics[metIdx] = nvml.GpmMetric{
|
||||||
|
MetricId: uint32(defmetric.id),
|
||||||
|
}
|
||||||
|
g.metricsLookup[metIdx] = defmetric
|
||||||
|
metIdx += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
g.measurement.NumMetrics = uint32(metIdx)
|
||||||
|
m.gpus = append(m.gpus, g)
|
||||||
|
}
|
||||||
|
cclog.ComponentDebugf(m.name, "Found %d Nvidia GPUs with GPM support", len(m.gpus))
|
||||||
|
m.num_gpus = len(m.gpus)
|
||||||
|
m.init = true
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *NvidiaGPMCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
|
var err error
|
||||||
|
if !m.init {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for i, gpu := range m.gpus {
|
||||||
|
gpu.startTime = time.Now()
|
||||||
|
nvmlErr := gpu.measurement.Sample1.Get(gpu.device)
|
||||||
|
if nvmlErr != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(nvmlErr))
|
||||||
|
cclog.ComponentError(m.name, "Unable to get start GPM sample for device at index", i, ":", err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
time.Sleep(interval)
|
||||||
|
|
||||||
|
for i, gpu := range m.gpus {
|
||||||
|
gpu.endTime = time.Now()
|
||||||
|
nvmlErr := gpu.measurement.Sample2.Get(gpu.device)
|
||||||
|
if nvmlErr != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(nvmlErr))
|
||||||
|
cclog.ComponentError(m.name, "Unable to get stop GPM sample for device at index", i, ":", err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, gpu := range m.gpus {
|
||||||
|
nvmlErr := nvml.GpmMetricsGet(&gpu.measurement)
|
||||||
|
if nvmlErr != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(nvmlErr))
|
||||||
|
cclog.ComponentError(m.name, "Unable to get evaluate GPM sample for device at index", i, ":", err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for idx, metricDef := range gpu.metricsLookup {
|
||||||
|
y, err := lp.NewMetric(metricDef.outname, gpu.tags, gpu.meta, gpu.measurement.Metrics[idx].Value, time.Now())
|
||||||
|
if err == nil {
|
||||||
|
y.AddMeta("unit", metricDef.unit)
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *NvidiaGPMCollector) Close() {
|
||||||
|
if m.init {
|
||||||
|
for i, gpu := range m.gpus {
|
||||||
|
ret := gpu.measurement.Sample1.Free()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err := errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Unable to free start sample for device at index %d: %s", i, err.Error())
|
||||||
|
}
|
||||||
|
ret = gpu.measurement.Sample2.Free()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err := errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Unable to free stop sample for device at index %d: %s", i, err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ret := nvml.Shutdown(); ret != nvml.SUCCESS {
|
||||||
|
cclog.ComponentError(m.name, "nvml.Shutdown() not successful")
|
||||||
|
}
|
||||||
|
m.init = false
|
||||||
|
}
|
||||||
|
}
|
||||||
54
collectors/nvidiaGPMMetric.md
Normal file
54
collectors/nvidiaGPMMetric.md
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
<!--
|
||||||
|
---
|
||||||
|
title: "Nvidia NVML GPM metric collector"
|
||||||
|
description: Collect metrics for Nvidia GPUs using the NVML GPM interface
|
||||||
|
categories: [cc-metric-collector]
|
||||||
|
tags: ['Admin']
|
||||||
|
weight: 2
|
||||||
|
hugo_path: docs/reference/cc-metric-collector/collectors/nvidiaGPM.md
|
||||||
|
---
|
||||||
|
-->
|
||||||
|
|
||||||
|
## `nvidiaGPM` collector
|
||||||
|
|
||||||
|
```json
|
||||||
|
"nvidia_gpm": {
|
||||||
|
"metrics": [
|
||||||
|
"nv_fb_mem_used",
|
||||||
|
"nv_fan"
|
||||||
|
],
|
||||||
|
"exclude_devices": [
|
||||||
|
"0","1", "0000000:ff:01.0"
|
||||||
|
],
|
||||||
|
|
||||||
|
"process_mig_devices": false,
|
||||||
|
"use_pci_info_as_type_id": true,
|
||||||
|
"add_pci_info_tag": false,
|
||||||
|
"add_uuid_meta": false,
|
||||||
|
"add_board_number_meta": false,
|
||||||
|
"add_serial_meta": false,
|
||||||
|
"use_uuid_for_mig_device": false,
|
||||||
|
"use_slice_for_mig_device": false
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The `nvidia_gpm` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=<mig_index>`). With the options `use_uuid_for_mig_device` and `use_slice_for_mig_device`, the `<mig_index>` can be replaced with the UUID (e.g. `MIG-6a9f7cc8-6d5b-5ce0-92de-750edc4d8849`) or the MIG slice name (e.g. `1g.5gb`).
|
||||||
|
|
||||||
|
The metrics sent by the `nvidia_gpm` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
|
||||||
|
|
||||||
|
Optionally, it is possible to add the UUID, the board part number and the serial to the meta informations. They are not sent to the sinks (if not configured otherwise).
|
||||||
|
|
||||||
|
|
||||||
|
Available Metrics:
|
||||||
|
* `nv_gpm_graphics_util`
|
||||||
|
* `nv_gpm_sm_util`
|
||||||
|
* `nv_gpm_sm_occupancy`
|
||||||
|
* `nv_gpm_integer_util`
|
||||||
|
* `nv_gpm_any_tensor_util`
|
||||||
|
* `nv_gpm_dfma_tensor_util`
|
||||||
|
* `nv_gpm_hmma_tensor_util`
|
||||||
|
* `nv_gpm_imma_tensor_util`
|
||||||
|
* `nv_gpm_dram_bw_util`
|
||||||
|
* `nv_gpm_fp64_util`
|
||||||
|
* `nv_gpm_fp32_util`
|
||||||
|
* `nv_gpm_fp16_util`
|
||||||
@@ -113,7 +113,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
// Skip excluded devices by ID
|
// Skip excluded devices by ID
|
||||||
str_i := strconv.Itoa(i)
|
str_i := strconv.Itoa(i)
|
||||||
if slices.Contains(m.config.ExcludeDevices, str_i) {
|
if slices.Contains(m.config.ExcludeDevices, str_i) {
|
||||||
cclog.ComponentDebug(m.name, "Skipping excluded device", str_i)
|
cclog.ComponentDebugf(m.name, "Skipping excluded device %s", str_i)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -121,7 +121,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
device, ret := nvml.DeviceGetHandleByIndex(i)
|
device, ret := nvml.DeviceGetHandleByIndex(i)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err = errors.New(nvml.ErrorString(ret))
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error())
|
cclog.ComponentErrorf(m.name, "Unable to get device at index %d: %s", i, err.Error())
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -129,7 +129,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
pciInfo, ret := nvml.DeviceGetPciInfo(device)
|
pciInfo, ret := nvml.DeviceGetPciInfo(device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err = errors.New(nvml.ErrorString(ret))
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
cclog.ComponentError(m.name, "Unable to get PCI info for device at index", i, ":", err.Error())
|
cclog.ComponentErrorf(m.name, "Unable to get PCI info for device at index %d: %s", i, err.Error())
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Create PCI ID in the common format used by the NVML.
|
// Create PCI ID in the common format used by the NVML.
|
||||||
@@ -141,7 +141,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Skip excluded devices specified by PCI ID
|
// Skip excluded devices specified by PCI ID
|
||||||
if slices.Contains(m.config.ExcludeDevices, pci_id) {
|
if slices.Contains(m.config.ExcludeDevices, pci_id) {
|
||||||
cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id)
|
cclog.ComponentDebugf(m.name, "Skipping excluded device %s", pci_id)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -183,7 +183,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
if m.config.AddBoardNumberMeta {
|
if m.config.AddBoardNumberMeta {
|
||||||
board, ret := nvml.DeviceGetBoardPartNumber(device)
|
board, ret := nvml.DeviceGetBoardPartNumber(device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
cclog.ComponentError(m.name, "Unable to get boart part number for device at index", i, ":", err.Error())
|
cclog.ComponentErrorf(m.name, "Unable to get boart part number for device at index %d: %s", i, err.Error())
|
||||||
} else {
|
} else {
|
||||||
g.meta["board_number"] = board
|
g.meta["board_number"] = board
|
||||||
}
|
}
|
||||||
@@ -191,7 +191,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
if m.config.AddSerialMeta {
|
if m.config.AddSerialMeta {
|
||||||
serial, ret := nvml.DeviceGetSerial(device)
|
serial, ret := nvml.DeviceGetSerial(device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error())
|
cclog.ComponentErrorf(m.name, "Unable to get serial number for device at index %d: %s", i, err.Error())
|
||||||
} else {
|
} else {
|
||||||
g.meta["serial"] = serial
|
g.meta["serial"] = serial
|
||||||
}
|
}
|
||||||
@@ -199,7 +199,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
if m.config.AddUuidMeta {
|
if m.config.AddUuidMeta {
|
||||||
uuid, ret := nvml.DeviceGetUUID(device)
|
uuid, ret := nvml.DeviceGetUUID(device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error())
|
cclog.ComponentErrorf(m.name, "Unable to get UUID for device at index %d: %s", i, err.Error())
|
||||||
} else {
|
} else {
|
||||||
g.meta["uuid"] = uuid
|
g.meta["uuid"] = uuid
|
||||||
}
|
}
|
||||||
@@ -1128,97 +1128,97 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
}
|
}
|
||||||
err = readMemoryInfo(device, output)
|
err = readMemoryInfo(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readMemoryInfo for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readMemoryInfo for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readUtilization(device, output)
|
err = readUtilization(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readUtilization for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readUtilization for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readTemp(device, output)
|
err = readTemp(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readTemp for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readTemp for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readFan(device, output)
|
err = readFan(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readFan for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readFan for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readEccMode(device, output)
|
err = readEccMode(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readEccMode for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readEccMode for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readPerfState(device, output)
|
err = readPerfState(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readPerfState for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readPerfState for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readPowerUsage(device, output)
|
err = readPowerUsage(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readPowerUsage for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readEnergyConsumption(device, output)
|
err = readEnergyConsumption(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readEnergyConsumption for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readEnergyConsumption for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readClocks(device, output)
|
err = readClocks(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readClocks for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readMaxClocks(device, output)
|
err = readMaxClocks(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readMaxClocks for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readMaxClocks for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readEccErrors(device, output)
|
err = readEccErrors(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readEccErrors for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readEccErrors for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readPowerLimit(device, output)
|
err = readPowerLimit(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readPowerLimit for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readPowerLimit for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readEncUtilization(device, output)
|
err = readEncUtilization(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readEncUtilization for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readEncUtilization for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readDecUtilization(device, output)
|
err = readDecUtilization(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readDecUtilization for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readDecUtilization for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readRemappedRows(device, output)
|
err = readRemappedRows(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readRemappedRows for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readRemappedRows for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readBarMemoryInfo(device, output)
|
err = readBarMemoryInfo(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readBarMemoryInfo for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readBarMemoryInfo for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readProcessCounts(device, output)
|
err = readProcessCounts(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readProcessCounts for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readProcessCounts for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readViolationStats(device, output)
|
err = readViolationStats(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readViolationStats for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readViolationStats for device %s failed", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readNVLinkStats(device, output)
|
err = readNVLinkStats(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
|
cclog.ComponentDebugf(m.name, "readNVLinkStats for device %s failed", name)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1244,7 +1244,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
if maxMig == 0 {
|
if maxMig == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i)
|
cclog.ComponentDebugf(m.name, "Reading MIG devices for GPU %d", i)
|
||||||
|
|
||||||
for j := range maxMig {
|
for j := range maxMig {
|
||||||
mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
|
mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
|
||||||
@@ -1268,7 +1268,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
if m.config.UseUuidForMigDevices {
|
if m.config.UseUuidForMigDevices {
|
||||||
uuid, ret := nvml.DeviceGetUUID(mdev)
|
uuid, ret := nvml.DeviceGetUUID(mdev)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
cclog.ComponentError(m.name, "Unable to get UUID for mig device at index", j, ":", err.Error())
|
cclog.ComponentErrorf(m.name, "Unable to get UUID for mig device at index %d: %s", j, err.Error())
|
||||||
} else {
|
} else {
|
||||||
migDevice.tags["stype-id"] = uuid
|
migDevice.tags["stype-id"] = uuid
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -208,11 +208,10 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Initialized
|
// Initialized
|
||||||
cclog.ComponentDebug(
|
cclog.ComponentDebugf(
|
||||||
m.name,
|
m.name,
|
||||||
"initialized",
|
"initialized %d zones with running average power limit (RAPL) monitoring attributes",
|
||||||
len(m.RAPLZoneInfo),
|
len(m.RAPLZoneInfo))
|
||||||
"zones with running average power limit (RAPL) monitoring attributes")
|
|
||||||
m.init = true
|
m.init = true
|
||||||
|
|
||||||
return err
|
return err
|
||||||
|
|||||||
@@ -124,7 +124,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
|||||||
if m.config.AddSerialMeta {
|
if m.config.AddSerialMeta {
|
||||||
serial, ret := rocm_smi.DeviceGetSerialNumber(device)
|
serial, ret := rocm_smi.DeviceGetSerialNumber(device)
|
||||||
if ret != rocm_smi.STATUS_SUCCESS {
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", rocm_smi.StatusStringNoError(ret))
|
cclog.ComponentErrorf(m.name, "Unable to get serial number for device at index %d: %s", i, rocm_smi.StatusStringNoError(ret))
|
||||||
} else {
|
} else {
|
||||||
dev.meta["serial"] = serial
|
dev.meta["serial"] = serial
|
||||||
}
|
}
|
||||||
@@ -152,7 +152,7 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
for _, dev := range m.devices {
|
for _, dev := range m.devices {
|
||||||
metrics, ret := rocm_smi.DeviceGetMetrics(dev.device)
|
metrics, ret := rocm_smi.DeviceGetMetrics(dev.device)
|
||||||
if ret != rocm_smi.STATUS_SUCCESS {
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
cclog.ComponentError(m.name, "Unable to get metrics for device at index", dev.index, ":", rocm_smi.StatusStringNoError(ret))
|
cclog.ComponentErrorf(m.name, "Unable to get metrics for device at index %d: %s", dev.index, rocm_smi.StatusStringNoError(ret))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -147,15 +147,15 @@ func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
|
|
||||||
file, err := os.Open(SCHEDSTATFILE)
|
file, err := os.Open(SCHEDSTATFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", SCHEDSTATFILE, err))
|
"Read(): Failed to open file '%s': %v", SCHEDSTATFILE, err)
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
if err := file.Close(); err != nil {
|
if err := file.Close(); err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", SCHEDSTATFILE, err))
|
"Read(): Failed to close file '%s': %v", SCHEDSTATFILE, err)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
|||||||
@@ -240,7 +240,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
|||||||
globPattern := filepath.Join(m.cgroupBase, "job_*")
|
globPattern := filepath.Join(m.cgroupBase, "job_*")
|
||||||
jobDirs, err := filepath.Glob(globPattern)
|
jobDirs, err := filepath.Glob(globPattern)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "Error globbing job directories:", err.Error())
|
cclog.ComponentErrorf(m.name, "Error globbing job directories: %s", err.Error())
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -249,7 +249,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
|||||||
|
|
||||||
jobdata, err := m.ReadJobData(jKey)
|
jobdata, err := m.ReadJobData(jKey)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "Error reading job data for", jKey, ":", err.Error())
|
cclog.ComponentError(m.name, "Error reading job data for %s: %s", jKey, err.Error())
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -228,12 +228,12 @@ func (m *SmartMonCollector) Read(interval time.Duration, output chan lp.CCMessag
|
|||||||
|
|
||||||
stdout, err := command.Output()
|
stdout, err := command.Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "cannot read data for device", d.Name)
|
cclog.ComponentErrorf(m.name, "cannot read data for device %s", d.Name)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
err = json.Unmarshal(stdout, &data)
|
err = json.Unmarshal(stdout, &data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "cannot unmarshal data for device", d.Name)
|
cclog.ComponentErrorf(m.name, "cannot unmarshal data for device %s", d.Name)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if !m.excludeMetric.temp {
|
if !m.excludeMetric.temp {
|
||||||
|
|||||||
@@ -188,16 +188,16 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
// Read sensor file
|
// Read sensor file
|
||||||
buffer, err := os.ReadFile(sensor.file)
|
buffer, err := os.ReadFile(sensor.file)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to read file '%s': %v", sensor.file, err))
|
"Read(): Failed to read file '%s': %v", sensor.file, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64)
|
x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to convert temperature '%s' to int64: %v", buffer, err))
|
"Read(): Failed to convert temperature '%s' to int64: %v", buffer, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
x /= 1000
|
x /= 1000
|
||||||
|
|||||||
@@ -77,9 +77,9 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessag
|
|||||||
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
||||||
stdout, err := command.Output()
|
stdout, err := command.Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentErrorf(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to read output from command \"%s\": %v", command.String(), err))
|
"Read(): Failed to read output from command \"%s\": %v", command.String(), err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -35,18 +35,18 @@ type metricRouterTagConfig struct {
|
|||||||
|
|
||||||
// Metric router configuration
|
// Metric router configuration
|
||||||
type metricRouterConfig struct {
|
type metricRouterConfig struct {
|
||||||
HostnameTagName string `json:"hostname_tag"` // Key name used when adding the hostname to a metric (default 'hostname')
|
HostnameTagName string `json:"hostname_tag,omitempty"` // Key name used when adding the hostname to a metric (default 'hostname')
|
||||||
AddTags []metricRouterTagConfig `json:"add_tags"` // List of tags that are added when the condition is met
|
AddTags []metricRouterTagConfig `json:"add_tags,omitempty"` // List of tags that are added when the condition is met
|
||||||
DelTags []metricRouterTagConfig `json:"delete_tags"` // List of tags that are removed when the condition is met
|
DelTags []metricRouterTagConfig `json:"delete_tags,omitempty"` // List of tags that are removed when the condition is met
|
||||||
IntervalAgg []agg.MetricAggregatorIntervalConfig `json:"interval_aggregates"` // List of aggregation function processed at the end of an interval
|
IntervalAgg []agg.MetricAggregatorIntervalConfig `json:"interval_aggregates,omitempty"` // List of aggregation function processed at the end of an interval
|
||||||
DropMetrics []string `json:"drop_metrics"` // List of metric names to drop. For fine-grained dropping use drop_metrics_if
|
DropMetrics []string `json:"drop_metrics,omitempty"` // List of metric names to drop. For fine-grained dropping use drop_metrics_if
|
||||||
DropMetricsIf []string `json:"drop_metrics_if"` // List of evaluatable terms to drop metrics
|
DropMetricsIf []string `json:"drop_metrics_if,omitempty"` // List of evaluatable terms to drop metrics
|
||||||
RenameMetrics map[string]string `json:"rename_metrics"` // Map to rename metric name from key to value
|
RenameMetrics map[string]string `json:"rename_metrics,omitempty"` // Map to rename metric name from key to value
|
||||||
IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically by ticker each interval?
|
IntervalStamp bool `json:"interval_timestamp,omitempty"` // Update timestamp periodically by ticker each interval?
|
||||||
NumCacheIntervals int `json:"num_cache_intervals"` // Number of intervals of cached metrics for evaluation
|
NumCacheIntervals int `json:"num_cache_intervals,omitempty"` // Number of intervals of cached metrics for evaluation
|
||||||
MaxForward int `json:"max_forward"` // Number of maximal forwarded metrics at one select
|
MaxForward int `json:"max_forward,omitempty"` // Number of maximal forwarded metrics at one select
|
||||||
NormalizeUnits bool `json:"normalize_units"` // Check unit meta flag and normalize it using cc-units
|
NormalizeUnits bool `json:"normalize_units,omitempty"` // Check unit meta flag and normalize it using cc-units
|
||||||
ChangeUnitPrefix map[string]string `json:"change_unit_prefix"` // Add prefix that should be applied to the metrics
|
ChangeUnitPrefix map[string]string `json:"change_unit_prefix,omitempty"` // Add prefix that should be applied to the metrics
|
||||||
MessageProcessor json.RawMessage `json:"process_messages,omitempty"`
|
MessageProcessor json.RawMessage `json:"process_messages,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -297,7 +297,7 @@ func (r *metricRouter) Start() {
|
|||||||
|
|
||||||
case timestamp := <-timeChan:
|
case timestamp := <-timeChan:
|
||||||
r.timestamp = timestamp
|
r.timestamp = timestamp
|
||||||
cclog.ComponentDebug("MetricRouter", "Update timestamp", r.timestamp.UnixNano())
|
cclog.ComponentDebugf("MetricRouter", "Update timestamp %d", r.timestamp.UnixNano())
|
||||||
|
|
||||||
case p := <-r.coll_input:
|
case p := <-r.coll_input:
|
||||||
coll_forward(p)
|
coll_forward(p)
|
||||||
|
|||||||
Reference in New Issue
Block a user