mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-06-11 14:27:31 +02:00
Compare commits
11 Commits
fix/libdrm
...
nvidia_gpm
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5d55ee7a77 | ||
|
|
5938368a76 | ||
|
|
077204d39f | ||
|
|
dcc9746df4 | ||
|
|
2c51a3ed72 | ||
|
|
656ea73d12 | ||
|
|
330f923596 | ||
|
|
8e58072ff6 | ||
|
|
0f6fee9db4 | ||
|
|
7585ee7289 | ||
|
|
30b2eb69dd |
@@ -227,8 +227,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
|
|
||||||
for key, data := range m.matches {
|
for key, data := range m.matches {
|
||||||
value, _ := strconv.ParseFloat(data, 32)
|
value, _ := strconv.ParseFloat(data, 32)
|
||||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
if y, err := lp.NewMetric(key, m.tags, m.meta, value, time.Now()); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -218,8 +218,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
|||||||
|
|
||||||
for key, data := range m.matches {
|
for key, data := range m.matches {
|
||||||
value, _ := strconv.ParseFloat(data, 32)
|
value, _ := strconv.ParseFloat(data, 32)
|
||||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
if y, err := lp.NewMetric(key, m.tags, m.meta, value, time.Now()); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -50,6 +50,7 @@ var AvailableCollectors = map[string]MetricCollector{
|
|||||||
"nfsiostat": new(NfsIOStatCollector),
|
"nfsiostat": new(NfsIOStatCollector),
|
||||||
"slurm_cgroup": new(SlurmCgroupCollector),
|
"slurm_cgroup": new(SlurmCgroupCollector),
|
||||||
"smartmon": new(SmartMonCollector),
|
"smartmon": new(SmartMonCollector),
|
||||||
|
"nvidia_gpm": new(NvidiaGPMCollector),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Metric collector manager data structure
|
// Metric collector manager data structure
|
||||||
@@ -99,17 +100,17 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
|
|||||||
// Initialize configured collectors
|
// Initialize configured collectors
|
||||||
for collectorName, collectorCfg := range cm.config {
|
for collectorName, collectorCfg := range cm.config {
|
||||||
if _, found := AvailableCollectors[collectorName]; !found {
|
if _, found := AvailableCollectors[collectorName]; !found {
|
||||||
cclog.ComponentError("CollectorManager", "SKIP unknown collector", collectorName)
|
cclog.ComponentErrorf("CollectorManager", "SKIP unknown collector %s", collectorName)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
collector := AvailableCollectors[collectorName]
|
collector := AvailableCollectors[collectorName]
|
||||||
|
|
||||||
err := collector.Init(collectorCfg)
|
err := collector.Init(collectorCfg)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError("CollectorManager", fmt.Sprintf("Collector %s initialization failed: %v", collectorName, err))
|
cclog.ComponentErrorf("CollectorManager", "Collector %s initialization failed: %v", collectorName, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())
|
cclog.ComponentDebugf("CollectorManager", "ADD COLLECTOR %s", collector.Name())
|
||||||
if collector.Parallel() {
|
if collector.Parallel() {
|
||||||
cm.collectors = append(cm.collectors, collector)
|
cm.collectors = append(cm.collectors, collector)
|
||||||
} else {
|
} else {
|
||||||
@@ -155,7 +156,7 @@ func (cm *collectorManager) Start() {
|
|||||||
return
|
return
|
||||||
default:
|
default:
|
||||||
// Read metrics from collector c via goroutine
|
// Read metrics from collector c via goroutine
|
||||||
cclog.ComponentDebug("CollectorManager", c.Name(), t)
|
cclog.ComponentDebugf("CollectorManager: Read %s at %v", c.Name(), t)
|
||||||
cm.collector_wg.Add(1)
|
cm.collector_wg.Add(1)
|
||||||
go func(myc MetricCollector) {
|
go func(myc MetricCollector) {
|
||||||
myc.Read(cm.duration, cm.output)
|
myc.Read(cm.duration, cm.output)
|
||||||
@@ -173,7 +174,7 @@ func (cm *collectorManager) Start() {
|
|||||||
return
|
return
|
||||||
default:
|
default:
|
||||||
// Read metrics from collector c
|
// Read metrics from collector c
|
||||||
cclog.ComponentDebug("CollectorManager", c.Name(), t)
|
cclog.ComponentDebugf("CollectorManager: Read %s at %v", c.Name(), t)
|
||||||
c.Read(cm.duration, cm.output)
|
c.Read(cm.duration, cm.output)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -171,7 +171,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
|||||||
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
|
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": value}, now); err == nil {
|
if y, err := lp.NewMetric("cpufreq", t.tagSet, m.meta, value, now); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -129,7 +129,7 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": cpuFreq}, now); err == nil {
|
if y, err := lp.NewMetric("cpufreq", t.tagSet, m.meta, cpuFreq, now); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ const CPUSTATFILE = `/proc/stat`
|
|||||||
|
|
||||||
type CpustatCollectorConfig struct {
|
type CpustatCollectorConfig struct {
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
|
excludeNumCPUs bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type CpustatCollector struct {
|
type CpustatCollector struct {
|
||||||
@@ -79,6 +80,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
|||||||
m.matches[match] = index
|
m.matches[match] = index
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
m.config.excludeNumCPUs = slices.Contains(m.config.ExcludeMetrics, "num_cpus")
|
||||||
|
|
||||||
// Check input file
|
// Check input file
|
||||||
file, err := os.Open(CPUSTATFILE)
|
file, err := os.Open(CPUSTATFILE)
|
||||||
@@ -95,11 +97,13 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
|||||||
line := scanner.Text()
|
line := scanner.Text()
|
||||||
linefields := strings.Fields(line)
|
linefields := strings.Fields(line)
|
||||||
if strings.Compare(linefields[0], "cpu") == 0 {
|
if strings.Compare(linefields[0], "cpu") == 0 {
|
||||||
|
// Kernel system statistics for all CPUs
|
||||||
m.olddata["cpu"] = make(map[string]int64)
|
m.olddata["cpu"] = make(map[string]int64)
|
||||||
for k, v := range m.matches {
|
for k, v := range m.matches {
|
||||||
m.olddata["cpu"][k], _ = strconv.ParseInt(linefields[v], 0, 64)
|
m.olddata["cpu"][k], _ = strconv.ParseInt(linefields[v], 0, 64)
|
||||||
}
|
}
|
||||||
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
||||||
|
// Kernel system statistics per CPU
|
||||||
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
||||||
cpu, _ := strconv.Atoi(cpustr)
|
cpu, _ := strconv.Atoi(cpustr)
|
||||||
m.cputags[linefields[0]] = map[string]string{
|
m.cputags[linefields[0]] = map[string]string{
|
||||||
@@ -141,7 +145,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
|
|||||||
sum := float64(0)
|
sum := float64(0)
|
||||||
for name, value := range values {
|
for name, value := range values {
|
||||||
sum += value
|
sum += value
|
||||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value * 100}, now)
|
y, err := lp.NewMetric(name, tags, m.meta, value*100, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("unit", "Percent")
|
y.AddTag("unit", "Percent")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -149,7 +153,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
|
|||||||
}
|
}
|
||||||
if v, ok := values["cpu_idle"]; ok {
|
if v, ok := values["cpu_idle"]; ok {
|
||||||
sum -= v
|
sum -= v
|
||||||
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]any{"value": sum * 100}, now)
|
y, err := lp.NewMetric("cpu_used", tags, m.meta, sum*100, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("unit", "Percent")
|
y.AddTag("unit", "Percent")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -191,14 +195,10 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
num_cpus_metric, err := lp.NewMessage("num_cpus",
|
if !m.config.excludeNumCPUs {
|
||||||
m.nodetags,
|
if num_cpus_metric, err := lp.NewMetric("num_cpus", m.nodetags, m.meta, num_cpus, now); err == nil {
|
||||||
m.meta,
|
output <- num_cpus_metric
|
||||||
map[string]any{"value": num_cpus},
|
}
|
||||||
now,
|
|
||||||
)
|
|
||||||
if err == nil {
|
|
||||||
output <- num_cpus_metric
|
|
||||||
}
|
}
|
||||||
|
|
||||||
m.lastTimestamp = now
|
m.lastTimestamp = now
|
||||||
|
|||||||
@@ -128,30 +128,14 @@ mountLoop:
|
|||||||
tags := map[string]string{"type": "node", "device": linefields[0]}
|
tags := map[string]string{"type": "node", "device": linefields[0]}
|
||||||
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000_000_000)
|
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000_000_000)
|
||||||
if m.allowedMetrics["disk_total"] {
|
if m.allowedMetrics["disk_total"] {
|
||||||
y, err := lp.NewMessage(
|
if y, err := lp.NewMetric("disk_total", tags, m.meta, total, time.Now()); err == nil {
|
||||||
"disk_total",
|
|
||||||
tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": total,
|
|
||||||
},
|
|
||||||
time.Now())
|
|
||||||
if err == nil {
|
|
||||||
y.AddMeta("unit", "GBytes")
|
y.AddMeta("unit", "GBytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000_000_000)
|
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000_000_000)
|
||||||
if m.allowedMetrics["disk_free"] {
|
if m.allowedMetrics["disk_free"] {
|
||||||
y, err := lp.NewMessage(
|
if y, err := lp.NewMetric("disk_free", tags, m.meta, free, time.Now()); err == nil {
|
||||||
"disk_free",
|
|
||||||
tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": free,
|
|
||||||
},
|
|
||||||
time.Now())
|
|
||||||
if err == nil {
|
|
||||||
y.AddMeta("unit", "GBytes")
|
y.AddMeta("unit", "GBytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -164,16 +148,7 @@ mountLoop:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.allowedMetrics["part_max_used"] {
|
if m.allowedMetrics["part_max_used"] {
|
||||||
y, err := lp.NewMessage(
|
y, err := lp.NewMetric("part_max_used", map[string]string{"type": "node"}, m.meta, int(part_max_used), time.Now())
|
||||||
"part_max_used",
|
|
||||||
map[string]string{
|
|
||||||
"type": "node",
|
|
||||||
},
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": int(part_max_used),
|
|
||||||
},
|
|
||||||
time.Now())
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "percent")
|
y.AddMeta("unit", "percent")
|
||||||
output <- y
|
output <- y
|
||||||
|
|||||||
@@ -32,7 +32,6 @@ type InfinibandCollectorMetric struct {
|
|||||||
scale int64
|
scale int64
|
||||||
addToIBTotal bool
|
addToIBTotal bool
|
||||||
addToIBTotalPkgs bool
|
addToIBTotalPkgs bool
|
||||||
currentState int64
|
|
||||||
lastState int64
|
lastState int64
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -202,7 +201,9 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
for i := range m.info {
|
for i := range m.info {
|
||||||
info := &m.info[i]
|
info := &m.info[i]
|
||||||
|
|
||||||
var ib_total, ib_total_pkts int64
|
var ib_total, ib_total_last_state,
|
||||||
|
ib_total_pkts, ib_total_pkts_last_state int64
|
||||||
|
var ib_total_last_state_available, ib_total_pkts_last_state_available bool
|
||||||
for i := range info.portCounterFiles {
|
for i := range info.portCounterFiles {
|
||||||
counterDef := &info.portCounterFiles[i]
|
counterDef := &info.portCounterFiles[i]
|
||||||
|
|
||||||
@@ -227,19 +228,9 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
// Scale raw value
|
// Scale raw value
|
||||||
v *= counterDef.scale
|
v *= counterDef.scale
|
||||||
|
|
||||||
// Save current state
|
|
||||||
counterDef.currentState = v
|
|
||||||
|
|
||||||
// Send absolut values
|
// Send absolut values
|
||||||
if m.config.SendAbsoluteValues {
|
if m.config.SendAbsoluteValues {
|
||||||
if y, err := lp.NewMessage(
|
if y, err := lp.NewMetric(counterDef.name, info.tagSet, m.meta, v, now); err == nil {
|
||||||
counterDef.name,
|
|
||||||
info.tagSet,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": counterDef.currentState,
|
|
||||||
},
|
|
||||||
now); err == nil {
|
|
||||||
y.AddMeta("unit", counterDef.unit)
|
y.AddMeta("unit", counterDef.unit)
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -248,59 +239,65 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
// Send derived values
|
// Send derived values
|
||||||
if m.config.SendDerivedValues {
|
if m.config.SendDerivedValues {
|
||||||
if counterDef.lastState >= 0 {
|
if counterDef.lastState >= 0 {
|
||||||
rate := float64((counterDef.currentState - counterDef.lastState)) / timeDiff
|
rate := float64((v - counterDef.lastState)) / timeDiff
|
||||||
if y, err := lp.NewMessage(
|
if y, err := lp.NewMetric(counterDef.name+"_bw", info.tagSet, m.meta, rate, now); err == nil {
|
||||||
counterDef.name+"_bw",
|
|
||||||
info.tagSet,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": rate,
|
|
||||||
},
|
|
||||||
now); err == nil {
|
|
||||||
y.AddMeta("unit", counterDef.unit+"/sec")
|
y.AddMeta("unit", counterDef.unit+"/sec")
|
||||||
output <- y
|
output <- y
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sum up total values of last state
|
||||||
|
if m.config.SendTotalValues {
|
||||||
|
switch {
|
||||||
|
case counterDef.addToIBTotal:
|
||||||
|
ib_total_last_state += counterDef.lastState
|
||||||
|
ib_total_last_state_available = true
|
||||||
|
case counterDef.addToIBTotalPkgs:
|
||||||
|
ib_total_pkts_last_state += counterDef.lastState
|
||||||
|
ib_total_pkts_last_state_available = true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
counterDef.lastState = counterDef.currentState
|
counterDef.lastState = v
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sum up total values
|
// Sum up total values
|
||||||
if m.config.SendTotalValues {
|
if m.config.SendTotalValues {
|
||||||
switch {
|
switch {
|
||||||
case counterDef.addToIBTotal:
|
case counterDef.addToIBTotal:
|
||||||
ib_total += counterDef.currentState
|
ib_total += v
|
||||||
case counterDef.addToIBTotalPkgs:
|
case counterDef.addToIBTotalPkgs:
|
||||||
ib_total_pkts += counterDef.currentState
|
ib_total_pkts += v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send total values
|
// Send total values
|
||||||
if m.config.SendTotalValues {
|
if m.config.SendTotalValues {
|
||||||
if y, err := lp.NewMessage(
|
if y, err := lp.NewMetric("ib_total", info.tagSet, m.meta, ib_total, now); err == nil {
|
||||||
"ib_total",
|
|
||||||
info.tagSet,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": ib_total,
|
|
||||||
},
|
|
||||||
now); err == nil {
|
|
||||||
y.AddMeta("unit", "bytes")
|
y.AddMeta("unit", "bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|
||||||
if y, err := lp.NewMessage(
|
if y, err := lp.NewMetric("ib_total_pkts", info.tagSet, m.meta, ib_total_pkts, now); err == nil {
|
||||||
"ib_total_pkts",
|
|
||||||
info.tagSet,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": ib_total_pkts,
|
|
||||||
},
|
|
||||||
now); err == nil {
|
|
||||||
y.AddMeta("unit", "packets")
|
y.AddMeta("unit", "packets")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if m.config.SendDerivedValues && ib_total_last_state_available {
|
||||||
|
rate := float64((ib_total - ib_total_last_state)) / timeDiff
|
||||||
|
if y, err := lp.NewMetric("ib_total_bw", info.tagSet, m.meta, rate, now); err == nil {
|
||||||
|
y.AddMeta("unit", "bytes/sec")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if m.config.SendDerivedValues && ib_total_pkts_last_state_available {
|
||||||
|
rate := float64((ib_total_pkts - ib_total_pkts_last_state)) / timeDiff
|
||||||
|
if y, err := lp.NewMetric("ib_total_pkts_bw", info.tagSet, m.meta, rate, now); err == nil {
|
||||||
|
y.AddMeta("unit", "packets/sec")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -41,5 +41,7 @@ Metrics:
|
|||||||
* `ib_xmit_bw` (if `send_derived_values == true`)
|
* `ib_xmit_bw` (if `send_derived_values == true`)
|
||||||
* `ib_recv_pkts_bw` (if `send_derived_values == true`)
|
* `ib_recv_pkts_bw` (if `send_derived_values == true`)
|
||||||
* `ib_xmit_pkts_bw` (if `send_derived_values == true`)
|
* `ib_xmit_pkts_bw` (if `send_derived_values == true`)
|
||||||
|
* `ib_total_bw` (if `send_total_values == true` and `send_derived_values == true`)
|
||||||
|
* `ib_total_pkts_bw` (if `send_total_values == true` and `send_derived_values == true`)
|
||||||
|
|
||||||
The collector adds a `device` tag to all metrics
|
The collector adds a `device` tag to all metrics
|
||||||
|
|||||||
@@ -28,9 +28,9 @@ type IpmiCollector struct {
|
|||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
config struct {
|
config struct {
|
||||||
IpmitoolPath string `json:"ipmitool_path"`
|
IpmitoolPath string `json:"ipmitool_path"`
|
||||||
IpmisensorsPath string `json:"ipmisensors_path"`
|
IpmisensorsPath string `json:"ipmisensors_path"`
|
||||||
Sudo bool `json:"use_sudo"`
|
Sudo bool `json:"use_sudo"`
|
||||||
}
|
}
|
||||||
|
|
||||||
ipmitool string
|
ipmitool string
|
||||||
@@ -157,7 +157,7 @@ func (m *IpmiCollector) readIpmiTool(output chan lp.CCMessage) error {
|
|||||||
unit = "Watts"
|
unit = "Watts"
|
||||||
}
|
}
|
||||||
|
|
||||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
|
y, err := lp.NewMetric(name, map[string]string{"type": "node"}, m.meta, v, time.Now())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentErrorf(m.name, "Failed to create message: %v", err)
|
cclog.ComponentErrorf(m.name, "Failed to create message: %v", err)
|
||||||
continue
|
continue
|
||||||
@@ -209,7 +209,7 @@ func (m *IpmiCollector) readIpmiSensors(output chan lp.CCMessage) error {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
name := strings.ToLower(strings.ReplaceAll(lv[1], " ", "_"))
|
name := strings.ToLower(strings.ReplaceAll(lv[1], " ", "_"))
|
||||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
|
y, err := lp.NewMetric(name, map[string]string{"type": "node"}, m.meta, v, time.Now())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentErrorf(m.name, "Failed to create message: %v", err)
|
cclog.ComponentErrorf(m.name, "Failed to create message: %v", err)
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
if m.load_skips[i] {
|
if m.load_skips[i] {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
|
y, err := lp.NewMetric(name, m.tags, m.meta, x, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -128,7 +128,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
if m.proc_skips[i] {
|
if m.proc_skips[i] {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
|
y, err := lp.NewMetric(name, m.tags, m.meta, x, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -72,7 +72,8 @@ func getStats(filename string) map[string]MemstatStats {
|
|||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
line := scanner.Text()
|
line := scanner.Text()
|
||||||
linefields := strings.Fields(line)
|
linefields := strings.Fields(line)
|
||||||
if len(linefields) == 3 {
|
switch len(linefields) {
|
||||||
|
case 3:
|
||||||
v, err := strconv.ParseFloat(linefields[1], 64)
|
v, err := strconv.ParseFloat(linefields[1], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
|
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
|
||||||
@@ -80,7 +81,7 @@ func getStats(filename string) map[string]MemstatStats {
|
|||||||
unit: linefields[2],
|
unit: linefields[2],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if len(linefields) == 5 {
|
case 5:
|
||||||
v, err := strconv.ParseFloat(linefields[3], 64)
|
v, err := strconv.ParseFloat(linefields[3], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4])
|
cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4])
|
||||||
@@ -106,7 +107,10 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Memory"}
|
m.meta = map[string]string{
|
||||||
|
"source": m.name,
|
||||||
|
"group": "Memory",
|
||||||
|
}
|
||||||
m.stats = make(map[string]int64)
|
m.stats = make(map[string]int64)
|
||||||
m.matches = make(map[string]string)
|
m.matches = make(map[string]string)
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
@@ -145,7 +149,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
"KernelStack": "mem_kernelstack",
|
"KernelStack": "mem_kernelstack",
|
||||||
}
|
}
|
||||||
for k, v := range matches {
|
for k, v := range matches {
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, k) {
|
if !slices.Contains(m.config.ExcludeMetrics, v) {
|
||||||
m.matches[k] = v
|
m.matches[k] = v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -153,7 +157,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
if !slices.Contains(m.config.ExcludeMetrics, "mem_used") {
|
if !slices.Contains(m.config.ExcludeMetrics, "mem_used") {
|
||||||
m.sendMemUsed = true
|
m.sendMemUsed = true
|
||||||
}
|
}
|
||||||
if len(m.matches) == 0 {
|
if len(m.matches) == 0 && !m.sendMemUsed {
|
||||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
||||||
}
|
}
|
||||||
if err := m.setup(); err != nil {
|
if err := m.setup(); err != nil {
|
||||||
@@ -213,7 +217,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value}, time.Now())
|
y, err := lp.NewMetric(name, tags, m.meta, value, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if len(unit) > 0 {
|
if len(unit) > 0 {
|
||||||
y.AddMeta("unit", unit)
|
y.AddMeta("unit", unit)
|
||||||
@@ -252,7 +256,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]any{"value": memUsed}, time.Now())
|
y, err := lp.NewMetric("mem_used", tags, m.meta, memUsed, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if len(unit) > 0 {
|
if len(unit) > 0 {
|
||||||
y.AddMeta("unit", unit)
|
y.AddMeta("unit", unit)
|
||||||
|
|||||||
@@ -262,14 +262,14 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if m.config.SendAbsoluteValues {
|
if m.config.SendAbsoluteValues {
|
||||||
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]any{"value": v}, now); err == nil {
|
if y, err := lp.NewMetric(metric.name, metric.tags, metric.meta, v, now); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.config.SendDerivedValues {
|
if m.config.SendDerivedValues {
|
||||||
if metric.lastValue >= 0 {
|
if metric.lastValue >= 0 {
|
||||||
rate := float64(v-metric.lastValue) / timeDiff
|
rate := float64(v-metric.lastValue) / timeDiff
|
||||||
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]any{"value": rate}, now); err == nil {
|
if y, err := lp.NewMetric(metric.name+"_bw", metric.tags, metric.meta_rates, rate, now); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -146,14 +146,13 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
valueMap := make(map[string]any)
|
|
||||||
if data.current >= 0 && data.last >= 0 {
|
if data.current >= 0 && data.last >= 0 {
|
||||||
valueMap["value"] = data.current - data.last
|
value := data.current - data.last
|
||||||
}
|
y, err := lp.NewMetric(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, value, timestamp)
|
||||||
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, valueMap, timestamp)
|
if err == nil {
|
||||||
if err == nil {
|
y.AddMeta("version", m.version)
|
||||||
y.AddMeta("version", m.version)
|
output <- y
|
||||||
output <- y
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -145,14 +145,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
if old, ok := m.data[mntpoint]; ok {
|
if old, ok := m.data[mntpoint]; ok {
|
||||||
for name, newVal := range values {
|
for name, newVal := range values {
|
||||||
if m.config.SendAbsoluteValues {
|
if m.config.SendAbsoluteValues {
|
||||||
msg, err := lp.NewMessage(
|
msg, err := lp.NewMetric("nfsio_"+name, m.tags, m.meta, newVal, now)
|
||||||
"nfsio_"+name,
|
|
||||||
m.tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": newVal,
|
|
||||||
},
|
|
||||||
now)
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
msg.AddTag("stype", "filesystem")
|
msg.AddTag("stype", "filesystem")
|
||||||
msg.AddTag("stype-id", mntpoint)
|
msg.AddTag("stype-id", mntpoint)
|
||||||
@@ -161,7 +154,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
}
|
}
|
||||||
if m.config.SendDerivedValues {
|
if m.config.SendDerivedValues {
|
||||||
rate := float64(newVal-old[name]) / timeDiff
|
rate := float64(newVal-old[name]) / timeDiff
|
||||||
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]any{"value": rate}, now)
|
msg, err := lp.NewMetric(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, rate, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if strings.HasPrefix(name, "page") {
|
if strings.HasPrefix(name, "page") {
|
||||||
msg.AddMeta("unit", "4K_pages/s")
|
msg.AddMeta("unit", "4K_pages/s")
|
||||||
|
|||||||
396
collectors/nvidiaGPMMetric.go
Normal file
396
collectors/nvidiaGPMMetric.go
Normal file
@@ -0,0 +1,396 @@
|
|||||||
|
package collectors
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"slices"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
|
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||||
|
)
|
||||||
|
|
||||||
|
type NvidiaGPMMetricDef struct {
|
||||||
|
name string
|
||||||
|
outname string
|
||||||
|
id nvml.GpmMetricId
|
||||||
|
unit string
|
||||||
|
}
|
||||||
|
|
||||||
|
var NvidiaGPMMetrics []NvidiaGPMMetricDef = []NvidiaGPMMetricDef{
|
||||||
|
{
|
||||||
|
name: "GRAPHICS_UTIL",
|
||||||
|
outname: "nv_gpm_graphics_util",
|
||||||
|
id: nvml.GPM_METRIC_GRAPHICS_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "SM_UTIL",
|
||||||
|
outname: "nv_gpm_sm_util",
|
||||||
|
id: nvml.GPM_METRIC_SM_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "SM_OCCUPANCY",
|
||||||
|
outname: "nv_gpm_sm_occupancy",
|
||||||
|
id: nvml.GPM_METRIC_SM_OCCUPANCY,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "INTEGER_UTIL",
|
||||||
|
outname: "nv_gpm_integer_util",
|
||||||
|
id: nvml.GPM_METRIC_INTEGER_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "ANY_TENSOR_UTIL",
|
||||||
|
outname: "nv_gpm_any_tensor_util",
|
||||||
|
id: nvml.GPM_METRIC_ANY_TENSOR_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "DFMA_TENSOR_UTIL",
|
||||||
|
outname: "nv_gpm_dfma_tensor_util",
|
||||||
|
id: nvml.GPM_METRIC_DFMA_TENSOR_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "HMMA_TENSOR_UTIL",
|
||||||
|
outname: "nv_gpm_hmma_tensor_util",
|
||||||
|
id: nvml.GPM_METRIC_HMMA_TENSOR_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "IMMA_TENSOR_UTIL",
|
||||||
|
outname: "nv_gpm_imma_tensor_util",
|
||||||
|
id: nvml.GPM_METRIC_IMMA_TENSOR_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "DRAM_BW_UTIL",
|
||||||
|
outname: "nv_gpm_dram_bw_util",
|
||||||
|
id: nvml.GPM_METRIC_DRAM_BW_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "FP64_UTIL",
|
||||||
|
outname: "nv_gpm_fp64_util",
|
||||||
|
id: nvml.GPM_METRIC_FP64_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "FP32_UTIL",
|
||||||
|
outname: "nv_gpm_fp32_util",
|
||||||
|
id: nvml.GPM_METRIC_FP32_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "FP16_UTIL",
|
||||||
|
outname: "nv_gpm_fp16_util",
|
||||||
|
id: nvml.GPM_METRIC_FP16_UTIL,
|
||||||
|
unit: "%",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaGPMCollectorConfig struct {
|
||||||
|
Metrics []string `json:"metrics,omitempty"`
|
||||||
|
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
||||||
|
AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"`
|
||||||
|
UsePciInfoAsTypeId bool `json:"use_pci_info_as_type_id,omitempty"`
|
||||||
|
AddUuidMeta bool `json:"add_uuid_meta,omitempty"`
|
||||||
|
AddBoardNumberMeta bool `json:"add_board_number_meta,omitempty"`
|
||||||
|
AddSerialMeta bool `json:"add_serial_meta,omitempty"`
|
||||||
|
ProcessMigDevices bool `json:"process_mig_devices,omitempty"`
|
||||||
|
UseUuidForMigDevices bool `json:"use_uuid_for_mig_device,omitempty"`
|
||||||
|
UseSliceForMigDevices bool `json:"use_slice_for_mig_device,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaGPMCollectorDevice struct {
|
||||||
|
device nvml.Device
|
||||||
|
tags map[string]string
|
||||||
|
meta map[string]string
|
||||||
|
startTime time.Time
|
||||||
|
endTime time.Time
|
||||||
|
measurement nvml.GpmMetricsGetType
|
||||||
|
metricsLookup map[int]NvidiaGPMMetricDef
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaGPMCollector struct {
|
||||||
|
metricCollector
|
||||||
|
|
||||||
|
config NvidiaGPMCollectorConfig
|
||||||
|
gpus []NvidiaGPMCollectorDevice
|
||||||
|
num_gpus int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *NvidiaGPMCollector) Init(config json.RawMessage) error {
|
||||||
|
var err error = nil
|
||||||
|
m.name = "NvidiaGPMCollector"
|
||||||
|
m.parallel = true
|
||||||
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
|
if len(config) > 0 {
|
||||||
|
d := json.NewDecoder(strings.NewReader(string(config)))
|
||||||
|
d.DisallowUnknownFields()
|
||||||
|
if err = d.Decode(&m.config); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
m.meta = map[string]string{
|
||||||
|
"source": m.name,
|
||||||
|
"group": "NvidiaGPM",
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize NVIDIA Management Library (NVML)
|
||||||
|
ret := nvml.Init()
|
||||||
|
|
||||||
|
// Error: NVML library not found
|
||||||
|
// (nvml.ErrorString can not be used in this case)
|
||||||
|
if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
|
||||||
|
return fmt.Errorf("%s Init(): NVML library not found", m.name)
|
||||||
|
}
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
return fmt.Errorf("%s Init(): Unable to initialize NVML: %w", m.name, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Number of NVIDIA GPUs
|
||||||
|
num_gpus, ret := nvml.DeviceGetCount()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
return fmt.Errorf("%s Init(): Unable to get device count: %w", m.name, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// For all GPUs
|
||||||
|
m.gpus = make([]NvidiaGPMCollectorDevice, 0, num_gpus)
|
||||||
|
for i := range num_gpus {
|
||||||
|
|
||||||
|
// Skip excluded devices by ID
|
||||||
|
str_i := strconv.Itoa(i)
|
||||||
|
if slices.Contains(m.config.ExcludeDevices, str_i) {
|
||||||
|
cclog.ComponentDebugf(m.name, "Skipping excluded device %s", str_i)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get device handle
|
||||||
|
device, ret := nvml.DeviceGetHandleByIndex(i)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Unable to get device at index %d: %s", i, err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
supportInfo, ret := nvml.GpmQueryDeviceSupport(device)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Unable to query GPM support for device at index %d: %s", i, err.Error())
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
|
if supportInfo.IsSupportedDevice == uint32(nvml.FEATURE_DISABLED) {
|
||||||
|
cclog.ComponentErrorf(m.name, "Device at index %d does not support GPM metrics", i)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stream, ret := nvml.GpmQueryIfStreamingEnabled(device)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Unable to query GPM streaming for device at index %d: %s", i, err.Error())
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
|
if stream == uint32(nvml.FEATURE_DISABLED) {
|
||||||
|
ret = nvml.GpmSetStreamingEnabled(device, uint32(nvml.FEATURE_ENABLED))
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Unable to set streaming mode for device at index %d: %s", i, err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get device's PCI info
|
||||||
|
pciInfo, ret := nvml.DeviceGetPciInfo(device)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Unable to get PCI info for device at index %d: %s", i, err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Create PCI ID in the common format used by the NVML.
|
||||||
|
pci_id := fmt.Sprintf(
|
||||||
|
nvml.DEVICE_PCI_BUS_ID_FMT,
|
||||||
|
pciInfo.Domain,
|
||||||
|
pciInfo.Bus,
|
||||||
|
pciInfo.Device)
|
||||||
|
|
||||||
|
// Skip excluded devices specified by PCI ID
|
||||||
|
if slices.Contains(m.config.ExcludeDevices, pci_id) {
|
||||||
|
cclog.ComponentDebugf(m.name, "Skipping excluded device %s", pci_id)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ss, nvmlErr := nvml.GpmSampleAlloc()
|
||||||
|
if nvmlErr != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
es, nvmlErr := nvml.GpmSampleAlloc()
|
||||||
|
if nvmlErr != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Select which value to use as 'type-id'.
|
||||||
|
// The PCI ID is commonly required in SLURM environments because the
|
||||||
|
// numberic IDs used by SLURM and the ones used by NVML might differ
|
||||||
|
// depending on the job type. The PCI ID is more reliable but is commonly
|
||||||
|
// not recorded for a job, so it must be added manually in prologue or epilogue
|
||||||
|
// e.g. to the comment field
|
||||||
|
tid := str_i
|
||||||
|
if m.config.UsePciInfoAsTypeId {
|
||||||
|
tid = pci_id
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now we got all infos together, populate the device list
|
||||||
|
g := NvidiaGPMCollectorDevice{}
|
||||||
|
|
||||||
|
// Add device handle
|
||||||
|
g.device = device
|
||||||
|
|
||||||
|
// Add tags
|
||||||
|
g.tags = map[string]string{
|
||||||
|
"type": "accelerator",
|
||||||
|
"type-id": tid,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add PCI info as tag if not already used as 'type-id'
|
||||||
|
if m.config.AddPciInfoTag && !m.config.UsePciInfoAsTypeId {
|
||||||
|
g.tags["pci_identifier"] = pci_id
|
||||||
|
}
|
||||||
|
|
||||||
|
g.meta = map[string]string{
|
||||||
|
"source": m.name,
|
||||||
|
"group": "Nvidia",
|
||||||
|
}
|
||||||
|
|
||||||
|
if m.config.AddBoardNumberMeta {
|
||||||
|
board, ret := nvml.DeviceGetBoardPartNumber(device)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentError(m.name, "Unable to get boart part number for device at index", i, ":", err.Error())
|
||||||
|
} else {
|
||||||
|
g.meta["board_number"] = board
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if m.config.AddSerialMeta {
|
||||||
|
serial, ret := nvml.DeviceGetSerial(device)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error())
|
||||||
|
} else {
|
||||||
|
g.meta["serial"] = serial
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if m.config.AddUuidMeta {
|
||||||
|
uuid, ret := nvml.DeviceGetUUID(device)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error())
|
||||||
|
} else {
|
||||||
|
g.meta["uuid"] = uuid
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
g.measurement.Sample1 = ss
|
||||||
|
g.measurement.Sample2 = es
|
||||||
|
g.measurement.Version = nvml.GPM_METRICS_GET_VERSION
|
||||||
|
g.metricsLookup = make(map[int]NvidiaGPMMetricDef)
|
||||||
|
metIdx := 0
|
||||||
|
for _, inmetric := range m.config.Metrics {
|
||||||
|
for _, defmetric := range NvidiaGPMMetrics {
|
||||||
|
if inmetric == defmetric.outname || inmetric == defmetric.name {
|
||||||
|
g.measurement.Metrics[metIdx] = nvml.GpmMetric{
|
||||||
|
MetricId: uint32(defmetric.id),
|
||||||
|
}
|
||||||
|
g.metricsLookup[metIdx] = defmetric
|
||||||
|
metIdx += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
g.measurement.NumMetrics = uint32(metIdx)
|
||||||
|
m.gpus = append(m.gpus, g)
|
||||||
|
}
|
||||||
|
cclog.ComponentDebugf(m.name, "Found %d Nvidia GPUs with GPM support", len(m.gpus))
|
||||||
|
m.num_gpus = len(m.gpus)
|
||||||
|
m.init = true
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *NvidiaGPMCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
|
var err error
|
||||||
|
if !m.init {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for i, gpu := range m.gpus {
|
||||||
|
gpu.startTime = time.Now()
|
||||||
|
nvmlErr := gpu.measurement.Sample1.Get(gpu.device)
|
||||||
|
if nvmlErr != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(nvmlErr))
|
||||||
|
cclog.ComponentError(m.name, "Unable to get start GPM sample for device at index", i, ":", err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
time.Sleep(interval)
|
||||||
|
|
||||||
|
for i, gpu := range m.gpus {
|
||||||
|
gpu.endTime = time.Now()
|
||||||
|
nvmlErr := gpu.measurement.Sample2.Get(gpu.device)
|
||||||
|
if nvmlErr != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(nvmlErr))
|
||||||
|
cclog.ComponentError(m.name, "Unable to get stop GPM sample for device at index", i, ":", err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, gpu := range m.gpus {
|
||||||
|
nvmlErr := nvml.GpmMetricsGet(&gpu.measurement)
|
||||||
|
if nvmlErr != nvml.SUCCESS {
|
||||||
|
err = errors.New(nvml.ErrorString(nvmlErr))
|
||||||
|
cclog.ComponentError(m.name, "Unable to get evaluate GPM sample for device at index", i, ":", err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for idx, metricDef := range gpu.metricsLookup {
|
||||||
|
y, err := lp.NewMetric(metricDef.outname, gpu.tags, gpu.meta, gpu.measurement.Metrics[idx].Value, time.Now())
|
||||||
|
if err == nil {
|
||||||
|
y.AddMeta("unit", metricDef.unit)
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *NvidiaGPMCollector) Close() {
|
||||||
|
if m.init {
|
||||||
|
for i, gpu := range m.gpus {
|
||||||
|
ret := gpu.measurement.Sample1.Free()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err := errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Unable to free start sample for device at index %d: %s", i, err.Error())
|
||||||
|
}
|
||||||
|
ret = gpu.measurement.Sample2.Free()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
err := errors.New(nvml.ErrorString(ret))
|
||||||
|
cclog.ComponentErrorf(m.name, "Unable to free stop sample for device at index %d: %s", i, err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ret := nvml.Shutdown(); ret != nvml.SUCCESS {
|
||||||
|
cclog.ComponentError(m.name, "nvml.Shutdown() not successful")
|
||||||
|
}
|
||||||
|
m.init = false
|
||||||
|
}
|
||||||
|
}
|
||||||
54
collectors/nvidiaGPMMetric.md
Normal file
54
collectors/nvidiaGPMMetric.md
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
<!--
|
||||||
|
---
|
||||||
|
title: "Nvidia NVML GPM metric collector"
|
||||||
|
description: Collect metrics for Nvidia GPUs using the NVML GPM interface
|
||||||
|
categories: [cc-metric-collector]
|
||||||
|
tags: ['Admin']
|
||||||
|
weight: 2
|
||||||
|
hugo_path: docs/reference/cc-metric-collector/collectors/nvidiaGPM.md
|
||||||
|
---
|
||||||
|
-->
|
||||||
|
|
||||||
|
## `nvidiaGPM` collector
|
||||||
|
|
||||||
|
```json
|
||||||
|
"nvidia_gpm": {
|
||||||
|
"metrics": [
|
||||||
|
"nv_fb_mem_used",
|
||||||
|
"nv_fan"
|
||||||
|
],
|
||||||
|
"exclude_devices": [
|
||||||
|
"0","1", "0000000:ff:01.0"
|
||||||
|
],
|
||||||
|
|
||||||
|
"process_mig_devices": false,
|
||||||
|
"use_pci_info_as_type_id": true,
|
||||||
|
"add_pci_info_tag": false,
|
||||||
|
"add_uuid_meta": false,
|
||||||
|
"add_board_number_meta": false,
|
||||||
|
"add_serial_meta": false,
|
||||||
|
"use_uuid_for_mig_device": false,
|
||||||
|
"use_slice_for_mig_device": false
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The `nvidia_gpm` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=<mig_index>`). With the options `use_uuid_for_mig_device` and `use_slice_for_mig_device`, the `<mig_index>` can be replaced with the UUID (e.g. `MIG-6a9f7cc8-6d5b-5ce0-92de-750edc4d8849`) or the MIG slice name (e.g. `1g.5gb`).
|
||||||
|
|
||||||
|
The metrics sent by the `nvidia_gpm` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
|
||||||
|
|
||||||
|
Optionally, it is possible to add the UUID, the board part number and the serial to the meta informations. They are not sent to the sinks (if not configured otherwise).
|
||||||
|
|
||||||
|
|
||||||
|
Available Metrics:
|
||||||
|
* `nv_gpm_graphics_util`
|
||||||
|
* `nv_gpm_sm_util`
|
||||||
|
* `nv_gpm_sm_occupancy`
|
||||||
|
* `nv_gpm_integer_util`
|
||||||
|
* `nv_gpm_any_tensor_util`
|
||||||
|
* `nv_gpm_dfma_tensor_util`
|
||||||
|
* `nv_gpm_hmma_tensor_util`
|
||||||
|
* `nv_gpm_imma_tensor_util`
|
||||||
|
* `nv_gpm_dram_bw_util`
|
||||||
|
* `nv_gpm_fp64_util`
|
||||||
|
* `nv_gpm_fp32_util`
|
||||||
|
* `nv_gpm_fp16_util`
|
||||||
@@ -242,12 +242,7 @@ func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
timeDiff := energyTimestamp.Sub(p.energyTimestamp)
|
timeDiff := energyTimestamp.Sub(p.energyTimestamp)
|
||||||
averagePower := float64(energyDiff) / float64(timeDiff.Microseconds())
|
averagePower := float64(energyDiff) / float64(timeDiff.Microseconds())
|
||||||
|
|
||||||
y, err := lp.NewMessage(
|
y, err := lp.NewMetric("rapl_average_power", p.tags, m.meta, averagePower, energyTimestamp)
|
||||||
"rapl_average_power",
|
|
||||||
p.tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{"value": averagePower},
|
|
||||||
energyTimestamp)
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -158,128 +158,110 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
|
|
||||||
if !dev.excludeMetrics["rocm_gfx_util"] {
|
if !dev.excludeMetrics["rocm_gfx_util"] {
|
||||||
value := metrics.Average_gfx_activity
|
value := metrics.Average_gfx_activity
|
||||||
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_gfx_util", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_umc_util"] {
|
if !dev.excludeMetrics["rocm_umc_util"] {
|
||||||
value := metrics.Average_umc_activity
|
value := metrics.Average_umc_activity
|
||||||
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_umc_util", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_mm_util"] {
|
if !dev.excludeMetrics["rocm_mm_util"] {
|
||||||
value := metrics.Average_mm_activity
|
value := metrics.Average_mm_activity
|
||||||
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_mm_util", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_avg_power"] {
|
if !dev.excludeMetrics["rocm_avg_power"] {
|
||||||
value := metrics.Average_socket_power
|
value := metrics.Average_socket_power
|
||||||
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_avg_power", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_mem"] {
|
if !dev.excludeMetrics["rocm_temp_mem"] {
|
||||||
value := metrics.Temperature_mem
|
value := metrics.Temperature_mem
|
||||||
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_temp_mem", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_hotspot"] {
|
if !dev.excludeMetrics["rocm_temp_hotspot"] {
|
||||||
value := metrics.Temperature_hotspot
|
value := metrics.Temperature_hotspot
|
||||||
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_temp_hotspot", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_edge"] {
|
if !dev.excludeMetrics["rocm_temp_edge"] {
|
||||||
value := metrics.Temperature_edge
|
value := metrics.Temperature_edge
|
||||||
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_temp_edge", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_vrgfx"] {
|
if !dev.excludeMetrics["rocm_temp_vrgfx"] {
|
||||||
value := metrics.Temperature_vrgfx
|
value := metrics.Temperature_vrgfx
|
||||||
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_temp_vrgfx", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_vrsoc"] {
|
if !dev.excludeMetrics["rocm_temp_vrsoc"] {
|
||||||
value := metrics.Temperature_vrsoc
|
value := metrics.Temperature_vrsoc
|
||||||
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_temp_vrsoc", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_vrmem"] {
|
if !dev.excludeMetrics["rocm_temp_vrmem"] {
|
||||||
value := metrics.Temperature_vrmem
|
value := metrics.Temperature_vrmem
|
||||||
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_temp_vrmem", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_gfx_clock"] {
|
if !dev.excludeMetrics["rocm_gfx_clock"] {
|
||||||
value := metrics.Average_gfxclk_frequency
|
value := metrics.Average_gfxclk_frequency
|
||||||
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_gfx_clock", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_soc_clock"] {
|
if !dev.excludeMetrics["rocm_soc_clock"] {
|
||||||
value := metrics.Average_socclk_frequency
|
value := metrics.Average_socclk_frequency
|
||||||
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_soc_clock", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_u_clock"] {
|
if !dev.excludeMetrics["rocm_u_clock"] {
|
||||||
value := metrics.Average_uclk_frequency
|
value := metrics.Average_uclk_frequency
|
||||||
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_u_clock", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_v0_clock"] {
|
if !dev.excludeMetrics["rocm_v0_clock"] {
|
||||||
value := metrics.Average_vclk0_frequency
|
value := metrics.Average_vclk0_frequency
|
||||||
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_v0_clock", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_v1_clock"] {
|
if !dev.excludeMetrics["rocm_v1_clock"] {
|
||||||
value := metrics.Average_vclk1_frequency
|
value := metrics.Average_vclk1_frequency
|
||||||
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_v1_clock", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_d0_clock"] {
|
if !dev.excludeMetrics["rocm_d0_clock"] {
|
||||||
value := metrics.Average_dclk0_frequency
|
value := metrics.Average_dclk0_frequency
|
||||||
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_d0_clock", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_d1_clock"] {
|
if !dev.excludeMetrics["rocm_d1_clock"] {
|
||||||
value := metrics.Average_dclk1_frequency
|
value := metrics.Average_dclk1_frequency
|
||||||
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_d1_clock", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_hbm"] {
|
if !dev.excludeMetrics["rocm_temp_hbm"] {
|
||||||
for i := range rocm_smi.NUM_HBM_INSTANCES {
|
for i := range rocm_smi.NUM_HBM_INSTANCES {
|
||||||
value := metrics.Temperature_hbm[i]
|
value := metrics.Temperature_hbm[i]
|
||||||
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
if y, err := lp.NewMetric("rocm_temp_hbm", dev.tags, dev.meta, value, timestamp); err == nil {
|
||||||
if err == nil {
|
|
||||||
y.AddTag("stype", "device")
|
y.AddTag("stype", "device")
|
||||||
y.AddTag("stype-id", strconv.Itoa(i))
|
y.AddTag("stype-id", strconv.Itoa(i))
|
||||||
output <- y
|
output <- y
|
||||||
|
|||||||
@@ -201,26 +201,14 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
x /= 1000
|
x /= 1000
|
||||||
y, err := lp.NewMessage(
|
y, err := lp.NewMetric(sensor.metricName, sensor.tags, m.meta, x, time.Now())
|
||||||
sensor.metricName,
|
|
||||||
sensor.tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{"value": x},
|
|
||||||
time.Now(),
|
|
||||||
)
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|
||||||
// max temperature
|
// max temperature
|
||||||
if m.config.ReportMaxTemp && sensor.maxTemp != 0 {
|
if m.config.ReportMaxTemp && sensor.maxTemp != 0 {
|
||||||
y, err := lp.NewMessage(
|
y, err := lp.NewMetric(sensor.maxTempName, sensor.tags, m.meta, sensor.maxTemp, time.Now())
|
||||||
sensor.maxTempName,
|
|
||||||
sensor.tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{"value": sensor.maxTemp},
|
|
||||||
time.Now(),
|
|
||||||
)
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -228,13 +216,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
|
|
||||||
// critical temperature
|
// critical temperature
|
||||||
if m.config.ReportCriticalTemp && sensor.critTemp != 0 {
|
if m.config.ReportCriticalTemp && sensor.critTemp != 0 {
|
||||||
y, err := lp.NewMessage(
|
y, err := lp.NewMetric(sensor.critTempName, sensor.tags, m.meta, sensor.critTemp, time.Now())
|
||||||
sensor.critTempName,
|
|
||||||
sensor.tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{"value": sensor.critTemp},
|
|
||||||
time.Now(),
|
|
||||||
)
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -86,15 +86,7 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessag
|
|||||||
lines := strings.Split(string(stdout), "\n")
|
lines := strings.Split(string(stdout), "\n")
|
||||||
for i := 1; i < m.config.Num_procs+1; i++ {
|
for i := 1; i < m.config.Num_procs+1; i++ {
|
||||||
name := fmt.Sprintf("topproc%d", i)
|
name := fmt.Sprintf("topproc%d", i)
|
||||||
y, err := lp.NewMessage(
|
if y, err := lp.NewMetric(name, m.tags, m.meta, lines[i], time.Now()); err == nil {
|
||||||
name,
|
|
||||||
m.tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": lines[i],
|
|
||||||
},
|
|
||||||
time.Now())
|
|
||||||
if err == nil {
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
8
go.mod
8
go.mod
@@ -7,10 +7,10 @@ require (
|
|||||||
github.com/ClusterCockpit/go-rocm-smi v0.4.0
|
github.com/ClusterCockpit/go-rocm-smi v0.4.0
|
||||||
github.com/NVIDIA/go-nvml v0.13.0-1
|
github.com/NVIDIA/go-nvml v0.13.0-1
|
||||||
github.com/PaesslerAG/gval v1.2.4
|
github.com/PaesslerAG/gval v1.2.4
|
||||||
github.com/fsnotify/fsnotify v1.10.0
|
github.com/fsnotify/fsnotify v1.10.1
|
||||||
github.com/tklauser/go-sysconf v0.3.16
|
github.com/tklauser/go-sysconf v0.4.0
|
||||||
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1
|
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1
|
||||||
golang.org/x/sys v0.43.0
|
golang.org/x/sys v0.45.0
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
@@ -38,7 +38,7 @@ require (
|
|||||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 // indirect
|
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 // indirect
|
||||||
github.com/shopspring/decimal v1.4.0 // indirect
|
github.com/shopspring/decimal v1.4.0 // indirect
|
||||||
github.com/stmcginnis/gofish v0.21.6 // indirect
|
github.com/stmcginnis/gofish v0.21.6 // indirect
|
||||||
github.com/tklauser/numcpus v0.11.0 // indirect
|
github.com/tklauser/numcpus v0.12.0 // indirect
|
||||||
go.yaml.in/yaml/v2 v2.4.4 // indirect
|
go.yaml.in/yaml/v2 v2.4.4 // indirect
|
||||||
golang.org/x/crypto v0.50.0 // indirect
|
golang.org/x/crypto v0.50.0 // indirect
|
||||||
golang.org/x/net v0.53.0 // indirect
|
golang.org/x/net v0.53.0 // indirect
|
||||||
|
|||||||
16
go.sum
16
go.sum
@@ -53,8 +53,8 @@ github.com/expr-lang/expr v1.17.8 h1:W1loDTT+0PQf5YteHSTpju2qfUfNoBt4yw9+wOEU9VM
|
|||||||
github.com/expr-lang/expr v1.17.8/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
|
github.com/expr-lang/expr v1.17.8/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
|
||||||
github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
|
github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
|
||||||
github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
|
github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
|
||||||
github.com/fsnotify/fsnotify v1.10.0 h1:Xx/5Ydg9CeBDX/wi4VJqStNtohYjitZhhlHt4h3St1M=
|
github.com/fsnotify/fsnotify v1.10.1 h1:b0/UzAf9yR5rhf3RPm9gf3ehBPpf0oZKIjtpKrx59Ho=
|
||||||
github.com/fsnotify/fsnotify v1.10.0/go.mod h1:TLheqan6HD6GBK6PrDWyDPBaEV8LspOxvPSjC+bVfgo=
|
github.com/fsnotify/fsnotify v1.10.1/go.mod h1:TLheqan6HD6GBK6PrDWyDPBaEV8LspOxvPSjC+bVfgo=
|
||||||
github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE=
|
github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE=
|
||||||
github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78=
|
github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78=
|
||||||
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
|
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
|
||||||
@@ -163,10 +163,10 @@ github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu
|
|||||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||||
github.com/testcontainers/testcontainers-go v0.26.0 h1:uqcYdoOHBy1ca7gKODfBd9uTHVK3a7UL848z09MVZ0c=
|
github.com/testcontainers/testcontainers-go v0.26.0 h1:uqcYdoOHBy1ca7gKODfBd9uTHVK3a7UL848z09MVZ0c=
|
||||||
github.com/testcontainers/testcontainers-go v0.26.0/go.mod h1:ICriE9bLX5CLxL9OFQ2N+2N+f+803LNJ1utJb1+Inx0=
|
github.com/testcontainers/testcontainers-go v0.26.0/go.mod h1:ICriE9bLX5CLxL9OFQ2N+2N+f+803LNJ1utJb1+Inx0=
|
||||||
github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA=
|
github.com/tklauser/go-sysconf v0.4.0 h1:7H0uAN+7RkwWRaxhYXDLqa5V3LPrJeV8wmD9dRUgPQU=
|
||||||
github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI=
|
github.com/tklauser/go-sysconf v0.4.0/go.mod h1:8mTNWyog7H+MpKijp4VmKJAd2bbYQ2zuUwkYRbUArPI=
|
||||||
github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw=
|
github.com/tklauser/numcpus v0.12.0 h1:NR85qdvHA9pFse3x3weVZ0r0ST8R6l5RHbZrlRaqob4=
|
||||||
github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ=
|
github.com/tklauser/numcpus v0.12.0/go.mod h1:ABHeXzJnr/qqwguhClkZKT1/8VABcYrsyUiUGobwWJg=
|
||||||
github.com/yusufpapurcu/wmi v1.2.3 h1:E1ctvB7uKFMOJw3fdOW32DwGE9I7t++CRUEMKvFoFiw=
|
github.com/yusufpapurcu/wmi v1.2.3 h1:E1ctvB7uKFMOJw3fdOW32DwGE9I7t++CRUEMKvFoFiw=
|
||||||
github.com/yusufpapurcu/wmi v1.2.3/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
|
github.com/yusufpapurcu/wmi v1.2.3/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
|
||||||
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||||
@@ -184,8 +184,8 @@ golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
|||||||
golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA=
|
golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA=
|
||||||
golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs=
|
golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs=
|
||||||
golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI=
|
golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY=
|
||||||
golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||||
golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
|
golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
|
||||||
golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
|
golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
|
||||||
golang.org/x/tools v0.14.0 h1:jvNa2pY0M4r62jkRQ6RwEZZyPcymeL9XZMLBbV7U2nc=
|
golang.org/x/tools v0.14.0 h1:jvNa2pY0M4r62jkRQ6RwEZZyPcymeL9XZMLBbV7U2nc=
|
||||||
|
|||||||
Reference in New Issue
Block a user