Compare commits

..

1 Commits

Author SHA1 Message Date
Thomas Roehl
0a1c7a9911 Add metric 'nv_util_eff' like nvtop 2026-05-06 18:57:58 +02:00
4 changed files with 39 additions and 17 deletions

View File

@@ -27,7 +27,6 @@ const CPUSTATFILE = `/proc/stat`
type CpustatCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
excludeNumCPUs bool
}
type CpustatCollector struct {
@@ -80,7 +79,6 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
m.matches[match] = index
}
}
m.config.excludeNumCPUs = slices.Contains(m.config.ExcludeMetrics, "num_cpus")
// Check input file
file, err := os.Open(CPUSTATFILE)
@@ -97,13 +95,11 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
line := scanner.Text()
linefields := strings.Fields(line)
if strings.Compare(linefields[0], "cpu") == 0 {
// Kernel system statistics for all CPUs
m.olddata["cpu"] = make(map[string]int64)
for k, v := range m.matches {
m.olddata["cpu"][k], _ = strconv.ParseInt(linefields[v], 0, 64)
}
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
// Kernel system statistics per CPU
cpustr := strings.TrimLeft(linefields[0], "cpu")
cpu, _ := strconv.Atoi(cpustr)
m.cputags[linefields[0]] = map[string]string{
@@ -195,10 +191,9 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
}
}
if !m.config.excludeNumCPUs {
if num_cpus_metric, err := lp.NewMetric("num_cpus", m.nodetags, m.meta, num_cpus, now); err == nil {
output <- num_cpus_metric
}
num_cpus_metric, err := lp.NewMetric("num_cpus", m.nodetags, m.meta, num_cpus, now)
if err == nil {
output <- num_cpus_metric
}
m.lastTimestamp = now

View File

@@ -72,8 +72,7 @@ func getStats(filename string) map[string]MemstatStats {
for scanner.Scan() {
line := scanner.Text()
linefields := strings.Fields(line)
switch len(linefields) {
case 3:
if len(linefields) == 3 {
v, err := strconv.ParseFloat(linefields[1], 64)
if err == nil {
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
@@ -81,7 +80,7 @@ func getStats(filename string) map[string]MemstatStats {
unit: linefields[2],
}
}
case 5:
} else if len(linefields) == 5 {
v, err := strconv.ParseFloat(linefields[3], 64)
if err == nil {
cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4])
@@ -107,10 +106,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
}
}
m.meta = map[string]string{
"source": m.name,
"group": "Memory",
}
m.meta = map[string]string{"source": m.name, "group": "Memory"}
m.stats = make(map[string]int64)
m.matches = make(map[string]string)
m.tags = map[string]string{"type": "node"}
@@ -149,7 +145,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
"KernelStack": "mem_kernelstack",
}
for k, v := range matches {
if !slices.Contains(m.config.ExcludeMetrics, v) {
if !slices.Contains(m.config.ExcludeMetrics, k) {
m.matches[k] = v
}
}
@@ -157,7 +153,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
if !slices.Contains(m.config.ExcludeMetrics, "mem_used") {
m.sendMemUsed = true
}
if len(m.matches) == 0 && !m.sendMemUsed {
if len(m.matches) == 0 {
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
}
if err := m.setup(); err != nil {

View File

@@ -1115,6 +1115,31 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
return nil
}
func readEfficiency(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_util_eff"] {
maxPower, ret := nvml.DeviceGetEnforcedPowerLimit(device.device)
if ret == nvml.SUCCESS {
curPower, ret := nvml.DeviceGetPowerUsage(device.device)
if ret == nvml.SUCCESS {
util, ret := nvml.DeviceGetUtilizationRates(device.device)
if ret == nvml.SUCCESS {
factor := float64(curPower) / float64(maxPower)
eff := uint32(float64(util.Gpu) * factor)
if eff > 100 {
eff = 100
}
y, err := lp.NewMetric("nv_util_eff", device.tags, device.meta, eff, time.Now())
if err == nil {
y.AddTag("unit", "percent")
output <- y
}
}
}
}
}
return nil
}
func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage) {
var err error
if !m.init {
@@ -1220,6 +1245,11 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
if err != nil {
cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
}
err = readEfficiency(device, output)
if err != nil {
cclog.ComponentDebug(m.name, "readEfficiency for device", name, "failed")
}
}
// Actual read loop over all attached Nvidia GPUs

View File

@@ -85,5 +85,6 @@ Metrics:
* `nv_energy`
* `nv_energy_abs`
* `nv_average_power`
* `nv_util_eff` (`nv_util` * (`nv_power_usage` / `nv_power_max_limit`))
Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`.