From 0a1c7a9911a5b37b6d55b632802ef9c8609fddfa Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 6 May 2026 18:57:58 +0200 Subject: [PATCH] Add metric 'nv_util_eff' like nvtop --- collectors/nvidiaMetric.go | 30 ++++++++++++++++++++++++++++++ collectors/nvidiaMetric.md | 1 + 2 files changed, 31 insertions(+) diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index f23f2fe..e1d9157 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -1115,6 +1115,31 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er return nil } +func readEfficiency(device *NvidiaCollectorDevice, output chan lp.CCMessage) error { + if !device.excludeMetrics["nv_util_eff"] { + maxPower, ret := nvml.DeviceGetEnforcedPowerLimit(device.device) + if ret == nvml.SUCCESS { + curPower, ret := nvml.DeviceGetPowerUsage(device.device) + if ret == nvml.SUCCESS { + util, ret := nvml.DeviceGetUtilizationRates(device.device) + if ret == nvml.SUCCESS { + factor := float64(curPower) / float64(maxPower) + eff := uint32(float64(util.Gpu) * factor) + if eff > 100 { + eff = 100 + } + y, err := lp.NewMetric("nv_util_eff", device.tags, device.meta, eff, time.Now()) + if err == nil { + y.AddTag("unit", "percent") + output <- y + } + } + } + } + } + return nil +} + func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage) { var err error if !m.init { @@ -1220,6 +1245,11 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage) if err != nil { cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed") } + + err = readEfficiency(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readEfficiency for device", name, "failed") + } } // Actual read loop over all attached Nvidia GPUs diff --git a/collectors/nvidiaMetric.md b/collectors/nvidiaMetric.md index 8163bc0..83fdd29 100644 --- a/collectors/nvidiaMetric.md +++ b/collectors/nvidiaMetric.md @@ -85,5 +85,6 @@ Metrics: * `nv_energy` * `nv_energy_abs` * `nv_average_power` +* `nv_util_eff` (`nv_util` * (`nv_power_usage` / `nv_power_max_limit`)) Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=`.