mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-05-11 16:07:30 +02:00
Add metric 'nv_util_eff' like nvtop
This commit is contained in:
@@ -1115,6 +1115,31 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func readEfficiency(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
|
if !device.excludeMetrics["nv_util_eff"] {
|
||||||
|
maxPower, ret := nvml.DeviceGetEnforcedPowerLimit(device.device)
|
||||||
|
if ret == nvml.SUCCESS {
|
||||||
|
curPower, ret := nvml.DeviceGetPowerUsage(device.device)
|
||||||
|
if ret == nvml.SUCCESS {
|
||||||
|
util, ret := nvml.DeviceGetUtilizationRates(device.device)
|
||||||
|
if ret == nvml.SUCCESS {
|
||||||
|
factor := float64(curPower) / float64(maxPower)
|
||||||
|
eff := uint32(float64(util.Gpu) * factor)
|
||||||
|
if eff > 100 {
|
||||||
|
eff = 100
|
||||||
|
}
|
||||||
|
y, err := lp.NewMetric("nv_util_eff", device.tags, device.meta, eff, time.Now())
|
||||||
|
if err == nil {
|
||||||
|
y.AddTag("unit", "percent")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
var err error
|
var err error
|
||||||
if !m.init {
|
if !m.init {
|
||||||
@@ -1220,6 +1245,11 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
err = readEfficiency(device, output)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentDebug(m.name, "readEfficiency for device", name, "failed")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Actual read loop over all attached Nvidia GPUs
|
// Actual read loop over all attached Nvidia GPUs
|
||||||
|
|||||||
@@ -85,5 +85,6 @@ Metrics:
|
|||||||
* `nv_energy`
|
* `nv_energy`
|
||||||
* `nv_energy_abs`
|
* `nv_energy_abs`
|
||||||
* `nv_average_power`
|
* `nv_average_power`
|
||||||
|
* `nv_util_eff` (`nv_util` * (`nv_power_usage` / `nv_power_max_limit`))
|
||||||
|
|
||||||
Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`.
|
Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`.
|
||||||
|
|||||||
Reference in New Issue
Block a user