mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-05-11 20:31:43 +02:00
Add energy metrics from NVML to Nvidia NVML collector
This commit is contained in:
parent
f8b2ac0d2c
commit
a606a3af01
@ -27,10 +27,12 @@ type NvidiaCollectorConfig struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaCollectorDevice struct {
|
type NvidiaCollectorDevice struct {
|
||||||
device nvml.Device
|
device nvml.Device
|
||||||
excludeMetrics map[string]bool
|
excludeMetrics map[string]bool
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
meta map[string]string
|
meta map[string]string
|
||||||
|
lastEnergyReading uint64
|
||||||
|
lastEnergyTimestamp time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaCollector struct {
|
type NvidiaCollector struct {
|
||||||
@ -149,6 +151,8 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Add device handle
|
// Add device handle
|
||||||
g.device = device
|
g.device = device
|
||||||
|
g.lastEnergyReading = 0
|
||||||
|
g.lastEnergyTimestamp = time.Now()
|
||||||
|
|
||||||
// Add tags
|
// Add tags
|
||||||
g.tags = map[string]string{
|
g.tags = map[string]string{
|
||||||
@ -206,7 +210,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_fb_mem_total"] || !device.excludeMetrics["nv_fb_mem_used"] || !device.excludeMetrics["nv_fb_mem_reserved"] {
|
if !device.excludeMetrics["nv_fb_mem_total"] || !device.excludeMetrics["nv_fb_mem_used"] || !device.excludeMetrics["nv_fb_mem_reserved"] {
|
||||||
var total uint64
|
var total uint64
|
||||||
var used uint64
|
var used uint64
|
||||||
@ -250,7 +254,7 @@ func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_bar1_mem_total"] || !device.excludeMetrics["nv_bar1_mem_used"] {
|
if !device.excludeMetrics["nv_bar1_mem_total"] || !device.excludeMetrics["nv_bar1_mem_used"] {
|
||||||
meminfo, ret := nvml.DeviceGetBAR1MemoryInfo(device.device)
|
meminfo, ret := nvml.DeviceGetBAR1MemoryInfo(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
@ -277,7 +281,7 @@ func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) e
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err := errors.New(nvml.ErrorString(ret))
|
err := errors.New(nvml.ErrorString(ret))
|
||||||
@ -319,7 +323,7 @@ func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readTemp(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readTemp(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_temp"] {
|
if !device.excludeMetrics["nv_temp"] {
|
||||||
// Retrieves the current temperature readings for the device, in degrees C.
|
// Retrieves the current temperature readings for the device, in degrees C.
|
||||||
//
|
//
|
||||||
@ -338,7 +342,7 @@ func readTemp(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readFan(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_fan"] {
|
if !device.excludeMetrics["nv_fan"] {
|
||||||
// Retrieves the intended operating speed of the device's fan.
|
// Retrieves the intended operating speed of the device's fan.
|
||||||
//
|
//
|
||||||
@ -361,7 +365,7 @@ func readFan(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// func readFans(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
// func readFans(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
// if !device.excludeMetrics["nv_fan"] {
|
// if !device.excludeMetrics["nv_fan"] {
|
||||||
// numFans, ret := nvml.DeviceGetNumFans(device.device)
|
// numFans, ret := nvml.DeviceGetNumFans(device.device)
|
||||||
// if ret == nvml.SUCCESS {
|
// if ret == nvml.SUCCESS {
|
||||||
@ -382,7 +386,7 @@ func readFan(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
// return nil
|
// return nil
|
||||||
// }
|
// }
|
||||||
|
|
||||||
func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_ecc_mode"] {
|
if !device.excludeMetrics["nv_ecc_mode"] {
|
||||||
// Retrieves the current and pending ECC modes for the device.
|
// Retrieves the current and pending ECC modes for the device.
|
||||||
//
|
//
|
||||||
@ -416,7 +420,7 @@ func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readPerfState(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_perf_state"] {
|
if !device.excludeMetrics["nv_perf_state"] {
|
||||||
// Retrieves the current performance state for the device.
|
// Retrieves the current performance state for the device.
|
||||||
//
|
//
|
||||||
@ -436,13 +440,16 @@ func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMessage) error
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readPowerUsage(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_power_usage"] {
|
if !device.excludeMetrics["nv_power_usage"] {
|
||||||
// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
|
// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
|
||||||
//
|
//
|
||||||
// On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
|
// On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
|
||||||
|
// On Ampere (except GA100) or newer GPUs, the API returns power averaged over 1 sec interval.
|
||||||
|
// On GA100 and older architectures, instantaneous power is returned.
|
||||||
//
|
//
|
||||||
// It is only available if power management mode is supported
|
// It is only available if power management mode is supported.
|
||||||
|
|
||||||
mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
|
mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
return nil
|
return nil
|
||||||
@ -461,7 +468,54 @@ func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readEnergyConsumption(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
|
// Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded
|
||||||
|
|
||||||
|
// For Volta or newer fully supported devices.
|
||||||
|
if (!device.excludeMetrics["nv_energy"]) && (!device.excludeMetrics["nv_energy_abs"]) && (!device.excludeMetrics["nv_average_power"]) {
|
||||||
|
now := time.Now()
|
||||||
|
mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if mode == nvml.FEATURE_ENABLED {
|
||||||
|
energy, ret := nvml.DeviceGetTotalEnergyConsumption(device.device)
|
||||||
|
if ret == nvml.SUCCESS {
|
||||||
|
if device.lastEnergyReading != 0 {
|
||||||
|
if !device.excludeMetrics["nv_energy"] {
|
||||||
|
y, err := lp.NewMetric("nv_energy", device.tags, device.meta, (energy-device.lastEnergyReading)/1000, now)
|
||||||
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "Joules")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !device.excludeMetrics["nv_average_power"] {
|
||||||
|
|
||||||
|
energyDiff := (energy - device.lastEnergyReading) / 1000
|
||||||
|
timeDiff := now.Sub(device.lastEnergyTimestamp)
|
||||||
|
y, err := lp.NewMetric("nv_average_power", device.tags, device.meta, energyDiff/uint64(timeDiff.Seconds()), now)
|
||||||
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "watts")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !device.excludeMetrics["nv_energy_abs"] {
|
||||||
|
y, err := lp.NewMetric("nv_energy_abs", device.tags, device.meta, energy/1000, now)
|
||||||
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "Joules")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
device.lastEnergyReading = energy
|
||||||
|
device.lastEnergyTimestamp = time.Now()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
// Retrieves the current clock speeds for the device.
|
// Retrieves the current clock speeds for the device.
|
||||||
//
|
//
|
||||||
// Available clock information:
|
// Available clock information:
|
||||||
@ -513,7 +567,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readMaxClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
// Retrieves the maximum clock speeds for the device.
|
// Retrieves the maximum clock speeds for the device.
|
||||||
//
|
//
|
||||||
// Available clock information:
|
// Available clock information:
|
||||||
@ -571,7 +625,7 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_ecc_uncorrected_error"] {
|
if !device.excludeMetrics["nv_ecc_uncorrected_error"] {
|
||||||
// Retrieves the total ECC error counts for the device.
|
// Retrieves the total ECC error counts for the device.
|
||||||
//
|
//
|
||||||
@ -602,7 +656,7 @@ func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMessage) error
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readPowerLimit(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_power_max_limit"] {
|
if !device.excludeMetrics["nv_power_max_limit"] {
|
||||||
// Retrieves the power management limit associated with this device.
|
// Retrieves the power management limit associated with this device.
|
||||||
//
|
//
|
||||||
@ -622,7 +676,7 @@ func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readEncUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err := errors.New(nvml.ErrorString(ret))
|
err := errors.New(nvml.ErrorString(ret))
|
||||||
@ -649,7 +703,7 @@ func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readDecUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err := errors.New(nvml.ErrorString(ret))
|
err := errors.New(nvml.ErrorString(ret))
|
||||||
@ -676,7 +730,7 @@ func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_remapped_rows_corrected"] ||
|
if !device.excludeMetrics["nv_remapped_rows_corrected"] ||
|
||||||
!device.excludeMetrics["nv_remapped_rows_uncorrected"] ||
|
!device.excludeMetrics["nv_remapped_rows_uncorrected"] ||
|
||||||
!device.excludeMetrics["nv_remapped_rows_pending"] ||
|
!device.excludeMetrics["nv_remapped_rows_pending"] ||
|
||||||
@ -729,7 +783,7 @@ func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_compute_processes"] {
|
if !device.excludeMetrics["nv_compute_processes"] {
|
||||||
// Get information about processes with a compute context on a device
|
// Get information about processes with a compute context on a device
|
||||||
//
|
//
|
||||||
@ -821,7 +875,7 @@ func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMessage) e
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
var violTime nvml.ViolationTime
|
var violTime nvml.ViolationTime
|
||||||
var ret nvml.Return
|
var ret nvml.Return
|
||||||
|
|
||||||
@ -935,7 +989,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
// Retrieves the specified error counter value
|
// Retrieves the specified error counter value
|
||||||
// Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
|
// Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
|
||||||
//
|
//
|
||||||
@ -1070,7 +1124,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
readAll := func(device NvidiaCollectorDevice, output chan lp.CCMessage) {
|
readAll := func(device *NvidiaCollectorDevice, output chan lp.CCMessage) {
|
||||||
name, ret := nvml.DeviceGetName(device.device)
|
name, ret := nvml.DeviceGetName(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
name = "NoName"
|
name = "NoName"
|
||||||
@ -1110,6 +1164,11 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
err = readEnergyConsumption(device, output)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentDebug(m.name, "readEnergyConsumption for device", name, "failed")
|
||||||
|
}
|
||||||
|
|
||||||
err = readClocks(device, output)
|
err = readClocks(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
|
||||||
@ -1169,7 +1228,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
// Actual read loop over all attached Nvidia GPUs
|
// Actual read loop over all attached Nvidia GPUs
|
||||||
for i := 0; i < m.num_gpus; i++ {
|
for i := 0; i < m.num_gpus; i++ {
|
||||||
|
|
||||||
readAll(m.gpus[i], output)
|
readAll(&m.gpus[i], output)
|
||||||
|
|
||||||
// Iterate over all MIG devices if any
|
// Iterate over all MIG devices if any
|
||||||
if m.config.ProcessMigDevices {
|
if m.config.ProcessMigDevices {
|
||||||
@ -1243,7 +1302,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
readAll(migDevice, output)
|
readAll(&migDevice, output)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user