Add energy metrics to Nvidia collector README

Add energy metrics from NVML to Nvidia NVML collector
Fix URL to new location of cc-units
2026-03-18 22:37:29 +01:00 · 2025-04-28 15:36:04 +00:00 · 2025-04-28 15:29:13 +00:00 · 2025-04-22 12:48:15 +02:00 · 2025-04-17 11:38:03 +02:00 · 2025-04-17 11:37:47 +02:00
3 changed files with 91 additions and 29 deletions
--- a/collectors/nvidiaMetric.go
+++ b/collectors/nvidiaMetric.go
@@ -27,10 +27,12 @@ type NvidiaCollectorConfig struct {
 }

 type NvidiaCollectorDevice struct {
-	device         nvml.Device
-	excludeMetrics map[string]bool
-	tags           map[string]string
-	meta           map[string]string
+	device              nvml.Device
+	excludeMetrics      map[string]bool
+	tags                map[string]string
+	meta                map[string]string
+	lastEnergyReading   uint64
+	lastEnergyTimestamp time.Time
 }

 type NvidiaCollector struct {
@@ -149,6 +151,8 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {

 		// Add device handle
 		g.device = device
+		g.lastEnergyReading = 0
+		g.lastEnergyTimestamp = time.Now()

 		// Add tags
 		g.tags = map[string]string{
@@ -206,7 +210,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
 	return nil
 }

-func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	if !device.excludeMetrics["nv_fb_mem_total"] || !device.excludeMetrics["nv_fb_mem_used"] || !device.excludeMetrics["nv_fb_mem_reserved"] {
 		var total uint64
 		var used uint64
@@ -250,7 +254,7 @@ func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) erro
 	return nil
 }

-func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	if !device.excludeMetrics["nv_bar1_mem_total"] || !device.excludeMetrics["nv_bar1_mem_used"] {
 		meminfo, ret := nvml.DeviceGetBAR1MemoryInfo(device.device)
 		if ret != nvml.SUCCESS {
@@ -277,7 +281,7 @@ func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) e
 	return nil
 }

-func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
 	if ret != nvml.SUCCESS {
 		err := errors.New(nvml.ErrorString(ret))
@@ -319,7 +323,7 @@ func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) err
 	return nil
 }

-func readTemp(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readTemp(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	if !device.excludeMetrics["nv_temp"] {
 		// Retrieves the current temperature readings for the device, in degrees C.
 		//
@@ -338,7 +342,7 @@ func readTemp(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	return nil
 }

-func readFan(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	if !device.excludeMetrics["nv_fan"] {
 		// Retrieves the intended operating speed of the device's fan.
 		//
@@ -361,7 +365,7 @@ func readFan(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	return nil
 }

-// func readFans(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+// func readFans(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 // 	if !device.excludeMetrics["nv_fan"] {
 // 		numFans, ret := nvml.DeviceGetNumFans(device.device)
 // 		if ret == nvml.SUCCESS {
@@ -382,7 +386,7 @@ func readFan(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
 // 	return nil
 // }

-func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	if !device.excludeMetrics["nv_ecc_mode"] {
 		// Retrieves the current and pending ECC modes for the device.
 		//
@@ -416,7 +420,7 @@ func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	return nil
 }

-func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readPerfState(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	if !device.excludeMetrics["nv_perf_state"] {
 		// Retrieves the current performance state for the device.
 		//
@@ -436,13 +440,16 @@ func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMessage) error
 	return nil
 }

-func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readPowerUsage(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	if !device.excludeMetrics["nv_power_usage"] {
 		// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
 		//
 		// On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
+		// On Ampere (except GA100) or newer GPUs, the API returns power averaged over 1 sec interval.
+		// On GA100 and older architectures, instantaneous power is returned.
 		//
-		// It is only available if power management mode is supported
+		// It is only available if power management mode is supported.
+
 		mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
 		if ret != nvml.SUCCESS {
 			return nil
@@ -461,7 +468,54 @@ func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMessage) erro
 	return nil
 }

-func readClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readEnergyConsumption(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
+	// Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded
+
+	// For Volta or newer fully supported devices.
+	if (!device.excludeMetrics["nv_energy"]) && (!device.excludeMetrics["nv_energy_abs"]) && (!device.excludeMetrics["nv_average_power"]) {
+		now := time.Now()
+		mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
+		if ret != nvml.SUCCESS {
+			return nil
+		}
+		if mode == nvml.FEATURE_ENABLED {
+			energy, ret := nvml.DeviceGetTotalEnergyConsumption(device.device)
+			if ret == nvml.SUCCESS {
+				if device.lastEnergyReading != 0 {
+					if !device.excludeMetrics["nv_energy"] {
+						y, err := lp.NewMetric("nv_energy", device.tags, device.meta, (energy-device.lastEnergyReading)/1000, now)
+						if err == nil {
+							y.AddMeta("unit", "Joules")
+							output <- y
+						}
+					}
+					if !device.excludeMetrics["nv_average_power"] {
+
+						energyDiff := (energy - device.lastEnergyReading) / 1000
+						timeDiff := now.Sub(device.lastEnergyTimestamp)
+						y, err := lp.NewMetric("nv_average_power", device.tags, device.meta, energyDiff/uint64(timeDiff.Seconds()), now)
+						if err == nil {
+							y.AddMeta("unit", "watts")
+							output <- y
+						}
+					}
+				}
+				if !device.excludeMetrics["nv_energy_abs"] {
+					y, err := lp.NewMetric("nv_energy_abs", device.tags, device.meta, energy/1000, now)
+					if err == nil {
+						y.AddMeta("unit", "Joules")
+						output <- y
+					}
+				}
+				device.lastEnergyReading = energy
+				device.lastEnergyTimestamp = time.Now()
+			}
+		}
+	}
+	return nil
+}
+
+func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	// Retrieves the current clock speeds for the device.
 	//
 	// Available clock information:
@@ -513,7 +567,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	return nil
 }

-func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readMaxClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	// Retrieves the maximum clock speeds for the device.
 	//
 	// Available clock information:
@@ -571,7 +625,7 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error
 	return nil
 }

-func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	if !device.excludeMetrics["nv_ecc_uncorrected_error"] {
 		// Retrieves the total ECC error counts for the device.
 		//
@@ -602,7 +656,7 @@ func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMessage) error
 	return nil
 }

-func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readPowerLimit(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	if !device.excludeMetrics["nv_power_max_limit"] {
 		// Retrieves the power management limit associated with this device.
 		//
@@ -622,7 +676,7 @@ func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMessage) erro
 	return nil
 }

-func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readEncUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
 	if ret != nvml.SUCCESS {
 		err := errors.New(nvml.ErrorString(ret))
@@ -649,7 +703,7 @@ func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage)
 	return nil
 }

-func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readDecUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
 	if ret != nvml.SUCCESS {
 		err := errors.New(nvml.ErrorString(ret))
@@ -676,7 +730,7 @@ func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage)
 	return nil
 }

-func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	if !device.excludeMetrics["nv_remapped_rows_corrected"] ||
 		!device.excludeMetrics["nv_remapped_rows_uncorrected"] ||
 		!device.excludeMetrics["nv_remapped_rows_pending"] ||
@@ -729,7 +783,7 @@ func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMessage) er
 	return nil
 }

-func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	if !device.excludeMetrics["nv_compute_processes"] {
 		// Get information about processes with a compute context on a device
 		//
@@ -821,7 +875,7 @@ func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMessage) e
 	return nil
 }

-func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	var violTime nvml.ViolationTime
 	var ret nvml.Return

@@ -935,7 +989,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage)
 	return nil
 }

-func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
+func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
 	// Retrieves the specified error counter value
 	// Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
 	//
@@ -1070,7 +1124,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
 		return
 	}

-	readAll := func(device NvidiaCollectorDevice, output chan lp.CCMessage) {
+	readAll := func(device *NvidiaCollectorDevice, output chan lp.CCMessage) {
 		name, ret := nvml.DeviceGetName(device.device)
 		if ret != nvml.SUCCESS {
 			name = "NoName"
@@ -1110,6 +1164,11 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
 			cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed")
 		}

+		err = readEnergyConsumption(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readEnergyConsumption for device", name, "failed")
+		}
+
 		err = readClocks(device, output)
 		if err != nil {
 			cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
@@ -1169,7 +1228,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
 	// Actual read loop over all attached Nvidia GPUs
 	for i := 0; i < m.num_gpus; i++ {

-		readAll(m.gpus[i], output)
+		readAll(&m.gpus[i], output)

 		// Iterate over all MIG devices if any
 		if m.config.ProcessMigDevices {
@@ -1243,7 +1302,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
 					}
 				}

-				readAll(migDevice, output)
+				readAll(&migDevice, output)
 			}
 		}
 	}
--- a/collectors/nvidiaMetric.md
+++ b/collectors/nvidiaMetric.md
@@ -82,5 +82,8 @@ Metrics:
 * `nv_nvlink_ecc_errors`
 * `nv_nvlink_replay_errors`
 * `nv_nvlink_recovery_errors`
+* `nv_energy`
+* `nv_energy_abs`
+* `nv_average_power`

 Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`. 
--- a/internal/metricRouter/README.md
+++ b/internal/metricRouter/README.md
@@ -236,13 +236,13 @@ __deprecated__


 The cc-metric-collector tries to read the data from the system as it is reported. If available, it tries to read the metric unit from the system as well (e.g. from `/proc/meminfo`). The problem is that, depending on the source, the metric units are named differently. Just think about `byte`, `Byte`, `B`, `bytes`, ...
-The [cc-units](https://github.com/ClusterCockpit/cc-units) package provides us a normalization option to use the same metric unit name for all metrics. It this option is set to true, all `unit` meta tags are normalized.
+The [cc-units](https://github.com/ClusterCockpit/cc-lib/ccUnits) package provides us a normalization option to use the same metric unit name for all metrics. It this option is set to true, all `unit` meta tags are normalized.

 ## The `change_unit_prefix` section

 __deprecated__

-It is often the case that metrics are reported by the system using a rather outdated unit prefix (like `/proc/meminfo` still uses kByte despite current memory sizes are in the GByte range). If you want to change the prefix of a unit, you can do that with the help of [cc-units](https://github.com/ClusterCockpit/cc-units). The setting works on the metric name and requires the new prefix for the metric. The cc-units package determines the scaling factor.
+It is often the case that metrics are reported by the system using a rather outdated unit prefix (like `/proc/meminfo` still uses kByte despite current memory sizes are in the GByte range). If you want to change the prefix of a unit, you can do that with the help of [cc-units](https://github.com/ClusterCockpit/cc-lib/ccUnits). The setting works on the metric name and requires the new prefix for the metric. The cc-units package determines the scaling factor.

 # Aggregate metric values of the current interval with the `interval_aggregates` option
Author	SHA1	Message	Date
Thomas Roehl	3877e4a0b6	Add energy metrics to Nvidia collector README	2025-04-28 15:36:04 +00:00
Thomas Roehl	a606a3af01	Add energy metrics from NVML to Nvidia NVML collector	2025-04-28 15:29:13 +00:00
Thomas Roehl	f8b2ac0d2c	Fix URL to new location of cc-units	2025-04-22 12:48:15 +02:00
Thomas Roehl	ec34b40295	Merge branch 'main' of github.com:ClusterCockpit/cc-metric-collector	2025-04-17 11:38:03 +02:00
Thomas Gruber	03cd965099	Merge develop into main for documentation (#143 ) * Fix Release part * Fix Release part * Update Hugo integration (#142)	2025-04-17 11:37:47 +02:00