Add metric 'nv_util_eff' like nvtop

2026-07-18 23:50:37 +02:00 · 2026-05-06 18:57:58 +02:00
9 changed files with 56 additions and 485 deletions
@@ -50,7 +50,6 @@ var AvailableCollectors = map[string]MetricCollector{
 	"nfsiostat":       new(NfsIOStatCollector),
 	"slurm_cgroup":    new(SlurmCgroupCollector),
 	"smartmon":        new(SmartMonCollector),
-	"nvidia_gpm":      new(NvidiaGPMCollector),
 }

 // Metric collector manager data structure
@@ -100,17 +99,17 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
 	// Initialize configured collectors
 	for collectorName, collectorCfg := range cm.config {
 		if _, found := AvailableCollectors[collectorName]; !found {
-			cclog.ComponentErrorf("CollectorManager", "SKIP unknown collector %s", collectorName)
+			cclog.ComponentError("CollectorManager", "SKIP unknown collector", collectorName)
 			continue
 		}
 		collector := AvailableCollectors[collectorName]

 		err := collector.Init(collectorCfg)
 		if err != nil {
-			cclog.ComponentErrorf("CollectorManager", "Collector %s initialization failed: %v", collectorName, err)
+			cclog.ComponentError("CollectorManager", fmt.Sprintf("Collector %s initialization failed: %v", collectorName, err))
 			continue
 		}
-		cclog.ComponentDebugf("CollectorManager", "ADD COLLECTOR %s", collector.Name())
+		cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())
 		if collector.Parallel() {
 			cm.collectors = append(cm.collectors, collector)
 		} else {
@@ -156,7 +155,7 @@ func (cm *collectorManager) Start() {
 						return
 					default:
 						// Read metrics from collector c via goroutine
-						cclog.ComponentDebugf("CollectorManager: Read %s at %v", c.Name(), t)
+						cclog.ComponentDebug("CollectorManager", c.Name(), t)
 						cm.collector_wg.Add(1)
 						go func(myc MetricCollector) {
 							myc.Read(cm.duration, cm.output)
@@ -174,7 +173,7 @@ func (cm *collectorManager) Start() {
 						return
 					default:
 						// Read metrics from collector c
-						cclog.ComponentDebugf("CollectorManager: Read %s at %v", c.Name(), t)
+						cclog.ComponentDebug("CollectorManager", c.Name(), t)
 						c.Read(cm.duration, cm.output)
 					}
 				}
@@ -27,7 +27,6 @@ const CPUSTATFILE = `/proc/stat`

 type CpustatCollectorConfig struct {
 	ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
-	excludeNumCPUs bool
 }

 type CpustatCollector struct {
@@ -80,7 +79,6 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
 			m.matches[match] = index
 		}
 	}
-	m.config.excludeNumCPUs = slices.Contains(m.config.ExcludeMetrics, "num_cpus")

 	// Check input file
 	file, err := os.Open(CPUSTATFILE)
@@ -97,13 +95,11 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
 		line := scanner.Text()
 		linefields := strings.Fields(line)
 		if strings.Compare(linefields[0], "cpu") == 0 {
-			// Kernel system statistics for all CPUs
 			m.olddata["cpu"] = make(map[string]int64)
 			for k, v := range m.matches {
 				m.olddata["cpu"][k], _ = strconv.ParseInt(linefields[v], 0, 64)
 			}
 		} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
-			// Kernel system statistics per CPU
 			cpustr := strings.TrimLeft(linefields[0], "cpu")
 			cpu, _ := strconv.Atoi(cpustr)
 			m.cputags[linefields[0]] = map[string]string{
@@ -195,10 +191,9 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
 		}
 	}

-	if !m.config.excludeNumCPUs {
-		if num_cpus_metric, err := lp.NewMetric("num_cpus", m.nodetags, m.meta, num_cpus, now); err == nil {
-			output <- num_cpus_metric
-		}
+	num_cpus_metric, err := lp.NewMetric("num_cpus", m.nodetags, m.meta, num_cpus, now)
+	if err == nil {
+		output <- num_cpus_metric
 	}

 	m.lastTimestamp = now
@@ -72,8 +72,7 @@ func getStats(filename string) map[string]MemstatStats {
 	for scanner.Scan() {
 		line := scanner.Text()
 		linefields := strings.Fields(line)
-		switch len(linefields) {
-		case 3:
+		if len(linefields) == 3 {
 			v, err := strconv.ParseFloat(linefields[1], 64)
 			if err == nil {
 				stats[strings.Trim(linefields[0], ":")] = MemstatStats{
@@ -81,7 +80,7 @@ func getStats(filename string) map[string]MemstatStats {
 					unit:  linefields[2],
 				}
 			}
-		case 5:
+		} else if len(linefields) == 5 {
 			v, err := strconv.ParseFloat(linefields[3], 64)
 			if err == nil {
 				cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4])
@@ -107,10 +106,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
 			return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
 		}
 	}
-	m.meta = map[string]string{
-		"source": m.name,
-		"group":  "Memory",
-	}
+	m.meta = map[string]string{"source": m.name, "group": "Memory"}
 	m.stats = make(map[string]int64)
 	m.matches = make(map[string]string)
 	m.tags = map[string]string{"type": "node"}
@@ -149,7 +145,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
 		"KernelStack":     "mem_kernelstack",
 	}
 	for k, v := range matches {
-		if !slices.Contains(m.config.ExcludeMetrics, v) {
+		if !slices.Contains(m.config.ExcludeMetrics, k) {
 			m.matches[k] = v
 		}
 	}
@@ -157,7 +153,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
 	if !slices.Contains(m.config.ExcludeMetrics, "mem_used") {
 		m.sendMemUsed = true
 	}
-	if len(m.matches) == 0 && !m.sendMemUsed {
+	if len(m.matches) == 0 {
 		return fmt.Errorf("%s Init(): no metrics to collect", m.name)
 	}
 	if err := m.setup(); err != nil {
@@ -1,396 +0,0 @@
-package collectors
-
-import (
-	"encoding/json"
-	"errors"
-	"fmt"
-	"slices"
-	"strconv"
-	"strings"
-	"time"
-
-	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
-	lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
-	"github.com/NVIDIA/go-nvml/pkg/nvml"
-)
-
-type NvidiaGPMMetricDef struct {
-	name    string
-	outname string
-	id      nvml.GpmMetricId
-	unit    string
-}
-
-var NvidiaGPMMetrics []NvidiaGPMMetricDef = []NvidiaGPMMetricDef{
-	{
-		name:    "GRAPHICS_UTIL",
-		outname: "nv_gpm_graphics_util",
-		id:      nvml.GPM_METRIC_GRAPHICS_UTIL,
-		unit:    "%",
-	},
-	{
-		name:    "SM_UTIL",
-		outname: "nv_gpm_sm_util",
-		id:      nvml.GPM_METRIC_SM_UTIL,
-		unit:    "%",
-	},
-	{
-		name:    "SM_OCCUPANCY",
-		outname: "nv_gpm_sm_occupancy",
-		id:      nvml.GPM_METRIC_SM_OCCUPANCY,
-		unit:    "%",
-	},
-	{
-		name:    "INTEGER_UTIL",
-		outname: "nv_gpm_integer_util",
-		id:      nvml.GPM_METRIC_INTEGER_UTIL,
-		unit:    "%",
-	},
-	{
-		name:    "ANY_TENSOR_UTIL",
-		outname: "nv_gpm_any_tensor_util",
-		id:      nvml.GPM_METRIC_ANY_TENSOR_UTIL,
-		unit:    "%",
-	},
-	{
-		name:    "DFMA_TENSOR_UTIL",
-		outname: "nv_gpm_dfma_tensor_util",
-		id:      nvml.GPM_METRIC_DFMA_TENSOR_UTIL,
-		unit:    "%",
-	},
-	{
-		name:    "HMMA_TENSOR_UTIL",
-		outname: "nv_gpm_hmma_tensor_util",
-		id:      nvml.GPM_METRIC_HMMA_TENSOR_UTIL,
-		unit:    "%",
-	},
-	{
-		name:    "IMMA_TENSOR_UTIL",
-		outname: "nv_gpm_imma_tensor_util",
-		id:      nvml.GPM_METRIC_IMMA_TENSOR_UTIL,
-		unit:    "%",
-	},
-	{
-		name:    "DRAM_BW_UTIL",
-		outname: "nv_gpm_dram_bw_util",
-		id:      nvml.GPM_METRIC_DRAM_BW_UTIL,
-		unit:    "%",
-	},
-	{
-		name:    "FP64_UTIL",
-		outname: "nv_gpm_fp64_util",
-		id:      nvml.GPM_METRIC_FP64_UTIL,
-		unit:    "%",
-	},
-	{
-		name:    "FP32_UTIL",
-		outname: "nv_gpm_fp32_util",
-		id:      nvml.GPM_METRIC_FP32_UTIL,
-		unit:    "%",
-	},
-	{
-		name:    "FP16_UTIL",
-		outname: "nv_gpm_fp16_util",
-		id:      nvml.GPM_METRIC_FP16_UTIL,
-		unit:    "%",
-	},
-}
-
-type NvidiaGPMCollectorConfig struct {
-	Metrics               []string `json:"metrics,omitempty"`
-	ExcludeDevices        []string `json:"exclude_devices,omitempty"`
-	AddPciInfoTag         bool     `json:"add_pci_info_tag,omitempty"`
-	UsePciInfoAsTypeId    bool     `json:"use_pci_info_as_type_id,omitempty"`
-	AddUuidMeta           bool     `json:"add_uuid_meta,omitempty"`
-	AddBoardNumberMeta    bool     `json:"add_board_number_meta,omitempty"`
-	AddSerialMeta         bool     `json:"add_serial_meta,omitempty"`
-	ProcessMigDevices     bool     `json:"process_mig_devices,omitempty"`
-	UseUuidForMigDevices  bool     `json:"use_uuid_for_mig_device,omitempty"`
-	UseSliceForMigDevices bool     `json:"use_slice_for_mig_device,omitempty"`
-}
-
-type NvidiaGPMCollectorDevice struct {
-	device        nvml.Device
-	tags          map[string]string
-	meta          map[string]string
-	startTime     time.Time
-	endTime       time.Time
-	measurement   nvml.GpmMetricsGetType
-	metricsLookup map[int]NvidiaGPMMetricDef
-}
-
-type NvidiaGPMCollector struct {
-	metricCollector
-
-	config   NvidiaGPMCollectorConfig
-	gpus     []NvidiaGPMCollectorDevice
-	num_gpus int
-}
-
-func (m *NvidiaGPMCollector) Init(config json.RawMessage) error {
-	var err error = nil
-	m.name = "NvidiaGPMCollector"
-	m.parallel = true
-	if err := m.setup(); err != nil {
-		return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
-	}
-	if len(config) > 0 {
-		d := json.NewDecoder(strings.NewReader(string(config)))
-		d.DisallowUnknownFields()
-		if err = d.Decode(&m.config); err != nil {
-			return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
-		}
-	}
-	m.meta = map[string]string{
-		"source": m.name,
-		"group":  "NvidiaGPM",
-	}
-
-	// Initialize NVIDIA Management Library (NVML)
-	ret := nvml.Init()
-
-	// Error: NVML library not found
-	// (nvml.ErrorString can not be used in this case)
-	if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
-		return fmt.Errorf("%s Init(): NVML library not found", m.name)
-	}
-	if ret != nvml.SUCCESS {
-		err = errors.New(nvml.ErrorString(ret))
-		return fmt.Errorf("%s Init(): Unable to initialize NVML: %w", m.name, err)
-	}
-
-	// Number of NVIDIA GPUs
-	num_gpus, ret := nvml.DeviceGetCount()
-	if ret != nvml.SUCCESS {
-		err = errors.New(nvml.ErrorString(ret))
-		return fmt.Errorf("%s Init(): Unable to get device count: %w", m.name, err)
-	}
-
-	// For all GPUs
-	m.gpus = make([]NvidiaGPMCollectorDevice, 0, num_gpus)
-	for i := range num_gpus {
-
-		// Skip excluded devices by ID
-		str_i := strconv.Itoa(i)
-		if slices.Contains(m.config.ExcludeDevices, str_i) {
-			cclog.ComponentDebugf(m.name, "Skipping excluded device %s", str_i)
-			continue
-		}
-
-		// Get device handle
-		device, ret := nvml.DeviceGetHandleByIndex(i)
-		if ret != nvml.SUCCESS {
-			err = errors.New(nvml.ErrorString(ret))
-			cclog.ComponentErrorf(m.name, "Unable to get device at index %d: %s", i, err.Error())
-			continue
-		}
-
-		supportInfo, ret := nvml.GpmQueryDeviceSupport(device)
-		if ret != nvml.SUCCESS {
-			err = errors.New(nvml.ErrorString(ret))
-			cclog.ComponentErrorf(m.name, "Unable to query GPM support for device at index %d: %s", i, err.Error())
-			continue
-		} else {
-			if supportInfo.IsSupportedDevice == uint32(nvml.FEATURE_DISABLED) {
-				cclog.ComponentErrorf(m.name, "Device at index %d does not support GPM metrics", i)
-				continue
-			}
-		}
-
-		stream, ret := nvml.GpmQueryIfStreamingEnabled(device)
-		if ret != nvml.SUCCESS {
-			err = errors.New(nvml.ErrorString(ret))
-			cclog.ComponentErrorf(m.name, "Unable to query GPM streaming for device at index %d: %s", i, err.Error())
-			continue
-		} else {
-			if stream == uint32(nvml.FEATURE_DISABLED) {
-				ret = nvml.GpmSetStreamingEnabled(device, uint32(nvml.FEATURE_ENABLED))
-				if ret != nvml.SUCCESS {
-					err = errors.New(nvml.ErrorString(ret))
-					cclog.ComponentErrorf(m.name, "Unable to set streaming mode for device at index %d: %s", i, err.Error())
-				}
-			}
-		}
-
-		// Get device's PCI info
-		pciInfo, ret := nvml.DeviceGetPciInfo(device)
-		if ret != nvml.SUCCESS {
-			err = errors.New(nvml.ErrorString(ret))
-			cclog.ComponentErrorf(m.name, "Unable to get PCI info for device at index %d: %s", i, err.Error())
-			continue
-		}
-		// Create PCI ID in the common format used by the NVML.
-		pci_id := fmt.Sprintf(
-			nvml.DEVICE_PCI_BUS_ID_FMT,
-			pciInfo.Domain,
-			pciInfo.Bus,
-			pciInfo.Device)
-
-		// Skip excluded devices specified by PCI ID
-		if slices.Contains(m.config.ExcludeDevices, pci_id) {
-			cclog.ComponentDebugf(m.name, "Skipping excluded device %s", pci_id)
-			continue
-		}
-		ss, nvmlErr := nvml.GpmSampleAlloc()
-		if nvmlErr != nvml.SUCCESS {
-			err = errors.New(nvml.ErrorString(ret))
-			cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error())
-			continue
-		}
-		es, nvmlErr := nvml.GpmSampleAlloc()
-		if nvmlErr != nvml.SUCCESS {
-			err = errors.New(nvml.ErrorString(ret))
-			cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error())
-			continue
-		}
-
-		// Select which value to use as 'type-id'.
-		// The PCI ID is commonly required in SLURM environments because the
-		// numberic IDs used by SLURM and the ones used by NVML might differ
-		// depending on the job type. The PCI ID is more reliable but is commonly
-		// not recorded for a job, so it must be added manually in prologue or epilogue
-		// e.g. to the comment field
-		tid := str_i
-		if m.config.UsePciInfoAsTypeId {
-			tid = pci_id
-		}
-
-		// Now we got all infos together, populate the device list
-		g := NvidiaGPMCollectorDevice{}
-
-		// Add device handle
-		g.device = device
-
-		// Add tags
-		g.tags = map[string]string{
-			"type":    "accelerator",
-			"type-id": tid,
-		}
-
-		// Add PCI info as tag if not already used as 'type-id'
-		if m.config.AddPciInfoTag && !m.config.UsePciInfoAsTypeId {
-			g.tags["pci_identifier"] = pci_id
-		}
-
-		g.meta = map[string]string{
-			"source": m.name,
-			"group":  "Nvidia",
-		}
-
-		if m.config.AddBoardNumberMeta {
-			board, ret := nvml.DeviceGetBoardPartNumber(device)
-			if ret != nvml.SUCCESS {
-				err = errors.New(nvml.ErrorString(ret))
-				cclog.ComponentError(m.name, "Unable to get boart part number for device at index", i, ":", err.Error())
-			} else {
-				g.meta["board_number"] = board
-			}
-		}
-		if m.config.AddSerialMeta {
-			serial, ret := nvml.DeviceGetSerial(device)
-			if ret != nvml.SUCCESS {
-				err = errors.New(nvml.ErrorString(ret))
-				cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error())
-			} else {
-				g.meta["serial"] = serial
-			}
-		}
-		if m.config.AddUuidMeta {
-			uuid, ret := nvml.DeviceGetUUID(device)
-			if ret != nvml.SUCCESS {
-				err = errors.New(nvml.ErrorString(ret))
-				cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error())
-			} else {
-				g.meta["uuid"] = uuid
-			}
-		}
-
-		g.measurement.Sample1 = ss
-		g.measurement.Sample2 = es
-		g.measurement.Version = nvml.GPM_METRICS_GET_VERSION
-		g.metricsLookup = make(map[int]NvidiaGPMMetricDef)
-		metIdx := 0
-		for _, inmetric := range m.config.Metrics {
-			for _, defmetric := range NvidiaGPMMetrics {
-				if inmetric == defmetric.outname || inmetric == defmetric.name {
-					g.measurement.Metrics[metIdx] = nvml.GpmMetric{
-						MetricId: uint32(defmetric.id),
-					}
-					g.metricsLookup[metIdx] = defmetric
-					metIdx += 1
-				}
-			}
-		}
-		g.measurement.NumMetrics = uint32(metIdx)
-		m.gpus = append(m.gpus, g)
-	}
-	cclog.ComponentDebugf(m.name, "Found %d Nvidia GPUs with GPM support", len(m.gpus))
-	m.num_gpus = len(m.gpus)
-	m.init = true
-	return err
-}
-
-func (m *NvidiaGPMCollector) Read(interval time.Duration, output chan lp.CCMessage) {
-	var err error
-	if !m.init {
-		return
-	}
-	for i, gpu := range m.gpus {
-		gpu.startTime = time.Now()
-		nvmlErr := gpu.measurement.Sample1.Get(gpu.device)
-		if nvmlErr != nvml.SUCCESS {
-			err = errors.New(nvml.ErrorString(nvmlErr))
-			cclog.ComponentError(m.name, "Unable to get start GPM sample for device at index", i, ":", err.Error())
-			continue
-		}
-	}
-	time.Sleep(interval)
-
-	for i, gpu := range m.gpus {
-		gpu.endTime = time.Now()
-		nvmlErr := gpu.measurement.Sample2.Get(gpu.device)
-		if nvmlErr != nvml.SUCCESS {
-			err = errors.New(nvml.ErrorString(nvmlErr))
-			cclog.ComponentError(m.name, "Unable to get stop GPM sample for device at index", i, ":", err.Error())
-			continue
-		}
-	}
-
-	for i, gpu := range m.gpus {
-		nvmlErr := nvml.GpmMetricsGet(&gpu.measurement)
-		if nvmlErr != nvml.SUCCESS {
-			err = errors.New(nvml.ErrorString(nvmlErr))
-			cclog.ComponentError(m.name, "Unable to get evaluate GPM sample for device at index", i, ":", err.Error())
-			continue
-		}
-		for idx, metricDef := range gpu.metricsLookup {
-			y, err := lp.NewMetric(metricDef.outname, gpu.tags, gpu.meta, gpu.measurement.Metrics[idx].Value, time.Now())
-			if err == nil {
-				y.AddMeta("unit", metricDef.unit)
-				output <- y
-			}
-		}
-	}
-
-}
-
-func (m *NvidiaGPMCollector) Close() {
-	if m.init {
-		for i, gpu := range m.gpus {
-			ret := gpu.measurement.Sample1.Free()
-			if ret != nvml.SUCCESS {
-				err := errors.New(nvml.ErrorString(ret))
-				cclog.ComponentErrorf(m.name, "Unable to free start sample for device at index %d: %s", i, err.Error())
-			}
-			ret = gpu.measurement.Sample2.Free()
-			if ret != nvml.SUCCESS {
-				err := errors.New(nvml.ErrorString(ret))
-				cclog.ComponentErrorf(m.name, "Unable to free stop sample for device at index %d: %s", i, err.Error())
-			}
-		}
-		if ret := nvml.Shutdown(); ret != nvml.SUCCESS {
-			cclog.ComponentError(m.name, "nvml.Shutdown() not successful")
-		}
-		m.init = false
-	}
-}
@@ -1,54 +0,0 @@
-<!--
---
-title: "Nvidia NVML GPM metric collector"
-description: Collect metrics for Nvidia GPUs using the NVML GPM interface
-categories: [cc-metric-collector]
-tags: ['Admin']
-weight: 2
-hugo_path: docs/reference/cc-metric-collector/collectors/nvidiaGPM.md
---
-->
-
-## `nvidiaGPM` collector
-
-```json
-  "nvidia_gpm": {
-    "metrics": [
-      "nv_fb_mem_used",
-      "nv_fan"
-    ],
-    "exclude_devices": [
-      "0","1", "0000000:ff:01.0"
-    ],
-    
-    "process_mig_devices": false,
-    "use_pci_info_as_type_id": true,
-    "add_pci_info_tag": false,
-    "add_uuid_meta": false,
-    "add_board_number_meta": false,
-    "add_serial_meta": false,
-    "use_uuid_for_mig_device": false,
-    "use_slice_for_mig_device": false
-  }
-```
-
-The `nvidia_gpm` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=<mig_index>`). With the options `use_uuid_for_mig_device` and `use_slice_for_mig_device`, the `<mig_index>` can be replaced with the UUID (e.g. `MIG-6a9f7cc8-6d5b-5ce0-92de-750edc4d8849`) or the MIG slice name (e.g. `1g.5gb`).
-
-The metrics sent by the `nvidia_gpm` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
-
-Optionally, it is possible to add the UUID, the board part number and the serial to the meta informations. They are not sent to the sinks (if not configured otherwise).
-
-
-Available Metrics:
-* `nv_gpm_graphics_util`
-* `nv_gpm_sm_util`
-* `nv_gpm_sm_occupancy`
-* `nv_gpm_integer_util`
-* `nv_gpm_any_tensor_util`
-* `nv_gpm_dfma_tensor_util`
-* `nv_gpm_hmma_tensor_util`
-* `nv_gpm_imma_tensor_util`
-* `nv_gpm_dram_bw_util`
-* `nv_gpm_fp64_util`
-* `nv_gpm_fp32_util`
-* `nv_gpm_fp16_util`
@@ -1115,6 +1115,31 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
 	return nil
 }

+func readEfficiency(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
+	if !device.excludeMetrics["nv_util_eff"] {
+		maxPower, ret := nvml.DeviceGetEnforcedPowerLimit(device.device)
+		if ret == nvml.SUCCESS {
+			curPower, ret := nvml.DeviceGetPowerUsage(device.device)
+			if ret == nvml.SUCCESS {
+				util, ret := nvml.DeviceGetUtilizationRates(device.device)
+				if ret == nvml.SUCCESS {
+					factor := float64(curPower) / float64(maxPower)
+					eff := uint32(float64(util.Gpu) * factor)
+					if eff > 100 {
+						eff = 100
+					}
+					y, err := lp.NewMetric("nv_util_eff", device.tags, device.meta, eff, time.Now())
+					if err == nil {
+						y.AddTag("unit", "percent")
+						output <- y
+					}
+				}
+			}
+		}
+	}
+	return nil
+}
+
 func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage) {
 	var err error
 	if !m.init {
@@ -1220,6 +1245,11 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
 		if err != nil {
 			cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
 		}
+
+		err = readEfficiency(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readEfficiency for device", name, "failed")
+		}
 	}

 	// Actual read loop over all attached Nvidia GPUs
@@ -85,5 +85,6 @@ Metrics:
 * `nv_energy`
 * `nv_energy_abs`
 * `nv_average_power`
+* `nv_util_eff` (`nv_util` * (`nv_power_usage` / `nv_power_max_limit`))

 Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`. 
@@ -7,10 +7,10 @@ require (
 	github.com/ClusterCockpit/go-rocm-smi v0.4.0
 	github.com/NVIDIA/go-nvml v0.13.0-1
 	github.com/PaesslerAG/gval v1.2.4
-	github.com/fsnotify/fsnotify v1.10.1
-	github.com/tklauser/go-sysconf v0.4.0
+	github.com/fsnotify/fsnotify v1.10.0
+	github.com/tklauser/go-sysconf v0.3.16
 	golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1
-	golang.org/x/sys v0.45.0
+	golang.org/x/sys v0.43.0
 )

 require (
@@ -38,7 +38,7 @@ require (
 	github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 // indirect
 	github.com/shopspring/decimal v1.4.0 // indirect
 	github.com/stmcginnis/gofish v0.21.6 // indirect
-	github.com/tklauser/numcpus v0.12.0 // indirect
+	github.com/tklauser/numcpus v0.11.0 // indirect
 	go.yaml.in/yaml/v2 v2.4.4 // indirect
 	golang.org/x/crypto v0.50.0 // indirect
 	golang.org/x/net v0.53.0 // indirect
@@ -53,8 +53,8 @@ github.com/expr-lang/expr v1.17.8 h1:W1loDTT+0PQf5YteHSTpju2qfUfNoBt4yw9+wOEU9VM
 github.com/expr-lang/expr v1.17.8/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
 github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
 github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
-github.com/fsnotify/fsnotify v1.10.1 h1:b0/UzAf9yR5rhf3RPm9gf3ehBPpf0oZKIjtpKrx59Ho=
-github.com/fsnotify/fsnotify v1.10.1/go.mod h1:TLheqan6HD6GBK6PrDWyDPBaEV8LspOxvPSjC+bVfgo=
+github.com/fsnotify/fsnotify v1.10.0 h1:Xx/5Ydg9CeBDX/wi4VJqStNtohYjitZhhlHt4h3St1M=
+github.com/fsnotify/fsnotify v1.10.0/go.mod h1:TLheqan6HD6GBK6PrDWyDPBaEV8LspOxvPSjC+bVfgo=
 github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE=
 github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78=
 github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
@@ -163,10 +163,10 @@ github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu
 github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
 github.com/testcontainers/testcontainers-go v0.26.0 h1:uqcYdoOHBy1ca7gKODfBd9uTHVK3a7UL848z09MVZ0c=
 github.com/testcontainers/testcontainers-go v0.26.0/go.mod h1:ICriE9bLX5CLxL9OFQ2N+2N+f+803LNJ1utJb1+Inx0=
-github.com/tklauser/go-sysconf v0.4.0 h1:7H0uAN+7RkwWRaxhYXDLqa5V3LPrJeV8wmD9dRUgPQU=
-github.com/tklauser/go-sysconf v0.4.0/go.mod h1:8mTNWyog7H+MpKijp4VmKJAd2bbYQ2zuUwkYRbUArPI=
-github.com/tklauser/numcpus v0.12.0 h1:NR85qdvHA9pFse3x3weVZ0r0ST8R6l5RHbZrlRaqob4=
-github.com/tklauser/numcpus v0.12.0/go.mod h1:ABHeXzJnr/qqwguhClkZKT1/8VABcYrsyUiUGobwWJg=
+github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA=
+github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI=
+github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw=
+github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ=
 github.com/yusufpapurcu/wmi v1.2.3 h1:E1ctvB7uKFMOJw3fdOW32DwGE9I7t++CRUEMKvFoFiw=
 github.com/yusufpapurcu/wmi v1.2.3/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
 go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
@@ -184,8 +184,8 @@ golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
 golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA=
 golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs=
 golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY=
-golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
+golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI=
+golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
 golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
 golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
 golang.org/x/tools v0.14.0 h1:jvNa2pY0M4r62jkRQ6RwEZZyPcymeL9XZMLBbV7U2nc=