From 0f7c4712fa30e2d3383fa5aded2607a87e4bf93d Mon Sep 17 00:00:00 2001
From: Thomas Roehl <Thomas.Roehl@googlemail.com>
Date: Thu, 12 May 2022 14:30:08 +0200
Subject: [PATCH] Update NvidiaCollector with new metrics, MIG and NvLink
 support

---
 collectors/nvidiaMetric.go | 1333 +++++++++++++++++++++++++++---------
 collectors/nvidiaMetric.md |   66 +-
 2 files changed, 1065 insertions(+), 334 deletions(-)

diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go
index 24f0855..3828284 100644
--- a/collectors/nvidiaMetric.go
+++ b/collectors/nvidiaMetric.go
@@ -13,22 +13,28 @@ import (
 )
 
 type NvidiaCollectorConfig struct {
-	ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
-	ExcludeDevices []string `json:"exclude_devices,omitempty"`
-	AddPciInfoTag  bool     `json:"add_pci_info_tag,omitempty"`
+	ExcludeMetrics     []string `json:"exclude_metrics,omitempty"`
+	ExcludeDevices     []string `json:"exclude_devices,omitempty"`
+	AddPciInfoTag      bool     `json:"add_pci_info_tag,omitempty"`
+	UsePciInfoAsTypeId bool     `json:"use_pci_info_as_type_id,omitempty"`
+	AddUuidMeta        bool     `json:"add_uuid_meta,omitempty"`
+	AddBoardNumberMeta bool     `json:"add_board_number_meta,omitempty"`
+	AddSerialMeta      bool     `json:"add_serial_meta,omitempty"`
+	ProcessMigDevices  bool     `json:"process_mig_devices,omitempty"`
 }
 
 type NvidiaCollectorDevice struct {
 	device         nvml.Device
 	excludeMetrics map[string]bool
 	tags           map[string]string
+	meta           map[string]string
 }
 
 type NvidiaCollector struct {
 	metricCollector
-	num_gpus int
 	config   NvidiaCollectorConfig
 	gpus     []NvidiaCollectorDevice
+	num_gpus int
 }
 
 func (m *NvidiaCollector) CatchPanic() {
@@ -42,6 +48,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
 	var err error
 	m.name = "NvidiaCollector"
 	m.config.AddPciInfoTag = false
+	m.config.UsePciInfoAsTypeId = false
 	m.setup()
 	if len(config) > 0 {
 		err = json.Unmarshal(config, &m.config)
@@ -54,7 +61,6 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
 		"group":  "Nvidia",
 	}
 
-	m.num_gpus = 0
 	defer m.CatchPanic()
 
 	// Initialize NVIDIA Management Library (NVML)
@@ -74,13 +80,14 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
 	}
 
 	// For all GPUs
+	idx := 0
 	m.gpus = make([]NvidiaCollectorDevice, num_gpus)
 	for i := 0; i < num_gpus; i++ {
-		g := &m.gpus[i]
 
-		// Skip excluded devices
+		// Skip excluded devices by ID
 		str_i := fmt.Sprintf("%d", i)
 		if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip {
+			cclog.ComponentDebug(m.name, "Skipping excluded device", str_i)
 			continue
 		}
 
@@ -89,14 +96,85 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
 		if ret != nvml.SUCCESS {
 			err = errors.New(nvml.ErrorString(ret))
 			cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error())
-			return err
+			continue
 		}
+
+		// Get device's PCI info
+		pciInfo, ret := nvml.DeviceGetPciInfo(device)
+		if ret != nvml.SUCCESS {
+			err = errors.New(nvml.ErrorString(ret))
+			cclog.ComponentError(m.name, "Unable to get PCI info for device at index", i, ":", err.Error())
+			continue
+		}
+		// Create PCI ID in the common format used by the NVML.
+		pci_id := fmt.Sprintf(
+			nvml.DEVICE_PCI_BUS_ID_FMT,
+			pciInfo.Domain,
+			pciInfo.Bus,
+			pciInfo.Device)
+
+		// Skip excluded devices specified by PCI ID
+		if _, skip := stringArrayContains(m.config.ExcludeDevices, pci_id); skip {
+			cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id)
+			continue
+		}
+
+		// Select which value to use as 'type-id'.
+		// The PCI ID is commonly required in SLURM environments because the
+		// numberic IDs used by SLURM and the ones used by NVML might differ
+		// depending on the job type. The PCI ID is more reliable but is commonly
+		// not recorded for a job, so it must be added manually in prologue or epilogue
+		// e.g. to the comment field
+		tid := str_i
+		if m.config.UsePciInfoAsTypeId {
+			tid = pci_id
+		}
+
+		// Now we got all infos together, populate the device list
+		g := &m.gpus[idx]
+
+		// Add device handle
 		g.device = device
 
 		// Add tags
 		g.tags = map[string]string{
 			"type":    "accelerator",
-			"type-id": str_i,
+			"type-id": tid,
+		}
+
+		// Add PCI info as tag if not already used as 'type-id'
+		if m.config.AddPciInfoTag && !m.config.UsePciInfoAsTypeId {
+			g.tags["pci_identifier"] = pci_id
+		}
+
+		g.meta = map[string]string{
+			"source": m.name,
+			"group":  "Nvidia",
+		}
+
+		if m.config.AddBoardNumberMeta {
+			board, ret := nvml.DeviceGetBoardPartNumber(device)
+			if ret != nvml.SUCCESS {
+				cclog.ComponentError(m.name, "Unable to get boart part number for device at index", i, ":", err.Error())
+			} else {
+				g.meta["board_number"] = board
+			}
+		}
+		if m.config.AddSerialMeta {
+			serial, ret := nvml.DeviceGetSerial(device)
+			if ret != nvml.SUCCESS {
+				cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error())
+			} else {
+				g.meta["serial"] = serial
+			}
+		}
+		if m.config.AddUuidMeta {
+			uuid, ret := nvml.DeviceGetUUID(device)
+			if ret != nvml.SUCCESS {
+				cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error())
+			} else {
+				g.meta["uuid"] = uuid
+			}
 		}
 
 		// Add excluded metrics
@@ -105,363 +183,982 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
 			g.excludeMetrics[e] = true
 		}
 
-		// Add PCI info as tag
-		if m.config.AddPciInfoTag {
-			pciInfo, ret := nvml.DeviceGetPciInfo(g.device)
-			if ret != nvml.SUCCESS {
-				err = errors.New(nvml.ErrorString(ret))
-				cclog.ComponentError(m.name, "Unable to get PCI info for device at index", i, ":", err.Error())
-				return err
-			}
-			g.tags["pci_identifier"] = fmt.Sprintf(
-				"%08X:%02X:%02X.0",
-				pciInfo.Domain,
-				pciInfo.Bus,
-				pciInfo.Device)
-		}
+		// Increment the index for the next device
+		idx++
 	}
+	m.num_gpus = idx
 
 	m.init = true
 	return nil
 }
 
+func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	if !device.excludeMetrics["nv_fb_mem_total"] || !device.excludeMetrics["nv_fb_mem_used"] || !device.excludeMetrics["nv_fb_mem_reserved"] {
+		var total uint64
+		var used uint64
+		var reserved uint64 = 0
+		var v2 bool = false
+		meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
+		if ret != nvml.SUCCESS {
+			err := errors.New(nvml.ErrorString(ret))
+			return err
+		}
+		total = meminfo.Total
+		used = meminfo.Used
+
+		if !device.excludeMetrics["nv_fb_mem_total"] {
+			t := float64(total) / (1024 * 1024)
+			y, err := lp.New("nv_fb_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "MByte")
+				output <- y
+			}
+		}
+
+		if !device.excludeMetrics["nv_fb_mem_used"] {
+			f := float64(used) / (1024 * 1024)
+			y, err := lp.New("nv_fb_mem_used", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "MByte")
+				output <- y
+			}
+		}
+
+		if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] {
+			r := float64(reserved) / (1024 * 1024)
+			y, err := lp.New("nv_fb_mem_reserved", device.tags, device.meta, map[string]interface{}{"value": r}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "MByte")
+				output <- y
+			}
+		}
+	}
+	return nil
+}
+
+func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	if !device.excludeMetrics["nv_bar1_mem_total"] || !device.excludeMetrics["nv_bar1_mem_used"] {
+		meminfo, ret := nvml.DeviceGetBAR1MemoryInfo(device.device)
+		if ret != nvml.SUCCESS {
+			err := errors.New(nvml.ErrorString(ret))
+			return err
+		}
+		if !device.excludeMetrics["nv_bar1_mem_total"] {
+			t := float64(meminfo.Bar1Total) / (1024 * 1024)
+			y, err := lp.New("nv_bar1_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "MByte")
+				output <- y
+			}
+		}
+		if !device.excludeMetrics["nv_bar1_mem_used"] {
+			t := float64(meminfo.Bar1Used) / (1024 * 1024)
+			y, err := lp.New("nv_bar1_mem_used", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "MByte")
+				output <- y
+			}
+		}
+	}
+	return nil
+}
+
+func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
+	if ret != nvml.SUCCESS {
+		err := errors.New(nvml.ErrorString(ret))
+		return err
+	}
+	if !isMig {
+		return nil
+	}
+
+	if !device.excludeMetrics["nv_util"] || !device.excludeMetrics["nv_mem_util"] {
+		// Retrieves the current utilization rates for the device's major subsystems.
+		//
+		// Available utilization rates
+		// * Gpu: Percent of time over the past sample period during which one or more kernels was executing on the GPU.
+		// * Memory: Percent of time over the past sample period during which global (device) memory was being read or written
+		//
+		// Note:
+		// * During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings.
+		//   This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization.
+		// * On MIG-enabled GPUs, querying device utilization rates is not currently supported.
+		util, ret := nvml.DeviceGetUtilizationRates(device.device)
+		if ret == nvml.SUCCESS {
+			if !device.excludeMetrics["nv_util"] {
+				y, err := lp.New("nv_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
+				if err == nil {
+					y.AddMeta("unit", "%")
+					output <- y
+				}
+			}
+			if !device.excludeMetrics["nv_mem_util"] {
+				y, err := lp.New("nv_mem_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
+				if err == nil {
+					y.AddMeta("unit", "%")
+					output <- y
+				}
+			}
+		}
+	}
+	return nil
+}
+
+func readTemp(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	if !device.excludeMetrics["nv_temp"] {
+		// Retrieves the current temperature readings for the device, in degrees C.
+		//
+		// Available temperature sensors:
+		// * TEMPERATURE_GPU: Temperature sensor for the GPU die.
+		// * NVML_TEMPERATURE_COUNT
+		temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_temp", device.tags, device.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "degC")
+				output <- y
+			}
+		}
+	}
+	return nil
+}
+
+func readFan(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	if !device.excludeMetrics["nv_fan"] {
+		// Retrieves the intended operating speed of the device's fan.
+		//
+		// Note: The reported speed is the intended fan speed.
+		// If the fan is physically blocked and unable to spin, the output will not match the actual fan speed.
+		//
+		// For all discrete products with dedicated fans.
+		//
+		// The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed.
+		// This value may exceed 100% in certain cases.
+		fan, ret := nvml.DeviceGetFanSpeed(device.device)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "%")
+				output <- y
+			}
+		}
+	}
+	return nil
+}
+
+// func readFans(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+// 	if !device.excludeMetrics["nv_fan"] {
+// 		numFans, ret := nvml.DeviceGetNumFans(device.device)
+// 		if ret == nvml.SUCCESS {
+// 			for i := 0; i < numFans; i++ {
+// 				fan, ret := nvml.DeviceGetFanSpeed_v2(device.device, i)
+// 				if ret == nvml.SUCCESS {
+// 					y, err := lp.New("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
+// 					if err == nil {
+// 						y.AddMeta("unit", "%")
+// 						y.AddTag("stype", "fan")
+// 						y.AddTag("stype-id", fmt.Sprintf("%d", i))
+// 						output <- y
+// 					}
+// 				}
+// 			}
+// 		}
+// 	}
+// 	return nil
+// }
+
+func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	if !device.excludeMetrics["nv_ecc_mode"] {
+		// Retrieves the current and pending ECC modes for the device.
+		//
+		// For Fermi or newer fully supported devices. Only applicable to devices with ECC.
+		// Requires NVML_INFOROM_ECC version 1.0 or higher.
+		//
+		// Changing ECC modes requires a reboot.
+		// The "pending" ECC mode refers to the target mode following the next reboot.
+		_, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
+		if ret == nvml.SUCCESS {
+			var y lp.CCMetric
+			var err error
+			switch ecc_pend {
+			case nvml.FEATURE_DISABLED:
+				y, err = lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "OFF"}, time.Now())
+			case nvml.FEATURE_ENABLED:
+				y, err = lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "ON"}, time.Now())
+			default:
+				y, err = lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now())
+			}
+			if err == nil {
+				output <- y
+			}
+		} else if ret == nvml.ERROR_NOT_SUPPORTED {
+			y, err := lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "N/A"}, time.Now())
+			if err == nil {
+				output <- y
+			}
+		}
+	}
+	return nil
+}
+
+func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	if !device.excludeMetrics["nv_perf_state"] {
+		// Retrieves the current performance state for the device.
+		//
+		// Allowed PStates:
+		//  0: Maximum Performance.
+		// ..
+		// 15: Minimum Performance.
+		// 32: Unknown performance state.
+		pState, ret := nvml.DeviceGetPerformanceState(device.device)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_perf_state", device.tags, device.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
+			if err == nil {
+				output <- y
+			}
+		}
+	}
+	return nil
+}
+
+func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	if !device.excludeMetrics["nv_power_usage"] {
+		// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
+		//
+		// On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
+		//
+		// It is only available if power management mode is supported
+		mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
+		if ret != nvml.SUCCESS {
+			return nil
+		}
+		if mode == nvml.FEATURE_ENABLED {
+			power, ret := nvml.DeviceGetPowerUsage(device.device)
+			if ret == nvml.SUCCESS {
+				y, err := lp.New("nv_power_usage", device.tags, device.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
+				if err == nil {
+					y.AddMeta("unit", "watts")
+					output <- y
+				}
+			}
+		}
+	}
+	return nil
+}
+
+func readClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	// Retrieves the current clock speeds for the device.
+	//
+	// Available clock information:
+	// * CLOCK_GRAPHICS: Graphics clock domain.
+	// * CLOCK_SM: Streaming Multiprocessor clock domain.
+	// * CLOCK_MEM: Memory clock domain.
+	if !device.excludeMetrics["nv_graphics_clock"] {
+		graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "MHz")
+				output <- y
+			}
+		}
+	}
+
+	if !device.excludeMetrics["nv_sm_clock"] {
+		smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(smCock)}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "MHz")
+				output <- y
+			}
+		}
+	}
+
+	if !device.excludeMetrics["nv_mem_clock"] {
+		memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "MHz")
+				output <- y
+			}
+		}
+	}
+	if !device.excludeMetrics["nv_video_clock"] {
+		memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "MHz")
+				output <- y
+			}
+		}
+	}
+	return nil
+}
+
+func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	// Retrieves the maximum clock speeds for the device.
+	//
+	// Available clock information:
+	// * CLOCK_GRAPHICS: Graphics clock domain.
+	// * CLOCK_SM:       Streaming multiprocessor clock domain.
+	// * CLOCK_MEM:      Memory clock domain.
+	// * CLOCK_VIDEO:    Video encoder/decoder clock domain.
+	// * CLOCK_COUNT:    Count of clock types.
+	//
+	// Note:
+	/// On GPUs from Fermi family current P0 clocks (reported by nvmlDeviceGetClockInfo) can differ from max clocks by few MHz.
+	if !device.excludeMetrics["nv_max_graphics_clock"] {
+		max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_max_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "MHz")
+				output <- y
+			}
+		}
+	}
+
+	if !device.excludeMetrics["nv_max_sm_clock"] {
+		maxSmClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_max_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxSmClock)}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "MHz")
+				output <- y
+			}
+		}
+	}
+
+	if !device.excludeMetrics["nv_max_mem_clock"] {
+		maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_max_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "MHz")
+				output <- y
+			}
+		}
+	}
+
+	if !device.excludeMetrics["nv_max_video_clock"] {
+		maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_max_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "MHz")
+				output <- y
+			}
+		}
+	}
+	return nil
+}
+
+func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	if !device.excludeMetrics["nv_ecc_uncorrected_error"] {
+		// Retrieves the total ECC error counts for the device.
+		//
+		// For Fermi or newer fully supported devices.
+		// Only applicable to devices with ECC.
+		// Requires NVML_INFOROM_ECC version 1.0 or higher.
+		// Requires ECC Mode to be enabled.
+		//
+		// The total error count is the sum of errors across each of the separate memory systems,
+		// i.e. the total set of errors across the entire device.
+		ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
+			if err == nil {
+				output <- y
+			}
+		}
+	}
+	if !device.excludeMetrics["nv_ecc_corrected_error"] {
+		ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_ecc_corrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
+			if err == nil {
+				output <- y
+			}
+		}
+	}
+	return nil
+}
+
+func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	if !device.excludeMetrics["nv_power_max_limit"] {
+		// Retrieves the power management limit associated with this device.
+		//
+		// For Fermi or newer fully supported devices.
+		//
+		// The power limit defines the upper boundary for the card's power draw.
+		// If the card's total power draw reaches this limit the power management algorithm kicks in.
+		pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_power_max_limit", device.tags, device.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "watts")
+				output <- y
+			}
+		}
+	}
+	return nil
+}
+
+func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
+	if ret != nvml.SUCCESS {
+		err := errors.New(nvml.ErrorString(ret))
+		return err
+	}
+	if !isMig {
+		return nil
+	}
+	if !device.excludeMetrics["nv_encoder_util"] {
+		// Retrieves the current utilization and sampling size in microseconds for the Encoder
+		//
+		// For Kepler or newer fully supported devices.
+		//
+		// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
+		enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_encoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "%")
+				output <- y
+			}
+		}
+	}
+	return nil
+}
+
+func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
+	if ret != nvml.SUCCESS {
+		err := errors.New(nvml.ErrorString(ret))
+		return err
+	}
+	if !isMig {
+		return nil
+	}
+	if !device.excludeMetrics["nv_decoder_util"] {
+		// Retrieves the current utilization and sampling size in microseconds for the Encoder
+		//
+		// For Kepler or newer fully supported devices.
+		//
+		// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
+		dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_decoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "%")
+				output <- y
+			}
+		}
+	}
+	return nil
+}
+
+func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	if !device.excludeMetrics["nv_remapped_rows_corrected"] ||
+		!device.excludeMetrics["nv_remapped_rows_uncorrected"] ||
+		!device.excludeMetrics["nv_remapped_rows_pending"] ||
+		!device.excludeMetrics["nv_remapped_rows_failure"] {
+		// Get number of remapped rows. The number of rows reported will be based on the cause of the remapping.
+		// isPending indicates whether or not there are pending remappings.
+		// A reset will be required to actually remap the row.
+		// failureOccurred will be set if a row remapping ever failed in the past.
+		// A pending remapping won't affect future work on the GPU since error-containment and dynamic page blacklisting will take care of that.
+		//
+		// For Ampere or newer fully supported devices.
+		//
+		// Note: On MIG-enabled GPUs with active instances, querying the number of remapped rows is not supported
+		corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device)
+		if ret == nvml.SUCCESS {
+			if !device.excludeMetrics["nv_remapped_rows_corrected"] {
+				y, err := lp.New("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(corrected)}, time.Now())
+				if err == nil {
+					output <- y
+				}
+			}
+			if !device.excludeMetrics["nv_remapped_rows_uncorrected"] {
+				y, err := lp.New("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(uncorrected)}, time.Now())
+				if err == nil {
+					output <- y
+				}
+			}
+			if !device.excludeMetrics["nv_remapped_rows_pending"] {
+				var p int = 0
+				if pending {
+					p = 1
+				}
+				y, err := lp.New("nv_remapped_rows_pending", device.tags, device.meta, map[string]interface{}{"value": p}, time.Now())
+				if err == nil {
+					output <- y
+				}
+			}
+			if !device.excludeMetrics["nv_remapped_rows_failure"] {
+				var f int = 0
+				if failure {
+					f = 1
+				}
+				y, err := lp.New("nv_remapped_rows_failure", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
+				if err == nil {
+					output <- y
+				}
+			}
+		}
+	}
+	return nil
+}
+
+func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	if !device.excludeMetrics["nv_compute_processes"] {
+		// Get information about processes with a compute context on a device
+		//
+		// For Fermi &tm; or newer fully supported devices.
+		//
+		// This function returns information only about compute running processes (e.g. CUDA application which have
+		// active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function.
+		//
+		// To query the current number of running compute processes, call this function with *infoCount = 0. The
+		// return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
+		// \a infos is allowed to be NULL.
+		//
+		// The usedGpuMemory field returned is all of the memory used by the application.
+		//
+		// Keep in mind that information returned by this call is dynamic and the number of elements might change in
+		// time. Allocate more space for \a infos table in case new compute processes are spawned.
+		//
+		// @note In MIG mode, if device handle is provided, the API returns aggregate information, only if
+		//        the caller has appropriate privileges. Per-instance information can be queried by using
+		//        specific MIG device handles.
+		//        Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
+		procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
+			if err == nil {
+				output <- y
+			}
+		}
+	}
+	if !device.excludeMetrics["nv_graphics_processes"] {
+		// Get information about processes with a graphics context on a device
+		//
+		// For Kepler &tm; or newer fully supported devices.
+		//
+		// This function returns information only about graphics based processes
+		// (eg. applications using OpenGL, DirectX)
+		//
+		// To query the current number of running graphics processes, call this function with *infoCount = 0. The
+		// return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
+		// \a infos is allowed to be NULL.
+		//
+		// The usedGpuMemory field returned is all of the memory used by the application.
+		//
+		// Keep in mind that information returned by this call is dynamic and the number of elements might change in
+		// time. Allocate more space for \a infos table in case new graphics processes are spawned.
+		//
+		// @note In MIG mode, if device handle is provided, the API returns aggregate information, only if
+		//       the caller has appropriate privileges. Per-instance information can be queried by using
+		//       specific MIG device handles.
+		//       Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
+		procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device)
+		if ret == nvml.SUCCESS {
+			y, err := lp.New("nv_graphics_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
+			if err == nil {
+				output <- y
+			}
+		}
+	}
+	// if !device.excludeMetrics["nv_mps_compute_processes"] {
+	// 	// Get information about processes with a MPS compute context on a device
+	// 	//
+	// 	// For Volta &tm; or newer fully supported devices.
+	// 	//
+	// 	// This function returns information only about compute running processes (e.g. CUDA application which have
+	// 	// active context) utilizing MPS. Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by
+	// 	// this function.
+	// 	//
+	// 	// To query the current number of running compute processes, call this function with *infoCount = 0. The
+	// 	// return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
+	// 	// \a infos is allowed to be NULL.
+	// 	//
+	// 	// The usedGpuMemory field returned is all of the memory used by the application.
+	// 	//
+	// 	// Keep in mind that information returned by this call is dynamic and the number of elements might change in
+	// 	// time. Allocate more space for \a infos table in case new compute processes are spawned.
+	// 	//
+	// 	// @note In MIG mode, if device handle is provided, the API returns aggregate information, only if
+	// 	//       the caller has appropriate privileges. Per-instance information can be queried by using
+	// 	//       specific MIG device handles.
+	// 	//       Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
+	// 	procList, ret := nvml.DeviceGetMPSComputeRunningProcesses(device.device)
+	// 	if ret == nvml.SUCCESS {
+	// 		y, err := lp.New("nv_mps_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
+	// 		if err == nil {
+	// 			output <- y
+	// 		}
+	// 	}
+	// }
+	return nil
+}
+
+func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	var violTime nvml.ViolationTime
+	var ret nvml.Return
+
+	// Gets the duration of time during which the device was throttled (lower than requested clocks) due to power
+	//  or thermal constraints.
+	//
+	// The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The
+	// difference in violation times at two different reference times gives the indication of GPU throttling event.
+	//
+	// Violation for thermal capping is not supported at this time.
+	//
+	// For Kepler  or newer fully supported devices.
+
+	if !device.excludeMetrics["nv_violation_power"] {
+		// How long did power violations cause the GPU to be below application clocks
+		violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER)
+		if ret == nvml.SUCCESS {
+			t := float64(violTime.ViolationTime) * 1e-9
+			y, err := lp.New("nv_violation_power", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "sec")
+				output <- y
+			}
+		}
+	}
+	if !device.excludeMetrics["nv_violation_thermal"] {
+		// How long did thermal violations cause the GPU to be below application clocks
+		violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL)
+		if ret == nvml.SUCCESS {
+			t := float64(violTime.ViolationTime) * 1e-9
+			y, err := lp.New("nv_violation_thermal", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "sec")
+				output <- y
+			}
+		}
+	}
+	if !device.excludeMetrics["nv_violation_sync_boost"] {
+		// How long did sync boost cause the GPU to be below application clocks
+		violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST)
+		if ret == nvml.SUCCESS {
+			t := float64(violTime.ViolationTime) * 1e-9
+			y, err := lp.New("nv_violation_sync_boost", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "sec")
+				output <- y
+			}
+		}
+	}
+	if !device.excludeMetrics["nv_violation_board_limit"] {
+		// How long did the board limit cause the GPU to be below application clocks
+		violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT)
+		if ret == nvml.SUCCESS {
+			t := float64(violTime.ViolationTime) * 1e-9
+			y, err := lp.New("nv_violation_board_limit", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "sec")
+				output <- y
+			}
+		}
+	}
+	if !device.excludeMetrics["nv_violation_low_util"] {
+		// How long did low utilization cause the GPU to be below application clocks
+		violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION)
+		if ret == nvml.SUCCESS {
+			t := float64(violTime.ViolationTime) * 1e-9
+			y, err := lp.New("nv_violation_low_util", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "sec")
+				output <- y
+			}
+		}
+	}
+	if !device.excludeMetrics["nv_violation_reliability"] {
+		// How long did the board reliability limit cause the GPU to be below application clocks
+		violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY)
+		if ret == nvml.SUCCESS {
+			t := float64(violTime.ViolationTime) * 1e-9
+			y, err := lp.New("nv_violation_reliability", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "sec")
+				output <- y
+			}
+		}
+	}
+	if !device.excludeMetrics["nv_violation_below_app_clock"] {
+		// Total time the GPU was held below application clocks by any limiter (all of above)
+		violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS)
+		if ret == nvml.SUCCESS {
+			t := float64(violTime.ViolationTime) * 1e-9
+			y, err := lp.New("nv_violation_below_app_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "sec")
+				output <- y
+			}
+		}
+	}
+	if !device.excludeMetrics["nv_violation_below_base_clock"] {
+		// Total time the GPU was held below base clocks
+		violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS)
+		if ret == nvml.SUCCESS {
+			t := float64(violTime.ViolationTime) * 1e-9
+			y, err := lp.New("nv_violation_below_base_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
+			if err == nil {
+				y.AddMeta("unit", "sec")
+				output <- y
+			}
+		}
+	}
+
+	return nil
+}
+
+func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
+	// Retrieves the specified error counter value
+	// Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
+	//
+	// For Pascal &tm; or newer fully supported devices.
+
+	for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ {
+		state, ret := nvml.DeviceGetNvLinkState(device.device, i)
+		if ret == nvml.SUCCESS {
+			if state == nvml.FEATURE_ENABLED {
+				if !device.excludeMetrics["nv_nvlink_crc_errors"] {
+					// Data link receive data CRC error counter
+					count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA)
+					if ret == nvml.SUCCESS {
+						y, err := lp.New("nv_nvlink_crc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
+						if err == nil {
+							y.AddTag("stype", "nvlink")
+							y.AddTag("stype-id", fmt.Sprintf("%d", i))
+							output <- y
+						}
+					}
+				}
+				if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
+					// Data link receive data ECC error counter
+					count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA)
+					if ret == nvml.SUCCESS {
+						y, err := lp.New("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
+						if err == nil {
+							y.AddTag("stype", "nvlink")
+							y.AddTag("stype-id", fmt.Sprintf("%d", i))
+							output <- y
+						}
+					}
+				}
+				if !device.excludeMetrics["nv_nvlink_replay_errors"] {
+					// Data link transmit replay error counter
+					count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY)
+					if ret == nvml.SUCCESS {
+						y, err := lp.New("nv_nvlink_replay_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
+						if err == nil {
+							y.AddTag("stype", "nvlink")
+							y.AddTag("stype-id", fmt.Sprintf("%d", i))
+							output <- y
+						}
+					}
+				}
+				if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
+					// Data link transmit recovery error counter
+					count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY)
+					if ret == nvml.SUCCESS {
+						y, err := lp.New("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
+						if err == nil {
+							y.AddTag("stype", "nvlink")
+							y.AddTag("stype-id", fmt.Sprintf("%d", i))
+							output <- y
+						}
+					}
+				}
+				if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
+					// Data link receive flow control digit CRC error counter
+					count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT)
+					if ret == nvml.SUCCESS {
+						y, err := lp.New("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
+						if err == nil {
+							y.AddTag("stype", "nvlink")
+							y.AddTag("stype-id", fmt.Sprintf("%d", i))
+							output <- y
+						}
+					}
+				}
+			}
+		}
+	}
+	return nil
+}
+
 func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) {
+	var err error
 	if !m.init {
 		return
 	}
 
-	for i := range m.gpus {
-		device := &m.gpus[i]
-
-		if !device.excludeMetrics["nv_util"] || !device.excludeMetrics["nv_mem_util"] {
-			// Retrieves the current utilization rates for the device's major subsystems.
-			//
-			// Available utilization rates
-			// * Gpu: Percent of time over the past sample period during which one or more kernels was executing on the GPU.
-			// * Memory: Percent of time over the past sample period during which global (device) memory was being read or written
-			//
-			// Note:
-			// * During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings.
-			//   This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization.
-			// * On MIG-enabled GPUs, querying device utilization rates is not currently supported.
-			util, ret := nvml.DeviceGetUtilizationRates(device.device)
-			if ret == nvml.SUCCESS {
-				if !device.excludeMetrics["nv_util"] {
-					y, err := lp.New("nv_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
-					if err == nil {
-						y.AddMeta("unit", "%")
-						output <- y
-					}
-				}
-				if !device.excludeMetrics["nv_mem_util"] {
-					y, err := lp.New("nv_mem_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
-					if err == nil {
-						y.AddMeta("unit", "%")
-						output <- y
-					}
-				}
-			}
+	readAll := func(device NvidiaCollectorDevice, output chan lp.CCMetric) {
+		name, ret := nvml.DeviceGetName(device.device)
+		if ret != nvml.SUCCESS {
+			name = "NoName"
+		}
+		err = readMemoryInfo(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readMemoryInfo for device", name, "failed")
 		}
 
-		if !device.excludeMetrics["nv_mem_total"] || !device.excludeMetrics["nv_fb_memory"] {
-			// Retrieves the amount of used, free and total memory available on the device, in bytes.
-			//
-			// Enabling ECC reduces the amount of total available memory, due to the extra required parity bits.
-			//
-			// The reported amount of used memory is equal to the sum of memory allocated by all active channels on the device.
-			//
-			// Available memory info:
-			// * Free: Unallocated FB memory (in bytes).
-			// * Total: Total installed FB memory (in bytes).
-			// * Used: Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping.
-			//
-			// Note:
-			// In MIG mode, if device handle is provided, the API returns aggregate information, only if the caller has appropriate privileges.
-			// Per-instance information can be queried by using specific MIG device handles.
-			meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
-			if ret == nvml.SUCCESS {
-				if !device.excludeMetrics["nv_mem_total"] {
-					t := float64(meminfo.Total) / (1024 * 1024)
-					y, err := lp.New("nv_mem_total", device.tags, m.meta, map[string]interface{}{"value": t}, time.Now())
-					if err == nil {
-						y.AddMeta("unit", "MByte")
-						output <- y
-					}
-				}
-
-				if !device.excludeMetrics["nv_fb_memory"] {
-					f := float64(meminfo.Used) / (1024 * 1024)
-					y, err := lp.New("nv_fb_memory", device.tags, m.meta, map[string]interface{}{"value": f}, time.Now())
-					if err == nil {
-						y.AddMeta("unit", "MByte")
-						output <- y
-					}
-				}
-			}
+		err = readUtilization(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readUtilization for device", name, "failed")
 		}
 
-		if !device.excludeMetrics["nv_temp"] {
-			// Retrieves the current temperature readings for the device, in degrees C.
-			//
-			// Available temperature sensors:
-			// * TEMPERATURE_GPU: Temperature sensor for the GPU die.
-			// * NVML_TEMPERATURE_COUNT
-			temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
-			if ret == nvml.SUCCESS {
-				y, err := lp.New("nv_temp", device.tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
-				if err == nil {
-					y.AddMeta("unit", "degC")
-					output <- y
-				}
-			}
+		err = readTemp(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readTemp for device", name, "failed")
 		}
 
-		if !device.excludeMetrics["nv_fan"] {
-			// Retrieves the intended operating speed of the device's fan.
-			//
-			// Note: The reported speed is the intended fan speed.
-			// If the fan is physically blocked and unable to spin, the output will not match the actual fan speed.
-			//
-			// For all discrete products with dedicated fans.
-			//
-			// The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed.
-			// This value may exceed 100% in certain cases.
-			fan, ret := nvml.DeviceGetFanSpeed(device.device)
-			if ret == nvml.SUCCESS {
-				y, err := lp.New("nv_fan", device.tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
-				if err == nil {
-					y.AddMeta("unit", "%")
-					output <- y
-				}
-			}
+		err = readFan(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readFan for device", name, "failed")
 		}
 
-		if !device.excludeMetrics["nv_ecc_mode"] {
-			// Retrieves the current and pending ECC modes for the device.
-			//
-			// For Fermi or newer fully supported devices. Only applicable to devices with ECC.
-			// Requires NVML_INFOROM_ECC version 1.0 or higher.
-			//
-			// Changing ECC modes requires a reboot.
-			// The "pending" ECC mode refers to the target mode following the next reboot.
-			_, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
-			if ret == nvml.SUCCESS {
-				var y lp.CCMetric
-				var err error
-				switch ecc_pend {
-				case nvml.FEATURE_DISABLED:
-					y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "OFF"}, time.Now())
-				case nvml.FEATURE_ENABLED:
-					y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "ON"}, time.Now())
-				default:
-					y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now())
-				}
-				if err == nil {
-					output <- y
-				}
-			} else if ret == nvml.ERROR_NOT_SUPPORTED {
-				y, err := lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "N/A"}, time.Now())
-				if err == nil {
-					output <- y
-				}
-			}
+		err = readEccMode(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readEccMode for device", name, "failed")
 		}
 
-		if !device.excludeMetrics["nv_perf_state"] {
-			// Retrieves the current performance state for the device.
-			//
-			// Allowed PStates:
-			//  0: Maximum Performance.
-			// ..
-			// 15: Minimum Performance.
-			// 32: Unknown performance state.
-			pState, ret := nvml.DeviceGetPerformanceState(device.device)
-			if ret == nvml.SUCCESS {
-				y, err := lp.New("nv_perf_state", device.tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
-				if err == nil {
-					output <- y
-				}
-			}
+		err = readPerfState(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readPerfState for device", name, "failed")
 		}
 
-		if !device.excludeMetrics["nv_power_usage_report"] {
-			// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
-			//
-			// On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
-			//
-			// It is only available if power management mode is supported
-			power, ret := nvml.DeviceGetPowerUsage(device.device)
-			if ret == nvml.SUCCESS {
-				y, err := lp.New("nv_power_usage_report", device.tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
-				if err == nil {
-					y.AddMeta("unit", "watts")
-					output <- y
-				}
-			}
+		err = readPowerUsage(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed")
 		}
 
-		// Retrieves the current clock speeds for the device.
-		//
-		// Available clock information:
-		// * CLOCK_GRAPHICS: Graphics clock domain.
-		// * CLOCK_SM: Streaming Multiprocessor clock domain.
-		// * CLOCK_MEM: Memory clock domain.
-		if !device.excludeMetrics["nv_graphics_clock_report"] {
-			graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
-			if ret == nvml.SUCCESS {
-				y, err := lp.New("nv_graphics_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now())
-				if err == nil {
-					y.AddMeta("unit", "MHz")
-					output <- y
-				}
-			}
+		err = readClocks(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
 		}
 
-		if !device.excludeMetrics["nv_sm_clock_report"] {
-			smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
-			if ret == nvml.SUCCESS {
-				y, err := lp.New("nv_sm_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(smCock)}, time.Now())
-				if err == nil {
-					y.AddMeta("unit", "MHz")
-					output <- y
-				}
-			}
+		err = readMaxClocks(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readMaxClocks for device", name, "failed")
 		}
 
-		if !device.excludeMetrics["nv_mem_clock_report"] {
-			memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
-			if ret == nvml.SUCCESS {
-				y, err := lp.New("nv_mem_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
-				if err == nil {
-					y.AddMeta("unit", "MHz")
-					output <- y
-				}
-			}
+		err = readEccErrors(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readEccErrors for device", name, "failed")
 		}
 
-		// Retrieves the maximum clock speeds for the device.
-		//
-		// Available clock information:
-		// * CLOCK_GRAPHICS: Graphics clock domain.
-		// * CLOCK_SM:       Streaming multiprocessor clock domain.
-		// * CLOCK_MEM:      Memory clock domain.
-		// * CLOCK_VIDEO:    Video encoder/decoder clock domain.
-		// * CLOCK_COUNT:    Count of clock types.
-		//
-		// Note:
-		/// On GPUs from Fermi family current P0 clocks (reported by nvmlDeviceGetClockInfo) can differ from max clocks by few MHz.
-		if !device.excludeMetrics["nv_max_graphics_clock"] {
-			max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS)
-			if ret == nvml.SUCCESS {
-				y, err := lp.New("nv_max_graphics_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
-				if err == nil {
-					y.AddMeta("unit", "MHz")
-					output <- y
-				}
-			}
+		err = readPowerLimit(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readPowerLimit for device", name, "failed")
 		}
 
-		if !device.excludeMetrics["nv_max_sm_clock"] {
-			maxSmClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
-			if ret == nvml.SUCCESS {
-				y, err := lp.New("nv_max_sm_clock", device.tags, m.meta, map[string]interface{}{"value": float64(maxSmClock)}, time.Now())
-				if err == nil {
-					y.AddMeta("unit", "MHz")
-					output <- y
-				}
-			}
+		err = readEncUtilization(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readEncUtilization for device", name, "failed")
 		}
 
-		if !device.excludeMetrics["nv_max_mem_clock"] {
-			maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
-			if ret == nvml.SUCCESS {
-				y, err := lp.New("nv_max_mem_clock", device.tags, m.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now())
-				if err == nil {
-					y.AddMeta("unit", "MHz")
-					output <- y
-				}
-			}
+		err = readDecUtilization(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readDecUtilization for device", name, "failed")
 		}
 
-		if !device.excludeMetrics["nv_ecc_db_error"] {
-			// Retrieves the total ECC error counts for the device.
-			//
-			// For Fermi or newer fully supported devices.
-			// Only applicable to devices with ECC.
-			// Requires NVML_INFOROM_ECC version 1.0 or higher.
-			// Requires ECC Mode to be enabled.
-			//
-			// The total error count is the sum of errors across each of the separate memory systems,
-			// i.e. the total set of errors across the entire device.
-			ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
-			if ret == nvml.SUCCESS {
-				y, err := lp.New("nv_ecc_db_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
-				if err == nil {
-					output <- y
-				}
-			}
+		err = readRemappedRows(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readRemappedRows for device", name, "failed")
 		}
 
-		if !device.excludeMetrics["nv_ecc_sb_error"] {
-			ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
-			if ret == nvml.SUCCESS {
-				y, err := lp.New("nv_ecc_sb_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
-				if err == nil {
-					output <- y
-				}
-			}
+		err = readBarMemoryInfo(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readBarMemoryInfo for device", name, "failed")
 		}
 
-		if !device.excludeMetrics["nv_power_man_limit"] {
-			// Retrieves the power management limit associated with this device.
-			//
-			// For Fermi or newer fully supported devices.
-			//
-			// The power limit defines the upper boundary for the card's power draw.
-			// If the card's total power draw reaches this limit the power management algorithm kicks in.
-			pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
-			if ret == nvml.SUCCESS {
-				y, err := lp.New("nv_power_man_limit", device.tags, m.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now())
-				if err == nil {
-					y.AddMeta("unit", "watts")
-					output <- y
-				}
-			}
+		err = readProcessCounts(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readProcessCounts for device", name, "failed")
 		}
 
-		if !device.excludeMetrics["nv_encoder_util"] {
-			// Retrieves the current utilization and sampling size in microseconds for the Encoder
-			//
-			// For Kepler or newer fully supported devices.
-			//
-			// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
-			enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
-			if ret == nvml.SUCCESS {
-				y, err := lp.New("nv_encoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
-				if err == nil {
-					y.AddMeta("unit", "%")
-					output <- y
-				}
-			}
+		err = readViolationStats(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readViolationStats for device", name, "failed")
 		}
 
-		if !device.excludeMetrics["nv_decoder_util"] {
-			// Retrieves the current utilization and sampling size in microseconds for the Decoder
-			//
-			// For Kepler or newer fully supported devices.
-			//
-			// Note: On MIG-enabled GPUs, querying decoder utilization is not currently supported.
-			dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
-			if ret == nvml.SUCCESS {
-				y, err := lp.New("nv_decoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
-				if err == nil {
-					y.AddMeta("unit", "%")
-					output <- y
-				}
-			}
+		err = readNVLinkStats(device, output)
+		if err != nil {
+			cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed")
 		}
 	}
 
+	// Actual read loop over all attached Nvidia GPUs
+	for i := 0; i < m.num_gpus; i++ {
+
+		readAll(m.gpus[i], output)
+
+		// Iterate over all MIG devices if any
+		if m.config.ProcessMigDevices {
+			current, _, ret := nvml.DeviceGetMigMode(m.gpus[i].device)
+			if ret != nvml.SUCCESS {
+				continue
+			}
+			if current == nvml.DEVICE_MIG_DISABLE {
+				continue
+			}
+
+			maxMig, ret := nvml.DeviceGetMaxMigDeviceCount(m.gpus[i].device)
+			if ret != nvml.SUCCESS {
+				continue
+			}
+			if maxMig == 0 {
+				continue
+			}
+			cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i)
+
+			for j := 0; j < maxMig; j++ {
+				mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
+				if ret != nvml.SUCCESS {
+					continue
+				}
+
+				excludeMetrics := make(map[string]bool)
+				for _, metric := range m.config.ExcludeMetrics {
+					excludeMetrics[metric] = true
+				}
+
+				migDevice := NvidiaCollectorDevice{
+					device:         mdev,
+					tags:           map[string]string{},
+					meta:           map[string]string{},
+					excludeMetrics: excludeMetrics,
+				}
+				for k, v := range m.gpus[i].tags {
+					migDevice.tags[k] = v
+				}
+				m.gpus[i].tags["stype"] = "mig"
+				m.gpus[i].tags["stype-id"] = fmt.Sprintf("%d", j)
+				for k, v := range m.gpus[i].meta {
+					migDevice.meta[k] = v
+				}
+				if _, ok := migDevice.meta["uuid"]; ok {
+					uuid, ret := nvml.DeviceGetUUID(mdev)
+					if ret == nvml.SUCCESS {
+						migDevice.meta["uuid"] = uuid
+					}
+				}
+
+				readAll(migDevice, output)
+			}
+		}
+	}
 }
 
 func (m *NvidiaCollector) Close() {
diff --git a/collectors/nvidiaMetric.md b/collectors/nvidiaMetric.md
index afe8b9e..8cfff32 100644
--- a/collectors/nvidiaMetric.md
+++ b/collectors/nvidiaMetric.md
@@ -3,38 +3,72 @@
 
 ```json
   "nvidia": {
-    "exclude_devices" : [
-      "0","1"
+    "exclude_devices": [
+      "0","1", "0000000:ff:01.0"
     ],
     "exclude_metrics": [
-      "nv_fb_memory",
+      "nv_fb_mem_used",
       "nv_fan"
-    ]
+    ],
+    "process_mig_devices": false,
+    "use_pci_info_as_type_id": true,
+    "add_pci_info_tag": false,
+    "add_uuid_meta": false,
+    "add_board_number_meta": false,
+    "add_serial_meta": false
   }
 ```
 
+The `nvidia` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option. Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=<mig_index>`).
+
+The metrics sent by the `nvidia` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
+
+Optionally, it is possible to add the UUID, the board part number and the serial to the meta informations. They are not sent to the sinks (if not configured otherwise).
+
+
 Metrics:
 * `nv_util`
 * `nv_mem_util`
-* `nv_mem_total`
-* `nv_fb_memory`
+* `nv_fb_mem_total`
+* `nv_fb_mem_used`
+* `nv_bar1_mem_total`
+* `nv_bar1_mem_used`
 * `nv_temp`
 * `nv_fan`
 * `nv_ecc_mode`
 * `nv_perf_state`
-* `nv_power_usage_report`
-* `nv_graphics_clock_report`
-* `nv_sm_clock_report`
-* `nv_mem_clock_report`
+* `nv_power_usage`
+* `nv_graphics_clock`
+* `nv_sm_clock`
+* `nv_mem_clock`
+* `nv_video_clock`
 * `nv_max_graphics_clock`
 * `nv_max_sm_clock`
 * `nv_max_mem_clock`
-* `nv_ecc_db_error`
-* `nv_ecc_sb_error`
-* `nv_power_man_limit`
+* `nv_max_video_clock`
+* `nv_ecc_uncorrected_error`
+* `nv_ecc_corrected_error`
+* `nv_power_max_limit`
 * `nv_encoder_util`
 * `nv_decoder_util`
+* `nv_remapped_rows_corrected`
+* `nv_remapped_rows_uncorrected`
+* `nv_remapped_rows_pending`
+* `nv_remapped_rows_failure`
+* `nv_compute_processes`
+* `nv_graphics_processes`
+* `nv_violation_power`
+* `nv_violation_thermal`
+* `nv_violation_sync_boost`
+* `nv_violation_board_limit`
+* `nv_violation_low_util`
+* `nv_violation_reliability`
+* `nv_violation_below_app_clock`
+* `nv_violation_below_base_clock`
+* `nv_nvlink_crc_flit_errors`
+* `nv_nvlink_crc_errors`
+* `nv_nvlink_ecc_errors`
+* `nv_nvlink_replay_errors`
+* `nv_nvlink_recovery_errors`
 
-It uses a separate `type` in the metrics. The output metric looks like this:
-`<name>,type=accelerator,type-id=<nvidia-gpu-id> value=<metric value> <timestamp>`
-
+Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`. 
\ No newline at end of file