package collectors

import (
	"encoding/json"
	"errors"
	"fmt"
	"log"
	"strings"
	"time"

	lp "github.com/ClusterCockpit/cc-lib/ccMessage"
	cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
	"github.com/NVIDIA/go-nvml/pkg/nvml"
)

type NvidiaCollectorConfig struct {
	ExcludeMetrics        []string `json:"exclude_metrics,omitempty"`
	OnlyMetrics           []string `json:"only_metrics,omitempty"`
	ExcludeDevices        []string `json:"exclude_devices,omitempty"`
	AddPciInfoTag         bool     `json:"add_pci_info_tag,omitempty"`
	UsePciInfoAsTypeId    bool     `json:"use_pci_info_as_type_id,omitempty"`
	AddUuidMeta           bool     `json:"add_uuid_meta,omitempty"`
	AddBoardNumberMeta    bool     `json:"add_board_number_meta,omitempty"`
	AddSerialMeta         bool     `json:"add_serial_meta,omitempty"`
	ProcessMigDevices     bool     `json:"process_mig_devices,omitempty"`
	UseUuidForMigDevices  bool     `json:"use_uuid_for_mig_device,omitempty"`
	UseSliceForMigDevices bool     `json:"use_slice_for_mig_device,omitempty"`
	UseMemoryInfoV2       bool     `json:"use_memory_info_v2,omitempty"`
	SendDiffValues        bool     `json:"send_diff_values,omitempty"`
}

type NvidiaCollectorDevice struct {
	device         nvml.Device
	excludeMetrics map[string]bool
	tags           map[string]string
	meta           map[string]string
	config         NvidiaCollectorConfig
}

type NvidiaCollector struct {
	metricCollector
	config             NvidiaCollectorConfig
	gpus               []NvidiaCollectorDevice
	num_gpus           int
	prevEccStats       map[string]*eccStats
	prevRemappedStats  map[string]*remappedRowsStats
	prevNVLinkStats    map[string]*nvlinkStats
	prevViolationStats map[string]*violationStats
}

func (m *NvidiaCollector) CatchPanic() {
	if rerr := recover(); rerr != nil {
		log.Print(rerr)
		m.init = false
	}
}

// shouldOutput checks if a metric should be output based on onlyMetrics and excludeMetrics.
func (d *NvidiaCollectorDevice) shouldOutput(metric string) bool {
	if len(d.config.OnlyMetrics) > 0 {
		for _, m := range d.config.OnlyMetrics {
			if m == metric {
				return true
			}
		}
		return false
	}
	return !d.excludeMetrics[metric]
}

type eccStats struct {
	uncorrected uint64
	corrected   uint64
}

type remappedRowsStats struct {
	corrected   int
	uncorrected int
	pending     int
	failure     int
}

type violationStats struct {
	power          float64
	thermal        float64
	syncBoost      float64
	boardLimit     float64
	lowUtil        float64
	reliability    float64
	belowAppClock  float64
	belowBaseClock float64
}

type nvlinkStats struct {
	crcErrors      [nvml.NVLINK_MAX_LINKS]uint64 // Pro NVLink
	eccErrors      [nvml.NVLINK_MAX_LINKS]uint64
	replayErrors   [nvml.NVLINK_MAX_LINKS]uint64
	recoveryErrors [nvml.NVLINK_MAX_LINKS]uint64
	crcFlitErrors  [nvml.NVLINK_MAX_LINKS]uint64
	// Aggregierte Werte für _sum_diff
	aggregateCrcErrors      uint64
	aggregateEccErrors      uint64
	aggregateReplayErrors   uint64
	aggregateRecoveryErrors uint64
	aggregateCrcFlitErrors  uint64
}

func (m *NvidiaCollector) Init(config json.RawMessage) error {
	var err error
	m.name = "NvidiaCollector"
	m.config.AddPciInfoTag = false
	m.config.UsePciInfoAsTypeId = false
	m.config.ProcessMigDevices = false
	m.config.UseUuidForMigDevices = false
	m.config.UseSliceForMigDevices = false
	m.prevEccStats = make(map[string]*eccStats)
	m.prevRemappedStats = make(map[string]*remappedRowsStats)
	m.prevViolationStats = make(map[string]*violationStats)
	m.prevNVLinkStats = make(map[string]*nvlinkStats)
	m.setup()
	if len(config) > 0 {
		err = json.Unmarshal(config, &m.config)
		if err != nil {
			return err
		}
	}
	m.meta = map[string]string{
		"source": m.name,
		"group":  "Nvidia",
	}

	defer m.CatchPanic()

	// Initialize NVIDIA Management Library (NVML)
	ret := nvml.Init()

	// Error: NVML library not found
	// (nvml.ErrorString can not be used in this case)
	if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
		err = fmt.Errorf("NVML library not found")
		cclog.ComponentError(m.name, err.Error())
		return err
	}
	if ret != nvml.SUCCESS {
		err = errors.New(nvml.ErrorString(ret))
		cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error())
		return err
	}

	// Number of NVIDIA GPUs
	num_gpus, ret := nvml.DeviceGetCount()
	if ret != nvml.SUCCESS {
		err = errors.New(nvml.ErrorString(ret))
		cclog.ComponentError(m.name, "Unable to get device count", err.Error())
		return err
	}

	// For all GPUs
	idx := 0
	m.gpus = make([]NvidiaCollectorDevice, num_gpus)
	for i := 0; i < num_gpus; i++ {

		// Skip excluded devices by ID
		str_i := fmt.Sprintf("%d", i)
		if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip {
			cclog.ComponentDebug(m.name, "Skipping excluded device", str_i)
			continue
		}

		// Get device handle
		device, ret := nvml.DeviceGetHandleByIndex(i)
		if ret != nvml.SUCCESS {
			err = errors.New(nvml.ErrorString(ret))
			cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error())
			continue
		}

		// Get device's PCI info
		pciInfo, ret := nvml.DeviceGetPciInfo(device)
		if ret != nvml.SUCCESS {
			err = errors.New(nvml.ErrorString(ret))
			cclog.ComponentError(m.name, "Unable to get PCI info for device at index", i, ":", err.Error())
			continue
		}
		// Create PCI ID in the common format used by the NVML.
		pci_id := fmt.Sprintf(
			nvml.DEVICE_PCI_BUS_ID_FMT,
			pciInfo.Domain,
			pciInfo.Bus,
			pciInfo.Device)

		// Skip excluded devices specified by PCI ID
		if _, skip := stringArrayContains(m.config.ExcludeDevices, pci_id); skip {
			cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id)
			continue
		}

		// Select which value to use as 'type-id'.
		// The PCI ID is commonly required in SLURM environments because the
		// numberic IDs used by SLURM and the ones used by NVML might differ
		// depending on the job type. The PCI ID is more reliable but is commonly
		// not recorded for a job, so it must be added manually in prologue or epilogue
		// e.g. to the comment field
		tid := str_i
		if m.config.UsePciInfoAsTypeId {
			tid = pci_id
		}

		// Now we got all infos together, populate the device list
		g := &m.gpus[idx]

		// Add device handle
		g.device = device

		// Add device config
		g.config = m.config

		// Add tags
		g.tags = map[string]string{
			"type":    "accelerator",
			"type-id": tid,
		}

		// Add PCI info as tag if not already used as 'type-id'
		if m.config.AddPciInfoTag && !m.config.UsePciInfoAsTypeId {
			g.tags["pci_identifier"] = pci_id
		}

		g.meta = map[string]string{
			"source": m.name,
			"group":  "Nvidia",
		}

		if m.config.AddBoardNumberMeta {
			board, ret := nvml.DeviceGetBoardPartNumber(device)
			if ret != nvml.SUCCESS {
				cclog.ComponentError(m.name, "Unable to get board part number for device at index", i, ":", err.Error())
			} else {
				g.meta["board_number"] = board
			}
		}
		if m.config.AddSerialMeta {
			serial, ret := nvml.DeviceGetSerial(device)
			if ret != nvml.SUCCESS {
				cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error())
			} else {
				g.meta["serial"] = serial
			}
		}
		if m.config.AddUuidMeta {
			uuid, ret := nvml.DeviceGetUUID(device)
			if ret != nvml.SUCCESS {
				cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error())
			} else {
				g.meta["uuid"] = uuid
			}
		}

		// Add excluded metrics
		g.excludeMetrics = map[string]bool{}
		for _, e := range m.config.ExcludeMetrics {
			g.excludeMetrics[e] = true
		}

		// Increment the index for the next device
		idx++
	}
	m.num_gpus = idx

	m.init = true
	return nil
}

func sendMetric(metricName string, value interface{}, unit string, device NvidiaCollectorDevice, output chan lp.CCMessage, extraTags ...map[string]string) {
	msg, err := lp.NewMessage(metricName, device.tags, device.meta, map[string]interface{}{"value": value}, time.Now())
	if err != nil {
		return
	}
	if unit != "" {
		msg.AddMeta("unit", unit)
	}
	for _, tags := range extraTags {
		for k, v := range tags {
			msg.AddTag(k, v)
		}
	}
	output <- msg
}

func readMemoryInfo(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
	// Try to use MemoryInfo_v2 if configured
	if config.UseMemoryInfoV2 {
		meminfoV2, ret := nvml.DeviceGetMemoryInfo_v2(device.device)
		if ret == nvml.SUCCESS {
			if device.shouldOutput("nv_fb_mem_total") {
				sendMetric("nv_fb_mem_total", float64(meminfoV2.Total)/(1024*1024), "MByte", device, output)
			}
			if device.shouldOutput("nv_fb_mem_used") {
				sendMetric("nv_fb_mem_used", float64(meminfoV2.Used)/(1024*1024), "MByte", device, output)
			}
			if device.shouldOutput("nv_fb_mem_reserved") {
				sendMetric("nv_fb_mem_reserved", float64(meminfoV2.Reserved)/(1024*1024), "MByte", device, output)
			}
			return nil
		}
	}

	// Fallback: Use DeviceGetMemoryInfo (v1)
	meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
	if ret != nvml.SUCCESS {
		return errors.New(nvml.ErrorString(ret))
	}
	if device.shouldOutput("nv_fb_mem_total") {
		sendMetric("nv_fb_mem_total", float64(meminfo.Total)/(1024*1024), "MByte", device, output)
	}
	if device.shouldOutput("nv_fb_mem_used") {
		sendMetric("nv_fb_mem_used", float64(meminfo.Used)/(1024*1024), "MByte", device, output)
	}
	return nil
}

func readBarMemoryInfo(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
	meminfo, ret := nvml.DeviceGetBAR1MemoryInfo(device.device)
	if ret != nvml.SUCCESS {
		return errors.New(nvml.ErrorString(ret))
	}
	if device.shouldOutput("nv_bar1_mem_total") {
		sendMetric("nv_bar1_mem_total", float64(meminfo.Bar1Total)/(1024*1024), "MByte", device, output)
	}
	if device.shouldOutput("nv_bar1_mem_used") {
		sendMetric("nv_bar1_mem_used", float64(meminfo.Bar1Used)/(1024*1024), "MByte", device, output)
	}
	return nil
}

func readUtilization(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
	isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
	if ret != nvml.SUCCESS {
		return errors.New(nvml.ErrorString(ret))
	}
	if isMig {
		return nil
	}
	// Retrieves the current utilization rates for the device's major subsystems.
	//
	// Available utilization rates
	// * Gpu: Percent of time over the past sample period during which one or more kernels was executing on the GPU.
	// * Memory: Percent of time over the past sample period during which global (device) memory was being read or written
	//
	// Note:
	// * During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings.
	//   This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization.
	// * On MIG-enabled GPUs, querying device utilization rates is not currently supported.
	util, ret := nvml.DeviceGetUtilizationRates(device.device)
	if ret == nvml.SUCCESS {
		if device.shouldOutput("nv_util") {
			sendMetric("nv_util", float64(util.Gpu), "%", device, output)
		}
		if device.shouldOutput("nv_mem_util") {
			sendMetric("nv_mem_util", float64(util.Memory), "%", device, output)
		}
	}
	return nil
}

func readTemp(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
	if device.shouldOutput("nv_temp") {
		// Retrieves the current temperature readings for the device, in degrees C.
		//
		// Available temperature sensors:
		// * TEMPERATURE_GPU: Temperature sensor for the GPU die.
		// * NVML_TEMPERATURE_COUNT
		temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
		if ret == nvml.SUCCESS {
			sendMetric("nv_temp", float64(temp), "degC", device, output)
		}
	}
	return nil
}

func readFan(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
	if !device.shouldOutput("nv_fan") {
		return nil
	}
	// Retrieves the intended operating speed of the device's fan.
	//
	// Note: The reported speed is the intended fan speed.
	// If the fan is physically blocked and unable to spin, the output will not match the actual fan speed.
	//
	// For all discrete products with dedicated fans.
	//
	// The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed.
	// This value may exceed 100% in certain cases.
	//
	// If more than one fan is found we need to use DeviceGetFanSpeed_v2
	numFans, ret := nvml.DeviceGetNumFans(device.device)
	if ret != nvml.SUCCESS {
		return fmt.Errorf("Error retrieving number of fans: %v", ret)
	}

	if numFans <= 1 {
		fan, ret := nvml.DeviceGetFanSpeed(device.device)
		if ret == nvml.SUCCESS {
			sendMetric("nv_fan", float64(fan), "%", device, output)
		}
	} else {
		for i := 0; i < numFans; i++ {
			fan, ret := nvml.DeviceGetFanSpeed_v2(device.device, i)
			if ret == nvml.SUCCESS {
				sendMetric("nv_fan", float64(fan), "%", device, output, map[string]string{
					"stype":    "fan",
					"stype-id": fmt.Sprintf("%d", i),
				})
			}
		}
	}
	return nil
}

func readEccMode(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
	if device.shouldOutput("nv_ecc_mode") {
		// Retrieves the current and pending ECC modes for the device.
		//
		// For Fermi or newer fully supported devices. Only applicable to devices with ECC.
		// Requires NVML_INFOROM_ECC version 1.0 or higher.
		//
		// Changing ECC modes requires a reboot.
		// The "pending" ECC mode refers to the target mode following the next reboot.
		_, eccPend, ret := nvml.DeviceGetEccMode(device.device)
		if ret == nvml.SUCCESS {
			var value string
			switch eccPend {
			case nvml.FEATURE_DISABLED:
				value = "OFF"
			case nvml.FEATURE_ENABLED:
				value = "ON"
			default:
				value = "UNKNOWN"
			}
			sendMetric("nv_ecc_mode", value, "", device, output)
		} else if ret == nvml.ERROR_NOT_SUPPORTED {
			sendMetric("nv_ecc_mode", "N/A", "", device, output)
		}
	}
	return nil
}

func readPerfState(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
	if device.shouldOutput("nv_perf_state") {
		// Retrieves the current performance state for the device.
		//
		// Allowed PStates:
		//  0: Maximum Performance.
		// ..
		// 15: Minimum Performance.
		// 32: Unknown performance state.
		pState, ret := nvml.DeviceGetPerformanceState(device.device)
		if ret == nvml.SUCCESS {
			sendMetric("nv_perf_state", fmt.Sprintf("P%d", int(pState)), "", device, output)
		}
	}
	return nil
}

func readPowerUsage(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
	if device.shouldOutput("nv_power_usage") {
		// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
		//
		// On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
		//
		// It is only available if power management mode is supported
		mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
		if ret != nvml.SUCCESS {
			return nil
		}
		if mode == nvml.FEATURE_ENABLED {
			power, ret := nvml.DeviceGetPowerUsage(device.device)
			if ret == nvml.SUCCESS {
				sendMetric("nv_power_usage", float64(power)/1000, "watts", device, output)
			}
		}
	}
	return nil
}

func readClocks(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
	clockTypes := []struct {
		metricName string
		clockType  nvml.ClockType
		unit       string
	}{
		{"nv_graphics_clock", nvml.CLOCK_GRAPHICS, "MHz"},
		{"nv_sm_clock", nvml.CLOCK_SM, "MHz"},
		{"nv_mem_clock", nvml.CLOCK_MEM, "MHz"},
		{"nv_video_clock", nvml.CLOCK_VIDEO, "MHz"},
	}
	// Retrieves the current clock speeds for the device.
	//
	// Available clock information:
	// * CLOCK_GRAPHICS: Graphics clock domain.
	// * CLOCK_SM: Streaming Multiprocessor clock domain.
	// * CLOCK_MEM: Memory clock domain.
	for _, ct := range clockTypes {
		if device.shouldOutput(ct.metricName) {
			clock, ret := nvml.DeviceGetClockInfo(device.device, ct.clockType)
			if ret == nvml.SUCCESS {
				sendMetric(ct.metricName, float64(clock), ct.unit, device, output)
			}
		}
	}
	return nil
}

func readMaxClocks(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
	clockTypes := []struct {
		metricName string
		clockType  nvml.ClockType
		unit       string
	}{
		{"nv_max_graphics_clock", nvml.CLOCK_GRAPHICS, "MHz"},
		{"nv_max_sm_clock", nvml.CLOCK_SM, "MHz"},
		{"nv_max_mem_clock", nvml.CLOCK_MEM, "MHz"},
		{"nv_max_video_clock", nvml.CLOCK_VIDEO, "MHz"},
	}
	// Retrieves the maximum clock speeds for the device.
	//
	// Available clock information:
	// * CLOCK_GRAPHICS: Graphics clock domain.
	// * CLOCK_SM:       Streaming multiprocessor clock domain.
	// * CLOCK_MEM:      Memory clock domain.
	// * CLOCK_VIDEO:    Video encoder/decoder clock domain.
	// * CLOCK_COUNT:    Count of clock types.
	//
	// Note:
	// On GPUs from Fermi family, current P0 clocks (reported by nvmlDeviceGetClockInfo) can differ from max clocks by a few MHz.
	for _, ct := range clockTypes {
		if device.shouldOutput(ct.metricName) {
			clock, ret := nvml.DeviceGetMaxClockInfo(device.device, ct.clockType)
			if ret == nvml.SUCCESS {
				sendMetric(ct.metricName, float64(clock), ct.unit, device, output)
			}
		}
	}
	return nil
}

func readEccErrors(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage, prevStats *eccStats, deviceID string) error {
	var currentUncorrected, currentCorrected uint64
	var ret nvml.Return
	// Retrieves the total ECC error counts for the device.
	//
	// For Fermi or newer fully supported devices.
	// Only applicable to devices with ECC.
	// Requires NVML_INFOROM_ECC version 1.0 or higher.
	// Requires ECC Mode to be enabled.
	//
	// The total error count is the sum of errors across each of the separate memory systems,
	// i.e. the total set of errors across the entire device.
	if device.shouldOutput("nv_ecc_uncorrected_error") {
		currentUncorrected, ret = nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
		if ret == nvml.SUCCESS {
			sendMetric("nv_ecc_uncorrected_error", uint64(currentUncorrected), "", device, output)
		}
	}
	if device.shouldOutput("nv_ecc_corrected_error") {
		currentCorrected, ret = nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
		if ret == nvml.SUCCESS {
			sendMetric("nv_ecc_corrected_error", uint64(currentCorrected), "", device, output)
		}
	}
	if config.SendDiffValues {
		var diffUncorrected, diffCorrected uint64
		if prevStats.uncorrected == 0 && prevStats.corrected == 0 {
			diffUncorrected = 0
			diffCorrected = 0
		} else {
			diffUncorrected = currentUncorrected - prevStats.uncorrected
			diffCorrected = currentCorrected - prevStats.corrected
			if diffUncorrected > currentUncorrected {
				diffUncorrected = 0
			}
			if diffCorrected > currentCorrected {
				diffCorrected = 0
			}
		}
		prevStats.uncorrected = currentUncorrected
		prevStats.corrected = currentCorrected
		if device.shouldOutput("nv_ecc_uncorrected_error_diff") {
			sendMetric("nv_ecc_uncorrected_error_diff", uint64(diffUncorrected), "", device, output)
		}
		if device.shouldOutput("nv_ecc_corrected_error_diff") {
			sendMetric("nv_ecc_corrected_error_diff", uint64(diffCorrected), "", device, output)
		}
	}
	return nil
}

func readPowerLimit(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
	if device.shouldOutput("nv_power_max_limit") {
		// Retrieves the power management limit associated with this device.
		//
		// For Fermi or newer fully supported devices.
		//
		// The power limit defines the upper boundary for the card's power draw.
		// If the card's total power draw reaches this limit the power management algorithm kicks in.
		pwrLimit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
		if ret == nvml.SUCCESS {
			sendMetric("nv_power_max_limit", float64(pwrLimit)/1000, "watts", device, output)
		}
	}
	return nil
}

func readEncUtilization(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
	isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
	if ret != nvml.SUCCESS {
		return errors.New(nvml.ErrorString(ret))
	}
	if isMig {
		return nil
	}
	if device.shouldOutput("nv_encoder_util") {
		// Retrieves the current utilization and sampling size in microseconds for the Encoder
		//
		// For Kepler or newer fully supported devices.
		//
		// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
		encUtil, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
		if ret == nvml.SUCCESS {
			sendMetric("nv_encoder_util", float64(encUtil), "%", device, output)
		}
	}
	return nil
}

func readDecUtilization(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
	isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
	if ret != nvml.SUCCESS {
		return errors.New(nvml.ErrorString(ret))
	}
	if isMig {
		return nil
	}
	if device.shouldOutput("nv_decoder_util") {
		// Retrieves the current utilization and sampling size in microseconds for the Decoder
		//
		// For Kepler or newer fully supported devices.
		//
		// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
		decUtil, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
		if ret == nvml.SUCCESS {
			sendMetric("nv_decoder_util", float64(decUtil), "%", device, output)
		}
	}
	return nil
}

func readRemappedRows(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage, prevStats *remappedRowsStats, deviceID string) error {

	// Get number of remapped rows. The number of rows reported will be based on the cause of the remapping.
	// isPending indicates whether or not there are pending remappings.
	// A reset will be required to actually remap the row.
	// failureOccurred will be set if a row remapping ever failed in the past.
	// A pending remapping won't affect future work on the GPU since error-containment and dynamic page blacklisting will take care of that.
	//
	// For Ampere or newer fully supported devices.
	//
	// Note: On MIG-enabled GPUs with active instances, querying the number of remapped rows is not supported
	corrected, uncorrected, pendingBool, failureBool, ret := nvml.DeviceGetRemappedRows(device.device)
	if ret != nvml.SUCCESS {
		return nil
	}
	var pending, failure int
	if pendingBool {
		pending = 1
	}
	if failureBool {
		failure = 1
	}
	if device.shouldOutput("nv_remapped_rows_corrected") {
		sendMetric("nv_remapped_rows_corrected", float64(corrected), "", device, output)
	}
	if device.shouldOutput("nv_remapped_rows_uncorrected") {
		sendMetric("nv_remapped_rows_uncorrected", float64(uncorrected), "", device, output)
	}
	if device.shouldOutput("nv_remapped_rows_pending") {
		sendMetric("nv_remapped_rows_pending", pending, "", device, output)
	}
	if device.shouldOutput("nv_remapped_rows_failure") {
		sendMetric("nv_remapped_rows_failure", failure, "", device, output)
	}
	if config.SendDiffValues {
		var diffCorrected, diffUncorrected, diffPending, diffFailure int
		if prevStats.corrected == 0 && prevStats.uncorrected == 0 && prevStats.pending == 0 && prevStats.failure == 0 {
			diffCorrected = 0
			diffUncorrected = 0
			diffPending = 0
			diffFailure = 0
		} else {
			diffCorrected = corrected - prevStats.corrected
			diffUncorrected = uncorrected - prevStats.uncorrected
			diffPending = pending - prevStats.pending
			diffFailure = failure - prevStats.failure
			if diffCorrected > corrected {
				diffCorrected = 0
			}
			if diffUncorrected > uncorrected {
				diffUncorrected = 0
			}
		}
		prevStats.corrected = corrected
		prevStats.uncorrected = uncorrected
		prevStats.pending = pending
		prevStats.failure = failure
		if device.shouldOutput("nv_remapped_rows_corrected_diff") {
			sendMetric("nv_remapped_rows_corrected_diff", float64(diffCorrected), "", device, output)
		}
		if device.shouldOutput("nv_remapped_rows_uncorrected_diff") {
			sendMetric("nv_remapped_rows_uncorrected_diff", float64(diffUncorrected), "", device, output)
		}
		if device.shouldOutput("nv_remapped_rows_pending_diff") {
			sendMetric("nv_remapped_rows_pending_diff", diffPending, "", device, output)
		}
		if device.shouldOutput("nv_remapped_rows_failure_diff") {
			sendMetric("nv_remapped_rows_failure_diff", diffFailure, "", device, output)
		}
	}
	return nil
}

func readProcessCounts(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
	if device.shouldOutput("nv_compute_processes") {
		// Get information about processes with a compute context on a device
		//
		// For Fermi &tm; or newer fully supported devices.
		//
		// This function returns information only about compute running processes (e.g. CUDA application which have
		// active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function.
		//
		// To query the current number of running compute processes, call this function with *infoCount = 0. The
		// return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
		// \a infos is allowed to be NULL.
		//
		// The usedGpuMemory field returned is all of the memory used by the application.
		//
		// Keep in mind that information returned by this call is dynamic and the number of elements might change in
		// time. Allocate more space for \a infos table in case new compute processes are spawned.
		//
		// @note In MIG mode, if device handle is provided, the API returns aggregate information, only if
		//        the caller has appropriate privileges. Per-instance information can be queried by using
		//        specific MIG device handles.
		//        Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
		procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device)
		if ret == nvml.SUCCESS {
			sendMetric("nv_compute_processes", len(procList), "", device, output)
		}
	}
	if device.shouldOutput("nv_graphics_processes") {
		// Get information about processes with a graphics context on a device
		//
		// For Kepler &tm; or newer fully supported devices.
		//
		// This function returns information only about graphics based processes
		// (eg. applications using OpenGL, DirectX)
		//
		// To query the current number of running graphics processes, call this function with *infoCount = 0. The
		// return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
		// \a infos is allowed to be NULL.
		//
		// The usedGpuMemory field returned is all of the memory used by the application.
		//
		// Keep in mind that information returned by this call is dynamic and the number of elements might change in
		// time. Allocate more space for \a infos table in case new graphics processes are spawned.
		//
		// @note In MIG mode, if device handle is provided, the API returns aggregate information, only if
		//       the caller has appropriate privileges. Per-instance information can be queried by using
		//       specific MIG device handles.
		//       Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
		procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device)
		if ret == nvml.SUCCESS {
			sendMetric("nv_graphics_processes", len(procList), "", device, output)
		}
	}
	return nil
}

func readViolationStats(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage, prevStats *violationStats) error {
	type violationMetric struct {
		name   string
		policy nvml.PerfPolicyType
	}

	// Gets the duration of time during which the device was throttled (lower than requested clocks) due to power
	//  or thermal constraints.
	//
	// The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The
	// difference in violation times at two different reference times gives the indication of GPU throttling event.
	//
	// Violation for thermal capping is not supported at this time.
	//
	// For Kepler  or newer fully supported devices.

	metrics := []violationMetric{
		{"nv_violation_power", nvml.PERF_POLICY_POWER},
		{"nv_violation_thermal", nvml.PERF_POLICY_THERMAL},
		{"nv_violation_sync_boost", nvml.PERF_POLICY_SYNC_BOOST},
		{"nv_violation_board_limit", nvml.PERF_POLICY_BOARD_LIMIT},
		{"nv_violation_low_util", nvml.PERF_POLICY_LOW_UTILIZATION},
		{"nv_violation_reliability", nvml.PERF_POLICY_RELIABILITY},
		{"nv_violation_below_app_clock", nvml.PERF_POLICY_TOTAL_APP_CLOCKS},
		{"nv_violation_below_base_clock", nvml.PERF_POLICY_TOTAL_BASE_CLOCKS},
	}
	for _, mtr := range metrics {
		if !device.shouldOutput(mtr.name) {
			continue
		}
		violTime, ret := nvml.DeviceGetViolationStatus(device.device, mtr.policy)
		if ret != nvml.SUCCESS {
			continue
		}
		currentValue := float64(violTime.ViolationTime) * 1e-9
		sendMetric(mtr.name, currentValue, "sec", device, output)
		if config.SendDiffValues && prevStats != nil {
			var diff float64
			var firstMeasurement bool
			switch mtr.name {
			case "nv_violation_power":
				firstMeasurement = prevStats.power == 0
			case "nv_violation_thermal":
				firstMeasurement = prevStats.thermal == 0
			case "nv_violation_sync_boost":
				firstMeasurement = prevStats.syncBoost == 0
			case "nv_violation_board_limit":
				firstMeasurement = prevStats.boardLimit == 0
			case "nv_violation_low_util":
				firstMeasurement = prevStats.lowUtil == 0
			case "nv_violation_reliability":
				firstMeasurement = prevStats.reliability == 0
			case "nv_violation_below_app_clock":
				firstMeasurement = prevStats.belowAppClock == 0
			case "nv_violation_below_base_clock":
				firstMeasurement = prevStats.belowBaseClock == 0
			}
			if firstMeasurement {
				diff = 0
			} else {
				var prevValue float64
				switch mtr.name {
				case "nv_violation_power":
					prevValue = prevStats.power
				case "nv_violation_thermal":
					prevValue = prevStats.thermal
				case "nv_violation_sync_boost":
					prevValue = prevStats.syncBoost
				case "nv_violation_board_limit":
					prevValue = prevStats.boardLimit
				case "nv_violation_low_util":
					prevValue = prevStats.lowUtil
				case "nv_violation_reliability":
					prevValue = prevStats.reliability
				case "nv_violation_below_app_clock":
					prevValue = prevStats.belowAppClock
				case "nv_violation_below_base_clock":
					prevValue = prevStats.belowBaseClock
				}
				diff = currentValue - prevValue
				if diff < 0 {
					diff = 0
				}
			}
			diffName := mtr.name + "_diff"
			if device.shouldOutput(diffName) {
				sendMetric(diffName, diff, "sec", device, output)
			}
			switch mtr.name {
			case "nv_violation_power":
				prevStats.power = currentValue
			case "nv_violation_thermal":
				prevStats.thermal = currentValue
			case "nv_violation_sync_boost":
				prevStats.syncBoost = currentValue
			case "nv_violation_board_limit":
				prevStats.boardLimit = currentValue
			case "nv_violation_low_util":
				prevStats.lowUtil = currentValue
			case "nv_violation_reliability":
				prevStats.reliability = currentValue
			case "nv_violation_below_app_clock":
				prevStats.belowAppClock = currentValue
			case "nv_violation_below_base_clock":
				prevStats.belowBaseClock = currentValue
			}
		}
	}
	return nil
}

func readNVLinkStats(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage, prevStats *nvlinkStats, deviceID string) error {
	var aggregate_crc_errors uint64 = 0
	var aggregate_ecc_errors uint64 = 0
	var aggregate_replay_errors uint64 = 0
	var aggregate_recovery_errors uint64 = 0
	var aggregate_crc_flit_errors uint64 = 0

	// Retrieves the specified error counter value
	// Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
	//
	// For Pascal &tm; or newer fully supported devices.

	needsMetric := func(base string) bool {
		return device.shouldOutput(base) ||
			device.shouldOutput(base+"_sum") ||
			(config.SendDiffValues && device.shouldOutput(base+"_diff")) ||
			(config.SendDiffValues && device.shouldOutput(base+"_sum_diff"))
	}

	for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ {
		state, ret := nvml.DeviceGetNvLinkState(device.device, i)
		if ret != nvml.SUCCESS {
			continue
		}
		if state != nvml.FEATURE_ENABLED {
			continue
		}

		extraTags := map[string]string{
			"stype":    "nvlink",
			"stype-id": fmt.Sprintf("%d", i),
		}

		if needsMetric("nv_nvlink_crc_errors") {
			count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA)
			if ret == nvml.SUCCESS {
				aggregate_crc_errors += count
				if device.shouldOutput("nv_nvlink_crc_errors") {
					sendMetric("nv_nvlink_crc_errors", count, "", device, output, extraTags)
				}
				if config.SendDiffValues && device.shouldOutput("nv_nvlink_crc_errors_diff") {
					var diff uint64
					if prevStats.crcErrors[i] == 0 {
						diff = 0
					} else {
						diff = count - prevStats.crcErrors[i]
						if diff > count {
							diff = 0
						}
					}
					sendMetric("nv_nvlink_crc_errors_diff", diff, "", device, output, extraTags)
					prevStats.crcErrors[i] = count
				}
			}
		}

		if needsMetric("nv_nvlink_ecc_errors") {
			count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA)
			if ret == nvml.SUCCESS {
				aggregate_ecc_errors += count
				if device.shouldOutput("nv_nvlink_ecc_errors") {
					sendMetric("nv_nvlink_ecc_errors", count, "", device, output, extraTags)
				}
				if config.SendDiffValues && device.shouldOutput("nv_nvlink_ecc_errors_diff") {
					var diff uint64
					if prevStats.eccErrors[i] == 0 {
						diff = 0
					} else {
						diff = count - prevStats.eccErrors[i]
						if diff > count {
							diff = 0
						}
					}
					sendMetric("nv_nvlink_ecc_errors_diff", diff, "", device, output, extraTags)
					prevStats.eccErrors[i] = count
				}
			}
		}

		if needsMetric("nv_nvlink_replay_errors") {
			count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY)
			if ret == nvml.SUCCESS {
				aggregate_replay_errors += count
				if device.shouldOutput("nv_nvlink_replay_errors") {
					sendMetric("nv_nvlink_replay_errors", count, "", device, output, extraTags)
				}
				if config.SendDiffValues && device.shouldOutput("nv_nvlink_replay_errors_diff") {
					var diff uint64
					if prevStats.replayErrors[i] == 0 {
						diff = 0
					} else {
						diff = count - prevStats.replayErrors[i]
						if diff > count {
							diff = 0
						}
					}
					sendMetric("nv_nvlink_replay_errors_diff", diff, "", device, output, extraTags)
					prevStats.replayErrors[i] = count
				}
			}
		}

		if needsMetric("nv_nvlink_recovery_errors") {
			count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY)
			if ret == nvml.SUCCESS {
				aggregate_recovery_errors += count
				if device.shouldOutput("nv_nvlink_recovery_errors") {
					sendMetric("nv_nvlink_recovery_errors", count, "", device, output, extraTags)
				}
				if config.SendDiffValues && device.shouldOutput("nv_nvlink_recovery_errors_diff") {
					var diff uint64
					if prevStats.recoveryErrors[i] == 0 {
						diff = 0
					} else {
						diff = count - prevStats.recoveryErrors[i]
						if diff > count {
							diff = 0
						}
					}
					sendMetric("nv_nvlink_recovery_errors_diff", diff, "", device, output, extraTags)
					prevStats.recoveryErrors[i] = count
				}
			}
		}

		if needsMetric("nv_nvlink_crc_flit_errors") {
			count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT)
			if ret == nvml.SUCCESS {
				aggregate_crc_flit_errors += count
				if device.shouldOutput("nv_nvlink_crc_flit_errors") {
					sendMetric("nv_nvlink_crc_flit_errors", count, "", device, output, extraTags)
				}
				if config.SendDiffValues && device.shouldOutput("nv_nvlink_crc_flit_errors_diff") {
					var diff uint64
					if prevStats.crcFlitErrors[i] == 0 {
						diff = 0
					} else {
						diff = count - prevStats.crcFlitErrors[i]
						if diff > count {
							diff = 0
						}
					}
					sendMetric("nv_nvlink_crc_flit_errors_diff", diff, "", device, output, extraTags)
					prevStats.crcFlitErrors[i] = count
				}
			}
		}
	}

	// Export aggregated values
	if device.shouldOutput("nv_nvlink_crc_errors_sum") {
		sendMetric("nv_nvlink_crc_errors_sum", aggregate_crc_errors, "", device, output, map[string]string{"stype": "nvlink"})
	}
	if device.shouldOutput("nv_nvlink_ecc_errors_sum") {
		sendMetric("nv_nvlink_ecc_errors_sum", aggregate_ecc_errors, "", device, output, map[string]string{"stype": "nvlink"})
	}
	if device.shouldOutput("nv_nvlink_replay_errors_sum") {
		sendMetric("nv_nvlink_replay_errors_sum", aggregate_replay_errors, "", device, output, map[string]string{"stype": "nvlink"})
	}
	if device.shouldOutput("nv_nvlink_recovery_errors_sum") {
		sendMetric("nv_nvlink_recovery_errors_sum", aggregate_recovery_errors, "", device, output, map[string]string{"stype": "nvlink"})
	}
	if device.shouldOutput("nv_nvlink_crc_flit_errors_sum") {
		sendMetric("nv_nvlink_crc_flit_errors_sum", aggregate_crc_flit_errors, "", device, output, map[string]string{"stype": "nvlink"})
	}

	// Export aggregated diff values
	if config.SendDiffValues {
		var diff_crc_sum, diff_ecc_sum, diff_replay_sum, diff_recovery_sum, diff_crc_flit_sum uint64

		// Initialize diffs to 0 for the first measurement
		if prevStats.aggregateCrcErrors == 0 && prevStats.aggregateEccErrors == 0 && prevStats.aggregateReplayErrors == 0 && prevStats.aggregateRecoveryErrors == 0 && prevStats.aggregateCrcFlitErrors == 0 {
			diff_crc_sum = 0
			diff_ecc_sum = 0
			diff_replay_sum = 0
			diff_recovery_sum = 0
			diff_crc_flit_sum = 0
		} else {
			// Compute diffs for sum metrics
			diff_crc_sum = aggregate_crc_errors - prevStats.aggregateCrcErrors
			diff_ecc_sum = aggregate_ecc_errors - prevStats.aggregateEccErrors
			diff_replay_sum = aggregate_replay_errors - prevStats.aggregateReplayErrors
			diff_recovery_sum = aggregate_recovery_errors - prevStats.aggregateRecoveryErrors
			diff_crc_flit_sum = aggregate_crc_flit_errors - prevStats.aggregateCrcFlitErrors

			// Reset diffs to 0 if they exceed current values (e.g., counter reset)
			if diff_crc_sum > aggregate_crc_errors {
				diff_crc_sum = 0
			}
			if diff_ecc_sum > aggregate_ecc_errors {
				diff_ecc_sum = 0
			}
			if diff_replay_sum > aggregate_replay_errors {
				diff_replay_sum = 0
			}
			if diff_recovery_sum > aggregate_recovery_errors {
				diff_recovery_sum = 0
			}
			if diff_crc_flit_sum > aggregate_crc_flit_errors {
				diff_crc_flit_sum = 0
			}
		}

		// Update prevStats with current aggregate values
		prevStats.aggregateCrcErrors = aggregate_crc_errors
		prevStats.aggregateEccErrors = aggregate_ecc_errors
		prevStats.aggregateReplayErrors = aggregate_replay_errors
		prevStats.aggregateRecoveryErrors = aggregate_recovery_errors
		prevStats.aggregateCrcFlitErrors = aggregate_crc_flit_errors

		// Export diff metrics for sum values
		if device.shouldOutput("nv_nvlink_crc_errors_sum_diff") {
			sendMetric("nv_nvlink_crc_errors_sum_diff", diff_crc_sum, "", device, output, map[string]string{"stype": "nvlink"})
		}
		if device.shouldOutput("nv_nvlink_ecc_errors_sum_diff") {
			sendMetric("nv_nvlink_ecc_errors_sum_diff", diff_ecc_sum, "", device, output, map[string]string{"stype": "nvlink"})
		}
		if device.shouldOutput("nv_nvlink_replay_errors_sum_diff") {
			sendMetric("nv_nvlink_replay_errors_sum_diff", diff_replay_sum, "", device, output, map[string]string{"stype": "nvlink"})
		}
		if device.shouldOutput("nv_nvlink_recovery_errors_sum_diff") {
			sendMetric("nv_nvlink_recovery_errors_sum_diff", diff_recovery_sum, "", device, output, map[string]string{"stype": "nvlink"})
		}
		if device.shouldOutput("nv_nvlink_crc_flit_errors_sum_diff") {
			sendMetric("nv_nvlink_crc_flit_errors_sum_diff", diff_crc_flit_sum, "", device, output, map[string]string{"stype": "nvlink"})
		}
	}

	return nil
}

func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage) {
	if !m.init {
		return
	}

	// Helper function to get the device name
	deviceName := func(device NvidiaCollectorDevice) string {
		name, ret := nvml.DeviceGetName(device.device)
		if ret != nvml.SUCCESS {
			return "NoName"
		}
		return name
	}

	// Helper function that executes a metric function and logs errors
	processMetric := func(metricName string, f func(NvidiaCollectorDevice, NvidiaCollectorConfig, chan lp.CCMessage) error, device NvidiaCollectorDevice) {
		if err := f(device, m.config, output); err != nil {
			cclog.ComponentDebug(m.name, fmt.Sprintf("%s for device %s failed", metricName, deviceName(device)))
		}
	}

	// Executes all metric functions for a device
	readAll := func(device NvidiaCollectorDevice) {
		processMetric("readMemoryInfo", readMemoryInfo, device)
		processMetric("readUtilization", readUtilization, device)
		processMetric("readTemp", readTemp, device)
		processMetric("readFan", readFan, device)
		processMetric("readEccMode", readEccMode, device)
		processMetric("readPerfState", readPerfState, device)
		processMetric("readPowerUsage", readPowerUsage, device)
		processMetric("readClocks", readClocks, device)
		processMetric("readMaxClocks", readMaxClocks, device)
		processMetric("readPowerLimit", readPowerLimit, device)
		processMetric("readEncUtilization", readEncUtilization, device)
		processMetric("readDecUtilization", readDecUtilization, device)
		processMetric("readBarMemoryInfo", readBarMemoryInfo, device)
		processMetric("readProcessCounts", readProcessCounts, device)
	}

	// Loop over all GPUs
	for i := 0; i < m.num_gpus; i++ {
		readAll(m.gpus[i])
		deviceID := m.gpus[i].tags["type-id"]

		if _, ok := m.prevEccStats[deviceID]; !ok {
			m.prevEccStats[deviceID] = &eccStats{}
		}
		readEccErrors(m.gpus[i], m.config, output, m.prevEccStats[deviceID], deviceID)

		if _, ok := m.prevRemappedStats[deviceID]; !ok {
			m.prevRemappedStats[deviceID] = &remappedRowsStats{}
		}
		readRemappedRows(m.gpus[i], m.config, output, m.prevRemappedStats[deviceID], deviceID)

		if _, ok := m.prevViolationStats[deviceID]; !ok {
			m.prevViolationStats[deviceID] = &violationStats{}
		}
		readViolationStats(m.gpus[i], m.config, output, m.prevViolationStats[deviceID])

		if _, ok := m.prevNVLinkStats[deviceID]; !ok {
			m.prevNVLinkStats[deviceID] = &nvlinkStats{}
		}
		readNVLinkStats(m.gpus[i], m.config, output, m.prevNVLinkStats[deviceID], deviceID)

		// If MIG devices should be processed
		if m.config.ProcessMigDevices {
			current, _, ret := nvml.DeviceGetMigMode(m.gpus[i].device)
			if ret != nvml.SUCCESS || current == nvml.DEVICE_MIG_DISABLE {
				continue
			}

			maxMig, ret := nvml.DeviceGetMaxMigDeviceCount(m.gpus[i].device)
			if ret != nvml.SUCCESS || maxMig == 0 {
				continue
			}
			cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i)

			for j := 0; j < maxMig; j++ {
				mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
				if ret != nvml.SUCCESS {
					continue
				}

				excludeMetrics := make(map[string]bool)
				for _, metric := range m.config.ExcludeMetrics {
					excludeMetrics[metric] = true
				}

				// Initialize the MIG device and copy tags and meta data
				migDevice := NvidiaCollectorDevice{
					device:         mdev,
					tags:           make(map[string]string),
					meta:           make(map[string]string),
					excludeMetrics: excludeMetrics,
					config:         m.config,
				}
				for k, v := range m.gpus[i].tags {
					migDevice.tags[k] = v
				}
				migDevice.tags["stype"] = "mig"
				if m.config.UseUuidForMigDevices {
					uuid, ret := nvml.DeviceGetUUID(mdev)
					if ret != nvml.SUCCESS {
						cclog.ComponentError(m.name, "Unable to get UUID for mig device at index", j, ":", "error occurred")
					} else {
						migDevice.tags["stype-id"] = uuid
					}
				} else if m.config.UseSliceForMigDevices {
					name, ret := nvml.DeviceGetName(m.gpus[i].device)
					if ret == nvml.SUCCESS {
						mname, ret := nvml.DeviceGetName(mdev)
						if ret == nvml.SUCCESS {
							x := strings.Replace(mname, name, "", -1)
							x = strings.Replace(x, "MIG", "", -1)
							x = strings.TrimSpace(x)
							migDevice.tags["stype-id"] = x
						}
					}
				}
				if _, ok := migDevice.tags["stype-id"]; !ok {
					migDevice.tags["stype-id"] = fmt.Sprintf("%d", j)
				}
				for k, v := range m.gpus[i].meta {
					migDevice.meta[k] = v
				}
				if _, ok := migDevice.meta["uuid"]; ok && !m.config.UseUuidForMigDevices {
					uuid, ret := nvml.DeviceGetUUID(mdev)
					if ret == nvml.SUCCESS {
						migDevice.meta["uuid"] = uuid
					}
				}

				// Read all metrics for the MIG device
				readAll(migDevice)
			}
		}
	}
}

func (m *NvidiaCollector) Close() {
	if m.init {
		nvml.Shutdown()
		m.init = false
	}
}