diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 24f0855..3828284 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -13,22 +13,28 @@ import ( ) type NvidiaCollectorConfig struct { - ExcludeMetrics []string `json:"exclude_metrics,omitempty"` - ExcludeDevices []string `json:"exclude_devices,omitempty"` - AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + ExcludeDevices []string `json:"exclude_devices,omitempty"` + AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"` + UsePciInfoAsTypeId bool `json:"use_pci_info_as_type_id,omitempty"` + AddUuidMeta bool `json:"add_uuid_meta,omitempty"` + AddBoardNumberMeta bool `json:"add_board_number_meta,omitempty"` + AddSerialMeta bool `json:"add_serial_meta,omitempty"` + ProcessMigDevices bool `json:"process_mig_devices,omitempty"` } type NvidiaCollectorDevice struct { device nvml.Device excludeMetrics map[string]bool tags map[string]string + meta map[string]string } type NvidiaCollector struct { metricCollector - num_gpus int config NvidiaCollectorConfig gpus []NvidiaCollectorDevice + num_gpus int } func (m *NvidiaCollector) CatchPanic() { @@ -42,6 +48,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { var err error m.name = "NvidiaCollector" m.config.AddPciInfoTag = false + m.config.UsePciInfoAsTypeId = false m.setup() if len(config) > 0 { err = json.Unmarshal(config, &m.config) @@ -54,7 +61,6 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { "group": "Nvidia", } - m.num_gpus = 0 defer m.CatchPanic() // Initialize NVIDIA Management Library (NVML) @@ -74,13 +80,14 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { } // For all GPUs + idx := 0 m.gpus = make([]NvidiaCollectorDevice, num_gpus) for i := 0; i < num_gpus; i++ { - g := &m.gpus[i] - // Skip excluded devices + // Skip excluded devices by ID str_i := fmt.Sprintf("%d", i) if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip { + cclog.ComponentDebug(m.name, "Skipping excluded device", str_i) continue } @@ -89,14 +96,85 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { if ret != nvml.SUCCESS { err = errors.New(nvml.ErrorString(ret)) cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error()) - return err + continue } + + // Get device's PCI info + pciInfo, ret := nvml.DeviceGetPciInfo(device) + if ret != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(ret)) + cclog.ComponentError(m.name, "Unable to get PCI info for device at index", i, ":", err.Error()) + continue + } + // Create PCI ID in the common format used by the NVML. + pci_id := fmt.Sprintf( + nvml.DEVICE_PCI_BUS_ID_FMT, + pciInfo.Domain, + pciInfo.Bus, + pciInfo.Device) + + // Skip excluded devices specified by PCI ID + if _, skip := stringArrayContains(m.config.ExcludeDevices, pci_id); skip { + cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id) + continue + } + + // Select which value to use as 'type-id'. + // The PCI ID is commonly required in SLURM environments because the + // numberic IDs used by SLURM and the ones used by NVML might differ + // depending on the job type. The PCI ID is more reliable but is commonly + // not recorded for a job, so it must be added manually in prologue or epilogue + // e.g. to the comment field + tid := str_i + if m.config.UsePciInfoAsTypeId { + tid = pci_id + } + + // Now we got all infos together, populate the device list + g := &m.gpus[idx] + + // Add device handle g.device = device // Add tags g.tags = map[string]string{ "type": "accelerator", - "type-id": str_i, + "type-id": tid, + } + + // Add PCI info as tag if not already used as 'type-id' + if m.config.AddPciInfoTag && !m.config.UsePciInfoAsTypeId { + g.tags["pci_identifier"] = pci_id + } + + g.meta = map[string]string{ + "source": m.name, + "group": "Nvidia", + } + + if m.config.AddBoardNumberMeta { + board, ret := nvml.DeviceGetBoardPartNumber(device) + if ret != nvml.SUCCESS { + cclog.ComponentError(m.name, "Unable to get boart part number for device at index", i, ":", err.Error()) + } else { + g.meta["board_number"] = board + } + } + if m.config.AddSerialMeta { + serial, ret := nvml.DeviceGetSerial(device) + if ret != nvml.SUCCESS { + cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error()) + } else { + g.meta["serial"] = serial + } + } + if m.config.AddUuidMeta { + uuid, ret := nvml.DeviceGetUUID(device) + if ret != nvml.SUCCESS { + cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error()) + } else { + g.meta["uuid"] = uuid + } } // Add excluded metrics @@ -105,363 +183,982 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { g.excludeMetrics[e] = true } - // Add PCI info as tag - if m.config.AddPciInfoTag { - pciInfo, ret := nvml.DeviceGetPciInfo(g.device) - if ret != nvml.SUCCESS { - err = errors.New(nvml.ErrorString(ret)) - cclog.ComponentError(m.name, "Unable to get PCI info for device at index", i, ":", err.Error()) - return err - } - g.tags["pci_identifier"] = fmt.Sprintf( - "%08X:%02X:%02X.0", - pciInfo.Domain, - pciInfo.Bus, - pciInfo.Device) - } + // Increment the index for the next device + idx++ } + m.num_gpus = idx m.init = true return nil } +func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + if !device.excludeMetrics["nv_fb_mem_total"] || !device.excludeMetrics["nv_fb_mem_used"] || !device.excludeMetrics["nv_fb_mem_reserved"] { + var total uint64 + var used uint64 + var reserved uint64 = 0 + var v2 bool = false + meminfo, ret := nvml.DeviceGetMemoryInfo(device.device) + if ret != nvml.SUCCESS { + err := errors.New(nvml.ErrorString(ret)) + return err + } + total = meminfo.Total + used = meminfo.Used + + if !device.excludeMetrics["nv_fb_mem_total"] { + t := float64(total) / (1024 * 1024) + y, err := lp.New("nv_fb_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + if err == nil { + y.AddMeta("unit", "MByte") + output <- y + } + } + + if !device.excludeMetrics["nv_fb_mem_used"] { + f := float64(used) / (1024 * 1024) + y, err := lp.New("nv_fb_mem_used", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now()) + if err == nil { + y.AddMeta("unit", "MByte") + output <- y + } + } + + if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] { + r := float64(reserved) / (1024 * 1024) + y, err := lp.New("nv_fb_mem_reserved", device.tags, device.meta, map[string]interface{}{"value": r}, time.Now()) + if err == nil { + y.AddMeta("unit", "MByte") + output <- y + } + } + } + return nil +} + +func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + if !device.excludeMetrics["nv_bar1_mem_total"] || !device.excludeMetrics["nv_bar1_mem_used"] { + meminfo, ret := nvml.DeviceGetBAR1MemoryInfo(device.device) + if ret != nvml.SUCCESS { + err := errors.New(nvml.ErrorString(ret)) + return err + } + if !device.excludeMetrics["nv_bar1_mem_total"] { + t := float64(meminfo.Bar1Total) / (1024 * 1024) + y, err := lp.New("nv_bar1_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + if err == nil { + y.AddMeta("unit", "MByte") + output <- y + } + } + if !device.excludeMetrics["nv_bar1_mem_used"] { + t := float64(meminfo.Bar1Used) / (1024 * 1024) + y, err := lp.New("nv_bar1_mem_used", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + if err == nil { + y.AddMeta("unit", "MByte") + output <- y + } + } + } + return nil +} + +func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device) + if ret != nvml.SUCCESS { + err := errors.New(nvml.ErrorString(ret)) + return err + } + if !isMig { + return nil + } + + if !device.excludeMetrics["nv_util"] || !device.excludeMetrics["nv_mem_util"] { + // Retrieves the current utilization rates for the device's major subsystems. + // + // Available utilization rates + // * Gpu: Percent of time over the past sample period during which one or more kernels was executing on the GPU. + // * Memory: Percent of time over the past sample period during which global (device) memory was being read or written + // + // Note: + // * During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings. + // This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization. + // * On MIG-enabled GPUs, querying device utilization rates is not currently supported. + util, ret := nvml.DeviceGetUtilizationRates(device.device) + if ret == nvml.SUCCESS { + if !device.excludeMetrics["nv_util"] { + y, err := lp.New("nv_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) + if err == nil { + y.AddMeta("unit", "%") + output <- y + } + } + if !device.excludeMetrics["nv_mem_util"] { + y, err := lp.New("nv_mem_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) + if err == nil { + y.AddMeta("unit", "%") + output <- y + } + } + } + } + return nil +} + +func readTemp(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + if !device.excludeMetrics["nv_temp"] { + // Retrieves the current temperature readings for the device, in degrees C. + // + // Available temperature sensors: + // * TEMPERATURE_GPU: Temperature sensor for the GPU die. + // * NVML_TEMPERATURE_COUNT + temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_temp", device.tags, device.meta, map[string]interface{}{"value": float64(temp)}, time.Now()) + if err == nil { + y.AddMeta("unit", "degC") + output <- y + } + } + } + return nil +} + +func readFan(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + if !device.excludeMetrics["nv_fan"] { + // Retrieves the intended operating speed of the device's fan. + // + // Note: The reported speed is the intended fan speed. + // If the fan is physically blocked and unable to spin, the output will not match the actual fan speed. + // + // For all discrete products with dedicated fans. + // + // The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. + // This value may exceed 100% in certain cases. + fan, ret := nvml.DeviceGetFanSpeed(device.device) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) + if err == nil { + y.AddMeta("unit", "%") + output <- y + } + } + } + return nil +} + +// func readFans(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +// if !device.excludeMetrics["nv_fan"] { +// numFans, ret := nvml.DeviceGetNumFans(device.device) +// if ret == nvml.SUCCESS { +// for i := 0; i < numFans; i++ { +// fan, ret := nvml.DeviceGetFanSpeed_v2(device.device, i) +// if ret == nvml.SUCCESS { +// y, err := lp.New("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) +// if err == nil { +// y.AddMeta("unit", "%") +// y.AddTag("stype", "fan") +// y.AddTag("stype-id", fmt.Sprintf("%d", i)) +// output <- y +// } +// } +// } +// } +// } +// return nil +// } + +func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + if !device.excludeMetrics["nv_ecc_mode"] { + // Retrieves the current and pending ECC modes for the device. + // + // For Fermi or newer fully supported devices. Only applicable to devices with ECC. + // Requires NVML_INFOROM_ECC version 1.0 or higher. + // + // Changing ECC modes requires a reboot. + // The "pending" ECC mode refers to the target mode following the next reboot. + _, ecc_pend, ret := nvml.DeviceGetEccMode(device.device) + if ret == nvml.SUCCESS { + var y lp.CCMetric + var err error + switch ecc_pend { + case nvml.FEATURE_DISABLED: + y, err = lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "OFF"}, time.Now()) + case nvml.FEATURE_ENABLED: + y, err = lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "ON"}, time.Now()) + default: + y, err = lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now()) + } + if err == nil { + output <- y + } + } else if ret == nvml.ERROR_NOT_SUPPORTED { + y, err := lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "N/A"}, time.Now()) + if err == nil { + output <- y + } + } + } + return nil +} + +func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + if !device.excludeMetrics["nv_perf_state"] { + // Retrieves the current performance state for the device. + // + // Allowed PStates: + // 0: Maximum Performance. + // .. + // 15: Minimum Performance. + // 32: Unknown performance state. + pState, ret := nvml.DeviceGetPerformanceState(device.device) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_perf_state", device.tags, device.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now()) + if err == nil { + output <- y + } + } + } + return nil +} + +func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + if !device.excludeMetrics["nv_power_usage"] { + // Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) + // + // On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. + // + // It is only available if power management mode is supported + mode, ret := nvml.DeviceGetPowerManagementMode(device.device) + if ret != nvml.SUCCESS { + return nil + } + if mode == nvml.FEATURE_ENABLED { + power, ret := nvml.DeviceGetPowerUsage(device.device) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_power_usage", device.tags, device.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) + if err == nil { + y.AddMeta("unit", "watts") + output <- y + } + } + } + } + return nil +} + +func readClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + // Retrieves the current clock speeds for the device. + // + // Available clock information: + // * CLOCK_GRAPHICS: Graphics clock domain. + // * CLOCK_SM: Streaming Multiprocessor clock domain. + // * CLOCK_MEM: Memory clock domain. + if !device.excludeMetrics["nv_graphics_clock"] { + graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now()) + if err == nil { + y.AddMeta("unit", "MHz") + output <- y + } + } + } + + if !device.excludeMetrics["nv_sm_clock"] { + smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(smCock)}, time.Now()) + if err == nil { + y.AddMeta("unit", "MHz") + output <- y + } + } + } + + if !device.excludeMetrics["nv_mem_clock"] { + memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now()) + if err == nil { + y.AddMeta("unit", "MHz") + output <- y + } + } + } + if !device.excludeMetrics["nv_video_clock"] { + memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now()) + if err == nil { + y.AddMeta("unit", "MHz") + output <- y + } + } + } + return nil +} + +func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + // Retrieves the maximum clock speeds for the device. + // + // Available clock information: + // * CLOCK_GRAPHICS: Graphics clock domain. + // * CLOCK_SM: Streaming multiprocessor clock domain. + // * CLOCK_MEM: Memory clock domain. + // * CLOCK_VIDEO: Video encoder/decoder clock domain. + // * CLOCK_COUNT: Count of clock types. + // + // Note: + /// On GPUs from Fermi family current P0 clocks (reported by nvmlDeviceGetClockInfo) can differ from max clocks by few MHz. + if !device.excludeMetrics["nv_max_graphics_clock"] { + max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_max_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) + if err == nil { + y.AddMeta("unit", "MHz") + output <- y + } + } + } + + if !device.excludeMetrics["nv_max_sm_clock"] { + maxSmClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_max_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxSmClock)}, time.Now()) + if err == nil { + y.AddMeta("unit", "MHz") + output <- y + } + } + } + + if !device.excludeMetrics["nv_max_mem_clock"] { + maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_max_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now()) + if err == nil { + y.AddMeta("unit", "MHz") + output <- y + } + } + } + + if !device.excludeMetrics["nv_max_video_clock"] { + maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_max_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now()) + if err == nil { + y.AddMeta("unit", "MHz") + output <- y + } + } + } + return nil +} + +func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + if !device.excludeMetrics["nv_ecc_uncorrected_error"] { + // Retrieves the total ECC error counts for the device. + // + // For Fermi or newer fully supported devices. + // Only applicable to devices with ECC. + // Requires NVML_INFOROM_ECC version 1.0 or higher. + // Requires ECC Mode to be enabled. + // + // The total error count is the sum of errors across each of the separate memory systems, + // i.e. the total set of errors across the entire device. + ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) + if err == nil { + output <- y + } + } + } + if !device.excludeMetrics["nv_ecc_corrected_error"] { + ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_ecc_corrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) + if err == nil { + output <- y + } + } + } + return nil +} + +func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + if !device.excludeMetrics["nv_power_max_limit"] { + // Retrieves the power management limit associated with this device. + // + // For Fermi or newer fully supported devices. + // + // The power limit defines the upper boundary for the card's power draw. + // If the card's total power draw reaches this limit the power management algorithm kicks in. + pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_power_max_limit", device.tags, device.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now()) + if err == nil { + y.AddMeta("unit", "watts") + output <- y + } + } + } + return nil +} + +func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device) + if ret != nvml.SUCCESS { + err := errors.New(nvml.ErrorString(ret)) + return err + } + if !isMig { + return nil + } + if !device.excludeMetrics["nv_encoder_util"] { + // Retrieves the current utilization and sampling size in microseconds for the Encoder + // + // For Kepler or newer fully supported devices. + // + // Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported. + enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_encoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now()) + if err == nil { + y.AddMeta("unit", "%") + output <- y + } + } + } + return nil +} + +func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device) + if ret != nvml.SUCCESS { + err := errors.New(nvml.ErrorString(ret)) + return err + } + if !isMig { + return nil + } + if !device.excludeMetrics["nv_decoder_util"] { + // Retrieves the current utilization and sampling size in microseconds for the Encoder + // + // For Kepler or newer fully supported devices. + // + // Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported. + dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_decoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now()) + if err == nil { + y.AddMeta("unit", "%") + output <- y + } + } + } + return nil +} + +func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + if !device.excludeMetrics["nv_remapped_rows_corrected"] || + !device.excludeMetrics["nv_remapped_rows_uncorrected"] || + !device.excludeMetrics["nv_remapped_rows_pending"] || + !device.excludeMetrics["nv_remapped_rows_failure"] { + // Get number of remapped rows. The number of rows reported will be based on the cause of the remapping. + // isPending indicates whether or not there are pending remappings. + // A reset will be required to actually remap the row. + // failureOccurred will be set if a row remapping ever failed in the past. + // A pending remapping won't affect future work on the GPU since error-containment and dynamic page blacklisting will take care of that. + // + // For Ampere or newer fully supported devices. + // + // Note: On MIG-enabled GPUs with active instances, querying the number of remapped rows is not supported + corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device) + if ret == nvml.SUCCESS { + if !device.excludeMetrics["nv_remapped_rows_corrected"] { + y, err := lp.New("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(corrected)}, time.Now()) + if err == nil { + output <- y + } + } + if !device.excludeMetrics["nv_remapped_rows_uncorrected"] { + y, err := lp.New("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(uncorrected)}, time.Now()) + if err == nil { + output <- y + } + } + if !device.excludeMetrics["nv_remapped_rows_pending"] { + var p int = 0 + if pending { + p = 1 + } + y, err := lp.New("nv_remapped_rows_pending", device.tags, device.meta, map[string]interface{}{"value": p}, time.Now()) + if err == nil { + output <- y + } + } + if !device.excludeMetrics["nv_remapped_rows_failure"] { + var f int = 0 + if failure { + f = 1 + } + y, err := lp.New("nv_remapped_rows_failure", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now()) + if err == nil { + output <- y + } + } + } + } + return nil +} + +func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + if !device.excludeMetrics["nv_compute_processes"] { + // Get information about processes with a compute context on a device + // + // For Fermi &tm; or newer fully supported devices. + // + // This function returns information only about compute running processes (e.g. CUDA application which have + // active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. + // + // To query the current number of running compute processes, call this function with *infoCount = 0. The + // return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call + // \a infos is allowed to be NULL. + // + // The usedGpuMemory field returned is all of the memory used by the application. + // + // Keep in mind that information returned by this call is dynamic and the number of elements might change in + // time. Allocate more space for \a infos table in case new compute processes are spawned. + // + // @note In MIG mode, if device handle is provided, the API returns aggregate information, only if + // the caller has appropriate privileges. Per-instance information can be queried by using + // specific MIG device handles. + // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. + procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now()) + if err == nil { + output <- y + } + } + } + if !device.excludeMetrics["nv_graphics_processes"] { + // Get information about processes with a graphics context on a device + // + // For Kepler &tm; or newer fully supported devices. + // + // This function returns information only about graphics based processes + // (eg. applications using OpenGL, DirectX) + // + // To query the current number of running graphics processes, call this function with *infoCount = 0. The + // return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call + // \a infos is allowed to be NULL. + // + // The usedGpuMemory field returned is all of the memory used by the application. + // + // Keep in mind that information returned by this call is dynamic and the number of elements might change in + // time. Allocate more space for \a infos table in case new graphics processes are spawned. + // + // @note In MIG mode, if device handle is provided, the API returns aggregate information, only if + // the caller has appropriate privileges. Per-instance information can be queried by using + // specific MIG device handles. + // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. + procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_graphics_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now()) + if err == nil { + output <- y + } + } + } + // if !device.excludeMetrics["nv_mps_compute_processes"] { + // // Get information about processes with a MPS compute context on a device + // // + // // For Volta &tm; or newer fully supported devices. + // // + // // This function returns information only about compute running processes (e.g. CUDA application which have + // // active context) utilizing MPS. Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by + // // this function. + // // + // // To query the current number of running compute processes, call this function with *infoCount = 0. The + // // return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call + // // \a infos is allowed to be NULL. + // // + // // The usedGpuMemory field returned is all of the memory used by the application. + // // + // // Keep in mind that information returned by this call is dynamic and the number of elements might change in + // // time. Allocate more space for \a infos table in case new compute processes are spawned. + // // + // // @note In MIG mode, if device handle is provided, the API returns aggregate information, only if + // // the caller has appropriate privileges. Per-instance information can be queried by using + // // specific MIG device handles. + // // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. + // procList, ret := nvml.DeviceGetMPSComputeRunningProcesses(device.device) + // if ret == nvml.SUCCESS { + // y, err := lp.New("nv_mps_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now()) + // if err == nil { + // output <- y + // } + // } + // } + return nil +} + +func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + var violTime nvml.ViolationTime + var ret nvml.Return + + // Gets the duration of time during which the device was throttled (lower than requested clocks) due to power + // or thermal constraints. + // + // The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The + // difference in violation times at two different reference times gives the indication of GPU throttling event. + // + // Violation for thermal capping is not supported at this time. + // + // For Kepler or newer fully supported devices. + + if !device.excludeMetrics["nv_violation_power"] { + // How long did power violations cause the GPU to be below application clocks + violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER) + if ret == nvml.SUCCESS { + t := float64(violTime.ViolationTime) * 1e-9 + y, err := lp.New("nv_violation_power", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + if err == nil { + y.AddMeta("unit", "sec") + output <- y + } + } + } + if !device.excludeMetrics["nv_violation_thermal"] { + // How long did thermal violations cause the GPU to be below application clocks + violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL) + if ret == nvml.SUCCESS { + t := float64(violTime.ViolationTime) * 1e-9 + y, err := lp.New("nv_violation_thermal", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + if err == nil { + y.AddMeta("unit", "sec") + output <- y + } + } + } + if !device.excludeMetrics["nv_violation_sync_boost"] { + // How long did sync boost cause the GPU to be below application clocks + violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST) + if ret == nvml.SUCCESS { + t := float64(violTime.ViolationTime) * 1e-9 + y, err := lp.New("nv_violation_sync_boost", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + if err == nil { + y.AddMeta("unit", "sec") + output <- y + } + } + } + if !device.excludeMetrics["nv_violation_board_limit"] { + // How long did the board limit cause the GPU to be below application clocks + violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT) + if ret == nvml.SUCCESS { + t := float64(violTime.ViolationTime) * 1e-9 + y, err := lp.New("nv_violation_board_limit", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + if err == nil { + y.AddMeta("unit", "sec") + output <- y + } + } + } + if !device.excludeMetrics["nv_violation_low_util"] { + // How long did low utilization cause the GPU to be below application clocks + violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION) + if ret == nvml.SUCCESS { + t := float64(violTime.ViolationTime) * 1e-9 + y, err := lp.New("nv_violation_low_util", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + if err == nil { + y.AddMeta("unit", "sec") + output <- y + } + } + } + if !device.excludeMetrics["nv_violation_reliability"] { + // How long did the board reliability limit cause the GPU to be below application clocks + violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY) + if ret == nvml.SUCCESS { + t := float64(violTime.ViolationTime) * 1e-9 + y, err := lp.New("nv_violation_reliability", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + if err == nil { + y.AddMeta("unit", "sec") + output <- y + } + } + } + if !device.excludeMetrics["nv_violation_below_app_clock"] { + // Total time the GPU was held below application clocks by any limiter (all of above) + violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS) + if ret == nvml.SUCCESS { + t := float64(violTime.ViolationTime) * 1e-9 + y, err := lp.New("nv_violation_below_app_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + if err == nil { + y.AddMeta("unit", "sec") + output <- y + } + } + } + if !device.excludeMetrics["nv_violation_below_base_clock"] { + // Total time the GPU was held below base clocks + violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS) + if ret == nvml.SUCCESS { + t := float64(violTime.ViolationTime) * 1e-9 + y, err := lp.New("nv_violation_below_base_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + if err == nil { + y.AddMeta("unit", "sec") + output <- y + } + } + } + + return nil +} + +func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + // Retrieves the specified error counter value + // Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available + // + // For Pascal &tm; or newer fully supported devices. + + for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ { + state, ret := nvml.DeviceGetNvLinkState(device.device, i) + if ret == nvml.SUCCESS { + if state == nvml.FEATURE_ENABLED { + if !device.excludeMetrics["nv_nvlink_crc_errors"] { + // Data link receive data CRC error counter + count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_nvlink_crc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) + if err == nil { + y.AddTag("stype", "nvlink") + y.AddTag("stype-id", fmt.Sprintf("%d", i)) + output <- y + } + } + } + if !device.excludeMetrics["nv_nvlink_ecc_errors"] { + // Data link receive data ECC error counter + count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) + if err == nil { + y.AddTag("stype", "nvlink") + y.AddTag("stype-id", fmt.Sprintf("%d", i)) + output <- y + } + } + } + if !device.excludeMetrics["nv_nvlink_replay_errors"] { + // Data link transmit replay error counter + count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_nvlink_replay_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) + if err == nil { + y.AddTag("stype", "nvlink") + y.AddTag("stype-id", fmt.Sprintf("%d", i)) + output <- y + } + } + } + if !device.excludeMetrics["nv_nvlink_recovery_errors"] { + // Data link transmit recovery error counter + count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) + if err == nil { + y.AddTag("stype", "nvlink") + y.AddTag("stype-id", fmt.Sprintf("%d", i)) + output <- y + } + } + } + if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] { + // Data link receive flow control digit CRC error counter + count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT) + if ret == nvml.SUCCESS { + y, err := lp.New("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) + if err == nil { + y.AddTag("stype", "nvlink") + y.AddTag("stype-id", fmt.Sprintf("%d", i)) + output <- y + } + } + } + } + } + } + return nil +} + func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) { + var err error if !m.init { return } - for i := range m.gpus { - device := &m.gpus[i] - - if !device.excludeMetrics["nv_util"] || !device.excludeMetrics["nv_mem_util"] { - // Retrieves the current utilization rates for the device's major subsystems. - // - // Available utilization rates - // * Gpu: Percent of time over the past sample period during which one or more kernels was executing on the GPU. - // * Memory: Percent of time over the past sample period during which global (device) memory was being read or written - // - // Note: - // * During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings. - // This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization. - // * On MIG-enabled GPUs, querying device utilization rates is not currently supported. - util, ret := nvml.DeviceGetUtilizationRates(device.device) - if ret == nvml.SUCCESS { - if !device.excludeMetrics["nv_util"] { - y, err := lp.New("nv_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) - if err == nil { - y.AddMeta("unit", "%") - output <- y - } - } - if !device.excludeMetrics["nv_mem_util"] { - y, err := lp.New("nv_mem_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) - if err == nil { - y.AddMeta("unit", "%") - output <- y - } - } - } + readAll := func(device NvidiaCollectorDevice, output chan lp.CCMetric) { + name, ret := nvml.DeviceGetName(device.device) + if ret != nvml.SUCCESS { + name = "NoName" + } + err = readMemoryInfo(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readMemoryInfo for device", name, "failed") } - if !device.excludeMetrics["nv_mem_total"] || !device.excludeMetrics["nv_fb_memory"] { - // Retrieves the amount of used, free and total memory available on the device, in bytes. - // - // Enabling ECC reduces the amount of total available memory, due to the extra required parity bits. - // - // The reported amount of used memory is equal to the sum of memory allocated by all active channels on the device. - // - // Available memory info: - // * Free: Unallocated FB memory (in bytes). - // * Total: Total installed FB memory (in bytes). - // * Used: Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping. - // - // Note: - // In MIG mode, if device handle is provided, the API returns aggregate information, only if the caller has appropriate privileges. - // Per-instance information can be queried by using specific MIG device handles. - meminfo, ret := nvml.DeviceGetMemoryInfo(device.device) - if ret == nvml.SUCCESS { - if !device.excludeMetrics["nv_mem_total"] { - t := float64(meminfo.Total) / (1024 * 1024) - y, err := lp.New("nv_mem_total", device.tags, m.meta, map[string]interface{}{"value": t}, time.Now()) - if err == nil { - y.AddMeta("unit", "MByte") - output <- y - } - } - - if !device.excludeMetrics["nv_fb_memory"] { - f := float64(meminfo.Used) / (1024 * 1024) - y, err := lp.New("nv_fb_memory", device.tags, m.meta, map[string]interface{}{"value": f}, time.Now()) - if err == nil { - y.AddMeta("unit", "MByte") - output <- y - } - } - } + err = readUtilization(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readUtilization for device", name, "failed") } - if !device.excludeMetrics["nv_temp"] { - // Retrieves the current temperature readings for the device, in degrees C. - // - // Available temperature sensors: - // * TEMPERATURE_GPU: Temperature sensor for the GPU die. - // * NVML_TEMPERATURE_COUNT - temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU) - if ret == nvml.SUCCESS { - y, err := lp.New("nv_temp", device.tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now()) - if err == nil { - y.AddMeta("unit", "degC") - output <- y - } - } + err = readTemp(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readTemp for device", name, "failed") } - if !device.excludeMetrics["nv_fan"] { - // Retrieves the intended operating speed of the device's fan. - // - // Note: The reported speed is the intended fan speed. - // If the fan is physically blocked and unable to spin, the output will not match the actual fan speed. - // - // For all discrete products with dedicated fans. - // - // The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. - // This value may exceed 100% in certain cases. - fan, ret := nvml.DeviceGetFanSpeed(device.device) - if ret == nvml.SUCCESS { - y, err := lp.New("nv_fan", device.tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) - if err == nil { - y.AddMeta("unit", "%") - output <- y - } - } + err = readFan(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readFan for device", name, "failed") } - if !device.excludeMetrics["nv_ecc_mode"] { - // Retrieves the current and pending ECC modes for the device. - // - // For Fermi or newer fully supported devices. Only applicable to devices with ECC. - // Requires NVML_INFOROM_ECC version 1.0 or higher. - // - // Changing ECC modes requires a reboot. - // The "pending" ECC mode refers to the target mode following the next reboot. - _, ecc_pend, ret := nvml.DeviceGetEccMode(device.device) - if ret == nvml.SUCCESS { - var y lp.CCMetric - var err error - switch ecc_pend { - case nvml.FEATURE_DISABLED: - y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "OFF"}, time.Now()) - case nvml.FEATURE_ENABLED: - y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "ON"}, time.Now()) - default: - y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now()) - } - if err == nil { - output <- y - } - } else if ret == nvml.ERROR_NOT_SUPPORTED { - y, err := lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "N/A"}, time.Now()) - if err == nil { - output <- y - } - } + err = readEccMode(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readEccMode for device", name, "failed") } - if !device.excludeMetrics["nv_perf_state"] { - // Retrieves the current performance state for the device. - // - // Allowed PStates: - // 0: Maximum Performance. - // .. - // 15: Minimum Performance. - // 32: Unknown performance state. - pState, ret := nvml.DeviceGetPerformanceState(device.device) - if ret == nvml.SUCCESS { - y, err := lp.New("nv_perf_state", device.tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now()) - if err == nil { - output <- y - } - } + err = readPerfState(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readPerfState for device", name, "failed") } - if !device.excludeMetrics["nv_power_usage_report"] { - // Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) - // - // On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. - // - // It is only available if power management mode is supported - power, ret := nvml.DeviceGetPowerUsage(device.device) - if ret == nvml.SUCCESS { - y, err := lp.New("nv_power_usage_report", device.tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) - if err == nil { - y.AddMeta("unit", "watts") - output <- y - } - } + err = readPowerUsage(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed") } - // Retrieves the current clock speeds for the device. - // - // Available clock information: - // * CLOCK_GRAPHICS: Graphics clock domain. - // * CLOCK_SM: Streaming Multiprocessor clock domain. - // * CLOCK_MEM: Memory clock domain. - if !device.excludeMetrics["nv_graphics_clock_report"] { - graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS) - if ret == nvml.SUCCESS { - y, err := lp.New("nv_graphics_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now()) - if err == nil { - y.AddMeta("unit", "MHz") - output <- y - } - } + err = readClocks(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readClocks for device", name, "failed") } - if !device.excludeMetrics["nv_sm_clock_report"] { - smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) - if ret == nvml.SUCCESS { - y, err := lp.New("nv_sm_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(smCock)}, time.Now()) - if err == nil { - y.AddMeta("unit", "MHz") - output <- y - } - } + err = readMaxClocks(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readMaxClocks for device", name, "failed") } - if !device.excludeMetrics["nv_mem_clock_report"] { - memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) - if ret == nvml.SUCCESS { - y, err := lp.New("nv_mem_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(memClock)}, time.Now()) - if err == nil { - y.AddMeta("unit", "MHz") - output <- y - } - } + err = readEccErrors(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readEccErrors for device", name, "failed") } - // Retrieves the maximum clock speeds for the device. - // - // Available clock information: - // * CLOCK_GRAPHICS: Graphics clock domain. - // * CLOCK_SM: Streaming multiprocessor clock domain. - // * CLOCK_MEM: Memory clock domain. - // * CLOCK_VIDEO: Video encoder/decoder clock domain. - // * CLOCK_COUNT: Count of clock types. - // - // Note: - /// On GPUs from Fermi family current P0 clocks (reported by nvmlDeviceGetClockInfo) can differ from max clocks by few MHz. - if !device.excludeMetrics["nv_max_graphics_clock"] { - max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS) - if ret == nvml.SUCCESS { - y, err := lp.New("nv_max_graphics_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) - if err == nil { - y.AddMeta("unit", "MHz") - output <- y - } - } + err = readPowerLimit(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readPowerLimit for device", name, "failed") } - if !device.excludeMetrics["nv_max_sm_clock"] { - maxSmClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) - if ret == nvml.SUCCESS { - y, err := lp.New("nv_max_sm_clock", device.tags, m.meta, map[string]interface{}{"value": float64(maxSmClock)}, time.Now()) - if err == nil { - y.AddMeta("unit", "MHz") - output <- y - } - } + err = readEncUtilization(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readEncUtilization for device", name, "failed") } - if !device.excludeMetrics["nv_max_mem_clock"] { - maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) - if ret == nvml.SUCCESS { - y, err := lp.New("nv_max_mem_clock", device.tags, m.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now()) - if err == nil { - y.AddMeta("unit", "MHz") - output <- y - } - } + err = readDecUtilization(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readDecUtilization for device", name, "failed") } - if !device.excludeMetrics["nv_ecc_db_error"] { - // Retrieves the total ECC error counts for the device. - // - // For Fermi or newer fully supported devices. - // Only applicable to devices with ECC. - // Requires NVML_INFOROM_ECC version 1.0 or higher. - // Requires ECC Mode to be enabled. - // - // The total error count is the sum of errors across each of the separate memory systems, - // i.e. the total set of errors across the entire device. - ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC) - if ret == nvml.SUCCESS { - y, err := lp.New("nv_ecc_db_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) - if err == nil { - output <- y - } - } + err = readRemappedRows(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readRemappedRows for device", name, "failed") } - if !device.excludeMetrics["nv_ecc_sb_error"] { - ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC) - if ret == nvml.SUCCESS { - y, err := lp.New("nv_ecc_sb_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) - if err == nil { - output <- y - } - } + err = readBarMemoryInfo(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readBarMemoryInfo for device", name, "failed") } - if !device.excludeMetrics["nv_power_man_limit"] { - // Retrieves the power management limit associated with this device. - // - // For Fermi or newer fully supported devices. - // - // The power limit defines the upper boundary for the card's power draw. - // If the card's total power draw reaches this limit the power management algorithm kicks in. - pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device) - if ret == nvml.SUCCESS { - y, err := lp.New("nv_power_man_limit", device.tags, m.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now()) - if err == nil { - y.AddMeta("unit", "watts") - output <- y - } - } + err = readProcessCounts(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readProcessCounts for device", name, "failed") } - if !device.excludeMetrics["nv_encoder_util"] { - // Retrieves the current utilization and sampling size in microseconds for the Encoder - // - // For Kepler or newer fully supported devices. - // - // Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported. - enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device) - if ret == nvml.SUCCESS { - y, err := lp.New("nv_encoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now()) - if err == nil { - y.AddMeta("unit", "%") - output <- y - } - } + err = readViolationStats(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readViolationStats for device", name, "failed") } - if !device.excludeMetrics["nv_decoder_util"] { - // Retrieves the current utilization and sampling size in microseconds for the Decoder - // - // For Kepler or newer fully supported devices. - // - // Note: On MIG-enabled GPUs, querying decoder utilization is not currently supported. - dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device) - if ret == nvml.SUCCESS { - y, err := lp.New("nv_decoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now()) - if err == nil { - y.AddMeta("unit", "%") - output <- y - } - } + err = readNVLinkStats(device, output) + if err != nil { + cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed") } } + // Actual read loop over all attached Nvidia GPUs + for i := 0; i < m.num_gpus; i++ { + + readAll(m.gpus[i], output) + + // Iterate over all MIG devices if any + if m.config.ProcessMigDevices { + current, _, ret := nvml.DeviceGetMigMode(m.gpus[i].device) + if ret != nvml.SUCCESS { + continue + } + if current == nvml.DEVICE_MIG_DISABLE { + continue + } + + maxMig, ret := nvml.DeviceGetMaxMigDeviceCount(m.gpus[i].device) + if ret != nvml.SUCCESS { + continue + } + if maxMig == 0 { + continue + } + cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i) + + for j := 0; j < maxMig; j++ { + mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j) + if ret != nvml.SUCCESS { + continue + } + + excludeMetrics := make(map[string]bool) + for _, metric := range m.config.ExcludeMetrics { + excludeMetrics[metric] = true + } + + migDevice := NvidiaCollectorDevice{ + device: mdev, + tags: map[string]string{}, + meta: map[string]string{}, + excludeMetrics: excludeMetrics, + } + for k, v := range m.gpus[i].tags { + migDevice.tags[k] = v + } + m.gpus[i].tags["stype"] = "mig" + m.gpus[i].tags["stype-id"] = fmt.Sprintf("%d", j) + for k, v := range m.gpus[i].meta { + migDevice.meta[k] = v + } + if _, ok := migDevice.meta["uuid"]; ok { + uuid, ret := nvml.DeviceGetUUID(mdev) + if ret == nvml.SUCCESS { + migDevice.meta["uuid"] = uuid + } + } + + readAll(migDevice, output) + } + } + } } func (m *NvidiaCollector) Close() { diff --git a/collectors/nvidiaMetric.md b/collectors/nvidiaMetric.md index afe8b9e..8cfff32 100644 --- a/collectors/nvidiaMetric.md +++ b/collectors/nvidiaMetric.md @@ -3,38 +3,72 @@ ```json "nvidia": { - "exclude_devices" : [ - "0","1" + "exclude_devices": [ + "0","1", "0000000:ff:01.0" ], "exclude_metrics": [ - "nv_fb_memory", + "nv_fb_mem_used", "nv_fan" - ] + ], + "process_mig_devices": false, + "use_pci_info_as_type_id": true, + "add_pci_info_tag": false, + "add_uuid_meta": false, + "add_board_number_meta": false, + "add_serial_meta": false } ``` +The `nvidia` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option. Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=`). + +The metrics sent by the `nvidia` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag. + +Optionally, it is possible to add the UUID, the board part number and the serial to the meta informations. They are not sent to the sinks (if not configured otherwise). + + Metrics: * `nv_util` * `nv_mem_util` -* `nv_mem_total` -* `nv_fb_memory` +* `nv_fb_mem_total` +* `nv_fb_mem_used` +* `nv_bar1_mem_total` +* `nv_bar1_mem_used` * `nv_temp` * `nv_fan` * `nv_ecc_mode` * `nv_perf_state` -* `nv_power_usage_report` -* `nv_graphics_clock_report` -* `nv_sm_clock_report` -* `nv_mem_clock_report` +* `nv_power_usage` +* `nv_graphics_clock` +* `nv_sm_clock` +* `nv_mem_clock` +* `nv_video_clock` * `nv_max_graphics_clock` * `nv_max_sm_clock` * `nv_max_mem_clock` -* `nv_ecc_db_error` -* `nv_ecc_sb_error` -* `nv_power_man_limit` +* `nv_max_video_clock` +* `nv_ecc_uncorrected_error` +* `nv_ecc_corrected_error` +* `nv_power_max_limit` * `nv_encoder_util` * `nv_decoder_util` +* `nv_remapped_rows_corrected` +* `nv_remapped_rows_uncorrected` +* `nv_remapped_rows_pending` +* `nv_remapped_rows_failure` +* `nv_compute_processes` +* `nv_graphics_processes` +* `nv_violation_power` +* `nv_violation_thermal` +* `nv_violation_sync_boost` +* `nv_violation_board_limit` +* `nv_violation_low_util` +* `nv_violation_reliability` +* `nv_violation_below_app_clock` +* `nv_violation_below_base_clock` +* `nv_nvlink_crc_flit_errors` +* `nv_nvlink_crc_errors` +* `nv_nvlink_ecc_errors` +* `nv_nvlink_replay_errors` +* `nv_nvlink_recovery_errors` -It uses a separate `type` in the metrics. The output metric looks like this: -`,type=accelerator,type-id= value= ` - +Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=`. \ No newline at end of file