From 01faa3b5317814dab94432fd9dd5d0647ba6d481 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 15 Feb 2022 10:57:32 +0100 Subject: [PATCH] Add comments and units to all nvidia metrics --- collectors.json | 31 ++++---- collectors/nvidiaMetric.go | 150 ++++++++++++++++++++++++++++++++----- 2 files changed, 149 insertions(+), 32 deletions(-) diff --git a/collectors.json b/collectors.json index 09731ab..cbfc23d 100644 --- a/collectors.json +++ b/collectors.json @@ -1,23 +1,28 @@ { - "numastats": {}, "cpufreq": {}, "cpufreq_cpuinfo": {}, "gpfs": { - "exclude_filesystem": [ "test_fs" ] + "exclude_filesystem": [ + "test_fs" + ] }, "loadavg": { - "exclude_metrics": [ "proc_total" ] + "exclude_metrics": [ + "proc_total" + ] }, + "numastats": {}, + "nvidia": {}, "tempstat": { - "tag_override": { - "hwmon0" : { - "type" : "socket", - "type-id" : "0" - }, - "hwmon1" : { - "type" : "socket", - "type-id" : "1" + "tag_override": { + "hwmon0": { + "type": "socket", + "type-id": "0" + }, + "hwmon1": { + "type": "socket", + "type-id": "1" + } } - } } -} +} \ No newline at end of file diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 27b921a..24f0855 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -134,17 +134,29 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) device := &m.gpus[i] if !device.excludeMetrics["nv_util"] || !device.excludeMetrics["nv_mem_util"] { + // Retrieves the current utilization rates for the device's major subsystems. + // + // Available utilization rates + // * Gpu: Percent of time over the past sample period during which one or more kernels was executing on the GPU. + // * Memory: Percent of time over the past sample period during which global (device) memory was being read or written + // + // Note: + // * During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings. + // This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization. + // * On MIG-enabled GPUs, querying device utilization rates is not currently supported. util, ret := nvml.DeviceGetUtilizationRates(device.device) if ret == nvml.SUCCESS { if !device.excludeMetrics["nv_util"] { y, err := lp.New("nv_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) if err == nil { + y.AddMeta("unit", "%") output <- y } } if !device.excludeMetrics["nv_mem_util"] { y, err := lp.New("nv_mem_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) if err == nil { + y.AddMeta("unit", "%") output <- y } } @@ -152,6 +164,20 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } if !device.excludeMetrics["nv_mem_total"] || !device.excludeMetrics["nv_fb_memory"] { + // Retrieves the amount of used, free and total memory available on the device, in bytes. + // + // Enabling ECC reduces the amount of total available memory, due to the extra required parity bits. + // + // The reported amount of used memory is equal to the sum of memory allocated by all active channels on the device. + // + // Available memory info: + // * Free: Unallocated FB memory (in bytes). + // * Total: Total installed FB memory (in bytes). + // * Used: Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping. + // + // Note: + // In MIG mode, if device handle is provided, the API returns aggregate information, only if the caller has appropriate privileges. + // Per-instance information can be queried by using specific MIG device handles. meminfo, ret := nvml.DeviceGetMemoryInfo(device.device) if ret == nvml.SUCCESS { if !device.excludeMetrics["nv_mem_total"] { @@ -175,6 +201,11 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } if !device.excludeMetrics["nv_temp"] { + // Retrieves the current temperature readings for the device, in degrees C. + // + // Available temperature sensors: + // * TEMPERATURE_GPU: Temperature sensor for the GPU die. + // * NVML_TEMPERATURE_COUNT temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU) if ret == nvml.SUCCESS { y, err := lp.New("nv_temp", device.tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now()) @@ -186,33 +217,50 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } if !device.excludeMetrics["nv_fan"] { + // Retrieves the intended operating speed of the device's fan. + // + // Note: The reported speed is the intended fan speed. + // If the fan is physically blocked and unable to spin, the output will not match the actual fan speed. + // + // For all discrete products with dedicated fans. + // + // The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. + // This value may exceed 100% in certain cases. fan, ret := nvml.DeviceGetFanSpeed(device.device) if ret == nvml.SUCCESS { y, err := lp.New("nv_fan", device.tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) if err == nil { + y.AddMeta("unit", "%") output <- y } } } if !device.excludeMetrics["nv_ecc_mode"] { + // Retrieves the current and pending ECC modes for the device. + // + // For Fermi or newer fully supported devices. Only applicable to devices with ECC. + // Requires NVML_INFOROM_ECC version 1.0 or higher. + // + // Changing ECC modes requires a reboot. + // The "pending" ECC mode refers to the target mode following the next reboot. _, ecc_pend, ret := nvml.DeviceGetEccMode(device.device) if ret == nvml.SUCCESS { var y lp.CCMetric var err error switch ecc_pend { case nvml.FEATURE_DISABLED: - y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now()) + y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "OFF"}, time.Now()) case nvml.FEATURE_ENABLED: - y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now()) + y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "ON"}, time.Now()) default: - y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now()) + y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now()) } if err == nil { output <- y } } else if ret == nvml.ERROR_NOT_SUPPORTED { - y, err := lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now()) + y, err := lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "N/A"}, time.Now()) if err == nil { output <- y } @@ -220,9 +268,16 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } if !device.excludeMetrics["nv_perf_state"] { - pstate, ret := nvml.DeviceGetPerformanceState(device.device) + // Retrieves the current performance state for the device. + // + // Allowed PStates: + // 0: Maximum Performance. + // .. + // 15: Minimum Performance. + // 32: Unknown performance state. + pState, ret := nvml.DeviceGetPerformanceState(device.device) if ret == nvml.SUCCESS { - y, err := lp.New("nv_perf_state", device.tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now()) + y, err := lp.New("nv_perf_state", device.tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now()) if err == nil { output <- y } @@ -230,77 +285,115 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } if !device.excludeMetrics["nv_power_usage_report"] { + // Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) + // + // On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. + // + // It is only available if power management mode is supported power, ret := nvml.DeviceGetPowerUsage(device.device) if ret == nvml.SUCCESS { y, err := lp.New("nv_power_usage_report", device.tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) if err == nil { + y.AddMeta("unit", "watts") output <- y } } } + // Retrieves the current clock speeds for the device. + // + // Available clock information: + // * CLOCK_GRAPHICS: Graphics clock domain. + // * CLOCK_SM: Streaming Multiprocessor clock domain. + // * CLOCK_MEM: Memory clock domain. if !device.excludeMetrics["nv_graphics_clock_report"] { - gclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS) + graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS) if ret == nvml.SUCCESS { - y, err := lp.New("nv_graphics_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now()) + y, err := lp.New("nv_graphics_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now()) if err == nil { + y.AddMeta("unit", "MHz") output <- y } } } if !device.excludeMetrics["nv_sm_clock_report"] { - smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) + smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) if ret == nvml.SUCCESS { - y, err := lp.New("nv_sm_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now()) + y, err := lp.New("nv_sm_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(smCock)}, time.Now()) if err == nil { + y.AddMeta("unit", "MHz") output <- y } } } if !device.excludeMetrics["nv_mem_clock_report"] { - memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) + memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) if ret == nvml.SUCCESS { - y, err := lp.New("nv_mem_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now()) + y, err := lp.New("nv_mem_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(memClock)}, time.Now()) if err == nil { + y.AddMeta("unit", "MHz") output <- y } } } + // Retrieves the maximum clock speeds for the device. + // + // Available clock information: + // * CLOCK_GRAPHICS: Graphics clock domain. + // * CLOCK_SM: Streaming multiprocessor clock domain. + // * CLOCK_MEM: Memory clock domain. + // * CLOCK_VIDEO: Video encoder/decoder clock domain. + // * CLOCK_COUNT: Count of clock types. + // + // Note: + /// On GPUs from Fermi family current P0 clocks (reported by nvmlDeviceGetClockInfo) can differ from max clocks by few MHz. if !device.excludeMetrics["nv_max_graphics_clock"] { max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS) if ret == nvml.SUCCESS { y, err := lp.New("nv_max_graphics_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) if err == nil { + y.AddMeta("unit", "MHz") output <- y } } } if !device.excludeMetrics["nv_max_sm_clock"] { - max_smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) + maxSmClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) if ret == nvml.SUCCESS { - y, err := lp.New("nv_max_sm_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now()) + y, err := lp.New("nv_max_sm_clock", device.tags, m.meta, map[string]interface{}{"value": float64(maxSmClock)}, time.Now()) if err == nil { + y.AddMeta("unit", "MHz") output <- y } } } if !device.excludeMetrics["nv_max_mem_clock"] { - max_memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) + maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) if ret == nvml.SUCCESS { - y, err := lp.New("nv_max_mem_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now()) + y, err := lp.New("nv_max_mem_clock", device.tags, m.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now()) if err == nil { + y.AddMeta("unit", "MHz") output <- y } } } if !device.excludeMetrics["nv_ecc_db_error"] { - ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, 1, 1) + // Retrieves the total ECC error counts for the device. + // + // For Fermi or newer fully supported devices. + // Only applicable to devices with ECC. + // Requires NVML_INFOROM_ECC version 1.0 or higher. + // Requires ECC Mode to be enabled. + // + // The total error count is the sum of errors across each of the separate memory systems, + // i.e. the total set of errors across the entire device. + ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC) if ret == nvml.SUCCESS { y, err := lp.New("nv_ecc_db_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) if err == nil { @@ -310,7 +403,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } if !device.excludeMetrics["nv_ecc_sb_error"] { - ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, 0, 1) + ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC) if ret == nvml.SUCCESS { y, err := lp.New("nv_ecc_sb_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) if err == nil { @@ -320,30 +413,49 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) } if !device.excludeMetrics["nv_power_man_limit"] { + // Retrieves the power management limit associated with this device. + // + // For Fermi or newer fully supported devices. + // + // The power limit defines the upper boundary for the card's power draw. + // If the card's total power draw reaches this limit the power management algorithm kicks in. pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device) if ret == nvml.SUCCESS { - y, err := lp.New("nv_power_man_limit", device.tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now()) + y, err := lp.New("nv_power_man_limit", device.tags, m.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now()) if err == nil { + y.AddMeta("unit", "watts") output <- y } } } if !device.excludeMetrics["nv_encoder_util"] { + // Retrieves the current utilization and sampling size in microseconds for the Encoder + // + // For Kepler or newer fully supported devices. + // + // Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported. enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device) if ret == nvml.SUCCESS { y, err := lp.New("nv_encoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now()) if err == nil { + y.AddMeta("unit", "%") output <- y } } } if !device.excludeMetrics["nv_decoder_util"] { + // Retrieves the current utilization and sampling size in microseconds for the Decoder + // + // For Kepler or newer fully supported devices. + // + // Note: On MIG-enabled GPUs, querying decoder utilization is not currently supported. dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device) if ret == nvml.SUCCESS { y, err := lp.New("nv_decoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now()) if err == nil { + y.AddMeta("unit", "%") output <- y } }