1274 lines
44 KiB
Go

package collectors
import (
"encoding/json"
"errors"
"fmt"
"log"
"strings"
"time"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
"github.com/NVIDIA/go-nvml/pkg/nvml"
)
type NvidiaCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
OnlyMetrics []string `json:"only_metrics,omitempty"`
ExcludeDevices []string `json:"exclude_devices,omitempty"`
AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"`
UsePciInfoAsTypeId bool `json:"use_pci_info_as_type_id,omitempty"`
AddUuidMeta bool `json:"add_uuid_meta,omitempty"`
AddBoardNumberMeta bool `json:"add_board_number_meta,omitempty"`
AddSerialMeta bool `json:"add_serial_meta,omitempty"`
ProcessMigDevices bool `json:"process_mig_devices,omitempty"`
UseUuidForMigDevices bool `json:"use_uuid_for_mig_device,omitempty"`
UseSliceForMigDevices bool `json:"use_slice_for_mig_device,omitempty"`
UseMemoryInfoV2 bool `json:"use_memory_info_v2,omitempty"`
SendDiffValues bool `json:"send_diff_values,omitempty"`
}
type NvidiaCollectorDevice struct {
device nvml.Device
excludeMetrics map[string]bool
tags map[string]string
meta map[string]string
config NvidiaCollectorConfig
}
type NvidiaCollector struct {
metricCollector
config NvidiaCollectorConfig
gpus []NvidiaCollectorDevice
num_gpus int
prevEccStats map[string]*eccStats
prevRemappedStats map[string]*remappedRowsStats
prevNVLinkStats map[string]*nvlinkStats
prevViolationStats map[string]*violationStats
}
func (m *NvidiaCollector) CatchPanic() {
if rerr := recover(); rerr != nil {
log.Print(rerr)
m.init = false
}
}
// shouldOutput checks if a metric should be output based on onlyMetrics and excludeMetrics.
func (d *NvidiaCollectorDevice) shouldOutput(metric string) bool {
if len(d.config.OnlyMetrics) > 0 {
for _, m := range d.config.OnlyMetrics {
if m == metric {
return true
}
}
return false
}
return !d.excludeMetrics[metric]
}
type eccStats struct {
uncorrected uint64
corrected uint64
}
type remappedRowsStats struct {
corrected int
uncorrected int
pending int
failure int
}
type violationStats struct {
power float64
thermal float64
syncBoost float64
boardLimit float64
lowUtil float64
reliability float64
belowAppClock float64
belowBaseClock float64
}
type nvlinkStats struct {
crcErrors [nvml.NVLINK_MAX_LINKS]uint64 // Pro NVLink
eccErrors [nvml.NVLINK_MAX_LINKS]uint64
replayErrors [nvml.NVLINK_MAX_LINKS]uint64
recoveryErrors [nvml.NVLINK_MAX_LINKS]uint64
crcFlitErrors [nvml.NVLINK_MAX_LINKS]uint64
// Aggregierte Werte für _sum_diff
aggregateCrcErrors uint64
aggregateEccErrors uint64
aggregateReplayErrors uint64
aggregateRecoveryErrors uint64
aggregateCrcFlitErrors uint64
}
func (m *NvidiaCollector) Init(config json.RawMessage) error {
var err error
m.name = "NvidiaCollector"
m.config.AddPciInfoTag = false
m.config.UsePciInfoAsTypeId = false
m.config.ProcessMigDevices = false
m.config.UseUuidForMigDevices = false
m.config.UseSliceForMigDevices = false
m.prevEccStats = make(map[string]*eccStats)
m.prevRemappedStats = make(map[string]*remappedRowsStats)
m.prevViolationStats = make(map[string]*violationStats)
m.prevNVLinkStats = make(map[string]*nvlinkStats)
m.setup()
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
m.meta = map[string]string{
"source": m.name,
"group": "Nvidia",
}
defer m.CatchPanic()
// Initialize NVIDIA Management Library (NVML)
ret := nvml.Init()
// Error: NVML library not found
// (nvml.ErrorString can not be used in this case)
if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
err = fmt.Errorf("NVML library not found")
cclog.ComponentError(m.name, err.Error())
return err
}
if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error())
return err
}
// Number of NVIDIA GPUs
num_gpus, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to get device count", err.Error())
return err
}
// For all GPUs
idx := 0
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
for i := 0; i < num_gpus; i++ {
// Skip excluded devices by ID
str_i := fmt.Sprintf("%d", i)
if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip {
cclog.ComponentDebug(m.name, "Skipping excluded device", str_i)
continue
}
// Get device handle
device, ret := nvml.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error())
continue
}
// Get device's PCI info
pciInfo, ret := nvml.DeviceGetPciInfo(device)
if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to get PCI info for device at index", i, ":", err.Error())
continue
}
// Create PCI ID in the common format used by the NVML.
pci_id := fmt.Sprintf(
nvml.DEVICE_PCI_BUS_ID_FMT,
pciInfo.Domain,
pciInfo.Bus,
pciInfo.Device)
// Skip excluded devices specified by PCI ID
if _, skip := stringArrayContains(m.config.ExcludeDevices, pci_id); skip {
cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id)
continue
}
// Select which value to use as 'type-id'.
// The PCI ID is commonly required in SLURM environments because the
// numberic IDs used by SLURM and the ones used by NVML might differ
// depending on the job type. The PCI ID is more reliable but is commonly
// not recorded for a job, so it must be added manually in prologue or epilogue
// e.g. to the comment field
tid := str_i
if m.config.UsePciInfoAsTypeId {
tid = pci_id
}
// Now we got all infos together, populate the device list
g := &m.gpus[idx]
// Add device handle
g.device = device
// Add device config
g.config = m.config
// Add tags
g.tags = map[string]string{
"type": "accelerator",
"type-id": tid,
}
// Add PCI info as tag if not already used as 'type-id'
if m.config.AddPciInfoTag && !m.config.UsePciInfoAsTypeId {
g.tags["pci_identifier"] = pci_id
}
g.meta = map[string]string{
"source": m.name,
"group": "Nvidia",
}
if m.config.AddBoardNumberMeta {
board, ret := nvml.DeviceGetBoardPartNumber(device)
if ret != nvml.SUCCESS {
cclog.ComponentError(m.name, "Unable to get board part number for device at index", i, ":", err.Error())
} else {
g.meta["board_number"] = board
}
}
if m.config.AddSerialMeta {
serial, ret := nvml.DeviceGetSerial(device)
if ret != nvml.SUCCESS {
cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error())
} else {
g.meta["serial"] = serial
}
}
if m.config.AddUuidMeta {
uuid, ret := nvml.DeviceGetUUID(device)
if ret != nvml.SUCCESS {
cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error())
} else {
g.meta["uuid"] = uuid
}
}
// Add excluded metrics
g.excludeMetrics = map[string]bool{}
for _, e := range m.config.ExcludeMetrics {
g.excludeMetrics[e] = true
}
// Increment the index for the next device
idx++
}
m.num_gpus = idx
m.init = true
return nil
}
func sendMetric(metricName string, value interface{}, unit string, device NvidiaCollectorDevice, output chan lp.CCMessage, extraTags ...map[string]string) {
msg, err := lp.NewMessage(metricName, device.tags, device.meta, map[string]interface{}{"value": value}, time.Now())
if err != nil {
return
}
if unit != "" {
msg.AddMeta("unit", unit)
}
for _, tags := range extraTags {
for k, v := range tags {
msg.AddTag(k, v)
}
}
output <- msg
}
func readMemoryInfo(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
// Try to use MemoryInfo_v2 if configured
if config.UseMemoryInfoV2 {
meminfoV2, ret := nvml.DeviceGetMemoryInfo_v2(device.device)
if ret == nvml.SUCCESS {
if device.shouldOutput("nv_fb_mem_total") {
sendMetric("nv_fb_mem_total", float64(meminfoV2.Total)/(1024*1024), "MByte", device, output)
}
if device.shouldOutput("nv_fb_mem_used") {
sendMetric("nv_fb_mem_used", float64(meminfoV2.Used)/(1024*1024), "MByte", device, output)
}
if device.shouldOutput("nv_fb_mem_reserved") {
sendMetric("nv_fb_mem_reserved", float64(meminfoV2.Reserved)/(1024*1024), "MByte", device, output)
}
return nil
}
}
// Fallback: Use DeviceGetMemoryInfo (v1)
meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
if ret != nvml.SUCCESS {
return errors.New(nvml.ErrorString(ret))
}
if device.shouldOutput("nv_fb_mem_total") {
sendMetric("nv_fb_mem_total", float64(meminfo.Total)/(1024*1024), "MByte", device, output)
}
if device.shouldOutput("nv_fb_mem_used") {
sendMetric("nv_fb_mem_used", float64(meminfo.Used)/(1024*1024), "MByte", device, output)
}
return nil
}
func readBarMemoryInfo(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
meminfo, ret := nvml.DeviceGetBAR1MemoryInfo(device.device)
if ret != nvml.SUCCESS {
return errors.New(nvml.ErrorString(ret))
}
if device.shouldOutput("nv_bar1_mem_total") {
sendMetric("nv_bar1_mem_total", float64(meminfo.Bar1Total)/(1024*1024), "MByte", device, output)
}
if device.shouldOutput("nv_bar1_mem_used") {
sendMetric("nv_bar1_mem_used", float64(meminfo.Bar1Used)/(1024*1024), "MByte", device, output)
}
return nil
}
func readUtilization(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
if ret != nvml.SUCCESS {
return errors.New(nvml.ErrorString(ret))
}
if isMig {
return nil
}
// Retrieves the current utilization rates for the device's major subsystems.
//
// Available utilization rates
// * Gpu: Percent of time over the past sample period during which one or more kernels was executing on the GPU.
// * Memory: Percent of time over the past sample period during which global (device) memory was being read or written
//
// Note:
// * During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings.
// This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization.
// * On MIG-enabled GPUs, querying device utilization rates is not currently supported.
util, ret := nvml.DeviceGetUtilizationRates(device.device)
if ret == nvml.SUCCESS {
if device.shouldOutput("nv_util") {
sendMetric("nv_util", float64(util.Gpu), "%", device, output)
}
if device.shouldOutput("nv_mem_util") {
sendMetric("nv_mem_util", float64(util.Memory), "%", device, output)
}
}
return nil
}
func readTemp(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
if device.shouldOutput("nv_temp") {
// Retrieves the current temperature readings for the device, in degrees C.
//
// Available temperature sensors:
// * TEMPERATURE_GPU: Temperature sensor for the GPU die.
// * NVML_TEMPERATURE_COUNT
temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
if ret == nvml.SUCCESS {
sendMetric("nv_temp", float64(temp), "degC", device, output)
}
}
return nil
}
func readFan(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
if !device.shouldOutput("nv_fan") {
return nil
}
// Retrieves the intended operating speed of the device's fan.
//
// Note: The reported speed is the intended fan speed.
// If the fan is physically blocked and unable to spin, the output will not match the actual fan speed.
//
// For all discrete products with dedicated fans.
//
// The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed.
// This value may exceed 100% in certain cases.
//
// If more than one fan is found we need to use DeviceGetFanSpeed_v2
numFans, ret := nvml.DeviceGetNumFans(device.device)
if ret != nvml.SUCCESS {
return fmt.Errorf("Error retrieving number of fans: %v", ret)
}
if numFans <= 1 {
fan, ret := nvml.DeviceGetFanSpeed(device.device)
if ret == nvml.SUCCESS {
sendMetric("nv_fan", float64(fan), "%", device, output)
}
} else {
for i := 0; i < numFans; i++ {
fan, ret := nvml.DeviceGetFanSpeed_v2(device.device, i)
if ret == nvml.SUCCESS {
sendMetric("nv_fan", float64(fan), "%", device, output, map[string]string{
"stype": "fan",
"stype-id": fmt.Sprintf("%d", i),
})
}
}
}
return nil
}
func readEccMode(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
if device.shouldOutput("nv_ecc_mode") {
// Retrieves the current and pending ECC modes for the device.
//
// For Fermi or newer fully supported devices. Only applicable to devices with ECC.
// Requires NVML_INFOROM_ECC version 1.0 or higher.
//
// Changing ECC modes requires a reboot.
// The "pending" ECC mode refers to the target mode following the next reboot.
_, eccPend, ret := nvml.DeviceGetEccMode(device.device)
if ret == nvml.SUCCESS {
var value string
switch eccPend {
case nvml.FEATURE_DISABLED:
value = "OFF"
case nvml.FEATURE_ENABLED:
value = "ON"
default:
value = "UNKNOWN"
}
sendMetric("nv_ecc_mode", value, "", device, output)
} else if ret == nvml.ERROR_NOT_SUPPORTED {
sendMetric("nv_ecc_mode", "N/A", "", device, output)
}
}
return nil
}
func readPerfState(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
if device.shouldOutput("nv_perf_state") {
// Retrieves the current performance state for the device.
//
// Allowed PStates:
// 0: Maximum Performance.
// ..
// 15: Minimum Performance.
// 32: Unknown performance state.
pState, ret := nvml.DeviceGetPerformanceState(device.device)
if ret == nvml.SUCCESS {
sendMetric("nv_perf_state", fmt.Sprintf("P%d", int(pState)), "", device, output)
}
}
return nil
}
func readPowerUsage(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
if device.shouldOutput("nv_power_usage") {
// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
//
// On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
//
// It is only available if power management mode is supported
mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
if ret != nvml.SUCCESS {
return nil
}
if mode == nvml.FEATURE_ENABLED {
power, ret := nvml.DeviceGetPowerUsage(device.device)
if ret == nvml.SUCCESS {
sendMetric("nv_power_usage", float64(power)/1000, "watts", device, output)
}
}
}
return nil
}
func readClocks(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
clockTypes := []struct {
metricName string
clockType nvml.ClockType
unit string
}{
{"nv_graphics_clock", nvml.CLOCK_GRAPHICS, "MHz"},
{"nv_sm_clock", nvml.CLOCK_SM, "MHz"},
{"nv_mem_clock", nvml.CLOCK_MEM, "MHz"},
{"nv_video_clock", nvml.CLOCK_VIDEO, "MHz"},
}
// Retrieves the current clock speeds for the device.
//
// Available clock information:
// * CLOCK_GRAPHICS: Graphics clock domain.
// * CLOCK_SM: Streaming Multiprocessor clock domain.
// * CLOCK_MEM: Memory clock domain.
for _, ct := range clockTypes {
if device.shouldOutput(ct.metricName) {
clock, ret := nvml.DeviceGetClockInfo(device.device, ct.clockType)
if ret == nvml.SUCCESS {
sendMetric(ct.metricName, float64(clock), ct.unit, device, output)
}
}
}
return nil
}
func readMaxClocks(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
clockTypes := []struct {
metricName string
clockType nvml.ClockType
unit string
}{
{"nv_max_graphics_clock", nvml.CLOCK_GRAPHICS, "MHz"},
{"nv_max_sm_clock", nvml.CLOCK_SM, "MHz"},
{"nv_max_mem_clock", nvml.CLOCK_MEM, "MHz"},
{"nv_max_video_clock", nvml.CLOCK_VIDEO, "MHz"},
}
// Retrieves the maximum clock speeds for the device.
//
// Available clock information:
// * CLOCK_GRAPHICS: Graphics clock domain.
// * CLOCK_SM: Streaming multiprocessor clock domain.
// * CLOCK_MEM: Memory clock domain.
// * CLOCK_VIDEO: Video encoder/decoder clock domain.
// * CLOCK_COUNT: Count of clock types.
//
// Note:
// On GPUs from Fermi family, current P0 clocks (reported by nvmlDeviceGetClockInfo) can differ from max clocks by a few MHz.
for _, ct := range clockTypes {
if device.shouldOutput(ct.metricName) {
clock, ret := nvml.DeviceGetMaxClockInfo(device.device, ct.clockType)
if ret == nvml.SUCCESS {
sendMetric(ct.metricName, float64(clock), ct.unit, device, output)
}
}
}
return nil
}
func readEccErrors(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage, prevStats *eccStats, deviceID string) error {
var currentUncorrected, currentCorrected uint64
var ret nvml.Return
// Retrieves the total ECC error counts for the device.
//
// For Fermi or newer fully supported devices.
// Only applicable to devices with ECC.
// Requires NVML_INFOROM_ECC version 1.0 or higher.
// Requires ECC Mode to be enabled.
//
// The total error count is the sum of errors across each of the separate memory systems,
// i.e. the total set of errors across the entire device.
if device.shouldOutput("nv_ecc_uncorrected_error") {
currentUncorrected, ret = nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
if ret == nvml.SUCCESS {
sendMetric("nv_ecc_uncorrected_error", uint64(currentUncorrected), "", device, output)
}
}
if device.shouldOutput("nv_ecc_corrected_error") {
currentCorrected, ret = nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
if ret == nvml.SUCCESS {
sendMetric("nv_ecc_corrected_error", uint64(currentCorrected), "", device, output)
}
}
if config.SendDiffValues {
var diffUncorrected, diffCorrected uint64
if prevStats.uncorrected == 0 && prevStats.corrected == 0 {
diffUncorrected = 0
diffCorrected = 0
} else {
diffUncorrected = currentUncorrected - prevStats.uncorrected
diffCorrected = currentCorrected - prevStats.corrected
if diffUncorrected > currentUncorrected {
diffUncorrected = 0
}
if diffCorrected > currentCorrected {
diffCorrected = 0
}
}
prevStats.uncorrected = currentUncorrected
prevStats.corrected = currentCorrected
if device.shouldOutput("nv_ecc_uncorrected_error_diff") {
sendMetric("nv_ecc_uncorrected_error_diff", uint64(diffUncorrected), "", device, output)
}
if device.shouldOutput("nv_ecc_corrected_error_diff") {
sendMetric("nv_ecc_corrected_error_diff", uint64(diffCorrected), "", device, output)
}
}
return nil
}
func readPowerLimit(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
if device.shouldOutput("nv_power_max_limit") {
// Retrieves the power management limit associated with this device.
//
// For Fermi or newer fully supported devices.
//
// The power limit defines the upper boundary for the card's power draw.
// If the card's total power draw reaches this limit the power management algorithm kicks in.
pwrLimit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
if ret == nvml.SUCCESS {
sendMetric("nv_power_max_limit", float64(pwrLimit)/1000, "watts", device, output)
}
}
return nil
}
func readEncUtilization(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
if ret != nvml.SUCCESS {
return errors.New(nvml.ErrorString(ret))
}
if isMig {
return nil
}
if device.shouldOutput("nv_encoder_util") {
// Retrieves the current utilization and sampling size in microseconds for the Encoder
//
// For Kepler or newer fully supported devices.
//
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
encUtil, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
if ret == nvml.SUCCESS {
sendMetric("nv_encoder_util", float64(encUtil), "%", device, output)
}
}
return nil
}
func readDecUtilization(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
if ret != nvml.SUCCESS {
return errors.New(nvml.ErrorString(ret))
}
if isMig {
return nil
}
if device.shouldOutput("nv_decoder_util") {
// Retrieves the current utilization and sampling size in microseconds for the Decoder
//
// For Kepler or newer fully supported devices.
//
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
decUtil, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
if ret == nvml.SUCCESS {
sendMetric("nv_decoder_util", float64(decUtil), "%", device, output)
}
}
return nil
}
func readRemappedRows(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage, prevStats *remappedRowsStats, deviceID string) error {
// Get number of remapped rows. The number of rows reported will be based on the cause of the remapping.
// isPending indicates whether or not there are pending remappings.
// A reset will be required to actually remap the row.
// failureOccurred will be set if a row remapping ever failed in the past.
// A pending remapping won't affect future work on the GPU since error-containment and dynamic page blacklisting will take care of that.
//
// For Ampere or newer fully supported devices.
//
// Note: On MIG-enabled GPUs with active instances, querying the number of remapped rows is not supported
corrected, uncorrected, pendingBool, failureBool, ret := nvml.DeviceGetRemappedRows(device.device)
if ret != nvml.SUCCESS {
return nil
}
var pending, failure int
if pendingBool {
pending = 1
}
if failureBool {
failure = 1
}
if device.shouldOutput("nv_remapped_rows_corrected") {
sendMetric("nv_remapped_rows_corrected", float64(corrected), "", device, output)
}
if device.shouldOutput("nv_remapped_rows_uncorrected") {
sendMetric("nv_remapped_rows_uncorrected", float64(uncorrected), "", device, output)
}
if device.shouldOutput("nv_remapped_rows_pending") {
sendMetric("nv_remapped_rows_pending", pending, "", device, output)
}
if device.shouldOutput("nv_remapped_rows_failure") {
sendMetric("nv_remapped_rows_failure", failure, "", device, output)
}
if config.SendDiffValues {
var diffCorrected, diffUncorrected, diffPending, diffFailure int
if prevStats.corrected == 0 && prevStats.uncorrected == 0 && prevStats.pending == 0 && prevStats.failure == 0 {
diffCorrected = 0
diffUncorrected = 0
diffPending = 0
diffFailure = 0
} else {
diffCorrected = corrected - prevStats.corrected
diffUncorrected = uncorrected - prevStats.uncorrected
diffPending = pending - prevStats.pending
diffFailure = failure - prevStats.failure
if diffCorrected > corrected {
diffCorrected = 0
}
if diffUncorrected > uncorrected {
diffUncorrected = 0
}
}
prevStats.corrected = corrected
prevStats.uncorrected = uncorrected
prevStats.pending = pending
prevStats.failure = failure
if device.shouldOutput("nv_remapped_rows_corrected_diff") {
sendMetric("nv_remapped_rows_corrected_diff", float64(diffCorrected), "", device, output)
}
if device.shouldOutput("nv_remapped_rows_uncorrected_diff") {
sendMetric("nv_remapped_rows_uncorrected_diff", float64(diffUncorrected), "", device, output)
}
if device.shouldOutput("nv_remapped_rows_pending_diff") {
sendMetric("nv_remapped_rows_pending_diff", diffPending, "", device, output)
}
if device.shouldOutput("nv_remapped_rows_failure_diff") {
sendMetric("nv_remapped_rows_failure_diff", diffFailure, "", device, output)
}
}
return nil
}
func readProcessCounts(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage) error {
if device.shouldOutput("nv_compute_processes") {
// Get information about processes with a compute context on a device
//
// For Fermi &tm; or newer fully supported devices.
//
// This function returns information only about compute running processes (e.g. CUDA application which have
// active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function.
//
// To query the current number of running compute processes, call this function with *infoCount = 0. The
// return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
// \a infos is allowed to be NULL.
//
// The usedGpuMemory field returned is all of the memory used by the application.
//
// Keep in mind that information returned by this call is dynamic and the number of elements might change in
// time. Allocate more space for \a infos table in case new compute processes are spawned.
//
// @note In MIG mode, if device handle is provided, the API returns aggregate information, only if
// the caller has appropriate privileges. Per-instance information can be queried by using
// specific MIG device handles.
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device)
if ret == nvml.SUCCESS {
sendMetric("nv_compute_processes", len(procList), "", device, output)
}
}
if device.shouldOutput("nv_graphics_processes") {
// Get information about processes with a graphics context on a device
//
// For Kepler &tm; or newer fully supported devices.
//
// This function returns information only about graphics based processes
// (eg. applications using OpenGL, DirectX)
//
// To query the current number of running graphics processes, call this function with *infoCount = 0. The
// return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
// \a infos is allowed to be NULL.
//
// The usedGpuMemory field returned is all of the memory used by the application.
//
// Keep in mind that information returned by this call is dynamic and the number of elements might change in
// time. Allocate more space for \a infos table in case new graphics processes are spawned.
//
// @note In MIG mode, if device handle is provided, the API returns aggregate information, only if
// the caller has appropriate privileges. Per-instance information can be queried by using
// specific MIG device handles.
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device)
if ret == nvml.SUCCESS {
sendMetric("nv_graphics_processes", len(procList), "", device, output)
}
}
return nil
}
func readViolationStats(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage, prevStats *violationStats) error {
type violationMetric struct {
name string
policy nvml.PerfPolicyType
}
// Gets the duration of time during which the device was throttled (lower than requested clocks) due to power
// or thermal constraints.
//
// The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The
// difference in violation times at two different reference times gives the indication of GPU throttling event.
//
// Violation for thermal capping is not supported at this time.
//
// For Kepler or newer fully supported devices.
metrics := []violationMetric{
{"nv_violation_power", nvml.PERF_POLICY_POWER},
{"nv_violation_thermal", nvml.PERF_POLICY_THERMAL},
{"nv_violation_sync_boost", nvml.PERF_POLICY_SYNC_BOOST},
{"nv_violation_board_limit", nvml.PERF_POLICY_BOARD_LIMIT},
{"nv_violation_low_util", nvml.PERF_POLICY_LOW_UTILIZATION},
{"nv_violation_reliability", nvml.PERF_POLICY_RELIABILITY},
{"nv_violation_below_app_clock", nvml.PERF_POLICY_TOTAL_APP_CLOCKS},
{"nv_violation_below_base_clock", nvml.PERF_POLICY_TOTAL_BASE_CLOCKS},
}
for _, mtr := range metrics {
if !device.shouldOutput(mtr.name) {
continue
}
violTime, ret := nvml.DeviceGetViolationStatus(device.device, mtr.policy)
if ret != nvml.SUCCESS {
continue
}
currentValue := float64(violTime.ViolationTime) * 1e-9
sendMetric(mtr.name, currentValue, "sec", device, output)
if config.SendDiffValues && prevStats != nil {
var diff float64
var firstMeasurement bool
switch mtr.name {
case "nv_violation_power":
firstMeasurement = prevStats.power == 0
case "nv_violation_thermal":
firstMeasurement = prevStats.thermal == 0
case "nv_violation_sync_boost":
firstMeasurement = prevStats.syncBoost == 0
case "nv_violation_board_limit":
firstMeasurement = prevStats.boardLimit == 0
case "nv_violation_low_util":
firstMeasurement = prevStats.lowUtil == 0
case "nv_violation_reliability":
firstMeasurement = prevStats.reliability == 0
case "nv_violation_below_app_clock":
firstMeasurement = prevStats.belowAppClock == 0
case "nv_violation_below_base_clock":
firstMeasurement = prevStats.belowBaseClock == 0
}
if firstMeasurement {
diff = 0
} else {
var prevValue float64
switch mtr.name {
case "nv_violation_power":
prevValue = prevStats.power
case "nv_violation_thermal":
prevValue = prevStats.thermal
case "nv_violation_sync_boost":
prevValue = prevStats.syncBoost
case "nv_violation_board_limit":
prevValue = prevStats.boardLimit
case "nv_violation_low_util":
prevValue = prevStats.lowUtil
case "nv_violation_reliability":
prevValue = prevStats.reliability
case "nv_violation_below_app_clock":
prevValue = prevStats.belowAppClock
case "nv_violation_below_base_clock":
prevValue = prevStats.belowBaseClock
}
diff = currentValue - prevValue
if diff < 0 {
diff = 0
}
}
diffName := mtr.name + "_diff"
if device.shouldOutput(diffName) {
sendMetric(diffName, diff, "sec", device, output)
}
switch mtr.name {
case "nv_violation_power":
prevStats.power = currentValue
case "nv_violation_thermal":
prevStats.thermal = currentValue
case "nv_violation_sync_boost":
prevStats.syncBoost = currentValue
case "nv_violation_board_limit":
prevStats.boardLimit = currentValue
case "nv_violation_low_util":
prevStats.lowUtil = currentValue
case "nv_violation_reliability":
prevStats.reliability = currentValue
case "nv_violation_below_app_clock":
prevStats.belowAppClock = currentValue
case "nv_violation_below_base_clock":
prevStats.belowBaseClock = currentValue
}
}
}
return nil
}
func readNVLinkStats(device NvidiaCollectorDevice, config NvidiaCollectorConfig, output chan lp.CCMessage, prevStats *nvlinkStats, deviceID string) error {
var aggregate_crc_errors uint64 = 0
var aggregate_ecc_errors uint64 = 0
var aggregate_replay_errors uint64 = 0
var aggregate_recovery_errors uint64 = 0
var aggregate_crc_flit_errors uint64 = 0
// Retrieves the specified error counter value
// Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
//
// For Pascal &tm; or newer fully supported devices.
needsMetric := func(base string) bool {
return device.shouldOutput(base) ||
device.shouldOutput(base+"_sum") ||
(config.SendDiffValues && device.shouldOutput(base+"_diff")) ||
(config.SendDiffValues && device.shouldOutput(base+"_sum_diff"))
}
for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ {
state, ret := nvml.DeviceGetNvLinkState(device.device, i)
if ret != nvml.SUCCESS {
continue
}
if state != nvml.FEATURE_ENABLED {
continue
}
extraTags := map[string]string{
"stype": "nvlink",
"stype-id": fmt.Sprintf("%d", i),
}
if needsMetric("nv_nvlink_crc_errors") {
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA)
if ret == nvml.SUCCESS {
aggregate_crc_errors += count
if device.shouldOutput("nv_nvlink_crc_errors") {
sendMetric("nv_nvlink_crc_errors", count, "", device, output, extraTags)
}
if config.SendDiffValues && device.shouldOutput("nv_nvlink_crc_errors_diff") {
var diff uint64
if prevStats.crcErrors[i] == 0 {
diff = 0
} else {
diff = count - prevStats.crcErrors[i]
if diff > count {
diff = 0
}
}
sendMetric("nv_nvlink_crc_errors_diff", diff, "", device, output, extraTags)
prevStats.crcErrors[i] = count
}
}
}
if needsMetric("nv_nvlink_ecc_errors") {
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA)
if ret == nvml.SUCCESS {
aggregate_ecc_errors += count
if device.shouldOutput("nv_nvlink_ecc_errors") {
sendMetric("nv_nvlink_ecc_errors", count, "", device, output, extraTags)
}
if config.SendDiffValues && device.shouldOutput("nv_nvlink_ecc_errors_diff") {
var diff uint64
if prevStats.eccErrors[i] == 0 {
diff = 0
} else {
diff = count - prevStats.eccErrors[i]
if diff > count {
diff = 0
}
}
sendMetric("nv_nvlink_ecc_errors_diff", diff, "", device, output, extraTags)
prevStats.eccErrors[i] = count
}
}
}
if needsMetric("nv_nvlink_replay_errors") {
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY)
if ret == nvml.SUCCESS {
aggregate_replay_errors += count
if device.shouldOutput("nv_nvlink_replay_errors") {
sendMetric("nv_nvlink_replay_errors", count, "", device, output, extraTags)
}
if config.SendDiffValues && device.shouldOutput("nv_nvlink_replay_errors_diff") {
var diff uint64
if prevStats.replayErrors[i] == 0 {
diff = 0
} else {
diff = count - prevStats.replayErrors[i]
if diff > count {
diff = 0
}
}
sendMetric("nv_nvlink_replay_errors_diff", diff, "", device, output, extraTags)
prevStats.replayErrors[i] = count
}
}
}
if needsMetric("nv_nvlink_recovery_errors") {
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY)
if ret == nvml.SUCCESS {
aggregate_recovery_errors += count
if device.shouldOutput("nv_nvlink_recovery_errors") {
sendMetric("nv_nvlink_recovery_errors", count, "", device, output, extraTags)
}
if config.SendDiffValues && device.shouldOutput("nv_nvlink_recovery_errors_diff") {
var diff uint64
if prevStats.recoveryErrors[i] == 0 {
diff = 0
} else {
diff = count - prevStats.recoveryErrors[i]
if diff > count {
diff = 0
}
}
sendMetric("nv_nvlink_recovery_errors_diff", diff, "", device, output, extraTags)
prevStats.recoveryErrors[i] = count
}
}
}
if needsMetric("nv_nvlink_crc_flit_errors") {
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT)
if ret == nvml.SUCCESS {
aggregate_crc_flit_errors += count
if device.shouldOutput("nv_nvlink_crc_flit_errors") {
sendMetric("nv_nvlink_crc_flit_errors", count, "", device, output, extraTags)
}
if config.SendDiffValues && device.shouldOutput("nv_nvlink_crc_flit_errors_diff") {
var diff uint64
if prevStats.crcFlitErrors[i] == 0 {
diff = 0
} else {
diff = count - prevStats.crcFlitErrors[i]
if diff > count {
diff = 0
}
}
sendMetric("nv_nvlink_crc_flit_errors_diff", diff, "", device, output, extraTags)
prevStats.crcFlitErrors[i] = count
}
}
}
}
// Export aggregated values
if device.shouldOutput("nv_nvlink_crc_errors_sum") {
sendMetric("nv_nvlink_crc_errors_sum", aggregate_crc_errors, "", device, output, map[string]string{"stype": "nvlink"})
}
if device.shouldOutput("nv_nvlink_ecc_errors_sum") {
sendMetric("nv_nvlink_ecc_errors_sum", aggregate_ecc_errors, "", device, output, map[string]string{"stype": "nvlink"})
}
if device.shouldOutput("nv_nvlink_replay_errors_sum") {
sendMetric("nv_nvlink_replay_errors_sum", aggregate_replay_errors, "", device, output, map[string]string{"stype": "nvlink"})
}
if device.shouldOutput("nv_nvlink_recovery_errors_sum") {
sendMetric("nv_nvlink_recovery_errors_sum", aggregate_recovery_errors, "", device, output, map[string]string{"stype": "nvlink"})
}
if device.shouldOutput("nv_nvlink_crc_flit_errors_sum") {
sendMetric("nv_nvlink_crc_flit_errors_sum", aggregate_crc_flit_errors, "", device, output, map[string]string{"stype": "nvlink"})
}
// Export aggregated diff values
if config.SendDiffValues {
var diff_crc_sum, diff_ecc_sum, diff_replay_sum, diff_recovery_sum, diff_crc_flit_sum uint64
// Initialize diffs to 0 for the first measurement
if prevStats.aggregateCrcErrors == 0 && prevStats.aggregateEccErrors == 0 && prevStats.aggregateReplayErrors == 0 && prevStats.aggregateRecoveryErrors == 0 && prevStats.aggregateCrcFlitErrors == 0 {
diff_crc_sum = 0
diff_ecc_sum = 0
diff_replay_sum = 0
diff_recovery_sum = 0
diff_crc_flit_sum = 0
} else {
// Compute diffs for sum metrics
diff_crc_sum = aggregate_crc_errors - prevStats.aggregateCrcErrors
diff_ecc_sum = aggregate_ecc_errors - prevStats.aggregateEccErrors
diff_replay_sum = aggregate_replay_errors - prevStats.aggregateReplayErrors
diff_recovery_sum = aggregate_recovery_errors - prevStats.aggregateRecoveryErrors
diff_crc_flit_sum = aggregate_crc_flit_errors - prevStats.aggregateCrcFlitErrors
// Reset diffs to 0 if they exceed current values (e.g., counter reset)
if diff_crc_sum > aggregate_crc_errors {
diff_crc_sum = 0
}
if diff_ecc_sum > aggregate_ecc_errors {
diff_ecc_sum = 0
}
if diff_replay_sum > aggregate_replay_errors {
diff_replay_sum = 0
}
if diff_recovery_sum > aggregate_recovery_errors {
diff_recovery_sum = 0
}
if diff_crc_flit_sum > aggregate_crc_flit_errors {
diff_crc_flit_sum = 0
}
}
// Update prevStats with current aggregate values
prevStats.aggregateCrcErrors = aggregate_crc_errors
prevStats.aggregateEccErrors = aggregate_ecc_errors
prevStats.aggregateReplayErrors = aggregate_replay_errors
prevStats.aggregateRecoveryErrors = aggregate_recovery_errors
prevStats.aggregateCrcFlitErrors = aggregate_crc_flit_errors
// Export diff metrics for sum values
if device.shouldOutput("nv_nvlink_crc_errors_sum_diff") {
sendMetric("nv_nvlink_crc_errors_sum_diff", diff_crc_sum, "", device, output, map[string]string{"stype": "nvlink"})
}
if device.shouldOutput("nv_nvlink_ecc_errors_sum_diff") {
sendMetric("nv_nvlink_ecc_errors_sum_diff", diff_ecc_sum, "", device, output, map[string]string{"stype": "nvlink"})
}
if device.shouldOutput("nv_nvlink_replay_errors_sum_diff") {
sendMetric("nv_nvlink_replay_errors_sum_diff", diff_replay_sum, "", device, output, map[string]string{"stype": "nvlink"})
}
if device.shouldOutput("nv_nvlink_recovery_errors_sum_diff") {
sendMetric("nv_nvlink_recovery_errors_sum_diff", diff_recovery_sum, "", device, output, map[string]string{"stype": "nvlink"})
}
if device.shouldOutput("nv_nvlink_crc_flit_errors_sum_diff") {
sendMetric("nv_nvlink_crc_flit_errors_sum_diff", diff_crc_flit_sum, "", device, output, map[string]string{"stype": "nvlink"})
}
}
return nil
}
func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
// Helper function to get the device name
deviceName := func(device NvidiaCollectorDevice) string {
name, ret := nvml.DeviceGetName(device.device)
if ret != nvml.SUCCESS {
return "NoName"
}
return name
}
// Helper function that executes a metric function and logs errors
processMetric := func(metricName string, f func(NvidiaCollectorDevice, NvidiaCollectorConfig, chan lp.CCMessage) error, device NvidiaCollectorDevice) {
if err := f(device, m.config, output); err != nil {
cclog.ComponentDebug(m.name, fmt.Sprintf("%s for device %s failed", metricName, deviceName(device)))
}
}
// Executes all metric functions for a device
readAll := func(device NvidiaCollectorDevice) {
processMetric("readMemoryInfo", readMemoryInfo, device)
processMetric("readUtilization", readUtilization, device)
processMetric("readTemp", readTemp, device)
processMetric("readFan", readFan, device)
processMetric("readEccMode", readEccMode, device)
processMetric("readPerfState", readPerfState, device)
processMetric("readPowerUsage", readPowerUsage, device)
processMetric("readClocks", readClocks, device)
processMetric("readMaxClocks", readMaxClocks, device)
processMetric("readPowerLimit", readPowerLimit, device)
processMetric("readEncUtilization", readEncUtilization, device)
processMetric("readDecUtilization", readDecUtilization, device)
processMetric("readBarMemoryInfo", readBarMemoryInfo, device)
processMetric("readProcessCounts", readProcessCounts, device)
}
// Loop over all GPUs
for i := 0; i < m.num_gpus; i++ {
readAll(m.gpus[i])
deviceID := m.gpus[i].tags["type-id"]
if _, ok := m.prevEccStats[deviceID]; !ok {
m.prevEccStats[deviceID] = &eccStats{}
}
readEccErrors(m.gpus[i], m.config, output, m.prevEccStats[deviceID], deviceID)
if _, ok := m.prevRemappedStats[deviceID]; !ok {
m.prevRemappedStats[deviceID] = &remappedRowsStats{}
}
readRemappedRows(m.gpus[i], m.config, output, m.prevRemappedStats[deviceID], deviceID)
if _, ok := m.prevViolationStats[deviceID]; !ok {
m.prevViolationStats[deviceID] = &violationStats{}
}
readViolationStats(m.gpus[i], m.config, output, m.prevViolationStats[deviceID])
if _, ok := m.prevNVLinkStats[deviceID]; !ok {
m.prevNVLinkStats[deviceID] = &nvlinkStats{}
}
readNVLinkStats(m.gpus[i], m.config, output, m.prevNVLinkStats[deviceID], deviceID)
// If MIG devices should be processed
if m.config.ProcessMigDevices {
current, _, ret := nvml.DeviceGetMigMode(m.gpus[i].device)
if ret != nvml.SUCCESS || current == nvml.DEVICE_MIG_DISABLE {
continue
}
maxMig, ret := nvml.DeviceGetMaxMigDeviceCount(m.gpus[i].device)
if ret != nvml.SUCCESS || maxMig == 0 {
continue
}
cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i)
for j := 0; j < maxMig; j++ {
mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
if ret != nvml.SUCCESS {
continue
}
excludeMetrics := make(map[string]bool)
for _, metric := range m.config.ExcludeMetrics {
excludeMetrics[metric] = true
}
// Initialize the MIG device and copy tags and meta data
migDevice := NvidiaCollectorDevice{
device: mdev,
tags: make(map[string]string),
meta: make(map[string]string),
excludeMetrics: excludeMetrics,
config: m.config,
}
for k, v := range m.gpus[i].tags {
migDevice.tags[k] = v
}
migDevice.tags["stype"] = "mig"
if m.config.UseUuidForMigDevices {
uuid, ret := nvml.DeviceGetUUID(mdev)
if ret != nvml.SUCCESS {
cclog.ComponentError(m.name, "Unable to get UUID for mig device at index", j, ":", "error occurred")
} else {
migDevice.tags["stype-id"] = uuid
}
} else if m.config.UseSliceForMigDevices {
name, ret := nvml.DeviceGetName(m.gpus[i].device)
if ret == nvml.SUCCESS {
mname, ret := nvml.DeviceGetName(mdev)
if ret == nvml.SUCCESS {
x := strings.Replace(mname, name, "", -1)
x = strings.Replace(x, "MIG", "", -1)
x = strings.TrimSpace(x)
migDevice.tags["stype-id"] = x
}
}
}
if _, ok := migDevice.tags["stype-id"]; !ok {
migDevice.tags["stype-id"] = fmt.Sprintf("%d", j)
}
for k, v := range m.gpus[i].meta {
migDevice.meta[k] = v
}
if _, ok := migDevice.meta["uuid"]; ok && !m.config.UseUuidForMigDevices {
uuid, ret := nvml.DeviceGetUUID(mdev)
if ret == nvml.SUCCESS {
migDevice.meta["uuid"] = uuid
}
}
// Read all metrics for the MIG device
readAll(migDevice)
}
}
}
}
func (m *NvidiaCollector) Close() {
if m.init {
nvml.Shutdown()
m.init = false
}
}