mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2024-12-25 15:09:05 +01:00
Add comments and units to all nvidia metrics
This commit is contained in:
parent
14c9d6f792
commit
01faa3b531
@ -1,13 +1,18 @@
|
|||||||
{
|
{
|
||||||
"numastats": {},
|
|
||||||
"cpufreq": {},
|
"cpufreq": {},
|
||||||
"cpufreq_cpuinfo": {},
|
"cpufreq_cpuinfo": {},
|
||||||
"gpfs": {
|
"gpfs": {
|
||||||
"exclude_filesystem": [ "test_fs" ]
|
"exclude_filesystem": [
|
||||||
|
"test_fs"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"loadavg": {
|
"loadavg": {
|
||||||
"exclude_metrics": [ "proc_total" ]
|
"exclude_metrics": [
|
||||||
|
"proc_total"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
|
"numastats": {},
|
||||||
|
"nvidia": {},
|
||||||
"tempstat": {
|
"tempstat": {
|
||||||
"tag_override": {
|
"tag_override": {
|
||||||
"hwmon0": {
|
"hwmon0": {
|
||||||
|
@ -134,17 +134,29 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
device := &m.gpus[i]
|
device := &m.gpus[i]
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_util"] || !device.excludeMetrics["nv_mem_util"] {
|
if !device.excludeMetrics["nv_util"] || !device.excludeMetrics["nv_mem_util"] {
|
||||||
|
// Retrieves the current utilization rates for the device's major subsystems.
|
||||||
|
//
|
||||||
|
// Available utilization rates
|
||||||
|
// * Gpu: Percent of time over the past sample period during which one or more kernels was executing on the GPU.
|
||||||
|
// * Memory: Percent of time over the past sample period during which global (device) memory was being read or written
|
||||||
|
//
|
||||||
|
// Note:
|
||||||
|
// * During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings.
|
||||||
|
// This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization.
|
||||||
|
// * On MIG-enabled GPUs, querying device utilization rates is not currently supported.
|
||||||
util, ret := nvml.DeviceGetUtilizationRates(device.device)
|
util, ret := nvml.DeviceGetUtilizationRates(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
if !device.excludeMetrics["nv_util"] {
|
if !device.excludeMetrics["nv_util"] {
|
||||||
y, err := lp.New("nv_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
|
y, err := lp.New("nv_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_mem_util"] {
|
if !device.excludeMetrics["nv_mem_util"] {
|
||||||
y, err := lp.New("nv_mem_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
|
y, err := lp.New("nv_mem_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -152,6 +164,20 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_mem_total"] || !device.excludeMetrics["nv_fb_memory"] {
|
if !device.excludeMetrics["nv_mem_total"] || !device.excludeMetrics["nv_fb_memory"] {
|
||||||
|
// Retrieves the amount of used, free and total memory available on the device, in bytes.
|
||||||
|
//
|
||||||
|
// Enabling ECC reduces the amount of total available memory, due to the extra required parity bits.
|
||||||
|
//
|
||||||
|
// The reported amount of used memory is equal to the sum of memory allocated by all active channels on the device.
|
||||||
|
//
|
||||||
|
// Available memory info:
|
||||||
|
// * Free: Unallocated FB memory (in bytes).
|
||||||
|
// * Total: Total installed FB memory (in bytes).
|
||||||
|
// * Used: Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping.
|
||||||
|
//
|
||||||
|
// Note:
|
||||||
|
// In MIG mode, if device handle is provided, the API returns aggregate information, only if the caller has appropriate privileges.
|
||||||
|
// Per-instance information can be queried by using specific MIG device handles.
|
||||||
meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
|
meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
if !device.excludeMetrics["nv_mem_total"] {
|
if !device.excludeMetrics["nv_mem_total"] {
|
||||||
@ -175,6 +201,11 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_temp"] {
|
if !device.excludeMetrics["nv_temp"] {
|
||||||
|
// Retrieves the current temperature readings for the device, in degrees C.
|
||||||
|
//
|
||||||
|
// Available temperature sensors:
|
||||||
|
// * TEMPERATURE_GPU: Temperature sensor for the GPU die.
|
||||||
|
// * NVML_TEMPERATURE_COUNT
|
||||||
temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
|
temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.New("nv_temp", device.tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
|
y, err := lp.New("nv_temp", device.tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
|
||||||
@ -186,33 +217,50 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_fan"] {
|
if !device.excludeMetrics["nv_fan"] {
|
||||||
|
// Retrieves the intended operating speed of the device's fan.
|
||||||
|
//
|
||||||
|
// Note: The reported speed is the intended fan speed.
|
||||||
|
// If the fan is physically blocked and unable to spin, the output will not match the actual fan speed.
|
||||||
|
//
|
||||||
|
// For all discrete products with dedicated fans.
|
||||||
|
//
|
||||||
|
// The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed.
|
||||||
|
// This value may exceed 100% in certain cases.
|
||||||
fan, ret := nvml.DeviceGetFanSpeed(device.device)
|
fan, ret := nvml.DeviceGetFanSpeed(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.New("nv_fan", device.tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
|
y, err := lp.New("nv_fan", device.tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_ecc_mode"] {
|
if !device.excludeMetrics["nv_ecc_mode"] {
|
||||||
|
// Retrieves the current and pending ECC modes for the device.
|
||||||
|
//
|
||||||
|
// For Fermi or newer fully supported devices. Only applicable to devices with ECC.
|
||||||
|
// Requires NVML_INFOROM_ECC version 1.0 or higher.
|
||||||
|
//
|
||||||
|
// Changing ECC modes requires a reboot.
|
||||||
|
// The "pending" ECC mode refers to the target mode following the next reboot.
|
||||||
_, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
|
_, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
var y lp.CCMetric
|
var y lp.CCMetric
|
||||||
var err error
|
var err error
|
||||||
switch ecc_pend {
|
switch ecc_pend {
|
||||||
case nvml.FEATURE_DISABLED:
|
case nvml.FEATURE_DISABLED:
|
||||||
y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now())
|
y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "OFF"}, time.Now())
|
||||||
case nvml.FEATURE_ENABLED:
|
case nvml.FEATURE_ENABLED:
|
||||||
y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now())
|
y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "ON"}, time.Now())
|
||||||
default:
|
default:
|
||||||
y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now())
|
y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now())
|
||||||
}
|
}
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
} else if ret == nvml.ERROR_NOT_SUPPORTED {
|
} else if ret == nvml.ERROR_NOT_SUPPORTED {
|
||||||
y, err := lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now())
|
y, err := lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "N/A"}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@ -220,9 +268,16 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_perf_state"] {
|
if !device.excludeMetrics["nv_perf_state"] {
|
||||||
pstate, ret := nvml.DeviceGetPerformanceState(device.device)
|
// Retrieves the current performance state for the device.
|
||||||
|
//
|
||||||
|
// Allowed PStates:
|
||||||
|
// 0: Maximum Performance.
|
||||||
|
// ..
|
||||||
|
// 15: Minimum Performance.
|
||||||
|
// 32: Unknown performance state.
|
||||||
|
pState, ret := nvml.DeviceGetPerformanceState(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.New("nv_perf_state", device.tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now())
|
y, err := lp.New("nv_perf_state", device.tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@ -230,77 +285,115 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_power_usage_report"] {
|
if !device.excludeMetrics["nv_power_usage_report"] {
|
||||||
|
// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
|
||||||
|
//
|
||||||
|
// On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
|
||||||
|
//
|
||||||
|
// It is only available if power management mode is supported
|
||||||
power, ret := nvml.DeviceGetPowerUsage(device.device)
|
power, ret := nvml.DeviceGetPowerUsage(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.New("nv_power_usage_report", device.tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
|
y, err := lp.New("nv_power_usage_report", device.tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "watts")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Retrieves the current clock speeds for the device.
|
||||||
|
//
|
||||||
|
// Available clock information:
|
||||||
|
// * CLOCK_GRAPHICS: Graphics clock domain.
|
||||||
|
// * CLOCK_SM: Streaming Multiprocessor clock domain.
|
||||||
|
// * CLOCK_MEM: Memory clock domain.
|
||||||
if !device.excludeMetrics["nv_graphics_clock_report"] {
|
if !device.excludeMetrics["nv_graphics_clock_report"] {
|
||||||
gclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
|
graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.New("nv_graphics_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now())
|
y, err := lp.New("nv_graphics_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_sm_clock_report"] {
|
if !device.excludeMetrics["nv_sm_clock_report"] {
|
||||||
smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
|
smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.New("nv_sm_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now())
|
y, err := lp.New("nv_sm_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(smCock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_mem_clock_report"] {
|
if !device.excludeMetrics["nv_mem_clock_report"] {
|
||||||
memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
|
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.New("nv_mem_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now())
|
y, err := lp.New("nv_mem_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Retrieves the maximum clock speeds for the device.
|
||||||
|
//
|
||||||
|
// Available clock information:
|
||||||
|
// * CLOCK_GRAPHICS: Graphics clock domain.
|
||||||
|
// * CLOCK_SM: Streaming multiprocessor clock domain.
|
||||||
|
// * CLOCK_MEM: Memory clock domain.
|
||||||
|
// * CLOCK_VIDEO: Video encoder/decoder clock domain.
|
||||||
|
// * CLOCK_COUNT: Count of clock types.
|
||||||
|
//
|
||||||
|
// Note:
|
||||||
|
/// On GPUs from Fermi family current P0 clocks (reported by nvmlDeviceGetClockInfo) can differ from max clocks by few MHz.
|
||||||
if !device.excludeMetrics["nv_max_graphics_clock"] {
|
if !device.excludeMetrics["nv_max_graphics_clock"] {
|
||||||
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS)
|
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.New("nv_max_graphics_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
|
y, err := lp.New("nv_max_graphics_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_max_sm_clock"] {
|
if !device.excludeMetrics["nv_max_sm_clock"] {
|
||||||
max_smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
|
maxSmClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.New("nv_max_sm_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now())
|
y, err := lp.New("nv_max_sm_clock", device.tags, m.meta, map[string]interface{}{"value": float64(maxSmClock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_max_mem_clock"] {
|
if !device.excludeMetrics["nv_max_mem_clock"] {
|
||||||
max_memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
|
maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.New("nv_max_mem_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now())
|
y, err := lp.New("nv_max_mem_clock", device.tags, m.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_ecc_db_error"] {
|
if !device.excludeMetrics["nv_ecc_db_error"] {
|
||||||
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, 1, 1)
|
// Retrieves the total ECC error counts for the device.
|
||||||
|
//
|
||||||
|
// For Fermi or newer fully supported devices.
|
||||||
|
// Only applicable to devices with ECC.
|
||||||
|
// Requires NVML_INFOROM_ECC version 1.0 or higher.
|
||||||
|
// Requires ECC Mode to be enabled.
|
||||||
|
//
|
||||||
|
// The total error count is the sum of errors across each of the separate memory systems,
|
||||||
|
// i.e. the total set of errors across the entire device.
|
||||||
|
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.New("nv_ecc_db_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
|
y, err := lp.New("nv_ecc_db_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@ -310,7 +403,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_ecc_sb_error"] {
|
if !device.excludeMetrics["nv_ecc_sb_error"] {
|
||||||
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, 0, 1)
|
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.New("nv_ecc_sb_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
|
y, err := lp.New("nv_ecc_sb_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@ -320,30 +413,49 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_power_man_limit"] {
|
if !device.excludeMetrics["nv_power_man_limit"] {
|
||||||
|
// Retrieves the power management limit associated with this device.
|
||||||
|
//
|
||||||
|
// For Fermi or newer fully supported devices.
|
||||||
|
//
|
||||||
|
// The power limit defines the upper boundary for the card's power draw.
|
||||||
|
// If the card's total power draw reaches this limit the power management algorithm kicks in.
|
||||||
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
|
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.New("nv_power_man_limit", device.tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now())
|
y, err := lp.New("nv_power_man_limit", device.tags, m.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "watts")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_encoder_util"] {
|
if !device.excludeMetrics["nv_encoder_util"] {
|
||||||
|
// Retrieves the current utilization and sampling size in microseconds for the Encoder
|
||||||
|
//
|
||||||
|
// For Kepler or newer fully supported devices.
|
||||||
|
//
|
||||||
|
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
|
||||||
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
|
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.New("nv_encoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
|
y, err := lp.New("nv_encoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_decoder_util"] {
|
if !device.excludeMetrics["nv_decoder_util"] {
|
||||||
|
// Retrieves the current utilization and sampling size in microseconds for the Decoder
|
||||||
|
//
|
||||||
|
// For Kepler or newer fully supported devices.
|
||||||
|
//
|
||||||
|
// Note: On MIG-enabled GPUs, querying decoder utilization is not currently supported.
|
||||||
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
|
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.New("nv_decoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
|
y, err := lp.New("nv_decoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user