2021-05-14 19:22:59 +02:00
|
|
|
package collectors
|
|
|
|
|
|
|
|
import (
|
2021-11-25 15:11:39 +01:00
|
|
|
"encoding/json"
|
2021-05-14 19:22:59 +02:00
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
"log"
|
|
|
|
"time"
|
2022-01-25 15:37:43 +01:00
|
|
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
2022-01-21 14:35:52 +01:00
|
|
|
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
2021-05-14 19:22:59 +02:00
|
|
|
)
|
|
|
|
|
2021-11-25 14:04:03 +01:00
|
|
|
type NvidiaCollectorConfig struct {
|
2022-01-20 12:38:52 +01:00
|
|
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
|
|
|
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
2021-11-25 14:04:03 +01:00
|
|
|
}
|
|
|
|
|
2021-05-14 19:22:59 +02:00
|
|
|
type NvidiaCollector struct {
|
2022-01-25 15:37:43 +01:00
|
|
|
metricCollector
|
2021-05-14 19:22:59 +02:00
|
|
|
num_gpus int
|
2021-11-25 15:11:39 +01:00
|
|
|
config NvidiaCollectorConfig
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
2021-11-26 19:01:47 +01:00
|
|
|
func (m *NvidiaCollector) CatchPanic() {
|
2021-10-04 15:23:43 +02:00
|
|
|
if rerr := recover(); rerr != nil {
|
2021-11-26 19:01:47 +01:00
|
|
|
log.Print(rerr)
|
|
|
|
m.init = false
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-01-25 15:37:43 +01:00
|
|
|
func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
2021-11-25 15:11:39 +01:00
|
|
|
var err error
|
2021-05-14 19:22:59 +02:00
|
|
|
m.name = "NvidiaCollector"
|
|
|
|
m.setup()
|
2022-01-25 15:37:43 +01:00
|
|
|
m.meta = map[string]string{"source": m.name, "group": "Nvidia"}
|
2021-11-25 14:04:03 +01:00
|
|
|
if len(config) > 0 {
|
2021-11-25 15:11:39 +01:00
|
|
|
err = json.Unmarshal(config, &m.config)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2021-11-25 14:04:03 +01:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
m.num_gpus = 0
|
2021-10-04 15:23:43 +02:00
|
|
|
defer m.CatchPanic()
|
2021-05-14 19:22:59 +02:00
|
|
|
ret := nvml.Init()
|
|
|
|
if ret != nvml.SUCCESS {
|
2021-11-25 14:04:03 +01:00
|
|
|
err = errors.New(nvml.ErrorString(ret))
|
2021-05-14 19:22:59 +02:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
m.num_gpus, ret = nvml.DeviceGetCount()
|
|
|
|
if ret != nvml.SUCCESS {
|
2021-11-25 14:04:03 +01:00
|
|
|
err = errors.New(nvml.ErrorString(ret))
|
2021-05-14 19:22:59 +02:00
|
|
|
return err
|
|
|
|
}
|
2021-10-04 15:23:43 +02:00
|
|
|
m.init = true
|
2021-05-14 19:22:59 +02:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2022-01-25 15:37:43 +01:00
|
|
|
func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
2021-11-25 15:11:39 +01:00
|
|
|
if !m.init {
|
|
|
|
return
|
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
for i := 0; i < m.num_gpus; i++ {
|
|
|
|
device, ret := nvml.DeviceGetHandleByIndex(i)
|
|
|
|
if ret != nvml.SUCCESS {
|
|
|
|
log.Fatalf("Unable to get device at index %d: %v", i, nvml.ErrorString(ret))
|
|
|
|
return
|
|
|
|
}
|
2021-11-25 14:04:03 +01:00
|
|
|
_, skip := stringArrayContains(m.config.ExcludeDevices, fmt.Sprintf("%d", i))
|
|
|
|
if skip {
|
2021-11-25 15:11:39 +01:00
|
|
|
continue
|
2021-11-25 14:04:03 +01:00
|
|
|
}
|
2021-10-04 15:23:43 +02:00
|
|
|
tags := map[string]string{"type": "accelerator", "type-id": fmt.Sprintf("%d", i)}
|
2021-05-14 19:22:59 +02:00
|
|
|
|
|
|
|
util, ret := nvml.DeviceGetUtilizationRates(device)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_util")
|
|
|
|
y, err := lp.New("nv_util", tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_util")
|
|
|
|
y, err = lp.New("nv_mem_util", tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
meminfo, ret := nvml.DeviceGetMemoryInfo(device)
|
|
|
|
if ret == nvml.SUCCESS {
|
2021-10-04 15:23:43 +02:00
|
|
|
t := float64(meminfo.Total) / (1024 * 1024)
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_total")
|
|
|
|
y, err := lp.New("nv_mem_total", tags, m.meta, map[string]interface{}{"value": t}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
y.AddMeta("unit", "MByte")
|
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
|
|
|
f := float64(meminfo.Used) / (1024 * 1024)
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_fb_memory")
|
|
|
|
y, err = lp.New("nv_fb_memory", tags, m.meta, map[string]interface{}{"value": f}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
y.AddMeta("unit", "MByte")
|
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
temp, ret := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_temp")
|
|
|
|
y, err := lp.New("nv_temp", tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
y.AddMeta("unit", "degC")
|
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
fan, ret := nvml.DeviceGetFanSpeed(device)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_fan")
|
|
|
|
y, err := lp.New("nv_fan", tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
_, ecc_pend, ret := nvml.DeviceGetEccMode(device)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-25 15:37:43 +01:00
|
|
|
var y lp.CCMetric
|
2021-10-04 15:23:43 +02:00
|
|
|
var err error
|
2021-05-14 19:22:59 +02:00
|
|
|
switch ecc_pend {
|
|
|
|
case nvml.FEATURE_DISABLED:
|
2022-01-26 18:45:23 +01:00
|
|
|
y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now())
|
2021-05-14 19:22:59 +02:00
|
|
|
case nvml.FEATURE_ENABLED:
|
2022-01-26 18:45:23 +01:00
|
|
|
y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now())
|
2021-05-14 19:22:59 +02:00
|
|
|
default:
|
2022-01-26 18:45:23 +01:00
|
|
|
y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now())
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_mode")
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
} else if ret == nvml.ERROR_NOT_SUPPORTED {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_mode")
|
|
|
|
y, err := lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
pstate, ret := nvml.DeviceGetPerformanceState(device)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_perf_state")
|
|
|
|
y, err := lp.New("nv_perf_state", tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
power, ret := nvml.DeviceGetPowerUsage(device)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_power_usage_report")
|
|
|
|
y, err := lp.New("nv_power_usage_report", tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
gclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_GRAPHICS)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_graphics_clock_report")
|
|
|
|
y, err := lp.New("nv_graphics_clock_report", tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_sm_clock_report")
|
|
|
|
y, err := lp.New("nv_sm_clock_report", tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_clock_report")
|
|
|
|
y, err := lp.New("nv_mem_clock_report", tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device, nvml.CLOCK_GRAPHICS)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_graphics_clock")
|
|
|
|
y, err := lp.New("nv_max_graphics_clock", tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
max_smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_sm_clock")
|
|
|
|
y, err := lp.New("nv_max_sm_clock", tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
max_memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_mem_clock")
|
|
|
|
y, err := lp.New("nv_max_mem_clock", tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device, 1, 1)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_db_error")
|
|
|
|
y, err := lp.New("nv_ecc_db_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device, 0, 1)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_sb_error")
|
|
|
|
y, err := lp.New("nv_ecc_sb_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_power_man_limit")
|
|
|
|
y, err := lp.New("nv_power_man_limit", tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_encoder_util")
|
|
|
|
y, err := lp.New("nv_encoder_util", tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device)
|
|
|
|
if ret == nvml.SUCCESS {
|
2022-01-26 18:45:23 +01:00
|
|
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_decoder_util")
|
|
|
|
y, err := lp.New("nv_decoder_util", tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
|
2021-11-25 14:04:03 +01:00
|
|
|
if err == nil && !skip {
|
2022-01-25 15:37:43 +01:00
|
|
|
output <- y
|
2021-10-04 15:23:43 +02:00
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *NvidiaCollector) Close() {
|
2021-10-04 15:23:43 +02:00
|
|
|
if m.init {
|
|
|
|
nvml.Shutdown()
|
|
|
|
m.init = false
|
|
|
|
}
|
2021-05-14 19:22:59 +02:00
|
|
|
}
|