Exclude metrics and devices in Init() for NvidiaCollector

This commit is contained in:
Thomas Roehl 2022-02-11 14:20:06 +01:00
parent 184d60cc58
commit b15fdf72b9

View File

@ -6,6 +6,8 @@ import (
"fmt" "fmt"
"log" "log"
"time" "time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
"github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/NVIDIA/go-nvml/pkg/nvml"
) )
@ -13,12 +15,20 @@ import (
type NvidiaCollectorConfig struct { type NvidiaCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"` ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ExcludeDevices []string `json:"exclude_devices,omitempty"` ExcludeDevices []string `json:"exclude_devices,omitempty"`
AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"`
}
type NvidiaCollectorDevice struct {
device nvml.Device
excludeMetrics map[string]bool
tags map[string]string
} }
type NvidiaCollector struct { type NvidiaCollector struct {
metricCollector metricCollector
num_gpus int num_gpus int
config NvidiaCollectorConfig config NvidiaCollectorConfig
gpus []NvidiaCollectorDevice
} }
func (m *NvidiaCollector) CatchPanic() { func (m *NvidiaCollector) CatchPanic() {
@ -31,6 +41,7 @@ func (m *NvidiaCollector) CatchPanic() {
func (m *NvidiaCollector) Init(config json.RawMessage) error { func (m *NvidiaCollector) Init(config json.RawMessage) error {
var err error var err error
m.name = "NvidiaCollector" m.name = "NvidiaCollector"
m.config.AddPciInfoTag = false
m.setup() m.setup()
m.meta = map[string]string{"source": m.name, "group": "Nvidia"} m.meta = map[string]string{"source": m.name, "group": "Nvidia"}
if len(config) > 0 { if len(config) > 0 {
@ -44,13 +55,48 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
ret := nvml.Init() ret := nvml.Init()
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret)) err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error())
return err return err
} }
m.num_gpus, ret = nvml.DeviceGetCount() num_gpus, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret)) err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to get device count", err.Error())
return err return err
} }
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
idx := 0
for i := 0; i < num_gpus && idx < num_gpus; i++ {
str_i := fmt.Sprintf("%d", i)
if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip {
continue
}
device, ret := nvml.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error())
return err
}
g := m.gpus[idx]
g.device = device
g.tags = map[string]string{"type": "accelerator", "type-id": str_i}
g.excludeMetrics = map[string]bool{}
for _, e := range m.config.ExcludeMetrics {
g.excludeMetrics[e] = true
}
if m.config.AddPciInfoTag {
pciinfo, ret := nvml.DeviceGetPciInfo(g.device)
if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to get pciInfo for device at index", i, ":", err.Error())
return err
}
g.tags["pci_identifier"] = fmt.Sprintf("%08X:%02X:%02X.0", pciinfo.Domain, pciinfo.Bus, pciinfo.Device)
}
m.gpus[idx] = g
idx++
}
m.init = true m.init = true
return nil return nil
} }
@ -59,207 +105,233 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
if !m.init { if !m.init {
return return
} }
for i := 0; i < m.num_gpus; i++ {
device, ret := nvml.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
log.Fatalf("Unable to get device at index %d: %v", i, nvml.ErrorString(ret))
return
}
_, skip := stringArrayContains(m.config.ExcludeDevices, fmt.Sprintf("%d", i))
if skip {
continue
}
tags := map[string]string{"type": "accelerator", "type-id": fmt.Sprintf("%d", i)}
util, ret := nvml.DeviceGetUtilizationRates(device) for _, device := range m.gpus {
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_util") exclude := func(metric string) bool {
y, err := lp.New("nv_util", tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) if _, ok := device.excludeMetrics[metric]; !ok {
if err == nil && !skip { return true
output <- y
} }
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_util") return false
y, err = lp.New("nv_mem_util", tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) }
if err == nil && !skip {
output <- y ex_nv_util := exclude("nv_util")
ex_nv_mem_util := exclude("nv_mem_util")
if (!ex_nv_util) || (!ex_nv_mem_util) {
util, ret := nvml.DeviceGetUtilizationRates(device.device)
if ret == nvml.SUCCESS {
if !ex_nv_util {
y, err := lp.New("nv_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
if err == nil {
output <- y
}
}
if !ex_nv_mem_util {
y, err := lp.New("nv_mem_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
if err == nil {
output <- y
}
}
} }
} }
meminfo, ret := nvml.DeviceGetMemoryInfo(device) ex_nv_mem_total := exclude("nv_mem_total")
if ret == nvml.SUCCESS { ex_nv_fb_memory := exclude("nv_fb_memory")
t := float64(meminfo.Total) / (1024 * 1024) if (!ex_nv_mem_total) || (!ex_nv_fb_memory) {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_total") meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
y, err := lp.New("nv_mem_total", tags, m.meta, map[string]interface{}{"value": t}, time.Now()) if ret == nvml.SUCCESS {
if err == nil && !skip { if !ex_nv_mem_total {
y.AddMeta("unit", "MByte") t := float64(meminfo.Total) / (1024 * 1024)
output <- y y, err := lp.New("nv_mem_total", device.tags, m.meta, map[string]interface{}{"value": t}, time.Now())
} if err == nil {
f := float64(meminfo.Used) / (1024 * 1024) y.AddMeta("unit", "MByte")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_fb_memory") output <- y
y, err = lp.New("nv_fb_memory", tags, m.meta, map[string]interface{}{"value": f}, time.Now()) }
if err == nil && !skip { }
y.AddMeta("unit", "MByte")
output <- y if !ex_nv_fb_memory {
f := float64(meminfo.Used) / (1024 * 1024)
y, err := lp.New("nv_fb_memory", device.tags, m.meta, map[string]interface{}{"value": f}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
}
}
} }
} }
temp, ret := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU) if !exclude("nv_temp") {
if ret == nvml.SUCCESS { temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_temp") if ret == nvml.SUCCESS {
y, err := lp.New("nv_temp", tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now()) y, err := lp.New("nv_temp", device.tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
if err == nil && !skip { if err == nil {
y.AddMeta("unit", "degC") y.AddMeta("unit", "degC")
output <- y output <- y
}
} }
} }
fan, ret := nvml.DeviceGetFanSpeed(device) if !exclude("nv_fan") {
if ret == nvml.SUCCESS { fan, ret := nvml.DeviceGetFanSpeed(device.device)
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_fan") if ret == nvml.SUCCESS {
y, err := lp.New("nv_fan", tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) y, err := lp.New("nv_fan", device.tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
if err == nil && !skip { if err == nil {
output <- y output <- y
}
} }
} }
_, ecc_pend, ret := nvml.DeviceGetEccMode(device) if !exclude("nv_ecc_mode") {
if ret == nvml.SUCCESS { _, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
var y lp.CCMetric if ret == nvml.SUCCESS {
var err error var y lp.CCMetric
switch ecc_pend { var err error
case nvml.FEATURE_DISABLED: switch ecc_pend {
y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now()) case nvml.FEATURE_DISABLED:
case nvml.FEATURE_ENABLED: y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now())
y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now()) case nvml.FEATURE_ENABLED:
default: y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now())
y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now()) default:
} y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now())
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_mode") }
if err == nil && !skip { if err == nil {
output <- y output <- y
} }
} else if ret == nvml.ERROR_NOT_SUPPORTED { } else if ret == nvml.ERROR_NOT_SUPPORTED {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_mode") y, err := lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now())
y, err := lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now()) if err == nil {
if err == nil && !skip { output <- y
output <- y }
} }
} }
pstate, ret := nvml.DeviceGetPerformanceState(device) if !exclude("nv_perf_state") {
if ret == nvml.SUCCESS { pstate, ret := nvml.DeviceGetPerformanceState(device.device)
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_perf_state") if ret == nvml.SUCCESS {
y, err := lp.New("nv_perf_state", tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now()) y, err := lp.New("nv_perf_state", device.tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now())
if err == nil && !skip { if err == nil {
output <- y output <- y
}
} }
} }
power, ret := nvml.DeviceGetPowerUsage(device) if !exclude("nv_power_usage_report") {
if ret == nvml.SUCCESS { power, ret := nvml.DeviceGetPowerUsage(device.device)
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_power_usage_report") if ret == nvml.SUCCESS {
y, err := lp.New("nv_power_usage_report", tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) y, err := lp.New("nv_power_usage_report", device.tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
if err == nil && !skip { if err == nil {
output <- y output <- y
}
} }
} }
gclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_GRAPHICS) if !exclude("nv_graphics_clock_report") {
if ret == nvml.SUCCESS { gclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_graphics_clock_report") if ret == nvml.SUCCESS {
y, err := lp.New("nv_graphics_clock_report", tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now()) y, err := lp.New("nv_graphics_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now())
if err == nil && !skip { if err == nil {
output <- y output <- y
}
} }
} }
smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM) if !exclude("nv_sm_clock_report") {
if ret == nvml.SUCCESS { smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_sm_clock_report") if ret == nvml.SUCCESS {
y, err := lp.New("nv_sm_clock_report", tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now()) y, err := lp.New("nv_sm_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now())
if err == nil && !skip { if err == nil {
output <- y output <- y
}
} }
} }
memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM) if !exclude("nv_mem_clock_report") {
if ret == nvml.SUCCESS { memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_clock_report") if ret == nvml.SUCCESS {
y, err := lp.New("nv_mem_clock_report", tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now()) y, err := lp.New("nv_mem_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now())
if err == nil && !skip { if err == nil {
output <- y output <- y
}
} }
} }
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device, nvml.CLOCK_GRAPHICS) if !exclude("nv_max_graphics_clock") {
if ret == nvml.SUCCESS { max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS)
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_graphics_clock") if ret == nvml.SUCCESS {
y, err := lp.New("nv_max_graphics_clock", tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) y, err := lp.New("nv_max_graphics_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
if err == nil && !skip { if err == nil {
output <- y output <- y
}
} }
} }
max_smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM) if !exclude("nv_max_sm_clock") {
if ret == nvml.SUCCESS { max_smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_sm_clock") if ret == nvml.SUCCESS {
y, err := lp.New("nv_max_sm_clock", tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now()) y, err := lp.New("nv_max_sm_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now())
if err == nil && !skip { if err == nil {
output <- y output <- y
}
} }
} }
max_memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM) if !exclude("nv_max_mem_clock") {
if ret == nvml.SUCCESS { max_memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_mem_clock") if ret == nvml.SUCCESS {
y, err := lp.New("nv_max_mem_clock", tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now()) y, err := lp.New("nv_max_mem_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now())
if err == nil && !skip { if err == nil {
output <- y output <- y
}
} }
} }
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device, 1, 1) if !exclude("nv_ecc_db_error") {
if ret == nvml.SUCCESS { ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, 1, 1)
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_db_error") if ret == nvml.SUCCESS {
y, err := lp.New("nv_ecc_db_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) y, err := lp.New("nv_ecc_db_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
if err == nil && !skip { if err == nil {
output <- y output <- y
}
} }
} }
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device, 0, 1) if !exclude("nv_ecc_sb_error") {
if ret == nvml.SUCCESS { ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, 0, 1)
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_sb_error") if ret == nvml.SUCCESS {
y, err := lp.New("nv_ecc_sb_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) y, err := lp.New("nv_ecc_sb_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
if err == nil && !skip { if err == nil {
output <- y output <- y
}
} }
} }
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device) if !exclude("nv_power_man_limit") {
if ret == nvml.SUCCESS { pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_power_man_limit") if ret == nvml.SUCCESS {
y, err := lp.New("nv_power_man_limit", tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now()) y, err := lp.New("nv_power_man_limit", device.tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now())
if err == nil && !skip { if err == nil {
output <- y output <- y
}
} }
} }
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device) if !exclude("nv_encoder_util") {
if ret == nvml.SUCCESS { enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_encoder_util") if ret == nvml.SUCCESS {
y, err := lp.New("nv_encoder_util", tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now()) y, err := lp.New("nv_encoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
if err == nil && !skip { if err == nil {
output <- y output <- y
}
} }
} }
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device) if !exclude("nv_decoder_util") {
if ret == nvml.SUCCESS { dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_decoder_util") if ret == nvml.SUCCESS {
y, err := lp.New("nv_decoder_util", tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now()) y, err := lp.New("nv_decoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
if err == nil && !skip { if err == nil {
output <- y output <- y
}
} }
} }
} }