Use slice element of m.gpus without slice index

This commit is contained in:
Holger Obermaier 2022-02-15 09:23:57 +01:00
parent 5060497abd
commit fcfb58c31c

View File

@ -43,60 +43,84 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
m.name = "NvidiaCollector" m.name = "NvidiaCollector"
m.config.AddPciInfoTag = false m.config.AddPciInfoTag = false
m.setup() m.setup()
m.meta = map[string]string{"source": m.name, "group": "Nvidia"}
if len(config) > 0 { if len(config) > 0 {
err = json.Unmarshal(config, &m.config) err = json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
return err return err
} }
} }
m.meta = map[string]string{
"source": m.name,
"group": "Nvidia",
}
m.num_gpus = 0 m.num_gpus = 0
defer m.CatchPanic() defer m.CatchPanic()
// Initialize NVIDIA Management Library (NVML)
ret := nvml.Init() ret := nvml.Init()
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret)) err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error()) cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error())
return err return err
} }
// Number of NVIDIA GPUs
num_gpus, ret := nvml.DeviceGetCount() num_gpus, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret)) err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to get device count", err.Error()) cclog.ComponentError(m.name, "Unable to get device count", err.Error())
return err return err
} }
// For all GPUs
m.gpus = make([]NvidiaCollectorDevice, num_gpus) m.gpus = make([]NvidiaCollectorDevice, num_gpus)
idx := 0 for i := 0; i < num_gpus; i++ {
for i := 0; i < num_gpus && idx < num_gpus; i++ { g := &m.gpus[i]
// Skip excluded devices
str_i := fmt.Sprintf("%d", i) str_i := fmt.Sprintf("%d", i)
if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip { if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip {
continue continue
} }
// Get device handle
device, ret := nvml.DeviceGetHandleByIndex(i) device, ret := nvml.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret)) err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error()) cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error())
return err return err
} }
g := m.gpus[idx]
g.device = device g.device = device
g.tags = map[string]string{"type": "accelerator", "type-id": str_i}
// Add tags
g.tags = map[string]string{
"type": "accelerator",
"type-id": str_i,
}
// Add excluded metrics
g.excludeMetrics = map[string]bool{} g.excludeMetrics = map[string]bool{}
for _, e := range m.config.ExcludeMetrics { for _, e := range m.config.ExcludeMetrics {
g.excludeMetrics[e] = true g.excludeMetrics[e] = true
} }
// Add PCI info as tag
if m.config.AddPciInfoTag { if m.config.AddPciInfoTag {
pciinfo, ret := nvml.DeviceGetPciInfo(g.device) pciInfo, ret := nvml.DeviceGetPciInfo(g.device)
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret)) err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to get pciInfo for device at index", i, ":", err.Error()) cclog.ComponentError(m.name, "Unable to get PCI info for device at index", i, ":", err.Error())
return err return err
} }
g.tags["pci_identifier"] = fmt.Sprintf("%08X:%02X:%02X.0", pciinfo.Domain, pciinfo.Bus, pciinfo.Device) g.tags["pci_identifier"] = fmt.Sprintf(
"%08X:%02X:%02X.0",
pciInfo.Domain,
pciInfo.Bus,
pciInfo.Device)
} }
m.gpus[idx] = g
idx++
} }
m.init = true m.init = true
return nil return nil
} }