mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2024-12-25 15:09:05 +01:00
Use slice element of m.gpus without slice index
This commit is contained in:
parent
5060497abd
commit
fcfb58c31c
@ -43,60 +43,84 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
m.name = "NvidiaCollector"
|
m.name = "NvidiaCollector"
|
||||||
m.config.AddPciInfoTag = false
|
m.config.AddPciInfoTag = false
|
||||||
m.setup()
|
m.setup()
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Nvidia"}
|
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err = json.Unmarshal(config, &m.config)
|
err = json.Unmarshal(config, &m.config)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
m.meta = map[string]string{
|
||||||
|
"source": m.name,
|
||||||
|
"group": "Nvidia",
|
||||||
|
}
|
||||||
|
|
||||||
m.num_gpus = 0
|
m.num_gpus = 0
|
||||||
defer m.CatchPanic()
|
defer m.CatchPanic()
|
||||||
|
|
||||||
|
// Initialize NVIDIA Management Library (NVML)
|
||||||
ret := nvml.Init()
|
ret := nvml.Init()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err = errors.New(nvml.ErrorString(ret))
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error())
|
cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error())
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Number of NVIDIA GPUs
|
||||||
num_gpus, ret := nvml.DeviceGetCount()
|
num_gpus, ret := nvml.DeviceGetCount()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err = errors.New(nvml.ErrorString(ret))
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
cclog.ComponentError(m.name, "Unable to get device count", err.Error())
|
cclog.ComponentError(m.name, "Unable to get device count", err.Error())
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// For all GPUs
|
||||||
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
|
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
|
||||||
idx := 0
|
for i := 0; i < num_gpus; i++ {
|
||||||
for i := 0; i < num_gpus && idx < num_gpus; i++ {
|
g := &m.gpus[i]
|
||||||
|
|
||||||
|
// Skip excluded devices
|
||||||
str_i := fmt.Sprintf("%d", i)
|
str_i := fmt.Sprintf("%d", i)
|
||||||
if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip {
|
if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Get device handle
|
||||||
device, ret := nvml.DeviceGetHandleByIndex(i)
|
device, ret := nvml.DeviceGetHandleByIndex(i)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err = errors.New(nvml.ErrorString(ret))
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error())
|
cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error())
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
g := m.gpus[idx]
|
|
||||||
g.device = device
|
g.device = device
|
||||||
g.tags = map[string]string{"type": "accelerator", "type-id": str_i}
|
|
||||||
|
// Add tags
|
||||||
|
g.tags = map[string]string{
|
||||||
|
"type": "accelerator",
|
||||||
|
"type-id": str_i,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add excluded metrics
|
||||||
g.excludeMetrics = map[string]bool{}
|
g.excludeMetrics = map[string]bool{}
|
||||||
for _, e := range m.config.ExcludeMetrics {
|
for _, e := range m.config.ExcludeMetrics {
|
||||||
g.excludeMetrics[e] = true
|
g.excludeMetrics[e] = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add PCI info as tag
|
||||||
if m.config.AddPciInfoTag {
|
if m.config.AddPciInfoTag {
|
||||||
pciinfo, ret := nvml.DeviceGetPciInfo(g.device)
|
pciInfo, ret := nvml.DeviceGetPciInfo(g.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err = errors.New(nvml.ErrorString(ret))
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
cclog.ComponentError(m.name, "Unable to get pciInfo for device at index", i, ":", err.Error())
|
cclog.ComponentError(m.name, "Unable to get PCI info for device at index", i, ":", err.Error())
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
g.tags["pci_identifier"] = fmt.Sprintf("%08X:%02X:%02X.0", pciinfo.Domain, pciinfo.Bus, pciinfo.Device)
|
g.tags["pci_identifier"] = fmt.Sprintf(
|
||||||
|
"%08X:%02X:%02X.0",
|
||||||
|
pciInfo.Domain,
|
||||||
|
pciInfo.Bus,
|
||||||
|
pciInfo.Device)
|
||||||
}
|
}
|
||||||
m.gpus[idx] = g
|
|
||||||
idx++
|
|
||||||
}
|
}
|
||||||
|
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user