mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2024-12-25 23:19:06 +01:00
Option to use MIG slice name as subtype-id in NvidiaCollector
This commit is contained in:
parent
d4c89a4206
commit
500685672b
@ -5,6 +5,7 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
@ -13,15 +14,16 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type NvidiaCollectorConfig struct {
|
type NvidiaCollectorConfig struct {
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
||||||
AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"`
|
AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"`
|
||||||
UsePciInfoAsTypeId bool `json:"use_pci_info_as_type_id,omitempty"`
|
UsePciInfoAsTypeId bool `json:"use_pci_info_as_type_id,omitempty"`
|
||||||
AddUuidMeta bool `json:"add_uuid_meta,omitempty"`
|
AddUuidMeta bool `json:"add_uuid_meta,omitempty"`
|
||||||
AddBoardNumberMeta bool `json:"add_board_number_meta,omitempty"`
|
AddBoardNumberMeta bool `json:"add_board_number_meta,omitempty"`
|
||||||
AddSerialMeta bool `json:"add_serial_meta,omitempty"`
|
AddSerialMeta bool `json:"add_serial_meta,omitempty"`
|
||||||
ProcessMigDevices bool `json:"process_mig_devices,omitempty"`
|
ProcessMigDevices bool `json:"process_mig_devices,omitempty"`
|
||||||
UseUuidForMigDevices bool `json:"use_uuid_for_mig_device,omitempty"`
|
UseUuidForMigDevices bool `json:"use_uuid_for_mig_device,omitempty"`
|
||||||
|
UseSliceForMigDevices bool `json:"use_slice_for_mig_device,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaCollectorDevice struct {
|
type NvidiaCollectorDevice struct {
|
||||||
@ -52,6 +54,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
m.config.UsePciInfoAsTypeId = false
|
m.config.UsePciInfoAsTypeId = false
|
||||||
m.config.ProcessMigDevices = false
|
m.config.ProcessMigDevices = false
|
||||||
m.config.UseUuidForMigDevices = false
|
m.config.UseUuidForMigDevices = false
|
||||||
|
m.config.UseSliceForMigDevices = false
|
||||||
m.setup()
|
m.setup()
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err = json.Unmarshal(config, &m.config)
|
err = json.Unmarshal(config, &m.config)
|
||||||
@ -1147,15 +1150,27 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
migDevice.tags[k] = v
|
migDevice.tags[k] = v
|
||||||
}
|
}
|
||||||
migDevice.tags["stype"] = "mig"
|
migDevice.tags["stype"] = "mig"
|
||||||
if !m.config.UseUuidForMigDevices {
|
if m.config.UseUuidForMigDevices {
|
||||||
migDevice.tags["stype-id"] = fmt.Sprintf("%d", j)
|
|
||||||
} else {
|
|
||||||
uuid, ret := nvml.DeviceGetUUID(mdev)
|
uuid, ret := nvml.DeviceGetUUID(mdev)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
cclog.ComponentError(m.name, "Unable to get UUID for mig device at index", j, ":", err.Error())
|
cclog.ComponentError(m.name, "Unable to get UUID for mig device at index", j, ":", err.Error())
|
||||||
} else {
|
} else {
|
||||||
migDevice.tags["stype-id"] = uuid
|
migDevice.tags["stype-id"] = uuid
|
||||||
}
|
}
|
||||||
|
} else if m.config.UseSliceForMigDevices {
|
||||||
|
name, ret := nvml.DeviceGetName(m.gpus[i].device)
|
||||||
|
if ret == nvml.SUCCESS {
|
||||||
|
mname, ret := nvml.DeviceGetName(mdev)
|
||||||
|
if ret == nvml.SUCCESS {
|
||||||
|
x := strings.Replace(mname, name, "", -1)
|
||||||
|
x = strings.Replace(x, "MIG", "", -1)
|
||||||
|
x = strings.TrimSpace(x)
|
||||||
|
migDevice.tags["stype-id"] = x
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if _, ok := migDevice.tags["stype-id"]; !ok {
|
||||||
|
migDevice.tags["stype-id"] = fmt.Sprintf("%d", j)
|
||||||
}
|
}
|
||||||
for k, v := range m.gpus[i].meta {
|
for k, v := range m.gpus[i].meta {
|
||||||
migDevice.meta[k] = v
|
migDevice.meta[k] = v
|
||||||
|
@ -15,11 +15,13 @@
|
|||||||
"add_pci_info_tag": false,
|
"add_pci_info_tag": false,
|
||||||
"add_uuid_meta": false,
|
"add_uuid_meta": false,
|
||||||
"add_board_number_meta": false,
|
"add_board_number_meta": false,
|
||||||
"add_serial_meta": false
|
"add_serial_meta": false,
|
||||||
|
"use_uuid_for_mig_device": false,
|
||||||
|
"use_slice_for_mig_device": false
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The `nvidia` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option. Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=<mig_index>`).
|
The `nvidia` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option. Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=<mig_index>`). With the options `use_uuid_for_mig_device` and `use_slice_for_mig_device`, the `<mig_index>` can be replaced with the UUID (e.g. `MIG-6a9f7cc8-6d5b-5ce0-92de-750edc4d8849`) or the MIG slice name (e.g. `1g.5gb`).
|
||||||
|
|
||||||
The metrics sent by the `nvidia` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
|
The metrics sent by the `nvidia` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user