From 500685672bfb7a29c54676ac5f0650710e473561 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 13 May 2022 15:26:47 +0200 Subject: [PATCH] Option to use MIG slice name as subtype-id in NvidiaCollector --- collectors/nvidiaMetric.go | 39 ++++++++++++++++++++++++++------------ collectors/nvidiaMetric.md | 6 ++++-- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 619711f..458ecd4 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "log" + "strings" "time" cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" @@ -13,15 +14,16 @@ import ( ) type NvidiaCollectorConfig struct { - ExcludeMetrics []string `json:"exclude_metrics,omitempty"` - ExcludeDevices []string `json:"exclude_devices,omitempty"` - AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"` - UsePciInfoAsTypeId bool `json:"use_pci_info_as_type_id,omitempty"` - AddUuidMeta bool `json:"add_uuid_meta,omitempty"` - AddBoardNumberMeta bool `json:"add_board_number_meta,omitempty"` - AddSerialMeta bool `json:"add_serial_meta,omitempty"` - ProcessMigDevices bool `json:"process_mig_devices,omitempty"` - UseUuidForMigDevices bool `json:"use_uuid_for_mig_device,omitempty"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + ExcludeDevices []string `json:"exclude_devices,omitempty"` + AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"` + UsePciInfoAsTypeId bool `json:"use_pci_info_as_type_id,omitempty"` + AddUuidMeta bool `json:"add_uuid_meta,omitempty"` + AddBoardNumberMeta bool `json:"add_board_number_meta,omitempty"` + AddSerialMeta bool `json:"add_serial_meta,omitempty"` + ProcessMigDevices bool `json:"process_mig_devices,omitempty"` + UseUuidForMigDevices bool `json:"use_uuid_for_mig_device,omitempty"` + UseSliceForMigDevices bool `json:"use_slice_for_mig_device,omitempty"` } type NvidiaCollectorDevice struct { @@ -52,6 +54,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { m.config.UsePciInfoAsTypeId = false m.config.ProcessMigDevices = false m.config.UseUuidForMigDevices = false + m.config.UseSliceForMigDevices = false m.setup() if len(config) > 0 { err = json.Unmarshal(config, &m.config) @@ -1147,15 +1150,27 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) migDevice.tags[k] = v } migDevice.tags["stype"] = "mig" - if !m.config.UseUuidForMigDevices { - migDevice.tags["stype-id"] = fmt.Sprintf("%d", j) - } else { + if m.config.UseUuidForMigDevices { uuid, ret := nvml.DeviceGetUUID(mdev) if ret != nvml.SUCCESS { cclog.ComponentError(m.name, "Unable to get UUID for mig device at index", j, ":", err.Error()) } else { migDevice.tags["stype-id"] = uuid } + } else if m.config.UseSliceForMigDevices { + name, ret := nvml.DeviceGetName(m.gpus[i].device) + if ret == nvml.SUCCESS { + mname, ret := nvml.DeviceGetName(mdev) + if ret == nvml.SUCCESS { + x := strings.Replace(mname, name, "", -1) + x = strings.Replace(x, "MIG", "", -1) + x = strings.TrimSpace(x) + migDevice.tags["stype-id"] = x + } + } + } + if _, ok := migDevice.tags["stype-id"]; !ok { + migDevice.tags["stype-id"] = fmt.Sprintf("%d", j) } for k, v := range m.gpus[i].meta { migDevice.meta[k] = v diff --git a/collectors/nvidiaMetric.md b/collectors/nvidiaMetric.md index 8cfff32..7f0c416 100644 --- a/collectors/nvidiaMetric.md +++ b/collectors/nvidiaMetric.md @@ -15,11 +15,13 @@ "add_pci_info_tag": false, "add_uuid_meta": false, "add_board_number_meta": false, - "add_serial_meta": false + "add_serial_meta": false, + "use_uuid_for_mig_device": false, + "use_slice_for_mig_device": false } ``` -The `nvidia` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option. Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=`). +The `nvidia` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option. Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=`). With the options `use_uuid_for_mig_device` and `use_slice_for_mig_device`, the `` can be replaced with the UUID (e.g. `MIG-6a9f7cc8-6d5b-5ce0-92de-750edc4d8849`) or the MIG slice name (e.g. `1g.5gb`). The metrics sent by the `nvidia` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.