mirror of
				https://github.com/ClusterCockpit/cc-metric-collector.git
				synced 2025-11-04 10:45:06 +01:00 
			
		
		
		
	Option to use MIG slice name as subtype-id in NvidiaCollector
This commit is contained in:
		@@ -5,6 +5,7 @@ import (
 | 
				
			|||||||
	"errors"
 | 
						"errors"
 | 
				
			||||||
	"fmt"
 | 
						"fmt"
 | 
				
			||||||
	"log"
 | 
						"log"
 | 
				
			||||||
 | 
						"strings"
 | 
				
			||||||
	"time"
 | 
						"time"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
 | 
						cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
 | 
				
			||||||
@@ -22,6 +23,7 @@ type NvidiaCollectorConfig struct {
 | 
				
			|||||||
	AddSerialMeta         bool     `json:"add_serial_meta,omitempty"`
 | 
						AddSerialMeta         bool     `json:"add_serial_meta,omitempty"`
 | 
				
			||||||
	ProcessMigDevices     bool     `json:"process_mig_devices,omitempty"`
 | 
						ProcessMigDevices     bool     `json:"process_mig_devices,omitempty"`
 | 
				
			||||||
	UseUuidForMigDevices  bool     `json:"use_uuid_for_mig_device,omitempty"`
 | 
						UseUuidForMigDevices  bool     `json:"use_uuid_for_mig_device,omitempty"`
 | 
				
			||||||
 | 
						UseSliceForMigDevices bool     `json:"use_slice_for_mig_device,omitempty"`
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
type NvidiaCollectorDevice struct {
 | 
					type NvidiaCollectorDevice struct {
 | 
				
			||||||
@@ -52,6 +54,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
 | 
				
			|||||||
	m.config.UsePciInfoAsTypeId = false
 | 
						m.config.UsePciInfoAsTypeId = false
 | 
				
			||||||
	m.config.ProcessMigDevices = false
 | 
						m.config.ProcessMigDevices = false
 | 
				
			||||||
	m.config.UseUuidForMigDevices = false
 | 
						m.config.UseUuidForMigDevices = false
 | 
				
			||||||
 | 
						m.config.UseSliceForMigDevices = false
 | 
				
			||||||
	m.setup()
 | 
						m.setup()
 | 
				
			||||||
	if len(config) > 0 {
 | 
						if len(config) > 0 {
 | 
				
			||||||
		err = json.Unmarshal(config, &m.config)
 | 
							err = json.Unmarshal(config, &m.config)
 | 
				
			||||||
@@ -1147,15 +1150,27 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
 | 
				
			|||||||
					migDevice.tags[k] = v
 | 
										migDevice.tags[k] = v
 | 
				
			||||||
				}
 | 
									}
 | 
				
			||||||
				migDevice.tags["stype"] = "mig"
 | 
									migDevice.tags["stype"] = "mig"
 | 
				
			||||||
				if !m.config.UseUuidForMigDevices {
 | 
									if m.config.UseUuidForMigDevices {
 | 
				
			||||||
					migDevice.tags["stype-id"] = fmt.Sprintf("%d", j)
 | 
					 | 
				
			||||||
				} else {
 | 
					 | 
				
			||||||
					uuid, ret := nvml.DeviceGetUUID(mdev)
 | 
										uuid, ret := nvml.DeviceGetUUID(mdev)
 | 
				
			||||||
					if ret != nvml.SUCCESS {
 | 
										if ret != nvml.SUCCESS {
 | 
				
			||||||
						cclog.ComponentError(m.name, "Unable to get UUID for mig device at index", j, ":", err.Error())
 | 
											cclog.ComponentError(m.name, "Unable to get UUID for mig device at index", j, ":", err.Error())
 | 
				
			||||||
					} else {
 | 
										} else {
 | 
				
			||||||
						migDevice.tags["stype-id"] = uuid
 | 
											migDevice.tags["stype-id"] = uuid
 | 
				
			||||||
					}
 | 
										}
 | 
				
			||||||
 | 
									} else if m.config.UseSliceForMigDevices {
 | 
				
			||||||
 | 
										name, ret := nvml.DeviceGetName(m.gpus[i].device)
 | 
				
			||||||
 | 
										if ret == nvml.SUCCESS {
 | 
				
			||||||
 | 
											mname, ret := nvml.DeviceGetName(mdev)
 | 
				
			||||||
 | 
											if ret == nvml.SUCCESS {
 | 
				
			||||||
 | 
												x := strings.Replace(mname, name, "", -1)
 | 
				
			||||||
 | 
												x = strings.Replace(x, "MIG", "", -1)
 | 
				
			||||||
 | 
												x = strings.TrimSpace(x)
 | 
				
			||||||
 | 
												migDevice.tags["stype-id"] = x
 | 
				
			||||||
 | 
											}
 | 
				
			||||||
 | 
										}
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
									if _, ok := migDevice.tags["stype-id"]; !ok {
 | 
				
			||||||
 | 
										migDevice.tags["stype-id"] = fmt.Sprintf("%d", j)
 | 
				
			||||||
				}
 | 
									}
 | 
				
			||||||
				for k, v := range m.gpus[i].meta {
 | 
									for k, v := range m.gpus[i].meta {
 | 
				
			||||||
					migDevice.meta[k] = v
 | 
										migDevice.meta[k] = v
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -15,11 +15,13 @@
 | 
				
			|||||||
    "add_pci_info_tag": false,
 | 
					    "add_pci_info_tag": false,
 | 
				
			||||||
    "add_uuid_meta": false,
 | 
					    "add_uuid_meta": false,
 | 
				
			||||||
    "add_board_number_meta": false,
 | 
					    "add_board_number_meta": false,
 | 
				
			||||||
    "add_serial_meta": false
 | 
					    "add_serial_meta": false,
 | 
				
			||||||
 | 
					    "use_uuid_for_mig_device": false,
 | 
				
			||||||
 | 
					    "use_slice_for_mig_device": false
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The `nvidia` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option. Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=<mig_index>`).
 | 
					The `nvidia` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option. Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=<mig_index>`). With the options `use_uuid_for_mig_device` and `use_slice_for_mig_device`, the `<mig_index>` can be replaced with the UUID (e.g. `MIG-6a9f7cc8-6d5b-5ce0-92de-750edc4d8849`) or the MIG slice name (e.g. `1g.5gb`).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The metrics sent by the `nvidia` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
 | 
					The metrics sent by the `nvidia` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user