From 5938368a7625abdfd081ad4f7d5418fdd6ebacc2 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 2 Jun 2026 13:52:44 +0200 Subject: [PATCH] Add collector to get Nvidia GPM metrics --- collectors/collectorManager.go | 11 +- collectors/nvidiaGPMMetric.go | 370 +++++++++++++++++++++++++++++++++ collectors/nvidiaGPMMetric.md | 54 +++++ 3 files changed, 430 insertions(+), 5 deletions(-) create mode 100644 collectors/nvidiaGPMMetric.go create mode 100644 collectors/nvidiaGPMMetric.md diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 679bdd2..d0b3e43 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -50,6 +50,7 @@ var AvailableCollectors = map[string]MetricCollector{ "nfsiostat": new(NfsIOStatCollector), "slurm_cgroup": new(SlurmCgroupCollector), "smartmon": new(SmartMonCollector), + "nvidia_gpm": new(NvidiaGPMCollector), } // Metric collector manager data structure @@ -99,17 +100,17 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat // Initialize configured collectors for collectorName, collectorCfg := range cm.config { if _, found := AvailableCollectors[collectorName]; !found { - cclog.ComponentError("CollectorManager", "SKIP unknown collector", collectorName) + cclog.ComponentErrorf("CollectorManager", "SKIP unknown collector %s", collectorName) continue } collector := AvailableCollectors[collectorName] err := collector.Init(collectorCfg) if err != nil { - cclog.ComponentError("CollectorManager", fmt.Sprintf("Collector %s initialization failed: %v", collectorName, err)) + cclog.ComponentErrorf("CollectorManager", "Collector %s initialization failed: %v", collectorName, err) continue } - cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name()) + cclog.ComponentDebugf("CollectorManager", "ADD COLLECTOR %s", collector.Name()) if collector.Parallel() { cm.collectors = append(cm.collectors, collector) } else { @@ -155,7 +156,7 @@ func (cm *collectorManager) Start() { return default: // Read metrics from collector c via goroutine - cclog.ComponentDebug("CollectorManager", c.Name(), t) + cclog.ComponentDebugf("CollectorManager: Read %s at %v", c.Name(), t) cm.collector_wg.Add(1) go func(myc MetricCollector) { myc.Read(cm.duration, cm.output) @@ -173,7 +174,7 @@ func (cm *collectorManager) Start() { return default: // Read metrics from collector c - cclog.ComponentDebug("CollectorManager", c.Name(), t) + cclog.ComponentDebugf("CollectorManager: Read %s at %v", c.Name(), t) c.Read(cm.duration, cm.output) } } diff --git a/collectors/nvidiaGPMMetric.go b/collectors/nvidiaGPMMetric.go new file mode 100644 index 0000000..62bf19f --- /dev/null +++ b/collectors/nvidiaGPMMetric.go @@ -0,0 +1,370 @@ +package collectors + +import ( + "encoding/json" + "errors" + "fmt" + "slices" + "strconv" + "strings" + "time" + + cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" + lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +type NvidiaGPMMetricDef struct { + name string + outname string + id nvml.GpmMetricId + unit string +} + +var NvidiaGPMMetrics []NvidiaGPMMetricDef = []NvidiaGPMMetricDef{ + { + name: "GRAPHICS_UTIL", + outname: "nv_gpm_graphics_util", + id: nvml.GPM_METRIC_GRAPHICS_UTIL, + unit: "%", + }, + { + name: "SM_UTIL", + outname: "nv_gpm_sm_util", + id: nvml.GPM_METRIC_SM_UTIL, + unit: "%", + }, + { + name: "SM_OCCUPANCY", + outname: "nv_gpm_sm_occupancy", + id: nvml.GPM_METRIC_SM_OCCUPANCY, + unit: "%", + }, + { + name: "INTEGER_UTIL", + outname: "nv_gpm_integer_util", + id: nvml.GPM_METRIC_INTEGER_UTIL, + unit: "%", + }, + { + name: "ANY_TENSOR_UTIL", + outname: "nv_gpm_any_tensor_util", + id: nvml.GPM_METRIC_ANY_TENSOR_UTIL, + unit: "%", + }, + { + name: "DFMA_TENSOR_UTIL", + outname: "nv_gpm_dfma_tensor_util", + id: nvml.GPM_METRIC_DFMA_TENSOR_UTIL, + unit: "%", + }, + { + name: "HMMA_TENSOR_UTIL", + outname: "nv_gpm_hmma_tensor_util", + id: nvml.GPM_METRIC_HMMA_TENSOR_UTIL, + unit: "%", + }, + { + name: "IMMA_TENSOR_UTIL", + outname: "nv_gpm_imma_tensor_util", + id: nvml.GPM_METRIC_IMMA_TENSOR_UTIL, + unit: "%", + }, + { + name: "DRAM_BW_UTIL", + outname: "nv_gpm_dram_bw_util", + id: nvml.GPM_METRIC_DRAM_BW_UTIL, + unit: "%", + }, + { + name: "FP64_UTIL", + outname: "nv_gpm_fp64_util", + id: nvml.GPM_METRIC_FP64_UTIL, + unit: "%", + }, + { + name: "FP32_UTIL", + outname: "nv_gpm_fp32_util", + id: nvml.GPM_METRIC_FP32_UTIL, + unit: "%", + }, + { + name: "FP16_UTIL", + outname: "nv_gpm_fp16_util", + id: nvml.GPM_METRIC_FP16_UTIL, + unit: "%", + }, +} + +type NvidiaGPMCollectorConfig struct { + Metrics []string `json:"metrics,omitempty"` + ExcludeDevices []string `json:"exclude_devices,omitempty"` + AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"` + UsePciInfoAsTypeId bool `json:"use_pci_info_as_type_id,omitempty"` + AddUuidMeta bool `json:"add_uuid_meta,omitempty"` + AddBoardNumberMeta bool `json:"add_board_number_meta,omitempty"` + AddSerialMeta bool `json:"add_serial_meta,omitempty"` + ProcessMigDevices bool `json:"process_mig_devices,omitempty"` + UseUuidForMigDevices bool `json:"use_uuid_for_mig_device,omitempty"` + UseSliceForMigDevices bool `json:"use_slice_for_mig_device,omitempty"` +} + +type NvidiaGPMCollectorDevice struct { + device nvml.Device + tags map[string]string + meta map[string]string + startTime time.Time + endTime time.Time + measurement nvml.GpmMetricsGetType + metricsLookup map[int]NvidiaGPMMetricDef +} + +type NvidiaGPMCollector struct { + metricCollector + + config NvidiaGPMCollectorConfig + gpus []NvidiaGPMCollectorDevice + num_gpus int +} + +func (m *NvidiaGPMCollector) Init(config json.RawMessage) error { + var err error = nil + m.name = "NvidiaGPMCollector" + if err := m.setup(); err != nil { + return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err) + } + if len(config) > 0 { + d := json.NewDecoder(strings.NewReader(string(config))) + d.DisallowUnknownFields() + if err = d.Decode(&m.config); err != nil { + return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err) + } + } + m.meta = map[string]string{ + "source": m.name, + "group": "NvidiaGPM", + } + + // Initialize NVIDIA Management Library (NVML) + ret := nvml.Init() + + // Error: NVML library not found + // (nvml.ErrorString can not be used in this case) + if ret == nvml.ERROR_LIBRARY_NOT_FOUND { + return fmt.Errorf("%s Init(): NVML library not found", m.name) + } + if ret != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(ret)) + return fmt.Errorf("%s Init(): Unable to initialize NVML: %w", m.name, err) + } + + // Number of NVIDIA GPUs + num_gpus, ret := nvml.DeviceGetCount() + if ret != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(ret)) + return fmt.Errorf("%s Init(): Unable to get device count: %w", m.name, err) + } + + // For all GPUs + idx := 0 + m.gpus = make([]NvidiaGPMCollectorDevice, 0, num_gpus) + for i := range num_gpus { + + // Skip excluded devices by ID + str_i := strconv.Itoa(i) + if slices.Contains(m.config.ExcludeDevices, str_i) { + cclog.ComponentDebugf(m.name, "Skipping excluded device %s", str_i) + continue + } + + // Get device handle + device, ret := nvml.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(ret)) + cclog.ComponentErrorf(m.name, "Unable to get device at index %d: %s", i, err.Error()) + continue + } + + //supportInfoFunc := nvml.GpmQueryDeviceSupportV(device) + supportInfo, ret := nvml.GpmQueryDeviceSupport(device) + if supportInfo.IsSupportedDevice == uint32(nvml.FEATURE_DISABLED) { + cclog.ComponentErrorf(m.name, "Device at index %d does not support GPM metrics", i) + continue + } + stream, ret := nvml.GpmQueryIfStreamingEnabled(device) + if stream == uint32(nvml.FEATURE_DISABLED) { + nvml.GpmSetStreamingEnabled(device, uint32(nvml.FEATURE_ENABLED)) + } + + // Get device's PCI info + pciInfo, ret := nvml.DeviceGetPciInfo(device) + if ret != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(ret)) + cclog.ComponentErrorf(m.name, "Unable to get PCI info for device at index %d: %s", i, err.Error()) + continue + } + // Create PCI ID in the common format used by the NVML. + pci_id := fmt.Sprintf( + nvml.DEVICE_PCI_BUS_ID_FMT, + pciInfo.Domain, + pciInfo.Bus, + pciInfo.Device) + + // Skip excluded devices specified by PCI ID + if slices.Contains(m.config.ExcludeDevices, pci_id) { + cclog.ComponentDebugf(m.name, "Skipping excluded device %s", pci_id) + continue + } + ss, nvmlErr := nvml.GpmSampleAlloc() + if nvmlErr != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(ret)) + cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error()) + continue + } + es, nvmlErr := nvml.GpmSampleAlloc() + if nvmlErr != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(ret)) + cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error()) + continue + } + + // Select which value to use as 'type-id'. + // The PCI ID is commonly required in SLURM environments because the + // numberic IDs used by SLURM and the ones used by NVML might differ + // depending on the job type. The PCI ID is more reliable but is commonly + // not recorded for a job, so it must be added manually in prologue or epilogue + // e.g. to the comment field + tid := str_i + if m.config.UsePciInfoAsTypeId { + tid = pci_id + } + + // Now we got all infos together, populate the device list + g := &m.gpus[idx] + + // Add device handle + g.device = device + + // Add tags + g.tags = map[string]string{ + "type": "accelerator", + "type-id": tid, + } + + // Add PCI info as tag if not already used as 'type-id' + if m.config.AddPciInfoTag && !m.config.UsePciInfoAsTypeId { + g.tags["pci_identifier"] = pci_id + } + + g.meta = map[string]string{ + "source": m.name, + "group": "Nvidia", + } + + if m.config.AddBoardNumberMeta { + board, ret := nvml.DeviceGetBoardPartNumber(device) + if ret != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(ret)) + cclog.ComponentError(m.name, "Unable to get boart part number for device at index", i, ":", err.Error()) + } else { + g.meta["board_number"] = board + } + } + if m.config.AddSerialMeta { + serial, ret := nvml.DeviceGetSerial(device) + if ret != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(ret)) + cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error()) + } else { + g.meta["serial"] = serial + } + } + if m.config.AddUuidMeta { + uuid, ret := nvml.DeviceGetUUID(device) + if ret != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(ret)) + cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error()) + } else { + g.meta["uuid"] = uuid + } + } + + g.measurement.Sample1 = ss + g.measurement.Sample2 = es + g.measurement.Version = nvml.GPM_METRICS_GET_VERSION + g.metricsLookup = make(map[int]NvidiaGPMMetricDef) + metIdx := 0 + for _, inmetric := range m.config.Metrics { + for _, defmetric := range NvidiaGPMMetrics { + if inmetric == defmetric.outname { + g.measurement.Metrics[metIdx] = nvml.GpmMetric{ + MetricId: uint32(defmetric.id), + } + g.metricsLookup[metIdx] = defmetric + metIdx += 1 + } + } + } + g.measurement.NumMetrics = uint32(metIdx) + } + cclog.ComponentDebugf(m.name, "Found %d Nvidia GPUs with GPM support", len(m.gpus)) + m.init = true + return err +} + +func (m *NvidiaGPMCollector) Read(interval time.Duration, output chan lp.CCMessage) { + var err error + if !m.init { + return + } + for i, gpu := range m.gpus { + gpu.startTime = time.Now() + nvmlErr := gpu.measurement.Sample1.Get(gpu.device) + if nvmlErr != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(nvmlErr)) + cclog.ComponentError(m.name, "Unable to get start GPM sample for device at index", i, ":", err.Error()) + continue + } + } + time.Sleep(interval) + + for i, gpu := range m.gpus { + gpu.endTime = time.Now() + nvmlErr := gpu.measurement.Sample2.Get(gpu.device) + if nvmlErr != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(nvmlErr)) + cclog.ComponentError(m.name, "Unable to get stop GPM sample for device at index", i, ":", err.Error()) + continue + } + } + + for i, gpu := range m.gpus { + nvmlErr := nvml.GpmMetricsGet(&gpu.measurement) + if nvmlErr != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(nvmlErr)) + cclog.ComponentError(m.name, "Unable to get evaluate GPM sample for device at index", i, ":", err.Error()) + continue + } + for idx, metricDef := range gpu.metricsLookup { + y, err := lp.NewMetric(metricDef.outname, gpu.tags, gpu.meta, gpu.measurement.Metrics[idx].Value, time.Now()) + if err == nil { + y.AddMeta("unit", metricDef.unit) + output <- y + } + } + } + +} + +func (m *NvidiaGPMCollector) Close() { + if m.init { + for _, gpu := range m.gpus { + gpu.measurement.Sample1.Free() + gpu.measurement.Sample2.Free() + } + if ret := nvml.Shutdown(); ret != nvml.SUCCESS { + cclog.ComponentError(m.name, "nvml.Shutdown() not successful") + } + m.init = false + } +} diff --git a/collectors/nvidiaGPMMetric.md b/collectors/nvidiaGPMMetric.md new file mode 100644 index 0000000..db4f41a --- /dev/null +++ b/collectors/nvidiaGPMMetric.md @@ -0,0 +1,54 @@ + + +## `nvidiaGPM` collector + +```json + "nvidia_gpm": { + "metrics": [ + "nv_fb_mem_used", + "nv_fan" + ], + "exclude_devices": [ + "0","1", "0000000:ff:01.0" + ], + + "process_mig_devices": false, + "use_pci_info_as_type_id": true, + "add_pci_info_tag": false, + "add_uuid_meta": false, + "add_board_number_meta": false, + "add_serial_meta": false, + "use_uuid_for_mig_device": false, + "use_slice_for_mig_device": false + } +``` + +The `nvidia_gpm` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=`). With the options `use_uuid_for_mig_device` and `use_slice_for_mig_device`, the `` can be replaced with the UUID (e.g. `MIG-6a9f7cc8-6d5b-5ce0-92de-750edc4d8849`) or the MIG slice name (e.g. `1g.5gb`). + +The metrics sent by the `nvidia_gpm` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag. + +Optionally, it is possible to add the UUID, the board part number and the serial to the meta informations. They are not sent to the sinks (if not configured otherwise). + + +Available Metrics: +* `nv_gpm_graphics_util` +* `nv_gpm_sm_util` +* `nv_gpm_sm_occupancy` +* `nv_gpm_integer_util` +* `nv_gpm_any_tensor_util` +* `nv_gpm_dfma_tensor_util` +* `nv_gpm_hmma_tensor_util` +* `nv_gpm_imma_tensor_util` +* `nv_gpm_dram_bw_util` +* `nv_gpm_fp64_util` +* `nv_gpm_fp32_util` +* `nv_gpm_fp16_util` \ No newline at end of file