Add collector to get Nvidia GPM metrics

2026-06-10 13:57:30 +02:00 · 2026-06-02 13:52:44 +02:00
parent 077204d39f
commit 5938368a76
3 changed files with 430 additions and 5 deletions
--- a/collectors/collectorManager.go
+++ b/collectors/collectorManager.go
@@ -50,6 +50,7 @@ var AvailableCollectors = map[string]MetricCollector{
 	"nfsiostat":       new(NfsIOStatCollector),
 	"slurm_cgroup":    new(SlurmCgroupCollector),
 	"smartmon":        new(SmartMonCollector),
 	"nvidia_gpm":      new(NvidiaGPMCollector),
 }
 // Metric collector manager data structure
@@ -99,17 +100,17 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
 	// Initialize configured collectors
 	for collectorName, collectorCfg := range cm.config {
 		if _, found := AvailableCollectors[collectorName]; !found {
-			cclog.ComponentError("CollectorManager", "SKIP unknown collector", collectorName)
+			cclog.ComponentErrorf("CollectorManager", "SKIP unknown collector %s", collectorName)
 			continue
 		}
 		collector := AvailableCollectors[collectorName]
 		err := collector.Init(collectorCfg)
 		if err != nil {
-			cclog.ComponentError("CollectorManager", fmt.Sprintf("Collector %s initialization failed: %v", collectorName, err))
+			cclog.ComponentErrorf("CollectorManager", "Collector %s initialization failed: %v", collectorName, err)
 			continue
 		}
-		cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())
+		cclog.ComponentDebugf("CollectorManager", "ADD COLLECTOR %s", collector.Name())
 		if collector.Parallel() {
 			cm.collectors = append(cm.collectors, collector)
 		} else {
@@ -155,7 +156,7 @@ func (cm *collectorManager) Start() {
 						return
 					default:
 						// Read metrics from collector c via goroutine
-						cclog.ComponentDebug("CollectorManager", c.Name(), t)
+						cclog.ComponentDebugf("CollectorManager: Read %s at %v", c.Name(), t)
 						cm.collector_wg.Add(1)
 						go func(myc MetricCollector) {
 							myc.Read(cm.duration, cm.output)
@@ -173,7 +174,7 @@ func (cm *collectorManager) Start() {
 						return
 					default:
 						// Read metrics from collector c
-						cclog.ComponentDebug("CollectorManager", c.Name(), t)
+						cclog.ComponentDebugf("CollectorManager: Read %s at %v", c.Name(), t)
 						c.Read(cm.duration, cm.output)
 					}
 				}
--- a/collectors/nvidiaGPMMetric.go
+++ b/collectors/nvidiaGPMMetric.go
@@ -0,0 +1,370 @@
 package collectors
 import (
 	"encoding/json"
 	"errors"
 	"fmt"
 	"slices"
 	"strconv"
 	"strings"
 	"time"
 	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
 	lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
 	"github.com/NVIDIA/go-nvml/pkg/nvml"
 )
 type NvidiaGPMMetricDef struct {
 	name    string
 	outname string
 	id      nvml.GpmMetricId
 	unit    string
 }
 var NvidiaGPMMetrics []NvidiaGPMMetricDef = []NvidiaGPMMetricDef{
 	{
 		name:    "GRAPHICS_UTIL",
 		outname: "nv_gpm_graphics_util",
 		id:      nvml.GPM_METRIC_GRAPHICS_UTIL,
 		unit:    "%",
 	},
 	{
 		name:    "SM_UTIL",
 		outname: "nv_gpm_sm_util",
 		id:      nvml.GPM_METRIC_SM_UTIL,
 		unit:    "%",
 	},
 	{
 		name:    "SM_OCCUPANCY",
 		outname: "nv_gpm_sm_occupancy",
 		id:      nvml.GPM_METRIC_SM_OCCUPANCY,
 		unit:    "%",
 	},
 	{
 		name:    "INTEGER_UTIL",
 		outname: "nv_gpm_integer_util",
 		id:      nvml.GPM_METRIC_INTEGER_UTIL,
 		unit:    "%",
 	},
 	{
 		name:    "ANY_TENSOR_UTIL",
 		outname: "nv_gpm_any_tensor_util",
 		id:      nvml.GPM_METRIC_ANY_TENSOR_UTIL,
 		unit:    "%",
 	},
 	{
 		name:    "DFMA_TENSOR_UTIL",
 		outname: "nv_gpm_dfma_tensor_util",
 		id:      nvml.GPM_METRIC_DFMA_TENSOR_UTIL,
 		unit:    "%",
 	},
 	{
 		name:    "HMMA_TENSOR_UTIL",
 		outname: "nv_gpm_hmma_tensor_util",
 		id:      nvml.GPM_METRIC_HMMA_TENSOR_UTIL,
 		unit:    "%",
 	},
 	{
 		name:    "IMMA_TENSOR_UTIL",
 		outname: "nv_gpm_imma_tensor_util",
 		id:      nvml.GPM_METRIC_IMMA_TENSOR_UTIL,
 		unit:    "%",
 	},
 	{
 		name:    "DRAM_BW_UTIL",
 		outname: "nv_gpm_dram_bw_util",
 		id:      nvml.GPM_METRIC_DRAM_BW_UTIL,
 		unit:    "%",
 	},
 	{
 		name:    "FP64_UTIL",
 		outname: "nv_gpm_fp64_util",
 		id:      nvml.GPM_METRIC_FP64_UTIL,
 		unit:    "%",
 	},
 	{
 		name:    "FP32_UTIL",
 		outname: "nv_gpm_fp32_util",
 		id:      nvml.GPM_METRIC_FP32_UTIL,
 		unit:    "%",
 	},
 	{
 		name:    "FP16_UTIL",
 		outname: "nv_gpm_fp16_util",
 		id:      nvml.GPM_METRIC_FP16_UTIL,
 		unit:    "%",
 	},
 }
 type NvidiaGPMCollectorConfig struct {
 	Metrics               []string `json:"metrics,omitempty"`
 	ExcludeDevices        []string `json:"exclude_devices,omitempty"`
 	AddPciInfoTag         bool     `json:"add_pci_info_tag,omitempty"`
 	UsePciInfoAsTypeId    bool     `json:"use_pci_info_as_type_id,omitempty"`
 	AddUuidMeta           bool     `json:"add_uuid_meta,omitempty"`
 	AddBoardNumberMeta    bool     `json:"add_board_number_meta,omitempty"`
 	AddSerialMeta         bool     `json:"add_serial_meta,omitempty"`
 	ProcessMigDevices     bool     `json:"process_mig_devices,omitempty"`
 	UseUuidForMigDevices  bool     `json:"use_uuid_for_mig_device,omitempty"`
 	UseSliceForMigDevices bool     `json:"use_slice_for_mig_device,omitempty"`
 }
 type NvidiaGPMCollectorDevice struct {
 	device        nvml.Device
 	tags          map[string]string
 	meta          map[string]string
 	startTime     time.Time
 	endTime       time.Time
 	measurement   nvml.GpmMetricsGetType
 	metricsLookup map[int]NvidiaGPMMetricDef
 }
 type NvidiaGPMCollector struct {
 	metricCollector
 	config   NvidiaGPMCollectorConfig
 	gpus     []NvidiaGPMCollectorDevice
 	num_gpus int
 }
 func (m *NvidiaGPMCollector) Init(config json.RawMessage) error {
 	var err error = nil
 	m.name = "NvidiaGPMCollector"
 	if err := m.setup(); err != nil {
 		return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
 	}
 	if len(config) > 0 {
 		d := json.NewDecoder(strings.NewReader(string(config)))
 		d.DisallowUnknownFields()
 		if err = d.Decode(&m.config); err != nil {
 			return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
 		}
 	}
 	m.meta = map[string]string{
 		"source": m.name,
 		"group":  "NvidiaGPM",
 	}
 	// Initialize NVIDIA Management Library (NVML)
 	ret := nvml.Init()
 	// Error: NVML library not found
 	// (nvml.ErrorString can not be used in this case)
 	if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
 		return fmt.Errorf("%s Init(): NVML library not found", m.name)
 	}
 	if ret != nvml.SUCCESS {
 		err = errors.New(nvml.ErrorString(ret))
 		return fmt.Errorf("%s Init(): Unable to initialize NVML: %w", m.name, err)
 	}
 	// Number of NVIDIA GPUs
 	num_gpus, ret := nvml.DeviceGetCount()
 	if ret != nvml.SUCCESS {
 		err = errors.New(nvml.ErrorString(ret))
 		return fmt.Errorf("%s Init(): Unable to get device count: %w", m.name, err)
 	}
 	// For all GPUs
 	idx := 0
 	m.gpus = make([]NvidiaGPMCollectorDevice, 0, num_gpus)
 	for i := range num_gpus {
 		// Skip excluded devices by ID
 		str_i := strconv.Itoa(i)
 		if slices.Contains(m.config.ExcludeDevices, str_i) {
 			cclog.ComponentDebugf(m.name, "Skipping excluded device %s", str_i)
 			continue
 		}
 		// Get device handle
 		device, ret := nvml.DeviceGetHandleByIndex(i)
 		if ret != nvml.SUCCESS {
 			err = errors.New(nvml.ErrorString(ret))
 			cclog.ComponentErrorf(m.name, "Unable to get device at index %d: %s", i, err.Error())
 			continue
 		}
 		//supportInfoFunc := nvml.GpmQueryDeviceSupportV(device)
 		supportInfo, ret := nvml.GpmQueryDeviceSupport(device)
 		if supportInfo.IsSupportedDevice == uint32(nvml.FEATURE_DISABLED) {
 			cclog.ComponentErrorf(m.name, "Device at index %d does not support GPM metrics", i)
 			continue
 		}
 		stream, ret := nvml.GpmQueryIfStreamingEnabled(device)
 		if stream == uint32(nvml.FEATURE_DISABLED) {
 			nvml.GpmSetStreamingEnabled(device, uint32(nvml.FEATURE_ENABLED))
 		}
 		// Get device's PCI info
 		pciInfo, ret := nvml.DeviceGetPciInfo(device)
 		if ret != nvml.SUCCESS {
 			err = errors.New(nvml.ErrorString(ret))
 			cclog.ComponentErrorf(m.name, "Unable to get PCI info for device at index %d: %s", i, err.Error())
 			continue
 		}
 		// Create PCI ID in the common format used by the NVML.
 		pci_id := fmt.Sprintf(
 			nvml.DEVICE_PCI_BUS_ID_FMT,
 			pciInfo.Domain,
 			pciInfo.Bus,
 			pciInfo.Device)
 		// Skip excluded devices specified by PCI ID
 		if slices.Contains(m.config.ExcludeDevices, pci_id) {
 			cclog.ComponentDebugf(m.name, "Skipping excluded device %s", pci_id)
 			continue
 		}
 		ss, nvmlErr := nvml.GpmSampleAlloc()
 		if nvmlErr != nvml.SUCCESS {
 			err = errors.New(nvml.ErrorString(ret))
 			cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error())
 			continue
 		}
 		es, nvmlErr := nvml.GpmSampleAlloc()
 		if nvmlErr != nvml.SUCCESS {
 			err = errors.New(nvml.ErrorString(ret))
 			cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error())
 			continue
 		}
 		// Select which value to use as 'type-id'.
 		// The PCI ID is commonly required in SLURM environments because the
 		// numberic IDs used by SLURM and the ones used by NVML might differ
 		// depending on the job type. The PCI ID is more reliable but is commonly
 		// not recorded for a job, so it must be added manually in prologue or epilogue
 		// e.g. to the comment field
 		tid := str_i
 		if m.config.UsePciInfoAsTypeId {
 			tid = pci_id
 		}
 		// Now we got all infos together, populate the device list
 		g := &m.gpus[idx]
 		// Add device handle
 		g.device = device
 		// Add tags
 		g.tags = map[string]string{
 			"type":    "accelerator",
 			"type-id": tid,
 		}
 		// Add PCI info as tag if not already used as 'type-id'
 		if m.config.AddPciInfoTag && !m.config.UsePciInfoAsTypeId {
 			g.tags["pci_identifier"] = pci_id
 		}
 		g.meta = map[string]string{
 			"source": m.name,
 			"group":  "Nvidia",
 		}
 		if m.config.AddBoardNumberMeta {
 			board, ret := nvml.DeviceGetBoardPartNumber(device)
 			if ret != nvml.SUCCESS {
 				err = errors.New(nvml.ErrorString(ret))
 				cclog.ComponentError(m.name, "Unable to get boart part number for device at index", i, ":", err.Error())
 			} else {
 				g.meta["board_number"] = board
 			}
 		}
 		if m.config.AddSerialMeta {
 			serial, ret := nvml.DeviceGetSerial(device)
 			if ret != nvml.SUCCESS {
 				err = errors.New(nvml.ErrorString(ret))
 				cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error())
 			} else {
 				g.meta["serial"] = serial
 			}
 		}
 		if m.config.AddUuidMeta {
 			uuid, ret := nvml.DeviceGetUUID(device)
 			if ret != nvml.SUCCESS {
 				err = errors.New(nvml.ErrorString(ret))
 				cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error())
 			} else {
 				g.meta["uuid"] = uuid
 			}
 		}
 		g.measurement.Sample1 = ss
 		g.measurement.Sample2 = es
 		g.measurement.Version = nvml.GPM_METRICS_GET_VERSION
 		g.metricsLookup = make(map[int]NvidiaGPMMetricDef)
 		metIdx := 0
 		for _, inmetric := range m.config.Metrics {
 			for _, defmetric := range NvidiaGPMMetrics {
 				if inmetric == defmetric.outname {
 					g.measurement.Metrics[metIdx] = nvml.GpmMetric{
 						MetricId: uint32(defmetric.id),
 					}
 					g.metricsLookup[metIdx] = defmetric
 					metIdx += 1
 				}
 			}
 		}
 		g.measurement.NumMetrics = uint32(metIdx)
 	}
 	cclog.ComponentDebugf(m.name, "Found %d Nvidia GPUs with GPM support", len(m.gpus))
 	m.init = true
 	return err
 }
 func (m *NvidiaGPMCollector) Read(interval time.Duration, output chan lp.CCMessage) {
 	var err error
 	if !m.init {
 		return
 	}
 	for i, gpu := range m.gpus {
 		gpu.startTime = time.Now()
 		nvmlErr := gpu.measurement.Sample1.Get(gpu.device)
 		if nvmlErr != nvml.SUCCESS {
 			err = errors.New(nvml.ErrorString(nvmlErr))
 			cclog.ComponentError(m.name, "Unable to get start GPM sample for device at index", i, ":", err.Error())
 			continue
 		}
 	}
 	time.Sleep(interval)
 	for i, gpu := range m.gpus {
 		gpu.endTime = time.Now()
 		nvmlErr := gpu.measurement.Sample2.Get(gpu.device)
 		if nvmlErr != nvml.SUCCESS {
 			err = errors.New(nvml.ErrorString(nvmlErr))
 			cclog.ComponentError(m.name, "Unable to get stop GPM sample for device at index", i, ":", err.Error())
 			continue
 		}
 	}
 	for i, gpu := range m.gpus {
 		nvmlErr := nvml.GpmMetricsGet(&gpu.measurement)
 		if nvmlErr != nvml.SUCCESS {
 			err = errors.New(nvml.ErrorString(nvmlErr))
 			cclog.ComponentError(m.name, "Unable to get evaluate GPM sample for device at index", i, ":", err.Error())
 			continue
 		}
 		for idx, metricDef := range gpu.metricsLookup {
 			y, err := lp.NewMetric(metricDef.outname, gpu.tags, gpu.meta, gpu.measurement.Metrics[idx].Value, time.Now())
 			if err == nil {
 				y.AddMeta("unit", metricDef.unit)
 				output <- y
 			}
 		}
 	}
 }
 func (m *NvidiaGPMCollector) Close() {
 	if m.init {
 		for _, gpu := range m.gpus {
 			gpu.measurement.Sample1.Free()
 			gpu.measurement.Sample2.Free()
 		}
 		if ret := nvml.Shutdown(); ret != nvml.SUCCESS {
 			cclog.ComponentError(m.name, "nvml.Shutdown() not successful")
 		}
 		m.init = false
 	}
 }
--- a/collectors/nvidiaGPMMetric.md
+++ b/collectors/nvidiaGPMMetric.md
@@ -0,0 +1,54 @@
 <!--
 ---
 title: "Nvidia NVML GPM metric collector"
 description: Collect metrics for Nvidia GPUs using the NVML GPM interface
 categories: [cc-metric-collector]
 tags: ['Admin']
 weight: 2
 hugo_path: docs/reference/cc-metric-collector/collectors/nvidiaGPM.md
 ---
 -->
 ## `nvidiaGPM` collector
 ```json
  "nvidia_gpm": {
    "metrics": [
      "nv_fb_mem_used",
      "nv_fan"
    ],
    "exclude_devices": [
      "0","1", "0000000:ff:01.0"
    ],
    "process_mig_devices": false,
    "use_pci_info_as_type_id": true,
    "add_pci_info_tag": false,
    "add_uuid_meta": false,
    "add_board_number_meta": false,
    "add_serial_meta": false,
    "use_uuid_for_mig_device": false,
    "use_slice_for_mig_device": false
  }
 ```
 The `nvidia_gpm` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=<mig_index>`). With the options `use_uuid_for_mig_device` and `use_slice_for_mig_device`, the `<mig_index>` can be replaced with the UUID (e.g. `MIG-6a9f7cc8-6d5b-5ce0-92de-750edc4d8849`) or the MIG slice name (e.g. `1g.5gb`).
 The metrics sent by the `nvidia_gpm` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
 Optionally, it is possible to add the UUID, the board part number and the serial to the meta informations. They are not sent to the sinks (if not configured otherwise).
 Available Metrics:
 * `nv_gpm_graphics_util`
 * `nv_gpm_sm_util`
 * `nv_gpm_sm_occupancy`
 * `nv_gpm_integer_util`
 * `nv_gpm_any_tensor_util`
 * `nv_gpm_dfma_tensor_util`
 * `nv_gpm_hmma_tensor_util`
 * `nv_gpm_imma_tensor_util`
 * `nv_gpm_dram_bw_util`
 * `nv_gpm_fp64_util`
 * `nv_gpm_fp32_util`
 * `nv_gpm_fp16_util`