Add collector to get Nvidia GPM metrics

2026-06-10 05:47:30 +02:00 · 2026-06-02 13:52:44 +02:00
parent 077204d39f
commit 5938368a76
3 changed files with 430 additions and 5 deletions
--- a/collectors/nvidiaGPMMetric.go
+++ b/collectors/nvidiaGPMMetric.go
@@ -0,0 +1,370 @@
+package collectors
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"slices"
+	"strconv"
+	"strings"
+	"time"
+
+	cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
+	lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+)
+
+type NvidiaGPMMetricDef struct {
+	name    string
+	outname string
+	id      nvml.GpmMetricId
+	unit    string
+}
+
+var NvidiaGPMMetrics []NvidiaGPMMetricDef = []NvidiaGPMMetricDef{
+	{
+		name:    "GRAPHICS_UTIL",
+		outname: "nv_gpm_graphics_util",
+		id:      nvml.GPM_METRIC_GRAPHICS_UTIL,
+		unit:    "%",
+	},
+	{
+		name:    "SM_UTIL",
+		outname: "nv_gpm_sm_util",
+		id:      nvml.GPM_METRIC_SM_UTIL,
+		unit:    "%",
+	},
+	{
+		name:    "SM_OCCUPANCY",
+		outname: "nv_gpm_sm_occupancy",
+		id:      nvml.GPM_METRIC_SM_OCCUPANCY,
+		unit:    "%",
+	},
+	{
+		name:    "INTEGER_UTIL",
+		outname: "nv_gpm_integer_util",
+		id:      nvml.GPM_METRIC_INTEGER_UTIL,
+		unit:    "%",
+	},
+	{
+		name:    "ANY_TENSOR_UTIL",
+		outname: "nv_gpm_any_tensor_util",
+		id:      nvml.GPM_METRIC_ANY_TENSOR_UTIL,
+		unit:    "%",
+	},
+	{
+		name:    "DFMA_TENSOR_UTIL",
+		outname: "nv_gpm_dfma_tensor_util",
+		id:      nvml.GPM_METRIC_DFMA_TENSOR_UTIL,
+		unit:    "%",
+	},
+	{
+		name:    "HMMA_TENSOR_UTIL",
+		outname: "nv_gpm_hmma_tensor_util",
+		id:      nvml.GPM_METRIC_HMMA_TENSOR_UTIL,
+		unit:    "%",
+	},
+	{
+		name:    "IMMA_TENSOR_UTIL",
+		outname: "nv_gpm_imma_tensor_util",
+		id:      nvml.GPM_METRIC_IMMA_TENSOR_UTIL,
+		unit:    "%",
+	},
+	{
+		name:    "DRAM_BW_UTIL",
+		outname: "nv_gpm_dram_bw_util",
+		id:      nvml.GPM_METRIC_DRAM_BW_UTIL,
+		unit:    "%",
+	},
+	{
+		name:    "FP64_UTIL",
+		outname: "nv_gpm_fp64_util",
+		id:      nvml.GPM_METRIC_FP64_UTIL,
+		unit:    "%",
+	},
+	{
+		name:    "FP32_UTIL",
+		outname: "nv_gpm_fp32_util",
+		id:      nvml.GPM_METRIC_FP32_UTIL,
+		unit:    "%",
+	},
+	{
+		name:    "FP16_UTIL",
+		outname: "nv_gpm_fp16_util",
+		id:      nvml.GPM_METRIC_FP16_UTIL,
+		unit:    "%",
+	},
+}
+
+type NvidiaGPMCollectorConfig struct {
+	Metrics               []string `json:"metrics,omitempty"`
+	ExcludeDevices        []string `json:"exclude_devices,omitempty"`
+	AddPciInfoTag         bool     `json:"add_pci_info_tag,omitempty"`
+	UsePciInfoAsTypeId    bool     `json:"use_pci_info_as_type_id,omitempty"`
+	AddUuidMeta           bool     `json:"add_uuid_meta,omitempty"`
+	AddBoardNumberMeta    bool     `json:"add_board_number_meta,omitempty"`
+	AddSerialMeta         bool     `json:"add_serial_meta,omitempty"`
+	ProcessMigDevices     bool     `json:"process_mig_devices,omitempty"`
+	UseUuidForMigDevices  bool     `json:"use_uuid_for_mig_device,omitempty"`
+	UseSliceForMigDevices bool     `json:"use_slice_for_mig_device,omitempty"`
+}
+
+type NvidiaGPMCollectorDevice struct {
+	device        nvml.Device
+	tags          map[string]string
+	meta          map[string]string
+	startTime     time.Time
+	endTime       time.Time
+	measurement   nvml.GpmMetricsGetType
+	metricsLookup map[int]NvidiaGPMMetricDef
+}
+
+type NvidiaGPMCollector struct {
+	metricCollector
+
+	config   NvidiaGPMCollectorConfig
+	gpus     []NvidiaGPMCollectorDevice
+	num_gpus int
+}
+
+func (m *NvidiaGPMCollector) Init(config json.RawMessage) error {
+	var err error = nil
+	m.name = "NvidiaGPMCollector"
+	if err := m.setup(); err != nil {
+		return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
+	}
+	if len(config) > 0 {
+		d := json.NewDecoder(strings.NewReader(string(config)))
+		d.DisallowUnknownFields()
+		if err = d.Decode(&m.config); err != nil {
+			return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
+		}
+	}
+	m.meta = map[string]string{
+		"source": m.name,
+		"group":  "NvidiaGPM",
+	}
+
+	// Initialize NVIDIA Management Library (NVML)
+	ret := nvml.Init()
+
+	// Error: NVML library not found
+	// (nvml.ErrorString can not be used in this case)
+	if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
+		return fmt.Errorf("%s Init(): NVML library not found", m.name)
+	}
+	if ret != nvml.SUCCESS {
+		err = errors.New(nvml.ErrorString(ret))
+		return fmt.Errorf("%s Init(): Unable to initialize NVML: %w", m.name, err)
+	}
+
+	// Number of NVIDIA GPUs
+	num_gpus, ret := nvml.DeviceGetCount()
+	if ret != nvml.SUCCESS {
+		err = errors.New(nvml.ErrorString(ret))
+		return fmt.Errorf("%s Init(): Unable to get device count: %w", m.name, err)
+	}
+
+	// For all GPUs
+	idx := 0
+	m.gpus = make([]NvidiaGPMCollectorDevice, 0, num_gpus)
+	for i := range num_gpus {
+
+		// Skip excluded devices by ID
+		str_i := strconv.Itoa(i)
+		if slices.Contains(m.config.ExcludeDevices, str_i) {
+			cclog.ComponentDebugf(m.name, "Skipping excluded device %s", str_i)
+			continue
+		}
+
+		// Get device handle
+		device, ret := nvml.DeviceGetHandleByIndex(i)
+		if ret != nvml.SUCCESS {
+			err = errors.New(nvml.ErrorString(ret))
+			cclog.ComponentErrorf(m.name, "Unable to get device at index %d: %s", i, err.Error())
+			continue
+		}
+
+		//supportInfoFunc := nvml.GpmQueryDeviceSupportV(device)
+		supportInfo, ret := nvml.GpmQueryDeviceSupport(device)
+		if supportInfo.IsSupportedDevice == uint32(nvml.FEATURE_DISABLED) {
+			cclog.ComponentErrorf(m.name, "Device at index %d does not support GPM metrics", i)
+			continue
+		}
+		stream, ret := nvml.GpmQueryIfStreamingEnabled(device)
+		if stream == uint32(nvml.FEATURE_DISABLED) {
+			nvml.GpmSetStreamingEnabled(device, uint32(nvml.FEATURE_ENABLED))
+		}
+
+		// Get device's PCI info
+		pciInfo, ret := nvml.DeviceGetPciInfo(device)
+		if ret != nvml.SUCCESS {
+			err = errors.New(nvml.ErrorString(ret))
+			cclog.ComponentErrorf(m.name, "Unable to get PCI info for device at index %d: %s", i, err.Error())
+			continue
+		}
+		// Create PCI ID in the common format used by the NVML.
+		pci_id := fmt.Sprintf(
+			nvml.DEVICE_PCI_BUS_ID_FMT,
+			pciInfo.Domain,
+			pciInfo.Bus,
+			pciInfo.Device)
+
+		// Skip excluded devices specified by PCI ID
+		if slices.Contains(m.config.ExcludeDevices, pci_id) {
+			cclog.ComponentDebugf(m.name, "Skipping excluded device %s", pci_id)
+			continue
+		}
+		ss, nvmlErr := nvml.GpmSampleAlloc()
+		if nvmlErr != nvml.SUCCESS {
+			err = errors.New(nvml.ErrorString(ret))
+			cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error())
+			continue
+		}
+		es, nvmlErr := nvml.GpmSampleAlloc()
+		if nvmlErr != nvml.SUCCESS {
+			err = errors.New(nvml.ErrorString(ret))
+			cclog.ComponentErrorf(m.name, "Failed to allocate GPM sample for device %d: %s", i, err.Error())
+			continue
+		}
+
+		// Select which value to use as 'type-id'.
+		// The PCI ID is commonly required in SLURM environments because the
+		// numberic IDs used by SLURM and the ones used by NVML might differ
+		// depending on the job type. The PCI ID is more reliable but is commonly
+		// not recorded for a job, so it must be added manually in prologue or epilogue
+		// e.g. to the comment field
+		tid := str_i
+		if m.config.UsePciInfoAsTypeId {
+			tid = pci_id
+		}
+
+		// Now we got all infos together, populate the device list
+		g := &m.gpus[idx]
+
+		// Add device handle
+		g.device = device
+
+		// Add tags
+		g.tags = map[string]string{
+			"type":    "accelerator",
+			"type-id": tid,
+		}
+
+		// Add PCI info as tag if not already used as 'type-id'
+		if m.config.AddPciInfoTag && !m.config.UsePciInfoAsTypeId {
+			g.tags["pci_identifier"] = pci_id
+		}
+
+		g.meta = map[string]string{
+			"source": m.name,
+			"group":  "Nvidia",
+		}
+
+		if m.config.AddBoardNumberMeta {
+			board, ret := nvml.DeviceGetBoardPartNumber(device)
+			if ret != nvml.SUCCESS {
+				err = errors.New(nvml.ErrorString(ret))
+				cclog.ComponentError(m.name, "Unable to get boart part number for device at index", i, ":", err.Error())
+			} else {
+				g.meta["board_number"] = board
+			}
+		}
+		if m.config.AddSerialMeta {
+			serial, ret := nvml.DeviceGetSerial(device)
+			if ret != nvml.SUCCESS {
+				err = errors.New(nvml.ErrorString(ret))
+				cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", err.Error())
+			} else {
+				g.meta["serial"] = serial
+			}
+		}
+		if m.config.AddUuidMeta {
+			uuid, ret := nvml.DeviceGetUUID(device)
+			if ret != nvml.SUCCESS {
+				err = errors.New(nvml.ErrorString(ret))
+				cclog.ComponentError(m.name, "Unable to get UUID for device at index", i, ":", err.Error())
+			} else {
+				g.meta["uuid"] = uuid
+			}
+		}
+
+		g.measurement.Sample1 = ss
+		g.measurement.Sample2 = es
+		g.measurement.Version = nvml.GPM_METRICS_GET_VERSION
+		g.metricsLookup = make(map[int]NvidiaGPMMetricDef)
+		metIdx := 0
+		for _, inmetric := range m.config.Metrics {
+			for _, defmetric := range NvidiaGPMMetrics {
+				if inmetric == defmetric.outname {
+					g.measurement.Metrics[metIdx] = nvml.GpmMetric{
+						MetricId: uint32(defmetric.id),
+					}
+					g.metricsLookup[metIdx] = defmetric
+					metIdx += 1
+				}
+			}
+		}
+		g.measurement.NumMetrics = uint32(metIdx)
+	}
+	cclog.ComponentDebugf(m.name, "Found %d Nvidia GPUs with GPM support", len(m.gpus))
+	m.init = true
+	return err
+}
+
+func (m *NvidiaGPMCollector) Read(interval time.Duration, output chan lp.CCMessage) {
+	var err error
+	if !m.init {
+		return
+	}
+	for i, gpu := range m.gpus {
+		gpu.startTime = time.Now()
+		nvmlErr := gpu.measurement.Sample1.Get(gpu.device)
+		if nvmlErr != nvml.SUCCESS {
+			err = errors.New(nvml.ErrorString(nvmlErr))
+			cclog.ComponentError(m.name, "Unable to get start GPM sample for device at index", i, ":", err.Error())
+			continue
+		}
+	}
+	time.Sleep(interval)
+
+	for i, gpu := range m.gpus {
+		gpu.endTime = time.Now()
+		nvmlErr := gpu.measurement.Sample2.Get(gpu.device)
+		if nvmlErr != nvml.SUCCESS {
+			err = errors.New(nvml.ErrorString(nvmlErr))
+			cclog.ComponentError(m.name, "Unable to get stop GPM sample for device at index", i, ":", err.Error())
+			continue
+		}
+	}
+
+	for i, gpu := range m.gpus {
+		nvmlErr := nvml.GpmMetricsGet(&gpu.measurement)
+		if nvmlErr != nvml.SUCCESS {
+			err = errors.New(nvml.ErrorString(nvmlErr))
+			cclog.ComponentError(m.name, "Unable to get evaluate GPM sample for device at index", i, ":", err.Error())
+			continue
+		}
+		for idx, metricDef := range gpu.metricsLookup {
+			y, err := lp.NewMetric(metricDef.outname, gpu.tags, gpu.meta, gpu.measurement.Metrics[idx].Value, time.Now())
+			if err == nil {
+				y.AddMeta("unit", metricDef.unit)
+				output <- y
+			}
+		}
+	}
+
+}
+
+func (m *NvidiaGPMCollector) Close() {
+	if m.init {
+		for _, gpu := range m.gpus {
+			gpu.measurement.Sample1.Free()
+			gpu.measurement.Sample2.Free()
+		}
+		if ret := nvml.Shutdown(); ret != nvml.SUCCESS {
+			cclog.ComponentError(m.name, "nvml.Shutdown() not successful")
+		}
+		m.init = false
+	}
+}