mirror of
				https://github.com/ClusterCockpit/cc-metric-collector.git
				synced 2025-11-04 02:35:07 +01:00 
			
		
		
		
	AMD ROCm SMI collector (#77)
* Add collector for AMD ROCm SMI metrics * Fix import path * Fix imports * Remove Board Number * store GPU index explicitly * Remove board number from description
This commit is contained in:
		
							
								
								
									
										319
									
								
								collectors/rocmsmiMetric.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										319
									
								
								collectors/rocmsmiMetric.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,319 @@
 | 
			
		||||
package collectors
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"encoding/json"
 | 
			
		||||
	"errors"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"time"
 | 
			
		||||
 | 
			
		||||
	cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
 | 
			
		||||
	lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
 | 
			
		||||
	"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
type RocmSmiCollectorConfig struct {
 | 
			
		||||
	ExcludeMetrics     []string `json:"exclude_metrics,omitempty"`
 | 
			
		||||
	ExcludeDevices     []string `json:"exclude_devices,omitempty"`
 | 
			
		||||
	AddPciInfoTag      bool     `json:"add_pci_info_tag,omitempty"`
 | 
			
		||||
	UsePciInfoAsTypeId bool     `json:"use_pci_info_as_type_id,omitempty"`
 | 
			
		||||
	AddSerialMeta      bool     `json:"add_serial_meta,omitempty"`
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type RocmSmiCollectorDevice struct {
 | 
			
		||||
	device         rocm_smi.DeviceHandle
 | 
			
		||||
	index          int
 | 
			
		||||
	tags           map[string]string // default tags
 | 
			
		||||
	meta           map[string]string // default meta information
 | 
			
		||||
	excludeMetrics map[string]bool   // copy of exclude metrics from config
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type RocmSmiCollector struct {
 | 
			
		||||
	metricCollector
 | 
			
		||||
	config  RocmSmiCollectorConfig // the configuration structure
 | 
			
		||||
	devices []RocmSmiCollectorDevice
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Functions to implement MetricCollector interface
 | 
			
		||||
// Init(...), Read(...), Close()
 | 
			
		||||
// See: metricCollector.go
 | 
			
		||||
 | 
			
		||||
// Init initializes the sample collector
 | 
			
		||||
// Called once by the collector manager
 | 
			
		||||
// All tags, meta data tags and metrics that do not change over the runtime should be set here
 | 
			
		||||
func (m *RocmSmiCollector) Init(config json.RawMessage) error {
 | 
			
		||||
	var err error = nil
 | 
			
		||||
	// Always set the name early in Init() to use it in cclog.Component* functions
 | 
			
		||||
	m.name = "RocmSmiCollector"
 | 
			
		||||
	// This is for later use, also call it early
 | 
			
		||||
	m.setup()
 | 
			
		||||
	// Define meta information sent with each metric
 | 
			
		||||
	// (Can also be dynamic or this is the basic set with extension through AddMeta())
 | 
			
		||||
	//m.meta = map[string]string{"source": m.name, "group": "AMD"}
 | 
			
		||||
	// Define tags sent with each metric
 | 
			
		||||
	// The 'type' tag is always needed, it defines the granulatity of the metric
 | 
			
		||||
	// node -> whole system
 | 
			
		||||
	// socket -> CPU socket (requires socket ID as 'type-id' tag)
 | 
			
		||||
	// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
 | 
			
		||||
	//m.tags = map[string]string{"type": "node"}
 | 
			
		||||
	// Read in the JSON configuration
 | 
			
		||||
	if len(config) > 0 {
 | 
			
		||||
		err = json.Unmarshal(config, &m.config)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			cclog.ComponentError(m.name, "Error reading config:", err.Error())
 | 
			
		||||
			return err
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	ret := rocm_smi.Init()
 | 
			
		||||
	if ret != rocm_smi.STATUS_SUCCESS {
 | 
			
		||||
		err = errors.New("Failed to initialize ROCm SMI library")
 | 
			
		||||
		cclog.ComponentError(m.name, err.Error())
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	numDevs, ret := rocm_smi.NumMonitorDevices()
 | 
			
		||||
	if ret != rocm_smi.STATUS_SUCCESS {
 | 
			
		||||
		err = errors.New("Failed to get number of GPUs from ROCm SMI library")
 | 
			
		||||
		cclog.ComponentError(m.name, err.Error())
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	exclDev := func(s string) bool {
 | 
			
		||||
		skip_device := false
 | 
			
		||||
		for _, excl := range m.config.ExcludeDevices {
 | 
			
		||||
			if excl == s {
 | 
			
		||||
				skip_device = true
 | 
			
		||||
				break
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		return skip_device
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	m.devices = make([]RocmSmiCollectorDevice, 0)
 | 
			
		||||
 | 
			
		||||
	for i := 0; i < numDevs; i++ {
 | 
			
		||||
		str_i := fmt.Sprintf("%d", i)
 | 
			
		||||
		if exclDev(str_i) {
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		device, ret := rocm_smi.DeviceGetHandleByIndex(i)
 | 
			
		||||
		if ret != rocm_smi.STATUS_SUCCESS {
 | 
			
		||||
			err = fmt.Errorf("Failed to get handle for GPU %d", i)
 | 
			
		||||
			cclog.ComponentError(m.name, err.Error())
 | 
			
		||||
			return err
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		pciInfo, ret := rocm_smi.DeviceGetPciInfo(device)
 | 
			
		||||
		if ret != rocm_smi.STATUS_SUCCESS {
 | 
			
		||||
			err = fmt.Errorf("Failed to get PCI information for GPU %d", i)
 | 
			
		||||
			cclog.ComponentError(m.name, err.Error())
 | 
			
		||||
			return err
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		pciId := fmt.Sprintf(
 | 
			
		||||
			"%08X:%02X:%02X.%X",
 | 
			
		||||
			pciInfo.Domain,
 | 
			
		||||
			pciInfo.Bus,
 | 
			
		||||
			pciInfo.Device,
 | 
			
		||||
			pciInfo.Function)
 | 
			
		||||
 | 
			
		||||
		if exclDev(pciId) {
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		dev := RocmSmiCollectorDevice{
 | 
			
		||||
			device: device,
 | 
			
		||||
			tags: map[string]string{
 | 
			
		||||
				"type":    "accelerator",
 | 
			
		||||
				"type-id": str_i,
 | 
			
		||||
			},
 | 
			
		||||
			meta: map[string]string{
 | 
			
		||||
				"source": m.name,
 | 
			
		||||
				"group":  "AMD",
 | 
			
		||||
			},
 | 
			
		||||
		}
 | 
			
		||||
		if m.config.UsePciInfoAsTypeId {
 | 
			
		||||
			dev.tags["type-id"] = pciId
 | 
			
		||||
		} else if m.config.AddPciInfoTag {
 | 
			
		||||
			dev.tags["pci_identifier"] = pciId
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if m.config.AddSerialMeta {
 | 
			
		||||
			serial, ret := rocm_smi.DeviceGetSerial(device)
 | 
			
		||||
			if ret != rocm_smi.STATUS_SUCCESS {
 | 
			
		||||
				cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", rocm_smi.StatusStringNoError(ret))
 | 
			
		||||
			} else {
 | 
			
		||||
				dev.meta["serial"] = serial
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		// Add excluded metrics
 | 
			
		||||
		dev.excludeMetrics = map[string]bool{}
 | 
			
		||||
		for _, e := range m.config.ExcludeMetrics {
 | 
			
		||||
			dev.excludeMetrics[e] = true
 | 
			
		||||
		}
 | 
			
		||||
		dev.index = i
 | 
			
		||||
		m.devices = append(m.devices, dev)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Set this flag only if everything is initialized properly, all required files exist, ...
 | 
			
		||||
	m.init = true
 | 
			
		||||
	return err
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Read collects all metrics belonging to the sample collector
 | 
			
		||||
// and sends them through the output channel to the collector manager
 | 
			
		||||
func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMetric) {
 | 
			
		||||
	// Create a sample metric
 | 
			
		||||
	timestamp := time.Now()
 | 
			
		||||
 | 
			
		||||
	for _, dev := range m.devices {
 | 
			
		||||
		metrics, ret := rocm_smi.DeviceGetMetrics(dev.device)
 | 
			
		||||
		if ret != rocm_smi.STATUS_SUCCESS {
 | 
			
		||||
			cclog.ComponentError(m.name, "Unable to get metrics for device at index", dev.index, ":", rocm_smi.StatusStringNoError(ret))
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if !dev.excludeMetrics["rocm_gfx_util"] {
 | 
			
		||||
			value := metrics.Average_gfx_activity
 | 
			
		||||
			y, err := lp.New("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_umc_util"] {
 | 
			
		||||
			value := metrics.Average_umc_activity
 | 
			
		||||
			y, err := lp.New("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_mm_util"] {
 | 
			
		||||
			value := metrics.Average_mm_activity
 | 
			
		||||
			y, err := lp.New("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_avg_power"] {
 | 
			
		||||
			value := metrics.Average_socket_power
 | 
			
		||||
			y, err := lp.New("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_temp_mem"] {
 | 
			
		||||
			value := metrics.Temperature_mem
 | 
			
		||||
			y, err := lp.New("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_temp_hotspot"] {
 | 
			
		||||
			value := metrics.Temperature_hotspot
 | 
			
		||||
			y, err := lp.New("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_temp_edge"] {
 | 
			
		||||
			value := metrics.Temperature_edge
 | 
			
		||||
			y, err := lp.New("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_temp_vrgfx"] {
 | 
			
		||||
			value := metrics.Temperature_vrgfx
 | 
			
		||||
			y, err := lp.New("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_temp_vrsoc"] {
 | 
			
		||||
			value := metrics.Temperature_vrsoc
 | 
			
		||||
			y, err := lp.New("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_temp_vrmem"] {
 | 
			
		||||
			value := metrics.Temperature_vrmem
 | 
			
		||||
			y, err := lp.New("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_gfx_clock"] {
 | 
			
		||||
			value := metrics.Average_gfxclk_frequency
 | 
			
		||||
			y, err := lp.New("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_soc_clock"] {
 | 
			
		||||
			value := metrics.Average_socclk_frequency
 | 
			
		||||
			y, err := lp.New("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_u_clock"] {
 | 
			
		||||
			value := metrics.Average_uclk_frequency
 | 
			
		||||
			y, err := lp.New("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_v0_clock"] {
 | 
			
		||||
			value := metrics.Average_vclk0_frequency
 | 
			
		||||
			y, err := lp.New("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_v1_clock"] {
 | 
			
		||||
			value := metrics.Average_vclk1_frequency
 | 
			
		||||
			y, err := lp.New("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_d0_clock"] {
 | 
			
		||||
			value := metrics.Average_dclk0_frequency
 | 
			
		||||
			y, err := lp.New("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_d1_clock"] {
 | 
			
		||||
			value := metrics.Average_dclk1_frequency
 | 
			
		||||
			y, err := lp.New("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if !dev.excludeMetrics["rocm_temp_hbm"] {
 | 
			
		||||
			for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ {
 | 
			
		||||
				value := metrics.Temperature_hbm[i]
 | 
			
		||||
				y, err := lp.New("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					y.AddTag("stype", "device")
 | 
			
		||||
					y.AddTag("stype-id", fmt.Sprintf("%d", i))
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Close metric collector: close network connection, close files, close libraries, ...
 | 
			
		||||
// Called once by the collector manager
 | 
			
		||||
func (m *RocmSmiCollector) Close() {
 | 
			
		||||
	// Unset flag
 | 
			
		||||
	ret := rocm_smi.Shutdown()
 | 
			
		||||
	if ret != rocm_smi.STATUS_SUCCESS {
 | 
			
		||||
		cclog.ComponentError(m.name, "Failed to shutdown ROCm SMI library")
 | 
			
		||||
	}
 | 
			
		||||
	m.init = false
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user