package collectors import ( "encoding/json" "errors" "fmt" "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" "github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi" ) type RocmSmiCollectorConfig struct { ExcludeMetrics []string `json:"exclude_metrics,omitempty"` ExcludeDevices []string `json:"exclude_devices,omitempty"` AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"` UsePciInfoAsTypeId bool `json:"use_pci_info_as_type_id,omitempty"` AddSerialMeta bool `json:"add_serial_meta,omitempty"` } type RocmSmiCollectorDevice struct { device rocm_smi.DeviceHandle index int tags map[string]string // default tags meta map[string]string // default meta information excludeMetrics map[string]bool // copy of exclude metrics from config } type RocmSmiCollector struct { metricCollector config RocmSmiCollectorConfig // the configuration structure devices []RocmSmiCollectorDevice } // Functions to implement MetricCollector interface // Init(...), Read(...), Close() // See: metricCollector.go // Init initializes the sample collector // Called once by the collector manager // All tags, meta data tags and metrics that do not change over the runtime should be set here func (m *RocmSmiCollector) Init(config json.RawMessage) error { var err error = nil // Always set the name early in Init() to use it in cclog.Component* functions m.name = "RocmSmiCollector" // This is for later use, also call it early m.setup() // Define meta information sent with each metric // (Can also be dynamic or this is the basic set with extension through AddMeta()) //m.meta = map[string]string{"source": m.name, "group": "AMD"} // Define tags sent with each metric // The 'type' tag is always needed, it defines the granulatity of the metric // node -> whole system // socket -> CPU socket (requires socket ID as 'type-id' tag) // cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag) //m.tags = map[string]string{"type": "node"} // Read in the JSON configuration if len(config) > 0 { err = json.Unmarshal(config, &m.config) if err != nil { cclog.ComponentError(m.name, "Error reading config:", err.Error()) return err } } ret := rocm_smi.Init() if ret != rocm_smi.STATUS_SUCCESS { err = errors.New("failed to initialize ROCm SMI library") cclog.ComponentError(m.name, err.Error()) return err } numDevs, ret := rocm_smi.NumMonitorDevices() if ret != rocm_smi.STATUS_SUCCESS { err = errors.New("failed to get number of GPUs from ROCm SMI library") cclog.ComponentError(m.name, err.Error()) return err } exclDev := func(s string) bool { skip_device := false for _, excl := range m.config.ExcludeDevices { if excl == s { skip_device = true break } } return skip_device } m.devices = make([]RocmSmiCollectorDevice, 0) for i := 0; i < numDevs; i++ { str_i := fmt.Sprintf("%d", i) if exclDev(str_i) { continue } device, ret := rocm_smi.DeviceGetHandleByIndex(i) if ret != rocm_smi.STATUS_SUCCESS { err = fmt.Errorf("failed to get handle for GPU %d", i) cclog.ComponentError(m.name, err.Error()) return err } pciInfo, ret := rocm_smi.DeviceGetPciInfo(device) if ret != rocm_smi.STATUS_SUCCESS { err = fmt.Errorf("failed to get PCI information for GPU %d", i) cclog.ComponentError(m.name, err.Error()) return err } pciId := fmt.Sprintf( "%08X:%02X:%02X.%X", pciInfo.Domain, pciInfo.Bus, pciInfo.Device, pciInfo.Function) if exclDev(pciId) { continue } dev := RocmSmiCollectorDevice{ device: device, tags: map[string]string{ "type": "accelerator", "type-id": str_i, }, meta: map[string]string{ "source": m.name, "group": "AMD", }, } if m.config.UsePciInfoAsTypeId { dev.tags["type-id"] = pciId } else if m.config.AddPciInfoTag { dev.tags["pci_identifier"] = pciId } if m.config.AddSerialMeta { serial, ret := rocm_smi.DeviceGetSerialNumber(device) if ret != rocm_smi.STATUS_SUCCESS { cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", rocm_smi.StatusStringNoError(ret)) } else { dev.meta["serial"] = serial } } // Add excluded metrics dev.excludeMetrics = map[string]bool{} for _, e := range m.config.ExcludeMetrics { dev.excludeMetrics[e] = true } dev.index = i m.devices = append(m.devices, dev) } // Set this flag only if everything is initialized properly, all required files exist, ... m.init = true return err } // Read collects all metrics belonging to the sample collector // and sends them through the output channel to the collector manager func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMetric) { // Create a sample metric timestamp := time.Now() for _, dev := range m.devices { metrics, ret := rocm_smi.DeviceGetMetrics(dev.device) if ret != rocm_smi.STATUS_SUCCESS { cclog.ComponentError(m.name, "Unable to get metrics for device at index", dev.index, ":", rocm_smi.StatusStringNoError(ret)) continue } if !dev.excludeMetrics["rocm_gfx_util"] { value := metrics.Average_gfx_activity y, err := lp.New("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_umc_util"] { value := metrics.Average_umc_activity y, err := lp.New("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_mm_util"] { value := metrics.Average_mm_activity y, err := lp.New("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_avg_power"] { value := metrics.Average_socket_power y, err := lp.New("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_temp_mem"] { value := metrics.Temperature_mem y, err := lp.New("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_temp_hotspot"] { value := metrics.Temperature_hotspot y, err := lp.New("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_temp_edge"] { value := metrics.Temperature_edge y, err := lp.New("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_temp_vrgfx"] { value := metrics.Temperature_vrgfx y, err := lp.New("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_temp_vrsoc"] { value := metrics.Temperature_vrsoc y, err := lp.New("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_temp_vrmem"] { value := metrics.Temperature_vrmem y, err := lp.New("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_gfx_clock"] { value := metrics.Average_gfxclk_frequency y, err := lp.New("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_soc_clock"] { value := metrics.Average_socclk_frequency y, err := lp.New("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_u_clock"] { value := metrics.Average_uclk_frequency y, err := lp.New("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_v0_clock"] { value := metrics.Average_vclk0_frequency y, err := lp.New("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_v1_clock"] { value := metrics.Average_vclk1_frequency y, err := lp.New("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_d0_clock"] { value := metrics.Average_dclk0_frequency y, err := lp.New("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_d1_clock"] { value := metrics.Average_dclk1_frequency y, err := lp.New("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_temp_hbm"] { for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ { value := metrics.Temperature_hbm[i] y, err := lp.New("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { y.AddTag("stype", "device") y.AddTag("stype-id", fmt.Sprintf("%d", i)) output <- y } } } } } // Close metric collector: close network connection, close files, close libraries, ... // Called once by the collector manager func (m *RocmSmiCollector) Close() { // Unset flag ret := rocm_smi.Shutdown() if ret != rocm_smi.STATUS_SUCCESS { cclog.ComponentError(m.name, "Failed to shutdown ROCm SMI library") } m.init = false }