From e13695307f9e5363ce33dec5975963cd01706799 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Wed, 25 May 2022 15:55:43 +0200 Subject: [PATCH] AMD ROCm SMI collector (#77) * Add collector for AMD ROCm SMI metrics * Fix import path * Fix imports * Remove Board Number * store GPU index explicitly * Remove board number from description --- collectors/README.md | 1 + collectors/collectorManager.go | 1 + collectors/rocmsmiMetric.go | 319 +++++++++++++++++++++++++++++++++ collectors/rocmsmiMetric.md | 47 +++++ 4 files changed, 368 insertions(+) create mode 100644 collectors/rocmsmiMetric.go create mode 100644 collectors/rocmsmiMetric.md diff --git a/collectors/README.md b/collectors/README.md index 3fcdd49..10e5105 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -39,6 +39,7 @@ In contrast to the configuration files for sinks and receivers, the collectors c * [`gpfs`](./gpfsMetric.md) * [`beegfs_meta`](./beegfsmetaMetric.md) * [`beegfs_storage`](./beegfsstorageMetric.md) +* [`rocm_smi`](./rocmsmiMetric.md) ## Todos diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 62f6220..49a9db8 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -36,6 +36,7 @@ var AvailableCollectors = map[string]MetricCollector{ "numastats": new(NUMAStatsCollector), "beegfs_meta": new(BeegfsMetaCollector), "beegfs_storage": new(BeegfsStorageCollector), + "rocm_smi": new(RocmSmiCollector), } // Metric collector manager data structure diff --git a/collectors/rocmsmiMetric.go b/collectors/rocmsmiMetric.go new file mode 100644 index 0000000..f2acfbf --- /dev/null +++ b/collectors/rocmsmiMetric.go @@ -0,0 +1,319 @@ +package collectors + +import ( + "encoding/json" + "errors" + "fmt" + "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + "github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi" +) + +type RocmSmiCollectorConfig struct { + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + ExcludeDevices []string `json:"exclude_devices,omitempty"` + AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"` + UsePciInfoAsTypeId bool `json:"use_pci_info_as_type_id,omitempty"` + AddSerialMeta bool `json:"add_serial_meta,omitempty"` +} + +type RocmSmiCollectorDevice struct { + device rocm_smi.DeviceHandle + index int + tags map[string]string // default tags + meta map[string]string // default meta information + excludeMetrics map[string]bool // copy of exclude metrics from config +} + +type RocmSmiCollector struct { + metricCollector + config RocmSmiCollectorConfig // the configuration structure + devices []RocmSmiCollectorDevice +} + +// Functions to implement MetricCollector interface +// Init(...), Read(...), Close() +// See: metricCollector.go + +// Init initializes the sample collector +// Called once by the collector manager +// All tags, meta data tags and metrics that do not change over the runtime should be set here +func (m *RocmSmiCollector) Init(config json.RawMessage) error { + var err error = nil + // Always set the name early in Init() to use it in cclog.Component* functions + m.name = "RocmSmiCollector" + // This is for later use, also call it early + m.setup() + // Define meta information sent with each metric + // (Can also be dynamic or this is the basic set with extension through AddMeta()) + //m.meta = map[string]string{"source": m.name, "group": "AMD"} + // Define tags sent with each metric + // The 'type' tag is always needed, it defines the granulatity of the metric + // node -> whole system + // socket -> CPU socket (requires socket ID as 'type-id' tag) + // cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag) + //m.tags = map[string]string{"type": "node"} + // Read in the JSON configuration + if len(config) > 0 { + err = json.Unmarshal(config, &m.config) + if err != nil { + cclog.ComponentError(m.name, "Error reading config:", err.Error()) + return err + } + } + + ret := rocm_smi.Init() + if ret != rocm_smi.STATUS_SUCCESS { + err = errors.New("Failed to initialize ROCm SMI library") + cclog.ComponentError(m.name, err.Error()) + return err + } + + numDevs, ret := rocm_smi.NumMonitorDevices() + if ret != rocm_smi.STATUS_SUCCESS { + err = errors.New("Failed to get number of GPUs from ROCm SMI library") + cclog.ComponentError(m.name, err.Error()) + return err + } + + exclDev := func(s string) bool { + skip_device := false + for _, excl := range m.config.ExcludeDevices { + if excl == s { + skip_device = true + break + } + } + return skip_device + } + + m.devices = make([]RocmSmiCollectorDevice, 0) + + for i := 0; i < numDevs; i++ { + str_i := fmt.Sprintf("%d", i) + if exclDev(str_i) { + continue + } + device, ret := rocm_smi.DeviceGetHandleByIndex(i) + if ret != rocm_smi.STATUS_SUCCESS { + err = fmt.Errorf("Failed to get handle for GPU %d", i) + cclog.ComponentError(m.name, err.Error()) + return err + } + + pciInfo, ret := rocm_smi.DeviceGetPciInfo(device) + if ret != rocm_smi.STATUS_SUCCESS { + err = fmt.Errorf("Failed to get PCI information for GPU %d", i) + cclog.ComponentError(m.name, err.Error()) + return err + } + + pciId := fmt.Sprintf( + "%08X:%02X:%02X.%X", + pciInfo.Domain, + pciInfo.Bus, + pciInfo.Device, + pciInfo.Function) + + if exclDev(pciId) { + continue + } + + dev := RocmSmiCollectorDevice{ + device: device, + tags: map[string]string{ + "type": "accelerator", + "type-id": str_i, + }, + meta: map[string]string{ + "source": m.name, + "group": "AMD", + }, + } + if m.config.UsePciInfoAsTypeId { + dev.tags["type-id"] = pciId + } else if m.config.AddPciInfoTag { + dev.tags["pci_identifier"] = pciId + } + + if m.config.AddSerialMeta { + serial, ret := rocm_smi.DeviceGetSerial(device) + if ret != rocm_smi.STATUS_SUCCESS { + cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", rocm_smi.StatusStringNoError(ret)) + } else { + dev.meta["serial"] = serial + } + } + // Add excluded metrics + dev.excludeMetrics = map[string]bool{} + for _, e := range m.config.ExcludeMetrics { + dev.excludeMetrics[e] = true + } + dev.index = i + m.devices = append(m.devices, dev) + } + + // Set this flag only if everything is initialized properly, all required files exist, ... + m.init = true + return err +} + +// Read collects all metrics belonging to the sample collector +// and sends them through the output channel to the collector manager +func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMetric) { + // Create a sample metric + timestamp := time.Now() + + for _, dev := range m.devices { + metrics, ret := rocm_smi.DeviceGetMetrics(dev.device) + if ret != rocm_smi.STATUS_SUCCESS { + cclog.ComponentError(m.name, "Unable to get metrics for device at index", dev.index, ":", rocm_smi.StatusStringNoError(ret)) + continue + } + + if !dev.excludeMetrics["rocm_gfx_util"] { + value := metrics.Average_gfx_activity + y, err := lp.New("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_umc_util"] { + value := metrics.Average_umc_activity + y, err := lp.New("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_mm_util"] { + value := metrics.Average_mm_activity + y, err := lp.New("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_avg_power"] { + value := metrics.Average_socket_power + y, err := lp.New("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_temp_mem"] { + value := metrics.Temperature_mem + y, err := lp.New("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_temp_hotspot"] { + value := metrics.Temperature_hotspot + y, err := lp.New("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_temp_edge"] { + value := metrics.Temperature_edge + y, err := lp.New("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_temp_vrgfx"] { + value := metrics.Temperature_vrgfx + y, err := lp.New("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_temp_vrsoc"] { + value := metrics.Temperature_vrsoc + y, err := lp.New("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_temp_vrmem"] { + value := metrics.Temperature_vrmem + y, err := lp.New("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_gfx_clock"] { + value := metrics.Average_gfxclk_frequency + y, err := lp.New("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_soc_clock"] { + value := metrics.Average_socclk_frequency + y, err := lp.New("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_u_clock"] { + value := metrics.Average_uclk_frequency + y, err := lp.New("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_v0_clock"] { + value := metrics.Average_vclk0_frequency + y, err := lp.New("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_v1_clock"] { + value := metrics.Average_vclk1_frequency + y, err := lp.New("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_d0_clock"] { + value := metrics.Average_dclk0_frequency + y, err := lp.New("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_d1_clock"] { + value := metrics.Average_dclk1_frequency + y, err := lp.New("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + output <- y + } + } + if !dev.excludeMetrics["rocm_temp_hbm"] { + for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ { + value := metrics.Temperature_hbm[i] + y, err := lp.New("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + if err == nil { + y.AddTag("stype", "device") + y.AddTag("stype-id", fmt.Sprintf("%d", i)) + output <- y + } + } + } + } + +} + +// Close metric collector: close network connection, close files, close libraries, ... +// Called once by the collector manager +func (m *RocmSmiCollector) Close() { + // Unset flag + ret := rocm_smi.Shutdown() + if ret != rocm_smi.STATUS_SUCCESS { + cclog.ComponentError(m.name, "Failed to shutdown ROCm SMI library") + } + m.init = false +} diff --git a/collectors/rocmsmiMetric.md b/collectors/rocmsmiMetric.md new file mode 100644 index 0000000..9c4da5e --- /dev/null +++ b/collectors/rocmsmiMetric.md @@ -0,0 +1,47 @@ + +## `rocm_smi` collector + +```json + "rocm_smi": { + "exclude_devices": [ + "0","1", "0000000:ff:01.0" + ], + "exclude_metrics": [ + "rocm_mm_util", + "rocm_temp_vrsoc" + ], + "use_pci_info_as_type_id": true, + "add_pci_info_tag": false, + "add_serial_meta": false, + } +``` + +The `rocm_smi` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes logical IDs in the list of available devices or the PCI address similar to NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option. + +The metrics sent by the `rocm_smi` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag. + +Optionally, it is possible to add the serial to the meta informations. They are not sent to the sinks (if not configured otherwise). + + +Metrics: +* `rocm_gfx_util` +* `rocm_umc_util` +* `rocm_mm_util` +* `rocm_avg_power` +* `rocm_temp_mem` +* `rocm_temp_hotspot` +* `rocm_temp_edge` +* `rocm_temp_vrgfx` +* `rocm_temp_vrsoc` +* `rocm_temp_vrmem` +* `rocm_gfx_clock` +* `rocm_soc_clock` +* `rocm_u_clock` +* `rocm_v0_clock` +* `rocm_v1_clock` +* `rocm_d0_clock` +* `rocm_d1_clock` +* `rocm_temp_hbm` + + +Some metrics add the additional sub type tag (`stype`) like the `rocm_temp_hbm` metrics set `stype=device,stype-id=`.