mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2024-12-25 15:09:05 +01:00
AMD ROCm SMI collector (#77)
* Add collector for AMD ROCm SMI metrics * Fix import path * Fix imports * Remove Board Number * store GPU index explicitly * Remove board number from description
This commit is contained in:
parent
4ed07cad77
commit
e13695307f
@ -39,6 +39,7 @@ In contrast to the configuration files for sinks and receivers, the collectors c
|
|||||||
* [`gpfs`](./gpfsMetric.md)
|
* [`gpfs`](./gpfsMetric.md)
|
||||||
* [`beegfs_meta`](./beegfsmetaMetric.md)
|
* [`beegfs_meta`](./beegfsmetaMetric.md)
|
||||||
* [`beegfs_storage`](./beegfsstorageMetric.md)
|
* [`beegfs_storage`](./beegfsstorageMetric.md)
|
||||||
|
* [`rocm_smi`](./rocmsmiMetric.md)
|
||||||
|
|
||||||
## Todos
|
## Todos
|
||||||
|
|
||||||
|
@ -36,6 +36,7 @@ var AvailableCollectors = map[string]MetricCollector{
|
|||||||
"numastats": new(NUMAStatsCollector),
|
"numastats": new(NUMAStatsCollector),
|
||||||
"beegfs_meta": new(BeegfsMetaCollector),
|
"beegfs_meta": new(BeegfsMetaCollector),
|
||||||
"beegfs_storage": new(BeegfsStorageCollector),
|
"beegfs_storage": new(BeegfsStorageCollector),
|
||||||
|
"rocm_smi": new(RocmSmiCollector),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Metric collector manager data structure
|
// Metric collector manager data structure
|
||||||
|
319
collectors/rocmsmiMetric.go
Normal file
319
collectors/rocmsmiMetric.go
Normal file
@ -0,0 +1,319 @@
|
|||||||
|
package collectors
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
|
||||||
|
)
|
||||||
|
|
||||||
|
type RocmSmiCollectorConfig struct {
|
||||||
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
|
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
||||||
|
AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"`
|
||||||
|
UsePciInfoAsTypeId bool `json:"use_pci_info_as_type_id,omitempty"`
|
||||||
|
AddSerialMeta bool `json:"add_serial_meta,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type RocmSmiCollectorDevice struct {
|
||||||
|
device rocm_smi.DeviceHandle
|
||||||
|
index int
|
||||||
|
tags map[string]string // default tags
|
||||||
|
meta map[string]string // default meta information
|
||||||
|
excludeMetrics map[string]bool // copy of exclude metrics from config
|
||||||
|
}
|
||||||
|
|
||||||
|
type RocmSmiCollector struct {
|
||||||
|
metricCollector
|
||||||
|
config RocmSmiCollectorConfig // the configuration structure
|
||||||
|
devices []RocmSmiCollectorDevice
|
||||||
|
}
|
||||||
|
|
||||||
|
// Functions to implement MetricCollector interface
|
||||||
|
// Init(...), Read(...), Close()
|
||||||
|
// See: metricCollector.go
|
||||||
|
|
||||||
|
// Init initializes the sample collector
|
||||||
|
// Called once by the collector manager
|
||||||
|
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
||||||
|
func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||||
|
var err error = nil
|
||||||
|
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||||
|
m.name = "RocmSmiCollector"
|
||||||
|
// This is for later use, also call it early
|
||||||
|
m.setup()
|
||||||
|
// Define meta information sent with each metric
|
||||||
|
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||||
|
//m.meta = map[string]string{"source": m.name, "group": "AMD"}
|
||||||
|
// Define tags sent with each metric
|
||||||
|
// The 'type' tag is always needed, it defines the granulatity of the metric
|
||||||
|
// node -> whole system
|
||||||
|
// socket -> CPU socket (requires socket ID as 'type-id' tag)
|
||||||
|
// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
|
||||||
|
//m.tags = map[string]string{"type": "node"}
|
||||||
|
// Read in the JSON configuration
|
||||||
|
if len(config) > 0 {
|
||||||
|
err = json.Unmarshal(config, &m.config)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ret := rocm_smi.Init()
|
||||||
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
|
err = errors.New("Failed to initialize ROCm SMI library")
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
numDevs, ret := rocm_smi.NumMonitorDevices()
|
||||||
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
|
err = errors.New("Failed to get number of GPUs from ROCm SMI library")
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
exclDev := func(s string) bool {
|
||||||
|
skip_device := false
|
||||||
|
for _, excl := range m.config.ExcludeDevices {
|
||||||
|
if excl == s {
|
||||||
|
skip_device = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return skip_device
|
||||||
|
}
|
||||||
|
|
||||||
|
m.devices = make([]RocmSmiCollectorDevice, 0)
|
||||||
|
|
||||||
|
for i := 0; i < numDevs; i++ {
|
||||||
|
str_i := fmt.Sprintf("%d", i)
|
||||||
|
if exclDev(str_i) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
|
||||||
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
|
err = fmt.Errorf("Failed to get handle for GPU %d", i)
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
pciInfo, ret := rocm_smi.DeviceGetPciInfo(device)
|
||||||
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
|
err = fmt.Errorf("Failed to get PCI information for GPU %d", i)
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
pciId := fmt.Sprintf(
|
||||||
|
"%08X:%02X:%02X.%X",
|
||||||
|
pciInfo.Domain,
|
||||||
|
pciInfo.Bus,
|
||||||
|
pciInfo.Device,
|
||||||
|
pciInfo.Function)
|
||||||
|
|
||||||
|
if exclDev(pciId) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
dev := RocmSmiCollectorDevice{
|
||||||
|
device: device,
|
||||||
|
tags: map[string]string{
|
||||||
|
"type": "accelerator",
|
||||||
|
"type-id": str_i,
|
||||||
|
},
|
||||||
|
meta: map[string]string{
|
||||||
|
"source": m.name,
|
||||||
|
"group": "AMD",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if m.config.UsePciInfoAsTypeId {
|
||||||
|
dev.tags["type-id"] = pciId
|
||||||
|
} else if m.config.AddPciInfoTag {
|
||||||
|
dev.tags["pci_identifier"] = pciId
|
||||||
|
}
|
||||||
|
|
||||||
|
if m.config.AddSerialMeta {
|
||||||
|
serial, ret := rocm_smi.DeviceGetSerial(device)
|
||||||
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
|
cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", rocm_smi.StatusStringNoError(ret))
|
||||||
|
} else {
|
||||||
|
dev.meta["serial"] = serial
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Add excluded metrics
|
||||||
|
dev.excludeMetrics = map[string]bool{}
|
||||||
|
for _, e := range m.config.ExcludeMetrics {
|
||||||
|
dev.excludeMetrics[e] = true
|
||||||
|
}
|
||||||
|
dev.index = i
|
||||||
|
m.devices = append(m.devices, dev)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set this flag only if everything is initialized properly, all required files exist, ...
|
||||||
|
m.init = true
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read collects all metrics belonging to the sample collector
|
||||||
|
// and sends them through the output channel to the collector manager
|
||||||
|
func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||||
|
// Create a sample metric
|
||||||
|
timestamp := time.Now()
|
||||||
|
|
||||||
|
for _, dev := range m.devices {
|
||||||
|
metrics, ret := rocm_smi.DeviceGetMetrics(dev.device)
|
||||||
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
|
cclog.ComponentError(m.name, "Unable to get metrics for device at index", dev.index, ":", rocm_smi.StatusStringNoError(ret))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if !dev.excludeMetrics["rocm_gfx_util"] {
|
||||||
|
value := metrics.Average_gfx_activity
|
||||||
|
y, err := lp.New("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_umc_util"] {
|
||||||
|
value := metrics.Average_umc_activity
|
||||||
|
y, err := lp.New("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_mm_util"] {
|
||||||
|
value := metrics.Average_mm_activity
|
||||||
|
y, err := lp.New("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_avg_power"] {
|
||||||
|
value := metrics.Average_socket_power
|
||||||
|
y, err := lp.New("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_temp_mem"] {
|
||||||
|
value := metrics.Temperature_mem
|
||||||
|
y, err := lp.New("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_temp_hotspot"] {
|
||||||
|
value := metrics.Temperature_hotspot
|
||||||
|
y, err := lp.New("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_temp_edge"] {
|
||||||
|
value := metrics.Temperature_edge
|
||||||
|
y, err := lp.New("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_temp_vrgfx"] {
|
||||||
|
value := metrics.Temperature_vrgfx
|
||||||
|
y, err := lp.New("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_temp_vrsoc"] {
|
||||||
|
value := metrics.Temperature_vrsoc
|
||||||
|
y, err := lp.New("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_temp_vrmem"] {
|
||||||
|
value := metrics.Temperature_vrmem
|
||||||
|
y, err := lp.New("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_gfx_clock"] {
|
||||||
|
value := metrics.Average_gfxclk_frequency
|
||||||
|
y, err := lp.New("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_soc_clock"] {
|
||||||
|
value := metrics.Average_socclk_frequency
|
||||||
|
y, err := lp.New("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_u_clock"] {
|
||||||
|
value := metrics.Average_uclk_frequency
|
||||||
|
y, err := lp.New("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_v0_clock"] {
|
||||||
|
value := metrics.Average_vclk0_frequency
|
||||||
|
y, err := lp.New("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_v1_clock"] {
|
||||||
|
value := metrics.Average_vclk1_frequency
|
||||||
|
y, err := lp.New("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_d0_clock"] {
|
||||||
|
value := metrics.Average_dclk0_frequency
|
||||||
|
y, err := lp.New("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_d1_clock"] {
|
||||||
|
value := metrics.Average_dclk1_frequency
|
||||||
|
y, err := lp.New("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !dev.excludeMetrics["rocm_temp_hbm"] {
|
||||||
|
for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ {
|
||||||
|
value := metrics.Temperature_hbm[i]
|
||||||
|
y, err := lp.New("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
y.AddTag("stype", "device")
|
||||||
|
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close metric collector: close network connection, close files, close libraries, ...
|
||||||
|
// Called once by the collector manager
|
||||||
|
func (m *RocmSmiCollector) Close() {
|
||||||
|
// Unset flag
|
||||||
|
ret := rocm_smi.Shutdown()
|
||||||
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
|
cclog.ComponentError(m.name, "Failed to shutdown ROCm SMI library")
|
||||||
|
}
|
||||||
|
m.init = false
|
||||||
|
}
|
47
collectors/rocmsmiMetric.md
Normal file
47
collectors/rocmsmiMetric.md
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
|
||||||
|
## `rocm_smi` collector
|
||||||
|
|
||||||
|
```json
|
||||||
|
"rocm_smi": {
|
||||||
|
"exclude_devices": [
|
||||||
|
"0","1", "0000000:ff:01.0"
|
||||||
|
],
|
||||||
|
"exclude_metrics": [
|
||||||
|
"rocm_mm_util",
|
||||||
|
"rocm_temp_vrsoc"
|
||||||
|
],
|
||||||
|
"use_pci_info_as_type_id": true,
|
||||||
|
"add_pci_info_tag": false,
|
||||||
|
"add_serial_meta": false,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The `rocm_smi` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes logical IDs in the list of available devices or the PCI address similar to NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option.
|
||||||
|
|
||||||
|
The metrics sent by the `rocm_smi` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
|
||||||
|
|
||||||
|
Optionally, it is possible to add the serial to the meta informations. They are not sent to the sinks (if not configured otherwise).
|
||||||
|
|
||||||
|
|
||||||
|
Metrics:
|
||||||
|
* `rocm_gfx_util`
|
||||||
|
* `rocm_umc_util`
|
||||||
|
* `rocm_mm_util`
|
||||||
|
* `rocm_avg_power`
|
||||||
|
* `rocm_temp_mem`
|
||||||
|
* `rocm_temp_hotspot`
|
||||||
|
* `rocm_temp_edge`
|
||||||
|
* `rocm_temp_vrgfx`
|
||||||
|
* `rocm_temp_vrsoc`
|
||||||
|
* `rocm_temp_vrmem`
|
||||||
|
* `rocm_gfx_clock`
|
||||||
|
* `rocm_soc_clock`
|
||||||
|
* `rocm_u_clock`
|
||||||
|
* `rocm_v0_clock`
|
||||||
|
* `rocm_v1_clock`
|
||||||
|
* `rocm_d0_clock`
|
||||||
|
* `rocm_d1_clock`
|
||||||
|
* `rocm_temp_hbm`
|
||||||
|
|
||||||
|
|
||||||
|
Some metrics add the additional sub type tag (`stype`) like the `rocm_temp_hbm` metrics set `stype=device,stype-id=<HBM_slice_number>`.
|
Loading…
Reference in New Issue
Block a user