cc-metric-collector/collectors/nvidiaMetric.go

371 lines
10 KiB
Go
Raw Normal View History

2021-05-14 19:22:59 +02:00
package collectors
import (
2021-11-25 15:11:39 +01:00
"encoding/json"
2021-05-14 19:22:59 +02:00
"errors"
"fmt"
"log"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
Modularize the whole thing (#16) * Use channels, add a metric router, split up configuration and use extended version of Influx line protocol internally * Use central timer for collectors and router. Add expressions to router * Add expression to router config * Update entry points * Start with README * Update README for CCMetric * Formatting * Update README.md * Add README for MultiChanTicker * Add README for MultiChanTicker * Update README.md * Add README to metric router * Update main README * Remove SinkEntity type * Update README for sinks * Update go files * Update README for receivers * Update collectors README * Update collectors README * Use seperate page per collector * Fix for tempstat page * Add docs for customcmd collector * Add docs for ipmistat collector * Add docs for topprocs collector * Update customCmdMetric.md * Use seconds when calculating LIKWID metrics * Add IB metrics ib_recv_pkts and ib_xmit_pkts * Drop domain part of host name * Updated to latest stable version of likwid * Define source code dependencies in Makefile * Add GPFS / IBM Spectrum Scale collector * Add vet and staticcheck make targets * Add vet and staticcheck make targets * Avoid go vet warning: struct field tag `json:"..., omitempty"` not compatible with reflect.StructTag.Get: suspicious space in struct tag value struct field tag `json:"...", omitempty` not compatible with reflect.StructTag.Get: key:"value" pairs not separated by spaces * Add sample collector to README.md * Add CPU frequency collector * Avoid staticcheck warning: redundant return statement * Avoid staticcheck warning: unnecessary assignment to the blank identifier * Simplified code * Add CPUFreqCollectorCpuinfo a metric collector to measure the current frequency of the CPUs as obtained from /proc/cpuinfo Only measure on the first hyperthread * Add collector for NFS clients * Move publication of metrics into Flush() for NatsSink * Update GitHub actions * Refactoring * Avoid vet warning: Println arg list ends with redundant newline * Avoid vet warning struct field commands has json tag but is not exported * Avoid vet warning: return copies lock value. * Corrected typo * Refactoring * Add go sources in internal/... * Bad separator in Makefile * Fix Infiniband collector Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
2022-01-25 15:37:43 +01:00
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
"github.com/NVIDIA/go-nvml/pkg/nvml"
2021-05-14 19:22:59 +02:00
)
type NvidiaCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ExcludeDevices []string `json:"exclude_devices,omitempty"`
AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"`
}
type NvidiaCollectorDevice struct {
device nvml.Device
excludeMetrics map[string]bool
tags map[string]string
}
2021-05-14 19:22:59 +02:00
type NvidiaCollector struct {
Modularize the whole thing (#16) * Use channels, add a metric router, split up configuration and use extended version of Influx line protocol internally * Use central timer for collectors and router. Add expressions to router * Add expression to router config * Update entry points * Start with README * Update README for CCMetric * Formatting * Update README.md * Add README for MultiChanTicker * Add README for MultiChanTicker * Update README.md * Add README to metric router * Update main README * Remove SinkEntity type * Update README for sinks * Update go files * Update README for receivers * Update collectors README * Update collectors README * Use seperate page per collector * Fix for tempstat page * Add docs for customcmd collector * Add docs for ipmistat collector * Add docs for topprocs collector * Update customCmdMetric.md * Use seconds when calculating LIKWID metrics * Add IB metrics ib_recv_pkts and ib_xmit_pkts * Drop domain part of host name * Updated to latest stable version of likwid * Define source code dependencies in Makefile * Add GPFS / IBM Spectrum Scale collector * Add vet and staticcheck make targets * Add vet and staticcheck make targets * Avoid go vet warning: struct field tag `json:"..., omitempty"` not compatible with reflect.StructTag.Get: suspicious space in struct tag value struct field tag `json:"...", omitempty` not compatible with reflect.StructTag.Get: key:"value" pairs not separated by spaces * Add sample collector to README.md * Add CPU frequency collector * Avoid staticcheck warning: redundant return statement * Avoid staticcheck warning: unnecessary assignment to the blank identifier * Simplified code * Add CPUFreqCollectorCpuinfo a metric collector to measure the current frequency of the CPUs as obtained from /proc/cpuinfo Only measure on the first hyperthread * Add collector for NFS clients * Move publication of metrics into Flush() for NatsSink * Update GitHub actions * Refactoring * Avoid vet warning: Println arg list ends with redundant newline * Avoid vet warning struct field commands has json tag but is not exported * Avoid vet warning: return copies lock value. * Corrected typo * Refactoring * Add go sources in internal/... * Bad separator in Makefile * Fix Infiniband collector Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
2022-01-25 15:37:43 +01:00
metricCollector
2021-05-14 19:22:59 +02:00
num_gpus int
2021-11-25 15:11:39 +01:00
config NvidiaCollectorConfig
gpus []NvidiaCollectorDevice
2021-05-14 19:22:59 +02:00
}
2021-11-26 19:01:47 +01:00
func (m *NvidiaCollector) CatchPanic() {
2021-10-04 15:23:43 +02:00
if rerr := recover(); rerr != nil {
2021-11-26 19:01:47 +01:00
log.Print(rerr)
m.init = false
2021-10-04 15:23:43 +02:00
}
}
Modularize the whole thing (#16) * Use channels, add a metric router, split up configuration and use extended version of Influx line protocol internally * Use central timer for collectors and router. Add expressions to router * Add expression to router config * Update entry points * Start with README * Update README for CCMetric * Formatting * Update README.md * Add README for MultiChanTicker * Add README for MultiChanTicker * Update README.md * Add README to metric router * Update main README * Remove SinkEntity type * Update README for sinks * Update go files * Update README for receivers * Update collectors README * Update collectors README * Use seperate page per collector * Fix for tempstat page * Add docs for customcmd collector * Add docs for ipmistat collector * Add docs for topprocs collector * Update customCmdMetric.md * Use seconds when calculating LIKWID metrics * Add IB metrics ib_recv_pkts and ib_xmit_pkts * Drop domain part of host name * Updated to latest stable version of likwid * Define source code dependencies in Makefile * Add GPFS / IBM Spectrum Scale collector * Add vet and staticcheck make targets * Add vet and staticcheck make targets * Avoid go vet warning: struct field tag `json:"..., omitempty"` not compatible with reflect.StructTag.Get: suspicious space in struct tag value struct field tag `json:"...", omitempty` not compatible with reflect.StructTag.Get: key:"value" pairs not separated by spaces * Add sample collector to README.md * Add CPU frequency collector * Avoid staticcheck warning: redundant return statement * Avoid staticcheck warning: unnecessary assignment to the blank identifier * Simplified code * Add CPUFreqCollectorCpuinfo a metric collector to measure the current frequency of the CPUs as obtained from /proc/cpuinfo Only measure on the first hyperthread * Add collector for NFS clients * Move publication of metrics into Flush() for NatsSink * Update GitHub actions * Refactoring * Avoid vet warning: Println arg list ends with redundant newline * Avoid vet warning struct field commands has json tag but is not exported * Avoid vet warning: return copies lock value. * Corrected typo * Refactoring * Add go sources in internal/... * Bad separator in Makefile * Fix Infiniband collector Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
2022-01-25 15:37:43 +01:00
func (m *NvidiaCollector) Init(config json.RawMessage) error {
2021-11-25 15:11:39 +01:00
var err error
2021-05-14 19:22:59 +02:00
m.name = "NvidiaCollector"
m.config.AddPciInfoTag = false
2021-05-14 19:22:59 +02:00
m.setup()
if len(config) > 0 {
2021-11-25 15:11:39 +01:00
err = json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
m.meta = map[string]string{
"source": m.name,
"group": "Nvidia",
}
2021-05-14 19:22:59 +02:00
m.num_gpus = 0
2021-10-04 15:23:43 +02:00
defer m.CatchPanic()
// Initialize NVIDIA Management Library (NVML)
2021-05-14 19:22:59 +02:00
ret := nvml.Init()
if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error())
2021-05-14 19:22:59 +02:00
return err
}
// Number of NVIDIA GPUs
num_gpus, ret := nvml.DeviceGetCount()
2021-05-14 19:22:59 +02:00
if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to get device count", err.Error())
2021-05-14 19:22:59 +02:00
return err
}
// For all GPUs
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
for i := 0; i < num_gpus; i++ {
g := &m.gpus[i]
// Skip excluded devices
str_i := fmt.Sprintf("%d", i)
if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip {
continue
}
// Get device handle
device, ret := nvml.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error())
return err
}
g.device = device
// Add tags
g.tags = map[string]string{
"type": "accelerator",
"type-id": str_i,
}
// Add excluded metrics
g.excludeMetrics = map[string]bool{}
for _, e := range m.config.ExcludeMetrics {
g.excludeMetrics[e] = true
}
// Add PCI info as tag
if m.config.AddPciInfoTag {
pciInfo, ret := nvml.DeviceGetPciInfo(g.device)
if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret))
cclog.ComponentError(m.name, "Unable to get PCI info for device at index", i, ":", err.Error())
return err
}
g.tags["pci_identifier"] = fmt.Sprintf(
"%08X:%02X:%02X.0",
pciInfo.Domain,
pciInfo.Bus,
pciInfo.Device)
}
}
2021-10-04 15:23:43 +02:00
m.init = true
2021-05-14 19:22:59 +02:00
return nil
}
Modularize the whole thing (#16) * Use channels, add a metric router, split up configuration and use extended version of Influx line protocol internally * Use central timer for collectors and router. Add expressions to router * Add expression to router config * Update entry points * Start with README * Update README for CCMetric * Formatting * Update README.md * Add README for MultiChanTicker * Add README for MultiChanTicker * Update README.md * Add README to metric router * Update main README * Remove SinkEntity type * Update README for sinks * Update go files * Update README for receivers * Update collectors README * Update collectors README * Use seperate page per collector * Fix for tempstat page * Add docs for customcmd collector * Add docs for ipmistat collector * Add docs for topprocs collector * Update customCmdMetric.md * Use seconds when calculating LIKWID metrics * Add IB metrics ib_recv_pkts and ib_xmit_pkts * Drop domain part of host name * Updated to latest stable version of likwid * Define source code dependencies in Makefile * Add GPFS / IBM Spectrum Scale collector * Add vet and staticcheck make targets * Add vet and staticcheck make targets * Avoid go vet warning: struct field tag `json:"..., omitempty"` not compatible with reflect.StructTag.Get: suspicious space in struct tag value struct field tag `json:"...", omitempty` not compatible with reflect.StructTag.Get: key:"value" pairs not separated by spaces * Add sample collector to README.md * Add CPU frequency collector * Avoid staticcheck warning: redundant return statement * Avoid staticcheck warning: unnecessary assignment to the blank identifier * Simplified code * Add CPUFreqCollectorCpuinfo a metric collector to measure the current frequency of the CPUs as obtained from /proc/cpuinfo Only measure on the first hyperthread * Add collector for NFS clients * Move publication of metrics into Flush() for NatsSink * Update GitHub actions * Refactoring * Avoid vet warning: Println arg list ends with redundant newline * Avoid vet warning struct field commands has json tag but is not exported * Avoid vet warning: return copies lock value. * Corrected typo * Refactoring * Add go sources in internal/... * Bad separator in Makefile * Fix Infiniband collector Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
2022-01-25 15:37:43 +01:00
func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) {
2021-11-25 15:11:39 +01:00
if !m.init {
return
}
for _, device := range m.gpus {
exclude := func(metric string) bool {
if _, ok := device.excludeMetrics[metric]; !ok {
return true
2021-10-04 15:23:43 +02:00
}
return false
2021-05-14 19:22:59 +02:00
}
ex_nv_util := exclude("nv_util")
ex_nv_mem_util := exclude("nv_mem_util")
if (!ex_nv_util) || (!ex_nv_mem_util) {
util, ret := nvml.DeviceGetUtilizationRates(device.device)
if ret == nvml.SUCCESS {
if !ex_nv_util {
y, err := lp.New("nv_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
if err == nil {
output <- y
}
}
if !ex_nv_mem_util {
y, err := lp.New("nv_mem_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
if err == nil {
output <- y
}
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
ex_nv_mem_total := exclude("nv_mem_total")
ex_nv_fb_memory := exclude("nv_fb_memory")
if (!ex_nv_mem_total) || (!ex_nv_fb_memory) {
meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
if ret == nvml.SUCCESS {
if !ex_nv_mem_total {
t := float64(meminfo.Total) / (1024 * 1024)
y, err := lp.New("nv_mem_total", device.tags, m.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
}
}
if !ex_nv_fb_memory {
f := float64(meminfo.Used) / (1024 * 1024)
y, err := lp.New("nv_fb_memory", device.tags, m.meta, map[string]interface{}{"value": f}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
}
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
if !exclude("nv_temp") {
temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_temp", device.tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
if err == nil {
y.AddMeta("unit", "degC")
output <- y
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
if !exclude("nv_fan") {
fan, ret := nvml.DeviceGetFanSpeed(device.device)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_fan", device.tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
if err == nil {
output <- y
}
2021-10-04 15:23:43 +02:00
}
}
if !exclude("nv_ecc_mode") {
_, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
if ret == nvml.SUCCESS {
var y lp.CCMetric
var err error
switch ecc_pend {
case nvml.FEATURE_DISABLED:
y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now())
case nvml.FEATURE_ENABLED:
y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now())
default:
y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now())
}
if err == nil {
output <- y
}
} else if ret == nvml.ERROR_NOT_SUPPORTED {
y, err := lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now())
if err == nil {
output <- y
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
if !exclude("nv_perf_state") {
pstate, ret := nvml.DeviceGetPerformanceState(device.device)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_perf_state", device.tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now())
if err == nil {
output <- y
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
if !exclude("nv_power_usage_report") {
power, ret := nvml.DeviceGetPowerUsage(device.device)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_power_usage_report", device.tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
if err == nil {
output <- y
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
if !exclude("nv_graphics_clock_report") {
gclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_graphics_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now())
if err == nil {
output <- y
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
if !exclude("nv_sm_clock_report") {
smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_sm_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now())
if err == nil {
output <- y
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
if !exclude("nv_mem_clock_report") {
memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_mem_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now())
if err == nil {
output <- y
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
if !exclude("nv_max_graphics_clock") {
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_max_graphics_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
if err == nil {
output <- y
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
if !exclude("nv_max_sm_clock") {
max_smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_max_sm_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now())
if err == nil {
output <- y
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
if !exclude("nv_max_mem_clock") {
max_memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_max_mem_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now())
if err == nil {
output <- y
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
if !exclude("nv_ecc_db_error") {
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, 1, 1)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_ecc_db_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
if err == nil {
output <- y
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
if !exclude("nv_ecc_sb_error") {
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, 0, 1)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_ecc_sb_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
if err == nil {
output <- y
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
if !exclude("nv_power_man_limit") {
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_power_man_limit", device.tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now())
if err == nil {
output <- y
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
if !exclude("nv_encoder_util") {
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_encoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
if err == nil {
output <- y
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
if !exclude("nv_decoder_util") {
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_decoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
if err == nil {
output <- y
}
2021-10-04 15:23:43 +02:00
}
2021-05-14 19:22:59 +02:00
}
}
}
func (m *NvidiaCollector) Close() {
2021-10-04 15:23:43 +02:00
if m.init {
nvml.Shutdown()
m.init = false
}
2021-05-14 19:22:59 +02:00
}