Formatting

This commit is contained in:
Thomas Roehl
2021-11-25 15:11:39 +01:00
parent a88ea4fd43
commit 9232dd9732
15 changed files with 577 additions and 566 deletions

View File

@@ -1,24 +1,24 @@
package collectors
import (
"encoding/json"
"errors"
"fmt"
"github.com/NVIDIA/go-nvml/pkg/nvml"
lp "github.com/influxdata/line-protocol"
"log"
"time"
"encoding/json"
)
type NvidiaCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics, omitempty"`
ExcludeDevices []string `json:"exclude_devices, omitempty"`
ExcludeMetrics []string `json:"exclude_metrics, omitempty"`
ExcludeDevices []string `json:"exclude_devices, omitempty"`
}
type NvidiaCollector struct {
MetricCollector
num_gpus int
config NvidiaCollectorConfig
config NvidiaCollectorConfig
}
func (m *NvidiaCollector) CatchPanic() error {
@@ -32,14 +32,14 @@ func (m *NvidiaCollector) CatchPanic() error {
}
func (m *NvidiaCollector) Init(config []byte) error {
var err error
var err error
m.name = "NvidiaCollector"
m.setup()
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
return err
}
err = json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
m.num_gpus = 0
defer m.CatchPanic()
@@ -58,9 +58,9 @@ func (m *NvidiaCollector) Init(config []byte) error {
}
func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
if (!m.init) {
return
}
if !m.init {
return
}
for i := 0; i < m.num_gpus; i++ {
device, ret := nvml.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
@@ -69,13 +69,13 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
}
_, skip := stringArrayContains(m.config.ExcludeDevices, fmt.Sprintf("%d", i))
if skip {
continue
continue
}
tags := map[string]string{"type": "accelerator", "type-id": fmt.Sprintf("%d", i)}
util, ret := nvml.DeviceGetUtilizationRates(device)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "util")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "util")
y, err := lp.New("util", tags, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -105,7 +105,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
temp, ret := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "temp")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "temp")
y, err := lp.New("temp", tags, map[string]interface{}{"value": float64(temp)}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -114,7 +114,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
fan, ret := nvml.DeviceGetFanSpeed(device)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "fan")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "fan")
y, err := lp.New("fan", tags, map[string]interface{}{"value": float64(fan)}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -138,7 +138,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
*out = append(*out, y)
}
} else if ret == nvml.ERROR_NOT_SUPPORTED {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_mode")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_mode")
y, err := lp.New("ecc_mode", tags, map[string]interface{}{"value": string("N/A")}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -147,7 +147,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
pstate, ret := nvml.DeviceGetPerformanceState(device)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "perf_state")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "perf_state")
y, err := lp.New("perf_state", tags, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -156,7 +156,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
power, ret := nvml.DeviceGetPowerUsage(device)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "power_usage_report")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "power_usage_report")
y, err := lp.New("power_usage_report", tags, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -165,7 +165,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
gclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_GRAPHICS)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "graphics_clock_report")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "graphics_clock_report")
y, err := lp.New("graphics_clock_report", tags, map[string]interface{}{"value": float64(gclk)}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -174,7 +174,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "sm_clock_report")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "sm_clock_report")
y, err := lp.New("sm_clock_report", tags, map[string]interface{}{"value": float64(smclk)}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -183,7 +183,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_clock_report")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_clock_report")
y, err := lp.New("mem_clock_report", tags, map[string]interface{}{"value": float64(memclk)}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -192,7 +192,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device, nvml.CLOCK_GRAPHICS)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "max_graphics_clock")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "max_graphics_clock")
y, err := lp.New("max_graphics_clock", tags, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -201,7 +201,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
max_smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "max_sm_clock")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "max_sm_clock")
y, err := lp.New("max_sm_clock", tags, map[string]interface{}{"value": float64(max_smclk)}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -210,7 +210,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
max_memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "max_mem_clock")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "max_mem_clock")
y, err := lp.New("max_mem_clock", tags, map[string]interface{}{"value": float64(max_memclk)}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -219,7 +219,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device, 1, 1)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_db_error")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_db_error")
y, err := lp.New("ecc_db_error", tags, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -228,7 +228,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device, 0, 1)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_sb_error")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_sb_error")
y, err := lp.New("ecc_sb_error", tags, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -237,7 +237,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "power_man_limit")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "power_man_limit")
y, err := lp.New("power_man_limit", tags, map[string]interface{}{"value": float64(pwr_limit)}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -246,7 +246,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "encoder_util")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "encoder_util")
y, err := lp.New("encoder_util", tags, map[string]interface{}{"value": float64(enc_util)}, time.Now())
if err == nil && !skip {
*out = append(*out, y)
@@ -255,7 +255,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device)
if ret == nvml.SUCCESS {
_, skip = stringArrayContains(m.config.ExcludeMetrics, "decoder_util")
_, skip = stringArrayContains(m.config.ExcludeMetrics, "decoder_util")
y, err := lp.New("decoder_util", tags, map[string]interface{}{"value": float64(dec_util)}, time.Now())
if err == nil && !skip {
*out = append(*out, y)