mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-07-31 08:56:06 +02:00
Collector-specific configuration. LIKWID collector derives metrics itself, Run once CLI option
This commit is contained in:
@@ -7,11 +7,18 @@ import (
|
||||
lp "github.com/influxdata/line-protocol"
|
||||
"log"
|
||||
"time"
|
||||
"encoding/json"
|
||||
)
|
||||
|
||||
type NvidiaCollectorConfig struct {
|
||||
ExcludeMetrics []string `json:"exclude_metrics, omitempty"`
|
||||
ExcludeDevices []string `json:"exclude_devices, omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaCollector struct {
|
||||
MetricCollector
|
||||
num_gpus int
|
||||
config NvidiaCollectorConfig
|
||||
}
|
||||
|
||||
func (m *NvidiaCollector) CatchPanic() error {
|
||||
@@ -24,20 +31,26 @@ func (m *NvidiaCollector) CatchPanic() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *NvidiaCollector) Init() error {
|
||||
func (m *NvidiaCollector) Init(config []byte) error {
|
||||
var err error
|
||||
m.name = "NvidiaCollector"
|
||||
m.setup()
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
m.num_gpus = 0
|
||||
defer m.CatchPanic()
|
||||
ret := nvml.Init()
|
||||
if ret != nvml.SUCCESS {
|
||||
err := errors.New(nvml.ErrorString(ret))
|
||||
log.Print(err)
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
return err
|
||||
}
|
||||
m.num_gpus, ret = nvml.DeviceGetCount()
|
||||
if ret != nvml.SUCCESS {
|
||||
err := errors.New(nvml.ErrorString(ret))
|
||||
err = errors.New(nvml.ErrorString(ret))
|
||||
return err
|
||||
}
|
||||
m.init = true
|
||||
@@ -45,23 +58,31 @@ func (m *NvidiaCollector) Init() error {
|
||||
}
|
||||
|
||||
func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||
|
||||
if (!m.init) {
|
||||
return
|
||||
}
|
||||
for i := 0; i < m.num_gpus; i++ {
|
||||
device, ret := nvml.DeviceGetHandleByIndex(i)
|
||||
if ret != nvml.SUCCESS {
|
||||
log.Fatalf("Unable to get device at index %d: %v", i, nvml.ErrorString(ret))
|
||||
return
|
||||
}
|
||||
_, skip := stringArrayContains(m.config.ExcludeDevices, fmt.Sprintf("%d", i))
|
||||
if skip {
|
||||
continue
|
||||
}
|
||||
tags := map[string]string{"type": "accelerator", "type-id": fmt.Sprintf("%d", i)}
|
||||
|
||||
util, ret := nvml.DeviceGetUtilizationRates(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "util")
|
||||
y, err := lp.New("util", tags, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_util")
|
||||
y, err = lp.New("mem_util", tags, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
@@ -69,29 +90,33 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
|
||||
meminfo, ret := nvml.DeviceGetMemoryInfo(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(meminfo.Total) / (1024 * 1024)
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_total")
|
||||
y, err := lp.New("mem_total", tags, map[string]interface{}{"value": t}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
f := float64(meminfo.Used) / (1024 * 1024)
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "fb_memory")
|
||||
y, err = lp.New("fb_memory", tags, map[string]interface{}{"value": f}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
temp, ret := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "temp")
|
||||
y, err := lp.New("temp", tags, map[string]interface{}{"value": float64(temp)}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
fan, ret := nvml.DeviceGetFanSpeed(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "fan")
|
||||
y, err := lp.New("fan", tags, map[string]interface{}{"value": float64(fan)}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
@@ -108,116 +133,131 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
|
||||
default:
|
||||
y, err = lp.New("ecc_mode", tags, map[string]interface{}{"value": string("UNKNOWN")}, time.Now())
|
||||
}
|
||||
if err == nil {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_mode")
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
} else if ret == nvml.ERROR_NOT_SUPPORTED {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_mode")
|
||||
y, err := lp.New("ecc_mode", tags, map[string]interface{}{"value": string("N/A")}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
pstate, ret := nvml.DeviceGetPerformanceState(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "perf_state")
|
||||
y, err := lp.New("perf_state", tags, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
power, ret := nvml.DeviceGetPowerUsage(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "power_usage_report")
|
||||
y, err := lp.New("power_usage_report", tags, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
gclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_GRAPHICS)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "graphics_clock_report")
|
||||
y, err := lp.New("graphics_clock_report", tags, map[string]interface{}{"value": float64(gclk)}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "sm_clock_report")
|
||||
y, err := lp.New("sm_clock_report", tags, map[string]interface{}{"value": float64(smclk)}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_clock_report")
|
||||
y, err := lp.New("mem_clock_report", tags, map[string]interface{}{"value": float64(memclk)}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device, nvml.CLOCK_GRAPHICS)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "max_graphics_clock")
|
||||
y, err := lp.New("max_graphics_clock", tags, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
max_smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "max_sm_clock")
|
||||
y, err := lp.New("max_sm_clock", tags, map[string]interface{}{"value": float64(max_smclk)}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
max_memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "max_mem_clock")
|
||||
y, err := lp.New("max_mem_clock", tags, map[string]interface{}{"value": float64(max_memclk)}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device, 1, 1)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_db_error")
|
||||
y, err := lp.New("ecc_db_error", tags, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device, 0, 1)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_sb_error")
|
||||
y, err := lp.New("ecc_sb_error", tags, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "power_man_limit")
|
||||
y, err := lp.New("power_man_limit", tags, map[string]interface{}{"value": float64(pwr_limit)}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "encoder_util")
|
||||
y, err := lp.New("encoder_util", tags, map[string]interface{}{"value": float64(enc_util)}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
_, skip = stringArrayContains(m.config.ExcludeMetrics, "decoder_util")
|
||||
y, err := lp.New("decoder_util", tags, map[string]interface{}{"value": float64(dec_util)}, time.Now())
|
||||
if err == nil {
|
||||
if err == nil && !skip {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user