mirror of
				https://github.com/ClusterCockpit/cc-metric-collector.git
				synced 2025-11-04 02:35:07 +01:00 
			
		
		
		
	Change storage format
This commit is contained in:
		@@ -4,6 +4,7 @@ import (
 | 
			
		||||
	"errors"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"github.com/NVIDIA/go-nvml/pkg/nvml"
 | 
			
		||||
	lp "github.com/influxdata/line-protocol"
 | 
			
		||||
	"log"
 | 
			
		||||
	"time"
 | 
			
		||||
)
 | 
			
		||||
@@ -13,10 +14,21 @@ type NvidiaCollector struct {
 | 
			
		||||
	num_gpus int
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (m *NvidiaCollector) CatchPanic() error {
 | 
			
		||||
 | 
			
		||||
	if rerr := recover(); rerr != nil {
 | 
			
		||||
		log.Print("CatchPanic ", string(rerr.(string)))
 | 
			
		||||
		err := errors.New(rerr.(string))
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (m *NvidiaCollector) Init() error {
 | 
			
		||||
	m.name = "NvidiaCollector"
 | 
			
		||||
	m.setup()
 | 
			
		||||
	m.num_gpus = 0
 | 
			
		||||
	defer m.CatchPanic()
 | 
			
		||||
	ret := nvml.Init()
 | 
			
		||||
	if ret != nvml.SUCCESS {
 | 
			
		||||
		err := errors.New(nvml.ErrorString(ret))
 | 
			
		||||
@@ -28,10 +40,11 @@ func (m *NvidiaCollector) Init() error {
 | 
			
		||||
		err := errors.New(nvml.ErrorString(ret))
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
	m.init = true
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (m *NvidiaCollector) Read(interval time.Duration) {
 | 
			
		||||
func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
 | 
			
		||||
 | 
			
		||||
	for i := 0; i < m.num_gpus; i++ {
 | 
			
		||||
		device, ret := nvml.DeviceGetHandleByIndex(i)
 | 
			
		||||
@@ -39,113 +52,183 @@ func (m *NvidiaCollector) Read(interval time.Duration) {
 | 
			
		||||
			log.Fatalf("Unable to get device at index %d: %v", i, nvml.ErrorString(ret))
 | 
			
		||||
			return
 | 
			
		||||
		}
 | 
			
		||||
		base := fmt.Sprintf("gpu%d", i)
 | 
			
		||||
		tags := map[string]string{"type": "accelerator", "type-id": fmt.Sprintf("%d", i)}
 | 
			
		||||
 | 
			
		||||
		util, ret := nvml.DeviceGetUtilizationRates(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_util", base)] = float64(util.Gpu)
 | 
			
		||||
			m.node[fmt.Sprintf("%s_mem_util", base)] = float64(util.Memory)
 | 
			
		||||
			y, err := lp.New("util", tags, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
			y, err = lp.New("mem_util", tags, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		meminfo, ret := nvml.DeviceGetMemoryInfo(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_mem_total", base)] = float64(meminfo.Total) / (1024 * 1024)
 | 
			
		||||
			m.node[fmt.Sprintf("%s_fb_memory", base)] = float64(meminfo.Used) / (1024 * 1024)
 | 
			
		||||
			t := float64(meminfo.Total) / (1024 * 1024)
 | 
			
		||||
			y, err := lp.New("mem_total", tags, map[string]interface{}{"value": t}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
			f := float64(meminfo.Used) / (1024 * 1024)
 | 
			
		||||
			y, err = lp.New("fb_memory", tags, map[string]interface{}{"value": f}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		temp, ret := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_temp", base)] = float64(temp)
 | 
			
		||||
			y, err := lp.New("temp", tags, map[string]interface{}{"value": float64(temp)}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		fan, ret := nvml.DeviceGetFanSpeed(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_fan", base)] = float64(fan)
 | 
			
		||||
			y, err := lp.New("fan", tags, map[string]interface{}{"value": float64(fan)}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		_, ecc_pend, ret := nvml.DeviceGetEccMode(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			var y lp.MutableMetric
 | 
			
		||||
			var err error
 | 
			
		||||
			switch ecc_pend {
 | 
			
		||||
			case nvml.FEATURE_DISABLED:
 | 
			
		||||
				m.node[fmt.Sprintf("%s_ecc_mode", base)] = string("OFF")
 | 
			
		||||
				y, err = lp.New("ecc_mode", tags, map[string]interface{}{"value": string("OFF")}, time.Now())
 | 
			
		||||
			case nvml.FEATURE_ENABLED:
 | 
			
		||||
				m.node[fmt.Sprintf("%s_ecc_mode", base)] = string("ON")
 | 
			
		||||
				y, err = lp.New("ecc_mode", tags, map[string]interface{}{"value": string("ON")}, time.Now())
 | 
			
		||||
			default:
 | 
			
		||||
				m.node[fmt.Sprintf("%s_ecc_mode", base)] = string("UNKNOWN")
 | 
			
		||||
				y, err = lp.New("ecc_mode", tags, map[string]interface{}{"value": string("UNKNOWN")}, time.Now())
 | 
			
		||||
			}
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		} else if ret == nvml.ERROR_NOT_SUPPORTED {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_ecc_mode", base)] = string("N/A")
 | 
			
		||||
			y, err := lp.New("ecc_mode", tags, map[string]interface{}{"value": string("N/A")}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		pstate, ret := nvml.DeviceGetPerformanceState(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_perf_state", base)] = fmt.Sprintf("P%d", int(pstate))
 | 
			
		||||
			y, err := lp.New("perf_state", tags, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		power, ret := nvml.DeviceGetPowerUsage(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_power_usage_report", base)] = float64(power) / 1000
 | 
			
		||||
			y, err := lp.New("power_usage_report", tags, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		gclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_GRAPHICS)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_graphics_clock_report", base)] = float64(gclk)
 | 
			
		||||
			y, err := lp.New("graphics_clock_report", tags, map[string]interface{}{"value": float64(gclk)}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_sm_clock_report", base)] = float64(smclk)
 | 
			
		||||
			y, err := lp.New("sm_clock_report", tags, map[string]interface{}{"value": float64(smclk)}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_mem_clock_report", base)] = float64(memclk)
 | 
			
		||||
			y, err := lp.New("mem_clock_report", tags, map[string]interface{}{"value": float64(memclk)}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		max_gclk, ret := nvml.DeviceGetMaxClockInfo(device, nvml.CLOCK_GRAPHICS)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_max_graphics_clock", base)] = float64(max_gclk)
 | 
			
		||||
			y, err := lp.New("max_graphics_clock", tags, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		max_smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_max_sm_clock", base)] = float64(max_smclk)
 | 
			
		||||
			y, err := lp.New("max_sm_clock", tags, map[string]interface{}{"value": float64(max_smclk)}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		max_memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_max_mem_clock", base)] = float64(max_memclk)
 | 
			
		||||
			y, err := lp.New("max_mem_clock", tags, map[string]interface{}{"value": float64(max_memclk)}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		ecc_db, ret := nvml.DeviceGetTotalEccErrors(device, 1, 1)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_ecc_db_error", base)] = float64(ecc_db)
 | 
			
		||||
			y, err := lp.New("ecc_db_error", tags, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device, 0, 1)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_ecc_sb_error", base)] = float64(ecc_sb)
 | 
			
		||||
			y, err := lp.New("ecc_sb_error", tags, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_power_man_limit", base)] = float64(pwr_limit)
 | 
			
		||||
			y, err := lp.New("power_man_limit", tags, map[string]interface{}{"value": float64(pwr_limit)}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_power_man_limit", base)] = float64(enc_util)
 | 
			
		||||
			y, err := lp.New("encoder_util", tags, map[string]interface{}{"value": float64(enc_util)}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			m.node[fmt.Sprintf("%s_power_man_limit", base)] = float64(dec_util)
 | 
			
		||||
			y, err := lp.New("decoder_util", tags, map[string]interface{}{"value": float64(dec_util)}, time.Now())
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (m *NvidiaCollector) Close() {
 | 
			
		||||
	nvml.Shutdown()
 | 
			
		||||
	if m.init {
 | 
			
		||||
		nvml.Shutdown()
 | 
			
		||||
		m.init = false
 | 
			
		||||
	}
 | 
			
		||||
	return
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user