mirror of
				https://github.com/ClusterCockpit/cc-metric-collector.git
				synced 2025-11-04 02:35:07 +01:00 
			
		
		
		
	Exclude metrics and devices in Init() for NvidiaCollector
This commit is contained in:
		@@ -6,6 +6,8 @@ import (
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"log"
 | 
			
		||||
	"time"
 | 
			
		||||
 | 
			
		||||
	cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
 | 
			
		||||
	lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
 | 
			
		||||
	"github.com/NVIDIA/go-nvml/pkg/nvml"
 | 
			
		||||
)
 | 
			
		||||
@@ -13,12 +15,20 @@ import (
 | 
			
		||||
type NvidiaCollectorConfig struct {
 | 
			
		||||
	ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
 | 
			
		||||
	ExcludeDevices []string `json:"exclude_devices,omitempty"`
 | 
			
		||||
	AddPciInfoTag  bool     `json:"add_pci_info_tag,omitempty"`
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type NvidiaCollectorDevice struct {
 | 
			
		||||
	device         nvml.Device
 | 
			
		||||
	excludeMetrics map[string]bool
 | 
			
		||||
	tags           map[string]string
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type NvidiaCollector struct {
 | 
			
		||||
	metricCollector
 | 
			
		||||
	num_gpus int
 | 
			
		||||
	config   NvidiaCollectorConfig
 | 
			
		||||
	gpus     []NvidiaCollectorDevice
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (m *NvidiaCollector) CatchPanic() {
 | 
			
		||||
@@ -31,6 +41,7 @@ func (m *NvidiaCollector) CatchPanic() {
 | 
			
		||||
func (m *NvidiaCollector) Init(config json.RawMessage) error {
 | 
			
		||||
	var err error
 | 
			
		||||
	m.name = "NvidiaCollector"
 | 
			
		||||
	m.config.AddPciInfoTag = false
 | 
			
		||||
	m.setup()
 | 
			
		||||
	m.meta = map[string]string{"source": m.name, "group": "Nvidia"}
 | 
			
		||||
	if len(config) > 0 {
 | 
			
		||||
@@ -44,13 +55,48 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
 | 
			
		||||
	ret := nvml.Init()
 | 
			
		||||
	if ret != nvml.SUCCESS {
 | 
			
		||||
		err = errors.New(nvml.ErrorString(ret))
 | 
			
		||||
		cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error())
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
	m.num_gpus, ret = nvml.DeviceGetCount()
 | 
			
		||||
	num_gpus, ret := nvml.DeviceGetCount()
 | 
			
		||||
	if ret != nvml.SUCCESS {
 | 
			
		||||
		err = errors.New(nvml.ErrorString(ret))
 | 
			
		||||
		cclog.ComponentError(m.name, "Unable to get device count", err.Error())
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
	m.gpus = make([]NvidiaCollectorDevice, num_gpus)
 | 
			
		||||
	idx := 0
 | 
			
		||||
	for i := 0; i < num_gpus && idx < num_gpus; i++ {
 | 
			
		||||
		str_i := fmt.Sprintf("%d", i)
 | 
			
		||||
		if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip {
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		device, ret := nvml.DeviceGetHandleByIndex(i)
 | 
			
		||||
		if ret != nvml.SUCCESS {
 | 
			
		||||
			err = errors.New(nvml.ErrorString(ret))
 | 
			
		||||
			cclog.ComponentError(m.name, "Unable to get device at index", i, ":", err.Error())
 | 
			
		||||
			return err
 | 
			
		||||
		}
 | 
			
		||||
		g := m.gpus[idx]
 | 
			
		||||
		g.device = device
 | 
			
		||||
		g.tags = map[string]string{"type": "accelerator", "type-id": str_i}
 | 
			
		||||
		g.excludeMetrics = map[string]bool{}
 | 
			
		||||
		for _, e := range m.config.ExcludeMetrics {
 | 
			
		||||
			g.excludeMetrics[e] = true
 | 
			
		||||
		}
 | 
			
		||||
		if m.config.AddPciInfoTag {
 | 
			
		||||
			pciinfo, ret := nvml.DeviceGetPciInfo(g.device)
 | 
			
		||||
			if ret != nvml.SUCCESS {
 | 
			
		||||
				err = errors.New(nvml.ErrorString(ret))
 | 
			
		||||
				cclog.ComponentError(m.name, "Unable to get pciInfo for device at index", i, ":", err.Error())
 | 
			
		||||
				return err
 | 
			
		||||
			}
 | 
			
		||||
			g.tags["pci_identifier"] = fmt.Sprintf("%08X:%02X:%02X.0", pciinfo.Domain, pciinfo.Bus, pciinfo.Device)
 | 
			
		||||
		}
 | 
			
		||||
		m.gpus[idx] = g
 | 
			
		||||
		idx++
 | 
			
		||||
	}
 | 
			
		||||
	m.init = true
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
@@ -59,207 +105,233 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
 | 
			
		||||
	if !m.init {
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
	for i := 0; i < m.num_gpus; i++ {
 | 
			
		||||
		device, ret := nvml.DeviceGetHandleByIndex(i)
 | 
			
		||||
		if ret != nvml.SUCCESS {
 | 
			
		||||
			log.Fatalf("Unable to get device at index %d: %v", i, nvml.ErrorString(ret))
 | 
			
		||||
			return
 | 
			
		||||
		}
 | 
			
		||||
		_, skip := stringArrayContains(m.config.ExcludeDevices, fmt.Sprintf("%d", i))
 | 
			
		||||
		if skip {
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		tags := map[string]string{"type": "accelerator", "type-id": fmt.Sprintf("%d", i)}
 | 
			
		||||
 | 
			
		||||
		util, ret := nvml.DeviceGetUtilizationRates(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_util")
 | 
			
		||||
			y, err := lp.New("nv_util", tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
	for _, device := range m.gpus {
 | 
			
		||||
 | 
			
		||||
		exclude := func(metric string) bool {
 | 
			
		||||
			if _, ok := device.excludeMetrics[metric]; !ok {
 | 
			
		||||
				return true
 | 
			
		||||
			}
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_util")
 | 
			
		||||
			y, err = lp.New("nv_mem_util", tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
			return false
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		ex_nv_util := exclude("nv_util")
 | 
			
		||||
		ex_nv_mem_util := exclude("nv_mem_util")
 | 
			
		||||
		if (!ex_nv_util) || (!ex_nv_mem_util) {
 | 
			
		||||
			util, ret := nvml.DeviceGetUtilizationRates(device.device)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				if !ex_nv_util {
 | 
			
		||||
					y, err := lp.New("nv_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
 | 
			
		||||
					if err == nil {
 | 
			
		||||
						output <- y
 | 
			
		||||
					}
 | 
			
		||||
				}
 | 
			
		||||
				if !ex_nv_mem_util {
 | 
			
		||||
					y, err := lp.New("nv_mem_util", device.tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
 | 
			
		||||
					if err == nil {
 | 
			
		||||
						output <- y
 | 
			
		||||
					}
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		meminfo, ret := nvml.DeviceGetMemoryInfo(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			t := float64(meminfo.Total) / (1024 * 1024)
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_total")
 | 
			
		||||
			y, err := lp.New("nv_mem_total", tags, m.meta, map[string]interface{}{"value": t}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				y.AddMeta("unit", "MByte")
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
			f := float64(meminfo.Used) / (1024 * 1024)
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_fb_memory")
 | 
			
		||||
			y, err = lp.New("nv_fb_memory", tags, m.meta, map[string]interface{}{"value": f}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				y.AddMeta("unit", "MByte")
 | 
			
		||||
				output <- y
 | 
			
		||||
		ex_nv_mem_total := exclude("nv_mem_total")
 | 
			
		||||
		ex_nv_fb_memory := exclude("nv_fb_memory")
 | 
			
		||||
		if (!ex_nv_mem_total) || (!ex_nv_fb_memory) {
 | 
			
		||||
			meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				if !ex_nv_mem_total {
 | 
			
		||||
					t := float64(meminfo.Total) / (1024 * 1024)
 | 
			
		||||
					y, err := lp.New("nv_mem_total", device.tags, m.meta, map[string]interface{}{"value": t}, time.Now())
 | 
			
		||||
					if err == nil {
 | 
			
		||||
						y.AddMeta("unit", "MByte")
 | 
			
		||||
						output <- y
 | 
			
		||||
					}
 | 
			
		||||
				}
 | 
			
		||||
 | 
			
		||||
				if !ex_nv_fb_memory {
 | 
			
		||||
					f := float64(meminfo.Used) / (1024 * 1024)
 | 
			
		||||
					y, err := lp.New("nv_fb_memory", device.tags, m.meta, map[string]interface{}{"value": f}, time.Now())
 | 
			
		||||
					if err == nil {
 | 
			
		||||
						y.AddMeta("unit", "MByte")
 | 
			
		||||
						output <- y
 | 
			
		||||
					}
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		temp, ret := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_temp")
 | 
			
		||||
			y, err := lp.New("nv_temp", tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				y.AddMeta("unit", "degC")
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_temp") {
 | 
			
		||||
			temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				y, err := lp.New("nv_temp", device.tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					y.AddMeta("unit", "degC")
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		fan, ret := nvml.DeviceGetFanSpeed(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_fan")
 | 
			
		||||
			y, err := lp.New("nv_fan", tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_fan") {
 | 
			
		||||
			fan, ret := nvml.DeviceGetFanSpeed(device.device)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				y, err := lp.New("nv_fan", device.tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		_, ecc_pend, ret := nvml.DeviceGetEccMode(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			var y lp.CCMetric
 | 
			
		||||
			var err error
 | 
			
		||||
			switch ecc_pend {
 | 
			
		||||
			case nvml.FEATURE_DISABLED:
 | 
			
		||||
				y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now())
 | 
			
		||||
			case nvml.FEATURE_ENABLED:
 | 
			
		||||
				y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now())
 | 
			
		||||
			default:
 | 
			
		||||
				y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now())
 | 
			
		||||
			}
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_mode")
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		} else if ret == nvml.ERROR_NOT_SUPPORTED {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_mode")
 | 
			
		||||
			y, err := lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_ecc_mode") {
 | 
			
		||||
			_, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				var y lp.CCMetric
 | 
			
		||||
				var err error
 | 
			
		||||
				switch ecc_pend {
 | 
			
		||||
				case nvml.FEATURE_DISABLED:
 | 
			
		||||
					y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now())
 | 
			
		||||
				case nvml.FEATURE_ENABLED:
 | 
			
		||||
					y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now())
 | 
			
		||||
				default:
 | 
			
		||||
					y, err = lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now())
 | 
			
		||||
				}
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			} else if ret == nvml.ERROR_NOT_SUPPORTED {
 | 
			
		||||
				y, err := lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		pstate, ret := nvml.DeviceGetPerformanceState(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_perf_state")
 | 
			
		||||
			y, err := lp.New("nv_perf_state", tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_perf_state") {
 | 
			
		||||
			pstate, ret := nvml.DeviceGetPerformanceState(device.device)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				y, err := lp.New("nv_perf_state", device.tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		power, ret := nvml.DeviceGetPowerUsage(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_power_usage_report")
 | 
			
		||||
			y, err := lp.New("nv_power_usage_report", tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_power_usage_report") {
 | 
			
		||||
			power, ret := nvml.DeviceGetPowerUsage(device.device)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				y, err := lp.New("nv_power_usage_report", device.tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		gclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_GRAPHICS)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_graphics_clock_report")
 | 
			
		||||
			y, err := lp.New("nv_graphics_clock_report", tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_graphics_clock_report") {
 | 
			
		||||
			gclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				y, err := lp.New("nv_graphics_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_sm_clock_report")
 | 
			
		||||
			y, err := lp.New("nv_sm_clock_report", tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_sm_clock_report") {
 | 
			
		||||
			smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				y, err := lp.New("nv_sm_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_clock_report")
 | 
			
		||||
			y, err := lp.New("nv_mem_clock_report", tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_mem_clock_report") {
 | 
			
		||||
			memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				y, err := lp.New("nv_mem_clock_report", device.tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		max_gclk, ret := nvml.DeviceGetMaxClockInfo(device, nvml.CLOCK_GRAPHICS)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_graphics_clock")
 | 
			
		||||
			y, err := lp.New("nv_max_graphics_clock", tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_max_graphics_clock") {
 | 
			
		||||
			max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				y, err := lp.New("nv_max_graphics_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		max_smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_sm_clock")
 | 
			
		||||
			y, err := lp.New("nv_max_sm_clock", tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_max_sm_clock") {
 | 
			
		||||
			max_smclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				y, err := lp.New("nv_max_sm_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		max_memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_mem_clock")
 | 
			
		||||
			y, err := lp.New("nv_max_mem_clock", tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_max_mem_clock") {
 | 
			
		||||
			max_memclk, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				y, err := lp.New("nv_max_mem_clock", device.tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		ecc_db, ret := nvml.DeviceGetTotalEccErrors(device, 1, 1)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_db_error")
 | 
			
		||||
			y, err := lp.New("nv_ecc_db_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_ecc_db_error") {
 | 
			
		||||
			ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, 1, 1)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				y, err := lp.New("nv_ecc_db_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device, 0, 1)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_sb_error")
 | 
			
		||||
			y, err := lp.New("nv_ecc_sb_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_ecc_sb_error") {
 | 
			
		||||
			ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, 0, 1)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				y, err := lp.New("nv_ecc_sb_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_power_man_limit")
 | 
			
		||||
			y, err := lp.New("nv_power_man_limit", tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_power_man_limit") {
 | 
			
		||||
			pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				y, err := lp.New("nv_power_man_limit", device.tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_encoder_util")
 | 
			
		||||
			y, err := lp.New("nv_encoder_util", tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_encoder_util") {
 | 
			
		||||
			enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				y, err := lp.New("nv_encoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_decoder_util")
 | 
			
		||||
			y, err := lp.New("nv_decoder_util", tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				output <- y
 | 
			
		||||
		if !exclude("nv_decoder_util") {
 | 
			
		||||
			dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
 | 
			
		||||
			if ret == nvml.SUCCESS {
 | 
			
		||||
				y, err := lp.New("nv_decoder_util", device.tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					output <- y
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user