mirror of
				https://github.com/ClusterCockpit/cc-metric-collector.git
				synced 2025-11-04 02:35:07 +01:00 
			
		
		
		
	Modularize the whole thing (#16)
* Use channels, add a metric router, split up configuration and use extended version of Influx line protocol internally * Use central timer for collectors and router. Add expressions to router * Add expression to router config * Update entry points * Start with README * Update README for CCMetric * Formatting * Update README.md * Add README for MultiChanTicker * Add README for MultiChanTicker * Update README.md * Add README to metric router * Update main README * Remove SinkEntity type * Update README for sinks * Update go files * Update README for receivers * Update collectors README * Update collectors README * Use seperate page per collector * Fix for tempstat page * Add docs for customcmd collector * Add docs for ipmistat collector * Add docs for topprocs collector * Update customCmdMetric.md * Use seconds when calculating LIKWID metrics * Add IB metrics ib_recv_pkts and ib_xmit_pkts * Drop domain part of host name * Updated to latest stable version of likwid * Define source code dependencies in Makefile * Add GPFS / IBM Spectrum Scale collector * Add vet and staticcheck make targets * Add vet and staticcheck make targets * Avoid go vet warning: struct field tag `json:"..., omitempty"` not compatible with reflect.StructTag.Get: suspicious space in struct tag value struct field tag `json:"...", omitempty` not compatible with reflect.StructTag.Get: key:"value" pairs not separated by spaces * Add sample collector to README.md * Add CPU frequency collector * Avoid staticcheck warning: redundant return statement * Avoid staticcheck warning: unnecessary assignment to the blank identifier * Simplified code * Add CPUFreqCollectorCpuinfo a metric collector to measure the current frequency of the CPUs as obtained from /proc/cpuinfo Only measure on the first hyperthread * Add collector for NFS clients * Move publication of metrics into Flush() for NatsSink * Update GitHub actions * Refactoring * Avoid vet warning: Println arg list ends with redundant newline * Avoid vet warning struct field commands has json tag but is not exported * Avoid vet warning: return copies lock value. * Corrected typo * Refactoring * Add go sources in internal/... * Bad separator in Makefile * Fix Infiniband collector Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
This commit is contained in:
		@@ -6,9 +6,8 @@ import (
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"log"
 | 
			
		||||
	"time"
 | 
			
		||||
 | 
			
		||||
	lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
 | 
			
		||||
	"github.com/NVIDIA/go-nvml/pkg/nvml"
 | 
			
		||||
	lp "github.com/influxdata/line-protocol"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
type NvidiaCollectorConfig struct {
 | 
			
		||||
@@ -17,7 +16,7 @@ type NvidiaCollectorConfig struct {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type NvidiaCollector struct {
 | 
			
		||||
	MetricCollector
 | 
			
		||||
	metricCollector
 | 
			
		||||
	num_gpus int
 | 
			
		||||
	config   NvidiaCollectorConfig
 | 
			
		||||
}
 | 
			
		||||
@@ -29,10 +28,11 @@ func (m *NvidiaCollector) CatchPanic() {
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (m *NvidiaCollector) Init(config []byte) error {
 | 
			
		||||
func (m *NvidiaCollector) Init(config json.RawMessage) error {
 | 
			
		||||
	var err error
 | 
			
		||||
	m.name = "NvidiaCollector"
 | 
			
		||||
	m.setup()
 | 
			
		||||
	m.meta = map[string]string{"source": m.name, "group": "Nvidia"}
 | 
			
		||||
	if len(config) > 0 {
 | 
			
		||||
		err = json.Unmarshal(config, &m.config)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
@@ -55,7 +55,7 @@ func (m *NvidiaCollector) Init(config []byte) error {
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
 | 
			
		||||
func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) {
 | 
			
		||||
	if !m.init {
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
@@ -74,14 +74,14 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
 | 
			
		||||
		util, ret := nvml.DeviceGetUtilizationRates(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "util")
 | 
			
		||||
			y, err := lp.New("util", tags, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
 | 
			
		||||
			y, err := lp.New("util", tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_util")
 | 
			
		||||
			y, err = lp.New("mem_util", tags, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
 | 
			
		||||
			y, err = lp.New("mem_util", tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
@@ -89,174 +89,177 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			t := float64(meminfo.Total) / (1024 * 1024)
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_total")
 | 
			
		||||
			y, err := lp.New("mem_total", tags, map[string]interface{}{"value": t}, time.Now())
 | 
			
		||||
			y, err := lp.New("mem_total", tags, m.meta, map[string]interface{}{"value": t}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				y.AddMeta("unit", "MByte")
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
			f := float64(meminfo.Used) / (1024 * 1024)
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "fb_memory")
 | 
			
		||||
			y, err = lp.New("fb_memory", tags, map[string]interface{}{"value": f}, time.Now())
 | 
			
		||||
			y, err = lp.New("fb_memory", tags, m.meta, map[string]interface{}{"value": f}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				y.AddMeta("unit", "MByte")
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		temp, ret := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "temp")
 | 
			
		||||
			y, err := lp.New("temp", tags, map[string]interface{}{"value": float64(temp)}, time.Now())
 | 
			
		||||
			y, err := lp.New("temp", tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				y.AddMeta("unit", "degC")
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		fan, ret := nvml.DeviceGetFanSpeed(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "fan")
 | 
			
		||||
			y, err := lp.New("fan", tags, map[string]interface{}{"value": float64(fan)}, time.Now())
 | 
			
		||||
			y, err := lp.New("fan", tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		_, ecc_pend, ret := nvml.DeviceGetEccMode(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			var y lp.MutableMetric
 | 
			
		||||
			var y lp.CCMetric
 | 
			
		||||
			var err error
 | 
			
		||||
			switch ecc_pend {
 | 
			
		||||
			case nvml.FEATURE_DISABLED:
 | 
			
		||||
				y, err = lp.New("ecc_mode", tags, map[string]interface{}{"value": string("OFF")}, time.Now())
 | 
			
		||||
				y, err = lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now())
 | 
			
		||||
			case nvml.FEATURE_ENABLED:
 | 
			
		||||
				y, err = lp.New("ecc_mode", tags, map[string]interface{}{"value": string("ON")}, time.Now())
 | 
			
		||||
				y, err = lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now())
 | 
			
		||||
			default:
 | 
			
		||||
				y, err = lp.New("ecc_mode", tags, map[string]interface{}{"value": string("UNKNOWN")}, time.Now())
 | 
			
		||||
				y, err = lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now())
 | 
			
		||||
			}
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_mode")
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		} else if ret == nvml.ERROR_NOT_SUPPORTED {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_mode")
 | 
			
		||||
			y, err := lp.New("ecc_mode", tags, map[string]interface{}{"value": string("N/A")}, time.Now())
 | 
			
		||||
			y, err := lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		pstate, ret := nvml.DeviceGetPerformanceState(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "perf_state")
 | 
			
		||||
			y, err := lp.New("perf_state", tags, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now())
 | 
			
		||||
			y, err := lp.New("perf_state", tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		power, ret := nvml.DeviceGetPowerUsage(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "power_usage_report")
 | 
			
		||||
			y, err := lp.New("power_usage_report", tags, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
 | 
			
		||||
			y, err := lp.New("power_usage_report", tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		gclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_GRAPHICS)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "graphics_clock_report")
 | 
			
		||||
			y, err := lp.New("graphics_clock_report", tags, map[string]interface{}{"value": float64(gclk)}, time.Now())
 | 
			
		||||
			y, err := lp.New("graphics_clock_report", tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "sm_clock_report")
 | 
			
		||||
			y, err := lp.New("sm_clock_report", tags, map[string]interface{}{"value": float64(smclk)}, time.Now())
 | 
			
		||||
			y, err := lp.New("sm_clock_report", tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_clock_report")
 | 
			
		||||
			y, err := lp.New("mem_clock_report", tags, map[string]interface{}{"value": float64(memclk)}, time.Now())
 | 
			
		||||
			y, err := lp.New("mem_clock_report", tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		max_gclk, ret := nvml.DeviceGetMaxClockInfo(device, nvml.CLOCK_GRAPHICS)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "max_graphics_clock")
 | 
			
		||||
			y, err := lp.New("max_graphics_clock", tags, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
 | 
			
		||||
			y, err := lp.New("max_graphics_clock", tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		max_smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "max_sm_clock")
 | 
			
		||||
			y, err := lp.New("max_sm_clock", tags, map[string]interface{}{"value": float64(max_smclk)}, time.Now())
 | 
			
		||||
			y, err := lp.New("max_sm_clock", tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		max_memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "max_mem_clock")
 | 
			
		||||
			y, err := lp.New("max_mem_clock", tags, map[string]interface{}{"value": float64(max_memclk)}, time.Now())
 | 
			
		||||
			y, err := lp.New("max_mem_clock", tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		ecc_db, ret := nvml.DeviceGetTotalEccErrors(device, 1, 1)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_db_error")
 | 
			
		||||
			y, err := lp.New("ecc_db_error", tags, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
 | 
			
		||||
			y, err := lp.New("ecc_db_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device, 0, 1)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_sb_error")
 | 
			
		||||
			y, err := lp.New("ecc_sb_error", tags, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
 | 
			
		||||
			y, err := lp.New("ecc_sb_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "power_man_limit")
 | 
			
		||||
			y, err := lp.New("power_man_limit", tags, map[string]interface{}{"value": float64(pwr_limit)}, time.Now())
 | 
			
		||||
			y, err := lp.New("power_man_limit", tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "encoder_util")
 | 
			
		||||
			y, err := lp.New("encoder_util", tags, map[string]interface{}{"value": float64(enc_util)}, time.Now())
 | 
			
		||||
			y, err := lp.New("encoder_util", tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device)
 | 
			
		||||
		if ret == nvml.SUCCESS {
 | 
			
		||||
			_, skip = stringArrayContains(m.config.ExcludeMetrics, "decoder_util")
 | 
			
		||||
			y, err := lp.New("decoder_util", tags, map[string]interface{}{"value": float64(dec_util)}, time.Now())
 | 
			
		||||
			y, err := lp.New("decoder_util", tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
 | 
			
		||||
			if err == nil && !skip {
 | 
			
		||||
				*out = append(*out, y)
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user