mirror of
				https://github.com/ClusterCockpit/cc-metric-collector.git
				synced 2025-10-24 23:05:06 +02:00 
			
		
		
		
	* InfiniBandCollector: Scale raw readings from octets to bytes * Fix clock frequency coming from LikwidCollector and update docs * Build DEB package for Ubuntu 20.04 for releases * Fix memstat collector with numa_stats option * Remove useless prints from MemstatCollector * Replace ioutils with os and io (#87) * Use lower case for error strings in RocmSmiCollector * move maybe-usable-by-other-cc-components to pkg. Fix all files to use the new paths (#88) * Add collector for monitoring the execution of cc-metric-collector itself (#81) * Add collector to monitor execution of cc-metric-collector itself * Register SelfCollector * Fix import paths for moved packages * Check if at least one CPU with frequency information was detected * Correct type: /proc/stats -> /proc/stat * Update README.md * Run ipmitool asynchron. Improved error handling. * Corrected some typos * Add running average power limit (RAPL) metric collector * Add running average power limit (RAPL) metric collector * Do not mess up with the orignal configuration * * Corrected json config in numastatsMetric.md * Added some debug output to numastatsMetric.go * Fixed computing number of physical packages for non continous physical package IDs (e.g. on Ampere Altra Q80-30) * Fix kernel panic for receiver config with missing receiver type * Add receiver to gather remote IPMI sensor metrics * Added config option to add ipmi-sensors command line options * Add documentaion for IPMI receiver * Update to latest version of included go modules * Add go.mod to App dependency * Try to use common metric tags across hardware vendors * Add IPMI metric: current * remove prefix enumeration like 01-... * Add IPMI receiver example configuration to receivers.json * Minimal formating changes * Add hostlist package * Added tests for hostlist Expand() * Use package hostlist to expand a host list * Use package hostlist to expand a host list * Some servers return "ConsumedPowerWatt":65535 instead of "ConsumedPowerWatt":null * Updated to latest package versions * Do not allow unknown fields in JSON configuration file * Add workflow to customize packages to docs * NFS I/O Stats Collector (#91) * Initial version * Delete values for vanished mount points and comments * Fix for Likwid collector (#95) * Run LIKWID in separate thread and check metric type * Change LIKWID collector documentation to use 'type' instead of 'scope' * Re-initialize LIKWID after one read is missing due to lock toggle * Register cc-metric-collector at Zenodo (#93) * Add initial version of Zenodo project file * Orcid ID added * Update .zenodo.json Co-authored-by: Holger Obermaier <holger.obermaier@kit.edu> * Update ipmiMetric.go Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu>
		
			
				
	
	
		
			152 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			152 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package collectors
 | |
| 
 | |
| import (
 | |
| 	"bufio"
 | |
| 	"encoding/json"
 | |
| 	"fmt"
 | |
| 	"os"
 | |
| 	"path/filepath"
 | |
| 	"strconv"
 | |
| 	"strings"
 | |
| 	"time"
 | |
| 
 | |
| 	cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
 | |
| 	lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
 | |
| )
 | |
| 
 | |
| // Non-Uniform Memory Access (NUMA) policy hit/miss statistics
 | |
| //
 | |
| // numa_hit:
 | |
| //
 | |
| //	A process wanted to allocate memory from this node, and succeeded.
 | |
| //
 | |
| // numa_miss:
 | |
| //
 | |
| //	A process wanted to allocate memory from another node,
 | |
| //	but ended up with memory from this node.
 | |
| //
 | |
| // numa_foreign:
 | |
| //
 | |
| //	A process wanted to allocate on this node,
 | |
| //	but ended up with memory from another node.
 | |
| //
 | |
| // local_node:
 | |
| //
 | |
| //	A process ran on this node's CPU,
 | |
| //	and got memory from this node.
 | |
| //
 | |
| // other_node:
 | |
| //
 | |
| //	A process ran on a different node's CPU
 | |
| //	and got memory from this node.
 | |
| //
 | |
| // interleave_hit:
 | |
| //
 | |
| //	Interleaving wanted to allocate from this node
 | |
| //	and succeeded.
 | |
| //
 | |
| // See: https://www.kernel.org/doc/html/latest/admin-guide/numastat.html
 | |
| type NUMAStatsCollectorTopolgy struct {
 | |
| 	file   string
 | |
| 	tagSet map[string]string
 | |
| }
 | |
| 
 | |
| type NUMAStatsCollector struct {
 | |
| 	metricCollector
 | |
| 	topology []NUMAStatsCollectorTopolgy
 | |
| }
 | |
| 
 | |
| func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
 | |
| 	// Check if already initialized
 | |
| 	if m.init {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	m.name = "NUMAStatsCollector"
 | |
| 	m.parallel = true
 | |
| 	m.setup()
 | |
| 	m.meta = map[string]string{
 | |
| 		"source": m.name,
 | |
| 		"group":  "NUMA",
 | |
| 	}
 | |
| 
 | |
| 	// Loop for all NUMA node directories
 | |
| 	base := "/sys/devices/system/node/node"
 | |
| 	globPattern := base + "[0-9]*"
 | |
| 	dirs, err := filepath.Glob(globPattern)
 | |
| 	if err != nil {
 | |
| 		return fmt.Errorf("unable to glob files with pattern '%s'", globPattern)
 | |
| 	}
 | |
| 	if dirs == nil {
 | |
| 		return fmt.Errorf("unable to find any files with pattern '%s'", globPattern)
 | |
| 	}
 | |
| 	m.topology = make([]NUMAStatsCollectorTopolgy, 0, len(dirs))
 | |
| 	for _, dir := range dirs {
 | |
| 		node := strings.TrimPrefix(dir, base)
 | |
| 		file := filepath.Join(dir, "numastat")
 | |
| 		m.topology = append(m.topology,
 | |
| 			NUMAStatsCollectorTopolgy{
 | |
| 				file:   file,
 | |
| 				tagSet: map[string]string{"memoryDomain": node},
 | |
| 			})
 | |
| 	}
 | |
| 
 | |
| 	// Initialized
 | |
| 	cclog.ComponentDebug(m.name, "initialized", len(m.topology), "NUMA domains")
 | |
| 	m.init = true
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
 | |
| 	if !m.init {
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	for i := range m.topology {
 | |
| 		// Loop for all NUMA domains
 | |
| 		t := &m.topology[i]
 | |
| 
 | |
| 		now := time.Now()
 | |
| 		file, err := os.Open(t.file)
 | |
| 		if err != nil {
 | |
| 			cclog.ComponentError(
 | |
| 				m.name,
 | |
| 				fmt.Sprintf("Read(): Failed to open file '%s': %v", t.file, err))
 | |
| 			return
 | |
| 		}
 | |
| 		scanner := bufio.NewScanner(file)
 | |
| 
 | |
| 		// Read line by line
 | |
| 		for scanner.Scan() {
 | |
| 			split := strings.Fields(scanner.Text())
 | |
| 			if len(split) != 2 {
 | |
| 				continue
 | |
| 			}
 | |
| 			key := split[0]
 | |
| 			value, err := strconv.ParseInt(split[1], 10, 64)
 | |
| 			if err != nil {
 | |
| 				cclog.ComponentError(
 | |
| 					m.name,
 | |
| 					fmt.Sprintf("Read(): Failed to convert %s='%s' to int64: %v", key, split[1], err))
 | |
| 				continue
 | |
| 			}
 | |
| 			y, err := lp.New(
 | |
| 				"numastats_"+key,
 | |
| 				t.tagSet,
 | |
| 				m.meta,
 | |
| 				map[string]interface{}{"value": value},
 | |
| 				now,
 | |
| 			)
 | |
| 			if err == nil {
 | |
| 				output <- y
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		file.Close()
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (m *NUMAStatsCollector) Close() {
 | |
| 	m.init = false
 | |
| }
 |