mirror of
				https://github.com/ClusterCockpit/cc-metric-collector.git
				synced 2025-11-04 02:35:07 +01:00 
			
		
		
		
	add only_metrics, exclude_metrics, absoulte_values, diff_values and derived_values. docs: consistency for list
This commit is contained in:
		@@ -10,8 +10,8 @@ import (
 | 
				
			|||||||
	"strings"
 | 
						"strings"
 | 
				
			||||||
	"time"
 | 
						"time"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						lp "github.com/ClusterCockpit/cc-lib/ccMessage"
 | 
				
			||||||
	cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
 | 
						cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
 | 
				
			||||||
	lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
 | 
					 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Non-Uniform Memory Access (NUMA) policy hit/miss statistics
 | 
					// Non-Uniform Memory Access (NUMA) policy hit/miss statistics
 | 
				
			||||||
@@ -46,18 +46,53 @@ import (
 | 
				
			|||||||
//	and succeeded.
 | 
					//	and succeeded.
 | 
				
			||||||
//
 | 
					//
 | 
				
			||||||
// See: https://www.kernel.org/doc/html/latest/admin-guide/numastat.html
 | 
					// See: https://www.kernel.org/doc/html/latest/admin-guide/numastat.html
 | 
				
			||||||
type NUMAStatsCollectorTopolgy struct {
 | 
					type NUMAStatsCollectorConfig struct {
 | 
				
			||||||
	file   string
 | 
						SendAbsoluteValues bool     `json:"send_abs_values"`     // Defaults to true if not provided.
 | 
				
			||||||
	tagSet map[string]string
 | 
						SendDiffValues     bool     `json:"send_diff_values"`    // If true, diff metrics are sent.
 | 
				
			||||||
 | 
						SendDerivedValues  bool     `json:"send_derived_values"` // If true, derived (rate) metrics are sent.
 | 
				
			||||||
 | 
						ExcludeMetrics     []string `json:"exclude_metrics,omitempty"`
 | 
				
			||||||
 | 
						OnlyMetrics        []string `json:"only_metrics,omitempty"`
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// NUMAStatsCollectorTopology represents a single NUMA domain.
 | 
				
			||||||
 | 
					type NUMAStatsCollectorTopolgy struct {
 | 
				
			||||||
 | 
						file           string
 | 
				
			||||||
 | 
						tagSet         map[string]string
 | 
				
			||||||
 | 
						previousValues map[string]int64
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// NUMAStatsCollector collects NUMA statistics from /sys/devices/system/node/node*/numastat.
 | 
				
			||||||
type NUMAStatsCollector struct {
 | 
					type NUMAStatsCollector struct {
 | 
				
			||||||
	metricCollector
 | 
						metricCollector
 | 
				
			||||||
	topology []NUMAStatsCollectorTopolgy
 | 
						topology      []NUMAStatsCollectorTopolgy
 | 
				
			||||||
 | 
						config        NUMAStatsCollectorConfig
 | 
				
			||||||
 | 
						lastTimestamp time.Time
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					type NUMAMetricDefinition struct {
 | 
				
			||||||
 | 
						name string
 | 
				
			||||||
 | 
						unit string
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// shouldOutput returns true if a metric should be forwarded based on only_metrics and exclude_metrics.
 | 
				
			||||||
 | 
					func (m *NUMAStatsCollector) shouldOutput(metricName string) bool {
 | 
				
			||||||
 | 
						if len(m.config.OnlyMetrics) > 0 {
 | 
				
			||||||
 | 
							for _, n := range m.config.OnlyMetrics {
 | 
				
			||||||
 | 
								if n == metricName {
 | 
				
			||||||
 | 
									return true
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							return false
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						for _, n := range m.config.ExcludeMetrics {
 | 
				
			||||||
 | 
							if n == metricName {
 | 
				
			||||||
 | 
								return false
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						return true
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
 | 
					func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
 | 
				
			||||||
	// Check if already initialized
 | 
					 | 
				
			||||||
	if m.init {
 | 
						if m.init {
 | 
				
			||||||
		return nil
 | 
							return nil
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
@@ -69,8 +104,13 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
 | 
				
			|||||||
		"source": m.name,
 | 
							"source": m.name,
 | 
				
			||||||
		"group":  "NUMA",
 | 
							"group":  "NUMA",
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
						// Default configuration: send_abs_values defaults to true.
 | 
				
			||||||
	// Loop for all NUMA node directories
 | 
						m.config.SendAbsoluteValues = true
 | 
				
			||||||
 | 
						if len(config) > 0 {
 | 
				
			||||||
 | 
							if err := json.Unmarshal(config, &m.config); err != nil {
 | 
				
			||||||
 | 
								return err
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
	base := "/sys/devices/system/node/node"
 | 
						base := "/sys/devices/system/node/node"
 | 
				
			||||||
	globPattern := base + "[0-9]*"
 | 
						globPattern := base + "[0-9]*"
 | 
				
			||||||
	dirs, err := filepath.Glob(globPattern)
 | 
						dirs, err := filepath.Glob(globPattern)
 | 
				
			||||||
@@ -84,15 +124,14 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
 | 
				
			|||||||
	for _, dir := range dirs {
 | 
						for _, dir := range dirs {
 | 
				
			||||||
		node := strings.TrimPrefix(dir, base)
 | 
							node := strings.TrimPrefix(dir, base)
 | 
				
			||||||
		file := filepath.Join(dir, "numastat")
 | 
							file := filepath.Join(dir, "numastat")
 | 
				
			||||||
		m.topology = append(m.topology,
 | 
							m.topology = append(m.topology, NUMAStatsCollectorTopolgy{
 | 
				
			||||||
			NUMAStatsCollectorTopolgy{
 | 
								file:           file,
 | 
				
			||||||
				file:   file,
 | 
								tagSet:         map[string]string{"memoryDomain": node},
 | 
				
			||||||
				tagSet: map[string]string{"memoryDomain": node},
 | 
								previousValues: make(map[string]int64),
 | 
				
			||||||
			})
 | 
							})
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					 | 
				
			||||||
	// Initialized
 | 
					 | 
				
			||||||
	cclog.ComponentDebug(m.name, "initialized", len(m.topology), "NUMA domains")
 | 
						cclog.ComponentDebug(m.name, "initialized", len(m.topology), "NUMA domains")
 | 
				
			||||||
 | 
						m.lastTimestamp = time.Now()
 | 
				
			||||||
	m.init = true
 | 
						m.init = true
 | 
				
			||||||
	return nil
 | 
						return nil
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@@ -102,46 +141,62 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
 | 
				
			|||||||
		return
 | 
							return
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for i := range m.topology {
 | 
						now := time.Now()
 | 
				
			||||||
		// Loop for all NUMA domains
 | 
						timeDiff := now.Sub(m.lastTimestamp).Seconds()
 | 
				
			||||||
		t := &m.topology[i]
 | 
						m.lastTimestamp = now
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		now := time.Now()
 | 
						for i := range m.topology {
 | 
				
			||||||
 | 
							t := &m.topology[i]
 | 
				
			||||||
		file, err := os.Open(t.file)
 | 
							file, err := os.Open(t.file)
 | 
				
			||||||
		if err != nil {
 | 
							if err != nil {
 | 
				
			||||||
			cclog.ComponentError(
 | 
								cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to open file '%s': %v", t.file, err))
 | 
				
			||||||
				m.name,
 | 
								continue
 | 
				
			||||||
				fmt.Sprintf("Read(): Failed to open file '%s': %v", t.file, err))
 | 
					 | 
				
			||||||
			return
 | 
					 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		scanner := bufio.NewScanner(file)
 | 
							scanner := bufio.NewScanner(file)
 | 
				
			||||||
 | 
					 | 
				
			||||||
		// Read line by line
 | 
					 | 
				
			||||||
		for scanner.Scan() {
 | 
							for scanner.Scan() {
 | 
				
			||||||
			split := strings.Fields(scanner.Text())
 | 
								line := scanner.Text()
 | 
				
			||||||
 | 
								split := strings.Fields(line)
 | 
				
			||||||
			if len(split) != 2 {
 | 
								if len(split) != 2 {
 | 
				
			||||||
				continue
 | 
									continue
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
			key := split[0]
 | 
								key := split[0]
 | 
				
			||||||
			value, err := strconv.ParseInt(split[1], 10, 64)
 | 
								value, err := strconv.ParseInt(split[1], 10, 64)
 | 
				
			||||||
			if err != nil {
 | 
								if err != nil {
 | 
				
			||||||
				cclog.ComponentError(
 | 
									cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert %s='%s' to int64: %v", key, split[1], err))
 | 
				
			||||||
					m.name,
 | 
					 | 
				
			||||||
					fmt.Sprintf("Read(): Failed to convert %s='%s' to int64: %v", key, split[1], err))
 | 
					 | 
				
			||||||
				continue
 | 
									continue
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
			y, err := lp.NewMessage(
 | 
								baseName := "numastats_" + key
 | 
				
			||||||
				"numastats_"+key,
 | 
					 | 
				
			||||||
				t.tagSet,
 | 
					 | 
				
			||||||
				m.meta,
 | 
					 | 
				
			||||||
				map[string]interface{}{"value": value},
 | 
					 | 
				
			||||||
				now,
 | 
					 | 
				
			||||||
			)
 | 
					 | 
				
			||||||
			if err == nil {
 | 
					 | 
				
			||||||
				output <- y
 | 
					 | 
				
			||||||
			}
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								// Send absolute value if enabled.
 | 
				
			||||||
 | 
								if m.config.SendAbsoluteValues && m.shouldOutput(baseName) {
 | 
				
			||||||
 | 
									msg, err := lp.NewMessage(baseName, t.tagSet, m.meta, map[string]interface{}{"value": value}, now)
 | 
				
			||||||
 | 
									if err == nil {
 | 
				
			||||||
 | 
										msg.AddMeta("unit", "count")
 | 
				
			||||||
 | 
										output <- msg
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								// If a previous value exists, compute diff and derived.
 | 
				
			||||||
 | 
								if prev, ok := t.previousValues[key]; ok {
 | 
				
			||||||
 | 
									diff := value - prev
 | 
				
			||||||
 | 
									if m.config.SendDiffValues && m.shouldOutput(baseName+"_diff") {
 | 
				
			||||||
 | 
										msg, err := lp.NewMessage(baseName+"_diff", t.tagSet, m.meta, map[string]interface{}{"value": diff}, now)
 | 
				
			||||||
 | 
										if err == nil {
 | 
				
			||||||
 | 
											msg.AddMeta("unit", "count")
 | 
				
			||||||
 | 
											output <- msg
 | 
				
			||||||
 | 
										}
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
									if m.config.SendDerivedValues && m.shouldOutput(baseName+"_rate") {
 | 
				
			||||||
 | 
										rate := float64(value-prev) / timeDiff
 | 
				
			||||||
 | 
										msg, err := lp.NewMessage(baseName+"_rate", t.tagSet, m.meta, map[string]interface{}{"value": rate}, now)
 | 
				
			||||||
 | 
										if err == nil {
 | 
				
			||||||
 | 
											msg.AddMeta("unit", "counts/s")
 | 
				
			||||||
 | 
											output <- msg
 | 
				
			||||||
 | 
										}
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
								t.previousValues[key] = value
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
		file.Close()
 | 
							file.Close()
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,17 +1,43 @@
 | 
				
			|||||||
 | 
					 | 
				
			||||||
## `numastat` collector
 | 
					## `numastat` collector
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```json
 | 
					```json
 | 
				
			||||||
  "numastats": {}
 | 
					  "numastats": {
 | 
				
			||||||
 | 
					    "send_abs_values": true,
 | 
				
			||||||
 | 
					    "send_diff_values": true,
 | 
				
			||||||
 | 
					    "send_derived_values": true,
 | 
				
			||||||
 | 
					    "exclude_metrics": [],
 | 
				
			||||||
 | 
					    "only_metrics": []
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The `numastat` collector reads data from `/sys/devices/system/node/node*/numastat` and outputs a handful **memoryDomain** metrics. See: <https://www.kernel.org/doc/html/latest/admin-guide/numastat.html>
 | 
					The `numastat` collector reads data from `/sys/devices/system/node/node*/numastat` and outputs a handful **memoryDomain** metrics. See: <https://www.kernel.org/doc/html/latest/admin-guide/numastat.html>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Metrics:
 | 
					Both filtering mechanisms are supported:
 | 
				
			||||||
 | 
					- `exclude_metrics`: Excludes the specified metrics.
 | 
				
			||||||
 | 
					- `only_metrics`: If provided, only the listed metrics are collected. This takes precedence over `exclude_metrics`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
* `numastats_numa_hit`: A process wanted to allocate memory from this node, and succeeded.
 | 
					Metrics are categorized as follows:
 | 
				
			||||||
* `numastats_numa_miss`: A process wanted to allocate memory from another node, but ended up with memory from this node.
 | 
					
 | 
				
			||||||
* `numastats_numa_foreign`: A process wanted to allocate on this node, but ended up with memory from another node.
 | 
					**Absolute Metrics:** (unit: `count`)
 | 
				
			||||||
* `numastats_local_node`: A process ran on this node's CPU, and got memory from this node.
 | 
					- `numastats_numa_hit`: A process wanted to allocate memory from this node, and succeeded.
 | 
				
			||||||
* `numastats_other_node`: A process ran on a different node's CPU, and got memory from this node.
 | 
					- `numastats_numa_miss`: A process wanted to allocate memory from another node, but ended up with memory from this node.
 | 
				
			||||||
* `numastats_interleave_hit`: Interleaving wanted to allocate from this node and succeeded.
 | 
					- `numastats_numa_foreign`: A process wanted to allocate on this node, but ended up with memory from another node.
 | 
				
			||||||
 | 
					- `numastats_local_node`: A process ran on this node's CPU, and got memory from this node.
 | 
				
			||||||
 | 
					- `numastats_other_node`: A process ran on a different node's CPU, and got memory from this node.
 | 
				
			||||||
 | 
					- `numastats_interleave_hit`: Interleaving wanted to allocate from this node and succeeded.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					**Diff Metrics:** (unit: `count`)
 | 
				
			||||||
 | 
					- `numastats_numa_hit_diff`
 | 
				
			||||||
 | 
					- `numastats_numa_miss_diff`
 | 
				
			||||||
 | 
					- `numastats_numa_foreign_diff`
 | 
				
			||||||
 | 
					- `numastats_local_node_diff`
 | 
				
			||||||
 | 
					- `numastats_other_node_diff`
 | 
				
			||||||
 | 
					- `numastats_interleave_hit_diff`
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					**Derived Metrics:** (unit: `counts/s`)
 | 
				
			||||||
 | 
					- `numastats_numa_hit_rate`
 | 
				
			||||||
 | 
					- `numastats_numa_miss_rate`
 | 
				
			||||||
 | 
					- `numastats_numa_foreign_rate`
 | 
				
			||||||
 | 
					- `numastats_local_node_rate`
 | 
				
			||||||
 | 
					- `numastats_other_node_rate`
 | 
				
			||||||
 | 
					- `numastats_interleave_hit_rate`
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user