mirror of
				https://github.com/ClusterCockpit/cc-metric-collector.git
				synced 2025-11-04 02:35:07 +01:00 
			
		
		
		
	add only_metrics, exclude_metrics, absoulte_values, diff_values and derived_values. docs: consistency for list
This commit is contained in:
		@@ -10,8 +10,8 @@ import (
 | 
			
		||||
	"strings"
 | 
			
		||||
	"time"
 | 
			
		||||
 | 
			
		||||
	lp "github.com/ClusterCockpit/cc-lib/ccMessage"
 | 
			
		||||
	cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
 | 
			
		||||
	lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
// Non-Uniform Memory Access (NUMA) policy hit/miss statistics
 | 
			
		||||
@@ -46,18 +46,53 @@ import (
 | 
			
		||||
//	and succeeded.
 | 
			
		||||
//
 | 
			
		||||
// See: https://www.kernel.org/doc/html/latest/admin-guide/numastat.html
 | 
			
		||||
type NUMAStatsCollectorTopolgy struct {
 | 
			
		||||
	file   string
 | 
			
		||||
	tagSet map[string]string
 | 
			
		||||
type NUMAStatsCollectorConfig struct {
 | 
			
		||||
	SendAbsoluteValues bool     `json:"send_abs_values"`     // Defaults to true if not provided.
 | 
			
		||||
	SendDiffValues     bool     `json:"send_diff_values"`    // If true, diff metrics are sent.
 | 
			
		||||
	SendDerivedValues  bool     `json:"send_derived_values"` // If true, derived (rate) metrics are sent.
 | 
			
		||||
	ExcludeMetrics     []string `json:"exclude_metrics,omitempty"`
 | 
			
		||||
	OnlyMetrics        []string `json:"only_metrics,omitempty"`
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// NUMAStatsCollectorTopology represents a single NUMA domain.
 | 
			
		||||
type NUMAStatsCollectorTopolgy struct {
 | 
			
		||||
	file           string
 | 
			
		||||
	tagSet         map[string]string
 | 
			
		||||
	previousValues map[string]int64
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// NUMAStatsCollector collects NUMA statistics from /sys/devices/system/node/node*/numastat.
 | 
			
		||||
type NUMAStatsCollector struct {
 | 
			
		||||
	metricCollector
 | 
			
		||||
	topology []NUMAStatsCollectorTopolgy
 | 
			
		||||
	topology      []NUMAStatsCollectorTopolgy
 | 
			
		||||
	config        NUMAStatsCollectorConfig
 | 
			
		||||
	lastTimestamp time.Time
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type NUMAMetricDefinition struct {
 | 
			
		||||
	name string
 | 
			
		||||
	unit string
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// shouldOutput returns true if a metric should be forwarded based on only_metrics and exclude_metrics.
 | 
			
		||||
func (m *NUMAStatsCollector) shouldOutput(metricName string) bool {
 | 
			
		||||
	if len(m.config.OnlyMetrics) > 0 {
 | 
			
		||||
		for _, n := range m.config.OnlyMetrics {
 | 
			
		||||
			if n == metricName {
 | 
			
		||||
				return true
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		return false
 | 
			
		||||
	}
 | 
			
		||||
	for _, n := range m.config.ExcludeMetrics {
 | 
			
		||||
		if n == metricName {
 | 
			
		||||
			return false
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return true
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
 | 
			
		||||
	// Check if already initialized
 | 
			
		||||
	if m.init {
 | 
			
		||||
		return nil
 | 
			
		||||
	}
 | 
			
		||||
@@ -69,8 +104,13 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
 | 
			
		||||
		"source": m.name,
 | 
			
		||||
		"group":  "NUMA",
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Loop for all NUMA node directories
 | 
			
		||||
	// Default configuration: send_abs_values defaults to true.
 | 
			
		||||
	m.config.SendAbsoluteValues = true
 | 
			
		||||
	if len(config) > 0 {
 | 
			
		||||
		if err := json.Unmarshal(config, &m.config); err != nil {
 | 
			
		||||
			return err
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	base := "/sys/devices/system/node/node"
 | 
			
		||||
	globPattern := base + "[0-9]*"
 | 
			
		||||
	dirs, err := filepath.Glob(globPattern)
 | 
			
		||||
@@ -84,15 +124,14 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
 | 
			
		||||
	for _, dir := range dirs {
 | 
			
		||||
		node := strings.TrimPrefix(dir, base)
 | 
			
		||||
		file := filepath.Join(dir, "numastat")
 | 
			
		||||
		m.topology = append(m.topology,
 | 
			
		||||
			NUMAStatsCollectorTopolgy{
 | 
			
		||||
				file:   file,
 | 
			
		||||
				tagSet: map[string]string{"memoryDomain": node},
 | 
			
		||||
			})
 | 
			
		||||
		m.topology = append(m.topology, NUMAStatsCollectorTopolgy{
 | 
			
		||||
			file:           file,
 | 
			
		||||
			tagSet:         map[string]string{"memoryDomain": node},
 | 
			
		||||
			previousValues: make(map[string]int64),
 | 
			
		||||
		})
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Initialized
 | 
			
		||||
	cclog.ComponentDebug(m.name, "initialized", len(m.topology), "NUMA domains")
 | 
			
		||||
	m.lastTimestamp = time.Now()
 | 
			
		||||
	m.init = true
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
@@ -102,46 +141,62 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for i := range m.topology {
 | 
			
		||||
		// Loop for all NUMA domains
 | 
			
		||||
		t := &m.topology[i]
 | 
			
		||||
	now := time.Now()
 | 
			
		||||
	timeDiff := now.Sub(m.lastTimestamp).Seconds()
 | 
			
		||||
	m.lastTimestamp = now
 | 
			
		||||
 | 
			
		||||
		now := time.Now()
 | 
			
		||||
	for i := range m.topology {
 | 
			
		||||
		t := &m.topology[i]
 | 
			
		||||
		file, err := os.Open(t.file)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			cclog.ComponentError(
 | 
			
		||||
				m.name,
 | 
			
		||||
				fmt.Sprintf("Read(): Failed to open file '%s': %v", t.file, err))
 | 
			
		||||
			return
 | 
			
		||||
			cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to open file '%s': %v", t.file, err))
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		scanner := bufio.NewScanner(file)
 | 
			
		||||
 | 
			
		||||
		// Read line by line
 | 
			
		||||
		for scanner.Scan() {
 | 
			
		||||
			split := strings.Fields(scanner.Text())
 | 
			
		||||
			line := scanner.Text()
 | 
			
		||||
			split := strings.Fields(line)
 | 
			
		||||
			if len(split) != 2 {
 | 
			
		||||
				continue
 | 
			
		||||
			}
 | 
			
		||||
			key := split[0]
 | 
			
		||||
			value, err := strconv.ParseInt(split[1], 10, 64)
 | 
			
		||||
			if err != nil {
 | 
			
		||||
				cclog.ComponentError(
 | 
			
		||||
					m.name,
 | 
			
		||||
					fmt.Sprintf("Read(): Failed to convert %s='%s' to int64: %v", key, split[1], err))
 | 
			
		||||
				cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert %s='%s' to int64: %v", key, split[1], err))
 | 
			
		||||
				continue
 | 
			
		||||
			}
 | 
			
		||||
			y, err := lp.NewMessage(
 | 
			
		||||
				"numastats_"+key,
 | 
			
		||||
				t.tagSet,
 | 
			
		||||
				m.meta,
 | 
			
		||||
				map[string]interface{}{"value": value},
 | 
			
		||||
				now,
 | 
			
		||||
			)
 | 
			
		||||
			if err == nil {
 | 
			
		||||
				output <- y
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
			baseName := "numastats_" + key
 | 
			
		||||
 | 
			
		||||
			// Send absolute value if enabled.
 | 
			
		||||
			if m.config.SendAbsoluteValues && m.shouldOutput(baseName) {
 | 
			
		||||
				msg, err := lp.NewMessage(baseName, t.tagSet, m.meta, map[string]interface{}{"value": value}, now)
 | 
			
		||||
				if err == nil {
 | 
			
		||||
					msg.AddMeta("unit", "count")
 | 
			
		||||
					output <- msg
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			// If a previous value exists, compute diff and derived.
 | 
			
		||||
			if prev, ok := t.previousValues[key]; ok {
 | 
			
		||||
				diff := value - prev
 | 
			
		||||
				if m.config.SendDiffValues && m.shouldOutput(baseName+"_diff") {
 | 
			
		||||
					msg, err := lp.NewMessage(baseName+"_diff", t.tagSet, m.meta, map[string]interface{}{"value": diff}, now)
 | 
			
		||||
					if err == nil {
 | 
			
		||||
						msg.AddMeta("unit", "count")
 | 
			
		||||
						output <- msg
 | 
			
		||||
					}
 | 
			
		||||
				}
 | 
			
		||||
				if m.config.SendDerivedValues && m.shouldOutput(baseName+"_rate") {
 | 
			
		||||
					rate := float64(value-prev) / timeDiff
 | 
			
		||||
					msg, err := lp.NewMessage(baseName+"_rate", t.tagSet, m.meta, map[string]interface{}{"value": rate}, now)
 | 
			
		||||
					if err == nil {
 | 
			
		||||
						msg.AddMeta("unit", "counts/s")
 | 
			
		||||
						output <- msg
 | 
			
		||||
					}
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
			t.previousValues[key] = value
 | 
			
		||||
		}
 | 
			
		||||
		file.Close()
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -1,17 +1,43 @@
 | 
			
		||||
 | 
			
		||||
## `numastat` collector
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
  "numastats": {}
 | 
			
		||||
  "numastats": {
 | 
			
		||||
    "send_abs_values": true,
 | 
			
		||||
    "send_diff_values": true,
 | 
			
		||||
    "send_derived_values": true,
 | 
			
		||||
    "exclude_metrics": [],
 | 
			
		||||
    "only_metrics": []
 | 
			
		||||
  }
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
The `numastat` collector reads data from `/sys/devices/system/node/node*/numastat` and outputs a handful **memoryDomain** metrics. See: <https://www.kernel.org/doc/html/latest/admin-guide/numastat.html>
 | 
			
		||||
 | 
			
		||||
Metrics:
 | 
			
		||||
Both filtering mechanisms are supported:
 | 
			
		||||
- `exclude_metrics`: Excludes the specified metrics.
 | 
			
		||||
- `only_metrics`: If provided, only the listed metrics are collected. This takes precedence over `exclude_metrics`.
 | 
			
		||||
 | 
			
		||||
* `numastats_numa_hit`: A process wanted to allocate memory from this node, and succeeded.
 | 
			
		||||
* `numastats_numa_miss`: A process wanted to allocate memory from another node, but ended up with memory from this node.
 | 
			
		||||
* `numastats_numa_foreign`: A process wanted to allocate on this node, but ended up with memory from another node.
 | 
			
		||||
* `numastats_local_node`: A process ran on this node's CPU, and got memory from this node.
 | 
			
		||||
* `numastats_other_node`: A process ran on a different node's CPU, and got memory from this node.
 | 
			
		||||
* `numastats_interleave_hit`: Interleaving wanted to allocate from this node and succeeded.
 | 
			
		||||
Metrics are categorized as follows:
 | 
			
		||||
 | 
			
		||||
**Absolute Metrics:** (unit: `count`)
 | 
			
		||||
- `numastats_numa_hit`: A process wanted to allocate memory from this node, and succeeded.
 | 
			
		||||
- `numastats_numa_miss`: A process wanted to allocate memory from another node, but ended up with memory from this node.
 | 
			
		||||
- `numastats_numa_foreign`: A process wanted to allocate on this node, but ended up with memory from another node.
 | 
			
		||||
- `numastats_local_node`: A process ran on this node's CPU, and got memory from this node.
 | 
			
		||||
- `numastats_other_node`: A process ran on a different node's CPU, and got memory from this node.
 | 
			
		||||
- `numastats_interleave_hit`: Interleaving wanted to allocate from this node and succeeded.
 | 
			
		||||
 | 
			
		||||
**Diff Metrics:** (unit: `count`)
 | 
			
		||||
- `numastats_numa_hit_diff`
 | 
			
		||||
- `numastats_numa_miss_diff`
 | 
			
		||||
- `numastats_numa_foreign_diff`
 | 
			
		||||
- `numastats_local_node_diff`
 | 
			
		||||
- `numastats_other_node_diff`
 | 
			
		||||
- `numastats_interleave_hit_diff`
 | 
			
		||||
 | 
			
		||||
**Derived Metrics:** (unit: `counts/s`)
 | 
			
		||||
- `numastats_numa_hit_rate`
 | 
			
		||||
- `numastats_numa_miss_rate`
 | 
			
		||||
- `numastats_numa_foreign_rate`
 | 
			
		||||
- `numastats_local_node_rate`
 | 
			
		||||
- `numastats_other_node_rate`
 | 
			
		||||
- `numastats_interleave_hit_rate`
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user