From 0e57c8db1cf751ca0b72899dbf176e2258e11733 Mon Sep 17 00:00:00 2001 From: brinkcoder Date: Wed, 19 Feb 2025 11:35:32 +0100 Subject: [PATCH] Add derived_values for numastats (#134) * Check creation of CCMessage in NATS receiver * add derived_values for numastats * change to ccMessage * remove vim command artefact --------- Co-authored-by: Thomas Roehl Co-authored-by: exterr2f Co-authored-by: Thomas Gruber --- collectors/numastatsMetric.go | 71 +++++++++++++++++++++++++---------- collectors/numastatsMetric.md | 11 +++++- 2 files changed, 62 insertions(+), 20 deletions(-) diff --git a/collectors/numastatsMetric.go b/collectors/numastatsMetric.go index 20b000c..094cfe5 100644 --- a/collectors/numastatsMetric.go +++ b/collectors/numastatsMetric.go @@ -11,9 +11,14 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" + lp "github.com/ClusterCockpit/cc-lib/ccMessage" ) +type NUMAStatsCollectorConfig struct { + SendAbsoluteValues bool `json:"send_abs_values"` + SendDerivedValues bool `json:"send_derived_values"` +} + // Non-Uniform Memory Access (NUMA) policy hit/miss statistics // // numa_hit: @@ -47,13 +52,16 @@ import ( // // See: https://www.kernel.org/doc/html/latest/admin-guide/numastat.html type NUMAStatsCollectorTopolgy struct { - file string - tagSet map[string]string + file string + tagSet map[string]string + previousValues map[string]int64 } type NUMAStatsCollector struct { metricCollector - topology []NUMAStatsCollectorTopolgy + topology []NUMAStatsCollectorTopolgy + config NUMAStatsCollectorConfig + lastTimestamp time.Time } func (m *NUMAStatsCollector) Init(config json.RawMessage) error { @@ -86,8 +94,9 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error { file := filepath.Join(dir, "numastat") m.topology = append(m.topology, NUMAStatsCollectorTopolgy{ - file: file, - tagSet: map[string]string{"memoryDomain": node}, + file: file, + tagSet: map[string]string{"memoryDomain": node}, + previousValues: make(map[string]int64), }) } @@ -102,23 +111,27 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa return } + now := time.Now() + timeDiff := now.Sub(m.lastTimestamp).Seconds() + m.lastTimestamp = now + for i := range m.topology { // Loop for all NUMA domains t := &m.topology[i] - now := time.Now() file, err := os.Open(t.file) if err != nil { cclog.ComponentError( m.name, fmt.Sprintf("Read(): Failed to open file '%s': %v", t.file, err)) - return + continue } scanner := bufio.NewScanner(file) // Read line by line for scanner.Scan() { - split := strings.Fields(scanner.Text()) + line := scanner.Text() + split := strings.Fields(line) if len(split) != 2 { continue } @@ -130,18 +143,38 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa fmt.Sprintf("Read(): Failed to convert %s='%s' to int64: %v", key, split[1], err)) continue } - y, err := lp.NewMessage( - "numastats_"+key, - t.tagSet, - m.meta, - map[string]interface{}{"value": value}, - now, - ) - if err == nil { - output <- y + + if m.config.SendAbsoluteValues { + msg, err := lp.NewMessage( + "numastats_"+key, + t.tagSet, + m.meta, + map[string]interface{}{"value": value}, + now, + ) + if err == nil { + output <- msg + } + } + + if m.config.SendDerivedValues { + prev, ok := t.previousValues[key] + if ok { + rate := float64(value-prev) / timeDiff + msg, err := lp.NewMessage( + "numastats_"+key+"_rate", + t.tagSet, + m.meta, + map[string]interface{}{"value": rate}, + now, + ) + if err == nil { + output <- msg + } + } + t.previousValues[key] = value } } - file.Close() } } diff --git a/collectors/numastatsMetric.md b/collectors/numastatsMetric.md index cb9ab2f..b7e038d 100644 --- a/collectors/numastatsMetric.md +++ b/collectors/numastatsMetric.md @@ -2,7 +2,10 @@ ## `numastat` collector ```json - "numastats": {} + "numastats": { + "send_abs_values" : true, + "send_derived_values" : true +} ``` The `numastat` collector reads data from `/sys/devices/system/node/node*/numastat` and outputs a handful **memoryDomain** metrics. See: @@ -15,3 +18,9 @@ Metrics: * `numastats_local_node`: A process ran on this node's CPU, and got memory from this node. * `numastats_other_node`: A process ran on a different node's CPU, and got memory from this node. * `numastats_interleave_hit`: Interleaving wanted to allocate from this node and succeeded. +* `numastats_numa_hit_rate` (if `send_derived_values == true`): Derived rate value per second. +* `numastats_numa_miss_rate` (if `send_derived_values == true`): Derived rate value per second. +* `numastats_numa_foreign_rate` (if `send_derived_values == true`): Derived rate value per second. +* `numastats_local_node_rate` (if `send_derived_values == true`): Derived rate value per second. +* `numastats_other_node_rate` (if `send_derived_values == true`): Derived rate value per second. +* `numastats_interleave_hit_rate` (if `send_derived_values == true`): Derived rate value per second.