mirror of
				https://github.com/ClusterCockpit/cc-metric-collector.git
				synced 2025-10-31 09:05:05 +01:00 
			
		
		
		
	Add derived_values for numastats (#134)
* Check creation of CCMessage in NATS receiver * add derived_values for numastats * change to ccMessage * remove vim command artefact --------- Co-authored-by: Thomas Roehl <thomas.roehl@fau.de> Co-authored-by: exterr2f <Robert.Externbrink@rub.de> Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com>
This commit is contained in:
		| @@ -11,9 +11,14 @@ import ( | ||||
| 	"time" | ||||
|  | ||||
| 	cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" | ||||
| 	lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" | ||||
| 	lp "github.com/ClusterCockpit/cc-lib/ccMessage" | ||||
| ) | ||||
|  | ||||
| type NUMAStatsCollectorConfig struct { | ||||
| 	SendAbsoluteValues bool `json:"send_abs_values"` | ||||
| 	SendDerivedValues  bool `json:"send_derived_values"` | ||||
| } | ||||
|  | ||||
| // Non-Uniform Memory Access (NUMA) policy hit/miss statistics | ||||
| // | ||||
| // numa_hit: | ||||
| @@ -47,13 +52,16 @@ import ( | ||||
| // | ||||
| // See: https://www.kernel.org/doc/html/latest/admin-guide/numastat.html | ||||
| type NUMAStatsCollectorTopolgy struct { | ||||
| 	file   string | ||||
| 	tagSet map[string]string | ||||
| 	file           string | ||||
| 	tagSet         map[string]string | ||||
| 	previousValues map[string]int64 | ||||
| } | ||||
|  | ||||
| type NUMAStatsCollector struct { | ||||
| 	metricCollector | ||||
| 	topology []NUMAStatsCollectorTopolgy | ||||
| 	topology      []NUMAStatsCollectorTopolgy | ||||
| 	config        NUMAStatsCollectorConfig | ||||
| 	lastTimestamp time.Time | ||||
| } | ||||
|  | ||||
| func (m *NUMAStatsCollector) Init(config json.RawMessage) error { | ||||
| @@ -86,8 +94,9 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error { | ||||
| 		file := filepath.Join(dir, "numastat") | ||||
| 		m.topology = append(m.topology, | ||||
| 			NUMAStatsCollectorTopolgy{ | ||||
| 				file:   file, | ||||
| 				tagSet: map[string]string{"memoryDomain": node}, | ||||
| 				file:           file, | ||||
| 				tagSet:         map[string]string{"memoryDomain": node}, | ||||
| 				previousValues: make(map[string]int64), | ||||
| 			}) | ||||
| 	} | ||||
|  | ||||
| @@ -102,23 +111,27 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa | ||||
| 		return | ||||
| 	} | ||||
|  | ||||
| 	now := time.Now() | ||||
| 	timeDiff := now.Sub(m.lastTimestamp).Seconds() | ||||
| 	m.lastTimestamp = now | ||||
|  | ||||
| 	for i := range m.topology { | ||||
| 		// Loop for all NUMA domains | ||||
| 		t := &m.topology[i] | ||||
|  | ||||
| 		now := time.Now() | ||||
| 		file, err := os.Open(t.file) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentError( | ||||
| 				m.name, | ||||
| 				fmt.Sprintf("Read(): Failed to open file '%s': %v", t.file, err)) | ||||
| 			return | ||||
| 			continue | ||||
| 		} | ||||
| 		scanner := bufio.NewScanner(file) | ||||
|  | ||||
| 		// Read line by line | ||||
| 		for scanner.Scan() { | ||||
| 			split := strings.Fields(scanner.Text()) | ||||
| 			line := scanner.Text() | ||||
| 			split := strings.Fields(line) | ||||
| 			if len(split) != 2 { | ||||
| 				continue | ||||
| 			} | ||||
| @@ -130,18 +143,38 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa | ||||
| 					fmt.Sprintf("Read(): Failed to convert %s='%s' to int64: %v", key, split[1], err)) | ||||
| 				continue | ||||
| 			} | ||||
| 			y, err := lp.NewMessage( | ||||
| 				"numastats_"+key, | ||||
| 				t.tagSet, | ||||
| 				m.meta, | ||||
| 				map[string]interface{}{"value": value}, | ||||
| 				now, | ||||
| 			) | ||||
| 			if err == nil { | ||||
| 				output <- y | ||||
|  | ||||
| 			if m.config.SendAbsoluteValues { | ||||
| 				msg, err := lp.NewMessage( | ||||
| 					"numastats_"+key, | ||||
| 					t.tagSet, | ||||
| 					m.meta, | ||||
| 					map[string]interface{}{"value": value}, | ||||
| 					now, | ||||
| 				) | ||||
| 				if err == nil { | ||||
| 					output <- msg | ||||
| 				} | ||||
| 			} | ||||
|  | ||||
| 			if m.config.SendDerivedValues { | ||||
| 				prev, ok := t.previousValues[key] | ||||
| 				if ok { | ||||
| 					rate := float64(value-prev) / timeDiff | ||||
| 					msg, err := lp.NewMessage( | ||||
| 						"numastats_"+key+"_rate", | ||||
| 						t.tagSet, | ||||
| 						m.meta, | ||||
| 						map[string]interface{}{"value": rate}, | ||||
| 						now, | ||||
| 					) | ||||
| 					if err == nil { | ||||
| 						output <- msg | ||||
| 					} | ||||
| 				} | ||||
| 				t.previousValues[key] = value | ||||
| 			} | ||||
| 		} | ||||
|  | ||||
| 		file.Close() | ||||
| 	} | ||||
| } | ||||
|   | ||||
| @@ -2,7 +2,10 @@ | ||||
| ## `numastat` collector | ||||
|  | ||||
| ```json | ||||
|   "numastats": {} | ||||
|   "numastats": { | ||||
|     "send_abs_values" : true, | ||||
|     "send_derived_values" : true | ||||
| } | ||||
| ``` | ||||
|  | ||||
| The `numastat` collector reads data from `/sys/devices/system/node/node*/numastat` and outputs a handful **memoryDomain** metrics. See: <https://www.kernel.org/doc/html/latest/admin-guide/numastat.html> | ||||
| @@ -15,3 +18,9 @@ Metrics: | ||||
| * `numastats_local_node`: A process ran on this node's CPU, and got memory from this node. | ||||
| * `numastats_other_node`: A process ran on a different node's CPU, and got memory from this node. | ||||
| * `numastats_interleave_hit`: Interleaving wanted to allocate from this node and succeeded. | ||||
| * `numastats_numa_hit_rate` (if `send_derived_values == true`): Derived rate value per second. | ||||
| * `numastats_numa_miss_rate` (if `send_derived_values == true`): Derived rate value per second. | ||||
| * `numastats_numa_foreign_rate` (if `send_derived_values == true`): Derived rate value per second. | ||||
| * `numastats_local_node_rate` (if `send_derived_values == true`): Derived rate value per second. | ||||
| * `numastats_other_node_rate` (if `send_derived_values == true`): Derived rate value per second. | ||||
| * `numastats_interleave_hit_rate` (if `send_derived_values == true`): Derived rate value per second. | ||||
|   | ||||
		Reference in New Issue
	
	Block a user