mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-07-29 16:16:06 +02:00
Merge latest development changes to main branch (#79)
* Cleanup: Remove unused code * Use Golang duration parser for 'interval' and 'duration' in main config * Update handling of LIKWID headers. Download only if not already present in the system. Fixes #73 * Units with cc-units (#64) * Add option to normalize units with cc-unit * Add unit conversion to router * Add option to change unit prefix in the router * Add to MetricRouter README * Add order of operations in router to README * Use second add_tags/del_tags only if metric gets renamed * Skip disks in DiskstatCollector that have size=0 * Check readability of sensor files in TempCollector * Fix for --once option * Rename `cpu` type to `hwthread` (#69) * Rename 'cpu' type to 'hwthread' to avoid naming clashes with MetricStore and CC-Webfrontend * Collectors in parallel (#74) * Provide info to CollectorManager whether the collector can be executed in parallel with others * Split serial and parallel collectors. Read in parallel first * Update NvidiaCollector with new metrics, MIG and NvLink support (#75) * CC topology module update (#76) * Rename CPU to hardware thread, write some comments * Do renaming in other parts * Remove CpuList and SocketList function from metricCollector. Available in ccTopology * Option to use MIG UUID as subtype-id in NvidiaCollector * Option to use MIG slice name as subtype-id in NvidiaCollector * MetricRouter: Fix JSON in README * Fix for Github Action to really use the selected version * Remove Ganglia installation in runonce Action and add Go 1.18 * Fix daemon options in init script * Add separate go.mod files to use it with deprecated 1.16 * Minor updates for Makefiles * fix string comparison * AMD ROCm SMI collector (#77) * Add collector for AMD ROCm SMI metrics * Fix import path * Fix imports * Remove Board Number * store GPU index explicitly * Remove board number from description * Use http instead of ftp to download likwid * Fix serial number in rocmCollector * Improved http sink (#78) * automatic flush in NatsSink * tweak default options of HttpSink * shorter cirt. section and retries for HttpSink * fix error handling * Remove file added by mistake. * Use http instead of ftp to download likwid * Fix serial number in rocmCollector Co-authored-by: Thomas Roehl <thomas.roehl@fau.de> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: Lou <lou.knauer@gmx.de>
This commit is contained in:
@@ -29,6 +29,7 @@ func intArrayContains(array []int, str int) (int, bool) {
|
||||
return -1, false
|
||||
}
|
||||
|
||||
// Used internally for sysfs file reads
|
||||
func fileToInt(path string) int {
|
||||
buffer, err := ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
@@ -47,6 +48,7 @@ func fileToInt(path string) int {
|
||||
return int(id)
|
||||
}
|
||||
|
||||
// Get list of CPU socket IDs
|
||||
func SocketList() []int {
|
||||
buffer, err := ioutil.ReadFile(string(PROCFS_CPUINFO))
|
||||
if err != nil {
|
||||
@@ -54,7 +56,7 @@ func SocketList() []int {
|
||||
return nil
|
||||
}
|
||||
ll := strings.Split(string(buffer), "\n")
|
||||
var packs []int
|
||||
packs := make([]int, 0)
|
||||
for _, line := range ll {
|
||||
if strings.HasPrefix(line, "physical id") {
|
||||
lv := strings.Fields(line)
|
||||
@@ -72,7 +74,8 @@ func SocketList() []int {
|
||||
return packs
|
||||
}
|
||||
|
||||
func CpuList() []int {
|
||||
// Get list of hardware thread IDs in the order of listing in /proc/cpuinfo
|
||||
func HwthreadList() []int {
|
||||
buffer, err := ioutil.ReadFile(string(PROCFS_CPUINFO))
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
@@ -97,6 +100,13 @@ func CpuList() []int {
|
||||
return cpulist
|
||||
}
|
||||
|
||||
// Get list of hardware thread IDs in the order of listing in /proc/cpuinfo
|
||||
// Deprecated! Use HwthreadList()
|
||||
func CpuList() []int {
|
||||
return HwthreadList()
|
||||
}
|
||||
|
||||
// Get list of CPU core IDs in the order of listing in /proc/cpuinfo
|
||||
func CoreList() []int {
|
||||
buffer, err := ioutil.ReadFile(string(PROCFS_CPUINFO))
|
||||
if err != nil {
|
||||
@@ -122,6 +132,7 @@ func CoreList() []int {
|
||||
return corelist
|
||||
}
|
||||
|
||||
// Get list of NUMA node IDs
|
||||
func NumaNodeList() []int {
|
||||
numaList := make([]int, 0)
|
||||
globPath := filepath.Join(string(SYSFS_NUMABASE), "node*")
|
||||
@@ -156,8 +167,9 @@ func NumaNodeList() []int {
|
||||
return numaList
|
||||
}
|
||||
|
||||
// Get list of CPU die IDs
|
||||
func DieList() []int {
|
||||
cpulist := CpuList()
|
||||
cpulist := HwthreadList()
|
||||
dielist := make([]int, 0)
|
||||
for _, c := range cpulist {
|
||||
diepath := filepath.Join(string(SYSFS_CPUBASE), fmt.Sprintf("cpu%d", c), "topology/die_id")
|
||||
@@ -175,7 +187,27 @@ func DieList() []int {
|
||||
return SocketList()
|
||||
}
|
||||
|
||||
type CpuEntry struct {
|
||||
// Get list of specified type using the naming format inside ClusterCockpit
|
||||
func GetTypeList(topology_type string) []int {
|
||||
switch topology_type {
|
||||
case "node":
|
||||
return []int{0}
|
||||
case "socket":
|
||||
return SocketList()
|
||||
case "die":
|
||||
return DieList()
|
||||
case "memoryDomain":
|
||||
return NumaNodeList()
|
||||
case "core":
|
||||
return CoreList()
|
||||
case "hwthread":
|
||||
return HwthreadList()
|
||||
}
|
||||
return []int{}
|
||||
}
|
||||
|
||||
// Structure holding all information about a hardware thread
|
||||
type HwthreadEntry struct {
|
||||
Cpuid int
|
||||
SMT int
|
||||
Core int
|
||||
@@ -184,25 +216,25 @@ type CpuEntry struct {
|
||||
Die int
|
||||
}
|
||||
|
||||
func CpuData() []CpuEntry {
|
||||
func CpuData() []HwthreadEntry {
|
||||
|
||||
fileToInt := func(path string) int {
|
||||
buffer, err := ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
//cclogger.ComponentError("ccTopology", "Reading", path, ":", err.Error())
|
||||
return -1
|
||||
}
|
||||
sbuffer := strings.Replace(string(buffer), "\n", "", -1)
|
||||
var id int64
|
||||
//_, err = fmt.Scanf("%d", sbuffer, &id)
|
||||
id, err = strconv.ParseInt(sbuffer, 10, 32)
|
||||
if err != nil {
|
||||
cclogger.ComponentError("ccTopology", "Parsing", path, ":", sbuffer, err.Error())
|
||||
return -1
|
||||
}
|
||||
return int(id)
|
||||
}
|
||||
// fileToInt := func(path string) int {
|
||||
// buffer, err := ioutil.ReadFile(path)
|
||||
// if err != nil {
|
||||
// log.Print(err)
|
||||
// //cclogger.ComponentError("ccTopology", "Reading", path, ":", err.Error())
|
||||
// return -1
|
||||
// }
|
||||
// sbuffer := strings.Replace(string(buffer), "\n", "", -1)
|
||||
// var id int64
|
||||
// //_, err = fmt.Scanf("%d", sbuffer, &id)
|
||||
// id, err = strconv.ParseInt(sbuffer, 10, 32)
|
||||
// if err != nil {
|
||||
// cclogger.ComponentError("ccTopology", "Parsing", path, ":", sbuffer, err.Error())
|
||||
// return -1
|
||||
// }
|
||||
// return int(id)
|
||||
// }
|
||||
getCore := func(basepath string) int {
|
||||
return fileToInt(fmt.Sprintf("%s/core_id", basepath))
|
||||
}
|
||||
@@ -260,9 +292,9 @@ func CpuData() []CpuEntry {
|
||||
return 0
|
||||
}
|
||||
|
||||
clist := make([]CpuEntry, 0)
|
||||
for _, c := range CpuList() {
|
||||
clist = append(clist, CpuEntry{Cpuid: c})
|
||||
clist := make([]HwthreadEntry, 0)
|
||||
for _, c := range HwthreadList() {
|
||||
clist = append(clist, HwthreadEntry{Cpuid: c})
|
||||
}
|
||||
for i, centry := range clist {
|
||||
centry.Socket = -1
|
||||
@@ -298,6 +330,7 @@ func CpuData() []CpuEntry {
|
||||
return clist
|
||||
}
|
||||
|
||||
// Structure holding basic information about a CPU
|
||||
type CpuInformation struct {
|
||||
NumHWthreads int
|
||||
SMTWidth int
|
||||
@@ -307,6 +340,7 @@ type CpuInformation struct {
|
||||
NumNumaDomains int
|
||||
}
|
||||
|
||||
// Get basic information about the CPU
|
||||
func CpuInfo() CpuInformation {
|
||||
var c CpuInformation
|
||||
|
||||
@@ -342,7 +376,8 @@ func CpuInfo() CpuInformation {
|
||||
return c
|
||||
}
|
||||
|
||||
func GetCpuSocket(cpuid int) int {
|
||||
// Get the CPU socket ID for a given hardware thread ID
|
||||
func GetHwthreadSocket(cpuid int) int {
|
||||
cdata := CpuData()
|
||||
for _, d := range cdata {
|
||||
if d.Cpuid == cpuid {
|
||||
@@ -352,7 +387,8 @@ func GetCpuSocket(cpuid int) int {
|
||||
return -1
|
||||
}
|
||||
|
||||
func GetCpuNumaDomain(cpuid int) int {
|
||||
// Get the NUMA node ID for a given hardware thread ID
|
||||
func GetHwthreadNumaDomain(cpuid int) int {
|
||||
cdata := CpuData()
|
||||
for _, d := range cdata {
|
||||
if d.Cpuid == cpuid {
|
||||
@@ -362,7 +398,8 @@ func GetCpuNumaDomain(cpuid int) int {
|
||||
return -1
|
||||
}
|
||||
|
||||
func GetCpuDie(cpuid int) int {
|
||||
// Get the CPU die ID for a given hardware thread ID
|
||||
func GetHwthreadDie(cpuid int) int {
|
||||
cdata := CpuData()
|
||||
for _, d := range cdata {
|
||||
if d.Cpuid == cpuid {
|
||||
@@ -372,7 +409,8 @@ func GetCpuDie(cpuid int) int {
|
||||
return -1
|
||||
}
|
||||
|
||||
func GetCpuCore(cpuid int) int {
|
||||
// Get the CPU core ID for a given hardware thread ID
|
||||
func GetHwthreadCore(cpuid int) int {
|
||||
cdata := CpuData()
|
||||
for _, d := range cdata {
|
||||
if d.Cpuid == cpuid {
|
||||
@@ -382,7 +420,8 @@ func GetCpuCore(cpuid int) int {
|
||||
return -1
|
||||
}
|
||||
|
||||
func GetSocketCpus(socket int) []int {
|
||||
// Get the all hardware thread ID associated with a CPU socket
|
||||
func GetSocketHwthreads(socket int) []int {
|
||||
all := CpuData()
|
||||
cpulist := make([]int, 0)
|
||||
for _, d := range all {
|
||||
@@ -393,7 +432,8 @@ func GetSocketCpus(socket int) []int {
|
||||
return cpulist
|
||||
}
|
||||
|
||||
func GetNumaDomainCpus(domain int) []int {
|
||||
// Get the all hardware thread ID associated with a NUMA node
|
||||
func GetNumaDomainHwthreads(domain int) []int {
|
||||
all := CpuData()
|
||||
cpulist := make([]int, 0)
|
||||
for _, d := range all {
|
||||
@@ -404,7 +444,8 @@ func GetNumaDomainCpus(domain int) []int {
|
||||
return cpulist
|
||||
}
|
||||
|
||||
func GetDieCpus(die int) []int {
|
||||
// Get the all hardware thread ID associated with a CPU die
|
||||
func GetDieHwthreads(die int) []int {
|
||||
all := CpuData()
|
||||
cpulist := make([]int, 0)
|
||||
for _, d := range all {
|
||||
@@ -415,7 +456,8 @@ func GetDieCpus(die int) []int {
|
||||
return cpulist
|
||||
}
|
||||
|
||||
func GetCoreCpus(core int) []int {
|
||||
// Get the all hardware thread ID associated with a CPU core
|
||||
func GetCoreHwthreads(core int) []int {
|
||||
all := CpuData()
|
||||
cpulist := make([]int, 0)
|
||||
for _, d := range all {
|
||||
|
@@ -246,7 +246,7 @@ func matchfunc(args ...interface{}) (interface{}, error) {
|
||||
func getCpuCoreFunc(args ...interface{}) (interface{}, error) {
|
||||
switch cpuid := args[0].(type) {
|
||||
case int:
|
||||
return topo.GetCpuCore(cpuid), nil
|
||||
return topo.GetHwthreadCore(cpuid), nil
|
||||
}
|
||||
return -1, errors.New("function 'getCpuCore' accepts only an 'int' cpuid")
|
||||
}
|
||||
@@ -255,7 +255,7 @@ func getCpuCoreFunc(args ...interface{}) (interface{}, error) {
|
||||
func getCpuSocketFunc(args ...interface{}) (interface{}, error) {
|
||||
switch cpuid := args[0].(type) {
|
||||
case int:
|
||||
return topo.GetCpuSocket(cpuid), nil
|
||||
return topo.GetHwthreadSocket(cpuid), nil
|
||||
}
|
||||
return -1, errors.New("function 'getCpuCore' accepts only an 'int' cpuid")
|
||||
}
|
||||
@@ -264,7 +264,7 @@ func getCpuSocketFunc(args ...interface{}) (interface{}, error) {
|
||||
func getCpuNumaDomainFunc(args ...interface{}) (interface{}, error) {
|
||||
switch cpuid := args[0].(type) {
|
||||
case int:
|
||||
return topo.GetCpuNumaDomain(cpuid), nil
|
||||
return topo.GetHwthreadNumaDomain(cpuid), nil
|
||||
}
|
||||
return -1, errors.New("function 'getCpuNuma' accepts only an 'int' cpuid")
|
||||
}
|
||||
@@ -273,7 +273,7 @@ func getCpuNumaDomainFunc(args ...interface{}) (interface{}, error) {
|
||||
func getCpuDieFunc(args ...interface{}) (interface{}, error) {
|
||||
switch cpuid := args[0].(type) {
|
||||
case int:
|
||||
return topo.GetCpuDie(cpuid), nil
|
||||
return topo.GetHwthreadDie(cpuid), nil
|
||||
}
|
||||
return -1, errors.New("function 'getCpuDie' accepts only an 'int' cpuid")
|
||||
}
|
||||
@@ -336,7 +336,7 @@ func getCpuListOfDieFunc(args ...interface{}) (interface{}, error) {
|
||||
|
||||
// wrapper function to get a list of all cpuids of the node
|
||||
func getCpuListOfNode(args ...interface{}) (interface{}, error) {
|
||||
return topo.CpuList(), nil
|
||||
return topo.HwthreadList(), nil
|
||||
}
|
||||
|
||||
// helper function to get the cpuid list for a CCMetric type tag set (type and type-id)
|
||||
@@ -348,14 +348,14 @@ func getCpuListOfType(args ...interface{}) (interface{}, error) {
|
||||
case string:
|
||||
switch typ {
|
||||
case "node":
|
||||
return topo.CpuList(), nil
|
||||
return topo.HwthreadList(), nil
|
||||
case "socket":
|
||||
return getCpuListOfSocketFunc(args[1])
|
||||
case "numadomain":
|
||||
return getCpuListOfNumaDomainFunc(args[1])
|
||||
case "core":
|
||||
return getCpuListOfCoreFunc(args[1])
|
||||
case "cpu":
|
||||
case "hwthread":
|
||||
var cpu int
|
||||
|
||||
switch id := args[1].(type) {
|
||||
|
@@ -52,6 +52,11 @@ The CCMetric router sits in between the collectors and the sinks and can be used
|
||||
],
|
||||
"rename_metrics" : {
|
||||
"metric_12345" : "mymetric"
|
||||
},
|
||||
"normalize_units" : true,
|
||||
"change_unit_prefix" : {
|
||||
"mem_used" : "G",
|
||||
"mem_total" : "G"
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -192,6 +197,14 @@ This option takes a list of evaluable conditions and performs them one after the
|
||||
```
|
||||
The first line is comparable with the example in `drop_metrics`, it drops all metrics starting with `drop_metric_` and ending with a number. The second line drops all metrics of the first hardware thread (**not** recommended)
|
||||
|
||||
# Manipulating the metric units
|
||||
|
||||
## The `normalize_units` option
|
||||
The cc-metric-collector tries to read the data from the system as it is reported. If available, it tries to read the metric unit from the system as well (e.g. from `/proc/meminfo`). The problem is that, depending on the source, the metric units are named differently. Just think about `byte`, `Byte`, `B`, `bytes`, ...
|
||||
The [cc-units](https://github.com/ClusterCockpit/cc-units) package provides us a normalization option to use the same metric unit name for all metrics. It this option is set to true, all `unit` meta tags are normalized.
|
||||
|
||||
## The `change_unit_prefix` section
|
||||
It is often the case that metrics are reported by the system using a rather outdated unit prefix (like `/proc/meminfo` still uses kByte despite current memory sizes are in the GByte range). If you want to change the prefix of a unit, you can do that with the help of [cc-units](https://github.com/ClusterCockpit/cc-units). The setting works on the metric name and requires the new prefix for the metric. The cc-units package determines the scaling factor.
|
||||
|
||||
# Aggregate metric values of the current interval with the `interval_aggregates` option
|
||||
|
||||
@@ -239,3 +252,22 @@ Use cases for `interval_aggregates`:
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
# Order of operations
|
||||
|
||||
The router performs the above mentioned options in a specific order. In order to get the logic you want for a specific metric, it is crucial to know the processing order:
|
||||
|
||||
- Add the `hostname` tag (c)
|
||||
- Manipulate the timestamp to the interval timestamp (c,r)
|
||||
- Drop metrics based on `drop_metrics` and `drop_metrics_if` (c,r)
|
||||
- Add tags based on `add_tags` (c,r)
|
||||
- Delete tags based on `del_tags` (c,r)
|
||||
- Rename metric based on `rename_metric` (c,r)
|
||||
- Add tags based on `add_tags` to still work if the configuration uses the new name (c,r)
|
||||
- Delete tags based on `del_tags` to still work if the configuration uses the new name (c,r)
|
||||
- Normalize units when `normalize_units` is set (c,r)
|
||||
- Convert unit prefix based on `change_unit_prefix` (c,r)
|
||||
|
||||
Legend:
|
||||
- 'c' if metric is coming from a collector
|
||||
- 'r' if metric is coming from a receiver
|
||||
|
@@ -12,6 +12,7 @@ import (
|
||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
||||
mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker"
|
||||
units "github.com/ClusterCockpit/cc-units"
|
||||
)
|
||||
|
||||
const ROUTER_MAX_FORWARD = 50
|
||||
@@ -35,6 +36,8 @@ type metricRouterConfig struct {
|
||||
IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically by ticker each interval?
|
||||
NumCacheIntervals int `json:"num_cache_intervals"` // Number of intervals of cached metrics for evaluation
|
||||
MaxForward int `json:"max_forward"` // Number of maximal forwarded metrics at one select
|
||||
NormalizeUnits bool `json:"normalize_units"` // Check unit meta flag and normalize it using cc-units
|
||||
ChangeUnitPrefix map[string]string `json:"change_unit_prefix"` // Add prefix that should be applied to the metrics
|
||||
dropMetrics map[string]bool // Internal map for O(1) lookup
|
||||
}
|
||||
|
||||
@@ -207,6 +210,38 @@ func (r *metricRouter) dropMetric(point lp.CCMetric) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func (r *metricRouter) prepareUnit(point lp.CCMetric) bool {
|
||||
if r.config.NormalizeUnits {
|
||||
if in_unit, ok := point.GetMeta("unit"); ok {
|
||||
u := units.NewUnit(in_unit)
|
||||
if u.Valid() {
|
||||
point.AddMeta("unit", u.Short())
|
||||
}
|
||||
}
|
||||
}
|
||||
if newP, ok := r.config.ChangeUnitPrefix[point.Name()]; ok {
|
||||
|
||||
newPrefix := units.NewPrefix(newP)
|
||||
|
||||
if in_unit, ok := point.GetMeta("unit"); ok && newPrefix != units.InvalidPrefix {
|
||||
u := units.NewUnit(in_unit)
|
||||
if u.Valid() {
|
||||
cclog.ComponentDebug("MetricRouter", "Change prefix to", newP, "for metric", point.Name())
|
||||
conv, out_unit := units.GetUnitPrefixFactor(u, newPrefix)
|
||||
if conv != nil && out_unit.Valid() {
|
||||
if val, ok := point.GetField("value"); ok {
|
||||
point.AddField("value", conv(val))
|
||||
point.AddMeta("unit", out_unit.Short())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// Start starts the metric router
|
||||
func (r *metricRouter) Start() {
|
||||
// start timer if configured
|
||||
@@ -232,9 +267,11 @@ func (r *metricRouter) Start() {
|
||||
if new, ok := r.config.RenameMetrics[name]; ok {
|
||||
point.SetName(new)
|
||||
point.AddMeta("oldname", name)
|
||||
r.DoAddTags(point)
|
||||
r.DoDelTags(point)
|
||||
}
|
||||
r.DoAddTags(point)
|
||||
r.DoDelTags(point)
|
||||
|
||||
r.prepareUnit(point)
|
||||
|
||||
for _, o := range r.outputs {
|
||||
o <- point
|
||||
|
Reference in New Issue
Block a user