Merge latest development changes to main branch (#79)

* Cleanup: Remove unused code

* Use Golang duration parser for 'interval' and 'duration'
 in main config

* Update handling of LIKWID headers. Download only if not already present in the system. Fixes #73

* Units with cc-units (#64)

* Add option to normalize units with cc-unit

* Add unit conversion to router

* Add option to change unit prefix in the router

* Add to MetricRouter README

* Add order of operations in router to README

* Use second add_tags/del_tags only if metric gets renamed

* Skip disks in DiskstatCollector that have size=0

* Check readability of sensor files in TempCollector

* Fix for --once option

* Rename `cpu` type to `hwthread` (#69)

* Rename 'cpu' type to 'hwthread' to avoid naming clashes with MetricStore and CC-Webfrontend

* Collectors in parallel (#74)

* Provide info to CollectorManager whether the collector can be executed in parallel with others

* Split serial and parallel collectors. Read in parallel first

* Update NvidiaCollector with new metrics, MIG and NvLink support (#75)

* CC topology module update (#76)

* Rename CPU to hardware thread, write some comments

* Do renaming in other parts

* Remove CpuList and SocketList function from metricCollector. Available in ccTopology

* Option to use MIG UUID as subtype-id in NvidiaCollector

* Option to use MIG slice name as subtype-id in NvidiaCollector

* MetricRouter: Fix JSON in README

* Fix for Github Action to really use the selected version

* Remove Ganglia installation in runonce Action and add Go 1.18

* Fix daemon options in init script

* Add separate go.mod files to use it with deprecated 1.16

* Minor updates for Makefiles

* fix string comparison

* AMD ROCm SMI collector (#77)

* Add collector for AMD ROCm SMI metrics

* Fix import path

* Fix imports

* Remove Board Number

* store GPU index explicitly

* Remove board number from description

* Use http instead of ftp to download likwid

* Fix serial number in rocmCollector

* Improved http sink (#78)

* automatic flush in NatsSink

* tweak default options of HttpSink

* shorter cirt. section and retries for HttpSink

* fix error handling

* Remove file added by mistake.

* Use http instead of ftp to download likwid

* Fix serial number in rocmCollector

Co-authored-by: Thomas Roehl <thomas.roehl@fau.de>

Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
Co-authored-by: Lou <lou.knauer@gmx.de>
This commit is contained in:
Thomas Gruber
2022-06-08 15:25:40 +02:00
committed by GitHub
parent 186a62a86b
commit 8d85bd53f1
51 changed files with 2097 additions and 705 deletions

View File

@@ -29,6 +29,7 @@ func intArrayContains(array []int, str int) (int, bool) {
return -1, false
}
// Used internally for sysfs file reads
func fileToInt(path string) int {
buffer, err := ioutil.ReadFile(path)
if err != nil {
@@ -47,6 +48,7 @@ func fileToInt(path string) int {
return int(id)
}
// Get list of CPU socket IDs
func SocketList() []int {
buffer, err := ioutil.ReadFile(string(PROCFS_CPUINFO))
if err != nil {
@@ -54,7 +56,7 @@ func SocketList() []int {
return nil
}
ll := strings.Split(string(buffer), "\n")
var packs []int
packs := make([]int, 0)
for _, line := range ll {
if strings.HasPrefix(line, "physical id") {
lv := strings.Fields(line)
@@ -72,7 +74,8 @@ func SocketList() []int {
return packs
}
func CpuList() []int {
// Get list of hardware thread IDs in the order of listing in /proc/cpuinfo
func HwthreadList() []int {
buffer, err := ioutil.ReadFile(string(PROCFS_CPUINFO))
if err != nil {
log.Print(err)
@@ -97,6 +100,13 @@ func CpuList() []int {
return cpulist
}
// Get list of hardware thread IDs in the order of listing in /proc/cpuinfo
// Deprecated! Use HwthreadList()
func CpuList() []int {
return HwthreadList()
}
// Get list of CPU core IDs in the order of listing in /proc/cpuinfo
func CoreList() []int {
buffer, err := ioutil.ReadFile(string(PROCFS_CPUINFO))
if err != nil {
@@ -122,6 +132,7 @@ func CoreList() []int {
return corelist
}
// Get list of NUMA node IDs
func NumaNodeList() []int {
numaList := make([]int, 0)
globPath := filepath.Join(string(SYSFS_NUMABASE), "node*")
@@ -156,8 +167,9 @@ func NumaNodeList() []int {
return numaList
}
// Get list of CPU die IDs
func DieList() []int {
cpulist := CpuList()
cpulist := HwthreadList()
dielist := make([]int, 0)
for _, c := range cpulist {
diepath := filepath.Join(string(SYSFS_CPUBASE), fmt.Sprintf("cpu%d", c), "topology/die_id")
@@ -175,7 +187,27 @@ func DieList() []int {
return SocketList()
}
type CpuEntry struct {
// Get list of specified type using the naming format inside ClusterCockpit
func GetTypeList(topology_type string) []int {
switch topology_type {
case "node":
return []int{0}
case "socket":
return SocketList()
case "die":
return DieList()
case "memoryDomain":
return NumaNodeList()
case "core":
return CoreList()
case "hwthread":
return HwthreadList()
}
return []int{}
}
// Structure holding all information about a hardware thread
type HwthreadEntry struct {
Cpuid int
SMT int
Core int
@@ -184,25 +216,25 @@ type CpuEntry struct {
Die int
}
func CpuData() []CpuEntry {
func CpuData() []HwthreadEntry {
fileToInt := func(path string) int {
buffer, err := ioutil.ReadFile(path)
if err != nil {
log.Print(err)
//cclogger.ComponentError("ccTopology", "Reading", path, ":", err.Error())
return -1
}
sbuffer := strings.Replace(string(buffer), "\n", "", -1)
var id int64
//_, err = fmt.Scanf("%d", sbuffer, &id)
id, err = strconv.ParseInt(sbuffer, 10, 32)
if err != nil {
cclogger.ComponentError("ccTopology", "Parsing", path, ":", sbuffer, err.Error())
return -1
}
return int(id)
}
// fileToInt := func(path string) int {
// buffer, err := ioutil.ReadFile(path)
// if err != nil {
// log.Print(err)
// //cclogger.ComponentError("ccTopology", "Reading", path, ":", err.Error())
// return -1
// }
// sbuffer := strings.Replace(string(buffer), "\n", "", -1)
// var id int64
// //_, err = fmt.Scanf("%d", sbuffer, &id)
// id, err = strconv.ParseInt(sbuffer, 10, 32)
// if err != nil {
// cclogger.ComponentError("ccTopology", "Parsing", path, ":", sbuffer, err.Error())
// return -1
// }
// return int(id)
// }
getCore := func(basepath string) int {
return fileToInt(fmt.Sprintf("%s/core_id", basepath))
}
@@ -260,9 +292,9 @@ func CpuData() []CpuEntry {
return 0
}
clist := make([]CpuEntry, 0)
for _, c := range CpuList() {
clist = append(clist, CpuEntry{Cpuid: c})
clist := make([]HwthreadEntry, 0)
for _, c := range HwthreadList() {
clist = append(clist, HwthreadEntry{Cpuid: c})
}
for i, centry := range clist {
centry.Socket = -1
@@ -298,6 +330,7 @@ func CpuData() []CpuEntry {
return clist
}
// Structure holding basic information about a CPU
type CpuInformation struct {
NumHWthreads int
SMTWidth int
@@ -307,6 +340,7 @@ type CpuInformation struct {
NumNumaDomains int
}
// Get basic information about the CPU
func CpuInfo() CpuInformation {
var c CpuInformation
@@ -342,7 +376,8 @@ func CpuInfo() CpuInformation {
return c
}
func GetCpuSocket(cpuid int) int {
// Get the CPU socket ID for a given hardware thread ID
func GetHwthreadSocket(cpuid int) int {
cdata := CpuData()
for _, d := range cdata {
if d.Cpuid == cpuid {
@@ -352,7 +387,8 @@ func GetCpuSocket(cpuid int) int {
return -1
}
func GetCpuNumaDomain(cpuid int) int {
// Get the NUMA node ID for a given hardware thread ID
func GetHwthreadNumaDomain(cpuid int) int {
cdata := CpuData()
for _, d := range cdata {
if d.Cpuid == cpuid {
@@ -362,7 +398,8 @@ func GetCpuNumaDomain(cpuid int) int {
return -1
}
func GetCpuDie(cpuid int) int {
// Get the CPU die ID for a given hardware thread ID
func GetHwthreadDie(cpuid int) int {
cdata := CpuData()
for _, d := range cdata {
if d.Cpuid == cpuid {
@@ -372,7 +409,8 @@ func GetCpuDie(cpuid int) int {
return -1
}
func GetCpuCore(cpuid int) int {
// Get the CPU core ID for a given hardware thread ID
func GetHwthreadCore(cpuid int) int {
cdata := CpuData()
for _, d := range cdata {
if d.Cpuid == cpuid {
@@ -382,7 +420,8 @@ func GetCpuCore(cpuid int) int {
return -1
}
func GetSocketCpus(socket int) []int {
// Get the all hardware thread ID associated with a CPU socket
func GetSocketHwthreads(socket int) []int {
all := CpuData()
cpulist := make([]int, 0)
for _, d := range all {
@@ -393,7 +432,8 @@ func GetSocketCpus(socket int) []int {
return cpulist
}
func GetNumaDomainCpus(domain int) []int {
// Get the all hardware thread ID associated with a NUMA node
func GetNumaDomainHwthreads(domain int) []int {
all := CpuData()
cpulist := make([]int, 0)
for _, d := range all {
@@ -404,7 +444,8 @@ func GetNumaDomainCpus(domain int) []int {
return cpulist
}
func GetDieCpus(die int) []int {
// Get the all hardware thread ID associated with a CPU die
func GetDieHwthreads(die int) []int {
all := CpuData()
cpulist := make([]int, 0)
for _, d := range all {
@@ -415,7 +456,8 @@ func GetDieCpus(die int) []int {
return cpulist
}
func GetCoreCpus(core int) []int {
// Get the all hardware thread ID associated with a CPU core
func GetCoreHwthreads(core int) []int {
all := CpuData()
cpulist := make([]int, 0)
for _, d := range all {

View File

@@ -246,7 +246,7 @@ func matchfunc(args ...interface{}) (interface{}, error) {
func getCpuCoreFunc(args ...interface{}) (interface{}, error) {
switch cpuid := args[0].(type) {
case int:
return topo.GetCpuCore(cpuid), nil
return topo.GetHwthreadCore(cpuid), nil
}
return -1, errors.New("function 'getCpuCore' accepts only an 'int' cpuid")
}
@@ -255,7 +255,7 @@ func getCpuCoreFunc(args ...interface{}) (interface{}, error) {
func getCpuSocketFunc(args ...interface{}) (interface{}, error) {
switch cpuid := args[0].(type) {
case int:
return topo.GetCpuSocket(cpuid), nil
return topo.GetHwthreadSocket(cpuid), nil
}
return -1, errors.New("function 'getCpuCore' accepts only an 'int' cpuid")
}
@@ -264,7 +264,7 @@ func getCpuSocketFunc(args ...interface{}) (interface{}, error) {
func getCpuNumaDomainFunc(args ...interface{}) (interface{}, error) {
switch cpuid := args[0].(type) {
case int:
return topo.GetCpuNumaDomain(cpuid), nil
return topo.GetHwthreadNumaDomain(cpuid), nil
}
return -1, errors.New("function 'getCpuNuma' accepts only an 'int' cpuid")
}
@@ -273,7 +273,7 @@ func getCpuNumaDomainFunc(args ...interface{}) (interface{}, error) {
func getCpuDieFunc(args ...interface{}) (interface{}, error) {
switch cpuid := args[0].(type) {
case int:
return topo.GetCpuDie(cpuid), nil
return topo.GetHwthreadDie(cpuid), nil
}
return -1, errors.New("function 'getCpuDie' accepts only an 'int' cpuid")
}
@@ -336,7 +336,7 @@ func getCpuListOfDieFunc(args ...interface{}) (interface{}, error) {
// wrapper function to get a list of all cpuids of the node
func getCpuListOfNode(args ...interface{}) (interface{}, error) {
return topo.CpuList(), nil
return topo.HwthreadList(), nil
}
// helper function to get the cpuid list for a CCMetric type tag set (type and type-id)
@@ -348,14 +348,14 @@ func getCpuListOfType(args ...interface{}) (interface{}, error) {
case string:
switch typ {
case "node":
return topo.CpuList(), nil
return topo.HwthreadList(), nil
case "socket":
return getCpuListOfSocketFunc(args[1])
case "numadomain":
return getCpuListOfNumaDomainFunc(args[1])
case "core":
return getCpuListOfCoreFunc(args[1])
case "cpu":
case "hwthread":
var cpu int
switch id := args[1].(type) {

View File

@@ -52,6 +52,11 @@ The CCMetric router sits in between the collectors and the sinks and can be used
],
"rename_metrics" : {
"metric_12345" : "mymetric"
},
"normalize_units" : true,
"change_unit_prefix" : {
"mem_used" : "G",
"mem_total" : "G"
}
}
```
@@ -192,6 +197,14 @@ This option takes a list of evaluable conditions and performs them one after the
```
The first line is comparable with the example in `drop_metrics`, it drops all metrics starting with `drop_metric_` and ending with a number. The second line drops all metrics of the first hardware thread (**not** recommended)
# Manipulating the metric units
## The `normalize_units` option
The cc-metric-collector tries to read the data from the system as it is reported. If available, it tries to read the metric unit from the system as well (e.g. from `/proc/meminfo`). The problem is that, depending on the source, the metric units are named differently. Just think about `byte`, `Byte`, `B`, `bytes`, ...
The [cc-units](https://github.com/ClusterCockpit/cc-units) package provides us a normalization option to use the same metric unit name for all metrics. It this option is set to true, all `unit` meta tags are normalized.
## The `change_unit_prefix` section
It is often the case that metrics are reported by the system using a rather outdated unit prefix (like `/proc/meminfo` still uses kByte despite current memory sizes are in the GByte range). If you want to change the prefix of a unit, you can do that with the help of [cc-units](https://github.com/ClusterCockpit/cc-units). The setting works on the metric name and requires the new prefix for the metric. The cc-units package determines the scaling factor.
# Aggregate metric values of the current interval with the `interval_aggregates` option
@@ -239,3 +252,22 @@ Use cases for `interval_aggregates`:
}
}
```
# Order of operations
The router performs the above mentioned options in a specific order. In order to get the logic you want for a specific metric, it is crucial to know the processing order:
- Add the `hostname` tag (c)
- Manipulate the timestamp to the interval timestamp (c,r)
- Drop metrics based on `drop_metrics` and `drop_metrics_if` (c,r)
- Add tags based on `add_tags` (c,r)
- Delete tags based on `del_tags` (c,r)
- Rename metric based on `rename_metric` (c,r)
- Add tags based on `add_tags` to still work if the configuration uses the new name (c,r)
- Delete tags based on `del_tags` to still work if the configuration uses the new name (c,r)
- Normalize units when `normalize_units` is set (c,r)
- Convert unit prefix based on `change_unit_prefix` (c,r)
Legend:
- 'c' if metric is coming from a collector
- 'r' if metric is coming from a receiver

View File

@@ -12,6 +12,7 @@ import (
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker"
units "github.com/ClusterCockpit/cc-units"
)
const ROUTER_MAX_FORWARD = 50
@@ -35,6 +36,8 @@ type metricRouterConfig struct {
IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically by ticker each interval?
NumCacheIntervals int `json:"num_cache_intervals"` // Number of intervals of cached metrics for evaluation
MaxForward int `json:"max_forward"` // Number of maximal forwarded metrics at one select
NormalizeUnits bool `json:"normalize_units"` // Check unit meta flag and normalize it using cc-units
ChangeUnitPrefix map[string]string `json:"change_unit_prefix"` // Add prefix that should be applied to the metrics
dropMetrics map[string]bool // Internal map for O(1) lookup
}
@@ -207,6 +210,38 @@ func (r *metricRouter) dropMetric(point lp.CCMetric) bool {
return false
}
func (r *metricRouter) prepareUnit(point lp.CCMetric) bool {
if r.config.NormalizeUnits {
if in_unit, ok := point.GetMeta("unit"); ok {
u := units.NewUnit(in_unit)
if u.Valid() {
point.AddMeta("unit", u.Short())
}
}
}
if newP, ok := r.config.ChangeUnitPrefix[point.Name()]; ok {
newPrefix := units.NewPrefix(newP)
if in_unit, ok := point.GetMeta("unit"); ok && newPrefix != units.InvalidPrefix {
u := units.NewUnit(in_unit)
if u.Valid() {
cclog.ComponentDebug("MetricRouter", "Change prefix to", newP, "for metric", point.Name())
conv, out_unit := units.GetUnitPrefixFactor(u, newPrefix)
if conv != nil && out_unit.Valid() {
if val, ok := point.GetField("value"); ok {
point.AddField("value", conv(val))
point.AddMeta("unit", out_unit.Short())
}
}
}
}
}
return true
}
// Start starts the metric router
func (r *metricRouter) Start() {
// start timer if configured
@@ -232,9 +267,11 @@ func (r *metricRouter) Start() {
if new, ok := r.config.RenameMetrics[name]; ok {
point.SetName(new)
point.AddMeta("oldname", name)
r.DoAddTags(point)
r.DoDelTags(point)
}
r.DoAddTags(point)
r.DoDelTags(point)
r.prepareUnit(point)
for _, o := range r.outputs {
o <- point