From c182d295f4cc2d74cf85f2fa33d325139b7b705e Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Tue, 15 Mar 2022 16:38:20 +0100 Subject: [PATCH 1/5] Fix staticcheck warnings (#66) --- collectors/cpufreqCpuinfoMetric.go | 6 +++--- collectors/cpufreqMetric.go | 16 ++++++++-------- collectors/cpustatMetric.go | 9 ++++----- collectors/customCmdMetric.go | 2 +- collectors/ipmiMetric.go | 2 +- collectors/metricCollector.go | 2 +- collectors/tempMetric.go | 6 +++--- collectors/topprocsMetric.go | 4 ++-- 8 files changed, 23 insertions(+), 24 deletions(-) diff --git a/collectors/cpufreqCpuinfoMetric.go b/collectors/cpufreqCpuinfoMetric.go index 44a3b0c..6c3de7a 100644 --- a/collectors/cpufreqCpuinfoMetric.go +++ b/collectors/cpufreqCpuinfoMetric.go @@ -57,7 +57,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { const cpuInfoFile = "/proc/cpuinfo" file, err := os.Open(cpuInfoFile) if err != nil { - return fmt.Errorf("Failed to open file '%s': %v", cpuInfoFile, err) + return fmt.Errorf("failed to open file '%s': %v", cpuInfoFile, err) } defer file.Close() @@ -106,14 +106,14 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { topology.coreID = coreID topology.coreID_int, err = strconv.ParseInt(coreID, 10, 64) if err != nil { - return fmt.Errorf("Unable to convert coreID '%s' to int64: %v", coreID, err) + return fmt.Errorf("unable to convert coreID '%s' to int64: %v", coreID, err) } // Physical package ID topology.physicalPackageID = physicalPackageID topology.physicalPackageID_int, err = strconv.ParseInt(physicalPackageID, 10, 64) if err != nil { - return fmt.Errorf("Unable to convert physicalPackageID '%s' to int64: %v", physicalPackageID, err) + return fmt.Errorf("unable to convert physicalPackageID '%s' to int64: %v", physicalPackageID, err) } // increase maximun socket / package ID, when required diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index 5146baa..0bf6d4c 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -70,10 +70,10 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { globPattern := filepath.Join(baseDir, "cpu[0-9]*") cpuDirs, err := filepath.Glob(globPattern) if err != nil { - return fmt.Errorf("Unable to glob files with pattern '%s': %v", globPattern, err) + return fmt.Errorf("unable to glob files with pattern '%s': %v", globPattern, err) } if cpuDirs == nil { - return fmt.Errorf("Unable to find any files with pattern '%s'", globPattern) + return fmt.Errorf("unable to find any files with pattern '%s'", globPattern) } // Initialize CPU topology @@ -82,38 +82,38 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { processor := strings.TrimPrefix(cpuDir, "/sys/devices/system/cpu/cpu") processor_int, err := strconv.ParseInt(processor, 10, 64) if err != nil { - return fmt.Errorf("Unable to convert cpuID '%s' to int64: %v", processor, err) + return fmt.Errorf("unable to convert cpuID '%s' to int64: %v", processor, err) } // Read package ID physicalPackageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") line, err := ioutil.ReadFile(physicalPackageIDFile) if err != nil { - return fmt.Errorf("Unable to read physical package ID from file '%s': %v", physicalPackageIDFile, err) + return fmt.Errorf("unable to read physical package ID from file '%s': %v", physicalPackageIDFile, err) } physicalPackageID := strings.TrimSpace(string(line)) physicalPackageID_int, err := strconv.ParseInt(physicalPackageID, 10, 64) if err != nil { - return fmt.Errorf("Unable to convert packageID '%s' to int64: %v", physicalPackageID, err) + return fmt.Errorf("unable to convert packageID '%s' to int64: %v", physicalPackageID, err) } // Read core ID coreIDFile := filepath.Join(cpuDir, "topology", "core_id") line, err = ioutil.ReadFile(coreIDFile) if err != nil { - return fmt.Errorf("Unable to read core ID from file '%s': %v", coreIDFile, err) + return fmt.Errorf("unable to read core ID from file '%s': %v", coreIDFile, err) } coreID := strings.TrimSpace(string(line)) coreID_int, err := strconv.ParseInt(coreID, 10, 64) if err != nil { - return fmt.Errorf("Unable to convert coreID '%s' to int64: %v", coreID, err) + return fmt.Errorf("unable to convert coreID '%s' to int64: %v", coreID, err) } // Check access to current frequency file scalingCurFreqFile := filepath.Join(cpuDir, "cpufreq", "scaling_cur_freq") err = unix.Access(scalingCurFreqFile, unix.R_OK) if err != nil { - return fmt.Errorf("Unable to access file '%s': %v", scalingCurFreqFile, err) + return fmt.Errorf("unable to access file '%s': %v", scalingCurFreqFile, err) } t := &m.topology[processor_int] diff --git a/collectors/cpustatMetric.go b/collectors/cpustatMetric.go index 28ae002..556aad4 100644 --- a/collectors/cpustatMetric.go +++ b/collectors/cpustatMetric.go @@ -21,11 +21,10 @@ type CpustatCollectorConfig struct { type CpustatCollector struct { metricCollector - config CpustatCollectorConfig - matches map[string]int - cputags map[string]map[string]string - nodetags map[string]string - num_cpus_metric lp.CCMetric + config CpustatCollectorConfig + matches map[string]int + cputags map[string]map[string]string + nodetags map[string]string } func (m *CpustatCollector) Init(config json.RawMessage) error { diff --git a/collectors/customCmdMetric.go b/collectors/customCmdMetric.go index e978c49..ec2109b 100644 --- a/collectors/customCmdMetric.go +++ b/collectors/customCmdMetric.go @@ -61,7 +61,7 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error { } } if len(m.files) == 0 && len(m.commands) == 0 { - return errors.New("No metrics to collect") + return errors.New("no metrics to collect") } m.handler = influx.NewMetricHandler() m.parser = influx.NewParser(m.handler) diff --git a/collectors/ipmiMetric.go b/collectors/ipmiMetric.go index e59f407..16b08ef 100644 --- a/collectors/ipmiMetric.go +++ b/collectors/ipmiMetric.go @@ -54,7 +54,7 @@ func (m *IpmiCollector) Init(config json.RawMessage) error { m.ipmisensors = p } if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 { - return errors.New("No IPMI reader found") + return errors.New("no IPMI reader found") } m.init = true return nil diff --git a/collectors/metricCollector.go b/collectors/metricCollector.go index 747772f..7c04e90 100644 --- a/collectors/metricCollector.go +++ b/collectors/metricCollector.go @@ -125,5 +125,5 @@ func RemoveFromStringList(s []string, r string) ([]string, error) { return append(s[:i], s[i+1:]...), nil } } - return s, fmt.Errorf("No such string in list") + return s, fmt.Errorf("no such string in list") } diff --git a/collectors/tempMetric.go b/collectors/tempMetric.go index bbc5100..7ba8eb1 100644 --- a/collectors/tempMetric.go +++ b/collectors/tempMetric.go @@ -70,10 +70,10 @@ func (m *TempCollector) Init(config json.RawMessage) error { globPattern := filepath.Join("/sys/class/hwmon", "*", "temp*_input") inputFiles, err := filepath.Glob(globPattern) if err != nil { - return fmt.Errorf("Unable to glob files with pattern '%s': %v", globPattern, err) + return fmt.Errorf("unable to glob files with pattern '%s': %v", globPattern, err) } if inputFiles == nil { - return fmt.Errorf("Unable to find any files with pattern '%s'", globPattern) + return fmt.Errorf("unable to find any files with pattern '%s'", globPattern) } // Get sensor name for each temperature sensor file @@ -158,7 +158,7 @@ func (m *TempCollector) Init(config json.RawMessage) error { // Empty sensors map if len(m.sensors) == 0 { - return fmt.Errorf("No temperature sensors found") + return fmt.Errorf("no temperature sensors found") } // Finished initialization diff --git a/collectors/topprocsMetric.go b/collectors/topprocsMetric.go index dd6bff3..408c3cc 100644 --- a/collectors/topprocsMetric.go +++ b/collectors/topprocsMetric.go @@ -39,14 +39,14 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error { m.config.Num_procs = int(DEFAULT_NUM_PROCS) } if m.config.Num_procs <= 0 || m.config.Num_procs > MAX_NUM_PROCS { - return errors.New(fmt.Sprintf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS)) + return fmt.Errorf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS) } m.setup() command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu") command.Wait() _, err = command.Output() if err != nil { - return errors.New("Failed to execute command") + return errors.New("failed to execute command") } m.init = true return nil From b66fdd1436331741bac9a4e32b90cdf03879f85b Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 16 Mar 2022 19:04:39 +0100 Subject: [PATCH 2/5] Add missing socket->thread_id map for LikwidCollector --- collectors/likwidMetric.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index e0b0d7e..85bd932 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -139,7 +139,16 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { for i, c := range cpulist { m.cpulist[i] = C.int(c) m.cpu2tid[c] = i - + } + m.sock2tid = make(map[int]int) + tmp := make([]C.int, 1) + for _, sid := range topo.SocketList() { + cstr := C.CString(fmt.Sprintf("S%d:0", sid)) + ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1) + if ret > 0 { + m.sock2tid[sid] = m.cpu2tid[int(tmp[0])] + } + C.free(unsafe.Pointer(cstr)) } m.results = make(map[int]map[int]map[string]interface{}) m.mresults = make(map[int]map[int]map[string]float64) From 657543dded37d87783db670289a50ed7f81f7254 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 18 Mar 2022 12:28:52 +0100 Subject: [PATCH 3/5] Ensure max_forward is at least 1 --- internal/metricRouter/metricRouter.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 7ad1e7f..f9b3faa 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -103,7 +103,10 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout cclog.ComponentError("MetricRouter", err.Error()) return err } - r.maxForward = r.config.MaxForward + r.maxForward = 1 + if r.config.MaxForward > r.maxForward { + r.maxForward = r.config.MaxForward + } if r.config.NumCacheIntervals > 0 { r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals) if err != nil { From c5061144809b8c9f9e798b7b7da6ced4a71b17fc Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 18 Mar 2022 12:29:00 +0100 Subject: [PATCH 4/5] Add processing order to MetricRouter README and add missing options --- internal/metricRouter/README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/internal/metricRouter/README.md b/internal/metricRouter/README.md index 9cd0d6c..fe2d64f 100644 --- a/internal/metricRouter/README.md +++ b/internal/metricRouter/README.md @@ -8,6 +8,8 @@ The CCMetric router sits in between the collectors and the sinks and can be used { "num_cache_intervals" : 1, "interval_timestamp" : true, + "hostname_tag" : "hostname", + "max_forward" : 50, "add_tags" : [ { "key" : "cluster", @@ -55,6 +57,20 @@ The CCMetric router sits in between the collectors and the sinks and can be used ``` There are three main options `add_tags`, `delete_tags` and `interval_timestamp`. `add_tags` and `delete_tags` are lists consisting of dicts with `key`, `value` and `if`. The `value` can be omitted in the `delete_tags` part as it only uses the `key` for removal. The `interval_timestamp` setting means that a unique timestamp is applied to all metrics traversing the router during an interval. + +# Processing order in the router + +- Add the `hostname_tag` tag (if sent by collectors or cache) +- If `interval_timestamp == true`, change time of metrics +- Check if metric should be dropped (`drop_metrics` and `drop_metrics_if`) +- Add tags from `add_tags` +- Delete tags from `del_tags` +- Rename metric based on `rename_metrics` and store old name as `oldname` in meta information +- Add tags from `add_tags` (if you used the new name in the `if` condition) +- Delete tags from `del_tags` (if you used the new name in the `if` condition) +- Send to sinks +- Move to cache (if `num_cache_intervals > 0`) + # The `interval_timestamp` option The collectors' `Read()` functions are not called simultaneously and therefore the metrics gathered in an interval can have different timestamps. If you want to avoid that and have a common timestamp (the beginning of the interval), set this option to `true` and the MetricRouter sets the time. @@ -65,6 +81,14 @@ If the MetricRouter should buffer metrics of intervals in a MetricCache, this op A `num_cache_intervals > 0` is required to use the `interval_aggregates` option. +# The `hostname_tag` option + +By default, the router tags metrics with the hostname for all locally created metrics. The default tag name is `hostname`, but it can be changed if your organization wants anything else + +# The `max_forward` option + +Every time the router receives a metric through any of the channels, it tries to directly read up to `max_forward` metrics from the same channel. This was done as the router thread would go to sleep and wake up with every arriving metric. The default are `50` metrics at once and `max_forward` needs to greater than `1`. + # The `rename_metrics` option In the ClusterCockpit world we specified a set of standard metrics. Since some collectors determine the metric names based on files, execuables and libraries, they might change from system to system (or installation to installtion, OS to OS, ...). In order to get the common names, you can rename incoming metrics before sending them to the sink. If the metric name matches the `oldname`, it is changed to `newname` From 622e94ae0eb4e559b2c3a819e06bf5e17cad2e8e Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 22 Mar 2022 15:58:10 +0100 Subject: [PATCH 5/5] Fix DieList() if system does not support dies. Explicitly set entries in CpuData list --- internal/ccTopology/ccTopology.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/internal/ccTopology/ccTopology.go b/internal/ccTopology/ccTopology.go index 958bb45..f68c3f4 100644 --- a/internal/ccTopology/ccTopology.go +++ b/internal/ccTopology/ccTopology.go @@ -169,7 +169,10 @@ func DieList() []int { } } } - return dielist + if len(dielist) > 0 { + return dielist + } + return SocketList() } type CpuEntry struct { @@ -261,7 +264,7 @@ func CpuData() []CpuEntry { for _, c := range CpuList() { clist = append(clist, CpuEntry{Cpuid: c}) } - for _, centry := range clist { + for i, centry := range clist { centry.Socket = -1 centry.Numadomain = -1 centry.Die = -1 @@ -289,6 +292,8 @@ func CpuData() []CpuEntry { // Lookup NUMA domain id centry.Numadomain = getNumaDomain(base) + // Update values in output list + clist[i] = centry } return clist }