From 657543dded37d87783db670289a50ed7f81f7254 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 18 Mar 2022 12:28:52 +0100 Subject: [PATCH 1/6] Ensure max_forward is at least 1 --- internal/metricRouter/metricRouter.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 7ad1e7f..f9b3faa 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -103,7 +103,10 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout cclog.ComponentError("MetricRouter", err.Error()) return err } - r.maxForward = r.config.MaxForward + r.maxForward = 1 + if r.config.MaxForward > r.maxForward { + r.maxForward = r.config.MaxForward + } if r.config.NumCacheIntervals > 0 { r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals) if err != nil { From c5061144809b8c9f9e798b7b7da6ced4a71b17fc Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 18 Mar 2022 12:29:00 +0100 Subject: [PATCH 2/6] Add processing order to MetricRouter README and add missing options --- internal/metricRouter/README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/internal/metricRouter/README.md b/internal/metricRouter/README.md index 9cd0d6c..fe2d64f 100644 --- a/internal/metricRouter/README.md +++ b/internal/metricRouter/README.md @@ -8,6 +8,8 @@ The CCMetric router sits in between the collectors and the sinks and can be used { "num_cache_intervals" : 1, "interval_timestamp" : true, + "hostname_tag" : "hostname", + "max_forward" : 50, "add_tags" : [ { "key" : "cluster", @@ -55,6 +57,20 @@ The CCMetric router sits in between the collectors and the sinks and can be used ``` There are three main options `add_tags`, `delete_tags` and `interval_timestamp`. `add_tags` and `delete_tags` are lists consisting of dicts with `key`, `value` and `if`. The `value` can be omitted in the `delete_tags` part as it only uses the `key` for removal. The `interval_timestamp` setting means that a unique timestamp is applied to all metrics traversing the router during an interval. + +# Processing order in the router + +- Add the `hostname_tag` tag (if sent by collectors or cache) +- If `interval_timestamp == true`, change time of metrics +- Check if metric should be dropped (`drop_metrics` and `drop_metrics_if`) +- Add tags from `add_tags` +- Delete tags from `del_tags` +- Rename metric based on `rename_metrics` and store old name as `oldname` in meta information +- Add tags from `add_tags` (if you used the new name in the `if` condition) +- Delete tags from `del_tags` (if you used the new name in the `if` condition) +- Send to sinks +- Move to cache (if `num_cache_intervals > 0`) + # The `interval_timestamp` option The collectors' `Read()` functions are not called simultaneously and therefore the metrics gathered in an interval can have different timestamps. If you want to avoid that and have a common timestamp (the beginning of the interval), set this option to `true` and the MetricRouter sets the time. @@ -65,6 +81,14 @@ If the MetricRouter should buffer metrics of intervals in a MetricCache, this op A `num_cache_intervals > 0` is required to use the `interval_aggregates` option. +# The `hostname_tag` option + +By default, the router tags metrics with the hostname for all locally created metrics. The default tag name is `hostname`, but it can be changed if your organization wants anything else + +# The `max_forward` option + +Every time the router receives a metric through any of the channels, it tries to directly read up to `max_forward` metrics from the same channel. This was done as the router thread would go to sleep and wake up with every arriving metric. The default are `50` metrics at once and `max_forward` needs to greater than `1`. + # The `rename_metrics` option In the ClusterCockpit world we specified a set of standard metrics. Since some collectors determine the metric names based on files, execuables and libraries, they might change from system to system (or installation to installtion, OS to OS, ...). In order to get the common names, you can rename incoming metrics before sending them to the sink. If the metric name matches the `oldname`, it is changed to `newname` From 622e94ae0eb4e559b2c3a819e06bf5e17cad2e8e Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 22 Mar 2022 15:58:10 +0100 Subject: [PATCH 3/6] Fix DieList() if system does not support dies. Explicitly set entries in CpuData list --- internal/ccTopology/ccTopology.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/internal/ccTopology/ccTopology.go b/internal/ccTopology/ccTopology.go index 958bb45..f68c3f4 100644 --- a/internal/ccTopology/ccTopology.go +++ b/internal/ccTopology/ccTopology.go @@ -169,7 +169,10 @@ func DieList() []int { } } } - return dielist + if len(dielist) > 0 { + return dielist + } + return SocketList() } type CpuEntry struct { @@ -261,7 +264,7 @@ func CpuData() []CpuEntry { for _, c := range CpuList() { clist = append(clist, CpuEntry{Cpuid: c}) } - for _, centry := range clist { + for i, centry := range clist { centry.Socket = -1 centry.Numadomain = -1 centry.Die = -1 @@ -289,6 +292,8 @@ func CpuData() []CpuEntry { // Lookup NUMA domain id centry.Numadomain = getNumaDomain(base) + // Update values in output list + clist[i] = centry } return clist } From 296225f3a8f4c0738d28195f59387c7512ca9b9d Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 24 Mar 2022 13:50:35 +0100 Subject: [PATCH 4/6] Always export all metrics in NfsCollectors --- collectors/nfsMetric.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/collectors/nfsMetric.go b/collectors/nfsMetric.go index 07e684d..c511b0d 100644 --- a/collectors/nfsMetric.go +++ b/collectors/nfsMetric.go @@ -36,7 +36,7 @@ type nfsCollector struct { } func (m *nfsCollector) initStats() error { - cmd := exec.Command(m.config.Nfsstats, `-l`) + cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`) cmd.Wait() buffer, err := cmd.Output() if err == nil { @@ -52,7 +52,7 @@ func (m *nfsCollector) initStats() error { if err == nil { x := m.data[name] x.current = value - x.last = 0 + x.last = value m.data[name] = x } } @@ -63,7 +63,7 @@ func (m *nfsCollector) initStats() error { } func (m *nfsCollector) updateStats() error { - cmd := exec.Command(m.config.Nfsstats, `-l`) + cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`) cmd.Wait() buffer, err := cmd.Output() if err == nil { From e0e91844bced050459e310d97e0e74c7bb4bd6e9 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 24 Mar 2022 17:56:51 +0100 Subject: [PATCH 5/6] Use late initialization of LIKWID and catch access daemon death. Fixes #70 and fixes #71. --- collectors/likwidMetric.go | 381 ++++++++++++++++++++++++------------- 1 file changed, 251 insertions(+), 130 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 85bd932..7113b4f 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -15,8 +15,11 @@ import ( "io/ioutil" "math" "os" + "os/signal" "strconv" "strings" + "sync" + "syscall" "time" "unsafe" @@ -46,6 +49,15 @@ type LikwidCollectorEventsetConfig struct { Metrics []LikwidCollectorMetricConfig `json:"metrics"` } +type LikwidEventsetConfig struct { + internal int + gid C.int + eorder []*C.char + estr *C.char + results map[int]map[string]interface{} + metrics map[int]map[string]float64 +} + type LikwidCollectorConfig struct { Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"` Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"` @@ -58,17 +70,18 @@ type LikwidCollectorConfig struct { type LikwidCollector struct { metricCollector - cpulist []C.int - cpu2tid map[int]int - sock2tid map[int]int - metrics map[C.int]map[string]int - groups []C.int - config LikwidCollectorConfig - results map[int]map[int]map[string]interface{} - mresults map[int]map[int]map[string]float64 - gmresults map[int]map[string]float64 - basefreq float64 - running bool + cpulist []C.int + cpu2tid map[int]int + sock2tid map[int]int + metrics map[C.int]map[string]int + groups []C.int + config LikwidCollectorConfig + gmresults map[int]map[string]float64 + basefreq float64 + running bool + initialized bool + likwidGroups map[C.int]LikwidEventsetConfig + lock sync.Mutex } type LikwidMetric struct { @@ -86,6 +99,45 @@ func eventsToEventStr(events map[string]string) string { return strings.Join(elist, ",") } +func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig { + tmplist := make([]string, 0) + elist := make([]*C.char, 0) + for k, v := range input.Events { + tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k)) + c_counter := C.CString(k) + elist = append(elist, c_counter) + } + estr := strings.Join(tmplist, ",") + res := make(map[int]map[string]interface{}) + met := make(map[int]map[string]float64) + for _, i := range topo.CpuList() { + res[i] = make(map[string]interface{}) + for k := range input.Events { + res[i][k] = 0.0 + } + met[i] = make(map[string]float64) + for _, v := range input.Metrics { + res[i][v.Name] = 0.0 + } + } + return LikwidEventsetConfig{ + gid: -1, + eorder: elist, + estr: C.CString(estr), + results: res, + metrics: met, + } +} + +func testLikwidMetricFormula(formula string, params []string) bool { + myparams := make(map[string]interface{}) + for _, p := range params { + myparams[p] = float64(1.0) + } + _, err := agg.EvalFloat64Condition(formula, myparams) + return err == nil +} + func getBaseFreq() float64 { var freq float64 = math.NaN() C.power_init(0) @@ -108,6 +160,8 @@ func getBaseFreq() float64 { func (m *LikwidCollector) Init(config json.RawMessage) error { var ret C.int m.name = "LikwidCollector" + m.initialized = false + m.running = false m.config.AccessMode = LIKWID_DEF_ACCESSMODE m.config.LibraryPath = LIKWID_LIB_NAME if len(config) > 0 { @@ -140,6 +194,15 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { m.cpulist[i] = C.int(c) m.cpu2tid[c] = i } + + cclog.ComponentDebug(m.name, "initialize LIKWID topology") + ret = C.topology_init() + if ret != 0 { + err := errors.New("failed to initialize LIKWID topology") + cclog.ComponentError(m.name, err.Error()) + return err + } + m.sock2tid = make(map[int]int) tmp := make([]C.int, 1) for _, sid := range topo.SocketList() { @@ -150,15 +213,13 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { } C.free(unsafe.Pointer(cstr)) } - m.results = make(map[int]map[int]map[string]interface{}) - m.mresults = make(map[int]map[int]map[string]float64) + m.likwidGroups = make(map[C.int]LikwidEventsetConfig) + + // m.results = make(map[int]map[int]map[string]interface{}) + // m.mresults = make(map[int]map[int]map[string]float64) m.gmresults = make(map[int]map[string]float64) - cclog.ComponentDebug(m.name, "initialize LIKWID topology") - ret = C.topology_init() - if ret != 0 { - err := errors.New("failed to initialize LIKWID topology") - cclog.ComponentError(m.name, err.Error()) - return err + for _, tid := range m.cpu2tid { + m.gmresults[tid] = make(map[string]float64) } switch m.config.AccessMode { @@ -172,79 +233,50 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { C.HPMmode(1) } - cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module") - ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) - if ret != 0 { - C.topology_finalize() - err := errors.New("failed to initialize LIKWID topology") - cclog.ComponentError(m.name, err.Error()) - return err - } - // This is for the global metrics computation test - globalParams := make(map[string]interface{}) - globalParams["time"] = float64(1.0) - globalParams["inverseClock"] = float64(1.0) - // While adding the events, we test the metrics whether they can be computed at all - for i, evset := range m.config.Eventsets { - var gid C.int - var cstr *C.char + totalMetrics := 0 + // Generate parameter list for the metric computing test + params := make([]string, 0) + params = append(params, "time", "inverseClock") + // Generate parameter list for the global metric computing test + globalParams := make([]string, 0) + globalParams = append(globalParams, "time", "inverseClock") + // We test the eventset metrics whether they can be computed at all + for _, evset := range m.config.Eventsets { if len(evset.Events) > 0 { - estr := eventsToEventStr(evset.Events) - // Generate parameter list for the metric computing test - params := make(map[string]interface{}) - params["time"] = float64(1.0) - params["inverseClock"] = float64(1.0) + params = params[:2] for counter := range evset.Events { - params[counter] = float64(1.0) + params = append(params, counter) } for _, metric := range evset.Metrics { // Try to evaluate the metric - _, err := agg.EvalFloat64Condition(metric.Calc, params) - if err != nil { - cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) - continue - } - // If the metric is not in the parameter list for the global metrics, add it - if _, ok := globalParams[metric.Name]; !ok { - globalParams[metric.Name] = float64(1.0) + if testLikwidMetricFormula(metric.Calc, params) { + // Add the computable metric to the parameter list for the global metrics + globalParams = append(globalParams, metric.Name) + totalMetrics++ + } else { + metric.Calc = "" } } - // Now we add the list of events to likwid - cstr = C.CString(estr) - gid = C.perfmon_addEventSet(cstr) } else { cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given") continue } - if gid >= 0 { - m.groups = append(m.groups, gid) - } - C.free(unsafe.Pointer(cstr)) - m.results[i] = make(map[int]map[string]interface{}) - m.mresults[i] = make(map[int]map[string]float64) - for tid := range m.cpulist { - m.results[i][tid] = make(map[string]interface{}) - m.mresults[i][tid] = make(map[string]float64) - if i == 0 { - m.gmresults[tid] = make(map[string]float64) - } - } } for _, metric := range m.config.Metrics { // Try to evaluate the global metric - _, err := agg.EvalFloat64Condition(metric.Calc, globalParams) - if err != nil { - cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) - continue + if !testLikwidMetricFormula(metric.Calc, globalParams) { + cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed") + metric.Calc = "" + } else { + totalMetrics++ } } // If no event set could be added, shut down LikwidCollector - if len(m.groups) == 0 { - C.perfmon_finalize() + if totalMetrics == 0 { C.topology_finalize() - err := errors.New("no LIKWID performance group initialized") + err := errors.New("no LIKWID eventset or metric usable") cclog.ComponentError(m.name, err.Error()) return err } @@ -255,57 +287,71 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { } // take a measurement for 'interval' seconds of event set index 'group' -func (m *LikwidCollector) takeMeasurement(group int, interval time.Duration) error { +func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval time.Duration) (bool, error) { var ret C.int - gid := m.groups[group] - ret = C.perfmon_setupCounters(gid) - if ret != 0 { - gctr := C.GoString(C.perfmon_getGroupName(gid)) - err := fmt.Errorf("failed to setup performance group %d (%s)", gid, gctr) - return err + m.lock.Lock() + if m.initialized { + ret = C.perfmon_setupCounters(evset.gid) + if ret != 0 { + var err error = nil + var skip bool = false + if ret == -37 { + skip = true + } else { + err = fmt.Errorf("failed to setup performance group %d", evset.gid) + } + m.lock.Unlock() + return skip, err + } + ret = C.perfmon_startCounters() + if ret != 0 { + var err error = nil + var skip bool = false + if ret == -37 { + skip = true + } else { + err = fmt.Errorf("failed to setup performance group %d", evset.gid) + } + m.lock.Unlock() + return skip, err + } + m.running = true + time.Sleep(interval) + m.running = false + ret = C.perfmon_stopCounters() + if ret != 0 { + var err error = nil + var skip bool = false + if ret == -37 { + skip = true + } else { + err = fmt.Errorf("failed to setup performance group %d", evset.gid) + } + m.lock.Unlock() + return skip, err + } } - ret = C.perfmon_startCounters() - if ret != 0 { - gctr := C.GoString(C.perfmon_getGroupName(gid)) - err := fmt.Errorf("failed to start performance group %d (%s)", gid, gctr) - return err - } - m.running = true - time.Sleep(interval) - m.running = false - ret = C.perfmon_stopCounters() - if ret != 0 { - gctr := C.GoString(C.perfmon_getGroupName(gid)) - err := fmt.Errorf("failed to stop performance group %d (%s)", gid, gctr) - return err - } - return nil + m.lock.Unlock() + return false, nil } // Get all measurement results for an event set, derive the metric values out of the measurement results and send it -func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, output chan lp.CCMetric) error { - var eidx C.int - evset := m.config.Eventsets[group] - gid := m.groups[group] +func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error { invClock := float64(1.0 / m.basefreq) // Go over events and get the results - for eidx = 0; int(eidx) < len(evset.Events); eidx++ { - ctr := C.perfmon_getCounterName(gid, eidx) - gctr := C.GoString(ctr) - + for eidx, counter := range evset.eorder { + gctr := C.GoString(counter) for _, tid := range m.cpu2tid { - if tid >= 0 { - m.results[group][tid]["time"] = interval.Seconds() - m.results[group][tid]["inverseClock"] = invClock - res := C.perfmon_getLastResult(gid, eidx, C.int(tid)) - m.results[group][tid][gctr] = float64(res) - } + res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid)) + evset.results[tid][gctr] = float64(res) + evset.results[tid]["time"] = interval.Seconds() + evset.results[tid]["inverseClock"] = invClock } } // Go over the event set metrics, derive the value out of the event:counter values and send it - for _, metric := range evset.Metrics { + for _, metric := range m.config.Eventsets[evset.internal].Metrics { // The metric scope is determined in the Init() function // Get the map scope-id -> tids scopemap := m.cpu2tid @@ -313,13 +359,13 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, scopemap = m.sock2tid } for domain, tid := range scopemap { - if tid >= 0 { - value, err := agg.EvalFloat64Condition(metric.Calc, m.results[group][tid]) + if tid >= 0 && len(metric.Calc) > 0 { + value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid]) if err != nil { cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) continue } - m.mresults[group][tid][metric.Name] = value + evset.metrics[tid][metric.Name] = value if m.config.InvalidToZero && math.IsNaN(value) { value = 0.0 } @@ -360,8 +406,8 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan if tid >= 0 { // Here we generate parameter list params := make(map[string]interface{}) - for j := range m.groups { - for mname, mres := range m.mresults[j][tid] { + for _, evset := range m.likwidGroups { + for mname, mres := range evset.metrics[tid] { params[mname] = mres } } @@ -401,38 +447,113 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan return nil } +func (m *LikwidCollector) LateInit() error { + var ret C.int + cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module") + ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) + if ret != 0 { + var err error = nil + C.topology_finalize() + if ret != -22 { + err = errors.New("failed to initialize LIKWID perfmon") + cclog.ComponentError(m.name, err.Error()) + } else { + err = errors.New("access to LIKWID perfmon locked") + } + return err + } + + // While adding the events, we test the metrics whether they can be computed at all + for i, evset := range m.config.Eventsets { + var gid C.int + if len(evset.Events) > 0 { + likwidGroup := genLikwidEventSet(evset) + // Now we add the list of events to likwid + gid = C.perfmon_addEventSet(likwidGroup.estr) + if gid >= 0 { + likwidGroup.gid = gid + likwidGroup.internal = i + m.likwidGroups[gid] = likwidGroup + } + } else { + cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given") + continue + } + + } + + // If no event set could be added, shut down LikwidCollector + if len(m.likwidGroups) == 0 { + C.perfmon_finalize() + C.topology_finalize() + err := errors.New("no LIKWID performance group initialized") + cclog.ComponentError(m.name, err.Error()) + return err + } + sigchan := make(chan os.Signal, 1) + signal.Notify(sigchan, syscall.SIGCHLD) + signal.Notify(sigchan, os.Interrupt) + go func() { + <-sigchan + + signal.Stop(sigchan) + m.initialized = false + }() + m.initialized = true + return nil +} + // main read function taking multiple measurement rounds, each 'interval' seconds long func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) { + var skip bool = false + var err error if !m.init { return } - for i := range m.groups { - // measure event set 'i' for 'interval' seconds - err := m.takeMeasurement(i, interval) - if err != nil { - cclog.ComponentError(m.name, err.Error()) + if !m.initialized { + if m.LateInit() != nil { return } - // read measurements and derive event set metrics - m.calcEventsetMetrics(i, interval, output) } - // use the event set metrics to derive the global metrics - m.calcGlobalMetrics(interval, output) + + if m.initialized && !skip { + for _, evset := range m.likwidGroups { + if !skip { + // measure event set 'i' for 'interval' seconds + skip, err = m.takeMeasurement(evset, interval) + if err != nil { + cclog.ComponentError(m.name, err.Error()) + return + } + } + + if !skip { + // read measurements and derive event set metrics + m.calcEventsetMetrics(evset, interval, output) + } + } + if !skip { + // use the event set metrics to derive the global metrics + m.calcGlobalMetrics(interval, output) + } + } } func (m *LikwidCollector) Close() { if m.init { - cclog.ComponentDebug(m.name, "Closing ...") m.init = false - if m.running { - cclog.ComponentDebug(m.name, "Stopping counters") - C.perfmon_stopCounters() + cclog.ComponentDebug(m.name, "Closing ...") + m.lock.Lock() + if m.initialized { + cclog.ComponentDebug(m.name, "Finalize LIKWID perfmon module") + C.perfmon_finalize() + m.initialized = false } - cclog.ComponentDebug(m.name, "Finalize LIKWID perfmon module") - C.perfmon_finalize() + m.lock.Unlock() cclog.ComponentDebug(m.name, "Finalize LIKWID topology module") C.topology_finalize() + cclog.ComponentDebug(m.name, "Closing done") } } From 50479f932587781e51c679a597ad0d30d8a8920a Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 24 Mar 2022 18:12:23 +0100 Subject: [PATCH 6/6] Move all LIKWID related stuff to late initialization routine --- collectors/likwidMetric.go | 86 +++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 39 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 7113b4f..3bbc1b5 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -139,13 +139,13 @@ func testLikwidMetricFormula(formula string, params []string) bool { } func getBaseFreq() float64 { + files := []string{ + "/sys/devices/system/cpu/cpu0/cpufreq/bios_limit", + "/sys/devices/system/cpu/cpu0/cpufreq/base_frequency", + } var freq float64 = math.NaN() - C.power_init(0) - info := C.get_powerInfo() - if float64(info.baseFrequency) != 0 { - freq = float64(info.baseFrequency) * 1e6 - } else { - buffer, err := ioutil.ReadFile("/sys/devices/system/cpu/cpu0/cpufreq/bios_limit") + for _, f := range files { + buffer, err := ioutil.ReadFile(f) if err == nil { data := strings.Replace(string(buffer), "\n", "", -1) x, err := strconv.ParseInt(data, 0, 64) @@ -154,11 +154,19 @@ func getBaseFreq() float64 { } } } + + if math.IsNaN(freq) { + C.power_init(0) + info := C.get_powerInfo() + if float64(info.baseFrequency) != 0 { + freq = float64(info.baseFrequency) * 1e6 + } + C.power_finalize() + } return freq } func (m *LikwidCollector) Init(config json.RawMessage) error { - var ret C.int m.name = "LikwidCollector" m.initialized = false m.running = false @@ -195,24 +203,6 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { m.cpu2tid[c] = i } - cclog.ComponentDebug(m.name, "initialize LIKWID topology") - ret = C.topology_init() - if ret != 0 { - err := errors.New("failed to initialize LIKWID topology") - cclog.ComponentError(m.name, err.Error()) - return err - } - - m.sock2tid = make(map[int]int) - tmp := make([]C.int, 1) - for _, sid := range topo.SocketList() { - cstr := C.CString(fmt.Sprintf("S%d:0", sid)) - ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1) - if ret > 0 { - m.sock2tid[sid] = m.cpu2tid[int(tmp[0])] - } - C.free(unsafe.Pointer(cstr)) - } m.likwidGroups = make(map[C.int]LikwidEventsetConfig) // m.results = make(map[int]map[int]map[string]interface{}) @@ -222,17 +212,6 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { m.gmresults[tid] = make(map[string]float64) } - switch m.config.AccessMode { - case "direct": - C.HPMmode(0) - case "accessdaemon": - if len(m.config.DaemonPath) > 0 { - p := os.Getenv("PATH") - os.Setenv("PATH", m.config.DaemonPath+":"+p) - } - C.HPMmode(1) - } - // This is for the global metrics computation test totalMetrics := 0 // Generate parameter list for the metric computing test @@ -275,13 +254,10 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { // If no event set could be added, shut down LikwidCollector if totalMetrics == 0 { - C.topology_finalize() err := errors.New("no LIKWID eventset or metric usable") cclog.ComponentError(m.name, err.Error()) return err } - m.basefreq = getBaseFreq() - cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq) m.init = true return nil } @@ -449,6 +425,38 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan func (m *LikwidCollector) LateInit() error { var ret C.int + switch m.config.AccessMode { + case "direct": + C.HPMmode(0) + case "accessdaemon": + if len(m.config.DaemonPath) > 0 { + p := os.Getenv("PATH") + os.Setenv("PATH", m.config.DaemonPath+":"+p) + } + C.HPMmode(1) + } + cclog.ComponentDebug(m.name, "initialize LIKWID topology") + ret = C.topology_init() + if ret != 0 { + err := errors.New("failed to initialize LIKWID topology") + cclog.ComponentError(m.name, err.Error()) + return err + } + + m.sock2tid = make(map[int]int) + tmp := make([]C.int, 1) + for _, sid := range topo.SocketList() { + cstr := C.CString(fmt.Sprintf("S%d:0", sid)) + ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1) + if ret > 0 { + m.sock2tid[sid] = m.cpu2tid[int(tmp[0])] + } + C.free(unsafe.Pointer(cstr)) + } + + m.basefreq = getBaseFreq() + cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq) + cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module") ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) if ret != 0 {