From ae64eddcc8018e5825242440a70e56144fecdabe Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 21 Feb 2022 14:50:53 +0100 Subject: [PATCH 01/34] Remove doubled import --- collectors/customCmdMetric.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/collectors/customCmdMetric.go b/collectors/customCmdMetric.go index 483d2ba..e978c49 100644 --- a/collectors/customCmdMetric.go +++ b/collectors/customCmdMetric.go @@ -9,7 +9,6 @@ import ( "strings" "time" - ccmetric "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influx "github.com/influxdata/line-protocol" ) @@ -99,7 +98,7 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetri continue } - y := ccmetric.FromInfluxMetric(c) + y := lp.FromInfluxMetric(c) if err == nil { output <- y } @@ -121,7 +120,7 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetri if skip { continue } - y := ccmetric.FromInfluxMetric(f) + y := lp.FromInfluxMetric(f) if err == nil { output <- y } From 622e94ae0eb4e559b2c3a819e06bf5e17cad2e8e Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 22 Mar 2022 15:58:10 +0100 Subject: [PATCH 02/34] Fix DieList() if system does not support dies. Explicitly set entries in CpuData list --- internal/ccTopology/ccTopology.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/internal/ccTopology/ccTopology.go b/internal/ccTopology/ccTopology.go index 958bb45..f68c3f4 100644 --- a/internal/ccTopology/ccTopology.go +++ b/internal/ccTopology/ccTopology.go @@ -169,7 +169,10 @@ func DieList() []int { } } } - return dielist + if len(dielist) > 0 { + return dielist + } + return SocketList() } type CpuEntry struct { @@ -261,7 +264,7 @@ func CpuData() []CpuEntry { for _, c := range CpuList() { clist = append(clist, CpuEntry{Cpuid: c}) } - for _, centry := range clist { + for i, centry := range clist { centry.Socket = -1 centry.Numadomain = -1 centry.Die = -1 @@ -289,6 +292,8 @@ func CpuData() []CpuEntry { // Lookup NUMA domain id centry.Numadomain = getNumaDomain(base) + // Update values in output list + clist[i] = centry } return clist } From 296225f3a8f4c0738d28195f59387c7512ca9b9d Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 24 Mar 2022 13:50:35 +0100 Subject: [PATCH 03/34] Always export all metrics in NfsCollectors --- collectors/nfsMetric.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/collectors/nfsMetric.go b/collectors/nfsMetric.go index 07e684d..c511b0d 100644 --- a/collectors/nfsMetric.go +++ b/collectors/nfsMetric.go @@ -36,7 +36,7 @@ type nfsCollector struct { } func (m *nfsCollector) initStats() error { - cmd := exec.Command(m.config.Nfsstats, `-l`) + cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`) cmd.Wait() buffer, err := cmd.Output() if err == nil { @@ -52,7 +52,7 @@ func (m *nfsCollector) initStats() error { if err == nil { x := m.data[name] x.current = value - x.last = 0 + x.last = value m.data[name] = x } } @@ -63,7 +63,7 @@ func (m *nfsCollector) initStats() error { } func (m *nfsCollector) updateStats() error { - cmd := exec.Command(m.config.Nfsstats, `-l`) + cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`) cmd.Wait() buffer, err := cmd.Output() if err == nil { From e0e91844bced050459e310d97e0e74c7bb4bd6e9 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 24 Mar 2022 17:56:51 +0100 Subject: [PATCH 04/34] Use late initialization of LIKWID and catch access daemon death. Fixes #70 and fixes #71. --- collectors/likwidMetric.go | 381 ++++++++++++++++++++++++------------- 1 file changed, 251 insertions(+), 130 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 85bd932..7113b4f 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -15,8 +15,11 @@ import ( "io/ioutil" "math" "os" + "os/signal" "strconv" "strings" + "sync" + "syscall" "time" "unsafe" @@ -46,6 +49,15 @@ type LikwidCollectorEventsetConfig struct { Metrics []LikwidCollectorMetricConfig `json:"metrics"` } +type LikwidEventsetConfig struct { + internal int + gid C.int + eorder []*C.char + estr *C.char + results map[int]map[string]interface{} + metrics map[int]map[string]float64 +} + type LikwidCollectorConfig struct { Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"` Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"` @@ -58,17 +70,18 @@ type LikwidCollectorConfig struct { type LikwidCollector struct { metricCollector - cpulist []C.int - cpu2tid map[int]int - sock2tid map[int]int - metrics map[C.int]map[string]int - groups []C.int - config LikwidCollectorConfig - results map[int]map[int]map[string]interface{} - mresults map[int]map[int]map[string]float64 - gmresults map[int]map[string]float64 - basefreq float64 - running bool + cpulist []C.int + cpu2tid map[int]int + sock2tid map[int]int + metrics map[C.int]map[string]int + groups []C.int + config LikwidCollectorConfig + gmresults map[int]map[string]float64 + basefreq float64 + running bool + initialized bool + likwidGroups map[C.int]LikwidEventsetConfig + lock sync.Mutex } type LikwidMetric struct { @@ -86,6 +99,45 @@ func eventsToEventStr(events map[string]string) string { return strings.Join(elist, ",") } +func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig { + tmplist := make([]string, 0) + elist := make([]*C.char, 0) + for k, v := range input.Events { + tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k)) + c_counter := C.CString(k) + elist = append(elist, c_counter) + } + estr := strings.Join(tmplist, ",") + res := make(map[int]map[string]interface{}) + met := make(map[int]map[string]float64) + for _, i := range topo.CpuList() { + res[i] = make(map[string]interface{}) + for k := range input.Events { + res[i][k] = 0.0 + } + met[i] = make(map[string]float64) + for _, v := range input.Metrics { + res[i][v.Name] = 0.0 + } + } + return LikwidEventsetConfig{ + gid: -1, + eorder: elist, + estr: C.CString(estr), + results: res, + metrics: met, + } +} + +func testLikwidMetricFormula(formula string, params []string) bool { + myparams := make(map[string]interface{}) + for _, p := range params { + myparams[p] = float64(1.0) + } + _, err := agg.EvalFloat64Condition(formula, myparams) + return err == nil +} + func getBaseFreq() float64 { var freq float64 = math.NaN() C.power_init(0) @@ -108,6 +160,8 @@ func getBaseFreq() float64 { func (m *LikwidCollector) Init(config json.RawMessage) error { var ret C.int m.name = "LikwidCollector" + m.initialized = false + m.running = false m.config.AccessMode = LIKWID_DEF_ACCESSMODE m.config.LibraryPath = LIKWID_LIB_NAME if len(config) > 0 { @@ -140,6 +194,15 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { m.cpulist[i] = C.int(c) m.cpu2tid[c] = i } + + cclog.ComponentDebug(m.name, "initialize LIKWID topology") + ret = C.topology_init() + if ret != 0 { + err := errors.New("failed to initialize LIKWID topology") + cclog.ComponentError(m.name, err.Error()) + return err + } + m.sock2tid = make(map[int]int) tmp := make([]C.int, 1) for _, sid := range topo.SocketList() { @@ -150,15 +213,13 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { } C.free(unsafe.Pointer(cstr)) } - m.results = make(map[int]map[int]map[string]interface{}) - m.mresults = make(map[int]map[int]map[string]float64) + m.likwidGroups = make(map[C.int]LikwidEventsetConfig) + + // m.results = make(map[int]map[int]map[string]interface{}) + // m.mresults = make(map[int]map[int]map[string]float64) m.gmresults = make(map[int]map[string]float64) - cclog.ComponentDebug(m.name, "initialize LIKWID topology") - ret = C.topology_init() - if ret != 0 { - err := errors.New("failed to initialize LIKWID topology") - cclog.ComponentError(m.name, err.Error()) - return err + for _, tid := range m.cpu2tid { + m.gmresults[tid] = make(map[string]float64) } switch m.config.AccessMode { @@ -172,79 +233,50 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { C.HPMmode(1) } - cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module") - ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) - if ret != 0 { - C.topology_finalize() - err := errors.New("failed to initialize LIKWID topology") - cclog.ComponentError(m.name, err.Error()) - return err - } - // This is for the global metrics computation test - globalParams := make(map[string]interface{}) - globalParams["time"] = float64(1.0) - globalParams["inverseClock"] = float64(1.0) - // While adding the events, we test the metrics whether they can be computed at all - for i, evset := range m.config.Eventsets { - var gid C.int - var cstr *C.char + totalMetrics := 0 + // Generate parameter list for the metric computing test + params := make([]string, 0) + params = append(params, "time", "inverseClock") + // Generate parameter list for the global metric computing test + globalParams := make([]string, 0) + globalParams = append(globalParams, "time", "inverseClock") + // We test the eventset metrics whether they can be computed at all + for _, evset := range m.config.Eventsets { if len(evset.Events) > 0 { - estr := eventsToEventStr(evset.Events) - // Generate parameter list for the metric computing test - params := make(map[string]interface{}) - params["time"] = float64(1.0) - params["inverseClock"] = float64(1.0) + params = params[:2] for counter := range evset.Events { - params[counter] = float64(1.0) + params = append(params, counter) } for _, metric := range evset.Metrics { // Try to evaluate the metric - _, err := agg.EvalFloat64Condition(metric.Calc, params) - if err != nil { - cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) - continue - } - // If the metric is not in the parameter list for the global metrics, add it - if _, ok := globalParams[metric.Name]; !ok { - globalParams[metric.Name] = float64(1.0) + if testLikwidMetricFormula(metric.Calc, params) { + // Add the computable metric to the parameter list for the global metrics + globalParams = append(globalParams, metric.Name) + totalMetrics++ + } else { + metric.Calc = "" } } - // Now we add the list of events to likwid - cstr = C.CString(estr) - gid = C.perfmon_addEventSet(cstr) } else { cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given") continue } - if gid >= 0 { - m.groups = append(m.groups, gid) - } - C.free(unsafe.Pointer(cstr)) - m.results[i] = make(map[int]map[string]interface{}) - m.mresults[i] = make(map[int]map[string]float64) - for tid := range m.cpulist { - m.results[i][tid] = make(map[string]interface{}) - m.mresults[i][tid] = make(map[string]float64) - if i == 0 { - m.gmresults[tid] = make(map[string]float64) - } - } } for _, metric := range m.config.Metrics { // Try to evaluate the global metric - _, err := agg.EvalFloat64Condition(metric.Calc, globalParams) - if err != nil { - cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) - continue + if !testLikwidMetricFormula(metric.Calc, globalParams) { + cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed") + metric.Calc = "" + } else { + totalMetrics++ } } // If no event set could be added, shut down LikwidCollector - if len(m.groups) == 0 { - C.perfmon_finalize() + if totalMetrics == 0 { C.topology_finalize() - err := errors.New("no LIKWID performance group initialized") + err := errors.New("no LIKWID eventset or metric usable") cclog.ComponentError(m.name, err.Error()) return err } @@ -255,57 +287,71 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { } // take a measurement for 'interval' seconds of event set index 'group' -func (m *LikwidCollector) takeMeasurement(group int, interval time.Duration) error { +func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval time.Duration) (bool, error) { var ret C.int - gid := m.groups[group] - ret = C.perfmon_setupCounters(gid) - if ret != 0 { - gctr := C.GoString(C.perfmon_getGroupName(gid)) - err := fmt.Errorf("failed to setup performance group %d (%s)", gid, gctr) - return err + m.lock.Lock() + if m.initialized { + ret = C.perfmon_setupCounters(evset.gid) + if ret != 0 { + var err error = nil + var skip bool = false + if ret == -37 { + skip = true + } else { + err = fmt.Errorf("failed to setup performance group %d", evset.gid) + } + m.lock.Unlock() + return skip, err + } + ret = C.perfmon_startCounters() + if ret != 0 { + var err error = nil + var skip bool = false + if ret == -37 { + skip = true + } else { + err = fmt.Errorf("failed to setup performance group %d", evset.gid) + } + m.lock.Unlock() + return skip, err + } + m.running = true + time.Sleep(interval) + m.running = false + ret = C.perfmon_stopCounters() + if ret != 0 { + var err error = nil + var skip bool = false + if ret == -37 { + skip = true + } else { + err = fmt.Errorf("failed to setup performance group %d", evset.gid) + } + m.lock.Unlock() + return skip, err + } } - ret = C.perfmon_startCounters() - if ret != 0 { - gctr := C.GoString(C.perfmon_getGroupName(gid)) - err := fmt.Errorf("failed to start performance group %d (%s)", gid, gctr) - return err - } - m.running = true - time.Sleep(interval) - m.running = false - ret = C.perfmon_stopCounters() - if ret != 0 { - gctr := C.GoString(C.perfmon_getGroupName(gid)) - err := fmt.Errorf("failed to stop performance group %d (%s)", gid, gctr) - return err - } - return nil + m.lock.Unlock() + return false, nil } // Get all measurement results for an event set, derive the metric values out of the measurement results and send it -func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, output chan lp.CCMetric) error { - var eidx C.int - evset := m.config.Eventsets[group] - gid := m.groups[group] +func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error { invClock := float64(1.0 / m.basefreq) // Go over events and get the results - for eidx = 0; int(eidx) < len(evset.Events); eidx++ { - ctr := C.perfmon_getCounterName(gid, eidx) - gctr := C.GoString(ctr) - + for eidx, counter := range evset.eorder { + gctr := C.GoString(counter) for _, tid := range m.cpu2tid { - if tid >= 0 { - m.results[group][tid]["time"] = interval.Seconds() - m.results[group][tid]["inverseClock"] = invClock - res := C.perfmon_getLastResult(gid, eidx, C.int(tid)) - m.results[group][tid][gctr] = float64(res) - } + res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid)) + evset.results[tid][gctr] = float64(res) + evset.results[tid]["time"] = interval.Seconds() + evset.results[tid]["inverseClock"] = invClock } } // Go over the event set metrics, derive the value out of the event:counter values and send it - for _, metric := range evset.Metrics { + for _, metric := range m.config.Eventsets[evset.internal].Metrics { // The metric scope is determined in the Init() function // Get the map scope-id -> tids scopemap := m.cpu2tid @@ -313,13 +359,13 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, scopemap = m.sock2tid } for domain, tid := range scopemap { - if tid >= 0 { - value, err := agg.EvalFloat64Condition(metric.Calc, m.results[group][tid]) + if tid >= 0 && len(metric.Calc) > 0 { + value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid]) if err != nil { cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) continue } - m.mresults[group][tid][metric.Name] = value + evset.metrics[tid][metric.Name] = value if m.config.InvalidToZero && math.IsNaN(value) { value = 0.0 } @@ -360,8 +406,8 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan if tid >= 0 { // Here we generate parameter list params := make(map[string]interface{}) - for j := range m.groups { - for mname, mres := range m.mresults[j][tid] { + for _, evset := range m.likwidGroups { + for mname, mres := range evset.metrics[tid] { params[mname] = mres } } @@ -401,38 +447,113 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan return nil } +func (m *LikwidCollector) LateInit() error { + var ret C.int + cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module") + ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) + if ret != 0 { + var err error = nil + C.topology_finalize() + if ret != -22 { + err = errors.New("failed to initialize LIKWID perfmon") + cclog.ComponentError(m.name, err.Error()) + } else { + err = errors.New("access to LIKWID perfmon locked") + } + return err + } + + // While adding the events, we test the metrics whether they can be computed at all + for i, evset := range m.config.Eventsets { + var gid C.int + if len(evset.Events) > 0 { + likwidGroup := genLikwidEventSet(evset) + // Now we add the list of events to likwid + gid = C.perfmon_addEventSet(likwidGroup.estr) + if gid >= 0 { + likwidGroup.gid = gid + likwidGroup.internal = i + m.likwidGroups[gid] = likwidGroup + } + } else { + cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given") + continue + } + + } + + // If no event set could be added, shut down LikwidCollector + if len(m.likwidGroups) == 0 { + C.perfmon_finalize() + C.topology_finalize() + err := errors.New("no LIKWID performance group initialized") + cclog.ComponentError(m.name, err.Error()) + return err + } + sigchan := make(chan os.Signal, 1) + signal.Notify(sigchan, syscall.SIGCHLD) + signal.Notify(sigchan, os.Interrupt) + go func() { + <-sigchan + + signal.Stop(sigchan) + m.initialized = false + }() + m.initialized = true + return nil +} + // main read function taking multiple measurement rounds, each 'interval' seconds long func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) { + var skip bool = false + var err error if !m.init { return } - for i := range m.groups { - // measure event set 'i' for 'interval' seconds - err := m.takeMeasurement(i, interval) - if err != nil { - cclog.ComponentError(m.name, err.Error()) + if !m.initialized { + if m.LateInit() != nil { return } - // read measurements and derive event set metrics - m.calcEventsetMetrics(i, interval, output) } - // use the event set metrics to derive the global metrics - m.calcGlobalMetrics(interval, output) + + if m.initialized && !skip { + for _, evset := range m.likwidGroups { + if !skip { + // measure event set 'i' for 'interval' seconds + skip, err = m.takeMeasurement(evset, interval) + if err != nil { + cclog.ComponentError(m.name, err.Error()) + return + } + } + + if !skip { + // read measurements and derive event set metrics + m.calcEventsetMetrics(evset, interval, output) + } + } + if !skip { + // use the event set metrics to derive the global metrics + m.calcGlobalMetrics(interval, output) + } + } } func (m *LikwidCollector) Close() { if m.init { - cclog.ComponentDebug(m.name, "Closing ...") m.init = false - if m.running { - cclog.ComponentDebug(m.name, "Stopping counters") - C.perfmon_stopCounters() + cclog.ComponentDebug(m.name, "Closing ...") + m.lock.Lock() + if m.initialized { + cclog.ComponentDebug(m.name, "Finalize LIKWID perfmon module") + C.perfmon_finalize() + m.initialized = false } - cclog.ComponentDebug(m.name, "Finalize LIKWID perfmon module") - C.perfmon_finalize() + m.lock.Unlock() cclog.ComponentDebug(m.name, "Finalize LIKWID topology module") C.topology_finalize() + cclog.ComponentDebug(m.name, "Closing done") } } From 50479f932587781e51c679a597ad0d30d8a8920a Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 24 Mar 2022 18:12:23 +0100 Subject: [PATCH 05/34] Move all LIKWID related stuff to late initialization routine --- collectors/likwidMetric.go | 86 +++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 39 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 7113b4f..3bbc1b5 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -139,13 +139,13 @@ func testLikwidMetricFormula(formula string, params []string) bool { } func getBaseFreq() float64 { + files := []string{ + "/sys/devices/system/cpu/cpu0/cpufreq/bios_limit", + "/sys/devices/system/cpu/cpu0/cpufreq/base_frequency", + } var freq float64 = math.NaN() - C.power_init(0) - info := C.get_powerInfo() - if float64(info.baseFrequency) != 0 { - freq = float64(info.baseFrequency) * 1e6 - } else { - buffer, err := ioutil.ReadFile("/sys/devices/system/cpu/cpu0/cpufreq/bios_limit") + for _, f := range files { + buffer, err := ioutil.ReadFile(f) if err == nil { data := strings.Replace(string(buffer), "\n", "", -1) x, err := strconv.ParseInt(data, 0, 64) @@ -154,11 +154,19 @@ func getBaseFreq() float64 { } } } + + if math.IsNaN(freq) { + C.power_init(0) + info := C.get_powerInfo() + if float64(info.baseFrequency) != 0 { + freq = float64(info.baseFrequency) * 1e6 + } + C.power_finalize() + } return freq } func (m *LikwidCollector) Init(config json.RawMessage) error { - var ret C.int m.name = "LikwidCollector" m.initialized = false m.running = false @@ -195,24 +203,6 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { m.cpu2tid[c] = i } - cclog.ComponentDebug(m.name, "initialize LIKWID topology") - ret = C.topology_init() - if ret != 0 { - err := errors.New("failed to initialize LIKWID topology") - cclog.ComponentError(m.name, err.Error()) - return err - } - - m.sock2tid = make(map[int]int) - tmp := make([]C.int, 1) - for _, sid := range topo.SocketList() { - cstr := C.CString(fmt.Sprintf("S%d:0", sid)) - ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1) - if ret > 0 { - m.sock2tid[sid] = m.cpu2tid[int(tmp[0])] - } - C.free(unsafe.Pointer(cstr)) - } m.likwidGroups = make(map[C.int]LikwidEventsetConfig) // m.results = make(map[int]map[int]map[string]interface{}) @@ -222,17 +212,6 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { m.gmresults[tid] = make(map[string]float64) } - switch m.config.AccessMode { - case "direct": - C.HPMmode(0) - case "accessdaemon": - if len(m.config.DaemonPath) > 0 { - p := os.Getenv("PATH") - os.Setenv("PATH", m.config.DaemonPath+":"+p) - } - C.HPMmode(1) - } - // This is for the global metrics computation test totalMetrics := 0 // Generate parameter list for the metric computing test @@ -275,13 +254,10 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { // If no event set could be added, shut down LikwidCollector if totalMetrics == 0 { - C.topology_finalize() err := errors.New("no LIKWID eventset or metric usable") cclog.ComponentError(m.name, err.Error()) return err } - m.basefreq = getBaseFreq() - cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq) m.init = true return nil } @@ -449,6 +425,38 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan func (m *LikwidCollector) LateInit() error { var ret C.int + switch m.config.AccessMode { + case "direct": + C.HPMmode(0) + case "accessdaemon": + if len(m.config.DaemonPath) > 0 { + p := os.Getenv("PATH") + os.Setenv("PATH", m.config.DaemonPath+":"+p) + } + C.HPMmode(1) + } + cclog.ComponentDebug(m.name, "initialize LIKWID topology") + ret = C.topology_init() + if ret != 0 { + err := errors.New("failed to initialize LIKWID topology") + cclog.ComponentError(m.name, err.Error()) + return err + } + + m.sock2tid = make(map[int]int) + tmp := make([]C.int, 1) + for _, sid := range topo.SocketList() { + cstr := C.CString(fmt.Sprintf("S%d:0", sid)) + ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1) + if ret > 0 { + m.sock2tid[sid] = m.cpu2tid[int(tmp[0])] + } + C.free(unsafe.Pointer(cstr)) + } + + m.basefreq = getBaseFreq() + cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq) + cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module") ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) if ret != 0 { From 2a014b6fbac56db1ae32db7bedc7af58f8f42dfc Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Thu, 31 Mar 2022 11:56:31 +0200 Subject: [PATCH 06/34] Read unit of values from /proc/meminfo (#68) --- collectors/memstatMetric.go | 52 ++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/collectors/memstatMetric.go b/collectors/memstatMetric.go index bd7af5d..c6c7f34 100644 --- a/collectors/memstatMetric.go +++ b/collectors/memstatMetric.go @@ -40,8 +40,13 @@ type MemstatCollector struct { sendMemUsed bool } -func getStats(filename string) map[string]float64 { - stats := make(map[string]float64) +type MemstatStats struct { + value float64 + unit string +} + +func getStats(filename string) map[string]MemstatStats { + stats := make(map[string]MemstatStats) file, err := os.Open(filename) if err != nil { cclog.Error(err.Error()) @@ -55,12 +60,18 @@ func getStats(filename string) map[string]float64 { if len(linefields) == 3 { v, err := strconv.ParseFloat(linefields[1], 64) if err == nil { - stats[strings.Trim(linefields[0], ":")] = v + stats[strings.Trim(linefields[0], ":")] = MemstatStats{ + value: v, + unit: linefields[2], + } } } else if len(linefields) == 5 { v, err := strconv.ParseFloat(linefields[3], 64) if err == nil { - stats[strings.Trim(linefields[0], ":")] = v + stats[strings.Trim(linefields[0], ":")] = MemstatStats{ + value: v, + unit: linefields[4], + } } } } @@ -78,7 +89,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error { return err } } - m.meta = map[string]string{"source": m.name, "group": "Memory", "unit": "GByte"} + m.meta = map[string]string{"source": m.name, "group": "Memory"} m.stats = make(map[string]int64) m.matches = make(map[string]string) m.tags = map[string]string{"type": "node"} @@ -151,30 +162,51 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) return } - sendStats := func(stats map[string]float64, tags map[string]string) { + sendStats := func(stats map[string]MemstatStats, tags map[string]string) { for match, name := range m.matches { var value float64 = 0 + var unit string = "" if v, ok := stats[match]; ok { - value = v + value = v.value + if len(v.unit) > 0 { + unit = v.unit + } } - y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value * 1e-6}, time.Now()) + + y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now()) if err == nil { + if len(unit) > 0 { + y.AddMeta("unit", unit) + } output <- y } } if m.sendMemUsed { memUsed := 0.0 + unit := "" if totalVal, total := stats["MemTotal"]; total { if freeVal, free := stats["MemFree"]; free { if bufVal, buffers := stats["Buffers"]; buffers { if cacheVal, cached := stats["Cached"]; cached { - memUsed = totalVal - (freeVal + bufVal + cacheVal) + memUsed = totalVal.value - (freeVal.value + bufVal.value + cacheVal.value) + if len(totalVal.unit) > 0 { + unit = totalVal.unit + } else if len(freeVal.unit) > 0 { + unit = freeVal.unit + } else if len(bufVal.unit) > 0 { + unit = bufVal.unit + } else if len(cacheVal.unit) > 0 { + unit = cacheVal.unit + } } } } } - y, err := lp.New("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed * 1e-6}, time.Now()) + y, err := lp.New("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now()) if err == nil { + if len(unit) > 0 { + y.AddMeta("unit", unit) + } output <- y } } From 83b4343310f999030daacf7de1ca3e22f3d69d1c Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 1 Apr 2022 17:10:31 +0200 Subject: [PATCH 07/34] Likwid receives signal at first Read, check when re-initializing --- collectors/likwidMetric.go | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 3bbc1b5..d808bad 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -16,6 +16,7 @@ import ( "math" "os" "os/signal" + "sort" "strconv" "strings" "sync" @@ -54,6 +55,7 @@ type LikwidEventsetConfig struct { gid C.int eorder []*C.char estr *C.char + go_estr string results map[int]map[string]interface{} metrics map[int]map[string]float64 } @@ -101,8 +103,14 @@ func eventsToEventStr(events map[string]string) string { func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig { tmplist := make([]string, 0) + clist := make([]string, 0) + for k := range input.Events { + clist = append(clist, k) + } + sort.Strings(clist) elist := make([]*C.char, 0) - for k, v := range input.Events { + for _, k := range clist { + v := input.Events[k] tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k)) c_counter := C.CString(k) elist = append(elist, c_counter) @@ -124,6 +132,7 @@ func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig gid: -1, eorder: elist, estr: C.CString(estr), + go_estr: estr, results: res, metrics: met, } @@ -193,7 +202,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { } m.setup() - m.meta = map[string]string{"source": m.name, "group": "PerfCounter"} + m.meta = map[string]string{"group": "PerfCounter"} cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists") cpulist := topo.CpuList() m.cpulist = make([]C.int, len(cpulist)) @@ -425,6 +434,9 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan func (m *LikwidCollector) LateInit() error { var ret C.int + if m.initialized { + return nil + } switch m.config.AccessMode { case "direct": C.HPMmode(0) @@ -475,7 +487,17 @@ func (m *LikwidCollector) LateInit() error { for i, evset := range m.config.Eventsets { var gid C.int if len(evset.Events) > 0 { + skip := false likwidGroup := genLikwidEventSet(evset) + for _, g := range m.likwidGroups { + if likwidGroup.go_estr == g.go_estr { + skip = true + break + } + } + if skip { + continue + } // Now we add the list of events to likwid gid = C.perfmon_addEventSet(likwidGroup.estr) if gid >= 0 { @@ -520,9 +542,14 @@ func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) } if !m.initialized { - if m.LateInit() != nil { + m.lock.Lock() + err = m.LateInit() + if err != nil { + m.lock.Unlock() return } + m.initialized = true + m.lock.Unlock() } if m.initialized && !skip { From 5d25a7bf12e27c576f2caeed24fd51ac4d5417b8 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 1 Apr 2022 17:14:26 +0200 Subject: [PATCH 08/34] Add units to InfiniBandCollector --- collectors/infinibandMetric.go | 41 ++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index 5be095d..274e669 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -18,13 +18,18 @@ import ( const IB_BASEPATH = "/sys/class/infiniband/" +type InfinibandCollectorMetric struct { + path string + unit string +} + type InfinibandCollectorInfo struct { - LID string // IB local Identifier (LID) - device string // IB device - port string // IB device port - portCounterFiles map[string]string // mapping counter name -> sysfs file - tagSet map[string]string // corresponding tag list - lastState map[string]int64 // State from last measurement + LID string // IB local Identifier (LID) + device string // IB device + port string // IB device port + portCounterFiles map[string]InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric + tagSet map[string]string // corresponding tag list + lastState map[string]int64 // State from last measurement } type InfinibandCollector struct { @@ -106,16 +111,16 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error { // Check access to counter files countersDir := filepath.Join(path, "counters") - portCounterFiles := map[string]string{ - "ib_recv": filepath.Join(countersDir, "port_rcv_data"), - "ib_xmit": filepath.Join(countersDir, "port_xmit_data"), - "ib_recv_pkts": filepath.Join(countersDir, "port_rcv_packets"), - "ib_xmit_pkts": filepath.Join(countersDir, "port_xmit_packets"), + portCounterFiles := map[string]InfinibandCollectorMetric{ + "ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes"}, + "ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes"}, + "ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets"}, + "ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets"}, } - for _, counterFile := range portCounterFiles { - err := unix.Access(counterFile, unix.R_OK) + for _, counter := range portCounterFiles { + err := unix.Access(counter.path, unix.R_OK) if err != nil { - return fmt.Errorf("unable to access %s: %v", counterFile, err) + return fmt.Errorf("unable to access %s: %v", counter.path, err) } } @@ -165,14 +170,14 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr m.lastTimestamp = now for _, info := range m.info { - for counterName, counterFile := range info.portCounterFiles { + for counterName, counterDef := range info.portCounterFiles { // Read counter file - line, err := ioutil.ReadFile(counterFile) + line, err := ioutil.ReadFile(counterDef.path) if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterFile, err)) + fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err)) continue } data := strings.TrimSpace(string(line)) @@ -189,6 +194,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr // Send absolut values if m.config.SendAbsoluteValues { if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil { + y.AddMeta("unit", counterDef.unit) output <- y } } @@ -198,6 +204,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr if info.lastState[counterName] >= 0 { rate := float64((v - info.lastState[counterName])) / timeDiff if y, err := lp.New(counterName+"_bw", info.tagSet, m.meta, map[string]interface{}{"value": rate}, now); err == nil { + y.AddMeta("unit", counterDef.unit+"/sec") output <- y } } From 7e43e9171e1d133de4c11bfc8703fcf2dbd6a06c Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 1 Apr 2022 17:26:56 +0200 Subject: [PATCH 09/34] Use default options. Overwrite if anything is configured differently. Use seconds as precision --- sinks/influxAsyncSink.go | 47 ++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/sinks/influxAsyncSink.go b/sinks/influxAsyncSink.go index 213f2d6..e22f941 100644 --- a/sinks/influxAsyncSink.go +++ b/sinks/influxAsyncSink.go @@ -28,10 +28,10 @@ type InfluxAsyncSinkConfig struct { BatchSize uint `json:"batch_size,omitempty"` // Interval, in ms, in which is buffer flushed if it has not been already written (by reaching batch size) . Default 1000ms FlushInterval uint `json:"flush_interval,omitempty"` - InfluxRetryInterval string `json:"retry_interval"` - InfluxExponentialBase uint `json:"retry_exponential_base"` - InfluxMaxRetries uint `json:"max_retries"` - InfluxMaxRetryTime string `json:"max_retry_time"` + InfluxRetryInterval string `json:"retry_interval,omitempty"` + InfluxExponentialBase uint `json:"retry_exponential_base,omitempty"` + InfluxMaxRetries uint `json:"max_retries,omitempty"` + InfluxMaxRetryTime string `json:"max_retry_time,omitempty"` } type InfluxAsyncSink struct { @@ -60,20 +60,34 @@ func (s *InfluxAsyncSink) connect() error { cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database) clientOptions := influxdb2.DefaultOptions() if s.config.BatchSize != 0 { + cclog.ComponentDebug(s.name, "Batch size", s.config.BatchSize) clientOptions.SetBatchSize(s.config.BatchSize) } if s.config.FlushInterval != 0 { + cclog.ComponentDebug(s.name, "Flush interval", s.config.FlushInterval) clientOptions.SetFlushInterval(s.config.FlushInterval) } + if s.influxRetryInterval != 0 { + cclog.ComponentDebug(s.name, "MaxRetryInterval", s.influxRetryInterval) + clientOptions.SetMaxRetryInterval(s.influxRetryInterval) + } + if s.influxMaxRetryTime != 0 { + cclog.ComponentDebug(s.name, "MaxRetryTime", s.influxMaxRetryTime) + clientOptions.SetMaxRetryTime(s.influxMaxRetryTime) + } + if s.config.InfluxExponentialBase != 0 { + cclog.ComponentDebug(s.name, "Exponential Base", s.config.InfluxExponentialBase) + clientOptions.SetExponentialBase(s.config.InfluxExponentialBase) + } + if s.config.InfluxMaxRetries != 0 { + cclog.ComponentDebug(s.name, "Max Retries", s.config.InfluxMaxRetries) + clientOptions.SetMaxRetries(s.config.InfluxMaxRetries) + } clientOptions.SetTLSConfig( &tls.Config{ InsecureSkipVerify: true, }, - ) - clientOptions.SetMaxRetryInterval(s.influxRetryInterval) - clientOptions.SetMaxRetryTime(s.influxMaxRetryTime) - clientOptions.SetExponentialBase(s.config.InfluxExponentialBase) - clientOptions.SetMaxRetries(s.config.InfluxMaxRetries) + ).SetPrecision(time.Second) s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions) s.writeApi = s.client.WriteAPI(s.config.Organization, s.config.Database) @@ -110,13 +124,14 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) { s.name = fmt.Sprintf("InfluxSink(%s)", name) // Set default for maximum number of points sent to server in single request. - s.config.BatchSize = 100 - s.influxRetryInterval = uint(time.Duration(1) * time.Second) - s.config.InfluxRetryInterval = "1s" - s.influxMaxRetryTime = uint(7 * time.Duration(24) * time.Hour) - s.config.InfluxMaxRetryTime = "168h" - s.config.InfluxMaxRetries = 20 - s.config.InfluxExponentialBase = 2 + s.config.BatchSize = 0 + s.influxRetryInterval = 0 + //s.config.InfluxRetryInterval = "1s" + s.influxMaxRetryTime = 0 + //s.config.InfluxMaxRetryTime = "168h" + s.config.InfluxMaxRetries = 0 + s.config.InfluxExponentialBase = 0 + s.config.FlushInterval = 0 // Default retry intervals (in seconds) // 1 2 From a3b9d8a90be2b6dff7eded6afc61ceacf0bb6c8c Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 1 Apr 2022 18:36:54 +0200 Subject: [PATCH 10/34] HttpSink: Use sink name in error outputs --- sinks/httpSink.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sinks/httpSink.go b/sinks/httpSink.go index 398eaf3..7713638 100644 --- a/sinks/httpSink.go +++ b/sinks/httpSink.go @@ -42,13 +42,13 @@ func (s *HttpSink) Write(m lp.CCMetric) error { if s.buffer.Len() == 0 && s.flushDelay != 0 { // This is the first write since the last flush, start the flushTimer! if s.flushTimer != nil && s.flushTimer.Stop() { - cclog.ComponentDebug("HttpSink", "unexpected: the flushTimer was already running?") + cclog.ComponentDebug(s.name, "unexpected: the flushTimer was already running?") } // Run a batched flush for all lines that have arrived in the last second s.flushTimer = time.AfterFunc(s.flushDelay, func() { if err := s.Flush(); err != nil { - cclog.ComponentError("HttpSink", "flush failed:", err.Error()) + cclog.ComponentError(s.name, "flush failed:", err.Error()) } }) } @@ -60,6 +60,7 @@ func (s *HttpSink) Write(m lp.CCMetric) error { s.lock.Unlock() // defer does not work here as Flush() takes the lock as well if err != nil { + cclog.ComponentError(s.name, "encoding failed:", err.Error()) return err } @@ -84,6 +85,7 @@ func (s *HttpSink) Flush() error { // Create new request to send buffer req, err := http.NewRequest(http.MethodPost, s.config.URL, s.buffer) if err != nil { + cclog.ComponentError(s.name, "failed to create request:", err.Error()) return err } @@ -100,12 +102,15 @@ func (s *HttpSink) Flush() error { // Handle transport/tcp errors if err != nil { + cclog.ComponentError(s.name, "transport/tcp error:", err.Error()) return err } // Handle application errors if res.StatusCode != http.StatusOK { - return errors.New(res.Status) + err = errors.New(res.Status) + cclog.ComponentError(s.name, "application error:", err.Error()) + return err } return nil @@ -114,7 +119,7 @@ func (s *HttpSink) Flush() error { func (s *HttpSink) Close() { s.flushTimer.Stop() if err := s.Flush(); err != nil { - cclog.ComponentError("HttpSink", "flush failed:", err.Error()) + cclog.ComponentError(s.name, "flush failed:", err.Error()) } s.client.CloseIdleConnections() } From 28348bd1086a9659aa0a0abf713e5dd1903dad26 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 1 Apr 2022 18:37:45 +0200 Subject: [PATCH 11/34] InfluxSink: Use batch&flush logic from HttpSink --- sinks/influxSink.go | 139 +++++++++++++++++++++++++++++++++----------- sinks/influxSink.md | 13 ++--- 2 files changed, 109 insertions(+), 43 deletions(-) diff --git a/sinks/influxSink.go b/sinks/influxSink.go index 1987342..e8b16d8 100644 --- a/sinks/influxSink.go +++ b/sinks/influxSink.go @@ -6,28 +6,32 @@ import ( "encoding/json" "errors" "fmt" + "sync" "time" cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" + "github.com/influxdata/influxdb-client-go/v2/api/write" ) type InfluxSinkConfig struct { defaultSinkConfig - Host string `json:"host,omitempty"` - Port string `json:"port,omitempty"` - Database string `json:"database,omitempty"` - User string `json:"user,omitempty"` - Password string `json:"password,omitempty"` - Organization string `json:"organization,omitempty"` - SSL bool `json:"ssl,omitempty"` - RetentionPol string `json:"retention_policy,omitempty"` - InfluxRetryInterval string `json:"retry_interval"` - InfluxExponentialBase uint `json:"retry_exponential_base"` - InfluxMaxRetries uint `json:"max_retries"` - InfluxMaxRetryTime string `json:"max_retry_time"` + Host string `json:"host,omitempty"` + Port string `json:"port,omitempty"` + Database string `json:"database,omitempty"` + User string `json:"user,omitempty"` + Password string `json:"password,omitempty"` + Organization string `json:"organization,omitempty"` + SSL bool `json:"ssl,omitempty"` + FlushDelay string `json:"flush_delay,omitempty"` + BatchSize int `json:"batch_size,omitempty"` + RetentionPol string `json:"retention_policy,omitempty"` + // InfluxRetryInterval string `json:"retry_interval"` + // InfluxExponentialBase uint `json:"retry_exponential_base"` + // InfluxMaxRetries uint `json:"max_retries"` + // InfluxMaxRetryTime string `json:"max_retry_time"` //InfluxMaxRetryDelay string `json:"max_retry_delay"` // It is mentioned in the docs but there is no way to set it } @@ -38,6 +42,10 @@ type InfluxSink struct { config InfluxSinkConfig influxRetryInterval uint influxMaxRetryTime uint + batch []*write.Point + flushTimer *time.Timer + flushDelay time.Duration + lock sync.Mutex // Flush() runs in another goroutine, so this lock has to protect the buffer //influxMaxRetryDelay uint } @@ -56,16 +64,31 @@ func (s *InfluxSink) connect() error { } cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database) clientOptions := influxdb2.DefaultOptions() + + // if s.influxRetryInterval != 0 { + // cclog.ComponentDebug(s.name, "MaxRetryInterval", s.influxRetryInterval) + // clientOptions.SetMaxRetryInterval(s.influxRetryInterval) + // } + // if s.influxMaxRetryTime != 0 { + // cclog.ComponentDebug(s.name, "MaxRetryTime", s.influxMaxRetryTime) + // clientOptions.SetMaxRetryTime(s.influxMaxRetryTime) + // } + // if s.config.InfluxExponentialBase != 0 { + // cclog.ComponentDebug(s.name, "Exponential Base", s.config.InfluxExponentialBase) + // clientOptions.SetExponentialBase(s.config.InfluxExponentialBase) + // } + // if s.config.InfluxMaxRetries != 0 { + // cclog.ComponentDebug(s.name, "Max Retries", s.config.InfluxMaxRetries) + // clientOptions.SetMaxRetries(s.config.InfluxMaxRetries) + // } + clientOptions.SetTLSConfig( &tls.Config{ InsecureSkipVerify: true, }, ) - clientOptions.SetMaxRetryInterval(s.influxRetryInterval) - clientOptions.SetMaxRetryTime(s.influxMaxRetryTime) - clientOptions.SetExponentialBase(s.config.InfluxExponentialBase) - clientOptions.SetMaxRetries(s.config.InfluxMaxRetries) + clientOptions.SetPrecision(time.Second) s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions) s.writeApi = s.client.WriteAPIBlocking(s.config.Organization, s.config.Database) @@ -80,38 +103,76 @@ func (s *InfluxSink) connect() error { } func (s *InfluxSink) Write(m lp.CCMetric) error { - err := - s.writeApi.WritePoint( - context.Background(), - m.ToPoint(s.meta_as_tags), - ) - return err + // err := + // s.writeApi.WritePoint( + // context.Background(), + // m.ToPoint(s.meta_as_tags), + // ) + if len(s.batch) == 0 && s.flushDelay != 0 { + // This is the first write since the last flush, start the flushTimer! + if s.flushTimer != nil && s.flushTimer.Stop() { + cclog.ComponentDebug(s.name, "unexpected: the flushTimer was already running?") + } + + // Run a batched flush for all lines that have arrived in the last second + s.flushTimer = time.AfterFunc(s.flushDelay, func() { + if err := s.Flush(); err != nil { + cclog.ComponentError(s.name, "flush failed:", err.Error()) + } + }) + } + p := m.ToPoint(s.meta_as_tags) + s.lock.Lock() + s.batch = append(s.batch, p) + s.lock.Unlock() + + // Flush synchronously if "flush_delay" is zero + if s.flushDelay == 0 { + return s.Flush() + } + + return nil } func (s *InfluxSink) Flush() error { + s.lock.Lock() + defer s.lock.Unlock() + if len(s.batch) == 0 { + return nil + } + err := s.writeApi.WritePoint(context.Background(), s.batch...) + if err != nil { + cclog.ComponentError(s.name, "flush failed:", err.Error()) + return err + } + s.batch = s.batch[:0] return nil } func (s *InfluxSink) Close() { cclog.ComponentDebug(s.name, "Closing InfluxDB connection") + s.flushTimer.Stop() + s.Flush() s.client.Close() } func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { s := new(InfluxSink) s.name = fmt.Sprintf("InfluxSink(%s)", name) + s.config.BatchSize = 100 + s.config.FlushDelay = "1s" if len(config) > 0 { err := json.Unmarshal(config, &s.config) if err != nil { return nil, err } } - s.influxRetryInterval = uint(time.Duration(1) * time.Second) - s.config.InfluxRetryInterval = "1s" - s.influxMaxRetryTime = uint(7 * time.Duration(24) * time.Hour) - s.config.InfluxMaxRetryTime = "168h" - s.config.InfluxMaxRetries = 20 - s.config.InfluxExponentialBase = 2 + s.influxRetryInterval = 0 + s.influxMaxRetryTime = 0 + // s.config.InfluxRetryInterval = "" + // s.config.InfluxMaxRetryTime = "" + // s.config.InfluxMaxRetries = 0 + // s.config.InfluxExponentialBase = 0 if len(s.config.Host) == 0 || len(s.config.Port) == 0 || @@ -126,15 +187,25 @@ func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { s.meta_as_tags[k] = true } - toUint := func(duration string, def uint) uint { - t, err := time.ParseDuration(duration) + // toUint := func(duration string, def uint) uint { + // if len(duration) > 0 { + // t, err := time.ParseDuration(duration) + // if err == nil { + // return uint(t.Milliseconds()) + // } + // } + // return def + // } + // s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval) + // s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime) + + if len(s.config.FlushDelay) > 0 { + t, err := time.ParseDuration(s.config.FlushDelay) if err == nil { - return uint(t.Milliseconds()) + s.flushDelay = t } - return def } - s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval) - s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime) + s.batch = make([]*write.Point, 0, s.config.BatchSize) // Connect to InfluxDB server if err := s.connect(); err != nil { diff --git a/sinks/influxSink.md b/sinks/influxSink.md index a099895..8f9ce83 100644 --- a/sinks/influxSink.md +++ b/sinks/influxSink.md @@ -17,10 +17,8 @@ The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.de "password" : "examplepw", "organization": "myorg", "ssl": true, - "retry_interval" : "1s", - "retry_exponential_base" : 2, - "max_retries": 20, - "max_retry_time" : "168h" + "flush_delay" : "1s", + "batch_size" : 100 } } ``` @@ -34,9 +32,6 @@ The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.de - `password`: Password for basic authentification - `organization`: Organization in the InfluxDB - `ssl`: Use SSL connection -- `retry_interval`: Base retry interval for failed write requests, default 1s -- `retry_exponential_base`: The retry interval is exponentially increased with this base, default 2 -- `max_retries`: Maximal number of retry attempts -- `max_retry_time`: Maximal time to retry failed writes, default 168h (one week) +- `flush_delay`: Group metrics coming in to a single batch +- `batch_size`: Maximal batch size -For information about the calculation of the retry interval settings, see [offical influxdb-client-go documentation](https://github.com/influxdata/influxdb-client-go#handling-of-failed-async-writes) \ No newline at end of file From 4d5b1adbc8bbe1d510f02d93af89d31ee4c3dd14 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 4 Apr 2022 02:26:04 +0200 Subject: [PATCH 12/34] Fix for interval_timestamp option --- internal/metricRouter/metricRouter.go | 38 ++++----------------------- 1 file changed, 5 insertions(+), 33 deletions(-) diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index f9b3faa..63d61ee 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -48,7 +48,6 @@ type metricRouter struct { done chan bool // channel to finish / stop metric router wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector timestamp time.Time // timestamp periodically updated by ticker each interval - timerdone chan bool // channel to finish / stop timestamp updater ticker mct.MultiChanTicker // periodically ticking once each interval config metricRouterConfig // json encoded config for metric router cache MetricCache // pointer to MetricCache @@ -124,29 +123,6 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout return nil } -// StartTimer starts a timer which updates timestamp periodically -func (r *metricRouter) StartTimer() { - m := make(chan time.Time) - r.ticker.AddChannel(m) - r.timerdone = make(chan bool) - - r.wg.Add(1) - go func() { - defer r.wg.Done() - for { - select { - case <-r.timerdone: - close(r.timerdone) - cclog.ComponentDebug("MetricRouter", "TIMER DONE") - return - case t := <-m: - r.timestamp = t - } - } - }() - cclog.ComponentDebug("MetricRouter", "TIMER START") -} - func getParamMap(point lp.CCMetric) map[string]interface{} { params := make(map[string]interface{}) params["metric"] = point @@ -235,8 +211,9 @@ func (r *metricRouter) dropMetric(point lp.CCMetric) bool { func (r *metricRouter) Start() { // start timer if configured r.timestamp = time.Now() + timeChan := make(chan time.Time) if r.config.IntervalStamp { - r.StartTimer() + r.ticker.AddChannel(timeChan) } // Router manager is done @@ -316,6 +293,9 @@ func (r *metricRouter) Start() { done() return + case timestamp := <-timeChan: + r.timestamp = timestamp + case p := <-r.coll_input: coll_forward(p) for i := 0; len(r.coll_input) > 0 && i < (r.maxForward-1); i++ { @@ -361,14 +341,6 @@ func (r *metricRouter) Close() { // wait for close of channel r.done <-r.done - // stop timer - if r.config.IntervalStamp { - cclog.ComponentDebug("MetricRouter", "TIMER CLOSE") - r.timerdone <- true - // wait for close of channel r.timerdone - <-r.timerdone - } - // stop metric cache if r.config.NumCacheIntervals > 0 { cclog.ComponentDebug("MetricRouter", "CACHE CLOSE") From ecdb4c1bcf3d4b7c90d0d79c7cce5a607b69c47b Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 4 Apr 2022 02:55:44 +0200 Subject: [PATCH 13/34] Add debug message when updating interval_timestep --- internal/metricRouter/metricRouter.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 63d61ee..8875d0e 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -295,6 +295,7 @@ func (r *metricRouter) Start() { case timestamp := <-timeChan: r.timestamp = timestamp + cclog.ComponentDebug("MetricRouter", "Update timestamp", r.timestamp.UnixNano()) case p := <-r.coll_input: coll_forward(p) From 69f7c196593dc7125dc0570c17d2e06cd6bb97ee Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 4 Apr 2022 02:56:23 +0200 Subject: [PATCH 14/34] InfluxAsyncSink: Add custom flush mechanism --- sinks/influxAsyncSink.go | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/sinks/influxAsyncSink.go b/sinks/influxAsyncSink.go index e22f941..c2956fc 100644 --- a/sinks/influxAsyncSink.go +++ b/sinks/influxAsyncSink.go @@ -32,6 +32,7 @@ type InfluxAsyncSinkConfig struct { InfluxExponentialBase uint `json:"retry_exponential_base,omitempty"` InfluxMaxRetries uint `json:"max_retries,omitempty"` InfluxMaxRetryTime string `json:"max_retry_time,omitempty"` + CustomFlushInterval string `json:"custom_flush_interval,omitempty"` } type InfluxAsyncSink struct { @@ -42,6 +43,8 @@ type InfluxAsyncSink struct { config InfluxAsyncSinkConfig influxRetryInterval uint influxMaxRetryTime uint + customFlushInterval time.Duration + flushTimer *time.Timer } func (s *InfluxAsyncSink) connect() error { @@ -102,6 +105,15 @@ func (s *InfluxAsyncSink) connect() error { } func (s *InfluxAsyncSink) Write(m lp.CCMetric) error { + if s.customFlushInterval != 0 && s.flushTimer == nil { + // Run a batched flush for all lines that have arrived in the defined interval + s.flushTimer = time.AfterFunc(s.customFlushInterval, func() { + if err := s.Flush(); err != nil { + cclog.ComponentError(s.name, "flush failed:", err.Error()) + } + + }) + } s.writeApi.WritePoint( m.ToPoint(s.meta_as_tags), ) @@ -109,7 +121,11 @@ func (s *InfluxAsyncSink) Write(m lp.CCMetric) error { } func (s *InfluxAsyncSink) Flush() error { + cclog.ComponentDebug(s.name, "Flushing") s.writeApi.Flush() + if s.customFlushInterval != 0 && s.flushTimer != nil { + s.flushTimer = nil + } return nil } @@ -132,6 +148,8 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) { s.config.InfluxMaxRetries = 0 s.config.InfluxExponentialBase = 0 s.config.FlushInterval = 0 + s.config.CustomFlushInterval = "" + s.customFlushInterval = time.Duration(0) // Default retry intervals (in seconds) // 1 2 @@ -183,6 +201,15 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) { s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval) s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime) + // Use a own timer for calling Flush() + if len(s.config.CustomFlushInterval) > 0 { + t, err := time.ParseDuration(s.config.CustomFlushInterval) + if err != nil { + return nil, fmt.Errorf("invalid duration in 'custom_flush_interval': %v", err) + } + s.customFlushInterval = t + } + // Connect to InfluxDB server if err := s.connect(); err != nil { return nil, fmt.Errorf("unable to connect: %v", err) From 70a9530abaa1b9f0280e7c35c0584bb87998b081 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 4 Apr 2022 11:48:54 +0200 Subject: [PATCH 15/34] Set WriteFailedCallback to get some error message --- sinks/influxAsyncSink.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sinks/influxAsyncSink.go b/sinks/influxAsyncSink.go index c2956fc..31d127d 100644 --- a/sinks/influxAsyncSink.go +++ b/sinks/influxAsyncSink.go @@ -6,12 +6,14 @@ import ( "encoding/json" "errors" "fmt" + "strings" "time" cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" + influxdb2ApiHttp "github.com/influxdata/influxdb-client-go/v2/api/http" ) type InfluxAsyncSinkConfig struct { @@ -33,6 +35,7 @@ type InfluxAsyncSinkConfig struct { InfluxMaxRetries uint `json:"max_retries,omitempty"` InfluxMaxRetryTime string `json:"max_retry_time,omitempty"` CustomFlushInterval string `json:"custom_flush_interval,omitempty"` + MaxRetryAttempts uint `json:"max_retry_attempts,omitempty"` } type InfluxAsyncSink struct { @@ -101,6 +104,11 @@ func (s *InfluxAsyncSink) connect() error { if !ok { return fmt.Errorf("connection to %s not healthy", uri) } + s.writeApi.SetWriteFailedCallback(func(batch string, err influxdb2ApiHttp.Error, retryAttempts uint) bool { + mlist := strings.Split(batch, "\n") + cclog.ComponentError(s.name, fmt.Sprintf("Failed to write batch with %d metrics %d times (max: %d): %s", len(mlist), retryAttempts, s.config.MaxRetryAttempts, err.Error())) + return retryAttempts <= s.config.MaxRetryAttempts + }) return nil } @@ -111,7 +119,6 @@ func (s *InfluxAsyncSink) Write(m lp.CCMetric) error { if err := s.Flush(); err != nil { cclog.ComponentError(s.name, "flush failed:", err.Error()) } - }) } s.writeApi.WritePoint( @@ -150,6 +157,7 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) { s.config.FlushInterval = 0 s.config.CustomFlushInterval = "" s.customFlushInterval = time.Duration(0) + s.config.MaxRetryAttempts = 1 // Default retry intervals (in seconds) // 1 2 From 7b098e0b1b1d58e9b5c21e0a559e3d0aba035566 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Mon, 4 Apr 2022 15:16:11 +0200 Subject: [PATCH 16/34] Fix for missing metrics in LikwidCollector is hwthread is inactive --- collectors/likwidMetric.go | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index d808bad..f2229d1 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -329,7 +329,11 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv gctr := C.GoString(counter) for _, tid := range m.cpu2tid { res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid)) - evset.results[tid][gctr] = float64(res) + fres := float64(res) + if m.config.InvalidToZero && (math.IsNaN(fres) || math.IsInf(fres, 0)) { + fres = 0.0 + } + evset.results[tid][gctr] = fres evset.results[tid]["time"] = interval.Seconds() evset.results[tid]["inverseClock"] = invClock } @@ -348,15 +352,12 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid]) if err != nil { cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) - continue + value = 0.0 + } + if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) { + value = 0.0 } evset.metrics[tid][metric.Name] = value - if m.config.InvalidToZero && math.IsNaN(value) { - value = 0.0 - } - if m.config.InvalidToZero && math.IsInf(value, 0) { - value = 0.0 - } // Now we have the result, send it with the proper tags if !math.IsNaN(value) { if metric.Publish { @@ -400,15 +401,12 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan value, err := agg.EvalFloat64Condition(metric.Calc, params) if err != nil { cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) - continue + value = 0.0 + } + if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) { + value = 0.0 } m.gmresults[tid][metric.Name] = value - if m.config.InvalidToZero && math.IsNaN(value) { - value = 0.0 - } - if m.config.InvalidToZero && math.IsInf(value, 0) { - value = 0.0 - } // Now we have the result, send it with the proper tags if !math.IsNaN(value) { if metric.Publish { From 017cd58247c7e09bf4fe3b91a78d9506c73f2233 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 5 Apr 2022 10:57:09 +0200 Subject: [PATCH 17/34] Updating page for LikwidCollector --- collectors/likwidMetric.md | 99 +++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 45 deletions(-) diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index fe28857..2d622d1 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -3,32 +3,63 @@ The `likwid` collector is probably the most complicated collector. The LIKWID library is included as static library with *direct* access mode. The *direct* access mode is suitable if the daemon is executed by a root user. The static library does not contain the performance groups, so all information needs to be provided in the configuration. -The `likwid` configuration consists of two parts, the "eventsets" and "globalmetrics": -- An event set list itself has two parts, the "events" and a set of derivable "metrics". Each of the "events" is a counter:event pair in LIKWID's syntax. The "metrics" are a list of formulas to derive the metric value from the measurements of the "events". Each metric has a name, the formula, a scope and a publish flag. Counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. The scope tells the Collector whether it is a metric for each hardware thread (`cpu`) or each CPU socket (`socket`). You may specify a unit for the metric with `unit`. The last one is the publishing flag. It tells the collector whether a metric should be sent to the router. -- The global metrics are metrics which require data from all event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. So, the idea is to derive a metric in the "eventsets" section and reuse it in the "globalmetrics" part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics (`publish=false`). **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Similar to the metrics in the eventset, you can specify a metric unit with the `unit` field. +```json + "likwid": { + "force_overwrite" : false, + "invalid_to_zero" : false, + "eventsets": [ + { + "events" : { + "COUNTER0": "EVENT0", + "COUNTER1": "EVENT1", + }, + "metrics" : [ + { + "name": "sum_01", + "calc": "COUNTER0 + COUNTER1", + "publish": false, + "unit": "myunit", + "type": "cpu" + } + ] + } + ] + "globalmetrics" : [ + { + "name": "global_sum", + "calc": "sum_01", + "publish": true, + "unit": "myunit", + "type": "cpu" + } + ] + } +``` + +The `likwid` configuration consists of two parts, the `eventsets` and `globalmetrics`: +- An event set list itself has two parts, the `events` and a set of derivable `metrics`. Each of the `events` is a `counter:event` pair in LIKWID's syntax. The `metrics` are a list of formulas to derive the metric value from the measurements of the `events`' values. Each metric has a name, the formula, a type and a publish flag. There is an optional `unit` field. Counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. You can optionally use `time` for the measurement time and `inverseClock` for `1.0/baseCpuFrequency`. The type tells the LikwidCollector whether it is a metric for each hardware thread (`cpu`) or each CPU socket (`socket`). You may specify a unit for the metric with `unit`. The last one is the publishing flag. It tells the LikwidCollector whether a metric should be sent to the router or is only used internally to compute a global metric. +- The `globalmetrics` are metrics which require data from multiple event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. Also `time` and `inverseClock` cannot be used anymore. So, the idea is to derive a metric in the `eventsets` section and reuse it in the `globalmetrics` part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics (`"publish": false`). **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Similar to the metrics in the eventset, you can specify a metric unit with the `unit` field. Additional options: -- `access_mode` : Method to use for hardware performance monitoring (`direct` access as root user, `accessdaemon` for the daemon mode) -- `accessdaemon_path`: Folder with the access daemon `likwid-accessD`, commonly `$LIKWID_INSTALL_LOC/sbin` - `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements -- `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`. -- `access_mode`: Specify LIKWID access mode: `direct` for direct register access as root user or `accessdaemon` -- `accessdaemon_path`: Folder of the accessDaemon `likwid-accessD` -- `liblikwid_path`: Location of `liblikwid.so` +- `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`. See below in [seperate section](./likwidMetric.md#invalid_to_zero-option) +- `access_mode`: Specify LIKWID access mode: `direct` for direct register access as root user or `accessdaemon`. The access mode `perf_event` is current untested. +- `accessdaemon_path`: Folder of the accessDaemon `likwid-accessD` (like `/usr/local/sbin`) +- `liblikwid_path`: Location of `liblikwid.so` including file name like `/usr/local/lib/liblikwid.so` ### Available metric scopes -Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the collector provides the specification of a 'scope' for each metric. +Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the LikwidCollector provides the specification of a `type` for each metric. - `cpu` : One metric per CPU hardware thread with the tags `"type" : "cpu"` and `"type-id" : "$cpu_id"` - `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"` -**Note:** You cannot specify `socket` scope for a metric that is measured at `cpu` scope, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific. +**Note:** You should not specify the `socket` type for a metric that is measured at `cpu` scope and vice versa, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific. As a guideline: - All counters `FIXCx`, `PMCy` and `TMAz` have the scope `cpu` - All counters names containing `BOX` have the scope `socket` -- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `cpu` scope +- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `cpu` scope (AMD Zen) - All `DFCx` counters have scope `socket` ### Help with the configuration @@ -50,6 +81,7 @@ $ scripts/likwid_perfgroup_to_cc_config.py ICX MEM_DP { "events": { "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", "..." : "..." }, "metrics" : [ @@ -75,21 +107,28 @@ LIKWID checks the file `/var/run/likwid.lock` before performing any interfering Before (SLURM prolog, ...) ``` -$ chwon $JOBUSER /var/run/likwid.lock +$ chown $JOBUSER /var/run/likwid.lock ``` After (SLURM epilog, ...) ``` -$ chwon $CCUSER /var/run/likwid.lock +$ chown $CCUSER /var/run/likwid.lock ``` +### `invalid_to_zero` option +In some cases LIKWID returns `0.0` for some events that are further used in processing and maybe used as divisor in a calculation. After evaluation of a metric, the result might be `NaN` or `+-Inf`. These resulting metrics are commonly not created and forwarded to the router because the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#float) does not support these special floating-point values. If you want to have them sent, this option forces these metric values to be `0.0` instead. + +One might think this does not happen often but often used metrics in the world of performance engineering like Instructions-per-Cycle (IPC) or more frequently the actual CPU clock are derived with events like `CPU_CLK_UNHALTED_CORE` (Intel) which do not increment in halted state (as the name implies). In there are different power management systems in a chip which can cause a hardware thread to go in such a state. Moreover, if no cycles are executed by the core, also many other events are not incremented as well (like `INSTR_RETIRED_ANY` for retired instructions and part of IPC). + + ### Example configuration +#### AMD Zen3 ```json "likwid": { "force_overwrite" : false, - "nan_to_zero" : false, + "invalid_to_zero" : false, "eventsets": [ { "events": { @@ -180,33 +219,3 @@ $ chwon $CCUSER /var/run/likwid.lock } ``` -### How to get the eventsets and metrics from LIKWID - -The `likwid` collector reads hardware performance counters at a **cpu** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility. - -The logic is as following: There are multiple eventsets, each consisting of a list of counters+events and a list of metrics. If you compare a common performance group with the example setting above, there is not much difference: -``` -EVENTSET -> "events": { -FIXC1 ACTUAL_CPU_CLOCK -> "FIXC1": "ACTUAL_CPU_CLOCK", -FIXC2 MAX_CPU_CLOCK -> "FIXC2": "MAX_CPU_CLOCK", -PMC0 RETIRED_INSTRUCTIONS -> "PMC0" : "RETIRED_INSTRUCTIONS", -PMC1 CPU_CLOCKS_UNHALTED -> "PMC1" : "CPU_CLOCKS_UNHALTED", -PMC2 RETIRED_SSE_AVX_FLOPS_ALL -> "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL", -PMC3 MERGE -> "PMC3": "MERGE", - -> } -``` - -The metrics are following the same procedure: - -``` -METRICS -> "metrics": [ -IPC PMC0/PMC1 -> { - -> "name" : "IPC", - -> "calc" : "PMC0/PMC1", - -> "scope": "cpu", - -> "publish": true - -> } - -> ] -``` - -The script `scripts/likwid_perfgroup_to_cc_config.py` might help you. \ No newline at end of file From e7b8088c41bd115bfcbec90c750bdd68a6e603da Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 19 Apr 2022 11:42:46 +0200 Subject: [PATCH 18/34] Extended go routine use case in sample receiver --- receivers/sampleReceiver.go | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/receivers/sampleReceiver.go b/receivers/sampleReceiver.go index 2892d56..19d6f25 100644 --- a/receivers/sampleReceiver.go +++ b/receivers/sampleReceiver.go @@ -36,16 +36,26 @@ func (r *SampleReceiver) Start() { // or use own go routine but always make sure it exits // as soon as it gets the signal of the r.done channel + // + // r.done = make(chan bool) // r.wg.Add(1) // go func() { - // for { - // select { - // case <-r.done: - // r.wg.Done() - // return - // } - // } - // r.wg.Done() + // defer r.wg.Done() + // + // // Create ticker + // ticker := time.NewTicker(30 * time.Second) + // defer ticker.Stop() + // + // for { + // readMetric() + // select { + // case <-ticker.C: + // // process ticker event -> continue + // continue + // case <-r.done: + // return + // } + // } // }() } From 96ee16398e710bf8d8c8e5ddf94a473ce353f0ca Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 19 Apr 2022 11:53:11 +0200 Subject: [PATCH 19/34] Removed unused done channel and wg wait group --- receivers/receiveManager.go | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/receivers/receiveManager.go b/receivers/receiveManager.go index 1c13026..b9a72b9 100644 --- a/receivers/receiveManager.go +++ b/receivers/receiveManager.go @@ -16,8 +16,6 @@ var AvailableReceivers = map[string]func(name string, config json.RawMessage) (R type receiveManager struct { inputs []Receiver output chan lp.CCMetric - done chan bool - wg *sync.WaitGroup config []json.RawMessage } @@ -33,8 +31,6 @@ func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) er // Initialize struct fields rm.inputs = make([]Receiver, 0) rm.output = nil - rm.done = make(chan bool) - rm.wg = wg rm.config = make([]json.RawMessage, 0) configFile, err := os.Open(receiverConfigFile) @@ -58,7 +54,7 @@ func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) er } func (rm *receiveManager) Start() { - rm.wg.Add(1) + cclog.ComponentDebug("ReceiveManager", "START") for _, r := range rm.inputs { cclog.ComponentDebug("ReceiveManager", "START", r.Name()) @@ -97,16 +93,19 @@ func (rm *receiveManager) AddOutput(output chan lp.CCMetric) { } func (rm *receiveManager) Close() { + cclog.ComponentDebug("ReceiveManager", "CLOSE") + + // Close all receivers for _, r := range rm.inputs { cclog.ComponentDebug("ReceiveManager", "CLOSE", r.Name()) r.Close() } - rm.wg.Done() - cclog.ComponentDebug("ReceiveManager", "CLOSE") + + cclog.ComponentDebug("ReceiveManager", "DONE") } func New(wg *sync.WaitGroup, receiverConfigFile string) (ReceiveManager, error) { - r := &receiveManager{} + r := new(receiveManager) err := r.Init(wg, receiverConfigFile) if err != nil { return nil, err From a1d85fa886aa4e0456364f6c73eacf6b5268dda7 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 19 Apr 2022 12:05:03 +0200 Subject: [PATCH 20/34] Add redfish receiver --- receivers.json | 17 +++ receivers/receiveManager.go | 3 +- receivers/redfishReceiver.go | 259 +++++++++++++++++++++++++++++++++++ 3 files changed, 278 insertions(+), 1 deletion(-) create mode 100644 receivers/redfishReceiver.go diff --git a/receivers.json b/receivers.json index a27f07d..cd78eb6 100644 --- a/receivers.json +++ b/receivers.json @@ -4,5 +4,22 @@ "address": "nats://my-url", "port" : "4222", "database": "testcluster" + }, + "redfish_recv": { + "type": "redfish", + "client_config": [ + { + "hostname": "my-host-1", + "username": "username-1", + "password": "password-1", + "endpoint": "https://my-endpoint-1" + }, + { + "hostname": "my-host-2", + "username": "username-2", + "password": "password-2", + "endpoint": "https://my-endpoint-2" + } + ] } } diff --git a/receivers/receiveManager.go b/receivers/receiveManager.go index b9a72b9..7a20fac 100644 --- a/receivers/receiveManager.go +++ b/receivers/receiveManager.go @@ -10,7 +10,8 @@ import ( ) var AvailableReceivers = map[string]func(name string, config json.RawMessage) (Receiver, error){ - "nats": NewNatsReceiver, + "nats": NewNatsReceiver, + "redfish": NewRedfishReceiver, } type receiveManager struct { diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go new file mode 100644 index 0000000..606bdcb --- /dev/null +++ b/receivers/redfishReceiver.go @@ -0,0 +1,259 @@ +package receivers + +import ( + "encoding/json" + "fmt" + "strconv" + "sync" + "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + + // See: https://pkg.go.dev/github.com/stmcginnis/gofish + "github.com/stmcginnis/gofish" +) + +// RedfishReceiver configuration: +type RedfishReceiver struct { + receiver + config struct { + Type string `json:"type"` + Fanout int `json:"fanout,omitempty"` // Default fanout: 64 + Interval int `json:"interval,omitempty"` // Default interval: 30s + + // Client config for each redfish service + ClientConfigs []struct { + Hostname *string `json:"hostname"` + Username *string `json:"username"` + Password *string `json:"password"` + Endpoint *string `json:"endpoint"` + Insecure *bool `json:"insecure,omitempty"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + gofish gofish.ClientConfig + } `json:"client_config"` + } + + done chan bool // channel to finish / stop redfish receiver + wg sync.WaitGroup // wait group for redfish receiver +} + +// Start starts the redfish receiver +func (r *RedfishReceiver) Start() { + cclog.ComponentDebug(r.name, "START") + + // readPowerMetric reads readfish power metric from the endpoint configured in conf + readPowerMetric := func(clientConfigIndex int) error { + + clientConfig := &r.config.ClientConfigs[clientConfigIndex] + + // Connect to redfish service + c, err := gofish.Connect(clientConfig.gofish) + if err != nil { + return fmt.Errorf("readPowerMetric: gofish.Connect(...) failed: %v", err) + } + defer c.Logout() + + // Get all chassis managed by this service + chassis_list, err := c.Service.Chassis() + if err != nil { + return fmt.Errorf("readPowerMetric: c.Service.Chassis() failed: %v", err) + } + + for _, chassis := range chassis_list { + timestamp := time.Now() + + // Get power information for each chassis + power, err := chassis.Power() + if err != nil { + return fmt.Errorf("readPowerMetric: chassis.Power() failed: %v", err) + } + + // Read min, max and average consumed watts for each power control + for _, pc := range power.PowerControl { + + // Map of collected metrics + metrics := map[string]float32{ + "average_consumed_watts": pc.PowerMetrics.AverageConsumedWatts, + "min_consumed_watts": pc.PowerMetrics.MinConsumedWatts, + "max_consumed_watts": pc.PowerMetrics.MaxConsumedWatts, + } + intervalInMin := strconv.FormatFloat(float64(pc.PowerMetrics.IntervalInMin), 'f', -1, 32) + + // Metrics to exclude + for _, key := range clientConfig.ExcludeMetrics { + delete(metrics, key) + } + + for name, value := range metrics { + y, err := lp.New( + name, + map[string]string{ + "hostname": *clientConfig.Hostname, + "type": "node", + "power_control_name": pc.Name, + }, + map[string]string{ + "source": r.name, + "group": "Energy", + "interval_in_minutes": intervalInMin, + "unit": "watts", + }, + map[string]interface{}{ + "value": value, + }, + timestamp) + if err == nil { + r.sink <- y + } + } + } + } + + return nil + } + + // doReadPowerMetric read power metrics for all configure redfish services. + // To compensate latencies of the Redfish services a fanout is used. + doReadPowerMetric := func() { + + // Compute fanout to use + realFanout := r.config.Fanout + if len(r.config.ClientConfigs) < realFanout { + realFanout = len(r.config.ClientConfigs) + } + + // Create wait group and input channel for workers + var workerWaitGroup sync.WaitGroup + workerInput := make(chan int, realFanout) + + // Create worker go routines + for i := 0; i < realFanout; i++ { + // Increment worker wait group counter + workerWaitGroup.Add(1) + go func() { + // Decrement worker wait group counter + defer workerWaitGroup.Done() + + // Read power metrics for each client config + for clientConfigIndex := range workerInput { + err := readPowerMetric(clientConfigIndex) + if err != nil { + cclog.ComponentError(r.name, err) + } + } + }() + } + + // Distribute client configs to workers + for i := range r.config.ClientConfigs { + workerInput <- i + } + + // Stop workers and wait for all workers to finish + close(workerInput) + workerWaitGroup.Wait() + } + + // Start redfish receiver + r.wg.Add(1) + go func() { + defer r.wg.Done() + + // Create ticker + ticker := time.NewTicker(time.Duration(r.config.Interval) * time.Second) + defer ticker.Stop() + + for { + doReadPowerMetric() + + select { + case <-ticker.C: + // process ticker event -> continue + continue + case <-r.done: + // process done event + return + } + } + }() + + cclog.ComponentDebug(r.name, "STARTED") +} + +// Close redfish receiver +func (r *RedfishReceiver) Close() { + cclog.ComponentDebug(r.name, "CLOSE") + + // Send the signal and wait + r.done <- true + r.wg.Wait() + + cclog.ComponentDebug(r.name, "DONE") +} + +// New function to create a new instance of the receiver +// Initialize the receiver by giving it a name and reading in the config JSON +func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { + r := new(RedfishReceiver) + + // Set name + r.name = fmt.Sprintf("RedfishReceiver(%s)", name) + + // Create done channel + r.done = make(chan bool) + + // Set defaults in r.config + // Allow overwriting these defaults by reading config JSON + r.config.Fanout = 64 + r.config.Interval = 30 + + // Read the redfish receiver specific JSON config + if len(config) > 0 { + err := json.Unmarshal(config, &r.config) + if err != nil { + cclog.ComponentError(r.name, "Error reading config:", err.Error()) + return nil, err + } + } + + // Create gofish client config + for i := range r.config.ClientConfigs { + clientConfig := &r.config.ClientConfigs[i] + gofishConfig := &clientConfig.gofish + + if clientConfig.Hostname == nil { + err := fmt.Errorf("client config number %v requires hostname", i) + cclog.ComponentError(r.name, err) + return nil, err + } + + if clientConfig.Endpoint == nil { + err := fmt.Errorf("client config number %v requires endpoint", i) + cclog.ComponentError(r.name, err) + return nil, err + } + gofishConfig.Endpoint = *clientConfig.Endpoint + + if clientConfig.Username == nil { + err := fmt.Errorf("client config number %v requires username", i) + cclog.ComponentError(r.name, err) + return nil, err + } + gofishConfig.Username = *clientConfig.Username + + if clientConfig.Password == nil { + err := fmt.Errorf("client config number %v requires password", i) + cclog.ComponentError(r.name, err) + return nil, err + } + gofishConfig.Password = *clientConfig.Password + + gofishConfig.Insecure = true + if clientConfig.Insecure != nil { + gofishConfig.Insecure = *clientConfig.Insecure + } + } + + return r, nil +} From 48d34bf564530f51e344979404e383b66654ef49 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 19 Apr 2022 12:06:53 +0200 Subject: [PATCH 21/34] Adopt sinks.json for new meta_as_tags usage --- sinks.json | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sinks.json b/sinks.json index 2fdae5a..3e9be6d 100644 --- a/sinks.json +++ b/sinks.json @@ -1,6 +1,8 @@ { - "mystdout" : { - "type" : "stdout", - "meta_as_tags" : true + "mystdout": { + "type": "stdout", + "meta_as_tags": [ + "unit" + ] } -} +} \ No newline at end of file From bf9c7e1830bc0ca3ec3902c385aebbadff5f50a8 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 19 Apr 2022 12:15:51 +0200 Subject: [PATCH 22/34] Update requirements --- go.mod | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/go.mod b/go.mod index 130f5cc..07d46f6 100644 --- a/go.mod +++ b/go.mod @@ -3,17 +3,14 @@ module github.com/ClusterCockpit/cc-metric-collector go 1.16 require ( - github.com/NVIDIA/go-nvml v0.11.1-0 - github.com/influxdata/influxdb-client-go/v2 v2.7.0 - github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf - github.com/nats-io/nats.go v1.13.1-0.20211122170419-d7c1d78a50fc - golang.org/x/sys v0.0.0-20220114195835-da31bd327af9 - gopkg.in/Knetic/govaluate.v2 v2.3.0 -) - -require ( + github.com/NVIDIA/go-nvml v0.11.6-0 github.com/PaesslerAG/gval v1.1.2 - github.com/golang/protobuf v1.5.2 // indirect - github.com/nats-io/nats-server/v2 v2.7.0 // indirect - google.golang.org/protobuf v1.27.1 // indirect + github.com/gorilla/mux v1.8.0 + github.com/influxdata/influxdb-client-go/v2 v2.8.1 + github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf + github.com/nats-io/nats-server/v2 v2.8.0 // indirect + github.com/nats-io/nats.go v1.14.0 + github.com/prometheus/client_golang v1.12.1 + github.com/stmcginnis/gofish v0.13.0 + golang.org/x/sys v0.0.0-20220412211240-33da011f77ad ) From 31c5c89a5ababc929d9276624fc49e60a6b4a958 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 19 Apr 2022 14:01:23 +0200 Subject: [PATCH 23/34] Fix: Close done channel --- receivers/redfishReceiver.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index 606bdcb..358093d 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -186,7 +186,7 @@ func (r *RedfishReceiver) Close() { cclog.ComponentDebug(r.name, "CLOSE") // Send the signal and wait - r.done <- true + close(r.done) r.wg.Wait() cclog.ComponentDebug(r.name, "DONE") From 8c730955484bad381ba3ebe0ab366f4924db098d Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 20 Apr 2022 09:58:02 +0200 Subject: [PATCH 24/34] Allow to shutdown redfish receiver during metric read --- receivers/redfishReceiver.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index 358093d..bcf857e 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -146,7 +146,17 @@ func (r *RedfishReceiver) Start() { } // Distribute client configs to workers + clientConfigLoop: for i := range r.config.ClientConfigs { + // Check done channel status + select { + case _, ok := <-r.done: + if !ok { + break clientConfigLoop + } + default: + } + workerInput <- i } From c2d4272fdf8af5f08249703e134b315d38aac0e5 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 20 Apr 2022 12:36:45 +0200 Subject: [PATCH 25/34] Clear workerInput channel after done event --- receivers/redfishReceiver.go | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index bcf857e..2ebe507 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -50,7 +50,18 @@ func (r *RedfishReceiver) Start() { // Connect to redfish service c, err := gofish.Connect(clientConfig.gofish) if err != nil { - return fmt.Errorf("readPowerMetric: gofish.Connect(...) failed: %v", err) + c := struct { + Username string + Endpoint string + BasicAuth bool + Insecure bool + }{ + Username: clientConfig.gofish.Username, + Endpoint: clientConfig.gofish.Endpoint, + BasicAuth: clientConfig.gofish.BasicAuth, + Insecure: clientConfig.gofish.Insecure, + } + return fmt.Errorf("readPowerMetric: gofish.Connect(%+v) failed: %v", c, err) } defer c.Logout() @@ -146,18 +157,19 @@ func (r *RedfishReceiver) Start() { } // Distribute client configs to workers - clientConfigLoop: for i := range r.config.ClientConfigs { // Check done channel status select { - case _, ok := <-r.done: - if !ok { - break clientConfigLoop + case workerInput <- i: + case <-r.done: + // process done event + // Stop workers, clear channel and wait for all workers to finish + close(workerInput) + for range workerInput { } - default: + workerWaitGroup.Wait() + return } - - workerInput <- i } // Stop workers and wait for all workers to finish From 9d6d0dbd93bdf7ff802e379b86a5338854b6a3ce Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 20 Apr 2022 14:39:26 +0200 Subject: [PATCH 26/34] Delete empty tags and meta data tags --- receivers/redfishReceiver.go | 72 ++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 15 deletions(-) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index 2ebe507..82d3819 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -79,15 +79,30 @@ func (r *RedfishReceiver) Start() { if err != nil { return fmt.Errorf("readPowerMetric: chassis.Power() failed: %v", err) } + if power == nil { + continue + } // Read min, max and average consumed watts for each power control for _, pc := range power.PowerControl { // Map of collected metrics metrics := map[string]float32{ + // PowerConsumedWatts shall represent the actual power being consumed (in + // Watts) by the chassis + "consumed_watts": pc.PowerConsumedWatts, + // AverageConsumedWatts shall represent the + // average power level that occurred averaged over the last IntervalInMin + // minutes. "average_consumed_watts": pc.PowerMetrics.AverageConsumedWatts, - "min_consumed_watts": pc.PowerMetrics.MinConsumedWatts, - "max_consumed_watts": pc.PowerMetrics.MaxConsumedWatts, + // MinConsumedWatts shall represent the + // minimum power level in watts that occurred within the last + // IntervalInMin minutes. + "min_consumed_watts": pc.PowerMetrics.MinConsumedWatts, + // MaxConsumedWatts shall represent the + // maximum power level in watts that occurred within the last + // IntervalInMin minutes + "max_consumed_watts": pc.PowerMetrics.MaxConsumedWatts, } intervalInMin := strconv.FormatFloat(float64(pc.PowerMetrics.IntervalInMin), 'f', -1, 32) @@ -97,19 +112,46 @@ func (r *RedfishReceiver) Start() { } for name, value := range metrics { - y, err := lp.New( - name, - map[string]string{ - "hostname": *clientConfig.Hostname, - "type": "node", - "power_control_name": pc.Name, - }, - map[string]string{ - "source": r.name, - "group": "Energy", - "interval_in_minutes": intervalInMin, - "unit": "watts", - }, + // Set tags + tags := map[string]string{ + "hostname": *clientConfig.Hostname, + "type": "node", + // ID uniquely identifies the resource + "id": pc.ID, + // MemberID shall uniquely identify the member within the collection. For + // services supporting Redfish v1.6 or higher, this value shall be the + // zero-based array index. + "member_id": pc.MemberID, + // PhysicalContext shall be a description of the affected device(s) or region + // within the chassis to which this power control applies. + "physical_context": string(pc.PhysicalContext), + // Name + "power_control_name": pc.Name, + } + + // Delete empty tags + for key, value := range tags { + if value == "" { + delete(tags, key) + } + } + + // Set meta data tags + meta := map[string]string{ + "source": r.name, + "group": "Energy", + "interval_in_minutes": intervalInMin, + "unit": "watts", + } + + // Delete empty tags + for key, value := range meta { + if value == "" { + delete(meta, key) + } + } + + y, err := lp.New(name, tags, meta, map[string]interface{}{ "value": value, }, From fb6f6a4daaa567225c443e43cf66a14072ce45f3 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 2 May 2022 16:57:19 +0200 Subject: [PATCH 27/34] Fix GPFS collector last state handling --- collectors/gpfsMetric.go | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/collectors/gpfsMetric.go b/collectors/gpfsMetric.go index 26fc723..ed63201 100644 --- a/collectors/gpfsMetric.go +++ b/collectors/gpfsMetric.go @@ -70,6 +70,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error { for _, fs := range m.config.ExcludeFilesystem { m.skipFS[fs] = struct{}{} } + m.lastState = make(map[string]GpfsCollectorLastState) // GPFS / IBM Spectrum Scale file system statistics can only be queried by user root user, err := user.Current() @@ -162,11 +163,16 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { continue } + // Add filesystem tag m.tags["filesystem"] = filesystem - if _, ok := m.lastState[filesystem]; !ok { - m.lastState[filesystem] = GpfsCollectorLastState{ - bytesRead: -1, - bytesWritten: -1, + + // Create initial last state + if m.config.SendBandwidths { + if _, ok := m.lastState[filesystem]; !ok { + m.lastState[filesystem] = GpfsCollectorLastState{ + bytesRead: -1, + bytesWritten: -1, + } } } From c019f8e7ad3ee2f0ce97b54b35ef22b484c6bace Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 3 May 2022 17:55:33 +0200 Subject: [PATCH 28/34] Reuse tags and meta data tags --- receivers/redfishReceiver.go | 77 ++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index 82d3819..d50cbc7 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -111,45 +111,46 @@ func (r *RedfishReceiver) Start() { delete(metrics, key) } + // Set tags + tags := map[string]string{ + "hostname": *clientConfig.Hostname, + "type": "node", + // ID uniquely identifies the resource + "id": pc.ID, + // MemberID shall uniquely identify the member within the collection. For + // services supporting Redfish v1.6 or higher, this value shall be the + // zero-based array index. + "member_id": pc.MemberID, + // PhysicalContext shall be a description of the affected device(s) or region + // within the chassis to which this power control applies. + "physical_context": string(pc.PhysicalContext), + // Name + "power_control_name": pc.Name, + } + + // Delete empty tags + for key, value := range tags { + if value == "" { + delete(tags, key) + } + } + + // Set meta data tags + meta := map[string]string{ + "source": r.name, + "group": "Energy", + "interval_in_minutes": intervalInMin, + "unit": "watts", + } + + // Delete empty meta data tags + for key, value := range meta { + if value == "" { + delete(meta, key) + } + } + for name, value := range metrics { - // Set tags - tags := map[string]string{ - "hostname": *clientConfig.Hostname, - "type": "node", - // ID uniquely identifies the resource - "id": pc.ID, - // MemberID shall uniquely identify the member within the collection. For - // services supporting Redfish v1.6 or higher, this value shall be the - // zero-based array index. - "member_id": pc.MemberID, - // PhysicalContext shall be a description of the affected device(s) or region - // within the chassis to which this power control applies. - "physical_context": string(pc.PhysicalContext), - // Name - "power_control_name": pc.Name, - } - - // Delete empty tags - for key, value := range tags { - if value == "" { - delete(tags, key) - } - } - - // Set meta data tags - meta := map[string]string{ - "source": r.name, - "group": "Energy", - "interval_in_minutes": intervalInMin, - "unit": "watts", - } - - // Delete empty tags - for key, value := range meta { - if value == "" { - delete(meta, key) - } - } y, err := lp.New(name, tags, meta, map[string]interface{}{ From c35ac9dba840995b32e52ecaa2b3585932f0dde4 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 4 May 2022 11:28:06 +0200 Subject: [PATCH 29/34] Flush if batch size is reached --- sinks/influxSink.go | 74 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 61 insertions(+), 13 deletions(-) diff --git a/sinks/influxSink.go b/sinks/influxSink.go index e8b16d8..b382c38 100644 --- a/sinks/influxSink.go +++ b/sinks/influxSink.go @@ -49,20 +49,31 @@ type InfluxSink struct { //influxMaxRetryDelay uint } +// connect connects to the InfluxDB server func (s *InfluxSink) connect() error { - var auth string + + // URI options: + // * http://host:port + // * https://host:port var uri string if s.config.SSL { uri = fmt.Sprintf("https://%s:%s", s.config.Host, s.config.Port) } else { uri = fmt.Sprintf("http://%s:%s", s.config.Host, s.config.Port) } + + // Authentication options: + // * token + // * username:password + var auth string if len(s.config.User) == 0 { auth = s.config.Password } else { auth = fmt.Sprintf("%s:%s", s.config.User, s.config.Password) } cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database) + + // Set influxDB client options clientOptions := influxdb2.DefaultOptions() // if s.influxRetryInterval != 0 { @@ -82,6 +93,7 @@ func (s *InfluxSink) connect() error { // clientOptions.SetMaxRetries(s.config.InfluxMaxRetries) // } + // Do not check InfluxDB certificate clientOptions.SetTLSConfig( &tls.Config{ InsecureSkipVerify: true, @@ -90,8 +102,11 @@ func (s *InfluxSink) connect() error { clientOptions.SetPrecision(time.Second) + // Create new writeAPI s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions) s.writeApi = s.client.WriteAPIBlocking(s.config.Organization, s.config.Database) + + // Check InfluxDB server accessibility ok, err := s.client.Ping(context.Background()) if err != nil { return err @@ -103,24 +118,22 @@ func (s *InfluxSink) connect() error { } func (s *InfluxSink) Write(m lp.CCMetric) error { - // err := - // s.writeApi.WritePoint( - // context.Background(), - // m.ToPoint(s.meta_as_tags), - // ) + if len(s.batch) == 0 && s.flushDelay != 0 { // This is the first write since the last flush, start the flushTimer! if s.flushTimer != nil && s.flushTimer.Stop() { cclog.ComponentDebug(s.name, "unexpected: the flushTimer was already running?") } - // Run a batched flush for all lines that have arrived in the last second + // Run a batched flush for all lines that have arrived in the last flush delay interval s.flushTimer = time.AfterFunc(s.flushDelay, func() { if err := s.Flush(); err != nil { cclog.ComponentError(s.name, "flush failed:", err.Error()) } }) } + + // Append metric to batch slice p := m.ToPoint(s.meta_as_tags) s.lock.Lock() s.batch = append(s.batch, p) @@ -131,21 +144,39 @@ func (s *InfluxSink) Write(m lp.CCMetric) error { return s.Flush() } + // Flush if batch size is reached + if len(s.batch) == s.config.BatchSize { + return s.Flush() + } + return nil } +// Flush sends all metrics buffered in batch slice to InfluxDB server func (s *InfluxSink) Flush() error { + + // Lock access to batch slice s.lock.Lock() defer s.lock.Unlock() + + // Nothing to do, batch slice is empty if len(s.batch) == 0 { return nil } + + // Send metrics from batch slice err := s.writeApi.WritePoint(context.Background(), s.batch...) if err != nil { cclog.ComponentError(s.name, "flush failed:", err.Error()) return err } + + // Clear batch slice + for i := range s.batch { + s.batch[i] = nil + } s.batch = s.batch[:0] + return nil } @@ -156,11 +187,16 @@ func (s *InfluxSink) Close() { s.client.Close() } +// NewInfluxSink create a new InfluxDB sink func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { s := new(InfluxSink) s.name = fmt.Sprintf("InfluxSink(%s)", name) + + // Set config default values s.config.BatchSize = 100 s.config.FlushDelay = "1s" + + // Read config if len(config) > 0 { err := json.Unmarshal(config, &s.config) if err != nil { @@ -174,13 +210,22 @@ func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { // s.config.InfluxMaxRetries = 0 // s.config.InfluxExponentialBase = 0 - if len(s.config.Host) == 0 || - len(s.config.Port) == 0 || - len(s.config.Database) == 0 || - len(s.config.Organization) == 0 || - len(s.config.Password) == 0 { - return nil, errors.New("not all configuration variables set required by InfluxSink") + if len(s.config.Host) == 0 { + return nil, errors.New("Missing host configuration required by InfluxSink") } + if len(s.config.Port) == 0 { + return nil, errors.New("Missing port configuration required by InfluxSink") + } + if len(s.config.Database) == 0 { + return nil, errors.New("Missing database configuration required by InfluxSink") + } + if len(s.config.Organization) == 0 { + return nil, errors.New("Missing organization configuration required by InfluxSink") + } + if len(s.config.Password) == 0 { + return nil, errors.New("Missing password configuration required by InfluxSink") + } + // Create lookup map to use meta infos as tags in the output metric s.meta_as_tags = make(map[string]bool) for _, k := range s.config.MetaAsTags { @@ -199,12 +244,15 @@ func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { // s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval) // s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime) + // Configure flush delay duration if len(s.config.FlushDelay) > 0 { t, err := time.ParseDuration(s.config.FlushDelay) if err == nil { s.flushDelay = t } } + + // allocate batch slice s.batch = make([]*write.Point, 0, s.config.BatchSize) // Connect to InfluxDB server From 54d14519ca538870a275e408796b9abd0da847f1 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 4 May 2022 11:54:34 +0200 Subject: [PATCH 30/34] Skip mount points in DiskstatCollector if statfs() call does not work (bind mounts, ...) --- collectors/diskstatMetric.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/collectors/diskstatMetric.go b/collectors/diskstatMetric.go index 16c70ba..4910c83 100644 --- a/collectors/diskstatMetric.go +++ b/collectors/diskstatMetric.go @@ -3,7 +3,6 @@ package collectors import ( "bufio" "encoding/json" - "fmt" "os" "strings" "syscall" @@ -81,8 +80,7 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric stat := syscall.Statfs_t{} err := syscall.Statfs(path, &stat) if err != nil { - fmt.Println(err.Error()) - return + continue } tags := map[string]string{"type": "node", "device": linefields[0]} total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000) From e098c3317985db393d5d64a65a8d63da4d65469e Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 4 May 2022 12:48:46 +0200 Subject: [PATCH 31/34] Add some golang debug options --- scripts/cc-metric-collector.config | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/cc-metric-collector.config b/scripts/cc-metric-collector.config index 3535ddf..988b0ff 100644 --- a/scripts/cc-metric-collector.config +++ b/scripts/cc-metric-collector.config @@ -15,3 +15,9 @@ CONF_DIR=/etc/cc-metric-collector CONF_FILE=/etc/cc-metric-collector/cc-metric-collector.json RESTART_ON_UPGRADE=true + +# Golang runtime debugging. (see: https://pkg.go.dev/runtime) +# GODEBUG=gctrace=1 + +# Golang garbage collection target percentage +# GOGC=100 From ee4bd558f1ba0a6515793085c842f05207d83ad4 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 6 May 2022 11:44:57 +0200 Subject: [PATCH 32/34] Cleanup: Remove unused code --- sinks/influxAsyncSink.go | 18 ++++--- sinks/influxSink.go | 102 ++++++++++++--------------------------- 2 files changed, 41 insertions(+), 79 deletions(-) diff --git a/sinks/influxAsyncSink.go b/sinks/influxAsyncSink.go index 31d127d..bf88079 100644 --- a/sinks/influxAsyncSink.go +++ b/sinks/influxAsyncSink.go @@ -25,7 +25,6 @@ type InfluxAsyncSinkConfig struct { Password string `json:"password,omitempty"` Organization string `json:"organization,omitempty"` SSL bool `json:"ssl,omitempty"` - RetentionPol string `json:"retention_policy,omitempty"` // Maximum number of points sent to server in single request. Default 5000 BatchSize uint `json:"batch_size,omitempty"` // Interval, in ms, in which is buffer flushed if it has not been already written (by reaching batch size) . Default 1000ms @@ -186,12 +185,17 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) { return nil, err } } - if len(s.config.Host) == 0 || - len(s.config.Port) == 0 || - len(s.config.Database) == 0 || - len(s.config.Organization) == 0 || - len(s.config.Password) == 0 { - return nil, errors.New("not all configuration variables set required by InfluxAsyncSink") + if len(s.config.Port) == 0 { + return nil, errors.New("Missing port configuration required by InfluxSink") + } + if len(s.config.Database) == 0 { + return nil, errors.New("Missing database configuration required by InfluxSink") + } + if len(s.config.Organization) == 0 { + return nil, errors.New("Missing organization configuration required by InfluxSink") + } + if len(s.config.Password) == 0 { + return nil, errors.New("Missing password configuration required by InfluxSink") } // Create lookup map to use meta infos as tags in the output metric s.meta_as_tags = make(map[string]bool) diff --git a/sinks/influxSink.go b/sinks/influxSink.go index b382c38..212647d 100644 --- a/sinks/influxSink.go +++ b/sinks/influxSink.go @@ -16,37 +16,28 @@ import ( "github.com/influxdata/influxdb-client-go/v2/api/write" ) -type InfluxSinkConfig struct { - defaultSinkConfig - Host string `json:"host,omitempty"` - Port string `json:"port,omitempty"` - Database string `json:"database,omitempty"` - User string `json:"user,omitempty"` - Password string `json:"password,omitempty"` - Organization string `json:"organization,omitempty"` - SSL bool `json:"ssl,omitempty"` - FlushDelay string `json:"flush_delay,omitempty"` - BatchSize int `json:"batch_size,omitempty"` - RetentionPol string `json:"retention_policy,omitempty"` - // InfluxRetryInterval string `json:"retry_interval"` - // InfluxExponentialBase uint `json:"retry_exponential_base"` - // InfluxMaxRetries uint `json:"max_retries"` - // InfluxMaxRetryTime string `json:"max_retry_time"` - //InfluxMaxRetryDelay string `json:"max_retry_delay"` // It is mentioned in the docs but there is no way to set it -} - type InfluxSink struct { sink - client influxdb2.Client - writeApi influxdb2Api.WriteAPIBlocking - config InfluxSinkConfig - influxRetryInterval uint - influxMaxRetryTime uint - batch []*write.Point - flushTimer *time.Timer - flushDelay time.Duration - lock sync.Mutex // Flush() runs in another goroutine, so this lock has to protect the buffer - //influxMaxRetryDelay uint + client influxdb2.Client + writeApi influxdb2Api.WriteAPIBlocking + config struct { + defaultSinkConfig + Host string `json:"host,omitempty"` + Port string `json:"port,omitempty"` + Database string `json:"database,omitempty"` + User string `json:"user,omitempty"` + Password string `json:"password,omitempty"` + Organization string `json:"organization,omitempty"` + SSL bool `json:"ssl,omitempty"` + // Maximum number of points sent to server in single request. Default 100 + BatchSize int `json:"batch_size,omitempty"` + // Interval, in which is buffer flushed if it has not been already written (by reaching batch size). Default 1s + FlushInterval string `json:"flush_delay,omitempty"` + } + batch []*write.Point + flushTimer *time.Timer + flushDelay time.Duration + lock sync.Mutex // Flush() runs in another goroutine, so this lock has to protect the buffer } // connect connects to the InfluxDB server @@ -76,23 +67,6 @@ func (s *InfluxSink) connect() error { // Set influxDB client options clientOptions := influxdb2.DefaultOptions() - // if s.influxRetryInterval != 0 { - // cclog.ComponentDebug(s.name, "MaxRetryInterval", s.influxRetryInterval) - // clientOptions.SetMaxRetryInterval(s.influxRetryInterval) - // } - // if s.influxMaxRetryTime != 0 { - // cclog.ComponentDebug(s.name, "MaxRetryTime", s.influxMaxRetryTime) - // clientOptions.SetMaxRetryTime(s.influxMaxRetryTime) - // } - // if s.config.InfluxExponentialBase != 0 { - // cclog.ComponentDebug(s.name, "Exponential Base", s.config.InfluxExponentialBase) - // clientOptions.SetExponentialBase(s.config.InfluxExponentialBase) - // } - // if s.config.InfluxMaxRetries != 0 { - // cclog.ComponentDebug(s.name, "Max Retries", s.config.InfluxMaxRetries) - // clientOptions.SetMaxRetries(s.config.InfluxMaxRetries) - // } - // Do not check InfluxDB certificate clientOptions.SetTLSConfig( &tls.Config{ @@ -126,11 +100,13 @@ func (s *InfluxSink) Write(m lp.CCMetric) error { } // Run a batched flush for all lines that have arrived in the last flush delay interval - s.flushTimer = time.AfterFunc(s.flushDelay, func() { - if err := s.Flush(); err != nil { - cclog.ComponentError(s.name, "flush failed:", err.Error()) - } - }) + s.flushTimer = time.AfterFunc( + s.flushDelay, + func() { + if err := s.Flush(); err != nil { + cclog.ComponentError(s.name, "flush failed:", err.Error()) + } + }) } // Append metric to batch slice @@ -194,7 +170,7 @@ func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { // Set config default values s.config.BatchSize = 100 - s.config.FlushDelay = "1s" + s.config.FlushInterval = "1s" // Read config if len(config) > 0 { @@ -203,12 +179,6 @@ func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { return nil, err } } - s.influxRetryInterval = 0 - s.influxMaxRetryTime = 0 - // s.config.InfluxRetryInterval = "" - // s.config.InfluxMaxRetryTime = "" - // s.config.InfluxMaxRetries = 0 - // s.config.InfluxExponentialBase = 0 if len(s.config.Host) == 0 { return nil, errors.New("Missing host configuration required by InfluxSink") @@ -232,21 +202,9 @@ func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { s.meta_as_tags[k] = true } - // toUint := func(duration string, def uint) uint { - // if len(duration) > 0 { - // t, err := time.ParseDuration(duration) - // if err == nil { - // return uint(t.Milliseconds()) - // } - // } - // return def - // } - // s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval) - // s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime) - // Configure flush delay duration - if len(s.config.FlushDelay) > 0 { - t, err := time.ParseDuration(s.config.FlushDelay) + if len(s.config.FlushInterval) > 0 { + t, err := time.ParseDuration(s.config.FlushInterval) if err == nil { s.flushDelay = t } From 8abedac0fee453bccad55b98d40c46a0bf02ca6c Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 13 May 2022 12:33:33 +0200 Subject: [PATCH 33/34] Use Golang duration parser for 'interval' and 'duration' in main config --- .github/ci-config.json | 4 ++-- cc-metric-collector.go | 42 +++++++++++++++++++++++++++++++----------- config.json | 12 ++++++------ 3 files changed, 39 insertions(+), 19 deletions(-) diff --git a/.github/ci-config.json b/.github/ci-config.json index 15b2e6f..1c4ba97 100644 --- a/.github/ci-config.json +++ b/.github/ci-config.json @@ -3,6 +3,6 @@ "collectors" : ".github/ci-collectors.json", "receivers" : ".github/ci-receivers.json", "router" : ".github/ci-router.json", - "interval": 5, - "duration": 1 + "interval": "5s", + "duration": "1s" } diff --git a/cc-metric-collector.go b/cc-metric-collector.go index e6388df..6e1f705 100644 --- a/cc-metric-collector.go +++ b/cc-metric-collector.go @@ -22,8 +22,8 @@ import ( ) type CentralConfigFile struct { - Interval int `json:"interval"` - Duration int `json:"duration"` + Interval string `json:"interval"` + Duration string `json:"duration"` CollectorConfigFile string `json:"collectors"` RouterConfigFile string `json:"router"` SinkConfigFile string `json:"sinks"` @@ -173,16 +173,36 @@ func mainFunc() int { cclog.Error("Error reading configuration file ", rcfg.CliArgs["configfile"], ": ", err.Error()) return 1 } - if rcfg.ConfigFile.Interval <= 0 || time.Duration(rcfg.ConfigFile.Interval)*time.Second <= 0 { - cclog.Error("Configuration value 'interval' must be greater than zero") + + // Properly use duration parser with inputs like '60s', '5m' or similar + if len(rcfg.ConfigFile.Interval) > 0 { + t, err := time.ParseDuration(rcfg.ConfigFile.Interval) + if err != nil { + cclog.Error("Configuration value 'interval' no valid duration") + } + rcfg.Interval = t + if rcfg.Interval == 0 { + cclog.Error("Configuration value 'interval' must be greater than zero") + return 1 + } + } + + // Properly use duration parser with inputs like '60s', '5m' or similar + if len(rcfg.ConfigFile.Duration) > 0 { + t, err := time.ParseDuration(rcfg.ConfigFile.Duration) + if err != nil { + cclog.Error("Configuration value 'duration' no valid duration") + } + rcfg.Duration = t + if rcfg.Duration == 0 { + cclog.Error("Configuration value 'duration' must be greater than zero") + return 1 + } + } + if rcfg.Duration > rcfg.Interval { + cclog.Error("The interval should be greater than duration") return 1 } - rcfg.Interval = time.Duration(rcfg.ConfigFile.Interval) * time.Second - if rcfg.ConfigFile.Duration <= 0 || time.Duration(rcfg.ConfigFile.Duration)*time.Second <= 0 { - cclog.Error("Configuration value 'duration' must be greater than zero") - return 1 - } - rcfg.Duration = time.Duration(rcfg.ConfigFile.Duration) * time.Second if len(rcfg.ConfigFile.RouterConfigFile) == 0 { cclog.Error("Metric router configuration file must be set") @@ -271,7 +291,7 @@ func mainFunc() int { // Wait until one tick has passed. This is a workaround if rcfg.CliArgs["once"] == "true" { - x := 1.2 * float64(rcfg.ConfigFile.Interval) + x := 1.2 * float64(rcfg.Interval) time.Sleep(time.Duration(int(x)) * time.Second) shutdownSignal <- os.Interrupt } diff --git a/config.json b/config.json index 52f9df1..924bec7 100644 --- a/config.json +++ b/config.json @@ -1,8 +1,8 @@ { - "sinks": "sinks.json", - "collectors" : "collectors.json", - "receivers" : "receivers.json", - "router" : "router.json", - "interval": 10, - "duration": 1 + "sinks": "./sinks.json", + "collectors" : "./collectors.json", + "receivers" : "./receivers.json", + "router" : "./router.json", + "interval": "10s", + "duration": "1s" } From 8068e598188d57cac6e94d6c4936eed03d9d5d3b Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 13 May 2022 13:14:47 +0200 Subject: [PATCH 34/34] Update handling of LIKWID headers. Download only if not already present in the system. Fixes #73 --- collectors/Makefile | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/collectors/Makefile b/collectors/Makefile index b07bccd..20418ed 100644 --- a/collectors/Makefile +++ b/collectors/Makefile @@ -1,22 +1,28 @@ - -all: likwid - - # LIKWID version LIKWID_VERSION = 5.2.1 +LIKWID_INSTALLED_FOLDER=$(shell dirname $(shell which likwid-topology 2>/dev/null)) + +LIKWID_FOLDER="$(shell pwd)/likwid" + +all: $(LIKWID_FOLDER)/likwid.h .ONESHELL: -.PHONY: likwid -likwid: - INSTALL_FOLDER="$${PWD}/likwid" - BUILD_FOLDER="$${PWD}/likwidbuild" - if [ -d $${INSTALL_FOLDER} ]; then rm -r $${INSTALL_FOLDER}; fi - mkdir --parents --verbose $${INSTALL_FOLDER} $${BUILD_FOLDER} - wget -P "$${BUILD_FOLDER}" ftp://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz - tar -C $${BUILD_FOLDER} -xf $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION).tar.gz - install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/likwid*.h $${INSTALL_FOLDER}/ - install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/bstrlib.h $${INSTALL_FOLDER}/ - rm -r $${BUILD_FOLDER} +.PHONY: $(LIKWID_FOLDER)/likwid.h +$(LIKWID_FOLDER)/likwid.h: + if [ "$(LIKWID_INSTALLED_FOLDER)" != "" ]; then \ + BASE="$(LIKWID_INSTALLED_FOLDER)/../include"; \ + mkdir -p $(LIKWID_FOLDER); \ + cp $$BASE/*.h $(LIKWID_FOLDER); \ + else \ + BUILD_FOLDER="$${PWD}/likwidbuild"; \ + if [ -d $(LIKWID_FOLDER) ]; then rm -r $(LIKWID_FOLDER); fi; \ + mkdir --parents --verbose $(LIKWID_FOLDER) $${BUILD_FOLDER}; \ + wget -P "$${BUILD_FOLDER}" ftp://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz; \ + tar -C $${BUILD_FOLDER} -xf $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION).tar.gz; \ + install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/likwid*.h $(LIKWID_FOLDER)/; \ + install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/bstrlib.h $(LIKWID_FOLDER)/; \ + rm -r $${BUILD_FOLDER}; \ + fi clean: