diff --git a/.github/ci-config.json b/.github/ci-config.json index 15b2e6f..1c4ba97 100644 --- a/.github/ci-config.json +++ b/.github/ci-config.json @@ -3,6 +3,6 @@ "collectors" : ".github/ci-collectors.json", "receivers" : ".github/ci-receivers.json", "router" : ".github/ci-router.json", - "interval": 5, - "duration": 1 + "interval": "5s", + "duration": "1s" } diff --git a/cc-metric-collector.go b/cc-metric-collector.go index e6388df..6e1f705 100644 --- a/cc-metric-collector.go +++ b/cc-metric-collector.go @@ -22,8 +22,8 @@ import ( ) type CentralConfigFile struct { - Interval int `json:"interval"` - Duration int `json:"duration"` + Interval string `json:"interval"` + Duration string `json:"duration"` CollectorConfigFile string `json:"collectors"` RouterConfigFile string `json:"router"` SinkConfigFile string `json:"sinks"` @@ -173,16 +173,36 @@ func mainFunc() int { cclog.Error("Error reading configuration file ", rcfg.CliArgs["configfile"], ": ", err.Error()) return 1 } - if rcfg.ConfigFile.Interval <= 0 || time.Duration(rcfg.ConfigFile.Interval)*time.Second <= 0 { - cclog.Error("Configuration value 'interval' must be greater than zero") + + // Properly use duration parser with inputs like '60s', '5m' or similar + if len(rcfg.ConfigFile.Interval) > 0 { + t, err := time.ParseDuration(rcfg.ConfigFile.Interval) + if err != nil { + cclog.Error("Configuration value 'interval' no valid duration") + } + rcfg.Interval = t + if rcfg.Interval == 0 { + cclog.Error("Configuration value 'interval' must be greater than zero") + return 1 + } + } + + // Properly use duration parser with inputs like '60s', '5m' or similar + if len(rcfg.ConfigFile.Duration) > 0 { + t, err := time.ParseDuration(rcfg.ConfigFile.Duration) + if err != nil { + cclog.Error("Configuration value 'duration' no valid duration") + } + rcfg.Duration = t + if rcfg.Duration == 0 { + cclog.Error("Configuration value 'duration' must be greater than zero") + return 1 + } + } + if rcfg.Duration > rcfg.Interval { + cclog.Error("The interval should be greater than duration") return 1 } - rcfg.Interval = time.Duration(rcfg.ConfigFile.Interval) * time.Second - if rcfg.ConfigFile.Duration <= 0 || time.Duration(rcfg.ConfigFile.Duration)*time.Second <= 0 { - cclog.Error("Configuration value 'duration' must be greater than zero") - return 1 - } - rcfg.Duration = time.Duration(rcfg.ConfigFile.Duration) * time.Second if len(rcfg.ConfigFile.RouterConfigFile) == 0 { cclog.Error("Metric router configuration file must be set") @@ -271,7 +291,7 @@ func mainFunc() int { // Wait until one tick has passed. This is a workaround if rcfg.CliArgs["once"] == "true" { - x := 1.2 * float64(rcfg.ConfigFile.Interval) + x := 1.2 * float64(rcfg.Interval) time.Sleep(time.Duration(int(x)) * time.Second) shutdownSignal <- os.Interrupt } diff --git a/collectors/Makefile b/collectors/Makefile index b07bccd..20418ed 100644 --- a/collectors/Makefile +++ b/collectors/Makefile @@ -1,22 +1,28 @@ - -all: likwid - - # LIKWID version LIKWID_VERSION = 5.2.1 +LIKWID_INSTALLED_FOLDER=$(shell dirname $(shell which likwid-topology 2>/dev/null)) + +LIKWID_FOLDER="$(shell pwd)/likwid" + +all: $(LIKWID_FOLDER)/likwid.h .ONESHELL: -.PHONY: likwid -likwid: - INSTALL_FOLDER="$${PWD}/likwid" - BUILD_FOLDER="$${PWD}/likwidbuild" - if [ -d $${INSTALL_FOLDER} ]; then rm -r $${INSTALL_FOLDER}; fi - mkdir --parents --verbose $${INSTALL_FOLDER} $${BUILD_FOLDER} - wget -P "$${BUILD_FOLDER}" ftp://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz - tar -C $${BUILD_FOLDER} -xf $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION).tar.gz - install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/likwid*.h $${INSTALL_FOLDER}/ - install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/bstrlib.h $${INSTALL_FOLDER}/ - rm -r $${BUILD_FOLDER} +.PHONY: $(LIKWID_FOLDER)/likwid.h +$(LIKWID_FOLDER)/likwid.h: + if [ "$(LIKWID_INSTALLED_FOLDER)" != "" ]; then \ + BASE="$(LIKWID_INSTALLED_FOLDER)/../include"; \ + mkdir -p $(LIKWID_FOLDER); \ + cp $$BASE/*.h $(LIKWID_FOLDER); \ + else \ + BUILD_FOLDER="$${PWD}/likwidbuild"; \ + if [ -d $(LIKWID_FOLDER) ]; then rm -r $(LIKWID_FOLDER); fi; \ + mkdir --parents --verbose $(LIKWID_FOLDER) $${BUILD_FOLDER}; \ + wget -P "$${BUILD_FOLDER}" ftp://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz; \ + tar -C $${BUILD_FOLDER} -xf $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION).tar.gz; \ + install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/likwid*.h $(LIKWID_FOLDER)/; \ + install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/bstrlib.h $(LIKWID_FOLDER)/; \ + rm -r $${BUILD_FOLDER}; \ + fi clean: diff --git a/collectors/diskstatMetric.go b/collectors/diskstatMetric.go index 16c70ba..4910c83 100644 --- a/collectors/diskstatMetric.go +++ b/collectors/diskstatMetric.go @@ -3,7 +3,6 @@ package collectors import ( "bufio" "encoding/json" - "fmt" "os" "strings" "syscall" @@ -81,8 +80,7 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric stat := syscall.Statfs_t{} err := syscall.Statfs(path, &stat) if err != nil { - fmt.Println(err.Error()) - return + continue } tags := map[string]string{"type": "node", "device": linefields[0]} total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000) diff --git a/collectors/gpfsMetric.go b/collectors/gpfsMetric.go index 26fc723..ed63201 100644 --- a/collectors/gpfsMetric.go +++ b/collectors/gpfsMetric.go @@ -70,6 +70,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error { for _, fs := range m.config.ExcludeFilesystem { m.skipFS[fs] = struct{}{} } + m.lastState = make(map[string]GpfsCollectorLastState) // GPFS / IBM Spectrum Scale file system statistics can only be queried by user root user, err := user.Current() @@ -162,11 +163,16 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { continue } + // Add filesystem tag m.tags["filesystem"] = filesystem - if _, ok := m.lastState[filesystem]; !ok { - m.lastState[filesystem] = GpfsCollectorLastState{ - bytesRead: -1, - bytesWritten: -1, + + // Create initial last state + if m.config.SendBandwidths { + if _, ok := m.lastState[filesystem]; !ok { + m.lastState[filesystem] = GpfsCollectorLastState{ + bytesRead: -1, + bytesWritten: -1, + } } } diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index 5be095d..274e669 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -18,13 +18,18 @@ import ( const IB_BASEPATH = "/sys/class/infiniband/" +type InfinibandCollectorMetric struct { + path string + unit string +} + type InfinibandCollectorInfo struct { - LID string // IB local Identifier (LID) - device string // IB device - port string // IB device port - portCounterFiles map[string]string // mapping counter name -> sysfs file - tagSet map[string]string // corresponding tag list - lastState map[string]int64 // State from last measurement + LID string // IB local Identifier (LID) + device string // IB device + port string // IB device port + portCounterFiles map[string]InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric + tagSet map[string]string // corresponding tag list + lastState map[string]int64 // State from last measurement } type InfinibandCollector struct { @@ -106,16 +111,16 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error { // Check access to counter files countersDir := filepath.Join(path, "counters") - portCounterFiles := map[string]string{ - "ib_recv": filepath.Join(countersDir, "port_rcv_data"), - "ib_xmit": filepath.Join(countersDir, "port_xmit_data"), - "ib_recv_pkts": filepath.Join(countersDir, "port_rcv_packets"), - "ib_xmit_pkts": filepath.Join(countersDir, "port_xmit_packets"), + portCounterFiles := map[string]InfinibandCollectorMetric{ + "ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes"}, + "ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes"}, + "ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets"}, + "ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets"}, } - for _, counterFile := range portCounterFiles { - err := unix.Access(counterFile, unix.R_OK) + for _, counter := range portCounterFiles { + err := unix.Access(counter.path, unix.R_OK) if err != nil { - return fmt.Errorf("unable to access %s: %v", counterFile, err) + return fmt.Errorf("unable to access %s: %v", counter.path, err) } } @@ -165,14 +170,14 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr m.lastTimestamp = now for _, info := range m.info { - for counterName, counterFile := range info.portCounterFiles { + for counterName, counterDef := range info.portCounterFiles { // Read counter file - line, err := ioutil.ReadFile(counterFile) + line, err := ioutil.ReadFile(counterDef.path) if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterFile, err)) + fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err)) continue } data := strings.TrimSpace(string(line)) @@ -189,6 +194,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr // Send absolut values if m.config.SendAbsoluteValues { if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil { + y.AddMeta("unit", counterDef.unit) output <- y } } @@ -198,6 +204,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr if info.lastState[counterName] >= 0 { rate := float64((v - info.lastState[counterName])) / timeDiff if y, err := lp.New(counterName+"_bw", info.tagSet, m.meta, map[string]interface{}{"value": rate}, now); err == nil { + y.AddMeta("unit", counterDef.unit+"/sec") output <- y } } diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 85bd932..f2229d1 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -15,8 +15,12 @@ import ( "io/ioutil" "math" "os" + "os/signal" + "sort" "strconv" "strings" + "sync" + "syscall" "time" "unsafe" @@ -46,6 +50,16 @@ type LikwidCollectorEventsetConfig struct { Metrics []LikwidCollectorMetricConfig `json:"metrics"` } +type LikwidEventsetConfig struct { + internal int + gid C.int + eorder []*C.char + estr *C.char + go_estr string + results map[int]map[string]interface{} + metrics map[int]map[string]float64 +} + type LikwidCollectorConfig struct { Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"` Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"` @@ -58,17 +72,18 @@ type LikwidCollectorConfig struct { type LikwidCollector struct { metricCollector - cpulist []C.int - cpu2tid map[int]int - sock2tid map[int]int - metrics map[C.int]map[string]int - groups []C.int - config LikwidCollectorConfig - results map[int]map[int]map[string]interface{} - mresults map[int]map[int]map[string]float64 - gmresults map[int]map[string]float64 - basefreq float64 - running bool + cpulist []C.int + cpu2tid map[int]int + sock2tid map[int]int + metrics map[C.int]map[string]int + groups []C.int + config LikwidCollectorConfig + gmresults map[int]map[string]float64 + basefreq float64 + running bool + initialized bool + likwidGroups map[C.int]LikwidEventsetConfig + lock sync.Mutex } type LikwidMetric struct { @@ -86,14 +101,60 @@ func eventsToEventStr(events map[string]string) string { return strings.Join(elist, ",") } +func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig { + tmplist := make([]string, 0) + clist := make([]string, 0) + for k := range input.Events { + clist = append(clist, k) + } + sort.Strings(clist) + elist := make([]*C.char, 0) + for _, k := range clist { + v := input.Events[k] + tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k)) + c_counter := C.CString(k) + elist = append(elist, c_counter) + } + estr := strings.Join(tmplist, ",") + res := make(map[int]map[string]interface{}) + met := make(map[int]map[string]float64) + for _, i := range topo.CpuList() { + res[i] = make(map[string]interface{}) + for k := range input.Events { + res[i][k] = 0.0 + } + met[i] = make(map[string]float64) + for _, v := range input.Metrics { + res[i][v.Name] = 0.0 + } + } + return LikwidEventsetConfig{ + gid: -1, + eorder: elist, + estr: C.CString(estr), + go_estr: estr, + results: res, + metrics: met, + } +} + +func testLikwidMetricFormula(formula string, params []string) bool { + myparams := make(map[string]interface{}) + for _, p := range params { + myparams[p] = float64(1.0) + } + _, err := agg.EvalFloat64Condition(formula, myparams) + return err == nil +} + func getBaseFreq() float64 { + files := []string{ + "/sys/devices/system/cpu/cpu0/cpufreq/bios_limit", + "/sys/devices/system/cpu/cpu0/cpufreq/base_frequency", + } var freq float64 = math.NaN() - C.power_init(0) - info := C.get_powerInfo() - if float64(info.baseFrequency) != 0 { - freq = float64(info.baseFrequency) * 1e6 - } else { - buffer, err := ioutil.ReadFile("/sys/devices/system/cpu/cpu0/cpufreq/bios_limit") + for _, f := range files { + buffer, err := ioutil.ReadFile(f) if err == nil { data := strings.Replace(string(buffer), "\n", "", -1) x, err := strconv.ParseInt(data, 0, 64) @@ -102,12 +163,22 @@ func getBaseFreq() float64 { } } } + + if math.IsNaN(freq) { + C.power_init(0) + info := C.get_powerInfo() + if float64(info.baseFrequency) != 0 { + freq = float64(info.baseFrequency) * 1e6 + } + C.power_finalize() + } return freq } func (m *LikwidCollector) Init(config json.RawMessage) error { - var ret C.int m.name = "LikwidCollector" + m.initialized = false + m.running = false m.config.AccessMode = LIKWID_DEF_ACCESSMODE m.config.LibraryPath = LIKWID_LIB_NAME if len(config) > 0 { @@ -131,7 +202,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { } m.setup() - m.meta = map[string]string{"source": m.name, "group": "PerfCounter"} + m.meta = map[string]string{"group": "PerfCounter"} cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists") cpulist := topo.CpuList() m.cpulist = make([]C.int, len(cpulist)) @@ -140,172 +211,136 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { m.cpulist[i] = C.int(c) m.cpu2tid[c] = i } - m.sock2tid = make(map[int]int) - tmp := make([]C.int, 1) - for _, sid := range topo.SocketList() { - cstr := C.CString(fmt.Sprintf("S%d:0", sid)) - ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1) - if ret > 0 { - m.sock2tid[sid] = m.cpu2tid[int(tmp[0])] - } - C.free(unsafe.Pointer(cstr)) - } - m.results = make(map[int]map[int]map[string]interface{}) - m.mresults = make(map[int]map[int]map[string]float64) + + m.likwidGroups = make(map[C.int]LikwidEventsetConfig) + + // m.results = make(map[int]map[int]map[string]interface{}) + // m.mresults = make(map[int]map[int]map[string]float64) m.gmresults = make(map[int]map[string]float64) - cclog.ComponentDebug(m.name, "initialize LIKWID topology") - ret = C.topology_init() - if ret != 0 { - err := errors.New("failed to initialize LIKWID topology") - cclog.ComponentError(m.name, err.Error()) - return err - } - - switch m.config.AccessMode { - case "direct": - C.HPMmode(0) - case "accessdaemon": - if len(m.config.DaemonPath) > 0 { - p := os.Getenv("PATH") - os.Setenv("PATH", m.config.DaemonPath+":"+p) - } - C.HPMmode(1) - } - - cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module") - ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) - if ret != 0 { - C.topology_finalize() - err := errors.New("failed to initialize LIKWID topology") - cclog.ComponentError(m.name, err.Error()) - return err + for _, tid := range m.cpu2tid { + m.gmresults[tid] = make(map[string]float64) } // This is for the global metrics computation test - globalParams := make(map[string]interface{}) - globalParams["time"] = float64(1.0) - globalParams["inverseClock"] = float64(1.0) - // While adding the events, we test the metrics whether they can be computed at all - for i, evset := range m.config.Eventsets { - var gid C.int - var cstr *C.char + totalMetrics := 0 + // Generate parameter list for the metric computing test + params := make([]string, 0) + params = append(params, "time", "inverseClock") + // Generate parameter list for the global metric computing test + globalParams := make([]string, 0) + globalParams = append(globalParams, "time", "inverseClock") + // We test the eventset metrics whether they can be computed at all + for _, evset := range m.config.Eventsets { if len(evset.Events) > 0 { - estr := eventsToEventStr(evset.Events) - // Generate parameter list for the metric computing test - params := make(map[string]interface{}) - params["time"] = float64(1.0) - params["inverseClock"] = float64(1.0) + params = params[:2] for counter := range evset.Events { - params[counter] = float64(1.0) + params = append(params, counter) } for _, metric := range evset.Metrics { // Try to evaluate the metric - _, err := agg.EvalFloat64Condition(metric.Calc, params) - if err != nil { - cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) - continue - } - // If the metric is not in the parameter list for the global metrics, add it - if _, ok := globalParams[metric.Name]; !ok { - globalParams[metric.Name] = float64(1.0) + if testLikwidMetricFormula(metric.Calc, params) { + // Add the computable metric to the parameter list for the global metrics + globalParams = append(globalParams, metric.Name) + totalMetrics++ + } else { + metric.Calc = "" } } - // Now we add the list of events to likwid - cstr = C.CString(estr) - gid = C.perfmon_addEventSet(cstr) } else { cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given") continue } - if gid >= 0 { - m.groups = append(m.groups, gid) - } - C.free(unsafe.Pointer(cstr)) - m.results[i] = make(map[int]map[string]interface{}) - m.mresults[i] = make(map[int]map[string]float64) - for tid := range m.cpulist { - m.results[i][tid] = make(map[string]interface{}) - m.mresults[i][tid] = make(map[string]float64) - if i == 0 { - m.gmresults[tid] = make(map[string]float64) - } - } } for _, metric := range m.config.Metrics { // Try to evaluate the global metric - _, err := agg.EvalFloat64Condition(metric.Calc, globalParams) - if err != nil { - cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) - continue + if !testLikwidMetricFormula(metric.Calc, globalParams) { + cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed") + metric.Calc = "" + } else { + totalMetrics++ } } // If no event set could be added, shut down LikwidCollector - if len(m.groups) == 0 { - C.perfmon_finalize() - C.topology_finalize() - err := errors.New("no LIKWID performance group initialized") + if totalMetrics == 0 { + err := errors.New("no LIKWID eventset or metric usable") cclog.ComponentError(m.name, err.Error()) return err } - m.basefreq = getBaseFreq() - cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq) m.init = true return nil } // take a measurement for 'interval' seconds of event set index 'group' -func (m *LikwidCollector) takeMeasurement(group int, interval time.Duration) error { +func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval time.Duration) (bool, error) { var ret C.int - gid := m.groups[group] - ret = C.perfmon_setupCounters(gid) - if ret != 0 { - gctr := C.GoString(C.perfmon_getGroupName(gid)) - err := fmt.Errorf("failed to setup performance group %d (%s)", gid, gctr) - return err + m.lock.Lock() + if m.initialized { + ret = C.perfmon_setupCounters(evset.gid) + if ret != 0 { + var err error = nil + var skip bool = false + if ret == -37 { + skip = true + } else { + err = fmt.Errorf("failed to setup performance group %d", evset.gid) + } + m.lock.Unlock() + return skip, err + } + ret = C.perfmon_startCounters() + if ret != 0 { + var err error = nil + var skip bool = false + if ret == -37 { + skip = true + } else { + err = fmt.Errorf("failed to setup performance group %d", evset.gid) + } + m.lock.Unlock() + return skip, err + } + m.running = true + time.Sleep(interval) + m.running = false + ret = C.perfmon_stopCounters() + if ret != 0 { + var err error = nil + var skip bool = false + if ret == -37 { + skip = true + } else { + err = fmt.Errorf("failed to setup performance group %d", evset.gid) + } + m.lock.Unlock() + return skip, err + } } - ret = C.perfmon_startCounters() - if ret != 0 { - gctr := C.GoString(C.perfmon_getGroupName(gid)) - err := fmt.Errorf("failed to start performance group %d (%s)", gid, gctr) - return err - } - m.running = true - time.Sleep(interval) - m.running = false - ret = C.perfmon_stopCounters() - if ret != 0 { - gctr := C.GoString(C.perfmon_getGroupName(gid)) - err := fmt.Errorf("failed to stop performance group %d (%s)", gid, gctr) - return err - } - return nil + m.lock.Unlock() + return false, nil } // Get all measurement results for an event set, derive the metric values out of the measurement results and send it -func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, output chan lp.CCMetric) error { - var eidx C.int - evset := m.config.Eventsets[group] - gid := m.groups[group] +func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error { invClock := float64(1.0 / m.basefreq) // Go over events and get the results - for eidx = 0; int(eidx) < len(evset.Events); eidx++ { - ctr := C.perfmon_getCounterName(gid, eidx) - gctr := C.GoString(ctr) - + for eidx, counter := range evset.eorder { + gctr := C.GoString(counter) for _, tid := range m.cpu2tid { - if tid >= 0 { - m.results[group][tid]["time"] = interval.Seconds() - m.results[group][tid]["inverseClock"] = invClock - res := C.perfmon_getLastResult(gid, eidx, C.int(tid)) - m.results[group][tid][gctr] = float64(res) + res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid)) + fres := float64(res) + if m.config.InvalidToZero && (math.IsNaN(fres) || math.IsInf(fres, 0)) { + fres = 0.0 } + evset.results[tid][gctr] = fres + evset.results[tid]["time"] = interval.Seconds() + evset.results[tid]["inverseClock"] = invClock } } // Go over the event set metrics, derive the value out of the event:counter values and send it - for _, metric := range evset.Metrics { + for _, metric := range m.config.Eventsets[evset.internal].Metrics { // The metric scope is determined in the Init() function // Get the map scope-id -> tids scopemap := m.cpu2tid @@ -313,19 +348,16 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, scopemap = m.sock2tid } for domain, tid := range scopemap { - if tid >= 0 { - value, err := agg.EvalFloat64Condition(metric.Calc, m.results[group][tid]) + if tid >= 0 && len(metric.Calc) > 0 { + value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid]) if err != nil { cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) - continue - } - m.mresults[group][tid][metric.Name] = value - if m.config.InvalidToZero && math.IsNaN(value) { value = 0.0 } - if m.config.InvalidToZero && math.IsInf(value, 0) { + if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) { value = 0.0 } + evset.metrics[tid][metric.Name] = value // Now we have the result, send it with the proper tags if !math.IsNaN(value) { if metric.Publish { @@ -360,8 +392,8 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan if tid >= 0 { // Here we generate parameter list params := make(map[string]interface{}) - for j := range m.groups { - for mname, mres := range m.mresults[j][tid] { + for _, evset := range m.likwidGroups { + for mname, mres := range evset.metrics[tid] { params[mname] = mres } } @@ -369,15 +401,12 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan value, err := agg.EvalFloat64Condition(metric.Calc, params) if err != nil { cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) - continue + value = 0.0 + } + if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) { + value = 0.0 } m.gmresults[tid][metric.Name] = value - if m.config.InvalidToZero && math.IsNaN(value) { - value = 0.0 - } - if m.config.InvalidToZero && math.IsInf(value, 0) { - value = 0.0 - } // Now we have the result, send it with the proper tags if !math.IsNaN(value) { if metric.Publish { @@ -401,38 +430,163 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan return nil } +func (m *LikwidCollector) LateInit() error { + var ret C.int + if m.initialized { + return nil + } + switch m.config.AccessMode { + case "direct": + C.HPMmode(0) + case "accessdaemon": + if len(m.config.DaemonPath) > 0 { + p := os.Getenv("PATH") + os.Setenv("PATH", m.config.DaemonPath+":"+p) + } + C.HPMmode(1) + } + cclog.ComponentDebug(m.name, "initialize LIKWID topology") + ret = C.topology_init() + if ret != 0 { + err := errors.New("failed to initialize LIKWID topology") + cclog.ComponentError(m.name, err.Error()) + return err + } + + m.sock2tid = make(map[int]int) + tmp := make([]C.int, 1) + for _, sid := range topo.SocketList() { + cstr := C.CString(fmt.Sprintf("S%d:0", sid)) + ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1) + if ret > 0 { + m.sock2tid[sid] = m.cpu2tid[int(tmp[0])] + } + C.free(unsafe.Pointer(cstr)) + } + + m.basefreq = getBaseFreq() + cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq) + + cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module") + ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) + if ret != 0 { + var err error = nil + C.topology_finalize() + if ret != -22 { + err = errors.New("failed to initialize LIKWID perfmon") + cclog.ComponentError(m.name, err.Error()) + } else { + err = errors.New("access to LIKWID perfmon locked") + } + return err + } + + // While adding the events, we test the metrics whether they can be computed at all + for i, evset := range m.config.Eventsets { + var gid C.int + if len(evset.Events) > 0 { + skip := false + likwidGroup := genLikwidEventSet(evset) + for _, g := range m.likwidGroups { + if likwidGroup.go_estr == g.go_estr { + skip = true + break + } + } + if skip { + continue + } + // Now we add the list of events to likwid + gid = C.perfmon_addEventSet(likwidGroup.estr) + if gid >= 0 { + likwidGroup.gid = gid + likwidGroup.internal = i + m.likwidGroups[gid] = likwidGroup + } + } else { + cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given") + continue + } + + } + + // If no event set could be added, shut down LikwidCollector + if len(m.likwidGroups) == 0 { + C.perfmon_finalize() + C.topology_finalize() + err := errors.New("no LIKWID performance group initialized") + cclog.ComponentError(m.name, err.Error()) + return err + } + sigchan := make(chan os.Signal, 1) + signal.Notify(sigchan, syscall.SIGCHLD) + signal.Notify(sigchan, os.Interrupt) + go func() { + <-sigchan + + signal.Stop(sigchan) + m.initialized = false + }() + m.initialized = true + return nil +} + // main read function taking multiple measurement rounds, each 'interval' seconds long func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) { + var skip bool = false + var err error if !m.init { return } - for i := range m.groups { - // measure event set 'i' for 'interval' seconds - err := m.takeMeasurement(i, interval) + if !m.initialized { + m.lock.Lock() + err = m.LateInit() if err != nil { - cclog.ComponentError(m.name, err.Error()) + m.lock.Unlock() return } - // read measurements and derive event set metrics - m.calcEventsetMetrics(i, interval, output) + m.initialized = true + m.lock.Unlock() + } + + if m.initialized && !skip { + for _, evset := range m.likwidGroups { + if !skip { + // measure event set 'i' for 'interval' seconds + skip, err = m.takeMeasurement(evset, interval) + if err != nil { + cclog.ComponentError(m.name, err.Error()) + return + } + } + + if !skip { + // read measurements and derive event set metrics + m.calcEventsetMetrics(evset, interval, output) + } + } + if !skip { + // use the event set metrics to derive the global metrics + m.calcGlobalMetrics(interval, output) + } } - // use the event set metrics to derive the global metrics - m.calcGlobalMetrics(interval, output) } func (m *LikwidCollector) Close() { if m.init { - cclog.ComponentDebug(m.name, "Closing ...") m.init = false - if m.running { - cclog.ComponentDebug(m.name, "Stopping counters") - C.perfmon_stopCounters() + cclog.ComponentDebug(m.name, "Closing ...") + m.lock.Lock() + if m.initialized { + cclog.ComponentDebug(m.name, "Finalize LIKWID perfmon module") + C.perfmon_finalize() + m.initialized = false } - cclog.ComponentDebug(m.name, "Finalize LIKWID perfmon module") - C.perfmon_finalize() + m.lock.Unlock() cclog.ComponentDebug(m.name, "Finalize LIKWID topology module") C.topology_finalize() + cclog.ComponentDebug(m.name, "Closing done") } } diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index 86c1dda..1bb211f 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -3,22 +3,53 @@ The `likwid` collector is probably the most complicated collector. The LIKWID library is included as static library with *direct* access mode. The *direct* access mode is suitable if the daemon is executed by a root user. The static library does not contain the performance groups, so all information needs to be provided in the configuration. -The `likwid` configuration consists of two parts, the "eventsets" and "globalmetrics": -- An event set list itself has two parts, the "events" and a set of derivable "metrics". Each of the "events" is a counter:event pair in LIKWID's syntax. The "metrics" are a list of formulas to derive the metric value from the measurements of the "events". Each metric has a name, the formula, a scope and a publish flag. Counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. The scope tells the Collector whether it is a metric for each hardware thread (`hwthread`) or each CPU socket (`socket`). You may specify a unit for the metric with `unit`. The last one is the publishing flag. It tells the collector whether a metric should be sent to the router. -- The global metrics are metrics which require data from all event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. So, the idea is to derive a metric in the "eventsets" section and reuse it in the "globalmetrics" part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics (`publish=false`). **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Similar to the metrics in the eventset, you can specify a metric unit with the `unit` field. +```json + "likwid": { + "force_overwrite" : false, + "invalid_to_zero" : false, + "eventsets": [ + { + "events" : { + "COUNTER0": "EVENT0", + "COUNTER1": "EVENT1", + }, + "metrics" : [ + { + "name": "sum_01", + "calc": "COUNTER0 + COUNTER1", + "publish": false, + "unit": "myunit", + "type": "hwthread" + } + ] + } + ] + "globalmetrics" : [ + { + "name": "global_sum", + "calc": "sum_01", + "publish": true, + "unit": "myunit", + "type": "hwthread" + } + ] + } +``` + +The `likwid` configuration consists of two parts, the `eventsets` and `globalmetrics`: +- An event set list itself has two parts, the `events` and a set of derivable `metrics`. Each of the `events` is a `counter:event` pair in LIKWID's syntax. The `metrics` are a list of formulas to derive the metric value from the measurements of the `events`' values. Each metric has a name, the formula, a type and a publish flag. There is an optional `unit` field. Counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. You can optionally use `time` for the measurement time and `inverseClock` for `1.0/baseCpuFrequency`. The type tells the LikwidCollector whether it is a metric for each hardware thread (`cpu`) or each CPU socket (`socket`). You may specify a unit for the metric with `unit`. The last one is the publishing flag. It tells the LikwidCollector whether a metric should be sent to the router or is only used internally to compute a global metric. +- The `globalmetrics` are metrics which require data from multiple event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. Also `time` and `inverseClock` cannot be used anymore. So, the idea is to derive a metric in the `eventsets` section and reuse it in the `globalmetrics` part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics (`"publish": false`). **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Similar to the metrics in the eventset, you can specify a metric unit with the `unit` field. Additional options: -- `access_mode` : Method to use for hardware performance monitoring (`direct` access as root user, `accessdaemon` for the daemon mode) -- `accessdaemon_path`: Folder with the access daemon `likwid-accessD`, commonly `$LIKWID_INSTALL_LOC/sbin` - `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements -- `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`. -- `access_mode`: Specify LIKWID access mode: `direct` for direct register access as root user or `accessdaemon` -- `accessdaemon_path`: Folder of the accessDaemon `likwid-accessD` -- `liblikwid_path`: Location of `liblikwid.so` +- `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`. See below in [seperate section](./likwidMetric.md#invalid_to_zero-option) +- `access_mode`: Specify LIKWID access mode: `direct` for direct register access as root user or `accessdaemon`. The access mode `perf_event` is current untested. +- `accessdaemon_path`: Folder of the accessDaemon `likwid-accessD` (like `/usr/local/sbin`) +- `liblikwid_path`: Location of `liblikwid.so` including file name like `/usr/local/lib/liblikwid.so` ### Available metric scopes -Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the collector provides the specification of a 'scope' for each metric. +Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the LikwidCollector provides the specification of a `type` for each metric. - `hwthread` : One metric per CPU hardware thread with the tags `"type" : "hwthread"` and `"type-id" : "$hwthread_id"` - `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"` @@ -50,6 +81,7 @@ $ scripts/likwid_perfgroup_to_cc_config.py ICX MEM_DP { "events": { "FIXC0": "INSTR_RETIRED_ANY", + "FIXC1": "CPU_CLK_UNHALTED_CORE", "..." : "..." }, "metrics" : [ @@ -75,21 +107,28 @@ LIKWID checks the file `/var/run/likwid.lock` before performing any interfering Before (SLURM prolog, ...) ``` -$ chwon $JOBUSER /var/run/likwid.lock +$ chown $JOBUSER /var/run/likwid.lock ``` After (SLURM epilog, ...) ``` -$ chwon $CCUSER /var/run/likwid.lock +$ chown $CCUSER /var/run/likwid.lock ``` +### `invalid_to_zero` option +In some cases LIKWID returns `0.0` for some events that are further used in processing and maybe used as divisor in a calculation. After evaluation of a metric, the result might be `NaN` or `+-Inf`. These resulting metrics are commonly not created and forwarded to the router because the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#float) does not support these special floating-point values. If you want to have them sent, this option forces these metric values to be `0.0` instead. + +One might think this does not happen often but often used metrics in the world of performance engineering like Instructions-per-Cycle (IPC) or more frequently the actual CPU clock are derived with events like `CPU_CLK_UNHALTED_CORE` (Intel) which do not increment in halted state (as the name implies). In there are different power management systems in a chip which can cause a hardware thread to go in such a state. Moreover, if no cycles are executed by the core, also many other events are not incremented as well (like `INSTR_RETIRED_ANY` for retired instructions and part of IPC). + + ### Example configuration +#### AMD Zen3 ```json "likwid": { "force_overwrite" : false, - "nan_to_zero" : false, + "invalid_to_zero" : false, "eventsets": [ { "events": { @@ -209,4 +248,4 @@ IPC PMC0/PMC1 -> { -> ] ``` -The script `scripts/likwid_perfgroup_to_cc_config.py` might help you. \ No newline at end of file +The script `scripts/likwid_perfgroup_to_cc_config.py` might help you. diff --git a/collectors/memstatMetric.go b/collectors/memstatMetric.go index bd7af5d..c6c7f34 100644 --- a/collectors/memstatMetric.go +++ b/collectors/memstatMetric.go @@ -40,8 +40,13 @@ type MemstatCollector struct { sendMemUsed bool } -func getStats(filename string) map[string]float64 { - stats := make(map[string]float64) +type MemstatStats struct { + value float64 + unit string +} + +func getStats(filename string) map[string]MemstatStats { + stats := make(map[string]MemstatStats) file, err := os.Open(filename) if err != nil { cclog.Error(err.Error()) @@ -55,12 +60,18 @@ func getStats(filename string) map[string]float64 { if len(linefields) == 3 { v, err := strconv.ParseFloat(linefields[1], 64) if err == nil { - stats[strings.Trim(linefields[0], ":")] = v + stats[strings.Trim(linefields[0], ":")] = MemstatStats{ + value: v, + unit: linefields[2], + } } } else if len(linefields) == 5 { v, err := strconv.ParseFloat(linefields[3], 64) if err == nil { - stats[strings.Trim(linefields[0], ":")] = v + stats[strings.Trim(linefields[0], ":")] = MemstatStats{ + value: v, + unit: linefields[4], + } } } } @@ -78,7 +89,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error { return err } } - m.meta = map[string]string{"source": m.name, "group": "Memory", "unit": "GByte"} + m.meta = map[string]string{"source": m.name, "group": "Memory"} m.stats = make(map[string]int64) m.matches = make(map[string]string) m.tags = map[string]string{"type": "node"} @@ -151,30 +162,51 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) return } - sendStats := func(stats map[string]float64, tags map[string]string) { + sendStats := func(stats map[string]MemstatStats, tags map[string]string) { for match, name := range m.matches { var value float64 = 0 + var unit string = "" if v, ok := stats[match]; ok { - value = v + value = v.value + if len(v.unit) > 0 { + unit = v.unit + } } - y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value * 1e-6}, time.Now()) + + y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now()) if err == nil { + if len(unit) > 0 { + y.AddMeta("unit", unit) + } output <- y } } if m.sendMemUsed { memUsed := 0.0 + unit := "" if totalVal, total := stats["MemTotal"]; total { if freeVal, free := stats["MemFree"]; free { if bufVal, buffers := stats["Buffers"]; buffers { if cacheVal, cached := stats["Cached"]; cached { - memUsed = totalVal - (freeVal + bufVal + cacheVal) + memUsed = totalVal.value - (freeVal.value + bufVal.value + cacheVal.value) + if len(totalVal.unit) > 0 { + unit = totalVal.unit + } else if len(freeVal.unit) > 0 { + unit = freeVal.unit + } else if len(bufVal.unit) > 0 { + unit = bufVal.unit + } else if len(cacheVal.unit) > 0 { + unit = cacheVal.unit + } } } } } - y, err := lp.New("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed * 1e-6}, time.Now()) + y, err := lp.New("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now()) if err == nil { + if len(unit) > 0 { + y.AddMeta("unit", unit) + } output <- y } } diff --git a/collectors/nfsMetric.go b/collectors/nfsMetric.go index 07e684d..c511b0d 100644 --- a/collectors/nfsMetric.go +++ b/collectors/nfsMetric.go @@ -36,7 +36,7 @@ type nfsCollector struct { } func (m *nfsCollector) initStats() error { - cmd := exec.Command(m.config.Nfsstats, `-l`) + cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`) cmd.Wait() buffer, err := cmd.Output() if err == nil { @@ -52,7 +52,7 @@ func (m *nfsCollector) initStats() error { if err == nil { x := m.data[name] x.current = value - x.last = 0 + x.last = value m.data[name] = x } } @@ -63,7 +63,7 @@ func (m *nfsCollector) initStats() error { } func (m *nfsCollector) updateStats() error { - cmd := exec.Command(m.config.Nfsstats, `-l`) + cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`) cmd.Wait() buffer, err := cmd.Output() if err == nil { diff --git a/config.json b/config.json index 52f9df1..924bec7 100644 --- a/config.json +++ b/config.json @@ -1,8 +1,8 @@ { - "sinks": "sinks.json", - "collectors" : "collectors.json", - "receivers" : "receivers.json", - "router" : "router.json", - "interval": 10, - "duration": 1 + "sinks": "./sinks.json", + "collectors" : "./collectors.json", + "receivers" : "./receivers.json", + "router" : "./router.json", + "interval": "10s", + "duration": "1s" } diff --git a/go.mod b/go.mod index 130f5cc..07d46f6 100644 --- a/go.mod +++ b/go.mod @@ -3,17 +3,14 @@ module github.com/ClusterCockpit/cc-metric-collector go 1.16 require ( - github.com/NVIDIA/go-nvml v0.11.1-0 - github.com/influxdata/influxdb-client-go/v2 v2.7.0 - github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf - github.com/nats-io/nats.go v1.13.1-0.20211122170419-d7c1d78a50fc - golang.org/x/sys v0.0.0-20220114195835-da31bd327af9 - gopkg.in/Knetic/govaluate.v2 v2.3.0 -) - -require ( + github.com/NVIDIA/go-nvml v0.11.6-0 github.com/PaesslerAG/gval v1.1.2 - github.com/golang/protobuf v1.5.2 // indirect - github.com/nats-io/nats-server/v2 v2.7.0 // indirect - google.golang.org/protobuf v1.27.1 // indirect + github.com/gorilla/mux v1.8.0 + github.com/influxdata/influxdb-client-go/v2 v2.8.1 + github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf + github.com/nats-io/nats-server/v2 v2.8.0 // indirect + github.com/nats-io/nats.go v1.14.0 + github.com/prometheus/client_golang v1.12.1 + github.com/stmcginnis/gofish v0.13.0 + golang.org/x/sys v0.0.0-20220412211240-33da011f77ad ) diff --git a/internal/ccTopology/ccTopology.go b/internal/ccTopology/ccTopology.go index 958bb45..f68c3f4 100644 --- a/internal/ccTopology/ccTopology.go +++ b/internal/ccTopology/ccTopology.go @@ -169,7 +169,10 @@ func DieList() []int { } } } - return dielist + if len(dielist) > 0 { + return dielist + } + return SocketList() } type CpuEntry struct { @@ -261,7 +264,7 @@ func CpuData() []CpuEntry { for _, c := range CpuList() { clist = append(clist, CpuEntry{Cpuid: c}) } - for _, centry := range clist { + for i, centry := range clist { centry.Socket = -1 centry.Numadomain = -1 centry.Die = -1 @@ -289,6 +292,8 @@ func CpuData() []CpuEntry { // Lookup NUMA domain id centry.Numadomain = getNumaDomain(base) + // Update values in output list + clist[i] = centry } return clist } diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index f9b3faa..8875d0e 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -48,7 +48,6 @@ type metricRouter struct { done chan bool // channel to finish / stop metric router wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector timestamp time.Time // timestamp periodically updated by ticker each interval - timerdone chan bool // channel to finish / stop timestamp updater ticker mct.MultiChanTicker // periodically ticking once each interval config metricRouterConfig // json encoded config for metric router cache MetricCache // pointer to MetricCache @@ -124,29 +123,6 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout return nil } -// StartTimer starts a timer which updates timestamp periodically -func (r *metricRouter) StartTimer() { - m := make(chan time.Time) - r.ticker.AddChannel(m) - r.timerdone = make(chan bool) - - r.wg.Add(1) - go func() { - defer r.wg.Done() - for { - select { - case <-r.timerdone: - close(r.timerdone) - cclog.ComponentDebug("MetricRouter", "TIMER DONE") - return - case t := <-m: - r.timestamp = t - } - } - }() - cclog.ComponentDebug("MetricRouter", "TIMER START") -} - func getParamMap(point lp.CCMetric) map[string]interface{} { params := make(map[string]interface{}) params["metric"] = point @@ -235,8 +211,9 @@ func (r *metricRouter) dropMetric(point lp.CCMetric) bool { func (r *metricRouter) Start() { // start timer if configured r.timestamp = time.Now() + timeChan := make(chan time.Time) if r.config.IntervalStamp { - r.StartTimer() + r.ticker.AddChannel(timeChan) } // Router manager is done @@ -316,6 +293,10 @@ func (r *metricRouter) Start() { done() return + case timestamp := <-timeChan: + r.timestamp = timestamp + cclog.ComponentDebug("MetricRouter", "Update timestamp", r.timestamp.UnixNano()) + case p := <-r.coll_input: coll_forward(p) for i := 0; len(r.coll_input) > 0 && i < (r.maxForward-1); i++ { @@ -361,14 +342,6 @@ func (r *metricRouter) Close() { // wait for close of channel r.done <-r.done - // stop timer - if r.config.IntervalStamp { - cclog.ComponentDebug("MetricRouter", "TIMER CLOSE") - r.timerdone <- true - // wait for close of channel r.timerdone - <-r.timerdone - } - // stop metric cache if r.config.NumCacheIntervals > 0 { cclog.ComponentDebug("MetricRouter", "CACHE CLOSE") diff --git a/receivers.json b/receivers.json index a27f07d..cd78eb6 100644 --- a/receivers.json +++ b/receivers.json @@ -4,5 +4,22 @@ "address": "nats://my-url", "port" : "4222", "database": "testcluster" + }, + "redfish_recv": { + "type": "redfish", + "client_config": [ + { + "hostname": "my-host-1", + "username": "username-1", + "password": "password-1", + "endpoint": "https://my-endpoint-1" + }, + { + "hostname": "my-host-2", + "username": "username-2", + "password": "password-2", + "endpoint": "https://my-endpoint-2" + } + ] } } diff --git a/receivers/receiveManager.go b/receivers/receiveManager.go index 1c13026..7a20fac 100644 --- a/receivers/receiveManager.go +++ b/receivers/receiveManager.go @@ -10,14 +10,13 @@ import ( ) var AvailableReceivers = map[string]func(name string, config json.RawMessage) (Receiver, error){ - "nats": NewNatsReceiver, + "nats": NewNatsReceiver, + "redfish": NewRedfishReceiver, } type receiveManager struct { inputs []Receiver output chan lp.CCMetric - done chan bool - wg *sync.WaitGroup config []json.RawMessage } @@ -33,8 +32,6 @@ func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) er // Initialize struct fields rm.inputs = make([]Receiver, 0) rm.output = nil - rm.done = make(chan bool) - rm.wg = wg rm.config = make([]json.RawMessage, 0) configFile, err := os.Open(receiverConfigFile) @@ -58,7 +55,7 @@ func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) er } func (rm *receiveManager) Start() { - rm.wg.Add(1) + cclog.ComponentDebug("ReceiveManager", "START") for _, r := range rm.inputs { cclog.ComponentDebug("ReceiveManager", "START", r.Name()) @@ -97,16 +94,19 @@ func (rm *receiveManager) AddOutput(output chan lp.CCMetric) { } func (rm *receiveManager) Close() { + cclog.ComponentDebug("ReceiveManager", "CLOSE") + + // Close all receivers for _, r := range rm.inputs { cclog.ComponentDebug("ReceiveManager", "CLOSE", r.Name()) r.Close() } - rm.wg.Done() - cclog.ComponentDebug("ReceiveManager", "CLOSE") + + cclog.ComponentDebug("ReceiveManager", "DONE") } func New(wg *sync.WaitGroup, receiverConfigFile string) (ReceiveManager, error) { - r := &receiveManager{} + r := new(receiveManager) err := r.Init(wg, receiverConfigFile) if err != nil { return nil, err diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go new file mode 100644 index 0000000..d50cbc7 --- /dev/null +++ b/receivers/redfishReceiver.go @@ -0,0 +1,324 @@ +package receivers + +import ( + "encoding/json" + "fmt" + "strconv" + "sync" + "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + + // See: https://pkg.go.dev/github.com/stmcginnis/gofish + "github.com/stmcginnis/gofish" +) + +// RedfishReceiver configuration: +type RedfishReceiver struct { + receiver + config struct { + Type string `json:"type"` + Fanout int `json:"fanout,omitempty"` // Default fanout: 64 + Interval int `json:"interval,omitempty"` // Default interval: 30s + + // Client config for each redfish service + ClientConfigs []struct { + Hostname *string `json:"hostname"` + Username *string `json:"username"` + Password *string `json:"password"` + Endpoint *string `json:"endpoint"` + Insecure *bool `json:"insecure,omitempty"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + gofish gofish.ClientConfig + } `json:"client_config"` + } + + done chan bool // channel to finish / stop redfish receiver + wg sync.WaitGroup // wait group for redfish receiver +} + +// Start starts the redfish receiver +func (r *RedfishReceiver) Start() { + cclog.ComponentDebug(r.name, "START") + + // readPowerMetric reads readfish power metric from the endpoint configured in conf + readPowerMetric := func(clientConfigIndex int) error { + + clientConfig := &r.config.ClientConfigs[clientConfigIndex] + + // Connect to redfish service + c, err := gofish.Connect(clientConfig.gofish) + if err != nil { + c := struct { + Username string + Endpoint string + BasicAuth bool + Insecure bool + }{ + Username: clientConfig.gofish.Username, + Endpoint: clientConfig.gofish.Endpoint, + BasicAuth: clientConfig.gofish.BasicAuth, + Insecure: clientConfig.gofish.Insecure, + } + return fmt.Errorf("readPowerMetric: gofish.Connect(%+v) failed: %v", c, err) + } + defer c.Logout() + + // Get all chassis managed by this service + chassis_list, err := c.Service.Chassis() + if err != nil { + return fmt.Errorf("readPowerMetric: c.Service.Chassis() failed: %v", err) + } + + for _, chassis := range chassis_list { + timestamp := time.Now() + + // Get power information for each chassis + power, err := chassis.Power() + if err != nil { + return fmt.Errorf("readPowerMetric: chassis.Power() failed: %v", err) + } + if power == nil { + continue + } + + // Read min, max and average consumed watts for each power control + for _, pc := range power.PowerControl { + + // Map of collected metrics + metrics := map[string]float32{ + // PowerConsumedWatts shall represent the actual power being consumed (in + // Watts) by the chassis + "consumed_watts": pc.PowerConsumedWatts, + // AverageConsumedWatts shall represent the + // average power level that occurred averaged over the last IntervalInMin + // minutes. + "average_consumed_watts": pc.PowerMetrics.AverageConsumedWatts, + // MinConsumedWatts shall represent the + // minimum power level in watts that occurred within the last + // IntervalInMin minutes. + "min_consumed_watts": pc.PowerMetrics.MinConsumedWatts, + // MaxConsumedWatts shall represent the + // maximum power level in watts that occurred within the last + // IntervalInMin minutes + "max_consumed_watts": pc.PowerMetrics.MaxConsumedWatts, + } + intervalInMin := strconv.FormatFloat(float64(pc.PowerMetrics.IntervalInMin), 'f', -1, 32) + + // Metrics to exclude + for _, key := range clientConfig.ExcludeMetrics { + delete(metrics, key) + } + + // Set tags + tags := map[string]string{ + "hostname": *clientConfig.Hostname, + "type": "node", + // ID uniquely identifies the resource + "id": pc.ID, + // MemberID shall uniquely identify the member within the collection. For + // services supporting Redfish v1.6 or higher, this value shall be the + // zero-based array index. + "member_id": pc.MemberID, + // PhysicalContext shall be a description of the affected device(s) or region + // within the chassis to which this power control applies. + "physical_context": string(pc.PhysicalContext), + // Name + "power_control_name": pc.Name, + } + + // Delete empty tags + for key, value := range tags { + if value == "" { + delete(tags, key) + } + } + + // Set meta data tags + meta := map[string]string{ + "source": r.name, + "group": "Energy", + "interval_in_minutes": intervalInMin, + "unit": "watts", + } + + // Delete empty meta data tags + for key, value := range meta { + if value == "" { + delete(meta, key) + } + } + + for name, value := range metrics { + + y, err := lp.New(name, tags, meta, + map[string]interface{}{ + "value": value, + }, + timestamp) + if err == nil { + r.sink <- y + } + } + } + } + + return nil + } + + // doReadPowerMetric read power metrics for all configure redfish services. + // To compensate latencies of the Redfish services a fanout is used. + doReadPowerMetric := func() { + + // Compute fanout to use + realFanout := r.config.Fanout + if len(r.config.ClientConfigs) < realFanout { + realFanout = len(r.config.ClientConfigs) + } + + // Create wait group and input channel for workers + var workerWaitGroup sync.WaitGroup + workerInput := make(chan int, realFanout) + + // Create worker go routines + for i := 0; i < realFanout; i++ { + // Increment worker wait group counter + workerWaitGroup.Add(1) + go func() { + // Decrement worker wait group counter + defer workerWaitGroup.Done() + + // Read power metrics for each client config + for clientConfigIndex := range workerInput { + err := readPowerMetric(clientConfigIndex) + if err != nil { + cclog.ComponentError(r.name, err) + } + } + }() + } + + // Distribute client configs to workers + for i := range r.config.ClientConfigs { + // Check done channel status + select { + case workerInput <- i: + case <-r.done: + // process done event + // Stop workers, clear channel and wait for all workers to finish + close(workerInput) + for range workerInput { + } + workerWaitGroup.Wait() + return + } + } + + // Stop workers and wait for all workers to finish + close(workerInput) + workerWaitGroup.Wait() + } + + // Start redfish receiver + r.wg.Add(1) + go func() { + defer r.wg.Done() + + // Create ticker + ticker := time.NewTicker(time.Duration(r.config.Interval) * time.Second) + defer ticker.Stop() + + for { + doReadPowerMetric() + + select { + case <-ticker.C: + // process ticker event -> continue + continue + case <-r.done: + // process done event + return + } + } + }() + + cclog.ComponentDebug(r.name, "STARTED") +} + +// Close redfish receiver +func (r *RedfishReceiver) Close() { + cclog.ComponentDebug(r.name, "CLOSE") + + // Send the signal and wait + close(r.done) + r.wg.Wait() + + cclog.ComponentDebug(r.name, "DONE") +} + +// New function to create a new instance of the receiver +// Initialize the receiver by giving it a name and reading in the config JSON +func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { + r := new(RedfishReceiver) + + // Set name + r.name = fmt.Sprintf("RedfishReceiver(%s)", name) + + // Create done channel + r.done = make(chan bool) + + // Set defaults in r.config + // Allow overwriting these defaults by reading config JSON + r.config.Fanout = 64 + r.config.Interval = 30 + + // Read the redfish receiver specific JSON config + if len(config) > 0 { + err := json.Unmarshal(config, &r.config) + if err != nil { + cclog.ComponentError(r.name, "Error reading config:", err.Error()) + return nil, err + } + } + + // Create gofish client config + for i := range r.config.ClientConfigs { + clientConfig := &r.config.ClientConfigs[i] + gofishConfig := &clientConfig.gofish + + if clientConfig.Hostname == nil { + err := fmt.Errorf("client config number %v requires hostname", i) + cclog.ComponentError(r.name, err) + return nil, err + } + + if clientConfig.Endpoint == nil { + err := fmt.Errorf("client config number %v requires endpoint", i) + cclog.ComponentError(r.name, err) + return nil, err + } + gofishConfig.Endpoint = *clientConfig.Endpoint + + if clientConfig.Username == nil { + err := fmt.Errorf("client config number %v requires username", i) + cclog.ComponentError(r.name, err) + return nil, err + } + gofishConfig.Username = *clientConfig.Username + + if clientConfig.Password == nil { + err := fmt.Errorf("client config number %v requires password", i) + cclog.ComponentError(r.name, err) + return nil, err + } + gofishConfig.Password = *clientConfig.Password + + gofishConfig.Insecure = true + if clientConfig.Insecure != nil { + gofishConfig.Insecure = *clientConfig.Insecure + } + } + + return r, nil +} diff --git a/receivers/sampleReceiver.go b/receivers/sampleReceiver.go index 2892d56..19d6f25 100644 --- a/receivers/sampleReceiver.go +++ b/receivers/sampleReceiver.go @@ -36,16 +36,26 @@ func (r *SampleReceiver) Start() { // or use own go routine but always make sure it exits // as soon as it gets the signal of the r.done channel + // + // r.done = make(chan bool) // r.wg.Add(1) // go func() { - // for { - // select { - // case <-r.done: - // r.wg.Done() - // return - // } - // } - // r.wg.Done() + // defer r.wg.Done() + // + // // Create ticker + // ticker := time.NewTicker(30 * time.Second) + // defer ticker.Stop() + // + // for { + // readMetric() + // select { + // case <-ticker.C: + // // process ticker event -> continue + // continue + // case <-r.done: + // return + // } + // } // }() } diff --git a/scripts/cc-metric-collector.config b/scripts/cc-metric-collector.config index 3535ddf..988b0ff 100644 --- a/scripts/cc-metric-collector.config +++ b/scripts/cc-metric-collector.config @@ -15,3 +15,9 @@ CONF_DIR=/etc/cc-metric-collector CONF_FILE=/etc/cc-metric-collector/cc-metric-collector.json RESTART_ON_UPGRADE=true + +# Golang runtime debugging. (see: https://pkg.go.dev/runtime) +# GODEBUG=gctrace=1 + +# Golang garbage collection target percentage +# GOGC=100 diff --git a/sinks.json b/sinks.json index 2fdae5a..3e9be6d 100644 --- a/sinks.json +++ b/sinks.json @@ -1,6 +1,8 @@ { - "mystdout" : { - "type" : "stdout", - "meta_as_tags" : true + "mystdout": { + "type": "stdout", + "meta_as_tags": [ + "unit" + ] } -} +} \ No newline at end of file diff --git a/sinks/httpSink.go b/sinks/httpSink.go index 398eaf3..7713638 100644 --- a/sinks/httpSink.go +++ b/sinks/httpSink.go @@ -42,13 +42,13 @@ func (s *HttpSink) Write(m lp.CCMetric) error { if s.buffer.Len() == 0 && s.flushDelay != 0 { // This is the first write since the last flush, start the flushTimer! if s.flushTimer != nil && s.flushTimer.Stop() { - cclog.ComponentDebug("HttpSink", "unexpected: the flushTimer was already running?") + cclog.ComponentDebug(s.name, "unexpected: the flushTimer was already running?") } // Run a batched flush for all lines that have arrived in the last second s.flushTimer = time.AfterFunc(s.flushDelay, func() { if err := s.Flush(); err != nil { - cclog.ComponentError("HttpSink", "flush failed:", err.Error()) + cclog.ComponentError(s.name, "flush failed:", err.Error()) } }) } @@ -60,6 +60,7 @@ func (s *HttpSink) Write(m lp.CCMetric) error { s.lock.Unlock() // defer does not work here as Flush() takes the lock as well if err != nil { + cclog.ComponentError(s.name, "encoding failed:", err.Error()) return err } @@ -84,6 +85,7 @@ func (s *HttpSink) Flush() error { // Create new request to send buffer req, err := http.NewRequest(http.MethodPost, s.config.URL, s.buffer) if err != nil { + cclog.ComponentError(s.name, "failed to create request:", err.Error()) return err } @@ -100,12 +102,15 @@ func (s *HttpSink) Flush() error { // Handle transport/tcp errors if err != nil { + cclog.ComponentError(s.name, "transport/tcp error:", err.Error()) return err } // Handle application errors if res.StatusCode != http.StatusOK { - return errors.New(res.Status) + err = errors.New(res.Status) + cclog.ComponentError(s.name, "application error:", err.Error()) + return err } return nil @@ -114,7 +119,7 @@ func (s *HttpSink) Flush() error { func (s *HttpSink) Close() { s.flushTimer.Stop() if err := s.Flush(); err != nil { - cclog.ComponentError("HttpSink", "flush failed:", err.Error()) + cclog.ComponentError(s.name, "flush failed:", err.Error()) } s.client.CloseIdleConnections() } diff --git a/sinks/influxAsyncSink.go b/sinks/influxAsyncSink.go index 213f2d6..bf88079 100644 --- a/sinks/influxAsyncSink.go +++ b/sinks/influxAsyncSink.go @@ -6,12 +6,14 @@ import ( "encoding/json" "errors" "fmt" + "strings" "time" cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" + influxdb2ApiHttp "github.com/influxdata/influxdb-client-go/v2/api/http" ) type InfluxAsyncSinkConfig struct { @@ -23,15 +25,16 @@ type InfluxAsyncSinkConfig struct { Password string `json:"password,omitempty"` Organization string `json:"organization,omitempty"` SSL bool `json:"ssl,omitempty"` - RetentionPol string `json:"retention_policy,omitempty"` // Maximum number of points sent to server in single request. Default 5000 BatchSize uint `json:"batch_size,omitempty"` // Interval, in ms, in which is buffer flushed if it has not been already written (by reaching batch size) . Default 1000ms FlushInterval uint `json:"flush_interval,omitempty"` - InfluxRetryInterval string `json:"retry_interval"` - InfluxExponentialBase uint `json:"retry_exponential_base"` - InfluxMaxRetries uint `json:"max_retries"` - InfluxMaxRetryTime string `json:"max_retry_time"` + InfluxRetryInterval string `json:"retry_interval,omitempty"` + InfluxExponentialBase uint `json:"retry_exponential_base,omitempty"` + InfluxMaxRetries uint `json:"max_retries,omitempty"` + InfluxMaxRetryTime string `json:"max_retry_time,omitempty"` + CustomFlushInterval string `json:"custom_flush_interval,omitempty"` + MaxRetryAttempts uint `json:"max_retry_attempts,omitempty"` } type InfluxAsyncSink struct { @@ -42,6 +45,8 @@ type InfluxAsyncSink struct { config InfluxAsyncSinkConfig influxRetryInterval uint influxMaxRetryTime uint + customFlushInterval time.Duration + flushTimer *time.Timer } func (s *InfluxAsyncSink) connect() error { @@ -60,20 +65,34 @@ func (s *InfluxAsyncSink) connect() error { cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database) clientOptions := influxdb2.DefaultOptions() if s.config.BatchSize != 0 { + cclog.ComponentDebug(s.name, "Batch size", s.config.BatchSize) clientOptions.SetBatchSize(s.config.BatchSize) } if s.config.FlushInterval != 0 { + cclog.ComponentDebug(s.name, "Flush interval", s.config.FlushInterval) clientOptions.SetFlushInterval(s.config.FlushInterval) } + if s.influxRetryInterval != 0 { + cclog.ComponentDebug(s.name, "MaxRetryInterval", s.influxRetryInterval) + clientOptions.SetMaxRetryInterval(s.influxRetryInterval) + } + if s.influxMaxRetryTime != 0 { + cclog.ComponentDebug(s.name, "MaxRetryTime", s.influxMaxRetryTime) + clientOptions.SetMaxRetryTime(s.influxMaxRetryTime) + } + if s.config.InfluxExponentialBase != 0 { + cclog.ComponentDebug(s.name, "Exponential Base", s.config.InfluxExponentialBase) + clientOptions.SetExponentialBase(s.config.InfluxExponentialBase) + } + if s.config.InfluxMaxRetries != 0 { + cclog.ComponentDebug(s.name, "Max Retries", s.config.InfluxMaxRetries) + clientOptions.SetMaxRetries(s.config.InfluxMaxRetries) + } clientOptions.SetTLSConfig( &tls.Config{ InsecureSkipVerify: true, }, - ) - clientOptions.SetMaxRetryInterval(s.influxRetryInterval) - clientOptions.SetMaxRetryTime(s.influxMaxRetryTime) - clientOptions.SetExponentialBase(s.config.InfluxExponentialBase) - clientOptions.SetMaxRetries(s.config.InfluxMaxRetries) + ).SetPrecision(time.Second) s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions) s.writeApi = s.client.WriteAPI(s.config.Organization, s.config.Database) @@ -84,10 +103,23 @@ func (s *InfluxAsyncSink) connect() error { if !ok { return fmt.Errorf("connection to %s not healthy", uri) } + s.writeApi.SetWriteFailedCallback(func(batch string, err influxdb2ApiHttp.Error, retryAttempts uint) bool { + mlist := strings.Split(batch, "\n") + cclog.ComponentError(s.name, fmt.Sprintf("Failed to write batch with %d metrics %d times (max: %d): %s", len(mlist), retryAttempts, s.config.MaxRetryAttempts, err.Error())) + return retryAttempts <= s.config.MaxRetryAttempts + }) return nil } func (s *InfluxAsyncSink) Write(m lp.CCMetric) error { + if s.customFlushInterval != 0 && s.flushTimer == nil { + // Run a batched flush for all lines that have arrived in the defined interval + s.flushTimer = time.AfterFunc(s.customFlushInterval, func() { + if err := s.Flush(); err != nil { + cclog.ComponentError(s.name, "flush failed:", err.Error()) + } + }) + } s.writeApi.WritePoint( m.ToPoint(s.meta_as_tags), ) @@ -95,7 +127,11 @@ func (s *InfluxAsyncSink) Write(m lp.CCMetric) error { } func (s *InfluxAsyncSink) Flush() error { + cclog.ComponentDebug(s.name, "Flushing") s.writeApi.Flush() + if s.customFlushInterval != 0 && s.flushTimer != nil { + s.flushTimer = nil + } return nil } @@ -110,13 +146,17 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) { s.name = fmt.Sprintf("InfluxSink(%s)", name) // Set default for maximum number of points sent to server in single request. - s.config.BatchSize = 100 - s.influxRetryInterval = uint(time.Duration(1) * time.Second) - s.config.InfluxRetryInterval = "1s" - s.influxMaxRetryTime = uint(7 * time.Duration(24) * time.Hour) - s.config.InfluxMaxRetryTime = "168h" - s.config.InfluxMaxRetries = 20 - s.config.InfluxExponentialBase = 2 + s.config.BatchSize = 0 + s.influxRetryInterval = 0 + //s.config.InfluxRetryInterval = "1s" + s.influxMaxRetryTime = 0 + //s.config.InfluxMaxRetryTime = "168h" + s.config.InfluxMaxRetries = 0 + s.config.InfluxExponentialBase = 0 + s.config.FlushInterval = 0 + s.config.CustomFlushInterval = "" + s.customFlushInterval = time.Duration(0) + s.config.MaxRetryAttempts = 1 // Default retry intervals (in seconds) // 1 2 @@ -145,12 +185,17 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) { return nil, err } } - if len(s.config.Host) == 0 || - len(s.config.Port) == 0 || - len(s.config.Database) == 0 || - len(s.config.Organization) == 0 || - len(s.config.Password) == 0 { - return nil, errors.New("not all configuration variables set required by InfluxAsyncSink") + if len(s.config.Port) == 0 { + return nil, errors.New("Missing port configuration required by InfluxSink") + } + if len(s.config.Database) == 0 { + return nil, errors.New("Missing database configuration required by InfluxSink") + } + if len(s.config.Organization) == 0 { + return nil, errors.New("Missing organization configuration required by InfluxSink") + } + if len(s.config.Password) == 0 { + return nil, errors.New("Missing password configuration required by InfluxSink") } // Create lookup map to use meta infos as tags in the output metric s.meta_as_tags = make(map[string]bool) @@ -168,6 +213,15 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) { s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval) s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime) + // Use a own timer for calling Flush() + if len(s.config.CustomFlushInterval) > 0 { + t, err := time.ParseDuration(s.config.CustomFlushInterval) + if err != nil { + return nil, fmt.Errorf("invalid duration in 'custom_flush_interval': %v", err) + } + s.customFlushInterval = t + } + // Connect to InfluxDB server if err := s.connect(); err != nil { return nil, fmt.Errorf("unable to connect: %v", err) diff --git a/sinks/influxSink.go b/sinks/influxSink.go index 1987342..212647d 100644 --- a/sinks/influxSink.go +++ b/sinks/influxSink.go @@ -6,69 +6,81 @@ import ( "encoding/json" "errors" "fmt" + "sync" "time" cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" + "github.com/influxdata/influxdb-client-go/v2/api/write" ) -type InfluxSinkConfig struct { - defaultSinkConfig - Host string `json:"host,omitempty"` - Port string `json:"port,omitempty"` - Database string `json:"database,omitempty"` - User string `json:"user,omitempty"` - Password string `json:"password,omitempty"` - Organization string `json:"organization,omitempty"` - SSL bool `json:"ssl,omitempty"` - RetentionPol string `json:"retention_policy,omitempty"` - InfluxRetryInterval string `json:"retry_interval"` - InfluxExponentialBase uint `json:"retry_exponential_base"` - InfluxMaxRetries uint `json:"max_retries"` - InfluxMaxRetryTime string `json:"max_retry_time"` - //InfluxMaxRetryDelay string `json:"max_retry_delay"` // It is mentioned in the docs but there is no way to set it -} - type InfluxSink struct { sink - client influxdb2.Client - writeApi influxdb2Api.WriteAPIBlocking - config InfluxSinkConfig - influxRetryInterval uint - influxMaxRetryTime uint - //influxMaxRetryDelay uint + client influxdb2.Client + writeApi influxdb2Api.WriteAPIBlocking + config struct { + defaultSinkConfig + Host string `json:"host,omitempty"` + Port string `json:"port,omitempty"` + Database string `json:"database,omitempty"` + User string `json:"user,omitempty"` + Password string `json:"password,omitempty"` + Organization string `json:"organization,omitempty"` + SSL bool `json:"ssl,omitempty"` + // Maximum number of points sent to server in single request. Default 100 + BatchSize int `json:"batch_size,omitempty"` + // Interval, in which is buffer flushed if it has not been already written (by reaching batch size). Default 1s + FlushInterval string `json:"flush_delay,omitempty"` + } + batch []*write.Point + flushTimer *time.Timer + flushDelay time.Duration + lock sync.Mutex // Flush() runs in another goroutine, so this lock has to protect the buffer } +// connect connects to the InfluxDB server func (s *InfluxSink) connect() error { - var auth string + + // URI options: + // * http://host:port + // * https://host:port var uri string if s.config.SSL { uri = fmt.Sprintf("https://%s:%s", s.config.Host, s.config.Port) } else { uri = fmt.Sprintf("http://%s:%s", s.config.Host, s.config.Port) } + + // Authentication options: + // * token + // * username:password + var auth string if len(s.config.User) == 0 { auth = s.config.Password } else { auth = fmt.Sprintf("%s:%s", s.config.User, s.config.Password) } cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database) + + // Set influxDB client options clientOptions := influxdb2.DefaultOptions() + + // Do not check InfluxDB certificate clientOptions.SetTLSConfig( &tls.Config{ InsecureSkipVerify: true, }, ) - clientOptions.SetMaxRetryInterval(s.influxRetryInterval) - clientOptions.SetMaxRetryTime(s.influxMaxRetryTime) - clientOptions.SetExponentialBase(s.config.InfluxExponentialBase) - clientOptions.SetMaxRetries(s.config.InfluxMaxRetries) + clientOptions.SetPrecision(time.Second) + // Create new writeAPI s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions) s.writeApi = s.client.WriteAPIBlocking(s.config.Organization, s.config.Database) + + // Check InfluxDB server accessibility ok, err := s.client.Ping(context.Background()) if err != nil { return err @@ -80,61 +92,126 @@ func (s *InfluxSink) connect() error { } func (s *InfluxSink) Write(m lp.CCMetric) error { - err := - s.writeApi.WritePoint( - context.Background(), - m.ToPoint(s.meta_as_tags), - ) - return err + + if len(s.batch) == 0 && s.flushDelay != 0 { + // This is the first write since the last flush, start the flushTimer! + if s.flushTimer != nil && s.flushTimer.Stop() { + cclog.ComponentDebug(s.name, "unexpected: the flushTimer was already running?") + } + + // Run a batched flush for all lines that have arrived in the last flush delay interval + s.flushTimer = time.AfterFunc( + s.flushDelay, + func() { + if err := s.Flush(); err != nil { + cclog.ComponentError(s.name, "flush failed:", err.Error()) + } + }) + } + + // Append metric to batch slice + p := m.ToPoint(s.meta_as_tags) + s.lock.Lock() + s.batch = append(s.batch, p) + s.lock.Unlock() + + // Flush synchronously if "flush_delay" is zero + if s.flushDelay == 0 { + return s.Flush() + } + + // Flush if batch size is reached + if len(s.batch) == s.config.BatchSize { + return s.Flush() + } + + return nil } +// Flush sends all metrics buffered in batch slice to InfluxDB server func (s *InfluxSink) Flush() error { + + // Lock access to batch slice + s.lock.Lock() + defer s.lock.Unlock() + + // Nothing to do, batch slice is empty + if len(s.batch) == 0 { + return nil + } + + // Send metrics from batch slice + err := s.writeApi.WritePoint(context.Background(), s.batch...) + if err != nil { + cclog.ComponentError(s.name, "flush failed:", err.Error()) + return err + } + + // Clear batch slice + for i := range s.batch { + s.batch[i] = nil + } + s.batch = s.batch[:0] + return nil } func (s *InfluxSink) Close() { cclog.ComponentDebug(s.name, "Closing InfluxDB connection") + s.flushTimer.Stop() + s.Flush() s.client.Close() } +// NewInfluxSink create a new InfluxDB sink func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { s := new(InfluxSink) s.name = fmt.Sprintf("InfluxSink(%s)", name) + + // Set config default values + s.config.BatchSize = 100 + s.config.FlushInterval = "1s" + + // Read config if len(config) > 0 { err := json.Unmarshal(config, &s.config) if err != nil { return nil, err } } - s.influxRetryInterval = uint(time.Duration(1) * time.Second) - s.config.InfluxRetryInterval = "1s" - s.influxMaxRetryTime = uint(7 * time.Duration(24) * time.Hour) - s.config.InfluxMaxRetryTime = "168h" - s.config.InfluxMaxRetries = 20 - s.config.InfluxExponentialBase = 2 - if len(s.config.Host) == 0 || - len(s.config.Port) == 0 || - len(s.config.Database) == 0 || - len(s.config.Organization) == 0 || - len(s.config.Password) == 0 { - return nil, errors.New("not all configuration variables set required by InfluxSink") + if len(s.config.Host) == 0 { + return nil, errors.New("Missing host configuration required by InfluxSink") } + if len(s.config.Port) == 0 { + return nil, errors.New("Missing port configuration required by InfluxSink") + } + if len(s.config.Database) == 0 { + return nil, errors.New("Missing database configuration required by InfluxSink") + } + if len(s.config.Organization) == 0 { + return nil, errors.New("Missing organization configuration required by InfluxSink") + } + if len(s.config.Password) == 0 { + return nil, errors.New("Missing password configuration required by InfluxSink") + } + // Create lookup map to use meta infos as tags in the output metric s.meta_as_tags = make(map[string]bool) for _, k := range s.config.MetaAsTags { s.meta_as_tags[k] = true } - toUint := func(duration string, def uint) uint { - t, err := time.ParseDuration(duration) + // Configure flush delay duration + if len(s.config.FlushInterval) > 0 { + t, err := time.ParseDuration(s.config.FlushInterval) if err == nil { - return uint(t.Milliseconds()) + s.flushDelay = t } - return def } - s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval) - s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime) + + // allocate batch slice + s.batch = make([]*write.Point, 0, s.config.BatchSize) // Connect to InfluxDB server if err := s.connect(); err != nil { diff --git a/sinks/influxSink.md b/sinks/influxSink.md index a099895..8f9ce83 100644 --- a/sinks/influxSink.md +++ b/sinks/influxSink.md @@ -17,10 +17,8 @@ The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.de "password" : "examplepw", "organization": "myorg", "ssl": true, - "retry_interval" : "1s", - "retry_exponential_base" : 2, - "max_retries": 20, - "max_retry_time" : "168h" + "flush_delay" : "1s", + "batch_size" : 100 } } ``` @@ -34,9 +32,6 @@ The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.de - `password`: Password for basic authentification - `organization`: Organization in the InfluxDB - `ssl`: Use SSL connection -- `retry_interval`: Base retry interval for failed write requests, default 1s -- `retry_exponential_base`: The retry interval is exponentially increased with this base, default 2 -- `max_retries`: Maximal number of retry attempts -- `max_retry_time`: Maximal time to retry failed writes, default 168h (one week) +- `flush_delay`: Group metrics coming in to a single batch +- `batch_size`: Maximal batch size -For information about the calculation of the retry interval settings, see [offical influxdb-client-go documentation](https://github.com/influxdata/influxdb-client-go#handling-of-failed-async-writes) \ No newline at end of file