Merge branch 'develop' into main

2025-07-21 20:31:41 +02:00 · 2022-03-31 11:57:19 +02:00 · 2022-03-31 11:47:02 +02:00 · 2022-03-16 19:08:13 +01:00 · 2022-03-15 16:41:11 +01:00 · 2022-03-04 23:34:28 +01:00
17 changed files with 223 additions and 770 deletions
--- a/collectors/diskstatMetric.go
+++ b/collectors/diskstatMetric.go
@@ -3,6 +3,7 @@ package collectors
 import (
 	"bufio"
 	"encoding/json"
 	"fmt"
 	"os"
 	"strings"
 	"syscall"
@@ -80,7 +81,8 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric
 		stat := syscall.Statfs_t{}
 		err := syscall.Statfs(path, &stat)
 		if err != nil {
-			continue
+			fmt.Println(err.Error())
 			return
 		}
 		tags := map[string]string{"type": "node", "device": linefields[0]}
 		total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
--- a/collectors/gpfsMetric.go
+++ b/collectors/gpfsMetric.go
@@ -70,7 +70,6 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
 	for _, fs := range m.config.ExcludeFilesystem {
 		m.skipFS[fs] = struct{}{}
 	}
 	m.lastState = make(map[string]GpfsCollectorLastState)
 	// GPFS / IBM Spectrum Scale file system statistics can only be queried by user root
 	user, err := user.Current()
@@ -163,16 +162,11 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
 			continue
 		}
 		// Add filesystem tag
 		m.tags["filesystem"] = filesystem
-
+		if _, ok := m.lastState[filesystem]; !ok {
-		// Create initial last state
+			m.lastState[filesystem] = GpfsCollectorLastState{
-		if m.config.SendBandwidths {
+				bytesRead:    -1,
-			if _, ok := m.lastState[filesystem]; !ok {
+				bytesWritten: -1,
 				m.lastState[filesystem] = GpfsCollectorLastState{
 					bytesRead:    -1,
 					bytesWritten: -1,
 				}
 			}
 		}
--- a/collectors/infinibandMetric.go
+++ b/collectors/infinibandMetric.go
@@ -18,18 +18,13 @@ import (
 const IB_BASEPATH = "/sys/class/infiniband/"
 type InfinibandCollectorMetric struct {
 	path string
 	unit string
 }
 type InfinibandCollectorInfo struct {
-	LID              string                               // IB local Identifier (LID)
+	LID              string            // IB local Identifier (LID)
-	device           string                               // IB device
+	device           string            // IB device
-	port             string                               // IB device port
+	port             string            // IB device port
-	portCounterFiles map[string]InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric
+	portCounterFiles map[string]string // mapping counter name -> sysfs file
-	tagSet           map[string]string                    // corresponding tag list
+	tagSet           map[string]string // corresponding tag list
-	lastState        map[string]int64                     // State from last measurement
+	lastState        map[string]int64  // State from last measurement
 }
 type InfinibandCollector struct {
@@ -111,16 +106,16 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
 		// Check access to counter files
 		countersDir := filepath.Join(path, "counters")
-		portCounterFiles := map[string]InfinibandCollectorMetric{
+		portCounterFiles := map[string]string{
-			"ib_recv":      {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes"},
+			"ib_recv":      filepath.Join(countersDir, "port_rcv_data"),
-			"ib_xmit":      {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes"},
+			"ib_xmit":      filepath.Join(countersDir, "port_xmit_data"),
-			"ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets"},
+			"ib_recv_pkts": filepath.Join(countersDir, "port_rcv_packets"),
-			"ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets"},
+			"ib_xmit_pkts": filepath.Join(countersDir, "port_xmit_packets"),
 		}
-		for _, counter := range portCounterFiles {
+		for _, counterFile := range portCounterFiles {
-			err := unix.Access(counter.path, unix.R_OK)
+			err := unix.Access(counterFile, unix.R_OK)
 			if err != nil {
-				return fmt.Errorf("unable to access %s: %v", counter.path, err)
+				return fmt.Errorf("unable to access %s: %v", counterFile, err)
 			}
 		}
@@ -170,14 +165,14 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
 	m.lastTimestamp = now
 	for _, info := range m.info {
-		for counterName, counterDef := range info.portCounterFiles {
+		for counterName, counterFile := range info.portCounterFiles {
 			// Read counter file
-			line, err := ioutil.ReadFile(counterDef.path)
+			line, err := ioutil.ReadFile(counterFile)
 			if err != nil {
 				cclog.ComponentError(
 					m.name,
-					fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err))
+					fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterFile, err))
 				continue
 			}
 			data := strings.TrimSpace(string(line))
@@ -194,7 +189,6 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
 			// Send absolut values
 			if m.config.SendAbsoluteValues {
 				if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil {
 					y.AddMeta("unit", counterDef.unit)
 					output <- y
 				}
 			}
@@ -204,7 +198,6 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
 				if info.lastState[counterName] >= 0 {
 					rate := float64((v - info.lastState[counterName])) / timeDiff
 					if y, err := lp.New(counterName+"_bw", info.tagSet, m.meta, map[string]interface{}{"value": rate}, now); err == nil {
 						y.AddMeta("unit", counterDef.unit+"/sec")
 						output <- y
 					}
 				}
--- a/collectors/likwidMetric.go
+++ b/collectors/likwidMetric.go
@@ -16,7 +16,6 @@ import (
 	"math"
 	"os"
 	"os/signal"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
@@ -55,7 +54,6 @@ type LikwidEventsetConfig struct {
 	gid      C.int
 	eorder   []*C.char
 	estr     *C.char
 	go_estr  string
 	results  map[int]map[string]interface{}
 	metrics  map[int]map[string]float64
 }
@@ -103,14 +101,8 @@ func eventsToEventStr(events map[string]string) string {
 func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig {
 	tmplist := make([]string, 0)
 	clist := make([]string, 0)
 	for k := range input.Events {
 		clist = append(clist, k)
 	}
 	sort.Strings(clist)
 	elist := make([]*C.char, 0)
-	for _, k := range clist {
+	for k, v := range input.Events {
 		v := input.Events[k]
 		tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k))
 		c_counter := C.CString(k)
 		elist = append(elist, c_counter)
@@ -132,7 +124,6 @@ func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig
 		gid:     -1,
 		eorder:  elist,
 		estr:    C.CString(estr),
 		go_estr: estr,
 		results: res,
 		metrics: met,
 	}
@@ -202,7 +193,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
 	}
 	m.setup()
-	m.meta = map[string]string{"group": "PerfCounter"}
+	m.meta = map[string]string{"source": m.name, "group": "PerfCounter"}
 	cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
 	cpulist := topo.CpuList()
 	m.cpulist = make([]C.int, len(cpulist))
@@ -329,11 +320,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
 		gctr := C.GoString(counter)
 		for _, tid := range m.cpu2tid {
 			res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid))
-			fres := float64(res)
+			evset.results[tid][gctr] = float64(res)
 			if m.config.InvalidToZero && (math.IsNaN(fres) || math.IsInf(fres, 0)) {
 				fres = 0.0
 			}
 			evset.results[tid][gctr] = fres
 			evset.results[tid]["time"] = interval.Seconds()
 			evset.results[tid]["inverseClock"] = invClock
 		}
@@ -352,12 +339,15 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
 				value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid])
 				if err != nil {
 					cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
-					value = 0.0
+					continue
 				}
 				if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
 					value = 0.0
 				}
 				evset.metrics[tid][metric.Name] = value
 				if m.config.InvalidToZero && math.IsNaN(value) {
 					value = 0.0
 				}
 				if m.config.InvalidToZero && math.IsInf(value, 0) {
 					value = 0.0
 				}
 				// Now we have the result, send it with the proper tags
 				if !math.IsNaN(value) {
 					if metric.Publish {
@@ -401,12 +391,15 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
 				value, err := agg.EvalFloat64Condition(metric.Calc, params)
 				if err != nil {
 					cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
-					value = 0.0
+					continue
 				}
 				if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
 					value = 0.0
 				}
 				m.gmresults[tid][metric.Name] = value
 				if m.config.InvalidToZero && math.IsNaN(value) {
 					value = 0.0
 				}
 				if m.config.InvalidToZero && math.IsInf(value, 0) {
 					value = 0.0
 				}
 				// Now we have the result, send it with the proper tags
 				if !math.IsNaN(value) {
 					if metric.Publish {
@@ -432,9 +425,6 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
 func (m *LikwidCollector) LateInit() error {
 	var ret C.int
 	if m.initialized {
 		return nil
 	}
 	switch m.config.AccessMode {
 	case "direct":
 		C.HPMmode(0)
@@ -485,17 +475,7 @@ func (m *LikwidCollector) LateInit() error {
 	for i, evset := range m.config.Eventsets {
 		var gid C.int
 		if len(evset.Events) > 0 {
 			skip := false
 			likwidGroup := genLikwidEventSet(evset)
 			for _, g := range m.likwidGroups {
 				if likwidGroup.go_estr == g.go_estr {
 					skip = true
 					break
 				}
 			}
 			if skip {
 				continue
 			}
 			// Now we add the list of events to likwid
 			gid = C.perfmon_addEventSet(likwidGroup.estr)
 			if gid >= 0 {
@@ -540,14 +520,9 @@ func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric)
 	}
 	if !m.initialized {
-		m.lock.Lock()
+		if m.LateInit() != nil {
 		err = m.LateInit()
 		if err != nil {
 			m.lock.Unlock()
 			return
 		}
 		m.initialized = true
 		m.lock.Unlock()
 	}
 	if m.initialized && !skip {
--- a/collectors/likwidMetric.md
+++ b/collectors/likwidMetric.md
@@ -3,63 +3,32 @@
 The `likwid` collector is probably the most complicated collector. The LIKWID library is included as static library with *direct* access mode. The *direct* access mode is suitable if the daemon is executed by a root user. The static library does not contain the performance groups, so all information needs to be provided in the configuration.
-```json
+The `likwid` configuration consists of two parts, the "eventsets" and "globalmetrics":
-  "likwid": {
+- An event set list itself has two parts, the "events" and a set of derivable "metrics". Each of the "events" is a counter:event pair in LIKWID's syntax. The "metrics" are a list of formulas to derive the metric value from the measurements of the "events". Each metric has a name, the formula, a scope and a publish flag. Counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. The scope tells the Collector whether it is a metric for each hardware thread (`cpu`) or each CPU socket (`socket`). You may specify a unit for the metric with `unit`. The last one is the publishing flag. It tells the collector whether a metric should be sent to the router.
-    "force_overwrite" : false,
+- The global metrics are metrics which require data from all event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. So, the idea is to derive a metric in the "eventsets" section and reuse it in the "globalmetrics" part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics (`publish=false`). **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Similar to the metrics in the eventset, you can specify a metric unit with the `unit` field.
    "invalid_to_zero" : false,
    "eventsets": [
      {
        "events" : {
          "COUNTER0": "EVENT0",
          "COUNTER1": "EVENT1",
        },
        "metrics" : [
          {
            "name": "sum_01",
            "calc": "COUNTER0 + COUNTER1",
            "publish": false,
            "unit": "myunit",
            "type": "cpu"
          }
        ]
      }
    ]
    "globalmetrics" : [
      {
        "name": "global_sum",
        "calc": "sum_01",
        "publish": true,
        "unit": "myunit",
        "type": "cpu"
      }
    ]
  }
 ```
 The `likwid` configuration consists of two parts, the `eventsets` and `globalmetrics`:
 - An event set list itself has two parts, the `events` and a set of derivable `metrics`. Each of the `events` is a `counter:event` pair in LIKWID's syntax. The `metrics` are a list of formulas to derive the metric value from the measurements of the `events`' values. Each metric has a name, the formula, a type and a publish flag. There is an optional `unit` field. Counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. You can optionally use `time` for the measurement time and `inverseClock` for `1.0/baseCpuFrequency`. The type tells the LikwidCollector whether it is a metric for each hardware thread (`cpu`) or each CPU socket (`socket`). You may specify a unit for the metric with `unit`. The last one is the publishing flag. It tells the LikwidCollector whether a metric should be sent to the router or is only used internally to compute a global metric.
 - The `globalmetrics` are metrics which require data from multiple event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. Also `time` and `inverseClock` cannot be used anymore. So, the idea is to derive a metric in the `eventsets` section and reuse it in the `globalmetrics` part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics (`"publish": false`). **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Similar to the metrics in the eventset, you can specify a metric unit with the `unit` field.
 Additional options:
 - `access_mode` : Method to use for hardware performance monitoring (`direct` access as root user, `accessdaemon` for the daemon mode)
 - `accessdaemon_path`: Folder with the access daemon `likwid-accessD`, commonly `$LIKWID_INSTALL_LOC/sbin`
 - `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements
- `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`. See below in [seperate section](./likwidMetric.md#invalid_to_zero-option)
+- `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`.
- `access_mode`: Specify LIKWID access mode: `direct` for direct register access as root user or `accessdaemon`. The access mode `perf_event` is current untested.
+- `access_mode`: Specify LIKWID access mode: `direct` for direct register access as root user or `accessdaemon`
- `accessdaemon_path`: Folder of the accessDaemon `likwid-accessD` (like `/usr/local/sbin`)
+- `accessdaemon_path`: Folder of the accessDaemon `likwid-accessD`
- `liblikwid_path`: Location of `liblikwid.so` including file name like `/usr/local/lib/liblikwid.so`
+- `liblikwid_path`: Location of `liblikwid.so`
 ### Available metric scopes
-Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the LikwidCollector provides the specification of a `type` for each metric.
+Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the collector provides the specification of a 'scope' for each metric.
 - `cpu` : One metric per CPU hardware thread with the tags `"type" : "cpu"` and `"type-id" : "$cpu_id"`
 - `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"`
-**Note:** You should not specify the `socket` type for a metric that is measured at `cpu` scope and vice versa, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific.
+**Note:** You cannot specify `socket` scope for a metric that is measured at `cpu` scope, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific.
 As a guideline:
 - All counters `FIXCx`, `PMCy` and `TMAz` have the scope `cpu`
 - All counters names containing `BOX` have the scope `socket`
- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `cpu` scope (AMD Zen)
+- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `cpu` scope
 - All `DFCx` counters have scope `socket`
 ### Help with the configuration
@@ -81,7 +50,6 @@ $ scripts/likwid_perfgroup_to_cc_config.py ICX MEM_DP
 {
  "events": {
    "FIXC0": "INSTR_RETIRED_ANY",
    "FIXC1": "CPU_CLK_UNHALTED_CORE",
    "..." : "..."
  },
  "metrics" : [
@@ -107,28 +75,21 @@ LIKWID checks the file `/var/run/likwid.lock` before performing any interfering
 Before (SLURM prolog, ...)
 ```
-$ chown $JOBUSER /var/run/likwid.lock
+$ chwon $JOBUSER /var/run/likwid.lock
 ```
 After (SLURM epilog, ...)
 ```
-$ chown $CCUSER /var/run/likwid.lock
+$ chwon $CCUSER /var/run/likwid.lock
 ```
 ### `invalid_to_zero` option
 In some cases LIKWID returns `0.0` for some events that are further used in processing and maybe used as divisor in a calculation. After evaluation of a metric, the result might be `NaN` or `+-Inf`. These resulting metrics are commonly not created and forwarded to the router because the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#float) does not support these special floating-point values. If you want to have them sent, this option forces these metric values to be `0.0` instead.
 One might think this does not happen often but often used metrics in the world of performance engineering like Instructions-per-Cycle (IPC) or more frequently the actual CPU clock are derived with events like `CPU_CLK_UNHALTED_CORE` (Intel) which do not increment in halted state (as the name implies). In there are different power management systems in a chip which can cause a hardware thread to go in such a state. Moreover, if no cycles are executed by the core, also many other events are not incremented as well (like `INSTR_RETIRED_ANY` for retired instructions and part of IPC).
 ### Example configuration
 #### AMD Zen3
 ```json
  "likwid": {
    "force_overwrite" : false,
-    "invalid_to_zero" : false,
+    "nan_to_zero" : false,
    "eventsets": [
      {
        "events": {
@@ -219,3 +180,33 @@ One might think this does not happen often but often used metrics in the world o
  }
 ```
 ### How to get the eventsets and metrics from LIKWID
 The `likwid` collector reads hardware performance counters at a **cpu** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility.
 The logic is as following: There are multiple eventsets, each consisting of a list of counters+events and a list of metrics. If you compare a common performance group with the example setting above, there is not much difference:
 ```
 EVENTSET                         ->   "events": {
 FIXC1 ACTUAL_CPU_CLOCK           ->     "FIXC1": "ACTUAL_CPU_CLOCK",
 FIXC2 MAX_CPU_CLOCK              ->     "FIXC2": "MAX_CPU_CLOCK",
 PMC0  RETIRED_INSTRUCTIONS       ->     "PMC0" : "RETIRED_INSTRUCTIONS",
 PMC1  CPU_CLOCKS_UNHALTED        ->     "PMC1" : "CPU_CLOCKS_UNHALTED",
 PMC2  RETIRED_SSE_AVX_FLOPS_ALL  ->     "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL",
 PMC3  MERGE                      ->     "PMC3": "MERGE",
                                 ->   }
 ```
 The metrics are following the same procedure:
 ```
 METRICS                          ->   "metrics": [
 IPC   PMC0/PMC1                  ->     {
                                 ->       "name" : "IPC",
                                 ->       "calc" : "PMC0/PMC1",
                                 ->       "scope": "cpu",
                                 ->       "publish": true
                                 ->     }
                                 ->   ]
 ```
 The script `scripts/likwid_perfgroup_to_cc_config.py` might help you.
--- a/go.mod
+++ b/go.mod
@@ -3,14 +3,17 @@ module github.com/ClusterCockpit/cc-metric-collector
 go 1.16
 require (
-	github.com/NVIDIA/go-nvml v0.11.6-0
+	github.com/NVIDIA/go-nvml v0.11.1-0
-	github.com/PaesslerAG/gval v1.1.2
+	github.com/influxdata/influxdb-client-go/v2 v2.7.0
 	github.com/gorilla/mux v1.8.0
 	github.com/influxdata/influxdb-client-go/v2 v2.8.1
 	github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf
-	github.com/nats-io/nats-server/v2 v2.8.0 // indirect
+	github.com/nats-io/nats.go v1.13.1-0.20211122170419-d7c1d78a50fc
-	github.com/nats-io/nats.go v1.14.0
+	golang.org/x/sys v0.0.0-20220114195835-da31bd327af9
-	github.com/prometheus/client_golang v1.12.1
+	gopkg.in/Knetic/govaluate.v2 v2.3.0
-	github.com/stmcginnis/gofish v0.13.0
+)
-	golang.org/x/sys v0.0.0-20220412211240-33da011f77ad
+
 require (
 	github.com/PaesslerAG/gval v1.1.2
 	github.com/golang/protobuf v1.5.2 // indirect
 	github.com/nats-io/nats-server/v2 v2.7.0 // indirect
 	google.golang.org/protobuf v1.27.1 // indirect
 )
--- a/internal/metricRouter/metricRouter.go
+++ b/internal/metricRouter/metricRouter.go
@@ -48,6 +48,7 @@ type metricRouter struct {
 	done        chan bool           // channel to finish / stop metric router
 	wg          *sync.WaitGroup     // wait group for all goroutines in cc-metric-collector
 	timestamp   time.Time           // timestamp periodically updated by ticker each interval
 	timerdone   chan bool           // channel to finish / stop timestamp updater
 	ticker      mct.MultiChanTicker // periodically ticking once each interval
 	config      metricRouterConfig  // json encoded config for metric router
 	cache       MetricCache         // pointer to MetricCache
@@ -123,6 +124,29 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
 	return nil
 }
 // StartTimer starts a timer which updates timestamp periodically
 func (r *metricRouter) StartTimer() {
 	m := make(chan time.Time)
 	r.ticker.AddChannel(m)
 	r.timerdone = make(chan bool)
 	r.wg.Add(1)
 	go func() {
 		defer r.wg.Done()
 		for {
 			select {
 			case <-r.timerdone:
 				close(r.timerdone)
 				cclog.ComponentDebug("MetricRouter", "TIMER DONE")
 				return
 			case t := <-m:
 				r.timestamp = t
 			}
 		}
 	}()
 	cclog.ComponentDebug("MetricRouter", "TIMER START")
 }
 func getParamMap(point lp.CCMetric) map[string]interface{} {
 	params := make(map[string]interface{})
 	params["metric"] = point
@@ -211,9 +235,8 @@ func (r *metricRouter) dropMetric(point lp.CCMetric) bool {
 func (r *metricRouter) Start() {
 	// start timer if configured
 	r.timestamp = time.Now()
 	timeChan := make(chan time.Time)
 	if r.config.IntervalStamp {
-		r.ticker.AddChannel(timeChan)
+		r.StartTimer()
 	}
 	// Router manager is done
@@ -293,10 +316,6 @@ func (r *metricRouter) Start() {
 				done()
 				return
 			case timestamp := <-timeChan:
 				r.timestamp = timestamp
 				cclog.ComponentDebug("MetricRouter", "Update timestamp", r.timestamp.UnixNano())
 			case p := <-r.coll_input:
 				coll_forward(p)
 				for i := 0; len(r.coll_input) > 0 && i < (r.maxForward-1); i++ {
@@ -342,6 +361,14 @@ func (r *metricRouter) Close() {
 	// wait for close of channel r.done
 	<-r.done
 	// stop timer
 	if r.config.IntervalStamp {
 		cclog.ComponentDebug("MetricRouter", "TIMER CLOSE")
 		r.timerdone <- true
 		// wait for close of channel r.timerdone
 		<-r.timerdone
 	}
 	// stop metric cache
 	if r.config.NumCacheIntervals > 0 {
 		cclog.ComponentDebug("MetricRouter", "CACHE CLOSE")
--- a/receivers.json
+++ b/receivers.json
@@ -4,22 +4,5 @@
        "address": "nats://my-url",
        "port" : "4222",
        "database": "testcluster"
    },
    "redfish_recv": {
        "type": "redfish",
        "client_config": [
            {
                "hostname": "my-host-1",
                "username": "username-1",
                "password": "password-1",
                "endpoint": "https://my-endpoint-1"
            },
            {
                "hostname": "my-host-2",
                "username": "username-2",
                "password": "password-2",
                "endpoint": "https://my-endpoint-2"
            }
        ]
    }
 }
--- a/receivers/receiveManager.go
+++ b/receivers/receiveManager.go
@@ -10,13 +10,14 @@ import (
 )
 var AvailableReceivers = map[string]func(name string, config json.RawMessage) (Receiver, error){
-	"nats":    NewNatsReceiver,
+	"nats": NewNatsReceiver,
 	"redfish": NewRedfishReceiver,
 }
 type receiveManager struct {
 	inputs []Receiver
 	output chan lp.CCMetric
 	done   chan bool
 	wg     *sync.WaitGroup
 	config []json.RawMessage
 }
@@ -32,6 +33,8 @@ func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) er
 	// Initialize struct fields
 	rm.inputs = make([]Receiver, 0)
 	rm.output = nil
 	rm.done = make(chan bool)
 	rm.wg = wg
 	rm.config = make([]json.RawMessage, 0)
 	configFile, err := os.Open(receiverConfigFile)
@@ -55,7 +58,7 @@ func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) er
 }
 func (rm *receiveManager) Start() {
-	cclog.ComponentDebug("ReceiveManager", "START")
+	rm.wg.Add(1)
 	for _, r := range rm.inputs {
 		cclog.ComponentDebug("ReceiveManager", "START", r.Name())
@@ -94,19 +97,16 @@ func (rm *receiveManager) AddOutput(output chan lp.CCMetric) {
 }
 func (rm *receiveManager) Close() {
 	cclog.ComponentDebug("ReceiveManager", "CLOSE")
 	// Close all receivers
 	for _, r := range rm.inputs {
 		cclog.ComponentDebug("ReceiveManager", "CLOSE", r.Name())
 		r.Close()
 	}
-
+	rm.wg.Done()
-	cclog.ComponentDebug("ReceiveManager", "DONE")
+	cclog.ComponentDebug("ReceiveManager", "CLOSE")
 }
 func New(wg *sync.WaitGroup, receiverConfigFile string) (ReceiveManager, error) {
-	r := new(receiveManager)
+	r := &receiveManager{}
 	err := r.Init(wg, receiverConfigFile)
 	if err != nil {
 		return nil, err
--- a/receivers/redfishReceiver.go
+++ b/receivers/redfishReceiver.go
@@ -1,324 +0,0 @@
 package receivers
 import (
 	"encoding/json"
 	"fmt"
 	"strconv"
 	"sync"
 	"time"
 	cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
 	lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
 	// See: https://pkg.go.dev/github.com/stmcginnis/gofish
 	"github.com/stmcginnis/gofish"
 )
 // RedfishReceiver configuration:
 type RedfishReceiver struct {
 	receiver
 	config struct {
 		Type     string `json:"type"`
 		Fanout   int    `json:"fanout,omitempty"`   // Default fanout: 64
 		Interval int    `json:"interval,omitempty"` // Default interval: 30s
 		// Client config for each redfish service
 		ClientConfigs []struct {
 			Hostname       *string  `json:"hostname"`
 			Username       *string  `json:"username"`
 			Password       *string  `json:"password"`
 			Endpoint       *string  `json:"endpoint"`
 			Insecure       *bool    `json:"insecure,omitempty"`
 			ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
 			gofish         gofish.ClientConfig
 		} `json:"client_config"`
 	}
 	done chan bool      // channel to finish / stop redfish receiver
 	wg   sync.WaitGroup // wait group for redfish receiver
 }
 // Start starts the redfish receiver
 func (r *RedfishReceiver) Start() {
 	cclog.ComponentDebug(r.name, "START")
 	// readPowerMetric reads readfish power metric from the endpoint configured in conf
 	readPowerMetric := func(clientConfigIndex int) error {
 		clientConfig := &r.config.ClientConfigs[clientConfigIndex]
 		// Connect to redfish service
 		c, err := gofish.Connect(clientConfig.gofish)
 		if err != nil {
 			c := struct {
 				Username  string
 				Endpoint  string
 				BasicAuth bool
 				Insecure  bool
 			}{
 				Username:  clientConfig.gofish.Username,
 				Endpoint:  clientConfig.gofish.Endpoint,
 				BasicAuth: clientConfig.gofish.BasicAuth,
 				Insecure:  clientConfig.gofish.Insecure,
 			}
 			return fmt.Errorf("readPowerMetric: gofish.Connect(%+v) failed: %v", c, err)
 		}
 		defer c.Logout()
 		// Get all chassis managed by this service
 		chassis_list, err := c.Service.Chassis()
 		if err != nil {
 			return fmt.Errorf("readPowerMetric: c.Service.Chassis() failed: %v", err)
 		}
 		for _, chassis := range chassis_list {
 			timestamp := time.Now()
 			// Get power information for each chassis
 			power, err := chassis.Power()
 			if err != nil {
 				return fmt.Errorf("readPowerMetric: chassis.Power() failed: %v", err)
 			}
 			if power == nil {
 				continue
 			}
 			// Read min, max and average consumed watts for each power control
 			for _, pc := range power.PowerControl {
 				// Map of collected metrics
 				metrics := map[string]float32{
 					// PowerConsumedWatts shall represent the actual power being consumed (in
 					// Watts) by the chassis
 					"consumed_watts": pc.PowerConsumedWatts,
 					// AverageConsumedWatts shall represent the
 					// average power level that occurred averaged over the last IntervalInMin
 					// minutes.
 					"average_consumed_watts": pc.PowerMetrics.AverageConsumedWatts,
 					// MinConsumedWatts shall represent the
 					// minimum power level in watts that occurred within the last
 					// IntervalInMin minutes.
 					"min_consumed_watts": pc.PowerMetrics.MinConsumedWatts,
 					// MaxConsumedWatts shall represent the
 					// maximum power level in watts that occurred within the last
 					// IntervalInMin minutes
 					"max_consumed_watts": pc.PowerMetrics.MaxConsumedWatts,
 				}
 				intervalInMin := strconv.FormatFloat(float64(pc.PowerMetrics.IntervalInMin), 'f', -1, 32)
 				// Metrics to exclude
 				for _, key := range clientConfig.ExcludeMetrics {
 					delete(metrics, key)
 				}
 				// Set tags
 				tags := map[string]string{
 					"hostname": *clientConfig.Hostname,
 					"type":     "node",
 					// ID uniquely identifies the resource
 					"id": pc.ID,
 					// MemberID shall uniquely identify the member within the collection. For
 					// services supporting Redfish v1.6 or higher, this value shall be the
 					// zero-based array index.
 					"member_id": pc.MemberID,
 					// PhysicalContext shall be a description of the affected device(s) or region
 					// within the chassis to which this power control applies.
 					"physical_context": string(pc.PhysicalContext),
 					// Name
 					"power_control_name": pc.Name,
 				}
 				// Delete empty tags
 				for key, value := range tags {
 					if value == "" {
 						delete(tags, key)
 					}
 				}
 				// Set meta data tags
 				meta := map[string]string{
 					"source":              r.name,
 					"group":               "Energy",
 					"interval_in_minutes": intervalInMin,
 					"unit":                "watts",
 				}
 				// Delete empty meta data tags
 				for key, value := range meta {
 					if value == "" {
 						delete(meta, key)
 					}
 				}
 				for name, value := range metrics {
 					y, err := lp.New(name, tags, meta,
 						map[string]interface{}{
 							"value": value,
 						},
 						timestamp)
 					if err == nil {
 						r.sink <- y
 					}
 				}
 			}
 		}
 		return nil
 	}
 	// doReadPowerMetric read power metrics for all configure redfish services.
 	// To compensate latencies of the Redfish services a fanout is used.
 	doReadPowerMetric := func() {
 		// Compute fanout to use
 		realFanout := r.config.Fanout
 		if len(r.config.ClientConfigs) < realFanout {
 			realFanout = len(r.config.ClientConfigs)
 		}
 		// Create wait group and input channel for workers
 		var workerWaitGroup sync.WaitGroup
 		workerInput := make(chan int, realFanout)
 		// Create worker go routines
 		for i := 0; i < realFanout; i++ {
 			// Increment worker wait group counter
 			workerWaitGroup.Add(1)
 			go func() {
 				// Decrement worker wait group counter
 				defer workerWaitGroup.Done()
 				// Read power metrics for each client config
 				for clientConfigIndex := range workerInput {
 					err := readPowerMetric(clientConfigIndex)
 					if err != nil {
 						cclog.ComponentError(r.name, err)
 					}
 				}
 			}()
 		}
 		// Distribute client configs to workers
 		for i := range r.config.ClientConfigs {
 			// Check done channel status
 			select {
 			case workerInput <- i:
 			case <-r.done:
 				// process done event
 				// Stop workers, clear channel and wait for all workers to finish
 				close(workerInput)
 				for range workerInput {
 				}
 				workerWaitGroup.Wait()
 				return
 			}
 		}
 		// Stop workers and wait for all workers to finish
 		close(workerInput)
 		workerWaitGroup.Wait()
 	}
 	// Start redfish receiver
 	r.wg.Add(1)
 	go func() {
 		defer r.wg.Done()
 		// Create ticker
 		ticker := time.NewTicker(time.Duration(r.config.Interval) * time.Second)
 		defer ticker.Stop()
 		for {
 			doReadPowerMetric()
 			select {
 			case <-ticker.C:
 				// process ticker event -> continue
 				continue
 			case <-r.done:
 				// process done event
 				return
 			}
 		}
 	}()
 	cclog.ComponentDebug(r.name, "STARTED")
 }
 // Close redfish receiver
 func (r *RedfishReceiver) Close() {
 	cclog.ComponentDebug(r.name, "CLOSE")
 	// Send the signal and wait
 	close(r.done)
 	r.wg.Wait()
 	cclog.ComponentDebug(r.name, "DONE")
 }
 // New function to create a new instance of the receiver
 // Initialize the receiver by giving it a name and reading in the config JSON
 func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) {
 	r := new(RedfishReceiver)
 	// Set name
 	r.name = fmt.Sprintf("RedfishReceiver(%s)", name)
 	// Create done channel
 	r.done = make(chan bool)
 	// Set defaults in r.config
 	// Allow overwriting these defaults by reading config JSON
 	r.config.Fanout = 64
 	r.config.Interval = 30
 	// Read the redfish receiver specific JSON config
 	if len(config) > 0 {
 		err := json.Unmarshal(config, &r.config)
 		if err != nil {
 			cclog.ComponentError(r.name, "Error reading config:", err.Error())
 			return nil, err
 		}
 	}
 	// Create gofish client config
 	for i := range r.config.ClientConfigs {
 		clientConfig := &r.config.ClientConfigs[i]
 		gofishConfig := &clientConfig.gofish
 		if clientConfig.Hostname == nil {
 			err := fmt.Errorf("client config number %v requires hostname", i)
 			cclog.ComponentError(r.name, err)
 			return nil, err
 		}
 		if clientConfig.Endpoint == nil {
 			err := fmt.Errorf("client config number %v requires endpoint", i)
 			cclog.ComponentError(r.name, err)
 			return nil, err
 		}
 		gofishConfig.Endpoint = *clientConfig.Endpoint
 		if clientConfig.Username == nil {
 			err := fmt.Errorf("client config number %v requires username", i)
 			cclog.ComponentError(r.name, err)
 			return nil, err
 		}
 		gofishConfig.Username = *clientConfig.Username
 		if clientConfig.Password == nil {
 			err := fmt.Errorf("client config number %v requires password", i)
 			cclog.ComponentError(r.name, err)
 			return nil, err
 		}
 		gofishConfig.Password = *clientConfig.Password
 		gofishConfig.Insecure = true
 		if clientConfig.Insecure != nil {
 			gofishConfig.Insecure = *clientConfig.Insecure
 		}
 	}
 	return r, nil
 }
--- a/receivers/sampleReceiver.go
+++ b/receivers/sampleReceiver.go
@@ -36,26 +36,16 @@ func (r *SampleReceiver) Start() {
 	// or use own go routine but always make sure it exits
 	// as soon as it gets the signal of the r.done channel
 	//
 	// r.done = make(chan bool)
 	// r.wg.Add(1)
 	// go func() {
-	//      defer r.wg.Done()
+	// 	for {
-	//
+	// 		select {
-	//      // Create ticker
+	// 		case <-r.done:
-	//      ticker := time.NewTicker(30 * time.Second)
+	// 			r.wg.Done()
-	//      defer ticker.Stop()
+	// 			return
-	//
+	// 		}
-	//      for {
+	// 	}
-	//          readMetric()
+	// 	r.wg.Done()
 	//          select {
 	//          case <-ticker.C:
 	//              // process ticker event -> continue
 	//              continue
 	//          case <-r.done:
 	//              return
 	//          }
 	//      }
 	// }()
 }
--- a/sinks.json
+++ b/sinks.json
@@ -1,8 +1,6 @@
 {
-  "mystdout": {
+  "mystdout" : {
-    "type": "stdout",
+    "type" : "stdout",
-    "meta_as_tags": [
+    "meta_as_tags" : true
      "unit"
    ]
  }
-}
+}
--- a/sinks/httpSink.go
+++ b/sinks/httpSink.go
@@ -22,7 +22,6 @@ type HttpSinkConfig struct {
 	MaxIdleConns    int    `json:"max_idle_connections,omitempty"`
 	IdleConnTimeout string `json:"idle_connection_timeout,omitempty"`
 	FlushDelay      string `json:"flush_delay,omitempty"`
 	BatchSize       int    `json:"batch_size,omitempty"`
 }
 type HttpSink struct {
@@ -37,20 +36,19 @@ type HttpSink struct {
 	idleConnTimeout time.Duration
 	timeout         time.Duration
 	flushDelay      time.Duration
 	batchSize       int
 }
 func (s *HttpSink) Write(m lp.CCMetric) error {
 	if s.buffer.Len() == 0 && s.flushDelay != 0 {
 		// This is the first write since the last flush, start the flushTimer!
 		if s.flushTimer != nil && s.flushTimer.Stop() {
-			cclog.ComponentDebug(s.name, "unexpected: the flushTimer was already running?")
+			cclog.ComponentDebug("HttpSink", "unexpected: the flushTimer was already running?")
 		}
 		// Run a batched flush for all lines that have arrived in the last second
 		s.flushTimer = time.AfterFunc(s.flushDelay, func() {
 			if err := s.Flush(); err != nil {
-				cclog.ComponentError(s.name, "flush failed:", err.Error())
+				cclog.ComponentError("HttpSink", "flush failed:", err.Error())
 			}
 		})
 	}
@@ -59,11 +57,9 @@ func (s *HttpSink) Write(m lp.CCMetric) error {
 	s.lock.Lock()
 	_, err := s.encoder.Encode(p)
 	s.batchSize++
 	s.lock.Unlock() // defer does not work here as Flush() takes the lock as well
 	if err != nil {
 		cclog.ComponentError(s.name, "encoding failed:", err.Error())
 		return err
 	}
@@ -71,9 +67,6 @@ func (s *HttpSink) Write(m lp.CCMetric) error {
 	if s.flushDelay == 0 {
 		return s.Flush()
 	}
 	if s.batchSize == s.config.BatchSize {
 		return s.Flush()
 	}
 	return err
 }
@@ -91,7 +84,6 @@ func (s *HttpSink) Flush() error {
 	// Create new request to send buffer
 	req, err := http.NewRequest(http.MethodPost, s.config.URL, s.buffer)
 	if err != nil {
 		cclog.ComponentError(s.name, "failed to create request:", err.Error())
 		return err
 	}
@@ -105,19 +97,15 @@ func (s *HttpSink) Flush() error {
 	// Clear buffer
 	s.buffer.Reset()
 	s.batchSize = 0
 	// Handle transport/tcp errors
 	if err != nil {
 		cclog.ComponentError(s.name, "transport/tcp error:", err.Error())
 		return err
 	}
 	// Handle application errors
 	if res.StatusCode != http.StatusOK {
-		err = errors.New(res.Status)
+		return errors.New(res.Status)
 		cclog.ComponentError(s.name, "application error:", err.Error())
 		return err
 	}
 	return nil
@@ -126,7 +114,7 @@ func (s *HttpSink) Flush() error {
 func (s *HttpSink) Close() {
 	s.flushTimer.Stop()
 	if err := s.Flush(); err != nil {
-		cclog.ComponentError(s.name, "flush failed:", err.Error())
+		cclog.ComponentError("HttpSink", "flush failed:", err.Error())
 	}
 	s.client.CloseIdleConnections()
 }
@@ -139,7 +127,6 @@ func NewHttpSink(name string, config json.RawMessage) (Sink, error) {
 	s.config.IdleConnTimeout = "5s"
 	s.config.Timeout = "5s"
 	s.config.FlushDelay = "1s"
 	s.config.BatchSize = 100
 	// Read config
 	if len(config) > 0 {
--- a/sinks/httpSink.md
+++ b/sinks/httpSink.md
@@ -15,7 +15,6 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the
    "max_idle_connections" : 10,
    "idle_connection_timeout" : "5s",
    "flush_delay": "2s",
    "batch_size" : 100
  }
 }
 ```
@@ -28,4 +27,3 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the
 - `max_idle_connections`: Maximally idle connections (default 10)
 - `idle_connection_timeout`: Timeout for idle connections (default '5s')
 - `flush_delay`: Batch all writes arriving in during this duration (default '1s', batching can be disabled by setting it to 0)
 - `batch_size`: Maximal number of batched metrics. Either it is flushed because batch size or the `flush_delay` is reached
--- a/sinks/influxAsyncSink.go
+++ b/sinks/influxAsyncSink.go
@@ -6,14 +6,12 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
 	"strings"
 	"time"
 	cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
 	lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
 	influxdb2 "github.com/influxdata/influxdb-client-go/v2"
 	influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
 	influxdb2ApiHttp "github.com/influxdata/influxdb-client-go/v2/api/http"
 )
 type InfluxAsyncSinkConfig struct {
@@ -30,12 +28,10 @@ type InfluxAsyncSinkConfig struct {
 	BatchSize uint `json:"batch_size,omitempty"`
 	// Interval, in ms, in which is buffer flushed if it has not been already written (by reaching batch size) . Default 1000ms
 	FlushInterval         uint   `json:"flush_interval,omitempty"`
-	InfluxRetryInterval   string `json:"retry_interval,omitempty"`
+	InfluxRetryInterval   string `json:"retry_interval"`
-	InfluxExponentialBase uint   `json:"retry_exponential_base,omitempty"`
+	InfluxExponentialBase uint   `json:"retry_exponential_base"`
-	InfluxMaxRetries      uint   `json:"max_retries,omitempty"`
+	InfluxMaxRetries      uint   `json:"max_retries"`
-	InfluxMaxRetryTime    string `json:"max_retry_time,omitempty"`
+	InfluxMaxRetryTime    string `json:"max_retry_time"`
 	CustomFlushInterval   string `json:"custom_flush_interval,omitempty"`
 	MaxRetryAttempts      uint   `json:"max_retry_attempts,omitempty"`
 }
 type InfluxAsyncSink struct {
@@ -46,8 +42,6 @@ type InfluxAsyncSink struct {
 	config              InfluxAsyncSinkConfig
 	influxRetryInterval uint
 	influxMaxRetryTime  uint
 	customFlushInterval time.Duration
 	flushTimer          *time.Timer
 }
 func (s *InfluxAsyncSink) connect() error {
@@ -66,34 +60,20 @@ func (s *InfluxAsyncSink) connect() error {
 	cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database)
 	clientOptions := influxdb2.DefaultOptions()
 	if s.config.BatchSize != 0 {
 		cclog.ComponentDebug(s.name, "Batch size", s.config.BatchSize)
 		clientOptions.SetBatchSize(s.config.BatchSize)
 	}
 	if s.config.FlushInterval != 0 {
 		cclog.ComponentDebug(s.name, "Flush interval", s.config.FlushInterval)
 		clientOptions.SetFlushInterval(s.config.FlushInterval)
 	}
 	if s.influxRetryInterval != 0 {
 		cclog.ComponentDebug(s.name, "MaxRetryInterval", s.influxRetryInterval)
 		clientOptions.SetMaxRetryInterval(s.influxRetryInterval)
 	}
 	if s.influxMaxRetryTime != 0 {
 		cclog.ComponentDebug(s.name, "MaxRetryTime", s.influxMaxRetryTime)
 		clientOptions.SetMaxRetryTime(s.influxMaxRetryTime)
 	}
 	if s.config.InfluxExponentialBase != 0 {
 		cclog.ComponentDebug(s.name, "Exponential Base", s.config.InfluxExponentialBase)
 		clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
 	}
 	if s.config.InfluxMaxRetries != 0 {
 		cclog.ComponentDebug(s.name, "Max Retries", s.config.InfluxMaxRetries)
 		clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
 	}
 	clientOptions.SetTLSConfig(
 		&tls.Config{
 			InsecureSkipVerify: true,
 		},
-	).SetPrecision(time.Second)
+	)
 	clientOptions.SetMaxRetryInterval(s.influxRetryInterval)
 	clientOptions.SetMaxRetryTime(s.influxMaxRetryTime)
 	clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
 	clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
 	s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions)
 	s.writeApi = s.client.WriteAPI(s.config.Organization, s.config.Database)
@@ -104,23 +84,10 @@ func (s *InfluxAsyncSink) connect() error {
 	if !ok {
 		return fmt.Errorf("connection to %s not healthy", uri)
 	}
 	s.writeApi.SetWriteFailedCallback(func(batch string, err influxdb2ApiHttp.Error, retryAttempts uint) bool {
 		mlist := strings.Split(batch, "\n")
 		cclog.ComponentError(s.name, fmt.Sprintf("Failed to write batch with %d metrics %d times (max: %d): %s", len(mlist), retryAttempts, s.config.MaxRetryAttempts, err.Error()))
 		return retryAttempts <= s.config.MaxRetryAttempts
 	})
 	return nil
 }
 func (s *InfluxAsyncSink) Write(m lp.CCMetric) error {
 	if s.customFlushInterval != 0 && s.flushTimer == nil {
 		// Run a batched flush for all lines that have arrived in the defined interval
 		s.flushTimer = time.AfterFunc(s.customFlushInterval, func() {
 			if err := s.Flush(); err != nil {
 				cclog.ComponentError(s.name, "flush failed:", err.Error())
 			}
 		})
 	}
 	s.writeApi.WritePoint(
 		m.ToPoint(s.meta_as_tags),
 	)
@@ -128,11 +95,7 @@ func (s *InfluxAsyncSink) Write(m lp.CCMetric) error {
 }
 func (s *InfluxAsyncSink) Flush() error {
 	cclog.ComponentDebug(s.name, "Flushing")
 	s.writeApi.Flush()
 	if s.customFlushInterval != 0 && s.flushTimer != nil {
 		s.flushTimer = nil
 	}
 	return nil
 }
@@ -147,17 +110,13 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) {
 	s.name = fmt.Sprintf("InfluxSink(%s)", name)
 	// Set default for maximum number of points sent to server in single request.
-	s.config.BatchSize = 0
+	s.config.BatchSize = 100
-	s.influxRetryInterval = 0
+	s.influxRetryInterval = uint(time.Duration(1) * time.Second)
-	//s.config.InfluxRetryInterval = "1s"
+	s.config.InfluxRetryInterval = "1s"
-	s.influxMaxRetryTime = 0
+	s.influxMaxRetryTime = uint(7 * time.Duration(24) * time.Hour)
-	//s.config.InfluxMaxRetryTime = "168h"
+	s.config.InfluxMaxRetryTime = "168h"
-	s.config.InfluxMaxRetries = 0
+	s.config.InfluxMaxRetries = 20
-	s.config.InfluxExponentialBase = 0
+	s.config.InfluxExponentialBase = 2
 	s.config.FlushInterval = 0
 	s.config.CustomFlushInterval = ""
 	s.customFlushInterval = time.Duration(0)
 	s.config.MaxRetryAttempts = 1
 	// Default retry intervals (in seconds)
 	// 1 2
@@ -209,15 +168,6 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) {
 	s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval)
 	s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime)
 	// Use a own timer for calling Flush()
 	if len(s.config.CustomFlushInterval) > 0 {
 		t, err := time.ParseDuration(s.config.CustomFlushInterval)
 		if err != nil {
 			return nil, fmt.Errorf("invalid duration in 'custom_flush_interval': %v", err)
 		}
 		s.customFlushInterval = t
 	}
 	// Connect to InfluxDB server
 	if err := s.connect(); err != nil {
 		return nil, fmt.Errorf("unable to connect: %v", err)
--- a/sinks/influxSink.go
+++ b/sinks/influxSink.go
@@ -6,32 +6,28 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
 	"sync"
 	"time"
 	cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
 	lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
 	influxdb2 "github.com/influxdata/influxdb-client-go/v2"
 	influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
 	"github.com/influxdata/influxdb-client-go/v2/api/write"
 )
 type InfluxSinkConfig struct {
 	defaultSinkConfig
-	Host         string `json:"host,omitempty"`
+	Host                  string `json:"host,omitempty"`
-	Port         string `json:"port,omitempty"`
+	Port                  string `json:"port,omitempty"`
-	Database     string `json:"database,omitempty"`
+	Database              string `json:"database,omitempty"`
-	User         string `json:"user,omitempty"`
+	User                  string `json:"user,omitempty"`
-	Password     string `json:"password,omitempty"`
+	Password              string `json:"password,omitempty"`
-	Organization string `json:"organization,omitempty"`
+	Organization          string `json:"organization,omitempty"`
-	SSL          bool   `json:"ssl,omitempty"`
+	SSL                   bool   `json:"ssl,omitempty"`
-	FlushDelay   string `json:"flush_delay,omitempty"`
+	RetentionPol          string `json:"retention_policy,omitempty"`
-	BatchSize    int    `json:"batch_size,omitempty"`
+	InfluxRetryInterval   string `json:"retry_interval"`
-	RetentionPol string `json:"retention_policy,omitempty"`
+	InfluxExponentialBase uint   `json:"retry_exponential_base"`
-	// InfluxRetryInterval   string `json:"retry_interval"`
+	InfluxMaxRetries      uint   `json:"max_retries"`
-	// InfluxExponentialBase uint   `json:"retry_exponential_base"`
+	InfluxMaxRetryTime    string `json:"max_retry_time"`
 	// InfluxMaxRetries      uint   `json:"max_retries"`
 	// InfluxMaxRetryTime    string `json:"max_retry_time"`
 	//InfluxMaxRetryDelay  string `json:"max_retry_delay"` // It is mentioned in the docs but there is no way to set it
 }
@@ -42,71 +38,37 @@ type InfluxSink struct {
 	config              InfluxSinkConfig
 	influxRetryInterval uint
 	influxMaxRetryTime  uint
 	batch               []*write.Point
 	flushTimer          *time.Timer
 	flushDelay          time.Duration
 	lock                sync.Mutex // Flush() runs in another goroutine, so this lock has to protect the buffer
 	//influxMaxRetryDelay uint
 }
 // connect connects to the InfluxDB server
 func (s *InfluxSink) connect() error {
-
+	var auth string
 	// URI options:
 	// * http://host:port
 	// * https://host:port
 	var uri string
 	if s.config.SSL {
 		uri = fmt.Sprintf("https://%s:%s", s.config.Host, s.config.Port)
 	} else {
 		uri = fmt.Sprintf("http://%s:%s", s.config.Host, s.config.Port)
 	}
 	// Authentication options:
 	// * token
 	// * username:password
 	var auth string
 	if len(s.config.User) == 0 {
 		auth = s.config.Password
 	} else {
 		auth = fmt.Sprintf("%s:%s", s.config.User, s.config.Password)
 	}
 	cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database)
 	// Set influxDB client options
 	clientOptions := influxdb2.DefaultOptions()
 	// if s.influxRetryInterval != 0 {
 	// 	cclog.ComponentDebug(s.name, "MaxRetryInterval", s.influxRetryInterval)
 	// 	clientOptions.SetMaxRetryInterval(s.influxRetryInterval)
 	// }
 	// if s.influxMaxRetryTime != 0 {
 	// 	cclog.ComponentDebug(s.name, "MaxRetryTime", s.influxMaxRetryTime)
 	// 	clientOptions.SetMaxRetryTime(s.influxMaxRetryTime)
 	// }
 	// if s.config.InfluxExponentialBase != 0 {
 	// 	cclog.ComponentDebug(s.name, "Exponential Base", s.config.InfluxExponentialBase)
 	// 	clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
 	// }
 	// if s.config.InfluxMaxRetries != 0 {
 	// 	cclog.ComponentDebug(s.name, "Max Retries", s.config.InfluxMaxRetries)
 	// 	clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
 	// }
 	// Do not check InfluxDB certificate
 	clientOptions.SetTLSConfig(
 		&tls.Config{
 			InsecureSkipVerify: true,
 		},
 	)
-	clientOptions.SetPrecision(time.Second)
+	clientOptions.SetMaxRetryInterval(s.influxRetryInterval)
 	clientOptions.SetMaxRetryTime(s.influxMaxRetryTime)
 	clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
 	clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
 	// Create new writeAPI
 	s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions)
 	s.writeApi = s.client.WriteAPIBlocking(s.config.Organization, s.config.Database)
 	// Check InfluxDB server accessibility
 	ok, err := s.client.Ping(context.Background())
 	if err != nil {
 		return err
@@ -118,142 +80,61 @@ func (s *InfluxSink) connect() error {
 }
 func (s *InfluxSink) Write(m lp.CCMetric) error {
-
+	err :=
-	if len(s.batch) == 0 && s.flushDelay != 0 {
+		s.writeApi.WritePoint(
-		// This is the first write since the last flush, start the flushTimer!
+			context.Background(),
-		if s.flushTimer != nil && s.flushTimer.Stop() {
+			m.ToPoint(s.meta_as_tags),
-			cclog.ComponentDebug(s.name, "unexpected: the flushTimer was already running?")
+		)
-		}
+	return err
 		// Run a batched flush for all lines that have arrived in the last flush delay interval
 		s.flushTimer = time.AfterFunc(s.flushDelay, func() {
 			if err := s.Flush(); err != nil {
 				cclog.ComponentError(s.name, "flush failed:", err.Error())
 			}
 		})
 	}
 	// Append metric to batch slice
 	p := m.ToPoint(s.meta_as_tags)
 	s.lock.Lock()
 	s.batch = append(s.batch, p)
 	s.lock.Unlock()
 	// Flush synchronously if "flush_delay" is zero
 	if s.flushDelay == 0 {
 		return s.Flush()
 	}
 	// Flush if batch size is reached
 	if len(s.batch) == s.config.BatchSize {
 		return s.Flush()
 	}
 	return nil
 }
 // Flush sends all metrics buffered in batch slice to InfluxDB server
 func (s *InfluxSink) Flush() error {
 	// Lock access to batch slice
 	s.lock.Lock()
 	defer s.lock.Unlock()
 	// Nothing to do, batch slice is empty
 	if len(s.batch) == 0 {
 		return nil
 	}
 	// Send metrics from batch slice
 	err := s.writeApi.WritePoint(context.Background(), s.batch...)
 	if err != nil {
 		cclog.ComponentError(s.name, "flush failed:", err.Error())
 		return err
 	}
 	// Clear batch slice
 	for i := range s.batch {
 		s.batch[i] = nil
 	}
 	s.batch = s.batch[:0]
 	return nil
 }
 func (s *InfluxSink) Close() {
 	cclog.ComponentDebug(s.name, "Closing InfluxDB connection")
 	s.flushTimer.Stop()
 	s.Flush()
 	s.client.Close()
 }
 // NewInfluxSink create a new InfluxDB sink
 func NewInfluxSink(name string, config json.RawMessage) (Sink, error) {
 	s := new(InfluxSink)
 	s.name = fmt.Sprintf("InfluxSink(%s)", name)
 	// Set config default values
 	s.config.BatchSize = 100
 	s.config.FlushDelay = "1s"
 	// Read config
 	if len(config) > 0 {
 		err := json.Unmarshal(config, &s.config)
 		if err != nil {
 			return nil, err
 		}
 	}
-	s.influxRetryInterval = 0
+	s.influxRetryInterval = uint(time.Duration(1) * time.Second)
-	s.influxMaxRetryTime = 0
+	s.config.InfluxRetryInterval = "1s"
-	// s.config.InfluxRetryInterval = ""
+	s.influxMaxRetryTime = uint(7 * time.Duration(24) * time.Hour)
-	// s.config.InfluxMaxRetryTime = ""
+	s.config.InfluxMaxRetryTime = "168h"
-	// s.config.InfluxMaxRetries = 0
+	s.config.InfluxMaxRetries = 20
-	// s.config.InfluxExponentialBase = 0
+	s.config.InfluxExponentialBase = 2
-	if len(s.config.Host) == 0 {
+	if len(s.config.Host) == 0 ||
-		return nil, errors.New("Missing host configuration required by InfluxSink")
+		len(s.config.Port) == 0 ||
 		len(s.config.Database) == 0 ||
 		len(s.config.Organization) == 0 ||
 		len(s.config.Password) == 0 {
 		return nil, errors.New("not all configuration variables set required by InfluxSink")
 	}
 	if len(s.config.Port) == 0 {
 		return nil, errors.New("Missing port configuration required by InfluxSink")
 	}
 	if len(s.config.Database) == 0 {
 		return nil, errors.New("Missing database configuration required by InfluxSink")
 	}
 	if len(s.config.Organization) == 0 {
 		return nil, errors.New("Missing organization configuration required by InfluxSink")
 	}
 	if len(s.config.Password) == 0 {
 		return nil, errors.New("Missing password configuration required by InfluxSink")
 	}
 	// Create lookup map to use meta infos as tags in the output metric
 	s.meta_as_tags = make(map[string]bool)
 	for _, k := range s.config.MetaAsTags {
 		s.meta_as_tags[k] = true
 	}
-	// toUint := func(duration string, def uint) uint {
+	toUint := func(duration string, def uint) uint {
-	// 	if len(duration) > 0 {
+		t, err := time.ParseDuration(duration)
 	// 		t, err := time.ParseDuration(duration)
 	// 		if err == nil {
 	// 			return uint(t.Milliseconds())
 	// 		}
 	// 	}
 	// 	return def
 	// }
 	// s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval)
 	// s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime)
 	// Configure flush delay duration
 	if len(s.config.FlushDelay) > 0 {
 		t, err := time.ParseDuration(s.config.FlushDelay)
 		if err == nil {
-			s.flushDelay = t
+			return uint(t.Milliseconds())
 		}
 		return def
 	}
-
+	s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval)
-	// allocate batch slice
+	s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime)
 	s.batch = make([]*write.Point, 0, s.config.BatchSize)
 	// Connect to InfluxDB server
 	if err := s.connect(); err != nil {
--- a/sinks/influxSink.md
+++ b/sinks/influxSink.md
@@ -17,8 +17,10 @@ The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.de
    "password" : "examplepw",
    "organization": "myorg",
    "ssl": true,
-    "flush_delay" : "1s",
+    "retry_interval" : "1s",
-    "batch_size" : 100
+    "retry_exponential_base" : 2,
    "max_retries": 20,
    "max_retry_time" : "168h"
  }
 }
 ```
@@ -32,6 +34,9 @@ The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.de
 - `password`: Password for basic authentification
 - `organization`: Organization in the InfluxDB
 - `ssl`: Use SSL connection
- `flush_delay`: Group metrics coming in to a single batch
+- `retry_interval`: Base retry interval for failed write requests, default 1s
- `batch_size`: Maximal batch size
+- `retry_exponential_base`: The retry interval is exponentially increased with this base, default 2
 - `max_retries`: Maximal number of retry attempts
 - `max_retry_time`: Maximal time to retry failed writes, default 168h (one week)
 For information about the calculation of the retry interval settings, see [offical influxdb-client-go documentation](https://github.com/influxdata/influxdb-client-go#handling-of-failed-async-writes)
Author	SHA1	Message	Date
Thomas Röhl	4763733d8d	Merge branch 'develop' into main	2022-03-31 11:57:19 +02:00
Thomas Röhl	16e898ecca	Merge branch 'develop' into main	2022-03-31 11:47:02 +02:00
Thomas Roehl	4851382ad7	Merge branch 'develop' into main	2022-03-16 19:08:13 +01:00
Thomas Gruber	3f76947f54	Merge latest developments into main (#67 ) * Update configuration.md Add an additional receiver to have better alignment of components * Change default GpfsCollector command to `mmpmon` (#53) * Set default cmd to 'mmpmon' * Reuse looked up path * Cast const to string * Just download LIKWID to get the headers (#54) * Just download LIKWID to get the headers * Remove perl-Data-Dumper from BuildRequires, only required by LIKWID build * Add HttpReceiver as counterpart to the HttpSink (#49) * Use GBytes as unit for large memory numbers * Make maxForward configurable, save old name in meta in rename metrics and make the hostname tag key configurable * Single release action (#55) Building all RPMs and releasing in a single workflow * Makefile target to build binary-only Debian packages (#61) * Add 'install' and 'DEB' make targets to build binary-only Debian packages * Add control file for DEB builds * Use a single line for bash loop in make clean * Add config options for retry intervals of InfluxDB clients (#59) * Refactoring of LikwidCollector and metric units (#62) * Reduce complexity of LikwidCollector and allow metric units * Add unit to LikwidCollector docu and fix some typos * Make library path configurable * Use old metric name in Ganglia if rename has happened in the router (#60) * Use old metric name if rename has happened in the router * Also check for Ganglia renames for the oldname * Derived metrics (#57) * Add time-based derivatived (e.g. bandwidth) to some collectors * Add documentation * Add comments * Fix: Only compute rates with a valid previous state * Only compute rates with a valid previous state * Define const values for net/dev fields * Set default config values * Add comments * Refactor: Consolidate data structures * Refactor: Consolidate data structures * Refactor: Avoid struct deep copy * Refactor: Avoid redundant tag maps * Refactor: Use int64 type for absolut values Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> * Simplified iota usage * Move unit tag to meta data tags * Derived metrics (#65) * Add time-based derivatived (e.g. bandwidth) to some collectors * Add documentation * Add comments * Fix: Only compute rates with a valid previous state * Only compute rates with a valid previous state * Define const values for net/dev fields * Set default config values * Add comments * Refactor: Consolidate data structures * Refactor: Consolidate data structures * Refactor: Avoid struct deep copy * Refactor: Avoid redundant tag maps * Refactor: Use int64 type for absolut values * Update LustreCollector Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> * Meta to tags list and map for sinks (#63) * Change ccMetric->Influx functions * Use a meta_as_tags string list in config but create a lookup map afterwards * Add meta as tag logic to sampleSink * Fix staticcheck warnings (#66) Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>	2022-03-15 16:41:11 +01:00
Thomas Roehl	3157386b3e	Merge branch 'develop' into main	2022-03-04 23:34:28 +01:00
Thomas Roehl	ff08eaeb43	Set proper user for files	2022-03-04 11:49:55 +01:00
Thomas Roehl	64c41be34c	Fix name for ClusterCockpit	2022-03-04 11:37:45 +01:00
Thomas Roehl	f4af520b2a	Fix error print in LustreCollector	2022-03-04 11:32:39 +01:00