Compare commits

..

8 Commits

Author SHA1 Message Date
Thomas Röhl
4763733d8d Merge branch 'develop' into main 2022-03-31 11:57:19 +02:00
Thomas Röhl
16e898ecca Merge branch 'develop' into main 2022-03-31 11:47:02 +02:00
Thomas Roehl
4851382ad7 Merge branch 'develop' into main 2022-03-16 19:08:13 +01:00
Thomas Gruber
3f76947f54 Merge latest developments into main (#67)
* Update configuration.md

Add an additional receiver to have better alignment of components

* Change default GpfsCollector command to `mmpmon` (#53)

* Set default cmd to 'mmpmon'

* Reuse looked up path

* Cast const to string

* Just download LIKWID to get the headers (#54)

* Just download LIKWID to get the headers

* Remove perl-Data-Dumper from BuildRequires, only required by LIKWID build

* Add HttpReceiver as counterpart to the HttpSink (#49)

* Use GBytes as unit for large memory numbers

* Make maxForward configurable, save old name in meta in rename metrics and make the hostname tag key configurable

* Single release action (#55)

Building all RPMs and releasing in a single workflow

* Makefile target to build binary-only Debian packages (#61)

* Add 'install' and 'DEB' make targets to build binary-only Debian packages

* Add control file for DEB builds

* Use a single line for bash loop in make clean

* Add config options for retry intervals of InfluxDB clients (#59)

* Refactoring of LikwidCollector and metric units (#62)

* Reduce complexity of LikwidCollector and allow metric units

* Add unit to LikwidCollector docu and fix some typos

* Make library path configurable

* Use old metric name in Ganglia if rename has happened in the router (#60)

* Use old metric name if rename has happened in the router

* Also check for Ganglia renames for the oldname

* Derived metrics (#57)

* Add time-based derivatived (e.g. bandwidth) to some collectors

* Add documentation

* Add comments

* Fix: Only compute rates with a valid previous state

* Only compute rates with a valid previous state

* Define const values for net/dev fields

* Set default config values

* Add comments

* Refactor: Consolidate data structures

* Refactor: Consolidate data structures

* Refactor: Avoid struct deep copy

* Refactor: Avoid redundant tag maps

* Refactor: Use int64 type for absolut values

Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>

* Simplified iota usage

* Move unit tag to meta data tags

* Derived metrics (#65)

* Add time-based derivatived (e.g. bandwidth) to some collectors

* Add documentation

* Add comments

* Fix: Only compute rates with a valid previous state

* Only compute rates with a valid previous state

* Define const values for net/dev fields

* Set default config values

* Add comments

* Refactor: Consolidate data structures

* Refactor: Consolidate data structures

* Refactor: Avoid struct deep copy

* Refactor: Avoid redundant tag maps

* Refactor: Use int64 type for absolut values

* Update LustreCollector

Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>

* Meta to tags list and map for sinks (#63)

* Change ccMetric->Influx functions

* Use a meta_as_tags string list in config but create a lookup map afterwards

* Add meta as tag logic to sampleSink

* Fix staticcheck warnings (#66)

Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
2022-03-15 16:41:11 +01:00
Thomas Roehl
3157386b3e Merge branch 'develop' into main 2022-03-04 23:34:28 +01:00
Thomas Roehl
ff08eaeb43 Set proper user for files 2022-03-04 11:49:55 +01:00
Thomas Roehl
64c41be34c Fix name for ClusterCockpit 2022-03-04 11:37:45 +01:00
Thomas Roehl
f4af520b2a Fix error print in LustreCollector 2022-03-04 11:32:39 +01:00
17 changed files with 223 additions and 770 deletions

View File

@@ -3,6 +3,7 @@ package collectors
import ( import (
"bufio" "bufio"
"encoding/json" "encoding/json"
"fmt"
"os" "os"
"strings" "strings"
"syscall" "syscall"
@@ -80,7 +81,8 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric
stat := syscall.Statfs_t{} stat := syscall.Statfs_t{}
err := syscall.Statfs(path, &stat) err := syscall.Statfs(path, &stat)
if err != nil { if err != nil {
continue fmt.Println(err.Error())
return
} }
tags := map[string]string{"type": "node", "device": linefields[0]} tags := map[string]string{"type": "node", "device": linefields[0]}
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000) total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)

View File

@@ -70,7 +70,6 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
for _, fs := range m.config.ExcludeFilesystem { for _, fs := range m.config.ExcludeFilesystem {
m.skipFS[fs] = struct{}{} m.skipFS[fs] = struct{}{}
} }
m.lastState = make(map[string]GpfsCollectorLastState)
// GPFS / IBM Spectrum Scale file system statistics can only be queried by user root // GPFS / IBM Spectrum Scale file system statistics can only be queried by user root
user, err := user.Current() user, err := user.Current()
@@ -163,16 +162,11 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
continue continue
} }
// Add filesystem tag
m.tags["filesystem"] = filesystem m.tags["filesystem"] = filesystem
if _, ok := m.lastState[filesystem]; !ok {
// Create initial last state m.lastState[filesystem] = GpfsCollectorLastState{
if m.config.SendBandwidths { bytesRead: -1,
if _, ok := m.lastState[filesystem]; !ok { bytesWritten: -1,
m.lastState[filesystem] = GpfsCollectorLastState{
bytesRead: -1,
bytesWritten: -1,
}
} }
} }

View File

@@ -18,18 +18,13 @@ import (
const IB_BASEPATH = "/sys/class/infiniband/" const IB_BASEPATH = "/sys/class/infiniband/"
type InfinibandCollectorMetric struct {
path string
unit string
}
type InfinibandCollectorInfo struct { type InfinibandCollectorInfo struct {
LID string // IB local Identifier (LID) LID string // IB local Identifier (LID)
device string // IB device device string // IB device
port string // IB device port port string // IB device port
portCounterFiles map[string]InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric portCounterFiles map[string]string // mapping counter name -> sysfs file
tagSet map[string]string // corresponding tag list tagSet map[string]string // corresponding tag list
lastState map[string]int64 // State from last measurement lastState map[string]int64 // State from last measurement
} }
type InfinibandCollector struct { type InfinibandCollector struct {
@@ -111,16 +106,16 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
// Check access to counter files // Check access to counter files
countersDir := filepath.Join(path, "counters") countersDir := filepath.Join(path, "counters")
portCounterFiles := map[string]InfinibandCollectorMetric{ portCounterFiles := map[string]string{
"ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes"}, "ib_recv": filepath.Join(countersDir, "port_rcv_data"),
"ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes"}, "ib_xmit": filepath.Join(countersDir, "port_xmit_data"),
"ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets"}, "ib_recv_pkts": filepath.Join(countersDir, "port_rcv_packets"),
"ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets"}, "ib_xmit_pkts": filepath.Join(countersDir, "port_xmit_packets"),
} }
for _, counter := range portCounterFiles { for _, counterFile := range portCounterFiles {
err := unix.Access(counter.path, unix.R_OK) err := unix.Access(counterFile, unix.R_OK)
if err != nil { if err != nil {
return fmt.Errorf("unable to access %s: %v", counter.path, err) return fmt.Errorf("unable to access %s: %v", counterFile, err)
} }
} }
@@ -170,14 +165,14 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
m.lastTimestamp = now m.lastTimestamp = now
for _, info := range m.info { for _, info := range m.info {
for counterName, counterDef := range info.portCounterFiles { for counterName, counterFile := range info.portCounterFiles {
// Read counter file // Read counter file
line, err := ioutil.ReadFile(counterDef.path) line, err := ioutil.ReadFile(counterFile)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err)) fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterFile, err))
continue continue
} }
data := strings.TrimSpace(string(line)) data := strings.TrimSpace(string(line))
@@ -194,7 +189,6 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
// Send absolut values // Send absolut values
if m.config.SendAbsoluteValues { if m.config.SendAbsoluteValues {
if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil { if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil {
y.AddMeta("unit", counterDef.unit)
output <- y output <- y
} }
} }
@@ -204,7 +198,6 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
if info.lastState[counterName] >= 0 { if info.lastState[counterName] >= 0 {
rate := float64((v - info.lastState[counterName])) / timeDiff rate := float64((v - info.lastState[counterName])) / timeDiff
if y, err := lp.New(counterName+"_bw", info.tagSet, m.meta, map[string]interface{}{"value": rate}, now); err == nil { if y, err := lp.New(counterName+"_bw", info.tagSet, m.meta, map[string]interface{}{"value": rate}, now); err == nil {
y.AddMeta("unit", counterDef.unit+"/sec")
output <- y output <- y
} }
} }

View File

@@ -16,7 +16,6 @@ import (
"math" "math"
"os" "os"
"os/signal" "os/signal"
"sort"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
@@ -55,7 +54,6 @@ type LikwidEventsetConfig struct {
gid C.int gid C.int
eorder []*C.char eorder []*C.char
estr *C.char estr *C.char
go_estr string
results map[int]map[string]interface{} results map[int]map[string]interface{}
metrics map[int]map[string]float64 metrics map[int]map[string]float64
} }
@@ -103,14 +101,8 @@ func eventsToEventStr(events map[string]string) string {
func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig { func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig {
tmplist := make([]string, 0) tmplist := make([]string, 0)
clist := make([]string, 0)
for k := range input.Events {
clist = append(clist, k)
}
sort.Strings(clist)
elist := make([]*C.char, 0) elist := make([]*C.char, 0)
for _, k := range clist { for k, v := range input.Events {
v := input.Events[k]
tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k)) tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k))
c_counter := C.CString(k) c_counter := C.CString(k)
elist = append(elist, c_counter) elist = append(elist, c_counter)
@@ -132,7 +124,6 @@ func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig
gid: -1, gid: -1,
eorder: elist, eorder: elist,
estr: C.CString(estr), estr: C.CString(estr),
go_estr: estr,
results: res, results: res,
metrics: met, metrics: met,
} }
@@ -202,7 +193,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
} }
m.setup() m.setup()
m.meta = map[string]string{"group": "PerfCounter"} m.meta = map[string]string{"source": m.name, "group": "PerfCounter"}
cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists") cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
cpulist := topo.CpuList() cpulist := topo.CpuList()
m.cpulist = make([]C.int, len(cpulist)) m.cpulist = make([]C.int, len(cpulist))
@@ -329,11 +320,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
gctr := C.GoString(counter) gctr := C.GoString(counter)
for _, tid := range m.cpu2tid { for _, tid := range m.cpu2tid {
res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid)) res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid))
fres := float64(res) evset.results[tid][gctr] = float64(res)
if m.config.InvalidToZero && (math.IsNaN(fres) || math.IsInf(fres, 0)) {
fres = 0.0
}
evset.results[tid][gctr] = fres
evset.results[tid]["time"] = interval.Seconds() evset.results[tid]["time"] = interval.Seconds()
evset.results[tid]["inverseClock"] = invClock evset.results[tid]["inverseClock"] = invClock
} }
@@ -352,12 +339,15 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid]) value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid])
if err != nil { if err != nil {
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
value = 0.0 continue
}
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
value = 0.0
} }
evset.metrics[tid][metric.Name] = value evset.metrics[tid][metric.Name] = value
if m.config.InvalidToZero && math.IsNaN(value) {
value = 0.0
}
if m.config.InvalidToZero && math.IsInf(value, 0) {
value = 0.0
}
// Now we have the result, send it with the proper tags // Now we have the result, send it with the proper tags
if !math.IsNaN(value) { if !math.IsNaN(value) {
if metric.Publish { if metric.Publish {
@@ -401,12 +391,15 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
value, err := agg.EvalFloat64Condition(metric.Calc, params) value, err := agg.EvalFloat64Condition(metric.Calc, params)
if err != nil { if err != nil {
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
value = 0.0 continue
}
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
value = 0.0
} }
m.gmresults[tid][metric.Name] = value m.gmresults[tid][metric.Name] = value
if m.config.InvalidToZero && math.IsNaN(value) {
value = 0.0
}
if m.config.InvalidToZero && math.IsInf(value, 0) {
value = 0.0
}
// Now we have the result, send it with the proper tags // Now we have the result, send it with the proper tags
if !math.IsNaN(value) { if !math.IsNaN(value) {
if metric.Publish { if metric.Publish {
@@ -432,9 +425,6 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
func (m *LikwidCollector) LateInit() error { func (m *LikwidCollector) LateInit() error {
var ret C.int var ret C.int
if m.initialized {
return nil
}
switch m.config.AccessMode { switch m.config.AccessMode {
case "direct": case "direct":
C.HPMmode(0) C.HPMmode(0)
@@ -485,17 +475,7 @@ func (m *LikwidCollector) LateInit() error {
for i, evset := range m.config.Eventsets { for i, evset := range m.config.Eventsets {
var gid C.int var gid C.int
if len(evset.Events) > 0 { if len(evset.Events) > 0 {
skip := false
likwidGroup := genLikwidEventSet(evset) likwidGroup := genLikwidEventSet(evset)
for _, g := range m.likwidGroups {
if likwidGroup.go_estr == g.go_estr {
skip = true
break
}
}
if skip {
continue
}
// Now we add the list of events to likwid // Now we add the list of events to likwid
gid = C.perfmon_addEventSet(likwidGroup.estr) gid = C.perfmon_addEventSet(likwidGroup.estr)
if gid >= 0 { if gid >= 0 {
@@ -540,14 +520,9 @@ func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric)
} }
if !m.initialized { if !m.initialized {
m.lock.Lock() if m.LateInit() != nil {
err = m.LateInit()
if err != nil {
m.lock.Unlock()
return return
} }
m.initialized = true
m.lock.Unlock()
} }
if m.initialized && !skip { if m.initialized && !skip {

View File

@@ -3,63 +3,32 @@
The `likwid` collector is probably the most complicated collector. The LIKWID library is included as static library with *direct* access mode. The *direct* access mode is suitable if the daemon is executed by a root user. The static library does not contain the performance groups, so all information needs to be provided in the configuration. The `likwid` collector is probably the most complicated collector. The LIKWID library is included as static library with *direct* access mode. The *direct* access mode is suitable if the daemon is executed by a root user. The static library does not contain the performance groups, so all information needs to be provided in the configuration.
```json The `likwid` configuration consists of two parts, the "eventsets" and "globalmetrics":
"likwid": { - An event set list itself has two parts, the "events" and a set of derivable "metrics". Each of the "events" is a counter:event pair in LIKWID's syntax. The "metrics" are a list of formulas to derive the metric value from the measurements of the "events". Each metric has a name, the formula, a scope and a publish flag. Counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. The scope tells the Collector whether it is a metric for each hardware thread (`cpu`) or each CPU socket (`socket`). You may specify a unit for the metric with `unit`. The last one is the publishing flag. It tells the collector whether a metric should be sent to the router.
"force_overwrite" : false, - The global metrics are metrics which require data from all event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. So, the idea is to derive a metric in the "eventsets" section and reuse it in the "globalmetrics" part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics (`publish=false`). **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Similar to the metrics in the eventset, you can specify a metric unit with the `unit` field.
"invalid_to_zero" : false,
"eventsets": [
{
"events" : {
"COUNTER0": "EVENT0",
"COUNTER1": "EVENT1",
},
"metrics" : [
{
"name": "sum_01",
"calc": "COUNTER0 + COUNTER1",
"publish": false,
"unit": "myunit",
"type": "cpu"
}
]
}
]
"globalmetrics" : [
{
"name": "global_sum",
"calc": "sum_01",
"publish": true,
"unit": "myunit",
"type": "cpu"
}
]
}
```
The `likwid` configuration consists of two parts, the `eventsets` and `globalmetrics`:
- An event set list itself has two parts, the `events` and a set of derivable `metrics`. Each of the `events` is a `counter:event` pair in LIKWID's syntax. The `metrics` are a list of formulas to derive the metric value from the measurements of the `events`' values. Each metric has a name, the formula, a type and a publish flag. There is an optional `unit` field. Counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. You can optionally use `time` for the measurement time and `inverseClock` for `1.0/baseCpuFrequency`. The type tells the LikwidCollector whether it is a metric for each hardware thread (`cpu`) or each CPU socket (`socket`). You may specify a unit for the metric with `unit`. The last one is the publishing flag. It tells the LikwidCollector whether a metric should be sent to the router or is only used internally to compute a global metric.
- The `globalmetrics` are metrics which require data from multiple event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. Also `time` and `inverseClock` cannot be used anymore. So, the idea is to derive a metric in the `eventsets` section and reuse it in the `globalmetrics` part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics (`"publish": false`). **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Similar to the metrics in the eventset, you can specify a metric unit with the `unit` field.
Additional options: Additional options:
- `access_mode` : Method to use for hardware performance monitoring (`direct` access as root user, `accessdaemon` for the daemon mode)
- `accessdaemon_path`: Folder with the access daemon `likwid-accessD`, commonly `$LIKWID_INSTALL_LOC/sbin`
- `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements - `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements
- `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`. See below in [seperate section](./likwidMetric.md#invalid_to_zero-option) - `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`.
- `access_mode`: Specify LIKWID access mode: `direct` for direct register access as root user or `accessdaemon`. The access mode `perf_event` is current untested. - `access_mode`: Specify LIKWID access mode: `direct` for direct register access as root user or `accessdaemon`
- `accessdaemon_path`: Folder of the accessDaemon `likwid-accessD` (like `/usr/local/sbin`) - `accessdaemon_path`: Folder of the accessDaemon `likwid-accessD`
- `liblikwid_path`: Location of `liblikwid.so` including file name like `/usr/local/lib/liblikwid.so` - `liblikwid_path`: Location of `liblikwid.so`
### Available metric scopes ### Available metric scopes
Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the LikwidCollector provides the specification of a `type` for each metric. Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the collector provides the specification of a 'scope' for each metric.
- `cpu` : One metric per CPU hardware thread with the tags `"type" : "cpu"` and `"type-id" : "$cpu_id"` - `cpu` : One metric per CPU hardware thread with the tags `"type" : "cpu"` and `"type-id" : "$cpu_id"`
- `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"` - `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"`
**Note:** You should not specify the `socket` type for a metric that is measured at `cpu` scope and vice versa, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific. **Note:** You cannot specify `socket` scope for a metric that is measured at `cpu` scope, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific.
As a guideline: As a guideline:
- All counters `FIXCx`, `PMCy` and `TMAz` have the scope `cpu` - All counters `FIXCx`, `PMCy` and `TMAz` have the scope `cpu`
- All counters names containing `BOX` have the scope `socket` - All counters names containing `BOX` have the scope `socket`
- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `cpu` scope (AMD Zen) - All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `cpu` scope
- All `DFCx` counters have scope `socket` - All `DFCx` counters have scope `socket`
### Help with the configuration ### Help with the configuration
@@ -81,7 +50,6 @@ $ scripts/likwid_perfgroup_to_cc_config.py ICX MEM_DP
{ {
"events": { "events": {
"FIXC0": "INSTR_RETIRED_ANY", "FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"..." : "..." "..." : "..."
}, },
"metrics" : [ "metrics" : [
@@ -107,28 +75,21 @@ LIKWID checks the file `/var/run/likwid.lock` before performing any interfering
Before (SLURM prolog, ...) Before (SLURM prolog, ...)
``` ```
$ chown $JOBUSER /var/run/likwid.lock $ chwon $JOBUSER /var/run/likwid.lock
``` ```
After (SLURM epilog, ...) After (SLURM epilog, ...)
``` ```
$ chown $CCUSER /var/run/likwid.lock $ chwon $CCUSER /var/run/likwid.lock
``` ```
### `invalid_to_zero` option
In some cases LIKWID returns `0.0` for some events that are further used in processing and maybe used as divisor in a calculation. After evaluation of a metric, the result might be `NaN` or `+-Inf`. These resulting metrics are commonly not created and forwarded to the router because the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#float) does not support these special floating-point values. If you want to have them sent, this option forces these metric values to be `0.0` instead.
One might think this does not happen often but often used metrics in the world of performance engineering like Instructions-per-Cycle (IPC) or more frequently the actual CPU clock are derived with events like `CPU_CLK_UNHALTED_CORE` (Intel) which do not increment in halted state (as the name implies). In there are different power management systems in a chip which can cause a hardware thread to go in such a state. Moreover, if no cycles are executed by the core, also many other events are not incremented as well (like `INSTR_RETIRED_ANY` for retired instructions and part of IPC).
### Example configuration ### Example configuration
#### AMD Zen3
```json ```json
"likwid": { "likwid": {
"force_overwrite" : false, "force_overwrite" : false,
"invalid_to_zero" : false, "nan_to_zero" : false,
"eventsets": [ "eventsets": [
{ {
"events": { "events": {
@@ -219,3 +180,33 @@ One might think this does not happen often but often used metrics in the world o
} }
``` ```
### How to get the eventsets and metrics from LIKWID
The `likwid` collector reads hardware performance counters at a **cpu** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility.
The logic is as following: There are multiple eventsets, each consisting of a list of counters+events and a list of metrics. If you compare a common performance group with the example setting above, there is not much difference:
```
EVENTSET -> "events": {
FIXC1 ACTUAL_CPU_CLOCK -> "FIXC1": "ACTUAL_CPU_CLOCK",
FIXC2 MAX_CPU_CLOCK -> "FIXC2": "MAX_CPU_CLOCK",
PMC0 RETIRED_INSTRUCTIONS -> "PMC0" : "RETIRED_INSTRUCTIONS",
PMC1 CPU_CLOCKS_UNHALTED -> "PMC1" : "CPU_CLOCKS_UNHALTED",
PMC2 RETIRED_SSE_AVX_FLOPS_ALL -> "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL",
PMC3 MERGE -> "PMC3": "MERGE",
-> }
```
The metrics are following the same procedure:
```
METRICS -> "metrics": [
IPC PMC0/PMC1 -> {
-> "name" : "IPC",
-> "calc" : "PMC0/PMC1",
-> "scope": "cpu",
-> "publish": true
-> }
-> ]
```
The script `scripts/likwid_perfgroup_to_cc_config.py` might help you.

21
go.mod
View File

@@ -3,14 +3,17 @@ module github.com/ClusterCockpit/cc-metric-collector
go 1.16 go 1.16
require ( require (
github.com/NVIDIA/go-nvml v0.11.6-0 github.com/NVIDIA/go-nvml v0.11.1-0
github.com/PaesslerAG/gval v1.1.2 github.com/influxdata/influxdb-client-go/v2 v2.7.0
github.com/gorilla/mux v1.8.0
github.com/influxdata/influxdb-client-go/v2 v2.8.1
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf
github.com/nats-io/nats-server/v2 v2.8.0 // indirect github.com/nats-io/nats.go v1.13.1-0.20211122170419-d7c1d78a50fc
github.com/nats-io/nats.go v1.14.0 golang.org/x/sys v0.0.0-20220114195835-da31bd327af9
github.com/prometheus/client_golang v1.12.1 gopkg.in/Knetic/govaluate.v2 v2.3.0
github.com/stmcginnis/gofish v0.13.0 )
golang.org/x/sys v0.0.0-20220412211240-33da011f77ad
require (
github.com/PaesslerAG/gval v1.1.2
github.com/golang/protobuf v1.5.2 // indirect
github.com/nats-io/nats-server/v2 v2.7.0 // indirect
google.golang.org/protobuf v1.27.1 // indirect
) )

View File

@@ -48,6 +48,7 @@ type metricRouter struct {
done chan bool // channel to finish / stop metric router done chan bool // channel to finish / stop metric router
wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector
timestamp time.Time // timestamp periodically updated by ticker each interval timestamp time.Time // timestamp periodically updated by ticker each interval
timerdone chan bool // channel to finish / stop timestamp updater
ticker mct.MultiChanTicker // periodically ticking once each interval ticker mct.MultiChanTicker // periodically ticking once each interval
config metricRouterConfig // json encoded config for metric router config metricRouterConfig // json encoded config for metric router
cache MetricCache // pointer to MetricCache cache MetricCache // pointer to MetricCache
@@ -123,6 +124,29 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
return nil return nil
} }
// StartTimer starts a timer which updates timestamp periodically
func (r *metricRouter) StartTimer() {
m := make(chan time.Time)
r.ticker.AddChannel(m)
r.timerdone = make(chan bool)
r.wg.Add(1)
go func() {
defer r.wg.Done()
for {
select {
case <-r.timerdone:
close(r.timerdone)
cclog.ComponentDebug("MetricRouter", "TIMER DONE")
return
case t := <-m:
r.timestamp = t
}
}
}()
cclog.ComponentDebug("MetricRouter", "TIMER START")
}
func getParamMap(point lp.CCMetric) map[string]interface{} { func getParamMap(point lp.CCMetric) map[string]interface{} {
params := make(map[string]interface{}) params := make(map[string]interface{})
params["metric"] = point params["metric"] = point
@@ -211,9 +235,8 @@ func (r *metricRouter) dropMetric(point lp.CCMetric) bool {
func (r *metricRouter) Start() { func (r *metricRouter) Start() {
// start timer if configured // start timer if configured
r.timestamp = time.Now() r.timestamp = time.Now()
timeChan := make(chan time.Time)
if r.config.IntervalStamp { if r.config.IntervalStamp {
r.ticker.AddChannel(timeChan) r.StartTimer()
} }
// Router manager is done // Router manager is done
@@ -293,10 +316,6 @@ func (r *metricRouter) Start() {
done() done()
return return
case timestamp := <-timeChan:
r.timestamp = timestamp
cclog.ComponentDebug("MetricRouter", "Update timestamp", r.timestamp.UnixNano())
case p := <-r.coll_input: case p := <-r.coll_input:
coll_forward(p) coll_forward(p)
for i := 0; len(r.coll_input) > 0 && i < (r.maxForward-1); i++ { for i := 0; len(r.coll_input) > 0 && i < (r.maxForward-1); i++ {
@@ -342,6 +361,14 @@ func (r *metricRouter) Close() {
// wait for close of channel r.done // wait for close of channel r.done
<-r.done <-r.done
// stop timer
if r.config.IntervalStamp {
cclog.ComponentDebug("MetricRouter", "TIMER CLOSE")
r.timerdone <- true
// wait for close of channel r.timerdone
<-r.timerdone
}
// stop metric cache // stop metric cache
if r.config.NumCacheIntervals > 0 { if r.config.NumCacheIntervals > 0 {
cclog.ComponentDebug("MetricRouter", "CACHE CLOSE") cclog.ComponentDebug("MetricRouter", "CACHE CLOSE")

View File

@@ -4,22 +4,5 @@
"address": "nats://my-url", "address": "nats://my-url",
"port" : "4222", "port" : "4222",
"database": "testcluster" "database": "testcluster"
},
"redfish_recv": {
"type": "redfish",
"client_config": [
{
"hostname": "my-host-1",
"username": "username-1",
"password": "password-1",
"endpoint": "https://my-endpoint-1"
},
{
"hostname": "my-host-2",
"username": "username-2",
"password": "password-2",
"endpoint": "https://my-endpoint-2"
}
]
} }
} }

View File

@@ -10,13 +10,14 @@ import (
) )
var AvailableReceivers = map[string]func(name string, config json.RawMessage) (Receiver, error){ var AvailableReceivers = map[string]func(name string, config json.RawMessage) (Receiver, error){
"nats": NewNatsReceiver, "nats": NewNatsReceiver,
"redfish": NewRedfishReceiver,
} }
type receiveManager struct { type receiveManager struct {
inputs []Receiver inputs []Receiver
output chan lp.CCMetric output chan lp.CCMetric
done chan bool
wg *sync.WaitGroup
config []json.RawMessage config []json.RawMessage
} }
@@ -32,6 +33,8 @@ func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) er
// Initialize struct fields // Initialize struct fields
rm.inputs = make([]Receiver, 0) rm.inputs = make([]Receiver, 0)
rm.output = nil rm.output = nil
rm.done = make(chan bool)
rm.wg = wg
rm.config = make([]json.RawMessage, 0) rm.config = make([]json.RawMessage, 0)
configFile, err := os.Open(receiverConfigFile) configFile, err := os.Open(receiverConfigFile)
@@ -55,7 +58,7 @@ func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) er
} }
func (rm *receiveManager) Start() { func (rm *receiveManager) Start() {
cclog.ComponentDebug("ReceiveManager", "START") rm.wg.Add(1)
for _, r := range rm.inputs { for _, r := range rm.inputs {
cclog.ComponentDebug("ReceiveManager", "START", r.Name()) cclog.ComponentDebug("ReceiveManager", "START", r.Name())
@@ -94,19 +97,16 @@ func (rm *receiveManager) AddOutput(output chan lp.CCMetric) {
} }
func (rm *receiveManager) Close() { func (rm *receiveManager) Close() {
cclog.ComponentDebug("ReceiveManager", "CLOSE")
// Close all receivers
for _, r := range rm.inputs { for _, r := range rm.inputs {
cclog.ComponentDebug("ReceiveManager", "CLOSE", r.Name()) cclog.ComponentDebug("ReceiveManager", "CLOSE", r.Name())
r.Close() r.Close()
} }
rm.wg.Done()
cclog.ComponentDebug("ReceiveManager", "DONE") cclog.ComponentDebug("ReceiveManager", "CLOSE")
} }
func New(wg *sync.WaitGroup, receiverConfigFile string) (ReceiveManager, error) { func New(wg *sync.WaitGroup, receiverConfigFile string) (ReceiveManager, error) {
r := new(receiveManager) r := &receiveManager{}
err := r.Init(wg, receiverConfigFile) err := r.Init(wg, receiverConfigFile)
if err != nil { if err != nil {
return nil, err return nil, err

View File

@@ -1,324 +0,0 @@
package receivers
import (
"encoding/json"
"fmt"
"strconv"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
// See: https://pkg.go.dev/github.com/stmcginnis/gofish
"github.com/stmcginnis/gofish"
)
// RedfishReceiver configuration:
type RedfishReceiver struct {
receiver
config struct {
Type string `json:"type"`
Fanout int `json:"fanout,omitempty"` // Default fanout: 64
Interval int `json:"interval,omitempty"` // Default interval: 30s
// Client config for each redfish service
ClientConfigs []struct {
Hostname *string `json:"hostname"`
Username *string `json:"username"`
Password *string `json:"password"`
Endpoint *string `json:"endpoint"`
Insecure *bool `json:"insecure,omitempty"`
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
gofish gofish.ClientConfig
} `json:"client_config"`
}
done chan bool // channel to finish / stop redfish receiver
wg sync.WaitGroup // wait group for redfish receiver
}
// Start starts the redfish receiver
func (r *RedfishReceiver) Start() {
cclog.ComponentDebug(r.name, "START")
// readPowerMetric reads readfish power metric from the endpoint configured in conf
readPowerMetric := func(clientConfigIndex int) error {
clientConfig := &r.config.ClientConfigs[clientConfigIndex]
// Connect to redfish service
c, err := gofish.Connect(clientConfig.gofish)
if err != nil {
c := struct {
Username string
Endpoint string
BasicAuth bool
Insecure bool
}{
Username: clientConfig.gofish.Username,
Endpoint: clientConfig.gofish.Endpoint,
BasicAuth: clientConfig.gofish.BasicAuth,
Insecure: clientConfig.gofish.Insecure,
}
return fmt.Errorf("readPowerMetric: gofish.Connect(%+v) failed: %v", c, err)
}
defer c.Logout()
// Get all chassis managed by this service
chassis_list, err := c.Service.Chassis()
if err != nil {
return fmt.Errorf("readPowerMetric: c.Service.Chassis() failed: %v", err)
}
for _, chassis := range chassis_list {
timestamp := time.Now()
// Get power information for each chassis
power, err := chassis.Power()
if err != nil {
return fmt.Errorf("readPowerMetric: chassis.Power() failed: %v", err)
}
if power == nil {
continue
}
// Read min, max and average consumed watts for each power control
for _, pc := range power.PowerControl {
// Map of collected metrics
metrics := map[string]float32{
// PowerConsumedWatts shall represent the actual power being consumed (in
// Watts) by the chassis
"consumed_watts": pc.PowerConsumedWatts,
// AverageConsumedWatts shall represent the
// average power level that occurred averaged over the last IntervalInMin
// minutes.
"average_consumed_watts": pc.PowerMetrics.AverageConsumedWatts,
// MinConsumedWatts shall represent the
// minimum power level in watts that occurred within the last
// IntervalInMin minutes.
"min_consumed_watts": pc.PowerMetrics.MinConsumedWatts,
// MaxConsumedWatts shall represent the
// maximum power level in watts that occurred within the last
// IntervalInMin minutes
"max_consumed_watts": pc.PowerMetrics.MaxConsumedWatts,
}
intervalInMin := strconv.FormatFloat(float64(pc.PowerMetrics.IntervalInMin), 'f', -1, 32)
// Metrics to exclude
for _, key := range clientConfig.ExcludeMetrics {
delete(metrics, key)
}
// Set tags
tags := map[string]string{
"hostname": *clientConfig.Hostname,
"type": "node",
// ID uniquely identifies the resource
"id": pc.ID,
// MemberID shall uniquely identify the member within the collection. For
// services supporting Redfish v1.6 or higher, this value shall be the
// zero-based array index.
"member_id": pc.MemberID,
// PhysicalContext shall be a description of the affected device(s) or region
// within the chassis to which this power control applies.
"physical_context": string(pc.PhysicalContext),
// Name
"power_control_name": pc.Name,
}
// Delete empty tags
for key, value := range tags {
if value == "" {
delete(tags, key)
}
}
// Set meta data tags
meta := map[string]string{
"source": r.name,
"group": "Energy",
"interval_in_minutes": intervalInMin,
"unit": "watts",
}
// Delete empty meta data tags
for key, value := range meta {
if value == "" {
delete(meta, key)
}
}
for name, value := range metrics {
y, err := lp.New(name, tags, meta,
map[string]interface{}{
"value": value,
},
timestamp)
if err == nil {
r.sink <- y
}
}
}
}
return nil
}
// doReadPowerMetric read power metrics for all configure redfish services.
// To compensate latencies of the Redfish services a fanout is used.
doReadPowerMetric := func() {
// Compute fanout to use
realFanout := r.config.Fanout
if len(r.config.ClientConfigs) < realFanout {
realFanout = len(r.config.ClientConfigs)
}
// Create wait group and input channel for workers
var workerWaitGroup sync.WaitGroup
workerInput := make(chan int, realFanout)
// Create worker go routines
for i := 0; i < realFanout; i++ {
// Increment worker wait group counter
workerWaitGroup.Add(1)
go func() {
// Decrement worker wait group counter
defer workerWaitGroup.Done()
// Read power metrics for each client config
for clientConfigIndex := range workerInput {
err := readPowerMetric(clientConfigIndex)
if err != nil {
cclog.ComponentError(r.name, err)
}
}
}()
}
// Distribute client configs to workers
for i := range r.config.ClientConfigs {
// Check done channel status
select {
case workerInput <- i:
case <-r.done:
// process done event
// Stop workers, clear channel and wait for all workers to finish
close(workerInput)
for range workerInput {
}
workerWaitGroup.Wait()
return
}
}
// Stop workers and wait for all workers to finish
close(workerInput)
workerWaitGroup.Wait()
}
// Start redfish receiver
r.wg.Add(1)
go func() {
defer r.wg.Done()
// Create ticker
ticker := time.NewTicker(time.Duration(r.config.Interval) * time.Second)
defer ticker.Stop()
for {
doReadPowerMetric()
select {
case <-ticker.C:
// process ticker event -> continue
continue
case <-r.done:
// process done event
return
}
}
}()
cclog.ComponentDebug(r.name, "STARTED")
}
// Close redfish receiver
func (r *RedfishReceiver) Close() {
cclog.ComponentDebug(r.name, "CLOSE")
// Send the signal and wait
close(r.done)
r.wg.Wait()
cclog.ComponentDebug(r.name, "DONE")
}
// New function to create a new instance of the receiver
// Initialize the receiver by giving it a name and reading in the config JSON
func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) {
r := new(RedfishReceiver)
// Set name
r.name = fmt.Sprintf("RedfishReceiver(%s)", name)
// Create done channel
r.done = make(chan bool)
// Set defaults in r.config
// Allow overwriting these defaults by reading config JSON
r.config.Fanout = 64
r.config.Interval = 30
// Read the redfish receiver specific JSON config
if len(config) > 0 {
err := json.Unmarshal(config, &r.config)
if err != nil {
cclog.ComponentError(r.name, "Error reading config:", err.Error())
return nil, err
}
}
// Create gofish client config
for i := range r.config.ClientConfigs {
clientConfig := &r.config.ClientConfigs[i]
gofishConfig := &clientConfig.gofish
if clientConfig.Hostname == nil {
err := fmt.Errorf("client config number %v requires hostname", i)
cclog.ComponentError(r.name, err)
return nil, err
}
if clientConfig.Endpoint == nil {
err := fmt.Errorf("client config number %v requires endpoint", i)
cclog.ComponentError(r.name, err)
return nil, err
}
gofishConfig.Endpoint = *clientConfig.Endpoint
if clientConfig.Username == nil {
err := fmt.Errorf("client config number %v requires username", i)
cclog.ComponentError(r.name, err)
return nil, err
}
gofishConfig.Username = *clientConfig.Username
if clientConfig.Password == nil {
err := fmt.Errorf("client config number %v requires password", i)
cclog.ComponentError(r.name, err)
return nil, err
}
gofishConfig.Password = *clientConfig.Password
gofishConfig.Insecure = true
if clientConfig.Insecure != nil {
gofishConfig.Insecure = *clientConfig.Insecure
}
}
return r, nil
}

View File

@@ -36,26 +36,16 @@ func (r *SampleReceiver) Start() {
// or use own go routine but always make sure it exits // or use own go routine but always make sure it exits
// as soon as it gets the signal of the r.done channel // as soon as it gets the signal of the r.done channel
//
// r.done = make(chan bool)
// r.wg.Add(1) // r.wg.Add(1)
// go func() { // go func() {
// defer r.wg.Done() // for {
// // select {
// // Create ticker // case <-r.done:
// ticker := time.NewTicker(30 * time.Second) // r.wg.Done()
// defer ticker.Stop() // return
// // }
// for { // }
// readMetric() // r.wg.Done()
// select {
// case <-ticker.C:
// // process ticker event -> continue
// continue
// case <-r.done:
// return
// }
// }
// }() // }()
} }

View File

@@ -1,8 +1,6 @@
{ {
"mystdout": { "mystdout" : {
"type": "stdout", "type" : "stdout",
"meta_as_tags": [ "meta_as_tags" : true
"unit"
]
} }
} }

View File

@@ -22,7 +22,6 @@ type HttpSinkConfig struct {
MaxIdleConns int `json:"max_idle_connections,omitempty"` MaxIdleConns int `json:"max_idle_connections,omitempty"`
IdleConnTimeout string `json:"idle_connection_timeout,omitempty"` IdleConnTimeout string `json:"idle_connection_timeout,omitempty"`
FlushDelay string `json:"flush_delay,omitempty"` FlushDelay string `json:"flush_delay,omitempty"`
BatchSize int `json:"batch_size,omitempty"`
} }
type HttpSink struct { type HttpSink struct {
@@ -37,20 +36,19 @@ type HttpSink struct {
idleConnTimeout time.Duration idleConnTimeout time.Duration
timeout time.Duration timeout time.Duration
flushDelay time.Duration flushDelay time.Duration
batchSize int
} }
func (s *HttpSink) Write(m lp.CCMetric) error { func (s *HttpSink) Write(m lp.CCMetric) error {
if s.buffer.Len() == 0 && s.flushDelay != 0 { if s.buffer.Len() == 0 && s.flushDelay != 0 {
// This is the first write since the last flush, start the flushTimer! // This is the first write since the last flush, start the flushTimer!
if s.flushTimer != nil && s.flushTimer.Stop() { if s.flushTimer != nil && s.flushTimer.Stop() {
cclog.ComponentDebug(s.name, "unexpected: the flushTimer was already running?") cclog.ComponentDebug("HttpSink", "unexpected: the flushTimer was already running?")
} }
// Run a batched flush for all lines that have arrived in the last second // Run a batched flush for all lines that have arrived in the last second
s.flushTimer = time.AfterFunc(s.flushDelay, func() { s.flushTimer = time.AfterFunc(s.flushDelay, func() {
if err := s.Flush(); err != nil { if err := s.Flush(); err != nil {
cclog.ComponentError(s.name, "flush failed:", err.Error()) cclog.ComponentError("HttpSink", "flush failed:", err.Error())
} }
}) })
} }
@@ -59,11 +57,9 @@ func (s *HttpSink) Write(m lp.CCMetric) error {
s.lock.Lock() s.lock.Lock()
_, err := s.encoder.Encode(p) _, err := s.encoder.Encode(p)
s.batchSize++
s.lock.Unlock() // defer does not work here as Flush() takes the lock as well s.lock.Unlock() // defer does not work here as Flush() takes the lock as well
if err != nil { if err != nil {
cclog.ComponentError(s.name, "encoding failed:", err.Error())
return err return err
} }
@@ -71,9 +67,6 @@ func (s *HttpSink) Write(m lp.CCMetric) error {
if s.flushDelay == 0 { if s.flushDelay == 0 {
return s.Flush() return s.Flush()
} }
if s.batchSize == s.config.BatchSize {
return s.Flush()
}
return err return err
} }
@@ -91,7 +84,6 @@ func (s *HttpSink) Flush() error {
// Create new request to send buffer // Create new request to send buffer
req, err := http.NewRequest(http.MethodPost, s.config.URL, s.buffer) req, err := http.NewRequest(http.MethodPost, s.config.URL, s.buffer)
if err != nil { if err != nil {
cclog.ComponentError(s.name, "failed to create request:", err.Error())
return err return err
} }
@@ -105,19 +97,15 @@ func (s *HttpSink) Flush() error {
// Clear buffer // Clear buffer
s.buffer.Reset() s.buffer.Reset()
s.batchSize = 0
// Handle transport/tcp errors // Handle transport/tcp errors
if err != nil { if err != nil {
cclog.ComponentError(s.name, "transport/tcp error:", err.Error())
return err return err
} }
// Handle application errors // Handle application errors
if res.StatusCode != http.StatusOK { if res.StatusCode != http.StatusOK {
err = errors.New(res.Status) return errors.New(res.Status)
cclog.ComponentError(s.name, "application error:", err.Error())
return err
} }
return nil return nil
@@ -126,7 +114,7 @@ func (s *HttpSink) Flush() error {
func (s *HttpSink) Close() { func (s *HttpSink) Close() {
s.flushTimer.Stop() s.flushTimer.Stop()
if err := s.Flush(); err != nil { if err := s.Flush(); err != nil {
cclog.ComponentError(s.name, "flush failed:", err.Error()) cclog.ComponentError("HttpSink", "flush failed:", err.Error())
} }
s.client.CloseIdleConnections() s.client.CloseIdleConnections()
} }
@@ -139,7 +127,6 @@ func NewHttpSink(name string, config json.RawMessage) (Sink, error) {
s.config.IdleConnTimeout = "5s" s.config.IdleConnTimeout = "5s"
s.config.Timeout = "5s" s.config.Timeout = "5s"
s.config.FlushDelay = "1s" s.config.FlushDelay = "1s"
s.config.BatchSize = 100
// Read config // Read config
if len(config) > 0 { if len(config) > 0 {

View File

@@ -15,7 +15,6 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the
"max_idle_connections" : 10, "max_idle_connections" : 10,
"idle_connection_timeout" : "5s", "idle_connection_timeout" : "5s",
"flush_delay": "2s", "flush_delay": "2s",
"batch_size" : 100
} }
} }
``` ```
@@ -28,4 +27,3 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the
- `max_idle_connections`: Maximally idle connections (default 10) - `max_idle_connections`: Maximally idle connections (default 10)
- `idle_connection_timeout`: Timeout for idle connections (default '5s') - `idle_connection_timeout`: Timeout for idle connections (default '5s')
- `flush_delay`: Batch all writes arriving in during this duration (default '1s', batching can be disabled by setting it to 0) - `flush_delay`: Batch all writes arriving in during this duration (default '1s', batching can be disabled by setting it to 0)
- `batch_size`: Maximal number of batched metrics. Either it is flushed because batch size or the `flush_delay` is reached

View File

@@ -6,14 +6,12 @@ import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2 "github.com/influxdata/influxdb-client-go/v2"
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
influxdb2ApiHttp "github.com/influxdata/influxdb-client-go/v2/api/http"
) )
type InfluxAsyncSinkConfig struct { type InfluxAsyncSinkConfig struct {
@@ -30,12 +28,10 @@ type InfluxAsyncSinkConfig struct {
BatchSize uint `json:"batch_size,omitempty"` BatchSize uint `json:"batch_size,omitempty"`
// Interval, in ms, in which is buffer flushed if it has not been already written (by reaching batch size) . Default 1000ms // Interval, in ms, in which is buffer flushed if it has not been already written (by reaching batch size) . Default 1000ms
FlushInterval uint `json:"flush_interval,omitempty"` FlushInterval uint `json:"flush_interval,omitempty"`
InfluxRetryInterval string `json:"retry_interval,omitempty"` InfluxRetryInterval string `json:"retry_interval"`
InfluxExponentialBase uint `json:"retry_exponential_base,omitempty"` InfluxExponentialBase uint `json:"retry_exponential_base"`
InfluxMaxRetries uint `json:"max_retries,omitempty"` InfluxMaxRetries uint `json:"max_retries"`
InfluxMaxRetryTime string `json:"max_retry_time,omitempty"` InfluxMaxRetryTime string `json:"max_retry_time"`
CustomFlushInterval string `json:"custom_flush_interval,omitempty"`
MaxRetryAttempts uint `json:"max_retry_attempts,omitempty"`
} }
type InfluxAsyncSink struct { type InfluxAsyncSink struct {
@@ -46,8 +42,6 @@ type InfluxAsyncSink struct {
config InfluxAsyncSinkConfig config InfluxAsyncSinkConfig
influxRetryInterval uint influxRetryInterval uint
influxMaxRetryTime uint influxMaxRetryTime uint
customFlushInterval time.Duration
flushTimer *time.Timer
} }
func (s *InfluxAsyncSink) connect() error { func (s *InfluxAsyncSink) connect() error {
@@ -66,34 +60,20 @@ func (s *InfluxAsyncSink) connect() error {
cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database) cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database)
clientOptions := influxdb2.DefaultOptions() clientOptions := influxdb2.DefaultOptions()
if s.config.BatchSize != 0 { if s.config.BatchSize != 0 {
cclog.ComponentDebug(s.name, "Batch size", s.config.BatchSize)
clientOptions.SetBatchSize(s.config.BatchSize) clientOptions.SetBatchSize(s.config.BatchSize)
} }
if s.config.FlushInterval != 0 { if s.config.FlushInterval != 0 {
cclog.ComponentDebug(s.name, "Flush interval", s.config.FlushInterval)
clientOptions.SetFlushInterval(s.config.FlushInterval) clientOptions.SetFlushInterval(s.config.FlushInterval)
} }
if s.influxRetryInterval != 0 {
cclog.ComponentDebug(s.name, "MaxRetryInterval", s.influxRetryInterval)
clientOptions.SetMaxRetryInterval(s.influxRetryInterval)
}
if s.influxMaxRetryTime != 0 {
cclog.ComponentDebug(s.name, "MaxRetryTime", s.influxMaxRetryTime)
clientOptions.SetMaxRetryTime(s.influxMaxRetryTime)
}
if s.config.InfluxExponentialBase != 0 {
cclog.ComponentDebug(s.name, "Exponential Base", s.config.InfluxExponentialBase)
clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
}
if s.config.InfluxMaxRetries != 0 {
cclog.ComponentDebug(s.name, "Max Retries", s.config.InfluxMaxRetries)
clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
}
clientOptions.SetTLSConfig( clientOptions.SetTLSConfig(
&tls.Config{ &tls.Config{
InsecureSkipVerify: true, InsecureSkipVerify: true,
}, },
).SetPrecision(time.Second) )
clientOptions.SetMaxRetryInterval(s.influxRetryInterval)
clientOptions.SetMaxRetryTime(s.influxMaxRetryTime)
clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions) s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions)
s.writeApi = s.client.WriteAPI(s.config.Organization, s.config.Database) s.writeApi = s.client.WriteAPI(s.config.Organization, s.config.Database)
@@ -104,23 +84,10 @@ func (s *InfluxAsyncSink) connect() error {
if !ok { if !ok {
return fmt.Errorf("connection to %s not healthy", uri) return fmt.Errorf("connection to %s not healthy", uri)
} }
s.writeApi.SetWriteFailedCallback(func(batch string, err influxdb2ApiHttp.Error, retryAttempts uint) bool {
mlist := strings.Split(batch, "\n")
cclog.ComponentError(s.name, fmt.Sprintf("Failed to write batch with %d metrics %d times (max: %d): %s", len(mlist), retryAttempts, s.config.MaxRetryAttempts, err.Error()))
return retryAttempts <= s.config.MaxRetryAttempts
})
return nil return nil
} }
func (s *InfluxAsyncSink) Write(m lp.CCMetric) error { func (s *InfluxAsyncSink) Write(m lp.CCMetric) error {
if s.customFlushInterval != 0 && s.flushTimer == nil {
// Run a batched flush for all lines that have arrived in the defined interval
s.flushTimer = time.AfterFunc(s.customFlushInterval, func() {
if err := s.Flush(); err != nil {
cclog.ComponentError(s.name, "flush failed:", err.Error())
}
})
}
s.writeApi.WritePoint( s.writeApi.WritePoint(
m.ToPoint(s.meta_as_tags), m.ToPoint(s.meta_as_tags),
) )
@@ -128,11 +95,7 @@ func (s *InfluxAsyncSink) Write(m lp.CCMetric) error {
} }
func (s *InfluxAsyncSink) Flush() error { func (s *InfluxAsyncSink) Flush() error {
cclog.ComponentDebug(s.name, "Flushing")
s.writeApi.Flush() s.writeApi.Flush()
if s.customFlushInterval != 0 && s.flushTimer != nil {
s.flushTimer = nil
}
return nil return nil
} }
@@ -147,17 +110,13 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) {
s.name = fmt.Sprintf("InfluxSink(%s)", name) s.name = fmt.Sprintf("InfluxSink(%s)", name)
// Set default for maximum number of points sent to server in single request. // Set default for maximum number of points sent to server in single request.
s.config.BatchSize = 0 s.config.BatchSize = 100
s.influxRetryInterval = 0 s.influxRetryInterval = uint(time.Duration(1) * time.Second)
//s.config.InfluxRetryInterval = "1s" s.config.InfluxRetryInterval = "1s"
s.influxMaxRetryTime = 0 s.influxMaxRetryTime = uint(7 * time.Duration(24) * time.Hour)
//s.config.InfluxMaxRetryTime = "168h" s.config.InfluxMaxRetryTime = "168h"
s.config.InfluxMaxRetries = 0 s.config.InfluxMaxRetries = 20
s.config.InfluxExponentialBase = 0 s.config.InfluxExponentialBase = 2
s.config.FlushInterval = 0
s.config.CustomFlushInterval = ""
s.customFlushInterval = time.Duration(0)
s.config.MaxRetryAttempts = 1
// Default retry intervals (in seconds) // Default retry intervals (in seconds)
// 1 2 // 1 2
@@ -209,15 +168,6 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) {
s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval) s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval)
s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime) s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime)
// Use a own timer for calling Flush()
if len(s.config.CustomFlushInterval) > 0 {
t, err := time.ParseDuration(s.config.CustomFlushInterval)
if err != nil {
return nil, fmt.Errorf("invalid duration in 'custom_flush_interval': %v", err)
}
s.customFlushInterval = t
}
// Connect to InfluxDB server // Connect to InfluxDB server
if err := s.connect(); err != nil { if err := s.connect(); err != nil {
return nil, fmt.Errorf("unable to connect: %v", err) return nil, fmt.Errorf("unable to connect: %v", err)

View File

@@ -6,32 +6,28 @@ import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"sync"
"time" "time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2 "github.com/influxdata/influxdb-client-go/v2"
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
"github.com/influxdata/influxdb-client-go/v2/api/write"
) )
type InfluxSinkConfig struct { type InfluxSinkConfig struct {
defaultSinkConfig defaultSinkConfig
Host string `json:"host,omitempty"` Host string `json:"host,omitempty"`
Port string `json:"port,omitempty"` Port string `json:"port,omitempty"`
Database string `json:"database,omitempty"` Database string `json:"database,omitempty"`
User string `json:"user,omitempty"` User string `json:"user,omitempty"`
Password string `json:"password,omitempty"` Password string `json:"password,omitempty"`
Organization string `json:"organization,omitempty"` Organization string `json:"organization,omitempty"`
SSL bool `json:"ssl,omitempty"` SSL bool `json:"ssl,omitempty"`
FlushDelay string `json:"flush_delay,omitempty"` RetentionPol string `json:"retention_policy,omitempty"`
BatchSize int `json:"batch_size,omitempty"` InfluxRetryInterval string `json:"retry_interval"`
RetentionPol string `json:"retention_policy,omitempty"` InfluxExponentialBase uint `json:"retry_exponential_base"`
// InfluxRetryInterval string `json:"retry_interval"` InfluxMaxRetries uint `json:"max_retries"`
// InfluxExponentialBase uint `json:"retry_exponential_base"` InfluxMaxRetryTime string `json:"max_retry_time"`
// InfluxMaxRetries uint `json:"max_retries"`
// InfluxMaxRetryTime string `json:"max_retry_time"`
//InfluxMaxRetryDelay string `json:"max_retry_delay"` // It is mentioned in the docs but there is no way to set it //InfluxMaxRetryDelay string `json:"max_retry_delay"` // It is mentioned in the docs but there is no way to set it
} }
@@ -42,71 +38,37 @@ type InfluxSink struct {
config InfluxSinkConfig config InfluxSinkConfig
influxRetryInterval uint influxRetryInterval uint
influxMaxRetryTime uint influxMaxRetryTime uint
batch []*write.Point
flushTimer *time.Timer
flushDelay time.Duration
lock sync.Mutex // Flush() runs in another goroutine, so this lock has to protect the buffer
//influxMaxRetryDelay uint //influxMaxRetryDelay uint
} }
// connect connects to the InfluxDB server
func (s *InfluxSink) connect() error { func (s *InfluxSink) connect() error {
var auth string
// URI options:
// * http://host:port
// * https://host:port
var uri string var uri string
if s.config.SSL { if s.config.SSL {
uri = fmt.Sprintf("https://%s:%s", s.config.Host, s.config.Port) uri = fmt.Sprintf("https://%s:%s", s.config.Host, s.config.Port)
} else { } else {
uri = fmt.Sprintf("http://%s:%s", s.config.Host, s.config.Port) uri = fmt.Sprintf("http://%s:%s", s.config.Host, s.config.Port)
} }
// Authentication options:
// * token
// * username:password
var auth string
if len(s.config.User) == 0 { if len(s.config.User) == 0 {
auth = s.config.Password auth = s.config.Password
} else { } else {
auth = fmt.Sprintf("%s:%s", s.config.User, s.config.Password) auth = fmt.Sprintf("%s:%s", s.config.User, s.config.Password)
} }
cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database) cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database)
// Set influxDB client options
clientOptions := influxdb2.DefaultOptions() clientOptions := influxdb2.DefaultOptions()
// if s.influxRetryInterval != 0 {
// cclog.ComponentDebug(s.name, "MaxRetryInterval", s.influxRetryInterval)
// clientOptions.SetMaxRetryInterval(s.influxRetryInterval)
// }
// if s.influxMaxRetryTime != 0 {
// cclog.ComponentDebug(s.name, "MaxRetryTime", s.influxMaxRetryTime)
// clientOptions.SetMaxRetryTime(s.influxMaxRetryTime)
// }
// if s.config.InfluxExponentialBase != 0 {
// cclog.ComponentDebug(s.name, "Exponential Base", s.config.InfluxExponentialBase)
// clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
// }
// if s.config.InfluxMaxRetries != 0 {
// cclog.ComponentDebug(s.name, "Max Retries", s.config.InfluxMaxRetries)
// clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
// }
// Do not check InfluxDB certificate
clientOptions.SetTLSConfig( clientOptions.SetTLSConfig(
&tls.Config{ &tls.Config{
InsecureSkipVerify: true, InsecureSkipVerify: true,
}, },
) )
clientOptions.SetPrecision(time.Second) clientOptions.SetMaxRetryInterval(s.influxRetryInterval)
clientOptions.SetMaxRetryTime(s.influxMaxRetryTime)
clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
// Create new writeAPI
s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions) s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions)
s.writeApi = s.client.WriteAPIBlocking(s.config.Organization, s.config.Database) s.writeApi = s.client.WriteAPIBlocking(s.config.Organization, s.config.Database)
// Check InfluxDB server accessibility
ok, err := s.client.Ping(context.Background()) ok, err := s.client.Ping(context.Background())
if err != nil { if err != nil {
return err return err
@@ -118,142 +80,61 @@ func (s *InfluxSink) connect() error {
} }
func (s *InfluxSink) Write(m lp.CCMetric) error { func (s *InfluxSink) Write(m lp.CCMetric) error {
err :=
if len(s.batch) == 0 && s.flushDelay != 0 { s.writeApi.WritePoint(
// This is the first write since the last flush, start the flushTimer! context.Background(),
if s.flushTimer != nil && s.flushTimer.Stop() { m.ToPoint(s.meta_as_tags),
cclog.ComponentDebug(s.name, "unexpected: the flushTimer was already running?") )
} return err
// Run a batched flush for all lines that have arrived in the last flush delay interval
s.flushTimer = time.AfterFunc(s.flushDelay, func() {
if err := s.Flush(); err != nil {
cclog.ComponentError(s.name, "flush failed:", err.Error())
}
})
}
// Append metric to batch slice
p := m.ToPoint(s.meta_as_tags)
s.lock.Lock()
s.batch = append(s.batch, p)
s.lock.Unlock()
// Flush synchronously if "flush_delay" is zero
if s.flushDelay == 0 {
return s.Flush()
}
// Flush if batch size is reached
if len(s.batch) == s.config.BatchSize {
return s.Flush()
}
return nil
} }
// Flush sends all metrics buffered in batch slice to InfluxDB server
func (s *InfluxSink) Flush() error { func (s *InfluxSink) Flush() error {
// Lock access to batch slice
s.lock.Lock()
defer s.lock.Unlock()
// Nothing to do, batch slice is empty
if len(s.batch) == 0 {
return nil
}
// Send metrics from batch slice
err := s.writeApi.WritePoint(context.Background(), s.batch...)
if err != nil {
cclog.ComponentError(s.name, "flush failed:", err.Error())
return err
}
// Clear batch slice
for i := range s.batch {
s.batch[i] = nil
}
s.batch = s.batch[:0]
return nil return nil
} }
func (s *InfluxSink) Close() { func (s *InfluxSink) Close() {
cclog.ComponentDebug(s.name, "Closing InfluxDB connection") cclog.ComponentDebug(s.name, "Closing InfluxDB connection")
s.flushTimer.Stop()
s.Flush()
s.client.Close() s.client.Close()
} }
// NewInfluxSink create a new InfluxDB sink
func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { func NewInfluxSink(name string, config json.RawMessage) (Sink, error) {
s := new(InfluxSink) s := new(InfluxSink)
s.name = fmt.Sprintf("InfluxSink(%s)", name) s.name = fmt.Sprintf("InfluxSink(%s)", name)
// Set config default values
s.config.BatchSize = 100
s.config.FlushDelay = "1s"
// Read config
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &s.config) err := json.Unmarshal(config, &s.config)
if err != nil { if err != nil {
return nil, err return nil, err
} }
} }
s.influxRetryInterval = 0 s.influxRetryInterval = uint(time.Duration(1) * time.Second)
s.influxMaxRetryTime = 0 s.config.InfluxRetryInterval = "1s"
// s.config.InfluxRetryInterval = "" s.influxMaxRetryTime = uint(7 * time.Duration(24) * time.Hour)
// s.config.InfluxMaxRetryTime = "" s.config.InfluxMaxRetryTime = "168h"
// s.config.InfluxMaxRetries = 0 s.config.InfluxMaxRetries = 20
// s.config.InfluxExponentialBase = 0 s.config.InfluxExponentialBase = 2
if len(s.config.Host) == 0 { if len(s.config.Host) == 0 ||
return nil, errors.New("Missing host configuration required by InfluxSink") len(s.config.Port) == 0 ||
len(s.config.Database) == 0 ||
len(s.config.Organization) == 0 ||
len(s.config.Password) == 0 {
return nil, errors.New("not all configuration variables set required by InfluxSink")
} }
if len(s.config.Port) == 0 {
return nil, errors.New("Missing port configuration required by InfluxSink")
}
if len(s.config.Database) == 0 {
return nil, errors.New("Missing database configuration required by InfluxSink")
}
if len(s.config.Organization) == 0 {
return nil, errors.New("Missing organization configuration required by InfluxSink")
}
if len(s.config.Password) == 0 {
return nil, errors.New("Missing password configuration required by InfluxSink")
}
// Create lookup map to use meta infos as tags in the output metric // Create lookup map to use meta infos as tags in the output metric
s.meta_as_tags = make(map[string]bool) s.meta_as_tags = make(map[string]bool)
for _, k := range s.config.MetaAsTags { for _, k := range s.config.MetaAsTags {
s.meta_as_tags[k] = true s.meta_as_tags[k] = true
} }
// toUint := func(duration string, def uint) uint { toUint := func(duration string, def uint) uint {
// if len(duration) > 0 { t, err := time.ParseDuration(duration)
// t, err := time.ParseDuration(duration)
// if err == nil {
// return uint(t.Milliseconds())
// }
// }
// return def
// }
// s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval)
// s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime)
// Configure flush delay duration
if len(s.config.FlushDelay) > 0 {
t, err := time.ParseDuration(s.config.FlushDelay)
if err == nil { if err == nil {
s.flushDelay = t return uint(t.Milliseconds())
} }
return def
} }
s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval)
// allocate batch slice s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime)
s.batch = make([]*write.Point, 0, s.config.BatchSize)
// Connect to InfluxDB server // Connect to InfluxDB server
if err := s.connect(); err != nil { if err := s.connect(); err != nil {

View File

@@ -17,8 +17,10 @@ The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.de
"password" : "examplepw", "password" : "examplepw",
"organization": "myorg", "organization": "myorg",
"ssl": true, "ssl": true,
"flush_delay" : "1s", "retry_interval" : "1s",
"batch_size" : 100 "retry_exponential_base" : 2,
"max_retries": 20,
"max_retry_time" : "168h"
} }
} }
``` ```
@@ -32,6 +34,9 @@ The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.de
- `password`: Password for basic authentification - `password`: Password for basic authentification
- `organization`: Organization in the InfluxDB - `organization`: Organization in the InfluxDB
- `ssl`: Use SSL connection - `ssl`: Use SSL connection
- `flush_delay`: Group metrics coming in to a single batch - `retry_interval`: Base retry interval for failed write requests, default 1s
- `batch_size`: Maximal batch size - `retry_exponential_base`: The retry interval is exponentially increased with this base, default 2
- `max_retries`: Maximal number of retry attempts
- `max_retry_time`: Maximal time to retry failed writes, default 168h (one week)
For information about the calculation of the retry interval settings, see [offical influxdb-client-go documentation](https://github.com/influxdata/influxdb-client-go#handling-of-failed-async-writes)