Merge develop into main (#109)

* Add cpu_used (all-cpu_idle) to CpustatCollector

* Update to line-protocol/v2

* Update runonce.yml with Golang 1.20

* Update fsnotify in LIKWID Collector

* Use not a pointer to line-protocol.Encoder

* Simplify Makefile

* Use only as many arguments as required

* Allow sum function to handle non float types

* Allow values to be a slice of type float64, float32, int, int64, int32, bool

* Use generic function to simplify code

* Add missing case for type []int32

* Use generic function to compute minimum

* Use generic function to compute maximum

* Use generic function to compute average

* Add error value to sumAnyType

* Use generic function to compute median

* For older versions of go slices is not part of the installation

* Remove old entries from go.sum

* Use simpler sort function

* Compute metrics ib_total and ib_total_pkts

* Add aggregated metrics.
Add missing units

* Update likwidMetric.go

Fixes a potential bug when `fsnotify.NewWatcher()` fails with an error

* Completly avoid memory allocations in infinibandMetric read()

* Fixed initialization: Initalization and measurements should run in the same thread

* Add safe.directory to Release action

* Fix path after installation to /usr/bin after installation

* ioutil.ReadFile is deprecated: As of Go 1.16, this function simply calls os.ReadFile

* Switch to package slices from the golang 1.21 default library

* Read file line by line

* Read file line by line

* Read file line by line

* Use CamelCase

* Use CamelCase

* Fix function getNumaDomain, it always returned 0

* Avoid type conversion by using Atoi
Avoid copying structs by using pointer access
Increase readability with CamelCase variable names

* Add caching

* Cache CpuData

* Cleanup

* Use init function to initalize cache structure to avoid multi threading problems

* Reuse information from /proc/cpuinfo

* Avoid slice cloning. Directly use the cache

* Add DieList

* Add NumaDomainList and SMTList

* Cleanup

* Add comment

* Lookup core ID from /sys/devices/system/cpu, /proc/cpuinfo is not portable

* Lookup all information from /sys/devices/system/cpu, /proc/cpuinfo is not portable

* Correctly handle lists from /sys

* Add Simultaneous Multithreading siblings

* Replace deprecated thread_siblings_list by core_cpus_list

* Reduce number of required slices

* Allow to send total values per core, socket and node

* Send all metrics with same time stamp
calcEventsetMetrics does only computiation, counter measurement is done before

* Input parameters should be float64 when evaluating to float64

* Send all metrics with same time stamp
calcGlobalMetrics does only computiation, counter measurement is done before

* Remove unused variable gmresults

* Add comments

* Updated go packages

* Add build with golang 1.21

* Switch to checkout action version 4

* Switch to setup-go action version 4

* Add workflow_dispatch to allow manual run of workflow

* Add workflow_dispatch to allow manual run of workflow

* Add release build jobs to runonce.yml

* Switch to golang 1.20 for RHEL based distributions

* Use dnf to download golang

* Remove golang versions before 1.20

* Upgrade Ubuntu focal -> jammy

* Pipe golang tar package directly to tar

* Update golang version

* Fix Ubuntu version number

* Add links to ipmi and redfish receivers

* Fix http server addr format

* github.com/influxdata/line-protocol -> github.com/influxdata/line-protocol/v2/lineprotocol

* Corrected spelling

* Add some comments

* github.com/influxdata/line-protocol -> github.com/influxdata/line-protocol/v2/lineprotocol

* Allow other fields not only field "value"

* Add some basic debugging documentation

* Add some basic debugging documentation

* Use a lock for the flush timer

* Add tags in lexical order as required by AddTag()

* Only access meta data, when it gets used as tag

* Use slice to store lexialicly orderd key value pairs

* Increase golang version requirement to 1.20.

* Avoid package cmp to allow builds with golang v1.20

* Fix: Error NVML library not found did crash
cc-metric-collector with "SIGSEGV: segmentation violation"

* Add config option idle_timeout

* Add basic authentication support

* Add basic authentication support

* Avoid unneccessary memory allocations

* Add documentation for send_*_total values

* Use generic package maps to clone maps

* Reuse flush timer

* Add Influx client options

* Reuse ccTopology functionality

* Do not store unused topology information

* Add batch_size config

* Cleanup

* Use stype and stype-id for the NIC in NetstatCollector

* Wait for concurrent flush operations to finish

* Be more verbose in error messages

* Reverted previous changes.
Made the code to complex without much advantages

* Use line protocol encoder

* Go pkg update

* Stop flush timer, when immediatelly flushing

* Fix: Corrected unlock access to batch slice

* Add config option to specify whether to use GZip compression in influx write requests

* Add asynchron send of encoder metrics

* Use DefaultServeMux instead of github.com/gorilla/mux

* Add config option for HTTP keep-alives

* Be more strict, when parsing json

* Add config option for HTTP request timeout and Retry interval

* Allow more then one background send operation

* Fix %sysusers_create_package args (#108)

%sysusers_create_package requires two arguments. See: https://github.com/systemd/systemd/blob/main/src/rpm/macros.systemd.in#L165

* Add nfsiostat to list of collectors

---------

Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
Co-authored-by: Holger Obermaier <holgerob@gmx.de>
Co-authored-by: Obihörnchen <obihoernchende@gmail.com>
This commit is contained in:
Thomas Gruber
2023-12-04 12:21:26 +01:00
committed by GitHub
parent 9df1054e32
commit 6ab45dd3ec
39 changed files with 1835 additions and 1030 deletions

View File

@@ -1,6 +1,7 @@
package sinks
import (
"bytes"
"encoding/json"
"errors"
"fmt"
@@ -94,9 +95,10 @@ func NewGangliaSink(name string, config json.RawMessage) (Sink, error) {
s.config.AddTagsAsDesc = false
s.config.AddGangliaGroup = false
if len(config) > 0 {
err := json.Unmarshal(config, &s.config)
if err != nil {
cclog.ComponentError(s.name, "Error reading config for", s.name, ":", err.Error())
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&s.config); err != nil {
cclog.ComponentError(s.name, "Error reading config:", err.Error())
return nil, err
}
}

View File

@@ -12,92 +12,195 @@ import (
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
influx "github.com/influxdata/line-protocol/v2/lineprotocol"
"golang.org/x/exp/slices"
)
type HttpSinkConfig struct {
defaultSinkConfig
URL string `json:"url"`
JWT string `json:"jwt,omitempty"`
Timeout string `json:"timeout,omitempty"`
// The full URL of the endpoint
URL string `json:"url"`
// JSON web tokens for authentication (Using the *Bearer* scheme)
JWT string `json:"jwt,omitempty"`
// Basic authentication
Username string `json:"username"`
Password string `json:"password"`
useBasicAuth bool
// time limit for requests made by the http client
Timeout string `json:"timeout,omitempty"`
timeout time.Duration
// Maximum amount of time an idle (keep-alive) connection will remain idle before closing itself
// should be larger than the measurement interval to keep the connection open
IdleConnTimeout string `json:"idle_connection_timeout,omitempty"`
FlushDelay string `json:"flush_delay,omitempty"`
MaxRetries int `json:"max_retries,omitempty"`
idleConnTimeout time.Duration
// Batch all writes arriving in during this duration
// (default '5s', batching can be disabled by setting it to 0)
FlushDelay string `json:"flush_delay,omitempty"`
flushDelay time.Duration
// Maximum number of retries to connect to the http server (default: 3)
MaxRetries int `json:"max_retries,omitempty"`
}
type key_value_pair struct {
key string
value string
}
type HttpSink struct {
sink
client *http.Client
client *http.Client
// influx line protocol encoder
encoder influx.Encoder
lock sync.Mutex // Flush() runs in another goroutine, so this lock has to protect the buffer
//buffer *bytes.Buffer
flushTimer *time.Timer
config HttpSinkConfig
idleConnTimeout time.Duration
timeout time.Duration
flushDelay time.Duration
// List of tags and meta data tags which should be used as tags
extended_tag_list []key_value_pair
// Flush() runs in another goroutine and accesses the influx line protocol encoder,
// so this encoderLock has to protect the encoder
encoderLock sync.Mutex
// timer to run Flush()
flushTimer *time.Timer
// Lock to assure that only one timer is running at a time
timerLock sync.Mutex
config HttpSinkConfig
}
// Write sends metric m as http message
func (s *HttpSink) Write(m lp.CCMetric) error {
var err error = nil
var firstWriteOfBatch bool = false
p := m.ToPoint(s.meta_as_tags)
s.lock.Lock()
firstWriteOfBatch = len(s.encoder.Bytes()) == 0
v, ok := m.GetField("value")
if ok {
s.encoder.StartLine(p.Name())
for _, v := range p.TagList() {
s.encoder.AddTag(v.Key, v.Value)
}
// Lock for encoder usage
s.encoderLock.Lock()
s.encoder.AddField("value", influx.MustNewValue(v))
s.encoder.EndLine(p.Time())
err = s.encoder.Err()
if err != nil {
cclog.ComponentError(s.name, "encoding failed:", err.Error())
s.lock.Unlock()
return err
// Encode measurement name
s.encoder.StartLine(m.Name())
// copy tags and meta data which should be used as tags
s.extended_tag_list = s.extended_tag_list[:0]
for key, value := range m.Tags() {
s.extended_tag_list =
append(
s.extended_tag_list,
key_value_pair{
key: key,
value: value,
},
)
}
for _, key := range s.config.MetaAsTags {
if value, ok := m.GetMeta(key); ok {
s.extended_tag_list =
append(
s.extended_tag_list,
key_value_pair{
key: key,
value: value,
},
)
}
}
s.lock.Unlock()
if s.flushDelay == 0 {
// Encode tags (they musts be in lexical order)
slices.SortFunc(
s.extended_tag_list,
func(a key_value_pair, b key_value_pair) int {
if a.key < b.key {
return -1
}
if a.key > b.key {
return +1
}
return 0
},
)
for i := range s.extended_tag_list {
s.encoder.AddTag(
s.extended_tag_list[i].key,
s.extended_tag_list[i].value,
)
}
// Encode fields
for key, value := range m.Fields() {
s.encoder.AddField(key, influx.MustNewValue(value))
}
// Encode time stamp
s.encoder.EndLine(m.Time())
// Check for encoder errors
err := s.encoder.Err()
// Unlock encoder usage
s.encoderLock.Unlock()
// Check that encoding worked
if err != nil {
return fmt.Errorf("Encoding failed: %v", err)
}
if s.config.flushDelay == 0 {
// Directly flush if no flush delay is configured
return s.Flush()
}
} else if s.timerLock.TryLock() {
if firstWriteOfBatch {
if s.flushTimer == nil {
s.flushTimer = time.AfterFunc(s.flushDelay, func() {
if err := s.Flush(); err != nil {
cclog.ComponentError(s.name, "flush failed:", err.Error())
}
})
// Setup flush timer when flush delay is configured
// and no other timer is already running
if s.flushTimer != nil {
// Restarting existing flush timer
cclog.ComponentDebug(s.name, "Write(): Restarting flush timer")
s.flushTimer.Reset(s.config.flushDelay)
} else {
s.flushTimer.Reset(s.flushDelay)
// Creating and starting flush timer
cclog.ComponentDebug(s.name, "Write(): Starting new flush timer")
s.flushTimer = time.AfterFunc(
s.config.flushDelay,
func() {
defer s.timerLock.Unlock()
cclog.ComponentDebug(s.name, "Starting flush triggered by flush timer")
if err := s.Flush(); err != nil {
cclog.ComponentError(s.name, "Flush triggered by flush timer: flush failed:", err)
}
})
}
}
return nil
}
// Flush sends all metrics stored in encoder to HTTP server
func (s *HttpSink) Flush() error {
// Own lock for as short as possible: the time it takes to copy the buffer.
s.lock.Lock()
buf := make([]byte, len(s.encoder.Bytes()))
copy(buf, s.encoder.Bytes())
// Lock for encoder usage
// Own lock for as short as possible: the time it takes to clone the buffer.
s.encoderLock.Lock()
buf := slices.Clone(s.encoder.Bytes())
s.encoder.Reset()
s.lock.Unlock()
// Unlock encoder usage
s.encoderLock.Unlock()
if len(buf) == 0 {
return nil
}
cclog.ComponentDebug(s.name, "Flush(): Flushing")
var res *http.Response
for i := 0; i < s.config.MaxRetries; i++ {
// Create new request to send buffer
req, err := http.NewRequest(http.MethodPost, s.config.URL, bytes.NewReader(buf))
if err != nil {
cclog.ComponentError(s.name, "failed to create request:", err.Error())
cclog.ComponentError(s.name, "Flush(): Failed to create HTTP request:", err)
return err
}
@@ -106,10 +209,15 @@ func (s *HttpSink) Flush() error {
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", s.config.JWT))
}
// Set basic authentication
if s.config.useBasicAuth {
req.SetBasicAuth(s.config.Username, s.config.Password)
}
// Do request
res, err = s.client.Do(req)
if err != nil {
cclog.ComponentError(s.name, "transport/tcp error:", err.Error())
cclog.ComponentError(s.name, "Flush(): transport/tcp error:", err)
// Wait between retries
time.Sleep(time.Duration(i+1) * (time.Second / 2))
continue
@@ -125,7 +233,7 @@ func (s *HttpSink) Flush() error {
// Handle application errors
if res.StatusCode != http.StatusOK {
err := errors.New(res.Status)
cclog.ComponentError(s.name, "application error:", err.Error())
cclog.ComponentError(s.name, "Flush(): Application error:", err)
return err
}
@@ -133,64 +241,93 @@ func (s *HttpSink) Flush() error {
}
func (s *HttpSink) Close() {
s.flushTimer.Stop()
if err := s.Flush(); err != nil {
cclog.ComponentError(s.name, "flush failed:", err.Error())
cclog.ComponentDebug(s.name, "Closing HTTP connection")
// Stop existing timer and immediately flush
if s.flushTimer != nil {
if ok := s.flushTimer.Stop(); ok {
s.timerLock.Unlock()
}
}
// Flush
if err := s.Flush(); err != nil {
cclog.ComponentError(s.name, "Close(): Flush failed:", err)
}
s.client.CloseIdleConnections()
}
// NewHttpSink creates a new http sink
func NewHttpSink(name string, config json.RawMessage) (Sink, error) {
s := new(HttpSink)
// Set default values
s.name = fmt.Sprintf("HttpSink(%s)", name)
s.config.IdleConnTimeout = "120s" // should be larger than the measurement interval.
// should be larger than the measurement interval to keep the connection open
s.config.IdleConnTimeout = "120s"
s.config.Timeout = "5s"
s.config.FlushDelay = "5s"
s.config.MaxRetries = 3
cclog.ComponentDebug(s.name, "init")
cclog.ComponentDebug(s.name, "Init()")
// Read config
if len(config) > 0 {
err := json.Unmarshal(config, &s.config)
if err != nil {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&s.config); err != nil {
cclog.ComponentError(s.name, "Error reading config:", err.Error())
return nil, err
}
}
if len(s.config.URL) == 0 {
return nil, errors.New("`url` config option is required for HTTP sink")
}
// Check basic authentication config
if len(s.config.Username) > 0 || len(s.config.Password) > 0 {
s.config.useBasicAuth = true
}
if s.config.useBasicAuth && len(s.config.Username) == 0 {
return nil, errors.New("basic authentication requires username")
}
if s.config.useBasicAuth && len(s.config.Password) == 0 {
return nil, errors.New("basic authentication requires password")
}
if len(s.config.IdleConnTimeout) > 0 {
t, err := time.ParseDuration(s.config.IdleConnTimeout)
if err == nil {
cclog.ComponentDebug(s.name, "idleConnTimeout", t)
s.idleConnTimeout = t
cclog.ComponentDebug(s.name, "Init(): idleConnTimeout", t)
s.config.idleConnTimeout = t
}
}
if len(s.config.Timeout) > 0 {
t, err := time.ParseDuration(s.config.Timeout)
if err == nil {
s.timeout = t
cclog.ComponentDebug(s.name, "timeout", t)
s.config.timeout = t
cclog.ComponentDebug(s.name, "Init(): timeout", t)
}
}
if len(s.config.FlushDelay) > 0 {
t, err := time.ParseDuration(s.config.FlushDelay)
if err == nil {
s.flushDelay = t
cclog.ComponentDebug(s.name, "flushDelay", t)
s.config.flushDelay = t
cclog.ComponentDebug(s.name, "Init(): flushDelay", t)
}
}
// Create lookup map to use meta infos as tags in the output metric
s.meta_as_tags = make(map[string]bool)
for _, k := range s.config.MetaAsTags {
s.meta_as_tags[k] = true
// Create http client
s.client = &http.Client{
Transport: &http.Transport{
MaxIdleConns: 1, // We will only ever talk to one host.
IdleConnTimeout: s.config.idleConnTimeout,
},
Timeout: s.config.timeout,
}
tr := &http.Transport{
MaxIdleConns: 1, // We will only ever talk to one host.
IdleConnTimeout: s.idleConnTimeout,
}
s.client = &http.Client{Transport: tr, Timeout: s.timeout}
s.encoder.SetPrecision(influx.Second)
// Configure influx line protocol encoder
s.encoder.SetPrecision(influx.Nanosecond)
s.extended_tag_list = make([]key_value_pair, 0)
return s, nil
}

View File

@@ -13,10 +13,12 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the
],
"url" : "https://my-monitoring.example.com:1234/api/write",
"jwt" : "blabla.blabla.blabla",
"username": "myUser",
"password": "myPW",
"timeout": "5s",
"max_idle_connections" : 10,
"idle_connection_timeout" : "5s",
"flush_delay": "2s",
"batch_size": 1000
}
}
```
@@ -24,8 +26,11 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the
- `type`: makes the sink an `http` sink
- `meta_as_tags`: Move specific meta information to the tags in the output (optional)
- `url`: The full URL of the endpoint
- `jwt`: JSON web tokens for authentification (Using the *Bearer* scheme)
- `jwt`: JSON web tokens for authentication (Using the *Bearer* scheme)
- `username`: username for basic authentication
- `password`: password for basic authentication
- `timeout`: General timeout for the HTTP client (default '5s')
- `max_idle_connections`: Maximally idle connections (default 10)
- `idle_connection_timeout`: Timeout for idle connections (default '5s')
- `max_retries`: Maximum number of retries to connect to the http server
- `idle_connection_timeout`: Timeout for idle connections (default '120s'). Should be larger than the measurement interval to keep the connection open
- `flush_delay`: Batch all writes arriving in during this duration (default '1s', batching can be disabled by setting it to 0)
- `batch_size`: Maximal batch size. If `batch_size` is reached before the end of `flush_delay`, the metrics are sent without further delay

View File

@@ -1,6 +1,7 @@
package sinks
import (
"bytes"
"context"
"crypto/tls"
"encoding/json"
@@ -180,8 +181,10 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) {
// 262144 524288
if len(config) > 0 {
err := json.Unmarshal(config, &s.config)
if err != nil {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&s.config); err != nil {
cclog.ComponentError(s.name, "Error reading config:", err.Error())
return nil, err
}
}

View File

@@ -1,6 +1,7 @@
package sinks
import (
"bytes"
"context"
"crypto/tls"
"encoding/json"
@@ -13,7 +14,8 @@ import (
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
"github.com/influxdata/influxdb-client-go/v2/api/write"
influx "github.com/influxdata/line-protocol/v2/lineprotocol"
"golang.org/x/exp/slices"
)
type InfluxSink struct {
@@ -32,20 +34,49 @@ type InfluxSink struct {
// Maximum number of points sent to server in single request.
// Default: 1000
BatchSize int `json:"batch_size,omitempty"`
// Time interval for delayed sending of metrics.
// If the buffers are already filled before the end of this interval,
// the metrics are sent without further delay.
// Default: 1s
FlushInterval string `json:"flush_delay,omitempty"`
// Number of metrics that are dropped when buffer is full
// Default: 100
DropRate int `json:"drop_rate,omitempty"`
flushDelay time.Duration
// Influx client options:
// HTTP request timeout
HTTPRequestTimeout string `json:"http_request_timeout"`
// Retry interval
InfluxRetryInterval string `json:"retry_interval,omitempty"`
// maximum delay between each retry attempt
InfluxMaxRetryInterval string `json:"max_retry_interval,omitempty"`
// base for the exponential retry delay
InfluxExponentialBase uint `json:"retry_exponential_base,omitempty"`
// maximum count of retry attempts of failed writes
InfluxMaxRetries uint `json:"max_retries,omitempty"`
// maximum total retry timeout
InfluxMaxRetryTime string `json:"max_retry_time,omitempty"`
// Specify whether to use GZip compression in write requests
InfluxUseGzip bool `json:"use_gzip"`
}
batch []*write.Point
flushTimer *time.Timer
flushDelay time.Duration
batchMutex sync.Mutex // Flush() runs in another goroutine, so this lock has to protect the buffer
flushTimerMutex sync.Mutex // Ensure only one flush timer is running
// influx line protocol encoder
encoder influx.Encoder
// number of records stored in the encoder
numRecordsInEncoder int
// List of tags and meta data tags which should be used as tags
extended_tag_list []key_value_pair
// Flush() runs in another goroutine and accesses the influx line protocol encoder,
// so this encoderLock has to protect the encoder and numRecordsInEncoder
encoderLock sync.Mutex
// timer to run Flush()
flushTimer *time.Timer
// Lock to assure that only one timer is running at a time
timerLock sync.Mutex
// WaitGroup to ensure only one send operation is running at a time
sendWaitGroup sync.WaitGroup
}
// connect connects to the InfluxDB server
@@ -70,7 +101,7 @@ func (s *InfluxSink) connect() error {
} else {
auth = fmt.Sprintf("%s:%s", s.config.User, s.config.Password)
}
cclog.ComponentDebug(s.name,
cclog.ComponentDebug(s.name, "connect():",
"Using URI='"+uri+"'",
"Org='"+s.config.Organization+"'",
"Bucket='"+s.config.Database+"'")
@@ -78,6 +109,95 @@ func (s *InfluxSink) connect() error {
// Set influxDB client options
clientOptions := influxdb2.DefaultOptions()
// set HTTP request timeout
if len(s.config.HTTPRequestTimeout) > 0 {
if t, err := time.ParseDuration(s.config.HTTPRequestTimeout); err == nil {
httpRequestTimeout := uint(t.Seconds())
clientOptions.SetHTTPRequestTimeout(httpRequestTimeout)
} else {
cclog.ComponentError(s.name, "connect():", "Failed to parse duration for HTTP RequestTimeout: ", s.config.HTTPRequestTimeout)
}
}
cclog.ComponentDebug(
s.name,
"connect():",
"Influx client options HTTPRequestTimeout:",
time.Second*time.Duration(clientOptions.HTTPRequestTimeout()))
// Set retry interval
if len(s.config.InfluxRetryInterval) > 0 {
if t, err := time.ParseDuration(s.config.InfluxRetryInterval); err == nil {
influxRetryInterval := uint(t.Milliseconds())
clientOptions.SetRetryInterval(influxRetryInterval)
} else {
cclog.ComponentError(s.name, "connect():", "Failed to parse duration for Influx RetryInterval: ", s.config.InfluxRetryInterval)
}
}
cclog.ComponentDebug(
s.name,
"connect():",
"Influx client options RetryInterval:",
time.Millisecond*time.Duration(clientOptions.RetryInterval()))
// Set the maximum delay between each retry attempt
if len(s.config.InfluxMaxRetryInterval) > 0 {
if t, err := time.ParseDuration(s.config.InfluxMaxRetryInterval); err == nil {
influxMaxRetryInterval := uint(t.Milliseconds())
clientOptions.SetMaxRetryInterval(influxMaxRetryInterval)
} else {
cclog.ComponentError(s.name, "connect():", "Failed to parse duration for Influx MaxRetryInterval: ", s.config.InfluxMaxRetryInterval)
}
}
cclog.ComponentDebug(
s.name,
"connect():",
"Influx client options MaxRetryInterval:",
time.Millisecond*time.Duration(clientOptions.MaxRetryInterval()))
// Set the base for the exponential retry delay
if s.config.InfluxExponentialBase != 0 {
clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
}
cclog.ComponentDebug(
s.name,
"connect():",
"Influx client options ExponentialBase:",
clientOptions.ExponentialBase())
// Set maximum count of retry attempts of failed writes
if s.config.InfluxMaxRetries != 0 {
clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
}
cclog.ComponentDebug(
s.name,
"connect():",
"Influx client options MaxRetries:",
clientOptions.MaxRetries())
// Set the maximum total retry timeout
if len(s.config.InfluxMaxRetryTime) > 0 {
if t, err := time.ParseDuration(s.config.InfluxMaxRetryTime); err == nil {
influxMaxRetryTime := uint(t.Milliseconds())
cclog.ComponentDebug(s.name, "connect():", "MaxRetryTime", s.config.InfluxMaxRetryTime)
clientOptions.SetMaxRetryTime(influxMaxRetryTime)
} else {
cclog.ComponentError(s.name, "connect():", "Failed to parse duration for Influx MaxRetryInterval: ", s.config.InfluxMaxRetryInterval)
}
}
cclog.ComponentDebug(
s.name,
"connect():",
"Influx client options MaxRetryTime:",
time.Millisecond*time.Duration(clientOptions.MaxRetryTime()))
// Specify whether to use GZip compression in write requests
clientOptions.SetUseGZip(s.config.InfluxUseGzip)
cclog.ComponentDebug(
s.name,
"connect():",
"Influx client options UseGZip:",
clientOptions.UseGZip())
// Do not check InfluxDB certificate
clientOptions.SetTLSConfig(
&tls.Config{
@@ -85,7 +205,8 @@ func (s *InfluxSink) connect() error {
},
)
clientOptions.SetPrecision(time.Second)
// Set time precision
clientOptions.SetPrecision(time.Nanosecond)
// Create new writeAPI
s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions)
@@ -102,95 +223,189 @@ func (s *InfluxSink) connect() error {
return nil
}
// Write sends metric m in influxDB line protocol
func (s *InfluxSink) Write(m lp.CCMetric) error {
if s.flushDelay != 0 && s.flushTimerMutex.TryLock() {
// Run a batched flush for all metrics that arrived in the last flush delay interval
cclog.ComponentDebug(s.name, "Starting new flush timer")
s.flushTimer = time.AfterFunc(
s.flushDelay,
func() {
defer s.flushTimerMutex.Unlock()
cclog.ComponentDebug(s.name, "Starting flush in flush timer")
if err := s.Flush(); err != nil {
cclog.ComponentError(s.name, "Flush timer: flush failed:", err)
}
})
// Lock for encoder usage
s.encoderLock.Lock()
// Encode measurement name
s.encoder.StartLine(m.Name())
// copy tags and meta data which should be used as tags
s.extended_tag_list = s.extended_tag_list[:0]
for key, value := range m.Tags() {
s.extended_tag_list =
append(
s.extended_tag_list,
key_value_pair{
key: key,
value: value,
},
)
}
for _, key := range s.config.MetaAsTags {
if value, ok := m.GetMeta(key); ok {
s.extended_tag_list =
append(
s.extended_tag_list,
key_value_pair{
key: key,
value: value,
},
)
}
}
// Lock access to batch slice
s.batchMutex.Lock()
// batch slice full, dropping oldest metric(s)
// e.g. when previous flushes failed and batch slice was not cleared
if len(s.batch) == s.config.BatchSize {
newSize := s.config.BatchSize - s.config.DropRate
for i := 0; i < newSize; i++ {
s.batch[i] = s.batch[i+s.config.DropRate]
}
for i := newSize; i < s.config.BatchSize; i++ {
s.batch[i] = nil
}
s.batch = s.batch[:newSize]
cclog.ComponentError(s.name, "Batch slice full, dropping", s.config.DropRate, "oldest metric(s)")
// Encode tags (they musts be in lexical order)
slices.SortFunc(
s.extended_tag_list,
func(a key_value_pair, b key_value_pair) int {
if a.key < b.key {
return -1
}
if a.key > b.key {
return +1
}
return 0
},
)
for i := range s.extended_tag_list {
s.encoder.AddTag(
s.extended_tag_list[i].key,
s.extended_tag_list[i].value,
)
}
// Append metric to batch slice
p := m.ToPoint(s.meta_as_tags)
s.batch = append(s.batch, p)
// Encode fields
for key, value := range m.Fields() {
s.encoder.AddField(key, influx.MustNewValue(value))
}
// Flush synchronously if "flush_delay" is zero
// or
// Flush if batch size is reached
if s.flushDelay == 0 ||
len(s.batch) == s.config.BatchSize {
// Unlock access to batch slice
s.batchMutex.Unlock()
// Encode time stamp
s.encoder.EndLine(m.Time())
// Check for encoder errors
if err := s.encoder.Err(); err != nil {
// Unlock encoder usage
s.encoderLock.Unlock()
return fmt.Errorf("Encoding failed: %v", err)
}
s.numRecordsInEncoder++
if s.config.flushDelay == 0 {
// Unlock encoder usage
s.encoderLock.Unlock()
// Directly flush if no flush delay is configured
return s.Flush()
} else if s.numRecordsInEncoder == s.config.BatchSize {
// Unlock encoder usage
s.encoderLock.Unlock()
// Stop flush timer
if s.flushTimer != nil {
if ok := s.flushTimer.Stop(); ok {
cclog.ComponentDebug(s.name, "Write(): Stopped flush timer. Batch size limit reached before flush delay")
s.timerLock.Unlock()
}
}
// Flush if batch size is reached
return s.Flush()
} else if s.timerLock.TryLock() {
// Setup flush timer when flush delay is configured
// and no other timer is already running
if s.flushTimer != nil {
// Restarting existing flush timer
cclog.ComponentDebug(s.name, "Write(): Restarting flush timer")
s.flushTimer.Reset(s.config.flushDelay)
} else {
// Creating and starting flush timer
cclog.ComponentDebug(s.name, "Write(): Starting new flush timer")
s.flushTimer = time.AfterFunc(
s.config.flushDelay,
func() {
defer s.timerLock.Unlock()
cclog.ComponentDebug(s.name, "Starting flush triggered by flush timer")
if err := s.Flush(); err != nil {
cclog.ComponentError(s.name, "Flush triggered by flush timer: flush failed:", err)
}
})
}
}
// Unlock access to batch slice
s.batchMutex.Unlock()
// Unlock encoder usage
s.encoderLock.Unlock()
return nil
}
// Flush sends all metrics buffered in batch slice to InfluxDB server
// Flush sends all metrics stored in encoder to InfluxDB server
func (s *InfluxSink) Flush() error {
cclog.ComponentDebug(s.name, "Flushing")
// Lock access to batch slice
s.batchMutex.Lock()
defer s.batchMutex.Unlock()
// Lock for encoder usage
// Own lock for as short as possible: the time it takes to clone the buffer.
s.encoderLock.Lock()
// Nothing to do, batch slice is empty
if len(s.batch) == 0 {
buf := slices.Clone(s.encoder.Bytes())
numRecordsInBuf := s.numRecordsInEncoder
s.encoder.Reset()
s.numRecordsInEncoder = 0
// Unlock encoder usage
s.encoderLock.Unlock()
if len(buf) == 0 {
return nil
}
// Send metrics from batch slice
err := s.writeApi.WritePoint(context.Background(), s.batch...)
if err != nil {
cclog.ComponentError(s.name, "Flush(): Flush of", len(s.batch), "metrics failed:", err)
return err
}
cclog.ComponentDebug(s.name, "Flush(): Flushing", numRecordsInBuf, "metrics")
// Clear batch slice
for i := range s.batch {
s.batch[i] = nil
}
s.batch = s.batch[:0]
// Asynchron send of encoder metrics
s.sendWaitGroup.Add(1)
go func() {
defer s.sendWaitGroup.Done()
startTime := time.Now()
err := s.writeApi.WriteRecord(context.Background(), string(buf))
if err != nil {
cclog.ComponentError(
s.name,
"Flush():",
"Flush failed:", err,
"(number of records =", numRecordsInBuf,
", buffer size =", len(buf),
", send duration =", time.Since(startTime),
")",
)
return
}
}()
return nil
}
func (s *InfluxSink) Close() {
cclog.ComponentDebug(s.name, "Closing InfluxDB connection")
s.flushTimer.Stop()
s.Flush()
if err := s.Flush(); err != nil {
cclog.ComponentError(s.name, "Close(): Flush failed:", err)
// Stop existing timer and immediately flush
if s.flushTimer != nil {
if ok := s.flushTimer.Stop(); ok {
s.timerLock.Unlock()
}
}
// Flush
if err := s.Flush(); err != nil {
cclog.ComponentError(s.name, "Close():", "Flush failed:", err)
}
// Wait for send operations to finish
s.sendWaitGroup.Wait()
s.client.Close()
}
@@ -202,13 +417,14 @@ func NewInfluxSink(name string, config json.RawMessage) (Sink, error) {
// Set config default values
s.config.BatchSize = 1000
s.config.FlushInterval = "1s"
s.config.DropRate = 100
// Read config
if len(config) > 0 {
err := json.Unmarshal(config, &s.config)
if err != nil {
return s, err
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&s.config); err != nil {
cclog.ComponentError(s.name, "Error reading config:", err.Error())
return nil, err
}
}
@@ -238,28 +454,22 @@ func NewInfluxSink(name string, config json.RawMessage) (Sink, error) {
if len(s.config.FlushInterval) > 0 {
t, err := time.ParseDuration(s.config.FlushInterval)
if err == nil {
s.flushDelay = t
s.config.flushDelay = t
}
}
if !(s.config.BatchSize > 0) {
return s, fmt.Errorf("batch_size=%d in InfluxDB config must be > 0", s.config.BatchSize)
}
if !(s.config.DropRate > 0) {
return s, fmt.Errorf("drop_rate=%d in InfluxDB config must be > 0", s.config.DropRate)
}
if !(s.config.BatchSize > s.config.DropRate) {
return s, fmt.Errorf(
"batch_size=%d must be greater then drop_rate=%d in InfluxDB config",
s.config.BatchSize, s.config.DropRate)
}
// allocate batch slice
s.batch = make([]*write.Point, 0, s.config.BatchSize)
// Connect to InfluxDB server
if err := s.connect(); err != nil {
return s, fmt.Errorf("unable to connect: %v", err)
}
// Configure influx line protocol encoder
s.encoder.SetPrecision(influx.Nanosecond)
s.extended_tag_list = make([]key_value_pair, 0)
return s, nil
}

View File

@@ -2,7 +2,6 @@
The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.dev/github.com/influxdata/influxdb-client-go/v2) to write the metrics to an InfluxDB database in a **blocking** fashion. It provides only support for V2 write endpoints (InfluxDB 1.8.0 or later).
### Configuration structure
```json
@@ -17,21 +16,33 @@ The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.de
"organization": "myorg",
"ssl": true,
"flush_delay" : "1s",
"batch_size" : 100,
"batch_size" : 1000,
"use_gzip": true
"meta_as_tags" : [],
}
}
```
- `type`: makes the sink an `influxdb` sink
- `database`: All metrics are written to this bucket
- `meta_as_tags`: print all meta information as tags in the output (optional)
- `database`: All metrics are written to this bucket
- `host`: Hostname of the InfluxDB database server
- `port`: Portnumber (as string) of the InfluxDB database server
- `user`: Username for basic authentification
- `password`: Password for basic authentification
- `port`: Port number (as string) of the InfluxDB database server
- `user`: Username for basic authentication
- `password`: Password for basic authentication
- `organization`: Organization in the InfluxDB
- `ssl`: Use SSL connection
- `flush_delay`: Group metrics coming in to a single batch
- `batch_size`: Maximal batch size. If `batch_size` is reached before the end of `flush_delay`, the metrics are sent without further delay
Influx client options:
=======
- `batch_size`: Maximal batch size
- `meta_as_tags`: move meta information keys to tags (optional)
- `http_request_timeout`: HTTP request timeout
- `retry_interval`: retry interval
- `max_retry_interval`: maximum delay between each retry attempt
- `retry_exponential_base`: base for the exponential retry delay
- `max_retries`: maximum count of retry attempts of failed writes
- `max_retry_time`: maximum total retry timeout
- `use_gzip`: Specify whether to use GZip compression in write requests

View File

@@ -66,6 +66,7 @@ void Ganglia_pool_destroy( Ganglia_pool pool );
import "C"
import (
"bytes"
"encoding/json"
"errors"
"fmt"
@@ -233,8 +234,9 @@ func NewLibgangliaSink(name string, config json.RawMessage) (Sink, error) {
s.config.GmondConfig = string(GMOND_CONFIG_FILE)
s.config.GangliaLib = string(GANGLIA_LIB_NAME)
if len(config) > 0 {
err = json.Unmarshal(config, &s.config)
if err != nil {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&s.config); err != nil {
cclog.ComponentError(s.name, "Error reading config:", err.Error())
return nil, err
}

View File

@@ -108,9 +108,10 @@ func NewNatsSink(name string, config json.RawMessage) (Sink, error) {
s.name = fmt.Sprintf("NatsSink(%s)", name)
s.flushDelay = 10 * time.Second
if len(config) > 0 {
err := json.Unmarshal(config, &s.config)
if err != nil {
cclog.ComponentError(s.name, "Error reading config for", s.name, ":", err.Error())
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&s.config); err != nil {
cclog.ComponentError(s.name, "Error reading config:", err.Error())
return nil, err
}
}

View File

@@ -2,7 +2,6 @@
The `nats` sink publishes all metrics into a NATS network. The publishing key is the database name provided in the configuration file
### Configuration structure
```json
@@ -22,7 +21,7 @@ The `nats` sink publishes all metrics into a NATS network. The publishing key is
- `type`: makes the sink an `nats` sink
- `database`: All metrics are published with this subject
- `host`: Hostname of the NATS server
- `port`: Portnumber (as string) of the NATS server
- `user`: Username for basic authentification
- `password`: Password for basic authentification
- `port`: Port number (as string) of the NATS server
- `user`: Username for basic authentication
- `password`: Password for basic authentication
- `meta_as_tags`: print all meta information as tags in the output (optional)

View File

@@ -1,6 +1,7 @@
package sinks
import (
"bytes"
"context"
"encoding/json"
"errors"
@@ -167,9 +168,10 @@ func NewPrometheusSink(name string, config json.RawMessage) (Sink, error) {
s := new(PrometheusSink)
s.name = "PrometheusSink"
if len(config) > 0 {
err := json.Unmarshal(config, &s.config)
if err != nil {
cclog.ComponentError(s.name, "Error reading config for", s.name, ":", err.Error())
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&s.config); err != nil {
cclog.ComponentError(s.name, "Error reading config:", err.Error())
return nil, err
}
}

View File

@@ -1,6 +1,7 @@
package sinks
import (
"bytes"
"encoding/json"
"fmt"
"log"
@@ -57,8 +58,10 @@ func NewSampleSink(name string, config json.RawMessage) (Sink, error) {
// Read in the config JSON
if len(config) > 0 {
err := json.Unmarshal(config, &s.config)
if err != nil {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&s.config); err != nil {
cclog.ComponentError(s.name, "Error reading config:", err.Error())
return nil, err
}
}

View File

@@ -157,7 +157,7 @@ func (sm *sinkManager) AddOutput(name string, rawConfig json.RawMessage) error {
}
s, err := AvailableSinks[sinkConfig.Type](name, rawConfig)
if err != nil {
cclog.ComponentError("SinkManager", "SKIP", s.Name(), "initialization failed:", err.Error())
cclog.ComponentError("SinkManager", "SKIP", name, "initialization failed:", err.Error())
return err
}
sm.sinks[name] = s

View File

@@ -1,12 +1,14 @@
package sinks
import (
"bytes"
"encoding/json"
"fmt"
"os"
"strings"
// "time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
@@ -42,8 +44,10 @@ func NewStdoutSink(name string, config json.RawMessage) (Sink, error) {
s := new(StdoutSink)
s.name = fmt.Sprintf("StdoutSink(%s)", name)
if len(config) > 0 {
err := json.Unmarshal(config, &s.config)
if err != nil {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&s.config); err != nil {
cclog.ComponentError(s.name, "Error reading config:", err.Error())
return nil, err
}
}