Merge branch 'develop' into rename_cpu_type

This commit is contained in:
Thomas Gruber
2022-05-13 13:25:45 +02:00
committed by GitHub
24 changed files with 1139 additions and 412 deletions

View File

@@ -3,6 +3,6 @@
"collectors" : ".github/ci-collectors.json", "collectors" : ".github/ci-collectors.json",
"receivers" : ".github/ci-receivers.json", "receivers" : ".github/ci-receivers.json",
"router" : ".github/ci-router.json", "router" : ".github/ci-router.json",
"interval": 5, "interval": "5s",
"duration": 1 "duration": "1s"
} }

View File

@@ -22,8 +22,8 @@ import (
) )
type CentralConfigFile struct { type CentralConfigFile struct {
Interval int `json:"interval"` Interval string `json:"interval"`
Duration int `json:"duration"` Duration string `json:"duration"`
CollectorConfigFile string `json:"collectors"` CollectorConfigFile string `json:"collectors"`
RouterConfigFile string `json:"router"` RouterConfigFile string `json:"router"`
SinkConfigFile string `json:"sinks"` SinkConfigFile string `json:"sinks"`
@@ -173,16 +173,36 @@ func mainFunc() int {
cclog.Error("Error reading configuration file ", rcfg.CliArgs["configfile"], ": ", err.Error()) cclog.Error("Error reading configuration file ", rcfg.CliArgs["configfile"], ": ", err.Error())
return 1 return 1
} }
if rcfg.ConfigFile.Interval <= 0 || time.Duration(rcfg.ConfigFile.Interval)*time.Second <= 0 {
// Properly use duration parser with inputs like '60s', '5m' or similar
if len(rcfg.ConfigFile.Interval) > 0 {
t, err := time.ParseDuration(rcfg.ConfigFile.Interval)
if err != nil {
cclog.Error("Configuration value 'interval' no valid duration")
}
rcfg.Interval = t
if rcfg.Interval == 0 {
cclog.Error("Configuration value 'interval' must be greater than zero") cclog.Error("Configuration value 'interval' must be greater than zero")
return 1 return 1
} }
rcfg.Interval = time.Duration(rcfg.ConfigFile.Interval) * time.Second }
if rcfg.ConfigFile.Duration <= 0 || time.Duration(rcfg.ConfigFile.Duration)*time.Second <= 0 {
// Properly use duration parser with inputs like '60s', '5m' or similar
if len(rcfg.ConfigFile.Duration) > 0 {
t, err := time.ParseDuration(rcfg.ConfigFile.Duration)
if err != nil {
cclog.Error("Configuration value 'duration' no valid duration")
}
rcfg.Duration = t
if rcfg.Duration == 0 {
cclog.Error("Configuration value 'duration' must be greater than zero") cclog.Error("Configuration value 'duration' must be greater than zero")
return 1 return 1
} }
rcfg.Duration = time.Duration(rcfg.ConfigFile.Duration) * time.Second }
if rcfg.Duration > rcfg.Interval {
cclog.Error("The interval should be greater than duration")
return 1
}
if len(rcfg.ConfigFile.RouterConfigFile) == 0 { if len(rcfg.ConfigFile.RouterConfigFile) == 0 {
cclog.Error("Metric router configuration file must be set") cclog.Error("Metric router configuration file must be set")
@@ -271,7 +291,7 @@ func mainFunc() int {
// Wait until one tick has passed. This is a workaround // Wait until one tick has passed. This is a workaround
if rcfg.CliArgs["once"] == "true" { if rcfg.CliArgs["once"] == "true" {
x := 1.2 * float64(rcfg.ConfigFile.Interval) x := 1.2 * float64(rcfg.Interval)
time.Sleep(time.Duration(int(x)) * time.Second) time.Sleep(time.Duration(int(x)) * time.Second)
shutdownSignal <- os.Interrupt shutdownSignal <- os.Interrupt
} }

View File

@@ -1,22 +1,28 @@
all: likwid
# LIKWID version # LIKWID version
LIKWID_VERSION = 5.2.1 LIKWID_VERSION = 5.2.1
LIKWID_INSTALLED_FOLDER=$(shell dirname $(shell which likwid-topology 2>/dev/null))
LIKWID_FOLDER="$(shell pwd)/likwid"
all: $(LIKWID_FOLDER)/likwid.h
.ONESHELL: .ONESHELL:
.PHONY: likwid .PHONY: $(LIKWID_FOLDER)/likwid.h
likwid: $(LIKWID_FOLDER)/likwid.h:
INSTALL_FOLDER="$${PWD}/likwid" if [ "$(LIKWID_INSTALLED_FOLDER)" != "" ]; then \
BUILD_FOLDER="$${PWD}/likwidbuild" BASE="$(LIKWID_INSTALLED_FOLDER)/../include"; \
if [ -d $${INSTALL_FOLDER} ]; then rm -r $${INSTALL_FOLDER}; fi mkdir -p $(LIKWID_FOLDER); \
mkdir --parents --verbose $${INSTALL_FOLDER} $${BUILD_FOLDER} cp $$BASE/*.h $(LIKWID_FOLDER); \
wget -P "$${BUILD_FOLDER}" ftp://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz else \
tar -C $${BUILD_FOLDER} -xf $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION).tar.gz BUILD_FOLDER="$${PWD}/likwidbuild"; \
install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/likwid*.h $${INSTALL_FOLDER}/ if [ -d $(LIKWID_FOLDER) ]; then rm -r $(LIKWID_FOLDER); fi; \
install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/bstrlib.h $${INSTALL_FOLDER}/ mkdir --parents --verbose $(LIKWID_FOLDER) $${BUILD_FOLDER}; \
rm -r $${BUILD_FOLDER} wget -P "$${BUILD_FOLDER}" ftp://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz; \
tar -C $${BUILD_FOLDER} -xf $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION).tar.gz; \
install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/likwid*.h $(LIKWID_FOLDER)/; \
install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/bstrlib.h $(LIKWID_FOLDER)/; \
rm -r $${BUILD_FOLDER}; \
fi
clean: clean:

View File

@@ -3,7 +3,6 @@ package collectors
import ( import (
"bufio" "bufio"
"encoding/json" "encoding/json"
"fmt"
"os" "os"
"strings" "strings"
"syscall" "syscall"
@@ -81,8 +80,7 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric
stat := syscall.Statfs_t{} stat := syscall.Statfs_t{}
err := syscall.Statfs(path, &stat) err := syscall.Statfs(path, &stat)
if err != nil { if err != nil {
fmt.Println(err.Error()) continue
return
} }
tags := map[string]string{"type": "node", "device": linefields[0]} tags := map[string]string{"type": "node", "device": linefields[0]}
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000) total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)

View File

@@ -70,6 +70,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
for _, fs := range m.config.ExcludeFilesystem { for _, fs := range m.config.ExcludeFilesystem {
m.skipFS[fs] = struct{}{} m.skipFS[fs] = struct{}{}
} }
m.lastState = make(map[string]GpfsCollectorLastState)
// GPFS / IBM Spectrum Scale file system statistics can only be queried by user root // GPFS / IBM Spectrum Scale file system statistics can only be queried by user root
user, err := user.Current() user, err := user.Current()
@@ -162,13 +163,18 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
continue continue
} }
// Add filesystem tag
m.tags["filesystem"] = filesystem m.tags["filesystem"] = filesystem
// Create initial last state
if m.config.SendBandwidths {
if _, ok := m.lastState[filesystem]; !ok { if _, ok := m.lastState[filesystem]; !ok {
m.lastState[filesystem] = GpfsCollectorLastState{ m.lastState[filesystem] = GpfsCollectorLastState{
bytesRead: -1, bytesRead: -1,
bytesWritten: -1, bytesWritten: -1,
} }
} }
}
// return code // return code
rc, err := strconv.Atoi(key_value["_rc_"]) rc, err := strconv.Atoi(key_value["_rc_"])

View File

@@ -18,11 +18,16 @@ import (
const IB_BASEPATH = "/sys/class/infiniband/" const IB_BASEPATH = "/sys/class/infiniband/"
type InfinibandCollectorMetric struct {
path string
unit string
}
type InfinibandCollectorInfo struct { type InfinibandCollectorInfo struct {
LID string // IB local Identifier (LID) LID string // IB local Identifier (LID)
device string // IB device device string // IB device
port string // IB device port port string // IB device port
portCounterFiles map[string]string // mapping counter name -> sysfs file portCounterFiles map[string]InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric
tagSet map[string]string // corresponding tag list tagSet map[string]string // corresponding tag list
lastState map[string]int64 // State from last measurement lastState map[string]int64 // State from last measurement
} }
@@ -106,16 +111,16 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
// Check access to counter files // Check access to counter files
countersDir := filepath.Join(path, "counters") countersDir := filepath.Join(path, "counters")
portCounterFiles := map[string]string{ portCounterFiles := map[string]InfinibandCollectorMetric{
"ib_recv": filepath.Join(countersDir, "port_rcv_data"), "ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes"},
"ib_xmit": filepath.Join(countersDir, "port_xmit_data"), "ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes"},
"ib_recv_pkts": filepath.Join(countersDir, "port_rcv_packets"), "ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets"},
"ib_xmit_pkts": filepath.Join(countersDir, "port_xmit_packets"), "ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets"},
} }
for _, counterFile := range portCounterFiles { for _, counter := range portCounterFiles {
err := unix.Access(counterFile, unix.R_OK) err := unix.Access(counter.path, unix.R_OK)
if err != nil { if err != nil {
return fmt.Errorf("unable to access %s: %v", counterFile, err) return fmt.Errorf("unable to access %s: %v", counter.path, err)
} }
} }
@@ -165,14 +170,14 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
m.lastTimestamp = now m.lastTimestamp = now
for _, info := range m.info { for _, info := range m.info {
for counterName, counterFile := range info.portCounterFiles { for counterName, counterDef := range info.portCounterFiles {
// Read counter file // Read counter file
line, err := ioutil.ReadFile(counterFile) line, err := ioutil.ReadFile(counterDef.path)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(
m.name, m.name,
fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterFile, err)) fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err))
continue continue
} }
data := strings.TrimSpace(string(line)) data := strings.TrimSpace(string(line))
@@ -189,6 +194,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
// Send absolut values // Send absolut values
if m.config.SendAbsoluteValues { if m.config.SendAbsoluteValues {
if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil { if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil {
y.AddMeta("unit", counterDef.unit)
output <- y output <- y
} }
} }
@@ -198,6 +204,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
if info.lastState[counterName] >= 0 { if info.lastState[counterName] >= 0 {
rate := float64((v - info.lastState[counterName])) / timeDiff rate := float64((v - info.lastState[counterName])) / timeDiff
if y, err := lp.New(counterName+"_bw", info.tagSet, m.meta, map[string]interface{}{"value": rate}, now); err == nil { if y, err := lp.New(counterName+"_bw", info.tagSet, m.meta, map[string]interface{}{"value": rate}, now); err == nil {
y.AddMeta("unit", counterDef.unit+"/sec")
output <- y output <- y
} }
} }

View File

@@ -15,8 +15,12 @@ import (
"io/ioutil" "io/ioutil"
"math" "math"
"os" "os"
"os/signal"
"sort"
"strconv" "strconv"
"strings" "strings"
"sync"
"syscall"
"time" "time"
"unsafe" "unsafe"
@@ -46,6 +50,16 @@ type LikwidCollectorEventsetConfig struct {
Metrics []LikwidCollectorMetricConfig `json:"metrics"` Metrics []LikwidCollectorMetricConfig `json:"metrics"`
} }
type LikwidEventsetConfig struct {
internal int
gid C.int
eorder []*C.char
estr *C.char
go_estr string
results map[int]map[string]interface{}
metrics map[int]map[string]float64
}
type LikwidCollectorConfig struct { type LikwidCollectorConfig struct {
Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"` Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"`
Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"` Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"`
@@ -64,11 +78,12 @@ type LikwidCollector struct {
metrics map[C.int]map[string]int metrics map[C.int]map[string]int
groups []C.int groups []C.int
config LikwidCollectorConfig config LikwidCollectorConfig
results map[int]map[int]map[string]interface{}
mresults map[int]map[int]map[string]float64
gmresults map[int]map[string]float64 gmresults map[int]map[string]float64
basefreq float64 basefreq float64
running bool running bool
initialized bool
likwidGroups map[C.int]LikwidEventsetConfig
lock sync.Mutex
} }
type LikwidMetric struct { type LikwidMetric struct {
@@ -86,14 +101,60 @@ func eventsToEventStr(events map[string]string) string {
return strings.Join(elist, ",") return strings.Join(elist, ",")
} }
func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig {
tmplist := make([]string, 0)
clist := make([]string, 0)
for k := range input.Events {
clist = append(clist, k)
}
sort.Strings(clist)
elist := make([]*C.char, 0)
for _, k := range clist {
v := input.Events[k]
tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k))
c_counter := C.CString(k)
elist = append(elist, c_counter)
}
estr := strings.Join(tmplist, ",")
res := make(map[int]map[string]interface{})
met := make(map[int]map[string]float64)
for _, i := range topo.CpuList() {
res[i] = make(map[string]interface{})
for k := range input.Events {
res[i][k] = 0.0
}
met[i] = make(map[string]float64)
for _, v := range input.Metrics {
res[i][v.Name] = 0.0
}
}
return LikwidEventsetConfig{
gid: -1,
eorder: elist,
estr: C.CString(estr),
go_estr: estr,
results: res,
metrics: met,
}
}
func testLikwidMetricFormula(formula string, params []string) bool {
myparams := make(map[string]interface{})
for _, p := range params {
myparams[p] = float64(1.0)
}
_, err := agg.EvalFloat64Condition(formula, myparams)
return err == nil
}
func getBaseFreq() float64 { func getBaseFreq() float64 {
files := []string{
"/sys/devices/system/cpu/cpu0/cpufreq/bios_limit",
"/sys/devices/system/cpu/cpu0/cpufreq/base_frequency",
}
var freq float64 = math.NaN() var freq float64 = math.NaN()
C.power_init(0) for _, f := range files {
info := C.get_powerInfo() buffer, err := ioutil.ReadFile(f)
if float64(info.baseFrequency) != 0 {
freq = float64(info.baseFrequency) * 1e6
} else {
buffer, err := ioutil.ReadFile("/sys/devices/system/cpu/cpu0/cpufreq/bios_limit")
if err == nil { if err == nil {
data := strings.Replace(string(buffer), "\n", "", -1) data := strings.Replace(string(buffer), "\n", "", -1)
x, err := strconv.ParseInt(data, 0, 64) x, err := strconv.ParseInt(data, 0, 64)
@@ -102,12 +163,22 @@ func getBaseFreq() float64 {
} }
} }
} }
if math.IsNaN(freq) {
C.power_init(0)
info := C.get_powerInfo()
if float64(info.baseFrequency) != 0 {
freq = float64(info.baseFrequency) * 1e6
}
C.power_finalize()
}
return freq return freq
} }
func (m *LikwidCollector) Init(config json.RawMessage) error { func (m *LikwidCollector) Init(config json.RawMessage) error {
var ret C.int
m.name = "LikwidCollector" m.name = "LikwidCollector"
m.initialized = false
m.running = false
m.config.AccessMode = LIKWID_DEF_ACCESSMODE m.config.AccessMode = LIKWID_DEF_ACCESSMODE
m.config.LibraryPath = LIKWID_LIB_NAME m.config.LibraryPath = LIKWID_LIB_NAME
if len(config) > 0 { if len(config) > 0 {
@@ -131,7 +202,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
} }
m.setup() m.setup()
m.meta = map[string]string{"source": m.name, "group": "PerfCounter"} m.meta = map[string]string{"group": "PerfCounter"}
cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists") cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
cpulist := topo.CpuList() cpulist := topo.CpuList()
m.cpulist = make([]C.int, len(cpulist)) m.cpulist = make([]C.int, len(cpulist))
@@ -140,172 +211,136 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
m.cpulist[i] = C.int(c) m.cpulist[i] = C.int(c)
m.cpu2tid[c] = i m.cpu2tid[c] = i
} }
m.sock2tid = make(map[int]int)
tmp := make([]C.int, 1) m.likwidGroups = make(map[C.int]LikwidEventsetConfig)
for _, sid := range topo.SocketList() {
cstr := C.CString(fmt.Sprintf("S%d:0", sid)) // m.results = make(map[int]map[int]map[string]interface{})
ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1) // m.mresults = make(map[int]map[int]map[string]float64)
if ret > 0 {
m.sock2tid[sid] = m.cpu2tid[int(tmp[0])]
}
C.free(unsafe.Pointer(cstr))
}
m.results = make(map[int]map[int]map[string]interface{})
m.mresults = make(map[int]map[int]map[string]float64)
m.gmresults = make(map[int]map[string]float64) m.gmresults = make(map[int]map[string]float64)
cclog.ComponentDebug(m.name, "initialize LIKWID topology") for _, tid := range m.cpu2tid {
ret = C.topology_init() m.gmresults[tid] = make(map[string]float64)
if ret != 0 {
err := errors.New("failed to initialize LIKWID topology")
cclog.ComponentError(m.name, err.Error())
return err
}
switch m.config.AccessMode {
case "direct":
C.HPMmode(0)
case "accessdaemon":
if len(m.config.DaemonPath) > 0 {
p := os.Getenv("PATH")
os.Setenv("PATH", m.config.DaemonPath+":"+p)
}
C.HPMmode(1)
}
cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module")
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
if ret != 0 {
C.topology_finalize()
err := errors.New("failed to initialize LIKWID topology")
cclog.ComponentError(m.name, err.Error())
return err
} }
// This is for the global metrics computation test // This is for the global metrics computation test
globalParams := make(map[string]interface{}) totalMetrics := 0
globalParams["time"] = float64(1.0)
globalParams["inverseClock"] = float64(1.0)
// While adding the events, we test the metrics whether they can be computed at all
for i, evset := range m.config.Eventsets {
var gid C.int
var cstr *C.char
if len(evset.Events) > 0 {
estr := eventsToEventStr(evset.Events)
// Generate parameter list for the metric computing test // Generate parameter list for the metric computing test
params := make(map[string]interface{}) params := make([]string, 0)
params["time"] = float64(1.0) params = append(params, "time", "inverseClock")
params["inverseClock"] = float64(1.0) // Generate parameter list for the global metric computing test
globalParams := make([]string, 0)
globalParams = append(globalParams, "time", "inverseClock")
// We test the eventset metrics whether they can be computed at all
for _, evset := range m.config.Eventsets {
if len(evset.Events) > 0 {
params = params[:2]
for counter := range evset.Events { for counter := range evset.Events {
params[counter] = float64(1.0) params = append(params, counter)
} }
for _, metric := range evset.Metrics { for _, metric := range evset.Metrics {
// Try to evaluate the metric // Try to evaluate the metric
_, err := agg.EvalFloat64Condition(metric.Calc, params) if testLikwidMetricFormula(metric.Calc, params) {
if err != nil { // Add the computable metric to the parameter list for the global metrics
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) globalParams = append(globalParams, metric.Name)
continue totalMetrics++
} } else {
// If the metric is not in the parameter list for the global metrics, add it metric.Calc = ""
if _, ok := globalParams[metric.Name]; !ok {
globalParams[metric.Name] = float64(1.0)
} }
} }
// Now we add the list of events to likwid
cstr = C.CString(estr)
gid = C.perfmon_addEventSet(cstr)
} else { } else {
cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given") cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given")
continue continue
} }
if gid >= 0 {
m.groups = append(m.groups, gid)
}
C.free(unsafe.Pointer(cstr))
m.results[i] = make(map[int]map[string]interface{})
m.mresults[i] = make(map[int]map[string]float64)
for tid := range m.cpulist {
m.results[i][tid] = make(map[string]interface{})
m.mresults[i][tid] = make(map[string]float64)
if i == 0 {
m.gmresults[tid] = make(map[string]float64)
}
}
} }
for _, metric := range m.config.Metrics { for _, metric := range m.config.Metrics {
// Try to evaluate the global metric // Try to evaluate the global metric
_, err := agg.EvalFloat64Condition(metric.Calc, globalParams) if !testLikwidMetricFormula(metric.Calc, globalParams) {
if err != nil { cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed")
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) metric.Calc = ""
continue } else {
totalMetrics++
} }
} }
// If no event set could be added, shut down LikwidCollector // If no event set could be added, shut down LikwidCollector
if len(m.groups) == 0 { if totalMetrics == 0 {
C.perfmon_finalize() err := errors.New("no LIKWID eventset or metric usable")
C.topology_finalize()
err := errors.New("no LIKWID performance group initialized")
cclog.ComponentError(m.name, err.Error()) cclog.ComponentError(m.name, err.Error())
return err return err
} }
m.basefreq = getBaseFreq()
cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq)
m.init = true m.init = true
return nil return nil
} }
// take a measurement for 'interval' seconds of event set index 'group' // take a measurement for 'interval' seconds of event set index 'group'
func (m *LikwidCollector) takeMeasurement(group int, interval time.Duration) error { func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval time.Duration) (bool, error) {
var ret C.int var ret C.int
gid := m.groups[group] m.lock.Lock()
ret = C.perfmon_setupCounters(gid) if m.initialized {
ret = C.perfmon_setupCounters(evset.gid)
if ret != 0 { if ret != 0 {
gctr := C.GoString(C.perfmon_getGroupName(gid)) var err error = nil
err := fmt.Errorf("failed to setup performance group %d (%s)", gid, gctr) var skip bool = false
return err if ret == -37 {
skip = true
} else {
err = fmt.Errorf("failed to setup performance group %d", evset.gid)
}
m.lock.Unlock()
return skip, err
} }
ret = C.perfmon_startCounters() ret = C.perfmon_startCounters()
if ret != 0 { if ret != 0 {
gctr := C.GoString(C.perfmon_getGroupName(gid)) var err error = nil
err := fmt.Errorf("failed to start performance group %d (%s)", gid, gctr) var skip bool = false
return err if ret == -37 {
skip = true
} else {
err = fmt.Errorf("failed to setup performance group %d", evset.gid)
}
m.lock.Unlock()
return skip, err
} }
m.running = true m.running = true
time.Sleep(interval) time.Sleep(interval)
m.running = false m.running = false
ret = C.perfmon_stopCounters() ret = C.perfmon_stopCounters()
if ret != 0 { if ret != 0 {
gctr := C.GoString(C.perfmon_getGroupName(gid)) var err error = nil
err := fmt.Errorf("failed to stop performance group %d (%s)", gid, gctr) var skip bool = false
return err if ret == -37 {
skip = true
} else {
err = fmt.Errorf("failed to setup performance group %d", evset.gid)
} }
return nil m.lock.Unlock()
return skip, err
}
}
m.lock.Unlock()
return false, nil
} }
// Get all measurement results for an event set, derive the metric values out of the measurement results and send it // Get all measurement results for an event set, derive the metric values out of the measurement results and send it
func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, output chan lp.CCMetric) error { func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error {
var eidx C.int
evset := m.config.Eventsets[group]
gid := m.groups[group]
invClock := float64(1.0 / m.basefreq) invClock := float64(1.0 / m.basefreq)
// Go over events and get the results // Go over events and get the results
for eidx = 0; int(eidx) < len(evset.Events); eidx++ { for eidx, counter := range evset.eorder {
ctr := C.perfmon_getCounterName(gid, eidx) gctr := C.GoString(counter)
gctr := C.GoString(ctr)
for _, tid := range m.cpu2tid { for _, tid := range m.cpu2tid {
if tid >= 0 { res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid))
m.results[group][tid]["time"] = interval.Seconds() fres := float64(res)
m.results[group][tid]["inverseClock"] = invClock if m.config.InvalidToZero && (math.IsNaN(fres) || math.IsInf(fres, 0)) {
res := C.perfmon_getLastResult(gid, eidx, C.int(tid)) fres = 0.0
m.results[group][tid][gctr] = float64(res)
} }
evset.results[tid][gctr] = fres
evset.results[tid]["time"] = interval.Seconds()
evset.results[tid]["inverseClock"] = invClock
} }
} }
// Go over the event set metrics, derive the value out of the event:counter values and send it // Go over the event set metrics, derive the value out of the event:counter values and send it
for _, metric := range evset.Metrics { for _, metric := range m.config.Eventsets[evset.internal].Metrics {
// The metric scope is determined in the Init() function // The metric scope is determined in the Init() function
// Get the map scope-id -> tids // Get the map scope-id -> tids
scopemap := m.cpu2tid scopemap := m.cpu2tid
@@ -313,19 +348,16 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration,
scopemap = m.sock2tid scopemap = m.sock2tid
} }
for domain, tid := range scopemap { for domain, tid := range scopemap {
if tid >= 0 { if tid >= 0 && len(metric.Calc) > 0 {
value, err := agg.EvalFloat64Condition(metric.Calc, m.results[group][tid]) value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid])
if err != nil { if err != nil {
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
continue
}
m.mresults[group][tid][metric.Name] = value
if m.config.InvalidToZero && math.IsNaN(value) {
value = 0.0 value = 0.0
} }
if m.config.InvalidToZero && math.IsInf(value, 0) { if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
value = 0.0 value = 0.0
} }
evset.metrics[tid][metric.Name] = value
// Now we have the result, send it with the proper tags // Now we have the result, send it with the proper tags
if !math.IsNaN(value) { if !math.IsNaN(value) {
if metric.Publish { if metric.Publish {
@@ -360,8 +392,8 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
if tid >= 0 { if tid >= 0 {
// Here we generate parameter list // Here we generate parameter list
params := make(map[string]interface{}) params := make(map[string]interface{})
for j := range m.groups { for _, evset := range m.likwidGroups {
for mname, mres := range m.mresults[j][tid] { for mname, mres := range evset.metrics[tid] {
params[mname] = mres params[mname] = mres
} }
} }
@@ -369,15 +401,12 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
value, err := agg.EvalFloat64Condition(metric.Calc, params) value, err := agg.EvalFloat64Condition(metric.Calc, params)
if err != nil { if err != nil {
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error()) cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
continue value = 0.0
}
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
value = 0.0
} }
m.gmresults[tid][metric.Name] = value m.gmresults[tid][metric.Name] = value
if m.config.InvalidToZero && math.IsNaN(value) {
value = 0.0
}
if m.config.InvalidToZero && math.IsInf(value, 0) {
value = 0.0
}
// Now we have the result, send it with the proper tags // Now we have the result, send it with the proper tags
if !math.IsNaN(value) { if !math.IsNaN(value) {
if metric.Publish { if metric.Publish {
@@ -401,38 +430,163 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
return nil return nil
} }
func (m *LikwidCollector) LateInit() error {
var ret C.int
if m.initialized {
return nil
}
switch m.config.AccessMode {
case "direct":
C.HPMmode(0)
case "accessdaemon":
if len(m.config.DaemonPath) > 0 {
p := os.Getenv("PATH")
os.Setenv("PATH", m.config.DaemonPath+":"+p)
}
C.HPMmode(1)
}
cclog.ComponentDebug(m.name, "initialize LIKWID topology")
ret = C.topology_init()
if ret != 0 {
err := errors.New("failed to initialize LIKWID topology")
cclog.ComponentError(m.name, err.Error())
return err
}
m.sock2tid = make(map[int]int)
tmp := make([]C.int, 1)
for _, sid := range topo.SocketList() {
cstr := C.CString(fmt.Sprintf("S%d:0", sid))
ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1)
if ret > 0 {
m.sock2tid[sid] = m.cpu2tid[int(tmp[0])]
}
C.free(unsafe.Pointer(cstr))
}
m.basefreq = getBaseFreq()
cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq)
cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module")
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
if ret != 0 {
var err error = nil
C.topology_finalize()
if ret != -22 {
err = errors.New("failed to initialize LIKWID perfmon")
cclog.ComponentError(m.name, err.Error())
} else {
err = errors.New("access to LIKWID perfmon locked")
}
return err
}
// While adding the events, we test the metrics whether they can be computed at all
for i, evset := range m.config.Eventsets {
var gid C.int
if len(evset.Events) > 0 {
skip := false
likwidGroup := genLikwidEventSet(evset)
for _, g := range m.likwidGroups {
if likwidGroup.go_estr == g.go_estr {
skip = true
break
}
}
if skip {
continue
}
// Now we add the list of events to likwid
gid = C.perfmon_addEventSet(likwidGroup.estr)
if gid >= 0 {
likwidGroup.gid = gid
likwidGroup.internal = i
m.likwidGroups[gid] = likwidGroup
}
} else {
cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given")
continue
}
}
// If no event set could be added, shut down LikwidCollector
if len(m.likwidGroups) == 0 {
C.perfmon_finalize()
C.topology_finalize()
err := errors.New("no LIKWID performance group initialized")
cclog.ComponentError(m.name, err.Error())
return err
}
sigchan := make(chan os.Signal, 1)
signal.Notify(sigchan, syscall.SIGCHLD)
signal.Notify(sigchan, os.Interrupt)
go func() {
<-sigchan
signal.Stop(sigchan)
m.initialized = false
}()
m.initialized = true
return nil
}
// main read function taking multiple measurement rounds, each 'interval' seconds long // main read function taking multiple measurement rounds, each 'interval' seconds long
func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) { func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) {
var skip bool = false
var err error
if !m.init { if !m.init {
return return
} }
for i := range m.groups { if !m.initialized {
m.lock.Lock()
err = m.LateInit()
if err != nil {
m.lock.Unlock()
return
}
m.initialized = true
m.lock.Unlock()
}
if m.initialized && !skip {
for _, evset := range m.likwidGroups {
if !skip {
// measure event set 'i' for 'interval' seconds // measure event set 'i' for 'interval' seconds
err := m.takeMeasurement(i, interval) skip, err = m.takeMeasurement(evset, interval)
if err != nil { if err != nil {
cclog.ComponentError(m.name, err.Error()) cclog.ComponentError(m.name, err.Error())
return return
} }
// read measurements and derive event set metrics
m.calcEventsetMetrics(i, interval, output)
} }
if !skip {
// read measurements and derive event set metrics
m.calcEventsetMetrics(evset, interval, output)
}
}
if !skip {
// use the event set metrics to derive the global metrics // use the event set metrics to derive the global metrics
m.calcGlobalMetrics(interval, output) m.calcGlobalMetrics(interval, output)
} }
}
}
func (m *LikwidCollector) Close() { func (m *LikwidCollector) Close() {
if m.init { if m.init {
cclog.ComponentDebug(m.name, "Closing ...")
m.init = false m.init = false
if m.running { cclog.ComponentDebug(m.name, "Closing ...")
cclog.ComponentDebug(m.name, "Stopping counters") m.lock.Lock()
C.perfmon_stopCounters() if m.initialized {
}
cclog.ComponentDebug(m.name, "Finalize LIKWID perfmon module") cclog.ComponentDebug(m.name, "Finalize LIKWID perfmon module")
C.perfmon_finalize() C.perfmon_finalize()
m.initialized = false
}
m.lock.Unlock()
cclog.ComponentDebug(m.name, "Finalize LIKWID topology module") cclog.ComponentDebug(m.name, "Finalize LIKWID topology module")
C.topology_finalize() C.topology_finalize()
cclog.ComponentDebug(m.name, "Closing done") cclog.ComponentDebug(m.name, "Closing done")
} }
} }

View File

@@ -3,22 +3,53 @@
The `likwid` collector is probably the most complicated collector. The LIKWID library is included as static library with *direct* access mode. The *direct* access mode is suitable if the daemon is executed by a root user. The static library does not contain the performance groups, so all information needs to be provided in the configuration. The `likwid` collector is probably the most complicated collector. The LIKWID library is included as static library with *direct* access mode. The *direct* access mode is suitable if the daemon is executed by a root user. The static library does not contain the performance groups, so all information needs to be provided in the configuration.
The `likwid` configuration consists of two parts, the "eventsets" and "globalmetrics": ```json
- An event set list itself has two parts, the "events" and a set of derivable "metrics". Each of the "events" is a counter:event pair in LIKWID's syntax. The "metrics" are a list of formulas to derive the metric value from the measurements of the "events". Each metric has a name, the formula, a scope and a publish flag. Counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. The scope tells the Collector whether it is a metric for each hardware thread (`hwthread`) or each CPU socket (`socket`). You may specify a unit for the metric with `unit`. The last one is the publishing flag. It tells the collector whether a metric should be sent to the router. "likwid": {
- The global metrics are metrics which require data from all event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. So, the idea is to derive a metric in the "eventsets" section and reuse it in the "globalmetrics" part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics (`publish=false`). **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Similar to the metrics in the eventset, you can specify a metric unit with the `unit` field. "force_overwrite" : false,
"invalid_to_zero" : false,
"eventsets": [
{
"events" : {
"COUNTER0": "EVENT0",
"COUNTER1": "EVENT1",
},
"metrics" : [
{
"name": "sum_01",
"calc": "COUNTER0 + COUNTER1",
"publish": false,
"unit": "myunit",
"type": "hwthread"
}
]
}
]
"globalmetrics" : [
{
"name": "global_sum",
"calc": "sum_01",
"publish": true,
"unit": "myunit",
"type": "hwthread"
}
]
}
```
The `likwid` configuration consists of two parts, the `eventsets` and `globalmetrics`:
- An event set list itself has two parts, the `events` and a set of derivable `metrics`. Each of the `events` is a `counter:event` pair in LIKWID's syntax. The `metrics` are a list of formulas to derive the metric value from the measurements of the `events`' values. Each metric has a name, the formula, a type and a publish flag. There is an optional `unit` field. Counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. You can optionally use `time` for the measurement time and `inverseClock` for `1.0/baseCpuFrequency`. The type tells the LikwidCollector whether it is a metric for each hardware thread (`cpu`) or each CPU socket (`socket`). You may specify a unit for the metric with `unit`. The last one is the publishing flag. It tells the LikwidCollector whether a metric should be sent to the router or is only used internally to compute a global metric.
- The `globalmetrics` are metrics which require data from multiple event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. Also `time` and `inverseClock` cannot be used anymore. So, the idea is to derive a metric in the `eventsets` section and reuse it in the `globalmetrics` part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics (`"publish": false`). **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Similar to the metrics in the eventset, you can specify a metric unit with the `unit` field.
Additional options: Additional options:
- `access_mode` : Method to use for hardware performance monitoring (`direct` access as root user, `accessdaemon` for the daemon mode)
- `accessdaemon_path`: Folder with the access daemon `likwid-accessD`, commonly `$LIKWID_INSTALL_LOC/sbin`
- `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements - `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements
- `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`. - `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`. See below in [seperate section](./likwidMetric.md#invalid_to_zero-option)
- `access_mode`: Specify LIKWID access mode: `direct` for direct register access as root user or `accessdaemon` - `access_mode`: Specify LIKWID access mode: `direct` for direct register access as root user or `accessdaemon`. The access mode `perf_event` is current untested.
- `accessdaemon_path`: Folder of the accessDaemon `likwid-accessD` - `accessdaemon_path`: Folder of the accessDaemon `likwid-accessD` (like `/usr/local/sbin`)
- `liblikwid_path`: Location of `liblikwid.so` - `liblikwid_path`: Location of `liblikwid.so` including file name like `/usr/local/lib/liblikwid.so`
### Available metric scopes ### Available metric scopes
Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the collector provides the specification of a 'scope' for each metric. Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the LikwidCollector provides the specification of a `type` for each metric.
- `hwthread` : One metric per CPU hardware thread with the tags `"type" : "hwthread"` and `"type-id" : "$hwthread_id"` - `hwthread` : One metric per CPU hardware thread with the tags `"type" : "hwthread"` and `"type-id" : "$hwthread_id"`
- `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"` - `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"`
@@ -50,6 +81,7 @@ $ scripts/likwid_perfgroup_to_cc_config.py ICX MEM_DP
{ {
"events": { "events": {
"FIXC0": "INSTR_RETIRED_ANY", "FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"..." : "..." "..." : "..."
}, },
"metrics" : [ "metrics" : [
@@ -75,21 +107,28 @@ LIKWID checks the file `/var/run/likwid.lock` before performing any interfering
Before (SLURM prolog, ...) Before (SLURM prolog, ...)
``` ```
$ chwon $JOBUSER /var/run/likwid.lock $ chown $JOBUSER /var/run/likwid.lock
``` ```
After (SLURM epilog, ...) After (SLURM epilog, ...)
``` ```
$ chwon $CCUSER /var/run/likwid.lock $ chown $CCUSER /var/run/likwid.lock
``` ```
### `invalid_to_zero` option
In some cases LIKWID returns `0.0` for some events that are further used in processing and maybe used as divisor in a calculation. After evaluation of a metric, the result might be `NaN` or `+-Inf`. These resulting metrics are commonly not created and forwarded to the router because the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#float) does not support these special floating-point values. If you want to have them sent, this option forces these metric values to be `0.0` instead.
One might think this does not happen often but often used metrics in the world of performance engineering like Instructions-per-Cycle (IPC) or more frequently the actual CPU clock are derived with events like `CPU_CLK_UNHALTED_CORE` (Intel) which do not increment in halted state (as the name implies). In there are different power management systems in a chip which can cause a hardware thread to go in such a state. Moreover, if no cycles are executed by the core, also many other events are not incremented as well (like `INSTR_RETIRED_ANY` for retired instructions and part of IPC).
### Example configuration ### Example configuration
#### AMD Zen3
```json ```json
"likwid": { "likwid": {
"force_overwrite" : false, "force_overwrite" : false,
"nan_to_zero" : false, "invalid_to_zero" : false,
"eventsets": [ "eventsets": [
{ {
"events": { "events": {

View File

@@ -40,8 +40,13 @@ type MemstatCollector struct {
sendMemUsed bool sendMemUsed bool
} }
func getStats(filename string) map[string]float64 { type MemstatStats struct {
stats := make(map[string]float64) value float64
unit string
}
func getStats(filename string) map[string]MemstatStats {
stats := make(map[string]MemstatStats)
file, err := os.Open(filename) file, err := os.Open(filename)
if err != nil { if err != nil {
cclog.Error(err.Error()) cclog.Error(err.Error())
@@ -55,12 +60,18 @@ func getStats(filename string) map[string]float64 {
if len(linefields) == 3 { if len(linefields) == 3 {
v, err := strconv.ParseFloat(linefields[1], 64) v, err := strconv.ParseFloat(linefields[1], 64)
if err == nil { if err == nil {
stats[strings.Trim(linefields[0], ":")] = v stats[strings.Trim(linefields[0], ":")] = MemstatStats{
value: v,
unit: linefields[2],
}
} }
} else if len(linefields) == 5 { } else if len(linefields) == 5 {
v, err := strconv.ParseFloat(linefields[3], 64) v, err := strconv.ParseFloat(linefields[3], 64)
if err == nil { if err == nil {
stats[strings.Trim(linefields[0], ":")] = v stats[strings.Trim(linefields[0], ":")] = MemstatStats{
value: v,
unit: linefields[4],
}
} }
} }
} }
@@ -78,7 +89,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
return err return err
} }
} }
m.meta = map[string]string{"source": m.name, "group": "Memory", "unit": "GByte"} m.meta = map[string]string{"source": m.name, "group": "Memory"}
m.stats = make(map[string]int64) m.stats = make(map[string]int64)
m.matches = make(map[string]string) m.matches = make(map[string]string)
m.tags = map[string]string{"type": "node"} m.tags = map[string]string{"type": "node"}
@@ -151,30 +162,51 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric)
return return
} }
sendStats := func(stats map[string]float64, tags map[string]string) { sendStats := func(stats map[string]MemstatStats, tags map[string]string) {
for match, name := range m.matches { for match, name := range m.matches {
var value float64 = 0 var value float64 = 0
var unit string = ""
if v, ok := stats[match]; ok { if v, ok := stats[match]; ok {
value = v value = v.value
if len(v.unit) > 0 {
unit = v.unit
} }
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value * 1e-6}, time.Now()) }
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil { if err == nil {
if len(unit) > 0 {
y.AddMeta("unit", unit)
}
output <- y output <- y
} }
} }
if m.sendMemUsed { if m.sendMemUsed {
memUsed := 0.0 memUsed := 0.0
unit := ""
if totalVal, total := stats["MemTotal"]; total { if totalVal, total := stats["MemTotal"]; total {
if freeVal, free := stats["MemFree"]; free { if freeVal, free := stats["MemFree"]; free {
if bufVal, buffers := stats["Buffers"]; buffers { if bufVal, buffers := stats["Buffers"]; buffers {
if cacheVal, cached := stats["Cached"]; cached { if cacheVal, cached := stats["Cached"]; cached {
memUsed = totalVal - (freeVal + bufVal + cacheVal) memUsed = totalVal.value - (freeVal.value + bufVal.value + cacheVal.value)
if len(totalVal.unit) > 0 {
unit = totalVal.unit
} else if len(freeVal.unit) > 0 {
unit = freeVal.unit
} else if len(bufVal.unit) > 0 {
unit = bufVal.unit
} else if len(cacheVal.unit) > 0 {
unit = cacheVal.unit
} }
} }
} }
} }
y, err := lp.New("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed * 1e-6}, time.Now()) }
y, err := lp.New("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now())
if err == nil { if err == nil {
if len(unit) > 0 {
y.AddMeta("unit", unit)
}
output <- y output <- y
} }
} }

View File

@@ -36,7 +36,7 @@ type nfsCollector struct {
} }
func (m *nfsCollector) initStats() error { func (m *nfsCollector) initStats() error {
cmd := exec.Command(m.config.Nfsstats, `-l`) cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
cmd.Wait() cmd.Wait()
buffer, err := cmd.Output() buffer, err := cmd.Output()
if err == nil { if err == nil {
@@ -52,7 +52,7 @@ func (m *nfsCollector) initStats() error {
if err == nil { if err == nil {
x := m.data[name] x := m.data[name]
x.current = value x.current = value
x.last = 0 x.last = value
m.data[name] = x m.data[name] = x
} }
} }
@@ -63,7 +63,7 @@ func (m *nfsCollector) initStats() error {
} }
func (m *nfsCollector) updateStats() error { func (m *nfsCollector) updateStats() error {
cmd := exec.Command(m.config.Nfsstats, `-l`) cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
cmd.Wait() cmd.Wait()
buffer, err := cmd.Output() buffer, err := cmd.Output()
if err == nil { if err == nil {

View File

@@ -1,8 +1,8 @@
{ {
"sinks": "sinks.json", "sinks": "./sinks.json",
"collectors" : "collectors.json", "collectors" : "./collectors.json",
"receivers" : "receivers.json", "receivers" : "./receivers.json",
"router" : "router.json", "router" : "./router.json",
"interval": 10, "interval": "10s",
"duration": 1 "duration": "1s"
} }

21
go.mod
View File

@@ -3,17 +3,14 @@ module github.com/ClusterCockpit/cc-metric-collector
go 1.16 go 1.16
require ( require (
github.com/NVIDIA/go-nvml v0.11.1-0 github.com/NVIDIA/go-nvml v0.11.6-0
github.com/influxdata/influxdb-client-go/v2 v2.7.0
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf
github.com/nats-io/nats.go v1.13.1-0.20211122170419-d7c1d78a50fc
golang.org/x/sys v0.0.0-20220114195835-da31bd327af9
gopkg.in/Knetic/govaluate.v2 v2.3.0
)
require (
github.com/PaesslerAG/gval v1.1.2 github.com/PaesslerAG/gval v1.1.2
github.com/golang/protobuf v1.5.2 // indirect github.com/gorilla/mux v1.8.0
github.com/nats-io/nats-server/v2 v2.7.0 // indirect github.com/influxdata/influxdb-client-go/v2 v2.8.1
google.golang.org/protobuf v1.27.1 // indirect github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf
github.com/nats-io/nats-server/v2 v2.8.0 // indirect
github.com/nats-io/nats.go v1.14.0
github.com/prometheus/client_golang v1.12.1
github.com/stmcginnis/gofish v0.13.0
golang.org/x/sys v0.0.0-20220412211240-33da011f77ad
) )

View File

@@ -169,8 +169,11 @@ func DieList() []int {
} }
} }
} }
if len(dielist) > 0 {
return dielist return dielist
} }
return SocketList()
}
type CpuEntry struct { type CpuEntry struct {
Cpuid int Cpuid int
@@ -261,7 +264,7 @@ func CpuData() []CpuEntry {
for _, c := range CpuList() { for _, c := range CpuList() {
clist = append(clist, CpuEntry{Cpuid: c}) clist = append(clist, CpuEntry{Cpuid: c})
} }
for _, centry := range clist { for i, centry := range clist {
centry.Socket = -1 centry.Socket = -1
centry.Numadomain = -1 centry.Numadomain = -1
centry.Die = -1 centry.Die = -1
@@ -289,6 +292,8 @@ func CpuData() []CpuEntry {
// Lookup NUMA domain id // Lookup NUMA domain id
centry.Numadomain = getNumaDomain(base) centry.Numadomain = getNumaDomain(base)
// Update values in output list
clist[i] = centry
} }
return clist return clist
} }

View File

@@ -48,7 +48,6 @@ type metricRouter struct {
done chan bool // channel to finish / stop metric router done chan bool // channel to finish / stop metric router
wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector
timestamp time.Time // timestamp periodically updated by ticker each interval timestamp time.Time // timestamp periodically updated by ticker each interval
timerdone chan bool // channel to finish / stop timestamp updater
ticker mct.MultiChanTicker // periodically ticking once each interval ticker mct.MultiChanTicker // periodically ticking once each interval
config metricRouterConfig // json encoded config for metric router config metricRouterConfig // json encoded config for metric router
cache MetricCache // pointer to MetricCache cache MetricCache // pointer to MetricCache
@@ -124,29 +123,6 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
return nil return nil
} }
// StartTimer starts a timer which updates timestamp periodically
func (r *metricRouter) StartTimer() {
m := make(chan time.Time)
r.ticker.AddChannel(m)
r.timerdone = make(chan bool)
r.wg.Add(1)
go func() {
defer r.wg.Done()
for {
select {
case <-r.timerdone:
close(r.timerdone)
cclog.ComponentDebug("MetricRouter", "TIMER DONE")
return
case t := <-m:
r.timestamp = t
}
}
}()
cclog.ComponentDebug("MetricRouter", "TIMER START")
}
func getParamMap(point lp.CCMetric) map[string]interface{} { func getParamMap(point lp.CCMetric) map[string]interface{} {
params := make(map[string]interface{}) params := make(map[string]interface{})
params["metric"] = point params["metric"] = point
@@ -235,8 +211,9 @@ func (r *metricRouter) dropMetric(point lp.CCMetric) bool {
func (r *metricRouter) Start() { func (r *metricRouter) Start() {
// start timer if configured // start timer if configured
r.timestamp = time.Now() r.timestamp = time.Now()
timeChan := make(chan time.Time)
if r.config.IntervalStamp { if r.config.IntervalStamp {
r.StartTimer() r.ticker.AddChannel(timeChan)
} }
// Router manager is done // Router manager is done
@@ -316,6 +293,10 @@ func (r *metricRouter) Start() {
done() done()
return return
case timestamp := <-timeChan:
r.timestamp = timestamp
cclog.ComponentDebug("MetricRouter", "Update timestamp", r.timestamp.UnixNano())
case p := <-r.coll_input: case p := <-r.coll_input:
coll_forward(p) coll_forward(p)
for i := 0; len(r.coll_input) > 0 && i < (r.maxForward-1); i++ { for i := 0; len(r.coll_input) > 0 && i < (r.maxForward-1); i++ {
@@ -361,14 +342,6 @@ func (r *metricRouter) Close() {
// wait for close of channel r.done // wait for close of channel r.done
<-r.done <-r.done
// stop timer
if r.config.IntervalStamp {
cclog.ComponentDebug("MetricRouter", "TIMER CLOSE")
r.timerdone <- true
// wait for close of channel r.timerdone
<-r.timerdone
}
// stop metric cache // stop metric cache
if r.config.NumCacheIntervals > 0 { if r.config.NumCacheIntervals > 0 {
cclog.ComponentDebug("MetricRouter", "CACHE CLOSE") cclog.ComponentDebug("MetricRouter", "CACHE CLOSE")

View File

@@ -4,5 +4,22 @@
"address": "nats://my-url", "address": "nats://my-url",
"port" : "4222", "port" : "4222",
"database": "testcluster" "database": "testcluster"
},
"redfish_recv": {
"type": "redfish",
"client_config": [
{
"hostname": "my-host-1",
"username": "username-1",
"password": "password-1",
"endpoint": "https://my-endpoint-1"
},
{
"hostname": "my-host-2",
"username": "username-2",
"password": "password-2",
"endpoint": "https://my-endpoint-2"
}
]
} }
} }

View File

@@ -11,13 +11,12 @@ import (
var AvailableReceivers = map[string]func(name string, config json.RawMessage) (Receiver, error){ var AvailableReceivers = map[string]func(name string, config json.RawMessage) (Receiver, error){
"nats": NewNatsReceiver, "nats": NewNatsReceiver,
"redfish": NewRedfishReceiver,
} }
type receiveManager struct { type receiveManager struct {
inputs []Receiver inputs []Receiver
output chan lp.CCMetric output chan lp.CCMetric
done chan bool
wg *sync.WaitGroup
config []json.RawMessage config []json.RawMessage
} }
@@ -33,8 +32,6 @@ func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) er
// Initialize struct fields // Initialize struct fields
rm.inputs = make([]Receiver, 0) rm.inputs = make([]Receiver, 0)
rm.output = nil rm.output = nil
rm.done = make(chan bool)
rm.wg = wg
rm.config = make([]json.RawMessage, 0) rm.config = make([]json.RawMessage, 0)
configFile, err := os.Open(receiverConfigFile) configFile, err := os.Open(receiverConfigFile)
@@ -58,7 +55,7 @@ func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) er
} }
func (rm *receiveManager) Start() { func (rm *receiveManager) Start() {
rm.wg.Add(1) cclog.ComponentDebug("ReceiveManager", "START")
for _, r := range rm.inputs { for _, r := range rm.inputs {
cclog.ComponentDebug("ReceiveManager", "START", r.Name()) cclog.ComponentDebug("ReceiveManager", "START", r.Name())
@@ -97,16 +94,19 @@ func (rm *receiveManager) AddOutput(output chan lp.CCMetric) {
} }
func (rm *receiveManager) Close() { func (rm *receiveManager) Close() {
cclog.ComponentDebug("ReceiveManager", "CLOSE")
// Close all receivers
for _, r := range rm.inputs { for _, r := range rm.inputs {
cclog.ComponentDebug("ReceiveManager", "CLOSE", r.Name()) cclog.ComponentDebug("ReceiveManager", "CLOSE", r.Name())
r.Close() r.Close()
} }
rm.wg.Done()
cclog.ComponentDebug("ReceiveManager", "CLOSE") cclog.ComponentDebug("ReceiveManager", "DONE")
} }
func New(wg *sync.WaitGroup, receiverConfigFile string) (ReceiveManager, error) { func New(wg *sync.WaitGroup, receiverConfigFile string) (ReceiveManager, error) {
r := &receiveManager{} r := new(receiveManager)
err := r.Init(wg, receiverConfigFile) err := r.Init(wg, receiverConfigFile)
if err != nil { if err != nil {
return nil, err return nil, err

View File

@@ -0,0 +1,324 @@
package receivers
import (
"encoding/json"
"fmt"
"strconv"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
// See: https://pkg.go.dev/github.com/stmcginnis/gofish
"github.com/stmcginnis/gofish"
)
// RedfishReceiver configuration:
type RedfishReceiver struct {
receiver
config struct {
Type string `json:"type"`
Fanout int `json:"fanout,omitempty"` // Default fanout: 64
Interval int `json:"interval,omitempty"` // Default interval: 30s
// Client config for each redfish service
ClientConfigs []struct {
Hostname *string `json:"hostname"`
Username *string `json:"username"`
Password *string `json:"password"`
Endpoint *string `json:"endpoint"`
Insecure *bool `json:"insecure,omitempty"`
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
gofish gofish.ClientConfig
} `json:"client_config"`
}
done chan bool // channel to finish / stop redfish receiver
wg sync.WaitGroup // wait group for redfish receiver
}
// Start starts the redfish receiver
func (r *RedfishReceiver) Start() {
cclog.ComponentDebug(r.name, "START")
// readPowerMetric reads readfish power metric from the endpoint configured in conf
readPowerMetric := func(clientConfigIndex int) error {
clientConfig := &r.config.ClientConfigs[clientConfigIndex]
// Connect to redfish service
c, err := gofish.Connect(clientConfig.gofish)
if err != nil {
c := struct {
Username string
Endpoint string
BasicAuth bool
Insecure bool
}{
Username: clientConfig.gofish.Username,
Endpoint: clientConfig.gofish.Endpoint,
BasicAuth: clientConfig.gofish.BasicAuth,
Insecure: clientConfig.gofish.Insecure,
}
return fmt.Errorf("readPowerMetric: gofish.Connect(%+v) failed: %v", c, err)
}
defer c.Logout()
// Get all chassis managed by this service
chassis_list, err := c.Service.Chassis()
if err != nil {
return fmt.Errorf("readPowerMetric: c.Service.Chassis() failed: %v", err)
}
for _, chassis := range chassis_list {
timestamp := time.Now()
// Get power information for each chassis
power, err := chassis.Power()
if err != nil {
return fmt.Errorf("readPowerMetric: chassis.Power() failed: %v", err)
}
if power == nil {
continue
}
// Read min, max and average consumed watts for each power control
for _, pc := range power.PowerControl {
// Map of collected metrics
metrics := map[string]float32{
// PowerConsumedWatts shall represent the actual power being consumed (in
// Watts) by the chassis
"consumed_watts": pc.PowerConsumedWatts,
// AverageConsumedWatts shall represent the
// average power level that occurred averaged over the last IntervalInMin
// minutes.
"average_consumed_watts": pc.PowerMetrics.AverageConsumedWatts,
// MinConsumedWatts shall represent the
// minimum power level in watts that occurred within the last
// IntervalInMin minutes.
"min_consumed_watts": pc.PowerMetrics.MinConsumedWatts,
// MaxConsumedWatts shall represent the
// maximum power level in watts that occurred within the last
// IntervalInMin minutes
"max_consumed_watts": pc.PowerMetrics.MaxConsumedWatts,
}
intervalInMin := strconv.FormatFloat(float64(pc.PowerMetrics.IntervalInMin), 'f', -1, 32)
// Metrics to exclude
for _, key := range clientConfig.ExcludeMetrics {
delete(metrics, key)
}
// Set tags
tags := map[string]string{
"hostname": *clientConfig.Hostname,
"type": "node",
// ID uniquely identifies the resource
"id": pc.ID,
// MemberID shall uniquely identify the member within the collection. For
// services supporting Redfish v1.6 or higher, this value shall be the
// zero-based array index.
"member_id": pc.MemberID,
// PhysicalContext shall be a description of the affected device(s) or region
// within the chassis to which this power control applies.
"physical_context": string(pc.PhysicalContext),
// Name
"power_control_name": pc.Name,
}
// Delete empty tags
for key, value := range tags {
if value == "" {
delete(tags, key)
}
}
// Set meta data tags
meta := map[string]string{
"source": r.name,
"group": "Energy",
"interval_in_minutes": intervalInMin,
"unit": "watts",
}
// Delete empty meta data tags
for key, value := range meta {
if value == "" {
delete(meta, key)
}
}
for name, value := range metrics {
y, err := lp.New(name, tags, meta,
map[string]interface{}{
"value": value,
},
timestamp)
if err == nil {
r.sink <- y
}
}
}
}
return nil
}
// doReadPowerMetric read power metrics for all configure redfish services.
// To compensate latencies of the Redfish services a fanout is used.
doReadPowerMetric := func() {
// Compute fanout to use
realFanout := r.config.Fanout
if len(r.config.ClientConfigs) < realFanout {
realFanout = len(r.config.ClientConfigs)
}
// Create wait group and input channel for workers
var workerWaitGroup sync.WaitGroup
workerInput := make(chan int, realFanout)
// Create worker go routines
for i := 0; i < realFanout; i++ {
// Increment worker wait group counter
workerWaitGroup.Add(1)
go func() {
// Decrement worker wait group counter
defer workerWaitGroup.Done()
// Read power metrics for each client config
for clientConfigIndex := range workerInput {
err := readPowerMetric(clientConfigIndex)
if err != nil {
cclog.ComponentError(r.name, err)
}
}
}()
}
// Distribute client configs to workers
for i := range r.config.ClientConfigs {
// Check done channel status
select {
case workerInput <- i:
case <-r.done:
// process done event
// Stop workers, clear channel and wait for all workers to finish
close(workerInput)
for range workerInput {
}
workerWaitGroup.Wait()
return
}
}
// Stop workers and wait for all workers to finish
close(workerInput)
workerWaitGroup.Wait()
}
// Start redfish receiver
r.wg.Add(1)
go func() {
defer r.wg.Done()
// Create ticker
ticker := time.NewTicker(time.Duration(r.config.Interval) * time.Second)
defer ticker.Stop()
for {
doReadPowerMetric()
select {
case <-ticker.C:
// process ticker event -> continue
continue
case <-r.done:
// process done event
return
}
}
}()
cclog.ComponentDebug(r.name, "STARTED")
}
// Close redfish receiver
func (r *RedfishReceiver) Close() {
cclog.ComponentDebug(r.name, "CLOSE")
// Send the signal and wait
close(r.done)
r.wg.Wait()
cclog.ComponentDebug(r.name, "DONE")
}
// New function to create a new instance of the receiver
// Initialize the receiver by giving it a name and reading in the config JSON
func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) {
r := new(RedfishReceiver)
// Set name
r.name = fmt.Sprintf("RedfishReceiver(%s)", name)
// Create done channel
r.done = make(chan bool)
// Set defaults in r.config
// Allow overwriting these defaults by reading config JSON
r.config.Fanout = 64
r.config.Interval = 30
// Read the redfish receiver specific JSON config
if len(config) > 0 {
err := json.Unmarshal(config, &r.config)
if err != nil {
cclog.ComponentError(r.name, "Error reading config:", err.Error())
return nil, err
}
}
// Create gofish client config
for i := range r.config.ClientConfigs {
clientConfig := &r.config.ClientConfigs[i]
gofishConfig := &clientConfig.gofish
if clientConfig.Hostname == nil {
err := fmt.Errorf("client config number %v requires hostname", i)
cclog.ComponentError(r.name, err)
return nil, err
}
if clientConfig.Endpoint == nil {
err := fmt.Errorf("client config number %v requires endpoint", i)
cclog.ComponentError(r.name, err)
return nil, err
}
gofishConfig.Endpoint = *clientConfig.Endpoint
if clientConfig.Username == nil {
err := fmt.Errorf("client config number %v requires username", i)
cclog.ComponentError(r.name, err)
return nil, err
}
gofishConfig.Username = *clientConfig.Username
if clientConfig.Password == nil {
err := fmt.Errorf("client config number %v requires password", i)
cclog.ComponentError(r.name, err)
return nil, err
}
gofishConfig.Password = *clientConfig.Password
gofishConfig.Insecure = true
if clientConfig.Insecure != nil {
gofishConfig.Insecure = *clientConfig.Insecure
}
}
return r, nil
}

View File

@@ -36,16 +36,26 @@ func (r *SampleReceiver) Start() {
// or use own go routine but always make sure it exits // or use own go routine but always make sure it exits
// as soon as it gets the signal of the r.done channel // as soon as it gets the signal of the r.done channel
//
// r.done = make(chan bool)
// r.wg.Add(1) // r.wg.Add(1)
// go func() { // go func() {
// defer r.wg.Done()
//
// // Create ticker
// ticker := time.NewTicker(30 * time.Second)
// defer ticker.Stop()
//
// for { // for {
// readMetric()
// select { // select {
// case <-ticker.C:
// // process ticker event -> continue
// continue
// case <-r.done: // case <-r.done:
// r.wg.Done()
// return // return
// } // }
// } // }
// r.wg.Done()
// }() // }()
} }

View File

@@ -15,3 +15,9 @@ CONF_DIR=/etc/cc-metric-collector
CONF_FILE=/etc/cc-metric-collector/cc-metric-collector.json CONF_FILE=/etc/cc-metric-collector/cc-metric-collector.json
RESTART_ON_UPGRADE=true RESTART_ON_UPGRADE=true
# Golang runtime debugging. (see: https://pkg.go.dev/runtime)
# GODEBUG=gctrace=1
# Golang garbage collection target percentage
# GOGC=100

View File

@@ -1,6 +1,8 @@
{ {
"mystdout": { "mystdout": {
"type": "stdout", "type": "stdout",
"meta_as_tags" : true "meta_as_tags": [
"unit"
]
} }
} }

View File

@@ -42,13 +42,13 @@ func (s *HttpSink) Write(m lp.CCMetric) error {
if s.buffer.Len() == 0 && s.flushDelay != 0 { if s.buffer.Len() == 0 && s.flushDelay != 0 {
// This is the first write since the last flush, start the flushTimer! // This is the first write since the last flush, start the flushTimer!
if s.flushTimer != nil && s.flushTimer.Stop() { if s.flushTimer != nil && s.flushTimer.Stop() {
cclog.ComponentDebug("HttpSink", "unexpected: the flushTimer was already running?") cclog.ComponentDebug(s.name, "unexpected: the flushTimer was already running?")
} }
// Run a batched flush for all lines that have arrived in the last second // Run a batched flush for all lines that have arrived in the last second
s.flushTimer = time.AfterFunc(s.flushDelay, func() { s.flushTimer = time.AfterFunc(s.flushDelay, func() {
if err := s.Flush(); err != nil { if err := s.Flush(); err != nil {
cclog.ComponentError("HttpSink", "flush failed:", err.Error()) cclog.ComponentError(s.name, "flush failed:", err.Error())
} }
}) })
} }
@@ -60,6 +60,7 @@ func (s *HttpSink) Write(m lp.CCMetric) error {
s.lock.Unlock() // defer does not work here as Flush() takes the lock as well s.lock.Unlock() // defer does not work here as Flush() takes the lock as well
if err != nil { if err != nil {
cclog.ComponentError(s.name, "encoding failed:", err.Error())
return err return err
} }
@@ -84,6 +85,7 @@ func (s *HttpSink) Flush() error {
// Create new request to send buffer // Create new request to send buffer
req, err := http.NewRequest(http.MethodPost, s.config.URL, s.buffer) req, err := http.NewRequest(http.MethodPost, s.config.URL, s.buffer)
if err != nil { if err != nil {
cclog.ComponentError(s.name, "failed to create request:", err.Error())
return err return err
} }
@@ -100,12 +102,15 @@ func (s *HttpSink) Flush() error {
// Handle transport/tcp errors // Handle transport/tcp errors
if err != nil { if err != nil {
cclog.ComponentError(s.name, "transport/tcp error:", err.Error())
return err return err
} }
// Handle application errors // Handle application errors
if res.StatusCode != http.StatusOK { if res.StatusCode != http.StatusOK {
return errors.New(res.Status) err = errors.New(res.Status)
cclog.ComponentError(s.name, "application error:", err.Error())
return err
} }
return nil return nil
@@ -114,7 +119,7 @@ func (s *HttpSink) Flush() error {
func (s *HttpSink) Close() { func (s *HttpSink) Close() {
s.flushTimer.Stop() s.flushTimer.Stop()
if err := s.Flush(); err != nil { if err := s.Flush(); err != nil {
cclog.ComponentError("HttpSink", "flush failed:", err.Error()) cclog.ComponentError(s.name, "flush failed:", err.Error())
} }
s.client.CloseIdleConnections() s.client.CloseIdleConnections()
} }

View File

@@ -6,12 +6,14 @@ import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2 "github.com/influxdata/influxdb-client-go/v2"
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
influxdb2ApiHttp "github.com/influxdata/influxdb-client-go/v2/api/http"
) )
type InfluxAsyncSinkConfig struct { type InfluxAsyncSinkConfig struct {
@@ -23,15 +25,16 @@ type InfluxAsyncSinkConfig struct {
Password string `json:"password,omitempty"` Password string `json:"password,omitempty"`
Organization string `json:"organization,omitempty"` Organization string `json:"organization,omitempty"`
SSL bool `json:"ssl,omitempty"` SSL bool `json:"ssl,omitempty"`
RetentionPol string `json:"retention_policy,omitempty"`
// Maximum number of points sent to server in single request. Default 5000 // Maximum number of points sent to server in single request. Default 5000
BatchSize uint `json:"batch_size,omitempty"` BatchSize uint `json:"batch_size,omitempty"`
// Interval, in ms, in which is buffer flushed if it has not been already written (by reaching batch size) . Default 1000ms // Interval, in ms, in which is buffer flushed if it has not been already written (by reaching batch size) . Default 1000ms
FlushInterval uint `json:"flush_interval,omitempty"` FlushInterval uint `json:"flush_interval,omitempty"`
InfluxRetryInterval string `json:"retry_interval"` InfluxRetryInterval string `json:"retry_interval,omitempty"`
InfluxExponentialBase uint `json:"retry_exponential_base"` InfluxExponentialBase uint `json:"retry_exponential_base,omitempty"`
InfluxMaxRetries uint `json:"max_retries"` InfluxMaxRetries uint `json:"max_retries,omitempty"`
InfluxMaxRetryTime string `json:"max_retry_time"` InfluxMaxRetryTime string `json:"max_retry_time,omitempty"`
CustomFlushInterval string `json:"custom_flush_interval,omitempty"`
MaxRetryAttempts uint `json:"max_retry_attempts,omitempty"`
} }
type InfluxAsyncSink struct { type InfluxAsyncSink struct {
@@ -42,6 +45,8 @@ type InfluxAsyncSink struct {
config InfluxAsyncSinkConfig config InfluxAsyncSinkConfig
influxRetryInterval uint influxRetryInterval uint
influxMaxRetryTime uint influxMaxRetryTime uint
customFlushInterval time.Duration
flushTimer *time.Timer
} }
func (s *InfluxAsyncSink) connect() error { func (s *InfluxAsyncSink) connect() error {
@@ -60,20 +65,34 @@ func (s *InfluxAsyncSink) connect() error {
cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database) cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database)
clientOptions := influxdb2.DefaultOptions() clientOptions := influxdb2.DefaultOptions()
if s.config.BatchSize != 0 { if s.config.BatchSize != 0 {
cclog.ComponentDebug(s.name, "Batch size", s.config.BatchSize)
clientOptions.SetBatchSize(s.config.BatchSize) clientOptions.SetBatchSize(s.config.BatchSize)
} }
if s.config.FlushInterval != 0 { if s.config.FlushInterval != 0 {
cclog.ComponentDebug(s.name, "Flush interval", s.config.FlushInterval)
clientOptions.SetFlushInterval(s.config.FlushInterval) clientOptions.SetFlushInterval(s.config.FlushInterval)
} }
if s.influxRetryInterval != 0 {
cclog.ComponentDebug(s.name, "MaxRetryInterval", s.influxRetryInterval)
clientOptions.SetMaxRetryInterval(s.influxRetryInterval)
}
if s.influxMaxRetryTime != 0 {
cclog.ComponentDebug(s.name, "MaxRetryTime", s.influxMaxRetryTime)
clientOptions.SetMaxRetryTime(s.influxMaxRetryTime)
}
if s.config.InfluxExponentialBase != 0 {
cclog.ComponentDebug(s.name, "Exponential Base", s.config.InfluxExponentialBase)
clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
}
if s.config.InfluxMaxRetries != 0 {
cclog.ComponentDebug(s.name, "Max Retries", s.config.InfluxMaxRetries)
clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
}
clientOptions.SetTLSConfig( clientOptions.SetTLSConfig(
&tls.Config{ &tls.Config{
InsecureSkipVerify: true, InsecureSkipVerify: true,
}, },
) ).SetPrecision(time.Second)
clientOptions.SetMaxRetryInterval(s.influxRetryInterval)
clientOptions.SetMaxRetryTime(s.influxMaxRetryTime)
clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions) s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions)
s.writeApi = s.client.WriteAPI(s.config.Organization, s.config.Database) s.writeApi = s.client.WriteAPI(s.config.Organization, s.config.Database)
@@ -84,10 +103,23 @@ func (s *InfluxAsyncSink) connect() error {
if !ok { if !ok {
return fmt.Errorf("connection to %s not healthy", uri) return fmt.Errorf("connection to %s not healthy", uri)
} }
s.writeApi.SetWriteFailedCallback(func(batch string, err influxdb2ApiHttp.Error, retryAttempts uint) bool {
mlist := strings.Split(batch, "\n")
cclog.ComponentError(s.name, fmt.Sprintf("Failed to write batch with %d metrics %d times (max: %d): %s", len(mlist), retryAttempts, s.config.MaxRetryAttempts, err.Error()))
return retryAttempts <= s.config.MaxRetryAttempts
})
return nil return nil
} }
func (s *InfluxAsyncSink) Write(m lp.CCMetric) error { func (s *InfluxAsyncSink) Write(m lp.CCMetric) error {
if s.customFlushInterval != 0 && s.flushTimer == nil {
// Run a batched flush for all lines that have arrived in the defined interval
s.flushTimer = time.AfterFunc(s.customFlushInterval, func() {
if err := s.Flush(); err != nil {
cclog.ComponentError(s.name, "flush failed:", err.Error())
}
})
}
s.writeApi.WritePoint( s.writeApi.WritePoint(
m.ToPoint(s.meta_as_tags), m.ToPoint(s.meta_as_tags),
) )
@@ -95,7 +127,11 @@ func (s *InfluxAsyncSink) Write(m lp.CCMetric) error {
} }
func (s *InfluxAsyncSink) Flush() error { func (s *InfluxAsyncSink) Flush() error {
cclog.ComponentDebug(s.name, "Flushing")
s.writeApi.Flush() s.writeApi.Flush()
if s.customFlushInterval != 0 && s.flushTimer != nil {
s.flushTimer = nil
}
return nil return nil
} }
@@ -110,13 +146,17 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) {
s.name = fmt.Sprintf("InfluxSink(%s)", name) s.name = fmt.Sprintf("InfluxSink(%s)", name)
// Set default for maximum number of points sent to server in single request. // Set default for maximum number of points sent to server in single request.
s.config.BatchSize = 100 s.config.BatchSize = 0
s.influxRetryInterval = uint(time.Duration(1) * time.Second) s.influxRetryInterval = 0
s.config.InfluxRetryInterval = "1s" //s.config.InfluxRetryInterval = "1s"
s.influxMaxRetryTime = uint(7 * time.Duration(24) * time.Hour) s.influxMaxRetryTime = 0
s.config.InfluxMaxRetryTime = "168h" //s.config.InfluxMaxRetryTime = "168h"
s.config.InfluxMaxRetries = 20 s.config.InfluxMaxRetries = 0
s.config.InfluxExponentialBase = 2 s.config.InfluxExponentialBase = 0
s.config.FlushInterval = 0
s.config.CustomFlushInterval = ""
s.customFlushInterval = time.Duration(0)
s.config.MaxRetryAttempts = 1
// Default retry intervals (in seconds) // Default retry intervals (in seconds)
// 1 2 // 1 2
@@ -145,12 +185,17 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) {
return nil, err return nil, err
} }
} }
if len(s.config.Host) == 0 || if len(s.config.Port) == 0 {
len(s.config.Port) == 0 || return nil, errors.New("Missing port configuration required by InfluxSink")
len(s.config.Database) == 0 || }
len(s.config.Organization) == 0 || if len(s.config.Database) == 0 {
len(s.config.Password) == 0 { return nil, errors.New("Missing database configuration required by InfluxSink")
return nil, errors.New("not all configuration variables set required by InfluxAsyncSink") }
if len(s.config.Organization) == 0 {
return nil, errors.New("Missing organization configuration required by InfluxSink")
}
if len(s.config.Password) == 0 {
return nil, errors.New("Missing password configuration required by InfluxSink")
} }
// Create lookup map to use meta infos as tags in the output metric // Create lookup map to use meta infos as tags in the output metric
s.meta_as_tags = make(map[string]bool) s.meta_as_tags = make(map[string]bool)
@@ -168,6 +213,15 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) {
s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval) s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval)
s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime) s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime)
// Use a own timer for calling Flush()
if len(s.config.CustomFlushInterval) > 0 {
t, err := time.ParseDuration(s.config.CustomFlushInterval)
if err != nil {
return nil, fmt.Errorf("invalid duration in 'custom_flush_interval': %v", err)
}
s.customFlushInterval = t
}
// Connect to InfluxDB server // Connect to InfluxDB server
if err := s.connect(); err != nil { if err := s.connect(); err != nil {
return nil, fmt.Errorf("unable to connect: %v", err) return nil, fmt.Errorf("unable to connect: %v", err)

View File

@@ -6,15 +6,21 @@ import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"sync"
"time" "time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2 "github.com/influxdata/influxdb-client-go/v2"
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
"github.com/influxdata/influxdb-client-go/v2/api/write"
) )
type InfluxSinkConfig struct { type InfluxSink struct {
sink
client influxdb2.Client
writeApi influxdb2Api.WriteAPIBlocking
config struct {
defaultSinkConfig defaultSinkConfig
Host string `json:"host,omitempty"` Host string `json:"host,omitempty"`
Port string `json:"port,omitempty"` Port string `json:"port,omitempty"`
@@ -23,52 +29,58 @@ type InfluxSinkConfig struct {
Password string `json:"password,omitempty"` Password string `json:"password,omitempty"`
Organization string `json:"organization,omitempty"` Organization string `json:"organization,omitempty"`
SSL bool `json:"ssl,omitempty"` SSL bool `json:"ssl,omitempty"`
RetentionPol string `json:"retention_policy,omitempty"` // Maximum number of points sent to server in single request. Default 100
InfluxRetryInterval string `json:"retry_interval"` BatchSize int `json:"batch_size,omitempty"`
InfluxExponentialBase uint `json:"retry_exponential_base"` // Interval, in which is buffer flushed if it has not been already written (by reaching batch size). Default 1s
InfluxMaxRetries uint `json:"max_retries"` FlushInterval string `json:"flush_delay,omitempty"`
InfluxMaxRetryTime string `json:"max_retry_time"` }
//InfluxMaxRetryDelay string `json:"max_retry_delay"` // It is mentioned in the docs but there is no way to set it batch []*write.Point
} flushTimer *time.Timer
flushDelay time.Duration
type InfluxSink struct { lock sync.Mutex // Flush() runs in another goroutine, so this lock has to protect the buffer
sink
client influxdb2.Client
writeApi influxdb2Api.WriteAPIBlocking
config InfluxSinkConfig
influxRetryInterval uint
influxMaxRetryTime uint
//influxMaxRetryDelay uint
} }
// connect connects to the InfluxDB server
func (s *InfluxSink) connect() error { func (s *InfluxSink) connect() error {
var auth string
// URI options:
// * http://host:port
// * https://host:port
var uri string var uri string
if s.config.SSL { if s.config.SSL {
uri = fmt.Sprintf("https://%s:%s", s.config.Host, s.config.Port) uri = fmt.Sprintf("https://%s:%s", s.config.Host, s.config.Port)
} else { } else {
uri = fmt.Sprintf("http://%s:%s", s.config.Host, s.config.Port) uri = fmt.Sprintf("http://%s:%s", s.config.Host, s.config.Port)
} }
// Authentication options:
// * token
// * username:password
var auth string
if len(s.config.User) == 0 { if len(s.config.User) == 0 {
auth = s.config.Password auth = s.config.Password
} else { } else {
auth = fmt.Sprintf("%s:%s", s.config.User, s.config.Password) auth = fmt.Sprintf("%s:%s", s.config.User, s.config.Password)
} }
cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database) cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database)
// Set influxDB client options
clientOptions := influxdb2.DefaultOptions() clientOptions := influxdb2.DefaultOptions()
// Do not check InfluxDB certificate
clientOptions.SetTLSConfig( clientOptions.SetTLSConfig(
&tls.Config{ &tls.Config{
InsecureSkipVerify: true, InsecureSkipVerify: true,
}, },
) )
clientOptions.SetMaxRetryInterval(s.influxRetryInterval) clientOptions.SetPrecision(time.Second)
clientOptions.SetMaxRetryTime(s.influxMaxRetryTime)
clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
// Create new writeAPI
s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions) s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions)
s.writeApi = s.client.WriteAPIBlocking(s.config.Organization, s.config.Database) s.writeApi = s.client.WriteAPIBlocking(s.config.Organization, s.config.Database)
// Check InfluxDB server accessibility
ok, err := s.client.Ping(context.Background()) ok, err := s.client.Ping(context.Background())
if err != nil { if err != nil {
return err return err
@@ -80,61 +92,126 @@ func (s *InfluxSink) connect() error {
} }
func (s *InfluxSink) Write(m lp.CCMetric) error { func (s *InfluxSink) Write(m lp.CCMetric) error {
err :=
s.writeApi.WritePoint( if len(s.batch) == 0 && s.flushDelay != 0 {
context.Background(), // This is the first write since the last flush, start the flushTimer!
m.ToPoint(s.meta_as_tags), if s.flushTimer != nil && s.flushTimer.Stop() {
) cclog.ComponentDebug(s.name, "unexpected: the flushTimer was already running?")
}
// Run a batched flush for all lines that have arrived in the last flush delay interval
s.flushTimer = time.AfterFunc(
s.flushDelay,
func() {
if err := s.Flush(); err != nil {
cclog.ComponentError(s.name, "flush failed:", err.Error())
}
})
}
// Append metric to batch slice
p := m.ToPoint(s.meta_as_tags)
s.lock.Lock()
s.batch = append(s.batch, p)
s.lock.Unlock()
// Flush synchronously if "flush_delay" is zero
if s.flushDelay == 0 {
return s.Flush()
}
// Flush if batch size is reached
if len(s.batch) == s.config.BatchSize {
return s.Flush()
}
return nil
}
// Flush sends all metrics buffered in batch slice to InfluxDB server
func (s *InfluxSink) Flush() error {
// Lock access to batch slice
s.lock.Lock()
defer s.lock.Unlock()
// Nothing to do, batch slice is empty
if len(s.batch) == 0 {
return nil
}
// Send metrics from batch slice
err := s.writeApi.WritePoint(context.Background(), s.batch...)
if err != nil {
cclog.ComponentError(s.name, "flush failed:", err.Error())
return err return err
} }
func (s *InfluxSink) Flush() error { // Clear batch slice
for i := range s.batch {
s.batch[i] = nil
}
s.batch = s.batch[:0]
return nil return nil
} }
func (s *InfluxSink) Close() { func (s *InfluxSink) Close() {
cclog.ComponentDebug(s.name, "Closing InfluxDB connection") cclog.ComponentDebug(s.name, "Closing InfluxDB connection")
s.flushTimer.Stop()
s.Flush()
s.client.Close() s.client.Close()
} }
// NewInfluxSink create a new InfluxDB sink
func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { func NewInfluxSink(name string, config json.RawMessage) (Sink, error) {
s := new(InfluxSink) s := new(InfluxSink)
s.name = fmt.Sprintf("InfluxSink(%s)", name) s.name = fmt.Sprintf("InfluxSink(%s)", name)
// Set config default values
s.config.BatchSize = 100
s.config.FlushInterval = "1s"
// Read config
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &s.config) err := json.Unmarshal(config, &s.config)
if err != nil { if err != nil {
return nil, err return nil, err
} }
} }
s.influxRetryInterval = uint(time.Duration(1) * time.Second)
s.config.InfluxRetryInterval = "1s"
s.influxMaxRetryTime = uint(7 * time.Duration(24) * time.Hour)
s.config.InfluxMaxRetryTime = "168h"
s.config.InfluxMaxRetries = 20
s.config.InfluxExponentialBase = 2
if len(s.config.Host) == 0 || if len(s.config.Host) == 0 {
len(s.config.Port) == 0 || return nil, errors.New("Missing host configuration required by InfluxSink")
len(s.config.Database) == 0 ||
len(s.config.Organization) == 0 ||
len(s.config.Password) == 0 {
return nil, errors.New("not all configuration variables set required by InfluxSink")
} }
if len(s.config.Port) == 0 {
return nil, errors.New("Missing port configuration required by InfluxSink")
}
if len(s.config.Database) == 0 {
return nil, errors.New("Missing database configuration required by InfluxSink")
}
if len(s.config.Organization) == 0 {
return nil, errors.New("Missing organization configuration required by InfluxSink")
}
if len(s.config.Password) == 0 {
return nil, errors.New("Missing password configuration required by InfluxSink")
}
// Create lookup map to use meta infos as tags in the output metric // Create lookup map to use meta infos as tags in the output metric
s.meta_as_tags = make(map[string]bool) s.meta_as_tags = make(map[string]bool)
for _, k := range s.config.MetaAsTags { for _, k := range s.config.MetaAsTags {
s.meta_as_tags[k] = true s.meta_as_tags[k] = true
} }
toUint := func(duration string, def uint) uint { // Configure flush delay duration
t, err := time.ParseDuration(duration) if len(s.config.FlushInterval) > 0 {
t, err := time.ParseDuration(s.config.FlushInterval)
if err == nil { if err == nil {
return uint(t.Milliseconds()) s.flushDelay = t
} }
return def
} }
s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval)
s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime) // allocate batch slice
s.batch = make([]*write.Point, 0, s.config.BatchSize)
// Connect to InfluxDB server // Connect to InfluxDB server
if err := s.connect(); err != nil { if err := s.connect(); err != nil {

View File

@@ -17,10 +17,8 @@ The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.de
"password" : "examplepw", "password" : "examplepw",
"organization": "myorg", "organization": "myorg",
"ssl": true, "ssl": true,
"retry_interval" : "1s", "flush_delay" : "1s",
"retry_exponential_base" : 2, "batch_size" : 100
"max_retries": 20,
"max_retry_time" : "168h"
} }
} }
``` ```
@@ -34,9 +32,6 @@ The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.de
- `password`: Password for basic authentification - `password`: Password for basic authentification
- `organization`: Organization in the InfluxDB - `organization`: Organization in the InfluxDB
- `ssl`: Use SSL connection - `ssl`: Use SSL connection
- `retry_interval`: Base retry interval for failed write requests, default 1s - `flush_delay`: Group metrics coming in to a single batch
- `retry_exponential_base`: The retry interval is exponentially increased with this base, default 2 - `batch_size`: Maximal batch size
- `max_retries`: Maximal number of retry attempts
- `max_retry_time`: Maximal time to retry failed writes, default 168h (one week)
For information about the calculation of the retry interval settings, see [offical influxdb-client-go documentation](https://github.com/influxdata/influxdb-client-go#handling-of-failed-async-writes)