From 7840de7b826e73ecc57bb2e1a72b5215e38d75d0 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Thu, 19 Dec 2024 23:00:14 +0100 Subject: [PATCH] Merge develop branch into main (#123) * Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * Update README.md Use right JSON type in configuration * Update sink's README * Test whether ipmitool or ipmi-sensors can be executed without errors * Little fixes to the prometheus sink (#115) * Add uint64 to float64 cast option * Add prometheus sink to the list of available sinks * Add aggregated counters by gpu for nvlink errors --------- Co-authored-by: Michael Schwarz * Ccmessage migration (#119) * Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * Switch to CCMessage for all files. --------- Co-authored-by: Holger Obermaier Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> * Switch to ccmessage also for latest additions in nvidiaMetric * New Message processor (#118) * Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * New message processor to check whether a message should be dropped or manipulate it in flight * Create a copy of message before manipulation --------- Co-authored-by: Holger Obermaier Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> * Update collector's Makefile and go.mod/sum files * Use message processor in router, all sinks and all receivers * Add support for credential file (NKEY) to NATS sink and receiver * Fix JSON keys in message processor configuration * Update docs for message processor, router and the default router config file * Add link to expr syntax and fix regex matching docs * Update sample collectors * Minor style change in collector manager * Some helpers for ccTopology * LIKWID collector: write log owner change only once * Fix for metrics without units and reduce debugging messages for messageProcessor * Use shorted hostname for hostname added by router * Define default port for NATS * CPUstat collector: only add unit for applicable metrics * Add precision option to all sinks using Influx's encoder * Add message processor to all sink documentation * Add units to documentation of cpustat collector --------- Co-authored-by: Holger Obermaier Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: oscarminus Co-authored-by: Michael Schwarz --- .github/workflows/runonce.yml | 1 - cc-metric-collector.go | 10 +- collectors/Makefile | 4 +- collectors/beegfsmetaMetric.go | 6 +- collectors/beegfsstorageMetric.go | 6 +- collectors/collectorManager.go | 8 +- collectors/cpufreqCpuinfoMetric.go | 8 +- collectors/cpufreqMetric.go | 6 +- collectors/cpustatMetric.go | 16 +- collectors/cpustatMetric.md | 23 +- collectors/customCmdMetric.go | 4 +- collectors/diskstatMetric.go | 10 +- collectors/gpfsMetric.go | 30 +- collectors/infinibandMetric.go | 12 +- collectors/iostatMetric.go | 6 +- collectors/ipmiMetric.go | 12 +- collectors/likwidMetric.go | 55 ++-- collectors/loadavgMetric.go | 18 +- collectors/lustreMetric.go | 12 +- collectors/memstatMetric.go | 8 +- collectors/metricCollector.go | 4 +- collectors/netstatMetric.go | 8 +- collectors/nfsMetric.go | 6 +- collectors/nfsiostatMetric.go | 6 +- collectors/numastatsMetric.go | 6 +- collectors/nvidiaMetric.go | 197 +++++++----- collectors/raplMetric.go | 6 +- collectors/rocmsmiMetric.go | 40 +-- collectors/sampleMetric.go | 8 +- collectors/sampleTimerMetric.go | 8 +- collectors/schedstatMetric.go | 8 +- collectors/selfMetric.go | 36 +-- collectors/tempMetric.go | 10 +- collectors/topprocsMetric.go | 6 +- go.mod | 10 +- go.sum | 26 +- internal/metricAggregator/metricAggregator.go | 32 +- internal/metricRouter/README.md | 31 +- internal/metricRouter/metricCache.go | 28 +- internal/metricRouter/metricRouter.go | 289 +++++++++++------- pkg/ccTopology/ccTopology.go | 38 +++ pkg/messageProcessor/README.md | 228 ++++++++++++-- pkg/messageProcessor/messageProcessor.go | 83 ++--- pkg/messageProcessor/messageProcessorFuncs.go | 55 ++-- receivers/httpReceiver.go | 135 ++++---- receivers/ipmiReceiver.go | 55 +++- receivers/metricReceiver.go | 19 +- receivers/natsReceiver.go | 143 +++++---- receivers/natsReceiver.md | 8 +- receivers/prometheusReceiver.go | 4 +- receivers/receiveManager.go | 8 +- receivers/redfishReceiver.go | 73 +++-- receivers/sampleReceiver.go | 27 +- router.json | 39 +-- sinks/gangliaCommon.go | 10 +- sinks/gangliaSink.go | 97 +++--- sinks/gangliaSink.md | 12 +- sinks/httpSink.go | 128 ++++---- sinks/httpSink.md | 18 +- sinks/influxAsyncSink.go | 50 ++- sinks/influxAsyncSink.md | 14 +- sinks/influxSink.go | 147 +++++---- sinks/influxSink.md | 16 +- sinks/libgangliaSink.go | 176 ++++++----- sinks/libgangliaSink.md | 9 +- sinks/metricSink.go | 23 +- sinks/natsSink.go | 134 ++++++-- sinks/natsSink.md | 18 +- sinks/prometheusSink.go | 35 ++- sinks/prometheusSink.md | 8 +- sinks/sampleSink.go | 32 +- sinks/sinkManager.go | 11 +- sinks/stdoutSink.go | 37 ++- sinks/stdoutSink.md | 9 +- 74 files changed, 1902 insertions(+), 1017 deletions(-) diff --git a/.github/workflows/runonce.yml b/.github/workflows/runonce.yml index ddc41b3..182e043 100644 --- a/.github/workflows/runonce.yml +++ b/.github/workflows/runonce.yml @@ -159,7 +159,6 @@ jobs: submodules: recursive fetch-depth: 0 # Use official golang package - # See: https://github.com/marketplace/actions/setup-go-environment - name: Setup Golang uses: actions/setup-go@v4 diff --git a/cc-metric-collector.go b/cc-metric-collector.go index 5544af8..823cb5e 100644 --- a/cc-metric-collector.go +++ b/cc-metric-collector.go @@ -17,7 +17,7 @@ import ( mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker" ) @@ -54,7 +54,7 @@ type RuntimeConfig struct { ReceiveManager receivers.ReceiveManager MultiChanTicker mct.MultiChanTicker - Channels []chan lp.CCMetric + Channels []chan lp.CCMessage Sync sync.WaitGroup } @@ -242,7 +242,7 @@ func mainFunc() int { } // Connect metric router to sink manager - RouterToSinksChannel := make(chan lp.CCMetric, 200) + RouterToSinksChannel := make(chan lp.CCMessage, 200) rcfg.SinkManager.AddInput(RouterToSinksChannel) rcfg.MetricRouter.AddOutput(RouterToSinksChannel) @@ -254,7 +254,7 @@ func mainFunc() int { } // Connect collector manager to metric router - CollectToRouterChannel := make(chan lp.CCMetric, 200) + CollectToRouterChannel := make(chan lp.CCMessage, 200) rcfg.CollectManager.AddOutput(CollectToRouterChannel) rcfg.MetricRouter.AddCollectorInput(CollectToRouterChannel) @@ -267,7 +267,7 @@ func mainFunc() int { } // Connect receive manager to metric router - ReceiveToRouterChannel := make(chan lp.CCMetric, 200) + ReceiveToRouterChannel := make(chan lp.CCMessage, 200) rcfg.ReceiveManager.AddOutput(ReceiveToRouterChannel) rcfg.MetricRouter.AddReceiverInput(ReceiveToRouterChannel) use_recv = true diff --git a/collectors/Makefile b/collectors/Makefile index b73d917..53d6bb9 100644 --- a/collectors/Makefile +++ b/collectors/Makefile @@ -1,5 +1,5 @@ # LIKWID version -LIKWID_VERSION := 5.2.2 +LIKWID_VERSION := 5.4.1 LIKWID_INSTALLED_FOLDER := $(shell dirname $$(which likwid-topology 2>/dev/null) 2>/dev/null) LIKWID_FOLDER := $(CURDIR)/likwid @@ -23,7 +23,7 @@ likwid: mkdir --parents --verbose "$${BUILD_FOLDER}" wget --output-document=- http://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz | tar --directory="$${BUILD_FOLDER}" --extract --gz - install -D --verbose --preserve-timestamps --mode=0644 --target-directory="$(LIKWID_FOLDER)" "$${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes"/likwid*.h "$${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes"/bstrlib.h + install -D --verbose --preserve-timestamps --mode=0644 --target-directory="$(LIKWID_FOLDER)" "$${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes"/likwid*.h rm --recursive "$${BUILD_FOLDER}" fi diff --git a/collectors/beegfsmetaMetric.go b/collectors/beegfsmetaMetric.go index d202773..553c158 100644 --- a/collectors/beegfsmetaMetric.go +++ b/collectors/beegfsmetaMetric.go @@ -15,7 +15,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) const DEFAULT_BEEGFS_CMD = "beegfs-ctl" @@ -110,7 +110,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error { return nil } -func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMessage) { if !m.init { return } @@ -216,7 +216,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMetr for key, data := range m.matches { value, _ := strconv.ParseFloat(data, 32) - y, err := lp.New(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) + y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) if err == nil { output <- y } diff --git a/collectors/beegfsstorageMetric.go b/collectors/beegfsstorageMetric.go index be57e0f..2ca5dc9 100644 --- a/collectors/beegfsstorageMetric.go +++ b/collectors/beegfsstorageMetric.go @@ -15,7 +15,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) // Struct for the collector-specific JSON config @@ -103,7 +103,7 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error { return nil } -func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCMessage) { if !m.init { return } @@ -208,7 +208,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM for key, data := range m.matches { value, _ := strconv.ParseFloat(data, 32) - y, err := lp.New(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) + y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) if err == nil { output <- y } diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 4cf3435..5d4f1b5 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -6,8 +6,8 @@ import ( "sync" "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker" ) @@ -47,7 +47,7 @@ var AvailableCollectors = map[string]MetricCollector{ type collectorManager struct { collectors []MetricCollector // List of metric collectors to read in parallel serial []MetricCollector // List of metric collectors to read serially - output chan lp.CCMetric // Output channels + output chan lp.CCMessage // Output channels done chan bool // channel to finish / stop metric collector manager ticker mct.MultiChanTicker // periodically ticking once each interval duration time.Duration // duration (for metrics that measure over a given duration) @@ -60,7 +60,7 @@ type collectorManager struct { // Metric collector manager access functions type CollectorManager interface { Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) error - AddOutput(output chan lp.CCMetric) + AddOutput(output chan lp.CCMessage) Start() Close() } @@ -187,7 +187,7 @@ func (cm *collectorManager) Start() { } // AddOutput adds the output channel to the metric collector manager -func (cm *collectorManager) AddOutput(output chan lp.CCMetric) { +func (cm *collectorManager) AddOutput(output chan lp.CCMessage) { cm.output = output } diff --git a/collectors/cpufreqCpuinfoMetric.go b/collectors/cpufreqCpuinfoMetric.go index ce3e1dd..415e018 100644 --- a/collectors/cpufreqCpuinfoMetric.go +++ b/collectors/cpufreqCpuinfoMetric.go @@ -10,8 +10,8 @@ import ( "strings" "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) // CPUFreqCollector @@ -112,14 +112,14 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { // Check if at least one CPU with frequency information was detected if len(m.topology) == 0 { - return fmt.Errorf("No CPU frequency info found in %s", cpuInfoFile) + return fmt.Errorf("no CPU frequency info found in %s", cpuInfoFile) } m.init = true return nil } -func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CCMessage) { // Check if already initialized if !m.init { return @@ -154,7 +154,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err)) return } - if y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil { + if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil { output <- y } } diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index 61caf10..033445c 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -10,7 +10,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology" "golang.org/x/sys/unix" ) @@ -91,7 +91,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { return nil } -func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage) { // Check if already initialized if !m.init { return @@ -117,7 +117,7 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric) continue } - if y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil { + if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil { output <- y } } diff --git a/collectors/cpustatMetric.go b/collectors/cpustatMetric.go index 2b1756f..8126201 100644 --- a/collectors/cpustatMetric.go +++ b/collectors/cpustatMetric.go @@ -9,8 +9,8 @@ import ( "strings" "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" sysconf "github.com/tklauser/go-sysconf" ) @@ -34,7 +34,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error { m.name = "CpustatCollector" m.setup() m.parallel = true - m.meta = map[string]string{"source": m.name, "group": "CPU", "unit": "Percent"} + m.meta = map[string]string{"source": m.name, "group": "CPU"} m.nodetags = map[string]string{"type": "node"} if len(config) > 0 { err := json.Unmarshal(config, &m.config) @@ -105,7 +105,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error { return nil } -func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]string, output chan lp.CCMetric, now time.Time, tsdelta time.Duration) { +func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]string, output chan lp.CCMessage, now time.Time, tsdelta time.Duration) { values := make(map[string]float64) clktck, _ := sysconf.Sysconf(sysconf.SC_CLK_TCK) for match, index := range m.matches { @@ -122,21 +122,23 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st sum := float64(0) for name, value := range values { sum += value - y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now) + y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now) if err == nil { + y.AddTag("unit", "Percent") output <- y } } if v, ok := values["cpu_idle"]; ok { sum -= v - y, err := lp.New("cpu_used", tags, m.meta, map[string]interface{}{"value": sum * 100}, now) + y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]interface{}{"value": sum * 100}, now) if err == nil { + y.AddTag("unit", "Percent") output <- y } } } -func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage) { if !m.init { return } @@ -162,7 +164,7 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - num_cpus_metric, err := lp.New("num_cpus", + num_cpus_metric, err := lp.NewMessage("num_cpus", m.nodetags, m.meta, map[string]interface{}{"value": int(num_cpus)}, diff --git a/collectors/cpustatMetric.md b/collectors/cpustatMetric.md index d160410..f4e0616 100644 --- a/collectors/cpustatMetric.md +++ b/collectors/cpustatMetric.md @@ -13,14 +13,15 @@ The `cpustat` collector reads data from `/proc/stat` and outputs a handful **nod Metrics: -* `cpu_user` -* `cpu_nice` -* `cpu_system` -* `cpu_idle` -* `cpu_iowait` -* `cpu_irq` -* `cpu_softirq` -* `cpu_steal` -* `cpu_guest` -* `cpu_guest_nice` -* `cpu_used` = `cpu_* - cpu_idle` \ No newline at end of file +* `cpu_user` with `unit=Percent` +* `cpu_nice` with `unit=Percent` +* `cpu_system` with `unit=Percent` +* `cpu_idle` with `unit=Percent` +* `cpu_iowait` with `unit=Percent` +* `cpu_irq` with `unit=Percent` +* `cpu_softirq` with `unit=Percent` +* `cpu_steal` with `unit=Percent` +* `cpu_guest` with `unit=Percent` +* `cpu_guest_nice` with `unit=Percent` +* `cpu_used` = `cpu_* - cpu_idle` with `unit=Percent` +* `num_cpus` \ No newline at end of file diff --git a/collectors/customCmdMetric.go b/collectors/customCmdMetric.go index ae205f1..c9a50cf 100644 --- a/collectors/customCmdMetric.go +++ b/collectors/customCmdMetric.go @@ -9,7 +9,7 @@ import ( "strings" "time" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" influx "github.com/influxdata/line-protocol" ) @@ -75,7 +75,7 @@ var DefaultTime = func() time.Time { return time.Unix(42, 0) } -func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessage) { if !m.init { return } diff --git a/collectors/diskstatMetric.go b/collectors/diskstatMetric.go index d1ec4fc..0298362 100644 --- a/collectors/diskstatMetric.go +++ b/collectors/diskstatMetric.go @@ -9,7 +9,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) // "log" @@ -48,7 +48,7 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error { return nil } -func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessage) { if !m.init { return } @@ -92,13 +92,13 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric } tags := map[string]string{"type": "node", "device": linefields[0]} total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000) - y, err := lp.New("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now()) + y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now()) if err == nil { y.AddMeta("unit", "GBytes") output <- y } free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000) - y, err = lp.New("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now()) + y, err = lp.NewMessage("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now()) if err == nil { y.AddMeta("unit", "GBytes") output <- y @@ -110,7 +110,7 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric } } } - y, err := lp.New("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now()) + y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now()) if err == nil { y.AddMeta("unit", "percent") output <- y diff --git a/collectors/gpfsMetric.go b/collectors/gpfsMetric.go index 98720b8..31856cb 100644 --- a/collectors/gpfsMetric.go +++ b/collectors/gpfsMetric.go @@ -14,7 +14,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) const DEFAULT_GPFS_CMD = "mmpmon" @@ -94,7 +94,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error { return nil } -func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) { // Check if already initialized if !m.init { return @@ -218,7 +218,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { continue } if y, err := - lp.New( + lp.NewMessage( "gpfs_bytes_read", m.tags, m.meta, @@ -234,7 +234,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { if lastBytesRead := m.lastState[filesystem].bytesRead; lastBytesRead >= 0 { bwRead := float64(bytesRead-lastBytesRead) / timeDiff if y, err := - lp.New( + lp.NewMessage( "gpfs_bw_read", m.tags, m.meta, @@ -258,7 +258,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { continue } if y, err := - lp.New( + lp.NewMessage( "gpfs_bytes_written", m.tags, m.meta, @@ -274,7 +274,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { if lastBytesWritten := m.lastState[filesystem].bytesRead; lastBytesWritten >= 0 { bwWrite := float64(bytesWritten-lastBytesWritten) / timeDiff if y, err := - lp.New( + lp.NewMessage( "gpfs_bw_write", m.tags, m.meta, @@ -304,7 +304,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { fmt.Sprintf("Read(): Failed to convert number of opens '%s' to int64: %v", key_value["_oc_"], err)) continue } - if y, err := lp.New("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp); err == nil { + if y, err := lp.NewMessage("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp); err == nil { output <- y } @@ -316,7 +316,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { fmt.Sprintf("Read(): Failed to convert number of closes: '%s' to int64: %v", key_value["_cc_"], err)) continue } - if y, err := lp.New("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp); err == nil { + if y, err := lp.NewMessage("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp); err == nil { output <- y } @@ -328,7 +328,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { fmt.Sprintf("Read(): Failed to convert number of reads: '%s' to int64: %v", key_value["_rdc_"], err)) continue } - if y, err := lp.New("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp); err == nil { + if y, err := lp.NewMessage("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp); err == nil { output <- y } @@ -340,7 +340,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { fmt.Sprintf("Read(): Failed to convert number of writes: '%s' to int64: %v", key_value["_wc_"], err)) continue } - if y, err := lp.New("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp); err == nil { + if y, err := lp.NewMessage("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp); err == nil { output <- y } @@ -352,7 +352,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { fmt.Sprintf("Read(): Failed to convert number of read directories: '%s' to int64: %v", key_value["_dir_"], err)) continue } - if y, err := lp.New("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp); err == nil { + if y, err := lp.NewMessage("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp); err == nil { output <- y } @@ -364,7 +364,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { fmt.Sprintf("Read(): Failed to convert number of inode updates: '%s' to int: %v", key_value["_iu_"], err)) continue } - if y, err := lp.New("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil { + if y, err := lp.NewMessage("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil { output <- y } @@ -372,7 +372,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { if m.config.SendTotalValues { bytesTotal := bytesRead + bytesWritten if y, err := - lp.New("gpfs_bytes_total", + lp.NewMessage("gpfs_bytes_total", m.tags, m.meta, map[string]interface{}{ @@ -385,7 +385,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { } iops := numReads + numWrites if y, err := - lp.New("gpfs_iops", + lp.NewMessage("gpfs_iops", m.tags, m.meta, map[string]interface{}{ @@ -397,7 +397,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { } metaops := numInodeUpdates + numCloses + numOpens + numReaddirs if y, err := - lp.New("gpfs_metaops", + lp.NewMessage("gpfs_metaops", m.tags, m.meta, map[string]interface{}{ diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index 13b76a0..f0d5e49 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -5,7 +5,7 @@ import ( "os" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" "golang.org/x/sys/unix" "encoding/json" @@ -182,7 +182,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error { } // Read reads Infiniband counter files below IB_BASEPATH -func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMessage) { // Check if already initialized if !m.init { @@ -230,7 +230,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr // Send absolut values if m.config.SendAbsoluteValues { if y, err := - lp.New( + lp.NewMessage( counterDef.name, info.tagSet, m.meta, @@ -248,7 +248,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr if counterDef.lastState >= 0 { rate := float64((counterDef.currentState - counterDef.lastState)) / timeDiff if y, err := - lp.New( + lp.NewMessage( counterDef.name+"_bw", info.tagSet, m.meta, @@ -278,7 +278,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr // Send total values if m.config.SendTotalValues { if y, err := - lp.New( + lp.NewMessage( "ib_total", info.tagSet, m.meta, @@ -291,7 +291,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr } if y, err := - lp.New( + lp.NewMessage( "ib_total_pkts", info.tagSet, m.meta, diff --git a/collectors/iostatMetric.go b/collectors/iostatMetric.go index 4d1dbd1..8715d7e 100644 --- a/collectors/iostatMetric.go +++ b/collectors/iostatMetric.go @@ -5,7 +5,7 @@ import ( "os" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" // "log" "encoding/json" @@ -107,7 +107,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error { return err } -func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage) { if !m.init { return } @@ -139,7 +139,7 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMetric) x, err := strconv.ParseInt(linefields[idx], 0, 64) if err == nil { diff := x - entry.lastValues[name] - y, err := lp.New(name, entry.tags, m.meta, map[string]interface{}{"value": int(diff)}, time.Now()) + y, err := lp.NewMessage(name, entry.tags, m.meta, map[string]interface{}{"value": int(diff)}, time.Now()) if err == nil { output <- y } diff --git a/collectors/ipmiMetric.go b/collectors/ipmiMetric.go index a4b4b88..e8aafae 100644 --- a/collectors/ipmiMetric.go +++ b/collectors/ipmiMetric.go @@ -14,7 +14,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) const IPMISENSORS_PATH = `ipmi-sensors` @@ -83,7 +83,7 @@ func (m *IpmiCollector) Init(config json.RawMessage) error { return nil } -func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMetric) { +func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) { // Setup ipmitool command command := exec.Command(cmd, "sensor") @@ -121,7 +121,7 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMetric) { unit = "Watts" } - y, err := lp.New(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now()) + y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now()) if err == nil { y.AddMeta("unit", unit) output <- y @@ -141,7 +141,7 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMetric) { } } -func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMetric) { +func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) { command := exec.Command(cmd, "--comma-separated-output", "--sdr-cache-recreate") command.Wait() @@ -159,7 +159,7 @@ func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMetric) { v, err := strconv.ParseFloat(lv[3], 64) if err == nil { name := strings.ToLower(strings.Replace(lv[1], " ", "_", -1)) - y, err := lp.New(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now()) + y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now()) if err == nil { if len(lv) > 4 { y.AddMeta("unit", lv[4]) @@ -171,7 +171,7 @@ func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMetric) { } } -func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMessage) { // Check if already initialized if !m.init { diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 12757c3..797b324 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -24,9 +24,9 @@ import ( "time" "unsafe" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology" "github.com/NVIDIA/go-nvml/pkg/dl" "github.com/fsnotify/fsnotify" @@ -43,7 +43,7 @@ const ( type LikwidCollectorMetricConfig struct { Name string `json:"name"` // Name of the metric Calc string `json:"calc"` // Calculation for the metric using - Type string `json:"type"` // Metric type (aka node, socket, cpu, ...) + Type string `json:"type"` // Metric type (aka node, socket, hwthread, ...) Publish bool `json:"publish"` SendCoreTotalVal bool `json:"send_core_total_values,omitempty"` SendSocketTotalVal bool `json:"send_socket_total_values,omitempty"` @@ -91,6 +91,8 @@ type LikwidCollector struct { running bool initialized bool needs_reinit bool + myuid int + lock_err_once bool likwidGroups map[C.int]LikwidEventsetConfig lock sync.Mutex measureThread thread.Thread @@ -204,6 +206,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { m.initialized = false m.needs_reinit = true m.running = false + m.myuid = os.Getuid() m.config.AccessMode = LIKWID_DEF_ACCESSMODE m.config.LibraryPath = LIKWID_LIB_NAME m.config.LockfilePath = LIKWID_DEF_LOCKFILE @@ -390,14 +393,24 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, } // Check file ownership uid := info.Sys().(*syscall.Stat_t).Uid - if uid != uint32(os.Getuid()) { + if uid != uint32(m.myuid) { usr, err := user.LookupId(fmt.Sprint(uid)) if err == nil { - return true, fmt.Errorf("Access to performance counters locked by %s", usr.Username) + err = fmt.Errorf("access to performance counters locked by %s", usr.Username) } else { - return true, fmt.Errorf("Access to performance counters locked by %d", uid) + err = fmt.Errorf("access to performance counters locked by %d", uid) } + // delete error if we already returned the error once. + if !m.lock_err_once { + m.lock_err_once = true + } else { + err = nil + } + return true, err } + // reset lock_err_once + m.lock_err_once = false + // Add the lock file to the watcher err = watcher.Add(m.config.LockfilePath) if err != nil { @@ -436,9 +449,7 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, gid = C.perfmon_addEventSet(evset.estr) } if gid < 0 { - return true, fmt.Errorf("failed to add events %s, error %d", evset.go_estr, gid) - } else { - evset.gid = gid + return true, fmt.Errorf("failed to add events %s, id %d, error %d", evset.go_estr, evidx, gid) } // Setup all performance monitoring counters of an eventSet @@ -549,11 +560,12 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, } // Get all measurement results for an event set, derive the metric values out of the measurement results and send it -func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error { +func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interval time.Duration, output chan lp.CCMessage) error { invClock := float64(1.0 / m.basefreq) for _, tid := range m.cpu2tid { evset.results[tid]["inverseClock"] = invClock + evset.results[tid]["gotime"] = interval.Seconds() } // Go over the event set metrics, derive the value out of the event:counter values and send it @@ -582,7 +594,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv if !math.IsNaN(value) && metric.Publish { fields := map[string]interface{}{"value": value} y, err := - lp.New( + lp.NewMessage( metric.Name, map[string]string{ "type": metric.Type, @@ -619,7 +631,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv for coreID, value := range totalCoreValues { y, err := - lp.New( + lp.NewMessage( metric.Name, map[string]string{ "type": "core", @@ -656,7 +668,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv for socketID, value := range totalSocketValues { y, err := - lp.New( + lp.NewMessage( metric.Name, map[string]string{ "type": "socket", @@ -691,7 +703,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv } y, err := - lp.New( + lp.NewMessage( metric.Name, map[string]string{ "type": "node", @@ -716,7 +728,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv } // Go over the global metrics, derive the value out of the event sets' metric values and send it -func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error { +func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, interval time.Duration, output chan lp.CCMessage) error { // Send all metrics with same time stamp // This function does only computiation, counter measurement is done before now := time.Now() @@ -737,6 +749,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter params[mname] = mres } } + params["gotime"] = interval.Seconds() // Evaluate the metric value, err := agg.EvalFloat64Condition(metric.Calc, params) if err != nil { @@ -750,7 +763,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter if !math.IsNaN(value) { if metric.Publish { y, err := - lp.New( + lp.NewMessage( metric.Name, map[string]string{ "type": metric.Type, @@ -778,7 +791,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter return nil } -func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMetric) { +func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMessage) { var err error = nil groups := make([]LikwidEventsetConfig, 0) @@ -798,15 +811,17 @@ func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMe if !skip { // read measurements and derive event set metrics m.calcEventsetMetrics(e, interval, output) + groups = append(groups, e) } - groups = append(groups, e) } - // calculate global metrics - m.calcGlobalMetrics(groups, interval, output) + if len(groups) > 0 { + // calculate global metrics + m.calcGlobalMetrics(groups, interval, output) + } } // main read function taking multiple measurement rounds, each 'interval' seconds long -func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMessage) { if !m.init { return } diff --git a/collectors/loadavgMetric.go b/collectors/loadavgMetric.go index 887e63e..03bd37e 100644 --- a/collectors/loadavgMetric.go +++ b/collectors/loadavgMetric.go @@ -8,18 +8,16 @@ import ( "strings" "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) -// // LoadavgCollector collects: // * load average of last 1, 5 & 15 minutes // * number of processes currently runnable // * total number of processes in system // // See: https://www.kernel.org/doc/html/latest/filesystems/proc.html -// const LOADAVGFILE = "/proc/loadavg" type LoadavgCollector struct { @@ -68,17 +66,15 @@ func (m *LoadavgCollector) Init(config json.RawMessage) error { return nil } -func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage) { if !m.init { return } buffer, err := os.ReadFile(LOADAVGFILE) if err != nil { - if err != nil { - cclog.ComponentError( - m.name, - fmt.Sprintf("Read(): Failed to read file '%s': %v", LOADAVGFILE, err)) - } + cclog.ComponentError( + m.name, + fmt.Sprintf("Read(): Failed to read file '%s': %v", LOADAVGFILE, err)) return } now := time.Now() @@ -96,7 +92,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric) if m.load_skips[i] { continue } - y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now) + y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now) if err == nil { output <- y } @@ -115,7 +111,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric) if m.proc_skips[i] { continue } - y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now) + y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now) if err == nil { output <- y } diff --git a/collectors/lustreMetric.go b/collectors/lustreMetric.go index 02ce572..f6d6ef6 100644 --- a/collectors/lustreMetric.go +++ b/collectors/lustreMetric.go @@ -11,7 +11,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) const LUSTRE_SYSFS = `/sys/fs/lustre` @@ -377,7 +377,7 @@ func (m *LustreCollector) Init(config json.RawMessage) error { return nil } -func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMessage) { if !m.init { return } @@ -388,7 +388,7 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMetric) for _, def := range m.definitions { var use_x int64 var err error - var y lp.CCMetric + var y lp.CCMessage x, err := getMetricData(data, def.lineprefix, def.lineoffset) if err == nil { use_x = x @@ -399,19 +399,19 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMetric) switch def.calc { case "none": value = use_x - y, err = lp.New(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) + y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) case "difference": value = use_x - devData[def.name] if value.(int64) < 0 { value = 0 } - y, err = lp.New(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) + y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) case "derivative": value = float64(use_x-devData[def.name]) / tdiff.Seconds() if value.(float64) < 0 { value = 0 } - y, err = lp.New(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) + y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now()) } if err == nil { y.AddTag("device", device) diff --git a/collectors/memstatMetric.go b/collectors/memstatMetric.go index 4aec4c8..795e030 100644 --- a/collectors/memstatMetric.go +++ b/collectors/memstatMetric.go @@ -13,7 +13,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) const MEMSTATFILE = "/proc/meminfo" @@ -159,7 +159,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error { return err } -func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage) { if !m.init { return } @@ -175,7 +175,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) } } - y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now()) + y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now()) if err == nil { if len(unit) > 0 { y.AddMeta("unit", unit) @@ -208,7 +208,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) } } } - y, err := lp.New("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now()) + y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now()) if err == nil { if len(unit) > 0 { y.AddMeta("unit", unit) diff --git a/collectors/metricCollector.go b/collectors/metricCollector.go index f09fa61..b8da524 100644 --- a/collectors/metricCollector.go +++ b/collectors/metricCollector.go @@ -5,7 +5,7 @@ import ( "fmt" "time" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) type MetricCollector interface { @@ -13,7 +13,7 @@ type MetricCollector interface { Init(config json.RawMessage) error // Initialize metric collector Initialized() bool // Is metric collector initialized? Parallel() bool - Read(duration time.Duration, output chan lp.CCMetric) // Read metrics from metric collector + Read(duration time.Duration, output chan lp.CCMessage) // Read metrics from metric collector Close() // Close / finish metric collector } diff --git a/collectors/netstatMetric.go b/collectors/netstatMetric.go index 2ae1764..7933e53 100644 --- a/collectors/netstatMetric.go +++ b/collectors/netstatMetric.go @@ -10,7 +10,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) const NETSTATFILE = "/proc/net/dev" @@ -153,7 +153,7 @@ func (m *NetstatCollector) Init(config json.RawMessage) error { return nil } -func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage) { if !m.init { return } @@ -197,14 +197,14 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMetric) continue } if m.config.SendAbsoluteValues { - if y, err := lp.New(metric.name, metric.tags, metric.meta, map[string]interface{}{"value": v}, now); err == nil { + if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]interface{}{"value": v}, now); err == nil { output <- y } } if m.config.SendDerivedValues { if metric.lastValue >= 0 { rate := float64(v-metric.lastValue) / timeDiff - if y, err := lp.New(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]interface{}{"value": rate}, now); err == nil { + if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]interface{}{"value": rate}, now); err == nil { output <- y } } diff --git a/collectors/nfsMetric.go b/collectors/nfsMetric.go index 7dca096..019e25c 100644 --- a/collectors/nfsMetric.go +++ b/collectors/nfsMetric.go @@ -11,7 +11,7 @@ import ( "strings" "time" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) // First part contains the code for the general NfsCollector. @@ -118,7 +118,7 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error { return nil } -func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) { if !m.init { return } @@ -140,7 +140,7 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { continue } value := data.current - data.last - y, err := lp.New(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { y.AddMeta("version", m.version) output <- y diff --git a/collectors/nfsiostatMetric.go b/collectors/nfsiostatMetric.go index 810215c..09686e9 100644 --- a/collectors/nfsiostatMetric.go +++ b/collectors/nfsiostatMetric.go @@ -10,7 +10,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) // These are the fields we read from the JSON configuration @@ -114,7 +114,7 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error { return err } -func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessage) { timestamp := time.Now() // Get the current values for all mountpoints @@ -126,7 +126,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMetri // Calculate the difference of old and new values for i := range values { x := values[i] - old[i] - y, err := lp.New(fmt.Sprintf("nfsio_%s", i), m.tags, m.meta, map[string]interface{}{"value": x}, timestamp) + y, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", i), m.tags, m.meta, map[string]interface{}{"value": x}, timestamp) if err == nil { if strings.HasPrefix(i, "page") { y.AddMeta("unit", "4K_Pages") diff --git a/collectors/numastatsMetric.go b/collectors/numastatsMetric.go index f79b45b..20b000c 100644 --- a/collectors/numastatsMetric.go +++ b/collectors/numastatsMetric.go @@ -11,7 +11,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) // Non-Uniform Memory Access (NUMA) policy hit/miss statistics @@ -97,7 +97,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error { return nil } -func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessage) { if !m.init { return } @@ -130,7 +130,7 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMetri fmt.Sprintf("Read(): Failed to convert %s='%s' to int64: %v", key, split[1], err)) continue } - y, err := lp.New( + y, err := lp.NewMessage( "numastats_"+key, t.tagSet, m.meta, diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 369f12b..c28f0ce 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -8,8 +8,8 @@ import ( "strings" "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" "github.com/NVIDIA/go-nvml/pkg/nvml" ) @@ -206,7 +206,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { return nil } -func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) error { if !device.excludeMetrics["nv_fb_mem_total"] || !device.excludeMetrics["nv_fb_mem_used"] || !device.excludeMetrics["nv_fb_mem_reserved"] { var total uint64 var used uint64 @@ -222,7 +222,7 @@ func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error if !device.excludeMetrics["nv_fb_mem_total"] { t := float64(total) / (1024 * 1024) - y, err := lp.New("nv_fb_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + y, err := lp.NewMessage("nv_fb_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) if err == nil { y.AddMeta("unit", "MByte") output <- y @@ -231,7 +231,7 @@ func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error if !device.excludeMetrics["nv_fb_mem_used"] { f := float64(used) / (1024 * 1024) - y, err := lp.New("nv_fb_mem_used", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now()) + y, err := lp.NewMessage("nv_fb_mem_used", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now()) if err == nil { y.AddMeta("unit", "MByte") output <- y @@ -240,7 +240,7 @@ func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] { r := float64(reserved) / (1024 * 1024) - y, err := lp.New("nv_fb_mem_reserved", device.tags, device.meta, map[string]interface{}{"value": r}, time.Now()) + y, err := lp.NewMessage("nv_fb_mem_reserved", device.tags, device.meta, map[string]interface{}{"value": r}, time.Now()) if err == nil { y.AddMeta("unit", "MByte") output <- y @@ -250,7 +250,7 @@ func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error return nil } -func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) error { if !device.excludeMetrics["nv_bar1_mem_total"] || !device.excludeMetrics["nv_bar1_mem_used"] { meminfo, ret := nvml.DeviceGetBAR1MemoryInfo(device.device) if ret != nvml.SUCCESS { @@ -259,7 +259,7 @@ func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) er } if !device.excludeMetrics["nv_bar1_mem_total"] { t := float64(meminfo.Bar1Total) / (1024 * 1024) - y, err := lp.New("nv_bar1_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + y, err := lp.NewMessage("nv_bar1_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) if err == nil { y.AddMeta("unit", "MByte") output <- y @@ -267,7 +267,7 @@ func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) er } if !device.excludeMetrics["nv_bar1_mem_used"] { t := float64(meminfo.Bar1Used) / (1024 * 1024) - y, err := lp.New("nv_bar1_mem_used", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + y, err := lp.NewMessage("nv_bar1_mem_used", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) if err == nil { y.AddMeta("unit", "MByte") output <- y @@ -277,7 +277,7 @@ func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) er return nil } -func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error { isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device) if ret != nvml.SUCCESS { err := errors.New(nvml.ErrorString(ret)) @@ -301,14 +301,14 @@ func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) erro util, ret := nvml.DeviceGetUtilizationRates(device.device) if ret == nvml.SUCCESS { if !device.excludeMetrics["nv_util"] { - y, err := lp.New("nv_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) + y, err := lp.NewMessage("nv_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) if err == nil { y.AddMeta("unit", "%") output <- y } } if !device.excludeMetrics["nv_mem_util"] { - y, err := lp.New("nv_mem_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) + y, err := lp.NewMessage("nv_mem_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) if err == nil { y.AddMeta("unit", "%") output <- y @@ -319,7 +319,7 @@ func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) erro return nil } -func readTemp(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readTemp(device NvidiaCollectorDevice, output chan lp.CCMessage) error { if !device.excludeMetrics["nv_temp"] { // Retrieves the current temperature readings for the device, in degrees C. // @@ -328,7 +328,7 @@ func readTemp(device NvidiaCollectorDevice, output chan lp.CCMetric) error { // * NVML_TEMPERATURE_COUNT temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU) if ret == nvml.SUCCESS { - y, err := lp.New("nv_temp", device.tags, device.meta, map[string]interface{}{"value": float64(temp)}, time.Now()) + y, err := lp.NewMessage("nv_temp", device.tags, device.meta, map[string]interface{}{"value": float64(temp)}, time.Now()) if err == nil { y.AddMeta("unit", "degC") output <- y @@ -338,7 +338,7 @@ func readTemp(device NvidiaCollectorDevice, output chan lp.CCMetric) error { return nil } -func readFan(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readFan(device NvidiaCollectorDevice, output chan lp.CCMessage) error { if !device.excludeMetrics["nv_fan"] { // Retrieves the intended operating speed of the device's fan. // @@ -351,7 +351,7 @@ func readFan(device NvidiaCollectorDevice, output chan lp.CCMetric) error { // This value may exceed 100% in certain cases. fan, ret := nvml.DeviceGetFanSpeed(device.device) if ret == nvml.SUCCESS { - y, err := lp.New("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) + y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) if err == nil { y.AddMeta("unit", "%") output <- y @@ -361,14 +361,14 @@ func readFan(device NvidiaCollectorDevice, output chan lp.CCMetric) error { return nil } -// func readFans(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +// func readFans(device NvidiaCollectorDevice, output chan lp.CCMessage) error { // if !device.excludeMetrics["nv_fan"] { // numFans, ret := nvml.DeviceGetNumFans(device.device) // if ret == nvml.SUCCESS { // for i := 0; i < numFans; i++ { // fan, ret := nvml.DeviceGetFanSpeed_v2(device.device, i) // if ret == nvml.SUCCESS { -// y, err := lp.New("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) +// y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) // if err == nil { // y.AddMeta("unit", "%") // y.AddTag("stype", "fan") @@ -382,7 +382,7 @@ func readFan(device NvidiaCollectorDevice, output chan lp.CCMetric) error { // return nil // } -func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMessage) error { if !device.excludeMetrics["nv_ecc_mode"] { // Retrieves the current and pending ECC modes for the device. // @@ -393,21 +393,21 @@ func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMetric) error { // The "pending" ECC mode refers to the target mode following the next reboot. _, ecc_pend, ret := nvml.DeviceGetEccMode(device.device) if ret == nvml.SUCCESS { - var y lp.CCMetric + var y lp.CCMessage var err error switch ecc_pend { case nvml.FEATURE_DISABLED: - y, err = lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "OFF"}, time.Now()) + y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "OFF"}, time.Now()) case nvml.FEATURE_ENABLED: - y, err = lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "ON"}, time.Now()) + y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "ON"}, time.Now()) default: - y, err = lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now()) + y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now()) } if err == nil { output <- y } } else if ret == nvml.ERROR_NOT_SUPPORTED { - y, err := lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "N/A"}, time.Now()) + y, err := lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "N/A"}, time.Now()) if err == nil { output <- y } @@ -416,7 +416,7 @@ func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMetric) error { return nil } -func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMessage) error { if !device.excludeMetrics["nv_perf_state"] { // Retrieves the current performance state for the device. // @@ -427,7 +427,7 @@ func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMetric) error // 32: Unknown performance state. pState, ret := nvml.DeviceGetPerformanceState(device.device) if ret == nvml.SUCCESS { - y, err := lp.New("nv_perf_state", device.tags, device.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now()) + y, err := lp.NewMessage("nv_perf_state", device.tags, device.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now()) if err == nil { output <- y } @@ -436,7 +436,7 @@ func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMetric) error return nil } -func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMessage) error { if !device.excludeMetrics["nv_power_usage"] { // Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) // @@ -450,7 +450,7 @@ func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMetric) error if mode == nvml.FEATURE_ENABLED { power, ret := nvml.DeviceGetPowerUsage(device.device) if ret == nvml.SUCCESS { - y, err := lp.New("nv_power_usage", device.tags, device.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) + y, err := lp.NewMessage("nv_power_usage", device.tags, device.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) if err == nil { y.AddMeta("unit", "watts") output <- y @@ -461,7 +461,7 @@ func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMetric) error return nil } -func readClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error { // Retrieves the current clock speeds for the device. // // Available clock information: @@ -471,7 +471,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error { if !device.excludeMetrics["nv_graphics_clock"] { graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS) if ret == nvml.SUCCESS { - y, err := lp.New("nv_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now()) + y, err := lp.NewMessage("nv_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now()) if err == nil { y.AddMeta("unit", "MHz") output <- y @@ -482,7 +482,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error { if !device.excludeMetrics["nv_sm_clock"] { smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) if ret == nvml.SUCCESS { - y, err := lp.New("nv_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(smCock)}, time.Now()) + y, err := lp.NewMessage("nv_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(smCock)}, time.Now()) if err == nil { y.AddMeta("unit", "MHz") output <- y @@ -493,7 +493,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error { if !device.excludeMetrics["nv_mem_clock"] { memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) if ret == nvml.SUCCESS { - y, err := lp.New("nv_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now()) + y, err := lp.NewMessage("nv_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now()) if err == nil { y.AddMeta("unit", "MHz") output <- y @@ -503,7 +503,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error { if !device.excludeMetrics["nv_video_clock"] { memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO) if ret == nvml.SUCCESS { - y, err := lp.New("nv_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now()) + y, err := lp.NewMessage("nv_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now()) if err == nil { y.AddMeta("unit", "MHz") output <- y @@ -513,7 +513,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error { return nil } -func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error { // Retrieves the maximum clock speeds for the device. // // Available clock information: @@ -528,7 +528,7 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error if !device.excludeMetrics["nv_max_graphics_clock"] { max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS) if ret == nvml.SUCCESS { - y, err := lp.New("nv_max_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) + y, err := lp.NewMessage("nv_max_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) if err == nil { y.AddMeta("unit", "MHz") output <- y @@ -539,7 +539,7 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error if !device.excludeMetrics["nv_max_sm_clock"] { maxSmClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) if ret == nvml.SUCCESS { - y, err := lp.New("nv_max_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxSmClock)}, time.Now()) + y, err := lp.NewMessage("nv_max_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxSmClock)}, time.Now()) if err == nil { y.AddMeta("unit", "MHz") output <- y @@ -550,7 +550,7 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error if !device.excludeMetrics["nv_max_mem_clock"] { maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) if ret == nvml.SUCCESS { - y, err := lp.New("nv_max_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now()) + y, err := lp.NewMessage("nv_max_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now()) if err == nil { y.AddMeta("unit", "MHz") output <- y @@ -561,7 +561,7 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error if !device.excludeMetrics["nv_max_video_clock"] { maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO) if ret == nvml.SUCCESS { - y, err := lp.New("nv_max_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now()) + y, err := lp.NewMessage("nv_max_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now()) if err == nil { y.AddMeta("unit", "MHz") output <- y @@ -571,7 +571,7 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error return nil } -func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMessage) error { if !device.excludeMetrics["nv_ecc_uncorrected_error"] { // Retrieves the total ECC error counts for the device. // @@ -584,7 +584,7 @@ func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMetric) error // i.e. the total set of errors across the entire device. ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC) if ret == nvml.SUCCESS { - y, err := lp.New("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) + y, err := lp.NewMessage("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) if err == nil { output <- y } @@ -593,7 +593,7 @@ func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMetric) error if !device.excludeMetrics["nv_ecc_corrected_error"] { ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC) if ret == nvml.SUCCESS { - y, err := lp.New("nv_ecc_corrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) + y, err := lp.NewMessage("nv_ecc_corrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) if err == nil { output <- y } @@ -602,7 +602,7 @@ func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMetric) error return nil } -func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMessage) error { if !device.excludeMetrics["nv_power_max_limit"] { // Retrieves the power management limit associated with this device. // @@ -612,7 +612,7 @@ func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMetric) error // If the card's total power draw reaches this limit the power management algorithm kicks in. pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device) if ret == nvml.SUCCESS { - y, err := lp.New("nv_power_max_limit", device.tags, device.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now()) + y, err := lp.NewMessage("nv_power_max_limit", device.tags, device.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now()) if err == nil { y.AddMeta("unit", "watts") output <- y @@ -622,7 +622,7 @@ func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMetric) error return nil } -func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error { isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device) if ret != nvml.SUCCESS { err := errors.New(nvml.ErrorString(ret)) @@ -639,7 +639,7 @@ func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) e // Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported. enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device) if ret == nvml.SUCCESS { - y, err := lp.New("nv_encoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now()) + y, err := lp.NewMessage("nv_encoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now()) if err == nil { y.AddMeta("unit", "%") output <- y @@ -649,7 +649,7 @@ func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) e return nil } -func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error { isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device) if ret != nvml.SUCCESS { err := errors.New(nvml.ErrorString(ret)) @@ -666,7 +666,7 @@ func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) e // Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported. dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device) if ret == nvml.SUCCESS { - y, err := lp.New("nv_decoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now()) + y, err := lp.NewMessage("nv_decoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now()) if err == nil { y.AddMeta("unit", "%") output <- y @@ -676,7 +676,7 @@ func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) e return nil } -func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMessage) error { if !device.excludeMetrics["nv_remapped_rows_corrected"] || !device.excludeMetrics["nv_remapped_rows_uncorrected"] || !device.excludeMetrics["nv_remapped_rows_pending"] || @@ -693,13 +693,13 @@ func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMetric) err corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device) if ret == nvml.SUCCESS { if !device.excludeMetrics["nv_remapped_rows_corrected"] { - y, err := lp.New("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(corrected)}, time.Now()) + y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(corrected)}, time.Now()) if err == nil { output <- y } } if !device.excludeMetrics["nv_remapped_rows_uncorrected"] { - y, err := lp.New("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(uncorrected)}, time.Now()) + y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(uncorrected)}, time.Now()) if err == nil { output <- y } @@ -709,7 +709,7 @@ func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMetric) err if pending { p = 1 } - y, err := lp.New("nv_remapped_rows_pending", device.tags, device.meta, map[string]interface{}{"value": p}, time.Now()) + y, err := lp.NewMessage("nv_remapped_rows_pending", device.tags, device.meta, map[string]interface{}{"value": p}, time.Now()) if err == nil { output <- y } @@ -719,7 +719,7 @@ func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMetric) err if failure { f = 1 } - y, err := lp.New("nv_remapped_rows_failure", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now()) + y, err := lp.NewMessage("nv_remapped_rows_failure", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now()) if err == nil { output <- y } @@ -729,7 +729,7 @@ func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMetric) err return nil } -func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMessage) error { if !device.excludeMetrics["nv_compute_processes"] { // Get information about processes with a compute context on a device // @@ -753,7 +753,7 @@ func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMetric) er // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device) if ret == nvml.SUCCESS { - y, err := lp.New("nv_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now()) + y, err := lp.NewMessage("nv_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now()) if err == nil { output <- y } @@ -782,7 +782,7 @@ func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMetric) er // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device) if ret == nvml.SUCCESS { - y, err := lp.New("nv_graphics_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now()) + y, err := lp.NewMessage("nv_graphics_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now()) if err == nil { output <- y } @@ -812,7 +812,7 @@ func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMetric) er // // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. // procList, ret := nvml.DeviceGetMPSComputeRunningProcesses(device.device) // if ret == nvml.SUCCESS { - // y, err := lp.New("nv_mps_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now()) + // y, err := lp.NewMessage("nv_mps_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now()) // if err == nil { // output <- y // } @@ -821,7 +821,7 @@ func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMetric) er return nil } -func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage) error { var violTime nvml.ViolationTime var ret nvml.Return @@ -840,7 +840,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER) if ret == nvml.SUCCESS { t := float64(violTime.ViolationTime) * 1e-9 - y, err := lp.New("nv_violation_power", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + y, err := lp.NewMessage("nv_violation_power", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) if err == nil { y.AddMeta("unit", "sec") output <- y @@ -852,7 +852,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL) if ret == nvml.SUCCESS { t := float64(violTime.ViolationTime) * 1e-9 - y, err := lp.New("nv_violation_thermal", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + y, err := lp.NewMessage("nv_violation_thermal", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) if err == nil { y.AddMeta("unit", "sec") output <- y @@ -864,7 +864,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST) if ret == nvml.SUCCESS { t := float64(violTime.ViolationTime) * 1e-9 - y, err := lp.New("nv_violation_sync_boost", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + y, err := lp.NewMessage("nv_violation_sync_boost", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) if err == nil { y.AddMeta("unit", "sec") output <- y @@ -876,7 +876,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT) if ret == nvml.SUCCESS { t := float64(violTime.ViolationTime) * 1e-9 - y, err := lp.New("nv_violation_board_limit", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + y, err := lp.NewMessage("nv_violation_board_limit", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) if err == nil { y.AddMeta("unit", "sec") output <- y @@ -888,7 +888,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION) if ret == nvml.SUCCESS { t := float64(violTime.ViolationTime) * 1e-9 - y, err := lp.New("nv_violation_low_util", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + y, err := lp.NewMessage("nv_violation_low_util", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) if err == nil { y.AddMeta("unit", "sec") output <- y @@ -900,7 +900,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY) if ret == nvml.SUCCESS { t := float64(violTime.ViolationTime) * 1e-9 - y, err := lp.New("nv_violation_reliability", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + y, err := lp.NewMessage("nv_violation_reliability", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) if err == nil { y.AddMeta("unit", "sec") output <- y @@ -912,7 +912,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS) if ret == nvml.SUCCESS { t := float64(violTime.ViolationTime) * 1e-9 - y, err := lp.New("nv_violation_below_app_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + y, err := lp.NewMessage("nv_violation_below_app_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) if err == nil { y.AddMeta("unit", "sec") output <- y @@ -924,7 +924,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS) if ret == nvml.SUCCESS { t := float64(violTime.ViolationTime) * 1e-9 - y, err := lp.New("nv_violation_below_base_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) + y, err := lp.NewMessage("nv_violation_below_base_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now()) if err == nil { y.AddMeta("unit", "sec") output <- y @@ -935,12 +935,18 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e return nil } -func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) error { +func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) error { // Retrieves the specified error counter value // Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available // // For Pascal &tm; or newer fully supported devices. + var aggregate_crc_errors uint64 = 0 + var aggregate_ecc_errors uint64 = 0 + var aggregate_replay_errors uint64 = 0 + var aggregate_recovery_errors uint64 = 0 + var aggregate_crc_flit_errors uint64 = 0 + for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ { state, ret := nvml.DeviceGetNvLinkState(device.device, i) if ret == nvml.SUCCESS { @@ -948,8 +954,9 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro if !device.excludeMetrics["nv_nvlink_crc_errors"] { // Data link receive data CRC error counter count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA) + aggregate_crc_errors = aggregate_crc_errors + count if ret == nvml.SUCCESS { - y, err := lp.New("nv_nvlink_crc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) + y, err := lp.NewMessage("nv_nvlink_crc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) if err == nil { y.AddTag("stype", "nvlink") y.AddTag("stype-id", fmt.Sprintf("%d", i)) @@ -960,8 +967,9 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro if !device.excludeMetrics["nv_nvlink_ecc_errors"] { // Data link receive data ECC error counter count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA) + aggregate_ecc_errors = aggregate_ecc_errors + count if ret == nvml.SUCCESS { - y, err := lp.New("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) + y, err := lp.NewMessage("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) if err == nil { y.AddTag("stype", "nvlink") y.AddTag("stype-id", fmt.Sprintf("%d", i)) @@ -972,8 +980,9 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro if !device.excludeMetrics["nv_nvlink_replay_errors"] { // Data link transmit replay error counter count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY) + aggregate_replay_errors = aggregate_replay_errors + count if ret == nvml.SUCCESS { - y, err := lp.New("nv_nvlink_replay_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) + y, err := lp.NewMessage("nv_nvlink_replay_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) if err == nil { y.AddTag("stype", "nvlink") y.AddTag("stype-id", fmt.Sprintf("%d", i)) @@ -984,8 +993,9 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro if !device.excludeMetrics["nv_nvlink_recovery_errors"] { // Data link transmit recovery error counter count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY) + aggregate_recovery_errors = aggregate_recovery_errors + count if ret == nvml.SUCCESS { - y, err := lp.New("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) + y, err := lp.NewMessage("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) if err == nil { y.AddTag("stype", "nvlink") y.AddTag("stype-id", fmt.Sprintf("%d", i)) @@ -996,8 +1006,9 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] { // Data link receive flow control digit CRC error counter count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT) + aggregate_crc_flit_errors = aggregate_crc_flit_errors + count if ret == nvml.SUCCESS { - y, err := lp.New("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) + y, err := lp.NewMessage("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) if err == nil { y.AddTag("stype", "nvlink") y.AddTag("stype-id", fmt.Sprintf("%d", i)) @@ -1008,16 +1019,58 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro } } } + + // Export aggegated values + if !device.excludeMetrics["nv_nvlink_crc_errors"] { + // Data link receive data CRC error counter + y, err := lp.NewMessage("nv_nvlink_crc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_errors}, time.Now()) + if err == nil { + y.AddTag("stype", "nvlink") + output <- y + } + } + if !device.excludeMetrics["nv_nvlink_ecc_errors"] { + // Data link receive data ECC error counter + y, err := lp.NewMessage("nv_nvlink_ecc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_ecc_errors}, time.Now()) + if err == nil { + y.AddTag("stype", "nvlink") + output <- y + } + } + if !device.excludeMetrics["nv_nvlink_replay_errors"] { + // Data link transmit replay error counter + y, err := lp.NewMessage("nv_nvlink_replay_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_replay_errors}, time.Now()) + if err == nil { + y.AddTag("stype", "nvlink") + output <- y + } + } + if !device.excludeMetrics["nv_nvlink_recovery_errors"] { + // Data link transmit recovery error counter + y, err := lp.NewMessage("nv_nvlink_recovery_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_recovery_errors}, time.Now()) + if err == nil { + y.AddTag("stype", "nvlink") + output <- y + } + } + if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] { + // Data link receive flow control digit CRC error counter + y, err := lp.NewMessage("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_flit_errors}, time.Now()) + if err == nil { + y.AddTag("stype", "nvlink") + output <- y + } + } return nil } -func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage) { var err error if !m.init { return } - readAll := func(device NvidiaCollectorDevice, output chan lp.CCMetric) { + readAll := func(device NvidiaCollectorDevice, output chan lp.CCMessage) { name, ret := nvml.DeviceGetName(device.device) if ret != nvml.SUCCESS { name = "NoName" diff --git a/collectors/raplMetric.go b/collectors/raplMetric.go index 57c456b..7800288 100644 --- a/collectors/raplMetric.go +++ b/collectors/raplMetric.go @@ -10,7 +10,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) // running average power limit (RAPL) monitoring attributes for a zone @@ -214,7 +214,7 @@ func (m *RAPLCollector) Init(config json.RawMessage) error { // Read reads running average power limit (RAPL) monitoring attributes for all initialized zones // See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes -func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) { for i := range m.RAPLZoneInfo { p := &m.RAPLZoneInfo[i] @@ -237,7 +237,7 @@ func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMetric) { timeDiff := energyTimestamp.Sub(p.energyTimestamp) averagePower := float64(energyDiff) / float64(timeDiff.Microseconds()) - y, err := lp.New( + y, err := lp.NewMessage( "rapl_average_power", p.tags, m.meta, diff --git a/collectors/rocmsmiMetric.go b/collectors/rocmsmiMetric.go index 9d8625d..32b3eca 100644 --- a/collectors/rocmsmiMetric.go +++ b/collectors/rocmsmiMetric.go @@ -7,7 +7,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" "github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi" ) @@ -162,7 +162,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error { // Read collects all metrics belonging to the sample collector // and sends them through the output channel to the collector manager -func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage) { // Create a sample metric timestamp := time.Now() @@ -175,119 +175,119 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMetric) if !dev.excludeMetrics["rocm_gfx_util"] { value := metrics.Average_gfx_activity - y, err := lp.New("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_umc_util"] { value := metrics.Average_umc_activity - y, err := lp.New("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_mm_util"] { value := metrics.Average_mm_activity - y, err := lp.New("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_avg_power"] { value := metrics.Average_socket_power - y, err := lp.New("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_temp_mem"] { value := metrics.Temperature_mem - y, err := lp.New("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_temp_hotspot"] { value := metrics.Temperature_hotspot - y, err := lp.New("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_temp_edge"] { value := metrics.Temperature_edge - y, err := lp.New("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_temp_vrgfx"] { value := metrics.Temperature_vrgfx - y, err := lp.New("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_temp_vrsoc"] { value := metrics.Temperature_vrsoc - y, err := lp.New("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_temp_vrmem"] { value := metrics.Temperature_vrmem - y, err := lp.New("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_gfx_clock"] { value := metrics.Average_gfxclk_frequency - y, err := lp.New("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_soc_clock"] { value := metrics.Average_socclk_frequency - y, err := lp.New("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_u_clock"] { value := metrics.Average_uclk_frequency - y, err := lp.New("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_v0_clock"] { value := metrics.Average_vclk0_frequency - y, err := lp.New("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_v1_clock"] { value := metrics.Average_vclk1_frequency - y, err := lp.New("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_d0_clock"] { value := metrics.Average_dclk0_frequency - y, err := lp.New("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } } if !dev.excludeMetrics["rocm_d1_clock"] { value := metrics.Average_dclk1_frequency - y, err := lp.New("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { output <- y } @@ -295,7 +295,7 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMetric) if !dev.excludeMetrics["rocm_temp_hbm"] { for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ { value := metrics.Temperature_hbm[i] - y, err := lp.New("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { y.AddTag("stype", "device") y.AddTag("stype-id", fmt.Sprintf("%d", i)) diff --git a/collectors/sampleMetric.go b/collectors/sampleMetric.go index 4c3ac66..73f76ef 100644 --- a/collectors/sampleMetric.go +++ b/collectors/sampleMetric.go @@ -4,8 +4,8 @@ import ( "encoding/json" "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) // These are the fields we read from the JSON configuration @@ -32,7 +32,7 @@ type SampleCollector struct { func (m *SampleCollector) Init(config json.RawMessage) error { var err error = nil // Always set the name early in Init() to use it in cclog.Component* functions - m.name = "InternalCollector" + m.name = "SampleCollector" // This is for later use, also call it early m.setup() // Tell whether the collector should be run in parallel with others (reading files, ...) @@ -74,7 +74,7 @@ func (m *SampleCollector) Init(config json.RawMessage) error { // Read collects all metrics belonging to the sample collector // and sends them through the output channel to the collector manager -func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMessage) { // Create a sample metric timestamp := time.Now() @@ -85,7 +85,7 @@ func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMetric) // stop := readState() // value = (stop - start) / interval.Seconds() - y, err := lp.New("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp) if err == nil { // Send it to output channel output <- y diff --git a/collectors/sampleTimerMetric.go b/collectors/sampleTimerMetric.go index dfac808..24c062b 100644 --- a/collectors/sampleTimerMetric.go +++ b/collectors/sampleTimerMetric.go @@ -5,8 +5,8 @@ import ( "sync" "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) // These are the fields we read from the JSON configuration @@ -25,7 +25,7 @@ type SampleTimerCollector struct { config SampleTimerCollectorConfig // the configuration structure interval time.Duration // the interval parsed from configuration ticker *time.Ticker // own timer - output chan lp.CCMetric // own internal output channel + output chan lp.CCMessage // own internal output channel } func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error { @@ -100,14 +100,14 @@ func (m *SampleTimerCollector) ReadMetrics(timestamp time.Time) { // stop := readState() // value = (stop - start) / interval.Seconds() - y, err := lp.New("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp) + y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp) if err == nil && m.output != nil { // Send it to output channel if we have a valid channel m.output <- y } } -func (m *SampleTimerCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *SampleTimerCollector) Read(interval time.Duration, output chan lp.CCMessage) { // Capture output channel m.output = output } diff --git a/collectors/schedstatMetric.go b/collectors/schedstatMetric.go index 8c010ed..d098804 100644 --- a/collectors/schedstatMetric.go +++ b/collectors/schedstatMetric.go @@ -11,7 +11,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) const SCHEDSTATFILE = `/proc/schedstat` @@ -96,7 +96,7 @@ func (m *SchedstatCollector) Init(config json.RawMessage) error { return err } -func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]string, output chan lp.CCMetric, now time.Time, tsdelta time.Duration) { +func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]string, output chan lp.CCMessage, now time.Time, tsdelta time.Duration) { running, _ := strconv.ParseInt(linefields[7], 10, 64) waiting, _ := strconv.ParseInt(linefields[8], 10, 64) diff_running := running - m.olddata[linefields[0]]["running"] @@ -109,7 +109,7 @@ func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string] m.olddata[linefields[0]]["waiting"] = waiting value := l_running + l_waiting - y, err := lp.New("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now) + y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now) if err == nil { // Send it to output channel output <- y @@ -118,7 +118,7 @@ func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string] // Read collects all metrics belonging to the sample collector // and sends them through the output channel to the collector manager -func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessage) { if !m.init { return } diff --git a/collectors/selfMetric.go b/collectors/selfMetric.go index 4fc95c0..6dd99db 100644 --- a/collectors/selfMetric.go +++ b/collectors/selfMetric.go @@ -7,7 +7,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) type SelfCollectorConfig struct { @@ -42,56 +42,56 @@ func (m *SelfCollector) Init(config json.RawMessage) error { return err } -func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) { timestamp := time.Now() if m.config.MemStats { var memstats runtime.MemStats runtime.ReadMemStats(&memstats) - y, err := lp.New("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp) + y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp) if err == nil { y.AddMeta("unit", "Bytes") output <- y } - y, err = lp.New("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp) + y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp) if err == nil { y.AddMeta("unit", "Bytes") output <- y } - y, err = lp.New("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp) + y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp) if err == nil { y.AddMeta("unit", "Bytes") output <- y } - y, err = lp.New("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp) + y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp) if err == nil { y.AddMeta("unit", "Bytes") output <- y } - y, err = lp.New("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp) + y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp) if err == nil { y.AddMeta("unit", "Bytes") output <- y } - y, err = lp.New("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp) + y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp) if err == nil { y.AddMeta("unit", "Bytes") output <- y } - y, err = lp.New("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp) + y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp) if err == nil { output <- y } } if m.config.GoRoutines { - y, err := lp.New("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp) + y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp) if err == nil { output <- y } } if m.config.CgoCalls { - y, err := lp.New("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp) + y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp) if err == nil { output <- y } @@ -102,35 +102,35 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMetric) { if err == nil { sec, nsec := rusage.Utime.Unix() t := float64(sec) + (float64(nsec) * 1e-9) - y, err := lp.New("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp) + y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp) if err == nil { y.AddMeta("unit", "seconds") output <- y } sec, nsec = rusage.Stime.Unix() t = float64(sec) + (float64(nsec) * 1e-9) - y, err = lp.New("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp) + y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp) if err == nil { y.AddMeta("unit", "seconds") output <- y } - y, err = lp.New("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp) + y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp) if err == nil { output <- y } - y, err = lp.New("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp) + y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp) if err == nil { output <- y } - y, err = lp.New("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp) + y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp) if err == nil { output <- y } - y, err = lp.New("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp) + y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp) if err == nil { output <- y } - y, err = lp.New("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp) + y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp) if err == nil { output <- y } diff --git a/collectors/tempMetric.go b/collectors/tempMetric.go index 303be4a..6ee9829 100644 --- a/collectors/tempMetric.go +++ b/collectors/tempMetric.go @@ -10,7 +10,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) // See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html @@ -171,7 +171,7 @@ func (m *TempCollector) Init(config json.RawMessage) error { return nil } -func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) { for _, sensor := range m.sensors { // Read sensor file @@ -190,7 +190,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) { continue } x /= 1000 - y, err := lp.New( + y, err := lp.NewMessage( sensor.metricName, sensor.tags, m.meta, @@ -203,7 +203,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) { // max temperature if m.config.ReportMaxTemp && sensor.maxTemp != 0 { - y, err := lp.New( + y, err := lp.NewMessage( sensor.maxTempName, sensor.tags, m.meta, @@ -217,7 +217,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) { // critical temperature if m.config.ReportCriticalTemp && sensor.critTemp != 0 { - y, err := lp.New( + y, err := lp.NewMessage( sensor.critTempName, sensor.tags, m.meta, diff --git a/collectors/topprocsMetric.go b/collectors/topprocsMetric.go index 08dbae0..edfd755 100644 --- a/collectors/topprocsMetric.go +++ b/collectors/topprocsMetric.go @@ -9,7 +9,7 @@ import ( "strings" "time" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) const MAX_NUM_PROCS = 10 @@ -53,7 +53,7 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error { return nil } -func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMetric) { +func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessage) { if !m.init { return } @@ -68,7 +68,7 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMetric lines := strings.Split(string(stdout), "\n") for i := 1; i < m.config.Num_procs+1; i++ { name := fmt.Sprintf("topproc%d", i) - y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now()) + y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now()) if err == nil { output <- y } diff --git a/go.mod b/go.mod index f6b7275..bc79294 100644 --- a/go.mod +++ b/go.mod @@ -1,23 +1,27 @@ module github.com/ClusterCockpit/cc-metric-collector -go 1.21 +go 1.21.1 + +toolchain go1.22.1 require ( + github.com/ClusterCockpit/cc-energy-manager v0.0.0-20240709142550-dd446f7ab900 github.com/ClusterCockpit/cc-units v0.4.0 github.com/ClusterCockpit/go-rocm-smi v0.3.0 github.com/NVIDIA/go-nvml v0.12.0-2 github.com/PaesslerAG/gval v1.2.2 + github.com/expr-lang/expr v1.16.9 github.com/fsnotify/fsnotify v1.7.0 github.com/gorilla/mux v1.8.1 github.com/influxdata/influxdb-client-go/v2 v2.13.0 github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf github.com/influxdata/line-protocol/v2 v2.2.1 - github.com/nats-io/nats.go v1.33.1 + github.com/nats-io/nats.go v1.36.0 github.com/prometheus/client_golang v1.19.0 github.com/stmcginnis/gofish v0.15.0 github.com/tklauser/go-sysconf v0.3.13 golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 - golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 + golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8 golang.org/x/sys v0.18.0 ) diff --git a/go.sum b/go.sum index 896df0b..dcc7656 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/ClusterCockpit/cc-energy-manager v0.0.0-20240709142550-dd446f7ab900 h1:6+WNav16uWTEDC09hkZKEHfBhtc91p/ZcjgCtyntuIg= +github.com/ClusterCockpit/cc-energy-manager v0.0.0-20240709142550-dd446f7ab900/go.mod h1:EbYeC5t+Y0kW1Q1pP2n9zMqbeYEJITG8YGvAUihXVn4= github.com/ClusterCockpit/cc-units v0.4.0 h1:zP5DOu99GmErW0tCDf0gcLrlWt42RQ9dpoONEOh4cI0= github.com/ClusterCockpit/cc-units v0.4.0/go.mod h1:3S3PAhAayS3pbgcT4q9Vn9VJw22Op51X0YimtG77zBw= github.com/ClusterCockpit/go-rocm-smi v0.3.0 h1:1qZnSpG7/NyLtc7AjqnUL9Jb8xtqG1nMVgp69rJfaR8= @@ -7,6 +9,7 @@ github.com/NVIDIA/go-nvml v0.12.0-2 h1:Sg239yy7jmopu/cuvYauoMj9fOpcGMngxVxxS1EBX github.com/NVIDIA/go-nvml v0.12.0-2/go.mod h1:7ruy85eOM73muOc/I37euONSwEyFqZsv5ED9AogD4G0= github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E= github.com/PaesslerAG/gval v1.2.2/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac= +github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI= github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8= github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk= github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ= @@ -18,14 +21,20 @@ github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/expr-lang/expr v1.16.9 h1:WUAzmR0JNI9JCiF0/ewwHB1gmcGw5wW7nWt8gc6PpCI= +github.com/expr-lang/expr v1.16.9/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= +github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk= github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= @@ -35,6 +44,7 @@ github.com/influxdata/influxdb-client-go/v2 v2.13.0/go.mod h1:k+spCbt9hcvqvUiz0s github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU= github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo= github.com/influxdata/line-protocol-corpus v0.0.0-20210519164801-ca6fa5da0184/go.mod h1:03nmhxzZ7Xk2pdG+lmMd7mHDfeVOYFyhOgwO61qWU98= +github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig= github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo= github.com/influxdata/line-protocol/v2 v2.0.0-20210312151457-c52fdecb625a/go.mod h1:6+9Xt5Sq1rWx+glMgxhcg2c0DUaehK+5TDcPZ76GypY= github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxksNj7PX9aUSeYOYE/ceHY= @@ -44,11 +54,14 @@ github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPci github.com/klauspost/compress v1.17.7 h1:ehO88t2UGzQK66LMdE8tibEd1ErmzZjNEqWkjLAKQQg= github.com/klauspost/compress v1.17.7/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/nats-io/nats.go v1.33.1 h1:8TxLZZ/seeEfR97qV0/Bl939tpDnt2Z2fK3HkPypj70= -github.com/nats-io/nats.go v1.33.1/go.mod h1:Ubdu4Nh9exXdSz0RVWRFBbRfrbSxOYd26oF0wkWclB8= +github.com/nats-io/nats.go v1.36.0 h1:suEUPuWzTSse/XhESwqLxXGuj8vGRuPRoG7MoRN/qyU= +github.com/nats-io/nats.go v1.36.0/go.mod h1:Ubdu4Nh9exXdSz0RVWRFBbRfrbSxOYd26oF0wkWclB8= github.com/nats-io/nkeys v0.4.7 h1:RwNJbbIdYCoClSDNY7QVKZlyb/wfT6ugvFCiKy6vDvI= github.com/nats-io/nkeys v0.4.7/go.mod h1:kqXRgRDPlGy7nGaEDMuYzmiJCIAAWDK0IMBtDmGD0nc= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= @@ -56,6 +69,7 @@ github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OS github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro= github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v1.19.0 h1:ygXvpU1AoN1MhdzckN+PyD9QJOSD4x7kmXYlnfbA6JU= github.com/prometheus/client_golang v1.19.0/go.mod h1:ZRM9uEAypZakd+q/x7+gmsvXdURP+DABIEIjnmDdp+k= @@ -65,6 +79,8 @@ github.com/prometheus/common v0.49.0 h1:ToNTdK4zSnPVJmh698mGFkDor9wBI/iGaJy5dbH1 github.com/prometheus/common v0.49.0/go.mod h1:Kxm+EULxRbUkjGU6WFsQqo3ORzB4tyKvlWFOE9mB2sE= github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo= github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8= github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0= @@ -76,6 +92,7 @@ github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpE github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/tklauser/go-sysconf v0.3.13 h1:GBUpcahXSpR2xN01jhkNAbTLRk2Yzgggk8IM08lq3r4= github.com/tklauser/go-sysconf v0.3.13/go.mod h1:zwleP4Q4OehZHGn4CYZDipCgg9usW5IJePewFCGVEa0= @@ -85,8 +102,8 @@ golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 h1:P7S/GeHBAFEZIYp0ePP golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1/go.mod h1:9CWpnTUmlQkfdpdutA1nNf4iE5lAVt3QZOu0Z6hahBE= golang.org/x/crypto v0.21.0 h1:X31++rzVUdKhX5sWmSOFZxx8UW/ldWx55cbf08iNAMA= golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs= -golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 h1:LfspQV/FYTatPTr/3HzIcmiUFH7PGP+OQ6mgDYo3yuQ= -golang.org/x/exp v0.0.0-20240222234643-814bf88cf225/go.mod h1:CxmFvTBINI24O/j8iY7H1xHzx2i4OsyguNBmN/uPtqc= +golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8 h1:yixxcjnhBmY0nkL253HFVIm0JsFHwrHdT3Yh6szTnfY= +golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8/go.mod h1:jj3sYF3dwk5D+ghuXyeI3r5MFf+NT2An6/9dOA95KSI= golang.org/x/net v0.22.0 h1:9sGLhx7iRIHEiX0oAJ3MRZMUCElJgy7Br1nO+AMN3Tc= golang.org/x/net v0.22.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -99,4 +116,5 @@ gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8 gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/metricAggregator/metricAggregator.go b/internal/metricAggregator/metricAggregator.go index 170b13e..c300df5 100644 --- a/internal/metricAggregator/metricAggregator.go +++ b/internal/metricAggregator/metricAggregator.go @@ -11,7 +11,7 @@ import ( cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology" "github.com/PaesslerAG/gval" @@ -31,14 +31,14 @@ type metricAggregator struct { functions []*MetricAggregatorIntervalConfig constants map[string]interface{} language gval.Language - output chan lp.CCMetric + output chan lp.CCMessage } type MetricAggregator interface { AddAggregation(name, function, condition string, tags, meta map[string]string) error DeleteAggregation(name string) error - Init(output chan lp.CCMetric) error - Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMetric) + Init(output chan lp.CCMessage) error + Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMessage) } var metricCacheLanguage = gval.NewLanguage( @@ -74,7 +74,7 @@ var evaluables = struct { mapping: make(map[string]gval.Evaluable), } -func (c *metricAggregator) Init(output chan lp.CCMetric) error { +func (c *metricAggregator) Init(output chan lp.CCMessage) error { c.output = output c.functions = make([]*MetricAggregatorIntervalConfig, 0) c.constants = make(map[string]interface{}) @@ -112,7 +112,7 @@ func (c *metricAggregator) Init(output chan lp.CCMetric) error { return nil } -func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMetric) { +func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMessage) { vars := make(map[string]interface{}) for k, v := range c.constants { vars[k] = v @@ -127,7 +127,7 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics var valuesInt32 []int32 var valuesInt64 []int64 var valuesBool []bool - matches := make([]lp.CCMetric, 0) + matches := make([]lp.CCMessage, 0) for _, m := range metrics { vars["metric"] = m //value, err := gval.Evaluate(f.Condition, vars, c.language) @@ -216,7 +216,7 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics break } - copy_tags := func(tags map[string]string, metrics []lp.CCMetric) map[string]string { + copy_tags := func(tags map[string]string, metrics []lp.CCMessage) map[string]string { out := make(map[string]string) for key, value := range tags { switch value { @@ -233,7 +233,7 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics } return out } - copy_meta := func(meta map[string]string, metrics []lp.CCMetric) map[string]string { + copy_meta := func(meta map[string]string, metrics []lp.CCMessage) map[string]string { out := make(map[string]string) for key, value := range meta { switch value { @@ -253,18 +253,18 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics tags := copy_tags(f.Tags, matches) meta := copy_meta(f.Meta, matches) - var m lp.CCMetric + var m lp.CCMessage switch t := value.(type) { case float64: - m, err = lp.New(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime) + m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime) case float32: - m, err = lp.New(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime) + m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime) case int: - m, err = lp.New(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime) + m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime) case int64: - m, err = lp.New(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime) + m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime) case string: - m, err = lp.New(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime) + m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime) default: cclog.ComponentError("MetricCache", "Gval returned invalid type", t, "skipping metric", f.Name) } @@ -389,7 +389,7 @@ func EvalFloat64Condition(condition string, params map[string]float64) (float64, return value, err } -func NewAggregator(output chan lp.CCMetric) (MetricAggregator, error) { +func NewAggregator(output chan lp.CCMessage) (MetricAggregator, error) { a := new(metricAggregator) err := a.Init(output) if err != nil { diff --git a/internal/metricRouter/README.md b/internal/metricRouter/README.md index ed99b51..546ac62 100644 --- a/internal/metricRouter/README.md +++ b/internal/metricRouter/README.md @@ -1,15 +1,21 @@ # CC Metric Router -The CCMetric router sits in between the collectors and the sinks and can be used to add and remove tags to/from traversing [CCMetrics](../ccMetric/README.md). +The CCMetric router sits in between the collectors and the sinks and can be used to add and remove tags to/from traversing [CCMessages](https://pkg.go.dev/github.com/ClusterCockpit/cc-energy-manager@v0.0.0-20240919152819-92a17f2da4f7/pkg/cc-message. + # Configuration +**Note**: Use the [message processor configuration](../../pkg/messageProcessor/README.md) with option `process_messages`. + ```json { "num_cache_intervals" : 1, "interval_timestamp" : true, "hostname_tag" : "hostname", "max_forward" : 50, + "process_messages": { + "see": "pkg/messageProcessor/README.md" + }, "add_tags" : [ { "key" : "cluster", @@ -63,6 +69,8 @@ The CCMetric router sits in between the collectors and the sinks and can be used There are three main options `add_tags`, `delete_tags` and `interval_timestamp`. `add_tags` and `delete_tags` are lists consisting of dicts with `key`, `value` and `if`. The `value` can be omitted in the `delete_tags` part as it only uses the `key` for removal. The `interval_timestamp` setting means that a unique timestamp is applied to all metrics traversing the router during an interval. +**Note**: Use the [message processor configuration](../../pkg/messageProcessor/README.md) (option `process_messages`) instead of `add_tags`, `delete_tags`, `drop_metrics`, `drop_metrics_if`, `rename_metrics`, `normalize_units` and `change_unit_prefix`. These options are deprecated and will be removed in future versions. Until then, they are added to the message processor. + # Processing order in the router - Add the `hostname_tag` tag (if sent by collectors or cache) @@ -96,6 +104,8 @@ Every time the router receives a metric through any of the channels, it tries to # The `rename_metrics` option +__deprecated__ + In the ClusterCockpit world we specified a set of standard metrics. Since some collectors determine the metric names based on files, execuables and libraries, they might change from system to system (or installation to installtion, OS to OS, ...). In order to get the common names, you can rename incoming metrics before sending them to the sink. If the metric name matches the `oldname`, it is changed to `newname` ```json @@ -107,6 +117,8 @@ In the ClusterCockpit world we specified a set of standard metrics. Since some c # Conditional manipulation of tags (`add_tags` and `del_tags`) +__deprecated__ + Common config format: ```json { @@ -118,6 +130,8 @@ Common config format: ## The `del_tags` option +__deprecated__ + The collectors are free to add whatever `key=value` pair to the metric tags (although the usage of tags should be minimized). If you want to delete a tag afterwards, you can do that. When the `if` condition matches on a metric, the `key` is removed from the metric's tags. If you want to remove a tag for all metrics, use the condition wildcard `*`. The `value` field can be omitted in the `del_tags` case. @@ -129,6 +143,8 @@ Never delete tags: ## The `add_tags` option +__deprecated__ + In some cases, metrics should be tagged or an existing tag changed based on some condition. This can be done in the `add_tags` section. When the `if` condition evaluates to `true`, the tag `key` is added or gets changed to the new `value`. If the CCMetric name is equal to `temp_package_id_0`, it adds an additional tag `test=testing` to the metric. @@ -170,6 +186,8 @@ In some cases, you want to drop a metric and don't get it forwarded to the sinks ## The `drop_metrics` section +__deprecated__ + The argument is a list of metric names. No futher checks are performed, only a comparison of the metric name ```json @@ -185,6 +203,8 @@ The example drops all metrics with the name `drop_metric_1` and `drop_metric_2`. ## The `drop_metrics_if` section +__deprecated__ + This option takes a list of evaluable conditions and performs them one after the other on **all** metrics incoming from the collectors and the metric cache (aka `interval_aggregates`). ```json @@ -200,15 +220,22 @@ The first line is comparable with the example in `drop_metrics`, it drops all me # Manipulating the metric units ## The `normalize_units` option + +__deprecated__ + + The cc-metric-collector tries to read the data from the system as it is reported. If available, it tries to read the metric unit from the system as well (e.g. from `/proc/meminfo`). The problem is that, depending on the source, the metric units are named differently. Just think about `byte`, `Byte`, `B`, `bytes`, ... The [cc-units](https://github.com/ClusterCockpit/cc-units) package provides us a normalization option to use the same metric unit name for all metrics. It this option is set to true, all `unit` meta tags are normalized. ## The `change_unit_prefix` section + +__deprecated__ + It is often the case that metrics are reported by the system using a rather outdated unit prefix (like `/proc/meminfo` still uses kByte despite current memory sizes are in the GByte range). If you want to change the prefix of a unit, you can do that with the help of [cc-units](https://github.com/ClusterCockpit/cc-units). The setting works on the metric name and requires the new prefix for the metric. The cc-units package determines the scaling factor. # Aggregate metric values of the current interval with the `interval_aggregates` option -**Note:** `interval_aggregates` works only if `num_cache_intervals` > 0 +**Note:** `interval_aggregates` works only if `num_cache_intervals` > 0 and is **experimental** In some cases, you need to derive new metrics based on the metrics arriving during an interval. This can be done in the `interval_aggregates` section. The logic is similar to the other metric manipulation and filtering options. A cache stores all metrics that arrive during an interval. At the beginning of the *next* interval, the list of metrics is submitted to the MetricAggregator. It derives new metrics and submits them back to the MetricRouter, so they are sent in the next interval but have the timestamp of the previous interval beginning. diff --git a/internal/metricRouter/metricCache.go b/internal/metricRouter/metricCache.go index 81e69a9..b0fcd3d 100644 --- a/internal/metricRouter/metricCache.go +++ b/internal/metricRouter/metricCache.go @@ -7,7 +7,7 @@ import ( cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker" ) @@ -16,7 +16,7 @@ type metricCachePeriod struct { stopstamp time.Time numMetrics int sizeMetrics int - metrics []lp.CCMetric + metrics []lp.CCMessage } // Metric cache data structure @@ -29,21 +29,21 @@ type metricCache struct { ticker mct.MultiChanTicker tickchan chan time.Time done chan bool - output chan lp.CCMetric + output chan lp.CCMessage aggEngine agg.MetricAggregator } type MetricCache interface { - Init(output chan lp.CCMetric, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error + Init(output chan lp.CCMessage, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error Start() - Add(metric lp.CCMetric) - GetPeriod(index int) (time.Time, time.Time, []lp.CCMetric) + Add(metric lp.CCMessage) + GetPeriod(index int) (time.Time, time.Time, []lp.CCMessage) AddAggregation(name, function, condition string, tags, meta map[string]string) error DeleteAggregation(name string) error Close() } -func (c *metricCache) Init(output chan lp.CCMetric, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error { +func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error { var err error = nil c.done = make(chan bool) c.wg = wg @@ -55,7 +55,7 @@ func (c *metricCache) Init(output chan lp.CCMetric, ticker mct.MultiChanTicker, p := new(metricCachePeriod) p.numMetrics = 0 p.sizeMetrics = 0 - p.metrics = make([]lp.CCMetric, 0) + p.metrics = make([]lp.CCMessage, 0) c.intervals = append(c.intervals, p) } @@ -124,7 +124,7 @@ func (c *metricCache) Start() { // Add a metric to the cache. The interval is defined by the global timer (rotate() in Start()) // The intervals list is used as round-robin buffer and the metric list grows dynamically and // to avoid reallocations -func (c *metricCache) Add(metric lp.CCMetric) { +func (c *metricCache) Add(metric lp.CCMessage) { if c.curPeriod >= 0 && c.curPeriod < c.numPeriods { c.lock.Lock() p := c.intervals[c.curPeriod] @@ -153,10 +153,10 @@ func (c *metricCache) DeleteAggregation(name string) error { // Get all metrics of a interval. The index is the difference to the current interval, so index=0 // is the current one, index=1 the last interval and so on. Returns and empty array if a wrong index // is given (negative index, index larger than configured number of total intervals, ...) -func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMetric) { +func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMessage) { var start time.Time = time.Now() var stop time.Time = time.Now() - var metrics []lp.CCMetric + var metrics []lp.CCMessage if index >= 0 && index < c.numPeriods { pindex := c.curPeriod - index if pindex < 0 { @@ -168,10 +168,10 @@ func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMetric) metrics = c.intervals[pindex].metrics //return c.intervals[pindex].startstamp, c.intervals[pindex].stopstamp, c.intervals[pindex].metrics } else { - metrics = make([]lp.CCMetric, 0) + metrics = make([]lp.CCMessage, 0) } } else { - metrics = make([]lp.CCMetric, 0) + metrics = make([]lp.CCMessage, 0) } return start, stop, metrics } @@ -182,7 +182,7 @@ func (c *metricCache) Close() { c.done <- true } -func NewCache(output chan lp.CCMetric, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) (MetricCache, error) { +func NewCache(output chan lp.CCMessage, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) (MetricCache, error) { c := new(metricCache) err := c.Init(output, ticker, wg, numPeriods) if err != nil { diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 32ac0f3..e30e436 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -2,6 +2,7 @@ package metricRouter import ( "encoding/json" + "fmt" "os" "strings" "sync" @@ -9,10 +10,10 @@ import ( cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker" - units "github.com/ClusterCockpit/cc-units" ) const ROUTER_MAX_FORWARD = 50 @@ -38,16 +39,17 @@ type metricRouterConfig struct { MaxForward int `json:"max_forward"` // Number of maximal forwarded metrics at one select NormalizeUnits bool `json:"normalize_units"` // Check unit meta flag and normalize it using cc-units ChangeUnitPrefix map[string]string `json:"change_unit_prefix"` // Add prefix that should be applied to the metrics - dropMetrics map[string]bool // Internal map for O(1) lookup + // dropMetrics map[string]bool // Internal map for O(1) lookup + MessageProcessor json.RawMessage `json:"process_message,omitempty"` } // Metric router data structure type metricRouter struct { hostname string // Hostname used in tags - coll_input chan lp.CCMetric // Input channel from CollectorManager - recv_input chan lp.CCMetric // Input channel from ReceiveManager - cache_input chan lp.CCMetric // Input channel from MetricCache - outputs []chan lp.CCMetric // List of all output channels + coll_input chan lp.CCMessage // Input channel from CollectorManager + recv_input chan lp.CCMessage // Input channel from ReceiveManager + cache_input chan lp.CCMessage // Input channel from MetricCache + outputs []chan lp.CCMessage // List of all output channels done chan bool // channel to finish / stop metric router wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector timestamp time.Time // timestamp periodically updated by ticker each interval @@ -56,14 +58,15 @@ type metricRouter struct { cache MetricCache // pointer to MetricCache cachewg sync.WaitGroup // wait group for MetricCache maxForward int // number of metrics to forward maximally in one iteration + mp mp.MessageProcessor } // MetricRouter access functions type MetricRouter interface { Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfigFile string) error - AddCollectorInput(input chan lp.CCMetric) - AddReceiverInput(input chan lp.CCMetric) - AddOutput(output chan lp.CCMetric) + AddCollectorInput(input chan lp.CCMessage) + AddReceiverInput(input chan lp.CCMessage) + AddOutput(output chan lp.CCMessage) Start() Close() } @@ -75,9 +78,9 @@ type MetricRouter interface { // * ticker (from variable ticker) // * configuration (read from config file in variable routerConfigFile) func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfigFile string) error { - r.outputs = make([]chan lp.CCMetric, 0) + r.outputs = make([]chan lp.CCMessage, 0) r.done = make(chan bool) - r.cache_input = make(chan lp.CCMetric) + r.cache_input = make(chan lp.CCMessage) r.wg = wg r.ticker = ticker r.config.MaxForward = ROUTER_MAX_FORWARD @@ -119,14 +122,56 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta) } } - r.config.dropMetrics = make(map[string]bool) - for _, mname := range r.config.DropMetrics { - r.config.dropMetrics[mname] = true + p, err := mp.NewMessageProcessor() + if err != nil { + return fmt.Errorf("initialization of message processor failed: %v", err.Error()) } + r.mp = p + + if len(r.config.MessageProcessor) > 0 { + err = r.mp.FromConfigJSON(r.config.MessageProcessor) + if err != nil { + return fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } + for _, mname := range r.config.DropMetrics { + r.mp.AddDropMessagesByName(mname) + } + for _, cond := range r.config.DropMetricsIf { + r.mp.AddDropMessagesByCondition(cond) + } + for _, data := range r.config.AddTags { + cond := data.Condition + if cond == "*" { + cond = "true" + } + r.mp.AddAddTagsByCondition(cond, data.Key, data.Value) + } + for _, data := range r.config.DelTags { + cond := data.Condition + if cond == "*" { + cond = "true" + } + r.mp.AddDeleteTagsByCondition(cond, data.Key, data.Value) + } + for oldname, newname := range r.config.RenameMetrics { + r.mp.AddRenameMetricByName(oldname, newname) + } + for metricName, prefix := range r.config.ChangeUnitPrefix { + r.mp.AddChangeUnitPrefix(fmt.Sprintf("name == '%s'", metricName), prefix) + } + r.mp.SetNormalizeUnits(r.config.NormalizeUnits) + + r.mp.AddAddTagsByCondition("true", r.config.HostnameTagName, r.hostname) + + // r.config.dropMetrics = make(map[string]bool) + // for _, mname := range r.config.DropMetrics { + // r.config.dropMetrics[mname] = true + // } return nil } -func getParamMap(point lp.CCMetric) map[string]interface{} { +func getParamMap(point lp.CCMessage) map[string]interface{} { params := make(map[string]interface{}) params["metric"] = point params["name"] = point.Name() @@ -144,7 +189,7 @@ func getParamMap(point lp.CCMetric) map[string]interface{} { } // DoAddTags adds a tag when condition is fullfiled -func (r *metricRouter) DoAddTags(point lp.CCMetric) { +func (r *metricRouter) DoAddTags(point lp.CCMessage) { var conditionMatches bool for _, m := range r.config.AddTags { if m.Condition == "*" { @@ -166,81 +211,81 @@ func (r *metricRouter) DoAddTags(point lp.CCMetric) { } // DoDelTags removes a tag when condition is fullfiled -func (r *metricRouter) DoDelTags(point lp.CCMetric) { - var conditionMatches bool - for _, m := range r.config.DelTags { - if m.Condition == "*" { - // Condition is always matched - conditionMatches = true - } else { - // Evaluate condition - var err error - conditionMatches, err = agg.EvalBoolCondition(m.Condition, getParamMap(point)) - if err != nil { - cclog.ComponentError("MetricRouter", err.Error()) - conditionMatches = false - } - } - if conditionMatches { - point.RemoveTag(m.Key) - } - } -} +// func (r *metricRouter) DoDelTags(point lp.CCMessage) { +// var conditionMatches bool +// for _, m := range r.config.DelTags { +// if m.Condition == "*" { +// // Condition is always matched +// conditionMatches = true +// } else { +// // Evaluate condition +// var err error +// conditionMatches, err = agg.EvalBoolCondition(m.Condition, getParamMap(point)) +// if err != nil { +// cclog.ComponentError("MetricRouter", err.Error()) +// conditionMatches = false +// } +// } +// if conditionMatches { +// point.RemoveTag(m.Key) +// } +// } +// } // Conditional test whether a metric should be dropped -func (r *metricRouter) dropMetric(point lp.CCMetric) bool { - // Simple drop check - if conditionMatches, ok := r.config.dropMetrics[point.Name()]; ok { - return conditionMatches - } +// func (r *metricRouter) dropMetric(point lp.CCMessage) bool { +// // Simple drop check +// if conditionMatches, ok := r.config.dropMetrics[point.Name()]; ok { +// return conditionMatches +// } - // Checking the dropping conditions - for _, m := range r.config.DropMetricsIf { - conditionMatches, err := agg.EvalBoolCondition(m, getParamMap(point)) - if err != nil { - cclog.ComponentError("MetricRouter", err.Error()) - conditionMatches = false - } - if conditionMatches { - return conditionMatches - } - } +// // Checking the dropping conditions +// for _, m := range r.config.DropMetricsIf { +// conditionMatches, err := agg.EvalBoolCondition(m, getParamMap(point)) +// if err != nil { +// cclog.ComponentError("MetricRouter", err.Error()) +// conditionMatches = false +// } +// if conditionMatches { +// return conditionMatches +// } +// } - // No dropping condition met - return false -} +// // No dropping condition met +// return false +// } -func (r *metricRouter) prepareUnit(point lp.CCMetric) bool { - if r.config.NormalizeUnits { - if in_unit, ok := point.GetMeta("unit"); ok { - u := units.NewUnit(in_unit) - if u.Valid() { - point.AddMeta("unit", u.Short()) - } - } - } - if newP, ok := r.config.ChangeUnitPrefix[point.Name()]; ok { +// func (r *metricRouter) prepareUnit(point lp.CCMessage) bool { +// if r.config.NormalizeUnits { +// if in_unit, ok := point.GetMeta("unit"); ok { +// u := units.NewUnit(in_unit) +// if u.Valid() { +// point.AddMeta("unit", u.Short()) +// } +// } +// } +// if newP, ok := r.config.ChangeUnitPrefix[point.Name()]; ok { - newPrefix := units.NewPrefix(newP) +// newPrefix := units.NewPrefix(newP) - if in_unit, ok := point.GetMeta("unit"); ok && newPrefix != units.InvalidPrefix { - u := units.NewUnit(in_unit) - if u.Valid() { - cclog.ComponentDebug("MetricRouter", "Change prefix to", newP, "for metric", point.Name()) - conv, out_unit := units.GetUnitPrefixFactor(u, newPrefix) - if conv != nil && out_unit.Valid() { - if val, ok := point.GetField("value"); ok { - point.AddField("value", conv(val)) - point.AddMeta("unit", out_unit.Short()) - } - } - } +// if in_unit, ok := point.GetMeta("unit"); ok && newPrefix != units.InvalidPrefix { +// u := units.NewUnit(in_unit) +// if u.Valid() { +// cclog.ComponentDebug("MetricRouter", "Change prefix to", newP, "for metric", point.Name()) +// conv, out_unit := units.GetUnitPrefixFactor(u, newPrefix) +// if conv != nil && out_unit.Valid() { +// if val, ok := point.GetField("value"); ok { +// point.AddField("value", conv(val)) +// point.AddMeta("unit", out_unit.Short()) +// } +// } +// } - } - } +// } +// } - return true -} +// return true +// } // Start starts the metric router func (r *metricRouter) Start() { @@ -259,59 +304,75 @@ func (r *metricRouter) Start() { // Forward takes a received metric, adds or deletes tags // and forwards it to the output channels - forward := func(point lp.CCMetric) { - cclog.ComponentDebug("MetricRouter", "FORWARD", point) - r.DoAddTags(point) - r.DoDelTags(point) - name := point.Name() - if new, ok := r.config.RenameMetrics[name]; ok { - point.SetName(new) - point.AddMeta("oldname", name) - r.DoAddTags(point) - r.DoDelTags(point) - } + // forward := func(point lp.CCMessage) { + // cclog.ComponentDebug("MetricRouter", "FORWARD", point) + // r.DoAddTags(point) + // r.DoDelTags(point) + // name := point.Name() + // if new, ok := r.config.RenameMetrics[name]; ok { + // point.SetName(new) + // point.AddMeta("oldname", name) + // r.DoAddTags(point) + // r.DoDelTags(point) + // } - r.prepareUnit(point) + // r.prepareUnit(point) - for _, o := range r.outputs { - o <- point - } - } + // for _, o := range r.outputs { + // o <- point + // } + // } // Foward message received from collector channel - coll_forward := func(p lp.CCMetric) { + coll_forward := func(p lp.CCMessage) { // receive from metric collector - p.AddTag(r.config.HostnameTagName, r.hostname) + //p.AddTag(r.config.HostnameTagName, r.hostname) if r.config.IntervalStamp { p.SetTime(r.timestamp) } - if !r.dropMetric(p) { - forward(p) + m, err := r.mp.ProcessMessage(p) + if err == nil && m != nil { + for _, o := range r.outputs { + o <- m + } } + // if !r.dropMetric(p) { + // for _, o := range r.outputs { + // o <- point + // } + // } // even if the metric is dropped, it is stored in the cache for // aggregations if r.config.NumCacheIntervals > 0 { - r.cache.Add(p) + r.cache.Add(m) } } // Forward message received from receivers channel - recv_forward := func(p lp.CCMetric) { + recv_forward := func(p lp.CCMessage) { // receive from receive manager if r.config.IntervalStamp { p.SetTime(r.timestamp) } - if !r.dropMetric(p) { - forward(p) + m, err := r.mp.ProcessMessage(p) + if err == nil && m != nil { + for _, o := range r.outputs { + o <- m + } } + // if !r.dropMetric(p) { + // forward(p) + // } } // Forward message received from cache channel - cache_forward := func(p lp.CCMetric) { + cache_forward := func(p lp.CCMessage) { // receive from metric collector - if !r.dropMetric(p) { - p.AddTag(r.config.HostnameTagName, r.hostname) - forward(p) + m, err := r.mp.ProcessMessage(p) + if err == nil && m != nil { + for _, o := range r.outputs { + o <- m + } } } @@ -358,17 +419,17 @@ func (r *metricRouter) Start() { } // AddCollectorInput adds a channel between metric collector and metric router -func (r *metricRouter) AddCollectorInput(input chan lp.CCMetric) { +func (r *metricRouter) AddCollectorInput(input chan lp.CCMessage) { r.coll_input = input } // AddReceiverInput adds a channel between metric receiver and metric router -func (r *metricRouter) AddReceiverInput(input chan lp.CCMetric) { +func (r *metricRouter) AddReceiverInput(input chan lp.CCMessage) { r.recv_input = input } // AddOutput adds a output channel to the metric router -func (r *metricRouter) AddOutput(output chan lp.CCMetric) { +func (r *metricRouter) AddOutput(output chan lp.CCMessage) { r.outputs = append(r.outputs, output) } diff --git a/pkg/ccTopology/ccTopology.go b/pkg/ccTopology/ccTopology.go index a3ebec6..d180682 100644 --- a/pkg/ccTopology/ccTopology.go +++ b/pkg/ccTopology/ccTopology.go @@ -296,6 +296,25 @@ func GetTypeList(topology_type string) []int { return []int{} } +func GetTypeId(hwt HwthreadEntry, topology_type string) (int, error) { + var err error = nil + switch topology_type { + case "node": + return 0, err + case "socket": + return hwt.Socket, err + case "die": + return hwt.Die, err + case "memoryDomain": + return hwt.NumaDomain, err + case "core": + return hwt.Core, err + case "hwthread": + return hwt.CpuID, err + } + return -1, fmt.Errorf("unknown topology type '%s'", topology_type) +} + // CpuData returns CPU data for each hardware thread func CpuData() []HwthreadEntry { // return a deep copy to protect cache data @@ -423,3 +442,22 @@ func GetCoreHwthreads(core int) []int { } return cpuList } + +// GetTypeList gets the list of specified type using the naming format inside ClusterCockpit +func GetTypeHwthreads(topology_type string, id int) []int { + switch topology_type { + case "node": + return HwthreadList() + case "socket": + return GetSocketHwthreads(id) + case "die": + return GetDieHwthreads(id) + case "memoryDomain": + return GetNumaDomainHwthreads(id) + case "core": + return GetCoreHwthreads(id) + case "hwthread": + return []int{id} + } + return []int{} +} diff --git a/pkg/messageProcessor/README.md b/pkg/messageProcessor/README.md index 2fef020..36ff8ed 100644 --- a/pkg/messageProcessor/README.md +++ b/pkg/messageProcessor/README.md @@ -10,6 +10,8 @@ lack of flexibility caused some trouble: > resolution for some metrics. The issue was basically the `mem_used` metric showing the currently used memory of the node. Ganglia wants it in `kByte` as provided > by the Linux operating system but CC wanted it in `GByte`. +With the message processor, the Ganglia sinks can apply the unit prefix changes individually and name the metrics as required by Ganglia. + ## For developers Whenever you receive or are about to send a message out, you should provide some processing. @@ -20,41 +22,211 @@ New operations can be added to the message processor at runtime. Of course, they or some fields in a configuration file for the processing. The message processor uses the following configuration -```golang -type messageProcessorConfig struct { - DropMessages []string `json:"drop_messages"` // List of metric names to drop. For fine-grained dropping use drop_messages_if - DropMessagesIf []string `json:"drop_messages_if"` // List of evaluatable terms to drop messages - RenameMessages map[string]string `json:"rename_messages"` // Map to rename metric name from key to value - NormalizeUnits bool `json:"normalize_units"` // Check unit meta flag and normalize it using cc-units - ChangeUnitPrefix map[string]string `json:"change_unit_prefix"` // Add prefix that should be applied to the messages + +```json +{ + "drop_messages": [ + "name_of_message_to_drop" + ], + "drop_messages_if": [ + "condition_when_to_drop_message", + "name == 'drop_this'", + "tag.hostname == 'this_host'", + "meta.unit != 'MB'" + ], + "rename_messages" : { + "old_message_name" : "new_message_name" + }, + "rename_messages_if": { + "condition_when_to_rename_message" : "new_name" + }, + "add_tags_if": [ + { + "if" : "condition_when_to_add_tag", + "key": "name_for_new_tag", + "value": "new_tag_value" + } + ], + "delete_tags_if": [ + { + "if" : "condition_when_to_delete_tag", + "key": "name_of_tag" + } + ], + "add_meta_if": [ + { + "if" : "condition_when_to_add_meta_info", + "key": "name_for_new_meta_info", + "value": "new_meta_info_value" + } + ], + "delete_meta_if": [ + { + "if" : "condition_when_to_delete_meta_info", + "key": "name_of_meta_info" + } + ], + "add_field_if": [ + { + "if" : "condition_when_to_add_field", + "key": "name_for_new_field", + "value": "new_field_value_but_only_string_at_the_moment" + } + ], + "delete_field_if": [ + { + "if" : "condition_when_to_delete_field", + "key": "name_of_field" + } + ], + "move_tag_to_meta_if": [ + { + "if" : "condition_when_to_move_tag_to_meta_info_including_its_value", + "key": "name_of_tag", + "value": "name_of_meta_info" + } + ], + "move_tag_to_field_if": [ + { + "if" : "condition_when_to_move_tag_to_fields_including_its_value", + "key": "name_of_tag", + "value": "name_of_field" + } + ], + "move_meta_to_tag_if": [ + { + "if" : "condition_when_to_move_meta_info_to_tags_including_its_value", + "key": "name_of_meta_info", + "value": "name_of_tag" + } + ], + "move_meta_to_field_if": [ + { + "if" : "condition_when_to_move_meta_info_to_fields_including_its_value", + "key": "name_of_tag", + "value": "name_of_meta_info" + } + ], + "move_field_to_tag_if": [ + { + "if" : "condition_when_to_move_field_to_tags_including_its_stringified_value", + "key": "name_of_field", + "value": "name_of_tag" + } + ], + "move_field_to_meta_if": [ + { + "if" : "condition_when_to_move_field_to_meta_info_including_its_stringified_value", + "key": "name_of_field", + "value": "name_of_meta_info" + } + ], + "drop_by_message_type": [ + "metric", + "event", + "log", + "control" + ], + "change_unit_prefix": { + "name == 'metric_with_wrong_unit_prefix'" : "G", + "only_if_messagetype == 'metric'": "T" + }, + "normalize_units": true, + "add_base_env": { + "MY_CONSTANT_FOR_CUSTOM_CONDITIONS": 1.0, + "output_value_for_test_metrics": 42.0, + }, + "stage_order": [ + "rename_messages_if", + "drop_messages" + ] } ``` +The options `change_unit_prefix` and `normalize_units` are only applied to CCMetrics. It is not possible to delete the field related to each message type as defined in [cc-specification](https://github.com/ClusterCockpit/cc-specifications/tree/master/interfaces/lineprotocol). In short: +- CCMetrics always have to have a field named `value` +- CCEvents always have to have a field named `event` +- CCLogs always have to have a field named `log` +- CCControl messages always have to have a field named `control` + +With `add_base_env`, one can specifiy mykey=myvalue pairs that can be used in conditions like `tag.type == mykey`. + +The order in which each message is processed, can be specified with the `stage_order` option. The stage names are the keys in the JSON configuration, thus `change_unit_prefix`, `move_field_to_meta_if`, etc. Stages can be listed multiple times. + +### Using the component In order to load the configuration from a `json.RawMessage`: ```golang -mp, _ := NewMessageProcessor() - +mp, err := NewMessageProcessor() +if err != nil { + log.Error("failed to create new message processor") +} mp.FromConfigJSON(configJson) ``` -### Using the component After initialization and adding the different operations, the `ProcessMessage()` function applies all operations and returns whether the message should be dropped. ```golang m := lp.CCMetric{} -drop, err := mp.ProcessMessage(m) -if !drop { - // process further +x, err := mp.ProcessMessage(m) +if err != nil { + // handle error +} +if x != nil { + // process x further +} else { + // this message got dropped } ``` -#### Overhead +Single operations can be added and removed at runtime +```golang +type MessageProcessor interface { + // Functions to set the execution order of the processing stages + SetStages([]string) error + DefaultStages() []string + // Function to add variables to the base evaluation environment + AddBaseEnv(env map[string]interface{}) error + // Functions to add and remove rules + AddDropMessagesByName(name string) error + RemoveDropMessagesByName(name string) + AddDropMessagesByCondition(condition string) error + RemoveDropMessagesByCondition(condition string) + AddRenameMetricByCondition(condition string, name string) error + RemoveRenameMetricByCondition(condition string) + AddRenameMetricByName(from, to string) error + RemoveRenameMetricByName(from string) + SetNormalizeUnits(settings bool) + AddChangeUnitPrefix(condition string, prefix string) error + RemoveChangeUnitPrefix(condition string) + AddAddTagsByCondition(condition, key, value string) error + RemoveAddTagsByCondition(condition string) + AddDeleteTagsByCondition(condition, key, value string) error + RemoveDeleteTagsByCondition(condition string) + AddAddMetaByCondition(condition, key, value string) error + RemoveAddMetaByCondition(condition string) + AddDeleteMetaByCondition(condition, key, value string) error + RemoveDeleteMetaByCondition(condition string) + AddMoveTagToMeta(condition, key, value string) error + RemoveMoveTagToMeta(condition string) + AddMoveTagToFields(condition, key, value string) error + RemoveMoveTagToFields(condition string) + AddMoveMetaToTags(condition, key, value string) error + RemoveMoveMetaToTags(condition string) + AddMoveMetaToFields(condition, key, value string) error + RemoveMoveMetaToFields(condition string) + AddMoveFieldToTags(condition, key, value string) error + RemoveMoveFieldToTags(condition string) + AddMoveFieldToMeta(condition, key, value string) error + RemoveMoveFieldToMeta(condition string) + // Read in a JSON configuration + FromConfigJSON(config json.RawMessage) error + ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, error) + // Processing functions for legacy CCMetric and current CCMessage + ProcessMetric(m lp.CCMetric) (lp2.CCMessage, error) +} +``` -The operations taking conditions are pre-processed, which is commonly the time consuming part but, of course, with each added operation, the time to process a message -increases. - -## For users ### Syntax for evaluatable terms @@ -62,17 +234,19 @@ The message processor uses `gval` for evaluating the terms. It provides a basic Accessible for operations are - `name` of the message -- `timestamp` of the message -- `type`, `type-id` of the message (also `tag_type` and `tag_type-id`) -- `stype`, `stype-id` of the message (if message has theses tags, also `tag_stype` and `tag_stype-id`) +- `timestamp` or `time` of the message +- `type`, `type-id` of the message (also `tag_type`, `tag_type-id` and `tag_typeid`) +- `stype`, `stype-id` of the message (if message has theses tags, also `tag_stype`, `tag_stype-id` and `tag_stypeid`) - `value` for a CCMetric message (also `field_value`) - `event` for a CCEvent message (also `field_event`) - `control` for a CCControl message (also `field_control`) - `log` for a CCLog message (also `field_log`) +- `messagetype` or `msgtype`. Possible values `event`, `metric`, `log` and `control`. -Generally, all tags are accessible with `tag_`, all meta information with `meta_` and fields with `field_`. +Generally, all tags are accessible with `tag_`, `tags_` or `tags.`. Similarly for all fields with `field[s]?[_.]`. For meta information `meta[_.]` (there is no `metas[_.]`). -- Comparing strings: `==`, `!=`, `match(str, regex)` (use `%` instead of `\`!) +The [syntax of `expr`](https://expr-lang.org/docs/language-definition) is accepted with some additions: +- Comparing strings: `==`, `!=`, `str matches regex` (use `%` instead of `\`!) - Combining conditions: `&&`, `||` - Comparing numbers: `==`, `!=`, `<`, `>`, `<=`, `>=` - Test lists: ` in ` @@ -82,3 +256,11 @@ Often the operations are written in JSON files for loading them at startup. In J - use `''` instead of `""` for strings - for the regexes, use `%` instead of `\` + +For operations that should be applied on all messages, use the condition `true`. + +### Overhead + +The operations taking conditions are pre-processed, which is commonly the time consuming part but, of course, with each added operation, the time to process a message +increases. Moreover, the processing creates a copy of the message. + diff --git a/pkg/messageProcessor/messageProcessor.go b/pkg/messageProcessor/messageProcessor.go index 3a3d1cf..9bcc54a 100644 --- a/pkg/messageProcessor/messageProcessor.go +++ b/pkg/messageProcessor/messageProcessor.go @@ -6,18 +6,19 @@ import ( "strings" "sync" - lp2 "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lplegacy "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + "github.com/expr-lang/expr" "github.com/expr-lang/expr/vm" ) // Message processor add/delete tag/meta configuration type messageProcessorTagConfig struct { - Key string `json:"key"` // Tag name - Value string `json:"value"` // Tag value - Condition string `json:"if"` // Condition for adding or removing corresponding tag + Key string `json:"key"` // Tag name + Value string `json:"value,omitempty"` // Tag value + Condition string `json:"if"` // Condition for adding or removing corresponding tag } type messageProcessorConfig struct { @@ -32,8 +33,8 @@ type messageProcessorConfig struct { DelTagsIf []messageProcessorTagConfig `json:"delete_tags_if"` // List of tags that are removed when the condition is met AddMetaIf []messageProcessorTagConfig `json:"add_meta_if"` // List of meta infos that are added when the condition is met DelMetaIf []messageProcessorTagConfig `json:"delete_meta_if"` // List of meta infos that are removed when the condition is met - AddFieldIf []messageProcessorTagConfig `json:"add_fields_if"` // List of fields that are added when the condition is met - DelFieldIf []messageProcessorTagConfig `json:"delete_fields_if"` // List of fields that are removed when the condition is met + AddFieldIf []messageProcessorTagConfig `json:"add_field_if"` // List of fields that are added when the condition is met + DelFieldIf []messageProcessorTagConfig `json:"delete_field_if"` // List of fields that are removed when the condition is met DropByType []string `json:"drop_by_message_type"` // List of message types that should be dropped MoveTagToMeta []messageProcessorTagConfig `json:"move_tag_to_meta_if"` MoveTagToField []messageProcessorTagConfig `json:"move_tag_to_field_if"` @@ -117,8 +118,8 @@ type MessageProcessor interface { // Read in a JSON configuration FromConfigJSON(config json.RawMessage) error // Processing functions for legacy CCMetric and current CCMessage - ProcessMetric(m lp.CCMetric) (lp2.CCMessage, error) - ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, error) + ProcessMetric(m lplegacy.CCMetric) (lp.CCMessage, error) + ProcessMessage(m lp.CCMessage) (lp.CCMessage, error) //EvalToBool(condition string, parameters map[string]interface{}) (bool, error) //EvalToFloat64(condition string, parameters map[string]interface{}) (float64, error) //EvalToString(condition string, parameters map[string]interface{}) (string, error) @@ -261,8 +262,8 @@ var baseenv = map[string]interface{}{ "log": "", }, "timestamp": 1234567890, - "msg": lp2.EmptyMessage(), - "message": lp2.EmptyMessage(), + "msg": lp.EmptyMessage(), + "message": lp.EmptyMessage(), } func addBaseEnvWalker(values map[string]interface{}) map[string]interface{} { @@ -759,8 +760,8 @@ func (mp *messageProcessor) FromConfigJSON(config json.RawMessage) error { return nil } -func (mp *messageProcessor) ProcessMetric(metric lp.CCMetric) (lp2.CCMessage, error) { - m, err := lp2.NewMessage( +func (mp *messageProcessor) ProcessMetric(metric lplegacy.CCMetric) (lp.CCMessage, error) { + m, err := lp.NewMessage( metric.Name(), metric.Tags(), metric.Meta(), @@ -774,9 +775,9 @@ func (mp *messageProcessor) ProcessMetric(metric lp.CCMetric) (lp2.CCMessage, er } -func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, error) { +func (mp *messageProcessor) ProcessMessage(m lp.CCMessage) (lp.CCMessage, error) { var err error = nil - var out lp2.CCMessage = lp2.FromMessage(m) + var out lp.CCMessage = lp.FromMessage(m) name := out.Name() @@ -802,45 +803,45 @@ func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, erro switch s { case STAGENAME_DROP_BY_NAME: if len(mp.dropMessages) > 0 { - cclog.ComponentDebug("MessageProcessor", "Dropping by message name ", name) + //cclog.ComponentDebug("MessageProcessor", "Dropping by message name ", name) if _, ok := mp.dropMessages[name]; ok { - cclog.ComponentDebug("MessageProcessor", "Drop") + //cclog.ComponentDebug("MessageProcessor", "Drop") return nil, nil } } case STAGENAME_DROP_BY_TYPE: if len(mp.dropTypes) > 0 { - cclog.ComponentDebug("MessageProcessor", "Dropping by message type") + //cclog.ComponentDebug("MessageProcessor", "Dropping by message type") if _, ok := mp.dropTypes[params["messagetype"].(string)]; ok { - cclog.ComponentDebug("MessageProcessor", "Drop") + //cclog.ComponentDebug("MessageProcessor", "Drop") return nil, nil } } case STAGENAME_DROP_IF: if len(mp.dropMessagesIf) > 0 { - cclog.ComponentDebug("MessageProcessor", "Dropping by condition") + //cclog.ComponentDebug("MessageProcessor", "Dropping by condition") drop, err := dropMessagesIf(¶ms, &mp.dropMessagesIf) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) } if drop { - cclog.ComponentDebug("MessageProcessor", "Drop") + //cclog.ComponentDebug("MessageProcessor", "Drop") return nil, nil } } case STAGENAME_RENAME_BY_NAME: if len(mp.renameMessages) > 0 { - cclog.ComponentDebug("MessageProcessor", "Renaming by name match") + //cclog.ComponentDebug("MessageProcessor", "Renaming by name match") if newname, ok := mp.renameMessages[name]; ok { - cclog.ComponentDebug("MessageProcessor", "Rename to", newname) + //cclog.ComponentDebug("MessageProcessor", "Rename to", newname) out.SetName(newname) - cclog.ComponentDebug("MessageProcessor", "Add old name as 'oldname' to meta", name) + //cclog.ComponentDebug("MessageProcessor", "Add old name as 'oldname' to meta", name) out.AddMeta("oldname", name) } } case STAGENAME_RENAME_IF: if len(mp.renameMessagesIf) > 0 { - cclog.ComponentDebug("MessageProcessor", "Renaming by condition") + //cclog.ComponentDebug("MessageProcessor", "Renaming by condition") _, err := renameMessagesIf(out, ¶ms, &mp.renameMessagesIf) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) @@ -848,7 +849,7 @@ func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, erro } case STAGENAME_ADD_TAG: if len(mp.addTagsIf) > 0 { - cclog.ComponentDebug("MessageProcessor", "Adding tags") + //cclog.ComponentDebug("MessageProcessor", "Adding tags") _, err = addTagIf(out, ¶ms, &mp.addTagsIf) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) @@ -856,7 +857,7 @@ func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, erro } case STAGENAME_DELETE_TAG: if len(mp.deleteTagsIf) > 0 { - cclog.ComponentDebug("MessageProcessor", "Delete tags") + //cclog.ComponentDebug("MessageProcessor", "Delete tags") _, err = deleteTagIf(out, ¶ms, &mp.deleteTagsIf) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) @@ -864,7 +865,7 @@ func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, erro } case STAGENAME_ADD_META: if len(mp.addMetaIf) > 0 { - cclog.ComponentDebug("MessageProcessor", "Adding meta information") + //cclog.ComponentDebug("MessageProcessor", "Adding meta information") _, err = addMetaIf(out, ¶ms, &mp.addMetaIf) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) @@ -872,7 +873,7 @@ func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, erro } case STAGENAME_DELETE_META: if len(mp.deleteMetaIf) > 0 { - cclog.ComponentDebug("MessageProcessor", "Delete meta information") + //cclog.ComponentDebug("MessageProcessor", "Delete meta information") _, err = deleteMetaIf(out, ¶ms, &mp.deleteMetaIf) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) @@ -880,7 +881,7 @@ func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, erro } case STAGENAME_ADD_FIELD: if len(mp.addFieldIf) > 0 { - cclog.ComponentDebug("MessageProcessor", "Adding fields") + //cclog.ComponentDebug("MessageProcessor", "Adding fields") _, err = addFieldIf(out, ¶ms, &mp.addFieldIf) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) @@ -888,7 +889,7 @@ func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, erro } case STAGENAME_DELETE_FIELD: if len(mp.deleteFieldIf) > 0 { - cclog.ComponentDebug("MessageProcessor", "Delete fields") + //cclog.ComponentDebug("MessageProcessor", "Delete fields") _, err = deleteFieldIf(out, ¶ms, &mp.deleteFieldIf) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) @@ -896,7 +897,7 @@ func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, erro } case STAGENAME_MOVE_TAG_META: if len(mp.moveTagToMeta) > 0 { - cclog.ComponentDebug("MessageProcessor", "Move tag to meta") + //cclog.ComponentDebug("MessageProcessor", "Move tag to meta") _, err := moveTagToMeta(out, ¶ms, &mp.moveTagToMeta) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) @@ -904,7 +905,7 @@ func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, erro } case STAGENAME_MOVE_TAG_FIELD: if len(mp.moveTagToField) > 0 { - cclog.ComponentDebug("MessageProcessor", "Move tag to fields") + //cclog.ComponentDebug("MessageProcessor", "Move tag to fields") _, err := moveTagToField(out, ¶ms, &mp.moveTagToField) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) @@ -912,7 +913,7 @@ func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, erro } case STAGENAME_MOVE_META_TAG: if len(mp.moveMetaToTag) > 0 { - cclog.ComponentDebug("MessageProcessor", "Move meta to tags") + //cclog.ComponentDebug("MessageProcessor", "Move meta to tags") _, err := moveMetaToTag(out, ¶ms, &mp.moveMetaToTag) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) @@ -920,7 +921,7 @@ func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, erro } case STAGENAME_MOVE_META_FIELD: if len(mp.moveMetaToField) > 0 { - cclog.ComponentDebug("MessageProcessor", "Move meta to fields") + //cclog.ComponentDebug("MessageProcessor", "Move meta to fields") _, err := moveMetaToField(out, ¶ms, &mp.moveMetaToField) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) @@ -928,7 +929,7 @@ func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, erro } case STAGENAME_MOVE_FIELD_META: if len(mp.moveFieldToMeta) > 0 { - cclog.ComponentDebug("MessageProcessor", "Move field to meta") + //cclog.ComponentDebug("MessageProcessor", "Move field to meta") _, err := moveFieldToMeta(out, ¶ms, &mp.moveFieldToMeta) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) @@ -936,7 +937,7 @@ func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, erro } case STAGENAME_MOVE_FIELD_TAG: if len(mp.moveFieldToTag) > 0 { - cclog.ComponentDebug("MessageProcessor", "Move field to tags") + //cclog.ComponentDebug("MessageProcessor", "Move field to tags") _, err := moveFieldToTag(out, ¶ms, &mp.moveFieldToTag) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) @@ -944,8 +945,8 @@ func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, erro } case STAGENAME_NORMALIZE_UNIT: if mp.normalizeUnits { - cclog.ComponentDebug("MessageProcessor", "Normalize units") - if lp2.IsMetric(out) { + //cclog.ComponentDebug("MessageProcessor", "Normalize units") + if lp.IsMetric(out) { _, err := normalizeUnits(out) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) @@ -957,8 +958,8 @@ func (mp *messageProcessor) ProcessMessage(m lp2.CCMessage) (lp2.CCMessage, erro case STAGENAME_CHANGE_UNIT_PREFIX: if len(mp.changeUnitPrefix) > 0 { - cclog.ComponentDebug("MessageProcessor", "Change unit prefix") - if lp2.IsMetric(out) { + //cclog.ComponentDebug("MessageProcessor", "Change unit prefix") + if lp.IsMetric(out) { _, err := changeUnitPrefix(out, ¶ms, &mp.changeUnitPrefix) if err != nil { return out, fmt.Errorf("failed to evaluate: %v", err.Error()) diff --git a/pkg/messageProcessor/messageProcessorFuncs.go b/pkg/messageProcessor/messageProcessorFuncs.go index 23c261e..8fa5661 100644 --- a/pkg/messageProcessor/messageProcessorFuncs.go +++ b/pkg/messageProcessor/messageProcessorFuncs.go @@ -5,7 +5,6 @@ import ( "fmt" lp2 "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" - cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" units "github.com/ClusterCockpit/cc-units" "github.com/expr-lang/expr" "github.com/expr-lang/expr/vm" @@ -26,49 +25,47 @@ func moveInMessage(message lp2.CCMessage, params *map[string]interface{}, checks if err != nil { return false, fmt.Errorf("failed to evaluate: %v", err.Error()) } - cclog.ComponentDebug("MessageProcessor", "Move from", from, "to", to) + //cclog.ComponentDebug("MessageProcessor", "Move from", from, "to", to) if value.(bool) { var v string var ok bool = false switch from { case MESSAGE_LOCATION_TAGS: - cclog.ComponentDebug("MessageProcessor", "Getting tag key", data.Key) + //cclog.ComponentDebug("MessageProcessor", "Getting tag key", data.Key) v, ok = message.GetTag(data.Key) case MESSAGE_LOCATION_META: - cclog.ComponentDebug("MessageProcessor", "Getting meta key", data.Key) - cclog.ComponentDebug("MessageProcessor", message.Meta()) + //cclog.ComponentDebug("MessageProcessor", "Getting meta key", data.Key) + //cclog.ComponentDebug("MessageProcessor", message.Meta()) v, ok = message.GetMeta(data.Key) case MESSAGE_LOCATION_FIELDS: var x interface{} - cclog.ComponentDebug("MessageProcessor", "Getting field key", data.Key) + //cclog.ComponentDebug("MessageProcessor", "Getting field key", data.Key) x, ok = message.GetField(data.Key) v = fmt.Sprintf("%v", x) } if ok { switch from { case MESSAGE_LOCATION_TAGS: - cclog.ComponentDebug("MessageProcessor", "Removing tag key", data.Key) + //cclog.ComponentDebug("MessageProcessor", "Removing tag key", data.Key) message.RemoveTag(data.Key) case MESSAGE_LOCATION_META: - cclog.ComponentDebug("MessageProcessor", "Removing meta key", data.Key) + //cclog.ComponentDebug("MessageProcessor", "Removing meta key", data.Key) message.RemoveMeta(data.Key) case MESSAGE_LOCATION_FIELDS: - cclog.ComponentDebug("MessageProcessor", "Removing field key", data.Key) + //cclog.ComponentDebug("MessageProcessor", "Removing field key", data.Key) message.RemoveField(data.Key) } switch to { case MESSAGE_LOCATION_TAGS: - cclog.ComponentDebug("MessageProcessor", "Adding tag", data.Value, "->", v) + //cclog.ComponentDebug("MessageProcessor", "Adding tag", data.Value, "->", v) message.AddTag(data.Value, v) case MESSAGE_LOCATION_META: - cclog.ComponentDebug("MessageProcessor", "Adding meta", data.Value, "->", v) + //cclog.ComponentDebug("MessageProcessor", "Adding meta", data.Value, "->", v) message.AddMeta(data.Value, v) case MESSAGE_LOCATION_FIELDS: - cclog.ComponentDebug("MessageProcessor", "Adding field", data.Value, "->", v) + //cclog.ComponentDebug("MessageProcessor", "Adding field", data.Value, "->", v) message.AddField(data.Value, v) } - } else { - return false, fmt.Errorf("failed to get message entry: %s", data.Key) } } } @@ -88,14 +85,14 @@ func deleteIf(message lp2.CCMessage, params *map[string]interface{}, checks *map case "value", "event", "log", "control": return false, errors.New("cannot delete protected fields") default: - cclog.ComponentDebug("MessageProcessor", "Removing field for", data.Key) + //cclog.ComponentDebug("MessageProcessor", "Removing field for", data.Key) message.RemoveField(data.Key) } case MESSAGE_LOCATION_TAGS: - cclog.ComponentDebug("MessageProcessor", "Removing tag for", data.Key) + //cclog.ComponentDebug("MessageProcessor", "Removing tag for", data.Key) message.RemoveTag(data.Key) case MESSAGE_LOCATION_META: - cclog.ComponentDebug("MessageProcessor", "Removing meta for", data.Key) + //cclog.ComponentDebug("MessageProcessor", "Removing meta for", data.Key) message.RemoveMeta(data.Key) } } @@ -112,13 +109,13 @@ func addIf(message lp2.CCMessage, params *map[string]interface{}, checks *map[*v if value.(bool) { switch location { case MESSAGE_LOCATION_FIELDS: - cclog.ComponentDebug("MessageProcessor", "Adding field", data.Value, "->", data.Value) + //cclog.ComponentDebug("MessageProcessor", "Adding field", data.Value, "->", data.Value) message.AddField(data.Key, data.Value) case MESSAGE_LOCATION_TAGS: - cclog.ComponentDebug("MessageProcessor", "Adding tag", data.Value, "->", data.Value) + //cclog.ComponentDebug("MessageProcessor", "Adding tag", data.Value, "->", data.Value) message.AddTag(data.Key, data.Value) case MESSAGE_LOCATION_META: - cclog.ComponentDebug("MessageProcessor", "Adding meta", data.Value, "->", data.Value) + //cclog.ComponentDebug("MessageProcessor", "Adding meta", data.Value, "->", data.Value) message.AddMeta(data.Key, data.Value) } } @@ -191,13 +188,13 @@ func normalizeUnits(message lp2.CCMessage) (bool, error) { if in_unit, ok := message.GetMeta("unit"); ok { u := units.NewUnit(in_unit) if u.Valid() { - cclog.ComponentDebug("MessageProcessor", "Update unit with", u.Short()) + //cclog.ComponentDebug("MessageProcessor", "Update unit with", u.Short()) message.AddMeta("unit", u.Short()) } } else if in_unit, ok := message.GetTag("unit"); ok { u := units.NewUnit(in_unit) if u.Valid() { - cclog.ComponentDebug("MessageProcessor", "Update unit with", u.Short()) + //cclog.ComponentDebug("MessageProcessor", "Update unit with", u.Short()) message.AddTag("unit", u.Short()) } } @@ -212,15 +209,15 @@ func changeUnitPrefix(message lp2.CCMessage, params *map[string]interface{}, che } if value.(bool) { newPrefix := units.NewPrefix(n) - cclog.ComponentDebug("MessageProcessor", "Condition matches, change to prefix", newPrefix.String()) + //cclog.ComponentDebug("MessageProcessor", "Condition matches, change to prefix", newPrefix.String()) if in_unit, ok := message.GetMeta("unit"); ok && newPrefix != units.InvalidPrefix { u := units.NewUnit(in_unit) if u.Valid() { - cclog.ComponentDebug("MessageProcessor", "Input unit", u.Short()) + //cclog.ComponentDebug("MessageProcessor", "Input unit", u.Short()) conv, out_unit := units.GetUnitPrefixFactor(u, newPrefix) if conv != nil && out_unit.Valid() { if val, ok := message.GetField("value"); ok { - cclog.ComponentDebug("MessageProcessor", "Update unit with", out_unit.Short()) + //cclog.ComponentDebug("MessageProcessor", "Update unit with", out_unit.Short()) message.AddField("value", conv(val)) message.AddMeta("unit", out_unit.Short()) } @@ -230,11 +227,11 @@ func changeUnitPrefix(message lp2.CCMessage, params *map[string]interface{}, che } else if in_unit, ok := message.GetTag("unit"); ok && newPrefix != units.InvalidPrefix { u := units.NewUnit(in_unit) if u.Valid() { - cclog.ComponentDebug("MessageProcessor", "Input unit", u.Short()) + //cclog.ComponentDebug("MessageProcessor", "Input unit", u.Short()) conv, out_unit := units.GetUnitPrefixFactor(u, newPrefix) if conv != nil && out_unit.Valid() { if val, ok := message.GetField("value"); ok { - cclog.ComponentDebug("MessageProcessor", "Update unit with", out_unit.Short()) + //cclog.ComponentDebug("MessageProcessor", "Update unit with", out_unit.Short()) message.AddField("value", conv(val)) message.AddTag("unit", out_unit.Short()) } @@ -255,9 +252,9 @@ func renameMessagesIf(message lp2.CCMessage, params *map[string]interface{}, che } if value.(bool) { old := message.Name() - cclog.ComponentDebug("MessageProcessor", "Rename to", n) + //cclog.ComponentDebug("MessageProcessor", "Rename to", n) message.SetName(n) - cclog.ComponentDebug("MessageProcessor", "Add old name as 'oldname' to meta", old) + //cclog.ComponentDebug("MessageProcessor", "Add old name as 'oldname' to meta", old) message.AddMeta("oldname", old) } } diff --git a/receivers/httpReceiver.go b/receivers/httpReceiver.go index f37c629..d7965c6 100644 --- a/receivers/httpReceiver.go +++ b/receivers/httpReceiver.go @@ -10,15 +10,16 @@ import ( "sync" "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" influx "github.com/influxdata/line-protocol/v2/lineprotocol" ) const HTTP_RECEIVER_PORT = "8080" type HttpReceiverConfig struct { - Type string `json:"type"` + defaultReceiverConfig Addr string `json:"address"` Port string `json:"port"` Path string `json:"path"` @@ -39,7 +40,7 @@ type HttpReceiverConfig struct { type HttpReceiver struct { receiver - meta map[string]string + //meta map[string]string config HttpReceiverConfig server *http.Server wg sync.WaitGroup @@ -85,8 +86,20 @@ func (r *HttpReceiver) Init(name string, config json.RawMessage) error { if r.config.useBasicAuth && len(r.config.Password) == 0 { return errors.New("basic authentication requires password") } + msgp, err := mp.NewMessageProcessor() + if err != nil { + return fmt.Errorf("initialization of message processor failed: %v", err.Error()) + } + r.mp = msgp + if len(r.config.MessageProcessor) > 0 { + err = r.mp.FromConfigJSON(r.config.MessageProcessor) + if err != nil { + return fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } + r.mp.AddAddMetaByCondition("true", "source", r.name) - r.meta = map[string]string{"source": r.name} + //r.meta = map[string]string{"source": r.name} p := r.config.Path if !strings.HasPrefix(p, "/") { p = "/" + p @@ -137,80 +150,82 @@ func (r *HttpReceiver) ServerHttp(w http.ResponseWriter, req *http.Request) { return } } + if r.sink != nil { + d := influx.NewDecoder(req.Body) + for d.Next() { - d := influx.NewDecoder(req.Body) - for d.Next() { - - // Decode measurement name - measurement, err := d.Measurement() - if err != nil { - msg := "ServerHttp: Failed to decode measurement: " + err.Error() - cclog.ComponentError(r.name, msg) - http.Error(w, msg, http.StatusInternalServerError) - return - } - - // Decode tags - tags := make(map[string]string) - for { - key, value, err := d.NextTag() + // Decode measurement name + measurement, err := d.Measurement() if err != nil { - msg := "ServerHttp: Failed to decode tag: " + err.Error() + msg := "ServerHttp: Failed to decode measurement: " + err.Error() cclog.ComponentError(r.name, msg) http.Error(w, msg, http.StatusInternalServerError) return } - if key == nil { - break - } - tags[string(key)] = string(value) - } - // Decode fields - fields := make(map[string]interface{}) - for { - key, value, err := d.NextField() + // Decode tags + tags := make(map[string]string) + for { + key, value, err := d.NextTag() + if err != nil { + msg := "ServerHttp: Failed to decode tag: " + err.Error() + cclog.ComponentError(r.name, msg) + http.Error(w, msg, http.StatusInternalServerError) + return + } + if key == nil { + break + } + tags[string(key)] = string(value) + } + + // Decode fields + fields := make(map[string]interface{}) + for { + key, value, err := d.NextField() + if err != nil { + msg := "ServerHttp: Failed to decode field: " + err.Error() + cclog.ComponentError(r.name, msg) + http.Error(w, msg, http.StatusInternalServerError) + return + } + if key == nil { + break + } + fields[string(key)] = value.Interface() + } + + // Decode time stamp + t, err := d.Time(influx.Nanosecond, time.Time{}) if err != nil { - msg := "ServerHttp: Failed to decode field: " + err.Error() + msg := "ServerHttp: Failed to decode time stamp: " + err.Error() cclog.ComponentError(r.name, msg) http.Error(w, msg, http.StatusInternalServerError) return } - if key == nil { - break - } - fields[string(key)] = value.Interface() - } - // Decode time stamp - t, err := d.Time(influx.Nanosecond, time.Time{}) + y, _ := lp.NewMessage( + string(measurement), + tags, + nil, + fields, + t, + ) + + m, err := r.mp.ProcessMessage(y) + if err == nil && m != nil { + r.sink <- m + } + + } + // Check for IO errors + err := d.Err() if err != nil { - msg := "ServerHttp: Failed to decode time stamp: " + err.Error() + msg := "ServerHttp: Failed to decode: " + err.Error() cclog.ComponentError(r.name, msg) http.Error(w, msg, http.StatusInternalServerError) return } - - y, _ := lp.New( - string(measurement), - tags, - r.meta, - fields, - t, - ) - - if r.sink != nil { - r.sink <- y - } - } - - // Check for IO errors - err := d.Err() - if err != nil { - msg := "ServerHttp: Failed to decode: " + err.Error() - cclog.ComponentError(r.name, msg) - http.Error(w, msg, http.StatusInternalServerError) - return } w.WriteHeader(http.StatusOK) diff --git a/receivers/ipmiReceiver.go b/receivers/ipmiReceiver.go index e3d544a..9f045ee 100644 --- a/receivers/ipmiReceiver.go +++ b/receivers/ipmiReceiver.go @@ -13,9 +13,10 @@ import ( "sync" "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" "github.com/ClusterCockpit/cc-metric-collector/pkg/hostlist" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" ) type IPMIReceiverClientConfig struct { @@ -31,11 +32,13 @@ type IPMIReceiverClientConfig struct { Password string // Password to use for authentication CLIOptions []string // Additional command line options for ipmi-sensors isExcluded map[string]bool // is metric excluded + mp mp.MessageProcessor } type IPMIReceiver struct { receiver config struct { + defaultReceiverConfig Interval time.Duration // Client config for each IPMI hosts @@ -43,10 +46,11 @@ type IPMIReceiver struct { } // Storage for static information - meta map[string]string + //meta map[string]string done chan bool // channel to finish / stop IPMI receiver wg sync.WaitGroup // wait group for IPMI receiver + mp mp.MessageProcessor } // doReadMetrics reads metrics from all configure IPMI hosts. @@ -213,7 +217,7 @@ func (r *IPMIReceiver) doReadMetric() { continue } - y, err := lp.New( + y, err := lp.NewMessage( metric, map[string]string{ "hostname": host, @@ -230,7 +234,14 @@ func (r *IPMIReceiver) doReadMetric() { }, time.Now()) if err == nil { - r.sink <- y + mc, err := clientConfig.mp.ProcessMessage(y) + if err == nil && mc != nil { + m, err := r.mp.ProcessMessage(mc) + if err == nil && m != nil { + r.sink <- m + } + } + } } @@ -296,11 +307,12 @@ func (r *IPMIReceiver) Close() { // NewIPMIReceiver creates a new instance of the redfish receiver // Initialize the receiver by giving it a name and reading in the config JSON func NewIPMIReceiver(name string, config json.RawMessage) (Receiver, error) { + var err error r := new(IPMIReceiver) // Config options from config file configJSON := struct { - Type string `json:"type"` + defaultReceiverConfig // How often the IPMI sensor metrics should be read and send to the sink (default: 30 s) IntervalString string `json:"interval,omitempty"` @@ -331,7 +343,8 @@ func NewIPMIReceiver(name string, config json.RawMessage) (Receiver, error) { ExcludeMetrics []string `json:"exclude_metrics,omitempty"` // Additional command line options for ipmi-sensors - CLIOptions []string `json:"cli_options,omitempty"` + CLIOptions []string `json:"cli_options,omitempty"` + MessageProcessor json.RawMessage `json:"process_messages,omitempty"` } `json:"client_config"` }{ // Set defaults values @@ -347,8 +360,15 @@ func NewIPMIReceiver(name string, config json.RawMessage) (Receiver, error) { // Create done channel r.done = make(chan bool) + p, err := mp.NewMessageProcessor() + if err != nil { + return nil, fmt.Errorf("initialization of message processor failed: %v", err.Error()) + } + r.mp = p + // Set static information - r.meta = map[string]string{"source": r.name} + //r.meta = map[string]string{"source": r.name} + r.mp.AddAddMetaByCondition("true", "source", r.name) // Read the IPMI receiver specific JSON config if len(config) > 0 { @@ -360,12 +380,18 @@ func NewIPMIReceiver(name string, config json.RawMessage) (Receiver, error) { } } + if len(r.config.MessageProcessor) > 0 { + err = r.mp.FromConfigJSON(r.config.MessageProcessor) + if err != nil { + return nil, fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } // Convert interval string representation to duration - var err error + r.config.Interval, err = time.ParseDuration(configJSON.IntervalString) if err != nil { err := fmt.Errorf( - "Failed to parse duration string interval='%s': %w", + "failed to parse duration string interval='%s': %w", configJSON.IntervalString, err, ) @@ -506,6 +532,16 @@ func NewIPMIReceiver(name string, config json.RawMessage) (Receiver, error) { for _, key := range configJSON.ExcludeMetrics { isExcluded[key] = true } + p, err := mp.NewMessageProcessor() + if err != nil { + return nil, fmt.Errorf("initialization of message processor failed: %v", err.Error()) + } + if len(clientConfigJSON.MessageProcessor) > 0 { + err = p.FromConfigJSON(clientConfigJSON.MessageProcessor) + if err != nil { + return nil, fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } r.config.ClientConfigs = append( r.config.ClientConfigs, @@ -520,6 +556,7 @@ func NewIPMIReceiver(name string, config json.RawMessage) (Receiver, error) { Password: password, CLIOptions: cliOptions, isExcluded: isExcluded, + mp: p, }) } diff --git a/receivers/metricReceiver.go b/receivers/metricReceiver.go index 1edef4e..609eab7 100644 --- a/receivers/metricReceiver.go +++ b/receivers/metricReceiver.go @@ -1,11 +1,15 @@ package receivers import ( - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + "encoding/json" + + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" ) type defaultReceiverConfig struct { - Type string `json:"type"` + Type string `json:"type"` + MessageProcessor json.RawMessage `json:"process_messages,omitempty"` } // Receiver configuration: Listen address, port @@ -19,14 +23,15 @@ type ReceiverConfig struct { type receiver struct { name string - sink chan lp.CCMetric + sink chan lp.CCMessage + mp mp.MessageProcessor } type Receiver interface { Start() - Close() // Close / finish metric receiver - Name() string // Name of the metric receiver - SetSink(sink chan lp.CCMetric) // Set sink channel + Close() // Close / finish metric receiver + Name() string // Name of the metric receiver + SetSink(sink chan lp.CCMessage) // Set sink channel } // Name returns the name of the metric receiver @@ -35,6 +40,6 @@ func (r *receiver) Name() string { } // SetSink set the sink channel -func (r *receiver) SetSink(sink chan lp.CCMetric) { +func (r *receiver) SetSink(sink chan lp.CCMessage) { r.sink = sink } diff --git a/receivers/natsReceiver.go b/receivers/natsReceiver.go index ea0cc3b..ffb6dab 100644 --- a/receivers/natsReceiver.go +++ b/receivers/natsReceiver.go @@ -4,25 +4,30 @@ import ( "encoding/json" "errors" "fmt" + "os" "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" influx "github.com/influxdata/line-protocol/v2/lineprotocol" nats "github.com/nats-io/nats.go" ) type NatsReceiverConfig struct { - Type string `json:"type"` + defaultReceiverConfig Addr string `json:"address"` Port string `json:"port"` Subject string `json:"subject"` + User string `json:"user,omitempty"` + Password string `json:"password,omitempty"` + NkeyFile string `json:"nkey_file,omitempty"` } type NatsReceiver struct { receiver - nc *nats.Conn - meta map[string]string + nc *nats.Conn + //meta map[string]string config NatsReceiverConfig } @@ -36,65 +41,68 @@ func (r *NatsReceiver) Start() { // _NatsReceive receives subscribed messages from the NATS server func (r *NatsReceiver) _NatsReceive(m *nats.Msg) { - d := influx.NewDecoderWithBytes(m.Data) - for d.Next() { + if r.sink != nil { + d := influx.NewDecoderWithBytes(m.Data) + for d.Next() { - // Decode measurement name - measurement, err := d.Measurement() - if err != nil { - msg := "_NatsReceive: Failed to decode measurement: " + err.Error() - cclog.ComponentError(r.name, msg) - return - } - - // Decode tags - tags := make(map[string]string) - for { - key, value, err := d.NextTag() + // Decode measurement name + measurement, err := d.Measurement() if err != nil { - msg := "_NatsReceive: Failed to decode tag: " + err.Error() + msg := "_NatsReceive: Failed to decode measurement: " + err.Error() cclog.ComponentError(r.name, msg) return } - if key == nil { - break - } - tags[string(key)] = string(value) - } - // Decode fields - fields := make(map[string]interface{}) - for { - key, value, err := d.NextField() + // Decode tags + tags := make(map[string]string) + for { + key, value, err := d.NextTag() + if err != nil { + msg := "_NatsReceive: Failed to decode tag: " + err.Error() + cclog.ComponentError(r.name, msg) + return + } + if key == nil { + break + } + tags[string(key)] = string(value) + } + + // Decode fields + fields := make(map[string]interface{}) + for { + key, value, err := d.NextField() + if err != nil { + msg := "_NatsReceive: Failed to decode field: " + err.Error() + cclog.ComponentError(r.name, msg) + return + } + if key == nil { + break + } + fields[string(key)] = value.Interface() + } + + // Decode time stamp + t, err := d.Time(influx.Nanosecond, time.Time{}) if err != nil { - msg := "_NatsReceive: Failed to decode field: " + err.Error() + msg := "_NatsReceive: Failed to decode time: " + err.Error() cclog.ComponentError(r.name, msg) return } - if key == nil { - break + + y, _ := lp.NewMessage( + string(measurement), + tags, + nil, + fields, + t, + ) + + m, err := r.mp.ProcessMessage(y) + if err == nil && m != nil { + r.sink <- m } - fields[string(key)] = value.Interface() - } - - // Decode time stamp - t, err := d.Time(influx.Nanosecond, time.Time{}) - if err != nil { - msg := "_NatsReceive: Failed to decode time: " + err.Error() - cclog.ComponentError(r.name, msg) - return - } - - y, _ := lp.New( - string(measurement), - tags, - r.meta, - fields, - t, - ) - - if r.sink != nil { - r.sink <- y } } } @@ -109,6 +117,7 @@ func (r *NatsReceiver) Close() { // NewNatsReceiver creates a new Receiver which subscribes to messages from a NATS server func NewNatsReceiver(name string, config json.RawMessage) (Receiver, error) { + var uinfo nats.Option = nil r := new(NatsReceiver) r.name = fmt.Sprintf("NatsReceiver(%s)", name) @@ -127,16 +136,40 @@ func NewNatsReceiver(name string, config json.RawMessage) (Receiver, error) { len(r.config.Subject) == 0 { return nil, errors.New("not all configuration variables set required by NatsReceiver") } + p, err := mp.NewMessageProcessor() + if err != nil { + return nil, fmt.Errorf("initialization of message processor failed: %v", err.Error()) + } + r.mp = p + if len(r.config.MessageProcessor) > 0 { + err = r.mp.FromConfigJSON(r.config.MessageProcessor) + if err != nil { + return nil, fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } // Set metadata - r.meta = map[string]string{ - "source": r.name, + // r.meta = map[string]string{ + // "source": r.name, + // } + r.mp.AddAddMetaByCondition("true", "source", r.name) + + if len(r.config.User) > 0 && len(r.config.Password) > 0 { + uinfo = nats.UserInfo(r.config.User, r.config.Password) + } else if len(r.config.NkeyFile) > 0 { + _, err := os.Stat(r.config.NkeyFile) + if err == nil { + uinfo = nats.UserCredentials(r.config.NkeyFile) + } else { + cclog.ComponentError(r.name, "NKEY file", r.config.NkeyFile, "does not exist: %v", err.Error()) + return nil, err + } } // Connect to NATS server url := fmt.Sprintf("nats://%s:%s", r.config.Addr, r.config.Port) cclog.ComponentDebug(r.name, "NewNatsReceiver", url, "Subject", r.config.Subject) - if nc, err := nats.Connect(url); err == nil { + if nc, err := nats.Connect(url, uinfo); err == nil { r.nc = nc } else { r.nc = nil diff --git a/receivers/natsReceiver.md b/receivers/natsReceiver.md index d0b2166..0882dcf 100644 --- a/receivers/natsReceiver.md +++ b/receivers/natsReceiver.md @@ -10,7 +10,10 @@ The `nats` receiver can be used receive metrics from the NATS network. The `nats "type": "nats", "address" : "nats-server.example.org", "port" : "4222", - "subject" : "subject" + "subject" : "subject", + "user": "natsuser", + "password": "natssecret", + "nkey_file": "/path/to/nkey_file" } } ``` @@ -19,6 +22,9 @@ The `nats` receiver can be used receive metrics from the NATS network. The `nats - `address`: Address of the NATS control server - `port`: Port of the NATS control server - `subject`: Subscribes to this subject and receive metrics +- `user`: Connect to nats using this user +- `password`: Connect to nats using this password +- `nkey_file`: Path to credentials file with NKEY ### Debugging diff --git a/receivers/prometheusReceiver.go b/receivers/prometheusReceiver.go index 7846c1d..3e777a8 100644 --- a/receivers/prometheusReceiver.go +++ b/receivers/prometheusReceiver.go @@ -13,7 +13,7 @@ import ( "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) type PrometheusReceiverConfig struct { @@ -74,7 +74,7 @@ func (r *PrometheusReceiver) Start() { } value, err := strconv.ParseFloat(lineSplit[1], 64) if err == nil { - y, err := lp.New(name, tags, r.meta, map[string]interface{}{"value": value}, t) + y, err := lp.NewMessage(name, tags, r.meta, map[string]interface{}{"value": value}, t) if err == nil { r.sink <- y } diff --git a/receivers/receiveManager.go b/receivers/receiveManager.go index 47325b2..ac3d998 100644 --- a/receivers/receiveManager.go +++ b/receivers/receiveManager.go @@ -7,7 +7,7 @@ import ( "sync" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) var AvailableReceivers = map[string]func(name string, config json.RawMessage) (Receiver, error){ @@ -19,14 +19,14 @@ var AvailableReceivers = map[string]func(name string, config json.RawMessage) (R type receiveManager struct { inputs []Receiver - output chan lp.CCMetric + output chan lp.CCMessage config []json.RawMessage } type ReceiveManager interface { Init(wg *sync.WaitGroup, receiverConfigFile string) error AddInput(name string, rawConfig json.RawMessage) error - AddOutput(output chan lp.CCMetric) + AddOutput(output chan lp.CCMessage) Start() Close() } @@ -93,7 +93,7 @@ func (rm *receiveManager) AddInput(name string, rawConfig json.RawMessage) error return nil } -func (rm *receiveManager) AddOutput(output chan lp.CCMetric) { +func (rm *receiveManager) AddOutput(output chan lp.CCMessage) { rm.output = output for _, r := range rm.inputs { r.SetSink(rm.output) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index 7a712b5..b237231 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -13,9 +13,10 @@ import ( "sync" "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" "github.com/ClusterCockpit/cc-metric-collector/pkg/hostlist" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" // See: https://pkg.go.dev/github.com/stmcginnis/gofish "github.com/stmcginnis/gofish" @@ -42,6 +43,8 @@ type RedfishReceiverClientConfig struct { readSensorURLs map[string][]string gofish gofish.ClientConfig + + mp mp.MessageProcessor } // RedfishReceiver configuration: @@ -49,6 +52,7 @@ type RedfishReceiver struct { receiver config struct { + defaultReceiverConfig fanout int Interval time.Duration HttpTimeout time.Duration @@ -79,13 +83,19 @@ func setMetricValue(value any) map[string]interface{} { } // sendMetric sends the metric through the sink channel -func (r *RedfishReceiver) sendMetric(name string, tags map[string]string, meta map[string]string, value any, timestamp time.Time) { +func (r *RedfishReceiver) sendMetric(mp mp.MessageProcessor, name string, tags map[string]string, meta map[string]string, value any, timestamp time.Time) { deleteEmptyTags(tags) deleteEmptyTags(meta) - y, err := lp.New(name, tags, meta, setMetricValue(value), timestamp) + y, err := lp.NewMessage(name, tags, meta, setMetricValue(value), timestamp) if err == nil { - r.sink <- y + mc, err := mp.ProcessMessage(y) + if err == nil && mc != nil { + m, err := r.mp.ProcessMessage(mc) + if err == nil && m != nil { + r.sink <- m + } + } } } @@ -119,7 +129,7 @@ func (r *RedfishReceiver) readSensors( "unit": "degC", } - r.sendMetric("temperature", tags, meta, sensor.Reading, time.Now()) + r.sendMetric(clientConfig.mp, "temperature", tags, meta, sensor.Reading, time.Now()) } writeFanSpeedSensor := func(sensor *redfish.Sensor) { @@ -145,7 +155,7 @@ func (r *RedfishReceiver) readSensors( "unit": string(sensor.ReadingUnits), } - r.sendMetric("fan_speed", tags, meta, sensor.Reading, time.Now()) + r.sendMetric(clientConfig.mp, "fan_speed", tags, meta, sensor.Reading, time.Now()) } writePowerSensor := func(sensor *redfish.Sensor) { @@ -172,7 +182,7 @@ func (r *RedfishReceiver) readSensors( "unit": "watts", } - r.sendMetric("power", tags, meta, sensor.Reading, time.Now()) + r.sendMetric(clientConfig.mp, "power", tags, meta, sensor.Reading, time.Now()) } if _, ok := clientConfig.readSensorURLs[chassis.ID]; !ok { @@ -340,7 +350,7 @@ func (r *RedfishReceiver) readThermalMetrics( // ReadingCelsius shall be the current value of the temperature sensor's reading. value := temperature.ReadingCelsius - r.sendMetric("temperature", tags, meta, value, timestamp) + r.sendMetric(clientConfig.mp, "temperature", tags, meta, value, timestamp) } for _, fan := range thermal.Fans { @@ -381,7 +391,7 @@ func (r *RedfishReceiver) readThermalMetrics( "unit": string(fan.ReadingUnits), } - r.sendMetric("fan_speed", tags, meta, fan.Reading, timestamp) + r.sendMetric(clientConfig.mp, "fan_speed", tags, meta, fan.Reading, timestamp) } return nil @@ -479,7 +489,7 @@ func (r *RedfishReceiver) readPowerMetrics( } for name, value := range metrics { - r.sendMetric(name, tags, meta, value, timestamp) + r.sendMetric(clientConfig.mp, name, tags, meta, value, timestamp) } } @@ -561,7 +571,7 @@ func (r *RedfishReceiver) readProcessorMetrics( if !clientConfig.isExcluded[namePower] && // Some servers return "ConsumedPowerWatt":65535 instead of "ConsumedPowerWatt":null processorMetrics.ConsumedPowerWatt != 65535 { - r.sendMetric(namePower, tags, metaPower, processorMetrics.ConsumedPowerWatt, timestamp) + r.sendMetric(clientConfig.mp, namePower, tags, metaPower, processorMetrics.ConsumedPowerWatt, timestamp) } // Set meta data tags metaThermal := map[string]string{ @@ -573,7 +583,7 @@ func (r *RedfishReceiver) readProcessorMetrics( nameThermal := "temperature" if !clientConfig.isExcluded[nameThermal] { - r.sendMetric(nameThermal, tags, metaThermal, processorMetrics.TemperatureCelsius, timestamp) + r.sendMetric(clientConfig.mp, nameThermal, tags, metaThermal, processorMetrics.TemperatureCelsius, timestamp) } return nil } @@ -776,11 +786,13 @@ func (r *RedfishReceiver) Close() { // NewRedfishReceiver creates a new instance of the redfish receiver // Initialize the receiver by giving it a name and reading in the config JSON func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { + var err error r := new(RedfishReceiver) // Config options from config file configJSON := struct { - Type string `json:"type"` + Type string `json:"type"` + MessageProcessor json.RawMessage `json:"process_messages,omitempty"` // Maximum number of simultaneous redfish connections (default: 64) Fanout int `json:"fanout,omitempty"` @@ -820,7 +832,8 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { DisableThermalMetrics bool `json:"disable_thermal_metrics"` // Per client excluded metrics - ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + MessageProcessor json.RawMessage `json:"process_messages,omitempty"` } `json:"client_config"` }{ // Set defaults values @@ -846,13 +859,24 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { return nil, err } } + p, err := mp.NewMessageProcessor() + if err != nil { + return nil, fmt.Errorf("initialization of message processor failed: %v", err.Error()) + } + r.mp = p + if len(r.config.MessageProcessor) > 0 { + err = r.mp.FromConfigJSON(r.config.MessageProcessor) + if err != nil { + return nil, fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } // Convert interval string representation to duration - var err error + r.config.Interval, err = time.ParseDuration(configJSON.IntervalString) if err != nil { err := fmt.Errorf( - "Failed to parse duration string interval='%s': %w", + "failed to parse duration string interval='%s': %w", configJSON.IntervalString, err, ) @@ -864,7 +888,7 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { r.config.HttpTimeout, err = time.ParseDuration(configJSON.HttpTimeoutString) if err != nil { err := fmt.Errorf( - "Failed to parse duration string http_timeout='%s': %w", + "failed to parse duration string http_timeout='%s': %w", configJSON.HttpTimeoutString, err, ) @@ -948,6 +972,18 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { for _, key := range configJSON.ExcludeMetrics { isExcluded[key] = true } + p, err = mp.NewMessageProcessor() + if err != nil { + cclog.ComponentError(r.name, err.Error()) + return nil, err + } + if len(clientConfigJSON.MessageProcessor) > 0 { + err = p.FromConfigJSON(clientConfigJSON.MessageProcessor) + if err != nil { + cclog.ComponentError(r.name, err.Error()) + return nil, err + } + } hostList, err := hostlist.Expand(clientConfigJSON.HostList) if err != nil { @@ -978,6 +1014,7 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { Endpoint: endpoint, HTTPClient: httpClient, }, + mp: p, }) } @@ -1002,7 +1039,7 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { for i := range r.config.ClientConfigs { host := r.config.ClientConfigs[i].Hostname if isDuplicate[host] { - err := fmt.Errorf("Found duplicate client config for host %s", host) + err := fmt.Errorf("found duplicate client config for host %s", host) cclog.ComponentError(r.name, err) return nil, err } diff --git a/receivers/sampleReceiver.go b/receivers/sampleReceiver.go index 86e68cd..8fe7e1c 100644 --- a/receivers/sampleReceiver.go +++ b/receivers/sampleReceiver.go @@ -5,11 +5,13 @@ import ( "fmt" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" ) // SampleReceiver configuration: receiver type, listen address, port +// The defaultReceiverConfig contains the keys 'type' and 'process_messages' type SampleReceiverConfig struct { - Type string `json:"type"` + defaultReceiverConfig Addr string `json:"address"` Port string `json:"port"` } @@ -19,7 +21,6 @@ type SampleReceiver struct { config SampleReceiverConfig // Storage for static information - meta map[string]string // Use in case of own go routine // done chan bool // wg sync.WaitGroup @@ -79,8 +80,19 @@ func NewSampleReceiver(name string, config json.RawMessage) (Receiver, error) { // The name should be chosen in such a way that different instances of SampleReceiver can be distinguished r.name = fmt.Sprintf("SampleReceiver(%s)", name) + // create new message processor + p, err := mp.NewMessageProcessor() + if err != nil { + cclog.ComponentError(r.name, "Initialization of message processor failed:", err.Error()) + return nil, fmt.Errorf("initialization of message processor failed: %v", err.Error()) + } + r.mp = p // Set static information - r.meta = map[string]string{"source": r.name} + err = r.mp.AddAddMetaByCondition("true", "source", r.name) + if err != nil { + cclog.ComponentError(r.name, fmt.Sprintf("Failed to add static information source=%s:", r.name), err.Error()) + return nil, fmt.Errorf("failed to add static information source=%s: %v", r.name, err.Error()) + } // Set defaults in r.config // Allow overwriting these defaults by reading config JSON @@ -94,6 +106,15 @@ func NewSampleReceiver(name string, config json.RawMessage) (Receiver, error) { } } + // Add message processor config + if len(r.config.MessageProcessor) > 0 { + err = r.mp.FromConfigJSON(r.config.MessageProcessor) + if err != nil { + cclog.ComponentError(r.name, "Failed parsing JSON for message processor:", err.Error()) + return nil, fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } + // Check that all required fields in the configuration are set // Use 'if len(r.config.Option) > 0' for strings diff --git a/router.json b/router.json index a9f8714..3eae5ee 100644 --- a/router.json +++ b/router.json @@ -1,22 +1,23 @@ { - "add_tags" : [ - { - "key" : "cluster", - "value" : "testcluster", - "if" : "*" - }, - { - "key" : "test", - "value" : "testing", - "if" : "name == 'temp_package_id_0'" - } - ], - "delete_tags" : [ - { - "key" : "unit", - "value" : "*", - "if" : "*" - } - ], + "process_messages" : { + "add_tag_if": [ + { + "key" : "cluster", + "value" : "testcluster", + "if" : "true" + }, + { + "key" : "test", + "value" : "testing", + "if" : "name == 'temp_package_id_0'" + } + ], + "delete_tag_if": [ + { + "key" : "unit", + "if" : "true" + } + ] + }, "interval_timestamp" : true } diff --git a/sinks/gangliaCommon.go b/sinks/gangliaCommon.go index 1c846f0..010b94a 100644 --- a/sinks/gangliaCommon.go +++ b/sinks/gangliaCommon.go @@ -4,10 +4,10 @@ import ( "fmt" "strings" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) -func GangliaMetricName(point lp.CCMetric) string { +func GangliaMetricName(point lp.CCMessage) string { name := point.Name() metricType, typeOK := point.GetTag("type") metricTid, tidOk := point.GetTag("type-id") @@ -39,7 +39,7 @@ func GangliaMetricRename(name string) string { return name } -func GangliaSlopeType(point lp.CCMetric) uint { +func GangliaSlopeType(point lp.CCMessage) uint { name := point.Name() if name == "mem_total" || name == "swap_total" { return 0 @@ -151,7 +151,7 @@ type GangliaMetricConfig struct { Name string } -func GetCommonGangliaConfig(point lp.CCMetric) GangliaMetricConfig { +func GetCommonGangliaConfig(point lp.CCMessage) GangliaMetricConfig { mname := GangliaMetricRename(point.Name()) if oldname, ok := point.GetMeta("oldname"); ok { mname = GangliaMetricRename(oldname) @@ -207,7 +207,7 @@ func GetCommonGangliaConfig(point lp.CCMetric) GangliaMetricConfig { } } -func GetGangliaConfig(point lp.CCMetric) GangliaMetricConfig { +func GetGangliaConfig(point lp.CCMessage) GangliaMetricConfig { mname := GangliaMetricRename(point.Name()) if oldname, ok := point.GetMeta("oldname"); ok { mname = GangliaMetricRename(oldname) diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index b510003..e716ae4 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -10,8 +10,9 @@ import ( // "time" "os/exec" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" ) const GMETRIC_EXEC = `gmetric` @@ -35,50 +36,53 @@ type GangliaSink struct { config GangliaSinkConfig } -func (s *GangliaSink) Write(point lp.CCMetric) error { +func (s *GangliaSink) Write(msg lp.CCMessage) error { var err error = nil //var tagsstr []string var argstr []string - // Get metric config (type, value, ... in suitable format) - conf := GetCommonGangliaConfig(point) - if len(conf.Type) == 0 { - conf = GetGangliaConfig(point) - } - if len(conf.Type) == 0 { - return fmt.Errorf("metric %q (Ganglia name %q) has no 'value' field", point.Name(), conf.Name) - } + point, err := s.mp.ProcessMessage(msg) + if err == nil && point != nil { + // Get metric config (type, value, ... in suitable format) + conf := GetCommonGangliaConfig(point) + if len(conf.Type) == 0 { + conf = GetGangliaConfig(point) + } + if len(conf.Type) == 0 { + return fmt.Errorf("metric %q (Ganglia name %q) has no 'value' field", point.Name(), conf.Name) + } - if s.config.AddGangliaGroup { - argstr = append(argstr, fmt.Sprintf("--group=%s", conf.Group)) - } - if s.config.AddUnits && len(conf.Unit) > 0 { - argstr = append(argstr, fmt.Sprintf("--units=%s", conf.Unit)) - } + if s.config.AddGangliaGroup { + argstr = append(argstr, fmt.Sprintf("--group=%s", conf.Group)) + } + if s.config.AddUnits && len(conf.Unit) > 0 { + argstr = append(argstr, fmt.Sprintf("--units=%s", conf.Unit)) + } - if len(s.config.ClusterName) > 0 { - argstr = append(argstr, fmt.Sprintf("--cluster=%s", s.config.ClusterName)) - } - // if s.config.AddTagsAsDesc && len(tagsstr) > 0 { - // argstr = append(argstr, fmt.Sprintf("--desc=%q", strings.Join(tagsstr, ","))) - // } - if len(s.gmetric_config) > 0 { - argstr = append(argstr, fmt.Sprintf("--conf=%s", s.gmetric_config)) - } - if s.config.AddTypeToName { - argstr = append(argstr, fmt.Sprintf("--name=%s", GangliaMetricName(point))) - } else { - argstr = append(argstr, fmt.Sprintf("--name=%s", conf.Name)) - } - argstr = append(argstr, fmt.Sprintf("--slope=%s", conf.Slope)) - argstr = append(argstr, fmt.Sprintf("--value=%s", conf.Value)) - argstr = append(argstr, fmt.Sprintf("--type=%s", conf.Type)) - argstr = append(argstr, fmt.Sprintf("--tmax=%d", conf.Tmax)) + if len(s.config.ClusterName) > 0 { + argstr = append(argstr, fmt.Sprintf("--cluster=%s", s.config.ClusterName)) + } + // if s.config.AddTagsAsDesc && len(tagsstr) > 0 { + // argstr = append(argstr, fmt.Sprintf("--desc=%q", strings.Join(tagsstr, ","))) + // } + if len(s.gmetric_config) > 0 { + argstr = append(argstr, fmt.Sprintf("--conf=%s", s.gmetric_config)) + } + if s.config.AddTypeToName { + argstr = append(argstr, fmt.Sprintf("--name=%s", GangliaMetricName(point))) + } else { + argstr = append(argstr, fmt.Sprintf("--name=%s", conf.Name)) + } + argstr = append(argstr, fmt.Sprintf("--slope=%s", conf.Slope)) + argstr = append(argstr, fmt.Sprintf("--value=%s", conf.Value)) + argstr = append(argstr, fmt.Sprintf("--type=%s", conf.Type)) + argstr = append(argstr, fmt.Sprintf("--tmax=%d", conf.Tmax)) - cclog.ComponentDebug(s.name, s.gmetric_path, strings.Join(argstr, " ")) - command := exec.Command(s.gmetric_path, argstr...) - command.Wait() - _, err = command.Output() + cclog.ComponentDebug(s.name, s.gmetric_path, strings.Join(argstr, " ")) + command := exec.Command(s.gmetric_path, argstr...) + command.Wait() + _, err = command.Output() + } return err } @@ -104,6 +108,13 @@ func NewGangliaSink(name string, config json.RawMessage) (Sink, error) { } s.gmetric_path = "" s.gmetric_config = "" + + p, err := mp.NewMessageProcessor() + if err != nil { + return nil, fmt.Errorf("initialization of message processor failed: %v", err.Error()) + } + s.mp = p + if len(s.config.GmetricPath) > 0 { p, err := exec.LookPath(s.config.GmetricPath) if err == nil { @@ -122,5 +133,15 @@ func NewGangliaSink(name string, config json.RawMessage) (Sink, error) { if len(s.config.GmetricConfig) > 0 { s.gmetric_config = s.config.GmetricConfig } + if len(s.config.MessageProcessor) > 0 { + err = s.mp.FromConfigJSON(s.config.MessageProcessor) + if err != nil { + return nil, fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } + for _, k := range s.config.MetaAsTags { + s.mp.AddMoveMetaToTags("true", k, k) + } + return s, nil } diff --git a/sinks/gangliaSink.md b/sinks/gangliaSink.md index 9b77ac9..7bee887 100644 --- a/sinks/gangliaSink.md +++ b/sinks/gangliaSink.md @@ -8,14 +8,18 @@ The `ganglia` sink uses the `gmetric` tool of the [Ganglia Monitoring System](ht { "": { "type": "ganglia", - "meta_as_tags" : true, "gmetric_path" : "/path/to/gmetric", - "add_ganglia_group" : true + "add_ganglia_group" : true, + "process_messages" : { + "see" : "docs of message processor for valid fields" + }, + "meta_as_tags" : [] } } ``` - `type`: makes the sink an `ganglia` sink -- `meta_as_tags`: print all meta information as tags in the output (optional) - `gmetric_path`: Path to `gmetric` executable (optional). If not given, the sink searches in `$PATH` for `gmetric`. -- `add_ganglia_group`: Add `--group=X` based on meta information to the `gmetric` call. Some old versions of `gmetric` do not support the `--group` option. \ No newline at end of file +- `add_ganglia_group`: Add `--group=X` based on meta information to the `gmetric` call. Some old versions of `gmetric` do not support the `--group` option. +- `process_messages`: Process messages with given rules before progressing or dropping, see [here](../pkg/messageProcessor/README.md) (optional) +- `meta_as_tags`: print all meta information as tags in the output (deprecated, optional) \ No newline at end of file diff --git a/sinks/httpSink.go b/sinks/httpSink.go index 5bad35a..44d6dea 100644 --- a/sinks/httpSink.go +++ b/sinks/httpSink.go @@ -9,8 +9,9 @@ import ( "sync" "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" influx "github.com/influxdata/line-protocol/v2/lineprotocol" "golang.org/x/exp/slices" ) @@ -75,28 +76,20 @@ type HttpSink struct { } // Write sends metric m as http message -func (s *HttpSink) Write(m lp.CCMetric) error { +func (s *HttpSink) Write(msg lp.CCMessage) error { - // Lock for encoder usage - s.encoderLock.Lock() + // submit m only after applying processing/dropping rules + m, err := s.mp.ProcessMessage(msg) + if err == nil && m != nil { + // Lock for encoder usage + s.encoderLock.Lock() - // Encode measurement name - s.encoder.StartLine(m.Name()) + // Encode measurement name + s.encoder.StartLine(m.Name()) - // copy tags and meta data which should be used as tags - s.extended_tag_list = s.extended_tag_list[:0] - for key, value := range m.Tags() { - s.extended_tag_list = - append( - s.extended_tag_list, - key_value_pair{ - key: key, - value: value, - }, - ) - } - for _, key := range s.config.MetaAsTags { - if value, ok := m.GetMeta(key); ok { + // copy tags and meta data which should be used as tags + s.extended_tag_list = s.extended_tag_list[:0] + for key, value := range m.Tags() { s.extended_tag_list = append( s.extended_tag_list, @@ -106,45 +99,57 @@ func (s *HttpSink) Write(m lp.CCMetric) error { }, ) } - } + // for _, key := range s.config.MetaAsTags { + // if value, ok := m.GetMeta(key); ok { + // s.extended_tag_list = + // append( + // s.extended_tag_list, + // key_value_pair{ + // key: key, + // value: value, + // }, + // ) + // } + // } - // Encode tags (they musts be in lexical order) - slices.SortFunc( - s.extended_tag_list, - func(a key_value_pair, b key_value_pair) int { - if a.key < b.key { - return -1 - } - if a.key > b.key { - return +1 - } - return 0 - }, - ) - for i := range s.extended_tag_list { - s.encoder.AddTag( - s.extended_tag_list[i].key, - s.extended_tag_list[i].value, + // Encode tags (they musts be in lexical order) + slices.SortFunc( + s.extended_tag_list, + func(a key_value_pair, b key_value_pair) int { + if a.key < b.key { + return -1 + } + if a.key > b.key { + return +1 + } + return 0 + }, ) - } + for i := range s.extended_tag_list { + s.encoder.AddTag( + s.extended_tag_list[i].key, + s.extended_tag_list[i].value, + ) + } - // Encode fields - for key, value := range m.Fields() { - s.encoder.AddField(key, influx.MustNewValue(value)) - } + // Encode fields + for key, value := range m.Fields() { + s.encoder.AddField(key, influx.MustNewValue(value)) + } - // Encode time stamp - s.encoder.EndLine(m.Time()) + // Encode time stamp + s.encoder.EndLine(m.Time()) - // Check for encoder errors - err := s.encoder.Err() + // Check for encoder errors + err := s.encoder.Err() - // Unlock encoder usage - s.encoderLock.Unlock() + // Unlock encoder usage + s.encoderLock.Unlock() - // Check that encoding worked - if err != nil { - return fmt.Errorf("encoding failed: %v", err) + // Check that encoding worked + if err != nil { + return fmt.Errorf("encoding failed: %v", err) + } } if s.config.flushDelay == 0 { @@ -271,7 +276,7 @@ func NewHttpSink(name string, config json.RawMessage) (Sink, error) { s.config.Timeout = "5s" s.config.FlushDelay = "5s" s.config.MaxRetries = 3 - s.config.Precision = "ns" + s.config.Precision = "s" cclog.ComponentDebug(s.name, "Init()") // Read config @@ -297,6 +302,11 @@ func NewHttpSink(name string, config json.RawMessage) (Sink, error) { if s.config.useBasicAuth && len(s.config.Password) == 0 { return nil, errors.New("basic authentication requires password") } + p, err := mp.NewMessageProcessor() + if err != nil { + return nil, fmt.Errorf("initialization of message processor failed: %v", err.Error()) + } + s.mp = p if len(s.config.IdleConnTimeout) > 0 { t, err := time.ParseDuration(s.config.IdleConnTimeout) @@ -319,7 +329,17 @@ func NewHttpSink(name string, config json.RawMessage) (Sink, error) { cclog.ComponentDebug(s.name, "Init(): flushDelay", t) } } - precision := influx.Nanosecond + if len(s.config.MessageProcessor) > 0 { + err = p.FromConfigJSON(s.config.MessageProcessor) + if err != nil { + return nil, fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } + for _, k := range s.config.MetaAsTags { + s.mp.AddMoveMetaToTags("true", k, k) + } + + precision := influx.Second if len(s.config.Precision) > 0 { switch s.config.Precision { case "s": diff --git a/sinks/httpSink.md b/sinks/httpSink.md index 7d77ddf..5c6aded 100644 --- a/sinks/httpSink.md +++ b/sinks/httpSink.md @@ -8,9 +8,6 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the { "": { "type": "http", - "meta_as_tags" : [ - "meta-key" - ], "url" : "https://my-monitoring.example.com:1234/api/write", "jwt" : "blabla.blabla.blabla", "username": "myUser", @@ -19,13 +16,16 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the "idle_connection_timeout" : "5s", "flush_delay": "2s", "batch_size": 1000, - "precision": "s" + "precision": "s", + "process_messages" : { + "see" : "docs of message processor for valid fields" + }, + "meta_as_tags" : [] } } ``` - `type`: makes the sink an `http` sink -- `meta_as_tags`: Move specific meta information to the tags in the output (optional) - `url`: The full URL of the endpoint - `jwt`: JSON web tokens for authentication (Using the *Bearer* scheme) - `username`: username for basic authentication @@ -35,8 +35,10 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the - `idle_connection_timeout`: Timeout for idle connections (default '120s'). Should be larger than the measurement interval to keep the connection open - `flush_delay`: Batch all writes arriving in during this duration (default '1s', batching can be disabled by setting it to 0) - `batch_size`: Maximal batch size. If `batch_size` is reached before the end of `flush_delay`, the metrics are sent without further delay -- `precision`: Precision of the timestamp. Valid values are 's', 'ms', 'us' and 'ns'. (default is 'ns') +- `precision`: Precision of the timestamp. Valid values are 's', 'ms', 'us' and 'ns'. (default is 's') +- `process_messages`: Process messages with given rules before progressing or dropping, see [here](../pkg/messageProcessor/README.md) (optional) +- `meta_as_tags`: print all meta information as tags in the output (deprecated, optional) -### Using HttpSink for communication with cc-metric-store +### Using `http` sink for communication with cc-metric-store -The cc-metric-store only accepts metrics with a timestamp precision in seconds, so it is required to set `"precision": "s"`. \ No newline at end of file +The cc-metric-store only accepts metrics with a timestamp precision in seconds, so it is required to use `"precision": "s"`. \ No newline at end of file diff --git a/sinks/influxAsyncSink.go b/sinks/influxAsyncSink.go index b8555c6..e496627 100644 --- a/sinks/influxAsyncSink.go +++ b/sinks/influxAsyncSink.go @@ -10,8 +10,9 @@ import ( "strings" "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" influxdb2ApiHttp "github.com/influxdata/influxdb-client-go/v2/api/http" @@ -36,6 +37,8 @@ type InfluxAsyncSinkConfig struct { InfluxMaxRetryTime string `json:"max_retry_time,omitempty"` CustomFlushInterval string `json:"custom_flush_interval,omitempty"` MaxRetryAttempts uint `json:"max_retry_attempts,omitempty"` + // Timestamp precision + Precision string `json:"precision,omitempty"` } type InfluxAsyncSink struct { @@ -93,7 +96,22 @@ func (s *InfluxAsyncSink) connect() error { &tls.Config{ InsecureSkipVerify: true, }, - ).SetPrecision(time.Second) + ) + + precision := time.Second + if len(s.config.Precision) > 0 { + switch s.config.Precision { + case "s": + precision = time.Second + case "ms": + precision = time.Millisecond + case "us": + precision = time.Microsecond + case "ns": + precision = time.Nanosecond + } + } + clientOptions.SetPrecision(precision) s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions) s.writeApi = s.client.WriteAPI(s.config.Organization, s.config.Database) @@ -112,7 +130,7 @@ func (s *InfluxAsyncSink) connect() error { return nil } -func (s *InfluxAsyncSink) Write(m lp.CCMetric) error { +func (s *InfluxAsyncSink) Write(m lp.CCMessage) error { if s.customFlushInterval != 0 && s.flushTimer == nil { // Run a batched flush for all lines that have arrived in the defined interval s.flushTimer = time.AfterFunc(s.customFlushInterval, func() { @@ -121,9 +139,10 @@ func (s *InfluxAsyncSink) Write(m lp.CCMetric) error { } }) } - s.writeApi.WritePoint( - m.ToPoint(s.meta_as_tags), - ) + msg, err := s.mp.ProcessMessage(m) + if err == nil && msg != nil { + s.writeApi.WritePoint(msg.ToPoint(nil)) + } return nil } @@ -158,6 +177,7 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) { s.config.CustomFlushInterval = "" s.customFlushInterval = time.Duration(0) s.config.MaxRetryAttempts = 1 + s.config.Precision = "s" // Default retry intervals (in seconds) // 1 2 @@ -200,10 +220,24 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) { if len(s.config.Password) == 0 { return nil, errors.New("missing password configuration required by InfluxSink") } + p, err := mp.NewMessageProcessor() + if err != nil { + return nil, fmt.Errorf("initialization of message processor failed: %v", err.Error()) + } + s.mp = p + if len(s.config.MessageProcessor) > 0 { + err = s.mp.FromConfigJSON(s.config.MessageProcessor) + if err != nil { + return nil, fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } // Create lookup map to use meta infos as tags in the output metric - s.meta_as_tags = make(map[string]bool) + // s.meta_as_tags = make(map[string]bool) + // for _, k := range s.config.MetaAsTags { + // s.meta_as_tags[k] = true + // } for _, k := range s.config.MetaAsTags { - s.meta_as_tags[k] = true + s.mp.AddMoveMetaToTags("true", k, k) } toUint := func(duration string, def uint) uint { diff --git a/sinks/influxAsyncSink.md b/sinks/influxAsyncSink.md index ddcf4b4..cedcf35 100644 --- a/sinks/influxAsyncSink.md +++ b/sinks/influxAsyncSink.md @@ -19,9 +19,13 @@ The `influxasync` sink uses the official [InfluxDB golang client](https://pkg.go "batch_size": 200, "retry_interval" : "1s", "retry_exponential_base" : 2, + "precision": "s", "max_retries": 20, "max_retry_time" : "168h", - "meta_as_tags" : [], + "process_messages" : { + "see" : "docs of message processor for valid fields" + }, + "meta_as_tags" : [] } } ``` @@ -39,6 +43,12 @@ The `influxasync` sink uses the official [InfluxDB golang client](https://pkg.go - `retry_exponential_base`: The retry interval is exponentially increased with this base, default 2 - `max_retries`: Maximal number of retry attempts - `max_retry_time`: Maximal time to retry failed writes, default 168h (one week) -- `meta_as_tags`: move meta information keys to tags (optional) +- `precision`: Precision of the timestamp. Valid values are 's', 'ms', 'us' and 'ns'. (default is 's') +- `process_messages`: Process messages with given rules before progressing or dropping, see [here](../pkg/messageProcessor/README.md) (optional) +- `meta_as_tags`: print all meta information as tags in the output (deprecated, optional) For information about the calculation of the retry interval settings, see [offical influxdb-client-go documentation](https://github.com/influxdata/influxdb-client-go#handling-of-failed-async-writes) + +### Using `influxasync` sink for communication with cc-metric-store + +The cc-metric-store only accepts metrics with a timestamp precision in seconds, so it is required to use `"precision": "s"`. \ No newline at end of file diff --git a/sinks/influxSink.go b/sinks/influxSink.go index a9d46a5..a3dba6a 100644 --- a/sinks/influxSink.go +++ b/sinks/influxSink.go @@ -10,8 +10,9 @@ import ( "sync" "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" influx "github.com/influxdata/line-protocol/v2/lineprotocol" @@ -58,6 +59,8 @@ type InfluxSink struct { InfluxMaxRetryTime string `json:"max_retry_time,omitempty"` // Specify whether to use GZip compression in write requests InfluxUseGzip bool `json:"use_gzip"` + // Timestamp precision + Precision string `json:"precision,omitempty"` } // influx line protocol encoder @@ -206,7 +209,20 @@ func (s *InfluxSink) connect() error { ) // Set time precision - clientOptions.SetPrecision(time.Nanosecond) + precision := time.Second + if len(s.config.Precision) > 0 { + switch s.config.Precision { + case "s": + precision = time.Second + case "ms": + precision = time.Millisecond + case "us": + precision = time.Microsecond + case "ns": + precision = time.Nanosecond + } + } + clientOptions.SetPrecision(precision) // Create new writeAPI s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions) @@ -224,28 +240,19 @@ func (s *InfluxSink) connect() error { } // Write sends metric m in influxDB line protocol -func (s *InfluxSink) Write(m lp.CCMetric) error { +func (s *InfluxSink) Write(msg lp.CCMessage) error { - // Lock for encoder usage - s.encoderLock.Lock() + m, err := s.mp.ProcessMessage(msg) + if err == nil && m != nil { + // Lock for encoder usage + s.encoderLock.Lock() - // Encode measurement name - s.encoder.StartLine(m.Name()) + // Encode measurement name + s.encoder.StartLine(m.Name()) - // copy tags and meta data which should be used as tags - s.extended_tag_list = s.extended_tag_list[:0] - for key, value := range m.Tags() { - s.extended_tag_list = - append( - s.extended_tag_list, - key_value_pair{ - key: key, - value: value, - }, - ) - } - for _, key := range s.config.MetaAsTags { - if value, ok := m.GetMeta(key); ok { + // copy tags and meta data which should be used as tags + s.extended_tag_list = s.extended_tag_list[:0] + for key, value := range m.Tags() { s.extended_tag_list = append( s.extended_tag_list, @@ -255,45 +262,57 @@ func (s *InfluxSink) Write(m lp.CCMetric) error { }, ) } - } + // for _, key := range s.config.MetaAsTags { + // if value, ok := m.GetMeta(key); ok { + // s.extended_tag_list = + // append( + // s.extended_tag_list, + // key_value_pair{ + // key: key, + // value: value, + // }, + // ) + // } + // } - // Encode tags (they musts be in lexical order) - slices.SortFunc( - s.extended_tag_list, - func(a key_value_pair, b key_value_pair) int { - if a.key < b.key { - return -1 - } - if a.key > b.key { - return +1 - } - return 0 - }, - ) - for i := range s.extended_tag_list { - s.encoder.AddTag( - s.extended_tag_list[i].key, - s.extended_tag_list[i].value, + // Encode tags (they musts be in lexical order) + slices.SortFunc( + s.extended_tag_list, + func(a key_value_pair, b key_value_pair) int { + if a.key < b.key { + return -1 + } + if a.key > b.key { + return +1 + } + return 0 + }, ) + for i := range s.extended_tag_list { + s.encoder.AddTag( + s.extended_tag_list[i].key, + s.extended_tag_list[i].value, + ) + } + + // Encode fields + for key, value := range m.Fields() { + s.encoder.AddField(key, influx.MustNewValue(value)) + } + + // Encode time stamp + s.encoder.EndLine(m.Time()) + + // Check for encoder errors + if err := s.encoder.Err(); err != nil { + // Unlock encoder usage + s.encoderLock.Unlock() + + return fmt.Errorf("encoding failed: %v", err) + } + s.numRecordsInEncoder++ } - // Encode fields - for key, value := range m.Fields() { - s.encoder.AddField(key, influx.MustNewValue(value)) - } - - // Encode time stamp - s.encoder.EndLine(m.Time()) - - // Check for encoder errors - if err := s.encoder.Err(); err != nil { - // Unlock encoder usage - s.encoderLock.Unlock() - - return fmt.Errorf("Encoding failed: %v", err) - } - s.numRecordsInEncoder++ - if s.config.flushDelay == 0 { // Unlock encoder usage s.encoderLock.Unlock() @@ -417,6 +436,7 @@ func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { // Set config default values s.config.BatchSize = 1000 s.config.FlushInterval = "1s" + s.config.Precision = "s" // Read config if len(config) > 0 { @@ -443,11 +463,20 @@ func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { if len(s.config.Password) == 0 { return s, errors.New("missing password configuration required by InfluxSink") } + p, err := mp.NewMessageProcessor() + if err != nil { + return nil, fmt.Errorf("initialization of message processor failed: %v", err.Error()) + } + s.mp = p - // Create lookup map to use meta infos as tags in the output metric - s.meta_as_tags = make(map[string]bool) + if len(s.config.MessageProcessor) > 0 { + err = p.FromConfigJSON(s.config.MessageProcessor) + if err != nil { + return nil, fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } for _, k := range s.config.MetaAsTags { - s.meta_as_tags[k] = true + s.mp.AddMoveMetaToTags("true", k, k) } // Configure flush delay duration diff --git a/sinks/influxSink.md b/sinks/influxSink.md index 99390f5..acd0d06 100644 --- a/sinks/influxSink.md +++ b/sinks/influxSink.md @@ -17,14 +17,17 @@ The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.de "ssl": true, "flush_delay" : "1s", "batch_size" : 1000, - "use_gzip": true - "meta_as_tags" : [], + "use_gzip": true, + "precision": "s", + "process_messages" : { + "see" : "docs of message processor for valid fields" + }, + "meta_as_tags" : [] } } ``` - `type`: makes the sink an `influxdb` sink -- `meta_as_tags`: print all meta information as tags in the output (optional) - `database`: All metrics are written to this bucket - `host`: Hostname of the InfluxDB database server - `port`: Port number (as string) of the InfluxDB database server @@ -34,6 +37,9 @@ The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.de - `ssl`: Use SSL connection - `flush_delay`: Group metrics coming in to a single batch - `batch_size`: Maximal batch size. If `batch_size` is reached before the end of `flush_delay`, the metrics are sent without further delay +- `precision`: Precision of the timestamp. Valid values are 's', 'ms', 'us' and 'ns'. (default is 's') +- `process_messages`: Process messages with given rules before progressing or dropping, see [here](../pkg/messageProcessor/README.md) (optional) +- `meta_as_tags`: print all meta information as tags in the output (deprecated, optional) Influx client options: ======= @@ -46,3 +52,7 @@ Influx client options: - `max_retries`: maximum count of retry attempts of failed writes - `max_retry_time`: maximum total retry timeout - `use_gzip`: Specify whether to use GZip compression in write requests + +### Using `influxdb` sink for communication with cc-metric-store + +The cc-metric-store only accepts metrics with a timestamp precision in seconds, so it is required to use `"precision": "s"`. \ No newline at end of file diff --git a/sinks/libgangliaSink.go b/sinks/libgangliaSink.go index 145b490..4700eee 100644 --- a/sinks/libgangliaSink.go +++ b/sinks/libgangliaSink.go @@ -72,8 +72,9 @@ import ( "fmt" "unsafe" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" "github.com/NVIDIA/go-nvml/pkg/dl" ) @@ -110,99 +111,102 @@ type LibgangliaSink struct { cstrCache map[string]*C.char } -func (s *LibgangliaSink) Write(point lp.CCMetric) error { +func (s *LibgangliaSink) Write(msg lp.CCMessage) error { var err error = nil var c_name *C.char var c_value *C.char var c_type *C.char var c_unit *C.char - // helper function for looking up C strings in the cache - lookup := func(key string) *C.char { - if _, exist := s.cstrCache[key]; !exist { - s.cstrCache[key] = C.CString(key) + point, err := s.mp.ProcessMessage(msg) + if err == nil && point != nil { + // helper function for looking up C strings in the cache + lookup := func(key string) *C.char { + if _, exist := s.cstrCache[key]; !exist { + s.cstrCache[key] = C.CString(key) + } + return s.cstrCache[key] } - return s.cstrCache[key] - } - conf := GetCommonGangliaConfig(point) - if len(conf.Type) == 0 { - conf = GetGangliaConfig(point) - } - if len(conf.Type) == 0 { - return fmt.Errorf("metric %q (Ganglia name %q) has no 'value' field", point.Name(), conf.Name) - } + conf := GetCommonGangliaConfig(point) + if len(conf.Type) == 0 { + conf = GetGangliaConfig(point) + } + if len(conf.Type) == 0 { + return fmt.Errorf("metric %q (Ganglia name %q) has no 'value' field", point.Name(), conf.Name) + } - if s.config.AddTypeToName { - conf.Name = GangliaMetricName(point) - } + if s.config.AddTypeToName { + conf.Name = GangliaMetricName(point) + } - c_value = C.CString(conf.Value) - c_type = lookup(conf.Type) - c_name = lookup(conf.Name) + c_value = C.CString(conf.Value) + c_type = lookup(conf.Type) + c_name = lookup(conf.Name) - // Add unit - unit := "" - if s.config.AddUnits { - unit = conf.Unit - } - c_unit = lookup(unit) + // Add unit + unit := "" + if s.config.AddUnits { + unit = conf.Unit + } + c_unit = lookup(unit) - // Determine the slope of the metric. Ganglia's own collector mostly use - // 'both' but the mem and swap total uses 'zero'. - slope_type := C.GANGLIA_SLOPE_BOTH - switch conf.Slope { - case "zero": - slope_type = C.GANGLIA_SLOPE_ZERO - case "both": - slope_type = C.GANGLIA_SLOPE_BOTH - } + // Determine the slope of the metric. Ganglia's own collector mostly use + // 'both' but the mem and swap total uses 'zero'. + slope_type := C.GANGLIA_SLOPE_BOTH + switch conf.Slope { + case "zero": + slope_type = C.GANGLIA_SLOPE_ZERO + case "both": + slope_type = C.GANGLIA_SLOPE_BOTH + } - // Create a new Ganglia metric - gmetric := C.Ganglia_metric_create(s.global_context) - // Set name, value, type and unit in the Ganglia metric - // The default slope_type is both directions, so up and down. Some metrics want 'zero' slope, probably constant. - // The 'tmax' value is by default 300. - rval := C.int(0) - rval = C.Ganglia_metric_set(gmetric, c_name, c_value, c_type, c_unit, C.uint(slope_type), C.uint(conf.Tmax), 0) - switch rval { - case 1: + // Create a new Ganglia metric + gmetric := C.Ganglia_metric_create(s.global_context) + // Set name, value, type and unit in the Ganglia metric + // The default slope_type is both directions, so up and down. Some metrics want 'zero' slope, probably constant. + // The 'tmax' value is by default 300. + rval := C.int(0) + rval = C.Ganglia_metric_set(gmetric, c_name, c_value, c_type, c_unit, C.uint(slope_type), C.uint(conf.Tmax), 0) + switch rval { + case 1: + C.free(unsafe.Pointer(c_value)) + return errors.New("invalid parameters") + case 2: + C.free(unsafe.Pointer(c_value)) + return errors.New("one of your parameters has an invalid character '\"'") + case 3: + C.free(unsafe.Pointer(c_value)) + return fmt.Errorf("the type parameter \"%s\" is not a valid type", conf.Type) + case 4: + C.free(unsafe.Pointer(c_value)) + return fmt.Errorf("the value parameter \"%s\" does not represent a number", conf.Value) + default: + } + + // Set the cluster name, otherwise it takes it from the configuration file + if len(s.config.ClusterName) > 0 { + C.Ganglia_metadata_add(gmetric, lookup("CLUSTER"), lookup(s.config.ClusterName)) + } + // Set the group metadata in the Ganglia metric if configured + if s.config.AddGangliaGroup { + c_group := lookup(conf.Group) + C.Ganglia_metadata_add(gmetric, lookup("GROUP"), c_group) + } + + // Now we send the metric + // gmetric does provide some more options like description and other options + // but they are not provided by the collectors + rval = C.Ganglia_metric_send(gmetric, s.send_channels) + if rval != 0 { + err = fmt.Errorf("there was an error sending metric %s to %d of the send channels ", point.Name(), rval) + // fall throuph to use Ganglia_metric_destroy from common cleanup + } + // Cleanup Ganglia metric + C.Ganglia_metric_destroy(gmetric) + // Free the value C string, the only one not stored in the cache C.free(unsafe.Pointer(c_value)) - return errors.New("invalid parameters") - case 2: - C.free(unsafe.Pointer(c_value)) - return errors.New("one of your parameters has an invalid character '\"'") - case 3: - C.free(unsafe.Pointer(c_value)) - return fmt.Errorf("the type parameter \"%s\" is not a valid type", conf.Type) - case 4: - C.free(unsafe.Pointer(c_value)) - return fmt.Errorf("the value parameter \"%s\" does not represent a number", conf.Value) - default: } - - // Set the cluster name, otherwise it takes it from the configuration file - if len(s.config.ClusterName) > 0 { - C.Ganglia_metadata_add(gmetric, lookup("CLUSTER"), lookup(s.config.ClusterName)) - } - // Set the group metadata in the Ganglia metric if configured - if s.config.AddGangliaGroup { - c_group := lookup(conf.Group) - C.Ganglia_metadata_add(gmetric, lookup("GROUP"), c_group) - } - - // Now we send the metric - // gmetric does provide some more options like description and other options - // but they are not provided by the collectors - rval = C.Ganglia_metric_send(gmetric, s.send_channels) - if rval != 0 { - err = fmt.Errorf("there was an error sending metric %s to %d of the send channels ", point.Name(), rval) - // fall throuph to use Ganglia_metric_destroy from common cleanup - } - // Cleanup Ganglia metric - C.Ganglia_metric_destroy(gmetric) - // Free the value C string, the only one not stored in the cache - C.free(unsafe.Pointer(c_value)) return err } @@ -241,6 +245,20 @@ func NewLibgangliaSink(name string, config json.RawMessage) (Sink, error) { return nil, err } } + p, err := mp.NewMessageProcessor() + if err != nil { + return nil, fmt.Errorf("initialization of message processor failed: %v", err.Error()) + } + s.mp = p + if len(s.config.MessageProcessor) > 0 { + err = s.mp.FromConfigJSON(s.config.MessageProcessor) + if err != nil { + return nil, fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } + for _, k := range s.config.MetaAsTags { + s.mp.AddMoveMetaToTags("true", k, k) + } lib := dl.New(s.config.GangliaLib, GANGLIA_LIB_DL_FLAGS) if lib == nil { return nil, fmt.Errorf("error instantiating DynamicLibrary for %s", s.config.GangliaLib) diff --git a/sinks/libgangliaSink.md b/sinks/libgangliaSink.md index a0dede7..5d4e91f 100644 --- a/sinks/libgangliaSink.md +++ b/sinks/libgangliaSink.md @@ -15,18 +15,23 @@ The `libganglia` sink has probably less overhead compared to the `ganglia` sink "cluster_name": "MyCluster", "add_ganglia_group" : true, "add_type_to_name": true, - "add_units" : true + "add_units" : true, + "process_messages" : { + "see" : "docs of message processor for valid fields" + }, + "meta_as_tags" : [] } } ``` - `type`: makes the sink an `libganglia` sink -- `meta_as_tags`: print all meta information as tags in the output (optional) - `gmond_config`: Path to the Ganglia configuration file `gmond.conf` (default: `/etc/ganglia/gmond.conf`) - `cluster_name`: Set a cluster name for the metric. If not set, it is taken from `gmond_config` - `add_ganglia_group`: Add a Ganglia metric group based on meta information. Some old versions of `gmetric` do not support the `--group` option - `add_type_to_name`: Ganglia commonly uses only node-level metrics but with cc-metric-collector, there are metrics for cpus, memory domains, CPU sockets and the whole node. In order to get eeng, this option prefixes the metric name with `_` or `device_` depending on the metric tags and meta information. For metrics of the whole node `type=node`, no prefix is added - `add_units`: Add metric value unit if there is a `unit` entry in the metric tags or meta information +- `process_messages`: Process messages with given rules before progressing or dropping, see [here](../pkg/messageProcessor/README.md) (optional) +- `meta_as_tags`: print all meta information as tags in the output (deprecated, optional) ### Ganglia Installation diff --git a/sinks/metricSink.go b/sinks/metricSink.go index 2fd429c..4cac04b 100644 --- a/sinks/metricSink.go +++ b/sinks/metricSink.go @@ -1,24 +1,29 @@ package sinks import ( - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + "encoding/json" + + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" ) type defaultSinkConfig struct { - MetaAsTags []string `json:"meta_as_tags,omitempty"` - Type string `json:"type"` + MetaAsTags []string `json:"meta_as_tags,omitempty"` + MessageProcessor json.RawMessage `json:"process_messages,omitempty"` + Type string `json:"type"` } type sink struct { - meta_as_tags map[string]bool // Use meta data tags as tags - name string // Name of the sink + meta_as_tags map[string]bool // Use meta data tags as tags + mp mp.MessageProcessor // message processor for the sink + name string // Name of the sink } type Sink interface { - Write(point lp.CCMetric) error // Write metric to the sink - Flush() error // Flush buffered metrics - Close() // Close / finish metric sink - Name() string // Name of the metric sink + Write(point lp.CCMessage) error // Write metric to the sink + Flush() error // Flush buffered metrics + Close() // Close / finish metric sink + Name() string // Name of the metric sink } // Name returns the name of the metric sink diff --git a/sinks/natsSink.go b/sinks/natsSink.go index db446ca..1982bfe 100644 --- a/sinks/natsSink.go +++ b/sinks/natsSink.go @@ -5,13 +5,16 @@ import ( "encoding/json" "errors" "fmt" + "os" "sync" "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" - influx "github.com/influxdata/line-protocol" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" + influx "github.com/influxdata/line-protocol/v2/lineprotocol" nats "github.com/nats-io/nats.go" + "golang.org/x/exp/slices" ) type NatsSinkConfig struct { @@ -22,18 +25,24 @@ type NatsSinkConfig struct { User string `json:"user,omitempty"` Password string `json:"password,omitempty"` FlushDelay string `json:"flush_delay,omitempty"` + NkeyFile string `json:"nkey_file,omitempty"` + // Timestamp precision + Precision string `json:"precision,omitempty"` } + type NatsSink struct { sink client *nats.Conn - encoder *influx.Encoder + encoder influx.Encoder buffer *bytes.Buffer config NatsSinkConfig lock sync.Mutex flushDelay time.Duration flushTimer *time.Timer + + extended_tag_list []key_value_pair } func (s *NatsSink) connect() error { @@ -42,6 +51,13 @@ func (s *NatsSink) connect() error { var nc *nats.Conn if len(s.config.User) > 0 && len(s.config.Password) > 0 { uinfo = nats.UserInfo(s.config.User, s.config.Password) + } else if len(s.config.NkeyFile) > 0 { + if _, err := os.Stat(s.config.NkeyFile); err == nil { + uinfo = nats.UserCredentials(s.config.NkeyFile) + } else { + cclog.ComponentError(s.name, "NKEY file", s.config.NkeyFile, "does not exist: %v", err.Error()) + return err + } } uri := fmt.Sprintf("nats://%s:%s", s.config.Host, s.config.Port) cclog.ComponentDebug(s.name, "Connect to", uri) @@ -59,13 +75,61 @@ func (s *NatsSink) connect() error { return nil } -func (s *NatsSink) Write(m lp.CCMetric) error { - s.lock.Lock() - _, err := s.encoder.Encode(m.ToPoint(s.meta_as_tags)) - s.lock.Unlock() - if err != nil { - cclog.ComponentError(s.name, "Write:", err.Error()) - return err +func (s *NatsSink) Write(m lp.CCMessage) error { + msg, err := s.mp.ProcessMessage(m) + if err == nil && msg != nil { + s.lock.Lock() + // Encode measurement name + s.encoder.StartLine(msg.Name()) + + // copy tags and meta data which should be used as tags + s.extended_tag_list = s.extended_tag_list[:0] + for key, value := range m.Tags() { + s.extended_tag_list = + append( + s.extended_tag_list, + key_value_pair{ + key: key, + value: value, + }, + ) + } + // Encode tags (they musts be in lexical order) + slices.SortFunc( + s.extended_tag_list, + func(a key_value_pair, b key_value_pair) int { + if a.key < b.key { + return -1 + } + if a.key > b.key { + return +1 + } + return 0 + }, + ) + for i := range s.extended_tag_list { + s.encoder.AddTag( + s.extended_tag_list[i].key, + s.extended_tag_list[i].value, + ) + } + + // Encode fields + for key, value := range msg.Fields() { + s.encoder.AddField(key, influx.MustNewValue(value)) + } + + // Encode time stamp + s.encoder.EndLine(msg.Time()) + + // Check for encoder errors + err := s.encoder.Err() + + s.lock.Unlock() + if err != nil { + cclog.ComponentError(s.name, "Write:", err.Error()) + return err + } } if s.flushDelay == 0 { @@ -83,14 +147,13 @@ func (s *NatsSink) Write(m lp.CCMetric) error { func (s *NatsSink) Flush() error { s.lock.Lock() - buf := append([]byte{}, s.buffer.Bytes()...) // copy bytes - s.buffer.Reset() + buf := slices.Clone(s.encoder.Bytes()) + s.encoder.Reset() s.lock.Unlock() if len(buf) == 0 { return nil } - if err := s.client.Publish(s.config.Subject, buf); err != nil { cclog.ComponentError(s.name, "Flush:", err.Error()) return err @@ -107,6 +170,8 @@ func NewNatsSink(name string, config json.RawMessage) (Sink, error) { s := new(NatsSink) s.name = fmt.Sprintf("NatsSink(%s)", name) s.flushDelay = 10 * time.Second + s.config.Port = "4222" + s.config.Precision = "s" if len(config) > 0 { d := json.NewDecoder(bytes.NewReader(config)) d.DisallowUnknownFields() @@ -120,17 +185,41 @@ func NewNatsSink(name string, config json.RawMessage) (Sink, error) { len(s.config.Subject) == 0 { return nil, errors.New("not all configuration variables set required by NatsSink") } - // Create lookup map to use meta infos as tags in the output metric - s.meta_as_tags = make(map[string]bool) - for _, k := range s.config.MetaAsTags { - s.meta_as_tags[k] = true + p, err := mp.NewMessageProcessor() + if err != nil { + return nil, fmt.Errorf("initialization of message processor failed: %v", err.Error()) } + s.mp = p + if len(s.config.MessageProcessor) > 0 { + err = s.mp.FromConfigJSON(s.config.MessageProcessor) + if err != nil { + return nil, fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } + // Create lookup map to use meta infos as tags in the output metric + for _, k := range s.config.MetaAsTags { + s.mp.AddMoveMetaToTags("true", k, k) + } + precision := influx.Second + if len(s.config.Precision) > 0 { + switch s.config.Precision { + case "s": + precision = influx.Second + case "ms": + precision = influx.Millisecond + case "us": + precision = influx.Microsecond + case "ns": + precision = influx.Nanosecond + } + } + + // s.meta_as_tags = make(map[string]bool) + // for _, k := range s.config.MetaAsTags { + // s.meta_as_tags[k] = true + // } // Setup Influx line protocol - s.buffer = &bytes.Buffer{} - s.buffer.Grow(1025) - s.encoder = influx.NewEncoder(s.buffer) - s.encoder.SetPrecision(time.Second) - s.encoder.SetMaxLineBytes(1024) + s.encoder.SetPrecision(precision) // Setup infos for connection if err := s.connect(); err != nil { return nil, fmt.Errorf("unable to connect: %v", err) @@ -144,6 +233,7 @@ func NewNatsSink(name string, config json.RawMessage) (Sink, error) { return nil, err } } + s.extended_tag_list = make([]key_value_pair, 0) return s, nil } diff --git a/sinks/natsSink.md b/sinks/natsSink.md index 4c7d9d0..8bafbb0 100644 --- a/sinks/natsSink.md +++ b/sinks/natsSink.md @@ -13,7 +13,13 @@ The `nats` sink publishes all metrics into a NATS network. The publishing key is "port": "4222", "user": "exampleuser", "password" : "examplepw", - "meta_as_tags" : [], + "nkey_file": "/path/to/nkey_file", + "flush_delay": "10s", + "precision": "s", + "process_messages" : { + "see" : "docs of message processor for valid fields" + }, + "meta_as_tags" : [] } } ``` @@ -24,4 +30,12 @@ The `nats` sink publishes all metrics into a NATS network. The publishing key is - `port`: Port number (as string) of the NATS server - `user`: Username for basic authentication - `password`: Password for basic authentication -- `meta_as_tags`: print all meta information as tags in the output (optional) +- `nkey_file`: Path to credentials file with NKEY +- `flush_delay`: Maximum time until metrics are sent out +- `precision`: Precision of the timestamp. Valid values are 's', 'ms', 'us' and 'ns'. (default is 's') +- `process_messages`: Process messages with given rules before progressing or dropping, see [here](../pkg/messageProcessor/README.md) (optional) +- `meta_as_tags`: print all meta information as tags in the output (deprecated, optional) + +### Using `nats` sink for communication with cc-metric-store + +The cc-metric-store only accepts metrics with a timestamp precision in seconds, so it is required to use `"precision": "s"`. \ No newline at end of file diff --git a/sinks/prometheusSink.go b/sinks/prometheusSink.go index 7d792cd..677cfd6 100644 --- a/sinks/prometheusSink.go +++ b/sinks/prometheusSink.go @@ -10,8 +10,9 @@ import ( "strings" "sync" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" "github.com/gorilla/mux" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -49,11 +50,13 @@ func intToFloat64(input interface{}) (float64, error) { return float64(value), nil case int64: return float64(value), nil + case uint64: + return float64(value), nil } return 0, errors.New("cannot cast value to float64") } -func getLabelValue(metric lp.CCMetric) []string { +func getLabelValue(metric lp.CCMessage) []string { labelValues := []string{} if tid, tidok := metric.GetTag("type-id"); tidok && metric.HasTag("type") { labelValues = append(labelValues, tid) @@ -66,7 +69,7 @@ func getLabelValue(metric lp.CCMetric) []string { return labelValues } -func getLabelNames(metric lp.CCMetric) []string { +func getLabelNames(metric lp.CCMessage) []string { labelNames := []string{} if t, tok := metric.GetTag("type"); tok && metric.HasTag("type-id") { labelNames = append(labelNames, t) @@ -79,7 +82,7 @@ func getLabelNames(metric lp.CCMetric) []string { return labelNames } -func (s *PrometheusSink) newMetric(metric lp.CCMetric) error { +func (s *PrometheusSink) newMetric(metric lp.CCMessage) error { var value float64 = 0 name := metric.Name() opts := prometheus.GaugeOpts{ @@ -117,7 +120,7 @@ func (s *PrometheusSink) newMetric(metric lp.CCMetric) error { return nil } -func (s *PrometheusSink) updateMetric(metric lp.CCMetric) error { +func (s *PrometheusSink) updateMetric(metric lp.CCMessage) error { var value float64 = 0.0 name := metric.Name() labelValues := getLabelValue(metric) @@ -150,8 +153,12 @@ func (s *PrometheusSink) updateMetric(metric lp.CCMetric) error { return nil } -func (s *PrometheusSink) Write(m lp.CCMetric) error { - return s.updateMetric(m) +func (s *PrometheusSink) Write(m lp.CCMessage) error { + msg, err := s.mp.ProcessMessage(m) + if err == nil && msg != nil { + err = s.updateMetric(m) + } + return err } func (s *PrometheusSink) Flush() error { @@ -180,6 +187,20 @@ func NewPrometheusSink(name string, config json.RawMessage) (Sink, error) { cclog.ComponentError(s.name, err.Error()) return nil, err } + p, err := mp.NewMessageProcessor() + if err != nil { + return nil, fmt.Errorf("initialization of message processor failed: %v", err.Error()) + } + s.mp = p + if len(s.config.MessageProcessor) > 0 { + err = p.FromConfigJSON(s.config.MessageProcessor) + if err != nil { + return nil, fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } + for _, k := range s.config.MetaAsTags { + s.mp.AddMoveMetaToTags("true", k, k) + } s.labelMetrics = make(map[string]*prometheus.GaugeVec) s.nodeMetrics = make(map[string]prometheus.Gauge) s.promWg.Add(1) diff --git a/sinks/prometheusSink.md b/sinks/prometheusSink.md index 0996c43..de1a107 100644 --- a/sinks/prometheusSink.md +++ b/sinks/prometheusSink.md @@ -11,7 +11,11 @@ The `prometheus` sink publishes all metrics via an HTTP server ready to be scrap "type": "prometheus", "host": "localhost", "port": "8080", - "path": "metrics" + "path": "metrics", + "process_messages" : { + "see" : "docs of message processor for valid fields" + }, + "meta_as_tags" : [] } } ``` @@ -21,3 +25,5 @@ The `prometheus` sink publishes all metrics via an HTTP server ready to be scrap - `port`: Portnumber (as string) for the HTTP server - `path`: Path where the metrics should be servered. The metrics will be published at `host`:`port`/`path` - `group_as_namespace`: Most metrics contain a group as meta information like 'memory', 'load'. With this the metric names are extended to `group`_`name` if possible. +- `process_messages`: Process messages with given rules before progressing or dropping, see [here](../pkg/messageProcessor/README.md) (optional) +- `meta_as_tags`: print all meta information as tags in the output (deprecated, optional) \ No newline at end of file diff --git a/sinks/sampleSink.go b/sinks/sampleSink.go index fe4719d..24e2911 100644 --- a/sinks/sampleSink.go +++ b/sinks/sampleSink.go @@ -6,8 +6,9 @@ import ( "fmt" "log" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" ) type SampleSinkConfig struct { @@ -28,9 +29,14 @@ type SampleSink struct { // See: metricSink.go // Code to submit a single CCMetric to the sink -func (s *SampleSink) Write(point lp.CCMetric) error { +func (s *SampleSink) Write(point lp.CCMessage) error { // based on s.meta_as_tags use meta infos as tags - log.Print(point) + // moreover, submit the point to the message processor + // to apply drop/modify rules + msg, err := s.mp.ProcessMessage(point) + if err == nil && msg != nil { + log.Print(msg) + } return nil } @@ -66,10 +72,24 @@ func NewSampleSink(name string, config json.RawMessage) (Sink, error) { } } - // Create lookup map to use meta infos as tags in the output metric - s.meta_as_tags = make(map[string]bool) + // Initialize and configure the message processor + p, err := mp.NewMessageProcessor() + if err != nil { + return nil, fmt.Errorf("initialization of message processor failed: %v", err.Error()) + } + s.mp = p + + // Add message processor configuration + if len(s.config.MessageProcessor) > 0 { + err = p.FromConfigJSON(s.config.MessageProcessor) + if err != nil { + return nil, fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } + // Add rules to move meta information to tag space + // Replacing the legacy 'meta_as_tags' configuration for _, k := range s.config.MetaAsTags { - s.meta_as_tags[k] = true + s.mp.AddMoveMetaToTags("true", k, k) } // Check if all required fields in the config are set diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index cd2680f..487a693 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -7,7 +7,7 @@ import ( "sync" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" ) const SINK_MAX_FORWARD = 50 @@ -21,11 +21,12 @@ var AvailableSinks = map[string]func(name string, config json.RawMessage) (Sink, "influxdb": NewInfluxSink, "influxasync": NewInfluxAsyncSink, "http": NewHttpSink, + "prometheus": NewPrometheusSink, } // Metric collector manager data structure type sinkManager struct { - input chan lp.CCMetric // input channel + input chan lp.CCMessage // input channel done chan bool // channel to finish / stop metric sink manager wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector sinks map[string]Sink // Mapping sink name to sink @@ -35,7 +36,7 @@ type sinkManager struct { // Sink manager access functions type SinkManager interface { Init(wg *sync.WaitGroup, sinkConfigFile string) error - AddInput(input chan lp.CCMetric) + AddInput(input chan lp.CCMessage) AddOutput(name string, config json.RawMessage) error Start() Close() @@ -107,7 +108,7 @@ func (sm *sinkManager) Start() { cclog.ComponentDebug("SinkManager", "DONE") } - toTheSinks := func(p lp.CCMetric) { + toTheSinks := func(p lp.CCMessage) { // Send received metric to all outputs cclog.ComponentDebug("SinkManager", "WRITE", p) for _, s := range sm.sinks { @@ -138,7 +139,7 @@ func (sm *sinkManager) Start() { } // AddInput adds the input channel to the sink manager -func (sm *sinkManager) AddInput(input chan lp.CCMetric) { +func (sm *sinkManager) AddInput(input chan lp.CCMessage) { sm.input = input } diff --git a/sinks/stdoutSink.go b/sinks/stdoutSink.go index c235a9b..a95e866 100644 --- a/sinks/stdoutSink.go +++ b/sinks/stdoutSink.go @@ -8,8 +8,9 @@ import ( "strings" // "time" + lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + mp "github.com/ClusterCockpit/cc-metric-collector/pkg/messageProcessor" ) type StdoutSink struct { @@ -21,11 +22,14 @@ type StdoutSink struct { } } -func (s *StdoutSink) Write(m lp.CCMetric) error { - fmt.Fprint( - s.output, - m.ToLineProtocol(s.meta_as_tags), - ) +func (s *StdoutSink) Write(m lp.CCMessage) error { + msg, err := s.mp.ProcessMessage(m) + if err == nil && msg != nil { + fmt.Fprint( + s.output, + msg.ToLineProtocol(s.meta_as_tags), + ) + } return nil } @@ -41,6 +45,7 @@ func (s *StdoutSink) Close() { } func NewStdoutSink(name string, config json.RawMessage) (Sink, error) { + s := new(StdoutSink) s.name = fmt.Sprintf("StdoutSink(%s)", name) if len(config) > 0 { @@ -51,6 +56,11 @@ func NewStdoutSink(name string, config json.RawMessage) (Sink, error) { return nil, err } } + p, err := mp.NewMessageProcessor() + if err != nil { + return nil, fmt.Errorf("initialization of message processor failed: %v", err.Error()) + } + s.mp = p s.output = os.Stdout if len(s.config.Output) > 0 { @@ -67,10 +77,21 @@ func NewStdoutSink(name string, config json.RawMessage) (Sink, error) { s.output = f } } + + // Add message processor configuration + if len(s.config.MessageProcessor) > 0 { + err = s.mp.FromConfigJSON(s.config.MessageProcessor) + if err != nil { + return nil, fmt.Errorf("failed parsing JSON for message processor: %v", err.Error()) + } + } // Create lookup map to use meta infos as tags in the output metric - s.meta_as_tags = make(map[string]bool) + // s.meta_as_tags = make(map[string]bool) + // for _, k := range s.config.MetaAsTags { + // s.meta_as_tags[k] = true + // } for _, k := range s.config.MetaAsTags { - s.meta_as_tags[k] = true + s.mp.AddMoveMetaToTags("true", k, k) } return s, nil diff --git a/sinks/stdoutSink.md b/sinks/stdoutSink.md index 3fe3308..aef1db1 100644 --- a/sinks/stdoutSink.md +++ b/sinks/stdoutSink.md @@ -10,7 +10,11 @@ The `stdout` sink is the most simple sink provided by cc-metric-collector. It wr "": { "type": "stdout", "meta_as_tags" : [], - "output_file" : "mylogfile.log" + "output_file" : "mylogfile.log", + "process_messages" : { + "see" : "docs of message processor for valid fields" + }, + "meta_as_tags" : [] } } ``` @@ -18,5 +22,6 @@ The `stdout` sink is the most simple sink provided by cc-metric-collector. It wr - `type`: makes the sink an `stdout` sink - `meta_as_tags`: print meta information as tags in the output (optional) - `output_file`: Write all data to the selected file (optional). There are two 'special' files: `stdout` and `stderr`. If this option is not provided, the default value is `stdout` - +- `process_messages`: Process messages with given rules before progressing or dropping, see [here](../pkg/messageProcessor/README.md) (optional) +- `meta_as_tags`: print all meta information as tags in the output (deprecated, optional)