Merge develop branch into main (#123)

* Add cpu_used (all-cpu_idle) to CpustatCollector

* Update cc-metric-collector.init

* Allow selection of timestamp precision in HttpSink

* Add comment about precision requirement for cc-metric-store

* Fix for API changes in gofish@v0.15.0

* Update requirements to latest version

* Read sensors through redfish

* Update golang toolchain to 1.21

* Remove stray error check

* Update main config in configuration.md

* Update Release action to use golang 1.22 stable release, no golang RPMs anymore

* Update runonce action to use golang 1.22 stable release, no golang RPMs anymore

* Update README.md

Use right JSON type in configuration

* Update sink's README

* Test whether ipmitool or ipmi-sensors can be executed without errors

* Little fixes to the prometheus sink (#115)

* Add uint64 to float64 cast option

* Add prometheus sink to the list of available sinks

* Add aggregated counters by gpu for nvlink errors

---------

Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de>

* Ccmessage migration (#119)

* Add cpu_used (all-cpu_idle) to CpustatCollector

* Update cc-metric-collector.init

* Allow selection of timestamp precision in HttpSink

* Add comment about precision requirement for cc-metric-store

* Fix for API changes in gofish@v0.15.0

* Update requirements to latest version

* Read sensors through redfish

* Update golang toolchain to 1.21

* Remove stray error check

* Update main config in configuration.md

* Update Release action to use golang 1.22 stable release, no golang RPMs anymore

* Update runonce action to use golang 1.22 stable release, no golang RPMs anymore

* Switch to CCMessage for all files.

---------

Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu>
Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>

* Switch to ccmessage also for latest additions in nvidiaMetric

* New Message processor (#118)

* Add cpu_used (all-cpu_idle) to CpustatCollector

* Update cc-metric-collector.init

* Allow selection of timestamp precision in HttpSink

* Add comment about precision requirement for cc-metric-store

* Fix for API changes in gofish@v0.15.0

* Update requirements to latest version

* Read sensors through redfish

* Update golang toolchain to 1.21

* Remove stray error check

* Update main config in configuration.md

* Update Release action to use golang 1.22 stable release, no golang RPMs anymore

* Update runonce action to use golang 1.22 stable release, no golang RPMs anymore

* New message processor to check whether a message should be dropped or manipulate it in flight

* Create a copy of message before manipulation

---------

Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu>
Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>

* Update collector's Makefile and go.mod/sum files

* Use message processor in router, all sinks and all receivers

* Add support for credential file (NKEY) to NATS sink and receiver

* Fix JSON keys in message processor configuration

* Update docs for message processor, router and the default router config file

* Add link to expr syntax and fix regex matching docs

* Update sample collectors

* Minor style change in collector manager

* Some helpers for ccTopology

* LIKWID collector: write log owner change only once

* Fix for metrics without units and reduce debugging messages for messageProcessor

* Use shorted hostname for hostname added by router

* Define default port for NATS

* CPUstat collector: only add unit for applicable metrics

* Add precision option to all sinks using Influx's encoder

* Add message processor to all sink documentation

* Add units to documentation of cpustat collector

---------

Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu>
Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
Co-authored-by: oscarminus <me@oscarminus.de>
Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de>
This commit is contained in:
Thomas Gruber
2024-12-19 23:00:14 +01:00
committed by GitHub
parent 21646e1edf
commit 7840de7b82
74 changed files with 1902 additions and 1017 deletions

View File

@@ -1,5 +1,5 @@
# LIKWID version
LIKWID_VERSION := 5.2.2
LIKWID_VERSION := 5.4.1
LIKWID_INSTALLED_FOLDER := $(shell dirname $$(which likwid-topology 2>/dev/null) 2>/dev/null)
LIKWID_FOLDER := $(CURDIR)/likwid
@@ -23,7 +23,7 @@ likwid:
mkdir --parents --verbose "$${BUILD_FOLDER}"
wget --output-document=- http://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz |
tar --directory="$${BUILD_FOLDER}" --extract --gz
install -D --verbose --preserve-timestamps --mode=0644 --target-directory="$(LIKWID_FOLDER)" "$${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes"/likwid*.h "$${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes"/bstrlib.h
install -D --verbose --preserve-timestamps --mode=0644 --target-directory="$(LIKWID_FOLDER)" "$${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes"/likwid*.h
rm --recursive "$${BUILD_FOLDER}"
fi

View File

@@ -15,7 +15,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
const DEFAULT_BEEGFS_CMD = "beegfs-ctl"
@@ -110,7 +110,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
return nil
}
func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
@@ -216,7 +216,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMetr
for key, data := range m.matches {
value, _ := strconv.ParseFloat(data, 32)
y, err := lp.New(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil {
output <- y
}

View File

@@ -15,7 +15,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
// Struct for the collector-specific JSON config
@@ -103,7 +103,7 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
return nil
}
func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
@@ -208,7 +208,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
for key, data := range m.matches {
value, _ := strconv.ParseFloat(data, 32)
y, err := lp.New(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil {
output <- y
}

View File

@@ -6,8 +6,8 @@ import (
"sync"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
)
@@ -47,7 +47,7 @@ var AvailableCollectors = map[string]MetricCollector{
type collectorManager struct {
collectors []MetricCollector // List of metric collectors to read in parallel
serial []MetricCollector // List of metric collectors to read serially
output chan lp.CCMetric // Output channels
output chan lp.CCMessage // Output channels
done chan bool // channel to finish / stop metric collector manager
ticker mct.MultiChanTicker // periodically ticking once each interval
duration time.Duration // duration (for metrics that measure over a given duration)
@@ -60,7 +60,7 @@ type collectorManager struct {
// Metric collector manager access functions
type CollectorManager interface {
Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) error
AddOutput(output chan lp.CCMetric)
AddOutput(output chan lp.CCMessage)
Start()
Close()
}
@@ -187,7 +187,7 @@ func (cm *collectorManager) Start() {
}
// AddOutput adds the output channel to the metric collector manager
func (cm *collectorManager) AddOutput(output chan lp.CCMetric) {
func (cm *collectorManager) AddOutput(output chan lp.CCMessage) {
cm.output = output
}

View File

@@ -10,8 +10,8 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
// CPUFreqCollector
@@ -112,14 +112,14 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
// Check if at least one CPU with frequency information was detected
if len(m.topology) == 0 {
return fmt.Errorf("No CPU frequency info found in %s", cpuInfoFile)
return fmt.Errorf("no CPU frequency info found in %s", cpuInfoFile)
}
m.init = true
return nil
}
func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Check if already initialized
if !m.init {
return
@@ -154,7 +154,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
return
}
if y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil {
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil {
output <- y
}
}

View File

@@ -10,7 +10,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
"github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"golang.org/x/sys/unix"
)
@@ -91,7 +91,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
return nil
}
func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Check if already initialized
if !m.init {
return
@@ -117,7 +117,7 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric)
continue
}
if y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil {
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil {
output <- y
}
}

View File

@@ -9,8 +9,8 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
sysconf "github.com/tklauser/go-sysconf"
)
@@ -34,7 +34,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
m.name = "CpustatCollector"
m.setup()
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "CPU", "unit": "Percent"}
m.meta = map[string]string{"source": m.name, "group": "CPU"}
m.nodetags = map[string]string{"type": "node"}
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
@@ -105,7 +105,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
return nil
}
func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]string, output chan lp.CCMetric, now time.Time, tsdelta time.Duration) {
func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]string, output chan lp.CCMessage, now time.Time, tsdelta time.Duration) {
values := make(map[string]float64)
clktck, _ := sysconf.Sysconf(sysconf.SC_CLK_TCK)
for match, index := range m.matches {
@@ -122,21 +122,23 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
sum := float64(0)
for name, value := range values {
sum += value
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now)
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now)
if err == nil {
y.AddTag("unit", "Percent")
output <- y
}
}
if v, ok := values["cpu_idle"]; ok {
sum -= v
y, err := lp.New("cpu_used", tags, m.meta, map[string]interface{}{"value": sum * 100}, now)
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]interface{}{"value": sum * 100}, now)
if err == nil {
y.AddTag("unit", "Percent")
output <- y
}
}
}
func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
@@ -162,7 +164,7 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric)
}
}
num_cpus_metric, err := lp.New("num_cpus",
num_cpus_metric, err := lp.NewMessage("num_cpus",
m.nodetags,
m.meta,
map[string]interface{}{"value": int(num_cpus)},

View File

@@ -13,14 +13,15 @@ The `cpustat` collector reads data from `/proc/stat` and outputs a handful **nod
Metrics:
* `cpu_user`
* `cpu_nice`
* `cpu_system`
* `cpu_idle`
* `cpu_iowait`
* `cpu_irq`
* `cpu_softirq`
* `cpu_steal`
* `cpu_guest`
* `cpu_guest_nice`
* `cpu_used` = `cpu_* - cpu_idle`
* `cpu_user` with `unit=Percent`
* `cpu_nice` with `unit=Percent`
* `cpu_system` with `unit=Percent`
* `cpu_idle` with `unit=Percent`
* `cpu_iowait` with `unit=Percent`
* `cpu_irq` with `unit=Percent`
* `cpu_softirq` with `unit=Percent`
* `cpu_steal` with `unit=Percent`
* `cpu_guest` with `unit=Percent`
* `cpu_guest_nice` with `unit=Percent`
* `cpu_used` = `cpu_* - cpu_idle` with `unit=Percent`
* `num_cpus`

View File

@@ -9,7 +9,7 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
influx "github.com/influxdata/line-protocol"
)
@@ -75,7 +75,7 @@ var DefaultTime = func() time.Time {
return time.Unix(42, 0)
}
func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}

View File

@@ -9,7 +9,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
// "log"
@@ -48,7 +48,7 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
return nil
}
func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
@@ -92,13 +92,13 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric
}
tags := map[string]string{"type": "node", "device": linefields[0]}
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
y, err := lp.New("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
if err == nil {
y.AddMeta("unit", "GBytes")
output <- y
}
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000)
y, err = lp.New("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
y, err = lp.NewMessage("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
if err == nil {
y.AddMeta("unit", "GBytes")
output <- y
@@ -110,7 +110,7 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric
}
}
}
y, err := lp.New("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
if err == nil {
y.AddMeta("unit", "percent")
output <- y

View File

@@ -14,7 +14,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
const DEFAULT_GPFS_CMD = "mmpmon"
@@ -94,7 +94,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
return nil
}
func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Check if already initialized
if !m.init {
return
@@ -218,7 +218,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
continue
}
if y, err :=
lp.New(
lp.NewMessage(
"gpfs_bytes_read",
m.tags,
m.meta,
@@ -234,7 +234,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if lastBytesRead := m.lastState[filesystem].bytesRead; lastBytesRead >= 0 {
bwRead := float64(bytesRead-lastBytesRead) / timeDiff
if y, err :=
lp.New(
lp.NewMessage(
"gpfs_bw_read",
m.tags,
m.meta,
@@ -258,7 +258,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
continue
}
if y, err :=
lp.New(
lp.NewMessage(
"gpfs_bytes_written",
m.tags,
m.meta,
@@ -274,7 +274,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if lastBytesWritten := m.lastState[filesystem].bytesRead; lastBytesWritten >= 0 {
bwWrite := float64(bytesWritten-lastBytesWritten) / timeDiff
if y, err :=
lp.New(
lp.NewMessage(
"gpfs_bw_write",
m.tags,
m.meta,
@@ -304,7 +304,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
fmt.Sprintf("Read(): Failed to convert number of opens '%s' to int64: %v", key_value["_oc_"], err))
continue
}
if y, err := lp.New("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp); err == nil {
if y, err := lp.NewMessage("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp); err == nil {
output <- y
}
@@ -316,7 +316,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
fmt.Sprintf("Read(): Failed to convert number of closes: '%s' to int64: %v", key_value["_cc_"], err))
continue
}
if y, err := lp.New("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp); err == nil {
if y, err := lp.NewMessage("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp); err == nil {
output <- y
}
@@ -328,7 +328,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
fmt.Sprintf("Read(): Failed to convert number of reads: '%s' to int64: %v", key_value["_rdc_"], err))
continue
}
if y, err := lp.New("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp); err == nil {
if y, err := lp.NewMessage("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp); err == nil {
output <- y
}
@@ -340,7 +340,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
fmt.Sprintf("Read(): Failed to convert number of writes: '%s' to int64: %v", key_value["_wc_"], err))
continue
}
if y, err := lp.New("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp); err == nil {
if y, err := lp.NewMessage("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp); err == nil {
output <- y
}
@@ -352,7 +352,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
fmt.Sprintf("Read(): Failed to convert number of read directories: '%s' to int64: %v", key_value["_dir_"], err))
continue
}
if y, err := lp.New("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp); err == nil {
if y, err := lp.NewMessage("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp); err == nil {
output <- y
}
@@ -364,7 +364,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
fmt.Sprintf("Read(): Failed to convert number of inode updates: '%s' to int: %v", key_value["_iu_"], err))
continue
}
if y, err := lp.New("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil {
if y, err := lp.NewMessage("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil {
output <- y
}
@@ -372,7 +372,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if m.config.SendTotalValues {
bytesTotal := bytesRead + bytesWritten
if y, err :=
lp.New("gpfs_bytes_total",
lp.NewMessage("gpfs_bytes_total",
m.tags,
m.meta,
map[string]interface{}{
@@ -385,7 +385,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
}
iops := numReads + numWrites
if y, err :=
lp.New("gpfs_iops",
lp.NewMessage("gpfs_iops",
m.tags,
m.meta,
map[string]interface{}{
@@ -397,7 +397,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
}
metaops := numInodeUpdates + numCloses + numOpens + numReaddirs
if y, err :=
lp.New("gpfs_metaops",
lp.NewMessage("gpfs_metaops",
m.tags,
m.meta,
map[string]interface{}{

View File

@@ -5,7 +5,7 @@ import (
"os"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
"golang.org/x/sys/unix"
"encoding/json"
@@ -182,7 +182,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
}
// Read reads Infiniband counter files below IB_BASEPATH
func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Check if already initialized
if !m.init {
@@ -230,7 +230,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
// Send absolut values
if m.config.SendAbsoluteValues {
if y, err :=
lp.New(
lp.NewMessage(
counterDef.name,
info.tagSet,
m.meta,
@@ -248,7 +248,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
if counterDef.lastState >= 0 {
rate := float64((counterDef.currentState - counterDef.lastState)) / timeDiff
if y, err :=
lp.New(
lp.NewMessage(
counterDef.name+"_bw",
info.tagSet,
m.meta,
@@ -278,7 +278,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
// Send total values
if m.config.SendTotalValues {
if y, err :=
lp.New(
lp.NewMessage(
"ib_total",
info.tagSet,
m.meta,
@@ -291,7 +291,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
}
if y, err :=
lp.New(
lp.NewMessage(
"ib_total_pkts",
info.tagSet,
m.meta,

View File

@@ -5,7 +5,7 @@ import (
"os"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
// "log"
"encoding/json"
@@ -107,7 +107,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
return err
}
func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
@@ -139,7 +139,7 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMetric)
x, err := strconv.ParseInt(linefields[idx], 0, 64)
if err == nil {
diff := x - entry.lastValues[name]
y, err := lp.New(name, entry.tags, m.meta, map[string]interface{}{"value": int(diff)}, time.Now())
y, err := lp.NewMessage(name, entry.tags, m.meta, map[string]interface{}{"value": int(diff)}, time.Now())
if err == nil {
output <- y
}

View File

@@ -14,7 +14,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
const IPMISENSORS_PATH = `ipmi-sensors`
@@ -83,7 +83,7 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
return nil
}
func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMetric) {
func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
// Setup ipmitool command
command := exec.Command(cmd, "sensor")
@@ -121,7 +121,7 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMetric) {
unit = "Watts"
}
y, err := lp.New(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
if err == nil {
y.AddMeta("unit", unit)
output <- y
@@ -141,7 +141,7 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMetric) {
}
}
func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMetric) {
func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
command := exec.Command(cmd, "--comma-separated-output", "--sdr-cache-recreate")
command.Wait()
@@ -159,7 +159,7 @@ func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMetric) {
v, err := strconv.ParseFloat(lv[3], 64)
if err == nil {
name := strings.ToLower(strings.Replace(lv[1], " ", "_", -1))
y, err := lp.New(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
if err == nil {
if len(lv) > 4 {
y.AddMeta("unit", lv[4])
@@ -171,7 +171,7 @@ func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMetric) {
}
}
func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Check if already initialized
if !m.init {

View File

@@ -24,9 +24,9 @@ import (
"time"
"unsafe"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"github.com/NVIDIA/go-nvml/pkg/dl"
"github.com/fsnotify/fsnotify"
@@ -43,7 +43,7 @@ const (
type LikwidCollectorMetricConfig struct {
Name string `json:"name"` // Name of the metric
Calc string `json:"calc"` // Calculation for the metric using
Type string `json:"type"` // Metric type (aka node, socket, cpu, ...)
Type string `json:"type"` // Metric type (aka node, socket, hwthread, ...)
Publish bool `json:"publish"`
SendCoreTotalVal bool `json:"send_core_total_values,omitempty"`
SendSocketTotalVal bool `json:"send_socket_total_values,omitempty"`
@@ -91,6 +91,8 @@ type LikwidCollector struct {
running bool
initialized bool
needs_reinit bool
myuid int
lock_err_once bool
likwidGroups map[C.int]LikwidEventsetConfig
lock sync.Mutex
measureThread thread.Thread
@@ -204,6 +206,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
m.initialized = false
m.needs_reinit = true
m.running = false
m.myuid = os.Getuid()
m.config.AccessMode = LIKWID_DEF_ACCESSMODE
m.config.LibraryPath = LIKWID_LIB_NAME
m.config.LockfilePath = LIKWID_DEF_LOCKFILE
@@ -390,14 +393,24 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
}
// Check file ownership
uid := info.Sys().(*syscall.Stat_t).Uid
if uid != uint32(os.Getuid()) {
if uid != uint32(m.myuid) {
usr, err := user.LookupId(fmt.Sprint(uid))
if err == nil {
return true, fmt.Errorf("Access to performance counters locked by %s", usr.Username)
err = fmt.Errorf("access to performance counters locked by %s", usr.Username)
} else {
return true, fmt.Errorf("Access to performance counters locked by %d", uid)
err = fmt.Errorf("access to performance counters locked by %d", uid)
}
// delete error if we already returned the error once.
if !m.lock_err_once {
m.lock_err_once = true
} else {
err = nil
}
return true, err
}
// reset lock_err_once
m.lock_err_once = false
// Add the lock file to the watcher
err = watcher.Add(m.config.LockfilePath)
if err != nil {
@@ -436,9 +449,7 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
gid = C.perfmon_addEventSet(evset.estr)
}
if gid < 0 {
return true, fmt.Errorf("failed to add events %s, error %d", evset.go_estr, gid)
} else {
evset.gid = gid
return true, fmt.Errorf("failed to add events %s, id %d, error %d", evset.go_estr, evidx, gid)
}
// Setup all performance monitoring counters of an eventSet
@@ -549,11 +560,12 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
}
// Get all measurement results for an event set, derive the metric values out of the measurement results and send it
func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error {
func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interval time.Duration, output chan lp.CCMessage) error {
invClock := float64(1.0 / m.basefreq)
for _, tid := range m.cpu2tid {
evset.results[tid]["inverseClock"] = invClock
evset.results[tid]["gotime"] = interval.Seconds()
}
// Go over the event set metrics, derive the value out of the event:counter values and send it
@@ -582,7 +594,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
if !math.IsNaN(value) && metric.Publish {
fields := map[string]interface{}{"value": value}
y, err :=
lp.New(
lp.NewMessage(
metric.Name,
map[string]string{
"type": metric.Type,
@@ -619,7 +631,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
for coreID, value := range totalCoreValues {
y, err :=
lp.New(
lp.NewMessage(
metric.Name,
map[string]string{
"type": "core",
@@ -656,7 +668,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
for socketID, value := range totalSocketValues {
y, err :=
lp.New(
lp.NewMessage(
metric.Name,
map[string]string{
"type": "socket",
@@ -691,7 +703,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
}
y, err :=
lp.New(
lp.NewMessage(
metric.Name,
map[string]string{
"type": "node",
@@ -716,7 +728,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
}
// Go over the global metrics, derive the value out of the event sets' metric values and send it
func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error {
func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, interval time.Duration, output chan lp.CCMessage) error {
// Send all metrics with same time stamp
// This function does only computiation, counter measurement is done before
now := time.Now()
@@ -737,6 +749,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
params[mname] = mres
}
}
params["gotime"] = interval.Seconds()
// Evaluate the metric
value, err := agg.EvalFloat64Condition(metric.Calc, params)
if err != nil {
@@ -750,7 +763,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
if !math.IsNaN(value) {
if metric.Publish {
y, err :=
lp.New(
lp.NewMessage(
metric.Name,
map[string]string{
"type": metric.Type,
@@ -778,7 +791,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
return nil
}
func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMetric) {
func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMessage) {
var err error = nil
groups := make([]LikwidEventsetConfig, 0)
@@ -798,15 +811,17 @@ func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMe
if !skip {
// read measurements and derive event set metrics
m.calcEventsetMetrics(e, interval, output)
groups = append(groups, e)
}
groups = append(groups, e)
}
// calculate global metrics
m.calcGlobalMetrics(groups, interval, output)
if len(groups) > 0 {
// calculate global metrics
m.calcGlobalMetrics(groups, interval, output)
}
}
// main read function taking multiple measurement rounds, each 'interval' seconds long
func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}

View File

@@ -8,18 +8,16 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
//
// LoadavgCollector collects:
// * load average of last 1, 5 & 15 minutes
// * number of processes currently runnable
// * total number of processes in system
//
// See: https://www.kernel.org/doc/html/latest/filesystems/proc.html
//
const LOADAVGFILE = "/proc/loadavg"
type LoadavgCollector struct {
@@ -68,17 +66,15 @@ func (m *LoadavgCollector) Init(config json.RawMessage) error {
return nil
}
func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
buffer, err := os.ReadFile(LOADAVGFILE)
if err != nil {
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to read file '%s': %v", LOADAVGFILE, err))
}
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to read file '%s': %v", LOADAVGFILE, err))
return
}
now := time.Now()
@@ -96,7 +92,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric)
if m.load_skips[i] {
continue
}
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
if err == nil {
output <- y
}
@@ -115,7 +111,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric)
if m.proc_skips[i] {
continue
}
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
if err == nil {
output <- y
}

View File

@@ -11,7 +11,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
const LUSTRE_SYSFS = `/sys/fs/lustre`
@@ -377,7 +377,7 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
return nil
}
func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
@@ -388,7 +388,7 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMetric)
for _, def := range m.definitions {
var use_x int64
var err error
var y lp.CCMetric
var y lp.CCMessage
x, err := getMetricData(data, def.lineprefix, def.lineoffset)
if err == nil {
use_x = x
@@ -399,19 +399,19 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMetric)
switch def.calc {
case "none":
value = use_x
y, err = lp.New(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
case "difference":
value = use_x - devData[def.name]
if value.(int64) < 0 {
value = 0
}
y, err = lp.New(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
case "derivative":
value = float64(use_x-devData[def.name]) / tdiff.Seconds()
if value.(float64) < 0 {
value = 0
}
y, err = lp.New(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
}
if err == nil {
y.AddTag("device", device)

View File

@@ -13,7 +13,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
const MEMSTATFILE = "/proc/meminfo"
@@ -159,7 +159,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
return err
}
func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
@@ -175,7 +175,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric)
}
}
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now())
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil {
if len(unit) > 0 {
y.AddMeta("unit", unit)
@@ -208,7 +208,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric)
}
}
}
y, err := lp.New("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now())
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now())
if err == nil {
if len(unit) > 0 {
y.AddMeta("unit", unit)

View File

@@ -5,7 +5,7 @@ import (
"fmt"
"time"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
type MetricCollector interface {
@@ -13,7 +13,7 @@ type MetricCollector interface {
Init(config json.RawMessage) error // Initialize metric collector
Initialized() bool // Is metric collector initialized?
Parallel() bool
Read(duration time.Duration, output chan lp.CCMetric) // Read metrics from metric collector
Read(duration time.Duration, output chan lp.CCMessage) // Read metrics from metric collector
Close() // Close / finish metric collector
}

View File

@@ -10,7 +10,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
const NETSTATFILE = "/proc/net/dev"
@@ -153,7 +153,7 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
return nil
}
func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
@@ -197,14 +197,14 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMetric)
continue
}
if m.config.SendAbsoluteValues {
if y, err := lp.New(metric.name, metric.tags, metric.meta, map[string]interface{}{"value": v}, now); err == nil {
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]interface{}{"value": v}, now); err == nil {
output <- y
}
}
if m.config.SendDerivedValues {
if metric.lastValue >= 0 {
rate := float64(v-metric.lastValue) / timeDiff
if y, err := lp.New(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]interface{}{"value": rate}, now); err == nil {
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]interface{}{"value": rate}, now); err == nil {
output <- y
}
}

View File

@@ -11,7 +11,7 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
// First part contains the code for the general NfsCollector.
@@ -118,7 +118,7 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
return nil
}
func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
@@ -140,7 +140,7 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
continue
}
value := data.current - data.last
y, err := lp.New(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
y.AddMeta("version", m.version)
output <- y

View File

@@ -10,7 +10,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
// These are the fields we read from the JSON configuration
@@ -114,7 +114,7 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
return err
}
func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
timestamp := time.Now()
// Get the current values for all mountpoints
@@ -126,7 +126,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMetri
// Calculate the difference of old and new values
for i := range values {
x := values[i] - old[i]
y, err := lp.New(fmt.Sprintf("nfsio_%s", i), m.tags, m.meta, map[string]interface{}{"value": x}, timestamp)
y, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", i), m.tags, m.meta, map[string]interface{}{"value": x}, timestamp)
if err == nil {
if strings.HasPrefix(i, "page") {
y.AddMeta("unit", "4K_Pages")

View File

@@ -11,7 +11,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
// Non-Uniform Memory Access (NUMA) policy hit/miss statistics
@@ -97,7 +97,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
return nil
}
func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
@@ -130,7 +130,7 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMetri
fmt.Sprintf("Read(): Failed to convert %s='%s' to int64: %v", key, split[1], err))
continue
}
y, err := lp.New(
y, err := lp.NewMessage(
"numastats_"+key,
t.tagSet,
m.meta,

View File

@@ -8,8 +8,8 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
"github.com/NVIDIA/go-nvml/pkg/nvml"
)
@@ -206,7 +206,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
return nil
}
func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_fb_mem_total"] || !device.excludeMetrics["nv_fb_mem_used"] || !device.excludeMetrics["nv_fb_mem_reserved"] {
var total uint64
var used uint64
@@ -222,7 +222,7 @@ func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error
if !device.excludeMetrics["nv_fb_mem_total"] {
t := float64(total) / (1024 * 1024)
y, err := lp.New("nv_fb_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
y, err := lp.NewMessage("nv_fb_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
@@ -231,7 +231,7 @@ func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error
if !device.excludeMetrics["nv_fb_mem_used"] {
f := float64(used) / (1024 * 1024)
y, err := lp.New("nv_fb_mem_used", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
y, err := lp.NewMessage("nv_fb_mem_used", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
@@ -240,7 +240,7 @@ func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error
if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] {
r := float64(reserved) / (1024 * 1024)
y, err := lp.New("nv_fb_mem_reserved", device.tags, device.meta, map[string]interface{}{"value": r}, time.Now())
y, err := lp.NewMessage("nv_fb_mem_reserved", device.tags, device.meta, map[string]interface{}{"value": r}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
@@ -250,7 +250,7 @@ func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error
return nil
}
func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_bar1_mem_total"] || !device.excludeMetrics["nv_bar1_mem_used"] {
meminfo, ret := nvml.DeviceGetBAR1MemoryInfo(device.device)
if ret != nvml.SUCCESS {
@@ -259,7 +259,7 @@ func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) er
}
if !device.excludeMetrics["nv_bar1_mem_total"] {
t := float64(meminfo.Bar1Total) / (1024 * 1024)
y, err := lp.New("nv_bar1_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
y, err := lp.NewMessage("nv_bar1_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
@@ -267,7 +267,7 @@ func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) er
}
if !device.excludeMetrics["nv_bar1_mem_used"] {
t := float64(meminfo.Bar1Used) / (1024 * 1024)
y, err := lp.New("nv_bar1_mem_used", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
y, err := lp.NewMessage("nv_bar1_mem_used", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
@@ -277,7 +277,7 @@ func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMetric) er
return nil
}
func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
if ret != nvml.SUCCESS {
err := errors.New(nvml.ErrorString(ret))
@@ -301,14 +301,14 @@ func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) erro
util, ret := nvml.DeviceGetUtilizationRates(device.device)
if ret == nvml.SUCCESS {
if !device.excludeMetrics["nv_util"] {
y, err := lp.New("nv_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
y, err := lp.NewMessage("nv_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
if err == nil {
y.AddMeta("unit", "%")
output <- y
}
}
if !device.excludeMetrics["nv_mem_util"] {
y, err := lp.New("nv_mem_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
y, err := lp.NewMessage("nv_mem_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
if err == nil {
y.AddMeta("unit", "%")
output <- y
@@ -319,7 +319,7 @@ func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) erro
return nil
}
func readTemp(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readTemp(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_temp"] {
// Retrieves the current temperature readings for the device, in degrees C.
//
@@ -328,7 +328,7 @@ func readTemp(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
// * NVML_TEMPERATURE_COUNT
temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_temp", device.tags, device.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
y, err := lp.NewMessage("nv_temp", device.tags, device.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
if err == nil {
y.AddMeta("unit", "degC")
output <- y
@@ -338,7 +338,7 @@ func readTemp(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
return nil
}
func readFan(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readFan(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_fan"] {
// Retrieves the intended operating speed of the device's fan.
//
@@ -351,7 +351,7 @@ func readFan(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
// This value may exceed 100% in certain cases.
fan, ret := nvml.DeviceGetFanSpeed(device.device)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
if err == nil {
y.AddMeta("unit", "%")
output <- y
@@ -361,14 +361,14 @@ func readFan(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
return nil
}
// func readFans(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
// func readFans(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
// if !device.excludeMetrics["nv_fan"] {
// numFans, ret := nvml.DeviceGetNumFans(device.device)
// if ret == nvml.SUCCESS {
// for i := 0; i < numFans; i++ {
// fan, ret := nvml.DeviceGetFanSpeed_v2(device.device, i)
// if ret == nvml.SUCCESS {
// y, err := lp.New("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
// y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
// if err == nil {
// y.AddMeta("unit", "%")
// y.AddTag("stype", "fan")
@@ -382,7 +382,7 @@ func readFan(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
// return nil
// }
func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_ecc_mode"] {
// Retrieves the current and pending ECC modes for the device.
//
@@ -393,21 +393,21 @@ func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
// The "pending" ECC mode refers to the target mode following the next reboot.
_, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
if ret == nvml.SUCCESS {
var y lp.CCMetric
var y lp.CCMessage
var err error
switch ecc_pend {
case nvml.FEATURE_DISABLED:
y, err = lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "OFF"}, time.Now())
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "OFF"}, time.Now())
case nvml.FEATURE_ENABLED:
y, err = lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "ON"}, time.Now())
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "ON"}, time.Now())
default:
y, err = lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now())
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now())
}
if err == nil {
output <- y
}
} else if ret == nvml.ERROR_NOT_SUPPORTED {
y, err := lp.New("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "N/A"}, time.Now())
y, err := lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "N/A"}, time.Now())
if err == nil {
output <- y
}
@@ -416,7 +416,7 @@ func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
return nil
}
func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_perf_state"] {
// Retrieves the current performance state for the device.
//
@@ -427,7 +427,7 @@ func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMetric) error
// 32: Unknown performance state.
pState, ret := nvml.DeviceGetPerformanceState(device.device)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_perf_state", device.tags, device.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
y, err := lp.NewMessage("nv_perf_state", device.tags, device.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
if err == nil {
output <- y
}
@@ -436,7 +436,7 @@ func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMetric) error
return nil
}
func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_power_usage"] {
// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
//
@@ -450,7 +450,7 @@ func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMetric) error
if mode == nvml.FEATURE_ENABLED {
power, ret := nvml.DeviceGetPowerUsage(device.device)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_power_usage", device.tags, device.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
y, err := lp.NewMessage("nv_power_usage", device.tags, device.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
if err == nil {
y.AddMeta("unit", "watts")
output <- y
@@ -461,7 +461,7 @@ func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMetric) error
return nil
}
func readClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
// Retrieves the current clock speeds for the device.
//
// Available clock information:
@@ -471,7 +471,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
if !device.excludeMetrics["nv_graphics_clock"] {
graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now())
y, err := lp.NewMessage("nv_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -482,7 +482,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
if !device.excludeMetrics["nv_sm_clock"] {
smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(smCock)}, time.Now())
y, err := lp.NewMessage("nv_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(smCock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -493,7 +493,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
if !device.excludeMetrics["nv_mem_clock"] {
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
y, err := lp.NewMessage("nv_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -503,7 +503,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
if !device.excludeMetrics["nv_video_clock"] {
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
y, err := lp.NewMessage("nv_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -513,7 +513,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
return nil
}
func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
// Retrieves the maximum clock speeds for the device.
//
// Available clock information:
@@ -528,7 +528,7 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error
if !device.excludeMetrics["nv_max_graphics_clock"] {
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_max_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
y, err := lp.NewMessage("nv_max_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -539,7 +539,7 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error
if !device.excludeMetrics["nv_max_sm_clock"] {
maxSmClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_max_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxSmClock)}, time.Now())
y, err := lp.NewMessage("nv_max_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxSmClock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -550,7 +550,7 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error
if !device.excludeMetrics["nv_max_mem_clock"] {
maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_max_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now())
y, err := lp.NewMessage("nv_max_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -561,7 +561,7 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error
if !device.excludeMetrics["nv_max_video_clock"] {
maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_max_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now())
y, err := lp.NewMessage("nv_max_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -571,7 +571,7 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMetric) error
return nil
}
func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_ecc_uncorrected_error"] {
// Retrieves the total ECC error counts for the device.
//
@@ -584,7 +584,7 @@ func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMetric) error
// i.e. the total set of errors across the entire device.
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
y, err := lp.NewMessage("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
if err == nil {
output <- y
}
@@ -593,7 +593,7 @@ func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMetric) error
if !device.excludeMetrics["nv_ecc_corrected_error"] {
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_ecc_corrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
y, err := lp.NewMessage("nv_ecc_corrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
if err == nil {
output <- y
}
@@ -602,7 +602,7 @@ func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMetric) error
return nil
}
func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_power_max_limit"] {
// Retrieves the power management limit associated with this device.
//
@@ -612,7 +612,7 @@ func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMetric) error
// If the card's total power draw reaches this limit the power management algorithm kicks in.
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_power_max_limit", device.tags, device.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now())
y, err := lp.NewMessage("nv_power_max_limit", device.tags, device.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now())
if err == nil {
y.AddMeta("unit", "watts")
output <- y
@@ -622,7 +622,7 @@ func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMetric) error
return nil
}
func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
if ret != nvml.SUCCESS {
err := errors.New(nvml.ErrorString(ret))
@@ -639,7 +639,7 @@ func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) e
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_encoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
y, err := lp.NewMessage("nv_encoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
if err == nil {
y.AddMeta("unit", "%")
output <- y
@@ -649,7 +649,7 @@ func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) e
return nil
}
func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
if ret != nvml.SUCCESS {
err := errors.New(nvml.ErrorString(ret))
@@ -666,7 +666,7 @@ func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) e
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_decoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
y, err := lp.NewMessage("nv_decoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
if err == nil {
y.AddMeta("unit", "%")
output <- y
@@ -676,7 +676,7 @@ func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) e
return nil
}
func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_remapped_rows_corrected"] ||
!device.excludeMetrics["nv_remapped_rows_uncorrected"] ||
!device.excludeMetrics["nv_remapped_rows_pending"] ||
@@ -693,13 +693,13 @@ func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMetric) err
corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device)
if ret == nvml.SUCCESS {
if !device.excludeMetrics["nv_remapped_rows_corrected"] {
y, err := lp.New("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(corrected)}, time.Now())
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(corrected)}, time.Now())
if err == nil {
output <- y
}
}
if !device.excludeMetrics["nv_remapped_rows_uncorrected"] {
y, err := lp.New("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(uncorrected)}, time.Now())
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(uncorrected)}, time.Now())
if err == nil {
output <- y
}
@@ -709,7 +709,7 @@ func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMetric) err
if pending {
p = 1
}
y, err := lp.New("nv_remapped_rows_pending", device.tags, device.meta, map[string]interface{}{"value": p}, time.Now())
y, err := lp.NewMessage("nv_remapped_rows_pending", device.tags, device.meta, map[string]interface{}{"value": p}, time.Now())
if err == nil {
output <- y
}
@@ -719,7 +719,7 @@ func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMetric) err
if failure {
f = 1
}
y, err := lp.New("nv_remapped_rows_failure", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
y, err := lp.NewMessage("nv_remapped_rows_failure", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
if err == nil {
output <- y
}
@@ -729,7 +729,7 @@ func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMetric) err
return nil
}
func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_compute_processes"] {
// Get information about processes with a compute context on a device
//
@@ -753,7 +753,7 @@ func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMetric) er
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
y, err := lp.NewMessage("nv_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
if err == nil {
output <- y
}
@@ -782,7 +782,7 @@ func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMetric) er
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device)
if ret == nvml.SUCCESS {
y, err := lp.New("nv_graphics_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
y, err := lp.NewMessage("nv_graphics_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
if err == nil {
output <- y
}
@@ -812,7 +812,7 @@ func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMetric) er
// // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
// procList, ret := nvml.DeviceGetMPSComputeRunningProcesses(device.device)
// if ret == nvml.SUCCESS {
// y, err := lp.New("nv_mps_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
// y, err := lp.NewMessage("nv_mps_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
// if err == nil {
// output <- y
// }
@@ -821,7 +821,7 @@ func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMetric) er
return nil
}
func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
var violTime nvml.ViolationTime
var ret nvml.Return
@@ -840,7 +840,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.New("nv_violation_power", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_power", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -852,7 +852,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.New("nv_violation_thermal", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_thermal", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -864,7 +864,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.New("nv_violation_sync_boost", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_sync_boost", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -876,7 +876,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.New("nv_violation_board_limit", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_board_limit", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -888,7 +888,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.New("nv_violation_low_util", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_low_util", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -900,7 +900,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.New("nv_violation_reliability", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_reliability", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -912,7 +912,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.New("nv_violation_below_app_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_below_app_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -924,7 +924,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.New("nv_violation_below_base_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_below_base_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -935,12 +935,18 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMetric) e
return nil
}
func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) error {
func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
// Retrieves the specified error counter value
// Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
//
// For Pascal &tm; or newer fully supported devices.
var aggregate_crc_errors uint64 = 0
var aggregate_ecc_errors uint64 = 0
var aggregate_replay_errors uint64 = 0
var aggregate_recovery_errors uint64 = 0
var aggregate_crc_flit_errors uint64 = 0
for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ {
state, ret := nvml.DeviceGetNvLinkState(device.device, i)
if ret == nvml.SUCCESS {
@@ -948,8 +954,9 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro
if !device.excludeMetrics["nv_nvlink_crc_errors"] {
// Data link receive data CRC error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA)
aggregate_crc_errors = aggregate_crc_errors + count
if ret == nvml.SUCCESS {
y, err := lp.New("nv_nvlink_crc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
y, err := lp.NewMessage("nv_nvlink_crc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
y.AddTag("stype-id", fmt.Sprintf("%d", i))
@@ -960,8 +967,9 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro
if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
// Data link receive data ECC error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA)
aggregate_ecc_errors = aggregate_ecc_errors + count
if ret == nvml.SUCCESS {
y, err := lp.New("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
y, err := lp.NewMessage("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
y.AddTag("stype-id", fmt.Sprintf("%d", i))
@@ -972,8 +980,9 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro
if !device.excludeMetrics["nv_nvlink_replay_errors"] {
// Data link transmit replay error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY)
aggregate_replay_errors = aggregate_replay_errors + count
if ret == nvml.SUCCESS {
y, err := lp.New("nv_nvlink_replay_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
y, err := lp.NewMessage("nv_nvlink_replay_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
y.AddTag("stype-id", fmt.Sprintf("%d", i))
@@ -984,8 +993,9 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro
if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
// Data link transmit recovery error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY)
aggregate_recovery_errors = aggregate_recovery_errors + count
if ret == nvml.SUCCESS {
y, err := lp.New("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
y, err := lp.NewMessage("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
y.AddTag("stype-id", fmt.Sprintf("%d", i))
@@ -996,8 +1006,9 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
// Data link receive flow control digit CRC error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT)
aggregate_crc_flit_errors = aggregate_crc_flit_errors + count
if ret == nvml.SUCCESS {
y, err := lp.New("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
y.AddTag("stype-id", fmt.Sprintf("%d", i))
@@ -1008,16 +1019,58 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro
}
}
}
// Export aggegated values
if !device.excludeMetrics["nv_nvlink_crc_errors"] {
// Data link receive data CRC error counter
y, err := lp.NewMessage("nv_nvlink_crc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_errors}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
output <- y
}
}
if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
// Data link receive data ECC error counter
y, err := lp.NewMessage("nv_nvlink_ecc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_ecc_errors}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
output <- y
}
}
if !device.excludeMetrics["nv_nvlink_replay_errors"] {
// Data link transmit replay error counter
y, err := lp.NewMessage("nv_nvlink_replay_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_replay_errors}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
output <- y
}
}
if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
// Data link transmit recovery error counter
y, err := lp.NewMessage("nv_nvlink_recovery_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_recovery_errors}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
output <- y
}
}
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
// Data link receive flow control digit CRC error counter
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_flit_errors}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
output <- y
}
}
return nil
}
func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage) {
var err error
if !m.init {
return
}
readAll := func(device NvidiaCollectorDevice, output chan lp.CCMetric) {
readAll := func(device NvidiaCollectorDevice, output chan lp.CCMessage) {
name, ret := nvml.DeviceGetName(device.device)
if ret != nvml.SUCCESS {
name = "NoName"

View File

@@ -10,7 +10,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
// running average power limit (RAPL) monitoring attributes for a zone
@@ -214,7 +214,7 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
// Read reads running average power limit (RAPL) monitoring attributes for all initialized zones
// See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes
func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) {
for i := range m.RAPLZoneInfo {
p := &m.RAPLZoneInfo[i]
@@ -237,7 +237,7 @@ func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMetric) {
timeDiff := energyTimestamp.Sub(p.energyTimestamp)
averagePower := float64(energyDiff) / float64(timeDiff.Microseconds())
y, err := lp.New(
y, err := lp.NewMessage(
"rapl_average_power",
p.tags,
m.meta,

View File

@@ -7,7 +7,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
)
@@ -162,7 +162,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
// Read collects all metrics belonging to the sample collector
// and sends them through the output channel to the collector manager
func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Create a sample metric
timestamp := time.Now()
@@ -175,119 +175,119 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMetric)
if !dev.excludeMetrics["rocm_gfx_util"] {
value := metrics.Average_gfx_activity
y, err := lp.New("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_umc_util"] {
value := metrics.Average_umc_activity
y, err := lp.New("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_mm_util"] {
value := metrics.Average_mm_activity
y, err := lp.New("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_avg_power"] {
value := metrics.Average_socket_power
y, err := lp.New("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_mem"] {
value := metrics.Temperature_mem
y, err := lp.New("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_hotspot"] {
value := metrics.Temperature_hotspot
y, err := lp.New("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_edge"] {
value := metrics.Temperature_edge
y, err := lp.New("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_vrgfx"] {
value := metrics.Temperature_vrgfx
y, err := lp.New("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_vrsoc"] {
value := metrics.Temperature_vrsoc
y, err := lp.New("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_vrmem"] {
value := metrics.Temperature_vrmem
y, err := lp.New("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_gfx_clock"] {
value := metrics.Average_gfxclk_frequency
y, err := lp.New("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_soc_clock"] {
value := metrics.Average_socclk_frequency
y, err := lp.New("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_u_clock"] {
value := metrics.Average_uclk_frequency
y, err := lp.New("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_v0_clock"] {
value := metrics.Average_vclk0_frequency
y, err := lp.New("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_v1_clock"] {
value := metrics.Average_vclk1_frequency
y, err := lp.New("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_d0_clock"] {
value := metrics.Average_dclk0_frequency
y, err := lp.New("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_d1_clock"] {
value := metrics.Average_dclk1_frequency
y, err := lp.New("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
@@ -295,7 +295,7 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMetric)
if !dev.excludeMetrics["rocm_temp_hbm"] {
for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ {
value := metrics.Temperature_hbm[i]
y, err := lp.New("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
y.AddTag("stype", "device")
y.AddTag("stype-id", fmt.Sprintf("%d", i))

View File

@@ -4,8 +4,8 @@ import (
"encoding/json"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
// These are the fields we read from the JSON configuration
@@ -32,7 +32,7 @@ type SampleCollector struct {
func (m *SampleCollector) Init(config json.RawMessage) error {
var err error = nil
// Always set the name early in Init() to use it in cclog.Component* functions
m.name = "InternalCollector"
m.name = "SampleCollector"
// This is for later use, also call it early
m.setup()
// Tell whether the collector should be run in parallel with others (reading files, ...)
@@ -74,7 +74,7 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
// Read collects all metrics belonging to the sample collector
// and sends them through the output channel to the collector manager
func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Create a sample metric
timestamp := time.Now()
@@ -85,7 +85,7 @@ func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMetric)
// stop := readState()
// value = (stop - start) / interval.Seconds()
y, err := lp.New("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
// Send it to output channel
output <- y

View File

@@ -5,8 +5,8 @@ import (
"sync"
"time"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
// These are the fields we read from the JSON configuration
@@ -25,7 +25,7 @@ type SampleTimerCollector struct {
config SampleTimerCollectorConfig // the configuration structure
interval time.Duration // the interval parsed from configuration
ticker *time.Ticker // own timer
output chan lp.CCMetric // own internal output channel
output chan lp.CCMessage // own internal output channel
}
func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
@@ -100,14 +100,14 @@ func (m *SampleTimerCollector) ReadMetrics(timestamp time.Time) {
// stop := readState()
// value = (stop - start) / interval.Seconds()
y, err := lp.New("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil && m.output != nil {
// Send it to output channel if we have a valid channel
m.output <- y
}
}
func (m *SampleTimerCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *SampleTimerCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Capture output channel
m.output = output
}

View File

@@ -11,7 +11,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
const SCHEDSTATFILE = `/proc/schedstat`
@@ -96,7 +96,7 @@ func (m *SchedstatCollector) Init(config json.RawMessage) error {
return err
}
func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]string, output chan lp.CCMetric, now time.Time, tsdelta time.Duration) {
func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]string, output chan lp.CCMessage, now time.Time, tsdelta time.Duration) {
running, _ := strconv.ParseInt(linefields[7], 10, 64)
waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
diff_running := running - m.olddata[linefields[0]]["running"]
@@ -109,7 +109,7 @@ func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]
m.olddata[linefields[0]]["waiting"] = waiting
value := l_running + l_waiting
y, err := lp.New("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now)
y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now)
if err == nil {
// Send it to output channel
output <- y
@@ -118,7 +118,7 @@ func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]
// Read collects all metrics belonging to the sample collector
// and sends them through the output channel to the collector manager
func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}

View File

@@ -7,7 +7,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
type SelfCollectorConfig struct {
@@ -42,56 +42,56 @@ func (m *SelfCollector) Init(config json.RawMessage) error {
return err
}
func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
timestamp := time.Now()
if m.config.MemStats {
var memstats runtime.MemStats
runtime.ReadMemStats(&memstats)
y, err := lp.New("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp)
y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp)
y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp)
y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp)
y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp)
y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp)
y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp)
y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp)
if err == nil {
output <- y
}
}
if m.config.GoRoutines {
y, err := lp.New("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp)
y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp)
if err == nil {
output <- y
}
}
if m.config.CgoCalls {
y, err := lp.New("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp)
y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp)
if err == nil {
output <- y
}
@@ -102,35 +102,35 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if err == nil {
sec, nsec := rusage.Utime.Unix()
t := float64(sec) + (float64(nsec) * 1e-9)
y, err := lp.New("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
if err == nil {
y.AddMeta("unit", "seconds")
output <- y
}
sec, nsec = rusage.Stime.Unix()
t = float64(sec) + (float64(nsec) * 1e-9)
y, err = lp.New("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
if err == nil {
y.AddMeta("unit", "seconds")
output <- y
}
y, err = lp.New("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp)
y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp)
if err == nil {
output <- y
}
y, err = lp.New("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp)
y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp)
if err == nil {
output <- y
}
y, err = lp.New("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp)
y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp)
if err == nil {
output <- y
}
y, err = lp.New("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp)
y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp)
if err == nil {
output <- y
}
y, err = lp.New("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp)
y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp)
if err == nil {
output <- y
}

View File

@@ -10,7 +10,7 @@ import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
@@ -171,7 +171,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
return nil
}
func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
for _, sensor := range m.sensors {
// Read sensor file
@@ -190,7 +190,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) {
continue
}
x /= 1000
y, err := lp.New(
y, err := lp.NewMessage(
sensor.metricName,
sensor.tags,
m.meta,
@@ -203,7 +203,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) {
// max temperature
if m.config.ReportMaxTemp && sensor.maxTemp != 0 {
y, err := lp.New(
y, err := lp.NewMessage(
sensor.maxTempName,
sensor.tags,
m.meta,
@@ -217,7 +217,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) {
// critical temperature
if m.config.ReportCriticalTemp && sensor.critTemp != 0 {
y, err := lp.New(
y, err := lp.NewMessage(
sensor.critTempName,
sensor.tags,
m.meta,

View File

@@ -9,7 +9,7 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
)
const MAX_NUM_PROCS = 10
@@ -53,7 +53,7 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
return nil
}
func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
@@ -68,7 +68,7 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMetric
lines := strings.Split(string(stdout), "\n")
for i := 1; i < m.config.Num_procs+1; i++ {
name := fmt.Sprintf("topproc%d", i)
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now())
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now())
if err == nil {
output <- y
}