Merge develop branch into main (#106)

* Add cpu_used (all-cpu_idle) to CpustatCollector

* Update to line-protocol/v2

* Update runonce.yml with Golang 1.20

* Update fsnotify in LIKWID Collector

* Use not a pointer to line-protocol.Encoder

* Simplify Makefile

* Use only as many arguments as required

* Allow sum function to handle non float types

* Allow values to be a slice of type float64, float32, int, int64, int32, bool

* Use generic function to simplify code

* Add missing case for type []int32

* Use generic function to compute minimum

* Use generic function to compute maximum

* Use generic function to compute average

* Add error value to sumAnyType

* Use generic function to compute median

* For older versions of go slices is not part of the installation

* Remove old entries from go.sum

* Use simpler sort function

* Compute metrics ib_total and ib_total_pkts

* Add aggregated metrics.
Add missing units

* Update likwidMetric.go

Fixes a potential bug when `fsnotify.NewWatcher()` fails with an error

* Completly avoid memory allocations in infinibandMetric read()

* Fixed initialization: Initalization and measurements should run in the same thread

---------

Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
This commit is contained in:
Thomas Gruber
2023-08-29 14:12:49 +02:00
committed by GitHub
parent 3d7bb4cdd7
commit 195d0794b0
17 changed files with 746 additions and 839 deletions

View File

@@ -1,31 +1,33 @@
# LIKWID version
LIKWID_VERSION = 5.2.2
LIKWID_INSTALLED_FOLDER=$(shell dirname $(shell which likwid-topology 2>/dev/null) 2>/dev/null)
LIKWID_VERSION := 5.2.2
LIKWID_INSTALLED_FOLDER := $(shell dirname $$(which likwid-topology 2>/dev/null) 2>/dev/null)
LIKWID_FOLDER="$(shell pwd)/likwid"
LIKWID_FOLDER := $(CURDIR)/likwid
all: $(LIKWID_FOLDER)/likwid.h
all: likwid
.ONESHELL:
.PHONY: $(LIKWID_FOLDER)/likwid.h
$(LIKWID_FOLDER)/likwid.h:
if [ "$(LIKWID_INSTALLED_FOLDER)" != "" ]; then \
BASE="$(LIKWID_INSTALLED_FOLDER)/../include"; \
mkdir -p $(LIKWID_FOLDER); \
cp $$BASE/*.h $(LIKWID_FOLDER); \
else \
BUILD_FOLDER="$${PWD}/likwidbuild"; \
if [ -d $(LIKWID_FOLDER) ]; then rm -r $(LIKWID_FOLDER); fi; \
mkdir --parents --verbose $(LIKWID_FOLDER) $${BUILD_FOLDER}; \
wget -P "$${BUILD_FOLDER}" http://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz; \
tar -C $${BUILD_FOLDER} -xf $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION).tar.gz; \
install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/likwid*.h $(LIKWID_FOLDER)/; \
install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/bstrlib.h $(LIKWID_FOLDER)/; \
rm -r $${BUILD_FOLDER}; \
.PHONY: likwid
likwid:
if [ -n "$(LIKWID_INSTALLED_FOLDER)" ]; then
# Using likwid include files from system installation
INCLUDE_DIR="$(LIKWID_INSTALLED_FOLDER)/../include"
mkdir --parents --verbose "$(LIKWID_FOLDER)"
cp "$${INCLUDE_DIR}"/*.h "$(LIKWID_FOLDER)"
else
# Using likwid include files from downloaded tar archive
if [ -d "$(LIKWID_FOLDER)" ]; then
rm --recursive "$(LIKWID_FOLDER)"
fi
BUILD_FOLDER="$${PWD}/likwidbuild"
mkdir --parents --verbose "$${BUILD_FOLDER}"
wget --output-document=- http://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz |
tar --directory="$${BUILD_FOLDER}" --extract --gz
install -D --verbose --preserve-timestamps --mode=0644 --target-directory="$(LIKWID_FOLDER)" "$${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes"/likwid*.h "$${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes"/bstrlib.h
rm --recursive "$${BUILD_FOLDER}"
fi
.PHONY: clean
clean:
rm -rf likwid
.PHONY: clean

View File

@@ -119,12 +119,21 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
}
}
sum := float64(0)
for name, value := range values {
sum += value
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now)
if err == nil {
output <- y
}
}
if v, ok := values["cpu_idle"]; ok {
sum -= v
y, err := lp.New("cpu_used", tags, m.meta, map[string]interface{}{"value": sum * 100}, now)
if err == nil {
output <- y
}
}
}
func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric) {

View File

@@ -23,3 +23,4 @@ Metrics:
* `cpu_steal`
* `cpu_guest`
* `cpu_guest_nice`
* `cpu_used` = `cpu_* - cpu_idle`

View File

@@ -31,6 +31,7 @@ type GpfsCollector struct {
Mmpmon string `json:"mmpmon_path,omitempty"`
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
SendBandwidths bool `json:"send_bandwidths"`
SendTotalValues bool `json:"send_total_values"`
}
skipFS map[string]struct{}
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
@@ -216,13 +217,33 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
fmt.Sprintf("Read(): Failed to convert bytes read '%s' to int64: %v", key_value["_br_"], err))
continue
}
if y, err := lp.New("gpfs_bytes_read", m.tags, m.meta, map[string]interface{}{"value": bytesRead}, timestamp); err == nil {
if y, err :=
lp.New(
"gpfs_bytes_read",
m.tags,
m.meta,
map[string]interface{}{
"value": bytesRead,
},
timestamp,
); err == nil {
y.AddMeta("unit", "bytes")
output <- y
}
if m.config.SendBandwidths {
if lastBytesRead := m.lastState[filesystem].bytesRead; lastBytesRead >= 0 {
bwRead := float64(bytesRead-lastBytesRead) / timeDiff
if y, err := lp.New("gpfs_bw_read", m.tags, m.meta, map[string]interface{}{"value": bwRead}, timestamp); err == nil {
if y, err :=
lp.New(
"gpfs_bw_read",
m.tags,
m.meta,
map[string]interface{}{
"value": bwRead,
},
timestamp,
); err == nil {
y.AddMeta("unit", "bytes/sec")
output <- y
}
}
@@ -236,13 +257,33 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
fmt.Sprintf("Read(): Failed to convert bytes written '%s' to int64: %v", key_value["_bw_"], err))
continue
}
if y, err := lp.New("gpfs_bytes_written", m.tags, m.meta, map[string]interface{}{"value": bytesWritten}, timestamp); err == nil {
if y, err :=
lp.New(
"gpfs_bytes_written",
m.tags,
m.meta,
map[string]interface{}{
"value": bytesWritten,
},
timestamp,
); err == nil {
y.AddMeta("unit", "bytes")
output <- y
}
if m.config.SendBandwidths {
if lastBytesWritten := m.lastState[filesystem].bytesRead; lastBytesWritten >= 0 {
bwWrite := float64(bytesWritten-lastBytesWritten) / timeDiff
if y, err := lp.New("gpfs_bw_write", m.tags, m.meta, map[string]interface{}{"value": bwWrite}, timestamp); err == nil {
if y, err :=
lp.New(
"gpfs_bw_write",
m.tags,
m.meta,
map[string]interface{}{
"value": bwWrite,
},
timestamp,
); err == nil {
y.AddMeta("unit", "bytes/sec")
output <- y
}
}
@@ -326,6 +367,47 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if y, err := lp.New("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil {
output <- y
}
// Total values
if m.config.SendTotalValues {
bytesTotal := bytesRead + bytesWritten
if y, err :=
lp.New("gpfs_bytes_total",
m.tags,
m.meta,
map[string]interface{}{
"value": bytesTotal,
},
timestamp,
); err == nil {
y.AddMeta("unit", "bytes")
output <- y
}
iops := numReads + numWrites
if y, err :=
lp.New("gpfs_iops",
m.tags,
m.meta,
map[string]interface{}{
"value": iops,
},
timestamp,
); err == nil {
output <- y
}
metaops := numInodeUpdates + numCloses + numOpens + numReaddirs
if y, err :=
lp.New("gpfs_metaops",
m.tags,
m.meta,
map[string]interface{}{
"value": metaops,
},
timestamp,
); err == nil {
output <- y
}
}
}
}

View File

@@ -6,7 +6,8 @@
"exclude_filesystem": [
"fs1"
],
"send_bandwidths" : true
"send_bandwidths": true,
"send_total_values": true
}
```
@@ -26,8 +27,12 @@ Metrics:
* `gpfs_num_opens`
* `gpfs_num_closes`
* `gpfs_num_reads`
* `gpfs_num_writes`
* `gpfs_num_readdirs`
* `gpfs_num_inode_updates`
* `gpfs_bytes_total = gpfs_bytes_read + gpfs_bytes_written` (if `send_total_values == true`)
* `gpfs_iops = gpfs_num_reads + gpfs_num_writes` (if `send_total_values == true`)
* `gpfs_metaops = gpfs_num_inode_updates + gpfs_num_closes + gpfs_num_opens + gpfs_num_readdirs` (if `send_total_values == true`)
* `gpfs_bw_read` (if `send_bandwidths == true`)
* `gpfs_bw_write` (if `send_bandwidths == true`)

View File

@@ -18,18 +18,22 @@ import (
const IB_BASEPATH = "/sys/class/infiniband/"
type InfinibandCollectorMetric struct {
path string
unit string
scale int64
name string
path string
unit string
scale int64
addToIBTotal bool
addToIBTotalPkgs bool
currentState int64
lastState int64
}
type InfinibandCollectorInfo struct {
LID string // IB local Identifier (LID)
device string // IB device
port string // IB device port
portCounterFiles map[string]InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric
tagSet map[string]string // corresponding tag list
lastState map[string]int64 // State from last measurement
LID string // IB local Identifier (LID)
device string // IB device
port string // IB device port
portCounterFiles []InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric
tagSet map[string]string // corresponding tag list
}
type InfinibandCollector struct {
@@ -37,9 +41,10 @@ type InfinibandCollector struct {
config struct {
ExcludeDevices []string `json:"exclude_devices,omitempty"` // IB device to exclude e.g. mlx5_0
SendAbsoluteValues bool `json:"send_abs_values"` // Send absolut values as read from sys filesystem
SendTotalValues bool `json:"send_total_values"` // Send computed total values
SendDerivedValues bool `json:"send_derived_values"` // Send derived values e.g. rates
}
info []*InfinibandCollectorInfo
info []InfinibandCollectorInfo
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
}
@@ -112,11 +117,39 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
// Check access to counter files
countersDir := filepath.Join(path, "counters")
portCounterFiles := map[string]InfinibandCollectorMetric{
"ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes", scale: 4},
"ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes", scale: 4},
"ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets", scale: 1},
"ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets", scale: 1},
portCounterFiles := []InfinibandCollectorMetric{
{
name: "ib_recv",
path: filepath.Join(countersDir, "port_rcv_data"),
unit: "bytes",
scale: 4,
addToIBTotal: true,
lastState: -1,
},
{
name: "ib_xmit",
path: filepath.Join(countersDir, "port_xmit_data"),
unit: "bytes",
scale: 4,
addToIBTotal: true,
lastState: -1,
},
{
name: "ib_recv_pkts",
path: filepath.Join(countersDir, "port_rcv_packets"),
unit: "packets",
scale: 1,
addToIBTotalPkgs: true,
lastState: -1,
},
{
name: "ib_xmit_pkts",
path: filepath.Join(countersDir, "port_xmit_packets"),
unit: "packets",
scale: 1,
addToIBTotalPkgs: true,
lastState: -1,
},
}
for _, counter := range portCounterFiles {
err := unix.Access(counter.path, unix.R_OK)
@@ -125,14 +158,8 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
}
}
// Initialize last state
lastState := make(map[string]int64)
for counter := range portCounterFiles {
lastState[counter] = -1
}
m.info = append(m.info,
&InfinibandCollectorInfo{
InfinibandCollectorInfo{
LID: LID,
device: device,
port: port,
@@ -143,7 +170,6 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
"port": port,
"lid": LID,
},
lastState: lastState,
})
}
@@ -170,8 +196,12 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
// Save current timestamp
m.lastTimestamp = now
for _, info := range m.info {
for counterName, counterDef := range info.portCounterFiles {
for i := range m.info {
info := &m.info[i]
var ib_total, ib_total_pkts int64
for i := range info.portCounterFiles {
counterDef := &info.portCounterFiles[i]
// Read counter file
line, err := os.ReadFile(counterDef.path)
@@ -188,15 +218,26 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterName, data, err))
fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterDef.name, data, err))
continue
}
// Scale raw value
v *= counterDef.scale
// Save current state
counterDef.currentState = v
// Send absolut values
if m.config.SendAbsoluteValues {
if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil {
if y, err :=
lp.New(
counterDef.name,
info.tagSet,
m.meta,
map[string]interface{}{
"value": counterDef.currentState,
},
now); err == nil {
y.AddMeta("unit", counterDef.unit)
output <- y
}
@@ -204,18 +245,64 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
// Send derived values
if m.config.SendDerivedValues {
if info.lastState[counterName] >= 0 {
rate := float64((v - info.lastState[counterName])) / timeDiff
if y, err := lp.New(counterName+"_bw", info.tagSet, m.meta, map[string]interface{}{"value": rate}, now); err == nil {
if counterDef.lastState >= 0 {
rate := float64((counterDef.currentState - counterDef.lastState)) / timeDiff
if y, err :=
lp.New(
counterDef.name+"_bw",
info.tagSet,
m.meta,
map[string]interface{}{
"value": rate,
},
now); err == nil {
y.AddMeta("unit", counterDef.unit+"/sec")
output <- y
}
}
// Save current state
info.lastState[counterName] = v
counterDef.lastState = counterDef.currentState
}
// Sum up total values
if m.config.SendTotalValues {
switch {
case counterDef.addToIBTotal:
ib_total += counterDef.currentState
case counterDef.addToIBTotalPkgs:
ib_total_pkts += counterDef.currentState
}
}
}
// Send total values
if m.config.SendTotalValues {
if y, err :=
lp.New(
"ib_total",
info.tagSet,
m.meta,
map[string]interface{}{
"value": ib_total,
},
now); err == nil {
y.AddMeta("unit", "bytes")
output <- y
}
if y, err :=
lp.New(
"ib_total_pkts",
info.tagSet,
m.meta,
map[string]interface{}{
"value": ib_total_pkts,
},
now); err == nil {
y.AddMeta("unit", "packets")
output <- y
}
}
}
}

View File

@@ -17,13 +17,16 @@ LID file (`/sys/class/infiniband/<dev>/ports/<port>/lid`)
The devices can be filtered with the `exclude_devices` option in the configuration.
For each found LID the collector reads data through the sysfs files below `/sys/class/infiniband/<device>`.
For each found LID the collector reads data through the sysfs files below `/sys/class/infiniband/<device>`. (See: <https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-class-infiniband>)
Metrics:
* `ib_recv`
* `ib_xmit`
* `ib_recv_pkts`
* `ib_xmit_pkts`
* `ib_total = ib_recv + ib_xmit` (if `send_total_values == true`)
* `ib_total_pkts = ib_recv_pkts + ib_xmit_pkts` (if `send_total_values == true`)
* `ib_recv_bw` (if `send_derived_values == true`)
* `ib_xmit_bw` (if `send_derived_values == true`)
* `ib_recv_pkts_bw` (if `send_derived_values == true`)

View File

@@ -30,7 +30,7 @@ import (
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"github.com/NVIDIA/go-nvml/pkg/dl"
"golang.design/x/thread"
fsnotify "gopkg.in/fsnotify.v0"
fsnotify "gopkg.in/fsnotify.v1"
)
const (
@@ -306,17 +306,36 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
cclog.ComponentError(m.name, err.Error())
return err
}
m.measureThread = thread.New()
switch m.config.AccessMode {
case "direct":
C.HPMmode(0)
m.measureThread.Call(
func() {
C.HPMmode(0)
})
case "accessdaemon":
if len(m.config.DaemonPath) > 0 {
p := os.Getenv("PATH")
os.Setenv("PATH", m.config.DaemonPath+":"+p)
}
C.HPMmode(1)
m.measureThread.Call(
func() {
C.HPMmode(1)
retCode := C.HPMinit()
if retCode != 0 {
err := fmt.Errorf("C.HPMinit() failed with return code %v", retCode)
cclog.ComponentError(m.name, err.Error())
}
})
for _, c := range m.cpulist {
C.HPMaddThread(c)
m.measureThread.Call(
func() {
retCode := C.HPMaddThread(c)
if retCode != 0 {
err := fmt.Errorf("C.HPMaddThread(%v) failed with return code %v", c, retCode)
cclog.ComponentError(m.name, err.Error())
}
})
}
}
m.sock2tid = make(map[int]int)
@@ -331,7 +350,6 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
}
m.basefreq = getBaseFreq()
m.measureThread = thread.New()
m.init = true
return nil
}
@@ -344,6 +362,7 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
watcher, err := fsnotify.NewWatcher()
if err != nil {
cclog.ComponentError(m.name, err.Error())
return true, err
}
defer watcher.Close()
if len(m.config.LockfilePath) > 0 {
@@ -360,7 +379,7 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
return true, fmt.Errorf("Access to performance counters locked by %d", stat.Uid)
}
}
err = watcher.Watch(m.config.LockfilePath)
err = watcher.Add(m.config.LockfilePath)
if err != nil {
cclog.ComponentError(m.name, err.Error())
}
@@ -368,9 +387,9 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
m.lock.Lock()
defer m.lock.Unlock()
select {
case e := <-watcher.Event:
case e := <-watcher.Events:
ret = -1
if !e.IsAttrib() {
if e.Op != fsnotify.Chmod {
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
}
default:
@@ -384,9 +403,9 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
select {
case <-sigchan:
gid = -1
case e := <-watcher.Event:
case e := <-watcher.Events:
gid = -1
if !e.IsAttrib() {
if e.Op != fsnotify.Chmod {
gid = C.perfmon_addEventSet(evset.estr)
}
default:
@@ -401,8 +420,8 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
select {
case <-sigchan:
ret = -1
case e := <-watcher.Event:
if !e.IsAttrib() {
case e := <-watcher.Events:
if e.Op != fsnotify.Chmod {
ret = C.perfmon_setupCounters(gid)
}
default:
@@ -414,8 +433,8 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
select {
case <-sigchan:
ret = -1
case e := <-watcher.Event:
if !e.IsAttrib() {
case e := <-watcher.Events:
if e.Op != fsnotify.Chmod {
ret = C.perfmon_startCounters()
}
default:
@@ -427,8 +446,8 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
select {
case <-sigchan:
ret = -1
case e := <-watcher.Event:
if !e.IsAttrib() {
case e := <-watcher.Events:
if e.Op != fsnotify.Chmod {
ret = C.perfmon_readCounters()
}
default:
@@ -441,8 +460,8 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
select {
case <-sigchan:
ret = -1
case e := <-watcher.Event:
if !e.IsAttrib() {
case e := <-watcher.Events:
if e.Op != fsnotify.Chmod {
ret = C.perfmon_readCounters()
}
default:
@@ -468,8 +487,8 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
select {
case <-sigchan:
ret = -1
case e := <-watcher.Event:
if !e.IsAttrib() {
case e := <-watcher.Events:
if e.Op != fsnotify.Chmod {
ret = C.perfmon_stopCounters()
}
default:
@@ -480,8 +499,8 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
}
signal.Stop(sigchan)
select {
case e := <-watcher.Event:
if !e.IsAttrib() {
case e := <-watcher.Events:
if e.Op != fsnotify.Chmod {
C.perfmon_finalize()
}
default:
@@ -589,7 +608,6 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
return nil
}
func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMetric) {
var err error = nil
groups := make([]LikwidEventsetConfig, 0)