mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-07-20 20:01:40 +02:00
Compare commits
43 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
155d1b9acf | ||
|
c9b9752b6a | ||
|
3c8a5e434f | ||
|
efd4f5feb4 | ||
|
a1f4dd6a6c | ||
|
d55e579195 | ||
|
b78e83b055 | ||
|
56b41a9e57 | ||
|
ae98807ace | ||
|
31a8e63d72 | ||
|
6f1f33f3a5 | ||
|
a29f0c7e3b | ||
|
4fb6ac0140 | ||
|
5918f96fd8 | ||
|
8cb87a2165 | ||
|
3e91a37dee | ||
|
ed68baeada | ||
|
888db31dbf | ||
|
c938d32629 | ||
|
d5daf54d4f | ||
|
18bffd7c14 | ||
|
bd0105b370 | ||
|
b1a8674c4c | ||
|
234ad3c54e | ||
|
7bb80780e0 | ||
|
e66d52bb32 | ||
|
9840d0193d | ||
|
ce7eef8d30 | ||
|
92e45ca62c | ||
|
fd10a279fc | ||
|
9e63d0ea59 | ||
|
76bb033a88 | ||
|
deb1bcfa2f | ||
|
7a67d5e25f | ||
|
9ae0806aa9 | ||
|
4bd71224df | ||
|
6bf3bfd10a | ||
|
0fbff00996 | ||
|
8849824ba9 | ||
|
ed511b7c09 | ||
|
a0acf01dc3 | ||
|
58461f1f72 | ||
|
c09d8fb118 |
32
Makefile
32
Makefile
@@ -84,7 +84,7 @@ RPM: scripts/cc-metric-collector.spec
|
|||||||
@COMMITISH="HEAD"
|
@COMMITISH="HEAD"
|
||||||
@VERS=$$(git describe --tags $${COMMITISH})
|
@VERS=$$(git describe --tags $${COMMITISH})
|
||||||
@VERS=$${VERS#v}
|
@VERS=$${VERS#v}
|
||||||
@VERS=$$(echo $${VERS} | sed -e s+'-'+'_'+g)
|
@VERS=$$(echo $$VERS | sed -e s+'-'+'_'+g)
|
||||||
@eval $$(rpmspec --query --queryformat "NAME='%{name}' VERSION='%{version}' RELEASE='%{release}' NVR='%{NVR}' NVRA='%{NVRA}'" --define="VERS $${VERS}" "$${SPECFILE}")
|
@eval $$(rpmspec --query --queryformat "NAME='%{name}' VERSION='%{version}' RELEASE='%{release}' NVR='%{NVR}' NVRA='%{NVRA}'" --define="VERS $${VERS}" "$${SPECFILE}")
|
||||||
@PREFIX="$${NAME}-$${VERSION}"
|
@PREFIX="$${NAME}-$${VERSION}"
|
||||||
@FORMAT="tar.gz"
|
@FORMAT="tar.gz"
|
||||||
@@ -96,8 +96,10 @@ RPM: scripts/cc-metric-collector.spec
|
|||||||
@if [[ "$${GITHUB_ACTIONS}" == true ]]; then
|
@if [[ "$${GITHUB_ACTIONS}" == true ]]; then
|
||||||
@ RPMFILE="$${RPMDIR}/$${ARCH}/$${NVRA}.rpm"
|
@ RPMFILE="$${RPMDIR}/$${ARCH}/$${NVRA}.rpm"
|
||||||
@ SRPMFILE="$${SRPMDIR}/$${NVR}.src.rpm"
|
@ SRPMFILE="$${SRPMDIR}/$${NVR}.src.rpm"
|
||||||
@ echo "SRPM=$${SRPMFILE}" >> $${GITHUB_OUTPUT}
|
@ echo "RPM: $${RPMFILE}"
|
||||||
@ echo "RPM=$${RPMFILE}" >> $${GITHUB_OUTPUT}
|
@ echo "SRPM: $${SRPMFILE}"
|
||||||
|
@ echo "::set-output name=SRPM::$${SRPMFILE}"
|
||||||
|
@ echo "::set-output name=RPM::$${RPMFILE}"
|
||||||
@fi
|
@fi
|
||||||
|
|
||||||
.PHONY: DEB
|
.PHONY: DEB
|
||||||
@@ -106,25 +108,29 @@ DEB: scripts/cc-metric-collector.deb.control $(APP)
|
|||||||
@WORKSPACE=$${PWD}/.dpkgbuild
|
@WORKSPACE=$${PWD}/.dpkgbuild
|
||||||
@DEBIANDIR=$${WORKSPACE}/debian
|
@DEBIANDIR=$${WORKSPACE}/debian
|
||||||
@DEBIANBINDIR=$${WORKSPACE}/DEBIAN
|
@DEBIANBINDIR=$${WORKSPACE}/DEBIAN
|
||||||
@mkdir --parents --verbose $${WORKSPACE} $${DEBIANBINDIR}
|
@mkdir --parents --verbose $$WORKSPACE $$DEBIANBINDIR
|
||||||
#@mkdir --parents --verbose $$DEBIANDIR
|
#@mkdir --parents --verbose $$DEBIANDIR
|
||||||
@CONTROLFILE="$${BASEDIR}/scripts/cc-metric-collector.deb.control"
|
@CONTROLFILE="$${BASEDIR}/scripts/cc-metric-collector.deb.control"
|
||||||
@COMMITISH="HEAD"
|
@COMMITISH="HEAD"
|
||||||
|
@git describe --tags --abbrev=0 $${COMMITISH}
|
||||||
@VERS=$$(git describe --tags --abbrev=0 $${COMMITISH})
|
@VERS=$$(git describe --tags --abbrev=0 $${COMMITISH})
|
||||||
@if [ -z "$${VERS}" ]; then VERS=${GITHUB_REF_NAME}; fi
|
@if [ -z "$$VERS" ]; then VERS=${GITHUB_REF_NAME}; fi
|
||||||
@VERS=$${VERS#v}
|
@VERS=$${VERS#v}
|
||||||
@VERS=$$(echo $${VERS} | sed -e s+'-'+'_'+g)
|
@VERS=$$(echo $$VERS | sed -e s+'-'+'_'+g)
|
||||||
@ARCH=$$(uname -m)
|
@ARCH=$$(uname -m)
|
||||||
@ARCH=$$(echo $${ARCH} | sed -e s+'_'+'-'+g)
|
@ARCH=$$(echo $$ARCH | sed -e s+'_'+'-'+g)
|
||||||
@if [ "$${ARCH}" = "x86-64" ]; then ARCH=amd64; fi
|
|
||||||
@PREFIX="$${NAME}-$${VERSION}_$${ARCH}"
|
@PREFIX="$${NAME}-$${VERSION}_$${ARCH}"
|
||||||
@SIZE_BYTES=$$(du -bcs --exclude=.dpkgbuild "$${WORKSPACE}"/ | awk '{print $$1}' | head -1 | sed -e 's/^0\+//')
|
@SIZE_BYTES=$$(du -bcs --exclude=.dpkgbuild "$$WORKSPACE"/ | awk '{print $$1}' | head -1 | sed -e 's/^0\+//')
|
||||||
@SIZE="$$(awk -v size="$${SIZE_BYTES}" 'BEGIN {print (size/1024)+1}' | awk '{print int($$0)}')"
|
@SIZE="$$(awk -v size="$$SIZE_BYTES" 'BEGIN {print (size/1024)+1}' | awk '{print int($$0)}')"
|
||||||
@sed -e s+"{VERSION}"+"$${VERS}"+g -e s+"{INSTALLED_SIZE}"+"$${SIZE}"+g -e s+"{ARCH}"+"$${ARCH}"+g $${CONTROLFILE} > $${DEBIANBINDIR}/control
|
#@sed -e s+"{VERSION}"+"$$VERS"+g -e s+"{INSTALLED_SIZE}"+"$$SIZE"+g -e s+"{ARCH}"+"$$ARCH"+g $$CONTROLFILE > $${DEBIANDIR}/control
|
||||||
|
@echo "Version: $$VERS"
|
||||||
|
@echo "Size: $$SIZE"
|
||||||
|
@echo "Arch: $$ARCH"
|
||||||
|
@sed -e s+"{VERSION}"+"$$VERS"+g -e s+"{INSTALLED_SIZE}"+"$$SIZE"+g -e s+"{ARCH}"+"$$ARCH"+g $$CONTROLFILE > $${DEBIANBINDIR}/control
|
||||||
@make PREFIX=$${WORKSPACE} install
|
@make PREFIX=$${WORKSPACE} install
|
||||||
@DEB_FILE="cc-metric-collector_$${VERS}_$${ARCH}.deb"
|
@DEB_FILE="cc-metric-collector_$${VERS}_$${ARCH}.deb"
|
||||||
@dpkg-deb -b $${WORKSPACE} "$${DEB_FILE}"
|
@dpkg-deb -b $${WORKSPACE} "$$DEB_FILE"
|
||||||
@if [ "$${GITHUB_ACTIONS}" = "true" ]; then
|
@if [ "$${GITHUB_ACTIONS}" = "true" ]; then
|
||||||
@ echo "DEB=$${DEB_FILE}" >> $${GITHUB_OUTPUT}
|
@ echo "::set-output name=DEB::$${DEB_FILE}"
|
||||||
@fi
|
@fi
|
||||||
@rm -r "$${WORKSPACE}"
|
@rm -r "$${WORKSPACE}"
|
||||||
|
@@ -8,10 +8,6 @@ There is a single timer loop that triggers all collectors serially, collects the
|
|||||||
|
|
||||||
The receiver runs as a go routine side-by-side with the timer loop and asynchronously forwards received metrics to the sink.
|
The receiver runs as a go routine side-by-side with the timer loop and asynchronously forwards received metrics to the sink.
|
||||||
|
|
||||||
|
|
||||||
[](https://doi.org/10.5281/zenodo.7438287)
|
|
||||||
|
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
|
|
||||||
Configuration is implemented using a single json document that is distributed over network and may be persisted as file.
|
Configuration is implemented using a single json document that is distributed over network and may be persisted as file.
|
||||||
|
@@ -1,5 +1,5 @@
|
|||||||
# LIKWID version
|
# LIKWID version
|
||||||
LIKWID_VERSION = 5.2.2
|
LIKWID_VERSION = 5.2.1
|
||||||
LIKWID_INSTALLED_FOLDER=$(shell dirname $(shell which likwid-topology 2>/dev/null) 2>/dev/null)
|
LIKWID_INSTALLED_FOLDER=$(shell dirname $(shell which likwid-topology 2>/dev/null) 2>/dev/null)
|
||||||
|
|
||||||
LIKWID_FOLDER="$(shell pwd)/likwid"
|
LIKWID_FOLDER="$(shell pwd)/likwid"
|
||||||
|
@@ -15,7 +15,6 @@ import (
|
|||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
"os/user"
|
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -30,14 +29,12 @@ import (
|
|||||||
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||||
"github.com/NVIDIA/go-nvml/pkg/dl"
|
"github.com/NVIDIA/go-nvml/pkg/dl"
|
||||||
"golang.design/x/thread"
|
"golang.design/x/thread"
|
||||||
fsnotify "gopkg.in/fsnotify.v0"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
LIKWID_LIB_NAME = "liblikwid.so"
|
LIKWID_LIB_NAME = "liblikwid.so"
|
||||||
LIKWID_LIB_DL_FLAGS = dl.RTLD_LAZY | dl.RTLD_GLOBAL
|
LIKWID_LIB_DL_FLAGS = dl.RTLD_LAZY | dl.RTLD_GLOBAL
|
||||||
LIKWID_DEF_ACCESSMODE = "direct"
|
LIKWID_DEF_ACCESSMODE = "direct"
|
||||||
LIKWID_DEF_LOCKFILE = "/var/run/likwid.lock"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type LikwidCollectorMetricConfig struct {
|
type LikwidCollectorMetricConfig struct {
|
||||||
@@ -71,7 +68,6 @@ type LikwidCollectorConfig struct {
|
|||||||
AccessMode string `json:"access_mode,omitempty"`
|
AccessMode string `json:"access_mode,omitempty"`
|
||||||
DaemonPath string `json:"accessdaemon_path,omitempty"`
|
DaemonPath string `json:"accessdaemon_path,omitempty"`
|
||||||
LibraryPath string `json:"liblikwid_path,omitempty"`
|
LibraryPath string `json:"liblikwid_path,omitempty"`
|
||||||
LockfilePath string `json:"lockfile_path,omitempty"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type LikwidCollector struct {
|
type LikwidCollector struct {
|
||||||
@@ -86,7 +82,7 @@ type LikwidCollector struct {
|
|||||||
basefreq float64
|
basefreq float64
|
||||||
running bool
|
running bool
|
||||||
initialized bool
|
initialized bool
|
||||||
needs_reinit bool
|
needs_reinit bool
|
||||||
likwidGroups map[C.int]LikwidEventsetConfig
|
likwidGroups map[C.int]LikwidEventsetConfig
|
||||||
lock sync.Mutex
|
lock sync.Mutex
|
||||||
measureThread thread.Thread
|
measureThread thread.Thread
|
||||||
@@ -202,7 +198,6 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
m.running = false
|
m.running = false
|
||||||
m.config.AccessMode = LIKWID_DEF_ACCESSMODE
|
m.config.AccessMode = LIKWID_DEF_ACCESSMODE
|
||||||
m.config.LibraryPath = LIKWID_LIB_NAME
|
m.config.LibraryPath = LIKWID_LIB_NAME
|
||||||
m.config.LockfilePath = LIKWID_DEF_LOCKFILE
|
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err := json.Unmarshal(config, &m.config)
|
err := json.Unmarshal(config, &m.config)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -260,16 +255,12 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
for _, metric := range evset.Metrics {
|
for _, metric := range evset.Metrics {
|
||||||
// Try to evaluate the metric
|
// Try to evaluate the metric
|
||||||
cclog.ComponentDebug(m.name, "Checking", metric.Name)
|
if testLikwidMetricFormula(metric.Calc, params) && checkMetricType(metric.Type) {
|
||||||
if !checkMetricType(metric.Type) {
|
// Add the computable metric to the parameter list for the global metrics
|
||||||
cclog.ComponentError(m.name, "Metric", metric.Name, "uses invalid type", metric.Type)
|
|
||||||
metric.Calc = ""
|
|
||||||
} else if !testLikwidMetricFormula(metric.Calc, params) {
|
|
||||||
cclog.ComponentError(m.name, "Metric", metric.Name, "cannot be calculated with given counters")
|
|
||||||
metric.Calc = ""
|
|
||||||
} else {
|
|
||||||
globalParams = append(globalParams, metric.Name)
|
globalParams = append(globalParams, metric.Name)
|
||||||
totalMetrics++
|
totalMetrics++
|
||||||
|
} else {
|
||||||
|
metric.Calc = ""
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -279,11 +270,8 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
for _, metric := range m.config.Metrics {
|
for _, metric := range m.config.Metrics {
|
||||||
// Try to evaluate the global metric
|
// Try to evaluate the global metric
|
||||||
if !checkMetricType(metric.Type) {
|
if !testLikwidMetricFormula(metric.Calc, globalParams) {
|
||||||
cclog.ComponentError(m.name, "Metric", metric.Name, "uses invalid type", metric.Type)
|
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed")
|
||||||
metric.Calc = ""
|
|
||||||
} else if !testLikwidMetricFormula(metric.Calc, globalParams) {
|
|
||||||
cclog.ComponentError(m.name, "Metric", metric.Name, "cannot be calculated with given counters")
|
|
||||||
metric.Calc = ""
|
metric.Calc = ""
|
||||||
} else if !checkMetricType(metric.Type) {
|
} else if !checkMetricType(metric.Type) {
|
||||||
cclog.ComponentError(m.name, "Metric", metric.Name, "has invalid type")
|
cclog.ComponentError(m.name, "Metric", metric.Name, "has invalid type")
|
||||||
@@ -299,194 +287,77 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
cclog.ComponentError(m.name, err.Error())
|
cclog.ComponentError(m.name, err.Error())
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
ret := C.topology_init()
|
|
||||||
if ret != 0 {
|
|
||||||
err := errors.New("failed to initialize topology module")
|
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
switch m.config.AccessMode {
|
|
||||||
case "direct":
|
|
||||||
C.HPMmode(0)
|
|
||||||
case "accessdaemon":
|
|
||||||
if len(m.config.DaemonPath) > 0 {
|
|
||||||
p := os.Getenv("PATH")
|
|
||||||
os.Setenv("PATH", m.config.DaemonPath+":"+p)
|
|
||||||
}
|
|
||||||
C.HPMmode(1)
|
|
||||||
for _, c := range m.cpulist {
|
|
||||||
C.HPMaddThread(c)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
m.sock2tid = make(map[int]int)
|
|
||||||
tmp := make([]C.int, 1)
|
|
||||||
for _, sid := range topo.SocketList() {
|
|
||||||
cstr := C.CString(fmt.Sprintf("S%d:0", sid))
|
|
||||||
ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1)
|
|
||||||
if ret > 0 {
|
|
||||||
m.sock2tid[sid] = m.cpu2tid[int(tmp[0])]
|
|
||||||
}
|
|
||||||
C.free(unsafe.Pointer(cstr))
|
|
||||||
}
|
|
||||||
|
|
||||||
m.basefreq = getBaseFreq()
|
|
||||||
m.measureThread = thread.New()
|
m.measureThread = thread.New()
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// take a measurement for 'interval' seconds of event set index 'group'
|
// take a measurement for 'interval' seconds of event set index 'group'
|
||||||
func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, interval time.Duration) (bool, error) {
|
func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval time.Duration) (bool, error) {
|
||||||
var ret C.int
|
var ret C.int
|
||||||
var gid C.int = -1
|
|
||||||
sigchan := make(chan os.Signal, 1)
|
|
||||||
watcher, err := fsnotify.NewWatcher()
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
}
|
|
||||||
defer watcher.Close()
|
|
||||||
if len(m.config.LockfilePath) > 0 {
|
|
||||||
info, err := os.Stat(m.config.LockfilePath)
|
|
||||||
if err != nil {
|
|
||||||
return true, err
|
|
||||||
}
|
|
||||||
stat := info.Sys().(*syscall.Stat_t)
|
|
||||||
if stat.Uid != uint32(os.Getuid()) {
|
|
||||||
usr, err := user.LookupId(strconv.FormatUint(uint64(stat.Uid), 10))
|
|
||||||
if err == nil {
|
|
||||||
return true, fmt.Errorf("Access to performance counters locked by %s", usr.Username)
|
|
||||||
} else {
|
|
||||||
return true, fmt.Errorf("Access to performance counters locked by %d", stat.Uid)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
err = watcher.Watch(m.config.LockfilePath)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
m.lock.Lock()
|
m.lock.Lock()
|
||||||
defer m.lock.Unlock()
|
if m.initialized {
|
||||||
select {
|
ret = C.perfmon_setupCounters(evset.gid)
|
||||||
case e := <-watcher.Event:
|
if ret != 0 {
|
||||||
ret = -1
|
var err error = nil
|
||||||
if !e.IsAttrib() {
|
var skip bool = false
|
||||||
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
cclog.ComponentDebug(m.name, "Setup returns", ret)
|
||||||
}
|
if ret == -37 {
|
||||||
default:
|
skip = true
|
||||||
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
} else {
|
||||||
}
|
err = fmt.Errorf("failed to setup performance group %d", evset.gid)
|
||||||
if ret != 0 {
|
|
||||||
return true, fmt.Errorf("failed to initialize library, error %d", ret)
|
|
||||||
}
|
|
||||||
signal.Notify(sigchan, os.Interrupt)
|
|
||||||
signal.Notify(sigchan, syscall.SIGCHLD)
|
|
||||||
select {
|
|
||||||
case <-sigchan:
|
|
||||||
gid = -1
|
|
||||||
case e := <-watcher.Event:
|
|
||||||
gid = -1
|
|
||||||
if !e.IsAttrib() {
|
|
||||||
gid = C.perfmon_addEventSet(evset.estr)
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
gid = C.perfmon_addEventSet(evset.estr)
|
|
||||||
}
|
|
||||||
if gid < 0 {
|
|
||||||
return true, fmt.Errorf("failed to add events %s, error %d", evset.go_estr, gid)
|
|
||||||
} else {
|
|
||||||
evset.gid = gid
|
|
||||||
//m.likwidGroups[gid] = evset
|
|
||||||
}
|
|
||||||
select {
|
|
||||||
case <-sigchan:
|
|
||||||
ret = -1
|
|
||||||
case e := <-watcher.Event:
|
|
||||||
if !e.IsAttrib() {
|
|
||||||
ret = C.perfmon_setupCounters(gid)
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
ret = C.perfmon_setupCounters(gid)
|
|
||||||
}
|
|
||||||
if ret != 0 {
|
|
||||||
return true, fmt.Errorf("failed to setup events '%s', error %d", evset.go_estr, ret)
|
|
||||||
}
|
|
||||||
select {
|
|
||||||
case <-sigchan:
|
|
||||||
ret = -1
|
|
||||||
case e := <-watcher.Event:
|
|
||||||
if !e.IsAttrib() {
|
|
||||||
ret = C.perfmon_startCounters()
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
ret = C.perfmon_startCounters()
|
|
||||||
}
|
|
||||||
if ret != 0 {
|
|
||||||
return true, fmt.Errorf("failed to start events '%s', error %d", evset.go_estr, ret)
|
|
||||||
}
|
|
||||||
select {
|
|
||||||
case <-sigchan:
|
|
||||||
ret = -1
|
|
||||||
case e := <-watcher.Event:
|
|
||||||
if !e.IsAttrib() {
|
|
||||||
ret = C.perfmon_readCounters()
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
ret = C.perfmon_readCounters()
|
|
||||||
}
|
|
||||||
if ret != 0 {
|
|
||||||
return true, fmt.Errorf("failed to read events '%s', error %d", evset.go_estr, ret)
|
|
||||||
}
|
|
||||||
time.Sleep(interval)
|
|
||||||
select {
|
|
||||||
case <-sigchan:
|
|
||||||
ret = -1
|
|
||||||
case e := <-watcher.Event:
|
|
||||||
if !e.IsAttrib() {
|
|
||||||
ret = C.perfmon_readCounters()
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
ret = C.perfmon_readCounters()
|
|
||||||
}
|
|
||||||
if ret != 0 {
|
|
||||||
return true, fmt.Errorf("failed to read events '%s', error %d", evset.go_estr, ret)
|
|
||||||
}
|
|
||||||
for eidx, counter := range evset.eorder {
|
|
||||||
gctr := C.GoString(counter)
|
|
||||||
for _, tid := range m.cpu2tid {
|
|
||||||
res := C.perfmon_getLastResult(gid, C.int(eidx), C.int(tid))
|
|
||||||
fres := float64(res)
|
|
||||||
if m.config.InvalidToZero && (math.IsNaN(fres) || math.IsInf(fres, 0)) {
|
|
||||||
fres = 0.0
|
|
||||||
}
|
}
|
||||||
evset.results[tid][gctr] = fres
|
m.lock.Unlock()
|
||||||
|
return skip, err
|
||||||
}
|
}
|
||||||
}
|
m.running = true
|
||||||
for _, tid := range m.cpu2tid {
|
ret = C.perfmon_startCounters()
|
||||||
evset.results[tid]["time"] = float64(C.perfmon_getLastTimeOfGroup(gid))
|
if ret != 0 {
|
||||||
}
|
var err error = nil
|
||||||
select {
|
var skip bool = false
|
||||||
case <-sigchan:
|
if ret == -37 {
|
||||||
ret = -1
|
skip = true
|
||||||
case e := <-watcher.Event:
|
} else {
|
||||||
if !e.IsAttrib() {
|
err = fmt.Errorf("failed to setup performance group %d", evset.gid)
|
||||||
ret = C.perfmon_stopCounters()
|
}
|
||||||
|
m.lock.Unlock()
|
||||||
|
return skip, err
|
||||||
}
|
}
|
||||||
default:
|
ret = C.perfmon_readCounters()
|
||||||
|
time.Sleep(interval)
|
||||||
|
m.running = false
|
||||||
ret = C.perfmon_stopCounters()
|
ret = C.perfmon_stopCounters()
|
||||||
}
|
if ret != 0 {
|
||||||
if ret != 0 {
|
var err error = nil
|
||||||
return true, fmt.Errorf("failed to stop events '%s', error %d", evset.go_estr, ret)
|
var skip bool = false
|
||||||
}
|
if ret == -37 {
|
||||||
signal.Stop(sigchan)
|
skip = true
|
||||||
select {
|
} else {
|
||||||
case e := <-watcher.Event:
|
err = fmt.Errorf("failed to setup performance group %d", evset.gid)
|
||||||
if !e.IsAttrib() {
|
}
|
||||||
C.perfmon_finalize()
|
m.lock.Unlock()
|
||||||
|
return skip, err
|
||||||
|
}
|
||||||
|
m.running = false
|
||||||
|
runtime := float64(C.perfmon_getLastTimeOfGroup(evset.gid))
|
||||||
|
// Go over events and get the results
|
||||||
|
for eidx, counter := range evset.eorder {
|
||||||
|
gctr := C.GoString(counter)
|
||||||
|
for _, tid := range m.cpu2tid {
|
||||||
|
res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid))
|
||||||
|
fres := float64(res)
|
||||||
|
if m.config.InvalidToZero && (math.IsNaN(fres) || math.IsInf(fres, 0)) {
|
||||||
|
cclog.ComponentDebug(m.name, "Sanitize", gctr, "to zero")
|
||||||
|
fres = 0.0
|
||||||
|
}
|
||||||
|
evset.results[tid][gctr] = fres
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, tid := range m.cpu2tid {
|
||||||
|
evset.results[tid]["time"] = runtime
|
||||||
}
|
}
|
||||||
default:
|
|
||||||
C.perfmon_finalize()
|
|
||||||
}
|
}
|
||||||
|
m.lock.Unlock()
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -541,7 +412,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Go over the global metrics, derive the value out of the event sets' metric values and send it
|
// Go over the global metrics, derive the value out of the event sets' metric values and send it
|
||||||
func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error {
|
func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan lp.CCMetric) error {
|
||||||
for _, metric := range m.config.Metrics {
|
for _, metric := range m.config.Metrics {
|
||||||
scopemap := m.cpu2tid
|
scopemap := m.cpu2tid
|
||||||
if metric.Type == "socket" {
|
if metric.Type == "socket" {
|
||||||
@@ -551,7 +422,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
|||||||
if tid >= 0 {
|
if tid >= 0 {
|
||||||
// Here we generate parameter list
|
// Here we generate parameter list
|
||||||
params := make(map[string]interface{})
|
params := make(map[string]interface{})
|
||||||
for _, evset := range groups {
|
for _, evset := range m.likwidGroups {
|
||||||
for mname, mres := range evset.metrics[tid] {
|
for mname, mres := range evset.metrics[tid] {
|
||||||
params[mname] = mres
|
params[mname] = mres
|
||||||
}
|
}
|
||||||
@@ -565,7 +436,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
|||||||
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
|
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
|
||||||
value = 0.0
|
value = 0.0
|
||||||
}
|
}
|
||||||
//m.gmresults[tid][metric.Name] = value
|
m.gmresults[tid][metric.Name] = value
|
||||||
// Now we have the result, send it with the proper tags
|
// Now we have the result, send it with the proper tags
|
||||||
if !math.IsNaN(value) {
|
if !math.IsNaN(value) {
|
||||||
if metric.Publish {
|
if metric.Publish {
|
||||||
@@ -589,53 +460,203 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *LikwidCollector) ReInit() error {
|
||||||
func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMetric) {
|
C.perfmon_finalize()
|
||||||
var err error = nil
|
ret := C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
||||||
groups := make([]LikwidEventsetConfig, 0)
|
if ret != 0 {
|
||||||
|
return nil
|
||||||
for evidx, evset := range m.config.Eventsets {
|
}
|
||||||
e := genLikwidEventSet(evset)
|
for i, evset := range m.config.Eventsets {
|
||||||
e.internal = evidx
|
var gid C.int
|
||||||
skip := false
|
if len(evset.Events) > 0 {
|
||||||
if !skip {
|
//skip := false
|
||||||
// measure event set 'i' for 'interval' seconds
|
likwidGroup := genLikwidEventSet(evset)
|
||||||
skip, err = m.takeMeasurement(evidx, e, interval)
|
gid = C.perfmon_addEventSet(likwidGroup.estr)
|
||||||
if err != nil {
|
if gid >= 0 {
|
||||||
cclog.ComponentError(m.name, err.Error())
|
likwidGroup.gid = gid
|
||||||
return
|
likwidGroup.internal = i
|
||||||
|
m.likwidGroups[gid] = likwidGroup
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !skip {
|
|
||||||
// read measurements and derive event set metrics
|
|
||||||
m.calcEventsetMetrics(e, interval, output)
|
|
||||||
}
|
|
||||||
groups = append(groups, e)
|
|
||||||
}
|
}
|
||||||
// calculate global metrics
|
return nil
|
||||||
m.calcGlobalMetrics(groups, interval, output)
|
}
|
||||||
|
|
||||||
|
func (m *LikwidCollector) LateInit() error {
|
||||||
|
var ret C.int
|
||||||
|
if m.initialized {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
switch m.config.AccessMode {
|
||||||
|
case "direct":
|
||||||
|
C.HPMmode(0)
|
||||||
|
case "accessdaemon":
|
||||||
|
if len(m.config.DaemonPath) > 0 {
|
||||||
|
p := os.Getenv("PATH")
|
||||||
|
os.Setenv("PATH", m.config.DaemonPath+":"+p)
|
||||||
|
}
|
||||||
|
C.HPMmode(1)
|
||||||
|
for _, c := range m.cpulist {
|
||||||
|
C.HPMaddThread(c)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cclog.ComponentDebug(m.name, "initialize LIKWID topology")
|
||||||
|
ret = C.topology_init()
|
||||||
|
if ret != 0 {
|
||||||
|
err := errors.New("failed to initialize LIKWID topology")
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
m.sock2tid = make(map[int]int)
|
||||||
|
tmp := make([]C.int, 1)
|
||||||
|
for _, sid := range topo.SocketList() {
|
||||||
|
cstr := C.CString(fmt.Sprintf("S%d:0", sid))
|
||||||
|
ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1)
|
||||||
|
if ret > 0 {
|
||||||
|
m.sock2tid[sid] = m.cpu2tid[int(tmp[0])]
|
||||||
|
}
|
||||||
|
C.free(unsafe.Pointer(cstr))
|
||||||
|
}
|
||||||
|
|
||||||
|
m.basefreq = getBaseFreq()
|
||||||
|
cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq)
|
||||||
|
|
||||||
|
if m.needs_reinit {
|
||||||
|
m.ReInit()
|
||||||
|
m.needs_reinit = false
|
||||||
|
}
|
||||||
|
|
||||||
|
// cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module")
|
||||||
|
// ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
||||||
|
// if ret != 0 {
|
||||||
|
// var err error = nil
|
||||||
|
// C.topology_finalize()
|
||||||
|
// if ret != -22 {
|
||||||
|
// err = errors.New("failed to initialize LIKWID perfmon")
|
||||||
|
// cclog.ComponentError(m.name, err.Error())
|
||||||
|
// } else {
|
||||||
|
// err = errors.New("access to LIKWID perfmon locked")
|
||||||
|
// }
|
||||||
|
// return err
|
||||||
|
// }
|
||||||
|
|
||||||
|
// // While adding the events, we test the metrics whether they can be computed at all
|
||||||
|
// for i, evset := range m.config.Eventsets {
|
||||||
|
// var gid C.int
|
||||||
|
// if len(evset.Events) > 0 {
|
||||||
|
// //skip := false
|
||||||
|
// likwidGroup := genLikwidEventSet(evset)
|
||||||
|
// // for _, g := range m.likwidGroups {
|
||||||
|
// // if likwidGroup.go_estr == g.go_estr {
|
||||||
|
// // skip = true
|
||||||
|
// // break
|
||||||
|
// // }
|
||||||
|
// // }
|
||||||
|
// // if skip {
|
||||||
|
// // continue
|
||||||
|
// // }
|
||||||
|
// // Now we add the list of events to likwid
|
||||||
|
// gid = C.perfmon_addEventSet(likwidGroup.estr)
|
||||||
|
// if gid >= 0 {
|
||||||
|
// likwidGroup.gid = gid
|
||||||
|
// likwidGroup.internal = i
|
||||||
|
// m.likwidGroups[gid] = likwidGroup
|
||||||
|
// }
|
||||||
|
// } else {
|
||||||
|
// cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given")
|
||||||
|
// continue
|
||||||
|
// }
|
||||||
|
|
||||||
|
// }
|
||||||
|
|
||||||
|
// If no event set could be added, shut down LikwidCollector
|
||||||
|
if len(m.likwidGroups) == 0 {
|
||||||
|
C.perfmon_finalize()
|
||||||
|
C.topology_finalize()
|
||||||
|
err := errors.New("no LIKWID performance group initialized")
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
sigchan := make(chan os.Signal, 1)
|
||||||
|
signal.Notify(sigchan, syscall.SIGCHLD)
|
||||||
|
signal.Notify(sigchan, os.Interrupt)
|
||||||
|
go func() {
|
||||||
|
<-sigchan
|
||||||
|
|
||||||
|
signal.Stop(sigchan)
|
||||||
|
m.initialized = false
|
||||||
|
}()
|
||||||
|
m.initialized = true
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// main read function taking multiple measurement rounds, each 'interval' seconds long
|
// main read function taking multiple measurement rounds, each 'interval' seconds long
|
||||||
func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||||
//var skip bool = false
|
var skip bool = false
|
||||||
//var err error
|
var err error
|
||||||
if !m.init {
|
if !m.init {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
m.measureThread.Call(func() {
|
m.measureThread.Call(func() {
|
||||||
m.ReadThread(interval, output)
|
if !m.initialized {
|
||||||
|
m.lock.Lock()
|
||||||
|
err = m.LateInit()
|
||||||
|
if err != nil {
|
||||||
|
m.lock.Unlock()
|
||||||
|
cclog.ComponentError(m.name, "lateinit failed")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
m.initialized = true
|
||||||
|
m.lock.Unlock()
|
||||||
|
skip = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if m.initialized && !skip {
|
||||||
|
time := interval
|
||||||
|
for _, evset := range m.likwidGroups {
|
||||||
|
if !skip {
|
||||||
|
// measure event set 'i' for 'interval' seconds
|
||||||
|
skip, err = m.takeMeasurement(evset, interval)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !skip {
|
||||||
|
// read measurements and derive event set metrics
|
||||||
|
m.calcEventsetMetrics(evset, time, output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !skip {
|
||||||
|
// use the event set metrics to derive the global metrics
|
||||||
|
m.calcGlobalMetrics(time, output)
|
||||||
|
}
|
||||||
|
if skip {
|
||||||
|
m.needs_reinit = true
|
||||||
|
m.initialized = false
|
||||||
|
}
|
||||||
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *LikwidCollector) Close() {
|
func (m *LikwidCollector) Close() {
|
||||||
if m.init {
|
if m.init {
|
||||||
m.init = false
|
m.init = false
|
||||||
|
cclog.ComponentDebug(m.name, "Closing ...")
|
||||||
m.lock.Lock()
|
m.lock.Lock()
|
||||||
m.measureThread.Terminate()
|
if m.initialized {
|
||||||
m.initialized = false
|
cclog.ComponentDebug(m.name, "Finalize LIKWID perfmon module")
|
||||||
|
C.perfmon_finalize()
|
||||||
|
m.initialized = false
|
||||||
|
}
|
||||||
m.lock.Unlock()
|
m.lock.Unlock()
|
||||||
|
cclog.ComponentDebug(m.name, "Finalize LIKWID topology module")
|
||||||
C.topology_finalize()
|
C.topology_finalize()
|
||||||
|
|
||||||
|
cclog.ComponentDebug(m.name, "Closing done")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -10,7 +10,6 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li
|
|||||||
"liblikwid_path" : "/path/to/liblikwid.so",
|
"liblikwid_path" : "/path/to/liblikwid.so",
|
||||||
"accessdaemon_path" : "/folder/that/contains/likwid-accessD",
|
"accessdaemon_path" : "/folder/that/contains/likwid-accessD",
|
||||||
"access_mode" : "direct or accessdaemon or perf_event",
|
"access_mode" : "direct or accessdaemon or perf_event",
|
||||||
"lockfile_path" : "/var/run/likwid.lock",
|
|
||||||
"eventsets": [
|
"eventsets": [
|
||||||
{
|
{
|
||||||
"events" : {
|
"events" : {
|
||||||
@@ -50,7 +49,6 @@ Additional options:
|
|||||||
- `access_mode`: Specify LIKWID access mode: `direct` for direct register access as root user or `accessdaemon`. The access mode `perf_event` is current untested.
|
- `access_mode`: Specify LIKWID access mode: `direct` for direct register access as root user or `accessdaemon`. The access mode `perf_event` is current untested.
|
||||||
- `accessdaemon_path`: Folder of the accessDaemon `likwid-accessD` (like `/usr/local/sbin`)
|
- `accessdaemon_path`: Folder of the accessDaemon `likwid-accessD` (like `/usr/local/sbin`)
|
||||||
- `liblikwid_path`: Location of `liblikwid.so` including file name like `/usr/local/lib/liblikwid.so`
|
- `liblikwid_path`: Location of `liblikwid.so` including file name like `/usr/local/lib/liblikwid.so`
|
||||||
- `lockfile_path`: Location of LIKWID's lock file if multiple tools should access the hardware counters. Default `/var/run/likwid.lock`
|
|
||||||
|
|
||||||
### Available metric types
|
### Available metric types
|
||||||
|
|
||||||
|
@@ -10,8 +10,6 @@ BuildRequires: go-toolset
|
|||||||
BuildRequires: systemd-rpm-macros
|
BuildRequires: systemd-rpm-macros
|
||||||
# for header downloads
|
# for header downloads
|
||||||
BuildRequires: wget
|
BuildRequires: wget
|
||||||
# Recommended when using the sysusers_create_package macro
|
|
||||||
Requires(pre): /usr/bin/systemd-sysusers
|
|
||||||
|
|
||||||
Provides: %{name} = %{version}
|
Provides: %{name} = %{version}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user