From ff0833c413689a7ec22e8ce102a541517c9a82fc Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Tue, 20 Dec 2022 13:04:24 +0100 Subject: [PATCH] Push LIKWID collector fix into main (#98) * InfiniBandCollector: Scale raw readings from octets to bytes * Fix clock frequency coming from LikwidCollector and update docs * Build DEB package for Ubuntu 20.04 for releases * Fix memstat collector with numa_stats option * Remove useless prints from MemstatCollector * Replace ioutils with os and io (#87) * Use lower case for error strings in RocmSmiCollector * move maybe-usable-by-other-cc-components to pkg. Fix all files to use the new paths (#88) * Add collector for monitoring the execution of cc-metric-collector itself (#81) * Add collector to monitor execution of cc-metric-collector itself * Register SelfCollector * Fix import paths for moved packages * Check if at least one CPU with frequency information was detected * Correct type: /proc/stats -> /proc/stat * Update README.md * Run ipmitool asynchron. Improved error handling. * Corrected some typos * Add running average power limit (RAPL) metric collector * Add running average power limit (RAPL) metric collector * Do not mess up with the orignal configuration * * Corrected json config in numastatsMetric.md * Added some debug output to numastatsMetric.go * Fixed computing number of physical packages for non continous physical package IDs (e.g. on Ampere Altra Q80-30) * Fix kernel panic for receiver config with missing receiver type * Add receiver to gather remote IPMI sensor metrics * Added config option to add ipmi-sensors command line options * Add documentaion for IPMI receiver * Update to latest version of included go modules * Add go.mod to App dependency * Try to use common metric tags across hardware vendors * Add IPMI metric: current * remove prefix enumeration like 01-... * Add IPMI receiver example configuration to receivers.json * Minimal formating changes * Add hostlist package * Added tests for hostlist Expand() * Use package hostlist to expand a host list * Use package hostlist to expand a host list * Some servers return "ConsumedPowerWatt":65535 instead of "ConsumedPowerWatt":null * Updated to latest package versions * Do not allow unknown fields in JSON configuration file * Add workflow to customize packages to docs * NFS I/O Stats Collector (#91) * Initial version * Delete values for vanished mount points and comments * Fix for Likwid collector (#95) * Run LIKWID in separate thread and check metric type * Change LIKWID collector documentation to use 'type' instead of 'scope' * Re-initialize LIKWID after one read is missing due to lock toggle * Register cc-metric-collector at Zenodo (#93) * Add initial version of Zenodo project file * Orcid ID added * Update .zenodo.json Co-authored-by: Holger Obermaier * Update ipmiMetric.go * Use latest LIKWID version for builds * Update README.md * Remove development stuff from Makefile * Add Requires(pre) to RPM SPEC file * Use curly brackets in packaging make targets * Fix for LIKWID collector with separate measurement thread and inotify watcher on the LIKWID lock (#97) Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: Holger Obermaier --- Makefile | 32 +-- README.md | 4 + collectors/Makefile | 2 +- collectors/likwidMetric.go | 469 +++++++++++++++---------------- collectors/likwidMetric.md | 2 + scripts/cc-metric-collector.spec | 2 + 6 files changed, 246 insertions(+), 265 deletions(-) diff --git a/Makefile b/Makefile index 721e20f..b010cd8 100644 --- a/Makefile +++ b/Makefile @@ -84,7 +84,7 @@ RPM: scripts/cc-metric-collector.spec @COMMITISH="HEAD" @VERS=$$(git describe --tags $${COMMITISH}) @VERS=$${VERS#v} - @VERS=$$(echo $$VERS | sed -e s+'-'+'_'+g) + @VERS=$$(echo $${VERS} | sed -e s+'-'+'_'+g) @eval $$(rpmspec --query --queryformat "NAME='%{name}' VERSION='%{version}' RELEASE='%{release}' NVR='%{NVR}' NVRA='%{NVRA}'" --define="VERS $${VERS}" "$${SPECFILE}") @PREFIX="$${NAME}-$${VERSION}" @FORMAT="tar.gz" @@ -96,10 +96,8 @@ RPM: scripts/cc-metric-collector.spec @if [[ "$${GITHUB_ACTIONS}" == true ]]; then @ RPMFILE="$${RPMDIR}/$${ARCH}/$${NVRA}.rpm" @ SRPMFILE="$${SRPMDIR}/$${NVR}.src.rpm" - @ echo "RPM: $${RPMFILE}" - @ echo "SRPM: $${SRPMFILE}" - @ echo "::set-output name=SRPM::$${SRPMFILE}" - @ echo "::set-output name=RPM::$${RPMFILE}" + @ echo "SRPM=$${SRPMFILE}" >> $${GITHUB_OUTPUT} + @ echo "RPM=$${RPMFILE}" >> $${GITHUB_OUTPUT} @fi .PHONY: DEB @@ -108,29 +106,25 @@ DEB: scripts/cc-metric-collector.deb.control $(APP) @WORKSPACE=$${PWD}/.dpkgbuild @DEBIANDIR=$${WORKSPACE}/debian @DEBIANBINDIR=$${WORKSPACE}/DEBIAN - @mkdir --parents --verbose $$WORKSPACE $$DEBIANBINDIR + @mkdir --parents --verbose $${WORKSPACE} $${DEBIANBINDIR} #@mkdir --parents --verbose $$DEBIANDIR @CONTROLFILE="$${BASEDIR}/scripts/cc-metric-collector.deb.control" @COMMITISH="HEAD" - @git describe --tags --abbrev=0 $${COMMITISH} @VERS=$$(git describe --tags --abbrev=0 $${COMMITISH}) - @if [ -z "$$VERS" ]; then VERS=${GITHUB_REF_NAME}; fi + @if [ -z "$${VERS}" ]; then VERS=${GITHUB_REF_NAME}; fi @VERS=$${VERS#v} - @VERS=$$(echo $$VERS | sed -e s+'-'+'_'+g) + @VERS=$$(echo $${VERS} | sed -e s+'-'+'_'+g) @ARCH=$$(uname -m) - @ARCH=$$(echo $$ARCH | sed -e s+'_'+'-'+g) + @ARCH=$$(echo $${ARCH} | sed -e s+'_'+'-'+g) + @if [ "$${ARCH}" = "x86-64" ]; then ARCH=amd64; fi @PREFIX="$${NAME}-$${VERSION}_$${ARCH}" - @SIZE_BYTES=$$(du -bcs --exclude=.dpkgbuild "$$WORKSPACE"/ | awk '{print $$1}' | head -1 | sed -e 's/^0\+//') - @SIZE="$$(awk -v size="$$SIZE_BYTES" 'BEGIN {print (size/1024)+1}' | awk '{print int($$0)}')" - #@sed -e s+"{VERSION}"+"$$VERS"+g -e s+"{INSTALLED_SIZE}"+"$$SIZE"+g -e s+"{ARCH}"+"$$ARCH"+g $$CONTROLFILE > $${DEBIANDIR}/control - @echo "Version: $$VERS" - @echo "Size: $$SIZE" - @echo "Arch: $$ARCH" - @sed -e s+"{VERSION}"+"$$VERS"+g -e s+"{INSTALLED_SIZE}"+"$$SIZE"+g -e s+"{ARCH}"+"$$ARCH"+g $$CONTROLFILE > $${DEBIANBINDIR}/control + @SIZE_BYTES=$$(du -bcs --exclude=.dpkgbuild "$${WORKSPACE}"/ | awk '{print $$1}' | head -1 | sed -e 's/^0\+//') + @SIZE="$$(awk -v size="$${SIZE_BYTES}" 'BEGIN {print (size/1024)+1}' | awk '{print int($$0)}')" + @sed -e s+"{VERSION}"+"$${VERS}"+g -e s+"{INSTALLED_SIZE}"+"$${SIZE}"+g -e s+"{ARCH}"+"$${ARCH}"+g $${CONTROLFILE} > $${DEBIANBINDIR}/control @make PREFIX=$${WORKSPACE} install @DEB_FILE="cc-metric-collector_$${VERS}_$${ARCH}.deb" - @dpkg-deb -b $${WORKSPACE} "$$DEB_FILE" + @dpkg-deb -b $${WORKSPACE} "$${DEB_FILE}" @if [ "$${GITHUB_ACTIONS}" = "true" ]; then - @ echo "::set-output name=DEB::$${DEB_FILE}" + @ echo "DEB=$${DEB_FILE}" >> $${GITHUB_OUTPUT} @fi @rm -r "$${WORKSPACE}" diff --git a/README.md b/README.md index 65bde1d..07f5fd4 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,10 @@ There is a single timer loop that triggers all collectors serially, collects the The receiver runs as a go routine side-by-side with the timer loop and asynchronously forwards received metrics to the sink. + +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7438287.svg)](https://doi.org/10.5281/zenodo.7438287) + + # Configuration Configuration is implemented using a single json document that is distributed over network and may be persisted as file. diff --git a/collectors/Makefile b/collectors/Makefile index 37e8e67..de98eb6 100644 --- a/collectors/Makefile +++ b/collectors/Makefile @@ -1,5 +1,5 @@ # LIKWID version -LIKWID_VERSION = 5.2.1 +LIKWID_VERSION = 5.2.2 LIKWID_INSTALLED_FOLDER=$(shell dirname $(shell which likwid-topology 2>/dev/null) 2>/dev/null) LIKWID_FOLDER="$(shell pwd)/likwid" diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 68749d2..339c5ba 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -15,6 +15,7 @@ import ( "math" "os" "os/signal" + "os/user" "sort" "strconv" "strings" @@ -29,12 +30,14 @@ import ( topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology" "github.com/NVIDIA/go-nvml/pkg/dl" "golang.design/x/thread" + fsnotify "gopkg.in/fsnotify.v0" ) const ( LIKWID_LIB_NAME = "liblikwid.so" LIKWID_LIB_DL_FLAGS = dl.RTLD_LAZY | dl.RTLD_GLOBAL LIKWID_DEF_ACCESSMODE = "direct" + LIKWID_DEF_LOCKFILE = "/var/run/likwid.lock" ) type LikwidCollectorMetricConfig struct { @@ -68,6 +71,7 @@ type LikwidCollectorConfig struct { AccessMode string `json:"access_mode,omitempty"` DaemonPath string `json:"accessdaemon_path,omitempty"` LibraryPath string `json:"liblikwid_path,omitempty"` + LockfilePath string `json:"lockfile_path,omitempty"` } type LikwidCollector struct { @@ -82,7 +86,7 @@ type LikwidCollector struct { basefreq float64 running bool initialized bool - needs_reinit bool + needs_reinit bool likwidGroups map[C.int]LikwidEventsetConfig lock sync.Mutex measureThread thread.Thread @@ -198,6 +202,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { m.running = false m.config.AccessMode = LIKWID_DEF_ACCESSMODE m.config.LibraryPath = LIKWID_LIB_NAME + m.config.LockfilePath = LIKWID_DEF_LOCKFILE if len(config) > 0 { err := json.Unmarshal(config, &m.config) if err != nil { @@ -255,12 +260,16 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { } for _, metric := range evset.Metrics { // Try to evaluate the metric - if testLikwidMetricFormula(metric.Calc, params) && checkMetricType(metric.Type) { - // Add the computable metric to the parameter list for the global metrics + cclog.ComponentDebug(m.name, "Checking", metric.Name) + if !checkMetricType(metric.Type) { + cclog.ComponentError(m.name, "Metric", metric.Name, "uses invalid type", metric.Type) + metric.Calc = "" + } else if !testLikwidMetricFormula(metric.Calc, params) { + cclog.ComponentError(m.name, "Metric", metric.Name, "cannot be calculated with given counters") + metric.Calc = "" + } else { globalParams = append(globalParams, metric.Name) totalMetrics++ - } else { - metric.Calc = "" } } } else { @@ -270,8 +279,11 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { } for _, metric := range m.config.Metrics { // Try to evaluate the global metric - if !testLikwidMetricFormula(metric.Calc, globalParams) { - cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed") + if !checkMetricType(metric.Type) { + cclog.ComponentError(m.name, "Metric", metric.Name, "uses invalid type", metric.Type) + metric.Calc = "" + } else if !testLikwidMetricFormula(metric.Calc, globalParams) { + cclog.ComponentError(m.name, "Metric", metric.Name, "cannot be calculated with given counters") metric.Calc = "" } else if !checkMetricType(metric.Type) { cclog.ComponentError(m.name, "Metric", metric.Name, "has invalid type") @@ -287,77 +299,194 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { cclog.ComponentError(m.name, err.Error()) return err } + + ret := C.topology_init() + if ret != 0 { + err := errors.New("failed to initialize topology module") + cclog.ComponentError(m.name, err.Error()) + return err + } + switch m.config.AccessMode { + case "direct": + C.HPMmode(0) + case "accessdaemon": + if len(m.config.DaemonPath) > 0 { + p := os.Getenv("PATH") + os.Setenv("PATH", m.config.DaemonPath+":"+p) + } + C.HPMmode(1) + for _, c := range m.cpulist { + C.HPMaddThread(c) + } + } + m.sock2tid = make(map[int]int) + tmp := make([]C.int, 1) + for _, sid := range topo.SocketList() { + cstr := C.CString(fmt.Sprintf("S%d:0", sid)) + ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1) + if ret > 0 { + m.sock2tid[sid] = m.cpu2tid[int(tmp[0])] + } + C.free(unsafe.Pointer(cstr)) + } + + m.basefreq = getBaseFreq() m.measureThread = thread.New() m.init = true return nil } // take a measurement for 'interval' seconds of event set index 'group' -func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval time.Duration) (bool, error) { +func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, interval time.Duration) (bool, error) { var ret C.int - m.lock.Lock() - if m.initialized { - ret = C.perfmon_setupCounters(evset.gid) - if ret != 0 { - var err error = nil - var skip bool = false - cclog.ComponentDebug(m.name, "Setup returns", ret) - if ret == -37 { - skip = true + var gid C.int = -1 + sigchan := make(chan os.Signal, 1) + watcher, err := fsnotify.NewWatcher() + if err != nil { + cclog.ComponentError(m.name, err.Error()) + } + defer watcher.Close() + if len(m.config.LockfilePath) > 0 { + info, err := os.Stat(m.config.LockfilePath) + if err != nil { + return true, err + } + stat := info.Sys().(*syscall.Stat_t) + if stat.Uid != uint32(os.Getuid()) { + usr, err := user.LookupId(strconv.FormatUint(uint64(stat.Uid), 10)) + if err == nil { + return true, fmt.Errorf("Access to performance counters locked by %s", usr.Username) } else { - err = fmt.Errorf("failed to setup performance group %d", evset.gid) - } - m.lock.Unlock() - return skip, err - } - m.running = true - ret = C.perfmon_startCounters() - if ret != 0 { - var err error = nil - var skip bool = false - if ret == -37 { - skip = true - } else { - err = fmt.Errorf("failed to setup performance group %d", evset.gid) - } - m.lock.Unlock() - return skip, err - } - ret = C.perfmon_readCounters() - time.Sleep(interval) - m.running = false - ret = C.perfmon_stopCounters() - if ret != 0 { - var err error = nil - var skip bool = false - if ret == -37 { - skip = true - } else { - err = fmt.Errorf("failed to setup performance group %d", evset.gid) - } - m.lock.Unlock() - return skip, err - } - m.running = false - runtime := float64(C.perfmon_getLastTimeOfGroup(evset.gid)) - // Go over events and get the results - for eidx, counter := range evset.eorder { - gctr := C.GoString(counter) - for _, tid := range m.cpu2tid { - res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid)) - fres := float64(res) - if m.config.InvalidToZero && (math.IsNaN(fres) || math.IsInf(fres, 0)) { - cclog.ComponentDebug(m.name, "Sanitize", gctr, "to zero") - fres = 0.0 - } - evset.results[tid][gctr] = fres + return true, fmt.Errorf("Access to performance counters locked by %d", stat.Uid) } } - for _, tid := range m.cpu2tid { - evset.results[tid]["time"] = runtime + err = watcher.Watch(m.config.LockfilePath) + if err != nil { + cclog.ComponentError(m.name, err.Error()) } } - m.lock.Unlock() + m.lock.Lock() + defer m.lock.Unlock() + select { + case e := <-watcher.Event: + ret = -1 + if !e.IsAttrib() { + ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) + } + default: + ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) + } + if ret != 0 { + return true, fmt.Errorf("failed to initialize library, error %d", ret) + } + signal.Notify(sigchan, os.Interrupt) + signal.Notify(sigchan, syscall.SIGCHLD) + select { + case <-sigchan: + gid = -1 + case e := <-watcher.Event: + gid = -1 + if !e.IsAttrib() { + gid = C.perfmon_addEventSet(evset.estr) + } + default: + gid = C.perfmon_addEventSet(evset.estr) + } + if gid < 0 { + return true, fmt.Errorf("failed to add events %s, error %d", evset.go_estr, gid) + } else { + evset.gid = gid + //m.likwidGroups[gid] = evset + } + select { + case <-sigchan: + ret = -1 + case e := <-watcher.Event: + if !e.IsAttrib() { + ret = C.perfmon_setupCounters(gid) + } + default: + ret = C.perfmon_setupCounters(gid) + } + if ret != 0 { + return true, fmt.Errorf("failed to setup events '%s', error %d", evset.go_estr, ret) + } + select { + case <-sigchan: + ret = -1 + case e := <-watcher.Event: + if !e.IsAttrib() { + ret = C.perfmon_startCounters() + } + default: + ret = C.perfmon_startCounters() + } + if ret != 0 { + return true, fmt.Errorf("failed to start events '%s', error %d", evset.go_estr, ret) + } + select { + case <-sigchan: + ret = -1 + case e := <-watcher.Event: + if !e.IsAttrib() { + ret = C.perfmon_readCounters() + } + default: + ret = C.perfmon_readCounters() + } + if ret != 0 { + return true, fmt.Errorf("failed to read events '%s', error %d", evset.go_estr, ret) + } + time.Sleep(interval) + select { + case <-sigchan: + ret = -1 + case e := <-watcher.Event: + if !e.IsAttrib() { + ret = C.perfmon_readCounters() + } + default: + ret = C.perfmon_readCounters() + } + if ret != 0 { + return true, fmt.Errorf("failed to read events '%s', error %d", evset.go_estr, ret) + } + for eidx, counter := range evset.eorder { + gctr := C.GoString(counter) + for _, tid := range m.cpu2tid { + res := C.perfmon_getLastResult(gid, C.int(eidx), C.int(tid)) + fres := float64(res) + if m.config.InvalidToZero && (math.IsNaN(fres) || math.IsInf(fres, 0)) { + fres = 0.0 + } + evset.results[tid][gctr] = fres + } + } + for _, tid := range m.cpu2tid { + evset.results[tid]["time"] = float64(C.perfmon_getLastTimeOfGroup(gid)) + } + select { + case <-sigchan: + ret = -1 + case e := <-watcher.Event: + if !e.IsAttrib() { + ret = C.perfmon_stopCounters() + } + default: + ret = C.perfmon_stopCounters() + } + if ret != 0 { + return true, fmt.Errorf("failed to stop events '%s', error %d", evset.go_estr, ret) + } + signal.Stop(sigchan) + select { + case e := <-watcher.Event: + if !e.IsAttrib() { + C.perfmon_finalize() + } + default: + C.perfmon_finalize() + } return false, nil } @@ -412,7 +541,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv } // Go over the global metrics, derive the value out of the event sets' metric values and send it -func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan lp.CCMetric) error { +func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error { for _, metric := range m.config.Metrics { scopemap := m.cpu2tid if metric.Type == "socket" { @@ -422,7 +551,7 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan if tid >= 0 { // Here we generate parameter list params := make(map[string]interface{}) - for _, evset := range m.likwidGroups { + for _, evset := range groups { for mname, mres := range evset.metrics[tid] { params[mname] = mres } @@ -436,7 +565,7 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) { value = 0.0 } - m.gmresults[tid][metric.Name] = value + //m.gmresults[tid][metric.Name] = value // Now we have the result, send it with the proper tags if !math.IsNaN(value) { if metric.Publish { @@ -460,203 +589,53 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan return nil } -func (m *LikwidCollector) ReInit() error { - C.perfmon_finalize() - ret := C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) - if ret != 0 { - return nil - } - for i, evset := range m.config.Eventsets { - var gid C.int - if len(evset.Events) > 0 { - //skip := false - likwidGroup := genLikwidEventSet(evset) - gid = C.perfmon_addEventSet(likwidGroup.estr) - if gid >= 0 { - likwidGroup.gid = gid - likwidGroup.internal = i - m.likwidGroups[gid] = likwidGroup + +func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMetric) { + var err error = nil + groups := make([]LikwidEventsetConfig, 0) + + for evidx, evset := range m.config.Eventsets { + e := genLikwidEventSet(evset) + e.internal = evidx + skip := false + if !skip { + // measure event set 'i' for 'interval' seconds + skip, err = m.takeMeasurement(evidx, e, interval) + if err != nil { + cclog.ComponentError(m.name, err.Error()) + return } } - } - return nil -} -func (m *LikwidCollector) LateInit() error { - var ret C.int - if m.initialized { - return nil - } - switch m.config.AccessMode { - case "direct": - C.HPMmode(0) - case "accessdaemon": - if len(m.config.DaemonPath) > 0 { - p := os.Getenv("PATH") - os.Setenv("PATH", m.config.DaemonPath+":"+p) - } - C.HPMmode(1) - for _, c := range m.cpulist { - C.HPMaddThread(c) + if !skip { + // read measurements and derive event set metrics + m.calcEventsetMetrics(e, interval, output) } + groups = append(groups, e) } - cclog.ComponentDebug(m.name, "initialize LIKWID topology") - ret = C.topology_init() - if ret != 0 { - err := errors.New("failed to initialize LIKWID topology") - cclog.ComponentError(m.name, err.Error()) - return err - } - - m.sock2tid = make(map[int]int) - tmp := make([]C.int, 1) - for _, sid := range topo.SocketList() { - cstr := C.CString(fmt.Sprintf("S%d:0", sid)) - ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1) - if ret > 0 { - m.sock2tid[sid] = m.cpu2tid[int(tmp[0])] - } - C.free(unsafe.Pointer(cstr)) - } - - m.basefreq = getBaseFreq() - cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq) - - if m.needs_reinit { - m.ReInit() - m.needs_reinit = false - } - - // cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module") - // ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) - // if ret != 0 { - // var err error = nil - // C.topology_finalize() - // if ret != -22 { - // err = errors.New("failed to initialize LIKWID perfmon") - // cclog.ComponentError(m.name, err.Error()) - // } else { - // err = errors.New("access to LIKWID perfmon locked") - // } - // return err - // } - - // // While adding the events, we test the metrics whether they can be computed at all - // for i, evset := range m.config.Eventsets { - // var gid C.int - // if len(evset.Events) > 0 { - // //skip := false - // likwidGroup := genLikwidEventSet(evset) - // // for _, g := range m.likwidGroups { - // // if likwidGroup.go_estr == g.go_estr { - // // skip = true - // // break - // // } - // // } - // // if skip { - // // continue - // // } - // // Now we add the list of events to likwid - // gid = C.perfmon_addEventSet(likwidGroup.estr) - // if gid >= 0 { - // likwidGroup.gid = gid - // likwidGroup.internal = i - // m.likwidGroups[gid] = likwidGroup - // } - // } else { - // cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given") - // continue - // } - - // } - - // If no event set could be added, shut down LikwidCollector - if len(m.likwidGroups) == 0 { - C.perfmon_finalize() - C.topology_finalize() - err := errors.New("no LIKWID performance group initialized") - cclog.ComponentError(m.name, err.Error()) - return err - } - sigchan := make(chan os.Signal, 1) - signal.Notify(sigchan, syscall.SIGCHLD) - signal.Notify(sigchan, os.Interrupt) - go func() { - <-sigchan - - signal.Stop(sigchan) - m.initialized = false - }() - m.initialized = true - return nil + // calculate global metrics + m.calcGlobalMetrics(groups, interval, output) } // main read function taking multiple measurement rounds, each 'interval' seconds long func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) { - var skip bool = false - var err error + //var skip bool = false + //var err error if !m.init { return } - m.measureThread.Call(func() { - if !m.initialized { - m.lock.Lock() - err = m.LateInit() - if err != nil { - m.lock.Unlock() - cclog.ComponentError(m.name, "lateinit failed") - return - } - m.initialized = true - m.lock.Unlock() - skip = true - } - - if m.initialized && !skip { - time := interval - for _, evset := range m.likwidGroups { - if !skip { - // measure event set 'i' for 'interval' seconds - skip, err = m.takeMeasurement(evset, interval) - if err != nil { - cclog.ComponentError(m.name, err.Error()) - return - } - } - - if !skip { - // read measurements and derive event set metrics - m.calcEventsetMetrics(evset, time, output) - } - } - - if !skip { - // use the event set metrics to derive the global metrics - m.calcGlobalMetrics(time, output) - } - if skip { - m.needs_reinit = true - m.initialized = false - } - } + m.ReadThread(interval, output) }) } func (m *LikwidCollector) Close() { if m.init { m.init = false - cclog.ComponentDebug(m.name, "Closing ...") m.lock.Lock() - if m.initialized { - cclog.ComponentDebug(m.name, "Finalize LIKWID perfmon module") - C.perfmon_finalize() - m.initialized = false - } + m.measureThread.Terminate() + m.initialized = false m.lock.Unlock() - cclog.ComponentDebug(m.name, "Finalize LIKWID topology module") C.topology_finalize() - - cclog.ComponentDebug(m.name, "Closing done") } } diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index c080027..62ef3c5 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -10,6 +10,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li "liblikwid_path" : "/path/to/liblikwid.so", "accessdaemon_path" : "/folder/that/contains/likwid-accessD", "access_mode" : "direct or accessdaemon or perf_event", + "lockfile_path" : "/var/run/likwid.lock", "eventsets": [ { "events" : { @@ -49,6 +50,7 @@ Additional options: - `access_mode`: Specify LIKWID access mode: `direct` for direct register access as root user or `accessdaemon`. The access mode `perf_event` is current untested. - `accessdaemon_path`: Folder of the accessDaemon `likwid-accessD` (like `/usr/local/sbin`) - `liblikwid_path`: Location of `liblikwid.so` including file name like `/usr/local/lib/liblikwid.so` +- `lockfile_path`: Location of LIKWID's lock file if multiple tools should access the hardware counters. Default `/var/run/likwid.lock` ### Available metric types diff --git a/scripts/cc-metric-collector.spec b/scripts/cc-metric-collector.spec index 9d55b4f..b1b405e 100644 --- a/scripts/cc-metric-collector.spec +++ b/scripts/cc-metric-collector.spec @@ -10,6 +10,8 @@ BuildRequires: go-toolset BuildRequires: systemd-rpm-macros # for header downloads BuildRequires: wget +# Recommended when using the sysusers_create_package macro +Requires(pre): /usr/bin/systemd-sysusers Provides: %{name} = %{version}