Dynamically load liblikwid (#40)

* Check whether LIKWID library is present

* Generalize nan_to_zero option to invalid_to_zero including +Inf,+Inf and NaN

* Remove double error printing and return if measurements do not work
This commit is contained in:
Thomas Gruber 2022-02-21 13:29:33 +01:00 committed by GitHub
parent ea5b3bdbd6
commit f683f2e6da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 25 additions and 12 deletions

View File

@ -2,7 +2,7 @@ package collectors
/* /*
#cgo CFLAGS: -I./likwid #cgo CFLAGS: -I./likwid
#cgo LDFLAGS: -L./likwid -llikwid -llikwid-hwloc -lm #cgo LDFLAGS: -L./likwid -llikwid -llikwid-hwloc -lm -Wl,--unresolved-symbols=ignore-in-object-files
#include <stdlib.h> #include <stdlib.h>
#include <likwid.h> #include <likwid.h>
*/ */
@ -25,6 +25,7 @@ import (
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology" topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator" agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
"github.com/NVIDIA/go-nvml/pkg/dl"
) )
type MetricScope string type MetricScope string
@ -69,6 +70,11 @@ func GetAllMetricScopes() []MetricScope {
return []MetricScope{"cpu" /*, "core", "llc", "numadomain", "die",*/, "socket", "node"} return []MetricScope{"cpu" /*, "core", "llc", "numadomain", "die",*/, "socket", "node"}
} }
const (
LIKWID_LIB_NAME = "liblikwid.so"
LIKWID_LIB_DL_FLAGS = dl.RTLD_LAZY | dl.RTLD_GLOBAL
)
type LikwidCollectorMetricConfig struct { type LikwidCollectorMetricConfig struct {
Name string `json:"name"` // Name of the metric Name string `json:"name"` // Name of the metric
Calc string `json:"calc"` // Calculation for the metric using Calc string `json:"calc"` // Calculation for the metric using
@ -88,7 +94,7 @@ type LikwidCollectorConfig struct {
Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"` Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"`
Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"` Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"`
ForceOverwrite bool `json:"force_overwrite,omitempty"` ForceOverwrite bool `json:"force_overwrite,omitempty"`
NanToZero bool `json:"nan_to_zero,omitempty"` InvalidToZero bool `json:"invalid_to_zero,omitempty"`
} }
type LikwidCollector struct { type LikwidCollector struct {
@ -260,6 +266,10 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
return err return err
} }
} }
lib := dl.New(LIKWID_LIB_NAME, LIKWID_LIB_DL_FLAGS)
if lib == nil {
return fmt.Errorf("error instantiating DynamicLibrary for %s", LIKWID_LIB_NAME)
}
if m.config.ForceOverwrite { if m.config.ForceOverwrite {
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1") cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
os.Setenv("LIKWID_FORCE", "1") os.Setenv("LIKWID_FORCE", "1")
@ -374,15 +384,13 @@ func (m *LikwidCollector) takeMeasurement(group int, interval time.Duration) err
ret = C.perfmon_setupCounters(gid) ret = C.perfmon_setupCounters(gid)
if ret != 0 { if ret != 0 {
gctr := C.GoString(C.perfmon_getGroupName(gid)) gctr := C.GoString(C.perfmon_getGroupName(gid))
err := fmt.Errorf("failed to setup performance group %s", gctr) err := fmt.Errorf("failed to setup performance group %d (%s)", gid, gctr)
cclog.ComponentError(m.name, err.Error())
return err return err
} }
ret = C.perfmon_startCounters() ret = C.perfmon_startCounters()
if ret != 0 { if ret != 0 {
gctr := C.GoString(C.perfmon_getGroupName(gid)) gctr := C.GoString(C.perfmon_getGroupName(gid))
err := fmt.Errorf("failed to start performance group %s", gctr) err := fmt.Errorf("failed to start performance group %d (%s)", gid, gctr)
cclog.ComponentError(m.name, err.Error())
return err return err
} }
m.running = true m.running = true
@ -391,8 +399,7 @@ func (m *LikwidCollector) takeMeasurement(group int, interval time.Duration) err
ret = C.perfmon_stopCounters() ret = C.perfmon_stopCounters()
if ret != 0 { if ret != 0 {
gctr := C.GoString(C.perfmon_getGroupName(gid)) gctr := C.GoString(C.perfmon_getGroupName(gid))
err := fmt.Errorf("failed to stop performance group %s", gctr) err := fmt.Errorf("failed to stop performance group %d (%s)", gid, gctr)
cclog.ComponentError(m.name, err.Error())
return err return err
} }
return nil return nil
@ -439,7 +446,10 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration,
continue continue
} }
m.mresults[group][tid][metric.Name] = value m.mresults[group][tid][metric.Name] = value
if m.config.NanToZero && math.IsNaN(value) { if m.config.InvalidToZero && math.IsNaN(value) {
value = 0.0
}
if m.config.InvalidToZero && math.IsInf(value, 0) {
value = 0.0 value = 0.0
} }
// Now we have the result, send it with the proper tags // Now we have the result, send it with the proper tags
@ -483,7 +493,10 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
continue continue
} }
m.gmresults[tid][metric.Name] = value m.gmresults[tid][metric.Name] = value
if m.config.NanToZero && math.IsNaN(value) { if m.config.InvalidToZero && math.IsNaN(value) {
value = 0.0
}
if m.config.InvalidToZero && math.IsInf(value, 0) {
value = 0.0 value = 0.0
} }
// Now we have the result, send it with the proper tags // Now we have the result, send it with the proper tags
@ -517,7 +530,7 @@ func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric)
err := m.takeMeasurement(i, interval) err := m.takeMeasurement(i, interval)
if err != nil { if err != nil {
cclog.ComponentError(m.name, err.Error()) cclog.ComponentError(m.name, err.Error())
continue return
} }
// read measurements and derive event set metrics // read measurements and derive event set metrics
m.calcEventsetMetrics(i, interval, output) m.calcEventsetMetrics(i, interval, output)

View File

@ -9,7 +9,7 @@ The `likwid` configuration consists of two parts, the "eventsets" and "globalmet
Additional options: Additional options:
- `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements - `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements
- `nan_to_zero`: In some cases, the calculations result in `NaN`. With this option, all `NaN` values are replaces with `0.0`. - `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`.
### Available metric scopes ### Available metric scopes