mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2024-12-25 23:19:06 +01:00
Dynamically load liblikwid (#40)
* Check whether LIKWID library is present * Generalize nan_to_zero option to invalid_to_zero including +Inf,+Inf and NaN * Remove double error printing and return if measurements do not work
This commit is contained in:
parent
ea5b3bdbd6
commit
f683f2e6da
@ -2,7 +2,7 @@ package collectors
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
#cgo CFLAGS: -I./likwid
|
#cgo CFLAGS: -I./likwid
|
||||||
#cgo LDFLAGS: -L./likwid -llikwid -llikwid-hwloc -lm
|
#cgo LDFLAGS: -L./likwid -llikwid -llikwid-hwloc -lm -Wl,--unresolved-symbols=ignore-in-object-files
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <likwid.h>
|
#include <likwid.h>
|
||||||
*/
|
*/
|
||||||
@ -25,6 +25,7 @@ import (
|
|||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology"
|
topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology"
|
||||||
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
||||||
|
"github.com/NVIDIA/go-nvml/pkg/dl"
|
||||||
)
|
)
|
||||||
|
|
||||||
type MetricScope string
|
type MetricScope string
|
||||||
@ -69,6 +70,11 @@ func GetAllMetricScopes() []MetricScope {
|
|||||||
return []MetricScope{"cpu" /*, "core", "llc", "numadomain", "die",*/, "socket", "node"}
|
return []MetricScope{"cpu" /*, "core", "llc", "numadomain", "die",*/, "socket", "node"}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
LIKWID_LIB_NAME = "liblikwid.so"
|
||||||
|
LIKWID_LIB_DL_FLAGS = dl.RTLD_LAZY | dl.RTLD_GLOBAL
|
||||||
|
)
|
||||||
|
|
||||||
type LikwidCollectorMetricConfig struct {
|
type LikwidCollectorMetricConfig struct {
|
||||||
Name string `json:"name"` // Name of the metric
|
Name string `json:"name"` // Name of the metric
|
||||||
Calc string `json:"calc"` // Calculation for the metric using
|
Calc string `json:"calc"` // Calculation for the metric using
|
||||||
@ -88,7 +94,7 @@ type LikwidCollectorConfig struct {
|
|||||||
Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"`
|
Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"`
|
||||||
Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"`
|
Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"`
|
||||||
ForceOverwrite bool `json:"force_overwrite,omitempty"`
|
ForceOverwrite bool `json:"force_overwrite,omitempty"`
|
||||||
NanToZero bool `json:"nan_to_zero,omitempty"`
|
InvalidToZero bool `json:"invalid_to_zero,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type LikwidCollector struct {
|
type LikwidCollector struct {
|
||||||
@ -260,6 +266,10 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
lib := dl.New(LIKWID_LIB_NAME, LIKWID_LIB_DL_FLAGS)
|
||||||
|
if lib == nil {
|
||||||
|
return fmt.Errorf("error instantiating DynamicLibrary for %s", LIKWID_LIB_NAME)
|
||||||
|
}
|
||||||
if m.config.ForceOverwrite {
|
if m.config.ForceOverwrite {
|
||||||
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
|
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
|
||||||
os.Setenv("LIKWID_FORCE", "1")
|
os.Setenv("LIKWID_FORCE", "1")
|
||||||
@ -374,15 +384,13 @@ func (m *LikwidCollector) takeMeasurement(group int, interval time.Duration) err
|
|||||||
ret = C.perfmon_setupCounters(gid)
|
ret = C.perfmon_setupCounters(gid)
|
||||||
if ret != 0 {
|
if ret != 0 {
|
||||||
gctr := C.GoString(C.perfmon_getGroupName(gid))
|
gctr := C.GoString(C.perfmon_getGroupName(gid))
|
||||||
err := fmt.Errorf("failed to setup performance group %s", gctr)
|
err := fmt.Errorf("failed to setup performance group %d (%s)", gid, gctr)
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
ret = C.perfmon_startCounters()
|
ret = C.perfmon_startCounters()
|
||||||
if ret != 0 {
|
if ret != 0 {
|
||||||
gctr := C.GoString(C.perfmon_getGroupName(gid))
|
gctr := C.GoString(C.perfmon_getGroupName(gid))
|
||||||
err := fmt.Errorf("failed to start performance group %s", gctr)
|
err := fmt.Errorf("failed to start performance group %d (%s)", gid, gctr)
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
m.running = true
|
m.running = true
|
||||||
@ -391,8 +399,7 @@ func (m *LikwidCollector) takeMeasurement(group int, interval time.Duration) err
|
|||||||
ret = C.perfmon_stopCounters()
|
ret = C.perfmon_stopCounters()
|
||||||
if ret != 0 {
|
if ret != 0 {
|
||||||
gctr := C.GoString(C.perfmon_getGroupName(gid))
|
gctr := C.GoString(C.perfmon_getGroupName(gid))
|
||||||
err := fmt.Errorf("failed to stop performance group %s", gctr)
|
err := fmt.Errorf("failed to stop performance group %d (%s)", gid, gctr)
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
@ -439,7 +446,10 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration,
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
m.mresults[group][tid][metric.Name] = value
|
m.mresults[group][tid][metric.Name] = value
|
||||||
if m.config.NanToZero && math.IsNaN(value) {
|
if m.config.InvalidToZero && math.IsNaN(value) {
|
||||||
|
value = 0.0
|
||||||
|
}
|
||||||
|
if m.config.InvalidToZero && math.IsInf(value, 0) {
|
||||||
value = 0.0
|
value = 0.0
|
||||||
}
|
}
|
||||||
// Now we have the result, send it with the proper tags
|
// Now we have the result, send it with the proper tags
|
||||||
@ -483,7 +493,10 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
m.gmresults[tid][metric.Name] = value
|
m.gmresults[tid][metric.Name] = value
|
||||||
if m.config.NanToZero && math.IsNaN(value) {
|
if m.config.InvalidToZero && math.IsNaN(value) {
|
||||||
|
value = 0.0
|
||||||
|
}
|
||||||
|
if m.config.InvalidToZero && math.IsInf(value, 0) {
|
||||||
value = 0.0
|
value = 0.0
|
||||||
}
|
}
|
||||||
// Now we have the result, send it with the proper tags
|
// Now we have the result, send it with the proper tags
|
||||||
@ -517,7 +530,7 @@ func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
err := m.takeMeasurement(i, interval)
|
err := m.takeMeasurement(i, interval)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, err.Error())
|
cclog.ComponentError(m.name, err.Error())
|
||||||
continue
|
return
|
||||||
}
|
}
|
||||||
// read measurements and derive event set metrics
|
// read measurements and derive event set metrics
|
||||||
m.calcEventsetMetrics(i, interval, output)
|
m.calcEventsetMetrics(i, interval, output)
|
||||||
|
@ -9,7 +9,7 @@ The `likwid` configuration consists of two parts, the "eventsets" and "globalmet
|
|||||||
|
|
||||||
Additional options:
|
Additional options:
|
||||||
- `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements
|
- `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements
|
||||||
- `nan_to_zero`: In some cases, the calculations result in `NaN`. With this option, all `NaN` values are replaces with `0.0`.
|
- `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`.
|
||||||
|
|
||||||
### Available metric scopes
|
### Available metric scopes
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user