mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-04-05 21:25:55 +02:00
Run LIKWID in separate thread and check metric type
This commit is contained in:
parent
5918f96fd8
commit
a6699bbe75
@ -28,6 +28,7 @@ import (
|
||||
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
|
||||
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||
"github.com/NVIDIA/go-nvml/pkg/dl"
|
||||
"golang.design/x/thread"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -71,18 +72,19 @@ type LikwidCollectorConfig struct {
|
||||
|
||||
type LikwidCollector struct {
|
||||
metricCollector
|
||||
cpulist []C.int
|
||||
cpu2tid map[int]int
|
||||
sock2tid map[int]int
|
||||
metrics map[C.int]map[string]int
|
||||
groups []C.int
|
||||
config LikwidCollectorConfig
|
||||
gmresults map[int]map[string]float64
|
||||
basefreq float64
|
||||
running bool
|
||||
initialized bool
|
||||
likwidGroups map[C.int]LikwidEventsetConfig
|
||||
lock sync.Mutex
|
||||
cpulist []C.int
|
||||
cpu2tid map[int]int
|
||||
sock2tid map[int]int
|
||||
metrics map[C.int]map[string]int
|
||||
groups []C.int
|
||||
config LikwidCollectorConfig
|
||||
gmresults map[int]map[string]float64
|
||||
basefreq float64
|
||||
running bool
|
||||
initialized bool
|
||||
likwidGroups map[C.int]LikwidEventsetConfig
|
||||
lock sync.Mutex
|
||||
measureThread thread.Thread
|
||||
}
|
||||
|
||||
type LikwidMetric struct {
|
||||
@ -92,6 +94,18 @@ type LikwidMetric struct {
|
||||
group_idx int
|
||||
}
|
||||
|
||||
func checkMetricType(t string) bool {
|
||||
valid := map[string]bool{
|
||||
"node": true,
|
||||
"socket": true,
|
||||
"hwthread": true,
|
||||
"core": true,
|
||||
"memoryDomain": true,
|
||||
}
|
||||
_, ok := valid[t]
|
||||
return ok
|
||||
}
|
||||
|
||||
func eventsToEventStr(events map[string]string) string {
|
||||
elist := make([]string, 0)
|
||||
for k, v := range events {
|
||||
@ -239,7 +253,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
for _, metric := range evset.Metrics {
|
||||
// Try to evaluate the metric
|
||||
if testLikwidMetricFormula(metric.Calc, params) {
|
||||
if testLikwidMetricFormula(metric.Calc, params) && checkMetricType(metric.Type) {
|
||||
// Add the computable metric to the parameter list for the global metrics
|
||||
globalParams = append(globalParams, metric.Name)
|
||||
totalMetrics++
|
||||
@ -257,6 +271,9 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
if !testLikwidMetricFormula(metric.Calc, globalParams) {
|
||||
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed")
|
||||
metric.Calc = ""
|
||||
} else if !checkMetricType(metric.Type) {
|
||||
cclog.ComponentError(m.name, "Metric", metric.Name, "has invalid type")
|
||||
metric.Calc = ""
|
||||
} else {
|
||||
totalMetrics++
|
||||
}
|
||||
@ -268,6 +285,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
}
|
||||
m.measureThread = thread.New()
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
@ -289,6 +307,7 @@ func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval t
|
||||
m.lock.Unlock()
|
||||
return skip, err
|
||||
}
|
||||
m.running = true
|
||||
ret = C.perfmon_startCounters()
|
||||
if ret != 0 {
|
||||
var err error = nil
|
||||
@ -301,7 +320,7 @@ func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval t
|
||||
m.lock.Unlock()
|
||||
return skip, err
|
||||
}
|
||||
m.running = true
|
||||
ret = C.perfmon_readCounters()
|
||||
time.Sleep(interval)
|
||||
m.running = false
|
||||
ret = C.perfmon_stopCounters()
|
||||
@ -316,6 +335,25 @@ func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval t
|
||||
m.lock.Unlock()
|
||||
return skip, err
|
||||
}
|
||||
m.running = false
|
||||
runtime := float64(C.perfmon_getLastTimeOfGroup(evset.gid))
|
||||
// Go over events and get the results
|
||||
for eidx, counter := range evset.eorder {
|
||||
gctr := C.GoString(counter)
|
||||
for _, tid := range m.cpu2tid {
|
||||
res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid))
|
||||
fres := float64(res)
|
||||
if m.config.InvalidToZero && (math.IsNaN(fres) || math.IsInf(fres, 0)) {
|
||||
cclog.ComponentDebug(m.name, "Sanitize", gctr, "to zero")
|
||||
fres = 0.0
|
||||
}
|
||||
evset.results[tid][gctr] = fres
|
||||
}
|
||||
}
|
||||
for _, tid := range m.cpu2tid {
|
||||
evset.results[tid]["time"] = runtime
|
||||
}
|
||||
|
||||
}
|
||||
m.lock.Unlock()
|
||||
return false, nil
|
||||
@ -325,19 +363,8 @@ func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval t
|
||||
func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error {
|
||||
invClock := float64(1.0 / m.basefreq)
|
||||
|
||||
// Go over events and get the results
|
||||
for eidx, counter := range evset.eorder {
|
||||
gctr := C.GoString(counter)
|
||||
for _, tid := range m.cpu2tid {
|
||||
res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid))
|
||||
fres := float64(res)
|
||||
if m.config.InvalidToZero && (math.IsNaN(fres) || math.IsInf(fres, 0)) {
|
||||
fres = 0.0
|
||||
}
|
||||
evset.results[tid][gctr] = fres
|
||||
evset.results[tid]["time"] = interval.Seconds()
|
||||
evset.results[tid]["inverseClock"] = invClock
|
||||
}
|
||||
for _, tid := range m.cpu2tid {
|
||||
evset.results[tid]["inverseClock"] = invClock
|
||||
}
|
||||
|
||||
// Go over the event set metrics, derive the value out of the event:counter values and send it
|
||||
@ -445,6 +472,9 @@ func (m *LikwidCollector) LateInit() error {
|
||||
os.Setenv("PATH", m.config.DaemonPath+":"+p)
|
||||
}
|
||||
C.HPMmode(1)
|
||||
for _, c := range m.cpulist {
|
||||
C.HPMaddThread(c)
|
||||
}
|
||||
}
|
||||
cclog.ComponentDebug(m.name, "initialize LIKWID topology")
|
||||
ret = C.topology_init()
|
||||
@ -540,38 +570,44 @@ func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
||||
return
|
||||
}
|
||||
|
||||
if !m.initialized {
|
||||
m.lock.Lock()
|
||||
err = m.LateInit()
|
||||
if err != nil {
|
||||
m.measureThread.Call(func() {
|
||||
if !m.initialized {
|
||||
m.lock.Lock()
|
||||
err = m.LateInit()
|
||||
if err != nil {
|
||||
m.lock.Unlock()
|
||||
cclog.ComponentError(m.name, "lateinit failed")
|
||||
return
|
||||
}
|
||||
m.initialized = true
|
||||
m.lock.Unlock()
|
||||
return
|
||||
skip = true
|
||||
}
|
||||
m.initialized = true
|
||||
m.lock.Unlock()
|
||||
}
|
||||
|
||||
if m.initialized && !skip {
|
||||
for _, evset := range m.likwidGroups {
|
||||
if !skip {
|
||||
// measure event set 'i' for 'interval' seconds
|
||||
skip, err = m.takeMeasurement(evset, interval)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return
|
||||
if m.initialized && !skip {
|
||||
time := interval
|
||||
for _, evset := range m.likwidGroups {
|
||||
if !skip {
|
||||
// measure event set 'i' for 'interval' seconds
|
||||
skip, err = m.takeMeasurement(evset, interval)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if !skip {
|
||||
// read measurements and derive event set metrics
|
||||
m.calcEventsetMetrics(evset, time, output)
|
||||
}
|
||||
}
|
||||
|
||||
if !skip {
|
||||
// read measurements and derive event set metrics
|
||||
m.calcEventsetMetrics(evset, interval, output)
|
||||
// use the event set metrics to derive the global metrics
|
||||
m.calcGlobalMetrics(time, output)
|
||||
}
|
||||
}
|
||||
if !skip {
|
||||
// use the event set metrics to derive the global metrics
|
||||
m.calcGlobalMetrics(interval, output)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (m *LikwidCollector) Close() {
|
||||
|
Loading…
x
Reference in New Issue
Block a user