diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 265d84c..07f00f6 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -28,6 +28,7 @@ import ( lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology" "github.com/NVIDIA/go-nvml/pkg/dl" + "golang.design/x/thread" ) const ( @@ -71,18 +72,19 @@ type LikwidCollectorConfig struct { type LikwidCollector struct { metricCollector - cpulist []C.int - cpu2tid map[int]int - sock2tid map[int]int - metrics map[C.int]map[string]int - groups []C.int - config LikwidCollectorConfig - gmresults map[int]map[string]float64 - basefreq float64 - running bool - initialized bool - likwidGroups map[C.int]LikwidEventsetConfig - lock sync.Mutex + cpulist []C.int + cpu2tid map[int]int + sock2tid map[int]int + metrics map[C.int]map[string]int + groups []C.int + config LikwidCollectorConfig + gmresults map[int]map[string]float64 + basefreq float64 + running bool + initialized bool + likwidGroups map[C.int]LikwidEventsetConfig + lock sync.Mutex + measureThread thread.Thread } type LikwidMetric struct { @@ -92,6 +94,18 @@ type LikwidMetric struct { group_idx int } +func checkMetricType(t string) bool { + valid := map[string]bool{ + "node": true, + "socket": true, + "hwthread": true, + "core": true, + "memoryDomain": true, + } + _, ok := valid[t] + return ok +} + func eventsToEventStr(events map[string]string) string { elist := make([]string, 0) for k, v := range events { @@ -239,7 +253,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { } for _, metric := range evset.Metrics { // Try to evaluate the metric - if testLikwidMetricFormula(metric.Calc, params) { + if testLikwidMetricFormula(metric.Calc, params) && checkMetricType(metric.Type) { // Add the computable metric to the parameter list for the global metrics globalParams = append(globalParams, metric.Name) totalMetrics++ @@ -257,6 +271,9 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { if !testLikwidMetricFormula(metric.Calc, globalParams) { cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed") metric.Calc = "" + } else if !checkMetricType(metric.Type) { + cclog.ComponentError(m.name, "Metric", metric.Name, "has invalid type") + metric.Calc = "" } else { totalMetrics++ } @@ -268,6 +285,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { cclog.ComponentError(m.name, err.Error()) return err } + m.measureThread = thread.New() m.init = true return nil } @@ -289,6 +307,7 @@ func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval t m.lock.Unlock() return skip, err } + m.running = true ret = C.perfmon_startCounters() if ret != 0 { var err error = nil @@ -301,7 +320,7 @@ func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval t m.lock.Unlock() return skip, err } - m.running = true + ret = C.perfmon_readCounters() time.Sleep(interval) m.running = false ret = C.perfmon_stopCounters() @@ -316,6 +335,25 @@ func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval t m.lock.Unlock() return skip, err } + m.running = false + runtime := float64(C.perfmon_getLastTimeOfGroup(evset.gid)) + // Go over events and get the results + for eidx, counter := range evset.eorder { + gctr := C.GoString(counter) + for _, tid := range m.cpu2tid { + res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid)) + fres := float64(res) + if m.config.InvalidToZero && (math.IsNaN(fres) || math.IsInf(fres, 0)) { + cclog.ComponentDebug(m.name, "Sanitize", gctr, "to zero") + fres = 0.0 + } + evset.results[tid][gctr] = fres + } + } + for _, tid := range m.cpu2tid { + evset.results[tid]["time"] = runtime + } + } m.lock.Unlock() return false, nil @@ -325,19 +363,8 @@ func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval t func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error { invClock := float64(1.0 / m.basefreq) - // Go over events and get the results - for eidx, counter := range evset.eorder { - gctr := C.GoString(counter) - for _, tid := range m.cpu2tid { - res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid)) - fres := float64(res) - if m.config.InvalidToZero && (math.IsNaN(fres) || math.IsInf(fres, 0)) { - fres = 0.0 - } - evset.results[tid][gctr] = fres - evset.results[tid]["time"] = interval.Seconds() - evset.results[tid]["inverseClock"] = invClock - } + for _, tid := range m.cpu2tid { + evset.results[tid]["inverseClock"] = invClock } // Go over the event set metrics, derive the value out of the event:counter values and send it @@ -445,6 +472,9 @@ func (m *LikwidCollector) LateInit() error { os.Setenv("PATH", m.config.DaemonPath+":"+p) } C.HPMmode(1) + for _, c := range m.cpulist { + C.HPMaddThread(c) + } } cclog.ComponentDebug(m.name, "initialize LIKWID topology") ret = C.topology_init() @@ -540,38 +570,44 @@ func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) return } - if !m.initialized { - m.lock.Lock() - err = m.LateInit() - if err != nil { + m.measureThread.Call(func() { + if !m.initialized { + m.lock.Lock() + err = m.LateInit() + if err != nil { + m.lock.Unlock() + cclog.ComponentError(m.name, "lateinit failed") + return + } + m.initialized = true m.lock.Unlock() - return + skip = true } - m.initialized = true - m.lock.Unlock() - } - if m.initialized && !skip { - for _, evset := range m.likwidGroups { - if !skip { - // measure event set 'i' for 'interval' seconds - skip, err = m.takeMeasurement(evset, interval) - if err != nil { - cclog.ComponentError(m.name, err.Error()) - return + if m.initialized && !skip { + time := interval + for _, evset := range m.likwidGroups { + if !skip { + // measure event set 'i' for 'interval' seconds + skip, err = m.takeMeasurement(evset, interval) + if err != nil { + cclog.ComponentError(m.name, err.Error()) + return + } + } + + if !skip { + // read measurements and derive event set metrics + m.calcEventsetMetrics(evset, time, output) } } if !skip { - // read measurements and derive event set metrics - m.calcEventsetMetrics(evset, interval, output) + // use the event set metrics to derive the global metrics + m.calcGlobalMetrics(time, output) } } - if !skip { - // use the event set metrics to derive the global metrics - m.calcGlobalMetrics(interval, output) - } - } + }) } func (m *LikwidCollector) Close() {