mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-04-06 13:35:55 +02:00
Re-initialize LIKWID after one read is missing due to lock toggle
This commit is contained in:
parent
8f372f0274
commit
35b25a6ce8
@ -82,6 +82,7 @@ type LikwidCollector struct {
|
|||||||
basefreq float64
|
basefreq float64
|
||||||
running bool
|
running bool
|
||||||
initialized bool
|
initialized bool
|
||||||
|
needs_reinit bool
|
||||||
likwidGroups map[C.int]LikwidEventsetConfig
|
likwidGroups map[C.int]LikwidEventsetConfig
|
||||||
lock sync.Mutex
|
lock sync.Mutex
|
||||||
measureThread thread.Thread
|
measureThread thread.Thread
|
||||||
@ -193,6 +194,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
m.name = "LikwidCollector"
|
m.name = "LikwidCollector"
|
||||||
m.parallel = false
|
m.parallel = false
|
||||||
m.initialized = false
|
m.initialized = false
|
||||||
|
m.needs_reinit = true
|
||||||
m.running = false
|
m.running = false
|
||||||
m.config.AccessMode = LIKWID_DEF_ACCESSMODE
|
m.config.AccessMode = LIKWID_DEF_ACCESSMODE
|
||||||
m.config.LibraryPath = LIKWID_LIB_NAME
|
m.config.LibraryPath = LIKWID_LIB_NAME
|
||||||
@ -299,6 +301,7 @@ func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval t
|
|||||||
if ret != 0 {
|
if ret != 0 {
|
||||||
var err error = nil
|
var err error = nil
|
||||||
var skip bool = false
|
var skip bool = false
|
||||||
|
cclog.ComponentDebug(m.name, "Setup returns", ret)
|
||||||
if ret == -37 {
|
if ret == -37 {
|
||||||
skip = true
|
skip = true
|
||||||
} else {
|
} else {
|
||||||
@ -353,7 +356,6 @@ func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval t
|
|||||||
for _, tid := range m.cpu2tid {
|
for _, tid := range m.cpu2tid {
|
||||||
evset.results[tid]["time"] = runtime
|
evset.results[tid]["time"] = runtime
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
m.lock.Unlock()
|
m.lock.Unlock()
|
||||||
return false, nil
|
return false, nil
|
||||||
@ -458,6 +460,28 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *LikwidCollector) ReInit() error {
|
||||||
|
C.perfmon_finalize()
|
||||||
|
ret := C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
||||||
|
if ret != 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
for i, evset := range m.config.Eventsets {
|
||||||
|
var gid C.int
|
||||||
|
if len(evset.Events) > 0 {
|
||||||
|
//skip := false
|
||||||
|
likwidGroup := genLikwidEventSet(evset)
|
||||||
|
gid = C.perfmon_addEventSet(likwidGroup.estr)
|
||||||
|
if gid >= 0 {
|
||||||
|
likwidGroup.gid = gid
|
||||||
|
likwidGroup.internal = i
|
||||||
|
m.likwidGroups[gid] = likwidGroup
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (m *LikwidCollector) LateInit() error {
|
func (m *LikwidCollector) LateInit() error {
|
||||||
var ret C.int
|
var ret C.int
|
||||||
if m.initialized {
|
if m.initialized {
|
||||||
@ -498,48 +522,53 @@ func (m *LikwidCollector) LateInit() error {
|
|||||||
m.basefreq = getBaseFreq()
|
m.basefreq = getBaseFreq()
|
||||||
cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq)
|
cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq)
|
||||||
|
|
||||||
cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module")
|
if m.needs_reinit {
|
||||||
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
m.ReInit()
|
||||||
if ret != 0 {
|
m.needs_reinit = false
|
||||||
var err error = nil
|
|
||||||
C.topology_finalize()
|
|
||||||
if ret != -22 {
|
|
||||||
err = errors.New("failed to initialize LIKWID perfmon")
|
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
} else {
|
|
||||||
err = errors.New("access to LIKWID perfmon locked")
|
|
||||||
}
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// While adding the events, we test the metrics whether they can be computed at all
|
// cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module")
|
||||||
for i, evset := range m.config.Eventsets {
|
// ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
||||||
var gid C.int
|
// if ret != 0 {
|
||||||
if len(evset.Events) > 0 {
|
// var err error = nil
|
||||||
skip := false
|
// C.topology_finalize()
|
||||||
likwidGroup := genLikwidEventSet(evset)
|
// if ret != -22 {
|
||||||
for _, g := range m.likwidGroups {
|
// err = errors.New("failed to initialize LIKWID perfmon")
|
||||||
if likwidGroup.go_estr == g.go_estr {
|
// cclog.ComponentError(m.name, err.Error())
|
||||||
skip = true
|
// } else {
|
||||||
break
|
// err = errors.New("access to LIKWID perfmon locked")
|
||||||
}
|
// }
|
||||||
}
|
// return err
|
||||||
if skip {
|
// }
|
||||||
continue
|
|
||||||
}
|
|
||||||
// Now we add the list of events to likwid
|
|
||||||
gid = C.perfmon_addEventSet(likwidGroup.estr)
|
|
||||||
if gid >= 0 {
|
|
||||||
likwidGroup.gid = gid
|
|
||||||
likwidGroup.internal = i
|
|
||||||
m.likwidGroups[gid] = likwidGroup
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
// // While adding the events, we test the metrics whether they can be computed at all
|
||||||
|
// for i, evset := range m.config.Eventsets {
|
||||||
|
// var gid C.int
|
||||||
|
// if len(evset.Events) > 0 {
|
||||||
|
// //skip := false
|
||||||
|
// likwidGroup := genLikwidEventSet(evset)
|
||||||
|
// // for _, g := range m.likwidGroups {
|
||||||
|
// // if likwidGroup.go_estr == g.go_estr {
|
||||||
|
// // skip = true
|
||||||
|
// // break
|
||||||
|
// // }
|
||||||
|
// // }
|
||||||
|
// // if skip {
|
||||||
|
// // continue
|
||||||
|
// // }
|
||||||
|
// // Now we add the list of events to likwid
|
||||||
|
// gid = C.perfmon_addEventSet(likwidGroup.estr)
|
||||||
|
// if gid >= 0 {
|
||||||
|
// likwidGroup.gid = gid
|
||||||
|
// likwidGroup.internal = i
|
||||||
|
// m.likwidGroups[gid] = likwidGroup
|
||||||
|
// }
|
||||||
|
// } else {
|
||||||
|
// cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given")
|
||||||
|
// continue
|
||||||
|
// }
|
||||||
|
|
||||||
|
// }
|
||||||
|
|
||||||
// If no event set could be added, shut down LikwidCollector
|
// If no event set could be added, shut down LikwidCollector
|
||||||
if len(m.likwidGroups) == 0 {
|
if len(m.likwidGroups) == 0 {
|
||||||
@ -606,6 +635,10 @@ func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
// use the event set metrics to derive the global metrics
|
// use the event set metrics to derive the global metrics
|
||||||
m.calcGlobalMetrics(time, output)
|
m.calcGlobalMetrics(time, output)
|
||||||
}
|
}
|
||||||
|
if skip {
|
||||||
|
m.needs_reinit = true
|
||||||
|
m.initialized = false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user