mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2024-12-25 23:19:06 +01:00
Merge branch 'develop' of github.com:ClusterCockpit/cc-metric-collector into develop
This commit is contained in:
commit
f1d3cabdc6
@ -15,8 +15,11 @@ import (
|
|||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
|
"os/signal"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
@ -46,6 +49,15 @@ type LikwidCollectorEventsetConfig struct {
|
|||||||
Metrics []LikwidCollectorMetricConfig `json:"metrics"`
|
Metrics []LikwidCollectorMetricConfig `json:"metrics"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type LikwidEventsetConfig struct {
|
||||||
|
internal int
|
||||||
|
gid C.int
|
||||||
|
eorder []*C.char
|
||||||
|
estr *C.char
|
||||||
|
results map[int]map[string]interface{}
|
||||||
|
metrics map[int]map[string]float64
|
||||||
|
}
|
||||||
|
|
||||||
type LikwidCollectorConfig struct {
|
type LikwidCollectorConfig struct {
|
||||||
Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"`
|
Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"`
|
||||||
Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"`
|
Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"`
|
||||||
@ -58,17 +70,18 @@ type LikwidCollectorConfig struct {
|
|||||||
|
|
||||||
type LikwidCollector struct {
|
type LikwidCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
cpulist []C.int
|
cpulist []C.int
|
||||||
cpu2tid map[int]int
|
cpu2tid map[int]int
|
||||||
sock2tid map[int]int
|
sock2tid map[int]int
|
||||||
metrics map[C.int]map[string]int
|
metrics map[C.int]map[string]int
|
||||||
groups []C.int
|
groups []C.int
|
||||||
config LikwidCollectorConfig
|
config LikwidCollectorConfig
|
||||||
results map[int]map[int]map[string]interface{}
|
gmresults map[int]map[string]float64
|
||||||
mresults map[int]map[int]map[string]float64
|
basefreq float64
|
||||||
gmresults map[int]map[string]float64
|
running bool
|
||||||
basefreq float64
|
initialized bool
|
||||||
running bool
|
likwidGroups map[C.int]LikwidEventsetConfig
|
||||||
|
lock sync.Mutex
|
||||||
}
|
}
|
||||||
|
|
||||||
type LikwidMetric struct {
|
type LikwidMetric struct {
|
||||||
@ -86,14 +99,53 @@ func eventsToEventStr(events map[string]string) string {
|
|||||||
return strings.Join(elist, ",")
|
return strings.Join(elist, ",")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig {
|
||||||
|
tmplist := make([]string, 0)
|
||||||
|
elist := make([]*C.char, 0)
|
||||||
|
for k, v := range input.Events {
|
||||||
|
tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k))
|
||||||
|
c_counter := C.CString(k)
|
||||||
|
elist = append(elist, c_counter)
|
||||||
|
}
|
||||||
|
estr := strings.Join(tmplist, ",")
|
||||||
|
res := make(map[int]map[string]interface{})
|
||||||
|
met := make(map[int]map[string]float64)
|
||||||
|
for _, i := range topo.CpuList() {
|
||||||
|
res[i] = make(map[string]interface{})
|
||||||
|
for k := range input.Events {
|
||||||
|
res[i][k] = 0.0
|
||||||
|
}
|
||||||
|
met[i] = make(map[string]float64)
|
||||||
|
for _, v := range input.Metrics {
|
||||||
|
res[i][v.Name] = 0.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return LikwidEventsetConfig{
|
||||||
|
gid: -1,
|
||||||
|
eorder: elist,
|
||||||
|
estr: C.CString(estr),
|
||||||
|
results: res,
|
||||||
|
metrics: met,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func testLikwidMetricFormula(formula string, params []string) bool {
|
||||||
|
myparams := make(map[string]interface{})
|
||||||
|
for _, p := range params {
|
||||||
|
myparams[p] = float64(1.0)
|
||||||
|
}
|
||||||
|
_, err := agg.EvalFloat64Condition(formula, myparams)
|
||||||
|
return err == nil
|
||||||
|
}
|
||||||
|
|
||||||
func getBaseFreq() float64 {
|
func getBaseFreq() float64 {
|
||||||
|
files := []string{
|
||||||
|
"/sys/devices/system/cpu/cpu0/cpufreq/bios_limit",
|
||||||
|
"/sys/devices/system/cpu/cpu0/cpufreq/base_frequency",
|
||||||
|
}
|
||||||
var freq float64 = math.NaN()
|
var freq float64 = math.NaN()
|
||||||
C.power_init(0)
|
for _, f := range files {
|
||||||
info := C.get_powerInfo()
|
buffer, err := ioutil.ReadFile(f)
|
||||||
if float64(info.baseFrequency) != 0 {
|
|
||||||
freq = float64(info.baseFrequency) * 1e6
|
|
||||||
} else {
|
|
||||||
buffer, err := ioutil.ReadFile("/sys/devices/system/cpu/cpu0/cpufreq/bios_limit")
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
data := strings.Replace(string(buffer), "\n", "", -1)
|
data := strings.Replace(string(buffer), "\n", "", -1)
|
||||||
x, err := strconv.ParseInt(data, 0, 64)
|
x, err := strconv.ParseInt(data, 0, 64)
|
||||||
@ -102,12 +154,22 @@ func getBaseFreq() float64 {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if math.IsNaN(freq) {
|
||||||
|
C.power_init(0)
|
||||||
|
info := C.get_powerInfo()
|
||||||
|
if float64(info.baseFrequency) != 0 {
|
||||||
|
freq = float64(info.baseFrequency) * 1e6
|
||||||
|
}
|
||||||
|
C.power_finalize()
|
||||||
|
}
|
||||||
return freq
|
return freq
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *LikwidCollector) Init(config json.RawMessage) error {
|
func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||||
var ret C.int
|
|
||||||
m.name = "LikwidCollector"
|
m.name = "LikwidCollector"
|
||||||
|
m.initialized = false
|
||||||
|
m.running = false
|
||||||
m.config.AccessMode = LIKWID_DEF_ACCESSMODE
|
m.config.AccessMode = LIKWID_DEF_ACCESSMODE
|
||||||
m.config.LibraryPath = LIKWID_LIB_NAME
|
m.config.LibraryPath = LIKWID_LIB_NAME
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
@ -140,172 +202,132 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
m.cpulist[i] = C.int(c)
|
m.cpulist[i] = C.int(c)
|
||||||
m.cpu2tid[c] = i
|
m.cpu2tid[c] = i
|
||||||
}
|
}
|
||||||
m.sock2tid = make(map[int]int)
|
|
||||||
tmp := make([]C.int, 1)
|
m.likwidGroups = make(map[C.int]LikwidEventsetConfig)
|
||||||
for _, sid := range topo.SocketList() {
|
|
||||||
cstr := C.CString(fmt.Sprintf("S%d:0", sid))
|
// m.results = make(map[int]map[int]map[string]interface{})
|
||||||
ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1)
|
// m.mresults = make(map[int]map[int]map[string]float64)
|
||||||
if ret > 0 {
|
|
||||||
m.sock2tid[sid] = m.cpu2tid[int(tmp[0])]
|
|
||||||
}
|
|
||||||
C.free(unsafe.Pointer(cstr))
|
|
||||||
}
|
|
||||||
m.results = make(map[int]map[int]map[string]interface{})
|
|
||||||
m.mresults = make(map[int]map[int]map[string]float64)
|
|
||||||
m.gmresults = make(map[int]map[string]float64)
|
m.gmresults = make(map[int]map[string]float64)
|
||||||
cclog.ComponentDebug(m.name, "initialize LIKWID topology")
|
for _, tid := range m.cpu2tid {
|
||||||
ret = C.topology_init()
|
m.gmresults[tid] = make(map[string]float64)
|
||||||
if ret != 0 {
|
|
||||||
err := errors.New("failed to initialize LIKWID topology")
|
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
switch m.config.AccessMode {
|
|
||||||
case "direct":
|
|
||||||
C.HPMmode(0)
|
|
||||||
case "accessdaemon":
|
|
||||||
if len(m.config.DaemonPath) > 0 {
|
|
||||||
p := os.Getenv("PATH")
|
|
||||||
os.Setenv("PATH", m.config.DaemonPath+":"+p)
|
|
||||||
}
|
|
||||||
C.HPMmode(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module")
|
|
||||||
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
|
||||||
if ret != 0 {
|
|
||||||
C.topology_finalize()
|
|
||||||
err := errors.New("failed to initialize LIKWID topology")
|
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is for the global metrics computation test
|
// This is for the global metrics computation test
|
||||||
globalParams := make(map[string]interface{})
|
totalMetrics := 0
|
||||||
globalParams["time"] = float64(1.0)
|
// Generate parameter list for the metric computing test
|
||||||
globalParams["inverseClock"] = float64(1.0)
|
params := make([]string, 0)
|
||||||
// While adding the events, we test the metrics whether they can be computed at all
|
params = append(params, "time", "inverseClock")
|
||||||
for i, evset := range m.config.Eventsets {
|
// Generate parameter list for the global metric computing test
|
||||||
var gid C.int
|
globalParams := make([]string, 0)
|
||||||
var cstr *C.char
|
globalParams = append(globalParams, "time", "inverseClock")
|
||||||
|
// We test the eventset metrics whether they can be computed at all
|
||||||
|
for _, evset := range m.config.Eventsets {
|
||||||
if len(evset.Events) > 0 {
|
if len(evset.Events) > 0 {
|
||||||
estr := eventsToEventStr(evset.Events)
|
params = params[:2]
|
||||||
// Generate parameter list for the metric computing test
|
|
||||||
params := make(map[string]interface{})
|
|
||||||
params["time"] = float64(1.0)
|
|
||||||
params["inverseClock"] = float64(1.0)
|
|
||||||
for counter := range evset.Events {
|
for counter := range evset.Events {
|
||||||
params[counter] = float64(1.0)
|
params = append(params, counter)
|
||||||
}
|
}
|
||||||
for _, metric := range evset.Metrics {
|
for _, metric := range evset.Metrics {
|
||||||
// Try to evaluate the metric
|
// Try to evaluate the metric
|
||||||
_, err := agg.EvalFloat64Condition(metric.Calc, params)
|
if testLikwidMetricFormula(metric.Calc, params) {
|
||||||
if err != nil {
|
// Add the computable metric to the parameter list for the global metrics
|
||||||
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
globalParams = append(globalParams, metric.Name)
|
||||||
continue
|
totalMetrics++
|
||||||
}
|
} else {
|
||||||
// If the metric is not in the parameter list for the global metrics, add it
|
metric.Calc = ""
|
||||||
if _, ok := globalParams[metric.Name]; !ok {
|
|
||||||
globalParams[metric.Name] = float64(1.0)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Now we add the list of events to likwid
|
|
||||||
cstr = C.CString(estr)
|
|
||||||
gid = C.perfmon_addEventSet(cstr)
|
|
||||||
} else {
|
} else {
|
||||||
cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given")
|
cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if gid >= 0 {
|
|
||||||
m.groups = append(m.groups, gid)
|
|
||||||
}
|
|
||||||
C.free(unsafe.Pointer(cstr))
|
|
||||||
m.results[i] = make(map[int]map[string]interface{})
|
|
||||||
m.mresults[i] = make(map[int]map[string]float64)
|
|
||||||
for tid := range m.cpulist {
|
|
||||||
m.results[i][tid] = make(map[string]interface{})
|
|
||||||
m.mresults[i][tid] = make(map[string]float64)
|
|
||||||
if i == 0 {
|
|
||||||
m.gmresults[tid] = make(map[string]float64)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
for _, metric := range m.config.Metrics {
|
for _, metric := range m.config.Metrics {
|
||||||
// Try to evaluate the global metric
|
// Try to evaluate the global metric
|
||||||
_, err := agg.EvalFloat64Condition(metric.Calc, globalParams)
|
if !testLikwidMetricFormula(metric.Calc, globalParams) {
|
||||||
if err != nil {
|
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed")
|
||||||
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
metric.Calc = ""
|
||||||
continue
|
} else {
|
||||||
|
totalMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If no event set could be added, shut down LikwidCollector
|
// If no event set could be added, shut down LikwidCollector
|
||||||
if len(m.groups) == 0 {
|
if totalMetrics == 0 {
|
||||||
C.perfmon_finalize()
|
err := errors.New("no LIKWID eventset or metric usable")
|
||||||
C.topology_finalize()
|
|
||||||
err := errors.New("no LIKWID performance group initialized")
|
|
||||||
cclog.ComponentError(m.name, err.Error())
|
cclog.ComponentError(m.name, err.Error())
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
m.basefreq = getBaseFreq()
|
|
||||||
cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq)
|
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// take a measurement for 'interval' seconds of event set index 'group'
|
// take a measurement for 'interval' seconds of event set index 'group'
|
||||||
func (m *LikwidCollector) takeMeasurement(group int, interval time.Duration) error {
|
func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval time.Duration) (bool, error) {
|
||||||
var ret C.int
|
var ret C.int
|
||||||
gid := m.groups[group]
|
m.lock.Lock()
|
||||||
ret = C.perfmon_setupCounters(gid)
|
if m.initialized {
|
||||||
if ret != 0 {
|
ret = C.perfmon_setupCounters(evset.gid)
|
||||||
gctr := C.GoString(C.perfmon_getGroupName(gid))
|
if ret != 0 {
|
||||||
err := fmt.Errorf("failed to setup performance group %d (%s)", gid, gctr)
|
var err error = nil
|
||||||
return err
|
var skip bool = false
|
||||||
|
if ret == -37 {
|
||||||
|
skip = true
|
||||||
|
} else {
|
||||||
|
err = fmt.Errorf("failed to setup performance group %d", evset.gid)
|
||||||
|
}
|
||||||
|
m.lock.Unlock()
|
||||||
|
return skip, err
|
||||||
|
}
|
||||||
|
ret = C.perfmon_startCounters()
|
||||||
|
if ret != 0 {
|
||||||
|
var err error = nil
|
||||||
|
var skip bool = false
|
||||||
|
if ret == -37 {
|
||||||
|
skip = true
|
||||||
|
} else {
|
||||||
|
err = fmt.Errorf("failed to setup performance group %d", evset.gid)
|
||||||
|
}
|
||||||
|
m.lock.Unlock()
|
||||||
|
return skip, err
|
||||||
|
}
|
||||||
|
m.running = true
|
||||||
|
time.Sleep(interval)
|
||||||
|
m.running = false
|
||||||
|
ret = C.perfmon_stopCounters()
|
||||||
|
if ret != 0 {
|
||||||
|
var err error = nil
|
||||||
|
var skip bool = false
|
||||||
|
if ret == -37 {
|
||||||
|
skip = true
|
||||||
|
} else {
|
||||||
|
err = fmt.Errorf("failed to setup performance group %d", evset.gid)
|
||||||
|
}
|
||||||
|
m.lock.Unlock()
|
||||||
|
return skip, err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
ret = C.perfmon_startCounters()
|
m.lock.Unlock()
|
||||||
if ret != 0 {
|
return false, nil
|
||||||
gctr := C.GoString(C.perfmon_getGroupName(gid))
|
|
||||||
err := fmt.Errorf("failed to start performance group %d (%s)", gid, gctr)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
m.running = true
|
|
||||||
time.Sleep(interval)
|
|
||||||
m.running = false
|
|
||||||
ret = C.perfmon_stopCounters()
|
|
||||||
if ret != 0 {
|
|
||||||
gctr := C.GoString(C.perfmon_getGroupName(gid))
|
|
||||||
err := fmt.Errorf("failed to stop performance group %d (%s)", gid, gctr)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get all measurement results for an event set, derive the metric values out of the measurement results and send it
|
// Get all measurement results for an event set, derive the metric values out of the measurement results and send it
|
||||||
func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, output chan lp.CCMetric) error {
|
func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error {
|
||||||
var eidx C.int
|
|
||||||
evset := m.config.Eventsets[group]
|
|
||||||
gid := m.groups[group]
|
|
||||||
invClock := float64(1.0 / m.basefreq)
|
invClock := float64(1.0 / m.basefreq)
|
||||||
|
|
||||||
// Go over events and get the results
|
// Go over events and get the results
|
||||||
for eidx = 0; int(eidx) < len(evset.Events); eidx++ {
|
for eidx, counter := range evset.eorder {
|
||||||
ctr := C.perfmon_getCounterName(gid, eidx)
|
gctr := C.GoString(counter)
|
||||||
gctr := C.GoString(ctr)
|
|
||||||
|
|
||||||
for _, tid := range m.cpu2tid {
|
for _, tid := range m.cpu2tid {
|
||||||
if tid >= 0 {
|
res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid))
|
||||||
m.results[group][tid]["time"] = interval.Seconds()
|
evset.results[tid][gctr] = float64(res)
|
||||||
m.results[group][tid]["inverseClock"] = invClock
|
evset.results[tid]["time"] = interval.Seconds()
|
||||||
res := C.perfmon_getLastResult(gid, eidx, C.int(tid))
|
evset.results[tid]["inverseClock"] = invClock
|
||||||
m.results[group][tid][gctr] = float64(res)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Go over the event set metrics, derive the value out of the event:counter values and send it
|
// Go over the event set metrics, derive the value out of the event:counter values and send it
|
||||||
for _, metric := range evset.Metrics {
|
for _, metric := range m.config.Eventsets[evset.internal].Metrics {
|
||||||
// The metric scope is determined in the Init() function
|
// The metric scope is determined in the Init() function
|
||||||
// Get the map scope-id -> tids
|
// Get the map scope-id -> tids
|
||||||
scopemap := m.cpu2tid
|
scopemap := m.cpu2tid
|
||||||
@ -313,13 +335,13 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration,
|
|||||||
scopemap = m.sock2tid
|
scopemap = m.sock2tid
|
||||||
}
|
}
|
||||||
for domain, tid := range scopemap {
|
for domain, tid := range scopemap {
|
||||||
if tid >= 0 {
|
if tid >= 0 && len(metric.Calc) > 0 {
|
||||||
value, err := agg.EvalFloat64Condition(metric.Calc, m.results[group][tid])
|
value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
m.mresults[group][tid][metric.Name] = value
|
evset.metrics[tid][metric.Name] = value
|
||||||
if m.config.InvalidToZero && math.IsNaN(value) {
|
if m.config.InvalidToZero && math.IsNaN(value) {
|
||||||
value = 0.0
|
value = 0.0
|
||||||
}
|
}
|
||||||
@ -360,8 +382,8 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
|
|||||||
if tid >= 0 {
|
if tid >= 0 {
|
||||||
// Here we generate parameter list
|
// Here we generate parameter list
|
||||||
params := make(map[string]interface{})
|
params := make(map[string]interface{})
|
||||||
for j := range m.groups {
|
for _, evset := range m.likwidGroups {
|
||||||
for mname, mres := range m.mresults[j][tid] {
|
for mname, mres := range evset.metrics[tid] {
|
||||||
params[mname] = mres
|
params[mname] = mres
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -401,38 +423,145 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *LikwidCollector) LateInit() error {
|
||||||
|
var ret C.int
|
||||||
|
switch m.config.AccessMode {
|
||||||
|
case "direct":
|
||||||
|
C.HPMmode(0)
|
||||||
|
case "accessdaemon":
|
||||||
|
if len(m.config.DaemonPath) > 0 {
|
||||||
|
p := os.Getenv("PATH")
|
||||||
|
os.Setenv("PATH", m.config.DaemonPath+":"+p)
|
||||||
|
}
|
||||||
|
C.HPMmode(1)
|
||||||
|
}
|
||||||
|
cclog.ComponentDebug(m.name, "initialize LIKWID topology")
|
||||||
|
ret = C.topology_init()
|
||||||
|
if ret != 0 {
|
||||||
|
err := errors.New("failed to initialize LIKWID topology")
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
m.sock2tid = make(map[int]int)
|
||||||
|
tmp := make([]C.int, 1)
|
||||||
|
for _, sid := range topo.SocketList() {
|
||||||
|
cstr := C.CString(fmt.Sprintf("S%d:0", sid))
|
||||||
|
ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1)
|
||||||
|
if ret > 0 {
|
||||||
|
m.sock2tid[sid] = m.cpu2tid[int(tmp[0])]
|
||||||
|
}
|
||||||
|
C.free(unsafe.Pointer(cstr))
|
||||||
|
}
|
||||||
|
|
||||||
|
m.basefreq = getBaseFreq()
|
||||||
|
cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq)
|
||||||
|
|
||||||
|
cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module")
|
||||||
|
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
||||||
|
if ret != 0 {
|
||||||
|
var err error = nil
|
||||||
|
C.topology_finalize()
|
||||||
|
if ret != -22 {
|
||||||
|
err = errors.New("failed to initialize LIKWID perfmon")
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
} else {
|
||||||
|
err = errors.New("access to LIKWID perfmon locked")
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// While adding the events, we test the metrics whether they can be computed at all
|
||||||
|
for i, evset := range m.config.Eventsets {
|
||||||
|
var gid C.int
|
||||||
|
if len(evset.Events) > 0 {
|
||||||
|
likwidGroup := genLikwidEventSet(evset)
|
||||||
|
// Now we add the list of events to likwid
|
||||||
|
gid = C.perfmon_addEventSet(likwidGroup.estr)
|
||||||
|
if gid >= 0 {
|
||||||
|
likwidGroup.gid = gid
|
||||||
|
likwidGroup.internal = i
|
||||||
|
m.likwidGroups[gid] = likwidGroup
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no event set could be added, shut down LikwidCollector
|
||||||
|
if len(m.likwidGroups) == 0 {
|
||||||
|
C.perfmon_finalize()
|
||||||
|
C.topology_finalize()
|
||||||
|
err := errors.New("no LIKWID performance group initialized")
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
sigchan := make(chan os.Signal, 1)
|
||||||
|
signal.Notify(sigchan, syscall.SIGCHLD)
|
||||||
|
signal.Notify(sigchan, os.Interrupt)
|
||||||
|
go func() {
|
||||||
|
<-sigchan
|
||||||
|
|
||||||
|
signal.Stop(sigchan)
|
||||||
|
m.initialized = false
|
||||||
|
}()
|
||||||
|
m.initialized = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// main read function taking multiple measurement rounds, each 'interval' seconds long
|
// main read function taking multiple measurement rounds, each 'interval' seconds long
|
||||||
func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||||
|
var skip bool = false
|
||||||
|
var err error
|
||||||
if !m.init {
|
if !m.init {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := range m.groups {
|
if !m.initialized {
|
||||||
// measure event set 'i' for 'interval' seconds
|
if m.LateInit() != nil {
|
||||||
err := m.takeMeasurement(i, interval)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// read measurements and derive event set metrics
|
|
||||||
m.calcEventsetMetrics(i, interval, output)
|
|
||||||
}
|
}
|
||||||
// use the event set metrics to derive the global metrics
|
|
||||||
m.calcGlobalMetrics(interval, output)
|
if m.initialized && !skip {
|
||||||
|
for _, evset := range m.likwidGroups {
|
||||||
|
if !skip {
|
||||||
|
// measure event set 'i' for 'interval' seconds
|
||||||
|
skip, err = m.takeMeasurement(evset, interval)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !skip {
|
||||||
|
// read measurements and derive event set metrics
|
||||||
|
m.calcEventsetMetrics(evset, interval, output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !skip {
|
||||||
|
// use the event set metrics to derive the global metrics
|
||||||
|
m.calcGlobalMetrics(interval, output)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *LikwidCollector) Close() {
|
func (m *LikwidCollector) Close() {
|
||||||
if m.init {
|
if m.init {
|
||||||
cclog.ComponentDebug(m.name, "Closing ...")
|
|
||||||
m.init = false
|
m.init = false
|
||||||
if m.running {
|
cclog.ComponentDebug(m.name, "Closing ...")
|
||||||
cclog.ComponentDebug(m.name, "Stopping counters")
|
m.lock.Lock()
|
||||||
C.perfmon_stopCounters()
|
if m.initialized {
|
||||||
|
cclog.ComponentDebug(m.name, "Finalize LIKWID perfmon module")
|
||||||
|
C.perfmon_finalize()
|
||||||
|
m.initialized = false
|
||||||
}
|
}
|
||||||
cclog.ComponentDebug(m.name, "Finalize LIKWID perfmon module")
|
m.lock.Unlock()
|
||||||
C.perfmon_finalize()
|
|
||||||
cclog.ComponentDebug(m.name, "Finalize LIKWID topology module")
|
cclog.ComponentDebug(m.name, "Finalize LIKWID topology module")
|
||||||
C.topology_finalize()
|
C.topology_finalize()
|
||||||
|
|
||||||
cclog.ComponentDebug(m.name, "Closing done")
|
cclog.ComponentDebug(m.name, "Closing done")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -40,8 +40,13 @@ type MemstatCollector struct {
|
|||||||
sendMemUsed bool
|
sendMemUsed bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func getStats(filename string) map[string]float64 {
|
type MemstatStats struct {
|
||||||
stats := make(map[string]float64)
|
value float64
|
||||||
|
unit string
|
||||||
|
}
|
||||||
|
|
||||||
|
func getStats(filename string) map[string]MemstatStats {
|
||||||
|
stats := make(map[string]MemstatStats)
|
||||||
file, err := os.Open(filename)
|
file, err := os.Open(filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.Error(err.Error())
|
cclog.Error(err.Error())
|
||||||
@ -55,12 +60,18 @@ func getStats(filename string) map[string]float64 {
|
|||||||
if len(linefields) == 3 {
|
if len(linefields) == 3 {
|
||||||
v, err := strconv.ParseFloat(linefields[1], 64)
|
v, err := strconv.ParseFloat(linefields[1], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
stats[strings.Trim(linefields[0], ":")] = v
|
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
|
||||||
|
value: v,
|
||||||
|
unit: linefields[2],
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else if len(linefields) == 5 {
|
} else if len(linefields) == 5 {
|
||||||
v, err := strconv.ParseFloat(linefields[3], 64)
|
v, err := strconv.ParseFloat(linefields[3], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
stats[strings.Trim(linefields[0], ":")] = v
|
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
|
||||||
|
value: v,
|
||||||
|
unit: linefields[4],
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -78,7 +89,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Memory", "unit": "GByte"}
|
m.meta = map[string]string{"source": m.name, "group": "Memory"}
|
||||||
m.stats = make(map[string]int64)
|
m.stats = make(map[string]int64)
|
||||||
m.matches = make(map[string]string)
|
m.matches = make(map[string]string)
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
@ -151,30 +162,51 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
sendStats := func(stats map[string]float64, tags map[string]string) {
|
sendStats := func(stats map[string]MemstatStats, tags map[string]string) {
|
||||||
for match, name := range m.matches {
|
for match, name := range m.matches {
|
||||||
var value float64 = 0
|
var value float64 = 0
|
||||||
|
var unit string = ""
|
||||||
if v, ok := stats[match]; ok {
|
if v, ok := stats[match]; ok {
|
||||||
value = v
|
value = v.value
|
||||||
|
if len(v.unit) > 0 {
|
||||||
|
unit = v.unit
|
||||||
|
}
|
||||||
}
|
}
|
||||||
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value * 1e-6}, time.Now())
|
|
||||||
|
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
if len(unit) > 0 {
|
||||||
|
y.AddMeta("unit", unit)
|
||||||
|
}
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.sendMemUsed {
|
if m.sendMemUsed {
|
||||||
memUsed := 0.0
|
memUsed := 0.0
|
||||||
|
unit := ""
|
||||||
if totalVal, total := stats["MemTotal"]; total {
|
if totalVal, total := stats["MemTotal"]; total {
|
||||||
if freeVal, free := stats["MemFree"]; free {
|
if freeVal, free := stats["MemFree"]; free {
|
||||||
if bufVal, buffers := stats["Buffers"]; buffers {
|
if bufVal, buffers := stats["Buffers"]; buffers {
|
||||||
if cacheVal, cached := stats["Cached"]; cached {
|
if cacheVal, cached := stats["Cached"]; cached {
|
||||||
memUsed = totalVal - (freeVal + bufVal + cacheVal)
|
memUsed = totalVal.value - (freeVal.value + bufVal.value + cacheVal.value)
|
||||||
|
if len(totalVal.unit) > 0 {
|
||||||
|
unit = totalVal.unit
|
||||||
|
} else if len(freeVal.unit) > 0 {
|
||||||
|
unit = freeVal.unit
|
||||||
|
} else if len(bufVal.unit) > 0 {
|
||||||
|
unit = bufVal.unit
|
||||||
|
} else if len(cacheVal.unit) > 0 {
|
||||||
|
unit = cacheVal.unit
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
y, err := lp.New("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed * 1e-6}, time.Now())
|
y, err := lp.New("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
if len(unit) > 0 {
|
||||||
|
y.AddMeta("unit", unit)
|
||||||
|
}
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -36,7 +36,7 @@ type nfsCollector struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *nfsCollector) initStats() error {
|
func (m *nfsCollector) initStats() error {
|
||||||
cmd := exec.Command(m.config.Nfsstats, `-l`)
|
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
|
||||||
cmd.Wait()
|
cmd.Wait()
|
||||||
buffer, err := cmd.Output()
|
buffer, err := cmd.Output()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@ -52,7 +52,7 @@ func (m *nfsCollector) initStats() error {
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
x := m.data[name]
|
x := m.data[name]
|
||||||
x.current = value
|
x.current = value
|
||||||
x.last = 0
|
x.last = value
|
||||||
m.data[name] = x
|
m.data[name] = x
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -63,7 +63,7 @@ func (m *nfsCollector) initStats() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *nfsCollector) updateStats() error {
|
func (m *nfsCollector) updateStats() error {
|
||||||
cmd := exec.Command(m.config.Nfsstats, `-l`)
|
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
|
||||||
cmd.Wait()
|
cmd.Wait()
|
||||||
buffer, err := cmd.Output()
|
buffer, err := cmd.Output()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
Loading…
Reference in New Issue
Block a user