mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-07-19 03:11:41 +02:00
Collector-specific configuration. LIKWID collector derives metrics itself, Run once CLI option
This commit is contained in:
@@ -15,15 +15,44 @@ import (
|
||||
"log"
|
||||
"strings"
|
||||
"time"
|
||||
"os"
|
||||
"unsafe"
|
||||
"math"
|
||||
"encoding/json"
|
||||
"gopkg.in/Knetic/govaluate.v2"
|
||||
"io/ioutil"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
type LikwidCollectorMetricConfig struct {
|
||||
Name string `json:"name"`
|
||||
Calc string `json:"calc"`
|
||||
Socket_scope bool `json:"socket_scope"`
|
||||
Publish bool `json:"publish"`
|
||||
}
|
||||
|
||||
type LikwidCollectorEventsetConfig struct {
|
||||
Events map[string]string `json:"events"`
|
||||
Metrics []LikwidCollectorMetricConfig `json:"metrics"`
|
||||
}
|
||||
|
||||
type LikwidCollectorConfig struct {
|
||||
Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"`
|
||||
Metrics []LikwidCollectorMetricConfig `json:"globalmetrics"`
|
||||
ExcludeMetrics []string `json:"exclude_metrics"`
|
||||
}
|
||||
|
||||
type LikwidCollector struct {
|
||||
MetricCollector
|
||||
cpulist []C.int
|
||||
sock2tid map[int]int
|
||||
metrics map[C.int]map[string]int
|
||||
groups map[string]C.int
|
||||
groups []C.int
|
||||
config LikwidCollectorConfig
|
||||
results map[int]map[int]map[string]interface{}
|
||||
mresults map[int]map[int]map[string]float64
|
||||
gmresults map[int]map[string]float64
|
||||
basefreq float64
|
||||
}
|
||||
|
||||
type LikwidMetric struct {
|
||||
@@ -33,7 +62,7 @@ type LikwidMetric struct {
|
||||
group_idx int
|
||||
}
|
||||
|
||||
const GROUPPATH = `/home/unrz139/Work/cc-metric-collector/collectors/likwid/groups`
|
||||
const GROUPPATH = `/apps/likwid/5.2.0/share/likwid/perfgroups`
|
||||
|
||||
var likwid_metrics = map[string][]LikwidMetric{
|
||||
"MEM_DP": {LikwidMetric{name: "mem_bw", search: "Memory bandwidth [MBytes/s]", socket_scope: true},
|
||||
@@ -57,6 +86,33 @@ func getMetricId(group C.int, search string) (int, error) {
|
||||
return -1, errors.New(fmt.Sprintf("Cannot find metric for search string '%s' in group %d", search, int(group)))
|
||||
}
|
||||
|
||||
func eventsToEventStr(events map[string]string) string {
|
||||
elist := make([]string, 0)
|
||||
for k, v := range events {
|
||||
elist = append(elist, fmt.Sprintf("%s:%s", v, k))
|
||||
}
|
||||
return strings.Join(elist, ",")
|
||||
}
|
||||
|
||||
func getBaseFreq() float64 {
|
||||
var freq float64 = math.NaN()
|
||||
C.power_init(0)
|
||||
info := C.get_powerInfo()
|
||||
if float64(info.baseFrequency) != 0 {
|
||||
freq = float64(info.baseFrequency)
|
||||
} else {
|
||||
buffer, err := ioutil.ReadFile("/sys/devices/system/cpu/cpu0/cpufreq/bios_limit")
|
||||
if err == nil {
|
||||
data := strings.Replace(string(buffer), "\n", "", -1)
|
||||
x, err := strconv.ParseInt(data, 0, 64)
|
||||
if err == nil {
|
||||
freq = float64(x)*1E3
|
||||
}
|
||||
}
|
||||
}
|
||||
return freq
|
||||
}
|
||||
|
||||
func getSocketCpus() map[C.int]int {
|
||||
slist := SocketList()
|
||||
var cpu C.int
|
||||
@@ -71,9 +127,15 @@ func getSocketCpus() map[C.int]int {
|
||||
return outmap
|
||||
}
|
||||
|
||||
func (m *LikwidCollector) Init() error {
|
||||
func (m *LikwidCollector) Init(config []byte) error {
|
||||
var ret C.int
|
||||
m.name = "LikwidCollector"
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
m.setup()
|
||||
cpulist := CpuList()
|
||||
m.cpulist = make([]C.int, len(cpulist))
|
||||
@@ -86,161 +148,183 @@ func (m *LikwidCollector) Init() error {
|
||||
m.sock2tid[sid] = i
|
||||
}
|
||||
}
|
||||
m.metrics = make(map[C.int]map[string]int)
|
||||
m.groups = make(map[string]C.int)
|
||||
m.results = make(map[int]map[int]map[string]interface{})
|
||||
m.mresults = make(map[int]map[int]map[string]float64)
|
||||
m.gmresults = make(map[int]map[string]float64)
|
||||
ret = C.topology_init()
|
||||
if ret != 0 {
|
||||
return errors.New("Failed to initialize LIKWID topology")
|
||||
}
|
||||
os.Setenv("LIKWID_FORCE", "1")
|
||||
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
||||
if ret != 0 {
|
||||
C.topology_finalize()
|
||||
return errors.New("Failed to initialize LIKWID topology")
|
||||
}
|
||||
gpath := C.CString(GROUPPATH)
|
||||
C.config_setGroupPath(gpath)
|
||||
C.free(unsafe.Pointer(gpath))
|
||||
|
||||
for g, metrics := range likwid_metrics {
|
||||
cstr := C.CString(g)
|
||||
gid := C.perfmon_addEventSet(cstr)
|
||||
if gid >= 0 {
|
||||
gmetrics := 0
|
||||
for i, metric := range metrics {
|
||||
idx, err := getMetricId(gid, metric.search)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
} else {
|
||||
likwid_metrics[g][i].group_idx = idx
|
||||
gmetrics++
|
||||
}
|
||||
}
|
||||
if gmetrics > 0 {
|
||||
m.groups[g] = gid
|
||||
}
|
||||
} else {
|
||||
log.Print("Failed to add events set ", g)
|
||||
}
|
||||
C.free(unsafe.Pointer(cstr))
|
||||
|
||||
for i, evset := range m.config.Eventsets {
|
||||
estr := eventsToEventStr(evset.Events)
|
||||
cstr := C.CString(estr)
|
||||
gid := C.perfmon_addEventSet(cstr)
|
||||
if gid >= 0 {
|
||||
m.groups = append(m.groups, gid)
|
||||
}
|
||||
C.free(unsafe.Pointer(cstr))
|
||||
m.results[i] = make(map[int]map[string]interface{})
|
||||
m.mresults[i] = make(map[int]map[string]float64)
|
||||
for tid, _ := range m.cpulist {
|
||||
m.results[i][tid] = make(map[string]interface{})
|
||||
m.mresults[i][tid] = make(map[string]float64)
|
||||
m.gmresults[tid] = make(map[string]float64)
|
||||
}
|
||||
}
|
||||
|
||||
if len(m.groups) == 0 {
|
||||
C.perfmon_finalize()
|
||||
C.topology_finalize()
|
||||
return errors.New("No LIKWID performance group initialized")
|
||||
}
|
||||
m.basefreq = getBaseFreq()
|
||||
log.Print(m.basefreq)
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||
if m.init {
|
||||
var ret C.int
|
||||
core_fp_any := make(map[int]float64, len(m.cpulist))
|
||||
for _, cpu := range m.cpulist {
|
||||
core_fp_any[int(cpu)] = 0.0
|
||||
if !m.init {
|
||||
return
|
||||
}
|
||||
var ret C.int
|
||||
|
||||
for i, gid := range m.groups {
|
||||
evset := m.config.Eventsets[i]
|
||||
ret = C.perfmon_setupCounters(gid)
|
||||
if ret != 0 {
|
||||
log.Print("Failed to setup performance group ", C.perfmon_getGroupName(gid))
|
||||
continue
|
||||
}
|
||||
for gname, gid := range m.groups {
|
||||
ret = C.perfmon_setupCounters(gid)
|
||||
if ret != 0 {
|
||||
log.Print("Failed to setup performance group ", gname)
|
||||
continue
|
||||
}
|
||||
ret = C.perfmon_startCounters()
|
||||
if ret != 0 {
|
||||
log.Print("Failed to start performance group ", gname)
|
||||
continue
|
||||
}
|
||||
time.Sleep(interval)
|
||||
ret = C.perfmon_stopCounters()
|
||||
if ret != 0 {
|
||||
log.Print("Failed to stop performance group ", gname)
|
||||
continue
|
||||
}
|
||||
|
||||
for _, lmetric := range likwid_metrics[gname] {
|
||||
if lmetric.name == "pwr1" || lmetric.name == "pwr2" {
|
||||
continue
|
||||
}
|
||||
mname := lmetric.name
|
||||
inverse := false
|
||||
if mname == "cpi" {
|
||||
mname = "ipc"
|
||||
inverse = true
|
||||
}
|
||||
if lmetric.socket_scope {
|
||||
for sid, tid := range m.sock2tid {
|
||||
res := C.perfmon_getLastMetric(gid, C.int(lmetric.group_idx), C.int(tid))
|
||||
y, err := lp.New(lmetric.name,
|
||||
map[string]string{"type": "socket", "type-id": fmt.Sprintf("%d", int(sid))},
|
||||
map[string]interface{}{"value": float64(res)},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
// log.Print("Metric '", lmetric.name,"' on Socket ",int(sid)," returns ", m.sockets[int(sid)][lmetric.name])
|
||||
}
|
||||
} else {
|
||||
for tid, cpu := range m.cpulist {
|
||||
res := C.perfmon_getLastMetric(gid, C.int(lmetric.group_idx), C.int(tid))
|
||||
value := float64(res)
|
||||
if inverse {
|
||||
value = 1.0 / value
|
||||
}
|
||||
y, err := lp.New(mname,
|
||||
map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", int(cpu))},
|
||||
map[string]interface{}{"value": value},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
if lmetric.name == "flops_dp" {
|
||||
core_fp_any[int(cpu)] += 2 * float64(res)
|
||||
}
|
||||
if lmetric.name == "flops_sp" {
|
||||
core_fp_any[int(cpu)] += float64(res)
|
||||
}
|
||||
// log.Print("Metric '", lmetric.name,"' on CPU ",int(cpu)," returns ", m.cpus[int(cpu)][lmetric.name])
|
||||
}
|
||||
}
|
||||
}
|
||||
for sid, tid := range m.sock2tid {
|
||||
sum := 0.0
|
||||
valid := false
|
||||
for _, lmetric := range likwid_metrics[gname] {
|
||||
if lmetric.name == "pwr1" || lmetric.name == "pwr2" {
|
||||
res := C.perfmon_getLastMetric(gid, C.int(lmetric.group_idx), C.int(tid))
|
||||
sum += float64(res)
|
||||
valid = true
|
||||
}
|
||||
}
|
||||
if valid {
|
||||
y, err := lp.New("power",
|
||||
map[string]string{"type": "socket", "type-id": fmt.Sprintf("%d", int(sid))},
|
||||
map[string]interface{}{"value": float64(sum)},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
for cpu := range m.cpulist {
|
||||
y, err := lp.New("flops_any",
|
||||
map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", int(cpu))},
|
||||
map[string]interface{}{"value": float64(core_fp_any[int(cpu)])},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
ret = C.perfmon_startCounters()
|
||||
if ret != 0 {
|
||||
log.Print("Failed to start performance group ", C.perfmon_getGroupName(gid))
|
||||
continue
|
||||
}
|
||||
time.Sleep(interval)
|
||||
ret = C.perfmon_stopCounters()
|
||||
if ret != 0 {
|
||||
log.Print("Failed to stop performance group ", C.perfmon_getGroupName(gid))
|
||||
continue
|
||||
}
|
||||
var eidx C.int
|
||||
for tid, _ := range m.cpulist {
|
||||
for eidx = 0; int(eidx) < len(evset.Events); eidx++ {
|
||||
ctr := C.perfmon_getCounterName(gid, eidx)
|
||||
gctr := C.GoString(ctr)
|
||||
res := C.perfmon_getLastResult(gid, eidx, C.int(tid))
|
||||
m.results[i][tid][gctr] = float64(res)
|
||||
}
|
||||
m.results[i][tid]["time"] = float64(interval)
|
||||
m.results[i][tid]["inverseClock"] = float64(1.0/m.basefreq)
|
||||
for _, metric := range evset.Metrics {
|
||||
expression, err := govaluate.NewEvaluableExpression(metric.Calc)
|
||||
if err != nil {
|
||||
log.Print(err.Error())
|
||||
continue
|
||||
}
|
||||
result, err := expression.Evaluate(m.results[i][tid]);
|
||||
if err != nil {
|
||||
log.Print(err.Error())
|
||||
continue
|
||||
}
|
||||
m.mresults[i][tid][metric.Name] = float64(result.(float64))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, metric := range m.config.Metrics {
|
||||
for tid, _ := range m.cpulist {
|
||||
var params map[string]interface{}
|
||||
expression, err := govaluate.NewEvaluableExpression(metric.Calc)
|
||||
if err != nil {
|
||||
log.Print(err.Error())
|
||||
continue
|
||||
}
|
||||
params = make(map[string]interface{})
|
||||
for j, _ := range m.groups {
|
||||
for mname, mres := range m.mresults[j][tid] {
|
||||
params[mname] = mres
|
||||
}
|
||||
}
|
||||
result, err := expression.Evaluate(params);
|
||||
if err != nil {
|
||||
log.Print(err.Error())
|
||||
continue
|
||||
}
|
||||
m.gmresults[tid][metric.Name] = float64(result.(float64))
|
||||
}
|
||||
}
|
||||
for i, _ := range m.groups {
|
||||
evset := m.config.Eventsets[i]
|
||||
for _, metric := range evset.Metrics {
|
||||
_, skip := stringArrayContains(m.config.ExcludeMetrics, metric.Name)
|
||||
if metric.Publish && !skip {
|
||||
if metric.Socket_scope {
|
||||
for sid, tid := range m.sock2tid {
|
||||
y, err := lp.New(metric.Name,
|
||||
map[string]string{"type": "socket", "type-id": fmt.Sprintf("%d", int(sid))},
|
||||
map[string]interface{}{"value": m.mresults[i][tid][metric.Name]},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for tid, cpu := range m.cpulist {
|
||||
y, err := lp.New(metric.Name,
|
||||
map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", int(cpu))},
|
||||
map[string]interface{}{"value": m.mresults[i][tid][metric.Name]},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, metric := range m.config.Metrics {
|
||||
_, skip := stringArrayContains(m.config.ExcludeMetrics, metric.Name)
|
||||
if metric.Publish && !skip {
|
||||
if metric.Socket_scope {
|
||||
for sid, tid := range m.sock2tid {
|
||||
y, err := lp.New(metric.Name,
|
||||
map[string]string{"type": "socket", "type-id": fmt.Sprintf("%d", int(sid))},
|
||||
map[string]interface{}{"value": m.gmresults[tid][metric.Name]},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for tid, cpu := range m.cpulist {
|
||||
y, err := lp.New(metric.Name,
|
||||
map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", int(cpu))},
|
||||
map[string]interface{}{"value": m.gmresults[tid][metric.Name]},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
func (m *LikwidCollector) Close() {
|
||||
if m.init {
|
||||
m.init = false
|
||||
C.perfmon_finalize()
|
||||
C.topology_finalize()
|
||||
m.init = false
|
||||
}
|
||||
return
|
||||
}
|
||||
|
Reference in New Issue
Block a user