mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-01-24 21:09:06 +01:00
200af84c54
* Use channels, add a metric router, split up configuration and use extended version of Influx line protocol internally * Use central timer for collectors and router. Add expressions to router * Add expression to router config * Update entry points * Start with README * Update README for CCMetric * Formatting * Update README.md * Add README for MultiChanTicker * Add README for MultiChanTicker * Update README.md * Add README to metric router * Update main README * Remove SinkEntity type * Update README for sinks * Update go files * Update README for receivers * Update collectors README * Update collectors README * Use seperate page per collector * Fix for tempstat page * Add docs for customcmd collector * Add docs for ipmistat collector * Add docs for topprocs collector * Update customCmdMetric.md * Use seconds when calculating LIKWID metrics * Add IB metrics ib_recv_pkts and ib_xmit_pkts * Drop domain part of host name * Updated to latest stable version of likwid * Define source code dependencies in Makefile * Add GPFS / IBM Spectrum Scale collector * Add vet and staticcheck make targets * Add vet and staticcheck make targets * Avoid go vet warning: struct field tag `json:"..., omitempty"` not compatible with reflect.StructTag.Get: suspicious space in struct tag value struct field tag `json:"...", omitempty` not compatible with reflect.StructTag.Get: key:"value" pairs not separated by spaces * Add sample collector to README.md * Add CPU frequency collector * Avoid staticcheck warning: redundant return statement * Avoid staticcheck warning: unnecessary assignment to the blank identifier * Simplified code * Add CPUFreqCollectorCpuinfo a metric collector to measure the current frequency of the CPUs as obtained from /proc/cpuinfo Only measure on the first hyperthread * Add collector for NFS clients * Move publication of metrics into Flush() for NatsSink * Update GitHub actions * Refactoring * Avoid vet warning: Println arg list ends with redundant newline * Avoid vet warning struct field commands has json tag but is not exported * Avoid vet warning: return copies lock value. * Corrected typo * Refactoring * Add go sources in internal/... * Bad separator in Makefile * Fix Infiniband collector Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
330 lines
8.2 KiB
Go
330 lines
8.2 KiB
Go
package collectors
|
|
|
|
/*
|
|
#cgo CFLAGS: -I./likwid
|
|
#cgo LDFLAGS: -L./likwid -llikwid -llikwid-hwloc -lm
|
|
#include <stdlib.h>
|
|
#include <likwid.h>
|
|
*/
|
|
import "C"
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"log"
|
|
"math"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
"unsafe"
|
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
|
"gopkg.in/Knetic/govaluate.v2"
|
|
)
|
|
|
|
type MetricScope int
|
|
|
|
const (
|
|
METRIC_SCOPE_HWTHREAD = iota
|
|
METRIC_SCOPE_SOCKET
|
|
METRIC_SCOPE_NUMA
|
|
METRIC_SCOPE_NODE
|
|
)
|
|
|
|
func (ms MetricScope) String() string {
|
|
return []string{"Head", "Shoulder", "Knee", "Toe"}[ms]
|
|
}
|
|
|
|
type LikwidCollectorMetricConfig struct {
|
|
Name string `json:"name"`
|
|
Calc string `json:"calc"`
|
|
Scope MetricScope `json:"socket_scope"`
|
|
Publish bool `json:"publish"`
|
|
}
|
|
|
|
type LikwidCollectorEventsetConfig struct {
|
|
Events map[string]string `json:"events"`
|
|
Metrics []LikwidCollectorMetricConfig `json:"metrics"`
|
|
}
|
|
|
|
type LikwidCollectorConfig struct {
|
|
Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"`
|
|
Metrics []LikwidCollectorMetricConfig `json:"globalmetrics"`
|
|
ExcludeMetrics []string `json:"exclude_metrics"`
|
|
ForceOverwrite bool `json:"force_overwrite"`
|
|
}
|
|
|
|
type LikwidCollector struct {
|
|
metricCollector
|
|
cpulist []C.int
|
|
sock2tid map[int]int
|
|
metrics map[C.int]map[string]int
|
|
groups []C.int
|
|
config LikwidCollectorConfig
|
|
results map[int]map[int]map[string]interface{}
|
|
mresults map[int]map[int]map[string]float64
|
|
gmresults map[int]map[string]float64
|
|
basefreq float64
|
|
}
|
|
|
|
type LikwidMetric struct {
|
|
name string
|
|
search string
|
|
socket_scope bool
|
|
group_idx int
|
|
}
|
|
|
|
func eventsToEventStr(events map[string]string) string {
|
|
elist := make([]string, 0)
|
|
for k, v := range events {
|
|
elist = append(elist, fmt.Sprintf("%s:%s", v, k))
|
|
}
|
|
return strings.Join(elist, ",")
|
|
}
|
|
|
|
func getBaseFreq() float64 {
|
|
var freq float64 = math.NaN()
|
|
C.power_init(0)
|
|
info := C.get_powerInfo()
|
|
if float64(info.baseFrequency) != 0 {
|
|
freq = float64(info.baseFrequency)
|
|
} else {
|
|
buffer, err := ioutil.ReadFile("/sys/devices/system/cpu/cpu0/cpufreq/bios_limit")
|
|
if err == nil {
|
|
data := strings.Replace(string(buffer), "\n", "", -1)
|
|
x, err := strconv.ParseInt(data, 0, 64)
|
|
if err == nil {
|
|
freq = float64(x) * 1e3
|
|
}
|
|
}
|
|
}
|
|
return freq
|
|
}
|
|
|
|
func getSocketCpus() map[C.int]int {
|
|
slist := SocketList()
|
|
var cpu C.int
|
|
outmap := make(map[C.int]int)
|
|
for _, s := range slist {
|
|
t := C.CString(fmt.Sprintf("S%d", s))
|
|
clen := C.cpustr_to_cpulist(t, &cpu, 1)
|
|
if int(clen) == 1 {
|
|
outmap[cpu] = s
|
|
}
|
|
}
|
|
return outmap
|
|
}
|
|
|
|
func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|
var ret C.int
|
|
m.name = "LikwidCollector"
|
|
if len(config) > 0 {
|
|
err := json.Unmarshal(config, &m.config)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
m.setup()
|
|
m.meta = map[string]string{"source": m.name, "group": "PerfCounter"}
|
|
cpulist := CpuList()
|
|
m.cpulist = make([]C.int, len(cpulist))
|
|
slist := getSocketCpus()
|
|
|
|
m.sock2tid = make(map[int]int)
|
|
// m.numa2tid = make(map[int]int)
|
|
for i, c := range cpulist {
|
|
m.cpulist[i] = C.int(c)
|
|
if sid, found := slist[m.cpulist[i]]; found {
|
|
m.sock2tid[sid] = i
|
|
}
|
|
}
|
|
m.results = make(map[int]map[int]map[string]interface{})
|
|
m.mresults = make(map[int]map[int]map[string]float64)
|
|
m.gmresults = make(map[int]map[string]float64)
|
|
ret = C.topology_init()
|
|
if ret != 0 {
|
|
return errors.New("Failed to initialize LIKWID topology")
|
|
}
|
|
if m.config.ForceOverwrite {
|
|
os.Setenv("LIKWID_FORCE", "1")
|
|
}
|
|
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
|
if ret != 0 {
|
|
C.topology_finalize()
|
|
return errors.New("Failed to initialize LIKWID topology")
|
|
}
|
|
|
|
for i, evset := range m.config.Eventsets {
|
|
estr := eventsToEventStr(evset.Events)
|
|
cstr := C.CString(estr)
|
|
gid := C.perfmon_addEventSet(cstr)
|
|
if gid >= 0 {
|
|
m.groups = append(m.groups, gid)
|
|
}
|
|
C.free(unsafe.Pointer(cstr))
|
|
m.results[i] = make(map[int]map[string]interface{})
|
|
m.mresults[i] = make(map[int]map[string]float64)
|
|
for tid := range m.cpulist {
|
|
m.results[i][tid] = make(map[string]interface{})
|
|
m.mresults[i][tid] = make(map[string]float64)
|
|
m.gmresults[tid] = make(map[string]float64)
|
|
}
|
|
}
|
|
|
|
if len(m.groups) == 0 {
|
|
C.perfmon_finalize()
|
|
C.topology_finalize()
|
|
return errors.New("No LIKWID performance group initialized")
|
|
}
|
|
m.basefreq = getBaseFreq()
|
|
m.init = true
|
|
return nil
|
|
}
|
|
|
|
func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|
if !m.init {
|
|
return
|
|
}
|
|
var ret C.int
|
|
|
|
for i, gid := range m.groups {
|
|
evset := m.config.Eventsets[i]
|
|
ret = C.perfmon_setupCounters(gid)
|
|
if ret != 0 {
|
|
log.Print("Failed to setup performance group ", C.perfmon_getGroupName(gid))
|
|
continue
|
|
}
|
|
ret = C.perfmon_startCounters()
|
|
if ret != 0 {
|
|
log.Print("Failed to start performance group ", C.perfmon_getGroupName(gid))
|
|
continue
|
|
}
|
|
time.Sleep(interval)
|
|
ret = C.perfmon_stopCounters()
|
|
if ret != 0 {
|
|
log.Print("Failed to stop performance group ", C.perfmon_getGroupName(gid))
|
|
continue
|
|
}
|
|
var eidx C.int
|
|
for tid := range m.cpulist {
|
|
for eidx = 0; int(eidx) < len(evset.Events); eidx++ {
|
|
ctr := C.perfmon_getCounterName(gid, eidx)
|
|
gctr := C.GoString(ctr)
|
|
res := C.perfmon_getLastResult(gid, eidx, C.int(tid))
|
|
m.results[i][tid][gctr] = float64(res)
|
|
}
|
|
m.results[i][tid]["time"] = interval.Seconds()
|
|
m.results[i][tid]["inverseClock"] = float64(1.0 / m.basefreq)
|
|
for _, metric := range evset.Metrics {
|
|
expression, err := govaluate.NewEvaluableExpression(metric.Calc)
|
|
if err != nil {
|
|
log.Print(err.Error())
|
|
continue
|
|
}
|
|
result, err := expression.Evaluate(m.results[i][tid])
|
|
if err != nil {
|
|
log.Print(err.Error())
|
|
continue
|
|
}
|
|
m.mresults[i][tid][metric.Name] = float64(result.(float64))
|
|
}
|
|
}
|
|
}
|
|
|
|
for _, metric := range m.config.Metrics {
|
|
for tid := range m.cpulist {
|
|
var params map[string]interface{}
|
|
expression, err := govaluate.NewEvaluableExpression(metric.Calc)
|
|
if err != nil {
|
|
log.Print(err.Error())
|
|
continue
|
|
}
|
|
params = make(map[string]interface{})
|
|
for j := range m.groups {
|
|
for mname, mres := range m.mresults[j][tid] {
|
|
params[mname] = mres
|
|
}
|
|
}
|
|
result, err := expression.Evaluate(params)
|
|
if err != nil {
|
|
log.Print(err.Error())
|
|
continue
|
|
}
|
|
m.gmresults[tid][metric.Name] = float64(result.(float64))
|
|
}
|
|
}
|
|
for i := range m.groups {
|
|
evset := m.config.Eventsets[i]
|
|
for _, metric := range evset.Metrics {
|
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, metric.Name)
|
|
if metric.Publish && !skip {
|
|
if metric.Scope.String() == "socket" {
|
|
for sid, tid := range m.sock2tid {
|
|
y, err := lp.New(metric.Name,
|
|
map[string]string{"type": "socket",
|
|
"type-id": fmt.Sprintf("%d", int(sid))},
|
|
m.meta,
|
|
map[string]interface{}{"value": m.mresults[i][tid][metric.Name]},
|
|
time.Now())
|
|
if err == nil {
|
|
output <- y
|
|
}
|
|
}
|
|
} else if metric.Scope.String() == "hwthread" {
|
|
for tid, cpu := range m.cpulist {
|
|
y, err := lp.New(metric.Name,
|
|
map[string]string{"type": "cpu",
|
|
"type-id": fmt.Sprintf("%d", int(cpu))},
|
|
m.meta,
|
|
map[string]interface{}{"value": m.mresults[i][tid][metric.Name]},
|
|
time.Now())
|
|
if err == nil {
|
|
output <- y
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for _, metric := range m.config.Metrics {
|
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, metric.Name)
|
|
if metric.Publish && !skip {
|
|
if metric.Scope.String() == "socket" {
|
|
for sid, tid := range m.sock2tid {
|
|
y, err := lp.New(metric.Name,
|
|
map[string]string{"type": "socket",
|
|
"type-id": fmt.Sprintf("%d", int(sid))},
|
|
m.meta,
|
|
map[string]interface{}{"value": m.gmresults[tid][metric.Name]},
|
|
time.Now())
|
|
if err == nil {
|
|
output <- y
|
|
}
|
|
}
|
|
} else {
|
|
for tid, cpu := range m.cpulist {
|
|
y, err := lp.New(metric.Name,
|
|
map[string]string{"type": "cpu",
|
|
"type-id": fmt.Sprintf("%d", int(cpu))},
|
|
m.meta,
|
|
map[string]interface{}{"value": m.gmresults[tid][metric.Name]},
|
|
time.Now())
|
|
if err == nil {
|
|
output <- y
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *LikwidCollector) Close() {
|
|
if m.init {
|
|
m.init = false
|
|
C.perfmon_finalize()
|
|
C.topology_finalize()
|
|
}
|
|
}
|