mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2024-11-14 05:57:25 +01:00
6ab45dd3ec
* Add cpu_used (all-cpu_idle) to CpustatCollector * Update to line-protocol/v2 * Update runonce.yml with Golang 1.20 * Update fsnotify in LIKWID Collector * Use not a pointer to line-protocol.Encoder * Simplify Makefile * Use only as many arguments as required * Allow sum function to handle non float types * Allow values to be a slice of type float64, float32, int, int64, int32, bool * Use generic function to simplify code * Add missing case for type []int32 * Use generic function to compute minimum * Use generic function to compute maximum * Use generic function to compute average * Add error value to sumAnyType * Use generic function to compute median * For older versions of go slices is not part of the installation * Remove old entries from go.sum * Use simpler sort function * Compute metrics ib_total and ib_total_pkts * Add aggregated metrics. Add missing units * Update likwidMetric.go Fixes a potential bug when `fsnotify.NewWatcher()` fails with an error * Completly avoid memory allocations in infinibandMetric read() * Fixed initialization: Initalization and measurements should run in the same thread * Add safe.directory to Release action * Fix path after installation to /usr/bin after installation * ioutil.ReadFile is deprecated: As of Go 1.16, this function simply calls os.ReadFile * Switch to package slices from the golang 1.21 default library * Read file line by line * Read file line by line * Read file line by line * Use CamelCase * Use CamelCase * Fix function getNumaDomain, it always returned 0 * Avoid type conversion by using Atoi Avoid copying structs by using pointer access Increase readability with CamelCase variable names * Add caching * Cache CpuData * Cleanup * Use init function to initalize cache structure to avoid multi threading problems * Reuse information from /proc/cpuinfo * Avoid slice cloning. Directly use the cache * Add DieList * Add NumaDomainList and SMTList * Cleanup * Add comment * Lookup core ID from /sys/devices/system/cpu, /proc/cpuinfo is not portable * Lookup all information from /sys/devices/system/cpu, /proc/cpuinfo is not portable * Correctly handle lists from /sys * Add Simultaneous Multithreading siblings * Replace deprecated thread_siblings_list by core_cpus_list * Reduce number of required slices * Allow to send total values per core, socket and node * Send all metrics with same time stamp calcEventsetMetrics does only computiation, counter measurement is done before * Input parameters should be float64 when evaluating to float64 * Send all metrics with same time stamp calcGlobalMetrics does only computiation, counter measurement is done before * Remove unused variable gmresults * Add comments * Updated go packages * Add build with golang 1.21 * Switch to checkout action version 4 * Switch to setup-go action version 4 * Add workflow_dispatch to allow manual run of workflow * Add workflow_dispatch to allow manual run of workflow * Add release build jobs to runonce.yml * Switch to golang 1.20 for RHEL based distributions * Use dnf to download golang * Remove golang versions before 1.20 * Upgrade Ubuntu focal -> jammy * Pipe golang tar package directly to tar * Update golang version * Fix Ubuntu version number * Add links to ipmi and redfish receivers * Fix http server addr format * github.com/influxdata/line-protocol -> github.com/influxdata/line-protocol/v2/lineprotocol * Corrected spelling * Add some comments * github.com/influxdata/line-protocol -> github.com/influxdata/line-protocol/v2/lineprotocol * Allow other fields not only field "value" * Add some basic debugging documentation * Add some basic debugging documentation * Use a lock for the flush timer * Add tags in lexical order as required by AddTag() * Only access meta data, when it gets used as tag * Use slice to store lexialicly orderd key value pairs * Increase golang version requirement to 1.20. * Avoid package cmp to allow builds with golang v1.20 * Fix: Error NVML library not found did crash cc-metric-collector with "SIGSEGV: segmentation violation" * Add config option idle_timeout * Add basic authentication support * Add basic authentication support * Avoid unneccessary memory allocations * Add documentation for send_*_total values * Use generic package maps to clone maps * Reuse flush timer * Add Influx client options * Reuse ccTopology functionality * Do not store unused topology information * Add batch_size config * Cleanup * Use stype and stype-id for the NIC in NetstatCollector * Wait for concurrent flush operations to finish * Be more verbose in error messages * Reverted previous changes. Made the code to complex without much advantages * Use line protocol encoder * Go pkg update * Stop flush timer, when immediatelly flushing * Fix: Corrected unlock access to batch slice * Add config option to specify whether to use GZip compression in influx write requests * Add asynchron send of encoder metrics * Use DefaultServeMux instead of github.com/gorilla/mux * Add config option for HTTP keep-alives * Be more strict, when parsing json * Add config option for HTTP request timeout and Retry interval * Allow more then one background send operation * Fix %sysusers_create_package args (#108) %sysusers_create_package requires two arguments. See: https://github.com/systemd/systemd/blob/main/src/rpm/macros.systemd.in#L165 * Add nfsiostat to list of collectors --------- Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: Holger Obermaier <holgerob@gmx.de> Co-authored-by: Obihörnchen <obihoernchende@gmail.com>
816 lines
21 KiB
Go
816 lines
21 KiB
Go
package collectors
|
|
|
|
/*
|
|
#cgo CFLAGS: -I./likwid
|
|
#cgo LDFLAGS: -Wl,--unresolved-symbols=ignore-in-object-files
|
|
#include <stdlib.h>
|
|
#include <likwid.h>
|
|
*/
|
|
import "C"
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"os/signal"
|
|
"os/user"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"syscall"
|
|
"time"
|
|
"unsafe"
|
|
|
|
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
|
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
|
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
|
|
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
|
"github.com/NVIDIA/go-nvml/pkg/dl"
|
|
"github.com/fsnotify/fsnotify"
|
|
"golang.design/x/thread"
|
|
)
|
|
|
|
const (
|
|
LIKWID_LIB_NAME = "liblikwid.so"
|
|
LIKWID_LIB_DL_FLAGS = dl.RTLD_LAZY | dl.RTLD_GLOBAL
|
|
LIKWID_DEF_ACCESSMODE = "direct"
|
|
LIKWID_DEF_LOCKFILE = "/var/run/likwid.lock"
|
|
)
|
|
|
|
type LikwidCollectorMetricConfig struct {
|
|
Name string `json:"name"` // Name of the metric
|
|
Calc string `json:"calc"` // Calculation for the metric using
|
|
Type string `json:"type"` // Metric type (aka node, socket, cpu, ...)
|
|
Publish bool `json:"publish"`
|
|
SendCoreTotalVal bool `json:"send_core_total_values,omitempty"`
|
|
SendSocketTotalVal bool `json:"send_socket_total_values,omitempty"`
|
|
SendNodeTotalVal bool `json:"send_node_total_values,omitempty"`
|
|
Unit string `json:"unit"` // Unit of metric if any
|
|
}
|
|
|
|
type LikwidCollectorEventsetConfig struct {
|
|
Events map[string]string `json:"events"`
|
|
Metrics []LikwidCollectorMetricConfig `json:"metrics"`
|
|
}
|
|
|
|
type LikwidEventsetConfig struct {
|
|
internal int
|
|
gid C.int
|
|
eorder []*C.char
|
|
estr *C.char
|
|
go_estr string
|
|
results map[int]map[string]float64
|
|
metrics map[int]map[string]float64
|
|
}
|
|
|
|
type LikwidCollectorConfig struct {
|
|
Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"`
|
|
Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"`
|
|
ForceOverwrite bool `json:"force_overwrite,omitempty"`
|
|
InvalidToZero bool `json:"invalid_to_zero,omitempty"`
|
|
AccessMode string `json:"access_mode,omitempty"`
|
|
DaemonPath string `json:"accessdaemon_path,omitempty"`
|
|
LibraryPath string `json:"liblikwid_path,omitempty"`
|
|
LockfilePath string `json:"lockfile_path,omitempty"`
|
|
}
|
|
|
|
type LikwidCollector struct {
|
|
metricCollector
|
|
cpulist []C.int
|
|
cpu2tid map[int]int
|
|
sock2tid map[int]int
|
|
tid2core map[int]int
|
|
tid2socket map[int]int
|
|
metrics map[C.int]map[string]int
|
|
groups []C.int
|
|
config LikwidCollectorConfig
|
|
basefreq float64
|
|
running bool
|
|
initialized bool
|
|
needs_reinit bool
|
|
likwidGroups map[C.int]LikwidEventsetConfig
|
|
lock sync.Mutex
|
|
measureThread thread.Thread
|
|
}
|
|
|
|
type LikwidMetric struct {
|
|
name string
|
|
search string
|
|
scope string
|
|
group_idx int
|
|
}
|
|
|
|
func checkMetricType(t string) bool {
|
|
valid := map[string]bool{
|
|
"node": true,
|
|
"socket": true,
|
|
"hwthread": true,
|
|
"core": true,
|
|
"memoryDomain": true,
|
|
}
|
|
_, ok := valid[t]
|
|
return ok
|
|
}
|
|
|
|
func eventsToEventStr(events map[string]string) string {
|
|
elist := make([]string, 0)
|
|
for k, v := range events {
|
|
elist = append(elist, fmt.Sprintf("%s:%s", v, k))
|
|
}
|
|
return strings.Join(elist, ",")
|
|
}
|
|
|
|
func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig {
|
|
tmplist := make([]string, 0)
|
|
clist := make([]string, 0)
|
|
for k := range input.Events {
|
|
clist = append(clist, k)
|
|
}
|
|
sort.Strings(clist)
|
|
elist := make([]*C.char, 0)
|
|
for _, k := range clist {
|
|
v := input.Events[k]
|
|
tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k))
|
|
c_counter := C.CString(k)
|
|
elist = append(elist, c_counter)
|
|
}
|
|
estr := strings.Join(tmplist, ",")
|
|
res := make(map[int]map[string]float64)
|
|
met := make(map[int]map[string]float64)
|
|
for _, i := range topo.CpuList() {
|
|
res[i] = make(map[string]float64)
|
|
for k := range input.Events {
|
|
res[i][k] = 0.0
|
|
}
|
|
met[i] = make(map[string]float64)
|
|
for _, v := range input.Metrics {
|
|
res[i][v.Name] = 0.0
|
|
}
|
|
}
|
|
return LikwidEventsetConfig{
|
|
gid: -1,
|
|
eorder: elist,
|
|
estr: C.CString(estr),
|
|
go_estr: estr,
|
|
results: res,
|
|
metrics: met,
|
|
}
|
|
}
|
|
|
|
func testLikwidMetricFormula(formula string, params []string) bool {
|
|
myparams := make(map[string]float64)
|
|
for _, p := range params {
|
|
myparams[p] = float64(1.0)
|
|
}
|
|
_, err := agg.EvalFloat64Condition(formula, myparams)
|
|
return err == nil
|
|
}
|
|
|
|
func getBaseFreq() float64 {
|
|
files := []string{
|
|
"/sys/devices/system/cpu/cpu0/cpufreq/bios_limit",
|
|
"/sys/devices/system/cpu/cpu0/cpufreq/base_frequency",
|
|
}
|
|
var freq float64 = math.NaN()
|
|
for _, f := range files {
|
|
buffer, err := os.ReadFile(f)
|
|
if err == nil {
|
|
data := strings.Replace(string(buffer), "\n", "", -1)
|
|
x, err := strconv.ParseInt(data, 0, 64)
|
|
if err == nil {
|
|
freq = float64(x)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
if math.IsNaN(freq) {
|
|
C.power_init(0)
|
|
info := C.get_powerInfo()
|
|
if float64(info.baseFrequency) != 0 {
|
|
freq = float64(info.baseFrequency)
|
|
}
|
|
C.power_finalize()
|
|
}
|
|
return freq * 1e3
|
|
}
|
|
|
|
func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|
m.name = "LikwidCollector"
|
|
m.parallel = false
|
|
m.initialized = false
|
|
m.needs_reinit = true
|
|
m.running = false
|
|
m.config.AccessMode = LIKWID_DEF_ACCESSMODE
|
|
m.config.LibraryPath = LIKWID_LIB_NAME
|
|
m.config.LockfilePath = LIKWID_DEF_LOCKFILE
|
|
if len(config) > 0 {
|
|
err := json.Unmarshal(config, &m.config)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS)
|
|
if lib == nil {
|
|
return fmt.Errorf("error instantiating DynamicLibrary for %s", m.config.LibraryPath)
|
|
}
|
|
err := lib.Open()
|
|
if err != nil {
|
|
return fmt.Errorf("error opening %s: %v", m.config.LibraryPath, err)
|
|
}
|
|
|
|
if m.config.ForceOverwrite {
|
|
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
|
|
os.Setenv("LIKWID_FORCE", "1")
|
|
}
|
|
m.setup()
|
|
|
|
m.meta = map[string]string{"group": "PerfCounter"}
|
|
cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
|
|
cpulist := topo.HwthreadList()
|
|
m.cpulist = make([]C.int, len(cpulist))
|
|
m.cpu2tid = make(map[int]int)
|
|
for i, c := range cpulist {
|
|
m.cpulist[i] = C.int(c)
|
|
m.cpu2tid[c] = i
|
|
}
|
|
|
|
m.likwidGroups = make(map[C.int]LikwidEventsetConfig)
|
|
|
|
// This is for the global metrics computation test
|
|
totalMetrics := 0
|
|
// Generate parameter list for the metric computing test
|
|
params := make([]string, 0)
|
|
params = append(params, "time", "inverseClock")
|
|
// Generate parameter list for the global metric computing test
|
|
globalParams := make([]string, 0)
|
|
globalParams = append(globalParams, "time", "inverseClock")
|
|
// We test the eventset metrics whether they can be computed at all
|
|
for _, evset := range m.config.Eventsets {
|
|
if len(evset.Events) > 0 {
|
|
params = params[:2]
|
|
for counter := range evset.Events {
|
|
params = append(params, counter)
|
|
}
|
|
for _, metric := range evset.Metrics {
|
|
// Try to evaluate the metric
|
|
cclog.ComponentDebug(m.name, "Checking", metric.Name)
|
|
if !checkMetricType(metric.Type) {
|
|
cclog.ComponentError(m.name, "Metric", metric.Name, "uses invalid type", metric.Type)
|
|
metric.Calc = ""
|
|
} else if !testLikwidMetricFormula(metric.Calc, params) {
|
|
cclog.ComponentError(m.name, "Metric", metric.Name, "cannot be calculated with given counters")
|
|
metric.Calc = ""
|
|
} else {
|
|
globalParams = append(globalParams, metric.Name)
|
|
totalMetrics++
|
|
}
|
|
}
|
|
} else {
|
|
cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given")
|
|
continue
|
|
}
|
|
}
|
|
for _, metric := range m.config.Metrics {
|
|
// Try to evaluate the global metric
|
|
if !checkMetricType(metric.Type) {
|
|
cclog.ComponentError(m.name, "Metric", metric.Name, "uses invalid type", metric.Type)
|
|
metric.Calc = ""
|
|
} else if !testLikwidMetricFormula(metric.Calc, globalParams) {
|
|
cclog.ComponentError(m.name, "Metric", metric.Name, "cannot be calculated with given counters")
|
|
metric.Calc = ""
|
|
} else if !checkMetricType(metric.Type) {
|
|
cclog.ComponentError(m.name, "Metric", metric.Name, "has invalid type")
|
|
metric.Calc = ""
|
|
} else {
|
|
totalMetrics++
|
|
}
|
|
}
|
|
|
|
// If no event set could be added, shut down LikwidCollector
|
|
if totalMetrics == 0 {
|
|
err := errors.New("no LIKWID eventset or metric usable")
|
|
cclog.ComponentError(m.name, err.Error())
|
|
return err
|
|
}
|
|
|
|
ret := C.topology_init()
|
|
if ret != 0 {
|
|
err := errors.New("failed to initialize topology module")
|
|
cclog.ComponentError(m.name, err.Error())
|
|
return err
|
|
}
|
|
m.measureThread = thread.New()
|
|
switch m.config.AccessMode {
|
|
case "direct":
|
|
C.HPMmode(0)
|
|
case "accessdaemon":
|
|
if len(m.config.DaemonPath) > 0 {
|
|
p := os.Getenv("PATH")
|
|
os.Setenv("PATH", m.config.DaemonPath+":"+p)
|
|
}
|
|
C.HPMmode(1)
|
|
retCode := C.HPMinit()
|
|
if retCode != 0 {
|
|
err := fmt.Errorf("C.HPMinit() failed with return code %v", retCode)
|
|
cclog.ComponentError(m.name, err.Error())
|
|
}
|
|
for _, c := range m.cpulist {
|
|
m.measureThread.Call(
|
|
func() {
|
|
retCode := C.HPMaddThread(c)
|
|
if retCode != 0 {
|
|
err := fmt.Errorf("C.HPMaddThread(%v) failed with return code %v", c, retCode)
|
|
cclog.ComponentError(m.name, err.Error())
|
|
}
|
|
})
|
|
}
|
|
}
|
|
m.sock2tid = make(map[int]int)
|
|
tmp := make([]C.int, 1)
|
|
for _, sid := range topo.SocketList() {
|
|
cstr := C.CString(fmt.Sprintf("S%d:0", sid))
|
|
ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1)
|
|
if ret > 0 {
|
|
m.sock2tid[sid] = m.cpu2tid[int(tmp[0])]
|
|
}
|
|
C.free(unsafe.Pointer(cstr))
|
|
}
|
|
|
|
cpuData := topo.CpuData()
|
|
m.tid2core = make(map[int]int, len(cpuData))
|
|
m.tid2socket = make(map[int]int, len(cpuData))
|
|
for i := range cpuData {
|
|
c := &cpuData[i]
|
|
// Hardware thread ID to core ID mapping
|
|
if len(c.CoreCPUsList) > 0 {
|
|
m.tid2core[c.CpuID] = c.CoreCPUsList[0]
|
|
} else {
|
|
m.tid2core[c.CpuID] = c.CpuID
|
|
}
|
|
// Hardware thead ID to socket ID mapping
|
|
m.tid2socket[c.CpuID] = c.Socket
|
|
}
|
|
|
|
m.basefreq = getBaseFreq()
|
|
m.init = true
|
|
return nil
|
|
}
|
|
|
|
// take a measurement for 'interval' seconds of event set index 'group'
|
|
func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, interval time.Duration) (bool, error) {
|
|
var ret C.int
|
|
var gid C.int = -1
|
|
sigchan := make(chan os.Signal, 1)
|
|
|
|
// Watch changes for the lock file ()
|
|
watcher, err := fsnotify.NewWatcher()
|
|
if err != nil {
|
|
cclog.ComponentError(m.name, err.Error())
|
|
return true, err
|
|
}
|
|
defer watcher.Close()
|
|
if len(m.config.LockfilePath) > 0 {
|
|
info, err := os.Stat(m.config.LockfilePath)
|
|
if err != nil {
|
|
return true, err
|
|
}
|
|
uid := info.Sys().(*syscall.Stat_t).Uid
|
|
if uid != uint32(os.Getuid()) {
|
|
usr, err := user.LookupId(fmt.Sprint(uid))
|
|
if err == nil {
|
|
return true, fmt.Errorf("Access to performance counters locked by %s", usr.Username)
|
|
} else {
|
|
return true, fmt.Errorf("Access to performance counters locked by %d", uid)
|
|
}
|
|
}
|
|
err = watcher.Add(m.config.LockfilePath)
|
|
if err != nil {
|
|
cclog.ComponentError(m.name, err.Error())
|
|
}
|
|
}
|
|
m.lock.Lock()
|
|
defer m.lock.Unlock()
|
|
|
|
// Initialize the performance monitoring feature by creating basic data structures
|
|
select {
|
|
case e := <-watcher.Events:
|
|
ret = -1
|
|
if e.Op != fsnotify.Chmod {
|
|
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
|
}
|
|
default:
|
|
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
|
}
|
|
if ret != 0 {
|
|
return true, fmt.Errorf("failed to initialize library, error %d", ret)
|
|
}
|
|
signal.Notify(sigchan, os.Interrupt)
|
|
signal.Notify(sigchan, syscall.SIGCHLD)
|
|
|
|
// Add an event string to LIKWID
|
|
select {
|
|
case <-sigchan:
|
|
gid = -1
|
|
case e := <-watcher.Events:
|
|
gid = -1
|
|
if e.Op != fsnotify.Chmod {
|
|
gid = C.perfmon_addEventSet(evset.estr)
|
|
}
|
|
default:
|
|
gid = C.perfmon_addEventSet(evset.estr)
|
|
}
|
|
if gid < 0 {
|
|
return true, fmt.Errorf("failed to add events %s, error %d", evset.go_estr, gid)
|
|
} else {
|
|
evset.gid = gid
|
|
}
|
|
|
|
// Setup all performance monitoring counters of an eventSet
|
|
select {
|
|
case <-sigchan:
|
|
ret = -1
|
|
case e := <-watcher.Events:
|
|
if e.Op != fsnotify.Chmod {
|
|
ret = C.perfmon_setupCounters(gid)
|
|
}
|
|
default:
|
|
ret = C.perfmon_setupCounters(gid)
|
|
}
|
|
if ret != 0 {
|
|
return true, fmt.Errorf("failed to setup events '%s', error %d", evset.go_estr, ret)
|
|
}
|
|
|
|
// Start counters
|
|
select {
|
|
case <-sigchan:
|
|
ret = -1
|
|
case e := <-watcher.Events:
|
|
if e.Op != fsnotify.Chmod {
|
|
ret = C.perfmon_startCounters()
|
|
}
|
|
default:
|
|
ret = C.perfmon_startCounters()
|
|
}
|
|
if ret != 0 {
|
|
return true, fmt.Errorf("failed to start events '%s', error %d", evset.go_estr, ret)
|
|
}
|
|
select {
|
|
case <-sigchan:
|
|
ret = -1
|
|
case e := <-watcher.Events:
|
|
if e.Op != fsnotify.Chmod {
|
|
ret = C.perfmon_readCounters()
|
|
}
|
|
default:
|
|
ret = C.perfmon_readCounters()
|
|
}
|
|
if ret != 0 {
|
|
return true, fmt.Errorf("failed to read events '%s', error %d", evset.go_estr, ret)
|
|
}
|
|
|
|
// Wait
|
|
time.Sleep(interval)
|
|
|
|
// Read counters
|
|
select {
|
|
case <-sigchan:
|
|
ret = -1
|
|
case e := <-watcher.Events:
|
|
if e.Op != fsnotify.Chmod {
|
|
ret = C.perfmon_readCounters()
|
|
}
|
|
default:
|
|
ret = C.perfmon_readCounters()
|
|
}
|
|
if ret != 0 {
|
|
return true, fmt.Errorf("failed to read events '%s', error %d", evset.go_estr, ret)
|
|
}
|
|
|
|
// Store counters
|
|
for eidx, counter := range evset.eorder {
|
|
gctr := C.GoString(counter)
|
|
for _, tid := range m.cpu2tid {
|
|
res := C.perfmon_getLastResult(gid, C.int(eidx), C.int(tid))
|
|
fres := float64(res)
|
|
if m.config.InvalidToZero && (math.IsNaN(fres) || math.IsInf(fres, 0)) {
|
|
fres = 0.0
|
|
}
|
|
evset.results[tid][gctr] = fres
|
|
}
|
|
}
|
|
|
|
// Store time in seconds the event group was measured the last time
|
|
for _, tid := range m.cpu2tid {
|
|
evset.results[tid]["time"] = float64(C.perfmon_getLastTimeOfGroup(gid))
|
|
}
|
|
|
|
// Stop counters
|
|
select {
|
|
case <-sigchan:
|
|
ret = -1
|
|
case e := <-watcher.Events:
|
|
if e.Op != fsnotify.Chmod {
|
|
ret = C.perfmon_stopCounters()
|
|
}
|
|
default:
|
|
ret = C.perfmon_stopCounters()
|
|
}
|
|
if ret != 0 {
|
|
return true, fmt.Errorf("failed to stop events '%s', error %d", evset.go_estr, ret)
|
|
}
|
|
|
|
// Deallocates all internal data that is used during performance monitoring
|
|
signal.Stop(sigchan)
|
|
select {
|
|
case e := <-watcher.Events:
|
|
if e.Op != fsnotify.Chmod {
|
|
C.perfmon_finalize()
|
|
}
|
|
default:
|
|
C.perfmon_finalize()
|
|
}
|
|
return false, nil
|
|
}
|
|
|
|
// Get all measurement results for an event set, derive the metric values out of the measurement results and send it
|
|
func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error {
|
|
invClock := float64(1.0 / m.basefreq)
|
|
|
|
for _, tid := range m.cpu2tid {
|
|
evset.results[tid]["inverseClock"] = invClock
|
|
}
|
|
|
|
// Go over the event set metrics, derive the value out of the event:counter values and send it
|
|
for _, metric := range m.config.Eventsets[evset.internal].Metrics {
|
|
// The metric scope is determined in the Init() function
|
|
// Get the map scope-id -> tids
|
|
scopemap := m.cpu2tid
|
|
if metric.Type == "socket" {
|
|
scopemap = m.sock2tid
|
|
}
|
|
// Send all metrics with same time stamp
|
|
// This function does only computiation, counter measurement is done before
|
|
now := time.Now()
|
|
for domain, tid := range scopemap {
|
|
if tid >= 0 && len(metric.Calc) > 0 {
|
|
value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid])
|
|
if err != nil {
|
|
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
|
value = 0.0
|
|
}
|
|
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
|
|
value = 0.0
|
|
}
|
|
evset.metrics[tid][metric.Name] = value
|
|
// Now we have the result, send it with the proper tags
|
|
if !math.IsNaN(value) && metric.Publish {
|
|
fields := map[string]interface{}{"value": value}
|
|
y, err :=
|
|
lp.New(
|
|
metric.Name,
|
|
map[string]string{
|
|
"type": metric.Type,
|
|
},
|
|
m.meta,
|
|
fields,
|
|
now,
|
|
)
|
|
if err == nil {
|
|
if metric.Type != "node" {
|
|
y.AddTag("type-id", fmt.Sprintf("%d", domain))
|
|
}
|
|
if len(metric.Unit) > 0 {
|
|
y.AddMeta("unit", metric.Unit)
|
|
}
|
|
output <- y
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Send per core aggregated values
|
|
if metric.SendCoreTotalVal {
|
|
totalCoreValues := make(map[int]float64)
|
|
for _, tid := range scopemap {
|
|
if tid >= 0 && len(metric.Calc) > 0 {
|
|
coreID := m.tid2core[tid]
|
|
value := evset.metrics[tid][metric.Name]
|
|
if !math.IsNaN(value) && metric.Publish {
|
|
totalCoreValues[coreID] += value
|
|
}
|
|
}
|
|
}
|
|
|
|
for coreID, value := range totalCoreValues {
|
|
y, err :=
|
|
lp.New(
|
|
metric.Name,
|
|
map[string]string{
|
|
"type": "core",
|
|
"type-id": fmt.Sprintf("%d", coreID),
|
|
},
|
|
m.meta,
|
|
map[string]interface{}{
|
|
"value": value,
|
|
},
|
|
now,
|
|
)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if len(metric.Unit) > 0 {
|
|
y.AddMeta("unit", metric.Unit)
|
|
}
|
|
output <- y
|
|
}
|
|
}
|
|
|
|
// Send per socket aggregated values
|
|
if metric.SendSocketTotalVal {
|
|
totalSocketValues := make(map[int]float64)
|
|
for _, tid := range scopemap {
|
|
if tid >= 0 && len(metric.Calc) > 0 {
|
|
socketID := m.tid2socket[tid]
|
|
value := evset.metrics[tid][metric.Name]
|
|
if !math.IsNaN(value) && metric.Publish {
|
|
totalSocketValues[socketID] += value
|
|
}
|
|
}
|
|
}
|
|
|
|
for socketID, value := range totalSocketValues {
|
|
y, err :=
|
|
lp.New(
|
|
metric.Name,
|
|
map[string]string{
|
|
"type": "socket",
|
|
"type-id": fmt.Sprintf("%d", socketID),
|
|
},
|
|
m.meta,
|
|
map[string]interface{}{
|
|
"value": value,
|
|
},
|
|
now,
|
|
)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if len(metric.Unit) > 0 {
|
|
y.AddMeta("unit", metric.Unit)
|
|
}
|
|
output <- y
|
|
}
|
|
}
|
|
|
|
// Send per node aggregated value
|
|
if metric.SendNodeTotalVal {
|
|
var totalNodeValue float64 = 0.0
|
|
for _, tid := range scopemap {
|
|
if tid >= 0 && len(metric.Calc) > 0 {
|
|
value := evset.metrics[tid][metric.Name]
|
|
if !math.IsNaN(value) && metric.Publish {
|
|
totalNodeValue += value
|
|
}
|
|
}
|
|
}
|
|
|
|
y, err :=
|
|
lp.New(
|
|
metric.Name,
|
|
map[string]string{
|
|
"type": "node",
|
|
},
|
|
m.meta,
|
|
map[string]interface{}{
|
|
"value": totalNodeValue,
|
|
},
|
|
now,
|
|
)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if len(metric.Unit) > 0 {
|
|
y.AddMeta("unit", metric.Unit)
|
|
}
|
|
output <- y
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Go over the global metrics, derive the value out of the event sets' metric values and send it
|
|
func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error {
|
|
// Send all metrics with same time stamp
|
|
// This function does only computiation, counter measurement is done before
|
|
now := time.Now()
|
|
|
|
for _, metric := range m.config.Metrics {
|
|
// The metric scope is determined in the Init() function
|
|
// Get the map scope-id -> tids
|
|
scopemap := m.cpu2tid
|
|
if metric.Type == "socket" {
|
|
scopemap = m.sock2tid
|
|
}
|
|
for domain, tid := range scopemap {
|
|
if tid >= 0 {
|
|
// Here we generate parameter list
|
|
params := make(map[string]float64)
|
|
for _, evset := range groups {
|
|
for mname, mres := range evset.metrics[tid] {
|
|
params[mname] = mres
|
|
}
|
|
}
|
|
// Evaluate the metric
|
|
value, err := agg.EvalFloat64Condition(metric.Calc, params)
|
|
if err != nil {
|
|
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
|
value = 0.0
|
|
}
|
|
if m.config.InvalidToZero && (math.IsNaN(value) || math.IsInf(value, 0)) {
|
|
value = 0.0
|
|
}
|
|
// Now we have the result, send it with the proper tags
|
|
if !math.IsNaN(value) {
|
|
if metric.Publish {
|
|
y, err :=
|
|
lp.New(
|
|
metric.Name,
|
|
map[string]string{
|
|
"type": metric.Type,
|
|
},
|
|
m.meta,
|
|
map[string]interface{}{
|
|
"value": value,
|
|
},
|
|
now,
|
|
)
|
|
if err == nil {
|
|
if metric.Type != "node" {
|
|
y.AddTag("type-id", fmt.Sprintf("%d", domain))
|
|
}
|
|
if len(metric.Unit) > 0 {
|
|
y.AddMeta("unit", metric.Unit)
|
|
}
|
|
output <- y
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMetric) {
|
|
var err error = nil
|
|
groups := make([]LikwidEventsetConfig, 0)
|
|
|
|
for evidx, evset := range m.config.Eventsets {
|
|
e := genLikwidEventSet(evset)
|
|
e.internal = evidx
|
|
skip := false
|
|
if !skip {
|
|
// measure event set 'i' for 'interval' seconds
|
|
skip, err = m.takeMeasurement(evidx, e, interval)
|
|
if err != nil {
|
|
cclog.ComponentError(m.name, err.Error())
|
|
return
|
|
}
|
|
}
|
|
|
|
if !skip {
|
|
// read measurements and derive event set metrics
|
|
m.calcEventsetMetrics(e, interval, output)
|
|
}
|
|
groups = append(groups, e)
|
|
}
|
|
// calculate global metrics
|
|
m.calcGlobalMetrics(groups, interval, output)
|
|
}
|
|
|
|
// main read function taking multiple measurement rounds, each 'interval' seconds long
|
|
func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|
if !m.init {
|
|
return
|
|
}
|
|
m.measureThread.Call(func() {
|
|
m.ReadThread(interval, output)
|
|
})
|
|
}
|
|
|
|
func (m *LikwidCollector) Close() {
|
|
if m.init {
|
|
m.init = false
|
|
m.lock.Lock()
|
|
m.measureThread.Terminate()
|
|
m.initialized = false
|
|
m.lock.Unlock()
|
|
C.topology_finalize()
|
|
}
|
|
}
|