mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-07-31 08:56:06 +02:00
Merge develop branch into main (#123)
* Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * Update README.md Use right JSON type in configuration * Update sink's README * Test whether ipmitool or ipmi-sensors can be executed without errors * Little fixes to the prometheus sink (#115) * Add uint64 to float64 cast option * Add prometheus sink to the list of available sinks * Add aggregated counters by gpu for nvlink errors --------- Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de> * Ccmessage migration (#119) * Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * Switch to CCMessage for all files. --------- Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> * Switch to ccmessage also for latest additions in nvidiaMetric * New Message processor (#118) * Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * New message processor to check whether a message should be dropped or manipulate it in flight * Create a copy of message before manipulation --------- Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> * Update collector's Makefile and go.mod/sum files * Use message processor in router, all sinks and all receivers * Add support for credential file (NKEY) to NATS sink and receiver * Fix JSON keys in message processor configuration * Update docs for message processor, router and the default router config file * Add link to expr syntax and fix regex matching docs * Update sample collectors * Minor style change in collector manager * Some helpers for ccTopology * LIKWID collector: write log owner change only once * Fix for metrics without units and reduce debugging messages for messageProcessor * Use shorted hostname for hostname added by router * Define default port for NATS * CPUstat collector: only add unit for applicable metrics * Add precision option to all sinks using Influx's encoder * Add message processor to all sink documentation * Add units to documentation of cpustat collector --------- Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: oscarminus <me@oscarminus.de> Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de>
This commit is contained in:
@@ -24,9 +24,9 @@ import (
|
||||
"time"
|
||||
"unsafe"
|
||||
|
||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
||||
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
|
||||
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||
"github.com/NVIDIA/go-nvml/pkg/dl"
|
||||
"github.com/fsnotify/fsnotify"
|
||||
@@ -43,7 +43,7 @@ const (
|
||||
type LikwidCollectorMetricConfig struct {
|
||||
Name string `json:"name"` // Name of the metric
|
||||
Calc string `json:"calc"` // Calculation for the metric using
|
||||
Type string `json:"type"` // Metric type (aka node, socket, cpu, ...)
|
||||
Type string `json:"type"` // Metric type (aka node, socket, hwthread, ...)
|
||||
Publish bool `json:"publish"`
|
||||
SendCoreTotalVal bool `json:"send_core_total_values,omitempty"`
|
||||
SendSocketTotalVal bool `json:"send_socket_total_values,omitempty"`
|
||||
@@ -91,6 +91,8 @@ type LikwidCollector struct {
|
||||
running bool
|
||||
initialized bool
|
||||
needs_reinit bool
|
||||
myuid int
|
||||
lock_err_once bool
|
||||
likwidGroups map[C.int]LikwidEventsetConfig
|
||||
lock sync.Mutex
|
||||
measureThread thread.Thread
|
||||
@@ -204,6 +206,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
m.initialized = false
|
||||
m.needs_reinit = true
|
||||
m.running = false
|
||||
m.myuid = os.Getuid()
|
||||
m.config.AccessMode = LIKWID_DEF_ACCESSMODE
|
||||
m.config.LibraryPath = LIKWID_LIB_NAME
|
||||
m.config.LockfilePath = LIKWID_DEF_LOCKFILE
|
||||
@@ -390,14 +393,24 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
}
|
||||
// Check file ownership
|
||||
uid := info.Sys().(*syscall.Stat_t).Uid
|
||||
if uid != uint32(os.Getuid()) {
|
||||
if uid != uint32(m.myuid) {
|
||||
usr, err := user.LookupId(fmt.Sprint(uid))
|
||||
if err == nil {
|
||||
return true, fmt.Errorf("Access to performance counters locked by %s", usr.Username)
|
||||
err = fmt.Errorf("access to performance counters locked by %s", usr.Username)
|
||||
} else {
|
||||
return true, fmt.Errorf("Access to performance counters locked by %d", uid)
|
||||
err = fmt.Errorf("access to performance counters locked by %d", uid)
|
||||
}
|
||||
// delete error if we already returned the error once.
|
||||
if !m.lock_err_once {
|
||||
m.lock_err_once = true
|
||||
} else {
|
||||
err = nil
|
||||
}
|
||||
return true, err
|
||||
}
|
||||
// reset lock_err_once
|
||||
m.lock_err_once = false
|
||||
|
||||
// Add the lock file to the watcher
|
||||
err = watcher.Add(m.config.LockfilePath)
|
||||
if err != nil {
|
||||
@@ -436,9 +449,7 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
gid = C.perfmon_addEventSet(evset.estr)
|
||||
}
|
||||
if gid < 0 {
|
||||
return true, fmt.Errorf("failed to add events %s, error %d", evset.go_estr, gid)
|
||||
} else {
|
||||
evset.gid = gid
|
||||
return true, fmt.Errorf("failed to add events %s, id %d, error %d", evset.go_estr, evidx, gid)
|
||||
}
|
||||
|
||||
// Setup all performance monitoring counters of an eventSet
|
||||
@@ -549,11 +560,12 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
}
|
||||
|
||||
// Get all measurement results for an event set, derive the metric values out of the measurement results and send it
|
||||
func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error {
|
||||
func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interval time.Duration, output chan lp.CCMessage) error {
|
||||
invClock := float64(1.0 / m.basefreq)
|
||||
|
||||
for _, tid := range m.cpu2tid {
|
||||
evset.results[tid]["inverseClock"] = invClock
|
||||
evset.results[tid]["gotime"] = interval.Seconds()
|
||||
}
|
||||
|
||||
// Go over the event set metrics, derive the value out of the event:counter values and send it
|
||||
@@ -582,7 +594,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
if !math.IsNaN(value) && metric.Publish {
|
||||
fields := map[string]interface{}{"value": value}
|
||||
y, err :=
|
||||
lp.New(
|
||||
lp.NewMessage(
|
||||
metric.Name,
|
||||
map[string]string{
|
||||
"type": metric.Type,
|
||||
@@ -619,7 +631,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
|
||||
for coreID, value := range totalCoreValues {
|
||||
y, err :=
|
||||
lp.New(
|
||||
lp.NewMessage(
|
||||
metric.Name,
|
||||
map[string]string{
|
||||
"type": "core",
|
||||
@@ -656,7 +668,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
|
||||
for socketID, value := range totalSocketValues {
|
||||
y, err :=
|
||||
lp.New(
|
||||
lp.NewMessage(
|
||||
metric.Name,
|
||||
map[string]string{
|
||||
"type": "socket",
|
||||
@@ -691,7 +703,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
}
|
||||
|
||||
y, err :=
|
||||
lp.New(
|
||||
lp.NewMessage(
|
||||
metric.Name,
|
||||
map[string]string{
|
||||
"type": "node",
|
||||
@@ -716,7 +728,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
}
|
||||
|
||||
// Go over the global metrics, derive the value out of the event sets' metric values and send it
|
||||
func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error {
|
||||
func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, interval time.Duration, output chan lp.CCMessage) error {
|
||||
// Send all metrics with same time stamp
|
||||
// This function does only computiation, counter measurement is done before
|
||||
now := time.Now()
|
||||
@@ -737,6 +749,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
||||
params[mname] = mres
|
||||
}
|
||||
}
|
||||
params["gotime"] = interval.Seconds()
|
||||
// Evaluate the metric
|
||||
value, err := agg.EvalFloat64Condition(metric.Calc, params)
|
||||
if err != nil {
|
||||
@@ -750,7 +763,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
||||
if !math.IsNaN(value) {
|
||||
if metric.Publish {
|
||||
y, err :=
|
||||
lp.New(
|
||||
lp.NewMessage(
|
||||
metric.Name,
|
||||
map[string]string{
|
||||
"type": metric.Type,
|
||||
@@ -778,7 +791,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMetric) {
|
||||
func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMessage) {
|
||||
var err error = nil
|
||||
groups := make([]LikwidEventsetConfig, 0)
|
||||
|
||||
@@ -798,15 +811,17 @@ func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMe
|
||||
if !skip {
|
||||
// read measurements and derive event set metrics
|
||||
m.calcEventsetMetrics(e, interval, output)
|
||||
groups = append(groups, e)
|
||||
}
|
||||
groups = append(groups, e)
|
||||
}
|
||||
// calculate global metrics
|
||||
m.calcGlobalMetrics(groups, interval, output)
|
||||
if len(groups) > 0 {
|
||||
// calculate global metrics
|
||||
m.calcGlobalMetrics(groups, interval, output)
|
||||
}
|
||||
}
|
||||
|
||||
// main read function taking multiple measurement rounds, each 'interval' seconds long
|
||||
func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||
func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
if !m.init {
|
||||
return
|
||||
}
|
||||
|
Reference in New Issue
Block a user