mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-01-15 00:29:09 +01:00
7840de7b82
* Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * Update README.md Use right JSON type in configuration * Update sink's README * Test whether ipmitool or ipmi-sensors can be executed without errors * Little fixes to the prometheus sink (#115) * Add uint64 to float64 cast option * Add prometheus sink to the list of available sinks * Add aggregated counters by gpu for nvlink errors --------- Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de> * Ccmessage migration (#119) * Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * Switch to CCMessage for all files. --------- Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> * Switch to ccmessage also for latest additions in nvidiaMetric * New Message processor (#118) * Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * New message processor to check whether a message should be dropped or manipulate it in flight * Create a copy of message before manipulation --------- Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> * Update collector's Makefile and go.mod/sum files * Use message processor in router, all sinks and all receivers * Add support for credential file (NKEY) to NATS sink and receiver * Fix JSON keys in message processor configuration * Update docs for message processor, router and the default router config file * Add link to expr syntax and fix regex matching docs * Update sample collectors * Minor style change in collector manager * Some helpers for ccTopology * LIKWID collector: write log owner change only once * Fix for metrics without units and reduce debugging messages for messageProcessor * Use shorted hostname for hostname added by router * Define default port for NATS * CPUstat collector: only add unit for applicable metrics * Add precision option to all sinks using Influx's encoder * Add message processor to all sink documentation * Add units to documentation of cpustat collector --------- Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: oscarminus <me@oscarminus.de> Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de>
155 lines
5.0 KiB
Go
155 lines
5.0 KiB
Go
package collectors
|
|
|
|
import (
|
|
"bufio"
|
|
"encoding/json"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
|
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
|
)
|
|
|
|
const SCHEDSTATFILE = `/proc/schedstat`
|
|
|
|
// These are the fields we read from the JSON configuration
|
|
type SchedstatCollectorConfig struct {
|
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
|
}
|
|
|
|
// This contains all variables we need during execution and the variables
|
|
// defined by metricCollector (name, init, ...)
|
|
type SchedstatCollector struct {
|
|
metricCollector
|
|
config SchedstatCollectorConfig // the configuration structure
|
|
lastTimestamp time.Time // Store time stamp of last tick to derive values
|
|
meta map[string]string // default meta information
|
|
cputags map[string]map[string]string // default tags
|
|
olddata map[string]map[string]int64 // default tags
|
|
}
|
|
|
|
// Functions to implement MetricCollector interface
|
|
// Init(...), Read(...), Close()
|
|
// See: metricCollector.go
|
|
|
|
// Init initializes the sample collector
|
|
// Called once by the collector manager
|
|
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
|
func (m *SchedstatCollector) Init(config json.RawMessage) error {
|
|
var err error = nil
|
|
// Always set the name early in Init() to use it in cclog.Component* functions
|
|
m.name = "SchedstatCollector"
|
|
// This is for later use, also call it early
|
|
m.setup()
|
|
// Tell whether the collector should be run in parallel with others (reading files, ...)
|
|
// or it should be run serially, mostly for collectors acutally doing measurements
|
|
// because they should not measure the execution of the other collectors
|
|
m.parallel = true
|
|
// Define meta information sent with each metric
|
|
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
|
m.meta = map[string]string{"source": m.name, "group": "SCHEDSTAT"}
|
|
|
|
// Read in the JSON configuration
|
|
if len(config) > 0 {
|
|
err = json.Unmarshal(config, &m.config)
|
|
if err != nil {
|
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Check input file
|
|
file, err := os.Open(string(SCHEDSTATFILE))
|
|
if err != nil {
|
|
cclog.ComponentError(m.name, err.Error())
|
|
}
|
|
defer file.Close()
|
|
|
|
// Pre-generate tags for all CPUs
|
|
num_cpus := 0
|
|
m.cputags = make(map[string]map[string]string)
|
|
m.olddata = make(map[string]map[string]int64)
|
|
scanner := bufio.NewScanner(file)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
linefields := strings.Fields(line)
|
|
if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
|
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
|
cpu, _ := strconv.Atoi(cpustr)
|
|
running, _ := strconv.ParseInt(linefields[7], 10, 64)
|
|
waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
|
|
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
|
|
m.olddata[linefields[0]] = map[string]int64{"running": running, "waiting": waiting}
|
|
num_cpus++
|
|
}
|
|
}
|
|
|
|
// Save current timestamp
|
|
m.lastTimestamp = time.Now()
|
|
|
|
// Set this flag only if everything is initialized properly, all required files exist, ...
|
|
m.init = true
|
|
return err
|
|
}
|
|
|
|
func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]string, output chan lp.CCMessage, now time.Time, tsdelta time.Duration) {
|
|
running, _ := strconv.ParseInt(linefields[7], 10, 64)
|
|
waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
|
|
diff_running := running - m.olddata[linefields[0]]["running"]
|
|
diff_waiting := waiting - m.olddata[linefields[0]]["waiting"]
|
|
|
|
var l_running float64 = float64(diff_running) / tsdelta.Seconds() / (math.Pow(1000, 3))
|
|
var l_waiting float64 = float64(diff_waiting) / tsdelta.Seconds() / (math.Pow(1000, 3))
|
|
|
|
m.olddata[linefields[0]]["running"] = running
|
|
m.olddata[linefields[0]]["waiting"] = waiting
|
|
value := l_running + l_waiting
|
|
|
|
y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now)
|
|
if err == nil {
|
|
// Send it to output channel
|
|
output <- y
|
|
}
|
|
}
|
|
|
|
// Read collects all metrics belonging to the sample collector
|
|
// and sends them through the output channel to the collector manager
|
|
func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|
if !m.init {
|
|
return
|
|
}
|
|
|
|
//timestamps
|
|
now := time.Now()
|
|
tsdelta := now.Sub(m.lastTimestamp)
|
|
|
|
file, err := os.Open(string(SCHEDSTATFILE))
|
|
if err != nil {
|
|
cclog.ComponentError(m.name, err.Error())
|
|
}
|
|
defer file.Close()
|
|
|
|
scanner := bufio.NewScanner(file)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
linefields := strings.Fields(line)
|
|
if strings.HasPrefix(linefields[0], "cpu") {
|
|
m.ParseProcLine(linefields, m.cputags[linefields[0]], output, now, tsdelta)
|
|
}
|
|
}
|
|
|
|
m.lastTimestamp = now
|
|
|
|
}
|
|
|
|
// Close metric collector: close network connection, close files, close libraries, ...
|
|
// Called once by the collector manager
|
|
func (m *SchedstatCollector) Close() {
|
|
// Unset flag
|
|
m.init = false
|
|
}
|