mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-01-15 00:29:09 +01:00
7840de7b82
* Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * Update README.md Use right JSON type in configuration * Update sink's README * Test whether ipmitool or ipmi-sensors can be executed without errors * Little fixes to the prometheus sink (#115) * Add uint64 to float64 cast option * Add prometheus sink to the list of available sinks * Add aggregated counters by gpu for nvlink errors --------- Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de> * Ccmessage migration (#119) * Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * Switch to CCMessage for all files. --------- Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> * Switch to ccmessage also for latest additions in nvidiaMetric * New Message processor (#118) * Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * New message processor to check whether a message should be dropped or manipulate it in flight * Create a copy of message before manipulation --------- Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> * Update collector's Makefile and go.mod/sum files * Use message processor in router, all sinks and all receivers * Add support for credential file (NKEY) to NATS sink and receiver * Fix JSON keys in message processor configuration * Update docs for message processor, router and the default router config file * Add link to expr syntax and fix regex matching docs * Update sample collectors * Minor style change in collector manager * Some helpers for ccTopology * LIKWID collector: write log owner change only once * Fix for metrics without units and reduce debugging messages for messageProcessor * Use shorted hostname for hostname added by router * Define default port for NATS * CPUstat collector: only add unit for applicable metrics * Add precision option to all sinks using Influx's encoder * Add message processor to all sink documentation * Add units to documentation of cpustat collector --------- Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: oscarminus <me@oscarminus.de> Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de>
238 lines
6.0 KiB
Go
238 lines
6.0 KiB
Go
package collectors
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
|
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
|
)
|
|
|
|
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
|
|
// /sys/class/hwmon/hwmon*/name -> coretemp
|
|
// /sys/class/hwmon/hwmon*/temp*_label -> Core 0
|
|
// /sys/class/hwmon/hwmon*/temp*_input -> 27800 = 27.8°C
|
|
// /sys/class/hwmon/hwmon*/temp*_max -> 86000 = 86.0°C
|
|
// /sys/class/hwmon/hwmon*/temp*_crit -> 100000 = 100.0°C
|
|
|
|
type TempCollectorSensor struct {
|
|
name string
|
|
label string
|
|
metricName string // Default: name_label
|
|
file string
|
|
maxTempName string
|
|
maxTemp int64
|
|
critTempName string
|
|
critTemp int64
|
|
tags map[string]string
|
|
}
|
|
|
|
type TempCollector struct {
|
|
metricCollector
|
|
config struct {
|
|
ExcludeMetrics []string `json:"exclude_metrics"`
|
|
TagOverride map[string]map[string]string `json:"tag_override"`
|
|
ReportMaxTemp bool `json:"report_max_temperature"`
|
|
ReportCriticalTemp bool `json:"report_critical_temperature"`
|
|
}
|
|
sensors []*TempCollectorSensor
|
|
}
|
|
|
|
func (m *TempCollector) Init(config json.RawMessage) error {
|
|
// Check if already initialized
|
|
if m.init {
|
|
return nil
|
|
}
|
|
|
|
m.name = "TempCollector"
|
|
m.parallel = true
|
|
m.setup()
|
|
if len(config) > 0 {
|
|
err := json.Unmarshal(config, &m.config)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
m.meta = map[string]string{
|
|
"source": m.name,
|
|
"group": "IPMI",
|
|
"unit": "degC",
|
|
}
|
|
|
|
m.sensors = make([]*TempCollectorSensor, 0)
|
|
|
|
// Find all temperature sensor files
|
|
globPattern := filepath.Join("/sys/class/hwmon", "*", "temp*_input")
|
|
inputFiles, err := filepath.Glob(globPattern)
|
|
if err != nil {
|
|
return fmt.Errorf("unable to glob files with pattern '%s': %v", globPattern, err)
|
|
}
|
|
if inputFiles == nil {
|
|
return fmt.Errorf("unable to find any files with pattern '%s'", globPattern)
|
|
}
|
|
|
|
// Get sensor name for each temperature sensor file
|
|
for _, file := range inputFiles {
|
|
sensor := new(TempCollectorSensor)
|
|
|
|
// sensor name
|
|
nameFile := filepath.Join(filepath.Dir(file), "name")
|
|
name, err := os.ReadFile(nameFile)
|
|
if err == nil {
|
|
sensor.name = strings.TrimSpace(string(name))
|
|
}
|
|
|
|
// sensor label
|
|
labelFile := strings.TrimSuffix(file, "_input") + "_label"
|
|
label, err := os.ReadFile(labelFile)
|
|
if err == nil {
|
|
sensor.label = strings.TrimSpace(string(label))
|
|
}
|
|
|
|
// sensor metric name
|
|
switch {
|
|
case len(sensor.name) == 0 && len(sensor.label) == 0:
|
|
continue
|
|
case sensor.name == "coretemp" && strings.HasPrefix(sensor.label, "Core ") ||
|
|
sensor.name == "coretemp" && strings.HasPrefix(sensor.label, "Package id "):
|
|
sensor.metricName = "temp_" + sensor.label
|
|
case len(sensor.name) != 0 && len(sensor.label) != 0:
|
|
sensor.metricName = sensor.name + "_" + sensor.label
|
|
case len(sensor.name) != 0:
|
|
sensor.metricName = sensor.name
|
|
case len(sensor.label) != 0:
|
|
sensor.metricName = sensor.label
|
|
}
|
|
sensor.metricName = strings.ToLower(sensor.metricName)
|
|
sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1)
|
|
// Add temperature prefix, if required
|
|
if !strings.Contains(sensor.metricName, "temp") {
|
|
sensor.metricName = "temp_" + sensor.metricName
|
|
}
|
|
|
|
// Sensor file
|
|
_, err = os.ReadFile(file)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
sensor.file = file
|
|
|
|
// Sensor tags
|
|
sensor.tags = map[string]string{
|
|
"type": "node",
|
|
}
|
|
|
|
// Apply tag override configuration
|
|
for key, newtags := range m.config.TagOverride {
|
|
if strings.Contains(sensor.file, key) {
|
|
sensor.tags = newtags
|
|
break
|
|
}
|
|
}
|
|
|
|
// max temperature
|
|
if m.config.ReportMaxTemp {
|
|
maxTempFile := strings.TrimSuffix(file, "_input") + "_max"
|
|
if buffer, err := os.ReadFile(maxTempFile); err == nil {
|
|
if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil {
|
|
sensor.maxTempName = strings.Replace(sensor.metricName, "temp", "max_temp", 1)
|
|
sensor.maxTemp = x / 1000
|
|
}
|
|
}
|
|
}
|
|
|
|
// critical temperature
|
|
if m.config.ReportCriticalTemp {
|
|
criticalTempFile := strings.TrimSuffix(file, "_input") + "_crit"
|
|
if buffer, err := os.ReadFile(criticalTempFile); err == nil {
|
|
if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil {
|
|
sensor.critTempName = strings.Replace(sensor.metricName, "temp", "crit_temp", 1)
|
|
sensor.critTemp = x / 1000
|
|
}
|
|
}
|
|
}
|
|
|
|
m.sensors = append(m.sensors, sensor)
|
|
}
|
|
|
|
// Empty sensors map
|
|
if len(m.sensors) == 0 {
|
|
return fmt.Errorf("no temperature sensors found")
|
|
}
|
|
|
|
// Finished initialization
|
|
m.init = true
|
|
return nil
|
|
}
|
|
|
|
func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|
|
|
for _, sensor := range m.sensors {
|
|
// Read sensor file
|
|
buffer, err := os.ReadFile(sensor.file)
|
|
if err != nil {
|
|
cclog.ComponentError(
|
|
m.name,
|
|
fmt.Sprintf("Read(): Failed to read file '%s': %v", sensor.file, err))
|
|
continue
|
|
}
|
|
x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64)
|
|
if err != nil {
|
|
cclog.ComponentError(
|
|
m.name,
|
|
fmt.Sprintf("Read(): Failed to convert temperature '%s' to int64: %v", buffer, err))
|
|
continue
|
|
}
|
|
x /= 1000
|
|
y, err := lp.NewMessage(
|
|
sensor.metricName,
|
|
sensor.tags,
|
|
m.meta,
|
|
map[string]interface{}{"value": x},
|
|
time.Now(),
|
|
)
|
|
if err == nil {
|
|
output <- y
|
|
}
|
|
|
|
// max temperature
|
|
if m.config.ReportMaxTemp && sensor.maxTemp != 0 {
|
|
y, err := lp.NewMessage(
|
|
sensor.maxTempName,
|
|
sensor.tags,
|
|
m.meta,
|
|
map[string]interface{}{"value": sensor.maxTemp},
|
|
time.Now(),
|
|
)
|
|
if err == nil {
|
|
output <- y
|
|
}
|
|
}
|
|
|
|
// critical temperature
|
|
if m.config.ReportCriticalTemp && sensor.critTemp != 0 {
|
|
y, err := lp.NewMessage(
|
|
sensor.critTempName,
|
|
sensor.tags,
|
|
m.meta,
|
|
map[string]interface{}{"value": sensor.critTemp},
|
|
time.Now(),
|
|
)
|
|
if err == nil {
|
|
output <- y
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
func (m *TempCollector) Close() {
|
|
m.init = false
|
|
}
|