cc-metric-collector/collectors/tempMetric.go

238 lines
6.0 KiB
Go
Raw Permalink Normal View History

package collectors
import (
2021-11-29 15:32:58 +01:00
"encoding/json"
"fmt"
2022-10-09 17:03:38 +02:00
"os"
"path/filepath"
"strconv"
"strings"
"time"
2022-02-11 17:17:10 +01:00
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
2022-02-14 10:46:05 +01:00
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
// /sys/class/hwmon/hwmon*/name -> coretemp
// /sys/class/hwmon/hwmon*/temp*_label -> Core 0
// /sys/class/hwmon/hwmon*/temp*_input -> 27800 = 27.8°C
// /sys/class/hwmon/hwmon*/temp*_max -> 86000 = 86.0°C
// /sys/class/hwmon/hwmon*/temp*_crit -> 100000 = 100.0°C
2022-02-14 10:46:05 +01:00
type TempCollectorSensor struct {
name string
label string
metricName string // Default: name_label
file string
maxTempName string
maxTemp int64
critTempName string
critTemp int64
tags map[string]string
}
type TempCollector struct {
Modularize the whole thing (#16) * Use channels, add a metric router, split up configuration and use extended version of Influx line protocol internally * Use central timer for collectors and router. Add expressions to router * Add expression to router config * Update entry points * Start with README * Update README for CCMetric * Formatting * Update README.md * Add README for MultiChanTicker * Add README for MultiChanTicker * Update README.md * Add README to metric router * Update main README * Remove SinkEntity type * Update README for sinks * Update go files * Update README for receivers * Update collectors README * Update collectors README * Use seperate page per collector * Fix for tempstat page * Add docs for customcmd collector * Add docs for ipmistat collector * Add docs for topprocs collector * Update customCmdMetric.md * Use seconds when calculating LIKWID metrics * Add IB metrics ib_recv_pkts and ib_xmit_pkts * Drop domain part of host name * Updated to latest stable version of likwid * Define source code dependencies in Makefile * Add GPFS / IBM Spectrum Scale collector * Add vet and staticcheck make targets * Add vet and staticcheck make targets * Avoid go vet warning: struct field tag `json:"..., omitempty"` not compatible with reflect.StructTag.Get: suspicious space in struct tag value struct field tag `json:"...", omitempty` not compatible with reflect.StructTag.Get: key:"value" pairs not separated by spaces * Add sample collector to README.md * Add CPU frequency collector * Avoid staticcheck warning: redundant return statement * Avoid staticcheck warning: unnecessary assignment to the blank identifier * Simplified code * Add CPUFreqCollectorCpuinfo a metric collector to measure the current frequency of the CPUs as obtained from /proc/cpuinfo Only measure on the first hyperthread * Add collector for NFS clients * Move publication of metrics into Flush() for NatsSink * Update GitHub actions * Refactoring * Avoid vet warning: Println arg list ends with redundant newline * Avoid vet warning struct field commands has json tag but is not exported * Avoid vet warning: return copies lock value. * Corrected typo * Refactoring * Add go sources in internal/... * Bad separator in Makefile * Fix Infiniband collector Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
2022-01-25 15:37:43 +01:00
metricCollector
2022-02-14 10:46:05 +01:00
config struct {
ExcludeMetrics []string `json:"exclude_metrics"`
TagOverride map[string]map[string]string `json:"tag_override"`
ReportMaxTemp bool `json:"report_max_temperature"`
ReportCriticalTemp bool `json:"report_critical_temperature"`
2022-02-14 10:46:05 +01:00
}
sensors []*TempCollectorSensor
}
Modularize the whole thing (#16) * Use channels, add a metric router, split up configuration and use extended version of Influx line protocol internally * Use central timer for collectors and router. Add expressions to router * Add expression to router config * Update entry points * Start with README * Update README for CCMetric * Formatting * Update README.md * Add README for MultiChanTicker * Add README for MultiChanTicker * Update README.md * Add README to metric router * Update main README * Remove SinkEntity type * Update README for sinks * Update go files * Update README for receivers * Update collectors README * Update collectors README * Use seperate page per collector * Fix for tempstat page * Add docs for customcmd collector * Add docs for ipmistat collector * Add docs for topprocs collector * Update customCmdMetric.md * Use seconds when calculating LIKWID metrics * Add IB metrics ib_recv_pkts and ib_xmit_pkts * Drop domain part of host name * Updated to latest stable version of likwid * Define source code dependencies in Makefile * Add GPFS / IBM Spectrum Scale collector * Add vet and staticcheck make targets * Add vet and staticcheck make targets * Avoid go vet warning: struct field tag `json:"..., omitempty"` not compatible with reflect.StructTag.Get: suspicious space in struct tag value struct field tag `json:"...", omitempty` not compatible with reflect.StructTag.Get: key:"value" pairs not separated by spaces * Add sample collector to README.md * Add CPU frequency collector * Avoid staticcheck warning: redundant return statement * Avoid staticcheck warning: unnecessary assignment to the blank identifier * Simplified code * Add CPUFreqCollectorCpuinfo a metric collector to measure the current frequency of the CPUs as obtained from /proc/cpuinfo Only measure on the first hyperthread * Add collector for NFS clients * Move publication of metrics into Flush() for NatsSink * Update GitHub actions * Refactoring * Avoid vet warning: Println arg list ends with redundant newline * Avoid vet warning struct field commands has json tag but is not exported * Avoid vet warning: return copies lock value. * Corrected typo * Refactoring * Add go sources in internal/... * Bad separator in Makefile * Fix Infiniband collector Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
2022-01-25 15:37:43 +01:00
func (m *TempCollector) Init(config json.RawMessage) error {
2022-02-14 10:46:05 +01:00
// Check if already initialized
if m.init {
return nil
}
m.name = "TempCollector"
m.parallel = true
m.setup()
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
2022-02-14 10:46:05 +01:00
m.meta = map[string]string{
"source": m.name,
"group": "IPMI",
"unit": "degC",
}
m.sensors = make([]*TempCollectorSensor, 0)
2022-02-11 17:17:10 +01:00
// Find all temperature sensor files
2022-02-14 10:46:05 +01:00
globPattern := filepath.Join("/sys/class/hwmon", "*", "temp*_input")
2022-02-11 17:17:10 +01:00
inputFiles, err := filepath.Glob(globPattern)
if err != nil {
2022-03-15 16:38:20 +01:00
return fmt.Errorf("unable to glob files with pattern '%s': %v", globPattern, err)
2022-02-11 17:17:10 +01:00
}
if inputFiles == nil {
2022-03-15 16:38:20 +01:00
return fmt.Errorf("unable to find any files with pattern '%s'", globPattern)
}
2022-02-11 17:17:10 +01:00
// Get sensor name for each temperature sensor file
for _, file := range inputFiles {
2022-02-14 10:46:05 +01:00
sensor := new(TempCollectorSensor)
// sensor name
nameFile := filepath.Join(filepath.Dir(file), "name")
2022-10-09 17:03:38 +02:00
name, err := os.ReadFile(nameFile)
if err == nil {
2022-02-14 10:46:05 +01:00
sensor.name = strings.TrimSpace(string(name))
}
2022-02-14 10:46:05 +01:00
// sensor label
labelFile := strings.TrimSuffix(file, "_input") + "_label"
2022-10-09 17:03:38 +02:00
label, err := os.ReadFile(labelFile)
if err == nil {
2022-02-14 10:46:05 +01:00
sensor.label = strings.TrimSpace(string(label))
}
2022-02-14 10:46:05 +01:00
// sensor metric name
switch {
2022-02-14 10:46:05 +01:00
case len(sensor.name) == 0 && len(sensor.label) == 0:
continue
case sensor.name == "coretemp" && strings.HasPrefix(sensor.label, "Core ") ||
sensor.name == "coretemp" && strings.HasPrefix(sensor.label, "Package id "):
sensor.metricName = "temp_" + sensor.label
2022-02-14 10:46:05 +01:00
case len(sensor.name) != 0 && len(sensor.label) != 0:
sensor.metricName = sensor.name + "_" + sensor.label
case len(sensor.name) != 0:
sensor.metricName = sensor.name
case len(sensor.label) != 0:
sensor.metricName = sensor.label
}
sensor.metricName = strings.ToLower(sensor.metricName)
2022-02-14 10:46:05 +01:00
sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1)
// Add temperature prefix, if required
if !strings.Contains(sensor.metricName, "temp") {
sensor.metricName = "temp_" + sensor.metricName
}
// Sensor file
2022-10-09 17:03:38 +02:00
_, err = os.ReadFile(file)
if err != nil {
continue
}
2022-02-14 10:46:05 +01:00
sensor.file = file
// Sensor tags
sensor.tags = map[string]string{
"type": "node",
}
2022-02-14 10:46:05 +01:00
// Apply tag override configuration
for key, newtags := range m.config.TagOverride {
if strings.Contains(sensor.file, key) {
sensor.tags = newtags
break
}
2022-02-11 17:17:10 +01:00
}
2022-02-14 10:46:05 +01:00
// max temperature
if m.config.ReportMaxTemp {
maxTempFile := strings.TrimSuffix(file, "_input") + "_max"
2022-10-09 17:03:38 +02:00
if buffer, err := os.ReadFile(maxTempFile); err == nil {
if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil {
sensor.maxTempName = strings.Replace(sensor.metricName, "temp", "max_temp", 1)
sensor.maxTemp = x / 1000
}
}
}
// critical temperature
if m.config.ReportCriticalTemp {
criticalTempFile := strings.TrimSuffix(file, "_input") + "_crit"
2022-10-09 17:03:38 +02:00
if buffer, err := os.ReadFile(criticalTempFile); err == nil {
if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil {
sensor.critTempName = strings.Replace(sensor.metricName, "temp", "crit_temp", 1)
sensor.critTemp = x / 1000
}
}
}
2022-02-14 10:46:05 +01:00
m.sensors = append(m.sensors, sensor)
}
2022-02-11 17:17:10 +01:00
// Empty sensors map
if len(m.sensors) == 0 {
2022-03-15 16:38:20 +01:00
return fmt.Errorf("no temperature sensors found")
}
// Finished initialization
2022-02-11 17:17:10 +01:00
m.init = true
return nil
}
Modularize the whole thing (#16) * Use channels, add a metric router, split up configuration and use extended version of Influx line protocol internally * Use central timer for collectors and router. Add expressions to router * Add expression to router config * Update entry points * Start with README * Update README for CCMetric * Formatting * Update README.md * Add README for MultiChanTicker * Add README for MultiChanTicker * Update README.md * Add README to metric router * Update main README * Remove SinkEntity type * Update README for sinks * Update go files * Update README for receivers * Update collectors README * Update collectors README * Use seperate page per collector * Fix for tempstat page * Add docs for customcmd collector * Add docs for ipmistat collector * Add docs for topprocs collector * Update customCmdMetric.md * Use seconds when calculating LIKWID metrics * Add IB metrics ib_recv_pkts and ib_xmit_pkts * Drop domain part of host name * Updated to latest stable version of likwid * Define source code dependencies in Makefile * Add GPFS / IBM Spectrum Scale collector * Add vet and staticcheck make targets * Add vet and staticcheck make targets * Avoid go vet warning: struct field tag `json:"..., omitempty"` not compatible with reflect.StructTag.Get: suspicious space in struct tag value struct field tag `json:"...", omitempty` not compatible with reflect.StructTag.Get: key:"value" pairs not separated by spaces * Add sample collector to README.md * Add CPU frequency collector * Avoid staticcheck warning: redundant return statement * Avoid staticcheck warning: unnecessary assignment to the blank identifier * Simplified code * Add CPUFreqCollectorCpuinfo a metric collector to measure the current frequency of the CPUs as obtained from /proc/cpuinfo Only measure on the first hyperthread * Add collector for NFS clients * Move publication of metrics into Flush() for NatsSink * Update GitHub actions * Refactoring * Avoid vet warning: Println arg list ends with redundant newline * Avoid vet warning struct field commands has json tag but is not exported * Avoid vet warning: return copies lock value. * Corrected typo * Refactoring * Add go sources in internal/... * Bad separator in Makefile * Fix Infiniband collector Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com>
2022-01-25 15:37:43 +01:00
func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) {
2022-02-14 10:46:05 +01:00
for _, sensor := range m.sensors {
// Read sensor file
2022-10-09 17:03:38 +02:00
buffer, err := os.ReadFile(sensor.file)
2022-02-11 17:17:10 +01:00
if err != nil {
2022-02-14 10:46:05 +01:00
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to read file '%s': %v", sensor.file, err))
2022-02-11 17:17:10 +01:00
continue
}
x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64)
2022-02-14 10:46:05 +01:00
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert temperature '%s' to int64: %v", buffer, err))
continue
}
x /= 1000
y, err := lp.New(
sensor.metricName,
sensor.tags,
m.meta,
map[string]interface{}{"value": x},
time.Now(),
)
2022-02-11 17:17:10 +01:00
if err == nil {
2022-02-14 10:46:05 +01:00
output <- y
}
// max temperature
if m.config.ReportMaxTemp && sensor.maxTemp != 0 {
y, err := lp.New(
sensor.maxTempName,
sensor.tags,
m.meta,
map[string]interface{}{"value": sensor.maxTemp},
time.Now(),
)
if err == nil {
output <- y
}
}
// critical temperature
if m.config.ReportCriticalTemp && sensor.critTemp != 0 {
y, err := lp.New(
sensor.critTempName,
sensor.tags,
m.meta,
map[string]interface{}{"value": sensor.critTemp},
time.Now(),
)
if err == nil {
output <- y
}
}
}
2022-02-11 17:17:10 +01:00
}
func (m *TempCollector) Close() {
m.init = false
}