add only_metrics. add max and critical temperatures

This commit is contained in:
brinkcoder 2025-03-05 01:38:36 +01:00
parent 44ff56d0fe
commit c53c5673a4
2 changed files with 85 additions and 80 deletions

View File

@ -9,8 +9,8 @@ import (
"strings" "strings"
"time" "time"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
) )
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html // See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
@ -23,7 +23,7 @@ import (
type TempCollectorSensor struct { type TempCollectorSensor struct {
name string name string
label string label string
metricName string // Default: name_label metricName string // Default: name_label, in lowercase with underscores
file string file string
maxTempName string maxTempName string
maxTemp int64 maxTemp int64
@ -32,19 +32,41 @@ type TempCollectorSensor struct {
tags map[string]string tags map[string]string
} }
type TempCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
OnlyMetrics []string `json:"only_metrics,omitempty"`
TagOverride map[string]map[string]string `json:"tag_override,omitempty"`
ReportMaxTemp bool `json:"report_max_temperature"`
ReportCriticalTemp bool `json:"report_critical_temperature"`
}
type TempCollector struct { type TempCollector struct {
metricCollector metricCollector
config struct { config TempCollectorConfig
ExcludeMetrics []string `json:"exclude_metrics"`
TagOverride map[string]map[string]string `json:"tag_override"`
ReportMaxTemp bool `json:"report_max_temperature"`
ReportCriticalTemp bool `json:"report_critical_temperature"`
}
sensors []*TempCollectorSensor sensors []*TempCollectorSensor
} }
// shouldOutput returns true if the metric should be sent.
// If OnlyMetrics is set, only metrics in that list are output.
// Otherwise, metrics in ExcludeMetrics are skipped.
func (m *TempCollector) shouldOutput(metricName string) bool {
if len(m.config.OnlyMetrics) > 0 {
for _, name := range m.config.OnlyMetrics {
if name == metricName {
return true
}
}
return false
}
for _, name := range m.config.ExcludeMetrics {
if name == metricName {
return false
}
}
return true
}
func (m *TempCollector) Init(config json.RawMessage) error { func (m *TempCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init { if m.init {
return nil return nil
} }
@ -53,8 +75,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
m.parallel = true m.parallel = true
m.setup() m.setup()
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &m.config) if err := json.Unmarshal(config, &m.config); err != nil {
if err != nil {
return err return err
} }
} }
@ -81,26 +102,23 @@ func (m *TempCollector) Init(config json.RawMessage) error {
for _, file := range inputFiles { for _, file := range inputFiles {
sensor := new(TempCollectorSensor) sensor := new(TempCollectorSensor)
// sensor name // Read sensor name from the "name" file
nameFile := filepath.Join(filepath.Dir(file), "name") nameFile := filepath.Join(filepath.Dir(file), "name")
name, err := os.ReadFile(nameFile) if data, err := os.ReadFile(nameFile); err == nil {
if err == nil { sensor.name = strings.TrimSpace(string(data))
sensor.name = strings.TrimSpace(string(name))
} }
// sensor label // Read sensor label from the corresponding "_label" file
labelFile := strings.TrimSuffix(file, "_input") + "_label" labelFile := strings.TrimSuffix(file, "_input") + "_label"
label, err := os.ReadFile(labelFile) if data, err := os.ReadFile(labelFile); err == nil {
if err == nil { sensor.label = strings.TrimSpace(string(data))
sensor.label = strings.TrimSpace(string(label))
} }
// sensor metric name // Determine sensor metric name
switch { switch {
case len(sensor.name) == 0 && len(sensor.label) == 0: case len(sensor.name) == 0 && len(sensor.label) == 0:
continue continue
case sensor.name == "coretemp" && strings.HasPrefix(sensor.label, "Core ") || case sensor.name == "coretemp" && (strings.HasPrefix(sensor.label, "Core ") || strings.HasPrefix(sensor.label, "Package id ")):
sensor.name == "coretemp" && strings.HasPrefix(sensor.label, "Package id "):
sensor.metricName = "temp_" + sensor.label sensor.metricName = "temp_" + sensor.label
case len(sensor.name) != 0 && len(sensor.label) != 0: case len(sensor.name) != 0 && len(sensor.label) != 0:
sensor.metricName = sensor.name + "_" + sensor.label sensor.metricName = sensor.name + "_" + sensor.label
@ -111,24 +129,21 @@ func (m *TempCollector) Init(config json.RawMessage) error {
} }
sensor.metricName = strings.ToLower(sensor.metricName) sensor.metricName = strings.ToLower(sensor.metricName)
sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1) sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1)
// Add temperature prefix, if required
if !strings.Contains(sensor.metricName, "temp") { if !strings.Contains(sensor.metricName, "temp") {
sensor.metricName = "temp_" + sensor.metricName sensor.metricName = "temp_" + sensor.metricName
} }
// Sensor file // Verify sensor file exists
_, err = os.ReadFile(file) if _, err := os.ReadFile(file); err != nil {
if err != nil {
continue continue
} }
sensor.file = file sensor.file = file
// Sensor tags // Set default sensor tags
sensor.tags = map[string]string{ sensor.tags = map[string]string{
"type": "node", "type": "node",
} }
// Apply tag override configuration if applicable
// Apply tag override configuration
for key, newtags := range m.config.TagOverride { for key, newtags := range m.config.TagOverride {
if strings.Contains(sensor.file, key) { if strings.Contains(sensor.file, key) {
sensor.tags = newtags sensor.tags = newtags
@ -136,7 +151,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
} }
} }
// max temperature // Read max temperature if enabled
if m.config.ReportMaxTemp { if m.config.ReportMaxTemp {
maxTempFile := strings.TrimSuffix(file, "_input") + "_max" maxTempFile := strings.TrimSuffix(file, "_input") + "_max"
if buffer, err := os.ReadFile(maxTempFile); err == nil { if buffer, err := os.ReadFile(maxTempFile); err == nil {
@ -147,7 +162,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
} }
} }
// critical temperature // Read critical temperature if enabled
if m.config.ReportCriticalTemp { if m.config.ReportCriticalTemp {
criticalTempFile := strings.TrimSuffix(file, "_input") + "_crit" criticalTempFile := strings.TrimSuffix(file, "_input") + "_crit"
if buffer, err := os.ReadFile(criticalTempFile); err == nil { if buffer, err := os.ReadFile(criticalTempFile); err == nil {
@ -161,75 +176,52 @@ func (m *TempCollector) Init(config json.RawMessage) error {
m.sensors = append(m.sensors, sensor) m.sensors = append(m.sensors, sensor)
} }
// Empty sensors map
if len(m.sensors) == 0 { if len(m.sensors) == 0 {
return fmt.Errorf("no temperature sensors found") return fmt.Errorf("no temperature sensors found")
} }
// Finished initialization
m.init = true m.init = true
return nil return nil
} }
func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) { func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// For each sensor, read temperature and send metric if allowed.
for _, sensor := range m.sensors { for _, sensor := range m.sensors {
// Read sensor file // Read sensor file
buffer, err := os.ReadFile(sensor.file) buffer, err := os.ReadFile(sensor.file)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to read file '%s': %v", sensor.file, err))
m.name,
fmt.Sprintf("Read(): Failed to read file '%s': %v", sensor.file, err))
continue continue
} }
x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64) x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert temperature '%s' to int64: %v", buffer, err))
m.name,
fmt.Sprintf("Read(): Failed to convert temperature '%s' to int64: %v", buffer, err))
continue continue
} }
x /= 1000 x /= 1000
y, err := lp.NewMessage( if m.shouldOutput(sensor.metricName) {
sensor.metricName, y, err := lp.NewMessage(sensor.metricName, sensor.tags, m.meta, map[string]interface{}{"value": x}, time.Now())
sensor.tags,
m.meta,
map[string]interface{}{"value": x},
time.Now(),
)
if err == nil {
output <- y
}
// max temperature
if m.config.ReportMaxTemp && sensor.maxTemp != 0 {
y, err := lp.NewMessage(
sensor.maxTempName,
sensor.tags,
m.meta,
map[string]interface{}{"value": sensor.maxTemp},
time.Now(),
)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
// critical temperature // Send max temperature if enabled and available
if m.config.ReportCriticalTemp && sensor.critTemp != 0 { if m.config.ReportMaxTemp && sensor.maxTemp != 0 && m.shouldOutput(sensor.maxTempName) {
y, err := lp.NewMessage( y, err := lp.NewMessage(sensor.maxTempName, sensor.tags, m.meta, map[string]interface{}{"value": sensor.maxTemp}, time.Now())
sensor.critTempName, if err == nil {
sensor.tags, output <- y
m.meta, }
map[string]interface{}{"value": sensor.critTemp}, }
time.Now(),
) // Send critical temperature if enabled and available
if m.config.ReportCriticalTemp && sensor.critTemp != 0 && m.shouldOutput(sensor.critTempName) {
y, err := lp.NewMessage(sensor.critTempName, sensor.tags, m.meta, map[string]interface{}{"value": sensor.critTemp}, time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
} }
} }
} }
func (m *TempCollector) Close() { func (m *TempCollector) Close() {

View File

@ -1,22 +1,35 @@
## tempstat collector
## `tempstat` collector ```json{
```json
"tempstat": { "tempstat": {
"tag_override" : { "tag_override": {
"<device like hwmon1>" : { "<device identifier>": {
"type" : "socket", "type": "socket",
"type-id" : "0" "type-id": "0"
} }
}, },
"exclude_metrics": [ "exclude_metrics": [
"metric1", "metric1",
"metric2" "metric2"
] ],
"only_metrics": [
"temp_core_0",
"temp_core_1"
],
"report_max_temperature": true,
"report_critical_temperature": true
} }
``` ```
The `tempstat` collector reads the data from `/sys/class/hwmon/<device>/tempX_{input,label}` The `tempstat` collector reads the data from `/sys/class/hwmon/<device>/tempX_{input,label}`.
Both filtering mechanisms are supported:
- `exclude_metrics`: Excludes the specified metrics.
- `only_metrics`: If provided, only the listed metrics are collected. This takes precedence over `exclude_metrics`.
Metrics: Metrics:
* `temp_*`: The metric name is taken from the `label` files. - `temp_*`: The metric name is taken from the label files.
Optional additional metrics:
- **Max Temperature:** If `report_max_temperature` is enabled, the collector also reads the maximum temperature from the corresponding `_max` file. The metric name is derived by replacing "temp" with "max_temp" in the sensor's metric name.
- **Critical Temperature:** If `report_critical_temperature` is enabled, the collector also reads the critical temperature from the corresponding `_crit` file. The metric name is derived by replacing "temp" with "crit_temp" in the sensor's metric name.