mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-04-05 21:25:55 +02:00
add only_metrics. add max and critical temperatures
This commit is contained in:
parent
44ff56d0fe
commit
c53c5673a4
@ -9,8 +9,8 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
|
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
|
||||||
@ -23,7 +23,7 @@ import (
|
|||||||
type TempCollectorSensor struct {
|
type TempCollectorSensor struct {
|
||||||
name string
|
name string
|
||||||
label string
|
label string
|
||||||
metricName string // Default: name_label
|
metricName string // Default: name_label, in lowercase with underscores
|
||||||
file string
|
file string
|
||||||
maxTempName string
|
maxTempName string
|
||||||
maxTemp int64
|
maxTemp int64
|
||||||
@ -32,19 +32,41 @@ type TempCollectorSensor struct {
|
|||||||
tags map[string]string
|
tags map[string]string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type TempCollectorConfig struct {
|
||||||
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
|
OnlyMetrics []string `json:"only_metrics,omitempty"`
|
||||||
|
TagOverride map[string]map[string]string `json:"tag_override,omitempty"`
|
||||||
|
ReportMaxTemp bool `json:"report_max_temperature"`
|
||||||
|
ReportCriticalTemp bool `json:"report_critical_temperature"`
|
||||||
|
}
|
||||||
|
|
||||||
type TempCollector struct {
|
type TempCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
config struct {
|
config TempCollectorConfig
|
||||||
ExcludeMetrics []string `json:"exclude_metrics"`
|
|
||||||
TagOverride map[string]map[string]string `json:"tag_override"`
|
|
||||||
ReportMaxTemp bool `json:"report_max_temperature"`
|
|
||||||
ReportCriticalTemp bool `json:"report_critical_temperature"`
|
|
||||||
}
|
|
||||||
sensors []*TempCollectorSensor
|
sensors []*TempCollectorSensor
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// shouldOutput returns true if the metric should be sent.
|
||||||
|
// If OnlyMetrics is set, only metrics in that list are output.
|
||||||
|
// Otherwise, metrics in ExcludeMetrics are skipped.
|
||||||
|
func (m *TempCollector) shouldOutput(metricName string) bool {
|
||||||
|
if len(m.config.OnlyMetrics) > 0 {
|
||||||
|
for _, name := range m.config.OnlyMetrics {
|
||||||
|
if name == metricName {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, name := range m.config.ExcludeMetrics {
|
||||||
|
if name == metricName {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
func (m *TempCollector) Init(config json.RawMessage) error {
|
func (m *TempCollector) Init(config json.RawMessage) error {
|
||||||
// Check if already initialized
|
|
||||||
if m.init {
|
if m.init {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -53,8 +75,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
|||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.setup()
|
m.setup()
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err := json.Unmarshal(config, &m.config)
|
if err := json.Unmarshal(config, &m.config); err != nil {
|
||||||
if err != nil {
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -81,26 +102,23 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
|||||||
for _, file := range inputFiles {
|
for _, file := range inputFiles {
|
||||||
sensor := new(TempCollectorSensor)
|
sensor := new(TempCollectorSensor)
|
||||||
|
|
||||||
// sensor name
|
// Read sensor name from the "name" file
|
||||||
nameFile := filepath.Join(filepath.Dir(file), "name")
|
nameFile := filepath.Join(filepath.Dir(file), "name")
|
||||||
name, err := os.ReadFile(nameFile)
|
if data, err := os.ReadFile(nameFile); err == nil {
|
||||||
if err == nil {
|
sensor.name = strings.TrimSpace(string(data))
|
||||||
sensor.name = strings.TrimSpace(string(name))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// sensor label
|
// Read sensor label from the corresponding "_label" file
|
||||||
labelFile := strings.TrimSuffix(file, "_input") + "_label"
|
labelFile := strings.TrimSuffix(file, "_input") + "_label"
|
||||||
label, err := os.ReadFile(labelFile)
|
if data, err := os.ReadFile(labelFile); err == nil {
|
||||||
if err == nil {
|
sensor.label = strings.TrimSpace(string(data))
|
||||||
sensor.label = strings.TrimSpace(string(label))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// sensor metric name
|
// Determine sensor metric name
|
||||||
switch {
|
switch {
|
||||||
case len(sensor.name) == 0 && len(sensor.label) == 0:
|
case len(sensor.name) == 0 && len(sensor.label) == 0:
|
||||||
continue
|
continue
|
||||||
case sensor.name == "coretemp" && strings.HasPrefix(sensor.label, "Core ") ||
|
case sensor.name == "coretemp" && (strings.HasPrefix(sensor.label, "Core ") || strings.HasPrefix(sensor.label, "Package id ")):
|
||||||
sensor.name == "coretemp" && strings.HasPrefix(sensor.label, "Package id "):
|
|
||||||
sensor.metricName = "temp_" + sensor.label
|
sensor.metricName = "temp_" + sensor.label
|
||||||
case len(sensor.name) != 0 && len(sensor.label) != 0:
|
case len(sensor.name) != 0 && len(sensor.label) != 0:
|
||||||
sensor.metricName = sensor.name + "_" + sensor.label
|
sensor.metricName = sensor.name + "_" + sensor.label
|
||||||
@ -111,24 +129,21 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
sensor.metricName = strings.ToLower(sensor.metricName)
|
sensor.metricName = strings.ToLower(sensor.metricName)
|
||||||
sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1)
|
sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1)
|
||||||
// Add temperature prefix, if required
|
|
||||||
if !strings.Contains(sensor.metricName, "temp") {
|
if !strings.Contains(sensor.metricName, "temp") {
|
||||||
sensor.metricName = "temp_" + sensor.metricName
|
sensor.metricName = "temp_" + sensor.metricName
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sensor file
|
// Verify sensor file exists
|
||||||
_, err = os.ReadFile(file)
|
if _, err := os.ReadFile(file); err != nil {
|
||||||
if err != nil {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
sensor.file = file
|
sensor.file = file
|
||||||
|
|
||||||
// Sensor tags
|
// Set default sensor tags
|
||||||
sensor.tags = map[string]string{
|
sensor.tags = map[string]string{
|
||||||
"type": "node",
|
"type": "node",
|
||||||
}
|
}
|
||||||
|
// Apply tag override configuration if applicable
|
||||||
// Apply tag override configuration
|
|
||||||
for key, newtags := range m.config.TagOverride {
|
for key, newtags := range m.config.TagOverride {
|
||||||
if strings.Contains(sensor.file, key) {
|
if strings.Contains(sensor.file, key) {
|
||||||
sensor.tags = newtags
|
sensor.tags = newtags
|
||||||
@ -136,7 +151,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// max temperature
|
// Read max temperature if enabled
|
||||||
if m.config.ReportMaxTemp {
|
if m.config.ReportMaxTemp {
|
||||||
maxTempFile := strings.TrimSuffix(file, "_input") + "_max"
|
maxTempFile := strings.TrimSuffix(file, "_input") + "_max"
|
||||||
if buffer, err := os.ReadFile(maxTempFile); err == nil {
|
if buffer, err := os.ReadFile(maxTempFile); err == nil {
|
||||||
@ -147,7 +162,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// critical temperature
|
// Read critical temperature if enabled
|
||||||
if m.config.ReportCriticalTemp {
|
if m.config.ReportCriticalTemp {
|
||||||
criticalTempFile := strings.TrimSuffix(file, "_input") + "_crit"
|
criticalTempFile := strings.TrimSuffix(file, "_input") + "_crit"
|
||||||
if buffer, err := os.ReadFile(criticalTempFile); err == nil {
|
if buffer, err := os.ReadFile(criticalTempFile); err == nil {
|
||||||
@ -161,75 +176,52 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
|||||||
m.sensors = append(m.sensors, sensor)
|
m.sensors = append(m.sensors, sensor)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Empty sensors map
|
|
||||||
if len(m.sensors) == 0 {
|
if len(m.sensors) == 0 {
|
||||||
return fmt.Errorf("no temperature sensors found")
|
return fmt.Errorf("no temperature sensors found")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finished initialization
|
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
|
// For each sensor, read temperature and send metric if allowed.
|
||||||
for _, sensor := range m.sensors {
|
for _, sensor := range m.sensors {
|
||||||
// Read sensor file
|
// Read sensor file
|
||||||
buffer, err := os.ReadFile(sensor.file)
|
buffer, err := os.ReadFile(sensor.file)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to read file '%s': %v", sensor.file, err))
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to read file '%s': %v", sensor.file, err))
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64)
|
x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert temperature '%s' to int64: %v", buffer, err))
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to convert temperature '%s' to int64: %v", buffer, err))
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
x /= 1000
|
x /= 1000
|
||||||
y, err := lp.NewMessage(
|
if m.shouldOutput(sensor.metricName) {
|
||||||
sensor.metricName,
|
y, err := lp.NewMessage(sensor.metricName, sensor.tags, m.meta, map[string]interface{}{"value": x}, time.Now())
|
||||||
sensor.tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]interface{}{"value": x},
|
|
||||||
time.Now(),
|
|
||||||
)
|
|
||||||
if err == nil {
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
|
|
||||||
// max temperature
|
|
||||||
if m.config.ReportMaxTemp && sensor.maxTemp != 0 {
|
|
||||||
y, err := lp.NewMessage(
|
|
||||||
sensor.maxTempName,
|
|
||||||
sensor.tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]interface{}{"value": sensor.maxTemp},
|
|
||||||
time.Now(),
|
|
||||||
)
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// critical temperature
|
// Send max temperature if enabled and available
|
||||||
if m.config.ReportCriticalTemp && sensor.critTemp != 0 {
|
if m.config.ReportMaxTemp && sensor.maxTemp != 0 && m.shouldOutput(sensor.maxTempName) {
|
||||||
y, err := lp.NewMessage(
|
y, err := lp.NewMessage(sensor.maxTempName, sensor.tags, m.meta, map[string]interface{}{"value": sensor.maxTemp}, time.Now())
|
||||||
sensor.critTempName,
|
if err == nil {
|
||||||
sensor.tags,
|
output <- y
|
||||||
m.meta,
|
}
|
||||||
map[string]interface{}{"value": sensor.critTemp},
|
}
|
||||||
time.Now(),
|
|
||||||
)
|
// Send critical temperature if enabled and available
|
||||||
|
if m.config.ReportCriticalTemp && sensor.critTemp != 0 && m.shouldOutput(sensor.critTempName) {
|
||||||
|
y, err := lp.NewMessage(sensor.critTempName, sensor.tags, m.meta, map[string]interface{}{"value": sensor.critTemp}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *TempCollector) Close() {
|
func (m *TempCollector) Close() {
|
||||||
|
@ -1,22 +1,35 @@
|
|||||||
|
## tempstat collector
|
||||||
|
|
||||||
## `tempstat` collector
|
```json{
|
||||||
|
|
||||||
```json
|
|
||||||
"tempstat": {
|
"tempstat": {
|
||||||
"tag_override" : {
|
"tag_override": {
|
||||||
"<device like hwmon1>" : {
|
"<device identifier>": {
|
||||||
"type" : "socket",
|
"type": "socket",
|
||||||
"type-id" : "0"
|
"type-id": "0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"exclude_metrics": [
|
"exclude_metrics": [
|
||||||
"metric1",
|
"metric1",
|
||||||
"metric2"
|
"metric2"
|
||||||
]
|
],
|
||||||
|
"only_metrics": [
|
||||||
|
"temp_core_0",
|
||||||
|
"temp_core_1"
|
||||||
|
],
|
||||||
|
"report_max_temperature": true,
|
||||||
|
"report_critical_temperature": true
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The `tempstat` collector reads the data from `/sys/class/hwmon/<device>/tempX_{input,label}`
|
The `tempstat` collector reads the data from `/sys/class/hwmon/<device>/tempX_{input,label}`.
|
||||||
|
|
||||||
|
Both filtering mechanisms are supported:
|
||||||
|
- `exclude_metrics`: Excludes the specified metrics.
|
||||||
|
- `only_metrics`: If provided, only the listed metrics are collected. This takes precedence over `exclude_metrics`.
|
||||||
|
|
||||||
Metrics:
|
Metrics:
|
||||||
* `temp_*`: The metric name is taken from the `label` files.
|
- `temp_*`: The metric name is taken from the label files.
|
||||||
|
|
||||||
|
Optional additional metrics:
|
||||||
|
- **Max Temperature:** If `report_max_temperature` is enabled, the collector also reads the maximum temperature from the corresponding `_max` file. The metric name is derived by replacing "temp" with "max_temp" in the sensor's metric name.
|
||||||
|
- **Critical Temperature:** If `report_critical_temperature` is enabled, the collector also reads the critical temperature from the corresponding `_crit` file. The metric name is derived by replacing "temp" with "crit_temp" in the sensor's metric name.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user