mirror of
				https://github.com/ClusterCockpit/cc-metric-collector.git
				synced 2025-10-30 08:35:06 +01:00 
			
		
		
		
	add only_metrics. add max and critical temperatures
This commit is contained in:
		| @@ -9,8 +9,8 @@ import ( | |||||||
| 	"strings" | 	"strings" | ||||||
| 	"time" | 	"time" | ||||||
|  |  | ||||||
|  | 	lp "github.com/ClusterCockpit/cc-lib/ccMessage" | ||||||
| 	cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" | 	cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" | ||||||
| 	lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message" |  | ||||||
| ) | ) | ||||||
|  |  | ||||||
| // See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html | // See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html | ||||||
| @@ -23,7 +23,7 @@ import ( | |||||||
| type TempCollectorSensor struct { | type TempCollectorSensor struct { | ||||||
| 	name         string | 	name         string | ||||||
| 	label        string | 	label        string | ||||||
| 	metricName   string // Default: name_label | 	metricName   string // Default: name_label, in lowercase with underscores | ||||||
| 	file         string | 	file         string | ||||||
| 	maxTempName  string | 	maxTempName  string | ||||||
| 	maxTemp      int64 | 	maxTemp      int64 | ||||||
| @@ -32,19 +32,41 @@ type TempCollectorSensor struct { | |||||||
| 	tags         map[string]string | 	tags         map[string]string | ||||||
| } | } | ||||||
|  |  | ||||||
|  | type TempCollectorConfig struct { | ||||||
|  | 	ExcludeMetrics     []string                     `json:"exclude_metrics,omitempty"` | ||||||
|  | 	OnlyMetrics        []string                     `json:"only_metrics,omitempty"` | ||||||
|  | 	TagOverride        map[string]map[string]string `json:"tag_override,omitempty"` | ||||||
|  | 	ReportMaxTemp      bool                         `json:"report_max_temperature"` | ||||||
|  | 	ReportCriticalTemp bool                         `json:"report_critical_temperature"` | ||||||
|  | } | ||||||
|  |  | ||||||
| type TempCollector struct { | type TempCollector struct { | ||||||
| 	metricCollector | 	metricCollector | ||||||
| 	config struct { | 	config  TempCollectorConfig | ||||||
| 		ExcludeMetrics     []string                     `json:"exclude_metrics"` |  | ||||||
| 		TagOverride        map[string]map[string]string `json:"tag_override"` |  | ||||||
| 		ReportMaxTemp      bool                         `json:"report_max_temperature"` |  | ||||||
| 		ReportCriticalTemp bool                         `json:"report_critical_temperature"` |  | ||||||
| 	} |  | ||||||
| 	sensors []*TempCollectorSensor | 	sensors []*TempCollectorSensor | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // shouldOutput returns true if the metric should be sent. | ||||||
|  | // If OnlyMetrics is set, only metrics in that list are output. | ||||||
|  | // Otherwise, metrics in ExcludeMetrics are skipped. | ||||||
|  | func (m *TempCollector) shouldOutput(metricName string) bool { | ||||||
|  | 	if len(m.config.OnlyMetrics) > 0 { | ||||||
|  | 		for _, name := range m.config.OnlyMetrics { | ||||||
|  | 			if name == metricName { | ||||||
|  | 				return true | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 		return false | ||||||
|  | 	} | ||||||
|  | 	for _, name := range m.config.ExcludeMetrics { | ||||||
|  | 		if name == metricName { | ||||||
|  | 			return false | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	return true | ||||||
|  | } | ||||||
|  |  | ||||||
| func (m *TempCollector) Init(config json.RawMessage) error { | func (m *TempCollector) Init(config json.RawMessage) error { | ||||||
| 	// Check if already initialized |  | ||||||
| 	if m.init { | 	if m.init { | ||||||
| 		return nil | 		return nil | ||||||
| 	} | 	} | ||||||
| @@ -53,8 +75,7 @@ func (m *TempCollector) Init(config json.RawMessage) error { | |||||||
| 	m.parallel = true | 	m.parallel = true | ||||||
| 	m.setup() | 	m.setup() | ||||||
| 	if len(config) > 0 { | 	if len(config) > 0 { | ||||||
| 		err := json.Unmarshal(config, &m.config) | 		if err := json.Unmarshal(config, &m.config); err != nil { | ||||||
| 		if err != nil { |  | ||||||
| 			return err | 			return err | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| @@ -81,26 +102,23 @@ func (m *TempCollector) Init(config json.RawMessage) error { | |||||||
| 	for _, file := range inputFiles { | 	for _, file := range inputFiles { | ||||||
| 		sensor := new(TempCollectorSensor) | 		sensor := new(TempCollectorSensor) | ||||||
|  |  | ||||||
| 		// sensor name | 		// Read sensor name from the "name" file | ||||||
| 		nameFile := filepath.Join(filepath.Dir(file), "name") | 		nameFile := filepath.Join(filepath.Dir(file), "name") | ||||||
| 		name, err := os.ReadFile(nameFile) | 		if data, err := os.ReadFile(nameFile); err == nil { | ||||||
| 		if err == nil { | 			sensor.name = strings.TrimSpace(string(data)) | ||||||
| 			sensor.name = strings.TrimSpace(string(name)) |  | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
| 		// sensor label | 		// Read sensor label from the corresponding "_label" file | ||||||
| 		labelFile := strings.TrimSuffix(file, "_input") + "_label" | 		labelFile := strings.TrimSuffix(file, "_input") + "_label" | ||||||
| 		label, err := os.ReadFile(labelFile) | 		if data, err := os.ReadFile(labelFile); err == nil { | ||||||
| 		if err == nil { | 			sensor.label = strings.TrimSpace(string(data)) | ||||||
| 			sensor.label = strings.TrimSpace(string(label)) |  | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
| 		// sensor metric name | 		// Determine sensor metric name | ||||||
| 		switch { | 		switch { | ||||||
| 		case len(sensor.name) == 0 && len(sensor.label) == 0: | 		case len(sensor.name) == 0 && len(sensor.label) == 0: | ||||||
| 			continue | 			continue | ||||||
| 		case sensor.name == "coretemp" && strings.HasPrefix(sensor.label, "Core ") || | 		case sensor.name == "coretemp" && (strings.HasPrefix(sensor.label, "Core ") || strings.HasPrefix(sensor.label, "Package id ")): | ||||||
| 			sensor.name == "coretemp" && strings.HasPrefix(sensor.label, "Package id "): |  | ||||||
| 			sensor.metricName = "temp_" + sensor.label | 			sensor.metricName = "temp_" + sensor.label | ||||||
| 		case len(sensor.name) != 0 && len(sensor.label) != 0: | 		case len(sensor.name) != 0 && len(sensor.label) != 0: | ||||||
| 			sensor.metricName = sensor.name + "_" + sensor.label | 			sensor.metricName = sensor.name + "_" + sensor.label | ||||||
| @@ -111,24 +129,21 @@ func (m *TempCollector) Init(config json.RawMessage) error { | |||||||
| 		} | 		} | ||||||
| 		sensor.metricName = strings.ToLower(sensor.metricName) | 		sensor.metricName = strings.ToLower(sensor.metricName) | ||||||
| 		sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1) | 		sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1) | ||||||
| 		// Add temperature prefix, if required |  | ||||||
| 		if !strings.Contains(sensor.metricName, "temp") { | 		if !strings.Contains(sensor.metricName, "temp") { | ||||||
| 			sensor.metricName = "temp_" + sensor.metricName | 			sensor.metricName = "temp_" + sensor.metricName | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
| 		// Sensor file | 		// Verify sensor file exists | ||||||
| 		_, err = os.ReadFile(file) | 		if _, err := os.ReadFile(file); err != nil { | ||||||
| 		if err != nil { |  | ||||||
| 			continue | 			continue | ||||||
| 		} | 		} | ||||||
| 		sensor.file = file | 		sensor.file = file | ||||||
|  |  | ||||||
| 		// Sensor tags | 		// Set default sensor tags | ||||||
| 		sensor.tags = map[string]string{ | 		sensor.tags = map[string]string{ | ||||||
| 			"type": "node", | 			"type": "node", | ||||||
| 		} | 		} | ||||||
|  | 		// Apply tag override configuration if applicable | ||||||
| 		// Apply tag override configuration |  | ||||||
| 		for key, newtags := range m.config.TagOverride { | 		for key, newtags := range m.config.TagOverride { | ||||||
| 			if strings.Contains(sensor.file, key) { | 			if strings.Contains(sensor.file, key) { | ||||||
| 				sensor.tags = newtags | 				sensor.tags = newtags | ||||||
| @@ -136,7 +151,7 @@ func (m *TempCollector) Init(config json.RawMessage) error { | |||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
| 		// max temperature | 		// Read max temperature if enabled | ||||||
| 		if m.config.ReportMaxTemp { | 		if m.config.ReportMaxTemp { | ||||||
| 			maxTempFile := strings.TrimSuffix(file, "_input") + "_max" | 			maxTempFile := strings.TrimSuffix(file, "_input") + "_max" | ||||||
| 			if buffer, err := os.ReadFile(maxTempFile); err == nil { | 			if buffer, err := os.ReadFile(maxTempFile); err == nil { | ||||||
| @@ -147,7 +162,7 @@ func (m *TempCollector) Init(config json.RawMessage) error { | |||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
| 		// critical temperature | 		// Read critical temperature if enabled | ||||||
| 		if m.config.ReportCriticalTemp { | 		if m.config.ReportCriticalTemp { | ||||||
| 			criticalTempFile := strings.TrimSuffix(file, "_input") + "_crit" | 			criticalTempFile := strings.TrimSuffix(file, "_input") + "_crit" | ||||||
| 			if buffer, err := os.ReadFile(criticalTempFile); err == nil { | 			if buffer, err := os.ReadFile(criticalTempFile); err == nil { | ||||||
| @@ -161,75 +176,52 @@ func (m *TempCollector) Init(config json.RawMessage) error { | |||||||
| 		m.sensors = append(m.sensors, sensor) | 		m.sensors = append(m.sensors, sensor) | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Empty sensors map |  | ||||||
| 	if len(m.sensors) == 0 { | 	if len(m.sensors) == 0 { | ||||||
| 		return fmt.Errorf("no temperature sensors found") | 		return fmt.Errorf("no temperature sensors found") | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Finished initialization |  | ||||||
| 	m.init = true | 	m.init = true | ||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
| func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) { | func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) { | ||||||
|  | 	// For each sensor, read temperature and send metric if allowed. | ||||||
| 	for _, sensor := range m.sensors { | 	for _, sensor := range m.sensors { | ||||||
| 		// Read sensor file | 		// Read sensor file | ||||||
| 		buffer, err := os.ReadFile(sensor.file) | 		buffer, err := os.ReadFile(sensor.file) | ||||||
| 		if err != nil { | 		if err != nil { | ||||||
| 			cclog.ComponentError( | 			cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to read file '%s': %v", sensor.file, err)) | ||||||
| 				m.name, |  | ||||||
| 				fmt.Sprintf("Read(): Failed to read file '%s': %v", sensor.file, err)) |  | ||||||
| 			continue | 			continue | ||||||
| 		} | 		} | ||||||
| 		x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64) | 		x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64) | ||||||
| 		if err != nil { | 		if err != nil { | ||||||
| 			cclog.ComponentError( | 			cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert temperature '%s' to int64: %v", buffer, err)) | ||||||
| 				m.name, |  | ||||||
| 				fmt.Sprintf("Read(): Failed to convert temperature '%s' to int64: %v", buffer, err)) |  | ||||||
| 			continue | 			continue | ||||||
| 		} | 		} | ||||||
| 		x /= 1000 | 		x /= 1000 | ||||||
| 		y, err := lp.NewMessage( | 		if m.shouldOutput(sensor.metricName) { | ||||||
| 			sensor.metricName, | 			y, err := lp.NewMessage(sensor.metricName, sensor.tags, m.meta, map[string]interface{}{"value": x}, time.Now()) | ||||||
| 			sensor.tags, |  | ||||||
| 			m.meta, |  | ||||||
| 			map[string]interface{}{"value": x}, |  | ||||||
| 			time.Now(), |  | ||||||
| 		) |  | ||||||
| 		if err == nil { |  | ||||||
| 			output <- y |  | ||||||
| 		} |  | ||||||
|  |  | ||||||
| 		// max temperature |  | ||||||
| 		if m.config.ReportMaxTemp && sensor.maxTemp != 0 { |  | ||||||
| 			y, err := lp.NewMessage( |  | ||||||
| 				sensor.maxTempName, |  | ||||||
| 				sensor.tags, |  | ||||||
| 				m.meta, |  | ||||||
| 				map[string]interface{}{"value": sensor.maxTemp}, |  | ||||||
| 				time.Now(), |  | ||||||
| 			) |  | ||||||
| 			if err == nil { | 			if err == nil { | ||||||
| 				output <- y | 				output <- y | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
| 		// critical temperature | 		// Send max temperature if enabled and available | ||||||
| 		if m.config.ReportCriticalTemp && sensor.critTemp != 0 { | 		if m.config.ReportMaxTemp && sensor.maxTemp != 0 && m.shouldOutput(sensor.maxTempName) { | ||||||
| 			y, err := lp.NewMessage( | 			y, err := lp.NewMessage(sensor.maxTempName, sensor.tags, m.meta, map[string]interface{}{"value": sensor.maxTemp}, time.Now()) | ||||||
| 				sensor.critTempName, | 			if err == nil { | ||||||
| 				sensor.tags, | 				output <- y | ||||||
| 				m.meta, | 			} | ||||||
| 				map[string]interface{}{"value": sensor.critTemp}, | 		} | ||||||
| 				time.Now(), |  | ||||||
| 			) | 		// Send critical temperature if enabled and available | ||||||
|  | 		if m.config.ReportCriticalTemp && sensor.critTemp != 0 && m.shouldOutput(sensor.critTempName) { | ||||||
|  | 			y, err := lp.NewMessage(sensor.critTempName, sensor.tags, m.meta, map[string]interface{}{"value": sensor.critTemp}, time.Now()) | ||||||
| 			if err == nil { | 			if err == nil { | ||||||
| 				output <- y | 				output <- y | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
| func (m *TempCollector) Close() { | func (m *TempCollector) Close() { | ||||||
|   | |||||||
| @@ -1,22 +1,35 @@ | |||||||
|  | ## tempstat collector | ||||||
|  |  | ||||||
| ## `tempstat` collector | ```json{ | ||||||
|  |  | ||||||
| ```json |  | ||||||
|   "tempstat": { |   "tempstat": { | ||||||
|     "tag_override" : { |     "tag_override": { | ||||||
|         "<device like hwmon1>" : { |         "<device identifier>": { | ||||||
|             "type" : "socket", |             "type": "socket", | ||||||
|             "type-id" : "0" |             "type-id": "0" | ||||||
|         } |         } | ||||||
|     }, |     }, | ||||||
|     "exclude_metrics": [ |     "exclude_metrics": [ | ||||||
|       "metric1", |       "metric1", | ||||||
|       "metric2" |       "metric2" | ||||||
|     ] |     ], | ||||||
|  |     "only_metrics": [ | ||||||
|  |       "temp_core_0", | ||||||
|  |       "temp_core_1" | ||||||
|  |     ], | ||||||
|  |     "report_max_temperature": true, | ||||||
|  |     "report_critical_temperature": true | ||||||
|   } |   } | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| The `tempstat` collector reads the data from `/sys/class/hwmon/<device>/tempX_{input,label}` | The `tempstat` collector reads the data from `/sys/class/hwmon/<device>/tempX_{input,label}`. | ||||||
|  |  | ||||||
|  | Both filtering mechanisms are supported: | ||||||
|  | - `exclude_metrics`: Excludes the specified metrics. | ||||||
|  | - `only_metrics`: If provided, only the listed metrics are collected. This takes precedence over `exclude_metrics`. | ||||||
|  |  | ||||||
| Metrics: | Metrics: | ||||||
| * `temp_*`: The metric name is taken from the `label` files. | - `temp_*`: The metric name is taken from the label files. | ||||||
|  |  | ||||||
|  | Optional additional metrics: | ||||||
|  | - **Max Temperature:** If `report_max_temperature` is enabled, the collector also reads the maximum temperature from the corresponding `_max` file. The metric name is derived by replacing "temp" with "max_temp" in the sensor's metric name. | ||||||
|  | - **Critical Temperature:** If `report_critical_temperature` is enabled, the collector also reads the critical temperature from the corresponding `_crit` file. The metric name is derived by replacing "temp" with "crit_temp" in the sensor's metric name. | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user