Temp collector from sysfs hwmon (#8)

* Add collector for hwmon temperature

* Comment out local tag overrides

* Add temperature collector to README

* Update temperature collector with own config parser
This commit is contained in:
Thomas Gruber 2021-11-25 18:19:09 +01:00 committed by GitHub
parent 1e7a75598e
commit 3997984714
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 125 additions and 103 deletions

View File

@ -40,6 +40,7 @@ The base class/configuration is located in `metricCollector.go`.
* `cpustatMetric.go`: Read CPU specific values from `/proc/stat` * `cpustatMetric.go`: Read CPU specific values from `/proc/stat`
* `topprocsMetric.go`: Reads the Top5 processes by their CPU usage * `topprocsMetric.go`: Reads the Top5 processes by their CPU usage
* `nvidiaMetric.go`: Read data about Nvidia GPUs using the NVML library * `nvidiaMetric.go`: Read data about Nvidia GPUs using the NVML library
* `tempMetric.go`: Read temperature data from `/sys/class/hwmon/hwmon*`
* `ipmiMetric.go`: Collect data from `ipmitool` or as fallback `ipmi-sensors` * `ipmiMetric.go`: Collect data from `ipmitool` or as fallback `ipmi-sensors`
If any of the collectors cannot be initialized, it is excluded from all further reads. Like if the Lustre stat file is not a valid path, no Lustre specific metrics will be recorded. If any of the collectors cannot be initialized, it is excluded from all further reads. Like if the Lustre stat file is not a valid path, no Lustre specific metrics will be recorded.
@ -50,6 +51,9 @@ The InfiniBand collector requires the LID file to read the data. It has to be co
# Lustre collector # Lustre collector
The Lustre collector requires the path to the Lustre stats file. It has to be configured in the collector itself (`LUSTREFILE` in `lustreMetric.go`) The Lustre collector requires the path to the Lustre stats file. It has to be configured in the collector itself (`LUSTREFILE` in `lustreMetric.go`)
# Temperature collector
This is optional configuration of the temperature collector. On multi-socket system there are multiple hwmon devices, one for each CPU socket but there is no field to determine which hwmon device corresponds to which CPU socket. After you determined the mapping, you can add it to the `tag_override` map in the collector.
# LIKWID collector # LIKWID collector
The `likwidMetric.go` requires preparation steps. For this, the `Makefile` can be used. The `likwidMetric.go` requires preparation steps. For this, the `Makefile` can be used.

110
collectors/tempMetric.go Normal file
View File

@ -0,0 +1,110 @@
package collectors
import (
"fmt"
lp "github.com/influxdata/line-protocol"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"encoding/json"
)
const HWMON_PATH = `/sys/class/hwmon`
type TempCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics"`
TagOverride map[string]map[string]string `json:"tag_override"`
}
type TempCollector struct {
MetricCollector
config TempCollectorConfig
}
func (m *TempCollector) Init(config []byte) error {
m.name = "TempCollector"
m.setup()
m.init = true
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
return nil
}
func get_hwmon_sensors() (map[string]map[string]string, error) {
var folders []string
var sensors map[string]map[string]string
sensors = make(map[string]map[string]string)
err := filepath.Walk(HWMON_PATH, func(p string, info os.FileInfo, err error) error {
if info.IsDir() {
return nil
}
folders = append(folders, p)
return nil
})
if err != nil {
return sensors, err
}
for _, f := range folders {
sensors[f] = make(map[string]string)
myp := fmt.Sprintf("%s/", f)
err := filepath.Walk(myp, func(path string, info os.FileInfo, err error) error {
dir, fname := filepath.Split(path)
if strings.Contains(fname, "temp") && strings.Contains(fname, "_input") {
namefile := fmt.Sprintf("%s/%s", dir, strings.Replace(fname, "_input", "_label", -1))
name, ierr := ioutil.ReadFile(namefile)
if ierr == nil {
sensors[f][strings.Replace(string(name), "\n", "", -1)] = path
}
}
return nil
})
if err != nil {
continue
}
}
return sensors, nil
}
func (m *TempCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
sensors, err := get_hwmon_sensors()
if err != nil {
return
}
for _, files := range sensors {
for name, file := range files {
tags := map[string]string{"type": "node"}
for key, newtags := range m.config.TagOverride {
if strings.Contains(file, key) {
tags = newtags
break
}
}
buffer, err := ioutil.ReadFile(string(file))
if err != nil {
continue
}
x, err := strconv.ParseInt(strings.Replace(string(buffer), "\n", "", -1), 0, 64)
if err == nil {
y, err := lp.New(strings.ToLower(name), tags, map[string]interface{}{"value": float64(x) / 1000}, time.Now())
if err == nil {
*out = append(*out, y)
}
}
}
}
}
func (m *TempCollector) Close() {
m.init = false
return
}

View File

@ -11,16 +11,7 @@
"interval": 3, "interval": 3,
"duration": 1, "duration": 1,
"collectors": [ "collectors": [
"loadavg", "tempstat"
"likwid",
"memstat",
"netstat",
"ibstat",
"lustrestat",
"cpustat",
"topprocs",
"nvidia",
"diskstat"
], ],
"default_tags": { "default_tags": {
"cluster": "testcluster" "cluster": "testcluster"
@ -29,101 +20,17 @@
"type": "none" "type": "none"
}, },
"collect_config": { "collect_config": {
"netstat": { "tempstat": {
"exclude_devices": [ "tag_override": {
"enp195s0f1", "hwmon0" : {
"lo" "type" : "socket",
] "type-id" : "0"
}, },
"ibstat": { "hwmon1" : {
"exclude_devices": [ "type" : "socket",
"mlx5_0", "type-id" : "1"
"mlx5_1" }
] }
},
"likwid": {
"accessmode" : "accessdaemon",
"daemon_path" : "/apps/likwid/5.2.0/sbin",
"eventsets": [
{
"events": {
"FIXC1": "ACTUAL_CPU_CLOCK",
"FIXC2": "MAX_CPU_CLOCK",
"PMC0": "RETIRED_INSTRUCTIONS",
"PMC1": "CPU_CLOCKS_UNHALTED",
"PMC2": "RETIRED_SSE_AVX_FLOPS_ALL",
"PMC3": "MERGE",
"DFC0": "DRAM_CHANNEL_0",
"DFC1": "DRAM_CHANNEL_1",
"DFC2": "DRAM_CHANNEL_2",
"DFC3": "DRAM_CHANNEL_3"
},
"metrics": [
{
"name": "ipc",
"calc": "PMC0/PMC1",
"socket_scope": false,
"publish": true
},
{
"name": "flops_any",
"calc": "0.000001*PMC2/time",
"socket_scope": false,
"publish": true
},
{
"name": "clock_mhz",
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
"socket_scope": false,
"publish": true
},
{
"name": "mem1",
"calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time",
"socket_scope": true,
"publish": false
}
]
},
{
"events": {
"DFC0": "DRAM_CHANNEL_4",
"DFC1": "DRAM_CHANNEL_5",
"DFC2": "DRAM_CHANNEL_6",
"DFC3": "DRAM_CHANNEL_7",
"PWR0": "RAPL_CORE_ENERGY",
"PWR1": "RAPL_PKG_ENERGY"
},
"metrics": [
{
"name": "pwr_core",
"calc": "PWR0/time",
"socket_scope": false,
"publish": true
},
{
"name": "pwr_pkg",
"calc": "PWR1/time",
"socket_scope": true,
"publish": true
},
{
"name": "mem2",
"calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time",
"socket_scope": true,
"publish": false
}
]
}
],
"globalmetrics": [
{
"name": "mem_bw",
"calc": "mem1+mem2",
"socket_scope": true,
"publish": true
}
]
} }
} }
} }

View File

@ -30,6 +30,7 @@ var Collectors = map[string]collectors.MetricGetter{
"nvidia": &collectors.NvidiaCollector{}, "nvidia": &collectors.NvidiaCollector{},
"customcmd": &collectors.CustomCmdCollector{}, "customcmd": &collectors.CustomCmdCollector{},
"diskstat": &collectors.DiskstatCollector{}, "diskstat": &collectors.DiskstatCollector{},
"tempstat": &collectors.TempCollector{},
"ipmistat" : &collectors.IpmiCollector{}, "ipmistat" : &collectors.IpmiCollector{},
} }