mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-01-24 21:09:06 +01:00
Temp collector from sysfs hwmon (#8)
* Add collector for hwmon temperature * Comment out local tag overrides * Add temperature collector to README * Update temperature collector with own config parser
This commit is contained in:
parent
1e7a75598e
commit
3997984714
@ -40,6 +40,7 @@ The base class/configuration is located in `metricCollector.go`.
|
||||
* `cpustatMetric.go`: Read CPU specific values from `/proc/stat`
|
||||
* `topprocsMetric.go`: Reads the Top5 processes by their CPU usage
|
||||
* `nvidiaMetric.go`: Read data about Nvidia GPUs using the NVML library
|
||||
* `tempMetric.go`: Read temperature data from `/sys/class/hwmon/hwmon*`
|
||||
* `ipmiMetric.go`: Collect data from `ipmitool` or as fallback `ipmi-sensors`
|
||||
|
||||
If any of the collectors cannot be initialized, it is excluded from all further reads. Like if the Lustre stat file is not a valid path, no Lustre specific metrics will be recorded.
|
||||
@ -50,6 +51,9 @@ The InfiniBand collector requires the LID file to read the data. It has to be co
|
||||
# Lustre collector
|
||||
The Lustre collector requires the path to the Lustre stats file. It has to be configured in the collector itself (`LUSTREFILE` in `lustreMetric.go`)
|
||||
|
||||
# Temperature collector
|
||||
This is optional configuration of the temperature collector. On multi-socket system there are multiple hwmon devices, one for each CPU socket but there is no field to determine which hwmon device corresponds to which CPU socket. After you determined the mapping, you can add it to the `tag_override` map in the collector.
|
||||
|
||||
# LIKWID collector
|
||||
The `likwidMetric.go` requires preparation steps. For this, the `Makefile` can be used.
|
||||
|
||||
|
110
collectors/tempMetric.go
Normal file
110
collectors/tempMetric.go
Normal file
@ -0,0 +1,110 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
lp "github.com/influxdata/line-protocol"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
"encoding/json"
|
||||
)
|
||||
|
||||
const HWMON_PATH = `/sys/class/hwmon`
|
||||
|
||||
|
||||
type TempCollectorConfig struct {
|
||||
ExcludeMetrics []string `json:"exclude_metrics"`
|
||||
TagOverride map[string]map[string]string `json:"tag_override"`
|
||||
}
|
||||
|
||||
type TempCollector struct {
|
||||
MetricCollector
|
||||
config TempCollectorConfig
|
||||
}
|
||||
|
||||
func (m *TempCollector) Init(config []byte) error {
|
||||
m.name = "TempCollector"
|
||||
m.setup()
|
||||
m.init = true
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func get_hwmon_sensors() (map[string]map[string]string, error) {
|
||||
var folders []string
|
||||
var sensors map[string]map[string]string
|
||||
sensors = make(map[string]map[string]string)
|
||||
err := filepath.Walk(HWMON_PATH, func(p string, info os.FileInfo, err error) error {
|
||||
if info.IsDir() {
|
||||
return nil
|
||||
}
|
||||
folders = append(folders, p)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return sensors, err
|
||||
}
|
||||
|
||||
for _, f := range folders {
|
||||
sensors[f] = make(map[string]string)
|
||||
myp := fmt.Sprintf("%s/", f)
|
||||
err := filepath.Walk(myp, func(path string, info os.FileInfo, err error) error {
|
||||
dir, fname := filepath.Split(path)
|
||||
if strings.Contains(fname, "temp") && strings.Contains(fname, "_input") {
|
||||
namefile := fmt.Sprintf("%s/%s", dir, strings.Replace(fname, "_input", "_label", -1))
|
||||
name, ierr := ioutil.ReadFile(namefile)
|
||||
if ierr == nil {
|
||||
sensors[f][strings.Replace(string(name), "\n", "", -1)] = path
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
}
|
||||
return sensors, nil
|
||||
}
|
||||
|
||||
func (m *TempCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||
|
||||
sensors, err := get_hwmon_sensors()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
for _, files := range sensors {
|
||||
for name, file := range files {
|
||||
tags := map[string]string{"type": "node"}
|
||||
for key, newtags := range m.config.TagOverride {
|
||||
if strings.Contains(file, key) {
|
||||
tags = newtags
|
||||
break
|
||||
}
|
||||
}
|
||||
buffer, err := ioutil.ReadFile(string(file))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
x, err := strconv.ParseInt(strings.Replace(string(buffer), "\n", "", -1), 0, 64)
|
||||
if err == nil {
|
||||
y, err := lp.New(strings.ToLower(name), tags, map[string]interface{}{"value": float64(x) / 1000}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *TempCollector) Close() {
|
||||
m.init = false
|
||||
return
|
||||
}
|
113
config.json
113
config.json
@ -11,16 +11,7 @@
|
||||
"interval": 3,
|
||||
"duration": 1,
|
||||
"collectors": [
|
||||
"loadavg",
|
||||
"likwid",
|
||||
"memstat",
|
||||
"netstat",
|
||||
"ibstat",
|
||||
"lustrestat",
|
||||
"cpustat",
|
||||
"topprocs",
|
||||
"nvidia",
|
||||
"diskstat"
|
||||
"tempstat"
|
||||
],
|
||||
"default_tags": {
|
||||
"cluster": "testcluster"
|
||||
@ -29,101 +20,17 @@
|
||||
"type": "none"
|
||||
},
|
||||
"collect_config": {
|
||||
"netstat": {
|
||||
"exclude_devices": [
|
||||
"enp195s0f1",
|
||||
"lo"
|
||||
]
|
||||
},
|
||||
"ibstat": {
|
||||
"exclude_devices": [
|
||||
"mlx5_0",
|
||||
"mlx5_1"
|
||||
]
|
||||
},
|
||||
"likwid": {
|
||||
"accessmode" : "accessdaemon",
|
||||
"daemon_path" : "/apps/likwid/5.2.0/sbin",
|
||||
"eventsets": [
|
||||
{
|
||||
"events": {
|
||||
"FIXC1": "ACTUAL_CPU_CLOCK",
|
||||
"FIXC2": "MAX_CPU_CLOCK",
|
||||
"PMC0": "RETIRED_INSTRUCTIONS",
|
||||
"PMC1": "CPU_CLOCKS_UNHALTED",
|
||||
"PMC2": "RETIRED_SSE_AVX_FLOPS_ALL",
|
||||
"PMC3": "MERGE",
|
||||
"DFC0": "DRAM_CHANNEL_0",
|
||||
"DFC1": "DRAM_CHANNEL_1",
|
||||
"DFC2": "DRAM_CHANNEL_2",
|
||||
"DFC3": "DRAM_CHANNEL_3"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"name": "ipc",
|
||||
"calc": "PMC0/PMC1",
|
||||
"socket_scope": false,
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "flops_any",
|
||||
"calc": "0.000001*PMC2/time",
|
||||
"socket_scope": false,
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "clock_mhz",
|
||||
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
|
||||
"socket_scope": false,
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "mem1",
|
||||
"calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time",
|
||||
"socket_scope": true,
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
"tempstat": {
|
||||
"tag_override": {
|
||||
"hwmon0" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "0"
|
||||
},
|
||||
{
|
||||
"events": {
|
||||
"DFC0": "DRAM_CHANNEL_4",
|
||||
"DFC1": "DRAM_CHANNEL_5",
|
||||
"DFC2": "DRAM_CHANNEL_6",
|
||||
"DFC3": "DRAM_CHANNEL_7",
|
||||
"PWR0": "RAPL_CORE_ENERGY",
|
||||
"PWR1": "RAPL_PKG_ENERGY"
|
||||
},
|
||||
"metrics": [
|
||||
{
|
||||
"name": "pwr_core",
|
||||
"calc": "PWR0/time",
|
||||
"socket_scope": false,
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "pwr_pkg",
|
||||
"calc": "PWR1/time",
|
||||
"socket_scope": true,
|
||||
"publish": true
|
||||
},
|
||||
{
|
||||
"name": "mem2",
|
||||
"calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time",
|
||||
"socket_scope": true,
|
||||
"publish": false
|
||||
}
|
||||
]
|
||||
"hwmon1" : {
|
||||
"type" : "socket",
|
||||
"type-id" : "1"
|
||||
}
|
||||
],
|
||||
"globalmetrics": [
|
||||
{
|
||||
"name": "mem_bw",
|
||||
"calc": "mem1+mem2",
|
||||
"socket_scope": true,
|
||||
"publish": true
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -30,6 +30,7 @@ var Collectors = map[string]collectors.MetricGetter{
|
||||
"nvidia": &collectors.NvidiaCollector{},
|
||||
"customcmd": &collectors.CustomCmdCollector{},
|
||||
"diskstat": &collectors.DiskstatCollector{},
|
||||
"tempstat": &collectors.TempCollector{},
|
||||
"ipmistat" : &collectors.IpmiCollector{},
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user