diff --git a/collectors/README.md b/collectors/README.md index b2f1837..d43d67d 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -40,6 +40,7 @@ The base class/configuration is located in `metricCollector.go`. * `cpustatMetric.go`: Read CPU specific values from `/proc/stat` * `topprocsMetric.go`: Reads the Top5 processes by their CPU usage * `nvidiaMetric.go`: Read data about Nvidia GPUs using the NVML library +* `tempMetric.go`: Read temperature data from `/sys/class/hwmon/hwmon*` * `ipmiMetric.go`: Collect data from `ipmitool` or as fallback `ipmi-sensors` If any of the collectors cannot be initialized, it is excluded from all further reads. Like if the Lustre stat file is not a valid path, no Lustre specific metrics will be recorded. @@ -50,6 +51,9 @@ The InfiniBand collector requires the LID file to read the data. It has to be co # Lustre collector The Lustre collector requires the path to the Lustre stats file. It has to be configured in the collector itself (`LUSTREFILE` in `lustreMetric.go`) +# Temperature collector +This is optional configuration of the temperature collector. On multi-socket system there are multiple hwmon devices, one for each CPU socket but there is no field to determine which hwmon device corresponds to which CPU socket. After you determined the mapping, you can add it to the `tag_override` map in the collector. + # LIKWID collector The `likwidMetric.go` requires preparation steps. For this, the `Makefile` can be used. diff --git a/collectors/tempMetric.go b/collectors/tempMetric.go new file mode 100644 index 0000000..adf0e8c --- /dev/null +++ b/collectors/tempMetric.go @@ -0,0 +1,110 @@ +package collectors + +import ( + "fmt" + lp "github.com/influxdata/line-protocol" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "time" + "encoding/json" +) + +const HWMON_PATH = `/sys/class/hwmon` + + +type TempCollectorConfig struct { + ExcludeMetrics []string `json:"exclude_metrics"` + TagOverride map[string]map[string]string `json:"tag_override"` +} + +type TempCollector struct { + MetricCollector + config TempCollectorConfig +} + +func (m *TempCollector) Init(config []byte) error { + m.name = "TempCollector" + m.setup() + m.init = true + if len(config) > 0 { + err := json.Unmarshal(config, &m.config) + if err != nil { + return err + } + } + return nil +} + +func get_hwmon_sensors() (map[string]map[string]string, error) { + var folders []string + var sensors map[string]map[string]string + sensors = make(map[string]map[string]string) + err := filepath.Walk(HWMON_PATH, func(p string, info os.FileInfo, err error) error { + if info.IsDir() { + return nil + } + folders = append(folders, p) + return nil + }) + if err != nil { + return sensors, err + } + + for _, f := range folders { + sensors[f] = make(map[string]string) + myp := fmt.Sprintf("%s/", f) + err := filepath.Walk(myp, func(path string, info os.FileInfo, err error) error { + dir, fname := filepath.Split(path) + if strings.Contains(fname, "temp") && strings.Contains(fname, "_input") { + namefile := fmt.Sprintf("%s/%s", dir, strings.Replace(fname, "_input", "_label", -1)) + name, ierr := ioutil.ReadFile(namefile) + if ierr == nil { + sensors[f][strings.Replace(string(name), "\n", "", -1)] = path + } + } + return nil + }) + if err != nil { + continue + } + } + return sensors, nil +} + +func (m *TempCollector) Read(interval time.Duration, out *[]lp.MutableMetric) { + + sensors, err := get_hwmon_sensors() + if err != nil { + return + } + for _, files := range sensors { + for name, file := range files { + tags := map[string]string{"type": "node"} + for key, newtags := range m.config.TagOverride { + if strings.Contains(file, key) { + tags = newtags + break + } + } + buffer, err := ioutil.ReadFile(string(file)) + if err != nil { + continue + } + x, err := strconv.ParseInt(strings.Replace(string(buffer), "\n", "", -1), 0, 64) + if err == nil { + y, err := lp.New(strings.ToLower(name), tags, map[string]interface{}{"value": float64(x) / 1000}, time.Now()) + if err == nil { + *out = append(*out, y) + } + } + } + } +} + +func (m *TempCollector) Close() { + m.init = false + return +} diff --git a/config.json b/config.json index 17285c2..4a7fd89 100644 --- a/config.json +++ b/config.json @@ -11,16 +11,7 @@ "interval": 3, "duration": 1, "collectors": [ - "loadavg", - "likwid", - "memstat", - "netstat", - "ibstat", - "lustrestat", - "cpustat", - "topprocs", - "nvidia", - "diskstat" + "tempstat" ], "default_tags": { "cluster": "testcluster" @@ -29,101 +20,17 @@ "type": "none" }, "collect_config": { - "netstat": { - "exclude_devices": [ - "enp195s0f1", - "lo" - ] - }, - "ibstat": { - "exclude_devices": [ - "mlx5_0", - "mlx5_1" - ] - }, - "likwid": { - "accessmode" : "accessdaemon", - "daemon_path" : "/apps/likwid/5.2.0/sbin", - "eventsets": [ - { - "events": { - "FIXC1": "ACTUAL_CPU_CLOCK", - "FIXC2": "MAX_CPU_CLOCK", - "PMC0": "RETIRED_INSTRUCTIONS", - "PMC1": "CPU_CLOCKS_UNHALTED", - "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL", - "PMC3": "MERGE", - "DFC0": "DRAM_CHANNEL_0", - "DFC1": "DRAM_CHANNEL_1", - "DFC2": "DRAM_CHANNEL_2", - "DFC3": "DRAM_CHANNEL_3" - }, - "metrics": [ - { - "name": "ipc", - "calc": "PMC0/PMC1", - "socket_scope": false, - "publish": true - }, - { - "name": "flops_any", - "calc": "0.000001*PMC2/time", - "socket_scope": false, - "publish": true - }, - { - "name": "clock_mhz", - "calc": "0.000001*(FIXC1/FIXC2)/inverseClock", - "socket_scope": false, - "publish": true - }, - { - "name": "mem1", - "calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time", - "socket_scope": true, - "publish": false - } - ] + "tempstat": { + "tag_override": { + "hwmon0" : { + "type" : "socket", + "type-id" : "0" }, - { - "events": { - "DFC0": "DRAM_CHANNEL_4", - "DFC1": "DRAM_CHANNEL_5", - "DFC2": "DRAM_CHANNEL_6", - "DFC3": "DRAM_CHANNEL_7", - "PWR0": "RAPL_CORE_ENERGY", - "PWR1": "RAPL_PKG_ENERGY" - }, - "metrics": [ - { - "name": "pwr_core", - "calc": "PWR0/time", - "socket_scope": false, - "publish": true - }, - { - "name": "pwr_pkg", - "calc": "PWR1/time", - "socket_scope": true, - "publish": true - }, - { - "name": "mem2", - "calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time", - "socket_scope": true, - "publish": false - } - ] + "hwmon1" : { + "type" : "socket", + "type-id" : "1" } - ], - "globalmetrics": [ - { - "name": "mem_bw", - "calc": "mem1+mem2", - "socket_scope": true, - "publish": true - } - ] + } } } } diff --git a/metric-collector.go b/metric-collector.go index b8449b7..bd6faeb 100644 --- a/metric-collector.go +++ b/metric-collector.go @@ -30,6 +30,7 @@ var Collectors = map[string]collectors.MetricGetter{ "nvidia": &collectors.NvidiaCollector{}, "customcmd": &collectors.CustomCmdCollector{}, "diskstat": &collectors.DiskstatCollector{}, + "tempstat": &collectors.TempCollector{}, "ipmistat" : &collectors.IpmiCollector{}, }