Compare commits

..

13 Commits

Author SHA1 Message Date
Thomas Gruber
6f746de084 Merge branch 'develop' into smartmon_collector 2022-10-10 12:11:20 +02:00
Thomas Gruber
3d5b28e5aa Update smartmonMetric.go 2022-10-10 12:10:22 +02:00
Thomas Gruber
4bd71224df move maybe-usable-by-other-cc-components to pkg. Fix all files to use the new paths (#88) 2022-10-10 11:53:11 +02:00
Thomas Gruber
e2438f8cec Merge branch 'develop' into smartmon_collector 2022-10-09 17:35:47 +02:00
Thomas Roehl
6bf3bfd10a Use lower case for error strings in RocmSmiCollector 2022-10-09 17:05:49 +02:00
Thomas Gruber
0fbff00996 Replace ioutils with os and io (#87) 2022-10-09 17:03:38 +02:00
Thomas Roehl
8849824ba9 Remove useless prints from MemstatCollector 2022-10-09 02:56:15 +02:00
Thomas Roehl
ed511b7c09 Fix memstat collector with numa_stats option 2022-09-28 15:09:36 +02:00
Thomas Roehl
a0acf01dc3 Build DEB package for Ubuntu 20.04 for releases 2022-09-28 12:19:36 +02:00
Thomas Roehl
58461f1f72 Fix clock frequency coming from LikwidCollector and update docs 2022-09-09 20:01:21 +02:00
Thomas Röhl
c09d8fb118 InfiniBandCollector: Scale raw readings from octets to bytes 2022-09-09 19:27:20 +02:00
Thomas Roehl
d3e9f91ad2 Add SmartMonCollector to CollectorManager 2022-07-27 14:02:14 +02:00
Thomas Roehl
e58eff2fac Add Collector for S.M.A.R.T disk data 2022-07-27 13:59:06 +02:00
7 changed files with 250 additions and 189 deletions

View File

@@ -112,9 +112,7 @@ DEB: scripts/cc-metric-collector.deb.control $(APP)
#@mkdir --parents --verbose $$DEBIANDIR
@CONTROLFILE="$${BASEDIR}/scripts/cc-metric-collector.deb.control"
@COMMITISH="HEAD"
@git describe --tags --abbrev=0 $${COMMITISH}
@VERS=$$(git describe --tags --abbrev=0 $${COMMITISH})
@if [ -z "$$VERS" ]; then VERS=${GITHUB_REF_NAME}; fi
@VERS=$${VERS#v}
@VERS=$$(echo $$VERS | sed -e s+'-'+'_'+g)
@ARCH=$$(uname -m)
@@ -123,14 +121,8 @@ DEB: scripts/cc-metric-collector.deb.control $(APP)
@SIZE_BYTES=$$(du -bcs --exclude=.dpkgbuild "$$WORKSPACE"/ | awk '{print $$1}' | head -1 | sed -e 's/^0\+//')
@SIZE="$$(awk -v size="$$SIZE_BYTES" 'BEGIN {print (size/1024)+1}' | awk '{print int($$0)}')"
#@sed -e s+"{VERSION}"+"$$VERS"+g -e s+"{INSTALLED_SIZE}"+"$$SIZE"+g -e s+"{ARCH}"+"$$ARCH"+g $$CONTROLFILE > $${DEBIANDIR}/control
@echo "Version: $$VERS"
@echo "Size: $$SIZE"
@echo "Arch: $$ARCH"
@sed -e s+"{VERSION}"+"$$VERS"+g -e s+"{INSTALLED_SIZE}"+"$$SIZE"+g -e s+"{ARCH}"+"$$ARCH"+g $$CONTROLFILE > $${DEBIANBINDIR}/control
@make PREFIX=$${WORKSPACE} install
@DEB_FILE="cc-metric-collector_$${VERS}_$${ARCH}.deb"
@dpkg-deb -b $${WORKSPACE} "$$DEB_FILE"
@if [ "$${GITHUB_ACTIONS}" = "true" ]; then
@ echo "::set-output name=DEB::$${DEB_FILE}"
@fi
@rm -r "$${WORKSPACE}"

View File

@@ -37,7 +37,7 @@ var AvailableCollectors = map[string]MetricCollector{
"beegfs_meta": new(BeegfsMetaCollector),
"beegfs_storage": new(BeegfsStorageCollector),
"rocm_smi": new(RocmSmiCollector),
"self": new(SelfCollector),
"smartmon": new(SmartMonCollector),
"schedstat": new(SchedstatCollector),
}

View File

@@ -1,144 +0,0 @@
package collectors
import (
"encoding/json"
"runtime"
"syscall"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
type SelfCollectorConfig struct {
MemStats bool `json:"read_mem_stats"`
GoRoutines bool `json:"read_goroutines"`
CgoCalls bool `json:"read_cgo_calls"`
Rusage bool `json:"read_rusage"`
}
type SelfCollector struct {
metricCollector
config SelfCollectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
}
func (m *SelfCollector) Init(config json.RawMessage) error {
var err error = nil
m.name = "SelfCollector"
m.setup()
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Self"}
m.tags = map[string]string{"type": "node"}
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
m.init = true
return err
}
func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMetric) {
timestamp := time.Now()
if m.config.MemStats {
var memstats runtime.MemStats
runtime.ReadMemStats(&memstats)
y, err := lp.New("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp)
if err == nil {
output <- y
}
}
if m.config.GoRoutines {
y, err := lp.New("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp)
if err == nil {
output <- y
}
}
if m.config.CgoCalls {
y, err := lp.New("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp)
if err == nil {
output <- y
}
}
if m.config.Rusage {
var rusage syscall.Rusage
err := syscall.Getrusage(syscall.RUSAGE_SELF, &rusage)
if err == nil {
sec, nsec := rusage.Utime.Unix()
t := float64(sec) + (float64(nsec) * 1e-9)
y, err := lp.New("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
if err == nil {
y.AddMeta("unit", "seconds")
output <- y
}
sec, nsec = rusage.Stime.Unix()
t = float64(sec) + (float64(nsec) * 1e-9)
y, err = lp.New("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
if err == nil {
y.AddMeta("unit", "seconds")
output <- y
}
y, err = lp.New("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp)
if err == nil {
output <- y
}
y, err = lp.New("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp)
if err == nil {
output <- y
}
y, err = lp.New("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp)
if err == nil {
output <- y
}
y, err = lp.New("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp)
if err == nil {
output <- y
}
y, err = lp.New("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp)
if err == nil {
output <- y
}
}
}
}
func (m *SelfCollector) Close() {
m.init = false
}

View File

@@ -1,34 +0,0 @@
## `self` collector
```json
"self": {
"read_mem_stats" : true,
"read_goroutines" : true,
"read_cgo_calls" : true,
"read_rusage" : true
}
```
The `self` collector reads the data from the `runtime` and `syscall` packages, so monitors the execution of the cc-metric-collector itself.
Metrics:
* If `read_mem_stats == true`:
* `total_alloc`: The metric reports cumulative bytes allocated for heap objects.
* `heap_alloc`: The metric reports bytes of allocated heap objects.
* `heap_sys`: The metric reports bytes of heap memory obtained from the OS.
* `heap_idle`: The metric reports bytes in idle (unused) spans.
* `heap_inuse`: The metric reports bytes in in-use spans.
* `heap_released`: The metric reports bytes of physical memory returned to the OS.
* `heap_objects`: The metric reports the number of allocated heap objects.
* If `read_goroutines == true`:
* `num_goroutines`: The metric reports the number of goroutines that currently exist.
* If `read_cgo_calls == true`:
* `num_cgo_calls`: The metric reports the number of cgo calls made by the current process.
* If `read_rusage == true`:
* `rusage_user_time`: The metric reports the amount of time that this process has been scheduled in user mode.
* `rusage_system_time`: The metric reports the amount of time that this process has been scheduled in kernel mode.
* `rusage_vol_ctx_switch`: The metric reports the amount of voluntary context switches.
* `rusage_invol_ctx_switch`: The metric reports the amount of involuntary context switches.
* `rusage_signals`: The metric reports the number of signals received.
* `rusage_major_pgfaults`: The metric reports the number of major faults the process has made which have required loading a memory page from disk.
* `rusage_minor_pgfaults`: The metric reports the number of minor faults the process has made which have not required loading a memory page from disk.

View File

@@ -0,0 +1,220 @@
package collectors
import (
"encoding/json"
"os/exec"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
type SmartMonCollectorConfig struct {
UseSudo bool `json:"use_sudo"`
ExcludeDevices []string `json:"exclude_devices"`
}
type SmartMonCollector struct {
metricCollector
config SmartMonCollectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
devices []string // smartmon devices
sudoCmd string // Full path to 'sudo' command
smartCtlCmd string // Full path to 'smartctl' command
}
func (m *SmartMonCollector) getSmartmonDevices() error {
var command *exec.Cmd
var scan struct {
Devices []struct {
Name string `json:"name"`
Type string `json:"type"`
} `json:"devices"`
}
m.devices = make([]string, 0)
if m.config.UseSudo {
command = exec.Command(m.sudoCmd, m.smartCtlCmd, "--scan", "-j")
} else {
command = exec.Command(m.smartCtlCmd, "--scan", "-j")
}
command.Wait()
stdout, err := command.Output()
if err != nil {
return err
}
err = json.Unmarshal(stdout, &scan)
if err != nil {
return err
}
for _, d := range scan.Devices {
if len(d.Name) > 0 {
m.devices = append(m.devices, d.Name)
}
}
return nil
}
func (m *SmartMonCollector) Init(config json.RawMessage) error {
var err error = nil
m.name = "SmartMonCollector"
m.setup()
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Disk"}
m.tags = map[string]string{"type": "node", "stype": "disk"}
// Read in the JSON configuration
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
if m.config.UseSudo {
p, err := exec.LookPath("sudo")
if err != nil {
return err
}
m.sudoCmd = p
}
p, err := exec.LookPath("smartctl")
if err != nil {
return err
}
m.smartCtlCmd = p
err = m.getSmartmonDevices()
if err != nil {
return err
}
m.init = true
return err
}
type SmartMonData struct {
SerialNumber string `json:"serial_number"`
UserCapacity struct {
Blocks int `json:"blocks"`
Bytes int `json:"bytes"`
} `json:"user_capacity"`
HealthLog struct {
Temperature int `json:"temperature"`
PercentageUsed int `json:"percentage_used"`
AvailableSpare int `json:"available_spare"`
DataUnitsRead int `json:"data_units_read"`
DataUnitsWrite int `json:"data_units_written"`
HostReads int `json:"host_reads"`
HostWrites int `json:"host_writes"`
PowerCycles int `json:"power_cycles"`
PowerOnHours int `json:"power_on_hours"`
UnsafeShutdowns int `json:"unsafe_shutdowns"`
MediaErrors int `json:"media_errors"`
NumErrorLogEntries int `json:"num_err_log_entries"`
WarnTempTime int `json:"warning_temp_time"`
CriticalTempTime int `json:"critical_comp_time"`
} `json:"nvme_smart_health_information_log"`
}
func (m *SmartMonCollector) Read(interval time.Duration, output chan lp.CCMetric) {
timestamp := time.Now()
for _, d := range m.devices {
var command *exec.Cmd
var data SmartMonData
if m.config.UseSudo {
command = exec.Command(m.sudoCmd, m.smartCtlCmd, "-j", "-a", d)
} else {
command = exec.Command(m.smartCtlCmd, "-j", "-a", d)
}
command.Wait()
stdout, err := command.Output()
if err != nil {
cclog.ComponentError(m.name, "cannot read data for device", d)
continue
}
err = json.Unmarshal(stdout, &data)
if err != nil {
cclog.ComponentError(m.name, "cannot unmarshal data for device", d)
continue
}
y, err := lp.New("smartmon_temp", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.Temperature}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
y.AddMeta("unit", "degC")
output <- y
}
y, err = lp.New("smartmon_percent_used", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.PercentageUsed}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
y.AddMeta("unit", "percent")
output <- y
}
y, err = lp.New("smartmon_avail_spare", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.AvailableSpare}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
y.AddMeta("unit", "percent")
output <- y
}
y, err = lp.New("smartmon_data_units_read", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.DataUnitsRead}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_data_units_write", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.DataUnitsWrite}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_host_reads", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.HostReads}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_host_writes", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.HostWrites}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_power_cycles", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.PowerCycles}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_power_on", m.tags, m.meta, map[string]interface{}{"value": int64(data.HealthLog.PowerOnHours) * 3600}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
y.AddMeta("unit", "seconds")
output <- y
}
y, err = lp.New("smartmon_unsafe_shutdowns", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.UnsafeShutdowns}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_media_errors", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.MediaErrors}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_errlog_entries", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.NumErrorLogEntries}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_warn_temp_time", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.WarnTempTime}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_crit_temp_time", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.CriticalTempTime}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
}
}
func (m *SmartMonCollector) Close() {
m.init = false
}

View File

@@ -0,0 +1,29 @@
## `smartmon` collector
```json
"smartmon": {
"use_sudo" : true,
"exclude_devices": [
"/dev/sda",
]
}
```
The `smartmon` collector reads the data from the command `smartctl`. It retrieves S.M.A.R.T data from disks
Metrics:
* `smartmon_temp`: Temperature of the device (`unit=degC`)
* `smartmon_avail_spare`: Amount of spare left (`unit=percent`)
* `smartmon_percent_used`: Percentage of the device is used (`unit=percent`)
* `smartmon_data_units_read`: Read data units
* `smartmon_data_units_write`: Written data units
* `smartmon_host_reads`: Read operations
* `smartmon_host_writes`: Write operations
* `smartmon_power_cycles`: Number of power cycles
* `smartmon_power_on`: Seconds the device is powered on (`unit=seconds`)
* `smartmon_unsafe_shutdowns`: Count of unsafe shutdowns
* `smartmon_media_errors`: Media errors of the device
* `smartmon_errlog_entries`: Error log entries
* `smartmon_warn_temp_time`: Time above the warning temperature threshold
* `smartmon_crit_temp_time`: Time above the critical temperature threshold

2
go.mod
View File

@@ -8,7 +8,6 @@ require (
github.com/NVIDIA/go-nvml v0.11.6-0
github.com/PaesslerAG/gval v1.2.0
github.com/gorilla/mux v1.8.0
github.com/influxdata/influxdb-client-go/v2 v2.9.0
github.com/influxdata/influxdb-client-go/v2 v2.9.1
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf
github.com/nats-io/nats.go v1.16.0
@@ -18,7 +17,6 @@ require (
golang.org/x/sys v0.0.0-20220712014510-0a85c31ab51e
)
require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.1.2 // indirect