cc-metric-collector/collectors/smartmonMetric.go
2022-10-10 12:10:22 +02:00

221 lines
6.6 KiB
Go

package collectors
import (
"encoding/json"
"os/exec"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
type SmartMonCollectorConfig struct {
UseSudo bool `json:"use_sudo"`
ExcludeDevices []string `json:"exclude_devices"`
}
type SmartMonCollector struct {
metricCollector
config SmartMonCollectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
devices []string // smartmon devices
sudoCmd string // Full path to 'sudo' command
smartCtlCmd string // Full path to 'smartctl' command
}
func (m *SmartMonCollector) getSmartmonDevices() error {
var command *exec.Cmd
var scan struct {
Devices []struct {
Name string `json:"name"`
Type string `json:"type"`
} `json:"devices"`
}
m.devices = make([]string, 0)
if m.config.UseSudo {
command = exec.Command(m.sudoCmd, m.smartCtlCmd, "--scan", "-j")
} else {
command = exec.Command(m.smartCtlCmd, "--scan", "-j")
}
command.Wait()
stdout, err := command.Output()
if err != nil {
return err
}
err = json.Unmarshal(stdout, &scan)
if err != nil {
return err
}
for _, d := range scan.Devices {
if len(d.Name) > 0 {
m.devices = append(m.devices, d.Name)
}
}
return nil
}
func (m *SmartMonCollector) Init(config json.RawMessage) error {
var err error = nil
m.name = "SmartMonCollector"
m.setup()
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Disk"}
m.tags = map[string]string{"type": "node", "stype": "disk"}
// Read in the JSON configuration
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
if m.config.UseSudo {
p, err := exec.LookPath("sudo")
if err != nil {
return err
}
m.sudoCmd = p
}
p, err := exec.LookPath("smartctl")
if err != nil {
return err
}
m.smartCtlCmd = p
err = m.getSmartmonDevices()
if err != nil {
return err
}
m.init = true
return err
}
type SmartMonData struct {
SerialNumber string `json:"serial_number"`
UserCapacity struct {
Blocks int `json:"blocks"`
Bytes int `json:"bytes"`
} `json:"user_capacity"`
HealthLog struct {
Temperature int `json:"temperature"`
PercentageUsed int `json:"percentage_used"`
AvailableSpare int `json:"available_spare"`
DataUnitsRead int `json:"data_units_read"`
DataUnitsWrite int `json:"data_units_written"`
HostReads int `json:"host_reads"`
HostWrites int `json:"host_writes"`
PowerCycles int `json:"power_cycles"`
PowerOnHours int `json:"power_on_hours"`
UnsafeShutdowns int `json:"unsafe_shutdowns"`
MediaErrors int `json:"media_errors"`
NumErrorLogEntries int `json:"num_err_log_entries"`
WarnTempTime int `json:"warning_temp_time"`
CriticalTempTime int `json:"critical_comp_time"`
} `json:"nvme_smart_health_information_log"`
}
func (m *SmartMonCollector) Read(interval time.Duration, output chan lp.CCMetric) {
timestamp := time.Now()
for _, d := range m.devices {
var command *exec.Cmd
var data SmartMonData
if m.config.UseSudo {
command = exec.Command(m.sudoCmd, m.smartCtlCmd, "-j", "-a", d)
} else {
command = exec.Command(m.smartCtlCmd, "-j", "-a", d)
}
command.Wait()
stdout, err := command.Output()
if err != nil {
cclog.ComponentError(m.name, "cannot read data for device", d)
continue
}
err = json.Unmarshal(stdout, &data)
if err != nil {
cclog.ComponentError(m.name, "cannot unmarshal data for device", d)
continue
}
y, err := lp.New("smartmon_temp", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.Temperature}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
y.AddMeta("unit", "degC")
output <- y
}
y, err = lp.New("smartmon_percent_used", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.PercentageUsed}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
y.AddMeta("unit", "percent")
output <- y
}
y, err = lp.New("smartmon_avail_spare", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.AvailableSpare}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
y.AddMeta("unit", "percent")
output <- y
}
y, err = lp.New("smartmon_data_units_read", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.DataUnitsRead}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_data_units_write", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.DataUnitsWrite}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_host_reads", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.HostReads}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_host_writes", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.HostWrites}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_power_cycles", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.PowerCycles}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_power_on", m.tags, m.meta, map[string]interface{}{"value": int64(data.HealthLog.PowerOnHours) * 3600}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
y.AddMeta("unit", "seconds")
output <- y
}
y, err = lp.New("smartmon_unsafe_shutdowns", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.UnsafeShutdowns}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_media_errors", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.MediaErrors}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_errlog_entries", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.NumErrorLogEntries}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_warn_temp_time", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.WarnTempTime}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
y, err = lp.New("smartmon_crit_temp_time", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.CriticalTempTime}, timestamp)
if err == nil {
y.AddTag("stype-id", d)
output <- y
}
}
}
func (m *SmartMonCollector) Close() {
m.init = false
}