mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-03-10 10:37:29 +01:00
247 lines
6.8 KiB
Go
247 lines
6.8 KiB
Go
package collectors
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"os/exec"
|
|
"time"
|
|
|
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
|
)
|
|
|
|
type SmartMonCollectorConfig struct {
|
|
UseSudo bool `json:"use_sudo,omitempty"`
|
|
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
|
}
|
|
|
|
type SmartMonCollector struct {
|
|
metricCollector
|
|
config SmartMonCollectorConfig // the configuration structure
|
|
meta map[string]string // default meta information
|
|
tags map[string]string // default tags
|
|
devices []string // smartmon devices
|
|
sudoCmd string // Full path to 'sudo' command
|
|
smartCtlCmd string // Full path to 'smartctl' command
|
|
}
|
|
|
|
func (m *SmartMonCollector) getSmartmonDevices() error {
|
|
var scan struct {
|
|
Devices []struct {
|
|
Name string `json:"name"`
|
|
Type string `json:"type"`
|
|
} `json:"devices"`
|
|
}
|
|
m.devices = make([]string, 0)
|
|
|
|
var command *exec.Cmd
|
|
if m.config.UseSudo {
|
|
command = exec.Command(m.sudoCmd, m.smartCtlCmd, "--scan", "--json=c")
|
|
} else {
|
|
command = exec.Command(m.smartCtlCmd, "--scan", "--json=c")
|
|
}
|
|
|
|
stdout, err := command.Output()
|
|
if err != nil {
|
|
return fmt.Errorf(
|
|
"%s getSmartmonDevices(): Failed to execute device scan command %s: %w",
|
|
m.name, command.String(), err)
|
|
}
|
|
err = json.Unmarshal(stdout, &scan)
|
|
if err != nil {
|
|
return fmt.Errorf("%s getSmartmonDevices(): Failed to parse JSON output from device scan command: %w",
|
|
m.name, err)
|
|
}
|
|
for _, d := range scan.Devices {
|
|
if len(d.Name) > 0 {
|
|
m.devices = append(m.devices, d.Name)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (m *SmartMonCollector) Init(config json.RawMessage) error {
|
|
m.name = "SmartMonCollector"
|
|
if err := m.setup(); err != nil {
|
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
}
|
|
m.parallel = true
|
|
m.meta = map[string]string{
|
|
"source": m.name,
|
|
"group": "Disk",
|
|
}
|
|
m.tags = map[string]string{
|
|
"type": "node",
|
|
"stype": "disk",
|
|
}
|
|
|
|
// Read in the JSON configuration
|
|
if len(config) > 0 {
|
|
if err := json.Unmarshal(config, &m.config); err != nil {
|
|
return fmt.Errorf("%s Init(): Error reading config: %w", m.name, err)
|
|
}
|
|
}
|
|
|
|
// Check if sudo and smartctl are in search path
|
|
if m.config.UseSudo {
|
|
p, err := exec.LookPath("sudo")
|
|
if err != nil {
|
|
return fmt.Errorf("%s Init(): No sudo command found in search path: %w", m.name, err)
|
|
}
|
|
m.sudoCmd = p
|
|
}
|
|
p, err := exec.LookPath("smartctl")
|
|
if err != nil {
|
|
return fmt.Errorf("%s Init(): No smartctl command found in search path: %w", m.name, err)
|
|
}
|
|
m.smartCtlCmd = p
|
|
|
|
if err = m.getSmartmonDevices(); err != nil {
|
|
return err
|
|
}
|
|
|
|
m.init = true
|
|
return err
|
|
}
|
|
|
|
type SmartMonData struct {
|
|
SerialNumber string `json:"serial_number"`
|
|
UserCapacity struct {
|
|
Blocks int `json:"blocks"`
|
|
Bytes int `json:"bytes"`
|
|
} `json:"user_capacity"`
|
|
HealthLog struct {
|
|
Temperature int `json:"temperature"`
|
|
PercentageUsed int `json:"percentage_used"`
|
|
AvailableSpare int `json:"available_spare"`
|
|
DataUnitsRead int `json:"data_units_read"`
|
|
DataUnitsWrite int `json:"data_units_written"`
|
|
HostReads int `json:"host_reads"`
|
|
HostWrites int `json:"host_writes"`
|
|
PowerCycles int `json:"power_cycles"`
|
|
PowerOnHours int `json:"power_on_hours"`
|
|
UnsafeShutdowns int `json:"unsafe_shutdowns"`
|
|
MediaErrors int `json:"media_errors"`
|
|
NumErrorLogEntries int `json:"num_err_log_entries"`
|
|
WarnTempTime int `json:"warning_temp_time"`
|
|
CriticalTempTime int `json:"critical_comp_time"`
|
|
} `json:"nvme_smart_health_information_log"`
|
|
}
|
|
|
|
func (m *SmartMonCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|
timestamp := time.Now()
|
|
for _, d := range m.devices {
|
|
var command *exec.Cmd
|
|
var data SmartMonData
|
|
if m.config.UseSudo {
|
|
command = exec.Command(m.sudoCmd, m.smartCtlCmd, "-j", "-a", d)
|
|
} else {
|
|
command = exec.Command(m.smartCtlCmd, "-j", "-a", d)
|
|
}
|
|
|
|
stdout, err := command.Output()
|
|
if err != nil {
|
|
cclog.ComponentError(m.name, "cannot read data for device", d)
|
|
continue
|
|
}
|
|
err = json.Unmarshal(stdout, &data)
|
|
if err != nil {
|
|
cclog.ComponentError(m.name, "cannot unmarshal data for device", d)
|
|
continue
|
|
}
|
|
y, err := lp.NewMetric(
|
|
"smartmon_temp", m.tags, m.meta, data.HealthLog.Temperature, timestamp)
|
|
if err == nil {
|
|
y.AddTag("stype-id", d)
|
|
y.AddMeta("unit", "degC")
|
|
output <- y
|
|
}
|
|
y, err = lp.NewMetric(
|
|
"smartmon_percent_used", m.tags, m.meta, data.HealthLog.PercentageUsed, timestamp)
|
|
if err == nil {
|
|
y.AddTag("stype-id", d)
|
|
y.AddMeta("unit", "percent")
|
|
output <- y
|
|
}
|
|
y, err = lp.NewMetric(
|
|
"smartmon_avail_spare", m.tags, m.meta, data.HealthLog.AvailableSpare, timestamp)
|
|
if err == nil {
|
|
y.AddTag("stype-id", d)
|
|
y.AddMeta("unit", "percent")
|
|
output <- y
|
|
}
|
|
y, err = lp.NewMetric(
|
|
"smartmon_data_units_read", m.tags, m.meta, data.HealthLog.DataUnitsRead, timestamp)
|
|
if err == nil {
|
|
y.AddTag("stype-id", d)
|
|
output <- y
|
|
}
|
|
y, err = lp.NewMetric(
|
|
"smartmon_data_units_write", m.tags, m.meta, data.HealthLog.DataUnitsWrite, timestamp)
|
|
if err == nil {
|
|
y.AddTag("stype-id", d)
|
|
output <- y
|
|
}
|
|
y, err = lp.NewMetric(
|
|
"smartmon_host_reads", m.tags, m.meta, data.HealthLog.HostReads, timestamp)
|
|
if err == nil {
|
|
y.AddTag("stype-id", d)
|
|
output <- y
|
|
}
|
|
y, err = lp.NewMetric(
|
|
"smartmon_host_writes", m.tags, m.meta, data.HealthLog.HostWrites, timestamp)
|
|
if err == nil {
|
|
y.AddTag("stype-id", d)
|
|
output <- y
|
|
}
|
|
y, err = lp.NewMetric(
|
|
"smartmon_power_cycles", m.tags, m.meta, data.HealthLog.PowerCycles, timestamp)
|
|
if err == nil {
|
|
y.AddTag("stype-id", d)
|
|
output <- y
|
|
}
|
|
y, err = lp.NewMetric(
|
|
"smartmon_power_on", m.tags, m.meta, int64(data.HealthLog.PowerOnHours)*3600, timestamp)
|
|
if err == nil {
|
|
y.AddTag("stype-id", d)
|
|
y.AddMeta("unit", "sec")
|
|
output <- y
|
|
}
|
|
y, err = lp.NewMetric(
|
|
"smartmon_unsafe_shutdowns", m.tags, m.meta, data.HealthLog.UnsafeShutdowns, timestamp)
|
|
if err == nil {
|
|
y.AddTag("stype-id", d)
|
|
output <- y
|
|
}
|
|
y, err = lp.NewMetric(
|
|
"smartmon_media_errors", m.tags, m.meta, data.HealthLog.MediaErrors, timestamp)
|
|
if err == nil {
|
|
y.AddTag("stype-id", d)
|
|
output <- y
|
|
}
|
|
y, err = lp.NewMetric(
|
|
"smartmon_errlog_entries", m.tags, m.meta, data.HealthLog.NumErrorLogEntries, timestamp)
|
|
if err == nil {
|
|
y.AddTag("stype-id", d)
|
|
output <- y
|
|
}
|
|
y, err = lp.NewMetric(
|
|
"smartmon_warn_temp_time", m.tags, m.meta, data.HealthLog.WarnTempTime, timestamp)
|
|
if err == nil {
|
|
y.AddTag("stype-id", d)
|
|
output <- y
|
|
}
|
|
y, err = lp.NewMetric(
|
|
"smartmon_crit_temp_time", m.tags, m.meta, data.HealthLog.CriticalTempTime, timestamp)
|
|
if err == nil {
|
|
y.AddTag("stype-id", d)
|
|
output <- y
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *SmartMonCollector) Close() {
|
|
m.init = false
|
|
}
|