Files
cc-metric-collector/collectors/smartmonMetric.go
2026-03-09 14:00:32 +01:00

358 lines
9.9 KiB
Go

package collectors
import (
"encoding/json"
"fmt"
"os/exec"
"slices"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
)
type SmartMonCollectorConfig struct {
UseSudo bool `json:"use_sudo,omitempty"`
ExcludeDevices []string `json:"exclude_devices,omitempty"`
ExcludeMetrics []string `json:"excludeMetrics,omitempty"`
Devices []struct {
Name string `json:"name"`
Type string `json:"type"`
} `json:"devices,omitempty"`
}
type deviceT struct {
Name string `json:"name"`
Type string `json:"type"`
queryCommand []string
}
type SmartMonCollector struct {
metricCollector
config SmartMonCollectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
devices []deviceT // smartmon devices
sudoCmd string // Full path to 'sudo' command
smartCtlCmd string // Full path to 'smartctl' command
excludeMetric struct {
temp,
percentUsed,
availSpare,
dataUnitsRead,
dataUnitsWrite,
hostReads,
hostWrites,
powerCycles,
powerOn,
UnsafeShutdowns,
mediaErrors,
errlogEntries,
warnTempTime,
critCompTime bool
}
}
func (m *SmartMonCollector) getSmartmonDevices() error {
// Use configured devices
if len(m.config.Devices) > 0 {
for _, configDevice := range m.config.Devices {
if !slices.Contains(m.config.ExcludeDevices, configDevice.Name) {
d := deviceT{
Name: configDevice.Name,
Type: configDevice.Type,
}
if m.config.UseSudo {
d.queryCommand = append(d.queryCommand, m.sudoCmd)
}
d.queryCommand = append(d.queryCommand, m.smartCtlCmd, "--json=c", "--device="+d.Type, "--all", d.Name)
m.devices = append(m.devices, d)
}
}
return nil
}
// Use scan command
var scanCmd []string
if m.config.UseSudo {
scanCmd = append(scanCmd, m.sudoCmd)
}
scanCmd = append(scanCmd, m.smartCtlCmd, "--scan", "--json=c")
command := exec.Command(scanCmd[0], scanCmd[1:]...)
stdout, err := command.Output()
if err != nil {
return fmt.Errorf(
"%s getSmartmonDevices(): Failed to execute device scan command %s: %w",
m.name, command.String(), err)
}
var scanOutput struct {
Devices []deviceT `json:"devices"`
}
err = json.Unmarshal(stdout, &scanOutput)
if err != nil {
return fmt.Errorf("%s getSmartmonDevices(): Failed to parse JSON output from device scan command: %w",
m.name, err)
}
m.devices = make([]deviceT, 0)
for _, d := range scanOutput.Devices {
if !slices.Contains(m.config.ExcludeDevices, d.Name) {
if m.config.UseSudo {
d.queryCommand = append(d.queryCommand, m.sudoCmd)
}
d.queryCommand = append(d.queryCommand, m.smartCtlCmd, "--json=c", "--device="+d.Type, "--all", d.Name)
m.devices = append(m.devices, d)
}
}
return nil
}
func (m *SmartMonCollector) Init(config json.RawMessage) error {
m.name = "SmartMonCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true
m.meta = map[string]string{
"source": m.name,
"group": "Disk",
}
m.tags = map[string]string{
"type": "node",
"stype": "disk",
}
// Read in the JSON configuration
if len(config) > 0 {
if err := json.Unmarshal(config, &m.config); err != nil {
return fmt.Errorf("%s Init(): Error reading config: %w", m.name, err)
}
}
for _, excludeMetric := range m.config.ExcludeMetrics {
switch excludeMetric {
case "smartmon_temp":
m.excludeMetric.temp = true
case "smartmon_percent_used":
m.excludeMetric.percentUsed = true
case "smartmon_avail_spare":
m.excludeMetric.availSpare = true
case "smartmon_data_units_read":
m.excludeMetric.dataUnitsRead = true
case "smartmon_data_units_write":
m.excludeMetric.dataUnitsWrite = true
case "smartmon_host_reads":
m.excludeMetric.hostReads = true
case "smartmon_host_writes":
m.excludeMetric.hostWrites = true
case "smartmon_power_cycles":
m.excludeMetric.powerCycles = true
case "smartmon_power_on":
m.excludeMetric.powerOn = true
case "smartmon_unsafe_shutdowns":
m.excludeMetric.UnsafeShutdowns = true
case "smartmon_media_errors":
m.excludeMetric.mediaErrors = true
case "smartmon_errlog_entries":
m.excludeMetric.errlogEntries = true
case "smartmon_warn_temp_time":
m.excludeMetric.warnTempTime = true
case "smartmon_crit_comp_time":
m.excludeMetric.critCompTime = true
default:
return fmt.Errorf("%s Init(): Unknown excluded metric: %s", m.name, excludeMetric)
}
}
// Check if sudo and smartctl are in search path
if m.config.UseSudo {
p, err := exec.LookPath("sudo")
if err != nil {
return fmt.Errorf("%s Init(): No sudo command found in search path: %w", m.name, err)
}
m.sudoCmd = p
}
p, err := exec.LookPath("smartctl")
if err != nil {
return fmt.Errorf("%s Init(): No smartctl command found in search path: %w", m.name, err)
}
m.smartCtlCmd = p
if err = m.getSmartmonDevices(); err != nil {
return err
}
m.init = true
return err
}
type SmartMonData struct {
SerialNumber string `json:"serial_number"`
UserCapacity struct {
Blocks int `json:"blocks"`
Bytes int `json:"bytes"`
} `json:"user_capacity"`
HealthLog struct {
// Available SMART health information:
// sudo smartctl -a --json=c /dev/nvme0 | jq --color-output | less --RAW-CONTROL-CHARS
Temperature int `json:"temperature"`
PercentageUsed int `json:"percentage_used"`
AvailableSpare int `json:"available_spare"`
DataUnitsRead int `json:"data_units_read"`
DataUnitsWrite int `json:"data_units_written"`
HostReads int `json:"host_reads"`
HostWrites int `json:"host_writes"`
PowerCycles int `json:"power_cycles"`
PowerOnHours int `json:"power_on_hours"`
UnsafeShutdowns int `json:"unsafe_shutdowns"`
MediaErrors int `json:"media_errors"`
NumErrorLogEntries int `json:"num_err_log_entries"`
WarnTempTime int `json:"warning_temp_time"`
CriticalCompTime int `json:"critical_comp_time"`
} `json:"nvme_smart_health_information_log"`
}
func (m *SmartMonCollector) Read(interval time.Duration, output chan lp.CCMessage) {
timestamp := time.Now()
for _, d := range m.devices {
var data SmartMonData
command := exec.Command(d.queryCommand[0], d.queryCommand[1:]...)
stdout, err := command.Output()
if err != nil {
cclog.ComponentError(m.name, "cannot read data for device", d.Name)
continue
}
err = json.Unmarshal(stdout, &data)
if err != nil {
cclog.ComponentError(m.name, "cannot unmarshal data for device", d.Name)
continue
}
if !m.excludeMetric.temp {
y, err := lp.NewMetric(
"smartmon_temp", m.tags, m.meta, data.HealthLog.Temperature, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
y.AddMeta("unit", "degC")
output <- y
}
}
if !m.excludeMetric.percentUsed {
y, err := lp.NewMetric(
"smartmon_percent_used", m.tags, m.meta, data.HealthLog.PercentageUsed, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
y.AddMeta("unit", "percent")
output <- y
}
}
if !m.excludeMetric.availSpare {
y, err := lp.NewMetric(
"smartmon_avail_spare", m.tags, m.meta, data.HealthLog.AvailableSpare, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
y.AddMeta("unit", "percent")
output <- y
}
}
if !m.excludeMetric.dataUnitsRead {
y, err := lp.NewMetric(
"smartmon_data_units_read", m.tags, m.meta, data.HealthLog.DataUnitsRead, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.dataUnitsWrite {
y, err := lp.NewMetric(
"smartmon_data_units_write", m.tags, m.meta, data.HealthLog.DataUnitsWrite, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.hostReads {
y, err := lp.NewMetric(
"smartmon_host_reads", m.tags, m.meta, data.HealthLog.HostReads, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.hostWrites {
y, err := lp.NewMetric(
"smartmon_host_writes", m.tags, m.meta, data.HealthLog.HostWrites, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.powerCycles {
y, err := lp.NewMetric(
"smartmon_power_cycles", m.tags, m.meta, data.HealthLog.PowerCycles, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.powerOn {
y, err := lp.NewMetric(
"smartmon_power_on", m.tags, m.meta, int64(data.HealthLog.PowerOnHours)*3600, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
y.AddMeta("unit", "sec")
output <- y
}
}
if !m.excludeMetric.UnsafeShutdowns {
y, err := lp.NewMetric(
"smartmon_unsafe_shutdowns", m.tags, m.meta, data.HealthLog.UnsafeShutdowns, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.mediaErrors {
y, err := lp.NewMetric(
"smartmon_media_errors", m.tags, m.meta, data.HealthLog.MediaErrors, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.errlogEntries {
y, err := lp.NewMetric(
"smartmon_errlog_entries", m.tags, m.meta, data.HealthLog.NumErrorLogEntries, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.warnTempTime {
y, err := lp.NewMetric(
"smartmon_warn_temp_time", m.tags, m.meta, data.HealthLog.WarnTempTime, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.critCompTime {
y, err := lp.NewMetric(
"smartmon_crit_comp_time", m.tags, m.meta, data.HealthLog.CriticalCompTime, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
}
}
func (m *SmartMonCollector) Close() {
m.init = false
}