Add config option to exclude metrics

This commit is contained in:
Holger Obermaier
2026-03-09 14:00:32 +01:00
parent 162b8cb6d4
commit a4332554f4
2 changed files with 183 additions and 84 deletions

View File

@@ -14,6 +14,7 @@ import (
type SmartMonCollectorConfig struct { type SmartMonCollectorConfig struct {
UseSudo bool `json:"use_sudo,omitempty"` UseSudo bool `json:"use_sudo,omitempty"`
ExcludeDevices []string `json:"exclude_devices,omitempty"` ExcludeDevices []string `json:"exclude_devices,omitempty"`
ExcludeMetrics []string `json:"excludeMetrics,omitempty"`
Devices []struct { Devices []struct {
Name string `json:"name"` Name string `json:"name"`
Type string `json:"type"` Type string `json:"type"`
@@ -34,6 +35,22 @@ type SmartMonCollector struct {
devices []deviceT // smartmon devices devices []deviceT // smartmon devices
sudoCmd string // Full path to 'sudo' command sudoCmd string // Full path to 'sudo' command
smartCtlCmd string // Full path to 'smartctl' command smartCtlCmd string // Full path to 'smartctl' command
excludeMetric struct {
temp,
percentUsed,
availSpare,
dataUnitsRead,
dataUnitsWrite,
hostReads,
hostWrites,
powerCycles,
powerOn,
UnsafeShutdowns,
mediaErrors,
errlogEntries,
warnTempTime,
critCompTime bool
}
} }
func (m *SmartMonCollector) getSmartmonDevices() error { func (m *SmartMonCollector) getSmartmonDevices() error {
@@ -116,6 +133,41 @@ func (m *SmartMonCollector) Init(config json.RawMessage) error {
return fmt.Errorf("%s Init(): Error reading config: %w", m.name, err) return fmt.Errorf("%s Init(): Error reading config: %w", m.name, err)
} }
} }
for _, excludeMetric := range m.config.ExcludeMetrics {
switch excludeMetric {
case "smartmon_temp":
m.excludeMetric.temp = true
case "smartmon_percent_used":
m.excludeMetric.percentUsed = true
case "smartmon_avail_spare":
m.excludeMetric.availSpare = true
case "smartmon_data_units_read":
m.excludeMetric.dataUnitsRead = true
case "smartmon_data_units_write":
m.excludeMetric.dataUnitsWrite = true
case "smartmon_host_reads":
m.excludeMetric.hostReads = true
case "smartmon_host_writes":
m.excludeMetric.hostWrites = true
case "smartmon_power_cycles":
m.excludeMetric.powerCycles = true
case "smartmon_power_on":
m.excludeMetric.powerOn = true
case "smartmon_unsafe_shutdowns":
m.excludeMetric.UnsafeShutdowns = true
case "smartmon_media_errors":
m.excludeMetric.mediaErrors = true
case "smartmon_errlog_entries":
m.excludeMetric.errlogEntries = true
case "smartmon_warn_temp_time":
m.excludeMetric.warnTempTime = true
case "smartmon_crit_comp_time":
m.excludeMetric.critCompTime = true
default:
return fmt.Errorf("%s Init(): Unknown excluded metric: %s", m.name, excludeMetric)
}
}
// Check if sudo and smartctl are in search path // Check if sudo and smartctl are in search path
if m.config.UseSudo { if m.config.UseSudo {
@@ -146,6 +198,8 @@ type SmartMonData struct {
Bytes int `json:"bytes"` Bytes int `json:"bytes"`
} `json:"user_capacity"` } `json:"user_capacity"`
HealthLog struct { HealthLog struct {
// Available SMART health information:
// sudo smartctl -a --json=c /dev/nvme0 | jq --color-output | less --RAW-CONTROL-CHARS
Temperature int `json:"temperature"` Temperature int `json:"temperature"`
PercentageUsed int `json:"percentage_used"` PercentageUsed int `json:"percentage_used"`
AvailableSpare int `json:"available_spare"` AvailableSpare int `json:"available_spare"`
@@ -159,7 +213,7 @@ type SmartMonData struct {
MediaErrors int `json:"media_errors"` MediaErrors int `json:"media_errors"`
NumErrorLogEntries int `json:"num_err_log_entries"` NumErrorLogEntries int `json:"num_err_log_entries"`
WarnTempTime int `json:"warning_temp_time"` WarnTempTime int `json:"warning_temp_time"`
CriticalTempTime int `json:"critical_comp_time"` CriticalCompTime int `json:"critical_comp_time"`
} `json:"nvme_smart_health_information_log"` } `json:"nvme_smart_health_information_log"`
} }
@@ -179,6 +233,7 @@ func (m *SmartMonCollector) Read(interval time.Duration, output chan lp.CCMessag
cclog.ComponentError(m.name, "cannot unmarshal data for device", d.Name) cclog.ComponentError(m.name, "cannot unmarshal data for device", d.Name)
continue continue
} }
if !m.excludeMetric.temp {
y, err := lp.NewMetric( y, err := lp.NewMetric(
"smartmon_temp", m.tags, m.meta, data.HealthLog.Temperature, timestamp) "smartmon_temp", m.tags, m.meta, data.HealthLog.Temperature, timestamp)
if err == nil { if err == nil {
@@ -186,89 +241,116 @@ func (m *SmartMonCollector) Read(interval time.Duration, output chan lp.CCMessag
y.AddMeta("unit", "degC") y.AddMeta("unit", "degC")
output <- y output <- y
} }
y, err = lp.NewMetric( }
if !m.excludeMetric.percentUsed {
y, err := lp.NewMetric(
"smartmon_percent_used", m.tags, m.meta, data.HealthLog.PercentageUsed, timestamp) "smartmon_percent_used", m.tags, m.meta, data.HealthLog.PercentageUsed, timestamp)
if err == nil { if err == nil {
y.AddTag("stype-id", d.Name) y.AddTag("stype-id", d.Name)
y.AddMeta("unit", "percent") y.AddMeta("unit", "percent")
output <- y output <- y
} }
y, err = lp.NewMetric( }
if !m.excludeMetric.availSpare {
y, err := lp.NewMetric(
"smartmon_avail_spare", m.tags, m.meta, data.HealthLog.AvailableSpare, timestamp) "smartmon_avail_spare", m.tags, m.meta, data.HealthLog.AvailableSpare, timestamp)
if err == nil { if err == nil {
y.AddTag("stype-id", d.Name) y.AddTag("stype-id", d.Name)
y.AddMeta("unit", "percent") y.AddMeta("unit", "percent")
output <- y output <- y
} }
y, err = lp.NewMetric( }
if !m.excludeMetric.dataUnitsRead {
y, err := lp.NewMetric(
"smartmon_data_units_read", m.tags, m.meta, data.HealthLog.DataUnitsRead, timestamp) "smartmon_data_units_read", m.tags, m.meta, data.HealthLog.DataUnitsRead, timestamp)
if err == nil { if err == nil {
y.AddTag("stype-id", d.Name) y.AddTag("stype-id", d.Name)
output <- y output <- y
} }
y, err = lp.NewMetric( }
if !m.excludeMetric.dataUnitsWrite {
y, err := lp.NewMetric(
"smartmon_data_units_write", m.tags, m.meta, data.HealthLog.DataUnitsWrite, timestamp) "smartmon_data_units_write", m.tags, m.meta, data.HealthLog.DataUnitsWrite, timestamp)
if err == nil { if err == nil {
y.AddTag("stype-id", d.Name) y.AddTag("stype-id", d.Name)
output <- y output <- y
} }
y, err = lp.NewMetric( }
if !m.excludeMetric.hostReads {
y, err := lp.NewMetric(
"smartmon_host_reads", m.tags, m.meta, data.HealthLog.HostReads, timestamp) "smartmon_host_reads", m.tags, m.meta, data.HealthLog.HostReads, timestamp)
if err == nil { if err == nil {
y.AddTag("stype-id", d.Name) y.AddTag("stype-id", d.Name)
output <- y output <- y
} }
y, err = lp.NewMetric( }
if !m.excludeMetric.hostWrites {
y, err := lp.NewMetric(
"smartmon_host_writes", m.tags, m.meta, data.HealthLog.HostWrites, timestamp) "smartmon_host_writes", m.tags, m.meta, data.HealthLog.HostWrites, timestamp)
if err == nil { if err == nil {
y.AddTag("stype-id", d.Name) y.AddTag("stype-id", d.Name)
output <- y output <- y
} }
y, err = lp.NewMetric( }
if !m.excludeMetric.powerCycles {
y, err := lp.NewMetric(
"smartmon_power_cycles", m.tags, m.meta, data.HealthLog.PowerCycles, timestamp) "smartmon_power_cycles", m.tags, m.meta, data.HealthLog.PowerCycles, timestamp)
if err == nil { if err == nil {
y.AddTag("stype-id", d.Name) y.AddTag("stype-id", d.Name)
output <- y output <- y
} }
y, err = lp.NewMetric( }
if !m.excludeMetric.powerOn {
y, err := lp.NewMetric(
"smartmon_power_on", m.tags, m.meta, int64(data.HealthLog.PowerOnHours)*3600, timestamp) "smartmon_power_on", m.tags, m.meta, int64(data.HealthLog.PowerOnHours)*3600, timestamp)
if err == nil { if err == nil {
y.AddTag("stype-id", d.Name) y.AddTag("stype-id", d.Name)
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
} }
y, err = lp.NewMetric( }
if !m.excludeMetric.UnsafeShutdowns {
y, err := lp.NewMetric(
"smartmon_unsafe_shutdowns", m.tags, m.meta, data.HealthLog.UnsafeShutdowns, timestamp) "smartmon_unsafe_shutdowns", m.tags, m.meta, data.HealthLog.UnsafeShutdowns, timestamp)
if err == nil { if err == nil {
y.AddTag("stype-id", d.Name) y.AddTag("stype-id", d.Name)
output <- y output <- y
} }
y, err = lp.NewMetric( }
if !m.excludeMetric.mediaErrors {
y, err := lp.NewMetric(
"smartmon_media_errors", m.tags, m.meta, data.HealthLog.MediaErrors, timestamp) "smartmon_media_errors", m.tags, m.meta, data.HealthLog.MediaErrors, timestamp)
if err == nil { if err == nil {
y.AddTag("stype-id", d.Name) y.AddTag("stype-id", d.Name)
output <- y output <- y
} }
y, err = lp.NewMetric( }
if !m.excludeMetric.errlogEntries {
y, err := lp.NewMetric(
"smartmon_errlog_entries", m.tags, m.meta, data.HealthLog.NumErrorLogEntries, timestamp) "smartmon_errlog_entries", m.tags, m.meta, data.HealthLog.NumErrorLogEntries, timestamp)
if err == nil { if err == nil {
y.AddTag("stype-id", d.Name) y.AddTag("stype-id", d.Name)
output <- y output <- y
} }
y, err = lp.NewMetric( }
if !m.excludeMetric.warnTempTime {
y, err := lp.NewMetric(
"smartmon_warn_temp_time", m.tags, m.meta, data.HealthLog.WarnTempTime, timestamp) "smartmon_warn_temp_time", m.tags, m.meta, data.HealthLog.WarnTempTime, timestamp)
if err == nil { if err == nil {
y.AddTag("stype-id", d.Name) y.AddTag("stype-id", d.Name)
output <- y output <- y
} }
y, err = lp.NewMetric( }
"smartmon_crit_temp_time", m.tags, m.meta, data.HealthLog.CriticalTempTime, timestamp) if !m.excludeMetric.critCompTime {
y, err := lp.NewMetric(
"smartmon_crit_comp_time", m.tags, m.meta, data.HealthLog.CriticalCompTime, timestamp)
if err == nil { if err == nil {
y.AddTag("stype-id", d.Name) y.AddTag("stype-id", d.Name)
output <- y output <- y
} }
} }
} }
}
func (m *SmartMonCollector) Close() { func (m *SmartMonCollector) Close() {
m.init = false m.init = false

View File

@@ -1,3 +1,14 @@
<!--
---
title: smartmon metric collector
description: Collect S.M.A.R.T data from NVMEs
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/smartmonMetric.md
---
-->
## `smartmon` collector ## `smartmon` collector
```json ```json
@@ -6,6 +17,10 @@
"exclude_devices": [ "exclude_devices": [
"/dev/sda" "/dev/sda"
], ],
"excludeMetrics": [
"smartmon_warn_temp_time",
"smartmon_crit_comp_time"
]
"devices": [ "devices": [
"name": "/dev/nvme0" "name": "/dev/nvme0"
"type": "nvme" "type": "nvme"
@@ -13,9 +28,12 @@
} }
``` ```
The `smartmon` collector reads the data from the command `smartctl`. It retrieves S.M.A.R.T data from disks The `smartmon` collector retrieves S.M.A.R.T data from NVMEs via command `smartctl`.
Available NVMEs can be either automatically detected by a device scan or manually added with the "devices" config option.
Metrics: Metrics:
* `smartmon_temp`: Temperature of the device (`unit=degC`) * `smartmon_temp`: Temperature of the device (`unit=degC`)
* `smartmon_avail_spare`: Amount of spare left (`unit=percent`) * `smartmon_avail_spare`: Amount of spare left (`unit=percent`)
* `smartmon_percent_used`: Percentage of the device is used (`unit=percent`) * `smartmon_percent_used`: Percentage of the device is used (`unit=percent`)
@@ -29,5 +47,4 @@ Metrics:
* `smartmon_media_errors`: Media errors of the device * `smartmon_media_errors`: Media errors of the device
* `smartmon_errlog_entries`: Error log entries * `smartmon_errlog_entries`: Error log entries
* `smartmon_warn_temp_time`: Time above the warning temperature threshold * `smartmon_warn_temp_time`: Time above the warning temperature threshold
* `smartmon_crit_temp_time`: Time above the critical temperature threshold * `smartmon_crit_comp_time`: Time above the critical composite temperature threshold