diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 3b11e78..2cac282 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -37,6 +37,7 @@ var AvailableCollectors = map[string]MetricCollector{ "beegfs_meta": new(BeegfsMetaCollector), "beegfs_storage": new(BeegfsStorageCollector), "rocm_smi": new(RocmSmiCollector), + "smartmon": new(SmartMonCollector), "schedstat": new(SchedstatCollector), } diff --git a/collectors/smartmonMetric.go b/collectors/smartmonMetric.go new file mode 100644 index 0000000..10f358f --- /dev/null +++ b/collectors/smartmonMetric.go @@ -0,0 +1,220 @@ +package collectors + +import ( + "encoding/json" + "os/exec" + "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" +) + +type SmartMonCollectorConfig struct { + UseSudo bool `json:"use_sudo"` + ExcludeDevices []string `json:"exclude_devices"` +} + +type SmartMonCollector struct { + metricCollector + config SmartMonCollectorConfig // the configuration structure + meta map[string]string // default meta information + tags map[string]string // default tags + devices []string // smartmon devices + sudoCmd string // Full path to 'sudo' command + smartCtlCmd string // Full path to 'smartctl' command +} + +func (m *SmartMonCollector) getSmartmonDevices() error { + var command *exec.Cmd + var scan struct { + Devices []struct { + Name string `json:"name"` + Type string `json:"type"` + } `json:"devices"` + } + m.devices = make([]string, 0) + if m.config.UseSudo { + command = exec.Command(m.sudoCmd, m.smartCtlCmd, "--scan", "-j") + } else { + command = exec.Command(m.smartCtlCmd, "--scan", "-j") + } + command.Wait() + stdout, err := command.Output() + if err != nil { + return err + } + err = json.Unmarshal(stdout, &scan) + if err != nil { + return err + } + for _, d := range scan.Devices { + if len(d.Name) > 0 { + m.devices = append(m.devices, d.Name) + } + } + + return nil +} + +func (m *SmartMonCollector) Init(config json.RawMessage) error { + var err error = nil + m.name = "SmartMonCollector" + m.setup() + m.parallel = true + m.meta = map[string]string{"source": m.name, "group": "Disk"} + m.tags = map[string]string{"type": "node", "stype": "disk"} + // Read in the JSON configuration + if len(config) > 0 { + err = json.Unmarshal(config, &m.config) + if err != nil { + cclog.ComponentError(m.name, "Error reading config:", err.Error()) + return err + } + } + if m.config.UseSudo { + p, err := exec.LookPath("sudo") + if err != nil { + return err + } + m.sudoCmd = p + } + p, err := exec.LookPath("smartctl") + if err != nil { + return err + } + m.smartCtlCmd = p + err = m.getSmartmonDevices() + if err != nil { + return err + } + + m.init = true + return err +} + +type SmartMonData struct { + SerialNumber string `json:"serial_number"` + UserCapacity struct { + Blocks int `json:"blocks"` + Bytes int `json:"bytes"` + } `json:"user_capacity"` + HealthLog struct { + Temperature int `json:"temperature"` + PercentageUsed int `json:"percentage_used"` + AvailableSpare int `json:"available_spare"` + DataUnitsRead int `json:"data_units_read"` + DataUnitsWrite int `json:"data_units_written"` + HostReads int `json:"host_reads"` + HostWrites int `json:"host_writes"` + PowerCycles int `json:"power_cycles"` + PowerOnHours int `json:"power_on_hours"` + UnsafeShutdowns int `json:"unsafe_shutdowns"` + MediaErrors int `json:"media_errors"` + NumErrorLogEntries int `json:"num_err_log_entries"` + WarnTempTime int `json:"warning_temp_time"` + CriticalTempTime int `json:"critical_comp_time"` + } `json:"nvme_smart_health_information_log"` +} + +func (m *SmartMonCollector) Read(interval time.Duration, output chan lp.CCMetric) { + timestamp := time.Now() + for _, d := range m.devices { + var command *exec.Cmd + var data SmartMonData + if m.config.UseSudo { + command = exec.Command(m.sudoCmd, m.smartCtlCmd, "-j", "-a", d) + } else { + command = exec.Command(m.smartCtlCmd, "-j", "-a", d) + } + command.Wait() + stdout, err := command.Output() + if err != nil { + cclog.ComponentError(m.name, "cannot read data for device", d) + continue + } + err = json.Unmarshal(stdout, &data) + if err != nil { + cclog.ComponentError(m.name, "cannot unmarshal data for device", d) + continue + } + y, err := lp.New("smartmon_temp", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.Temperature}, timestamp) + if err == nil { + y.AddTag("stype-id", d) + y.AddMeta("unit", "degC") + output <- y + } + y, err = lp.New("smartmon_percent_used", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.PercentageUsed}, timestamp) + if err == nil { + y.AddTag("stype-id", d) + y.AddMeta("unit", "percent") + output <- y + } + y, err = lp.New("smartmon_avail_spare", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.AvailableSpare}, timestamp) + if err == nil { + y.AddTag("stype-id", d) + y.AddMeta("unit", "percent") + output <- y + } + y, err = lp.New("smartmon_data_units_read", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.DataUnitsRead}, timestamp) + if err == nil { + y.AddTag("stype-id", d) + output <- y + } + y, err = lp.New("smartmon_data_units_write", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.DataUnitsWrite}, timestamp) + if err == nil { + y.AddTag("stype-id", d) + output <- y + } + y, err = lp.New("smartmon_host_reads", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.HostReads}, timestamp) + if err == nil { + y.AddTag("stype-id", d) + output <- y + } + y, err = lp.New("smartmon_host_writes", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.HostWrites}, timestamp) + if err == nil { + y.AddTag("stype-id", d) + output <- y + } + y, err = lp.New("smartmon_power_cycles", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.PowerCycles}, timestamp) + if err == nil { + y.AddTag("stype-id", d) + output <- y + } + y, err = lp.New("smartmon_power_on", m.tags, m.meta, map[string]interface{}{"value": int64(data.HealthLog.PowerOnHours) * 3600}, timestamp) + if err == nil { + y.AddTag("stype-id", d) + y.AddMeta("unit", "seconds") + output <- y + } + y, err = lp.New("smartmon_unsafe_shutdowns", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.UnsafeShutdowns}, timestamp) + if err == nil { + y.AddTag("stype-id", d) + output <- y + } + y, err = lp.New("smartmon_media_errors", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.MediaErrors}, timestamp) + if err == nil { + y.AddTag("stype-id", d) + output <- y + } + y, err = lp.New("smartmon_errlog_entries", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.NumErrorLogEntries}, timestamp) + if err == nil { + y.AddTag("stype-id", d) + output <- y + } + y, err = lp.New("smartmon_warn_temp_time", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.WarnTempTime}, timestamp) + if err == nil { + y.AddTag("stype-id", d) + output <- y + } + y, err = lp.New("smartmon_crit_temp_time", m.tags, m.meta, map[string]interface{}{"value": data.HealthLog.CriticalTempTime}, timestamp) + if err == nil { + y.AddTag("stype-id", d) + output <- y + } + } + +} + +func (m *SmartMonCollector) Close() { + m.init = false +} diff --git a/collectors/smartmonMetric.md b/collectors/smartmonMetric.md new file mode 100644 index 0000000..a69aacd --- /dev/null +++ b/collectors/smartmonMetric.md @@ -0,0 +1,29 @@ +## `smartmon` collector + +```json + "smartmon": { + "use_sudo" : true, + "exclude_devices": [ + "/dev/sda", + ] + } +``` + +The `smartmon` collector reads the data from the command `smartctl`. It retrieves S.M.A.R.T data from disks + +Metrics: +* `smartmon_temp`: Temperature of the device (`unit=degC`) +* `smartmon_avail_spare`: Amount of spare left (`unit=percent`) +* `smartmon_percent_used`: Percentage of the device is used (`unit=percent`) +* `smartmon_data_units_read`: Read data units +* `smartmon_data_units_write`: Written data units +* `smartmon_host_reads`: Read operations +* `smartmon_host_writes`: Write operations +* `smartmon_power_cycles`: Number of power cycles +* `smartmon_power_on`: Seconds the device is powered on (`unit=seconds`) +* `smartmon_unsafe_shutdowns`: Count of unsafe shutdowns +* `smartmon_media_errors`: Media errors of the device +* `smartmon_errlog_entries`: Error log entries +* `smartmon_warn_temp_time`: Time above the warning temperature threshold +* `smartmon_crit_temp_time`: Time above the critical temperature threshold +