cc-metric-collector/receivers/redfishReceiver.go

1019 lines
30 KiB
Go
Raw Permalink Normal View History

2022-04-19 12:05:03 +02:00
package receivers
import (
"bytes"
"crypto/tls"
2022-04-19 12:05:03 +02:00
"encoding/json"
"fmt"
"io"
2024-03-06 14:59:47 +01:00
"maps"
"net/http"
2022-04-19 12:05:03 +02:00
"strconv"
"strings"
2022-04-19 12:05:03 +02:00
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
2024-07-13 02:23:58 +02:00
lp "github.com/ClusterCockpit/cc-energy-manager/pkg/cc-message"
"github.com/ClusterCockpit/cc-metric-collector/pkg/hostlist"
2022-04-19 12:05:03 +02:00
// See: https://pkg.go.dev/github.com/stmcginnis/gofish
"github.com/stmcginnis/gofish"
2022-08-10 10:30:59 +02:00
"github.com/stmcginnis/gofish/common"
"github.com/stmcginnis/gofish/redfish"
2022-04-19 12:05:03 +02:00
)
2022-08-16 15:14:20 +02:00
type RedfishReceiverClientConfig struct {
2022-08-16 15:14:20 +02:00
// Hostname the redfish service belongs to
Hostname string
2022-08-10 10:30:59 +02:00
2022-08-16 15:14:20 +02:00
// is metric excluded globally or per client
isExcluded map[string](bool)
2022-08-10 16:24:21 +02:00
2022-08-16 15:14:20 +02:00
doPowerMetric bool
doProcessorMetrics bool
2024-03-06 14:59:47 +01:00
doSensors bool
2022-08-16 15:14:20 +02:00
doThermalMetrics bool
2022-08-10 16:24:21 +02:00
skipProcessorMetricsURL map[string]bool
2024-03-06 14:59:47 +01:00
// readSensorURLs stores for each chassis ID a list of sensor URLs to read
readSensorURLs map[string][]string
2022-08-16 15:14:20 +02:00
gofish gofish.ClientConfig
}
2022-08-10 16:24:21 +02:00
2022-08-16 15:14:20 +02:00
// RedfishReceiver configuration:
type RedfishReceiver struct {
receiver
2022-08-10 16:24:21 +02:00
2022-08-16 15:14:20 +02:00
config struct {
fanout int
Interval time.Duration
HttpTimeout time.Duration
2022-08-10 16:24:21 +02:00
2022-08-16 15:14:20 +02:00
// Client config for each redfish service
ClientConfigs []RedfishReceiverClientConfig
2022-04-19 12:05:03 +02:00
}
done chan bool // channel to finish / stop redfish receiver
wg sync.WaitGroup // wait group for redfish receiver
}
2024-03-06 14:59:47 +01:00
// deleteEmptyTags removes tags or meta data tags with empty value
func deleteEmptyTags(tags map[string]string) {
maps.DeleteFunc(
tags,
func(key string, value string) bool {
return value == ""
},
)
}
// setMetricValue sets the value entry in the fields map
func setMetricValue(value any) map[string]interface{} {
return map[string]interface{}{
"value": value,
}
}
// sendMetric sends the metric through the sink channel
func (r *RedfishReceiver) sendMetric(name string, tags map[string]string, meta map[string]string, value any, timestamp time.Time) {
deleteEmptyTags(tags)
deleteEmptyTags(meta)
2024-07-13 02:23:58 +02:00
y, err := lp.NewMessage(name, tags, meta, setMetricValue(value), timestamp)
2024-03-06 14:59:47 +01:00
if err == nil {
r.sink <- y
}
}
// readSensors reads sensors from a redfish device
// See: https://redfish.dmtf.org/schemas/v1/Sensor.json
// Redfish URI: /redfish/v1/Chassis/{ChassisId}/Sensors/{SensorId}
func (r *RedfishReceiver) readSensors(
clientConfig *RedfishReceiverClientConfig,
chassis *redfish.Chassis) error {
writeTemperatureSensor := func(sensor *redfish.Sensor) {
tags := map[string]string{
"hostname": clientConfig.Hostname,
"type": "node",
// ChassisType shall indicate the physical form factor for the type of chassis
"chassis_typ": string(chassis.ChassisType),
// Chassis name
"chassis_name": chassis.Name,
// ID uniquely identifies the resource
"sensor_id": sensor.ID,
// The area or device to which this sensor measurement applies
"temperature_physical_context": string(sensor.PhysicalContext),
// Name
"temperature_name": sensor.Name,
}
// Set meta data tags
meta := map[string]string{
"source": r.name,
"group": "Temperature",
"unit": "degC",
}
r.sendMetric("temperature", tags, meta, sensor.Reading, time.Now())
}
writeFanSpeedSensor := func(sensor *redfish.Sensor) {
tags := map[string]string{
"hostname": clientConfig.Hostname,
"type": "node",
// ChassisType shall indicate the physical form factor for the type of chassis
"chassis_typ": string(chassis.ChassisType),
// Chassis name
"chassis_name": chassis.Name,
// ID uniquely identifies the resource
"sensor_id": sensor.ID,
// The area or device to which this sensor measurement applies
"fan_physical_context": string(sensor.PhysicalContext),
// Name
"fan_name": sensor.Name,
}
// Set meta data tags
meta := map[string]string{
"source": r.name,
"group": "FanSpeed",
"unit": string(sensor.ReadingUnits),
}
r.sendMetric("fan_speed", tags, meta, sensor.Reading, time.Now())
}
writePowerSensor := func(sensor *redfish.Sensor) {
// Set tags
tags := map[string]string{
"hostname": clientConfig.Hostname,
"type": "node",
// ChassisType shall indicate the physical form factor for the type of chassis
"chassis_typ": string(chassis.ChassisType),
// Chassis name
"chassis_name": chassis.Name,
// ID uniquely identifies the resource
"sensor_id": sensor.ID,
// The area or device to which this sensor measurement applies
"power_physical_context": string(sensor.PhysicalContext),
// Name
"power_name": sensor.Name,
}
// Set meta data tags
meta := map[string]string{
"source": r.name,
"group": "Energy",
"unit": "watts",
}
r.sendMetric("power", tags, meta, sensor.Reading, time.Now())
}
if _, ok := clientConfig.readSensorURLs[chassis.ID]; !ok {
// First time run of read sensors for this chassis
clientConfig.readSensorURLs[chassis.ID] = make([]string, 0)
// Get sensor information for this chassis
sensors, err := chassis.Sensors()
if err != nil {
return fmt.Errorf("readSensors: chassis.Sensors() failed: %v", err)
}
// Skip empty sensors information
if sensors == nil {
return nil
}
for _, sensor := range sensors {
// Skip all sensors which are not in enabled state or which are unhealthy
if sensor.Status.State != common.EnabledState || sensor.Status.Health != common.OKHealth {
continue
}
// Skip sensors with missing readings units or type
if sensor.ReadingUnits == "" || sensor.ReadingType == "" {
continue
}
// Power readings
if (sensor.ReadingType == redfish.PowerReadingType && sensor.ReadingUnits == "Watts") ||
(sensor.ReadingType == redfish.CurrentReadingType && sensor.ReadingUnits == "Watts") {
if clientConfig.isExcluded["power"] {
continue
}
clientConfig.readSensorURLs[chassis.ID] = append(clientConfig.readSensorURLs[chassis.ID], sensor.ODataID)
writePowerSensor(sensor)
continue
}
// Fan speed readings
if (sensor.ReadingType == redfish.AirFlowReadingType && sensor.ReadingUnits == "RPM") ||
(sensor.ReadingType == redfish.AirFlowReadingType && sensor.ReadingUnits == "Percent") {
// Skip, when fan_speed metric is excluded
if clientConfig.isExcluded["fan_speed"] {
continue
}
clientConfig.readSensorURLs[chassis.ID] = append(clientConfig.readSensorURLs[chassis.ID], sensor.ODataID)
writeFanSpeedSensor(sensor)
}
// Temperature readings
if sensor.ReadingType == redfish.TemperatureReadingType && sensor.ReadingUnits == "C" {
if clientConfig.isExcluded["temperature"] {
continue
}
clientConfig.readSensorURLs[chassis.ID] = append(clientConfig.readSensorURLs[chassis.ID], sensor.ODataID)
writeTemperatureSensor(sensor)
continue
}
}
} else {
common.CollectCollection(
func(uri string) {
sensor, err := redfish.GetSensor(chassis.GetClient(), uri)
if err != nil {
cclog.ComponentError(r.name, "redfish.GetSensor() for uri '", uri, "' failed")
}
// Power readings
if (sensor.ReadingType == redfish.PowerReadingType && sensor.ReadingUnits == "Watts") ||
(sensor.ReadingType == redfish.CurrentReadingType && sensor.ReadingUnits == "Watts") {
writePowerSensor(sensor)
return
}
// Fan speed readings
if (sensor.ReadingType == redfish.AirFlowReadingType && sensor.ReadingUnits == "RPM") ||
(sensor.ReadingType == redfish.AirFlowReadingType && sensor.ReadingUnits == "Percent") {
writeFanSpeedSensor(sensor)
return
}
// Temperature readings
if sensor.ReadingType == redfish.TemperatureReadingType && sensor.ReadingUnits == "C" {
writeTemperatureSensor(sensor)
return
}
},
clientConfig.readSensorURLs[chassis.ID])
}
return nil
}
2022-08-16 15:14:20 +02:00
// readThermalMetrics reads thermal metrics from a redfish device
2024-03-06 14:59:47 +01:00
// See: https://redfish.dmtf.org/schemas/v1/Thermal.json
// Redfish URI: /redfish/v1/Chassis/{ChassisId}/Thermal
// -> deprecated in favor of the ThermalSubsystem schema
// -> on Lenovo servers /redfish/v1/Chassis/{ChassisId}/ThermalSubsystem/ThermalMetrics links to /redfish/v1/Chassis/{ChassisId}/Sensors/{SensorId}
2022-08-16 15:14:20 +02:00
func (r *RedfishReceiver) readThermalMetrics(
clientConfig *RedfishReceiverClientConfig,
chassis *redfish.Chassis) error {
2022-08-10 16:24:21 +02:00
2022-08-16 15:14:20 +02:00
// Get thermal information for each chassis
thermal, err := chassis.Thermal()
if err != nil {
return fmt.Errorf("readMetrics: chassis.Thermal() failed: %v", err)
}
2022-08-10 10:30:59 +02:00
2022-08-16 15:14:20 +02:00
// Skip empty thermal information
if thermal == nil {
return nil
}
2022-08-10 10:30:59 +02:00
2022-08-16 15:14:20 +02:00
timestamp := time.Now()
2022-08-10 10:30:59 +02:00
2022-08-16 15:14:20 +02:00
for _, temperature := range thermal.Temperatures {
2022-08-10 10:30:59 +02:00
2022-08-16 15:14:20 +02:00
// Skip, when temperature metric is excluded
if clientConfig.isExcluded["temperature"] {
break
2022-08-10 10:30:59 +02:00
}
2022-08-16 15:14:20 +02:00
// Skip all temperatures which are not in enabled state
if temperature.Status.State != "" && temperature.Status.State != common.EnabledState {
2022-08-16 15:14:20 +02:00
continue
}
2022-08-10 16:24:21 +02:00
2022-08-16 15:14:20 +02:00
tags := map[string]string{
"hostname": clientConfig.Hostname,
"type": "node",
// ChassisType shall indicate the physical form factor for the type of chassis
"chassis_typ": string(chassis.ChassisType),
// Chassis name
"chassis_name": chassis.Name,
// ID uniquely identifies the resource
"temperature_id": temperature.ID,
// MemberID shall uniquely identify the member within the collection. For
// services supporting Redfish v1.6 or higher, this value shall be the
// zero-based array index.
"temperature_member_id": temperature.MemberID,
// PhysicalContext shall be a description of the affected device or region
// within the chassis to which this temperature measurement applies
"temperature_physical_context": string(temperature.PhysicalContext),
// Name
"temperature_name": temperature.Name,
}
2022-08-10 16:24:21 +02:00
2022-08-16 15:14:20 +02:00
// Set meta data tags
meta := map[string]string{
"source": r.name,
"group": "Temperature",
"unit": "degC",
}
2022-08-10 16:24:21 +02:00
2022-08-16 15:14:20 +02:00
// ReadingCelsius shall be the current value of the temperature sensor's reading.
value := temperature.ReadingCelsius
2022-08-10 16:24:21 +02:00
2024-03-06 14:59:47 +01:00
r.sendMetric("temperature", tags, meta, value, timestamp)
2022-08-10 10:30:59 +02:00
}
2022-08-16 15:14:20 +02:00
for _, fan := range thermal.Fans {
// Skip, when fan_speed metric is excluded
if clientConfig.isExcluded["fan_speed"] {
break
2022-08-10 10:30:59 +02:00
}
2022-08-16 15:14:20 +02:00
// Skip all fans which are not in enabled state
if fan.Status.State != common.EnabledState {
continue
2022-08-10 10:30:59 +02:00
}
2022-08-16 15:14:20 +02:00
tags := map[string]string{
"hostname": clientConfig.Hostname,
"type": "node",
// ChassisType shall indicate the physical form factor for the type of chassis
"chassis_typ": string(chassis.ChassisType),
// Chassis name
"chassis_name": chassis.Name,
// ID uniquely identifies the resource
"fan_id": fan.ID,
// MemberID shall uniquely identify the member within the collection. For
// services supporting Redfish v1.6 or higher, this value shall be the
// zero-based array index.
"fan_member_id": fan.MemberID,
// PhysicalContext shall be a description of the affected device or region
// within the chassis to which this fan is associated
"fan_physical_context": string(fan.PhysicalContext),
// Name
"fan_name": fan.Name,
2022-08-10 10:30:59 +02:00
}
2022-08-16 15:14:20 +02:00
// Set meta data tags
meta := map[string]string{
"source": r.name,
"group": "FanSpeed",
"unit": string(fan.ReadingUnits),
}
2022-08-10 10:30:59 +02:00
2024-03-06 14:59:47 +01:00
r.sendMetric("fan_speed", tags, meta, fan.Reading, timestamp)
2022-08-16 15:14:20 +02:00
}
2022-08-10 10:30:59 +02:00
2022-08-16 15:14:20 +02:00
return nil
}
2022-08-10 10:30:59 +02:00
2022-08-16 15:14:20 +02:00
// readPowerMetrics reads power metrics from a redfish device
2024-03-06 14:59:47 +01:00
// See: https://redfish.dmtf.org/schemas/v1/Power.json
// Redfish URI: /redfish/v1/Chassis/{ChassisId}/Power
// -> deprecated in favor of the PowerSubsystem schema
2022-08-16 15:14:20 +02:00
func (r *RedfishReceiver) readPowerMetrics(
clientConfig *RedfishReceiverClientConfig,
chassis *redfish.Chassis) error {
2022-08-10 10:30:59 +02:00
2022-08-16 15:14:20 +02:00
// Get power information for each chassis
power, err := chassis.Power()
if err != nil {
return fmt.Errorf("readMetrics: chassis.Power() failed: %v", err)
}
2022-08-10 10:30:59 +02:00
2022-08-16 15:14:20 +02:00
// Skip empty power information
if power == nil {
2022-08-10 10:30:59 +02:00
return nil
}
2022-08-16 15:14:20 +02:00
timestamp := time.Now()
2022-08-16 15:14:20 +02:00
// Read min, max and average consumed watts for each power control
for _, pc := range power.PowerControl {
// Skip all power controls which are not in enabled state
if pc.Status.State != "" && pc.Status.State != common.EnabledState {
2022-08-16 15:14:20 +02:00
continue
}
2022-08-16 15:14:20 +02:00
// Map of collected metrics
metrics := make(map[string]float32)
2022-08-16 15:14:20 +02:00
// PowerConsumedWatts shall represent the actual power being consumed (in
// Watts) by the chassis
if !clientConfig.isExcluded["consumed_watts"] {
metrics["consumed_watts"] = pc.PowerConsumedWatts
}
2022-08-16 15:14:20 +02:00
// AverageConsumedWatts shall represent the
// average power level that occurred averaged over the last IntervalInMin
// minutes.
if !clientConfig.isExcluded["average_consumed_watts"] {
metrics["average_consumed_watts"] = pc.PowerMetrics.AverageConsumedWatts
}
2022-08-16 15:14:20 +02:00
// MinConsumedWatts shall represent the
// minimum power level in watts that occurred within the last
// IntervalInMin minutes.
if !clientConfig.isExcluded["min_consumed_watts"] {
metrics["min_consumed_watts"] = pc.PowerMetrics.MinConsumedWatts
}
2022-08-16 15:14:20 +02:00
// MaxConsumedWatts shall represent the
// maximum power level in watts that occurred within the last
// IntervalInMin minutes
if !clientConfig.isExcluded["max_consumed_watts"] {
metrics["max_consumed_watts"] = pc.PowerMetrics.MaxConsumedWatts
}
// IntervalInMin shall represent the time interval (or window), in minutes,
// in which the PowerMetrics properties are measured over.
// Should be an integer, but some Dell implementations return as a float
intervalInMin :=
strconv.FormatFloat(
float64(pc.PowerMetrics.IntervalInMin), 'f', -1, 32)
// Set tags
tags := map[string]string{
2022-08-16 15:14:20 +02:00
"hostname": clientConfig.Hostname,
"type": "node",
// ChassisType shall indicate the physical form factor for the type of chassis
"chassis_typ": string(chassis.ChassisType),
// Chassis name
"chassis_name": chassis.Name,
// ID uniquely identifies the resource
2022-08-16 15:14:20 +02:00
"power_control_id": pc.ID,
// MemberID shall uniquely identify the member within the collection. For
// services supporting Redfish v1.6 or higher, this value shall be the
// zero-based array index.
"power_control_member_id": pc.MemberID,
// PhysicalContext shall be a description of the affected device(s) or region
// within the chassis to which this power control applies.
"power_control_physical_context": string(pc.PhysicalContext),
// Name
"power_control_name": pc.Name,
}
// Set meta data tags
2022-08-16 15:14:20 +02:00
meta := map[string]string{
"source": r.name,
"group": "Energy",
"interval_in_minutes": intervalInMin,
"unit": "watts",
}
2022-08-16 15:14:20 +02:00
for name, value := range metrics {
2024-03-06 14:59:47 +01:00
r.sendMetric(name, tags, meta, value, timestamp)
}
2022-08-16 15:14:20 +02:00
}
return nil
}
// readProcessorMetrics reads processor metrics from a redfish device
// See: https://redfish.dmtf.org/schemas/v1/ProcessorMetrics.json
2024-03-06 14:59:47 +01:00
// Redfish URI: /redfish/v1/Systems/{ComputerSystemId}/Processors/{ProcessorId}/ProcessorMetrics
2022-08-16 15:14:20 +02:00
func (r *RedfishReceiver) readProcessorMetrics(
clientConfig *RedfishReceiverClientConfig,
processor *redfish.Processor) error {
timestamp := time.Now()
// URL to processor metrics
2022-08-16 15:14:20 +02:00
URL := processor.ODataID + "/ProcessorMetrics"
// Skip previously detected non existing URLs
if clientConfig.skipProcessorMetricsURL[URL] {
return nil
}
2024-01-22 15:46:18 +01:00
resp, err := processor.GetClient().Get(URL)
2022-08-16 15:14:20 +02:00
if err != nil {
// Skip non existing URLs
if statusCode := err.(*common.Error).HTTPReturnedStatusCode; statusCode == http.StatusNotFound {
clientConfig.skipProcessorMetricsURL[URL] = true
return nil
}
2024-01-22 15:46:18 +01:00
return fmt.Errorf("processor.GetClient().Get(%v) failed: %+w", URL, err)
}
2022-08-16 15:14:20 +02:00
var processorMetrics struct {
common.Entity
ODataType string `json:"@odata.type"`
ODataEtag string `json:"@odata.etag"`
Description string `json:"Description"`
// This property shall contain the power, in watts, that the processor has consumed.
ConsumedPowerWatt float32 `json:"ConsumedPowerWatt"`
// This property shall contain the temperature, in Celsius, of the processor.
TemperatureCelsius float32 `json:"TemperatureCelsius"`
}
body, err := io.ReadAll(resp.Body)
2022-08-16 15:14:20 +02:00
if err != nil {
2024-01-22 15:46:18 +01:00
return fmt.Errorf("unable to read response body for processor metrics: %+w", err)
}
err = json.Unmarshal(body, &processorMetrics)
if err != nil {
return fmt.Errorf(
"unable to unmarshal JSON='%s' for processor metrics: %+w",
string(body),
err,
)
2022-08-16 15:14:20 +02:00
}
// Set tags
tags := map[string]string{
"hostname": clientConfig.Hostname,
"type": "socket",
// ProcessorType shall contain the string which identifies the type of processor contained in this Socket
"processor_typ": string(processor.ProcessorType),
// Processor name
"processor_name": processor.Name,
// ID uniquely identifies the resource
"processor_id": processor.ID,
}
// Set meta data tags
metaPower := map[string]string{
"source": r.name,
"group": "Energy",
"unit": "watts",
}
namePower := "consumed_power"
if !clientConfig.isExcluded[namePower] &&
// Some servers return "ConsumedPowerWatt":65535 instead of "ConsumedPowerWatt":null
processorMetrics.ConsumedPowerWatt != 65535 {
2024-03-06 14:59:47 +01:00
r.sendMetric(namePower, tags, metaPower, processorMetrics.ConsumedPowerWatt, timestamp)
2022-08-16 15:14:20 +02:00
}
// Set meta data tags
metaThermal := map[string]string{
"source": r.name,
"group": "Temperature",
"unit": "degC",
}
2022-04-19 12:05:03 +02:00
2022-08-16 15:14:20 +02:00
nameThermal := "temperature"
if !clientConfig.isExcluded[nameThermal] {
2024-03-06 14:59:47 +01:00
r.sendMetric(nameThermal, tags, metaThermal, processorMetrics.TemperatureCelsius, timestamp)
2022-08-16 15:14:20 +02:00
}
return nil
}
// readMetrics reads redfish thermal, power and processor metrics from the redfish device
// configured in clientConfig
func (r *RedfishReceiver) readMetrics(clientConfig *RedfishReceiverClientConfig) error {
// Connect to redfish service
c, err := gofish.Connect(clientConfig.gofish)
if err != nil {
return fmt.Errorf(
"readMetrics: gofish.Connect({Username: %v, Endpoint: %v, BasicAuth: %v, HttpTimeout: %v, HttpInsecure: %v}) failed: %v",
clientConfig.gofish.Username,
clientConfig.gofish.Endpoint,
clientConfig.gofish.BasicAuth,
clientConfig.gofish.HTTPClient.Timeout,
clientConfig.gofish.HTTPClient.Transport.(*http.Transport).TLSClientConfig.InsecureSkipVerify,
err)
}
defer c.Logout()
2022-04-19 12:05:03 +02:00
2022-08-16 15:14:20 +02:00
// Create a session, when required
if _, err = c.GetSession(); err != nil {
c, err = c.CloneWithSession()
2022-04-19 12:05:03 +02:00
if err != nil {
2022-08-16 15:14:20 +02:00
return fmt.Errorf("readMetrics: Failed to create a session: %+w", err)
2022-08-10 16:24:21 +02:00
}
2022-08-16 15:14:20 +02:00
}
2022-08-10 16:24:21 +02:00
2022-08-16 15:14:20 +02:00
// Get all chassis managed by this service
isChassisListRequired :=
2024-03-06 14:59:47 +01:00
clientConfig.doSensors ||
clientConfig.doThermalMetrics ||
2022-08-16 15:14:20 +02:00
clientConfig.doPowerMetric
var chassisList []*redfish.Chassis
if isChassisListRequired {
chassisList, err = c.Service.Chassis()
2022-04-19 12:05:03 +02:00
if err != nil {
2022-08-10 10:30:59 +02:00
return fmt.Errorf("readMetrics: c.Service.Chassis() failed: %v", err)
2022-04-19 12:05:03 +02:00
}
2022-08-16 15:14:20 +02:00
}
2022-04-19 12:05:03 +02:00
2022-08-16 15:14:20 +02:00
// Get all computer systems managed by this service
isComputerSystemListRequired := clientConfig.doProcessorMetrics
var computerSystemList []*redfish.ComputerSystem
if isComputerSystemListRequired {
computerSystemList, err = c.Service.Systems()
if err != nil {
return fmt.Errorf("readMetrics: c.Service.Systems() failed: %v", err)
}
}
2022-04-19 12:05:03 +02:00
2024-03-06 14:59:47 +01:00
// Read sensors
if clientConfig.doSensors {
for _, chassis := range chassisList {
err := r.readSensors(clientConfig, chassis)
if err != nil {
return err
}
}
}
2022-08-16 15:14:20 +02:00
// read thermal metrics
if clientConfig.doThermalMetrics {
for _, chassis := range chassisList {
err := r.readThermalMetrics(clientConfig, chassis)
2022-04-19 12:05:03 +02:00
if err != nil {
2022-08-10 10:30:59 +02:00
return err
2022-04-19 12:05:03 +02:00
}
2022-08-16 15:14:20 +02:00
}
}
2022-04-20 14:39:26 +02:00
2022-08-16 15:14:20 +02:00
// read power metrics
if clientConfig.doPowerMetric {
for _, chassis := range chassisList {
err = r.readPowerMetrics(clientConfig, chassis)
2022-08-10 10:30:59 +02:00
if err != nil {
return err
2022-04-19 12:05:03 +02:00
}
}
2022-08-16 15:14:20 +02:00
}
2022-04-19 12:05:03 +02:00
2022-08-16 15:14:20 +02:00
// read processor metrics
if clientConfig.doProcessorMetrics {
// loop for all computer systems
2022-08-16 15:14:20 +02:00
for _, system := range computerSystemList {
// loop for all processors
processors, err := system.Processors()
if err != nil {
return fmt.Errorf("readMetrics: system.Processors() failed: %v", err)
}
for _, processor := range processors {
2022-08-16 15:14:20 +02:00
err := r.readProcessorMetrics(clientConfig, processor)
if err != nil {
return err
}
}
}
2022-04-19 12:05:03 +02:00
}
2022-08-16 15:14:20 +02:00
return nil
}
2022-04-19 12:05:03 +02:00
2022-08-16 15:14:20 +02:00
// doReadMetrics reads metrics from all configure redfish devices.
// To compensate latencies of the Redfish devices a fanout is used.
func (r *RedfishReceiver) doReadMetric() {
// Create wait group and input channel for workers
var workerWaitGroup sync.WaitGroup
workerInput := make(chan *RedfishReceiverClientConfig, r.config.fanout)
// Create worker go routines
for i := 0; i < r.config.fanout; i++ {
// Increment worker wait group counter
workerWaitGroup.Add(1)
go func() {
// Decrement worker wait group counter
defer workerWaitGroup.Done()
// Read power metrics for each client config
for clientConfig := range workerInput {
err := r.readMetrics(clientConfig)
if err != nil {
cclog.ComponentError(r.name, err)
2022-04-19 12:05:03 +02:00
}
2022-08-16 15:14:20 +02:00
}
}()
}
2022-04-19 12:05:03 +02:00
2022-08-16 15:14:20 +02:00
// Distribute client configs to workers
for i := range r.config.ClientConfigs {
// Check done channel status
select {
case workerInput <- &r.config.ClientConfigs[i]:
case <-r.done:
// process done event
// Stop workers, clear channel and wait for all workers to finish
close(workerInput)
for range workerInput {
}
2022-08-16 15:14:20 +02:00
workerWaitGroup.Wait()
return
2022-04-19 12:05:03 +02:00
}
}
2022-08-16 15:14:20 +02:00
// Stop workers and wait for all workers to finish
close(workerInput)
workerWaitGroup.Wait()
}
// Start starts the redfish receiver
func (r *RedfishReceiver) Start() {
cclog.ComponentDebug(r.name, "START")
2022-04-19 12:05:03 +02:00
// Start redfish receiver
r.wg.Add(1)
go func() {
defer r.wg.Done()
// Create ticker
ticker := time.NewTicker(r.config.Interval)
2022-04-19 12:05:03 +02:00
defer ticker.Stop()
for {
2022-08-16 15:14:20 +02:00
r.doReadMetric()
2022-04-19 12:05:03 +02:00
select {
2022-08-16 15:14:20 +02:00
case tickerTime := <-ticker.C:
// Check if we missed the ticker event
if since := time.Since(tickerTime); since > 5*time.Second {
cclog.ComponentInfo(r.name, "Missed ticker event for more then", since)
}
2022-04-19 12:05:03 +02:00
// process ticker event -> continue
continue
case <-r.done:
// process done event
return
}
}
}()
cclog.ComponentDebug(r.name, "STARTED")
}
2022-08-16 15:14:20 +02:00
// Close closes the redfish receiver
2022-04-19 12:05:03 +02:00
func (r *RedfishReceiver) Close() {
cclog.ComponentDebug(r.name, "CLOSE")
// Send the signal and wait
2022-04-19 14:01:23 +02:00
close(r.done)
2022-04-19 12:05:03 +02:00
r.wg.Wait()
cclog.ComponentDebug(r.name, "DONE")
}
2022-08-16 15:14:20 +02:00
// NewRedfishReceiver creates a new instance of the redfish receiver
2022-04-19 12:05:03 +02:00
// Initialize the receiver by giving it a name and reading in the config JSON
func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) {
r := new(RedfishReceiver)
2022-08-16 15:14:20 +02:00
// Config options from config file
configJSON := struct {
Type string `json:"type"`
// Maximum number of simultaneous redfish connections (default: 64)
Fanout int `json:"fanout,omitempty"`
// How often the redfish power metrics should be read and send to the sink (default: 30 s)
IntervalString string `json:"interval,omitempty"`
// Control whether a client verifies the server's certificate
// (default: true == do not verify server's certificate)
HttpInsecure bool `json:"http_insecure,omitempty"`
// Time limit for requests made by this HTTP client (default: 10 s)
HttpTimeoutString string `json:"http_timeout,omitempty"`
// Default client username, password and endpoint
Username *string `json:"username"` // User name to authenticate with
Password *string `json:"password"` // Password to use for authentication
Endpoint *string `json:"endpoint"` // URL of the redfish service
2022-08-16 15:14:20 +02:00
// Globally disable collection of power, processor or thermal metrics
DisablePowerMetrics bool `json:"disable_power_metrics"`
DisableProcessorMetrics bool `json:"disable_processor_metrics"`
2024-03-06 14:59:47 +01:00
DisableSensors bool `json:"disable_sensors"`
2022-08-16 15:14:20 +02:00
DisableThermalMetrics bool `json:"disable_thermal_metrics"`
// Globally excluded metrics
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ClientConfigs []struct {
HostList string `json:"host_list"` // List of hosts with the same client configuration
Username *string `json:"username"` // User name to authenticate with
Password *string `json:"password"` // Password to use for authentication
Endpoint *string `json:"endpoint"` // URL of the redfish service
2022-08-16 15:14:20 +02:00
// Per client disable collection of power,processor or thermal metrics
DisablePowerMetrics bool `json:"disable_power_metrics"`
DisableProcessorMetrics bool `json:"disable_processor_metrics"`
2024-03-06 14:59:47 +01:00
DisableSensors bool `json:"disable_sensors"`
2022-08-16 15:14:20 +02:00
DisableThermalMetrics bool `json:"disable_thermal_metrics"`
// Per client excluded metrics
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
} `json:"client_config"`
}{
// Set defaults values
// Allow overwriting these defaults by reading config JSON
Fanout: 64,
IntervalString: "30s",
HttpTimeoutString: "10s",
HttpInsecure: true,
}
2022-04-19 12:05:03 +02:00
// Set name
r.name = fmt.Sprintf("RedfishReceiver(%s)", name)
// Create done channel
r.done = make(chan bool)
// Read the redfish receiver specific JSON config
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&configJSON); err != nil {
2022-04-19 12:05:03 +02:00
cclog.ComponentError(r.name, "Error reading config:", err.Error())
return nil, err
}
}
// Convert interval string representation to duration
var err error
2022-08-16 15:14:20 +02:00
r.config.Interval, err = time.ParseDuration(configJSON.IntervalString)
if err != nil {
err := fmt.Errorf(
"Failed to parse duration string interval='%s': %w",
2022-08-16 15:14:20 +02:00
configJSON.IntervalString,
err,
)
cclog.Error(r.name, err)
return nil, err
}
// HTTP timeout duration
2022-08-16 15:14:20 +02:00
r.config.HttpTimeout, err = time.ParseDuration(configJSON.HttpTimeoutString)
if err != nil {
err := fmt.Errorf(
"Failed to parse duration string http_timeout='%s': %w",
2022-08-16 15:14:20 +02:00
configJSON.HttpTimeoutString,
err,
)
cclog.Error(r.name, err)
return nil, err
}
// Create new http client
customTransport := http.DefaultTransport.(*http.Transport).Clone()
customTransport.TLSClientConfig = &tls.Config{
2022-08-16 15:14:20 +02:00
InsecureSkipVerify: configJSON.HttpInsecure,
}
httpClient := &http.Client{
Timeout: r.config.HttpTimeout,
Transport: customTransport,
}
// Initialize client configurations
r.config.ClientConfigs = make([]RedfishReceiverClientConfig, 0)
2022-08-16 15:14:20 +02:00
// Create client config from JSON config
for i := range configJSON.ClientConfigs {
2022-08-16 15:14:20 +02:00
clientConfigJSON := &configJSON.ClientConfigs[i]
2022-04-19 12:05:03 +02:00
2024-01-22 15:46:18 +01:00
// Redfish endpoint
var endpoint_pattern string
if clientConfigJSON.Endpoint != nil {
endpoint_pattern = *clientConfigJSON.Endpoint
} else if configJSON.Endpoint != nil {
endpoint_pattern = *configJSON.Endpoint
} else {
2022-04-19 12:05:03 +02:00
err := fmt.Errorf("client config number %v requires endpoint", i)
cclog.ComponentError(r.name, err)
return nil, err
}
2024-01-22 15:46:18 +01:00
// Redfish username
var username string
if clientConfigJSON.Username != nil {
username = *clientConfigJSON.Username
} else if configJSON.Username != nil {
username = *configJSON.Username
} else {
2022-04-19 12:05:03 +02:00
err := fmt.Errorf("client config number %v requires username", i)
cclog.ComponentError(r.name, err)
return nil, err
}
2024-01-22 15:46:18 +01:00
// Redfish password
var password string
if clientConfigJSON.Password != nil {
password = *clientConfigJSON.Password
} else if configJSON.Password != nil {
password = *configJSON.Password
} else {
2022-04-19 12:05:03 +02:00
err := fmt.Errorf("client config number %v requires password", i)
cclog.ComponentError(r.name, err)
return nil, err
}
2022-08-16 15:14:20 +02:00
// Which metrics should be collected
doPowerMetric :=
2022-08-16 15:14:20 +02:00
!(configJSON.DisablePowerMetrics ||
clientConfigJSON.DisablePowerMetrics)
doProcessorMetrics :=
2022-08-16 15:14:20 +02:00
!(configJSON.DisableProcessorMetrics ||
clientConfigJSON.DisableProcessorMetrics)
2024-03-06 14:59:47 +01:00
doSensors :=
!(configJSON.DisableSensors ||
clientConfigJSON.DisableSensors)
doThermalMetrics :=
2022-08-16 15:14:20 +02:00
!(configJSON.DisableThermalMetrics ||
clientConfigJSON.DisableThermalMetrics)
2022-08-10 16:24:21 +02:00
// Is metrics excluded globally or per client
isExcluded := make(map[string]bool)
2022-08-16 15:14:20 +02:00
for _, key := range clientConfigJSON.ExcludeMetrics {
isExcluded[key] = true
2022-08-10 16:24:21 +02:00
}
2022-08-16 15:14:20 +02:00
for _, key := range configJSON.ExcludeMetrics {
isExcluded[key] = true
}
hostList, err := hostlist.Expand(clientConfigJSON.HostList)
if err != nil {
err := fmt.Errorf("client config number %d failed to parse host list %s: %v",
i, clientConfigJSON.HostList, err)
cclog.ComponentError(r.name, err)
return nil, err
}
for _, host := range hostList {
// Endpoint of the redfish service
endpoint := strings.Replace(endpoint_pattern, "%h", host, -1)
r.config.ClientConfigs = append(
r.config.ClientConfigs,
RedfishReceiverClientConfig{
Hostname: host,
isExcluded: isExcluded,
doPowerMetric: doPowerMetric,
doProcessorMetrics: doProcessorMetrics,
2024-03-06 14:59:47 +01:00
doSensors: doSensors,
doThermalMetrics: doThermalMetrics,
skipProcessorMetricsURL: make(map[string]bool),
2024-03-06 14:59:47 +01:00
readSensorURLs: map[string][]string{},
gofish: gofish.ClientConfig{
Username: username,
Password: password,
Endpoint: endpoint,
HTTPClient: httpClient,
},
})
}
}
// Compute parallel fanout to use
numClients := len(r.config.ClientConfigs)
r.config.fanout = configJSON.Fanout
if numClients < r.config.fanout {
r.config.fanout = numClients
}
2024-01-22 15:46:18 +01:00
// Check that at least on client config exists
if numClients == 0 {
err := fmt.Errorf("at least one client config is required")
cclog.ComponentError(r.name, err)
return nil, err
}
// Check for duplicate client configurations
isDuplicate := make(map[string]bool)
for i := range r.config.ClientConfigs {
host := r.config.ClientConfigs[i].Hostname
if isDuplicate[host] {
err := fmt.Errorf("Found duplicate client config for host %s", host)
cclog.ComponentError(r.name, err)
return nil, err
2022-08-10 16:24:21 +02:00
}
isDuplicate[host] = true
2022-04-19 12:05:03 +02:00
}
// Give some basic info about redfish receiver status
cclog.ComponentInfo(r.name, "Monitoring", numClients, "clients")
cclog.ComponentInfo(r.name, "Monitoring interval:", r.config.Interval)
cclog.ComponentInfo(r.name, "Monitoring parallel fanout:", r.config.fanout)
2022-04-19 12:05:03 +02:00
return r, nil
}