From 88b3fe1e413570336de9cffc915273b79ecb3579 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 27 Jul 2022 17:38:51 +0200 Subject: [PATCH 01/31] Add some documentation about building --- docs/building.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 docs/building.md diff --git a/docs/building.md b/docs/building.md new file mode 100644 index 0000000..2be87b0 --- /dev/null +++ b/docs/building.md @@ -0,0 +1,51 @@ +# Building the cc-metric-collector + +In most cases, a simple `make` in the main folder is enough to get a `cc-metric-collector` binary. It is basically a `go build` but some collectors require additional tasks. There is currently no Golang interface to LIKWID, so it uses `cgo` to create bindings but `cgo` requires the LIKWID header files. Therefore, it checks whether LIKWID is installed and if not it downloads LIKWID and copies the headers. + +## System integration + +The main configuration settings for system integration are pre-defined in `scripts/cc-metric-collector.config`. The file contains the UNIX user and group used for execution, the PID file location and other settings. Adjust it accordingly and copy it to `/etc/default/cc-metric-collector` + +``` +$ install -m 644 scripts/cc-metric-collector.config /etc/default/cc-metric-collector +$ edit /etc/default/cc-metric-collector +``` + +### SysVinit and similar + +If you are using a init system based in `/etc/init.d` daemons, you can use the sample `scripts/cc-metric-collector.init`. It reads the basic configuration from `/etc/default/cc-metric-collector` + +``` +$ install -m 755 scripts/cc-metric-collector.init /etc/init.d/cc-metric-collector +``` + +### Systemd + +If you are using `systemd` as init system, you can use the sample systemd service file `scripts/cc-metric-collector.service`, the configuration file `scripts/cc-metric-collector.config`. + +``` +$ install -m 644 scripts/cc-metric-collector.service /etc/systemd/system/cc-metric-collector.service +$ systemctl enable cc-metric-collector +``` + +## RPM + +In order to get a RPM packages for cc-metric-collector, just use: + +``` +$ make RPM +``` + +It uses the RPM SPEC file `scripts/cc-metric-collector.spec` and requires the RPM tools (`rpm` and `rpmspec`) and `git`. + +## DEB + +In order to get very simple Debian packages for cc-metric-collector, just use: + +``` +$ make DEB +``` + +It uses the DEB control file `scripts/cc-metric-collector.control` and requires `dpkg-deb`, `awk`, `sed` and `git`. It creates only a binary deb package. + +_This option is not well tested and therefore experimental_ \ No newline at end of file From edd33d58106e65768ae591a7eb4a73b5913ab469 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 27 Jul 2022 17:45:13 +0200 Subject: [PATCH 02/31] Add docs to README --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 530989e..e1f9316 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # cc-metric-collector -A node agent for measuring, processing and forwarding node level metrics. It is part of the ClusterCockpit ecosystem. +A node agent for measuring, processing and forwarding node level metrics. It is part of the [ClusterCockpit ecosystem](./docs/introduction.md). The metric collector sends (and receives) metric in the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/) as it provides flexibility while providing a separation between tags (like index columns in relational databases) and fields (like data columns). @@ -26,7 +26,7 @@ There is a main configuration file with basic settings that point to the other c } ``` -The `interval` defines how often the metrics should be read and send to the sink. The `duration` tells collectors how long one measurement has to take. This is important for some collectors, like the `likwid` collector. +The `interval` defines how often the metrics should be read and send to the sink. The `duration` tells collectors how long one measurement has to take. This is important for some collectors, like the `likwid` collector. For more information, see [here](./docs/configuration.md). See the component READMEs for their configuration: @@ -44,6 +44,8 @@ $ go get (requires at least golang 1.16) $ make ``` +For more information, see [here](./docs/building.md). + # Running ``` From 251ae8e8793b1d86dece1fc72c55224e48b0dae4 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 27 Jul 2022 17:46:27 +0200 Subject: [PATCH 03/31] Update link to cc-specifications repo with line protocol --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e1f9316..65bde1d 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ The receiver runs as a go routine side-by-side with the timer loop and asynchron # Configuration Configuration is implemented using a single json document that is distributed over network and may be persisted as file. -Supported metrics are documented [here](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md). +Supported metrics are documented [here](https://github.com/ClusterCockpit/cc-specifications/blob/master/interfaces/lineprotocol/README.md). There is a main configuration file with basic settings that point to the other configuration files for the different components. From ea33d45d8e5571908e89b065c49ac165e7a8598c Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 27 Jul 2022 17:50:15 +0200 Subject: [PATCH 04/31] Fix link to docs of NumastatsCollector --- collectors/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectors/README.md b/collectors/README.md index 10e5105..8002ed2 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -35,7 +35,7 @@ In contrast to the configuration files for sinks and receivers, the collectors c * [`nfs4stat`](./nfs4Metric.md) * [`cpufreq`](./cpufreqMetric.md) * [`cpufreq_cpuinfo`](./cpufreqCpuinfoMetric.md) -* [`numastat`](./numastatMetric.md) +* [`numastats`](./numastatsMetric.md) * [`gpfs`](./gpfsMetric.md) * [`beegfs_meta`](./beegfsmetaMetric.md) * [`beegfs_storage`](./beegfsstorageMetric.md) From f5ad45e49fbeb202ccc14540c9b61878b3203df4 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 27 Jul 2022 17:52:36 +0200 Subject: [PATCH 05/31] Fix old entries in sample scripts --- scripts/cc-metric-collector.config | 2 +- scripts/cc-metric-collector.init | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/cc-metric-collector.config b/scripts/cc-metric-collector.config index 988b0ff..0f5e1ba 100644 --- a/scripts/cc-metric-collector.config +++ b/scripts/cc-metric-collector.config @@ -6,7 +6,7 @@ CC_HOME=/tmp LOG_DIR=/var/log -DATA_DIR=/var/lib/grafana +DATA_DIR=/var/lib/cc-metric-collector MAX_OPEN_FILES=10000 diff --git a/scripts/cc-metric-collector.init b/scripts/cc-metric-collector.init index acb82eb..1a7993b 100755 --- a/scripts/cc-metric-collector.init +++ b/scripts/cc-metric-collector.init @@ -19,7 +19,7 @@ PATH=/bin:/usr/bin:/sbin:/usr/sbin NAME=cc-metric-collector DESC="ClusterCockpit metric collector" -DEFAULT=/etc/default/${NAME}.json +DEFAULT=/etc/default/${NAME} CC_USER=clustercockpit CC_GROUP=clustercockpit From 32bb9c5fc0a526e821a986c466b58a3139ec110f Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 27 Jul 2022 18:06:41 +0200 Subject: [PATCH 06/31] Update ccMetric README and FromMetric copy --- internal/ccMetric/README.md | 53 ++++++++++++++++++++++++++--------- internal/ccMetric/ccMetric.go | 18 +++++++----- 2 files changed, 50 insertions(+), 21 deletions(-) diff --git a/internal/ccMetric/README.md b/internal/ccMetric/README.md index 1787ff0..71a3a6e 100644 --- a/internal/ccMetric/README.md +++ b/internal/ccMetric/README.md @@ -6,27 +6,52 @@ It is basically a copy of the [InfluxDB line protocol](https://github.com/influx ```golang type ccMetric struct { - name string // same as - tags []*influx.Tag // original - fields []*influx.Field // Influx - tm time.Time // line-protocol - meta []*influx.Tag + name string // Measurement name + meta map[string]string // map of meta data tags + tags map[string]string // map of of tags + fields map[string]interface{} // map of of fields + tm time.Time // timestamp } type CCMetric interface { - influx.MutableMetric // the same functions as defined by influx.MutableMetric - RemoveTag(key string) // this is not published by the original influx.MutableMetric - Meta() map[string]string - MetaList() []*inlux.Tag - AddMeta(key, value string) - HasMeta(key string) bool - GetMeta(key string) (string, bool) - RemoveMeta(key string) + ToPoint(metaAsTags map[string]bool) *write.Point // Generate influxDB point for data type ccMetric + ToLineProtocol(metaAsTags map[string]bool) string // Generate influxDB line protocol for data type ccMetric + String() string // Return line-protocol like string + + Name() string // Get metric name + SetName(name string) // Set metric name + + Time() time.Time // Get timestamp + SetTime(t time.Time) // Set timestamp + + Tags() map[string]string // Map of tags + AddTag(key, value string) // Add a tag + GetTag(key string) (value string, ok bool) // Get a tag by its key + HasTag(key string) (ok bool) // Check if a tag key is present + RemoveTag(key string) // Remove a tag by its key + + Meta() map[string]string // Map of meta data tags + AddMeta(key, value string) // Add a meta data tag + GetMeta(key string) (value string, ok bool) // Get a meta data tab addressed by its key + HasMeta(key string) (ok bool) // Check if a meta data key is present + RemoveMeta(key string) // Remove a meta data tag by its key + + Fields() map[string]interface{} // Map of fields + AddField(key string, value interface{}) // Add a field + GetField(key string) (value interface{}, ok bool) // Get a field addressed by its key + HasField(key string) (ok bool) // Check if a field key is present + RemoveField(key string) // Remove a field addressed by its key } + +func New(name string, tags map[string]string, meta map[string]string, fields map[string]interface{}, tm time.Time) (CCMetric, error) +func FromMetric(other CCMetric) CCMetric +func FromInfluxMetric(other lp.Metric) CCMetric ``` -The `CCMetric` interface provides the same functions as the `MutableMetric` like `{Add, Remove, Has}{Tag, Field}` and additionally provides `{Add, Remove, Has}Meta`. +The `CCMetric` interface provides the same functions as the `MutableMetric` like `{Add, Get, Remove, Has}{Tag, Field}` and additionally provides `{Add, Get, Remove, Has}Meta`. The InfluxDB protocol creates a new metric with `influx.New(name, tags, fields, time)` while CCMetric uses `ccMetric.New(name, tags, meta, fields, time)` where `tags` and `meta` are both of type `map[string]string`. You can copy a CCMetric with `FromMetric(other CCMetric) CCMetric`. If you get an `influx.Metric` from a function, like the line protocol parser, you can use `FromInfluxMetric(other influx.Metric) CCMetric` to get a CCMetric out of it (see `NatsReceiver` for an example). + +Although the [cc-specifications](https://github.com/ClusterCockpit/cc-specifications/blob/master/interfaces/lineprotocol/README.md) defines that there is only a `value` field for the metric value, the CCMetric still can have multiple values similar to the InfluxDB line protocol. diff --git a/internal/ccMetric/ccMetric.go b/internal/ccMetric/ccMetric.go index 661b9a4..8ad18cc 100644 --- a/internal/ccMetric/ccMetric.go +++ b/internal/ccMetric/ccMetric.go @@ -50,6 +50,7 @@ type CCMetric interface { GetField(key string) (value interface{}, ok bool) // Get a field addressed by its key HasField(key string) (ok bool) // Check if a field key is present RemoveField(key string) // Remove a field addressed by its key + String() string // Return line-protocol like string } // String implements the stringer interface for data type ccMetric @@ -217,23 +218,26 @@ func New( } // FromMetric copies the metric -func FromMetric(other ccMetric) CCMetric { +func FromMetric(other CCMetric) CCMetric { + otags := other.Tags() + ometa := other.Meta() + ofields := other.Fields() m := &ccMetric{ name: other.Name(), - tags: make(map[string]string, len(other.tags)), - meta: make(map[string]string, len(other.meta)), - fields: make(map[string]interface{}, len(other.fields)), + tags: make(map[string]string, len(otags)), + meta: make(map[string]string, len(ometa)), + fields: make(map[string]interface{}, len(ofields)), tm: other.Time(), } // deep copy tags, meta data tags and fields - for key, value := range other.tags { + for key, value := range otags { m.tags[key] = value } - for key, value := range other.meta { + for key, value := range ometa { m.meta[key] = value } - for key, value := range other.fields { + for key, value := range ofields { m.fields[key] = value } return m From 7438b9d245d8be5f444d03273b47757f0fafdb4a Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 27 Jul 2022 18:08:15 +0200 Subject: [PATCH 07/31] Add rules files for DEB package --- scripts/cc-metric-collector.deb.rules | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 scripts/cc-metric-collector.deb.rules diff --git a/scripts/cc-metric-collector.deb.rules b/scripts/cc-metric-collector.deb.rules new file mode 100644 index 0000000..18e2ea8 --- /dev/null +++ b/scripts/cc-metric-collector.deb.rules @@ -0,0 +1,16 @@ + +#!/usr/bin/make -f +# You must remove unused comment lines for the released package. +#export DH_VERBOSE = 1 +#export DEB_BUILD_MAINT_OPTIONS = hardening=+all +#export DEB_CFLAGS_MAINT_APPEND = -Wall -pedantic +#export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed + +%: + dh $@ + +override_dh_auto_build: + make + +override_dh_auto_install: + make PREFIX=/usr install \ No newline at end of file From c312093d2b7791ead429de6c641f6aefd4188e6e Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 28 Jul 2022 16:22:39 +0200 Subject: [PATCH 08/31] Add --owner and --group to install lines --- docs/building.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/building.md b/docs/building.md index 2be87b0..1e89964 100644 --- a/docs/building.md +++ b/docs/building.md @@ -7,7 +7,7 @@ In most cases, a simple `make` in the main folder is enough to get a `cc-metric- The main configuration settings for system integration are pre-defined in `scripts/cc-metric-collector.config`. The file contains the UNIX user and group used for execution, the PID file location and other settings. Adjust it accordingly and copy it to `/etc/default/cc-metric-collector` ``` -$ install -m 644 scripts/cc-metric-collector.config /etc/default/cc-metric-collector +$ install --mode 644 --owner $CC_USER --group $CC_GROUP scripts/cc-metric-collector.config /etc/default/cc-metric-collector $ edit /etc/default/cc-metric-collector ``` @@ -16,7 +16,7 @@ $ edit /etc/default/cc-metric-collector If you are using a init system based in `/etc/init.d` daemons, you can use the sample `scripts/cc-metric-collector.init`. It reads the basic configuration from `/etc/default/cc-metric-collector` ``` -$ install -m 755 scripts/cc-metric-collector.init /etc/init.d/cc-metric-collector +$ install --mode 755 --owner $CC_USER --group $CC_GROUP scripts/cc-metric-collector.init /etc/init.d/cc-metric-collector ``` ### Systemd @@ -24,7 +24,7 @@ $ install -m 755 scripts/cc-metric-collector.init /etc/init.d/cc-metric-collecto If you are using `systemd` as init system, you can use the sample systemd service file `scripts/cc-metric-collector.service`, the configuration file `scripts/cc-metric-collector.config`. ``` -$ install -m 644 scripts/cc-metric-collector.service /etc/systemd/system/cc-metric-collector.service +$ install --mode 644 --owner $CC_USER --group $CC_GROUP scripts/cc-metric-collector.service /etc/systemd/system/cc-metric-collector.service $ systemctl enable cc-metric-collector ``` From c7d692e27fab03a7d1151fa31cb8d187171ec0a6 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 28 Jul 2022 16:24:21 +0200 Subject: [PATCH 09/31] Use newlines in install lines for readability --- docs/building.md | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/docs/building.md b/docs/building.md index 1e89964..9d9c713 100644 --- a/docs/building.md +++ b/docs/building.md @@ -7,7 +7,10 @@ In most cases, a simple `make` in the main folder is enough to get a `cc-metric- The main configuration settings for system integration are pre-defined in `scripts/cc-metric-collector.config`. The file contains the UNIX user and group used for execution, the PID file location and other settings. Adjust it accordingly and copy it to `/etc/default/cc-metric-collector` ``` -$ install --mode 644 --owner $CC_USER --group $CC_GROUP scripts/cc-metric-collector.config /etc/default/cc-metric-collector +$ install --mode 644 \ + --owner $CC_USER \ + --group $CC_GROUP \ + scripts/cc-metric-collector.config /etc/default/cc-metric-collector $ edit /etc/default/cc-metric-collector ``` @@ -16,7 +19,10 @@ $ edit /etc/default/cc-metric-collector If you are using a init system based in `/etc/init.d` daemons, you can use the sample `scripts/cc-metric-collector.init`. It reads the basic configuration from `/etc/default/cc-metric-collector` ``` -$ install --mode 755 --owner $CC_USER --group $CC_GROUP scripts/cc-metric-collector.init /etc/init.d/cc-metric-collector +$ install --mode 755 \ + --owner $CC_USER \ + --group $CC_GROUP \ + scripts/cc-metric-collector.init /etc/init.d/cc-metric-collector ``` ### Systemd @@ -24,7 +30,10 @@ $ install --mode 755 --owner $CC_USER --group $CC_GROUP scripts/cc-metric-collec If you are using `systemd` as init system, you can use the sample systemd service file `scripts/cc-metric-collector.service`, the configuration file `scripts/cc-metric-collector.config`. ``` -$ install --mode 644 --owner $CC_USER --group $CC_GROUP scripts/cc-metric-collector.service /etc/systemd/system/cc-metric-collector.service +$ install --mode 644 \ + --owner $CC_USER \ + --group $CC_GROUP \ + scripts/cc-metric-collector.service /etc/systemd/system/cc-metric-collector.service $ systemctl enable cc-metric-collector ``` From cfcde9b23b68535c89d59dd7f4ca90e8e256d22a Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 28 Jul 2022 16:25:32 +0200 Subject: [PATCH 10/31] Mark code parts as bash --- docs/building.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/building.md b/docs/building.md index 9d9c713..968c454 100644 --- a/docs/building.md +++ b/docs/building.md @@ -6,7 +6,7 @@ In most cases, a simple `make` in the main folder is enough to get a `cc-metric- The main configuration settings for system integration are pre-defined in `scripts/cc-metric-collector.config`. The file contains the UNIX user and group used for execution, the PID file location and other settings. Adjust it accordingly and copy it to `/etc/default/cc-metric-collector` -``` +```bash $ install --mode 644 \ --owner $CC_USER \ --group $CC_GROUP \ @@ -18,7 +18,7 @@ $ edit /etc/default/cc-metric-collector If you are using a init system based in `/etc/init.d` daemons, you can use the sample `scripts/cc-metric-collector.init`. It reads the basic configuration from `/etc/default/cc-metric-collector` -``` +```bash $ install --mode 755 \ --owner $CC_USER \ --group $CC_GROUP \ @@ -29,7 +29,7 @@ $ install --mode 755 \ If you are using `systemd` as init system, you can use the sample systemd service file `scripts/cc-metric-collector.service`, the configuration file `scripts/cc-metric-collector.config`. -``` +```bash $ install --mode 644 \ --owner $CC_USER \ --group $CC_GROUP \ @@ -41,7 +41,7 @@ $ systemctl enable cc-metric-collector In order to get a RPM packages for cc-metric-collector, just use: -``` +```bash $ make RPM ``` @@ -51,7 +51,7 @@ It uses the RPM SPEC file `scripts/cc-metric-collector.spec` and requires the RP In order to get very simple Debian packages for cc-metric-collector, just use: -``` +```bash $ make DEB ``` From a2f0bc37d4174b73b5f66396d9b14097baa2d88f Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 3 Aug 2022 17:06:28 +0200 Subject: [PATCH 11/31] Add runonce job for Golang 1.19 --- .github/workflows/runonce.yml | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/.github/workflows/runonce.yml b/.github/workflows/runonce.yml index 2036179..5218145 100644 --- a/.github/workflows/runonce.yml +++ b/.github/workflows/runonce.yml @@ -31,4 +31,30 @@ jobs: run: make - name: Run MetricCollector once - run: ./cc-metric-collector --once --config .github/ci-config.json \ No newline at end of file + run: ./cc-metric-collector --once --config .github/ci-config.json + + # + # Job build-1-19 + # Build on latest Ubuntu using golang version 1.19 + # + build-1-19: + runs-on: ubuntu-latest + steps: + # See: https://github.com/marketplace/actions/checkout + # Checkout git repository and submodules + - name: Checkout + uses: actions/checkout@v2 + with: + submodules: recursive + + # See: https://github.com/marketplace/actions/setup-go-environment + - name: Setup Golang + uses: actions/setup-go@v3 + with: + go-version: '1.19' + + - name: Build MetricCollector + run: make + + - name: Run MetricCollector once + run: ./cc-metric-collector --once --config .github/ci-config.json From 2ca03597442445688a12539c7f4cf12cacebf392 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 10 Aug 2022 10:30:59 +0200 Subject: [PATCH 12/31] Add support to read thermal metrics --- receivers/redfishReceiver.go | 320 ++++++++++++++++++++++++----------- 1 file changed, 217 insertions(+), 103 deletions(-) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index d048fa8..23d89f8 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -14,6 +14,8 @@ import ( // See: https://pkg.go.dev/github.com/stmcginnis/gofish "github.com/stmcginnis/gofish" + "github.com/stmcginnis/gofish/common" + "github.com/stmcginnis/gofish/redfish" ) // RedfishReceiver configuration: @@ -34,14 +36,21 @@ type RedfishReceiver struct { HttpTimeoutString string `json:"http_timeout,omitempty"` HttpTimeout time.Duration + // Globally disable collection of power or thermal metrics + DisablePowerMetrics bool `json:"disable_power_metrics"` + DisableThermalMetrics bool `json:"disable_thermal_metrics"` + // Client config for each redfish service ClientConfigs []struct { - Hostname *string `json:"hostname"` - Username *string `json:"username"` - Password *string `json:"password"` - Endpoint *string `json:"endpoint"` - ExcludeMetrics []string `json:"exclude_metrics,omitempty"` - gofish gofish.ClientConfig + Hostname *string `json:"hostname"` + Username *string `json:"username"` + Password *string `json:"password"` + Endpoint *string `json:"endpoint"` + // Per client disable collection of power or thermal metrics + DisablePowerMetrics bool `json:"disable_power_metrics"` + DisableThermalMetrics bool `json:"disable_thermal_metrics"` + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + gofish gofish.ClientConfig } `json:"client_config"` } @@ -53,16 +62,205 @@ type RedfishReceiver struct { func (r *RedfishReceiver) Start() { cclog.ComponentDebug(r.name, "START") - // readPowerMetric reads redfish power metric from the endpoint configured in conf - readPowerMetric := func(clientConfigIndex int) error { + // Read redfish thermal metrics + readThermalMetrics := func(clientConfigIndex int, chassis *redfish.Chassis) error { + clientConfig := &r.config.ClientConfigs[clientConfigIndex] + // Skip collection off thermal metrics when disabled by config + if r.config.DisableThermalMetrics || clientConfig.DisableThermalMetrics { + return nil + } + + // Get thermal information for each chassis + thermal, err := chassis.Thermal() + if err != nil { + return fmt.Errorf("readMetrics: chassis.Thermal() failed: %v", err) + } + + // Skip empty thermal information + if thermal == nil { + return nil + } + + timestamp := time.Now() + + for _, temperature := range thermal.Temperatures { + + // Skip all temperatures which ar not in enabled state + if temperature.Status.State != common.EnabledState { + continue + } + + tags := map[string]string{ + "hostname": *clientConfig.Hostname, + "type": "node", + // ChassisType shall indicate the physical form factor for the type of chassis + "chassis_typ": string(chassis.ChassisType), + // Chassis name + "chassis_name": chassis.Name, + // ID uniquely identifies the resource + "temperature_id": temperature.ID, + // MemberID shall uniquely identify the member within the collection. For + // services supporting Redfish v1.6 or higher, this value shall be the + // zero-based array index. + "temperature_member_id": temperature.MemberID, + // PhysicalContext shall be a description of the affected device(s) or region + // within the chassis to which this power control applies. + "temperature_physical_context": string(temperature.PhysicalContext), + // Name + "temperature_name": temperature.Name, + } + + // Delete empty tags + for key, value := range tags { + if value == "" { + delete(tags, key) + } + } + + // Set meta data tags + meta := map[string]string{ + "source": r.name, + "group": "Temperature", + "unit": "degC", + } + + // ReadingCelsius shall be the current value of the temperature sensor's reading. + value := temperature.ReadingCelsius + + y, err := lp.New("temperature", tags, meta, + map[string]interface{}{ + "value": value, + }, + timestamp) + if err == nil { + r.sink <- y + } + } + + return nil + } + + // Read redfish power metrics + readPowerMetrics := func(clientConfigIndex int, chassis *redfish.Chassis) error { + clientConfig := &r.config.ClientConfigs[clientConfigIndex] + + // Skip collection off thermal metrics when disabled by config + if r.config.DisablePowerMetrics || clientConfig.DisablePowerMetrics { + return nil + } + + // Get power information for each chassis + power, err := chassis.Power() + if err != nil { + return fmt.Errorf("readMetrics: chassis.Power() failed: %v", err) + } + + // Skip empty power information + if power == nil { + return nil + } + + timestamp := time.Now() + + // Read min, max and average consumed watts for each power control + for _, pc := range power.PowerControl { + + // Map of collected metrics + metrics := map[string]float32{ + // PowerConsumedWatts shall represent the actual power being consumed (in + // Watts) by the chassis + "consumed_watts": pc.PowerConsumedWatts, + // AverageConsumedWatts shall represent the + // average power level that occurred averaged over the last IntervalInMin + // minutes. + "average_consumed_watts": pc.PowerMetrics.AverageConsumedWatts, + // MinConsumedWatts shall represent the + // minimum power level in watts that occurred within the last + // IntervalInMin minutes. + "min_consumed_watts": pc.PowerMetrics.MinConsumedWatts, + // MaxConsumedWatts shall represent the + // maximum power level in watts that occurred within the last + // IntervalInMin minutes + "max_consumed_watts": pc.PowerMetrics.MaxConsumedWatts, + } + intervalInMin := strconv.FormatFloat(float64(pc.PowerMetrics.IntervalInMin), 'f', -1, 32) + + // Metrics to exclude + for _, key := range clientConfig.ExcludeMetrics { + delete(metrics, key) + } + + // Set tags + tags := map[string]string{ + "hostname": *clientConfig.Hostname, + "type": "node", + // ChassisType shall indicate the physical form factor for the type of chassis + "chassis_typ": string(chassis.ChassisType), + // Chassis name + "chassis_name": chassis.Name, + // ID uniquely identifies the resource + "power_control_id": pc.ID, + // MemberID shall uniquely identify the member within the collection. For + // services supporting Redfish v1.6 or higher, this value shall be the + // zero-based array index. + "power_control_member_id": pc.MemberID, + // PhysicalContext shall be a description of the affected device(s) or region + // within the chassis to which this power control applies. + "power_control_physical_context": string(pc.PhysicalContext), + // Name + "power_control_name": pc.Name, + } + + // Delete empty tags + for key, value := range tags { + if value == "" { + delete(tags, key) + } + } + + // Set meta data tags + meta := map[string]string{ + "source": r.name, + "group": "Energy", + "interval_in_minutes": intervalInMin, + "unit": "watts", + } + + // Delete empty meta data tags + for key, value := range meta { + if value == "" { + delete(meta, key) + } + } + + for name, value := range metrics { + + y, err := lp.New(name, tags, meta, + map[string]interface{}{ + "value": value, + }, + timestamp) + if err == nil { + r.sink <- y + } + } + } + + return nil + } + + // readMetrics reads redfish temperature and power metrics from the endpoint configured in conf + readMetrics := func(clientConfigIndex int) error { + + // access client config clientConfig := &r.config.ClientConfigs[clientConfigIndex] // Connect to redfish service c, err := gofish.Connect(clientConfig.gofish) if err != nil { return fmt.Errorf( - "readPowerMetric: gofish.Connect({Username: %v, Endpoint: %v, BasicAuth: %v, HttpTimeout: %v, HttpInsecure: %v}) failed: %v", + "readMetrics: gofish.Connect({Username: %v, Endpoint: %v, BasicAuth: %v, HttpTimeout: %v, HttpInsecure: %v}) failed: %v", clientConfig.gofish.Username, clientConfig.gofish.Endpoint, clientConfig.gofish.BasicAuth, @@ -75,112 +273,28 @@ func (r *RedfishReceiver) Start() { // Get all chassis managed by this service chassis_list, err := c.Service.Chassis() if err != nil { - return fmt.Errorf("readPowerMetric: c.Service.Chassis() failed: %v", err) + return fmt.Errorf("readMetrics: c.Service.Chassis() failed: %v", err) } for _, chassis := range chassis_list { - timestamp := time.Now() - // Get power information for each chassis - power, err := chassis.Power() + err := readThermalMetrics(clientConfigIndex, chassis) if err != nil { - return fmt.Errorf("readPowerMetric: chassis.Power() failed: %v", err) - } - if power == nil { - continue + return err } - // Read min, max and average consumed watts for each power control - for _, pc := range power.PowerControl { - - // Map of collected metrics - metrics := map[string]float32{ - // PowerConsumedWatts shall represent the actual power being consumed (in - // Watts) by the chassis - "consumed_watts": pc.PowerConsumedWatts, - // AverageConsumedWatts shall represent the - // average power level that occurred averaged over the last IntervalInMin - // minutes. - "average_consumed_watts": pc.PowerMetrics.AverageConsumedWatts, - // MinConsumedWatts shall represent the - // minimum power level in watts that occurred within the last - // IntervalInMin minutes. - "min_consumed_watts": pc.PowerMetrics.MinConsumedWatts, - // MaxConsumedWatts shall represent the - // maximum power level in watts that occurred within the last - // IntervalInMin minutes - "max_consumed_watts": pc.PowerMetrics.MaxConsumedWatts, - } - intervalInMin := strconv.FormatFloat(float64(pc.PowerMetrics.IntervalInMin), 'f', -1, 32) - - // Metrics to exclude - for _, key := range clientConfig.ExcludeMetrics { - delete(metrics, key) - } - - // Set tags - tags := map[string]string{ - "hostname": *clientConfig.Hostname, - "type": "node", - // ChassisType shall indicate the physical form factor for the type of chassis - "chassis_typ": string(chassis.ChassisType), - // Chassis name - "chassis_name": chassis.Name, - // ID uniquely identifies the resource - "power_control_id": pc.ID, - // MemberID shall uniquely identify the member within the collection. For - // services supporting Redfish v1.6 or higher, this value shall be the - // zero-based array index. - "power_control_member_id": pc.MemberID, - // PhysicalContext shall be a description of the affected device(s) or region - // within the chassis to which this power control applies. - "power_control_physical_context": string(pc.PhysicalContext), - // Name - "power_control_name": pc.Name, - } - - // Delete empty tags - for key, value := range tags { - if value == "" { - delete(tags, key) - } - } - - // Set meta data tags - meta := map[string]string{ - "source": r.name, - "group": "Energy", - "interval_in_minutes": intervalInMin, - "unit": "watts", - } - - // Delete empty meta data tags - for key, value := range meta { - if value == "" { - delete(meta, key) - } - } - - for name, value := range metrics { - - y, err := lp.New(name, tags, meta, - map[string]interface{}{ - "value": value, - }, - timestamp) - if err == nil { - r.sink <- y - } - } + err = readPowerMetrics(clientConfigIndex, chassis) + if err != nil { + return err } } return nil } - // doReadPowerMetric read power metrics for all configure redfish services. + // doReadMetrics read power and temperature metrics for all configure redfish services. // To compensate latencies of the Redfish services a fanout is used. - doReadPowerMetric := func() { + doReadMetric := func() { // Compute fanout to use realFanout := r.config.Fanout @@ -202,7 +316,7 @@ func (r *RedfishReceiver) Start() { // Read power metrics for each client config for clientConfigIndex := range workerInput { - err := readPowerMetric(clientConfigIndex) + err := readMetrics(clientConfigIndex) if err != nil { cclog.ComponentError(r.name, err) } @@ -241,7 +355,7 @@ func (r *RedfishReceiver) Start() { defer ticker.Stop() for { - doReadPowerMetric() + doReadMetric() select { case <-ticker.C: From 8ba33568a6451b5ab4d3df5dbc479c29f939b31c Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 10 Aug 2022 16:24:21 +0200 Subject: [PATCH 13/31] Add reading of fan speeds --- receivers/redfishReceiver.go | 149 +++++++++++++++++++++++++++++------ 1 file changed, 124 insertions(+), 25 deletions(-) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index 23d89f8..cf64744 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -40,17 +40,27 @@ type RedfishReceiver struct { DisablePowerMetrics bool `json:"disable_power_metrics"` DisableThermalMetrics bool `json:"disable_thermal_metrics"` + // Globally excluded metrics + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + // Client config for each redfish service ClientConfigs []struct { Hostname *string `json:"hostname"` Username *string `json:"username"` Password *string `json:"password"` Endpoint *string `json:"endpoint"` + // Per client disable collection of power or thermal metrics - DisablePowerMetrics bool `json:"disable_power_metrics"` - DisableThermalMetrics bool `json:"disable_thermal_metrics"` - ExcludeMetrics []string `json:"exclude_metrics,omitempty"` - gofish gofish.ClientConfig + DisablePowerMetrics bool `json:"disable_power_metrics"` + DisableThermalMetrics bool `json:"disable_thermal_metrics"` + + // Per client excluded metrics + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` + + // is metric excluded globally or per client + isExcluded map[string](bool) + + gofish gofish.ClientConfig } `json:"client_config"` } @@ -86,6 +96,11 @@ func (r *RedfishReceiver) Start() { for _, temperature := range thermal.Temperatures { + // Skip, when temperature metric is excluded + if clientConfig.isExcluded["temperature"] { + break + } + // Skip all temperatures which ar not in enabled state if temperature.Status.State != common.EnabledState { continue @@ -138,6 +153,64 @@ func (r *RedfishReceiver) Start() { } } + for _, fan := range thermal.Fans { + // Skip, when fan_speed metric is excluded + if clientConfig.isExcluded["fan_speed"] { + break + } + + // Skip all fans which ar not in enabled state + if fan.Status.State != common.EnabledState { + continue + } + + tags := map[string]string{ + "hostname": *clientConfig.Hostname, + "type": "node", + // ChassisType shall indicate the physical form factor for the type of chassis + "chassis_typ": string(chassis.ChassisType), + // Chassis name + "chassis_name": chassis.Name, + // ID uniquely identifies the resource + "fan_id": fan.ID, + // MemberID shall uniquely identify the member within the collection. For + // services supporting Redfish v1.6 or higher, this value shall be the + // zero-based array index. + "fan_member_id": fan.MemberID, + // PhysicalContext shall be a description of the affected device(s) or region + // within the chassis to which this power control applies. + "fan_physical_context": string(fan.PhysicalContext), + // Name + "fan_name": fan.Name, + } + + // Delete empty tags + for key, value := range tags { + if value == "" { + delete(tags, key) + } + } + + // Set meta data tags + meta := map[string]string{ + "source": r.name, + "group": "FanSpeed", + "unit": string(fan.ReadingUnits), + } + + // ReadingCelsius shall be the current value of the temperature sensor's reading. + value := fan.Reading + + y, err := lp.New("fan_speed", tags, meta, + map[string]interface{}{ + "value": value, + }, + timestamp) + if err == nil { + r.sink <- y + } + } + return nil } @@ -167,29 +240,37 @@ func (r *RedfishReceiver) Start() { for _, pc := range power.PowerControl { // Map of collected metrics - metrics := map[string]float32{ - // PowerConsumedWatts shall represent the actual power being consumed (in - // Watts) by the chassis - "consumed_watts": pc.PowerConsumedWatts, - // AverageConsumedWatts shall represent the - // average power level that occurred averaged over the last IntervalInMin - // minutes. - "average_consumed_watts": pc.PowerMetrics.AverageConsumedWatts, - // MinConsumedWatts shall represent the - // minimum power level in watts that occurred within the last - // IntervalInMin minutes. - "min_consumed_watts": pc.PowerMetrics.MinConsumedWatts, - // MaxConsumedWatts shall represent the - // maximum power level in watts that occurred within the last - // IntervalInMin minutes - "max_consumed_watts": pc.PowerMetrics.MaxConsumedWatts, - } - intervalInMin := strconv.FormatFloat(float64(pc.PowerMetrics.IntervalInMin), 'f', -1, 32) + metrics := make(map[string]float32) - // Metrics to exclude - for _, key := range clientConfig.ExcludeMetrics { - delete(metrics, key) + // PowerConsumedWatts shall represent the actual power being consumed (in + // Watts) by the chassis + if !clientConfig.isExcluded["consumed_watts"] { + metrics["consumed_watts"] = pc.PowerConsumedWatts } + // AverageConsumedWatts shall represent the + // average power level that occurred averaged over the last IntervalInMin + // minutes. + if !clientConfig.isExcluded["average_consumed_watts"] { + metrics["average_consumed_watts"] = pc.PowerMetrics.AverageConsumedWatts + } + // MinConsumedWatts shall represent the + // minimum power level in watts that occurred within the last + // IntervalInMin minutes. + if !clientConfig.isExcluded["min_consumed_watts"] { + metrics["min_consumed_watts"] = pc.PowerMetrics.MinConsumedWatts + } + // MaxConsumedWatts shall represent the + // maximum power level in watts that occurred within the last + // IntervalInMin minutes + if !clientConfig.isExcluded["max_consumed_watts"] { + metrics["max_consumed_watts"] = pc.PowerMetrics.MaxConsumedWatts + } + // IntervalInMin shall represent the time interval (or window), in minutes, + // in which the PowerMetrics properties are measured over. + // Should be an integer, but some Dell implementations return as a float + intervalInMin := + strconv.FormatFloat( + float64(pc.PowerMetrics.IntervalInMin), 'f', -1, 32) // Set tags tags := map[string]string{ @@ -270,6 +351,14 @@ func (r *RedfishReceiver) Start() { } defer c.Logout() + // Create a session, when required + if _, err = c.GetSession(); err != nil { + c, err = c.CloneWithSession() + if err != nil { + return fmt.Errorf("readMetrics: Failed to create a session: %+w", err) + } + } + // Get all chassis managed by this service chassis_list, err := c.Service.Chassis() if err != nil { @@ -476,7 +565,17 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { } gofishConfig.Password = *clientConfig.Password + // Reuse existing http client gofishConfig.HTTPClient = httpClient + + // Is metrics excluded globally or per client + clientConfig.isExcluded = make(map[string]bool) + for _, key := range clientConfig.ExcludeMetrics { + clientConfig.isExcluded[key] = true + } + for _, key := range r.config.ExcludeMetrics { + clientConfig.isExcluded[key] = true + } } return r, nil From 6eb8e3a1f5eadf61c9bbab808883e5d7ff9fea39 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:00:47 +0200 Subject: [PATCH 14/31] Corrected comments. Added additional check --- receivers/redfishReceiver.go | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index cf64744..49a9a07 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -101,7 +101,7 @@ func (r *RedfishReceiver) Start() { break } - // Skip all temperatures which ar not in enabled state + // Skip all temperatures which are not in enabled state if temperature.Status.State != common.EnabledState { continue } @@ -119,8 +119,8 @@ func (r *RedfishReceiver) Start() { // services supporting Redfish v1.6 or higher, this value shall be the // zero-based array index. "temperature_member_id": temperature.MemberID, - // PhysicalContext shall be a description of the affected device(s) or region - // within the chassis to which this power control applies. + // PhysicalContext shall be a description of the affected device or region + // within the chassis to which this temperature measurement applies "temperature_physical_context": string(temperature.PhysicalContext), // Name "temperature_name": temperature.Name, @@ -159,7 +159,7 @@ func (r *RedfishReceiver) Start() { break } - // Skip all fans which ar not in enabled state + // Skip all fans which are not in enabled state if fan.Status.State != common.EnabledState { continue } @@ -177,8 +177,8 @@ func (r *RedfishReceiver) Start() { // services supporting Redfish v1.6 or higher, this value shall be the // zero-based array index. "fan_member_id": fan.MemberID, - // PhysicalContext shall be a description of the affected device(s) or region - // within the chassis to which this power control applies. + // PhysicalContext shall be a description of the affected device or region + // within the chassis to which this fan is associated "fan_physical_context": string(fan.PhysicalContext), // Name "fan_name": fan.Name, @@ -198,7 +198,7 @@ func (r *RedfishReceiver) Start() { "unit": string(fan.ReadingUnits), } - // ReadingCelsius shall be the current value of the temperature sensor's reading. + // Reading shall be the current value of the fan sensor's reading value := fan.Reading y, err := lp.New("fan_speed", tags, meta, @@ -239,6 +239,11 @@ func (r *RedfishReceiver) Start() { // Read min, max and average consumed watts for each power control for _, pc := range power.PowerControl { + // Skip all power controls which are not in enabled state + if pc.Status.State != common.EnabledState { + continue + } + // Map of collected metrics metrics := make(map[string]float32) From acd092a977bc498ba287b4401d84ba26dc438e1c Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 11 Aug 2022 15:36:18 +0200 Subject: [PATCH 15/31] Add redfish receiver documentation --- receivers/redfishReceiver.go | 11 +++---- receivers/redfishReceiver.md | 56 ++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 5 deletions(-) create mode 100644 receivers/redfishReceiver.md diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index 49a9a07..2c3b6a9 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -30,7 +30,8 @@ type RedfishReceiver struct { IntervalString string `json:"interval,omitempty"` Interval time.Duration - // Control whether a client verifies the server's certificate (default: true) + // Control whether a client verifies the server's certificate + // (default: true == do not verify server's certificate) HttpInsecure bool `json:"http_insecure,omitempty"` // Time limit for requests made by this HTTP client (default: 10 s) HttpTimeoutString string `json:"http_timeout,omitempty"` @@ -45,10 +46,10 @@ type RedfishReceiver struct { // Client config for each redfish service ClientConfigs []struct { - Hostname *string `json:"hostname"` - Username *string `json:"username"` - Password *string `json:"password"` - Endpoint *string `json:"endpoint"` + Hostname *string `json:"hostname"` // Hostname the redfish service belongs to + Username *string `json:"username"` // User name to authenticate with + Password *string `json:"password"` // Password to use for authentication + Endpoint *string `json:"endpoint"` // URL of the redfish service // Per client disable collection of power or thermal metrics DisablePowerMetrics bool `json:"disable_power_metrics"` diff --git a/receivers/redfishReceiver.md b/receivers/redfishReceiver.md new file mode 100644 index 0000000..524c330 --- /dev/null +++ b/receivers/redfishReceiver.md @@ -0,0 +1,56 @@ +## Redfish receiver + +The Redfish receiver uses the [Redfish (specification)](https://www.dmtf.org/standards/redfish) to query thermal and power metrics. Thermal metrics may include various fan speeds and temperatures. Power metrics may include the current power consumption of various hardware components. It may also include the minimum, maximum and average power consumption of these components in a given time interval. The receiver will poll each configured redfish device once in a given interval. Multiple devices can be accessed in parallel to increase throughput. + +### Configuration structure + +```json +{ + "": { + "type": "redfish", + "exclude_metrics": [ "min_consumed_watts" ], + "client_config": [ + { + "hostname": "", + "username": "", + "password": "", + "endpoint": "https://" + }, + { + "hostname": "", + "username": "", + "password": "", + "endpoint": "https://", + "disable_power_metrics": true + }, + { + "hostname": "", + "username": "", + "password": "", + "endpoint": "https://", + "disable_thermal_metrics": true + } + ] + } +} +``` + +Global settings: + +- `fanout`: Maximum number of simultaneous redfish connections (default: 64) +- `interval`: How often the redfish power metrics should be read and send to the sink (default: 30 s) +- `http_insecure`: Control whether a client verifies the server's certificate (default: true == do not verify server's certificate) +- `http_timeout`: Time limit for requests made by this HTTP client (default: 10 s) + +Global and per redfish device settings: + +- `disable_power_metrics`: disable collection of power metrics +- `disable_thermal_metrics`: disable collection of thermal metrics +- `exclude_metrics`: list of excluded metrics + +Per redfish device settings: + +- `hostname`: hostname the redfish service belongs to +- `username`: User name to authenticate with +- `password`: Password to use for authentication +- `endpoint`: URL of the redfish service From 62f6e4151a375592ce5b5fb53c3089e84f8fb08f Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 15 Aug 2022 15:11:29 +0200 Subject: [PATCH 16/31] Added readProcessorMetrics to read read thermal an power metrics per CPU / GPU --- receivers/redfishReceiver.go | 130 +++++++++++++++++++++++++++++++++-- 1 file changed, 124 insertions(+), 6 deletions(-) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index 2c3b6a9..8ab41f4 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "net/http" + "net/url" "strconv" "sync" "time" @@ -37,9 +38,10 @@ type RedfishReceiver struct { HttpTimeoutString string `json:"http_timeout,omitempty"` HttpTimeout time.Duration - // Globally disable collection of power or thermal metrics - DisablePowerMetrics bool `json:"disable_power_metrics"` - DisableThermalMetrics bool `json:"disable_thermal_metrics"` + // Globally disable collection of power, processor or thermal metrics + DisablePowerMetrics bool `json:"disable_power_metrics"` + DisableProcessorMetrics bool `json:"disable_processor_metrics"` + DisableThermalMetrics bool `json:"disable_thermal_metrics"` // Globally excluded metrics ExcludeMetrics []string `json:"exclude_metrics,omitempty"` @@ -51,9 +53,10 @@ type RedfishReceiver struct { Password *string `json:"password"` // Password to use for authentication Endpoint *string `json:"endpoint"` // URL of the redfish service - // Per client disable collection of power or thermal metrics - DisablePowerMetrics bool `json:"disable_power_metrics"` - DisableThermalMetrics bool `json:"disable_thermal_metrics"` + // Per client disable collection of power,processor or thermal metrics + DisablePowerMetrics bool `json:"disable_power_metrics"` + DisableProcessorMetrics bool `json:"disable_processor_metrics"` + DisableThermalMetrics bool `json:"disable_thermal_metrics"` // Per client excluded metrics ExcludeMetrics []string `json:"exclude_metrics,omitempty"` @@ -337,6 +340,101 @@ func (r *RedfishReceiver) Start() { return nil } + // Read redfish processor metrics + // See: https://redfish.dmtf.org/schemas/v1/ProcessorMetrics.json + readProcessorMetrics := func(clientConfigIndex int, processor *redfish.Processor) error { + clientConfig := &r.config.ClientConfigs[clientConfigIndex] + + // Skip collection off processor metrics when disabled by config + if r.config.DisableProcessorMetrics || clientConfig.DisableProcessorMetrics { + return nil + } + + timestamp := time.Now() + + URL, _ := url.JoinPath(processor.ODataID, "ProcessorMetrics") + resp, err := processor.Client.Get(URL) + if err != nil { + // Skip non existing URLs + return nil + } + + var processorMetrics struct { + common.Entity + ODataType string `json:"@odata.type"` + ODataEtag string `json:"@odata.etag"` + Description string `json:"Description"` + // This property shall contain the power, in watts, that the processor has consumed. + ConsumedPowerWatt float32 `json:"ConsumedPowerWatt"` + // This property shall contain the temperature, in Celsius, of the processor. + TemperatureCelsius float32 `json:"TemperatureCelsius"` + } + err = json.NewDecoder(resp.Body).Decode(&processorMetrics) + if err != nil { + return fmt.Errorf("unable to decode JSON for processor metrics: %+w", err) + } + processorMetrics.SetClient(processor.Client) + + // Set tags + tags := map[string]string{ + "hostname": *clientConfig.Hostname, + "type": "socket", + // ProcessorType shall contain the string which identifies the type of processor contained in this Socket + "processor_typ": string(processor.ProcessorType), + // Processor name + "processor_name": processor.Name, + // ID uniquely identifies the resource + "processor_id": processor.ID, + } + + // Delete empty tags + for key, value := range tags { + if value == "" { + delete(tags, key) + } + } + + // Set meta data tags + metaPower := map[string]string{ + "source": r.name, + "group": "Energy", + "unit": "watts", + } + + namePower := "consumed_power" + + if !clientConfig.isExcluded[namePower] { + y, err := lp.New(namePower, tags, metaPower, + map[string]interface{}{ + "value": processorMetrics.ConsumedPowerWatt, + }, + timestamp) + if err == nil { + r.sink <- y + } + } + // Set meta data tags + metaThermal := map[string]string{ + "source": r.name, + "group": "Temperature", + "unit": "degC", + } + + nameThermal := "temperature" + + if !clientConfig.isExcluded[nameThermal] { + y, err := lp.New(nameThermal, tags, metaThermal, + map[string]interface{}{ + "value": processorMetrics.TemperatureCelsius, + }, + timestamp) + if err == nil { + r.sink <- y + } + } + return nil + } + // readMetrics reads redfish temperature and power metrics from the endpoint configured in conf readMetrics := func(clientConfigIndex int) error { @@ -384,6 +482,26 @@ func (r *RedfishReceiver) Start() { } } + // loop for all computer systems + systems, err := c.Service.Systems() + if err != nil { + return fmt.Errorf("readMetrics: c.Service.Systems() failed: %v", err) + } + for _, system := range systems { + + // loop for all processors + processors, err := system.Processors() + if err != nil { + return fmt.Errorf("readMetrics: system.Processors() failed: %v", err) + } + for _, processor := range processors { + err := readProcessorMetrics(clientConfigIndex, processor) + if err != nil { + return err + } + } + } + return nil } From eaf8b1941d52f16c2ad2d705778d1b9c434cdf28 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 15 Aug 2022 15:25:20 +0200 Subject: [PATCH 17/31] ioutils is depreceated --- receivers/httpReceiver.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/receivers/httpReceiver.go b/receivers/httpReceiver.go index e66ad5e..974bcd6 100644 --- a/receivers/httpReceiver.go +++ b/receivers/httpReceiver.go @@ -5,7 +5,7 @@ import ( "encoding/json" "errors" "fmt" - "io/ioutil" + "io" "net/http" "strings" "sync" @@ -84,7 +84,7 @@ func (r *HttpReceiver) ServerHttp(w http.ResponseWriter, req *http.Request) { return } - body, err := ioutil.ReadAll(req.Body) + body, err := io.ReadAll(req.Body) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return From f7b39d027b80070639eba7c7d7c35b3bf01fbf6c Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 15 Aug 2022 15:25:59 +0200 Subject: [PATCH 18/31] url.JoinPath requires go 1.19. For now stay compatible with go 1.18 --- receivers/redfishReceiver.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index 8ab41f4..6632336 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -5,7 +5,6 @@ import ( "encoding/json" "fmt" "net/http" - "net/url" "strconv" "sync" "time" @@ -352,7 +351,8 @@ func (r *RedfishReceiver) Start() { timestamp := time.Now() - URL, _ := url.JoinPath(processor.ODataID, "ProcessorMetrics") + // Golang 1.19: URL, _ := url.JoinPath(processor.ODataID, "ProcessorMetrics") + URL := processor.ODataID + "/ProcessorMetrics" resp, err := processor.Client.Get(URL) if err != nil { // Skip non existing URLs From 0dd430e7e9830513d775b6426cca980bfe343dab Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Tue, 16 Aug 2022 15:14:20 +0200 Subject: [PATCH 19/31] Refactor redfishReceiver. --- receivers/redfishReceiver.go | 1177 ++++++++++++++++++---------------- 1 file changed, 613 insertions(+), 564 deletions(-) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index 6632336..1e4c94e 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -18,24 +18,589 @@ import ( "github.com/stmcginnis/gofish/redfish" ) +type RedfishReceiverClientConfig struct { + + // Hostname the redfish service belongs to + Hostname string + + // is metric excluded globally or per client + isExcluded map[string](bool) + + doPowerMetric bool + doProcessorMetrics bool + doThermalMetrics bool + + gofish gofish.ClientConfig +} + // RedfishReceiver configuration: type RedfishReceiver struct { receiver + config struct { + fanout int + Interval time.Duration + HttpTimeout time.Duration + + // Client config for each redfish service + ClientConfigs []RedfishReceiverClientConfig + } + + done chan bool // channel to finish / stop redfish receiver + wg sync.WaitGroup // wait group for redfish receiver +} + +// readThermalMetrics reads thermal metrics from a redfish device +func (r *RedfishReceiver) readThermalMetrics( + clientConfig *RedfishReceiverClientConfig, + chassis *redfish.Chassis) error { + + // Get thermal information for each chassis + thermal, err := chassis.Thermal() + if err != nil { + return fmt.Errorf("readMetrics: chassis.Thermal() failed: %v", err) + } + + // Skip empty thermal information + if thermal == nil { + return nil + } + + timestamp := time.Now() + + for _, temperature := range thermal.Temperatures { + + // Skip, when temperature metric is excluded + if clientConfig.isExcluded["temperature"] { + break + } + + // Skip all temperatures which are not in enabled state + if temperature.Status.State != common.EnabledState { + continue + } + + tags := map[string]string{ + "hostname": clientConfig.Hostname, + "type": "node", + // ChassisType shall indicate the physical form factor for the type of chassis + "chassis_typ": string(chassis.ChassisType), + // Chassis name + "chassis_name": chassis.Name, + // ID uniquely identifies the resource + "temperature_id": temperature.ID, + // MemberID shall uniquely identify the member within the collection. For + // services supporting Redfish v1.6 or higher, this value shall be the + // zero-based array index. + "temperature_member_id": temperature.MemberID, + // PhysicalContext shall be a description of the affected device or region + // within the chassis to which this temperature measurement applies + "temperature_physical_context": string(temperature.PhysicalContext), + // Name + "temperature_name": temperature.Name, + } + + // Delete empty tags + for key, value := range tags { + if value == "" { + delete(tags, key) + } + } + + // Set meta data tags + meta := map[string]string{ + "source": r.name, + "group": "Temperature", + "unit": "degC", + } + + // ReadingCelsius shall be the current value of the temperature sensor's reading. + value := temperature.ReadingCelsius + + y, err := lp.New("temperature", tags, meta, + map[string]interface{}{ + "value": value, + }, + timestamp) + if err == nil { + r.sink <- y + } + } + + for _, fan := range thermal.Fans { + // Skip, when fan_speed metric is excluded + if clientConfig.isExcluded["fan_speed"] { + break + } + + // Skip all fans which are not in enabled state + if fan.Status.State != common.EnabledState { + continue + } + + tags := map[string]string{ + "hostname": clientConfig.Hostname, + "type": "node", + // ChassisType shall indicate the physical form factor for the type of chassis + "chassis_typ": string(chassis.ChassisType), + // Chassis name + "chassis_name": chassis.Name, + // ID uniquely identifies the resource + "fan_id": fan.ID, + // MemberID shall uniquely identify the member within the collection. For + // services supporting Redfish v1.6 or higher, this value shall be the + // zero-based array index. + "fan_member_id": fan.MemberID, + // PhysicalContext shall be a description of the affected device or region + // within the chassis to which this fan is associated + "fan_physical_context": string(fan.PhysicalContext), + // Name + "fan_name": fan.Name, + } + + // Delete empty tags + for key, value := range tags { + if value == "" { + delete(tags, key) + } + } + + // Set meta data tags + meta := map[string]string{ + "source": r.name, + "group": "FanSpeed", + "unit": string(fan.ReadingUnits), + } + + // Reading shall be the current value of the fan sensor's reading + value := fan.Reading + + y, err := lp.New("fan_speed", tags, meta, + map[string]interface{}{ + "value": value, + }, + timestamp) + if err == nil { + r.sink <- y + } + } + + return nil +} + +// readPowerMetrics reads power metrics from a redfish device +func (r *RedfishReceiver) readPowerMetrics( + clientConfig *RedfishReceiverClientConfig, + chassis *redfish.Chassis) error { + + // Get power information for each chassis + power, err := chassis.Power() + if err != nil { + return fmt.Errorf("readMetrics: chassis.Power() failed: %v", err) + } + + // Skip empty power information + if power == nil { + return nil + } + + timestamp := time.Now() + + // Read min, max and average consumed watts for each power control + for _, pc := range power.PowerControl { + + // Skip all power controls which are not in enabled state + if pc.Status.State != common.EnabledState { + continue + } + + // Map of collected metrics + metrics := make(map[string]float32) + + // PowerConsumedWatts shall represent the actual power being consumed (in + // Watts) by the chassis + if !clientConfig.isExcluded["consumed_watts"] { + metrics["consumed_watts"] = pc.PowerConsumedWatts + } + // AverageConsumedWatts shall represent the + // average power level that occurred averaged over the last IntervalInMin + // minutes. + if !clientConfig.isExcluded["average_consumed_watts"] { + metrics["average_consumed_watts"] = pc.PowerMetrics.AverageConsumedWatts + } + // MinConsumedWatts shall represent the + // minimum power level in watts that occurred within the last + // IntervalInMin minutes. + if !clientConfig.isExcluded["min_consumed_watts"] { + metrics["min_consumed_watts"] = pc.PowerMetrics.MinConsumedWatts + } + // MaxConsumedWatts shall represent the + // maximum power level in watts that occurred within the last + // IntervalInMin minutes + if !clientConfig.isExcluded["max_consumed_watts"] { + metrics["max_consumed_watts"] = pc.PowerMetrics.MaxConsumedWatts + } + // IntervalInMin shall represent the time interval (or window), in minutes, + // in which the PowerMetrics properties are measured over. + // Should be an integer, but some Dell implementations return as a float + intervalInMin := + strconv.FormatFloat( + float64(pc.PowerMetrics.IntervalInMin), 'f', -1, 32) + + // Set tags + tags := map[string]string{ + "hostname": clientConfig.Hostname, + "type": "node", + // ChassisType shall indicate the physical form factor for the type of chassis + "chassis_typ": string(chassis.ChassisType), + // Chassis name + "chassis_name": chassis.Name, + // ID uniquely identifies the resource + "power_control_id": pc.ID, + // MemberID shall uniquely identify the member within the collection. For + // services supporting Redfish v1.6 or higher, this value shall be the + // zero-based array index. + "power_control_member_id": pc.MemberID, + // PhysicalContext shall be a description of the affected device(s) or region + // within the chassis to which this power control applies. + "power_control_physical_context": string(pc.PhysicalContext), + // Name + "power_control_name": pc.Name, + } + + // Delete empty tags + for key, value := range tags { + if value == "" { + delete(tags, key) + } + } + + // Set meta data tags + meta := map[string]string{ + "source": r.name, + "group": "Energy", + "interval_in_minutes": intervalInMin, + "unit": "watts", + } + + // Delete empty meta data tags + for key, value := range meta { + if value == "" { + delete(meta, key) + } + } + + for name, value := range metrics { + + y, err := lp.New(name, tags, meta, + map[string]interface{}{ + "value": value, + }, + timestamp) + if err == nil { + r.sink <- y + } + } + } + + return nil +} + +// readProcessorMetrics reads processor metrics from a redfish device +// See: https://redfish.dmtf.org/schemas/v1/ProcessorMetrics.json +func (r *RedfishReceiver) readProcessorMetrics( + clientConfig *RedfishReceiverClientConfig, + processor *redfish.Processor) error { + + timestamp := time.Now() + + // Golang 1.19: URL, _ := url.JoinPath(processor.ODataID, "ProcessorMetrics") + URL := processor.ODataID + "/ProcessorMetrics" + resp, err := processor.Client.Get(URL) + if err != nil { + // Skip non existing URLs + return nil + } + + var processorMetrics struct { + common.Entity + ODataType string `json:"@odata.type"` + ODataEtag string `json:"@odata.etag"` + Description string `json:"Description"` + // This property shall contain the power, in watts, that the processor has consumed. + ConsumedPowerWatt float32 `json:"ConsumedPowerWatt"` + // This property shall contain the temperature, in Celsius, of the processor. + TemperatureCelsius float32 `json:"TemperatureCelsius"` + } + err = json.NewDecoder(resp.Body).Decode(&processorMetrics) + if err != nil { + return fmt.Errorf("unable to decode JSON for processor metrics: %+w", err) + } + processorMetrics.SetClient(processor.Client) + + // Set tags + tags := map[string]string{ + "hostname": clientConfig.Hostname, + "type": "socket", + // ProcessorType shall contain the string which identifies the type of processor contained in this Socket + "processor_typ": string(processor.ProcessorType), + // Processor name + "processor_name": processor.Name, + // ID uniquely identifies the resource + "processor_id": processor.ID, + } + + // Delete empty tags + for key, value := range tags { + if value == "" { + delete(tags, key) + } + } + + // Set meta data tags + metaPower := map[string]string{ + "source": r.name, + "group": "Energy", + "unit": "watts", + } + + namePower := "consumed_power" + + if !clientConfig.isExcluded[namePower] { + y, err := lp.New(namePower, tags, metaPower, + map[string]interface{}{ + "value": processorMetrics.ConsumedPowerWatt, + }, + timestamp) + if err == nil { + r.sink <- y + } + } + // Set meta data tags + metaThermal := map[string]string{ + "source": r.name, + "group": "Temperature", + "unit": "degC", + } + + nameThermal := "temperature" + + if !clientConfig.isExcluded[nameThermal] { + y, err := lp.New(nameThermal, tags, metaThermal, + map[string]interface{}{ + "value": processorMetrics.TemperatureCelsius, + }, + timestamp) + if err == nil { + r.sink <- y + } + } + return nil +} + +// readMetrics reads redfish thermal, power and processor metrics from the redfish device +// configured in clientConfig +func (r *RedfishReceiver) readMetrics(clientConfig *RedfishReceiverClientConfig) error { + + // Connect to redfish service + c, err := gofish.Connect(clientConfig.gofish) + if err != nil { + return fmt.Errorf( + "readMetrics: gofish.Connect({Username: %v, Endpoint: %v, BasicAuth: %v, HttpTimeout: %v, HttpInsecure: %v}) failed: %v", + clientConfig.gofish.Username, + clientConfig.gofish.Endpoint, + clientConfig.gofish.BasicAuth, + clientConfig.gofish.HTTPClient.Timeout, + clientConfig.gofish.HTTPClient.Transport.(*http.Transport).TLSClientConfig.InsecureSkipVerify, + err) + } + defer c.Logout() + + // Create a session, when required + if _, err = c.GetSession(); err != nil { + c, err = c.CloneWithSession() + if err != nil { + return fmt.Errorf("readMetrics: Failed to create a session: %+w", err) + } + } + + // Get all chassis managed by this service + isChassisListRequired := + clientConfig.doThermalMetrics || + clientConfig.doPowerMetric + var chassisList []*redfish.Chassis + if isChassisListRequired { + chassisList, err = c.Service.Chassis() + if err != nil { + return fmt.Errorf("readMetrics: c.Service.Chassis() failed: %v", err) + } + } + + // Get all computer systems managed by this service + isComputerSystemListRequired := clientConfig.doProcessorMetrics + var computerSystemList []*redfish.ComputerSystem + if isComputerSystemListRequired { + computerSystemList, err = c.Service.Systems() + if err != nil { + return fmt.Errorf("readMetrics: c.Service.Systems() failed: %v", err) + } + } + + // read thermal metrics + if clientConfig.doThermalMetrics { + for _, chassis := range chassisList { + err := r.readThermalMetrics(clientConfig, chassis) + if err != nil { + return err + } + } + } + + // read power metrics + if clientConfig.doPowerMetric { + for _, chassis := range chassisList { + err = r.readPowerMetrics(clientConfig, chassis) + if err != nil { + return err + } + } + } + + // read processor metrics + if clientConfig.doProcessorMetrics { + // loop for all computer systems + for _, system := range computerSystemList { + + // loop for all processors + processors, err := system.Processors() + if err != nil { + return fmt.Errorf("readMetrics: system.Processors() failed: %v", err) + } + for _, processor := range processors { + err := r.readProcessorMetrics(clientConfig, processor) + if err != nil { + return err + } + } + } + } + + return nil +} + +// doReadMetrics reads metrics from all configure redfish devices. +// To compensate latencies of the Redfish devices a fanout is used. +func (r *RedfishReceiver) doReadMetric() { + + // Create wait group and input channel for workers + var workerWaitGroup sync.WaitGroup + workerInput := make(chan *RedfishReceiverClientConfig, r.config.fanout) + + // Create worker go routines + for i := 0; i < r.config.fanout; i++ { + // Increment worker wait group counter + workerWaitGroup.Add(1) + go func() { + // Decrement worker wait group counter + defer workerWaitGroup.Done() + + // Read power metrics for each client config + for clientConfig := range workerInput { + err := r.readMetrics(clientConfig) + if err != nil { + cclog.ComponentError(r.name, err) + } + } + }() + } + + // Distribute client configs to workers + for i := range r.config.ClientConfigs { + + // Check done channel status + select { + case workerInput <- &r.config.ClientConfigs[i]: + case <-r.done: + // process done event + // Stop workers, clear channel and wait for all workers to finish + close(workerInput) + for range workerInput { + } + workerWaitGroup.Wait() + return + } + } + + // Stop workers and wait for all workers to finish + close(workerInput) + workerWaitGroup.Wait() +} + +// Start starts the redfish receiver +func (r *RedfishReceiver) Start() { + cclog.ComponentDebug(r.name, "START") + + // Start redfish receiver + r.wg.Add(1) + go func() { + defer r.wg.Done() + + // Create ticker + ticker := time.NewTicker(r.config.Interval) + defer ticker.Stop() + + for { + r.doReadMetric() + + select { + case tickerTime := <-ticker.C: + // Check if we missed the ticker event + if since := time.Since(tickerTime); since > 5*time.Second { + cclog.ComponentInfo(r.name, "Missed ticker event for more then", since) + } + + // process ticker event -> continue + continue + case <-r.done: + // process done event + return + } + } + }() + + cclog.ComponentDebug(r.name, "STARTED") +} + +// Close closes the redfish receiver +func (r *RedfishReceiver) Close() { + cclog.ComponentDebug(r.name, "CLOSE") + + // Send the signal and wait + close(r.done) + r.wg.Wait() + + cclog.ComponentDebug(r.name, "DONE") +} + +// NewRedfishReceiver creates a new instance of the redfish receiver +// Initialize the receiver by giving it a name and reading in the config JSON +func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { + r := new(RedfishReceiver) + + // Config options from config file + configJSON := struct { Type string `json:"type"` // Maximum number of simultaneous redfish connections (default: 64) Fanout int `json:"fanout,omitempty"` // How often the redfish power metrics should be read and send to the sink (default: 30 s) IntervalString string `json:"interval,omitempty"` - Interval time.Duration // Control whether a client verifies the server's certificate // (default: true == do not verify server's certificate) HttpInsecure bool `json:"http_insecure,omitempty"` // Time limit for requests made by this HTTP client (default: 10 s) HttpTimeoutString string `json:"http_timeout,omitempty"` - HttpTimeout time.Duration // Globally disable collection of power, processor or thermal metrics DisablePowerMetrics bool `json:"disable_power_metrics"` @@ -45,7 +610,6 @@ type RedfishReceiver struct { // Globally excluded metrics ExcludeMetrics []string `json:"exclude_metrics,omitempty"` - // Client config for each redfish service ClientConfigs []struct { Hostname *string `json:"hostname"` // Hostname the redfish service belongs to Username *string `json:"username"` // User name to authenticate with @@ -59,563 +623,25 @@ type RedfishReceiver struct { // Per client excluded metrics ExcludeMetrics []string `json:"exclude_metrics,omitempty"` - - // is metric excluded globally or per client - isExcluded map[string](bool) - - gofish gofish.ClientConfig } `json:"client_config"` + }{ + // Set defaults values + // Allow overwriting these defaults by reading config JSON + Fanout: 64, + IntervalString: "30s", + HttpTimeoutString: "10s", + HttpInsecure: true, } - done chan bool // channel to finish / stop redfish receiver - wg sync.WaitGroup // wait group for redfish receiver -} - -// Start starts the redfish receiver -func (r *RedfishReceiver) Start() { - cclog.ComponentDebug(r.name, "START") - - // Read redfish thermal metrics - readThermalMetrics := func(clientConfigIndex int, chassis *redfish.Chassis) error { - clientConfig := &r.config.ClientConfigs[clientConfigIndex] - - // Skip collection off thermal metrics when disabled by config - if r.config.DisableThermalMetrics || clientConfig.DisableThermalMetrics { - return nil - } - - // Get thermal information for each chassis - thermal, err := chassis.Thermal() - if err != nil { - return fmt.Errorf("readMetrics: chassis.Thermal() failed: %v", err) - } - - // Skip empty thermal information - if thermal == nil { - return nil - } - - timestamp := time.Now() - - for _, temperature := range thermal.Temperatures { - - // Skip, when temperature metric is excluded - if clientConfig.isExcluded["temperature"] { - break - } - - // Skip all temperatures which are not in enabled state - if temperature.Status.State != common.EnabledState { - continue - } - - tags := map[string]string{ - "hostname": *clientConfig.Hostname, - "type": "node", - // ChassisType shall indicate the physical form factor for the type of chassis - "chassis_typ": string(chassis.ChassisType), - // Chassis name - "chassis_name": chassis.Name, - // ID uniquely identifies the resource - "temperature_id": temperature.ID, - // MemberID shall uniquely identify the member within the collection. For - // services supporting Redfish v1.6 or higher, this value shall be the - // zero-based array index. - "temperature_member_id": temperature.MemberID, - // PhysicalContext shall be a description of the affected device or region - // within the chassis to which this temperature measurement applies - "temperature_physical_context": string(temperature.PhysicalContext), - // Name - "temperature_name": temperature.Name, - } - - // Delete empty tags - for key, value := range tags { - if value == "" { - delete(tags, key) - } - } - - // Set meta data tags - meta := map[string]string{ - "source": r.name, - "group": "Temperature", - "unit": "degC", - } - - // ReadingCelsius shall be the current value of the temperature sensor's reading. - value := temperature.ReadingCelsius - - y, err := lp.New("temperature", tags, meta, - map[string]interface{}{ - "value": value, - }, - timestamp) - if err == nil { - r.sink <- y - } - } - - for _, fan := range thermal.Fans { - // Skip, when fan_speed metric is excluded - if clientConfig.isExcluded["fan_speed"] { - break - } - - // Skip all fans which are not in enabled state - if fan.Status.State != common.EnabledState { - continue - } - - tags := map[string]string{ - "hostname": *clientConfig.Hostname, - "type": "node", - // ChassisType shall indicate the physical form factor for the type of chassis - "chassis_typ": string(chassis.ChassisType), - // Chassis name - "chassis_name": chassis.Name, - // ID uniquely identifies the resource - "fan_id": fan.ID, - // MemberID shall uniquely identify the member within the collection. For - // services supporting Redfish v1.6 or higher, this value shall be the - // zero-based array index. - "fan_member_id": fan.MemberID, - // PhysicalContext shall be a description of the affected device or region - // within the chassis to which this fan is associated - "fan_physical_context": string(fan.PhysicalContext), - // Name - "fan_name": fan.Name, - } - - // Delete empty tags - for key, value := range tags { - if value == "" { - delete(tags, key) - } - } - - // Set meta data tags - meta := map[string]string{ - "source": r.name, - "group": "FanSpeed", - "unit": string(fan.ReadingUnits), - } - - // Reading shall be the current value of the fan sensor's reading - value := fan.Reading - - y, err := lp.New("fan_speed", tags, meta, - map[string]interface{}{ - "value": value, - }, - timestamp) - if err == nil { - r.sink <- y - } - } - - return nil - } - - // Read redfish power metrics - readPowerMetrics := func(clientConfigIndex int, chassis *redfish.Chassis) error { - clientConfig := &r.config.ClientConfigs[clientConfigIndex] - - // Skip collection off thermal metrics when disabled by config - if r.config.DisablePowerMetrics || clientConfig.DisablePowerMetrics { - return nil - } - - // Get power information for each chassis - power, err := chassis.Power() - if err != nil { - return fmt.Errorf("readMetrics: chassis.Power() failed: %v", err) - } - - // Skip empty power information - if power == nil { - return nil - } - - timestamp := time.Now() - - // Read min, max and average consumed watts for each power control - for _, pc := range power.PowerControl { - - // Skip all power controls which are not in enabled state - if pc.Status.State != common.EnabledState { - continue - } - - // Map of collected metrics - metrics := make(map[string]float32) - - // PowerConsumedWatts shall represent the actual power being consumed (in - // Watts) by the chassis - if !clientConfig.isExcluded["consumed_watts"] { - metrics["consumed_watts"] = pc.PowerConsumedWatts - } - // AverageConsumedWatts shall represent the - // average power level that occurred averaged over the last IntervalInMin - // minutes. - if !clientConfig.isExcluded["average_consumed_watts"] { - metrics["average_consumed_watts"] = pc.PowerMetrics.AverageConsumedWatts - } - // MinConsumedWatts shall represent the - // minimum power level in watts that occurred within the last - // IntervalInMin minutes. - if !clientConfig.isExcluded["min_consumed_watts"] { - metrics["min_consumed_watts"] = pc.PowerMetrics.MinConsumedWatts - } - // MaxConsumedWatts shall represent the - // maximum power level in watts that occurred within the last - // IntervalInMin minutes - if !clientConfig.isExcluded["max_consumed_watts"] { - metrics["max_consumed_watts"] = pc.PowerMetrics.MaxConsumedWatts - } - // IntervalInMin shall represent the time interval (or window), in minutes, - // in which the PowerMetrics properties are measured over. - // Should be an integer, but some Dell implementations return as a float - intervalInMin := - strconv.FormatFloat( - float64(pc.PowerMetrics.IntervalInMin), 'f', -1, 32) - - // Set tags - tags := map[string]string{ - "hostname": *clientConfig.Hostname, - "type": "node", - // ChassisType shall indicate the physical form factor for the type of chassis - "chassis_typ": string(chassis.ChassisType), - // Chassis name - "chassis_name": chassis.Name, - // ID uniquely identifies the resource - "power_control_id": pc.ID, - // MemberID shall uniquely identify the member within the collection. For - // services supporting Redfish v1.6 or higher, this value shall be the - // zero-based array index. - "power_control_member_id": pc.MemberID, - // PhysicalContext shall be a description of the affected device(s) or region - // within the chassis to which this power control applies. - "power_control_physical_context": string(pc.PhysicalContext), - // Name - "power_control_name": pc.Name, - } - - // Delete empty tags - for key, value := range tags { - if value == "" { - delete(tags, key) - } - } - - // Set meta data tags - meta := map[string]string{ - "source": r.name, - "group": "Energy", - "interval_in_minutes": intervalInMin, - "unit": "watts", - } - - // Delete empty meta data tags - for key, value := range meta { - if value == "" { - delete(meta, key) - } - } - - for name, value := range metrics { - - y, err := lp.New(name, tags, meta, - map[string]interface{}{ - "value": value, - }, - timestamp) - if err == nil { - r.sink <- y - } - } - } - - return nil - } - - // Read redfish processor metrics - // See: https://redfish.dmtf.org/schemas/v1/ProcessorMetrics.json - readProcessorMetrics := func(clientConfigIndex int, processor *redfish.Processor) error { - clientConfig := &r.config.ClientConfigs[clientConfigIndex] - - // Skip collection off processor metrics when disabled by config - if r.config.DisableProcessorMetrics || clientConfig.DisableProcessorMetrics { - return nil - } - - timestamp := time.Now() - - // Golang 1.19: URL, _ := url.JoinPath(processor.ODataID, "ProcessorMetrics") - URL := processor.ODataID + "/ProcessorMetrics" - resp, err := processor.Client.Get(URL) - if err != nil { - // Skip non existing URLs - return nil - } - - var processorMetrics struct { - common.Entity - ODataType string `json:"@odata.type"` - ODataEtag string `json:"@odata.etag"` - Description string `json:"Description"` - // This property shall contain the power, in watts, that the processor has consumed. - ConsumedPowerWatt float32 `json:"ConsumedPowerWatt"` - // This property shall contain the temperature, in Celsius, of the processor. - TemperatureCelsius float32 `json:"TemperatureCelsius"` - } - err = json.NewDecoder(resp.Body).Decode(&processorMetrics) - if err != nil { - return fmt.Errorf("unable to decode JSON for processor metrics: %+w", err) - } - processorMetrics.SetClient(processor.Client) - - // Set tags - tags := map[string]string{ - "hostname": *clientConfig.Hostname, - "type": "socket", - // ProcessorType shall contain the string which identifies the type of processor contained in this Socket - "processor_typ": string(processor.ProcessorType), - // Processor name - "processor_name": processor.Name, - // ID uniquely identifies the resource - "processor_id": processor.ID, - } - - // Delete empty tags - for key, value := range tags { - if value == "" { - delete(tags, key) - } - } - - // Set meta data tags - metaPower := map[string]string{ - "source": r.name, - "group": "Energy", - "unit": "watts", - } - - namePower := "consumed_power" - - if !clientConfig.isExcluded[namePower] { - y, err := lp.New(namePower, tags, metaPower, - map[string]interface{}{ - "value": processorMetrics.ConsumedPowerWatt, - }, - timestamp) - if err == nil { - r.sink <- y - } - } - // Set meta data tags - metaThermal := map[string]string{ - "source": r.name, - "group": "Temperature", - "unit": "degC", - } - - nameThermal := "temperature" - - if !clientConfig.isExcluded[nameThermal] { - y, err := lp.New(nameThermal, tags, metaThermal, - map[string]interface{}{ - "value": processorMetrics.TemperatureCelsius, - }, - timestamp) - if err == nil { - r.sink <- y - } - } - return nil - } - - // readMetrics reads redfish temperature and power metrics from the endpoint configured in conf - readMetrics := func(clientConfigIndex int) error { - - // access client config - clientConfig := &r.config.ClientConfigs[clientConfigIndex] - - // Connect to redfish service - c, err := gofish.Connect(clientConfig.gofish) - if err != nil { - return fmt.Errorf( - "readMetrics: gofish.Connect({Username: %v, Endpoint: %v, BasicAuth: %v, HttpTimeout: %v, HttpInsecure: %v}) failed: %v", - clientConfig.gofish.Username, - clientConfig.gofish.Endpoint, - clientConfig.gofish.BasicAuth, - clientConfig.gofish.HTTPClient.Timeout, - clientConfig.gofish.HTTPClient.Transport.(*http.Transport).TLSClientConfig.InsecureSkipVerify, - err) - } - defer c.Logout() - - // Create a session, when required - if _, err = c.GetSession(); err != nil { - c, err = c.CloneWithSession() - if err != nil { - return fmt.Errorf("readMetrics: Failed to create a session: %+w", err) - } - } - - // Get all chassis managed by this service - chassis_list, err := c.Service.Chassis() - if err != nil { - return fmt.Errorf("readMetrics: c.Service.Chassis() failed: %v", err) - } - - for _, chassis := range chassis_list { - - err := readThermalMetrics(clientConfigIndex, chassis) - if err != nil { - return err - } - - err = readPowerMetrics(clientConfigIndex, chassis) - if err != nil { - return err - } - } - - // loop for all computer systems - systems, err := c.Service.Systems() - if err != nil { - return fmt.Errorf("readMetrics: c.Service.Systems() failed: %v", err) - } - for _, system := range systems { - - // loop for all processors - processors, err := system.Processors() - if err != nil { - return fmt.Errorf("readMetrics: system.Processors() failed: %v", err) - } - for _, processor := range processors { - err := readProcessorMetrics(clientConfigIndex, processor) - if err != nil { - return err - } - } - } - - return nil - } - - // doReadMetrics read power and temperature metrics for all configure redfish services. - // To compensate latencies of the Redfish services a fanout is used. - doReadMetric := func() { - - // Compute fanout to use - realFanout := r.config.Fanout - if len(r.config.ClientConfigs) < realFanout { - realFanout = len(r.config.ClientConfigs) - } - - // Create wait group and input channel for workers - var workerWaitGroup sync.WaitGroup - workerInput := make(chan int, realFanout) - - // Create worker go routines - for i := 0; i < realFanout; i++ { - // Increment worker wait group counter - workerWaitGroup.Add(1) - go func() { - // Decrement worker wait group counter - defer workerWaitGroup.Done() - - // Read power metrics for each client config - for clientConfigIndex := range workerInput { - err := readMetrics(clientConfigIndex) - if err != nil { - cclog.ComponentError(r.name, err) - } - } - }() - } - - // Distribute client configs to workers - for i := range r.config.ClientConfigs { - // Check done channel status - select { - case workerInput <- i: - case <-r.done: - // process done event - // Stop workers, clear channel and wait for all workers to finish - close(workerInput) - for range workerInput { - } - workerWaitGroup.Wait() - return - } - } - - // Stop workers and wait for all workers to finish - close(workerInput) - workerWaitGroup.Wait() - } - - // Start redfish receiver - r.wg.Add(1) - go func() { - defer r.wg.Done() - - // Create ticker - ticker := time.NewTicker(r.config.Interval) - defer ticker.Stop() - - for { - doReadMetric() - - select { - case <-ticker.C: - // process ticker event -> continue - continue - case <-r.done: - // process done event - return - } - } - }() - - cclog.ComponentDebug(r.name, "STARTED") -} - -// Close redfish receiver -func (r *RedfishReceiver) Close() { - cclog.ComponentDebug(r.name, "CLOSE") - - // Send the signal and wait - close(r.done) - r.wg.Wait() - - cclog.ComponentDebug(r.name, "DONE") -} - -// New function to create a new instance of the receiver -// Initialize the receiver by giving it a name and reading in the config JSON -func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { - r := new(RedfishReceiver) - // Set name r.name = fmt.Sprintf("RedfishReceiver(%s)", name) // Create done channel r.done = make(chan bool) - // Set defaults in r.config - // Allow overwriting these defaults by reading config JSON - r.config.Fanout = 64 - r.config.IntervalString = "30s" - r.config.HttpTimeoutString = "10s" - r.config.HttpInsecure = true - // Read the redfish receiver specific JSON config if len(config) > 0 { - err := json.Unmarshal(config, &r.config) + err := json.Unmarshal(config, &configJSON) if err != nil { cclog.ComponentError(r.name, "Error reading config:", err.Error()) return nil, err @@ -624,11 +650,11 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { // interval duration var err error - r.config.Interval, err = time.ParseDuration(r.config.IntervalString) + r.config.Interval, err = time.ParseDuration(configJSON.IntervalString) if err != nil { err := fmt.Errorf( "Failed to parse duration string interval='%s': %w", - r.config.IntervalString, + configJSON.IntervalString, err, ) cclog.Error(r.name, err) @@ -636,11 +662,11 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { } // HTTP timeout duration - r.config.HttpTimeout, err = time.ParseDuration(r.config.HttpTimeoutString) + r.config.HttpTimeout, err = time.ParseDuration(configJSON.HttpTimeoutString) if err != nil { err := fmt.Errorf( "Failed to parse duration string http_timeout='%s': %w", - r.config.HttpTimeoutString, + configJSON.HttpTimeoutString, err, ) cclog.Error(r.name, err) @@ -650,54 +676,77 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { // Create new http client customTransport := http.DefaultTransport.(*http.Transport).Clone() customTransport.TLSClientConfig = &tls.Config{ - InsecureSkipVerify: r.config.HttpInsecure, + InsecureSkipVerify: configJSON.HttpInsecure, } httpClient := &http.Client{ Timeout: r.config.HttpTimeout, Transport: customTransport, } + // Compute fanout to use + numClients := len(configJSON.ClientConfigs) + r.config.fanout = configJSON.Fanout + if numClients < r.config.fanout { + r.config.fanout = numClients + } + + // Initialize derived configuration + r.config.ClientConfigs = make([]RedfishReceiverClientConfig, numClients) + // Create gofish client config - for i := range r.config.ClientConfigs { + for i := 0; i < numClients; i++ { clientConfig := &r.config.ClientConfigs[i] + clientConfigJSON := &configJSON.ClientConfigs[i] gofishConfig := &clientConfig.gofish - if clientConfig.Hostname == nil { + if clientConfigJSON.Hostname == nil { err := fmt.Errorf("client config number %v requires hostname", i) cclog.ComponentError(r.name, err) return nil, err } + clientConfig.Hostname = *clientConfigJSON.Hostname - if clientConfig.Endpoint == nil { + if clientConfigJSON.Endpoint == nil { err := fmt.Errorf("client config number %v requires endpoint", i) cclog.ComponentError(r.name, err) return nil, err } - gofishConfig.Endpoint = *clientConfig.Endpoint + gofishConfig.Endpoint = *clientConfigJSON.Endpoint - if clientConfig.Username == nil { + if clientConfigJSON.Username == nil { err := fmt.Errorf("client config number %v requires username", i) cclog.ComponentError(r.name, err) return nil, err } - gofishConfig.Username = *clientConfig.Username + gofishConfig.Username = *clientConfigJSON.Username - if clientConfig.Password == nil { + if clientConfigJSON.Password == nil { err := fmt.Errorf("client config number %v requires password", i) cclog.ComponentError(r.name, err) return nil, err } - gofishConfig.Password = *clientConfig.Password + gofishConfig.Password = *clientConfigJSON.Password // Reuse existing http client gofishConfig.HTTPClient = httpClient + // Which metrics should be collected + clientConfig.doPowerMetric = + !(configJSON.DisablePowerMetrics || + clientConfigJSON.DisablePowerMetrics) + clientConfig.doProcessorMetrics = + !(configJSON.DisableProcessorMetrics || + clientConfigJSON.DisableProcessorMetrics) + clientConfig.doThermalMetrics = + !(configJSON.DisableThermalMetrics || + clientConfigJSON.DisableThermalMetrics) + // Is metrics excluded globally or per client clientConfig.isExcluded = make(map[string]bool) - for _, key := range clientConfig.ExcludeMetrics { + for _, key := range clientConfigJSON.ExcludeMetrics { clientConfig.isExcluded[key] = true } - for _, key := range r.config.ExcludeMetrics { + for _, key := range configJSON.ExcludeMetrics { clientConfig.isExcluded[key] = true } } From a8beec29cc880ddeea5e8b483b370367bd9c9a4b Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 17 Aug 2022 15:11:21 +0200 Subject: [PATCH 20/31] Skip non existing processor metrics URLs --- receivers/redfishReceiver.go | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index 1e4c94e..20fa0c7 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -30,6 +30,8 @@ type RedfishReceiverClientConfig struct { doProcessorMetrics bool doThermalMetrics bool + skipProcessorMetricsURL map[string]bool + gofish gofish.ClientConfig } @@ -314,12 +316,23 @@ func (r *RedfishReceiver) readProcessorMetrics( timestamp := time.Now() - // Golang 1.19: URL, _ := url.JoinPath(processor.ODataID, "ProcessorMetrics") + // URL to processor metrics URL := processor.ODataID + "/ProcessorMetrics" + + // Skip previously detected non existing URLs + if clientConfig.skipProcessorMetricsURL[URL] { + return nil + } + resp, err := processor.Client.Get(URL) if err != nil { // Skip non existing URLs - return nil + if statusCode := err.(*common.Error).HTTPReturnedStatusCode; statusCode == http.StatusNotFound { + clientConfig.skipProcessorMetricsURL[URL] = true + return nil + } + + return fmt.Errorf("processor.Client.Get(%v) failed: %+w", URL, err) } var processorMetrics struct { @@ -741,6 +754,8 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { !(configJSON.DisableThermalMetrics || clientConfigJSON.DisableThermalMetrics) + clientConfig.skipProcessorMetricsURL = make(map[string]bool) + // Is metrics excluded globally or per client clientConfig.isExcluded = make(map[string]bool) for _, key := range clientConfigJSON.ExcludeMetrics { From 60ef0ed1167f5d41adbe205468279085fe4d1735 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Wed, 17 Aug 2022 17:37:24 +0200 Subject: [PATCH 21/31] Fix for servers, which do not set status.state for thermals or powercontrols --- receivers/redfishReceiver.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index 20fa0c7..71fb93d 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -78,7 +78,7 @@ func (r *RedfishReceiver) readThermalMetrics( } // Skip all temperatures which are not in enabled state - if temperature.Status.State != common.EnabledState { + if temperature.Status.State != "" && temperature.Status.State != common.EnabledState { continue } @@ -212,7 +212,7 @@ func (r *RedfishReceiver) readPowerMetrics( for _, pc := range power.PowerControl { // Skip all power controls which are not in enabled state - if pc.Status.State != common.EnabledState { + if pc.Status.State != "" && pc.Status.State != common.EnabledState { continue } From 7ccbf1ebe207b183c9ab09c30ac6cec6566f1ecc Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Thu, 25 Aug 2022 16:47:44 +0200 Subject: [PATCH 22/31] Allow global configuration for redfish devices username, password and endpoint. --- receivers/redfishReceiver.go | 29 +++++++++++++++++++++++------ receivers/redfishReceiver.md | 25 +++++++++++-------------- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index 71fb93d..dc4bcea 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -6,6 +6,7 @@ import ( "fmt" "net/http" "strconv" + "strings" "sync" "time" @@ -615,6 +616,11 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { // Time limit for requests made by this HTTP client (default: 10 s) HttpTimeoutString string `json:"http_timeout,omitempty"` + // Default client username, password and endpoint + Username *string `json:"username"` // User name to authenticate with + Password *string `json:"password"` // Password to use for authentication + Endpoint *string `json:"endpoint"` // URL of the redfish service + // Globally disable collection of power, processor or thermal metrics DisablePowerMetrics bool `json:"disable_power_metrics"` DisableProcessorMetrics bool `json:"disable_processor_metrics"` @@ -719,26 +725,37 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { } clientConfig.Hostname = *clientConfigJSON.Hostname - if clientConfigJSON.Endpoint == nil { + var endpoint string + if clientConfigJSON.Endpoint != nil { + endpoint = *clientConfigJSON.Endpoint + } else if configJSON.Endpoint != nil { + endpoint = *configJSON.Endpoint + } else { err := fmt.Errorf("client config number %v requires endpoint", i) cclog.ComponentError(r.name, err) return nil, err } - gofishConfig.Endpoint = *clientConfigJSON.Endpoint + gofishConfig.Endpoint = strings.Replace(endpoint, "%h", clientConfig.Hostname, -1) - if clientConfigJSON.Username == nil { + if clientConfigJSON.Username != nil { + gofishConfig.Username = *clientConfigJSON.Username + } else if configJSON.Username != nil { + gofishConfig.Username = *configJSON.Username + } else { err := fmt.Errorf("client config number %v requires username", i) cclog.ComponentError(r.name, err) return nil, err } - gofishConfig.Username = *clientConfigJSON.Username - if clientConfigJSON.Password == nil { + if clientConfigJSON.Password != nil { + gofishConfig.Password = *clientConfigJSON.Password + } else if configJSON.Password != nil { + gofishConfig.Password = *configJSON.Password + } else { err := fmt.Errorf("client config number %v requires password", i) cclog.ComponentError(r.name, err) return nil, err } - gofishConfig.Password = *clientConfigJSON.Password // Reuse existing http client gofishConfig.HTTPClient = httpClient diff --git a/receivers/redfishReceiver.md b/receivers/redfishReceiver.md index 524c330..3ccb015 100644 --- a/receivers/redfishReceiver.md +++ b/receivers/redfishReceiver.md @@ -8,26 +8,23 @@ The Redfish receiver uses the [Redfish (specification)](https://www.dmtf.org/sta { "": { "type": "redfish", + "username": "", + "password": "", + "endpoint": "https://%h-bmc", "exclude_metrics": [ "min_consumed_watts" ], "client_config": [ { - "hostname": "", - "username": "", - "password": "", - "endpoint": "https://" + "hostname": "" }, { "hostname": "", - "username": "", - "password": "", - "endpoint": "https://", "disable_power_metrics": true }, { "hostname": "", - "username": "", - "password": "", - "endpoint": "https://", + "username": "", + "password": "", + "endpoint": "https://%h-BMC", "disable_thermal_metrics": true } ] @@ -42,15 +39,15 @@ Global settings: - `http_insecure`: Control whether a client verifies the server's certificate (default: true == do not verify server's certificate) - `http_timeout`: Time limit for requests made by this HTTP client (default: 10 s) -Global and per redfish device settings: +Global and per redfish device settings (per redfish device settings overwrite the global settings): - `disable_power_metrics`: disable collection of power metrics - `disable_thermal_metrics`: disable collection of thermal metrics - `exclude_metrics`: list of excluded metrics +- `username`: User name to authenticate with +- `password`: Password to use for authentication +- `endpoint`: URL of the redfish service (placeholder `%h` gets replaced by the hostname) Per redfish device settings: - `hostname`: hostname the redfish service belongs to -- `username`: User name to authenticate with -- `password`: Password to use for authentication -- `endpoint`: URL of the redfish service From 503705d442ce65d97bcea4e70b9ae5396672ab8f Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Fri, 26 Aug 2022 11:55:53 +0200 Subject: [PATCH 23/31] Allow multiple hosts to share the same client configuration --- receivers/redfishReceiver.go | 121 ++++++++++++++++++++++------------- receivers/redfishReceiver.md | 9 +-- 2 files changed, 83 insertions(+), 47 deletions(-) diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index dc4bcea..18c3ee8 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -630,10 +630,10 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { ExcludeMetrics []string `json:"exclude_metrics,omitempty"` ClientConfigs []struct { - Hostname *string `json:"hostname"` // Hostname the redfish service belongs to - Username *string `json:"username"` // User name to authenticate with - Password *string `json:"password"` // Password to use for authentication - Endpoint *string `json:"endpoint"` // URL of the redfish service + HostList []string `json:"host_list"` // List of hosts with the same client configuration + Username *string `json:"username"` // User name to authenticate with + Password *string `json:"password"` // Password to use for authentication + Endpoint *string `json:"endpoint"` // URL of the redfish service // Per client disable collection of power,processor or thermal metrics DisablePowerMetrics bool `json:"disable_power_metrics"` @@ -667,7 +667,7 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { } } - // interval duration + // Convert interval string representation to duration var err error r.config.Interval, err = time.ParseDuration(configJSON.IntervalString) if err != nil { @@ -702,86 +702,121 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) { Transport: customTransport, } - // Compute fanout to use - numClients := len(configJSON.ClientConfigs) - r.config.fanout = configJSON.Fanout - if numClients < r.config.fanout { - r.config.fanout = numClients - } + // Initialize client configurations + r.config.ClientConfigs = make([]RedfishReceiverClientConfig, 0) - // Initialize derived configuration - r.config.ClientConfigs = make([]RedfishReceiverClientConfig, numClients) + // Create client config from JSON config + for i := range configJSON.ClientConfigs { - // Create gofish client config - for i := 0; i < numClients; i++ { - clientConfig := &r.config.ClientConfigs[i] clientConfigJSON := &configJSON.ClientConfigs[i] - gofishConfig := &clientConfig.gofish - if clientConfigJSON.Hostname == nil { - err := fmt.Errorf("client config number %v requires hostname", i) - cclog.ComponentError(r.name, err) - return nil, err - } - clientConfig.Hostname = *clientConfigJSON.Hostname - - var endpoint string + var endpoint_pattern string if clientConfigJSON.Endpoint != nil { - endpoint = *clientConfigJSON.Endpoint + endpoint_pattern = *clientConfigJSON.Endpoint } else if configJSON.Endpoint != nil { - endpoint = *configJSON.Endpoint + endpoint_pattern = *configJSON.Endpoint } else { err := fmt.Errorf("client config number %v requires endpoint", i) cclog.ComponentError(r.name, err) return nil, err } - gofishConfig.Endpoint = strings.Replace(endpoint, "%h", clientConfig.Hostname, -1) + var username string if clientConfigJSON.Username != nil { - gofishConfig.Username = *clientConfigJSON.Username + username = *clientConfigJSON.Username } else if configJSON.Username != nil { - gofishConfig.Username = *configJSON.Username + username = *configJSON.Username } else { err := fmt.Errorf("client config number %v requires username", i) cclog.ComponentError(r.name, err) return nil, err } + var password string if clientConfigJSON.Password != nil { - gofishConfig.Password = *clientConfigJSON.Password + password = *clientConfigJSON.Password } else if configJSON.Password != nil { - gofishConfig.Password = *configJSON.Password + password = *configJSON.Password } else { err := fmt.Errorf("client config number %v requires password", i) cclog.ComponentError(r.name, err) return nil, err } - // Reuse existing http client - gofishConfig.HTTPClient = httpClient - // Which metrics should be collected - clientConfig.doPowerMetric = + doPowerMetric := !(configJSON.DisablePowerMetrics || clientConfigJSON.DisablePowerMetrics) - clientConfig.doProcessorMetrics = + doProcessorMetrics := !(configJSON.DisableProcessorMetrics || clientConfigJSON.DisableProcessorMetrics) - clientConfig.doThermalMetrics = + doThermalMetrics := !(configJSON.DisableThermalMetrics || clientConfigJSON.DisableThermalMetrics) - clientConfig.skipProcessorMetricsURL = make(map[string]bool) - // Is metrics excluded globally or per client - clientConfig.isExcluded = make(map[string]bool) + isExcluded := make(map[string]bool) for _, key := range clientConfigJSON.ExcludeMetrics { - clientConfig.isExcluded[key] = true + isExcluded[key] = true } for _, key := range configJSON.ExcludeMetrics { - clientConfig.isExcluded[key] = true + isExcluded[key] = true } + + for _, host := range clientConfigJSON.HostList { + + // Endpoint of the redfish service + endpoint := strings.Replace(endpoint_pattern, "%h", host, -1) + + r.config.ClientConfigs = append( + r.config.ClientConfigs, + RedfishReceiverClientConfig{ + Hostname: host, + isExcluded: isExcluded, + doPowerMetric: doPowerMetric, + doProcessorMetrics: doProcessorMetrics, + doThermalMetrics: doThermalMetrics, + skipProcessorMetricsURL: make(map[string]bool), + gofish: gofish.ClientConfig{ + Username: username, + Password: password, + Endpoint: endpoint, + HTTPClient: httpClient, + }, + }) + } + } + // Compute parallel fanout to use + numClients := len(r.config.ClientConfigs) + r.config.fanout = configJSON.Fanout + if numClients < r.config.fanout { + r.config.fanout = numClients + } + + if numClients == 0 { + err := fmt.Errorf("at least one client config is required") + cclog.ComponentError(r.name, err) + return nil, err + } + + // Check for duplicate client configurations + isDuplicate := make(map[string]bool) + for i := range r.config.ClientConfigs { + host := r.config.ClientConfigs[i].Hostname + if isDuplicate[host] { + err := fmt.Errorf("Found duplicate client config for host %s", host) + cclog.ComponentError(r.name, err) + return nil, err + } + isDuplicate[host] = true + } + + // Give some basic info about redfish receiver status + cclog.ComponentInfo(r.name, "Monitoring", numClients, "clients") + cclog.ComponentInfo(r.name, "Monitoring interval:", r.config.Interval) + cclog.ComponentInfo(r.name, "Monitoring parallel fanout:", r.config.fanout) + return r, nil } diff --git a/receivers/redfishReceiver.md b/receivers/redfishReceiver.md index 3ccb015..1bc3ed8 100644 --- a/receivers/redfishReceiver.md +++ b/receivers/redfishReceiver.md @@ -14,14 +14,14 @@ The Redfish receiver uses the [Redfish (specification)](https://www.dmtf.org/sta "exclude_metrics": [ "min_consumed_watts" ], "client_config": [ { - "hostname": "" + "host_list": [ "", "" ] }, { - "hostname": "", + "host_list": [ "", "" ] "disable_power_metrics": true }, { - "hostname": "", + "host_list": [ "" ], "username": "", "password": "", "endpoint": "https://%h-BMC", @@ -42,6 +42,7 @@ Global settings: Global and per redfish device settings (per redfish device settings overwrite the global settings): - `disable_power_metrics`: disable collection of power metrics +- `disable_processor_metrics`: disable collection of processor metrics - `disable_thermal_metrics`: disable collection of thermal metrics - `exclude_metrics`: list of excluded metrics - `username`: User name to authenticate with @@ -50,4 +51,4 @@ Global and per redfish device settings (per redfish device settings overwrite th Per redfish device settings: -- `hostname`: hostname the redfish service belongs to +- `host_list`: List of hosts with the same client configuration From 8a3446a596758015cc2712b5405082b94b404e56 Mon Sep 17 00:00:00 2001 From: oscarminus Date: Wed, 7 Sep 2022 14:09:29 +0200 Subject: [PATCH 24/31] cpustatMetric.go: Use derived values instead of absolute values (#83) * cpustatMetric.go: Use derived values instead of absolute values The values in /proc/stat are absolute counters related to the boot time of the system. To obtain a utilization of the CPU, the changes in the counters must be derived according to time. To take only the absolute values leads to the fact that changes in the utilization, straight with larger values, do not become visible. * Add new collector for /proc/schedstat The `schedstat` collector reads data from /proc/schedstat and calculates a load value, separated by hwthread. This might be useful to detect bad cpu pinning on shared nodes etc. Co-authored-by: Michael Schwarz --- collectors/collectorManager.go | 1 + collectors/cpustatMetric.go | 40 ++++++--- collectors/schedstatMetric.go | 155 +++++++++++++++++++++++++++++++++ collectors/schedstatMetric.md | 11 +++ 4 files changed, 197 insertions(+), 10 deletions(-) create mode 100644 collectors/schedstatMetric.go create mode 100644 collectors/schedstatMetric.md diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 49a9db8..63d0cb4 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -37,6 +37,7 @@ var AvailableCollectors = map[string]MetricCollector{ "beegfs_meta": new(BeegfsMetaCollector), "beegfs_storage": new(BeegfsStorageCollector), "rocm_smi": new(RocmSmiCollector), + "schedstat": new(SchedstatCollector), } // Metric collector manager data structure diff --git a/collectors/cpustatMetric.go b/collectors/cpustatMetric.go index c0dcf13..3c09b83 100644 --- a/collectors/cpustatMetric.go +++ b/collectors/cpustatMetric.go @@ -11,6 +11,7 @@ import ( cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + sysconf "github.com/tklauser/go-sysconf" ) const CPUSTATFILE = `/proc/stat` @@ -22,9 +23,11 @@ type CpustatCollectorConfig struct { type CpustatCollector struct { metricCollector config CpustatCollectorConfig + lastTimestamp time.Time // Store time stamp of last tick to derive values matches map[string]int cputags map[string]map[string]string nodetags map[string]string + olddata map[string]map[string]int64 } func (m *CpustatCollector) Init(config json.RawMessage) error { @@ -76,36 +79,48 @@ func (m *CpustatCollector) Init(config json.RawMessage) error { // Pre-generate tags for all CPUs num_cpus := 0 m.cputags = make(map[string]map[string]string) + m.olddata = make(map[string]map[string]int64) scanner := bufio.NewScanner(file) for scanner.Scan() { line := scanner.Text() linefields := strings.Fields(line) - if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 { + if strings.Compare(linefields[0], "cpu") == 0 { + m.olddata["cpu"] = make(map[string]int64) + for k, v := range m.matches { + m.olddata["cpu"][k], _ = strconv.ParseInt(linefields[v], 0, 64) + } + } else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 { cpustr := strings.TrimLeft(linefields[0], "cpu") cpu, _ := strconv.Atoi(cpustr) m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)} + m.olddata[linefields[0]] = make(map[string]int64) + for k, v := range m.matches { + m.olddata[linefields[0]][k], _ = strconv.ParseInt(linefields[v], 0, 64) + } num_cpus++ } } + m.lastTimestamp = time.Now() m.init = true return nil } -func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]string, output chan lp.CCMetric) { +func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]string, output chan lp.CCMetric, now time.Time, tsdelta time.Duration) { values := make(map[string]float64) - total := 0.0 + clktck, _ := sysconf.Sysconf(sysconf.SC_CLK_TCK) for match, index := range m.matches { if len(match) > 0 { x, err := strconv.ParseInt(linefields[index], 0, 64) if err == nil { - values[match] = float64(x) - total += values[match] + vdiff := x - m.olddata[linefields[0]][match] + m.olddata[linefields[0]][match] = x // Store new value for next run + values[match] = float64(vdiff) / float64(tsdelta.Seconds()) / float64(clktck) } } } - t := time.Now() + for name, value := range values { - y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": (value * 100.0) / total}, t) + y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now) if err == nil { output <- y } @@ -117,6 +132,9 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric) return } num_cpus := 0 + now := time.Now() + tsdelta := now.Sub(m.lastTimestamp) + file, err := os.Open(string(CPUSTATFILE)) if err != nil { cclog.ComponentError(m.name, err.Error()) @@ -128,9 +146,9 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric) line := scanner.Text() linefields := strings.Fields(line) if strings.Compare(linefields[0], "cpu") == 0 { - m.parseStatLine(linefields, m.nodetags, output) + m.parseStatLine(linefields, m.nodetags, output, now, tsdelta) } else if strings.HasPrefix(linefields[0], "cpu") { - m.parseStatLine(linefields, m.cputags[linefields[0]], output) + m.parseStatLine(linefields, m.cputags[linefields[0]], output, now, tsdelta) num_cpus++ } } @@ -139,11 +157,13 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric) m.nodetags, m.meta, map[string]interface{}{"value": int(num_cpus)}, - time.Now(), + now, ) if err == nil { output <- num_cpus_metric } + + m.lastTimestamp = now } func (m *CpustatCollector) Close() { diff --git a/collectors/schedstatMetric.go b/collectors/schedstatMetric.go new file mode 100644 index 0000000..e3041ae --- /dev/null +++ b/collectors/schedstatMetric.go @@ -0,0 +1,155 @@ +package collectors + +import ( + "encoding/json" + "fmt" + "bufio" + "time" + "os" + "strings" + "strconv" + "math" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" +) + +const SCHEDSTATFILE = `/proc/schedstat` + +// These are the fields we read from the JSON configuration +type SchedstatCollectorConfig struct { + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` +} + +// This contains all variables we need during execution and the variables +// defined by metricCollector (name, init, ...) +type SchedstatCollector struct { + metricCollector + config SchedstatCollectorConfig // the configuration structure + lastTimestamp time.Time // Store time stamp of last tick to derive values + meta map[string]string // default meta information + cputags map[string]map[string]string // default tags + olddata map[string]map[string]int64 // default tags +} + +// Functions to implement MetricCollector interface +// Init(...), Read(...), Close() +// See: metricCollector.go + +// Init initializes the sample collector +// Called once by the collector manager +// All tags, meta data tags and metrics that do not change over the runtime should be set here +func (m *SchedstatCollector) Init(config json.RawMessage) error { + var err error = nil + // Always set the name early in Init() to use it in cclog.Component* functions + m.name = "SchedstatCollector" + // This is for later use, also call it early + m.setup() + // Tell whether the collector should be run in parallel with others (reading files, ...) + // or it should be run serially, mostly for collectors acutally doing measurements + // because they should not measure the execution of the other collectors + m.parallel = true + // Define meta information sent with each metric + // (Can also be dynamic or this is the basic set with extension through AddMeta()) + m.meta = map[string]string{"source": m.name, "group": "SCHEDSTAT"} + + // Read in the JSON configuration + if len(config) > 0 { + err = json.Unmarshal(config, &m.config) + if err != nil { + cclog.ComponentError(m.name, "Error reading config:", err.Error()) + return err + } + } + + // Check input file + file, err := os.Open(string(SCHEDSTATFILE)) + if err != nil { + cclog.ComponentError(m.name, err.Error()) + } + defer file.Close() + + // Pre-generate tags for all CPUs + num_cpus := 0 + m.cputags = make(map[string]map[string]string) + m.olddata = make(map[string]map[string]int64) + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + linefields := strings.Fields(line) + if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 { + cpustr := strings.TrimLeft(linefields[0], "cpu") + cpu, _ := strconv.Atoi(cpustr) + running, _ := strconv.ParseInt(linefields[7], 10, 64) + waiting, _ := strconv.ParseInt(linefields[8], 10, 64) + m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)} + m.olddata[linefields[0]] = map[string]int64{"running" : running, "waiting" : waiting} + num_cpus++ + } + } + + + // Save current timestamp + m.lastTimestamp = time.Now() + + // Set this flag only if everything is initialized properly, all required files exist, ... + m.init = true + return err +} + +func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]string, output chan lp.CCMetric, now time.Time, tsdelta time.Duration) { + running, _ := strconv.ParseInt(linefields[7], 10, 64) + waiting, _ := strconv.ParseInt(linefields[8], 10, 64) + diff_running := running - m.olddata[linefields[0]]["running"] + diff_waiting := waiting - m.olddata[linefields[0]]["waiting"] + + var l_running float64 = float64(diff_running) / tsdelta.Seconds() / (math.Pow(1000, 3)) + var l_waiting float64 = float64(diff_waiting) / tsdelta.Seconds() / (math.Pow(1000, 3)) + + m.olddata[linefields[0]]["running"] = running + m.olddata[linefields[0]]["waiting"] = waiting + value := l_running + l_waiting + + y, err := lp.New("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now) + if err == nil { + // Send it to output channel + output <- y + } +} + +// Read collects all metrics belonging to the sample collector +// and sends them through the output channel to the collector manager +func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { + if !m.init { + return + } + + //timestamps + now := time.Now() + tsdelta := now.Sub(m.lastTimestamp) + + file, err := os.Open(string(SCHEDSTATFILE)) + if err != nil { + cclog.ComponentError(m.name, err.Error()) + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + linefields := strings.Fields(line) + if strings.HasPrefix(linefields[0], "cpu") { + m.ParseProcLine(linefields, m.cputags[linefields[0]], output, now, tsdelta) + } + } + + m.lastTimestamp = now + +} + +// Close metric collector: close network connection, close files, close libraries, ... +// Called once by the collector manager +func (m *SchedstatCollector) Close() { + // Unset flag + m.init = false +} diff --git a/collectors/schedstatMetric.md b/collectors/schedstatMetric.md new file mode 100644 index 0000000..6369eca --- /dev/null +++ b/collectors/schedstatMetric.md @@ -0,0 +1,11 @@ + +## `schedstat` collector +```json + "schedstat": { + } +``` + +The `schedstat` collector reads data from /proc/schedstat and calculates a load value, separated by hwthread. This might be useful to detect bad cpu pinning on shared nodes etc. + +Metric: +* `cpu_load_core` \ No newline at end of file From c09d8fb11841d1aefeaf9206fcc98f075df0047f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6hl?= Date: Fri, 9 Sep 2022 19:27:20 +0200 Subject: [PATCH 25/31] InfiniBandCollector: Scale raw readings from octets to bytes --- collectors/infinibandMetric.go | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index 92ea911..d6613c5 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -19,8 +19,9 @@ import ( const IB_BASEPATH = "/sys/class/infiniband/" type InfinibandCollectorMetric struct { - path string - unit string + path string + unit string + scale int64 } type InfinibandCollectorInfo struct { @@ -113,10 +114,10 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error { // Check access to counter files countersDir := filepath.Join(path, "counters") portCounterFiles := map[string]InfinibandCollectorMetric{ - "ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes"}, - "ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes"}, - "ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets"}, - "ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets"}, + "ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes", scale: 4}, + "ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes", scale: 4}, + "ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets", scale: 1}, + "ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets", scale: 1}, } for _, counter := range portCounterFiles { err := unix.Access(counter.path, unix.R_OK) @@ -191,6 +192,8 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterName, data, err)) continue } + // Scale raw value + v *= counterDef.scale // Send absolut values if m.config.SendAbsoluteValues { From 58461f1f72209c465149ae89ed83c14640fb99c7 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Fri, 9 Sep 2022 20:01:21 +0200 Subject: [PATCH 26/31] Fix clock frequency coming from LikwidCollector and update docs --- collectors/likwidMetric.go | 7 ++++--- collectors/likwidMetric.md | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index c036415..f22d486 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -159,7 +159,8 @@ func getBaseFreq() float64 { data := strings.Replace(string(buffer), "\n", "", -1) x, err := strconv.ParseInt(data, 0, 64) if err == nil { - freq = float64(x) * 1e6 + freq = float64(x) + break } } } @@ -168,11 +169,11 @@ func getBaseFreq() float64 { C.power_init(0) info := C.get_powerInfo() if float64(info.baseFrequency) != 0 { - freq = float64(info.baseFrequency) * 1e6 + freq = float64(info.baseFrequency) } C.power_finalize() } - return freq + return freq * 1e3 } func (m *LikwidCollector) Init(config json.RawMessage) error { diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index 1bb211f..54640dc 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -7,6 +7,9 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li "likwid": { "force_overwrite" : false, "invalid_to_zero" : false, + "liblikwid_path" : "/path/to/liblikwid.so", + "accessdaemon_path" : "/folder/that/contains/likwid-accessD", + "access_mode" : "direct or accessdaemon or perf_event", "eventsets": [ { "events" : { From a0acf01dc31aba502d8f4b463854b7eefdd82f7f Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 28 Sep 2022 12:19:36 +0200 Subject: [PATCH 27/31] Build DEB package for Ubuntu 20.04 for releases --- .github/workflows/Release.yml | 61 ++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/.github/workflows/Release.yml b/.github/workflows/Release.yml index f7143cb..1d1906b 100644 --- a/.github/workflows/Release.yml +++ b/.github/workflows/Release.yml @@ -133,13 +133,63 @@ jobs: name: cc-metric-collector SRPM for UBI 8 path: ${{ steps.rpmbuild.outputs.SRPM }} + # + # Build on Ubuntu 20.04 using official go package + # + Ubuntu-focal-build: + runs-on: ubuntu-latest + container: ubuntu:20.04 + # The job outputs link to the outputs of the 'debrename' step + # Only job outputs can be used in child jobs + outputs: + deb : ${{steps.debrename.outputs.DEB}} + steps: + # Use apt to install development packages + - name: Install development packages + run: | + apt update && apt --assume-yes upgrade + apt --assume-yes install build-essential sed git wget bash + # Checkout git repository and submodules + # fetch-depth must be 0 to use git describe + # See: https://github.com/marketplace/actions/checkout + - name: Checkout + uses: actions/checkout@v2 + with: + submodules: recursive + fetch-depth: 0 + # Use official golang package + - name: Install Golang + run: | + wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz + tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz + export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH + go version + - name: DEB build MetricCollector + id: dpkg-build + run: | + export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH + make DEB + - name: Rename DEB (add '_ubuntu20.04') + id: debrename + run: | + OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev) + NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu20.04.deb" + mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}" + echo "::set-output name=DEB::${NEW_DEB_FILE}" + # See: https://github.com/actions/upload-artifact + - name: Save DEB as artifact + uses: actions/upload-artifact@v2 + with: + name: cc-metric-collector DEB for Ubuntu 20.04 + path: ${{ steps.debrename.outputs.DEB }} + # # Create release with fresh RPMs # Release: runs-on: ubuntu-latest # We need the RPMs, so add dependency - needs: [AlmaLinux-RPM-build, UBI-8-RPM-build] + needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-focal-build] steps: # See: https://github.com/actions/download-artifact @@ -161,6 +211,11 @@ jobs: with: name: cc-metric-collector SRPM for UBI 8 + - name: Download Ubuntu 20.04 DEB + uses: actions/download-artifact@v2 + with: + name: cc-metric-collector DEB for Ubuntu 20.04 + # The download actions do not publish the name of the downloaded file, # so we re-use the job outputs of the parent jobs. The files are all # downloaded to the current folder. @@ -174,14 +229,17 @@ jobs: ALMA_85_SRPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.srpm}}") UBI_8_RPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.rpm}}") UBI_8_SRPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.srpm}}") + U_2004_DEB=$(basename "${{ needs.Ubuntu-focal-build.outputs.deb}}") echo "ALMA_85_RPM::${ALMA_85_RPM}" echo "ALMA_85_SRPM::${ALMA_85_SRPM}" echo "UBI_8_RPM::${UBI_8_RPM}" echo "UBI_8_SRPM::${UBI_8_SRPM}" + echo "U_2004_DEB::${U_2004_DEB}" echo "::set-output name=ALMA_85_RPM::${ALMA_85_RPM}" echo "::set-output name=ALMA_85_SRPM::${ALMA_85_SRPM}" echo "::set-output name=UBI_8_RPM::${UBI_8_RPM}" echo "::set-output name=UBI_8_SRPM::${UBI_8_SRPM}" + echo "::set-output name=U_2004_DEB::${U_2004_DEB}" # See: https://github.com/softprops/action-gh-release - name: Release @@ -194,3 +252,4 @@ jobs: ${{ steps.files.outputs.ALMA_85_SRPM }} ${{ steps.files.outputs.UBI_8_RPM }} ${{ steps.files.outputs.UBI_8_SRPM }} + ${{ steps.files.outputs.U_2004_DEB }} From ed511b7c09199904a04ecb80d7f69a1596505342 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 28 Sep 2022 15:09:36 +0200 Subject: [PATCH 28/31] Fix memstat collector with numa_stats option --- collectors/memstatMetric.go | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/collectors/memstatMetric.go b/collectors/memstatMetric.go index 9841a01..ed80de7 100644 --- a/collectors/memstatMetric.go +++ b/collectors/memstatMetric.go @@ -68,7 +68,8 @@ func getStats(filename string) map[string]MemstatStats { } else if len(linefields) == 5 { v, err := strconv.ParseFloat(linefields[3], 64) if err == nil { - stats[strings.Trim(linefields[0], ":")] = MemstatStats{ + cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4]) + stats[strings.Trim(linefields[2], ":")] = MemstatStats{ value: v, unit: linefields[4], } @@ -188,16 +189,20 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) unit := "" if totalVal, total := stats["MemTotal"]; total { if freeVal, free := stats["MemFree"]; free { + memUsed = totalVal.value - freeVal.value + if len(totalVal.unit) > 0 { + unit = totalVal.unit + } else if len(freeVal.unit) > 0 { + unit = freeVal.unit + } if bufVal, buffers := stats["Buffers"]; buffers { + memUsed -= bufVal.value + if len(bufVal.unit) > 0 && len(unit) == 0 { + unit = bufVal.unit + } if cacheVal, cached := stats["Cached"]; cached { - memUsed = totalVal.value - (freeVal.value + bufVal.value + cacheVal.value) - if len(totalVal.unit) > 0 { - unit = totalVal.unit - } else if len(freeVal.unit) > 0 { - unit = freeVal.unit - } else if len(bufVal.unit) > 0 { - unit = bufVal.unit - } else if len(cacheVal.unit) > 0 { + memUsed -= cacheVal.value + if len(cacheVal.unit) > 0 && len(unit) == 0 { unit = cacheVal.unit } } @@ -215,12 +220,14 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) } if m.config.NodeStats { + cclog.ComponentInfo(m.name, MEMSTATFILE) nodestats := getStats(MEMSTATFILE) sendStats(nodestats, m.tags) } if m.config.NumaStats { for _, nodeConf := range m.nodefiles { + cclog.ComponentInfo(m.name, nodeConf.file) stats := getStats(nodeConf.file) sendStats(stats, nodeConf.tags) } From 8849824ba96b2dcf0d4a7a065bbe9e7fd35986f7 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Sun, 9 Oct 2022 02:56:15 +0200 Subject: [PATCH 29/31] Remove useless prints from MemstatCollector --- collectors/memstatMetric.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/collectors/memstatMetric.go b/collectors/memstatMetric.go index ed80de7..366607c 100644 --- a/collectors/memstatMetric.go +++ b/collectors/memstatMetric.go @@ -161,7 +161,6 @@ func (m *MemstatCollector) Init(config json.RawMessage) error { func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { - cclog.ComponentPrint(m.name, "Here") return } @@ -220,14 +219,12 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) } if m.config.NodeStats { - cclog.ComponentInfo(m.name, MEMSTATFILE) nodestats := getStats(MEMSTATFILE) sendStats(nodestats, m.tags) } if m.config.NumaStats { for _, nodeConf := range m.nodefiles { - cclog.ComponentInfo(m.name, nodeConf.file) stats := getStats(nodeConf.file) sendStats(stats, nodeConf.tags) } From 0fbff00996f17bf95d084d0557fd8ecb6982ccc1 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Sun, 9 Oct 2022 17:03:38 +0200 Subject: [PATCH 30/31] Replace ioutils with os and io (#87) --- collectors/beegfsmetaMetric.go | 8 ++++---- collectors/beegfsstorageMetric.go | 8 ++++---- collectors/cpufreqMetric.go | 8 ++++---- collectors/customCmdMetric.go | 6 +++--- collectors/gpfsMetric.go | 6 +++--- collectors/infinibandMetric.go | 5 ++--- collectors/likwidMetric.go | 3 +-- collectors/loadavgMetric.go | 4 ++-- collectors/tempMetric.go | 14 +++++++------- 9 files changed, 30 insertions(+), 32 deletions(-) diff --git a/collectors/beegfsmetaMetric.go b/collectors/beegfsmetaMetric.go index a27faf2..0aaea39 100644 --- a/collectors/beegfsmetaMetric.go +++ b/collectors/beegfsmetaMetric.go @@ -5,7 +5,7 @@ import ( "bytes" "encoding/json" "fmt" - "io/ioutil" + "io" "os" "os/exec" "os/user" @@ -115,7 +115,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMetr return } //get mounpoint - buffer, _ := ioutil.ReadFile(string("/proc/mounts")) + buffer, _ := os.ReadFile(string("/proc/mounts")) mounts := strings.Split(string(buffer), "\n") var mountpoints []string for _, line := range mounts { @@ -157,9 +157,9 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMetr if err != nil { fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error()) fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()) - data, _ := ioutil.ReadAll(cmdStderr) + data, _ := io.ReadAll(cmdStderr) fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stderr: \"%s\"\n", string(data)) - data, _ = ioutil.ReadAll(cmdStdout) + data, _ = io.ReadAll(cmdStdout) fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stdout: \"%s\"\n", string(data)) return } diff --git a/collectors/beegfsstorageMetric.go b/collectors/beegfsstorageMetric.go index 1160664..bc5b370 100644 --- a/collectors/beegfsstorageMetric.go +++ b/collectors/beegfsstorageMetric.go @@ -5,7 +5,7 @@ import ( "bytes" "encoding/json" "fmt" - "io/ioutil" + "io" "os" "os/exec" "os/user" @@ -108,7 +108,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM return } //get mounpoint - buffer, _ := ioutil.ReadFile(string("/proc/mounts")) + buffer, _ := os.ReadFile(string("/proc/mounts")) mounts := strings.Split(string(buffer), "\n") var mountpoints []string for _, line := range mounts { @@ -149,9 +149,9 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM if err != nil { fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error()) fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()) - data, _ := ioutil.ReadAll(cmdStderr) + data, _ := io.ReadAll(cmdStderr) fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stderr: \"%s\"\n", string(data)) - data, _ = ioutil.ReadAll(cmdStdout) + data, _ = io.ReadAll(cmdStdout) fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stdout: \"%s\"\n", string(data)) return } diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index e6a0081..c79be65 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -3,7 +3,7 @@ package collectors import ( "encoding/json" "fmt" - "io/ioutil" + "os" "path/filepath" "strconv" "strings" @@ -88,7 +88,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { // Read package ID physicalPackageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") - line, err := ioutil.ReadFile(physicalPackageIDFile) + line, err := os.ReadFile(physicalPackageIDFile) if err != nil { return fmt.Errorf("unable to read physical package ID from file '%s': %v", physicalPackageIDFile, err) } @@ -100,7 +100,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { // Read core ID coreIDFile := filepath.Join(cpuDir, "topology", "core_id") - line, err = ioutil.ReadFile(coreIDFile) + line, err = os.ReadFile(coreIDFile) if err != nil { return fmt.Errorf("unable to read core ID from file '%s': %v", coreIDFile, err) } @@ -188,7 +188,7 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric) } // Read current frequency - line, err := ioutil.ReadFile(t.scalingCurFreqFile) + line, err := os.ReadFile(t.scalingCurFreqFile) if err != nil { cclog.ComponentError( m.name, diff --git a/collectors/customCmdMetric.go b/collectors/customCmdMetric.go index 492dd48..a669cca 100644 --- a/collectors/customCmdMetric.go +++ b/collectors/customCmdMetric.go @@ -3,8 +3,8 @@ package collectors import ( "encoding/json" "errors" - "io/ioutil" "log" + "os" "os/exec" "strings" "time" @@ -53,7 +53,7 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error { } } for _, f := range m.config.Files { - _, err = ioutil.ReadFile(f) + _, err = os.ReadFile(f) if err == nil { m.files = append(m.files, f) } else { @@ -106,7 +106,7 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetri } } for _, file := range m.files { - buffer, err := ioutil.ReadFile(file) + buffer, err := os.ReadFile(file) if err != nil { log.Print(err) return diff --git a/collectors/gpfsMetric.go b/collectors/gpfsMetric.go index ca9affe..dac6bc2 100644 --- a/collectors/gpfsMetric.go +++ b/collectors/gpfsMetric.go @@ -5,7 +5,7 @@ import ( "bytes" "encoding/json" "fmt" - "io/ioutil" + "io" "log" "os/exec" "os/user" @@ -118,8 +118,8 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { cmd.Stderr = cmdStderr err := cmd.Run() if err != nil { - dataStdErr, _ := ioutil.ReadAll(cmdStderr) - dataStdOut, _ := ioutil.ReadAll(cmdStdout) + dataStdErr, _ := io.ReadAll(cmdStderr) + dataStdOut, _ := io.ReadAll(cmdStdout) cclog.ComponentError( m.name, fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err), diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index d6613c5..a4de367 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -2,7 +2,6 @@ package collectors import ( "fmt" - "io/ioutil" "os" cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" @@ -85,7 +84,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error { for _, path := range ibDirs { // Skip, when no LID is assigned - line, err := ioutil.ReadFile(filepath.Join(path, "lid")) + line, err := os.ReadFile(filepath.Join(path, "lid")) if err != nil { continue } @@ -175,7 +174,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr for counterName, counterDef := range info.portCounterFiles { // Read counter file - line, err := ioutil.ReadFile(counterDef.path) + line, err := os.ReadFile(counterDef.path) if err != nil { cclog.ComponentError( m.name, diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index f22d486..2819963 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -12,7 +12,6 @@ import ( "encoding/json" "errors" "fmt" - "io/ioutil" "math" "os" "os/signal" @@ -154,7 +153,7 @@ func getBaseFreq() float64 { } var freq float64 = math.NaN() for _, f := range files { - buffer, err := ioutil.ReadFile(f) + buffer, err := os.ReadFile(f) if err == nil { data := strings.Replace(string(buffer), "\n", "", -1) x, err := strconv.ParseInt(data, 0, 64) diff --git a/collectors/loadavgMetric.go b/collectors/loadavgMetric.go index 58fb102..287ad5d 100644 --- a/collectors/loadavgMetric.go +++ b/collectors/loadavgMetric.go @@ -3,7 +3,7 @@ package collectors import ( "encoding/json" "fmt" - "io/ioutil" + "os" "strconv" "strings" "time" @@ -72,7 +72,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric) if !m.init { return } - buffer, err := ioutil.ReadFile(LOADAVGFILE) + buffer, err := os.ReadFile(LOADAVGFILE) if err != nil { if err != nil { cclog.ComponentError( diff --git a/collectors/tempMetric.go b/collectors/tempMetric.go index af9d7fd..c9f4a16 100644 --- a/collectors/tempMetric.go +++ b/collectors/tempMetric.go @@ -3,7 +3,7 @@ package collectors import ( "encoding/json" "fmt" - "io/ioutil" + "os" "path/filepath" "strconv" "strings" @@ -83,14 +83,14 @@ func (m *TempCollector) Init(config json.RawMessage) error { // sensor name nameFile := filepath.Join(filepath.Dir(file), "name") - name, err := ioutil.ReadFile(nameFile) + name, err := os.ReadFile(nameFile) if err == nil { sensor.name = strings.TrimSpace(string(name)) } // sensor label labelFile := strings.TrimSuffix(file, "_input") + "_label" - label, err := ioutil.ReadFile(labelFile) + label, err := os.ReadFile(labelFile) if err == nil { sensor.label = strings.TrimSpace(string(label)) } @@ -117,7 +117,7 @@ func (m *TempCollector) Init(config json.RawMessage) error { } // Sensor file - _, err = ioutil.ReadFile(file) + _, err = os.ReadFile(file) if err != nil { continue } @@ -139,7 +139,7 @@ func (m *TempCollector) Init(config json.RawMessage) error { // max temperature if m.config.ReportMaxTemp { maxTempFile := strings.TrimSuffix(file, "_input") + "_max" - if buffer, err := ioutil.ReadFile(maxTempFile); err == nil { + if buffer, err := os.ReadFile(maxTempFile); err == nil { if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil { sensor.maxTempName = strings.Replace(sensor.metricName, "temp", "max_temp", 1) sensor.maxTemp = x / 1000 @@ -150,7 +150,7 @@ func (m *TempCollector) Init(config json.RawMessage) error { // critical temperature if m.config.ReportCriticalTemp { criticalTempFile := strings.TrimSuffix(file, "_input") + "_crit" - if buffer, err := ioutil.ReadFile(criticalTempFile); err == nil { + if buffer, err := os.ReadFile(criticalTempFile); err == nil { if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil { sensor.critTempName = strings.Replace(sensor.metricName, "temp", "crit_temp", 1) sensor.critTemp = x / 1000 @@ -175,7 +175,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) { for _, sensor := range m.sensors { // Read sensor file - buffer, err := ioutil.ReadFile(sensor.file) + buffer, err := os.ReadFile(sensor.file) if err != nil { cclog.ComponentError( m.name, From 6bf3bfd10a3fcfd8da53937989f4486f3962e8fc Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Sun, 9 Oct 2022 17:05:49 +0200 Subject: [PATCH 31/31] Use lower case for error strings in RocmSmiCollector --- collectors/rocmsmiMetric.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/collectors/rocmsmiMetric.go b/collectors/rocmsmiMetric.go index c717a5d..d9c635a 100644 --- a/collectors/rocmsmiMetric.go +++ b/collectors/rocmsmiMetric.go @@ -66,14 +66,14 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error { ret := rocm_smi.Init() if ret != rocm_smi.STATUS_SUCCESS { - err = errors.New("Failed to initialize ROCm SMI library") + err = errors.New("failed to initialize ROCm SMI library") cclog.ComponentError(m.name, err.Error()) return err } numDevs, ret := rocm_smi.NumMonitorDevices() if ret != rocm_smi.STATUS_SUCCESS { - err = errors.New("Failed to get number of GPUs from ROCm SMI library") + err = errors.New("failed to get number of GPUs from ROCm SMI library") cclog.ComponentError(m.name, err.Error()) return err } @@ -98,14 +98,14 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error { } device, ret := rocm_smi.DeviceGetHandleByIndex(i) if ret != rocm_smi.STATUS_SUCCESS { - err = fmt.Errorf("Failed to get handle for GPU %d", i) + err = fmt.Errorf("failed to get handle for GPU %d", i) cclog.ComponentError(m.name, err.Error()) return err } pciInfo, ret := rocm_smi.DeviceGetPciInfo(device) if ret != rocm_smi.STATUS_SUCCESS { - err = fmt.Errorf("Failed to get PCI information for GPU %d", i) + err = fmt.Errorf("failed to get PCI information for GPU %d", i) cclog.ComponentError(m.name, err.Error()) return err }