mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2024-12-26 15:29:04 +01:00
Merge branch 'develop' into smartmon_collector
This commit is contained in:
commit
e2438f8cec
61
.github/workflows/Release.yml
vendored
61
.github/workflows/Release.yml
vendored
@ -133,13 +133,63 @@ jobs:
|
||||
name: cc-metric-collector SRPM for UBI 8
|
||||
path: ${{ steps.rpmbuild.outputs.SRPM }}
|
||||
|
||||
#
|
||||
# Build on Ubuntu 20.04 using official go package
|
||||
#
|
||||
Ubuntu-focal-build:
|
||||
runs-on: ubuntu-latest
|
||||
container: ubuntu:20.04
|
||||
# The job outputs link to the outputs of the 'debrename' step
|
||||
# Only job outputs can be used in child jobs
|
||||
outputs:
|
||||
deb : ${{steps.debrename.outputs.DEB}}
|
||||
steps:
|
||||
# Use apt to install development packages
|
||||
- name: Install development packages
|
||||
run: |
|
||||
apt update && apt --assume-yes upgrade
|
||||
apt --assume-yes install build-essential sed git wget bash
|
||||
# Checkout git repository and submodules
|
||||
# fetch-depth must be 0 to use git describe
|
||||
# See: https://github.com/marketplace/actions/checkout
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: recursive
|
||||
fetch-depth: 0
|
||||
# Use official golang package
|
||||
- name: Install Golang
|
||||
run: |
|
||||
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
|
||||
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
go version
|
||||
- name: DEB build MetricCollector
|
||||
id: dpkg-build
|
||||
run: |
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
make DEB
|
||||
- name: Rename DEB (add '_ubuntu20.04')
|
||||
id: debrename
|
||||
run: |
|
||||
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
|
||||
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu20.04.deb"
|
||||
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
|
||||
echo "::set-output name=DEB::${NEW_DEB_FILE}"
|
||||
# See: https://github.com/actions/upload-artifact
|
||||
- name: Save DEB as artifact
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: cc-metric-collector DEB for Ubuntu 20.04
|
||||
path: ${{ steps.debrename.outputs.DEB }}
|
||||
|
||||
#
|
||||
# Create release with fresh RPMs
|
||||
#
|
||||
Release:
|
||||
runs-on: ubuntu-latest
|
||||
# We need the RPMs, so add dependency
|
||||
needs: [AlmaLinux-RPM-build, UBI-8-RPM-build]
|
||||
needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-focal-build]
|
||||
|
||||
steps:
|
||||
# See: https://github.com/actions/download-artifact
|
||||
@ -161,6 +211,11 @@ jobs:
|
||||
with:
|
||||
name: cc-metric-collector SRPM for UBI 8
|
||||
|
||||
- name: Download Ubuntu 20.04 DEB
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: cc-metric-collector DEB for Ubuntu 20.04
|
||||
|
||||
# The download actions do not publish the name of the downloaded file,
|
||||
# so we re-use the job outputs of the parent jobs. The files are all
|
||||
# downloaded to the current folder.
|
||||
@ -174,14 +229,17 @@ jobs:
|
||||
ALMA_85_SRPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.srpm}}")
|
||||
UBI_8_RPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.rpm}}")
|
||||
UBI_8_SRPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.srpm}}")
|
||||
U_2004_DEB=$(basename "${{ needs.Ubuntu-focal-build.outputs.deb}}")
|
||||
echo "ALMA_85_RPM::${ALMA_85_RPM}"
|
||||
echo "ALMA_85_SRPM::${ALMA_85_SRPM}"
|
||||
echo "UBI_8_RPM::${UBI_8_RPM}"
|
||||
echo "UBI_8_SRPM::${UBI_8_SRPM}"
|
||||
echo "U_2004_DEB::${U_2004_DEB}"
|
||||
echo "::set-output name=ALMA_85_RPM::${ALMA_85_RPM}"
|
||||
echo "::set-output name=ALMA_85_SRPM::${ALMA_85_SRPM}"
|
||||
echo "::set-output name=UBI_8_RPM::${UBI_8_RPM}"
|
||||
echo "::set-output name=UBI_8_SRPM::${UBI_8_SRPM}"
|
||||
echo "::set-output name=U_2004_DEB::${U_2004_DEB}"
|
||||
|
||||
# See: https://github.com/softprops/action-gh-release
|
||||
- name: Release
|
||||
@ -194,3 +252,4 @@ jobs:
|
||||
${{ steps.files.outputs.ALMA_85_SRPM }}
|
||||
${{ steps.files.outputs.UBI_8_RPM }}
|
||||
${{ steps.files.outputs.UBI_8_SRPM }}
|
||||
${{ steps.files.outputs.U_2004_DEB }}
|
||||
|
26
.github/workflows/runonce.yml
vendored
26
.github/workflows/runonce.yml
vendored
@ -32,3 +32,29 @@ jobs:
|
||||
|
||||
- name: Run MetricCollector once
|
||||
run: ./cc-metric-collector --once --config .github/ci-config.json
|
||||
|
||||
#
|
||||
# Job build-1-19
|
||||
# Build on latest Ubuntu using golang version 1.19
|
||||
#
|
||||
build-1-19:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
# See: https://github.com/marketplace/actions/checkout
|
||||
# Checkout git repository and submodules
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
# See: https://github.com/marketplace/actions/setup-go-environment
|
||||
- name: Setup Golang
|
||||
uses: actions/setup-go@v3
|
||||
with:
|
||||
go-version: '1.19'
|
||||
|
||||
- name: Build MetricCollector
|
||||
run: make
|
||||
|
||||
- name: Run MetricCollector once
|
||||
run: ./cc-metric-collector --once --config .github/ci-config.json
|
||||
|
@ -1,6 +1,6 @@
|
||||
# cc-metric-collector
|
||||
|
||||
A node agent for measuring, processing and forwarding node level metrics. It is part of the ClusterCockpit ecosystem.
|
||||
A node agent for measuring, processing and forwarding node level metrics. It is part of the [ClusterCockpit ecosystem](./docs/introduction.md).
|
||||
|
||||
The metric collector sends (and receives) metric in the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/) as it provides flexibility while providing a separation between tags (like index columns in relational databases) and fields (like data columns).
|
||||
|
||||
@ -11,7 +11,7 @@ The receiver runs as a go routine side-by-side with the timer loop and asynchron
|
||||
# Configuration
|
||||
|
||||
Configuration is implemented using a single json document that is distributed over network and may be persisted as file.
|
||||
Supported metrics are documented [here](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md).
|
||||
Supported metrics are documented [here](https://github.com/ClusterCockpit/cc-specifications/blob/master/interfaces/lineprotocol/README.md).
|
||||
|
||||
There is a main configuration file with basic settings that point to the other configuration files for the different components.
|
||||
|
||||
@ -26,7 +26,7 @@ There is a main configuration file with basic settings that point to the other c
|
||||
}
|
||||
```
|
||||
|
||||
The `interval` defines how often the metrics should be read and send to the sink. The `duration` tells collectors how long one measurement has to take. This is important for some collectors, like the `likwid` collector.
|
||||
The `interval` defines how often the metrics should be read and send to the sink. The `duration` tells collectors how long one measurement has to take. This is important for some collectors, like the `likwid` collector. For more information, see [here](./docs/configuration.md).
|
||||
|
||||
See the component READMEs for their configuration:
|
||||
|
||||
@ -44,6 +44,8 @@ $ go get (requires at least golang 1.16)
|
||||
$ make
|
||||
```
|
||||
|
||||
For more information, see [here](./docs/building.md).
|
||||
|
||||
# Running
|
||||
|
||||
```
|
||||
|
@ -35,7 +35,7 @@ In contrast to the configuration files for sinks and receivers, the collectors c
|
||||
* [`nfs4stat`](./nfs4Metric.md)
|
||||
* [`cpufreq`](./cpufreqMetric.md)
|
||||
* [`cpufreq_cpuinfo`](./cpufreqCpuinfoMetric.md)
|
||||
* [`numastat`](./numastatMetric.md)
|
||||
* [`numastats`](./numastatsMetric.md)
|
||||
* [`gpfs`](./gpfsMetric.md)
|
||||
* [`beegfs_meta`](./beegfsmetaMetric.md)
|
||||
* [`beegfs_storage`](./beegfsstorageMetric.md)
|
||||
|
@ -5,7 +5,7 @@ import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/user"
|
||||
@ -115,7 +115,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMetr
|
||||
return
|
||||
}
|
||||
//get mounpoint
|
||||
buffer, _ := ioutil.ReadFile(string("/proc/mounts"))
|
||||
buffer, _ := os.ReadFile(string("/proc/mounts"))
|
||||
mounts := strings.Split(string(buffer), "\n")
|
||||
var mountpoints []string
|
||||
for _, line := range mounts {
|
||||
@ -157,9 +157,9 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMetr
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error())
|
||||
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode())
|
||||
data, _ := ioutil.ReadAll(cmdStderr)
|
||||
data, _ := io.ReadAll(cmdStderr)
|
||||
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stderr: \"%s\"\n", string(data))
|
||||
data, _ = ioutil.ReadAll(cmdStdout)
|
||||
data, _ = io.ReadAll(cmdStdout)
|
||||
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stdout: \"%s\"\n", string(data))
|
||||
return
|
||||
}
|
||||
|
@ -5,7 +5,7 @@ import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/user"
|
||||
@ -108,7 +108,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
||||
return
|
||||
}
|
||||
//get mounpoint
|
||||
buffer, _ := ioutil.ReadFile(string("/proc/mounts"))
|
||||
buffer, _ := os.ReadFile(string("/proc/mounts"))
|
||||
mounts := strings.Split(string(buffer), "\n")
|
||||
var mountpoints []string
|
||||
for _, line := range mounts {
|
||||
@ -149,9 +149,9 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error())
|
||||
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode())
|
||||
data, _ := ioutil.ReadAll(cmdStderr)
|
||||
data, _ := io.ReadAll(cmdStderr)
|
||||
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stderr: \"%s\"\n", string(data))
|
||||
data, _ = ioutil.ReadAll(cmdStdout)
|
||||
data, _ = io.ReadAll(cmdStdout)
|
||||
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stdout: \"%s\"\n", string(data))
|
||||
return
|
||||
}
|
||||
|
@ -38,6 +38,7 @@ var AvailableCollectors = map[string]MetricCollector{
|
||||
"beegfs_storage": new(BeegfsStorageCollector),
|
||||
"rocm_smi": new(RocmSmiCollector),
|
||||
"smartmon": new(SmartMonCollector),
|
||||
"schedstat": new(SchedstatCollector),
|
||||
}
|
||||
|
||||
// Metric collector manager data structure
|
||||
|
@ -3,7 +3,7 @@ package collectors
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
@ -88,7 +88,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Read package ID
|
||||
physicalPackageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id")
|
||||
line, err := ioutil.ReadFile(physicalPackageIDFile)
|
||||
line, err := os.ReadFile(physicalPackageIDFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to read physical package ID from file '%s': %v", physicalPackageIDFile, err)
|
||||
}
|
||||
@ -100,7 +100,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Read core ID
|
||||
coreIDFile := filepath.Join(cpuDir, "topology", "core_id")
|
||||
line, err = ioutil.ReadFile(coreIDFile)
|
||||
line, err = os.ReadFile(coreIDFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to read core ID from file '%s': %v", coreIDFile, err)
|
||||
}
|
||||
@ -188,7 +188,7 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
||||
}
|
||||
|
||||
// Read current frequency
|
||||
line, err := ioutil.ReadFile(t.scalingCurFreqFile)
|
||||
line, err := os.ReadFile(t.scalingCurFreqFile)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
|
@ -11,6 +11,7 @@ import (
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||
sysconf "github.com/tklauser/go-sysconf"
|
||||
)
|
||||
|
||||
const CPUSTATFILE = `/proc/stat`
|
||||
@ -22,9 +23,11 @@ type CpustatCollectorConfig struct {
|
||||
type CpustatCollector struct {
|
||||
metricCollector
|
||||
config CpustatCollectorConfig
|
||||
lastTimestamp time.Time // Store time stamp of last tick to derive values
|
||||
matches map[string]int
|
||||
cputags map[string]map[string]string
|
||||
nodetags map[string]string
|
||||
olddata map[string]map[string]int64
|
||||
}
|
||||
|
||||
func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||
@ -76,36 +79,48 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||
// Pre-generate tags for all CPUs
|
||||
num_cpus := 0
|
||||
m.cputags = make(map[string]map[string]string)
|
||||
m.olddata = make(map[string]map[string]int64)
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
linefields := strings.Fields(line)
|
||||
if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
||||
if strings.Compare(linefields[0], "cpu") == 0 {
|
||||
m.olddata["cpu"] = make(map[string]int64)
|
||||
for k, v := range m.matches {
|
||||
m.olddata["cpu"][k], _ = strconv.ParseInt(linefields[v], 0, 64)
|
||||
}
|
||||
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
||||
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
||||
cpu, _ := strconv.Atoi(cpustr)
|
||||
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
|
||||
m.olddata[linefields[0]] = make(map[string]int64)
|
||||
for k, v := range m.matches {
|
||||
m.olddata[linefields[0]][k], _ = strconv.ParseInt(linefields[v], 0, 64)
|
||||
}
|
||||
num_cpus++
|
||||
}
|
||||
}
|
||||
m.lastTimestamp = time.Now()
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]string, output chan lp.CCMetric) {
|
||||
func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]string, output chan lp.CCMetric, now time.Time, tsdelta time.Duration) {
|
||||
values := make(map[string]float64)
|
||||
total := 0.0
|
||||
clktck, _ := sysconf.Sysconf(sysconf.SC_CLK_TCK)
|
||||
for match, index := range m.matches {
|
||||
if len(match) > 0 {
|
||||
x, err := strconv.ParseInt(linefields[index], 0, 64)
|
||||
if err == nil {
|
||||
values[match] = float64(x)
|
||||
total += values[match]
|
||||
vdiff := x - m.olddata[linefields[0]][match]
|
||||
m.olddata[linefields[0]][match] = x // Store new value for next run
|
||||
values[match] = float64(vdiff) / float64(tsdelta.Seconds()) / float64(clktck)
|
||||
}
|
||||
}
|
||||
}
|
||||
t := time.Now()
|
||||
|
||||
for name, value := range values {
|
||||
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": (value * 100.0) / total}, t)
|
||||
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@ -117,6 +132,9 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
||||
return
|
||||
}
|
||||
num_cpus := 0
|
||||
now := time.Now()
|
||||
tsdelta := now.Sub(m.lastTimestamp)
|
||||
|
||||
file, err := os.Open(string(CPUSTATFILE))
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
@ -128,9 +146,9 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
||||
line := scanner.Text()
|
||||
linefields := strings.Fields(line)
|
||||
if strings.Compare(linefields[0], "cpu") == 0 {
|
||||
m.parseStatLine(linefields, m.nodetags, output)
|
||||
m.parseStatLine(linefields, m.nodetags, output, now, tsdelta)
|
||||
} else if strings.HasPrefix(linefields[0], "cpu") {
|
||||
m.parseStatLine(linefields, m.cputags[linefields[0]], output)
|
||||
m.parseStatLine(linefields, m.cputags[linefields[0]], output, now, tsdelta)
|
||||
num_cpus++
|
||||
}
|
||||
}
|
||||
@ -139,11 +157,13 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
||||
m.nodetags,
|
||||
m.meta,
|
||||
map[string]interface{}{"value": int(num_cpus)},
|
||||
time.Now(),
|
||||
now,
|
||||
)
|
||||
if err == nil {
|
||||
output <- num_cpus_metric
|
||||
}
|
||||
|
||||
m.lastTimestamp = now
|
||||
}
|
||||
|
||||
func (m *CpustatCollector) Close() {
|
||||
|
@ -3,8 +3,8 @@ package collectors
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
@ -53,7 +53,7 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
}
|
||||
for _, f := range m.config.Files {
|
||||
_, err = ioutil.ReadFile(f)
|
||||
_, err = os.ReadFile(f)
|
||||
if err == nil {
|
||||
m.files = append(m.files, f)
|
||||
} else {
|
||||
@ -106,7 +106,7 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetri
|
||||
}
|
||||
}
|
||||
for _, file := range m.files {
|
||||
buffer, err := ioutil.ReadFile(file)
|
||||
buffer, err := os.ReadFile(file)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
return
|
||||
|
@ -5,7 +5,7 @@ import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"io"
|
||||
"log"
|
||||
"os/exec"
|
||||
"os/user"
|
||||
@ -118,8 +118,8 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||
cmd.Stderr = cmdStderr
|
||||
err := cmd.Run()
|
||||
if err != nil {
|
||||
dataStdErr, _ := ioutil.ReadAll(cmdStderr)
|
||||
dataStdOut, _ := ioutil.ReadAll(cmdStdout)
|
||||
dataStdErr, _ := io.ReadAll(cmdStderr)
|
||||
dataStdOut, _ := io.ReadAll(cmdStdout)
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err),
|
||||
|
@ -2,7 +2,6 @@ package collectors
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||
@ -21,6 +20,7 @@ const IB_BASEPATH = "/sys/class/infiniband/"
|
||||
type InfinibandCollectorMetric struct {
|
||||
path string
|
||||
unit string
|
||||
scale int64
|
||||
}
|
||||
|
||||
type InfinibandCollectorInfo struct {
|
||||
@ -84,7 +84,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
for _, path := range ibDirs {
|
||||
|
||||
// Skip, when no LID is assigned
|
||||
line, err := ioutil.ReadFile(filepath.Join(path, "lid"))
|
||||
line, err := os.ReadFile(filepath.Join(path, "lid"))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
@ -113,10 +113,10 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
// Check access to counter files
|
||||
countersDir := filepath.Join(path, "counters")
|
||||
portCounterFiles := map[string]InfinibandCollectorMetric{
|
||||
"ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes"},
|
||||
"ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes"},
|
||||
"ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets"},
|
||||
"ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets"},
|
||||
"ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes", scale: 4},
|
||||
"ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes", scale: 4},
|
||||
"ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets", scale: 1},
|
||||
"ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets", scale: 1},
|
||||
}
|
||||
for _, counter := range portCounterFiles {
|
||||
err := unix.Access(counter.path, unix.R_OK)
|
||||
@ -174,7 +174,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
|
||||
for counterName, counterDef := range info.portCounterFiles {
|
||||
|
||||
// Read counter file
|
||||
line, err := ioutil.ReadFile(counterDef.path)
|
||||
line, err := os.ReadFile(counterDef.path)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
@ -191,6 +191,8 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
|
||||
fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterName, data, err))
|
||||
continue
|
||||
}
|
||||
// Scale raw value
|
||||
v *= counterDef.scale
|
||||
|
||||
// Send absolut values
|
||||
if m.config.SendAbsoluteValues {
|
||||
|
@ -12,7 +12,6 @@ import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"math"
|
||||
"os"
|
||||
"os/signal"
|
||||
@ -154,12 +153,13 @@ func getBaseFreq() float64 {
|
||||
}
|
||||
var freq float64 = math.NaN()
|
||||
for _, f := range files {
|
||||
buffer, err := ioutil.ReadFile(f)
|
||||
buffer, err := os.ReadFile(f)
|
||||
if err == nil {
|
||||
data := strings.Replace(string(buffer), "\n", "", -1)
|
||||
x, err := strconv.ParseInt(data, 0, 64)
|
||||
if err == nil {
|
||||
freq = float64(x) * 1e6
|
||||
freq = float64(x)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -168,11 +168,11 @@ func getBaseFreq() float64 {
|
||||
C.power_init(0)
|
||||
info := C.get_powerInfo()
|
||||
if float64(info.baseFrequency) != 0 {
|
||||
freq = float64(info.baseFrequency) * 1e6
|
||||
freq = float64(info.baseFrequency)
|
||||
}
|
||||
C.power_finalize()
|
||||
}
|
||||
return freq
|
||||
return freq * 1e3
|
||||
}
|
||||
|
||||
func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
|
@ -7,6 +7,9 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li
|
||||
"likwid": {
|
||||
"force_overwrite" : false,
|
||||
"invalid_to_zero" : false,
|
||||
"liblikwid_path" : "/path/to/liblikwid.so",
|
||||
"accessdaemon_path" : "/folder/that/contains/likwid-accessD",
|
||||
"access_mode" : "direct or accessdaemon or perf_event",
|
||||
"eventsets": [
|
||||
{
|
||||
"events" : {
|
||||
|
@ -3,7 +3,7 @@ package collectors
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@ -72,7 +72,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
||||
if !m.init {
|
||||
return
|
||||
}
|
||||
buffer, err := ioutil.ReadFile(LOADAVGFILE)
|
||||
buffer, err := os.ReadFile(LOADAVGFILE)
|
||||
if err != nil {
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
|
@ -68,7 +68,8 @@ func getStats(filename string) map[string]MemstatStats {
|
||||
} else if len(linefields) == 5 {
|
||||
v, err := strconv.ParseFloat(linefields[3], 64)
|
||||
if err == nil {
|
||||
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
|
||||
cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4])
|
||||
stats[strings.Trim(linefields[2], ":")] = MemstatStats{
|
||||
value: v,
|
||||
unit: linefields[4],
|
||||
}
|
||||
@ -160,7 +161,6 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
|
||||
func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||
if !m.init {
|
||||
cclog.ComponentPrint(m.name, "Here")
|
||||
return
|
||||
}
|
||||
|
||||
@ -188,16 +188,20 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
||||
unit := ""
|
||||
if totalVal, total := stats["MemTotal"]; total {
|
||||
if freeVal, free := stats["MemFree"]; free {
|
||||
if bufVal, buffers := stats["Buffers"]; buffers {
|
||||
if cacheVal, cached := stats["Cached"]; cached {
|
||||
memUsed = totalVal.value - (freeVal.value + bufVal.value + cacheVal.value)
|
||||
memUsed = totalVal.value - freeVal.value
|
||||
if len(totalVal.unit) > 0 {
|
||||
unit = totalVal.unit
|
||||
} else if len(freeVal.unit) > 0 {
|
||||
unit = freeVal.unit
|
||||
} else if len(bufVal.unit) > 0 {
|
||||
}
|
||||
if bufVal, buffers := stats["Buffers"]; buffers {
|
||||
memUsed -= bufVal.value
|
||||
if len(bufVal.unit) > 0 && len(unit) == 0 {
|
||||
unit = bufVal.unit
|
||||
} else if len(cacheVal.unit) > 0 {
|
||||
}
|
||||
if cacheVal, cached := stats["Cached"]; cached {
|
||||
memUsed -= cacheVal.value
|
||||
if len(cacheVal.unit) > 0 && len(unit) == 0 {
|
||||
unit = cacheVal.unit
|
||||
}
|
||||
}
|
||||
|
@ -66,14 +66,14 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
|
||||
ret := rocm_smi.Init()
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = errors.New("Failed to initialize ROCm SMI library")
|
||||
err = errors.New("failed to initialize ROCm SMI library")
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
numDevs, ret := rocm_smi.NumMonitorDevices()
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = errors.New("Failed to get number of GPUs from ROCm SMI library")
|
||||
err = errors.New("failed to get number of GPUs from ROCm SMI library")
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
}
|
||||
@ -98,14 +98,14 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = fmt.Errorf("Failed to get handle for GPU %d", i)
|
||||
err = fmt.Errorf("failed to get handle for GPU %d", i)
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
pciInfo, ret := rocm_smi.DeviceGetPciInfo(device)
|
||||
if ret != rocm_smi.STATUS_SUCCESS {
|
||||
err = fmt.Errorf("Failed to get PCI information for GPU %d", i)
|
||||
err = fmt.Errorf("failed to get PCI information for GPU %d", i)
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
}
|
||||
|
155
collectors/schedstatMetric.go
Normal file
155
collectors/schedstatMetric.go
Normal file
@ -0,0 +1,155 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"bufio"
|
||||
"time"
|
||||
"os"
|
||||
"strings"
|
||||
"strconv"
|
||||
"math"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||
)
|
||||
|
||||
const SCHEDSTATFILE = `/proc/schedstat`
|
||||
|
||||
// These are the fields we read from the JSON configuration
|
||||
type SchedstatCollectorConfig struct {
|
||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||
}
|
||||
|
||||
// This contains all variables we need during execution and the variables
|
||||
// defined by metricCollector (name, init, ...)
|
||||
type SchedstatCollector struct {
|
||||
metricCollector
|
||||
config SchedstatCollectorConfig // the configuration structure
|
||||
lastTimestamp time.Time // Store time stamp of last tick to derive values
|
||||
meta map[string]string // default meta information
|
||||
cputags map[string]map[string]string // default tags
|
||||
olddata map[string]map[string]int64 // default tags
|
||||
}
|
||||
|
||||
// Functions to implement MetricCollector interface
|
||||
// Init(...), Read(...), Close()
|
||||
// See: metricCollector.go
|
||||
|
||||
// Init initializes the sample collector
|
||||
// Called once by the collector manager
|
||||
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
||||
func (m *SchedstatCollector) Init(config json.RawMessage) error {
|
||||
var err error = nil
|
||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||
m.name = "SchedstatCollector"
|
||||
// This is for later use, also call it early
|
||||
m.setup()
|
||||
// Tell whether the collector should be run in parallel with others (reading files, ...)
|
||||
// or it should be run serially, mostly for collectors acutally doing measurements
|
||||
// because they should not measure the execution of the other collectors
|
||||
m.parallel = true
|
||||
// Define meta information sent with each metric
|
||||
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||
m.meta = map[string]string{"source": m.name, "group": "SCHEDSTAT"}
|
||||
|
||||
// Read in the JSON configuration
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// Check input file
|
||||
file, err := os.Open(string(SCHEDSTATFILE))
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
// Pre-generate tags for all CPUs
|
||||
num_cpus := 0
|
||||
m.cputags = make(map[string]map[string]string)
|
||||
m.olddata = make(map[string]map[string]int64)
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
linefields := strings.Fields(line)
|
||||
if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
||||
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
||||
cpu, _ := strconv.Atoi(cpustr)
|
||||
running, _ := strconv.ParseInt(linefields[7], 10, 64)
|
||||
waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
|
||||
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
|
||||
m.olddata[linefields[0]] = map[string]int64{"running" : running, "waiting" : waiting}
|
||||
num_cpus++
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Save current timestamp
|
||||
m.lastTimestamp = time.Now()
|
||||
|
||||
// Set this flag only if everything is initialized properly, all required files exist, ...
|
||||
m.init = true
|
||||
return err
|
||||
}
|
||||
|
||||
func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]string, output chan lp.CCMetric, now time.Time, tsdelta time.Duration) {
|
||||
running, _ := strconv.ParseInt(linefields[7], 10, 64)
|
||||
waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
|
||||
diff_running := running - m.olddata[linefields[0]]["running"]
|
||||
diff_waiting := waiting - m.olddata[linefields[0]]["waiting"]
|
||||
|
||||
var l_running float64 = float64(diff_running) / tsdelta.Seconds() / (math.Pow(1000, 3))
|
||||
var l_waiting float64 = float64(diff_waiting) / tsdelta.Seconds() / (math.Pow(1000, 3))
|
||||
|
||||
m.olddata[linefields[0]]["running"] = running
|
||||
m.olddata[linefields[0]]["waiting"] = waiting
|
||||
value := l_running + l_waiting
|
||||
|
||||
y, err := lp.New("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now)
|
||||
if err == nil {
|
||||
// Send it to output channel
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
// Read collects all metrics belonging to the sample collector
|
||||
// and sends them through the output channel to the collector manager
|
||||
func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||
if !m.init {
|
||||
return
|
||||
}
|
||||
|
||||
//timestamps
|
||||
now := time.Now()
|
||||
tsdelta := now.Sub(m.lastTimestamp)
|
||||
|
||||
file, err := os.Open(string(SCHEDSTATFILE))
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
linefields := strings.Fields(line)
|
||||
if strings.HasPrefix(linefields[0], "cpu") {
|
||||
m.ParseProcLine(linefields, m.cputags[linefields[0]], output, now, tsdelta)
|
||||
}
|
||||
}
|
||||
|
||||
m.lastTimestamp = now
|
||||
|
||||
}
|
||||
|
||||
// Close metric collector: close network connection, close files, close libraries, ...
|
||||
// Called once by the collector manager
|
||||
func (m *SchedstatCollector) Close() {
|
||||
// Unset flag
|
||||
m.init = false
|
||||
}
|
11
collectors/schedstatMetric.md
Normal file
11
collectors/schedstatMetric.md
Normal file
@ -0,0 +1,11 @@
|
||||
|
||||
## `schedstat` collector
|
||||
```json
|
||||
"schedstat": {
|
||||
}
|
||||
```
|
||||
|
||||
The `schedstat` collector reads data from /proc/schedstat and calculates a load value, separated by hwthread. This might be useful to detect bad cpu pinning on shared nodes etc.
|
||||
|
||||
Metric:
|
||||
* `cpu_load_core`
|
@ -3,7 +3,7 @@ package collectors
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
@ -83,14 +83,14 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// sensor name
|
||||
nameFile := filepath.Join(filepath.Dir(file), "name")
|
||||
name, err := ioutil.ReadFile(nameFile)
|
||||
name, err := os.ReadFile(nameFile)
|
||||
if err == nil {
|
||||
sensor.name = strings.TrimSpace(string(name))
|
||||
}
|
||||
|
||||
// sensor label
|
||||
labelFile := strings.TrimSuffix(file, "_input") + "_label"
|
||||
label, err := ioutil.ReadFile(labelFile)
|
||||
label, err := os.ReadFile(labelFile)
|
||||
if err == nil {
|
||||
sensor.label = strings.TrimSpace(string(label))
|
||||
}
|
||||
@ -117,7 +117,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
// Sensor file
|
||||
_, err = ioutil.ReadFile(file)
|
||||
_, err = os.ReadFile(file)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
@ -139,7 +139,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
||||
// max temperature
|
||||
if m.config.ReportMaxTemp {
|
||||
maxTempFile := strings.TrimSuffix(file, "_input") + "_max"
|
||||
if buffer, err := ioutil.ReadFile(maxTempFile); err == nil {
|
||||
if buffer, err := os.ReadFile(maxTempFile); err == nil {
|
||||
if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil {
|
||||
sensor.maxTempName = strings.Replace(sensor.metricName, "temp", "max_temp", 1)
|
||||
sensor.maxTemp = x / 1000
|
||||
@ -150,7 +150,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
||||
// critical temperature
|
||||
if m.config.ReportCriticalTemp {
|
||||
criticalTempFile := strings.TrimSuffix(file, "_input") + "_crit"
|
||||
if buffer, err := ioutil.ReadFile(criticalTempFile); err == nil {
|
||||
if buffer, err := os.ReadFile(criticalTempFile); err == nil {
|
||||
if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil {
|
||||
sensor.critTempName = strings.Replace(sensor.metricName, "temp", "crit_temp", 1)
|
||||
sensor.critTemp = x / 1000
|
||||
@ -175,7 +175,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||
|
||||
for _, sensor := range m.sensors {
|
||||
// Read sensor file
|
||||
buffer, err := ioutil.ReadFile(sensor.file)
|
||||
buffer, err := os.ReadFile(sensor.file)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
|
60
docs/building.md
Normal file
60
docs/building.md
Normal file
@ -0,0 +1,60 @@
|
||||
# Building the cc-metric-collector
|
||||
|
||||
In most cases, a simple `make` in the main folder is enough to get a `cc-metric-collector` binary. It is basically a `go build` but some collectors require additional tasks. There is currently no Golang interface to LIKWID, so it uses `cgo` to create bindings but `cgo` requires the LIKWID header files. Therefore, it checks whether LIKWID is installed and if not it downloads LIKWID and copies the headers.
|
||||
|
||||
## System integration
|
||||
|
||||
The main configuration settings for system integration are pre-defined in `scripts/cc-metric-collector.config`. The file contains the UNIX user and group used for execution, the PID file location and other settings. Adjust it accordingly and copy it to `/etc/default/cc-metric-collector`
|
||||
|
||||
```bash
|
||||
$ install --mode 644 \
|
||||
--owner $CC_USER \
|
||||
--group $CC_GROUP \
|
||||
scripts/cc-metric-collector.config /etc/default/cc-metric-collector
|
||||
$ edit /etc/default/cc-metric-collector
|
||||
```
|
||||
|
||||
### SysVinit and similar
|
||||
|
||||
If you are using a init system based in `/etc/init.d` daemons, you can use the sample `scripts/cc-metric-collector.init`. It reads the basic configuration from `/etc/default/cc-metric-collector`
|
||||
|
||||
```bash
|
||||
$ install --mode 755 \
|
||||
--owner $CC_USER \
|
||||
--group $CC_GROUP \
|
||||
scripts/cc-metric-collector.init /etc/init.d/cc-metric-collector
|
||||
```
|
||||
|
||||
### Systemd
|
||||
|
||||
If you are using `systemd` as init system, you can use the sample systemd service file `scripts/cc-metric-collector.service`, the configuration file `scripts/cc-metric-collector.config`.
|
||||
|
||||
```bash
|
||||
$ install --mode 644 \
|
||||
--owner $CC_USER \
|
||||
--group $CC_GROUP \
|
||||
scripts/cc-metric-collector.service /etc/systemd/system/cc-metric-collector.service
|
||||
$ systemctl enable cc-metric-collector
|
||||
```
|
||||
|
||||
## RPM
|
||||
|
||||
In order to get a RPM packages for cc-metric-collector, just use:
|
||||
|
||||
```bash
|
||||
$ make RPM
|
||||
```
|
||||
|
||||
It uses the RPM SPEC file `scripts/cc-metric-collector.spec` and requires the RPM tools (`rpm` and `rpmspec`) and `git`.
|
||||
|
||||
## DEB
|
||||
|
||||
In order to get very simple Debian packages for cc-metric-collector, just use:
|
||||
|
||||
```bash
|
||||
$ make DEB
|
||||
```
|
||||
|
||||
It uses the DEB control file `scripts/cc-metric-collector.control` and requires `dpkg-deb`, `awk`, `sed` and `git`. It creates only a binary deb package.
|
||||
|
||||
_This option is not well tested and therefore experimental_
|
@ -6,27 +6,52 @@ It is basically a copy of the [InfluxDB line protocol](https://github.com/influx
|
||||
|
||||
```golang
|
||||
type ccMetric struct {
|
||||
name string // same as
|
||||
tags []*influx.Tag // original
|
||||
fields []*influx.Field // Influx
|
||||
tm time.Time // line-protocol
|
||||
meta []*influx.Tag
|
||||
name string // Measurement name
|
||||
meta map[string]string // map of meta data tags
|
||||
tags map[string]string // map of of tags
|
||||
fields map[string]interface{} // map of of fields
|
||||
tm time.Time // timestamp
|
||||
}
|
||||
|
||||
type CCMetric interface {
|
||||
influx.MutableMetric // the same functions as defined by influx.MutableMetric
|
||||
RemoveTag(key string) // this is not published by the original influx.MutableMetric
|
||||
Meta() map[string]string
|
||||
MetaList() []*inlux.Tag
|
||||
AddMeta(key, value string)
|
||||
HasMeta(key string) bool
|
||||
GetMeta(key string) (string, bool)
|
||||
RemoveMeta(key string)
|
||||
ToPoint(metaAsTags map[string]bool) *write.Point // Generate influxDB point for data type ccMetric
|
||||
ToLineProtocol(metaAsTags map[string]bool) string // Generate influxDB line protocol for data type ccMetric
|
||||
String() string // Return line-protocol like string
|
||||
|
||||
Name() string // Get metric name
|
||||
SetName(name string) // Set metric name
|
||||
|
||||
Time() time.Time // Get timestamp
|
||||
SetTime(t time.Time) // Set timestamp
|
||||
|
||||
Tags() map[string]string // Map of tags
|
||||
AddTag(key, value string) // Add a tag
|
||||
GetTag(key string) (value string, ok bool) // Get a tag by its key
|
||||
HasTag(key string) (ok bool) // Check if a tag key is present
|
||||
RemoveTag(key string) // Remove a tag by its key
|
||||
|
||||
Meta() map[string]string // Map of meta data tags
|
||||
AddMeta(key, value string) // Add a meta data tag
|
||||
GetMeta(key string) (value string, ok bool) // Get a meta data tab addressed by its key
|
||||
HasMeta(key string) (ok bool) // Check if a meta data key is present
|
||||
RemoveMeta(key string) // Remove a meta data tag by its key
|
||||
|
||||
Fields() map[string]interface{} // Map of fields
|
||||
AddField(key string, value interface{}) // Add a field
|
||||
GetField(key string) (value interface{}, ok bool) // Get a field addressed by its key
|
||||
HasField(key string) (ok bool) // Check if a field key is present
|
||||
RemoveField(key string) // Remove a field addressed by its key
|
||||
}
|
||||
|
||||
func New(name string, tags map[string]string, meta map[string]string, fields map[string]interface{}, tm time.Time) (CCMetric, error)
|
||||
func FromMetric(other CCMetric) CCMetric
|
||||
func FromInfluxMetric(other lp.Metric) CCMetric
|
||||
```
|
||||
|
||||
The `CCMetric` interface provides the same functions as the `MutableMetric` like `{Add, Remove, Has}{Tag, Field}` and additionally provides `{Add, Remove, Has}Meta`.
|
||||
The `CCMetric` interface provides the same functions as the `MutableMetric` like `{Add, Get, Remove, Has}{Tag, Field}` and additionally provides `{Add, Get, Remove, Has}Meta`.
|
||||
|
||||
The InfluxDB protocol creates a new metric with `influx.New(name, tags, fields, time)` while CCMetric uses `ccMetric.New(name, tags, meta, fields, time)` where `tags` and `meta` are both of type `map[string]string`.
|
||||
|
||||
You can copy a CCMetric with `FromMetric(other CCMetric) CCMetric`. If you get an `influx.Metric` from a function, like the line protocol parser, you can use `FromInfluxMetric(other influx.Metric) CCMetric` to get a CCMetric out of it (see `NatsReceiver` for an example).
|
||||
|
||||
Although the [cc-specifications](https://github.com/ClusterCockpit/cc-specifications/blob/master/interfaces/lineprotocol/README.md) defines that there is only a `value` field for the metric value, the CCMetric still can have multiple values similar to the InfluxDB line protocol.
|
||||
|
@ -50,6 +50,7 @@ type CCMetric interface {
|
||||
GetField(key string) (value interface{}, ok bool) // Get a field addressed by its key
|
||||
HasField(key string) (ok bool) // Check if a field key is present
|
||||
RemoveField(key string) // Remove a field addressed by its key
|
||||
String() string // Return line-protocol like string
|
||||
}
|
||||
|
||||
// String implements the stringer interface for data type ccMetric
|
||||
@ -217,23 +218,26 @@ func New(
|
||||
}
|
||||
|
||||
// FromMetric copies the metric <other>
|
||||
func FromMetric(other ccMetric) CCMetric {
|
||||
func FromMetric(other CCMetric) CCMetric {
|
||||
otags := other.Tags()
|
||||
ometa := other.Meta()
|
||||
ofields := other.Fields()
|
||||
m := &ccMetric{
|
||||
name: other.Name(),
|
||||
tags: make(map[string]string, len(other.tags)),
|
||||
meta: make(map[string]string, len(other.meta)),
|
||||
fields: make(map[string]interface{}, len(other.fields)),
|
||||
tags: make(map[string]string, len(otags)),
|
||||
meta: make(map[string]string, len(ometa)),
|
||||
fields: make(map[string]interface{}, len(ofields)),
|
||||
tm: other.Time(),
|
||||
}
|
||||
|
||||
// deep copy tags, meta data tags and fields
|
||||
for key, value := range other.tags {
|
||||
for key, value := range otags {
|
||||
m.tags[key] = value
|
||||
}
|
||||
for key, value := range other.meta {
|
||||
for key, value := range ometa {
|
||||
m.meta[key] = value
|
||||
}
|
||||
for key, value := range other.fields {
|
||||
for key, value := range ofields {
|
||||
m.fields[key] = value
|
||||
}
|
||||
return m
|
||||
|
@ -5,7 +5,7 @@ import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
@ -84,7 +84,7 @@ func (r *HttpReceiver) ServerHttp(w http.ResponseWriter, req *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
body, err := ioutil.ReadAll(req.Body)
|
||||
body, err := io.ReadAll(req.Body)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
|
@ -6,6 +6,7 @@ import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@ -14,113 +15,244 @@ import (
|
||||
|
||||
// See: https://pkg.go.dev/github.com/stmcginnis/gofish
|
||||
"github.com/stmcginnis/gofish"
|
||||
"github.com/stmcginnis/gofish/common"
|
||||
"github.com/stmcginnis/gofish/redfish"
|
||||
)
|
||||
|
||||
type RedfishReceiverClientConfig struct {
|
||||
|
||||
// Hostname the redfish service belongs to
|
||||
Hostname string
|
||||
|
||||
// is metric excluded globally or per client
|
||||
isExcluded map[string](bool)
|
||||
|
||||
doPowerMetric bool
|
||||
doProcessorMetrics bool
|
||||
doThermalMetrics bool
|
||||
|
||||
skipProcessorMetricsURL map[string]bool
|
||||
|
||||
gofish gofish.ClientConfig
|
||||
}
|
||||
|
||||
// RedfishReceiver configuration:
|
||||
type RedfishReceiver struct {
|
||||
receiver
|
||||
|
||||
config struct {
|
||||
Type string `json:"type"`
|
||||
|
||||
// Maximum number of simultaneous redfish connections (default: 64)
|
||||
Fanout int `json:"fanout,omitempty"`
|
||||
// How often the redfish power metrics should be read and send to the sink (default: 30 s)
|
||||
IntervalString string `json:"interval,omitempty"`
|
||||
fanout int
|
||||
Interval time.Duration
|
||||
|
||||
// Control whether a client verifies the server's certificate (default: true)
|
||||
HttpInsecure bool `json:"http_insecure,omitempty"`
|
||||
// Time limit for requests made by this HTTP client (default: 10 s)
|
||||
HttpTimeoutString string `json:"http_timeout,omitempty"`
|
||||
HttpTimeout time.Duration
|
||||
|
||||
// Client config for each redfish service
|
||||
ClientConfigs []struct {
|
||||
Hostname *string `json:"hostname"`
|
||||
Username *string `json:"username"`
|
||||
Password *string `json:"password"`
|
||||
Endpoint *string `json:"endpoint"`
|
||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||
gofish gofish.ClientConfig
|
||||
} `json:"client_config"`
|
||||
ClientConfigs []RedfishReceiverClientConfig
|
||||
}
|
||||
|
||||
done chan bool // channel to finish / stop redfish receiver
|
||||
wg sync.WaitGroup // wait group for redfish receiver
|
||||
}
|
||||
|
||||
// Start starts the redfish receiver
|
||||
func (r *RedfishReceiver) Start() {
|
||||
cclog.ComponentDebug(r.name, "START")
|
||||
// readThermalMetrics reads thermal metrics from a redfish device
|
||||
func (r *RedfishReceiver) readThermalMetrics(
|
||||
clientConfig *RedfishReceiverClientConfig,
|
||||
chassis *redfish.Chassis) error {
|
||||
|
||||
// readPowerMetric reads redfish power metric from the endpoint configured in conf
|
||||
readPowerMetric := func(clientConfigIndex int) error {
|
||||
|
||||
clientConfig := &r.config.ClientConfigs[clientConfigIndex]
|
||||
|
||||
// Connect to redfish service
|
||||
c, err := gofish.Connect(clientConfig.gofish)
|
||||
// Get thermal information for each chassis
|
||||
thermal, err := chassis.Thermal()
|
||||
if err != nil {
|
||||
return fmt.Errorf(
|
||||
"readPowerMetric: gofish.Connect({Username: %v, Endpoint: %v, BasicAuth: %v, HttpTimeout: %v, HttpInsecure: %v}) failed: %v",
|
||||
clientConfig.gofish.Username,
|
||||
clientConfig.gofish.Endpoint,
|
||||
clientConfig.gofish.BasicAuth,
|
||||
clientConfig.gofish.HTTPClient.Timeout,
|
||||
clientConfig.gofish.HTTPClient.Transport.(*http.Transport).TLSClientConfig.InsecureSkipVerify,
|
||||
err)
|
||||
}
|
||||
defer c.Logout()
|
||||
|
||||
// Get all chassis managed by this service
|
||||
chassis_list, err := c.Service.Chassis()
|
||||
if err != nil {
|
||||
return fmt.Errorf("readPowerMetric: c.Service.Chassis() failed: %v", err)
|
||||
return fmt.Errorf("readMetrics: chassis.Thermal() failed: %v", err)
|
||||
}
|
||||
|
||||
// Skip empty thermal information
|
||||
if thermal == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, chassis := range chassis_list {
|
||||
timestamp := time.Now()
|
||||
|
||||
for _, temperature := range thermal.Temperatures {
|
||||
|
||||
// Skip, when temperature metric is excluded
|
||||
if clientConfig.isExcluded["temperature"] {
|
||||
break
|
||||
}
|
||||
|
||||
// Skip all temperatures which are not in enabled state
|
||||
if temperature.Status.State != "" && temperature.Status.State != common.EnabledState {
|
||||
continue
|
||||
}
|
||||
|
||||
tags := map[string]string{
|
||||
"hostname": clientConfig.Hostname,
|
||||
"type": "node",
|
||||
// ChassisType shall indicate the physical form factor for the type of chassis
|
||||
"chassis_typ": string(chassis.ChassisType),
|
||||
// Chassis name
|
||||
"chassis_name": chassis.Name,
|
||||
// ID uniquely identifies the resource
|
||||
"temperature_id": temperature.ID,
|
||||
// MemberID shall uniquely identify the member within the collection. For
|
||||
// services supporting Redfish v1.6 or higher, this value shall be the
|
||||
// zero-based array index.
|
||||
"temperature_member_id": temperature.MemberID,
|
||||
// PhysicalContext shall be a description of the affected device or region
|
||||
// within the chassis to which this temperature measurement applies
|
||||
"temperature_physical_context": string(temperature.PhysicalContext),
|
||||
// Name
|
||||
"temperature_name": temperature.Name,
|
||||
}
|
||||
|
||||
// Delete empty tags
|
||||
for key, value := range tags {
|
||||
if value == "" {
|
||||
delete(tags, key)
|
||||
}
|
||||
}
|
||||
|
||||
// Set meta data tags
|
||||
meta := map[string]string{
|
||||
"source": r.name,
|
||||
"group": "Temperature",
|
||||
"unit": "degC",
|
||||
}
|
||||
|
||||
// ReadingCelsius shall be the current value of the temperature sensor's reading.
|
||||
value := temperature.ReadingCelsius
|
||||
|
||||
y, err := lp.New("temperature", tags, meta,
|
||||
map[string]interface{}{
|
||||
"value": value,
|
||||
},
|
||||
timestamp)
|
||||
if err == nil {
|
||||
r.sink <- y
|
||||
}
|
||||
}
|
||||
|
||||
for _, fan := range thermal.Fans {
|
||||
// Skip, when fan_speed metric is excluded
|
||||
if clientConfig.isExcluded["fan_speed"] {
|
||||
break
|
||||
}
|
||||
|
||||
// Skip all fans which are not in enabled state
|
||||
if fan.Status.State != common.EnabledState {
|
||||
continue
|
||||
}
|
||||
|
||||
tags := map[string]string{
|
||||
"hostname": clientConfig.Hostname,
|
||||
"type": "node",
|
||||
// ChassisType shall indicate the physical form factor for the type of chassis
|
||||
"chassis_typ": string(chassis.ChassisType),
|
||||
// Chassis name
|
||||
"chassis_name": chassis.Name,
|
||||
// ID uniquely identifies the resource
|
||||
"fan_id": fan.ID,
|
||||
// MemberID shall uniquely identify the member within the collection. For
|
||||
// services supporting Redfish v1.6 or higher, this value shall be the
|
||||
// zero-based array index.
|
||||
"fan_member_id": fan.MemberID,
|
||||
// PhysicalContext shall be a description of the affected device or region
|
||||
// within the chassis to which this fan is associated
|
||||
"fan_physical_context": string(fan.PhysicalContext),
|
||||
// Name
|
||||
"fan_name": fan.Name,
|
||||
}
|
||||
|
||||
// Delete empty tags
|
||||
for key, value := range tags {
|
||||
if value == "" {
|
||||
delete(tags, key)
|
||||
}
|
||||
}
|
||||
|
||||
// Set meta data tags
|
||||
meta := map[string]string{
|
||||
"source": r.name,
|
||||
"group": "FanSpeed",
|
||||
"unit": string(fan.ReadingUnits),
|
||||
}
|
||||
|
||||
// Reading shall be the current value of the fan sensor's reading
|
||||
value := fan.Reading
|
||||
|
||||
y, err := lp.New("fan_speed", tags, meta,
|
||||
map[string]interface{}{
|
||||
"value": value,
|
||||
},
|
||||
timestamp)
|
||||
if err == nil {
|
||||
r.sink <- y
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// readPowerMetrics reads power metrics from a redfish device
|
||||
func (r *RedfishReceiver) readPowerMetrics(
|
||||
clientConfig *RedfishReceiverClientConfig,
|
||||
chassis *redfish.Chassis) error {
|
||||
|
||||
// Get power information for each chassis
|
||||
power, err := chassis.Power()
|
||||
if err != nil {
|
||||
return fmt.Errorf("readPowerMetric: chassis.Power() failed: %v", err)
|
||||
return fmt.Errorf("readMetrics: chassis.Power() failed: %v", err)
|
||||
}
|
||||
|
||||
// Skip empty power information
|
||||
if power == nil {
|
||||
continue
|
||||
return nil
|
||||
}
|
||||
|
||||
timestamp := time.Now()
|
||||
|
||||
// Read min, max and average consumed watts for each power control
|
||||
for _, pc := range power.PowerControl {
|
||||
|
||||
// Skip all power controls which are not in enabled state
|
||||
if pc.Status.State != "" && pc.Status.State != common.EnabledState {
|
||||
continue
|
||||
}
|
||||
|
||||
// Map of collected metrics
|
||||
metrics := map[string]float32{
|
||||
metrics := make(map[string]float32)
|
||||
|
||||
// PowerConsumedWatts shall represent the actual power being consumed (in
|
||||
// Watts) by the chassis
|
||||
"consumed_watts": pc.PowerConsumedWatts,
|
||||
if !clientConfig.isExcluded["consumed_watts"] {
|
||||
metrics["consumed_watts"] = pc.PowerConsumedWatts
|
||||
}
|
||||
// AverageConsumedWatts shall represent the
|
||||
// average power level that occurred averaged over the last IntervalInMin
|
||||
// minutes.
|
||||
"average_consumed_watts": pc.PowerMetrics.AverageConsumedWatts,
|
||||
if !clientConfig.isExcluded["average_consumed_watts"] {
|
||||
metrics["average_consumed_watts"] = pc.PowerMetrics.AverageConsumedWatts
|
||||
}
|
||||
// MinConsumedWatts shall represent the
|
||||
// minimum power level in watts that occurred within the last
|
||||
// IntervalInMin minutes.
|
||||
"min_consumed_watts": pc.PowerMetrics.MinConsumedWatts,
|
||||
if !clientConfig.isExcluded["min_consumed_watts"] {
|
||||
metrics["min_consumed_watts"] = pc.PowerMetrics.MinConsumedWatts
|
||||
}
|
||||
// MaxConsumedWatts shall represent the
|
||||
// maximum power level in watts that occurred within the last
|
||||
// IntervalInMin minutes
|
||||
"max_consumed_watts": pc.PowerMetrics.MaxConsumedWatts,
|
||||
}
|
||||
intervalInMin := strconv.FormatFloat(float64(pc.PowerMetrics.IntervalInMin), 'f', -1, 32)
|
||||
|
||||
// Metrics to exclude
|
||||
for _, key := range clientConfig.ExcludeMetrics {
|
||||
delete(metrics, key)
|
||||
if !clientConfig.isExcluded["max_consumed_watts"] {
|
||||
metrics["max_consumed_watts"] = pc.PowerMetrics.MaxConsumedWatts
|
||||
}
|
||||
// IntervalInMin shall represent the time interval (or window), in minutes,
|
||||
// in which the PowerMetrics properties are measured over.
|
||||
// Should be an integer, but some Dell implementations return as a float
|
||||
intervalInMin :=
|
||||
strconv.FormatFloat(
|
||||
float64(pc.PowerMetrics.IntervalInMin), 'f', -1, 32)
|
||||
|
||||
// Set tags
|
||||
tags := map[string]string{
|
||||
"hostname": *clientConfig.Hostname,
|
||||
"hostname": clientConfig.Hostname,
|
||||
"type": "node",
|
||||
// ChassisType shall indicate the physical form factor for the type of chassis
|
||||
"chassis_typ": string(chassis.ChassisType),
|
||||
@ -173,27 +305,213 @@ func (r *RedfishReceiver) Start() {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// readProcessorMetrics reads processor metrics from a redfish device
|
||||
// See: https://redfish.dmtf.org/schemas/v1/ProcessorMetrics.json
|
||||
func (r *RedfishReceiver) readProcessorMetrics(
|
||||
clientConfig *RedfishReceiverClientConfig,
|
||||
processor *redfish.Processor) error {
|
||||
|
||||
timestamp := time.Now()
|
||||
|
||||
// URL to processor metrics
|
||||
URL := processor.ODataID + "/ProcessorMetrics"
|
||||
|
||||
// Skip previously detected non existing URLs
|
||||
if clientConfig.skipProcessorMetricsURL[URL] {
|
||||
return nil
|
||||
}
|
||||
|
||||
// doReadPowerMetric read power metrics for all configure redfish services.
|
||||
// To compensate latencies of the Redfish services a fanout is used.
|
||||
doReadPowerMetric := func() {
|
||||
|
||||
// Compute fanout to use
|
||||
realFanout := r.config.Fanout
|
||||
if len(r.config.ClientConfigs) < realFanout {
|
||||
realFanout = len(r.config.ClientConfigs)
|
||||
resp, err := processor.Client.Get(URL)
|
||||
if err != nil {
|
||||
// Skip non existing URLs
|
||||
if statusCode := err.(*common.Error).HTTPReturnedStatusCode; statusCode == http.StatusNotFound {
|
||||
clientConfig.skipProcessorMetricsURL[URL] = true
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("processor.Client.Get(%v) failed: %+w", URL, err)
|
||||
}
|
||||
|
||||
var processorMetrics struct {
|
||||
common.Entity
|
||||
ODataType string `json:"@odata.type"`
|
||||
ODataEtag string `json:"@odata.etag"`
|
||||
Description string `json:"Description"`
|
||||
// This property shall contain the power, in watts, that the processor has consumed.
|
||||
ConsumedPowerWatt float32 `json:"ConsumedPowerWatt"`
|
||||
// This property shall contain the temperature, in Celsius, of the processor.
|
||||
TemperatureCelsius float32 `json:"TemperatureCelsius"`
|
||||
}
|
||||
err = json.NewDecoder(resp.Body).Decode(&processorMetrics)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to decode JSON for processor metrics: %+w", err)
|
||||
}
|
||||
processorMetrics.SetClient(processor.Client)
|
||||
|
||||
// Set tags
|
||||
tags := map[string]string{
|
||||
"hostname": clientConfig.Hostname,
|
||||
"type": "socket",
|
||||
// ProcessorType shall contain the string which identifies the type of processor contained in this Socket
|
||||
"processor_typ": string(processor.ProcessorType),
|
||||
// Processor name
|
||||
"processor_name": processor.Name,
|
||||
// ID uniquely identifies the resource
|
||||
"processor_id": processor.ID,
|
||||
}
|
||||
|
||||
// Delete empty tags
|
||||
for key, value := range tags {
|
||||
if value == "" {
|
||||
delete(tags, key)
|
||||
}
|
||||
}
|
||||
|
||||
// Set meta data tags
|
||||
metaPower := map[string]string{
|
||||
"source": r.name,
|
||||
"group": "Energy",
|
||||
"unit": "watts",
|
||||
}
|
||||
|
||||
namePower := "consumed_power"
|
||||
|
||||
if !clientConfig.isExcluded[namePower] {
|
||||
y, err := lp.New(namePower, tags, metaPower,
|
||||
map[string]interface{}{
|
||||
"value": processorMetrics.ConsumedPowerWatt,
|
||||
},
|
||||
timestamp)
|
||||
if err == nil {
|
||||
r.sink <- y
|
||||
}
|
||||
}
|
||||
// Set meta data tags
|
||||
metaThermal := map[string]string{
|
||||
"source": r.name,
|
||||
"group": "Temperature",
|
||||
"unit": "degC",
|
||||
}
|
||||
|
||||
nameThermal := "temperature"
|
||||
|
||||
if !clientConfig.isExcluded[nameThermal] {
|
||||
y, err := lp.New(nameThermal, tags, metaThermal,
|
||||
map[string]interface{}{
|
||||
"value": processorMetrics.TemperatureCelsius,
|
||||
},
|
||||
timestamp)
|
||||
if err == nil {
|
||||
r.sink <- y
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// readMetrics reads redfish thermal, power and processor metrics from the redfish device
|
||||
// configured in clientConfig
|
||||
func (r *RedfishReceiver) readMetrics(clientConfig *RedfishReceiverClientConfig) error {
|
||||
|
||||
// Connect to redfish service
|
||||
c, err := gofish.Connect(clientConfig.gofish)
|
||||
if err != nil {
|
||||
return fmt.Errorf(
|
||||
"readMetrics: gofish.Connect({Username: %v, Endpoint: %v, BasicAuth: %v, HttpTimeout: %v, HttpInsecure: %v}) failed: %v",
|
||||
clientConfig.gofish.Username,
|
||||
clientConfig.gofish.Endpoint,
|
||||
clientConfig.gofish.BasicAuth,
|
||||
clientConfig.gofish.HTTPClient.Timeout,
|
||||
clientConfig.gofish.HTTPClient.Transport.(*http.Transport).TLSClientConfig.InsecureSkipVerify,
|
||||
err)
|
||||
}
|
||||
defer c.Logout()
|
||||
|
||||
// Create a session, when required
|
||||
if _, err = c.GetSession(); err != nil {
|
||||
c, err = c.CloneWithSession()
|
||||
if err != nil {
|
||||
return fmt.Errorf("readMetrics: Failed to create a session: %+w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Get all chassis managed by this service
|
||||
isChassisListRequired :=
|
||||
clientConfig.doThermalMetrics ||
|
||||
clientConfig.doPowerMetric
|
||||
var chassisList []*redfish.Chassis
|
||||
if isChassisListRequired {
|
||||
chassisList, err = c.Service.Chassis()
|
||||
if err != nil {
|
||||
return fmt.Errorf("readMetrics: c.Service.Chassis() failed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Get all computer systems managed by this service
|
||||
isComputerSystemListRequired := clientConfig.doProcessorMetrics
|
||||
var computerSystemList []*redfish.ComputerSystem
|
||||
if isComputerSystemListRequired {
|
||||
computerSystemList, err = c.Service.Systems()
|
||||
if err != nil {
|
||||
return fmt.Errorf("readMetrics: c.Service.Systems() failed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// read thermal metrics
|
||||
if clientConfig.doThermalMetrics {
|
||||
for _, chassis := range chassisList {
|
||||
err := r.readThermalMetrics(clientConfig, chassis)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// read power metrics
|
||||
if clientConfig.doPowerMetric {
|
||||
for _, chassis := range chassisList {
|
||||
err = r.readPowerMetrics(clientConfig, chassis)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// read processor metrics
|
||||
if clientConfig.doProcessorMetrics {
|
||||
// loop for all computer systems
|
||||
for _, system := range computerSystemList {
|
||||
|
||||
// loop for all processors
|
||||
processors, err := system.Processors()
|
||||
if err != nil {
|
||||
return fmt.Errorf("readMetrics: system.Processors() failed: %v", err)
|
||||
}
|
||||
for _, processor := range processors {
|
||||
err := r.readProcessorMetrics(clientConfig, processor)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// doReadMetrics reads metrics from all configure redfish devices.
|
||||
// To compensate latencies of the Redfish devices a fanout is used.
|
||||
func (r *RedfishReceiver) doReadMetric() {
|
||||
|
||||
// Create wait group and input channel for workers
|
||||
var workerWaitGroup sync.WaitGroup
|
||||
workerInput := make(chan int, realFanout)
|
||||
workerInput := make(chan *RedfishReceiverClientConfig, r.config.fanout)
|
||||
|
||||
// Create worker go routines
|
||||
for i := 0; i < realFanout; i++ {
|
||||
for i := 0; i < r.config.fanout; i++ {
|
||||
// Increment worker wait group counter
|
||||
workerWaitGroup.Add(1)
|
||||
go func() {
|
||||
@ -201,8 +519,8 @@ func (r *RedfishReceiver) Start() {
|
||||
defer workerWaitGroup.Done()
|
||||
|
||||
// Read power metrics for each client config
|
||||
for clientConfigIndex := range workerInput {
|
||||
err := readPowerMetric(clientConfigIndex)
|
||||
for clientConfig := range workerInput {
|
||||
err := r.readMetrics(clientConfig)
|
||||
if err != nil {
|
||||
cclog.ComponentError(r.name, err)
|
||||
}
|
||||
@ -212,9 +530,10 @@ func (r *RedfishReceiver) Start() {
|
||||
|
||||
// Distribute client configs to workers
|
||||
for i := range r.config.ClientConfigs {
|
||||
|
||||
// Check done channel status
|
||||
select {
|
||||
case workerInput <- i:
|
||||
case workerInput <- &r.config.ClientConfigs[i]:
|
||||
case <-r.done:
|
||||
// process done event
|
||||
// Stop workers, clear channel and wait for all workers to finish
|
||||
@ -229,7 +548,11 @@ func (r *RedfishReceiver) Start() {
|
||||
// Stop workers and wait for all workers to finish
|
||||
close(workerInput)
|
||||
workerWaitGroup.Wait()
|
||||
}
|
||||
}
|
||||
|
||||
// Start starts the redfish receiver
|
||||
func (r *RedfishReceiver) Start() {
|
||||
cclog.ComponentDebug(r.name, "START")
|
||||
|
||||
// Start redfish receiver
|
||||
r.wg.Add(1)
|
||||
@ -241,10 +564,15 @@ func (r *RedfishReceiver) Start() {
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
doReadPowerMetric()
|
||||
r.doReadMetric()
|
||||
|
||||
select {
|
||||
case <-ticker.C:
|
||||
case tickerTime := <-ticker.C:
|
||||
// Check if we missed the ticker event
|
||||
if since := time.Since(tickerTime); since > 5*time.Second {
|
||||
cclog.ComponentInfo(r.name, "Missed ticker event for more then", since)
|
||||
}
|
||||
|
||||
// process ticker event -> continue
|
||||
continue
|
||||
case <-r.done:
|
||||
@ -257,7 +585,7 @@ func (r *RedfishReceiver) Start() {
|
||||
cclog.ComponentDebug(r.name, "STARTED")
|
||||
}
|
||||
|
||||
// Close redfish receiver
|
||||
// Close closes the redfish receiver
|
||||
func (r *RedfishReceiver) Close() {
|
||||
cclog.ComponentDebug(r.name, "CLOSE")
|
||||
|
||||
@ -268,40 +596,84 @@ func (r *RedfishReceiver) Close() {
|
||||
cclog.ComponentDebug(r.name, "DONE")
|
||||
}
|
||||
|
||||
// New function to create a new instance of the receiver
|
||||
// NewRedfishReceiver creates a new instance of the redfish receiver
|
||||
// Initialize the receiver by giving it a name and reading in the config JSON
|
||||
func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) {
|
||||
r := new(RedfishReceiver)
|
||||
|
||||
// Config options from config file
|
||||
configJSON := struct {
|
||||
Type string `json:"type"`
|
||||
|
||||
// Maximum number of simultaneous redfish connections (default: 64)
|
||||
Fanout int `json:"fanout,omitempty"`
|
||||
// How often the redfish power metrics should be read and send to the sink (default: 30 s)
|
||||
IntervalString string `json:"interval,omitempty"`
|
||||
|
||||
// Control whether a client verifies the server's certificate
|
||||
// (default: true == do not verify server's certificate)
|
||||
HttpInsecure bool `json:"http_insecure,omitempty"`
|
||||
// Time limit for requests made by this HTTP client (default: 10 s)
|
||||
HttpTimeoutString string `json:"http_timeout,omitempty"`
|
||||
|
||||
// Default client username, password and endpoint
|
||||
Username *string `json:"username"` // User name to authenticate with
|
||||
Password *string `json:"password"` // Password to use for authentication
|
||||
Endpoint *string `json:"endpoint"` // URL of the redfish service
|
||||
|
||||
// Globally disable collection of power, processor or thermal metrics
|
||||
DisablePowerMetrics bool `json:"disable_power_metrics"`
|
||||
DisableProcessorMetrics bool `json:"disable_processor_metrics"`
|
||||
DisableThermalMetrics bool `json:"disable_thermal_metrics"`
|
||||
|
||||
// Globally excluded metrics
|
||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||
|
||||
ClientConfigs []struct {
|
||||
HostList []string `json:"host_list"` // List of hosts with the same client configuration
|
||||
Username *string `json:"username"` // User name to authenticate with
|
||||
Password *string `json:"password"` // Password to use for authentication
|
||||
Endpoint *string `json:"endpoint"` // URL of the redfish service
|
||||
|
||||
// Per client disable collection of power,processor or thermal metrics
|
||||
DisablePowerMetrics bool `json:"disable_power_metrics"`
|
||||
DisableProcessorMetrics bool `json:"disable_processor_metrics"`
|
||||
DisableThermalMetrics bool `json:"disable_thermal_metrics"`
|
||||
|
||||
// Per client excluded metrics
|
||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||
} `json:"client_config"`
|
||||
}{
|
||||
// Set defaults values
|
||||
// Allow overwriting these defaults by reading config JSON
|
||||
Fanout: 64,
|
||||
IntervalString: "30s",
|
||||
HttpTimeoutString: "10s",
|
||||
HttpInsecure: true,
|
||||
}
|
||||
|
||||
// Set name
|
||||
r.name = fmt.Sprintf("RedfishReceiver(%s)", name)
|
||||
|
||||
// Create done channel
|
||||
r.done = make(chan bool)
|
||||
|
||||
// Set defaults in r.config
|
||||
// Allow overwriting these defaults by reading config JSON
|
||||
r.config.Fanout = 64
|
||||
r.config.IntervalString = "30s"
|
||||
r.config.HttpTimeoutString = "10s"
|
||||
r.config.HttpInsecure = true
|
||||
|
||||
// Read the redfish receiver specific JSON config
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &r.config)
|
||||
err := json.Unmarshal(config, &configJSON)
|
||||
if err != nil {
|
||||
cclog.ComponentError(r.name, "Error reading config:", err.Error())
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// interval duration
|
||||
// Convert interval string representation to duration
|
||||
var err error
|
||||
r.config.Interval, err = time.ParseDuration(r.config.IntervalString)
|
||||
r.config.Interval, err = time.ParseDuration(configJSON.IntervalString)
|
||||
if err != nil {
|
||||
err := fmt.Errorf(
|
||||
"Failed to parse duration string interval='%s': %w",
|
||||
r.config.IntervalString,
|
||||
configJSON.IntervalString,
|
||||
err,
|
||||
)
|
||||
cclog.Error(r.name, err)
|
||||
@ -309,11 +681,11 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) {
|
||||
}
|
||||
|
||||
// HTTP timeout duration
|
||||
r.config.HttpTimeout, err = time.ParseDuration(r.config.HttpTimeoutString)
|
||||
r.config.HttpTimeout, err = time.ParseDuration(configJSON.HttpTimeoutString)
|
||||
if err != nil {
|
||||
err := fmt.Errorf(
|
||||
"Failed to parse duration string http_timeout='%s': %w",
|
||||
r.config.HttpTimeoutString,
|
||||
configJSON.HttpTimeoutString,
|
||||
err,
|
||||
)
|
||||
cclog.Error(r.name, err)
|
||||
@ -323,47 +695,128 @@ func NewRedfishReceiver(name string, config json.RawMessage) (Receiver, error) {
|
||||
// Create new http client
|
||||
customTransport := http.DefaultTransport.(*http.Transport).Clone()
|
||||
customTransport.TLSClientConfig = &tls.Config{
|
||||
InsecureSkipVerify: r.config.HttpInsecure,
|
||||
InsecureSkipVerify: configJSON.HttpInsecure,
|
||||
}
|
||||
httpClient := &http.Client{
|
||||
Timeout: r.config.HttpTimeout,
|
||||
Transport: customTransport,
|
||||
}
|
||||
|
||||
// Create gofish client config
|
||||
for i := range r.config.ClientConfigs {
|
||||
clientConfig := &r.config.ClientConfigs[i]
|
||||
gofishConfig := &clientConfig.gofish
|
||||
// Initialize client configurations
|
||||
r.config.ClientConfigs = make([]RedfishReceiverClientConfig, 0)
|
||||
|
||||
if clientConfig.Hostname == nil {
|
||||
err := fmt.Errorf("client config number %v requires hostname", i)
|
||||
cclog.ComponentError(r.name, err)
|
||||
return nil, err
|
||||
}
|
||||
// Create client config from JSON config
|
||||
for i := range configJSON.ClientConfigs {
|
||||
|
||||
if clientConfig.Endpoint == nil {
|
||||
clientConfigJSON := &configJSON.ClientConfigs[i]
|
||||
|
||||
var endpoint_pattern string
|
||||
if clientConfigJSON.Endpoint != nil {
|
||||
endpoint_pattern = *clientConfigJSON.Endpoint
|
||||
} else if configJSON.Endpoint != nil {
|
||||
endpoint_pattern = *configJSON.Endpoint
|
||||
} else {
|
||||
err := fmt.Errorf("client config number %v requires endpoint", i)
|
||||
cclog.ComponentError(r.name, err)
|
||||
return nil, err
|
||||
}
|
||||
gofishConfig.Endpoint = *clientConfig.Endpoint
|
||||
|
||||
if clientConfig.Username == nil {
|
||||
var username string
|
||||
if clientConfigJSON.Username != nil {
|
||||
username = *clientConfigJSON.Username
|
||||
} else if configJSON.Username != nil {
|
||||
username = *configJSON.Username
|
||||
} else {
|
||||
err := fmt.Errorf("client config number %v requires username", i)
|
||||
cclog.ComponentError(r.name, err)
|
||||
return nil, err
|
||||
}
|
||||
gofishConfig.Username = *clientConfig.Username
|
||||
|
||||
if clientConfig.Password == nil {
|
||||
var password string
|
||||
if clientConfigJSON.Password != nil {
|
||||
password = *clientConfigJSON.Password
|
||||
} else if configJSON.Password != nil {
|
||||
password = *configJSON.Password
|
||||
} else {
|
||||
err := fmt.Errorf("client config number %v requires password", i)
|
||||
cclog.ComponentError(r.name, err)
|
||||
return nil, err
|
||||
}
|
||||
gofishConfig.Password = *clientConfig.Password
|
||||
|
||||
gofishConfig.HTTPClient = httpClient
|
||||
// Which metrics should be collected
|
||||
doPowerMetric :=
|
||||
!(configJSON.DisablePowerMetrics ||
|
||||
clientConfigJSON.DisablePowerMetrics)
|
||||
doProcessorMetrics :=
|
||||
!(configJSON.DisableProcessorMetrics ||
|
||||
clientConfigJSON.DisableProcessorMetrics)
|
||||
doThermalMetrics :=
|
||||
!(configJSON.DisableThermalMetrics ||
|
||||
clientConfigJSON.DisableThermalMetrics)
|
||||
|
||||
// Is metrics excluded globally or per client
|
||||
isExcluded := make(map[string]bool)
|
||||
for _, key := range clientConfigJSON.ExcludeMetrics {
|
||||
isExcluded[key] = true
|
||||
}
|
||||
for _, key := range configJSON.ExcludeMetrics {
|
||||
isExcluded[key] = true
|
||||
}
|
||||
|
||||
for _, host := range clientConfigJSON.HostList {
|
||||
|
||||
// Endpoint of the redfish service
|
||||
endpoint := strings.Replace(endpoint_pattern, "%h", host, -1)
|
||||
|
||||
r.config.ClientConfigs = append(
|
||||
r.config.ClientConfigs,
|
||||
RedfishReceiverClientConfig{
|
||||
Hostname: host,
|
||||
isExcluded: isExcluded,
|
||||
doPowerMetric: doPowerMetric,
|
||||
doProcessorMetrics: doProcessorMetrics,
|
||||
doThermalMetrics: doThermalMetrics,
|
||||
skipProcessorMetricsURL: make(map[string]bool),
|
||||
gofish: gofish.ClientConfig{
|
||||
Username: username,
|
||||
Password: password,
|
||||
Endpoint: endpoint,
|
||||
HTTPClient: httpClient,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Compute parallel fanout to use
|
||||
numClients := len(r.config.ClientConfigs)
|
||||
r.config.fanout = configJSON.Fanout
|
||||
if numClients < r.config.fanout {
|
||||
r.config.fanout = numClients
|
||||
}
|
||||
|
||||
if numClients == 0 {
|
||||
err := fmt.Errorf("at least one client config is required")
|
||||
cclog.ComponentError(r.name, err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Check for duplicate client configurations
|
||||
isDuplicate := make(map[string]bool)
|
||||
for i := range r.config.ClientConfigs {
|
||||
host := r.config.ClientConfigs[i].Hostname
|
||||
if isDuplicate[host] {
|
||||
err := fmt.Errorf("Found duplicate client config for host %s", host)
|
||||
cclog.ComponentError(r.name, err)
|
||||
return nil, err
|
||||
}
|
||||
isDuplicate[host] = true
|
||||
}
|
||||
|
||||
// Give some basic info about redfish receiver status
|
||||
cclog.ComponentInfo(r.name, "Monitoring", numClients, "clients")
|
||||
cclog.ComponentInfo(r.name, "Monitoring interval:", r.config.Interval)
|
||||
cclog.ComponentInfo(r.name, "Monitoring parallel fanout:", r.config.fanout)
|
||||
|
||||
return r, nil
|
||||
}
|
||||
|
54
receivers/redfishReceiver.md
Normal file
54
receivers/redfishReceiver.md
Normal file
@ -0,0 +1,54 @@
|
||||
## Redfish receiver
|
||||
|
||||
The Redfish receiver uses the [Redfish (specification)](https://www.dmtf.org/standards/redfish) to query thermal and power metrics. Thermal metrics may include various fan speeds and temperatures. Power metrics may include the current power consumption of various hardware components. It may also include the minimum, maximum and average power consumption of these components in a given time interval. The receiver will poll each configured redfish device once in a given interval. Multiple devices can be accessed in parallel to increase throughput.
|
||||
|
||||
### Configuration structure
|
||||
|
||||
```json
|
||||
{
|
||||
"<redfish receiver name>": {
|
||||
"type": "redfish",
|
||||
"username": "<user A>",
|
||||
"password": "<password A>",
|
||||
"endpoint": "https://%h-bmc",
|
||||
"exclude_metrics": [ "min_consumed_watts" ],
|
||||
"client_config": [
|
||||
{
|
||||
"host_list": [ "<host 1>", "<host 2>" ]
|
||||
},
|
||||
{
|
||||
"host_list": [ "<host 3>", "<host 4>" ]
|
||||
"disable_power_metrics": true
|
||||
},
|
||||
{
|
||||
"host_list": [ "<host 5>" ],
|
||||
"username": "<user B>",
|
||||
"password": "<password B>",
|
||||
"endpoint": "https://%h-BMC",
|
||||
"disable_thermal_metrics": true
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Global settings:
|
||||
|
||||
- `fanout`: Maximum number of simultaneous redfish connections (default: 64)
|
||||
- `interval`: How often the redfish power metrics should be read and send to the sink (default: 30 s)
|
||||
- `http_insecure`: Control whether a client verifies the server's certificate (default: true == do not verify server's certificate)
|
||||
- `http_timeout`: Time limit for requests made by this HTTP client (default: 10 s)
|
||||
|
||||
Global and per redfish device settings (per redfish device settings overwrite the global settings):
|
||||
|
||||
- `disable_power_metrics`: disable collection of power metrics
|
||||
- `disable_processor_metrics`: disable collection of processor metrics
|
||||
- `disable_thermal_metrics`: disable collection of thermal metrics
|
||||
- `exclude_metrics`: list of excluded metrics
|
||||
- `username`: User name to authenticate with
|
||||
- `password`: Password to use for authentication
|
||||
- `endpoint`: URL of the redfish service (placeholder `%h` gets replaced by the hostname)
|
||||
|
||||
Per redfish device settings:
|
||||
|
||||
- `host_list`: List of hosts with the same client configuration
|
@ -6,7 +6,7 @@ CC_HOME=/tmp
|
||||
|
||||
LOG_DIR=/var/log
|
||||
|
||||
DATA_DIR=/var/lib/grafana
|
||||
DATA_DIR=/var/lib/cc-metric-collector
|
||||
|
||||
MAX_OPEN_FILES=10000
|
||||
|
||||
|
16
scripts/cc-metric-collector.deb.rules
Normal file
16
scripts/cc-metric-collector.deb.rules
Normal file
@ -0,0 +1,16 @@
|
||||
|
||||
#!/usr/bin/make -f
|
||||
# You must remove unused comment lines for the released package.
|
||||
#export DH_VERBOSE = 1
|
||||
#export DEB_BUILD_MAINT_OPTIONS = hardening=+all
|
||||
#export DEB_CFLAGS_MAINT_APPEND = -Wall -pedantic
|
||||
#export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed
|
||||
|
||||
%:
|
||||
dh $@
|
||||
|
||||
override_dh_auto_build:
|
||||
make
|
||||
|
||||
override_dh_auto_install:
|
||||
make PREFIX=/usr install
|
@ -19,7 +19,7 @@
|
||||
PATH=/bin:/usr/bin:/sbin:/usr/sbin
|
||||
NAME=cc-metric-collector
|
||||
DESC="ClusterCockpit metric collector"
|
||||
DEFAULT=/etc/default/${NAME}.json
|
||||
DEFAULT=/etc/default/${NAME}
|
||||
|
||||
CC_USER=clustercockpit
|
||||
CC_GROUP=clustercockpit
|
||||
|
Loading…
Reference in New Issue
Block a user