Compare commits

...

31 Commits

Author SHA1 Message Date
Thomas Roehl
1ce40aea16 Add AppMetricReceiver 2022-11-29 13:44:20 +01:00
Holger Obermaier
5918f96fd8 Minimal formating changes 2022-11-24 09:48:44 +01:00
Holger Obermaier
8cb87a2165 Add IPMI receiver example configuration to receivers.json 2022-11-23 10:37:31 +01:00
Holger Obermaier
3e91a37dee remove prefix enumeration like 01-... 2022-11-22 17:02:29 +01:00
Holger Obermaier
ed68baeada Add IPMI metric: current 2022-11-22 15:32:41 +01:00
Holger Obermaier
888db31dbf Try to use common metric tags across hardware vendors 2022-11-22 15:09:56 +01:00
Holger Obermaier
c938d32629 Add go.mod to App dependency 2022-11-22 09:45:29 +01:00
Holger Obermaier
d5daf54d4f Update to latest version of included go modules 2022-11-22 09:42:04 +01:00
Holger Obermaier
18bffd7c14 Add documentaion for IPMI receiver 2022-11-21 13:58:30 +01:00
Holger Obermaier
bd0105b370 Added config option to add ipmi-sensors command line options 2022-11-21 13:02:46 +01:00
Holger Obermaier
b1a8674c4c Add receiver to gather remote IPMI sensor metrics 2022-11-18 16:55:11 +01:00
Holger Obermaier
234ad3c54e Fix kernel panic for receiver config with missing receiver type 2022-11-17 11:33:13 +01:00
Holger Obermaier
7bb80780e0 Fixed computing number of physical packages for non continous physical package IDs (e.g. on Ampere Altra Q80-30) 2022-11-16 14:58:11 +01:00
Holger Obermaier
e66d52bb32 * Corrected json config in numastatsMetric.md
* Added some debug output to numastatsMetric.go
2022-11-16 14:10:25 +01:00
Holger Obermaier
9840d0193d Do not mess up with the orignal configuration 2022-11-16 09:37:40 +01:00
Holger Obermaier
ce7eef8d30 Add running average power limit (RAPL) metric collector 2022-11-15 17:15:27 +01:00
Holger Obermaier
92e45ca62c Add running average power limit (RAPL) metric collector 2022-11-15 17:09:26 +01:00
Holger Obermaier
fd10a279fc Corrected some typos 2022-11-14 09:35:02 +01:00
Holger Obermaier
9e63d0ea59 Run ipmitool asynchron. Improved error handling. 2022-11-11 16:16:14 +01:00
Thomas Gruber
76bb033a88 Update README.md 2022-11-04 14:52:09 +01:00
Holger Obermaier
deb1bcfa2f Correct type: /proc/stats -> /proc/stat 2022-10-13 15:01:39 +02:00
Holger Obermaier
7a67d5e25f Check if at least one CPU with frequency information was detected 2022-10-13 14:53:55 +02:00
Thomas Gruber
9ae0806aa9 Add collector for monitoring the execution of cc-metric-collector itself (#81)
* Add collector to monitor execution of cc-metric-collector itself

* Register SelfCollector

* Fix import paths for moved packages
2022-10-10 12:18:52 +02:00
Thomas Gruber
4bd71224df move maybe-usable-by-other-cc-components to pkg. Fix all files to use the new paths (#88) 2022-10-10 11:53:11 +02:00
Thomas Roehl
6bf3bfd10a Use lower case for error strings in RocmSmiCollector 2022-10-09 17:05:49 +02:00
Thomas Gruber
0fbff00996 Replace ioutils with os and io (#87) 2022-10-09 17:03:38 +02:00
Thomas Roehl
8849824ba9 Remove useless prints from MemstatCollector 2022-10-09 02:56:15 +02:00
Thomas Roehl
ed511b7c09 Fix memstat collector with numa_stats option 2022-09-28 15:09:36 +02:00
Thomas Roehl
a0acf01dc3 Build DEB package for Ubuntu 20.04 for releases 2022-09-28 12:19:36 +02:00
Thomas Roehl
58461f1f72 Fix clock frequency coming from LikwidCollector and update docs 2022-09-09 20:01:21 +02:00
Thomas Röhl
c09d8fb118 InfiniBandCollector: Scale raw readings from octets to bytes 2022-09-09 19:27:20 +02:00
77 changed files with 1659 additions and 284 deletions

View File

@@ -133,13 +133,63 @@ jobs:
name: cc-metric-collector SRPM for UBI 8
path: ${{ steps.rpmbuild.outputs.SRPM }}
#
# Build on Ubuntu 20.04 using official go package
#
Ubuntu-focal-build:
runs-on: ubuntu-latest
container: ubuntu:20.04
# The job outputs link to the outputs of the 'debrename' step
# Only job outputs can be used in child jobs
outputs:
deb : ${{steps.debrename.outputs.DEB}}
steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt update && apt --assume-yes upgrade
apt --assume-yes install build-essential sed git wget bash
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
- name: Install Golang
run: |
wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz
tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
go version
- name: DEB build MetricCollector
id: dpkg-build
run: |
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
make DEB
- name: Rename DEB (add '_ubuntu20.04')
id: debrename
run: |
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu20.04.deb"
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
echo "::set-output name=DEB::${NEW_DEB_FILE}"
# See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact
uses: actions/upload-artifact@v2
with:
name: cc-metric-collector DEB for Ubuntu 20.04
path: ${{ steps.debrename.outputs.DEB }}
#
# Create release with fresh RPMs
#
Release:
runs-on: ubuntu-latest
# We need the RPMs, so add dependency
needs: [AlmaLinux-RPM-build, UBI-8-RPM-build]
needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-focal-build]
steps:
# See: https://github.com/actions/download-artifact
@@ -161,6 +211,11 @@ jobs:
with:
name: cc-metric-collector SRPM for UBI 8
- name: Download Ubuntu 20.04 DEB
uses: actions/download-artifact@v2
with:
name: cc-metric-collector DEB for Ubuntu 20.04
# The download actions do not publish the name of the downloaded file,
# so we re-use the job outputs of the parent jobs. The files are all
# downloaded to the current folder.
@@ -174,14 +229,17 @@ jobs:
ALMA_85_SRPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.srpm}}")
UBI_8_RPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.rpm}}")
UBI_8_SRPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.srpm}}")
U_2004_DEB=$(basename "${{ needs.Ubuntu-focal-build.outputs.deb}}")
echo "ALMA_85_RPM::${ALMA_85_RPM}"
echo "ALMA_85_SRPM::${ALMA_85_SRPM}"
echo "UBI_8_RPM::${UBI_8_RPM}"
echo "UBI_8_SRPM::${UBI_8_SRPM}"
echo "U_2004_DEB::${U_2004_DEB}"
echo "::set-output name=ALMA_85_RPM::${ALMA_85_RPM}"
echo "::set-output name=ALMA_85_SRPM::${ALMA_85_SRPM}"
echo "::set-output name=UBI_8_RPM::${UBI_8_RPM}"
echo "::set-output name=UBI_8_SRPM::${UBI_8_SRPM}"
echo "::set-output name=U_2004_DEB::${U_2004_DEB}"
# See: https://github.com/softprops/action-gh-release
- name: Release
@@ -194,3 +252,4 @@ jobs:
${{ steps.files.outputs.ALMA_85_SRPM }}
${{ steps.files.outputs.UBI_8_RPM }}
${{ steps.files.outputs.UBI_8_SRPM }}
${{ steps.files.outputs.U_2004_DEB }}

View File

@@ -22,7 +22,7 @@ GOBIN = $(shell which go)
.PHONY: all
all: $(APP)
$(APP): $(GOSRC)
$(APP): $(GOSRC) go.mod
make -C collectors
$(GOBIN) get
$(GOBIN) build -o $(APP) $(GOSRC_APP)

View File

@@ -15,10 +15,10 @@ import (
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
)
type CentralConfigFile struct {

View File

@@ -5,7 +5,7 @@ import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"io"
"os"
"os/exec"
"os/user"
@@ -14,8 +14,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
const DEFAULT_BEEGFS_CMD = "beegfs-ctl"
@@ -115,7 +115,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMetr
return
}
//get mounpoint
buffer, _ := ioutil.ReadFile(string("/proc/mounts"))
buffer, _ := os.ReadFile(string("/proc/mounts"))
mounts := strings.Split(string(buffer), "\n")
var mountpoints []string
for _, line := range mounts {
@@ -157,9 +157,9 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMetr
if err != nil {
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error())
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode())
data, _ := ioutil.ReadAll(cmdStderr)
data, _ := io.ReadAll(cmdStderr)
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stderr: \"%s\"\n", string(data))
data, _ = ioutil.ReadAll(cmdStdout)
data, _ = io.ReadAll(cmdStdout)
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stdout: \"%s\"\n", string(data))
return
}

View File

@@ -5,7 +5,7 @@ import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"io"
"os"
"os/exec"
"os/user"
@@ -14,8 +14,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
// Struct for the collector-specific JSON config
@@ -108,7 +108,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
return
}
//get mounpoint
buffer, _ := ioutil.ReadFile(string("/proc/mounts"))
buffer, _ := os.ReadFile(string("/proc/mounts"))
mounts := strings.Split(string(buffer), "\n")
var mountpoints []string
for _, line := range mounts {
@@ -149,9 +149,9 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
if err != nil {
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error())
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode())
data, _ := ioutil.ReadAll(cmdStderr)
data, _ := io.ReadAll(cmdStderr)
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stderr: \"%s\"\n", string(data))
data, _ = ioutil.ReadAll(cmdStdout)
data, _ = io.ReadAll(cmdStdout)
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stdout: \"%s\"\n", string(data))
return
}

View File

@@ -6,9 +6,9 @@ import (
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
)
// Map of all available metric collectors
@@ -36,7 +36,9 @@ var AvailableCollectors = map[string]MetricCollector{
"numastats": new(NUMAStatsCollector),
"beegfs_meta": new(BeegfsMetaCollector),
"beegfs_storage": new(BeegfsStorageCollector),
"rapl": new(RAPLCollector),
"rocm_smi": new(RocmSmiCollector),
"self": new(SelfCollector),
"schedstat": new(SchedstatCollector),
}

View File

@@ -10,8 +10,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
//
@@ -142,6 +142,11 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
}
}
// Check if at least one CPU with frequency information was detected
if len(m.topology) == 0 {
return fmt.Errorf("No CPU frequency info found in %s", cpuInfoFile)
}
numPhysicalPackageID_int := maxPhysicalPackageID + 1
numPhysicalPackageID := fmt.Sprint(numPhysicalPackageID_int)
numNonHT := fmt.Sprint(numNonHT_int)

View File

@@ -3,14 +3,14 @@ package collectors
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
"golang.org/x/sys/unix"
)
@@ -23,20 +23,18 @@ type CPUFreqCollectorTopology struct {
numPhysicalPackages string // number of sockets / packages
numPhysicalPackages_int int64
isHT bool
numNonHT string // number of non hyperthreading processors
numNonHT string // number of non hyper-threading processors
numNonHT_int int64
scalingCurFreqFile string
tagSet map[string]string
}
//
// CPUFreqCollector
// a metric collector to measure the current frequency of the CPUs
// as obtained from the hardware (in KHz)
// Only measure on the first hyper thread
// Only measure on the first hyper-thread
//
// See: https://www.kernel.org/doc/html/latest/admin-guide/pm/cpufreq.html
//
type CPUFreqCollector struct {
metricCollector
topology []CPUFreqCollectorTopology
@@ -88,7 +86,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
// Read package ID
physicalPackageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id")
line, err := ioutil.ReadFile(physicalPackageIDFile)
line, err := os.ReadFile(physicalPackageIDFile)
if err != nil {
return fmt.Errorf("unable to read physical package ID from file '%s': %v", physicalPackageIDFile, err)
}
@@ -100,7 +98,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
// Read core ID
coreIDFile := filepath.Join(cpuDir, "topology", "core_id")
line, err = ioutil.ReadFile(coreIDFile)
line, err = os.ReadFile(coreIDFile)
if err != nil {
return fmt.Errorf("unable to read core ID from file '%s': %v", coreIDFile, err)
}
@@ -126,7 +124,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
t.scalingCurFreqFile = scalingCurFreqFile
}
// is processor a hyperthread?
// is processor a hyper-thread?
coreSeenBefore := make(map[string]bool)
for i := range m.topology {
t := &m.topology[i]
@@ -136,23 +134,20 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
coreSeenBefore[globalID] = true
}
// number of non hyper thread cores and packages / sockets
// number of non hyper-thread cores and packages / sockets
var numNonHT_int int64 = 0
var maxPhysicalPackageID int64 = 0
PhysicalPackageIDs := make(map[int64]struct{})
for i := range m.topology {
t := &m.topology[i]
// Update maxPackageID
if t.physicalPackageID_int > maxPhysicalPackageID {
maxPhysicalPackageID = t.physicalPackageID_int
}
if !t.isHT {
numNonHT_int++
}
PhysicalPackageIDs[t.physicalPackageID_int] = struct{}{}
}
numPhysicalPackageID_int := maxPhysicalPackageID + 1
numPhysicalPackageID_int := int64(len(PhysicalPackageIDs))
numPhysicalPackageID := fmt.Sprint(numPhysicalPackageID_int)
numNonHT := fmt.Sprint(numNonHT_int)
for i := range m.topology {
@@ -168,6 +163,13 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
}
}
// Initialized
cclog.ComponentDebug(
m.name,
"initialized",
numPhysicalPackageID_int, "physical packages,",
len(cpuDirs), "CPUs,",
numNonHT, "non-hyper-threading CPUs")
m.init = true
return nil
}
@@ -182,13 +184,13 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric)
for i := range m.topology {
t := &m.topology[i]
// skip hyperthreads
// skip hyper-threads
if t.isHT {
continue
}
// Read current frequency
line, err := ioutil.ReadFile(t.scalingCurFreqFile)
line, err := os.ReadFile(t.scalingCurFreqFile)
if err != nil {
cclog.ComponentError(
m.name,

View File

@@ -1,4 +1,5 @@
## `cpufreq_cpuinfo` collector
```json
"cpufreq": {
"exclude_metrics": []
@@ -8,4 +9,5 @@
The `cpufreq` collector reads the clock frequency from `/sys/devices/system/cpu/cpu*/cpufreq` and outputs a handful **hwthread** metrics.
Metrics:
* `cpufreq`

View File

@@ -9,8 +9,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
sysconf "github.com/tklauser/go-sysconf"
)

View File

@@ -1,5 +1,6 @@
## `cpustat` collector
```json
"cpustat": {
"exclude_metrics": [
@@ -8,9 +9,10 @@
}
```
The `cpustat` collector reads data from `/proc/stats` and outputs a handful **node** and **hwthread** metrics. If a metric is not required, it can be excluded from forwarding it to the sink.
The `cpustat` collector reads data from `/proc/stat` and outputs a handful **node** and **hwthread** metrics. If a metric is not required, it can be excluded from forwarding it to the sink.
Metrics:
* `cpu_user`
* `cpu_nice`
* `cpu_system`

View File

@@ -3,13 +3,13 @@ package collectors
import (
"encoding/json"
"errors"
"io/ioutil"
"log"
"os"
"os/exec"
"strings"
"time"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
influx "github.com/influxdata/line-protocol"
)
@@ -53,7 +53,7 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
}
}
for _, f := range m.config.Files {
_, err = ioutil.ReadFile(f)
_, err = os.ReadFile(f)
if err == nil {
m.files = append(m.files, f)
} else {
@@ -106,7 +106,7 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetri
}
}
for _, file := range m.files {
buffer, err := ioutil.ReadFile(file)
buffer, err := os.ReadFile(file)
if err != nil {
log.Print(err)
return

View File

@@ -8,8 +8,8 @@ import (
"syscall"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
// "log"

View File

@@ -5,7 +5,7 @@ import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"io"
"log"
"os/exec"
"os/user"
@@ -13,8 +13,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
const DEFAULT_GPFS_CMD = "mmpmon"
@@ -118,8 +118,8 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
cmd.Stderr = cmdStderr
err := cmd.Run()
if err != nil {
dataStdErr, _ := ioutil.ReadAll(cmdStderr)
dataStdOut, _ := ioutil.ReadAll(cmdStdout)
dataStdErr, _ := io.ReadAll(cmdStderr)
dataStdOut, _ := io.ReadAll(cmdStdout)
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err),

View File

@@ -2,11 +2,10 @@ package collectors
import (
"fmt"
"io/ioutil"
"os"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
"golang.org/x/sys/unix"
"encoding/json"
@@ -21,6 +20,7 @@ const IB_BASEPATH = "/sys/class/infiniband/"
type InfinibandCollectorMetric struct {
path string
unit string
scale int64
}
type InfinibandCollectorInfo struct {
@@ -84,7 +84,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
for _, path := range ibDirs {
// Skip, when no LID is assigned
line, err := ioutil.ReadFile(filepath.Join(path, "lid"))
line, err := os.ReadFile(filepath.Join(path, "lid"))
if err != nil {
continue
}
@@ -113,10 +113,10 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
// Check access to counter files
countersDir := filepath.Join(path, "counters")
portCounterFiles := map[string]InfinibandCollectorMetric{
"ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes"},
"ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes"},
"ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets"},
"ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets"},
"ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes", scale: 4},
"ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes", scale: 4},
"ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets", scale: 1},
"ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets", scale: 1},
}
for _, counter := range portCounterFiles {
err := unix.Access(counter.path, unix.R_OK)
@@ -174,7 +174,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
for counterName, counterDef := range info.portCounterFiles {
// Read counter file
line, err := ioutil.ReadFile(counterDef.path)
line, err := os.ReadFile(counterDef.path)
if err != nil {
cclog.ComponentError(
m.name,
@@ -191,6 +191,8 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterName, data, err))
continue
}
// Scale raw value
v *= counterDef.scale
// Send absolut values
if m.config.SendAbsoluteValues {

View File

@@ -4,8 +4,8 @@ import (
"bufio"
"os"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
// "log"
"encoding/json"

View File

@@ -1,51 +1,58 @@
package collectors
import (
"bufio"
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"log"
"os"
"os/exec"
"strconv"
"strings"
"time"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
const IPMITOOL_PATH = `ipmitool`
const IPMISENSORS_PATH = `ipmi-sensors`
type IpmiCollectorConfig struct {
ExcludeDevices []string `json:"exclude_devices"`
IpmitoolPath string `json:"ipmitool_path"`
IpmisensorsPath string `json:"ipmisensors_path"`
}
type IpmiCollector struct {
metricCollector
//tags map[string]string
//matches map[string]string
config IpmiCollectorConfig
config struct {
ExcludeDevices []string `json:"exclude_devices"`
IpmitoolPath string `json:"ipmitool_path"`
IpmisensorsPath string `json:"ipmisensors_path"`
}
ipmitool string
ipmisensors string
}
func (m *IpmiCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
m.name = "IpmiCollector"
m.setup()
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "IPMI"}
m.config.IpmitoolPath = string(IPMITOOL_PATH)
m.config.IpmisensorsPath = string(IPMISENSORS_PATH)
m.ipmitool = ""
m.ipmisensors = ""
m.meta = map[string]string{
"source": m.name,
"group": "IPMI",
}
// default path to IPMI tools
m.config.IpmitoolPath = "ipmitool"
m.config.IpmisensorsPath = "ipmi-sensors"
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
// Check if executables ipmitool or ipmisensors are found
p, err := exec.LookPath(m.config.IpmitoolPath)
if err == nil {
m.ipmitool = p
@@ -62,25 +69,33 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
}
func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMetric) {
// Setup ipmitool command
command := exec.Command(cmd, "sensor")
command.Wait()
stdout, err := command.Output()
if err != nil {
log.Print(err)
stdout, _ := command.StdoutPipe()
errBuf := new(bytes.Buffer)
command.Stderr = errBuf
// start command
if err := command.Start(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("readIpmiTool(): Failed to start command \"%s\": %v", command.String(), err),
)
return
}
ll := strings.Split(string(stdout), "\n")
for _, line := range ll {
lv := strings.Split(line, "|")
// Read command output
scanner := bufio.NewScanner(stdout)
for scanner.Scan() {
lv := strings.Split(scanner.Text(), "|")
if len(lv) < 3 {
continue
}
v, err := strconv.ParseFloat(strings.Trim(lv[1], " "), 64)
v, err := strconv.ParseFloat(strings.TrimSpace(lv[1]), 64)
if err == nil {
name := strings.ToLower(strings.Replace(strings.Trim(lv[0], " "), " ", "_", -1))
unit := strings.Trim(lv[2], " ")
name := strings.ToLower(strings.Replace(strings.TrimSpace(lv[0]), " ", "_", -1))
unit := strings.TrimSpace(lv[2])
if unit == "Volts" {
unit = "Volts"
} else if unit == "degrees C" {
@@ -98,6 +113,17 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMetric) {
}
}
}
// Wait for command end
if err := command.Wait(); err != nil {
errMsg, _ := io.ReadAll(errBuf)
cclog.ComponentError(
m.name,
fmt.Sprintf("readIpmiTool(): Failed to wait for the end of command \"%s\": %v\n", command.String(), err),
fmt.Sprintf("readIpmiTool(): command stderr: \"%s\"\n", string(errMsg)),
)
return
}
}
func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMetric) {
@@ -131,17 +157,17 @@ func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMetric) {
}
func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMetric) {
// Check if already initialized
if !m.init {
return
}
if len(m.config.IpmitoolPath) > 0 {
_, err := os.Stat(m.config.IpmitoolPath)
if err == nil {
m.readIpmiTool(m.config.IpmitoolPath, output)
}
} else if len(m.config.IpmisensorsPath) > 0 {
_, err := os.Stat(m.config.IpmisensorsPath)
if err == nil {
m.readIpmiSensors(m.config.IpmisensorsPath, output)
}
}
}
func (m *IpmiCollector) Close() {

View File

@@ -11,6 +11,3 @@
The `ipmistat` collector reads data from `ipmitool` (`ipmitool sensor`) or `ipmi-sensors` (`ipmi-sensors --sdr-cache-recreate --comma-separated-output`).
The metrics depend on the output of the underlying tools but contain temperature, power and energy metrics.

View File

@@ -12,7 +12,6 @@ import (
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"math"
"os"
"os/signal"
@@ -24,10 +23,10 @@ import (
"time"
"unsafe"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"github.com/NVIDIA/go-nvml/pkg/dl"
)
@@ -154,12 +153,13 @@ func getBaseFreq() float64 {
}
var freq float64 = math.NaN()
for _, f := range files {
buffer, err := ioutil.ReadFile(f)
buffer, err := os.ReadFile(f)
if err == nil {
data := strings.Replace(string(buffer), "\n", "", -1)
x, err := strconv.ParseInt(data, 0, 64)
if err == nil {
freq = float64(x) * 1e6
freq = float64(x)
break
}
}
}
@@ -168,11 +168,11 @@ func getBaseFreq() float64 {
C.power_init(0)
info := C.get_powerInfo()
if float64(info.baseFrequency) != 0 {
freq = float64(info.baseFrequency) * 1e6
freq = float64(info.baseFrequency)
}
C.power_finalize()
}
return freq
return freq * 1e3
}
func (m *LikwidCollector) Init(config json.RawMessage) error {

View File

@@ -7,6 +7,9 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li
"likwid": {
"force_overwrite" : false,
"invalid_to_zero" : false,
"liblikwid_path" : "/path/to/liblikwid.so",
"accessdaemon_path" : "/folder/that/contains/likwid-accessD",
"access_mode" : "direct or accessdaemon or perf_event",
"eventsets": [
{
"events" : {

View File

@@ -3,13 +3,13 @@ package collectors
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
//
@@ -72,7 +72,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric)
if !m.init {
return
}
buffer, err := ioutil.ReadFile(LOADAVGFILE)
buffer, err := os.ReadFile(LOADAVGFILE)
if err != nil {
if err != nil {
cclog.ComponentError(

View File

@@ -10,8 +10,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
const LUSTRE_SYSFS = `/sys/fs/lustre`

View File

@@ -12,8 +12,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
const MEMSTATFILE = "/proc/meminfo"
@@ -68,7 +68,8 @@ func getStats(filename string) map[string]MemstatStats {
} else if len(linefields) == 5 {
v, err := strconv.ParseFloat(linefields[3], 64)
if err == nil {
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4])
stats[strings.Trim(linefields[2], ":")] = MemstatStats{
value: v,
unit: linefields[4],
}
@@ -160,7 +161,6 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if !m.init {
cclog.ComponentPrint(m.name, "Here")
return
}
@@ -188,16 +188,20 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric)
unit := ""
if totalVal, total := stats["MemTotal"]; total {
if freeVal, free := stats["MemFree"]; free {
if bufVal, buffers := stats["Buffers"]; buffers {
if cacheVal, cached := stats["Cached"]; cached {
memUsed = totalVal.value - (freeVal.value + bufVal.value + cacheVal.value)
memUsed = totalVal.value - freeVal.value
if len(totalVal.unit) > 0 {
unit = totalVal.unit
} else if len(freeVal.unit) > 0 {
unit = freeVal.unit
} else if len(bufVal.unit) > 0 {
}
if bufVal, buffers := stats["Buffers"]; buffers {
memUsed -= bufVal.value
if len(bufVal.unit) > 0 && len(unit) == 0 {
unit = bufVal.unit
} else if len(cacheVal.unit) > 0 {
}
if cacheVal, cached := stats["Cached"]; cached {
memUsed -= cacheVal.value
if len(cacheVal.unit) > 0 && len(unit) == 0 {
unit = cacheVal.unit
}
}

View File

@@ -5,7 +5,7 @@ import (
"fmt"
"time"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
type MetricCollector interface {

View File

@@ -9,8 +9,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
const NETSTATFILE = "/proc/net/dev"

View File

@@ -11,7 +11,7 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
// First part contains the code for the general NfsCollector.

View File

@@ -10,33 +10,42 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
//
// Numa policy hit/miss statistics
// Non-Uniform Memory Access (NUMA) policy hit/miss statistics
//
// numa_hit:
//
// A process wanted to allocate memory from this node, and succeeded.
//
// numa_miss:
//
// A process wanted to allocate memory from another node,
// but ended up with memory from this node.
//
// numa_foreign:
//
// A process wanted to allocate on this node,
// but ended up with memory from another node.
//
// local_node:
//
// A process ran on this node's CPU,
// and got memory from this node.
//
// other_node:
//
// A process ran on a different node's CPU
// and got memory from this node.
//
// interleave_hit:
//
// Interleaving wanted to allocate from this node
// and succeeded.
//
// See: https://www.kernel.org/doc/html/latest/admin-guide/numastat.html
//
type NUMAStatsCollectorTopolgy struct {
file string
tagSet map[string]string
@@ -82,6 +91,8 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
})
}
// Initialized
cclog.ComponentDebug(m.name, "initialized", len(m.topology), "NUMA domains")
m.init = true
return nil
}

View File

@@ -1,12 +1,14 @@
## `numastat` collector
```json
"numastat": {}
"numastats": {}
```
The `numastat` collector reads data from `/sys/devices/system/node/node*/numastat` and outputs a handful **memoryDomain** metrics. See: https://www.kernel.org/doc/html/latest/admin-guide/numastat.html
The `numastat` collector reads data from `/sys/devices/system/node/node*/numastat` and outputs a handful **memoryDomain** metrics. See: <https://www.kernel.org/doc/html/latest/admin-guide/numastat.html>
Metrics:
* `numastats_numa_hit`: A process wanted to allocate memory from this node, and succeeded.
* `numastats_numa_miss`: A process wanted to allocate memory from another node, but ended up with memory from this node.
* `numastats_numa_foreign`: A process wanted to allocate on this node, but ended up with memory from another node.

View File

@@ -8,8 +8,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
"github.com/NVIDIA/go-nvml/pkg/nvml"
)

262
collectors/raplMetric.go Normal file
View File

@@ -0,0 +1,262 @@
package collectors
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
// running average power limit (RAPL) monitoring attributes for a zone
type RAPLZoneInfo struct {
// tags describing the RAPL zone:
// * zone_name, subzone_name: e.g. psys, dram, core, uncore, package-0
// * zone_id: e.g. 0:1 (zone 0 sub zone 1)
tags map[string]string
energyFilepath string // path to a file containing the zones current energy counter in micro joules
energy int64 // current reading of the energy counter in micro joules
energyTimestamp time.Time // timestamp when energy counter was read
maxEnergyRange int64 // Range of the above energy counter in micro-joules
}
type RAPLCollector struct {
metricCollector
config struct {
// Exclude IDs for RAPL zones, e.g.
// * 0 for zone 0
// * 0:1 for zone 0 subzone 1
ExcludeByID []string `json:"exclude_device_by_id,omitempty"`
// Exclude names for RAPL zones, e.g. psys, dram, core, uncore, package-0
ExcludeByName []string `json:"exclude_device_by_name,omitempty"`
}
RAPLZoneInfo []RAPLZoneInfo
meta map[string]string // default meta information
}
// Init initializes the running average power limit (RAPL) collector
func (m *RAPLCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
var err error = nil
m.name = "RAPLCollector"
m.setup()
m.parallel = true
m.meta = map[string]string{
"source": m.name,
"group": "energy",
"unit": "Watt",
}
// Read in the JSON configuration
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
// Configure excluded RAPL zones
isIDExcluded := make(map[string]bool)
if m.config.ExcludeByID != nil {
for _, ID := range m.config.ExcludeByID {
isIDExcluded[ID] = true
}
}
isNameExcluded := make(map[string]bool)
if m.config.ExcludeByName != nil {
for _, name := range m.config.ExcludeByName {
isNameExcluded[name] = true
}
}
// readZoneInfo reads RAPL monitoring attributes for a zone given by zonePath
// See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes
readZoneInfo := func(zonePath string) (z struct {
name string // zones name e.g. psys, dram, core, uncore, package-0
energyFilepath string // path to a file containing the zones current energy counter in micro joules
energy int64 // current reading of the energy counter in micro joules
energyTimestamp time.Time // timestamp when energy counter was read
maxEnergyRange int64 // Range of the above energy counter in micro-joules
ok bool // Are all information available?
}) {
// zones name e.g. psys, dram, core, uncore, package-0
foundName := false
if v, err :=
os.ReadFile(
filepath.Join(zonePath, "name")); err == nil {
foundName = true
z.name = strings.TrimSpace(string(v))
}
// path to a file containing the zones current energy counter in micro joules
z.energyFilepath = filepath.Join(zonePath, "energy_uj")
// current reading of the energy counter in micro joules
foundEnergy := false
if v, err := os.ReadFile(z.energyFilepath); err == nil {
// timestamp when energy counter was read
z.energyTimestamp = time.Now()
if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil {
foundEnergy = true
z.energy = i
}
}
// Range of the above energy counter in micro-joules
foundMaxEnergyRange := false
if v, err :=
os.ReadFile(
filepath.Join(zonePath, "max_energy_range_uj")); err == nil {
if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil {
foundMaxEnergyRange = true
z.maxEnergyRange = i
}
}
// Are all information available?
z.ok = foundName && foundEnergy && foundMaxEnergyRange
return
}
powerCapPrefix := "/sys/devices/virtual/powercap"
controlType := "intel-rapl"
controlTypePath := filepath.Join(powerCapPrefix, controlType)
// Find all RAPL zones
zonePrefix := filepath.Join(controlTypePath, controlType+":")
zonesPath, err := filepath.Glob(zonePrefix + "*")
if err != nil || zonesPath == nil {
return fmt.Errorf("unable to find any zones under %s", controlTypePath)
}
for _, zonePath := range zonesPath {
zoneID := strings.TrimPrefix(zonePath, zonePrefix)
z := readZoneInfo(zonePath)
if z.ok &&
!isIDExcluded[zoneID] &&
!isNameExcluded[z.name] {
// Add RAPL monitoring attributes for a zone
m.RAPLZoneInfo =
append(
m.RAPLZoneInfo,
RAPLZoneInfo{
tags: map[string]string{
"id": zoneID,
"zone_name": z.name,
},
energyFilepath: z.energyFilepath,
energy: z.energy,
energyTimestamp: z.energyTimestamp,
maxEnergyRange: z.maxEnergyRange,
})
}
// find all sub zones for the given zone
subZonePrefix := filepath.Join(zonePath, controlType+":"+zoneID+":")
subZonesPath, err := filepath.Glob(subZonePrefix + "*")
if err != nil || subZonesPath == nil {
continue
}
for _, subZonePath := range subZonesPath {
subZoneID := strings.TrimPrefix(subZonePath, subZonePrefix)
sz := readZoneInfo(subZonePath)
if len(zoneID) > 0 && len(z.name) > 0 &&
sz.ok &&
!isIDExcluded[zoneID+":"+subZoneID] &&
!isNameExcluded[sz.name] {
m.RAPLZoneInfo =
append(
m.RAPLZoneInfo,
RAPLZoneInfo{
tags: map[string]string{
"id": zoneID + ":" + subZoneID,
"zone_name": z.name,
"sub_zone_name": sz.name,
},
energyFilepath: sz.energyFilepath,
energy: sz.energy,
energyTimestamp: sz.energyTimestamp,
maxEnergyRange: sz.maxEnergyRange,
})
}
}
}
if m.RAPLZoneInfo == nil {
return fmt.Errorf("no running average power limit (RAPL) device found in %s", controlTypePath)
}
// Initialized
cclog.ComponentDebug(
m.name,
"initialized",
len(m.RAPLZoneInfo),
"zones with running average power limit (RAPL) monitoring attributes")
m.init = true
return err
}
// Read reads running average power limit (RAPL) monitoring attributes for all initialized zones
// See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes
func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMetric) {
for i := range m.RAPLZoneInfo {
p := &m.RAPLZoneInfo[i]
// Read current value of the energy counter in micro joules
if v, err := os.ReadFile(p.energyFilepath); err == nil {
energyTimestamp := time.Now()
if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil {
energy := i
// Compute average power (Δ energy / Δ time)
energyDiff := energy - p.energy
if energyDiff < 0 {
// Handle overflow:
// ( p.maxEnergyRange - p.energy ) + energy
// = p.maxEnergyRange + ( energy - p.energy )
// = p.maxEnergyRange + diffEnergy
energyDiff += p.maxEnergyRange
}
timeDiff := energyTimestamp.Sub(p.energyTimestamp)
averagePower := float64(energyDiff) / float64(timeDiff.Microseconds())
y, err := lp.New(
"rapl_average_power",
p.tags,
m.meta,
map[string]interface{}{"value": averagePower},
energyTimestamp)
if err == nil {
output <- y
}
// Save current energy counter state
p.energy = energy
p.energyTimestamp = energyTimestamp
}
}
}
}
// Close closes running average power limit (RAPL) metric collector
func (m *RAPLCollector) Close() {
// Unset flag
m.init = false
}

18
collectors/raplMetric.md Normal file
View File

@@ -0,0 +1,18 @@
# Running average power limit (RAPL) metric collector
This collector reads running average power limit (RAPL) monitoring attributes to compute average power consumption metrics. See <https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes>.
The Likwid metric collector provides similar functionality.
## Configuration
```json
"rapl": {
"exclude_device_by_id": ["0:1", "0:2"],
"exclude_device_by_name": ["psys"]
}
```
## Metrics
* `rapl_average_power`: average power consumption in Watt. The average is computed over the entire runtime from the last measurement to the current measurement

View File

@@ -6,8 +6,8 @@ import (
"fmt"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
)
@@ -66,14 +66,14 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
ret := rocm_smi.Init()
if ret != rocm_smi.STATUS_SUCCESS {
err = errors.New("Failed to initialize ROCm SMI library")
err = errors.New("failed to initialize ROCm SMI library")
cclog.ComponentError(m.name, err.Error())
return err
}
numDevs, ret := rocm_smi.NumMonitorDevices()
if ret != rocm_smi.STATUS_SUCCESS {
err = errors.New("Failed to get number of GPUs from ROCm SMI library")
err = errors.New("failed to get number of GPUs from ROCm SMI library")
cclog.ComponentError(m.name, err.Error())
return err
}
@@ -98,14 +98,14 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
}
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
if ret != rocm_smi.STATUS_SUCCESS {
err = fmt.Errorf("Failed to get handle for GPU %d", i)
err = fmt.Errorf("failed to get handle for GPU %d", i)
cclog.ComponentError(m.name, err.Error())
return err
}
pciInfo, ret := rocm_smi.DeviceGetPciInfo(device)
if ret != rocm_smi.STATUS_SUCCESS {
err = fmt.Errorf("Failed to get PCI information for GPU %d", i)
err = fmt.Errorf("failed to get PCI information for GPU %d", i)
cclog.ComponentError(m.name, err.Error())
return err
}

View File

@@ -4,8 +4,8 @@ import (
"encoding/json"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
// These are the fields we read from the JSON configuration
@@ -17,7 +17,7 @@ type SampleCollectorConfig struct {
// defined by metricCollector (name, init, ...)
type SampleCollector struct {
metricCollector
config SampleTimerCollectorConfig // the configuration structure
config SampleCollectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
}
@@ -36,14 +36,14 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
// This is for later use, also call it early
m.setup()
// Tell whether the collector should be run in parallel with others (reading files, ...)
// or it should be run serially, mostly for collectors acutally doing measurements
// or it should be run serially, mostly for collectors actually doing measurements
// because they should not measure the execution of the other collectors
m.parallel = true
// Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta())
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
// Define tags sent with each metric
// The 'type' tag is always needed, it defines the granulatity of the metric
// The 'type' tag is always needed, it defines the granularity of the metric
// node -> whole system
// socket -> CPU socket (requires socket ID as 'type-id' tag)
// die -> CPU die (requires CPU die ID as 'type-id' tag)

View File

@@ -5,8 +5,8 @@ import (
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
// These are the fields we read from the JSON configuration
@@ -38,7 +38,7 @@ func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
// (Can also be dynamic or this is the basic set with extension through AddMeta())
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
// Define tags sent with each metric
// The 'type' tag is always needed, it defines the granulatity of the metric
// The 'type' tag is always needed, it defines the granularity of the metric
// node -> whole system
// socket -> CPU socket (requires socket ID as 'type-id' tag)
// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
@@ -60,7 +60,7 @@ func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
// Storage for output channel
m.output = nil
// Mangement channel for the timer function.
// Management channel for the timer function.
m.done = make(chan bool)
// Create the own ticker
m.ticker = time.NewTicker(m.interval)
@@ -94,7 +94,7 @@ func (m *SampleTimerCollector) ReadMetrics(timestamp time.Time) {
value := 1.0
// If you want to measure something for a specific amout of time, use interval
// If you want to measure something for a specific amount of time, use interval
// start := readState()
// time.Sleep(interval)
// stop := readState()

View File

@@ -1,17 +1,17 @@
package collectors
import (
"bufio"
"encoding/json"
"fmt"
"bufio"
"time"
"os"
"strings"
"strconv"
"math"
"os"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
const SCHEDSTATFILE = `/proc/schedstat`
@@ -83,12 +83,11 @@ func (m *SchedstatCollector) Init(config json.RawMessage) error {
running, _ := strconv.ParseInt(linefields[7], 10, 64)
waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
m.olddata[linefields[0]] = map[string]int64{"running" : running, "waiting" : waiting}
m.olddata[linefields[0]] = map[string]int64{"running": running, "waiting": waiting}
num_cpus++
}
}
// Save current timestamp
m.lastTimestamp = time.Now()

144
collectors/selfMetric.go Normal file
View File

@@ -0,0 +1,144 @@
package collectors
import (
"encoding/json"
"runtime"
"syscall"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
type SelfCollectorConfig struct {
MemStats bool `json:"read_mem_stats"`
GoRoutines bool `json:"read_goroutines"`
CgoCalls bool `json:"read_cgo_calls"`
Rusage bool `json:"read_rusage"`
}
type SelfCollector struct {
metricCollector
config SelfCollectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
}
func (m *SelfCollector) Init(config json.RawMessage) error {
var err error = nil
m.name = "SelfCollector"
m.setup()
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Self"}
m.tags = map[string]string{"type": "node"}
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
m.init = true
return err
}
func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMetric) {
timestamp := time.Now()
if m.config.MemStats {
var memstats runtime.MemStats
runtime.ReadMemStats(&memstats)
y, err := lp.New("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.New("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp)
if err == nil {
output <- y
}
}
if m.config.GoRoutines {
y, err := lp.New("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp)
if err == nil {
output <- y
}
}
if m.config.CgoCalls {
y, err := lp.New("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp)
if err == nil {
output <- y
}
}
if m.config.Rusage {
var rusage syscall.Rusage
err := syscall.Getrusage(syscall.RUSAGE_SELF, &rusage)
if err == nil {
sec, nsec := rusage.Utime.Unix()
t := float64(sec) + (float64(nsec) * 1e-9)
y, err := lp.New("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
if err == nil {
y.AddMeta("unit", "seconds")
output <- y
}
sec, nsec = rusage.Stime.Unix()
t = float64(sec) + (float64(nsec) * 1e-9)
y, err = lp.New("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
if err == nil {
y.AddMeta("unit", "seconds")
output <- y
}
y, err = lp.New("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp)
if err == nil {
output <- y
}
y, err = lp.New("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp)
if err == nil {
output <- y
}
y, err = lp.New("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp)
if err == nil {
output <- y
}
y, err = lp.New("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp)
if err == nil {
output <- y
}
y, err = lp.New("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp)
if err == nil {
output <- y
}
}
}
}
func (m *SelfCollector) Close() {
m.init = false
}

34
collectors/selfMetric.md Normal file
View File

@@ -0,0 +1,34 @@
## `self` collector
```json
"self": {
"read_mem_stats" : true,
"read_goroutines" : true,
"read_cgo_calls" : true,
"read_rusage" : true
}
```
The `self` collector reads the data from the `runtime` and `syscall` packages, so monitors the execution of the cc-metric-collector itself.
Metrics:
* If `read_mem_stats == true`:
* `total_alloc`: The metric reports cumulative bytes allocated for heap objects.
* `heap_alloc`: The metric reports bytes of allocated heap objects.
* `heap_sys`: The metric reports bytes of heap memory obtained from the OS.
* `heap_idle`: The metric reports bytes in idle (unused) spans.
* `heap_inuse`: The metric reports bytes in in-use spans.
* `heap_released`: The metric reports bytes of physical memory returned to the OS.
* `heap_objects`: The metric reports the number of allocated heap objects.
* If `read_goroutines == true`:
* `num_goroutines`: The metric reports the number of goroutines that currently exist.
* If `read_cgo_calls == true`:
* `num_cgo_calls`: The metric reports the number of cgo calls made by the current process.
* If `read_rusage == true`:
* `rusage_user_time`: The metric reports the amount of time that this process has been scheduled in user mode.
* `rusage_system_time`: The metric reports the amount of time that this process has been scheduled in kernel mode.
* `rusage_vol_ctx_switch`: The metric reports the amount of voluntary context switches.
* `rusage_invol_ctx_switch`: The metric reports the amount of involuntary context switches.
* `rusage_signals`: The metric reports the number of signals received.
* `rusage_major_pgfaults`: The metric reports the number of major faults the process has made which have required loading a memory page from disk.
* `rusage_minor_pgfaults`: The metric reports the number of minor faults the process has made which have not required loading a memory page from disk.

View File

@@ -3,14 +3,14 @@ package collectors
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
@@ -83,14 +83,14 @@ func (m *TempCollector) Init(config json.RawMessage) error {
// sensor name
nameFile := filepath.Join(filepath.Dir(file), "name")
name, err := ioutil.ReadFile(nameFile)
name, err := os.ReadFile(nameFile)
if err == nil {
sensor.name = strings.TrimSpace(string(name))
}
// sensor label
labelFile := strings.TrimSuffix(file, "_input") + "_label"
label, err := ioutil.ReadFile(labelFile)
label, err := os.ReadFile(labelFile)
if err == nil {
sensor.label = strings.TrimSpace(string(label))
}
@@ -117,7 +117,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
}
// Sensor file
_, err = ioutil.ReadFile(file)
_, err = os.ReadFile(file)
if err != nil {
continue
}
@@ -139,7 +139,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
// max temperature
if m.config.ReportMaxTemp {
maxTempFile := strings.TrimSuffix(file, "_input") + "_max"
if buffer, err := ioutil.ReadFile(maxTempFile); err == nil {
if buffer, err := os.ReadFile(maxTempFile); err == nil {
if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil {
sensor.maxTempName = strings.Replace(sensor.metricName, "temp", "max_temp", 1)
sensor.maxTemp = x / 1000
@@ -150,7 +150,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
// critical temperature
if m.config.ReportCriticalTemp {
criticalTempFile := strings.TrimSuffix(file, "_input") + "_crit"
if buffer, err := ioutil.ReadFile(criticalTempFile); err == nil {
if buffer, err := os.ReadFile(criticalTempFile); err == nil {
if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil {
sensor.critTempName = strings.Replace(sensor.metricName, "temp", "crit_temp", 1)
sensor.critTemp = x / 1000
@@ -175,7 +175,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) {
for _, sensor := range m.sensors {
// Read sensor file
buffer, err := ioutil.ReadFile(sensor.file)
buffer, err := os.ReadFile(sensor.file)
if err != nil {
cclog.ComponentError(
m.name,

View File

@@ -9,7 +9,7 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
const MAX_NUM_PROCS = 10

28
go.mod
View File

@@ -6,33 +6,35 @@ require (
github.com/ClusterCockpit/cc-units v0.3.0
github.com/ClusterCockpit/go-rocm-smi v0.3.0
github.com/NVIDIA/go-nvml v0.11.6-0
github.com/PaesslerAG/gval v1.2.0
github.com/PaesslerAG/gval v1.2.1
github.com/gorilla/mux v1.8.0
github.com/influxdata/influxdb-client-go/v2 v2.9.1
github.com/influxdata/influxdb-client-go/v2 v2.12.0
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf
github.com/nats-io/nats.go v1.16.0
github.com/prometheus/client_golang v1.12.2
github.com/nats-io/nats.go v1.20.0
github.com/prometheus/client_golang v1.14.0
github.com/stmcginnis/gofish v0.13.0
golang.org/x/sys v0.0.0-20220712014510-0a85c31ab51e
github.com/tklauser/go-sysconf v0.3.11
golang.org/x/sys v0.2.0
)
require (
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.1.2 // indirect
github.com/deepmap/oapi-codegen v1.11.0 // indirect
github.com/deepmap/oapi-codegen v1.12.3 // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.1 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/nats-io/nats-server/v2 v2.8.4 // indirect
github.com/nats-io/nkeys v0.3.0 // indirect
github.com/nats-io/nuid v1.0.1 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_model v0.2.0 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.7.3 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
github.com/shopspring/decimal v1.3.1 // indirect
golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d // indirect
golang.org/x/net v0.0.0-20220708220712-1185a9018129 // indirect
google.golang.org/protobuf v1.28.0 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
github.com/tklauser/numcpus v0.6.0 // indirect
golang.org/x/crypto v0.3.0 // indirect
golang.org/x/net v0.2.0 // indirect
google.golang.org/protobuf v1.28.1 // indirect
)

5
go.sum
View File

@@ -287,6 +287,10 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/tklauser/go-sysconf v0.3.10 h1:IJ1AZGZRWbY8T5Vfk04D9WOA5WSejdflXxP03OUqALw=
github.com/tklauser/go-sysconf v0.3.10/go.mod h1:C8XykCvCb+Gn0oNCWPIlcb0RuglQTYaQ2hGm7jmxEFk=
github.com/tklauser/numcpus v0.4.0 h1:E53Dm1HjH1/R2/aoCtXtPgzmElmn51aOkhCFSuZq//o=
github.com/tklauser/numcpus v0.4.0/go.mod h1:1+UI3pD8NW14VMwdgJNJ1ESk2UnwhAnz5hMwiKKqXCQ=
github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw=
github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6M=
github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY=
@@ -445,6 +449,7 @@ golang.org/x/sys v0.0.0-20211019181941-9d821ace8654/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20211103235746-7861aae1554b/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220128215802-99c3d69c2c27/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220513210249-45d2b4557a2a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220712014510-0a85c31ab51e h1:NHvCuwuS43lGnYhten69ZWqi2QOj/CiDNcKbVqwVoew=
golang.org/x/sys v0.0.0-20220712014510-0a85c31ab51e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=

View File

@@ -9,10 +9,10 @@ import (
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"github.com/PaesslerAG/gval"
)

View File

@@ -8,8 +8,8 @@ import (
"sort"
"strings"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
)
/*

View File

@@ -4,11 +4,11 @@ import (
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
)
type metricCachePeriod struct {

View File

@@ -7,11 +7,11 @@ import (
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
units "github.com/ClusterCockpit/cc-units"
)

View File

@@ -10,7 +10,7 @@ import (
"strconv"
"strings"
cclogger "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
cclogger "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
)
const SYSFS_NUMABASE = `/sys/devices/system/node`

View File

@@ -3,7 +3,7 @@ package multiChanTicker
import (
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
)
type multiChanTicker struct {

View File

@@ -1,8 +1,8 @@
{
"natsrecv" : {
"natsrecv": {
"type": "nats",
"address": "nats://my-url",
"port" : "4222",
"port": "4222",
"database": "testcluster"
},
"redfish_recv": {
@@ -21,5 +21,30 @@
"endpoint": "https://my-endpoint-2"
}
]
},
"ipmi_recv": {
"type": "ipmi",
"exclude_metrics": [
"fan_speed",
"voltage"
],
"client_config": [
{
"username": "username-1",
"password": "password-1",
"endpoint": "ipmi-sensors://my-endpoint-1",
"host_list": [
"my-host-1"
]
},
{
"username": "username-2",
"password": "password-2",
"endpoint": "ipmi-sensors://my-endpoint-2",
"host_list": [
"my-host-2"
]
}
]
}
}

View File

@@ -0,0 +1,164 @@
package receivers
import (
"bufio"
"encoding/json"
"fmt"
"net"
"os"
"sync"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
influx "github.com/influxdata/line-protocol"
)
// SampleReceiver configuration: receiver type, listen address, port
type AppMetricReceiverConfig struct {
Type string `json:"type"`
SocketFile string `json:"socket_file"`
}
type AppMetricReceiver struct {
receiver
config AppMetricReceiverConfig
// Storage for static information
meta map[string]string
// Use in case of own go routine
done chan bool
wg sync.WaitGroup
// Influx stuff
handler *influx.MetricHandler
parser *influx.Parser
// WaitGroup for individual connections
connWg sync.WaitGroup
listener net.Listener
}
func (r *AppMetricReceiver) newConnection(conn net.Conn) {
//defer conn.Close()
//defer wg.Done()
buffer, err := bufio.NewReader(conn).ReadBytes('\n')
if err != nil {
conn.Close()
return
}
metrics, err := r.parser.Parse(buffer)
if err != nil {
cclog.ComponentError(r.name, "failed to parse received metrics")
return
}
for _, m := range metrics {
y := lp.FromInfluxMetric(m)
for k, v := range r.meta {
y.AddMeta(k, v)
}
if r.sink != nil {
r.sink <- y
}
}
r.newConnection(conn)
}
func (r *AppMetricReceiver) newAccepter(listenSocket net.Listener) {
accept_loop:
for {
select {
case <-r.done:
break accept_loop
default:
conn, err := listenSocket.Accept()
if err == nil {
r.connWg.Add(1)
go func() {
r.newConnection(conn)
r.connWg.Done()
}()
}
}
}
r.wg.Done()
}
// Implement functions required for Receiver interface
// Start(), Close()
// See: metricReceiver.go
func (r *AppMetricReceiver) Start() {
var err error = nil
cclog.ComponentDebug(r.name, "START")
r.listener, err = net.Listen("unix", r.config.SocketFile)
if err != nil {
cclog.ComponentError(r.name, "failed to listen at socket", r.config.SocketFile)
}
if _, err := os.Stat(r.config.SocketFile); err != nil {
cclog.ComponentError(r.name, "failed to create socket", r.config.SocketFile)
}
r.done = make(chan bool)
r.wg.Add(1)
go r.newAccepter(r.listener)
}
// Close receiver: close network connection, close files, close libraries, ...
func (r *AppMetricReceiver) Close() {
cclog.ComponentDebug(r.name, "CLOSE")
if _, err := os.Stat(r.config.SocketFile); err == nil {
if err := os.RemoveAll(r.config.SocketFile); err != nil {
cclog.ComponentError(r.name, "Failed to remove UNIX socket", r.config.SocketFile)
}
}
// in case of own go routine, send the signal and wait
r.listener.Close()
r.done <- true
close(r.done)
r.connWg.Wait()
r.wg.Wait()
}
// New function to create a new instance of the receiver
// Initialize the receiver by giving it a name and reading in the config JSON
func NewAppMetricReceiver(name string, config json.RawMessage) (Receiver, error) {
r := new(AppMetricReceiver)
// Set name of SampleReceiver
// The name should be chosen in such a way that different instances of SampleReceiver can be distinguished
r.name = fmt.Sprintf("AppMetricReceiver(%s)", name)
// Set static information
r.meta = map[string]string{"source": r.name}
// Set defaults in r.config
// Allow overwriting these defaults by reading config JSON
r.config.SocketFile = "/tmp/cc.sock"
// Read the sample receiver specific JSON config
if len(config) > 0 {
err := json.Unmarshal(config, &r.config)
if err != nil {
cclog.ComponentError(r.name, "Error reading config:", err.Error())
return nil, err
}
}
if len(r.config.SocketFile) == 0 {
cclog.ComponentError(r.name, "Invalid socket_file setting:", r.config.SocketFile)
return nil, fmt.Errorf("invalid socket_file setting: %s", r.config.SocketFile)
}
// Check that all required fields in the configuration are set
// Use 'if len(r.config.Option) > 0' for strings
r.handler = influx.NewMetricHandler()
r.parser = influx.NewParser(r.handler)
r.parser.SetTimeFunc(DefaultTime)
return r, nil
}

View File

@@ -0,0 +1,23 @@
## `appmetrics` receiver
The `appmetrics` receiver can be used to submit metrics from an application into the monitoring system. It listens for incoming connections on a UNIX socket.
### Configuration structure
```json
{
"<name>": {
"type": "appmetrics",
"socket_file" : "/tmp/cc.sock",
}
}
```
- `type`: makes the receiver a `appmetrics` receiver
- `socket_file`: Listen UNIX socket
### Inputs from applications
Applcations can connect to the `appmetrics` socket and provide metric in the [InfluxDB line protocol](https://github.com/influxdata/line-protocol). It is currently not possible to submit meta information as the Influx line protocol does not know them.

View File

@@ -10,8 +10,8 @@ import (
"strings"
"sync"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
"github.com/gorilla/mux"
influx "github.com/influxdata/line-protocol"
)

525
receivers/ipmiReceiver.go Normal file
View File

@@ -0,0 +1,525 @@
package receivers
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"io"
"os/exec"
"regexp"
"strconv"
"strings"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
type IPMIReceiverClientConfig struct {
// Hostname the IPMI service belongs to
Protocol string // Protocol / tool to use for IPMI sensor reading
DriverType string // Out of band IPMI driver
Fanout int // Maximum number of simultaneous IPMI connections
NumHosts int // Number of remote IPMI devices with the same configuration
IPMIHosts string // List of remote IPMI devices to communicate with
IPMI2HostMapping map[string]string // Mapping between IPMI device name and host name
Username string // User name to authenticate with
Password string // Password to use for authentication
CLIOptions []string // Additional command line options for ipmi-sensors
isExcluded map[string]bool // is metric excluded
}
type IPMIReceiver struct {
receiver
config struct {
Interval time.Duration
// Client config for each IPMI hosts
ClientConfigs []IPMIReceiverClientConfig
}
// Storage for static information
meta map[string]string
done chan bool // channel to finish / stop IPMI receiver
wg sync.WaitGroup // wait group for IPMI receiver
}
// doReadMetrics reads metrics from all configure IPMI hosts.
func (r *IPMIReceiver) doReadMetric() {
for i := range r.config.ClientConfigs {
clientConfig := &r.config.ClientConfigs[i]
var cmd_options []string
if clientConfig.Protocol == "ipmi-sensors" {
cmd_options = append(cmd_options,
"--always-prefix",
"--sdr-cache-recreate",
// Attempt to interpret OEM data, such as event data, sensor readings, or general extra info
"--interpret-oem-data",
// Ignore not-available (i.e. N/A) sensors in output
"--ignore-not-available-sensors",
// Ignore unrecognized sensor events
"--ignore-unrecognized-events",
// Output fields in comma separated format
"--comma-separated-output",
// Do not output column headers
"--no-header-output",
// Output non-abbreviated units (e.g. 'Amps' instead of 'A').
// May aid in disambiguation of units (e.g. 'C' for Celsius or Coulombs).
"--non-abbreviated-units",
"--fanout", fmt.Sprint(clientConfig.Fanout),
"--driver-type", clientConfig.DriverType,
"--hostname", clientConfig.IPMIHosts,
"--username", clientConfig.Username,
"--password", clientConfig.Password,
)
cmd_options := append(cmd_options, clientConfig.CLIOptions...)
command := exec.Command("ipmi-sensors", cmd_options...)
stdout, _ := command.StdoutPipe()
errBuf := new(bytes.Buffer)
command.Stderr = errBuf
// start command
if err := command.Start(); err != nil {
cclog.ComponentError(
r.name,
fmt.Sprintf("doReadMetric(): Failed to start command \"%s\": %v", command.String(), err),
)
continue
}
// Read command output
const (
idxID = iota
idxName
idxType
idxReading
idxUnits
idxEvent
)
numPrefixRegex := regexp.MustCompile("^[[:digit:]][[:digit:]]-(.*)$")
scanner := bufio.NewScanner(stdout)
for scanner.Scan() {
// Read host
v1 := strings.Split(scanner.Text(), ": ")
if len(v1) != 2 {
continue
}
host, ok := clientConfig.IPMI2HostMapping[v1[0]]
if !ok {
continue
}
// Read sensors
v2 := strings.Split(v1[1], ",")
if len(v2) != 6 {
continue
}
// Skip sensors with non available sensor readings
if v2[idxReading] == "N/A" {
continue
}
metric := strings.ToLower(v2[idxType])
name := strings.ToLower(
strings.Replace(
strings.TrimSpace(
v2[idxName]), " ", "_", -1))
// remove prefix enumeration like 01-...
if v := numPrefixRegex.FindStringSubmatch(name); v != nil {
name = v[1]
}
unit := v2[idxUnits]
if unit == "Watts" {
// Power
metric = "power"
name = strings.TrimSuffix(name, "_power")
name = strings.TrimSuffix(name, "_pwr")
name = strings.TrimPrefix(name, "pwr_")
} else if metric == "voltage" &&
unit == "Volts" {
// Voltage
name = strings.TrimPrefix(name, "volt_")
} else if metric == "current" &&
unit == "Amps" {
// Current
unit = "Ampere"
} else if metric == "temperature" &&
unit == "degrees C" {
// Temperature
name = strings.TrimSuffix(name, "_temp")
unit = "degC"
} else if metric == "temperature" &&
unit == "degrees F" {
// Temperature
name = strings.TrimSuffix(name, "_temp")
unit = "degF"
} else if metric == "fan" && unit == "RPM" {
// Fan speed
metric = "fan_speed"
name = strings.TrimSuffix(name, "_tach")
name = strings.TrimPrefix(name, "spd_")
} else if (metric == "cooling device" ||
metric == "other units based sensor") &&
name == "system_air_flow" &&
unit == "CFM" {
// Air flow
metric = "air_flow"
name = strings.TrimSuffix(name, "_air_flow")
unit = "CubicFeetPerMinute"
} else if (metric == "processor" ||
metric == "other units based sensor") &&
(name == "cpu_utilization" ||
name == "io_utilization" ||
name == "mem_utilization" ||
name == "sys_utilization") &&
(unit == "unspecified" ||
unit == "%") {
// Utilization
metric = "utilization"
name = strings.TrimSuffix(name, "_utilization")
unit = "percent"
} else {
if false {
// Debug output for unprocessed metrics
fmt.Printf(
"host: '%s', metric: '%s', name: '%s', unit: '%s'\n",
host, metric, name, unit)
}
continue
}
// Skip excluded metrics
if clientConfig.isExcluded[metric] {
continue
}
// Parse sensor value
value, err := strconv.ParseFloat(v2[idxReading], 64)
if err != nil {
continue
}
y, err := lp.New(
metric,
map[string]string{
"hostname": host,
"type": "node",
"name": name,
},
map[string]string{
"source": r.name,
"group": "IPMI",
"unit": unit,
},
map[string]interface{}{
"value": value,
},
time.Now())
if err == nil {
r.sink <- y
}
}
// Wait for command end
if err := command.Wait(); err != nil {
errMsg, _ := io.ReadAll(errBuf)
cclog.ComponentError(
r.name,
fmt.Sprintf("doReadMetric(): Failed to wait for the end of command \"%s\": %v\n",
strings.Replace(command.String(), clientConfig.Password, "<PW>", -1), err),
fmt.Sprintf("doReadMetric(): command stderr: \"%s\"\n", string(errMsg)),
)
}
}
}
}
func (r *IPMIReceiver) Start() {
cclog.ComponentDebug(r.name, "START")
// Start IPMI receiver
r.wg.Add(1)
go func() {
defer r.wg.Done()
// Create ticker
ticker := time.NewTicker(r.config.Interval)
defer ticker.Stop()
for {
r.doReadMetric()
select {
case tickerTime := <-ticker.C:
// Check if we missed the ticker event
if since := time.Since(tickerTime); since > 5*time.Second {
cclog.ComponentInfo(r.name, "Missed ticker event for more then", since)
}
// process ticker event -> continue
continue
case <-r.done:
// process done event
return
}
}
}()
cclog.ComponentDebug(r.name, "STARTED")
}
// Close receiver: close network connection, close files, close libraries, ...
func (r *IPMIReceiver) Close() {
cclog.ComponentDebug(r.name, "CLOSE")
// Send the signal and wait
close(r.done)
r.wg.Wait()
cclog.ComponentDebug(r.name, "DONE")
}
// NewIPMIReceiver creates a new instance of the redfish receiver
// Initialize the receiver by giving it a name and reading in the config JSON
func NewIPMIReceiver(name string, config json.RawMessage) (Receiver, error) {
r := new(IPMIReceiver)
// Config options from config file
configJSON := struct {
Type string `json:"type"`
// How often the IPMI sensor metrics should be read and send to the sink (default: 30 s)
IntervalString string `json:"interval,omitempty"`
// Maximum number of simultaneous IPMI connections (default: 64)
Fanout int `json:"fanout,omitempty"`
// Out of band IPMI driver (default: LAN_2_0)
DriverType string `json:"driver_type,omitempty"`
// Default client username, password and endpoint
Username *string `json:"username"` // User name to authenticate with
Password *string `json:"password"` // Password to use for authentication
Endpoint *string `json:"endpoint"` // URL of the IPMI device
// Globally excluded metrics
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ClientConfigs []struct {
Fanout int `json:"fanout,omitempty"` // Maximum number of simultaneous IPMI connections (default: 64)
DriverType string `json:"driver_type,omitempty"` // Out of band IPMI driver (default: LAN_2_0)
HostList []string `json:"host_list"` // List of hosts with the same client configuration
Username *string `json:"username"` // User name to authenticate with
Password *string `json:"password"` // Password to use for authentication
Endpoint *string `json:"endpoint"` // URL of the IPMI service
// Per client excluded metrics
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
// Additional command line options for ipmi-sensors
CLIOptions []string `json:"cli_options,omitempty"`
} `json:"client_config"`
}{
// Set defaults values
// Allow overwriting these defaults by reading config JSON
Fanout: 64,
DriverType: "LAN_2_0",
IntervalString: "30s",
}
// Set name of IPMIReceiver
r.name = fmt.Sprintf("IPMIReceiver(%s)", name)
// Create done channel
r.done = make(chan bool)
// Set static information
r.meta = map[string]string{"source": r.name}
// Read the IPMI receiver specific JSON config
if len(config) > 0 {
err := json.Unmarshal(config, &configJSON)
if err != nil {
cclog.ComponentError(r.name, "Error reading config:", err.Error())
return nil, err
}
}
// Convert interval string representation to duration
var err error
r.config.Interval, err = time.ParseDuration(configJSON.IntervalString)
if err != nil {
err := fmt.Errorf(
"Failed to parse duration string interval='%s': %w",
configJSON.IntervalString,
err,
)
cclog.Error(r.name, err)
return nil, err
}
// Create client config from JSON config
totalNumHosts := 0
for i := range configJSON.ClientConfigs {
clientConfigJSON := &configJSON.ClientConfigs[i]
var endpoint string
if clientConfigJSON.Endpoint != nil {
endpoint = *clientConfigJSON.Endpoint
} else if configJSON.Endpoint != nil {
endpoint = *configJSON.Endpoint
} else {
err := fmt.Errorf("client config number %v requires endpoint", i)
cclog.ComponentError(r.name, err)
return nil, err
}
fanout := configJSON.Fanout
if clientConfigJSON.Fanout != 0 {
fanout = clientConfigJSON.Fanout
}
driverType := configJSON.DriverType
if clientConfigJSON.DriverType != "" {
driverType = clientConfigJSON.DriverType
}
if driverType != "LAN" && driverType != "LAN_2_0" {
err := fmt.Errorf("client config number %v has invalid driver type %s", i, driverType)
cclog.ComponentError(r.name, err)
return nil, err
}
var protocol string
var host_pattern string
if e := strings.Split(endpoint, "://"); len(e) == 2 {
protocol = e[0]
host_pattern = e[1]
} else {
err := fmt.Errorf("client config number %v has invalid endpoint %s", i, endpoint)
cclog.ComponentError(r.name, err)
return nil, err
}
var username string
if clientConfigJSON.Username != nil {
username = *clientConfigJSON.Username
} else if configJSON.Username != nil {
username = *configJSON.Username
} else {
err := fmt.Errorf("client config number %v requires username", i)
cclog.ComponentError(r.name, err)
return nil, err
}
var password string
if clientConfigJSON.Password != nil {
password = *clientConfigJSON.Password
} else if configJSON.Password != nil {
password = *configJSON.Password
} else {
err := fmt.Errorf("client config number %v requires password", i)
cclog.ComponentError(r.name, err)
return nil, err
}
// Create mapping between ipmi hostname and node hostname
// This also guaranties that all ipmi hostnames are uniqu
ipmi2HostMapping := make(map[string]string)
for _, host := range clientConfigJSON.HostList {
ipmiHost := strings.Replace(host_pattern, "%h", host, -1)
ipmi2HostMapping[ipmiHost] = host
}
numHosts := len(ipmi2HostMapping)
totalNumHosts += numHosts
ipmiHostList := make([]string, 0, numHosts)
for ipmiHost := range ipmi2HostMapping {
ipmiHostList = append(ipmiHostList, ipmiHost)
}
// Additional command line options
for _, v := range clientConfigJSON.CLIOptions {
switch {
case v == "-u" || strings.HasPrefix(v, "--username"):
err := fmt.Errorf("client config number %v: do not set username in cli_options. Use json config username instead", i)
cclog.ComponentError(r.name, err)
return nil, err
case v == "-p" || strings.HasPrefix(v, "--password"):
err := fmt.Errorf("client config number %v: do not set password in cli_options. Use json config password instead", i)
cclog.ComponentError(r.name, err)
return nil, err
case v == "-h" || strings.HasPrefix(v, "--hostname"):
err := fmt.Errorf("client config number %v: do not set hostname in cli_options. Use json config host_list instead", i)
cclog.ComponentError(r.name, err)
return nil, err
case v == "-D" || strings.HasPrefix(v, "--driver-type"):
err := fmt.Errorf("client config number %v: do not set driver type in cli_options. Use json config driver_type instead", i)
cclog.ComponentError(r.name, err)
return nil, err
case v == "-F" || strings.HasPrefix(v, " --fanout"):
err := fmt.Errorf("client config number %v: do not set fanout in cli_options. Use json config fanout instead", i)
cclog.ComponentError(r.name, err)
return nil, err
case v == "--always-prefix" ||
v == "--sdr-cache-recreate" ||
v == "--interpret-oem-data" ||
v == "--ignore-not-available-sensors" ||
v == "--ignore-unrecognized-events" ||
v == "--comma-separated-output" ||
v == "--no-header-output" ||
v == "--non-abbreviated-units":
err := fmt.Errorf("client config number %v: Do not use option %s in cli_options, it is used internally", i, v)
cclog.ComponentError(r.name, err)
return nil, err
}
}
cliOptions := make([]string, 0)
cliOptions = append(cliOptions, clientConfigJSON.CLIOptions...)
// Is metrics excluded globally or per client
isExcluded := make(map[string]bool)
for _, key := range clientConfigJSON.ExcludeMetrics {
isExcluded[key] = true
}
for _, key := range configJSON.ExcludeMetrics {
isExcluded[key] = true
}
r.config.ClientConfigs = append(
r.config.ClientConfigs,
IPMIReceiverClientConfig{
Protocol: protocol,
Fanout: fanout,
DriverType: driverType,
NumHosts: numHosts,
IPMIHosts: strings.Join(ipmiHostList, ","),
IPMI2HostMapping: ipmi2HostMapping,
Username: username,
Password: password,
CLIOptions: cliOptions,
isExcluded: isExcluded,
})
}
if totalNumHosts == 0 {
err := fmt.Errorf("at least one IPMI host config is required")
cclog.ComponentError(r.name, err)
return nil, err
}
cclog.ComponentInfo(r.name, "monitoring", totalNumHosts, "IPMI hosts")
return r, nil
}

48
receivers/ipmiReceiver.md Normal file
View File

@@ -0,0 +1,48 @@
## IPMI Receiver
The IPMI Receiver uses `ipmi-sensors` from the [FreeIPMI](https://www.gnu.org/software/freeipmi/) project to read IPMI sensor readings and sensor data repository (SDR) information. The available metrics depend on the sensors provided by the hardware vendor but typically contain temperature, fan speed, voltage and power metrics.
### Configuration structure
```json
{
"<IPMI receiver name>": {
"type": "ipmi",
"interval": "30s",
"fanout": 256,
"username": "<Username>",
"password": "<Password>",
"endpoint": "ipmi-sensors://%h-p",
"exclude_metrics": [ "fan_speed", "voltage" ],
"client_config": [
{
"host_list": ["n1", "n2", "n3", "n4" ]
},
{
"host_list": [ "n5", "n6" ],
"driver_type": "LAN",
"cli_options": [ "--workaround-flags=..." ],
"password": "<Password 2>"
}
]
}
}
```
Global settings:
- `interval`: How often the IPMI sensor metrics should be read and send to the sink (default: 30 s)
Global and per IPMI device settings (per IPMI device settings overwrite the global settings):
- `exclude_metrics`: list of excluded metrics e.g. fan_speed, power, temperature, utilization, voltage
- `fanout`: Maximum number of simultaneous IPMI connections (default: 64)
- `driver_type`: Out of band IPMI driver (default: LAN_2_0)
- `username`: User name to authenticate with
- `password`: Password to use for authentication
- `endpoint`: URL of the IPMI device (placeholder `%h` gets replaced by the hostname)
Per IPMI device settings:
- `host_list`: List of hosts with the same client configuration
- `cli_options`: Additional command line options for ipmi-sensors

View File

@@ -1,7 +1,7 @@
package receivers
import (
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
type defaultReceiverConfig struct {

View File

@@ -6,8 +6,8 @@ import (
"fmt"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
influx "github.com/influxdata/line-protocol"
nats "github.com/nats-io/nats.go"
)

View File

@@ -12,8 +12,8 @@ import (
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
type PrometheusReceiverConfig struct {

View File

@@ -2,16 +2,19 @@ package receivers
import (
"encoding/json"
"fmt"
"os"
"sync"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
var AvailableReceivers = map[string]func(name string, config json.RawMessage) (Receiver, error){
"ipmi": NewIPMIReceiver,
"nats": NewNatsReceiver,
"redfish": NewRedfishReceiver,
"appmetrics": NewAppMetricReceiver,
}
type receiveManager struct {
@@ -71,9 +74,13 @@ func (rm *receiveManager) AddInput(name string, rawConfig json.RawMessage) error
cclog.ComponentError("ReceiveManager", "SKIP", config.Type, "JSON config error:", err.Error())
return err
}
if config.Type == "" {
cclog.ComponentError("ReceiveManager", "SKIP", "JSON config for receiver", name, "does not contain a receiver type")
return fmt.Errorf("JSON config for receiver %s does not contain a receiver type", name)
}
if _, found := AvailableReceivers[config.Type]; !found {
cclog.ComponentError("ReceiveManager", "SKIP", config.Type, "unknown receiver:", err.Error())
return err
cclog.ComponentError("ReceiveManager", "SKIP", "unknown receiver type:", config.Type)
return fmt.Errorf("unknown receiver type: %s", config.Type)
}
r, err := AvailableReceivers[config.Type](name, rawConfig)
if err != nil {

View File

@@ -10,8 +10,8 @@ import (
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
// See: https://pkg.go.dev/github.com/stmcginnis/gofish
"github.com/stmcginnis/gofish"

View File

@@ -4,7 +4,7 @@ import (
"encoding/json"
"fmt"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
)
// SampleReceiver configuration: receiver type, listen address, port

View File

@@ -20,7 +20,9 @@ The configuration file for the sinks is a list of configurations. The `type` fie
[
"mystdout" : {
"type" : "stdout",
"meta_as_tags" : false
"meta_as_tags" : [
"unit"
]
},
"metricstore" : {
"type" : "http",

View File

@@ -4,7 +4,7 @@ import (
"fmt"
"strings"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
func GangliaMetricName(point lp.CCMetric) string {

View File

@@ -9,8 +9,8 @@ import (
// "time"
"os/exec"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
const GMETRIC_EXEC = `gmetric`

View File

@@ -9,8 +9,8 @@ import (
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
influx "github.com/influxdata/line-protocol"
)

View File

@@ -9,8 +9,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
influxdb2ApiHttp "github.com/influxdata/influxdb-client-go/v2/api/http"

View File

@@ -9,8 +9,8 @@ import (
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
"github.com/influxdata/influxdb-client-go/v2/api/write"

View File

@@ -71,8 +71,8 @@ import (
"fmt"
"unsafe"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
"github.com/NVIDIA/go-nvml/pkg/dl"
)

View File

@@ -1,7 +1,7 @@
package sinks
import (
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
type defaultSinkConfig struct {

View File

@@ -8,8 +8,8 @@ import (
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
influx "github.com/influxdata/line-protocol"
nats "github.com/nats-io/nats.go"
)

View File

@@ -9,8 +9,8 @@ import (
"strings"
"sync"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
"github.com/gorilla/mux"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"

View File

@@ -5,8 +5,8 @@ import (
"fmt"
"log"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
type SampleSinkConfig struct {

View File

@@ -6,8 +6,8 @@ import (
"os"
"sync"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
const SINK_MAX_FORWARD = 50

View File

@@ -7,7 +7,7 @@ import (
"strings"
// "time"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
type StdoutSink struct {