Compare commits

..

20 Commits

Author SHA1 Message Date
Thomas Roehl
b77d9a6cf0 Add include configuration to diskstat collector 2025-12-19 17:32:54 +01:00
Thomas Roehl
6b10797556 Update README and build docu 2025-12-19 17:32:01 +01:00
Thomas Roehl
f964a9d065 likwid collector: explicitly cast input to HPMaddthread helper function to int. See #164 2025-08-06 18:56:47 +02:00
Thomas Roehl
8c5a3cc07b Fix build errors in likwid collector due to likwid API changes. Fixes #164 2025-08-06 18:53:47 +02:00
Thomas Roehl
4804576ad6 remove fixed LDPATH from likwid collector 2025-08-06 15:05:37 +02:00
Thomas Roehl
623391d271 Revert 6af85fe 2025-08-06 15:01:48 +02:00
Thomas Roehl
6af85fe52f Cast thread ID in likwid collector. Fixes #164 2025-08-06 14:38:44 +02:00
oscarminus
dd62929b42 Add meta operations and total values as value per second (#151)
Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de>
2025-07-03 14:58:48 +02:00
dependabot[bot]
88588ca80b Bump github.com/ClusterCockpit/cc-lib from 0.2.0 to 0.5.0 (#154)
* Fix Golang RPM URLs in Release Action

* Update cc-lib to 0.2.0

* Add missing 'Section' and 'Priority' to .deb.control

* Read written bytes instead of read bytes

* Add dependabot config

* Bump github.com/ClusterCockpit/cc-lib from 0.2.0 to 0.5.0

Bumps [github.com/ClusterCockpit/cc-lib](https://github.com/ClusterCockpit/cc-lib) from 0.2.0 to 0.5.0.
- [Release notes](https://github.com/ClusterCockpit/cc-lib/releases)
- [Commits](https://github.com/ClusterCockpit/cc-lib/compare/v0.2.0...v0.5.0)

---
updated-dependencies:
- dependency-name: github.com/ClusterCockpit/cc-lib
  dependency-version: 0.5.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com>
Co-authored-by: Thomas Roehl <thomas.roehl@fau.de>
Co-authored-by: Michael Panzlaff <michael.panzlaff@fau.de>
Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2025-07-03 14:57:11 +02:00
dependabot[bot]
0cc01c93c1 Bump github.com/NVIDIA/go-nvml from 0.12.4-1 to 0.12.9-0 (#156)
* Fix Golang RPM URLs in Release Action

* Update cc-lib to 0.2.0

* Add missing 'Section' and 'Priority' to .deb.control

* Read written bytes instead of read bytes

* Add dependabot config

* Update dependabot.yml

* Bump github.com/NVIDIA/go-nvml from 0.12.4-1 to 0.12.9-0

Bumps [github.com/NVIDIA/go-nvml](https://github.com/NVIDIA/go-nvml) from 0.12.4-1 to 0.12.9-0.
- [Release notes](https://github.com/NVIDIA/go-nvml/releases)
- [Commits](https://github.com/NVIDIA/go-nvml/compare/v0.12.4-1...v0.12.9-0)

---
updated-dependencies:
- dependency-name: github.com/NVIDIA/go-nvml
  dependency-version: 0.12.9-0
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com>
Co-authored-by: Thomas Roehl <thomas.roehl@fau.de>
Co-authored-by: Michael Panzlaff <michael.panzlaff@fau.de>
Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2025-07-03 14:54:44 +02:00
dependabot[bot]
412ae24b20 Bump golang.org/x/sys from 0.32.0 to 0.33.0 (#155)
* Fix Golang RPM URLs in Release Action

* Update cc-lib to 0.2.0

* Add missing 'Section' and 'Priority' to .deb.control

* Read written bytes instead of read bytes

* Add dependabot config

* Bump golang.org/x/sys from 0.32.0 to 0.33.0

Bumps [golang.org/x/sys](https://github.com/golang/sys) from 0.32.0 to 0.33.0.
- [Commits](https://github.com/golang/sys/compare/v0.32.0...v0.33.0)

---
updated-dependencies:
- dependency-name: golang.org/x/sys
  dependency-version: 0.33.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com>
Co-authored-by: Thomas Roehl <thomas.roehl@fau.de>
Co-authored-by: Michael Panzlaff <michael.panzlaff@fau.de>
Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2025-07-03 14:52:44 +02:00
dependabot[bot]
40fd936f48 Bump github.com/PaesslerAG/gval from 1.2.2 to 1.2.4 (#153)
* Fix Golang RPM URLs in Release Action

* Update cc-lib to 0.2.0

* Add missing 'Section' and 'Priority' to .deb.control

* Read written bytes instead of read bytes

* Add dependabot config

* Bump github.com/PaesslerAG/gval from 1.2.2 to 1.2.4

Bumps [github.com/PaesslerAG/gval](https://github.com/PaesslerAG/gval) from 1.2.2 to 1.2.4.
- [Release notes](https://github.com/PaesslerAG/gval/releases)
- [Commits](https://github.com/PaesslerAG/gval/compare/v1.2.2...v1.2.4)

---
updated-dependencies:
- dependency-name: github.com/PaesslerAG/gval
  dependency-version: 1.2.4
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com>
Co-authored-by: Thomas Roehl <thomas.roehl@fau.de>
Co-authored-by: Michael Panzlaff <michael.panzlaff@fau.de>
Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2025-07-03 14:50:59 +02:00
dependabot[bot]
a9b301b7d4 Bump github.com/fsnotify/fsnotify from 1.7.0 to 1.9.0 (#152)
* Fix Golang RPM URLs in Release Action

* Update cc-lib to 0.2.0

* Add missing 'Section' and 'Priority' to .deb.control

* Read written bytes instead of read bytes

* Add dependabot config

* Bump github.com/fsnotify/fsnotify from 1.7.0 to 1.9.0

Bumps [github.com/fsnotify/fsnotify](https://github.com/fsnotify/fsnotify) from 1.7.0 to 1.9.0.
- [Release notes](https://github.com/fsnotify/fsnotify/releases)
- [Changelog](https://github.com/fsnotify/fsnotify/blob/main/CHANGELOG.md)
- [Commits](https://github.com/fsnotify/fsnotify/compare/v1.7.0...v1.9.0)

---
updated-dependencies:
- dependency-name: github.com/fsnotify/fsnotify
  dependency-version: 1.9.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: Thomas Gruber <Thomas.Roehl@googlemail.com>
Co-authored-by: Thomas Roehl <thomas.roehl@fau.de>
Co-authored-by: Michael Panzlaff <michael.panzlaff@fau.de>
Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2025-07-03 14:50:01 +02:00
Michael Schwarz
a5a0474573 Read written bytes instead of read bytes 2025-07-02 18:13:55 +02:00
Thomas Roehl
c85c4eeb21 Likwid collector: deal with vanishing and occuring msr devices 2025-07-01 11:45:10 +02:00
Thomas Roehl
e03e21021d Merge branch 'develop' of github.com:ClusterCockpit/cc-metric-collector into develop 2025-07-01 11:35:07 +02:00
Thomas Roehl
0e730c9720 Bump to cc-lib 0.3.0 2025-06-30 14:25:01 +02:00
Thomas Roehl
eb452770a5 Update cc-lib to 0.2.0 2025-06-18 12:22:25 +02:00
Thomas Gruber
f060a1bb17 Fix Golang RPM URLs in Release Action 2025-06-17 12:10:25 +02:00
Thomas Roehl
f8b2ac0d2c Fix URL to new location of cc-units 2025-04-22 12:48:15 +02:00
57 changed files with 1319 additions and 1931 deletions

View File

@@ -5,10 +5,10 @@ name: Release
# Run on tag push
on:
push:
tags:
- '**'
workflow_dispatch:
push:
tags:
- '**'
workflow_dispatch:
jobs:
@@ -36,14 +36,22 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -70,13 +78,13 @@ jobs:
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector RPM for AlmaLinux 8
path: ${{ steps.rpmrename.outputs.RPM }}
overwrite: true
- name: Save SRPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector SRPM for AlmaLinux 8
path: ${{ steps.rpmrename.outputs.SRPM }}
@@ -106,14 +114,23 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.el9_6.noarch.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.23.9-1.el9_6.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -140,26 +157,25 @@ jobs:
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector RPM for AlmaLinux 9
path: ${{ steps.rpmrename.outputs.RPM }}
overwrite: true
- name: Save SRPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector SRPM for AlmaLinux 9
path: ${{ steps.rpmrename.outputs.SRPM }}
overwrite: true
#
# Build on Red Hat Universal Base Image (UBI 8) using go-toolset
# Build on UBI 8 using go-toolset
#
UBI-8-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+8
# https://hub.docker.com/r/redhat/ubi8
container: redhat/ubi8
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c35984d70cc534b3a3784e?container-tabs=gti
container: registry.access.redhat.com/ubi8/ubi:8.8-1032.1692772289
# The job outputs link to the outputs of the 'rpmbuild' step
outputs:
rpm : ${{steps.rpmbuild.outputs.RPM}}
@@ -174,14 +190,22 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-8-appstream-rpms install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -191,25 +215,24 @@ jobs:
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector RPM for UBI 8
path: ${{ steps.rpmbuild.outputs.RPM }}
overwrite: true
- name: Save SRPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector SRPM for UBI 8
path: ${{ steps.rpmbuild.outputs.SRPM }}
overwrite: true
#
# Build on Red Hat Universal Base Image (UBI 9) using go-toolset
# Build on UBI 9 using go-toolset
#
UBI-9-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+9
# https://hub.docker.com/r/redhat/ubi9
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
container: redhat/ubi9
# The job outputs link to the outputs of the 'rpmbuild' step
# The job outputs link to the outputs of the 'rpmbuild' step
@@ -220,20 +243,30 @@ jobs:
# Use dnf to install development packages
- name: Install development packages
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros gcc make python39 git wget openssl-devel diffutils delve
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros gcc make python39 git wget openssl-devel diffutils delve
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-9-appstream-rpms install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.el9_6.noarch.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.23.9-1.el9_6.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -243,13 +276,13 @@ jobs:
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector RPM for UBI 9
path: ${{ steps.rpmbuild.outputs.RPM }}
overwrite: true
- name: Save SRPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector SRPM for UBI 9
path: ${{ steps.rpmbuild.outputs.SRPM }}
@@ -275,14 +308,13 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang
uses: actions/setup-go@v6
uses: actions/setup-go@v5
with:
go-version: 'stable'
@@ -300,13 +332,13 @@ jobs:
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
# See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector DEB for Ubuntu 22.04
path: ${{ steps.debrename.outputs.DEB }}
overwrite: true
#
#
# Build on Ubuntu 24.04 using official go package
#
Ubuntu-noblenumbat-build:
@@ -326,14 +358,13 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang
uses: actions/setup-go@v6
uses: actions/setup-go@v5
with:
go-version: 'stable'
@@ -351,7 +382,7 @@ jobs:
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
# See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector DEB for Ubuntu 24.04
path: ${{ steps.debrename.outputs.DEB }}
@@ -369,48 +400,48 @@ jobs:
steps:
# See: https://github.com/actions/download-artifact
- name: Download AlmaLinux 8 RPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector RPM for AlmaLinux 8
- name: Download AlmaLinux 8 SRPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector SRPM for AlmaLinux 8
- name: Download AlmaLinux 9 RPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector RPM for AlmaLinux 9
- name: Download AlmaLinux 9 SRPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector SRPM for AlmaLinux 9
- name: Download UBI 8 RPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector RPM for UBI 8
- name: Download UBI 8 SRPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector SRPM for UBI 8
- name: Download UBI 9 RPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector RPM for UBI 9
- name: Download UBI 9 SRPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector SRPM for UBI 9
- name: Download Ubuntu 22.04 DEB
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector DEB for Ubuntu 22.04
- name: Download Ubuntu 24.04 DEB
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector DEB for Ubuntu 24.04

View File

@@ -20,41 +20,25 @@ jobs:
# See: https://github.com/marketplace/actions/checkout
# Checkout git repository and submodules
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang
uses: actions/setup-go@v6
uses: actions/setup-go@v5
with:
go-version: 'stable'
go-version: '1.21'
check-latest: true
- name: Install reviewdog
run: |
go install github.com/reviewdog/reviewdog/cmd/reviewdog@latest
# See: https://golangci-lint.run
- name: Install GolangCI-Lint
run: |
go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@latest
- name: Build MetricCollector
run: make
- name: Run MetricCollector once
run: ./cc-metric-collector --once --config .github/ci-config.json
# Running the linter requires likwid.h, which gets downloaded in the build step
- name: Static Analysis with GolangCI-Lint and Upload Report with reviewdog
run: |
golangci-lint run --enable modernize,staticcheck,govet | reviewdog -f=golangci-lint -name "Check golangci-lint on build-latest" -reporter=github-check -filter-mode=nofilter -fail-level none
env:
REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
#
# Build on AlmaLinux 8 using go-toolset
# Build on AlmaLinux 8
#
AlmaLinux8-RPM-build:
runs-on: ubuntu-latest
@@ -74,14 +58,23 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -90,7 +83,7 @@ jobs:
make RPM
#
# Build on AlmaLinux 9 using go-toolset
# Build on AlmaLinux 9
#
AlmaLinux9-RPM-build:
runs-on: ubuntu-latest
@@ -110,14 +103,24 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.el9_6.noarch.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.23.9-1.el9_6.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -125,49 +128,13 @@ jobs:
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
#
# Build on AlmaLinux 10 using go-toolset
#
AlmaLinux10-RPM-build:
runs-on: ubuntu-latest
# See: https://hub.docker.com/_/almalinux
container: almalinux:10
# The job outputs link to the outputs of the 'rpmrename' step
# Only job outputs can be used in child jobs
steps:
# Use dnf to install development packages
- name: Install development packages
run: |
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
dnf --assumeyes install wget openssl-devel diffutils delve which
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
fetch-depth: 0
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
#
# Build on Red Hat Universal Base Image (UBI 8) using go-toolset
# Build on UBI 8 using go-toolset
#
UBI-8-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+8
# https://hub.docker.com/r/redhat/ubi8
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
container: redhat/ubi8
# The job outputs link to the outputs of the 'rpmbuild' step
steps:
@@ -180,14 +147,23 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-8-appstream-rpms install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -196,12 +172,11 @@ jobs:
make RPM
#
# Build on Red Hat Universal Base Image (UBI 9) using go-toolset
# Build on UBI 9 using go-toolset
#
UBI-9-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+9
# https://hub.docker.com/r/redhat/ubi9
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
container: redhat/ubi9
# The job outputs link to the outputs of the 'rpmbuild' step
steps:
@@ -214,48 +189,24 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-9-appstream-rpms install go-toolset
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
#
# Build on Red Hat Universal Base Image (UBI 10) using go-toolset
#
UBI-10-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+10
# https://hub.docker.com/r/redhat/ubi10
container: redhat/ubi10
# The job outputs link to the outputs of the 'rpmbuild' step
steps:
# Use dnf to install development packages
- name: Install development packages
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros gcc make python3 git wget openssl-devel diffutils delve
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
fetch-depth: 0
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-10-for-x86_64-appstream-rpms install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.el9_6.noarch.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.23.9-1.el9_6.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -280,14 +231,14 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang
uses: actions/setup-go@v6
uses: actions/setup-go@v5
with:
go-version: 'stable'
@@ -314,14 +265,14 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang
uses: actions/setup-go@v6
uses: actions/setup-go@v5
with:
go-version: 'stable'

View File

@@ -72,11 +72,6 @@ staticcheck:
$(GOBIN) install honnef.co/go/tools/cmd/staticcheck@latest
$$($(GOBIN) env GOPATH)/bin/staticcheck ./...
.PHONY: golangci-lint
golangci-lint:
$(GOBIN) install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@latest
$$($(GOBIN) env GOPATH)/bin/golangci-lint run
.ONESHELL:
.PHONY: RPM
RPM: scripts/cc-metric-collector.spec

View File

@@ -54,11 +54,14 @@ See the component READMEs for their configuration:
# Installation
Dependecies:
- golang
- hwloc
```
$ git clone git@github.com:ClusterCockpit/cc-metric-collector.git
$ export CGO_LDFLAGS="-L/path/to/hwloc/lib/dir"
$ make (downloads LIKWID, builds it as static library with 'direct' accessmode and copies all required files for the collector)
$ go get (requires at least golang 1.16)
$ make
```
For more information, see [here](./docs/building.md).

View File

@@ -14,17 +14,17 @@ import (
"os/signal"
"syscall"
"github.com/ClusterCockpit/cc-lib/v2/receivers"
"github.com/ClusterCockpit/cc-lib/v2/sinks"
"github.com/ClusterCockpit/cc-lib/receivers"
"github.com/ClusterCockpit/cc-lib/sinks"
"github.com/ClusterCockpit/cc-metric-collector/collectors"
// "strings"
"sync"
"time"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
)

View File

@@ -1,19 +1,6 @@
{
"cpufreq": {},
"cpufreq_cpuinfo": {},
"cpustat": {
"exclude_metrics": [
"cpu_idle"
]
},
"diskstat": {
"exclude_metrics": [
"disk_total"
],
"exclude_mounts": [
"slurm-tmpfs"
]
},
"gpfs": {
"exclude_filesystem": [
"test_fs"
@@ -34,8 +21,6 @@
},
"numastats": {},
"nvidia": {},
"schedstat": {
},
"tempstat": {
"report_max_temperature": true,
"report_critical_temperature": true,
@@ -53,4 +38,4 @@
"topprocs": {
"num_procs": 5
}
}
}

View File

@@ -52,7 +52,6 @@ In contrast to the configuration files for sinks and receivers, the collectors c
* [`beegfs_meta`](./beegfsmetaMetric.md)
* [`beegfs_storage`](./beegfsstorageMetric.md)
* [`rocm_smi`](./rocmsmiMetric.md)
* [`slurm_cgroup`](./slurmCgroupMetric.md)
## Todos
@@ -67,7 +66,7 @@ A collector reads data from any source, parses it to metrics and submits these m
* `Read(duration time.Duration, output chan ccMessage.CCMessage)`: Read, parse and submit data to the `output` channel as [`CCMessage`](https://github.com/ClusterCockpit/cc-lib/blob/main/ccMessage/README.md). If the collector has to measure anything for some duration, use the provided function argument `duration`.
* `Close()`: Closes down the collector.
It is recommended to call `setup()` in the `Init()` function.
It is recommanded to call `setup()` in the `Init()` function.
Finally, the collector needs to be registered in the `collectorManager.go`. There is a list of collectors called `AvailableCollectors` which is a map (`collector_type_string` -> `pointer to MetricCollector interface`). Add a new entry with a descriptive name and the new collector.
@@ -100,12 +99,11 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
}
m.name = "SampleCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
if len(config) > 0 {
if err := json.Unmarshal(config, &m.config); err != nil {
return fmt.Errorf("%s Init(): json.Unmarshal() call failed: %w", m.name, err)
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
m.meta = map[string]string{"source": m.name, "group": "Sample"}

View File

@@ -17,13 +17,12 @@ import (
"os/exec"
"os/user"
"regexp"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const DEFAULT_BEEGFS_CMD = "beegfs-ctl"
@@ -62,9 +61,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
"rmXA", "setXA", "mirror"}
m.name = "BeegfsMetaCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
// Set default beegfs-ctl binary
@@ -81,7 +78,8 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
//create map with possible variables
m.matches = make(map[string]string)
for _, value := range nodeMdstat_array {
if slices.Contains(m.config.ExcludeMetrics, value) {
_, skip := stringArrayContains(m.config.ExcludeMetrics, value)
if skip {
m.matches["other"] = "0"
} else {
m.matches["beegfs_cmeta_"+value] = "0"
@@ -225,7 +223,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
for key, data := range m.matches {
value, _ := strconv.ParseFloat(data, 32)
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil {
output <- y
}

View File

@@ -17,13 +17,12 @@ import (
"os/exec"
"os/user"
"regexp"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// Struct for the collector-specific JSON config
@@ -55,9 +54,7 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
"storInf", "unlnk"}
m.name = "BeegfsStorageCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
// Set default beegfs-ctl binary
@@ -74,7 +71,8 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
//create map with possible variables
m.matches = make(map[string]string)
for _, value := range storageStat_array {
if slices.Contains(m.config.ExcludeMetrics, value) {
_, skip := stringArrayContains(m.config.ExcludeMetrics, value)
if skip {
m.matches["other"] = "0"
} else {
m.matches["beegfs_cstorage_"+value] = "0"
@@ -217,7 +215,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
for key, data := range m.matches {
value, _ := strconv.ParseFloat(data, 32)
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil {
output <- y
}

View File

@@ -9,12 +9,11 @@ package collectors
import (
"encoding/json"
"fmt"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
)
@@ -48,7 +47,6 @@ var AvailableCollectors = map[string]MetricCollector{
"self": new(SelfCollector),
"schedstat": new(SchedstatCollector),
"nfsiostat": new(NfsIOStatCollector),
"slurm_cgroup": new(SlurmCgroupCollector),
}
// Metric collector manager data structure
@@ -105,7 +103,7 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
err = collector.Init(collectorCfg)
if err != nil {
cclog.ComponentError("CollectorManager", fmt.Sprintf("Collector %s initialization failed: %v", collectorName, err))
cclog.ComponentError("CollectorManager", "Collector", collectorName, "initialization failed:", err.Error())
continue
}
cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())

View File

@@ -17,8 +17,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// CPUFreqCollector
@@ -41,10 +41,9 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
return nil
}
m.setup()
m.name = "CPUFreqCpuInfoCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true
m.meta = map[string]string{
"source": m.name,
@@ -57,6 +56,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
if err != nil {
return fmt.Errorf("failed to open file '%s': %v", cpuInfoFile, err)
}
defer file.Close()
// Collect topology information from file cpuinfo
foundFreq := false
@@ -86,10 +86,6 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
}
}
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): Call to file.Close() failed: %w", m.name, err)
}
// were all topology information collected?
if foundFreq &&
len(processor) > 0 &&
@@ -144,13 +140,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err))
return
}
defer func() {
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", cpuInfoFile, err))
}
}()
defer file.Close()
processorCounter := 0
now := time.Now()
@@ -171,7 +161,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
return
}
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": value}, now); err == nil {
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil {
output <- y
}
}

View File

@@ -16,8 +16,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"golang.org/x/sys/unix"
)
@@ -48,9 +48,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
}
m.name = "CPUFreqCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
@@ -126,7 +124,7 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage
continue
}
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": cpuFreq}, now); err == nil {
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil {
output <- y
}
}

View File

@@ -12,13 +12,12 @@ import (
"encoding/json"
"fmt"
"os"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
sysconf "github.com/tklauser/go-sysconf"
)
@@ -40,17 +39,10 @@ type CpustatCollector struct {
func (m *CpustatCollector) Init(config json.RawMessage) error {
m.name = "CpustatCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
m.meta = map[string]string{
"source": m.name,
"group": "CPU",
}
m.nodetags = map[string]string{
"type": "node",
}
m.meta = map[string]string{"source": m.name, "group": "CPU"}
m.nodetags = map[string]string{"type": "node"}
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
if err != nil {
@@ -72,7 +64,14 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
m.matches = make(map[string]int)
for match, index := range matches {
if !slices.Contains(m.config.ExcludeMetrics, match) {
doExclude := false
for _, exclude := range m.config.ExcludeMetrics {
if match == exclude {
doExclude = true
break
}
}
if !doExclude {
m.matches[match] = index
}
}
@@ -80,17 +79,9 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
// Check input file
file, err := os.Open(string(CPUSTATFILE))
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Init(): Failed to open file '%s': %v", string(CPUSTATFILE), err))
cclog.ComponentError(m.name, err.Error())
}
defer func() {
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Init(): Failed to close file '%s': %v", string(CPUSTATFILE), err))
}
}()
defer file.Close()
// Pre-generate tags for all CPUs
num_cpus := 0
@@ -138,7 +129,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
sum := float64(0)
for name, value := range values {
sum += value
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value * 100}, now)
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now)
if err == nil {
y.AddTag("unit", "Percent")
output <- y
@@ -146,7 +137,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
}
if v, ok := values["cpu_idle"]; ok {
sum -= v
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]any{"value": sum * 100}, now)
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]interface{}{"value": sum * 100}, now)
if err == nil {
y.AddTag("unit", "Percent")
output <- y
@@ -164,17 +155,9 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
file, err := os.Open(string(CPUSTATFILE))
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", string(CPUSTATFILE), err))
cclog.ComponentError(m.name, err.Error())
}
defer func() {
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", string(CPUSTATFILE), err))
}
}()
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
@@ -191,7 +174,7 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
num_cpus_metric, err := lp.NewMessage("num_cpus",
m.nodetags,
m.meta,
map[string]any{"value": int(num_cpus)},
map[string]interface{}{"value": int(num_cpus)},
now,
)
if err == nil {

View File

@@ -10,15 +10,13 @@ package collectors
import (
"encoding/json"
"errors"
"fmt"
"log"
"os"
"os/exec"
"slices"
"strings"
"time"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
influx "github.com/influxdata/line-protocol"
)
@@ -51,16 +49,11 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
return err
}
}
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
for _, c := range m.config.Commands {
cmdfields := strings.Fields(c)
command := exec.Command(cmdfields[0], cmdfields[1:]...)
if err := command.Wait(); err != nil {
log.Print(err)
continue
}
command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " "))
command.Wait()
_, err = command.Output()
if err == nil {
m.commands = append(m.commands, c)
@@ -95,11 +88,8 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
}
for _, cmd := range m.commands {
cmdfields := strings.Fields(cmd)
command := exec.Command(cmdfields[0], cmdfields[1:]...)
if err := command.Wait(); err != nil {
log.Print(err)
continue
}
command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " "))
command.Wait()
stdout, err := command.Output()
if err != nil {
log.Print(err)
@@ -111,7 +101,8 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
continue
}
for _, c := range cmdmetrics {
if slices.Contains(m.config.ExcludeMetrics, c.Name()) {
_, skip := stringArrayContains(m.config.ExcludeMetrics, c.Name())
if skip {
continue
}
@@ -130,7 +121,8 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
continue
}
for _, f := range fmetrics {
if slices.Contains(m.config.ExcludeMetrics, f.Name()) {
_, skip := stringArrayContains(m.config.ExcludeMetrics, f.Name())
if skip {
continue
}
output <- lp.FromInfluxMetric(f)

View File

@@ -10,36 +10,43 @@ package collectors
import (
"bufio"
"encoding/json"
"fmt"
"os"
"strings"
"syscall"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const MOUNTFILE = `/proc/self/mounts`
type DiskstatCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ExcludeMounts []string `json:"exclude_mounts,omitempty"`
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ExcludeDevices []string `json:"exclude_devices,omitempty"`
ExcludeMountpoints []string `json:"exclude_mountpoints,omitempty"`
IncludeDevices []string `json:"include_devices,omitempty"`
IncludeMountpoints []string `json:"include_mountpoints,omitempty"`
UseMountpoint bool `json:"mountpoint_as_stype,omitempty"`
UseIncludeConfig bool `json:"use_include_config,omitempty"`
}
type DiskstatCollector struct {
metricCollector
config DiskstatCollectorConfig
allowedMetrics map[string]bool
config DiskstatCollectorConfig
allowedMetrics map[string]bool
includeDevices map[string]bool
includeMountpoints map[string]bool
excludeDevices map[string]bool
excludeMountpoints map[string]bool
}
func (m *DiskstatCollector) Init(config json.RawMessage) error {
m.name = "DiskstatCollector"
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Disk"}
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.config.UseIncludeConfig = false
if len(config) > 0 {
if err := json.Unmarshal(config, &m.config); err != nil {
return err
@@ -55,13 +62,50 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
m.allowedMetrics[excl] = false
}
}
file, err := os.Open(MOUNTFILE)
if err != nil {
return fmt.Errorf("%s Init(): file open for file \"%s\" failed: %w", m.name, MOUNTFILE, err)
cclog.ComponentError(m.name, err.Error())
return err
}
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): file close for file \"%s\" failed: %w", m.name, MOUNTFILE, err)
defer file.Close()
availDevices := make(map[string]struct{})
availMpoints := make(map[string]struct{})
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
if len(line) == 0 {
continue
}
linefields := strings.Fields(line)
availDevices[linefields[0]] = struct{}{}
availMpoints[linefields[1]] = struct{}{}
}
m.includeDevices = make(map[string]bool)
for _, incl := range m.config.IncludeDevices {
if _, ok := availDevices[incl]; ok {
m.includeDevices[incl] = true
} else {
cclog.ComponentWarn(m.name, "Included Mount device ", incl, " does not exist")
}
}
m.includeMountpoints = make(map[string]bool)
for _, incl := range m.config.IncludeMountpoints {
if _, ok := availMpoints[incl]; ok {
m.includeMountpoints[incl] = true
} else {
cclog.ComponentWarn(m.name, "Included Mount point ", incl, " does not exist")
}
}
m.excludeMountpoints = make(map[string]bool)
for _, excl := range m.config.ExcludeMountpoints {
m.excludeMountpoints[excl] = true
}
m.excludeDevices = make(map[string]bool)
for _, excl := range m.config.ExcludeDevices {
m.excludeDevices[excl] = true
}
m.init = true
return nil
}
@@ -73,20 +117,14 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
file, err := os.Open(MOUNTFILE)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", MOUNTFILE, err))
cclog.ComponentError(m.name, err.Error())
return
}
defer func() {
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", MOUNTFILE, err))
}
}()
defer file.Close()
part_max_used := uint64(0)
part_max_used_device := ""
part_max_used_mountpoint := ""
scanner := bufio.NewScanner(file)
mountLoop:
for scanner.Scan() {
@@ -94,9 +132,9 @@ mountLoop:
if len(line) == 0 {
continue
}
if !strings.HasPrefix(line, "/dev") {
continue
}
// if !strings.HasPrefix(line, "/dev") {
// continue
// }
linefields := strings.Fields(line)
if strings.Contains(linefields[0], "loop") {
continue
@@ -105,10 +143,18 @@ mountLoop:
continue
}
mountPath := strings.ReplaceAll(linefields[1], `\040`, " ")
mountPath := strings.Replace(linefields[1], `\040`, " ", -1)
for _, excl := range m.config.ExcludeMounts {
if strings.Contains(mountPath, excl) {
if m.config.UseIncludeConfig {
_, ok1 := m.includeDevices[linefields[0]]
_, ok2 := m.includeMountpoints[linefields[1]]
if !(ok1 || ok2) {
continue mountLoop
}
} else {
_, ok1 := m.excludeDevices[linefields[0]]
_, ok2 := m.excludeMountpoints[linefields[1]]
if ok1 || ok2 {
continue mountLoop
}
}
@@ -121,10 +167,13 @@ mountLoop:
if stat.Blocks == 0 || stat.Bsize == 0 {
continue
}
tags := map[string]string{"type": "node", "device": linefields[0]}
tags := map[string]string{"type": "node", "stype": "filesystem", "stype-id": linefields[0]}
if m.config.UseMountpoint {
tags["stype-id"] = linefields[1]
}
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
if m.allowedMetrics["disk_total"] {
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]any{"value": total}, time.Now())
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
if err == nil {
y.AddMeta("unit", "GBytes")
output <- y
@@ -132,7 +181,7 @@ mountLoop:
}
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000)
if m.allowedMetrics["disk_free"] {
y, err := lp.NewMessage("disk_free", tags, m.meta, map[string]any{"value": free}, time.Now())
y, err := lp.NewMessage("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
if err == nil {
y.AddMeta("unit", "GBytes")
output <- y
@@ -142,11 +191,17 @@ mountLoop:
perc := (100 * (total - free)) / total
if perc > part_max_used {
part_max_used = perc
part_max_used_mountpoint = linefields[1]
part_max_used_device = linefields[0]
}
}
}
if m.allowedMetrics["part_max_used"] {
y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]any{"value": int(part_max_used)}, time.Now())
if m.allowedMetrics["part_max_used"] && len(part_max_used_mountpoint) > 0 {
tags := map[string]string{"type": "node", "stype": "filesystem", "stype-id": part_max_used_device}
if m.config.UseMountpoint {
tags["stype-id"] = part_max_used_mountpoint
}
y, err := lp.NewMessage("part_max_used", tags, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
if err == nil {
y.AddMeta("unit", "percent")
output <- y

View File

@@ -16,19 +16,38 @@ hugo_path: docs/reference/cc-metric-collector/collectors/diskstat.md
"exclude_metrics": [
"disk_total"
],
"exclude_mounts": [
"exclude_devices": [
"slurm-tmpfs"
],
"exclude_mountpoints": [
"/tmp"
],
"mountpoint_as_stype": true,
"use_include_config": false,
"include_devices": [
"/dev/sda3"
],
"include_mountpoints" : [
"/home"
]
}
```
The `diskstat` collector reads data from `/proc/self/mounts` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. Additionally, any mount point containing one of the strings specified in `exclude_mounts` will be skipped during metric collection.
The `diskstat` collector reads data from `/proc/self/mounts` and outputs a handful **node** metrics with `stype=filesystem,stype-id=<mountdevice>`. If a metric is not required, it can be excluded from forwarding it to the sink.
Metrics per device (with `device` tag):
For sending the `mountpoint` instead of the `mountdevice` in the `stype-id`, use `mountpoint_as_stype`.
There are two ways to specify for which devices or mountpoints the collector generates metrics. It's "either ...or".
- Excluding devices and mount points using `exclude_devices` and `exclude_mountpoints`. All devices (*) will be read that are not explicitly excluded
- Include devices and mount points by setting `use_include_config:true` and using `include_devices` and `include_mountpoints`.
(*) File systems where the mount device (first column in `/proc/self/mounts`) contains `loop` are always excluded. Filesystems where the mount point (second column in `/proc/self/mounts`) contains `boot` are also always excluded.
Metrics per filesystem (with `stype=filesystem` tag and `stype-id` based on the configuration):
* `disk_total` (unit `GBytes`)
* `disk_free` (unit `GBytes`)
Global metrics:
Global metrics (with `stype=filesystem` tag and `stype-id` pointing to the max. used filesystem device or mount point based on the configuration):
* `part_max_used` (unit `percent`)

File diff suppressed because it is too large Load Diff

View File

@@ -14,18 +14,12 @@ hugo_path: docs/reference/cc-metric-collector/collectors/gpfs.md
```json
"gpfs": {
"mmpmon_path": "/path/to/mmpmon",
"use_sudo": "true",
"exclude_filesystem": [
"fs1"
],
"exclude_metrics": [
"gpfs_bytes_written"
],
"send_abs_values": true,
"send_diff_values": true,
"send_derived_values": true,
"send_bandwidths": true,
"send_total_values": true,
"send_bandwidths": true
"send_derived_values": true
}
```
@@ -34,50 +28,33 @@ GPFS / IBM Spectrum Scale filesystems.
The reported filesystems can be filtered with the `exclude_filesystem` option
in the configuration.
Individual metrics can be disabled for reporting using option `exclude_metrics`.
The path to the `mmpmon` command can be configured with the `mmpmon_path` option
in the configuration. If nothing is set, the collector searches in `$PATH` for `mmpmon`.
If cc-metric-collector is run as non-root, password-less `sudo` can be enabled with `use_sudo`.
Because `mmpmon` is by default only executable as root, the Go procedure to
search for it in `$PATH` will fail. If you use `sudo`, you must specify the
complete path for `mmpmon` using the parameter `mmpmon_path`.
Metrics:
* `gpfs_bytes_read` (if `send_abs_values == true`)
* `gpfs_bytes_written` (if `send_abs_values == true`)
* `gpfs_num_opens` (if `send_abs_values == true`)
* `gpfs_num_closes` (if `send_abs_values == true`)
* `gpfs_num_reads` (if `send_abs_values == true`)
* `gpfs_num_writes` (if `send_abs_values == true`)
* `gpfs_num_readdirs` (if `send_abs_values == true`)
* `gpfs_num_inode_updates` (if `send_abs_values == true`)
* `gpfs_bytes_read_diff` (if `send_diff_values == true`)
* `gpfs_bytes_written_diff` (if `send_diff_values == true`)
* `gpfs_num_opens_diff` (if `send_diff_values == true`)
* `gpfs_num_closes_diff` (if `send_diff_values == true`)
* `gpfs_num_reads_diff` (if `send_diff_values == true`)
* `gpfs_num_writes_diff` (if `send_diff_values == true`)
* `gpfs_num_readdirs_diff` (if `send_diff_values == true`)
* `gpfs_num_inode_updates_diff` (if `send_diff_values == true`)
* `gpfs_bw_read` (if `send_derived_values == true` or `send_bandwidths == true`)
* `gpfs_bw_write` (if `send_derived_values == true` or `send_bandwidths == true`)
* `gpfs_bytes_read`
* `gpfs_bytes_written`
* `gpfs_num_opens`
* `gpfs_num_closes`
* `gpfs_num_reads`
* `gpfs_num_writes`
* `gpfs_num_readdirs`
* `gpfs_num_inode_updates`
* `gpfs_opens_rate` (if `send_derived_values == true`)
* `gpfs_closes_rate` (if `send_derived_values == true`)
* `gpfs_reads_rate` (if `send_derived_values == true`)
* `gpfs_writes_rate` (if `send_derived_values == true`)
* `gpfs_readdirs_rate` (if `send_derived_values == true`)
* `gpfs_inode_updates_rate` (if `send_derived_values == true`)
* `gpfs_bytes_total = gpfs_bytes_read + gpfs_bytes_written` (if `send_total_values == true` and `send_abs_values == true`)
* `gpfs_bytes_total_diff` (if `send_total_values == true` and `send_diff_values == true`)
* `gpfs_bw_total` ((if `send_total_values == true` and `send_derived_values == true`) or `send_bandwidths == true`)
* `gpfs_iops = gpfs_num_reads + gpfs_num_writes` (if `send_total_values == true` and `send_abs_values == true`)
* `gpfs_iops_diff` (if `send_total_values == true` and `send_diff_values == true`)
* `gpfs_bytes_total = gpfs_bytes_read + gpfs_bytes_written` (if `send_total_values == true`)
* `gpfs_iops = gpfs_num_reads + gpfs_num_writes` (if `send_total_values == true`)
* `gpfs_iops_rate` (if `send_total_values == true` and `send_derived_values == true`)
* `gpfs_metaops = gpfs_num_inode_updates + gpfs_num_closes + gpfs_num_opens + gpfs_num_readdirs` (if `send_total_values == true` and `send_abs_values == true`)
* `gpfs_metaops_diff` (if `send_total_values == true` and `send_diff_values == true`)
* `gpfs_metaops = gpfs_num_inode_updates + gpfs_num_closes + gpfs_num_opens + gpfs_num_readdirs` (if `send_total_values == true`)
* `gpfs_metaops_rate` (if `send_total_values == true` and `send_derived_values == true`)
* `gpfs_bw_read` (if `send_bandwidths == true`)
* `gpfs_bw_write` (if `send_bandwidths == true`)
* `gpfs_bw_total` (if `send_bandwidths == true` and `send_total_values == true`)
The collector adds a `filesystem` tag to all metrics

View File

@@ -10,10 +10,9 @@ package collectors
import (
"fmt"
"os"
"slices"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"golang.org/x/sys/unix"
"encoding/json"
@@ -66,9 +65,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
var err error
m.name = "InfinibandCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
m.meta = map[string]string{
"source": m.name,
@@ -114,7 +111,14 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
port := pathSplit[6]
// Skip excluded devices
if slices.Contains(m.config.ExcludeDevices, device) {
skip := false
for _, excludedDevice := range m.config.ExcludeDevices {
if excludedDevice == device {
skip = true
break
}
}
if skip {
continue
}
@@ -237,7 +241,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
counterDef.name,
info.tagSet,
m.meta,
map[string]any{
map[string]interface{}{
"value": counterDef.currentState,
},
now); err == nil {
@@ -255,7 +259,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
counterDef.name+"_bw",
info.tagSet,
m.meta,
map[string]any{
map[string]interface{}{
"value": rate,
},
now); err == nil {
@@ -285,7 +289,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
"ib_total",
info.tagSet,
m.meta,
map[string]any{
map[string]interface{}{
"value": ib_total,
},
now); err == nil {
@@ -298,7 +302,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
"ib_total_pkts",
info.tagSet,
m.meta,
map[string]any{
map[string]interface{}{
"value": ib_total_pkts,
},
now); err == nil {

View File

@@ -11,28 +11,27 @@ import (
"bufio"
"encoding/json"
"errors"
"fmt"
"os"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// Konstante für den Pfad zu /proc/diskstats
const IOSTATFILE = `/proc/diskstats`
type IOstatCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
// Neues Feld zum Ausschließen von Devices per JSON-Konfiguration
ExcludeDevices []string `json:"exclude_devices,omitempty"`
}
type IOstatCollectorEntry struct {
currentValues map[string]int64
lastValues map[string]int64
tags map[string]string
lastValues map[string]int64
tags map[string]string
}
type IOstatCollector struct {
@@ -47,9 +46,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
m.name = "IOstatCollector"
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Disk"}
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
@@ -79,7 +76,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
m.devices = make(map[string]IOstatCollectorEntry)
m.matches = make(map[string]int)
for k, v := range matches {
if !slices.Contains(m.config.ExcludeMetrics, k) {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, k); !skip {
m.matches[k] = v
}
}
@@ -88,8 +85,10 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
}
file, err := os.Open(IOSTATFILE)
if err != nil {
return fmt.Errorf("%s Init(): Failed to open file \"%s\": %w", m.name, IOSTATFILE, err)
cclog.ComponentError(m.name, err.Error())
return err
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
@@ -103,36 +102,21 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
if strings.Contains(device, "loop") {
continue
}
if slices.Contains(m.config.ExcludeDevices, device) {
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
continue
}
currentValues := make(map[string]int64)
lastValues := make(map[string]int64)
values := make(map[string]int64)
for m := range m.matches {
currentValues[m] = 0
lastValues[m] = 0
}
for name, idx := range m.matches {
if idx < len(linefields) {
if value, err := strconv.ParseInt(linefields[idx], 0, 64); err == nil {
currentValues[name] = value
lastValues[name] = value // Set last to current for first read
}
}
values[m] = 0
}
m.devices[device] = IOstatCollectorEntry{
tags: map[string]string{
"device": device,
"type": "node",
},
currentValues: currentValues,
lastValues: lastValues,
lastValues: values,
}
}
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): Failed to close file \"%s\": %w", m.name, IOSTATFILE, err)
}
m.init = true
return err
}
@@ -144,18 +128,10 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
file, err := os.Open(IOSTATFILE)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", IOSTATFILE, err))
cclog.ComponentError(m.name, err.Error())
return
}
defer func() {
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", IOSTATFILE, err))
}
}()
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
@@ -171,28 +147,24 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
if strings.Contains(device, "loop") {
continue
}
if slices.Contains(m.config.ExcludeDevices, device) {
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
continue
}
if _, ok := m.devices[device]; !ok {
continue
}
// Update current and last values
entry := m.devices[device]
for name, idx := range m.matches {
if idx < len(linefields) {
x, err := strconv.ParseInt(linefields[idx], 0, 64)
if err == nil {
// Calculate difference using previous current and new value
diff := x - entry.currentValues[name]
y, err := lp.NewMetric(name, entry.tags, m.meta, int(diff), time.Now())
diff := x - entry.lastValues[name]
y, err := lp.NewMessage(name, entry.tags, m.meta, map[string]interface{}{"value": int(diff)}, time.Now())
if err == nil {
output <- y
}
// Update last to previous current, and current to new value
entry.lastValues[name] = entry.currentValues[name]
entry.currentValues[name] = x
}
entry.lastValues[name] = x
}
}
m.devices[device] = entry

View File

@@ -14,13 +14,14 @@ import (
"errors"
"fmt"
"io"
"log"
"os/exec"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const IPMISENSORS_PATH = `ipmi-sensors`
@@ -43,9 +44,7 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
}
m.name = "IpmiCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
m.meta = map[string]string{
"source": m.name,
@@ -117,20 +116,19 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
}
v, err := strconv.ParseFloat(strings.TrimSpace(lv[1]), 64)
if err == nil {
name := strings.ToLower(strings.ReplaceAll(strings.TrimSpace(lv[0]), " ", "_"))
name := strings.ToLower(strings.Replace(strings.TrimSpace(lv[0]), " ", "_", -1))
unit := strings.TrimSpace(lv[2])
switch unit {
case "Volts":
if unit == "Volts" {
unit = "Volts"
case "degrees C":
} else if unit == "degrees C" {
unit = "degC"
case "degrees F":
} else if unit == "degrees F" {
unit = "degF"
case "Watts":
} else if unit == "Watts" {
unit = "Watts"
}
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
if err == nil {
y.AddMeta("unit", unit)
output <- y
@@ -152,30 +150,23 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
// Setup ipmisensors command
command := exec.Command(cmd, "--comma-separated-output", "--sdr-cache-recreate")
stdout, _ := command.StdoutPipe()
errBuf := new(bytes.Buffer)
command.Stderr = errBuf
// start command
if err := command.Start(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("readIpmiSensors(): Failed to start command \"%s\": %v", command.String(), err),
)
command.Wait()
stdout, err := command.Output()
if err != nil {
log.Print(err)
return
}
// Read command output
scanner := bufio.NewScanner(stdout)
for scanner.Scan() {
lv := strings.Split(scanner.Text(), ",")
ll := strings.Split(string(stdout), "\n")
for _, line := range ll {
lv := strings.Split(line, ",")
if len(lv) > 3 {
v, err := strconv.ParseFloat(lv[3], 64)
if err == nil {
name := strings.ToLower(strings.ReplaceAll(lv[1], " ", "_"))
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
name := strings.ToLower(strings.Replace(lv[1], " ", "_", -1))
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
if err == nil {
if len(lv) > 4 {
y.AddMeta("unit", lv[4])
@@ -185,18 +176,6 @@ func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
}
}
}
// Wait for command end
if err := command.Wait(); err != nil {
errMsg, _ := io.ReadAll(errBuf)
cclog.ComponentError(
m.name,
fmt.Sprintf("readIpmiSensors(): Failed to wait for the end of command \"%s\": %v\n", command.String(), err),
)
cclog.ComponentError(m.name, fmt.Sprintf("readIpmiSensors(): command stderr: \"%s\"\n", strings.TrimSpace(string(errMsg))))
return
}
}
func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMessage) {

View File

@@ -12,6 +12,12 @@ package collectors
#cgo LDFLAGS: -Wl,--unresolved-symbols=ignore-in-object-files
#include <stdlib.h>
#include <likwid.h>
int _HPMaddThread(int cpuid) {
return HPMaddThread(cpuid);
}
*/
import "C"
@@ -19,7 +25,6 @@ import (
"encoding/json"
"errors"
"fmt"
"maps"
"math"
"os"
"os/signal"
@@ -32,8 +37,8 @@ import (
"time"
"unsafe"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"github.com/NVIDIA/go-nvml/pkg/dl"
@@ -188,7 +193,7 @@ func getBaseFreq() float64 {
for _, f := range files {
buffer, err := os.ReadFile(f)
if err == nil {
data := strings.ReplaceAll(string(buffer), "\n", "")
data := strings.Replace(string(buffer), "\n", "", -1)
x, err := strconv.ParseInt(data, 0, 64)
if err == nil {
freq = float64(x)
@@ -231,13 +236,13 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
if m.config.ForceOverwrite {
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
if err := os.Setenv("LIKWID_FORCE", "1"); err != nil {
return fmt.Errorf("error setting environment variable LIKWID_FORCE=1: %v", err)
}
}
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
os.Setenv("LIKWID_FORCE", "1")
}
m.setup()
major := C.likwid_getMajorVersion()
minor := C.likwid_getMinorVersion()
bugfix := C.likwid_getBugfixVersion()
cclog.ComponentDebug(m.name, fmt.Sprintf("Using LIKWID library %d.%d.%d at %s with %s access", major, minor, bugfix, m.config.LibraryPath, m.config.AccessMode))
m.meta = map[string]string{"group": "PerfCounter"}
cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
@@ -321,14 +326,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
case "accessdaemon":
if len(m.config.DaemonPath) > 0 {
p := os.Getenv("PATH")
if len(p) > 0 {
p = m.config.DaemonPath + ":" + p
} else {
p = m.config.DaemonPath
}
if err := os.Setenv("PATH", p); err != nil {
return fmt.Errorf("error setting environment variable PATH=%s: %v", p, err)
}
os.Setenv("PATH", m.config.DaemonPath+":"+p)
}
C.HPMmode(1)
retCode := C.HPMinit()
@@ -339,7 +337,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
for _, c := range m.cpulist {
m.measureThread.Call(
func() {
retCode := C.HPMaddThread(c)
retCode := C._HPMaddThread(C.int(c))
if retCode != 0 {
err := fmt.Errorf("C.HPMaddThread(%v) failed with return code %v", c, retCode)
cclog.ComponentError(m.name, err.Error())
@@ -387,18 +385,10 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
// Watch changes for the lock file ()
watcher, err := fsnotify.NewWatcher()
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("takeMeasurement(): Failed to create a new fsnotify.Watcher: %v", err))
cclog.ComponentError(m.name, err.Error())
return true, err
}
defer func() {
if err := watcher.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("takeMeasurement(): Failed to close fsnotify.Watcher: %v", err))
}
}()
defer watcher.Close()
if len(m.config.LockfilePath) > 0 {
// Check if the lock file exists
info, err := os.Stat(m.config.LockfilePath)
@@ -408,9 +398,7 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
if createErr != nil {
return true, fmt.Errorf("failed to create lock file: %v", createErr)
}
if err := file.Close(); err != nil {
return true, fmt.Errorf("failed to close lock file: %v", err)
}
file.Close()
info, err = os.Stat(m.config.LockfilePath) // Recheck the file after creation
}
if err != nil {
@@ -450,9 +438,13 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
case e := <-watcher.Events:
ret = -1
if e.Op != fsnotify.Chmod {
C.HPMfinalize()
C.HPMinit()
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
}
default:
C.HPMfinalize()
C.HPMinit()
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
}
if ret != 0 {
@@ -617,6 +609,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
evset.metrics[tid][metric.Name] = value
// Now we have the result, send it with the proper tags
if !math.IsNaN(value) && metric.Publish {
fields := map[string]interface{}{"value": value}
y, err :=
lp.NewMessage(
metric.Name,
@@ -624,9 +617,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
"type": metric.Type,
},
m.meta,
map[string]any{
"value": value,
},
fields,
now,
)
if err == nil {
@@ -664,7 +655,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
"type-id": fmt.Sprintf("%d", coreID),
},
m.meta,
map[string]any{
map[string]interface{}{
"value": value,
},
now,
@@ -701,7 +692,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
"type-id": fmt.Sprintf("%d", socketID),
},
m.meta,
map[string]any{
map[string]interface{}{
"value": value,
},
now,
@@ -735,7 +726,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
"type": "node",
},
m.meta,
map[string]any{
map[string]interface{}{
"value": totalNodeValue,
},
now,
@@ -771,7 +762,9 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
// Here we generate parameter list
params := make(map[string]float64)
for _, evset := range groups {
maps.Copy(params, evset.metrics[tid])
for mname, mres := range evset.metrics[tid] {
params[mname] = mres
}
}
params["gotime"] = interval.Seconds()
// Evaluate the metric
@@ -793,7 +786,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
"type": metric.Type,
},
m.meta,
map[string]any{
map[string]interface{}{
"value": value,
},
now,
@@ -834,21 +827,13 @@ func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMe
if !skip {
// read measurements and derive event set metrics
err = m.calcEventsetMetrics(e, interval, output)
if err != nil {
cclog.ComponentError(m.name, err.Error())
return
}
m.calcEventsetMetrics(e, interval, output)
groups = append(groups, e)
}
}
if len(groups) > 0 {
// calculate global metrics
err = m.calcGlobalMetrics(groups, interval, output)
if err != nil {
cclog.ComponentError(m.name, err.Error())
return
}
m.calcGlobalMetrics(groups, interval, output)
}
}

View File

@@ -11,13 +11,12 @@ import (
"encoding/json"
"fmt"
"os"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// LoadavgCollector collects:
@@ -43,9 +42,7 @@ type LoadavgCollector struct {
func (m *LoadavgCollector) Init(config json.RawMessage) error {
m.name = "LoadavgCollector"
m.parallel = true
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
if err != nil {
@@ -67,10 +64,10 @@ func (m *LoadavgCollector) Init(config json.RawMessage) error {
m.proc_skips = make([]bool, len(m.proc_matches))
for i, name := range m.load_matches {
m.load_skips[i] = slices.Contains(m.config.ExcludeMetrics, name)
_, m.load_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
}
for i, name := range m.proc_matches {
m.proc_skips[i] = slices.Contains(m.config.ExcludeMetrics, name)
_, m.proc_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
}
m.init = true
return nil
@@ -102,7 +99,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
if m.load_skips[i] {
continue
}
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
if err == nil {
output <- y
}
@@ -121,7 +118,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
if m.proc_skips[i] {
continue
}
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
if err == nil {
output <- y
}

View File

@@ -13,13 +13,12 @@ import (
"fmt"
"os/exec"
"os/user"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const LUSTRE_SYSFS = `/sys/fs/lustre`
@@ -62,6 +61,7 @@ func (m *LustreCollector) getDeviceDataCommand(device string) []string {
} else {
command = exec.Command(m.lctl, LCTL_OPTION, statsfile)
}
command.Wait()
stdout, _ := command.Output()
return strings.Split(string(stdout), "\n")
}
@@ -302,9 +302,7 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
return err
}
}
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.tags = map[string]string{"type": "node"}
m.meta = map[string]string{"source": m.name, "group": "Lustre"}
@@ -341,21 +339,21 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
m.definitions = []LustreMetricDefinition{}
if m.config.SendAbsoluteValues {
for _, def := range LustreAbsMetrics {
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def)
}
}
}
if m.config.SendDiffValues {
for _, def := range LustreDiffMetrics {
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def)
}
}
}
if m.config.SendDerivedValues {
for _, def := range LustreDeriveMetrics {
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def)
}
}
@@ -404,23 +402,23 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMessage)
} else {
use_x = devData[def.name]
}
var value any
var value interface{}
switch def.calc {
case "none":
value = use_x
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
case "difference":
value = use_x - devData[def.name]
if value.(int64) < 0 {
value = 0
}
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
case "derivative":
value = float64(use_x-devData[def.name]) / tdiff.Seconds()
if value.(float64) < 0 {
value = 0
}
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
}
if err == nil {
y.AddTag("device", device)

View File

@@ -15,13 +15,12 @@ import (
"os"
"path/filepath"
"regexp"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const MEMSTATFILE = "/proc/meminfo"
@@ -59,11 +58,7 @@ func getStats(filename string) map[string]MemstatStats {
if err != nil {
cclog.Error(err.Error())
}
defer func() {
if err := file.Close(); err != nil {
cclog.Error(err.Error())
}
}()
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
@@ -120,20 +115,19 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
"MemShared": "mem_shared",
}
for k, v := range matches {
if !slices.Contains(m.config.ExcludeMetrics, k) {
_, skip := stringArrayContains(m.config.ExcludeMetrics, k)
if !skip {
m.matches[k] = v
}
}
m.sendMemUsed = false
if !slices.Contains(m.config.ExcludeMetrics, "mem_used") {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_used"); !skip {
m.sendMemUsed = true
}
if len(m.matches) == 0 {
return errors.New("no metrics to collect")
}
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
if m.config.NodeStats {
if stats := getStats(MEMSTATFILE); len(stats) == 0 {
@@ -180,7 +174,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
sendStats := func(stats map[string]MemstatStats, tags map[string]string) {
for match, name := range m.matches {
var value float64 = 0
unit := ""
var unit string = ""
if v, ok := stats[match]; ok {
value = v.value
if len(v.unit) > 0 {
@@ -188,7 +182,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
}
}
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value}, time.Now())
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil {
if len(unit) > 0 {
y.AddMeta("unit", unit)
@@ -221,7 +215,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
}
}
}
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]any{"value": memUsed}, time.Now())
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now())
if err == nil {
if len(unit) > 0 {
y.AddMeta("unit", unit)

View File

@@ -12,7 +12,7 @@ import (
"fmt"
"time"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
type MetricCollector interface {
@@ -51,6 +51,30 @@ func (c *metricCollector) Initialized() bool {
return c.init
}
// intArrayContains scans an array of ints if the value str is present in the array
// If the specified value is found, the corresponding array index is returned.
// The bool value is used to signal success or failure
func intArrayContains(array []int, str int) (int, bool) {
for i, a := range array {
if a == str {
return i, true
}
}
return -1, false
}
// stringArrayContains scans an array of strings if the value str is present in the array
// If the specified value is found, the corresponding array index is returned.
// The bool value is used to signal success or failure
func stringArrayContains(array []string, str string) (int, bool) {
for i, a := range array {
if a == str {
return i, true
}
}
return -1, false
}
// RemoveFromStringList removes the string r from the array of strings s
// If r is not contained in the array an error is returned
func RemoveFromStringList(s []string, r string) ([]string, error) {

View File

@@ -10,15 +10,14 @@ package collectors
import (
"bufio"
"encoding/json"
"fmt"
"errors"
"os"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const NETSTATFILE = "/proc/net/dev"
@@ -66,9 +65,7 @@ func getCanonicalName(raw string, aliasToCanonical map[string]string) string {
func (m *NetstatCollector) Init(config json.RawMessage) error {
m.name = "NetstatCollector"
m.parallel = true
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.lastTimestamp = time.Now()
const (
@@ -110,8 +107,10 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
// Check access to net statistic file
file, err := os.Open(NETSTATFILE)
if err != nil {
return fmt.Errorf("%s Init(): failed to open netstat file \"%s\": %w", m.name, NETSTATFILE, err)
cclog.ComponentError(m.name, err.Error())
return err
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
@@ -130,7 +129,7 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
canonical := getCanonicalName(raw, m.aliasToCanonical)
// Check if device is a included device
if slices.Contains(m.config.IncludeDevices, canonical) {
if _, ok := stringArrayContains(m.config.IncludeDevices, canonical); ok {
// Tag will contain original device name (raw).
tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"}
meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"}
@@ -175,13 +174,8 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
}
}
// Close netstat file
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): failed to close netstat file \"%s\": %w", m.name, NETSTATFILE, err)
}
if len(m.matches) == 0 {
return fmt.Errorf("%s Init(): no devices to collect metrics found", m.name)
return errors.New("no devices to collector metrics found")
}
m.init = true
return nil
@@ -200,18 +194,10 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
file, err := os.Open(NETSTATFILE)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", NETSTATFILE, err))
cclog.ComponentError(m.name, err.Error())
return
}
defer func() {
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", NETSTATFILE, err))
}
}()
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
@@ -240,14 +226,14 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
continue
}
if m.config.SendAbsoluteValues {
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]any{"value": v}, now); err == nil {
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]interface{}{"value": v}, now); err == nil {
output <- y
}
}
if m.config.SendDerivedValues {
if metric.lastValue >= 0 {
rate := float64(v-metric.lastValue) / timeDiff
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]any{"value": rate}, now); err == nil {
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]interface{}{"value": rate}, now); err == nil {
output <- y
}
}

View File

@@ -11,7 +11,6 @@ import (
"encoding/json"
"fmt"
"log"
"slices"
// "os"
"os/exec"
@@ -19,8 +18,7 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// First part contains the code for the general NfsCollector.
@@ -46,15 +44,10 @@ type nfsCollector struct {
func (m *nfsCollector) initStats() error {
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
// Wait for cmd end
if err := cmd.Wait(); err != nil {
return fmt.Errorf("initStats(): %w", err)
}
cmd.Wait()
buffer, err := cmd.Output()
if err == nil {
for line := range strings.Lines(string(buffer)) {
for _, line := range strings.Split(string(buffer), "\n") {
lf := strings.Fields(line)
if len(lf) != 5 {
continue
@@ -78,15 +71,10 @@ func (m *nfsCollector) initStats() error {
func (m *nfsCollector) updateStats() error {
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
// Wait for cmd end
if err := cmd.Wait(); err != nil {
return fmt.Errorf("updateStats(): %w", err)
}
cmd.Wait()
buffer, err := cmd.Output()
if err == nil {
for line := range strings.Lines(string(buffer)) {
for _, line := range strings.Split(string(buffer), "\n") {
lf := strings.Fields(line)
if len(lf) != 5 {
continue
@@ -131,9 +119,7 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
return fmt.Errorf("NfsCollector.Init(): Failed to find nfsstat binary '%s': %v", m.config.Nfsstats, err)
}
m.data = make(map[string]NfsCollectorData)
if err := m.initStats(); err != nil {
return fmt.Errorf("NfsCollector.Init(): %w", err)
}
m.initStats()
m.init = true
m.parallel = true
return nil
@@ -145,13 +131,7 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
}
timestamp := time.Now()
if err := m.updateStats(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): updateStats() failed: %v", err),
)
return
}
m.updateStats()
prefix := ""
switch m.version {
case "v3":
@@ -163,11 +143,11 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
}
for name, data := range m.data {
if slices.Contains(m.config.ExcludeMetrics, name) {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, name); skip {
continue
}
value := data.current - data.last
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
y.AddMeta("version", m.version)
output <- y
@@ -190,17 +170,13 @@ type Nfs4Collector struct {
func (m *Nfs3Collector) Init(config json.RawMessage) error {
m.name = "Nfs3Collector"
m.version = `v3`
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
return m.MainInit(config)
}
func (m *Nfs4Collector) Init(config json.RawMessage) error {
m.name = "Nfs4Collector"
m.version = `v4`
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
return m.MainInit(config)
}

View File

@@ -12,13 +12,12 @@ import (
"fmt"
"os"
"regexp"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// These are the fields we read from the JSON configuration
@@ -72,7 +71,7 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
// Is this a device line with mount point, remote target and NFS version?
dev := resolve_regex_fields(l, deviceRegex)
if len(dev) > 0 {
if !slices.Contains(m.config.ExcludeFilesystem, dev[m.key]) {
if _, ok := stringArrayContains(m.config.ExcludeFilesystem, dev[m.key]); !ok {
current = dev
if len(current["version"]) == 0 {
current["version"] = "3"
@@ -86,7 +85,7 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
if len(bytes) > 0 {
data[current[m.key]] = make(map[string]int64)
for name, sval := range bytes {
if !slices.Contains(m.config.ExcludeMetrics, name) {
if _, ok := stringArrayContains(m.config.ExcludeMetrics, name); !ok {
val, err := strconv.ParseInt(sval, 10, 64)
if err == nil {
data[current[m.key]][name] = val
@@ -103,9 +102,7 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
var err error = nil
m.name = "NfsIOStatCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "NFS", "unit": "bytes"}
m.tags = map[string]string{"type": "node"}
@@ -143,7 +140,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
if old, ok := m.data[mntpoint]; ok {
for name, newVal := range values {
if m.config.SendAbsoluteValues {
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", name), m.tags, m.meta, map[string]any{"value": newVal}, now)
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", name), m.tags, m.meta, map[string]interface{}{"value": newVal}, now)
if err == nil {
msg.AddTag("stype", "filesystem")
msg.AddTag("stype-id", mntpoint)
@@ -152,7 +149,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
}
if m.config.SendDerivedValues {
rate := float64(newVal-old[name]) / timeDiff
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]any{"value": rate}, now)
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]interface{}{"value": rate}, now)
if err == nil {
if strings.HasPrefix(name, "page") {
msg.AddMeta("unit", "4K_pages/s")

View File

@@ -10,8 +10,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
type NUMAStatsCollectorConfig struct {
@@ -72,9 +72,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
m.name = "NUMAStatsCollector"
m.parallel = true
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.meta = map[string]string{
"source": m.name,
"group": "NUMA",
@@ -104,11 +102,8 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
file := filepath.Join(dir, "numastat")
m.topology = append(m.topology,
NUMAStatsCollectorTopolgy{
file: file,
tagSet: map[string]string{
"type": "memoryDomain",
"type-id": node,
},
file: file,
tagSet: map[string]string{"memoryDomain": node},
previousValues: make(map[string]int64),
})
}
@@ -188,11 +183,7 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
t.previousValues[key] = value
}
}
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", t.file, err))
}
file.Close()
}
}

View File

@@ -12,13 +12,11 @@ import (
"errors"
"fmt"
"log"
"maps"
"slices"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"github.com/NVIDIA/go-nvml/pkg/nvml"
)
@@ -66,9 +64,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
m.config.ProcessMigDevices = false
m.config.UseUuidForMigDevices = false
m.config.UseSliceForMigDevices = false
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
@@ -109,11 +105,11 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
// For all GPUs
idx := 0
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
for i := range num_gpus {
for i := 0; i < num_gpus; i++ {
// Skip excluded devices by ID
str_i := fmt.Sprintf("%d", i)
if slices.Contains(m.config.ExcludeDevices, str_i) {
if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip {
cclog.ComponentDebug(m.name, "Skipping excluded device", str_i)
continue
}
@@ -141,7 +137,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
pciInfo.Device)
// Skip excluded devices specified by PCI ID
if slices.Contains(m.config.ExcludeDevices, pci_id) {
if _, skip := stringArrayContains(m.config.ExcludeDevices, pci_id); skip {
cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id)
continue
}
@@ -226,20 +222,18 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
var total uint64
var used uint64
var reserved uint64 = 0
v2 := false
var v2 bool = false
meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
if ret != nvml.SUCCESS {
err := errors.New(nvml.ErrorString(ret))
return err
}
// Total physical device memory (in bytes)
total = meminfo.Total
// Sum of Reserved and Allocated device memory (in bytes)
used = meminfo.Used
if !device.excludeMetrics["nv_fb_mem_total"] {
t := float64(total) / (1024 * 1024)
y, err := lp.NewMessage("nv_fb_mem_total", device.tags, device.meta, map[string]any{"value": t}, time.Now())
y, err := lp.NewMessage("nv_fb_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
@@ -248,7 +242,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
if !device.excludeMetrics["nv_fb_mem_used"] {
f := float64(used) / (1024 * 1024)
y, err := lp.NewMessage("nv_fb_mem_used", device.tags, device.meta, map[string]any{"value": f}, time.Now())
y, err := lp.NewMessage("nv_fb_mem_used", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
@@ -257,7 +251,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] {
r := float64(reserved) / (1024 * 1024)
y, err := lp.NewMessage("nv_fb_mem_reserved", device.tags, device.meta, map[string]any{"value": r}, time.Now())
y, err := lp.NewMessage("nv_fb_mem_reserved", device.tags, device.meta, map[string]interface{}{"value": r}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
@@ -276,7 +270,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
}
if !device.excludeMetrics["nv_bar1_mem_total"] {
t := float64(meminfo.Bar1Total) / (1024 * 1024)
y, err := lp.NewMessage("nv_bar1_mem_total", device.tags, device.meta, map[string]any{"value": t}, time.Now())
y, err := lp.NewMessage("nv_bar1_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
@@ -284,7 +278,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
}
if !device.excludeMetrics["nv_bar1_mem_used"] {
t := float64(meminfo.Bar1Used) / (1024 * 1024)
y, err := lp.NewMessage("nv_bar1_mem_used", device.tags, device.meta, map[string]any{"value": t}, time.Now())
y, err := lp.NewMessage("nv_bar1_mem_used", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
@@ -318,14 +312,14 @@ func readUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
util, ret := nvml.DeviceGetUtilizationRates(device.device)
if ret == nvml.SUCCESS {
if !device.excludeMetrics["nv_util"] {
y, err := lp.NewMessage("nv_util", device.tags, device.meta, map[string]any{"value": float64(util.Gpu)}, time.Now())
y, err := lp.NewMessage("nv_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
if err == nil {
y.AddMeta("unit", "%")
output <- y
}
}
if !device.excludeMetrics["nv_mem_util"] {
y, err := lp.NewMessage("nv_mem_util", device.tags, device.meta, map[string]any{"value": float64(util.Memory)}, time.Now())
y, err := lp.NewMessage("nv_mem_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
if err == nil {
y.AddMeta("unit", "%")
output <- y
@@ -345,7 +339,7 @@ func readTemp(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
// * NVML_TEMPERATURE_COUNT
temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_temp", device.tags, device.meta, map[string]any{"value": float64(temp)}, time.Now())
y, err := lp.NewMessage("nv_temp", device.tags, device.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
if err == nil {
y.AddMeta("unit", "degC")
output <- y
@@ -368,7 +362,7 @@ func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
// This value may exceed 100% in certain cases.
fan, ret := nvml.DeviceGetFanSpeed(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]any{"value": float64(fan)}, time.Now())
y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
if err == nil {
y.AddMeta("unit", "%")
output <- y
@@ -409,23 +403,22 @@ func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error
// Changing ECC modes requires a reboot.
// The "pending" ECC mode refers to the target mode following the next reboot.
_, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
switch ret {
case nvml.SUCCESS:
if ret == nvml.SUCCESS {
var y lp.CCMessage
var err error
switch ecc_pend {
case nvml.FEATURE_DISABLED:
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "OFF"}, time.Now())
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "OFF"}, time.Now())
case nvml.FEATURE_ENABLED:
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "ON"}, time.Now())
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "ON"}, time.Now())
default:
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "UNKNOWN"}, time.Now())
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now())
}
if err == nil {
output <- y
}
case nvml.ERROR_NOT_SUPPORTED:
y, err := lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "N/A"}, time.Now())
} else if ret == nvml.ERROR_NOT_SUPPORTED {
y, err := lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "N/A"}, time.Now())
if err == nil {
output <- y
}
@@ -445,7 +438,7 @@ func readPerfState(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
// 32: Unknown performance state.
pState, ret := nvml.DeviceGetPerformanceState(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_perf_state", device.tags, device.meta, map[string]any{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
y, err := lp.NewMessage("nv_perf_state", device.tags, device.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
if err == nil {
output <- y
}
@@ -471,7 +464,7 @@ func readPowerUsage(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
if mode == nvml.FEATURE_ENABLED {
power, ret := nvml.DeviceGetPowerUsage(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_power_usage", device.tags, device.meta, map[string]any{"value": float64(power) / 1000}, time.Now())
y, err := lp.NewMessage("nv_power_usage", device.tags, device.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
if err == nil {
y.AddMeta("unit", "watts")
output <- y
@@ -539,7 +532,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_graphics_clock"] {
graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_graphics_clock", device.tags, device.meta, map[string]any{"value": float64(graphicsClock)}, time.Now())
y, err := lp.NewMessage("nv_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -550,7 +543,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_sm_clock"] {
smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_sm_clock", device.tags, device.meta, map[string]any{"value": float64(smCock)}, time.Now())
y, err := lp.NewMessage("nv_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(smCock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -561,7 +554,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_mem_clock"] {
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_mem_clock", device.tags, device.meta, map[string]any{"value": float64(memClock)}, time.Now())
y, err := lp.NewMessage("nv_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -571,7 +564,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_video_clock"] {
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_video_clock", device.tags, device.meta, map[string]any{"value": float64(memClock)}, time.Now())
y, err := lp.NewMessage("nv_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -596,7 +589,7 @@ func readMaxClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
if !device.excludeMetrics["nv_max_graphics_clock"] {
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_max_graphics_clock", device.tags, device.meta, float64(max_gclk), time.Now())
y, err := lp.NewMessage("nv_max_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -605,9 +598,9 @@ func readMaxClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
}
if !device.excludeMetrics["nv_max_sm_clock"] {
maxSmClock, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_SM)
maxSmClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_max_sm_clock", device.tags, device.meta, float64(maxSmClock), time.Now())
y, err := lp.NewMessage("nv_max_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxSmClock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -616,9 +609,9 @@ func readMaxClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
}
if !device.excludeMetrics["nv_max_mem_clock"] {
maxMemClock, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_MEM)
maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_max_mem_clock", device.tags, device.meta, float64(maxMemClock), time.Now())
y, err := lp.NewMessage("nv_max_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -627,9 +620,9 @@ func readMaxClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
}
if !device.excludeMetrics["nv_max_video_clock"] {
maxVideoClock, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_VIDEO)
maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_max_video_clock", device.tags, device.meta, float64(maxVideoClock), time.Now())
y, err := lp.NewMessage("nv_max_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -652,7 +645,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
// i.e. the total set of errors across the entire device.
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]any{"value": float64(ecc_db)}, time.Now())
y, err := lp.NewMessage("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
if err == nil {
output <- y
}
@@ -661,7 +654,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
if !device.excludeMetrics["nv_ecc_corrected_error"] {
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_ecc_corrected_error", device.tags, device.meta, map[string]any{"value": float64(ecc_sb)}, time.Now())
y, err := lp.NewMessage("nv_ecc_corrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
if err == nil {
output <- y
}
@@ -680,7 +673,7 @@ func readPowerLimit(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
// If the card's total power draw reaches this limit the power management algorithm kicks in.
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_power_max_limit", device.tags, device.meta, map[string]any{"value": float64(pwr_limit) / 1000}, time.Now())
y, err := lp.NewMessage("nv_power_max_limit", device.tags, device.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now())
if err == nil {
y.AddMeta("unit", "watts")
output <- y
@@ -707,7 +700,7 @@ func readEncUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_encoder_util", device.tags, device.meta, map[string]any{"value": float64(enc_util)}, time.Now())
y, err := lp.NewMessage("nv_encoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
if err == nil {
y.AddMeta("unit", "%")
output <- y
@@ -734,7 +727,7 @@ func readDecUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_decoder_util", device.tags, device.meta, map[string]any{"value": float64(dec_util)}, time.Now())
y, err := lp.NewMessage("nv_decoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
if err == nil {
y.AddMeta("unit", "%")
output <- y
@@ -761,33 +754,33 @@ func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) e
corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device)
if ret == nvml.SUCCESS {
if !device.excludeMetrics["nv_remapped_rows_corrected"] {
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]any{"value": float64(corrected)}, time.Now())
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(corrected)}, time.Now())
if err == nil {
output <- y
}
}
if !device.excludeMetrics["nv_remapped_rows_uncorrected"] {
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]any{"value": float64(uncorrected)}, time.Now())
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(uncorrected)}, time.Now())
if err == nil {
output <- y
}
}
if !device.excludeMetrics["nv_remapped_rows_pending"] {
p := 0
var p int = 0
if pending {
p = 1
}
y, err := lp.NewMessage("nv_remapped_rows_pending", device.tags, device.meta, map[string]any{"value": p}, time.Now())
y, err := lp.NewMessage("nv_remapped_rows_pending", device.tags, device.meta, map[string]interface{}{"value": p}, time.Now())
if err == nil {
output <- y
}
}
if !device.excludeMetrics["nv_remapped_rows_failure"] {
f := 0
var f int = 0
if failure {
f = 1
}
y, err := lp.NewMessage("nv_remapped_rows_failure", device.tags, device.meta, map[string]any{"value": f}, time.Now())
y, err := lp.NewMessage("nv_remapped_rows_failure", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
if err == nil {
output <- y
}
@@ -821,7 +814,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_compute_processes", device.tags, device.meta, map[string]any{"value": len(procList)}, time.Now())
y, err := lp.NewMessage("nv_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
if err == nil {
output <- y
}
@@ -850,7 +843,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_graphics_processes", device.tags, device.meta, map[string]any{"value": len(procList)}, time.Now())
y, err := lp.NewMessage("nv_graphics_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
if err == nil {
output <- y
}
@@ -908,7 +901,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_power", device.tags, device.meta, map[string]any{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_power", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -920,7 +913,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_thermal", device.tags, device.meta, map[string]any{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_thermal", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -932,7 +925,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_sync_boost", device.tags, device.meta, map[string]any{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_sync_boost", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -944,7 +937,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_board_limit", device.tags, device.meta, map[string]any{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_board_limit", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -956,7 +949,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_low_util", device.tags, device.meta, map[string]any{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_low_util", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -968,7 +961,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_reliability", device.tags, device.meta, map[string]any{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_reliability", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -980,7 +973,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_below_app_clock", device.tags, device.meta, map[string]any{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_below_app_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -992,7 +985,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_below_base_clock", device.tags, device.meta, map[string]any{"value": t}, time.Now())
y, err := lp.NewMessage("nv_violation_below_base_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -1015,7 +1008,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
var aggregate_recovery_errors uint64 = 0
var aggregate_crc_flit_errors uint64 = 0
for i := range nvml.NVLINK_MAX_LINKS {
for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ {
state, ret := nvml.DeviceGetNvLinkState(device.device, i)
if ret == nvml.SUCCESS {
if state == nvml.FEATURE_ENABLED {
@@ -1024,7 +1017,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA)
aggregate_crc_errors = aggregate_crc_errors + count
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_nvlink_crc_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now())
y, err := lp.NewMessage("nv_nvlink_crc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
y.AddTag("stype-id", fmt.Sprintf("%d", i))
@@ -1037,7 +1030,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA)
aggregate_ecc_errors = aggregate_ecc_errors + count
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now())
y, err := lp.NewMessage("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
y.AddTag("stype-id", fmt.Sprintf("%d", i))
@@ -1050,7 +1043,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY)
aggregate_replay_errors = aggregate_replay_errors + count
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_nvlink_replay_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now())
y, err := lp.NewMessage("nv_nvlink_replay_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
y.AddTag("stype-id", fmt.Sprintf("%d", i))
@@ -1063,7 +1056,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY)
aggregate_recovery_errors = aggregate_recovery_errors + count
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now())
y, err := lp.NewMessage("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
y.AddTag("stype-id", fmt.Sprintf("%d", i))
@@ -1076,7 +1069,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT)
aggregate_crc_flit_errors = aggregate_crc_flit_errors + count
if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now())
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
y.AddTag("stype-id", fmt.Sprintf("%d", i))
@@ -1091,7 +1084,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
// Export aggegated values
if !device.excludeMetrics["nv_nvlink_crc_errors"] {
// Data link receive data CRC error counter
y, err := lp.NewMessage("nv_nvlink_crc_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_crc_errors}, time.Now())
y, err := lp.NewMessage("nv_nvlink_crc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_errors}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
output <- y
@@ -1099,7 +1092,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
}
if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
// Data link receive data ECC error counter
y, err := lp.NewMessage("nv_nvlink_ecc_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_ecc_errors}, time.Now())
y, err := lp.NewMessage("nv_nvlink_ecc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_ecc_errors}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
output <- y
@@ -1107,7 +1100,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
}
if !device.excludeMetrics["nv_nvlink_replay_errors"] {
// Data link transmit replay error counter
y, err := lp.NewMessage("nv_nvlink_replay_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_replay_errors}, time.Now())
y, err := lp.NewMessage("nv_nvlink_replay_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_replay_errors}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
output <- y
@@ -1115,7 +1108,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
}
if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
// Data link transmit recovery error counter
y, err := lp.NewMessage("nv_nvlink_recovery_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_recovery_errors}, time.Now())
y, err := lp.NewMessage("nv_nvlink_recovery_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_recovery_errors}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
output <- y
@@ -1123,7 +1116,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
}
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
// Data link receive flow control digit CRC error counter
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_crc_flit_errors}, time.Now())
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_flit_errors}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
output <- y
@@ -1263,7 +1256,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
}
cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i)
for j := range maxMig {
for j := 0; j < maxMig; j++ {
mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
if ret != nvml.SUCCESS {
continue
@@ -1280,7 +1273,9 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
meta: map[string]string{},
excludeMetrics: excludeMetrics,
}
maps.Copy(migDevice.tags, m.gpus[i].tags)
for k, v := range m.gpus[i].tags {
migDevice.tags[k] = v
}
migDevice.tags["stype"] = "mig"
if m.config.UseUuidForMigDevices {
uuid, ret := nvml.DeviceGetUUID(mdev)
@@ -1294,8 +1289,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
if ret == nvml.SUCCESS {
mname, ret := nvml.DeviceGetName(mdev)
if ret == nvml.SUCCESS {
x := strings.ReplaceAll(mname, name, "")
x = strings.ReplaceAll(x, "MIG", "")
x := strings.Replace(mname, name, "", -1)
x = strings.Replace(x, "MIG", "", -1)
x = strings.TrimSpace(x)
migDevice.tags["stype-id"] = x
}
@@ -1304,7 +1299,9 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
if _, ok := migDevice.tags["stype-id"]; !ok {
migDevice.tags["stype-id"] = fmt.Sprintf("%d", j)
}
maps.Copy(migDevice.meta, m.gpus[i].meta)
for k, v := range m.gpus[i].meta {
migDevice.meta[k] = v
}
if _, ok := migDevice.meta["uuid"]; ok && !m.config.UseUuidForMigDevices {
uuid, ret := nvml.DeviceGetUUID(mdev)
if ret == nvml.SUCCESS {
@@ -1320,9 +1317,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
func (m *NvidiaCollector) Close() {
if m.init {
if ret := nvml.Shutdown(); ret != nvml.SUCCESS {
cclog.ComponentError(m.name, "nvml.Shutdown() not successful")
}
nvml.Shutdown()
m.init = false
}
}

View File

@@ -16,8 +16,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// running average power limit (RAPL) monitoring attributes for a zone
@@ -54,10 +54,9 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
return nil
}
var err error = nil
m.name = "RAPLCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
m.meta = map[string]string{
"source": m.name,
@@ -67,7 +66,7 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
// Read in the JSON configuration
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
@@ -249,7 +248,7 @@ func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) {
"rapl_average_power",
p.tags,
m.meta,
map[string]any{"value": averagePower},
map[string]interface{}{"value": averagePower},
energyTimestamp)
if err == nil {
output <- y

View File

@@ -11,11 +11,10 @@ import (
"encoding/json"
"errors"
"fmt"
"slices"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
)
@@ -53,9 +52,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
// Always set the name early in Init() to use it in cclog.Component* functions
m.name = "RocmSmiCollector"
// This is for later use, also call it early
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
// Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta())
//m.meta = map[string]string{"source": m.name, "group": "AMD"}
@@ -88,11 +85,22 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
return err
}
exclDev := func(s string) bool {
skip_device := false
for _, excl := range m.config.ExcludeDevices {
if excl == s {
skip_device = true
break
}
}
return skip_device
}
m.devices = make([]RocmSmiCollectorDevice, 0)
for i := range numDevs {
for i := 0; i < numDevs; i++ {
str_i := fmt.Sprintf("%d", i)
if slices.Contains(m.config.ExcludeDevices, str_i) {
if exclDev(str_i) {
continue
}
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
@@ -116,7 +124,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
pciInfo.Device,
pciInfo.Function)
if slices.Contains(m.config.ExcludeDevices, pciId) {
if exclDev(pciId) {
continue
}
@@ -174,127 +182,127 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage
if !dev.excludeMetrics["rocm_gfx_util"] {
value := metrics.Average_gfx_activity
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_umc_util"] {
value := metrics.Average_umc_activity
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_mm_util"] {
value := metrics.Average_mm_activity
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_avg_power"] {
value := metrics.Average_socket_power
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_mem"] {
value := metrics.Temperature_mem
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_hotspot"] {
value := metrics.Temperature_hotspot
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_edge"] {
value := metrics.Temperature_edge
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_vrgfx"] {
value := metrics.Temperature_vrgfx
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_vrsoc"] {
value := metrics.Temperature_vrsoc
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_vrmem"] {
value := metrics.Temperature_vrmem
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_gfx_clock"] {
value := metrics.Average_gfxclk_frequency
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_soc_clock"] {
value := metrics.Average_socclk_frequency
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_u_clock"] {
value := metrics.Average_uclk_frequency
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_v0_clock"] {
value := metrics.Average_vclk0_frequency
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_v1_clock"] {
value := metrics.Average_vclk1_frequency
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_d0_clock"] {
value := metrics.Average_dclk0_frequency
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_d1_clock"] {
value := metrics.Average_dclk1_frequency
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_hbm"] {
for i := range rocm_smi.NUM_HBM_INSTANCES {
for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ {
value := metrics.Temperature_hbm[i]
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
y.AddTag("stype", "device")
y.AddTag("stype-id", fmt.Sprintf("%d", i))

View File

@@ -15,9 +15,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/rocmsmi.md
```json
"rocm_smi": {
"exclude_devices": [
"0",
"1",
"0000000:ff:01.0"
"0","1", "0000000:ff:01.0"
],
"exclude_metrics": [
"rocm_mm_util",
@@ -25,7 +23,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/rocmsmi.md
],
"use_pci_info_as_type_id": true,
"add_pci_info_tag": false,
"add_serial_meta": false
"add_serial_meta": false,
}
```

View File

@@ -9,11 +9,10 @@ package collectors
import (
"encoding/json"
"fmt"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// These are the fields we read from the JSON configuration
@@ -42,9 +41,7 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
// Always set the name early in Init() to use it in cclog.Component* functions
m.name = "SampleCollector"
// This is for later use, also call it early
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
// Tell whether the collector should be run in parallel with others (reading files, ...)
// or it should be run serially, mostly for collectors actually doing measurements
// because they should not measure the execution of the other collectors
@@ -95,7 +92,7 @@ func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMessage)
// stop := readState()
// value = (stop - start) / interval.Seconds()
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
// Send it to output channel
output <- y

View File

@@ -9,12 +9,11 @@ package collectors
import (
"encoding/json"
"fmt"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// These are the fields we read from the JSON configuration
@@ -41,9 +40,7 @@ func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
// Always set the name early in Init() to use it in cclog.Component* functions
m.name = "SampleTimerCollector"
// This is for later use, also call it early
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
// Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta())
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
@@ -110,7 +107,7 @@ func (m *SampleTimerCollector) ReadMetrics(timestamp time.Time) {
// stop := readState()
// value = (stop - start) / interval.Seconds()
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil && m.output != nil {
// Send it to output channel if we have a valid channel
m.output <- y

View File

@@ -11,13 +11,14 @@ import (
"bufio"
"encoding/json"
"fmt"
"math"
"os"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const SCHEDSTATFILE = `/proc/schedstat`
@@ -46,37 +47,37 @@ type SchedstatCollector struct {
// Called once by the collector manager
// All tags, meta data tags and metrics that do not change over the runtime should be set here
func (m *SchedstatCollector) Init(config json.RawMessage) error {
var err error = nil
// Always set the name early in Init() to use it in cclog.Component* functions
m.name = "SchedstatCollector"
// This is for later use, also call it early
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
// Tell whether the collector should be run in parallel with others (reading files, ...)
// or it should be run serially, mostly for collectors acutally doing measurements
// because they should not measure the execution of the other collectors
m.parallel = true
// Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta())
m.meta = map[string]string{
"source": m.name,
"group": "SCHEDSTAT",
}
m.meta = map[string]string{"source": m.name, "group": "SCHEDSTAT"}
// Read in the JSON configuration
if len(config) > 0 {
if err := json.Unmarshal(config, &m.config); err != nil {
return fmt.Errorf("%s Init(): Error reading config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
// Check input file
file, err := os.Open(SCHEDSTATFILE)
file, err := os.Open(string(SCHEDSTATFILE))
if err != nil {
return fmt.Errorf("%s Init(): Failed opening scheduler statistics file \"%s\": %w", m.name, SCHEDSTATFILE, err)
cclog.ComponentError(m.name, err.Error())
}
defer file.Close()
// Pre-generate tags for all CPUs
num_cpus := 0
m.cputags = make(map[string]map[string]string)
m.olddata = make(map[string]map[string]int64)
scanner := bufio.NewScanner(file)
@@ -88,19 +89,11 @@ func (m *SchedstatCollector) Init(config json.RawMessage) error {
cpu, _ := strconv.Atoi(cpustr)
running, _ := strconv.ParseInt(linefields[7], 10, 64)
waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
m.cputags[linefields[0]] = map[string]string{
"type": "hwthread",
"type-id": fmt.Sprintf("%d", cpu),
}
m.olddata[linefields[0]] = map[string]int64{
"running": running,
"waiting": waiting,
}
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
m.olddata[linefields[0]] = map[string]int64{"running": running, "waiting": waiting}
num_cpus++
}
}
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): Failed closing scheduler statistics file \"%s\": %w", m.name, SCHEDSTATFILE, err)
}
// Save current timestamp
m.lastTimestamp = time.Now()
@@ -116,14 +109,14 @@ func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]
diff_running := running - m.olddata[linefields[0]]["running"]
diff_waiting := waiting - m.olddata[linefields[0]]["waiting"]
l_running := float64(diff_running) / tsdelta.Seconds() / 1000_000_000
l_waiting := float64(diff_waiting) / tsdelta.Seconds() / 1000_000_000
var l_running float64 = float64(diff_running) / tsdelta.Seconds() / (math.Pow(1000, 3))
var l_waiting float64 = float64(diff_waiting) / tsdelta.Seconds() / (math.Pow(1000, 3))
m.olddata[linefields[0]]["running"] = running
m.olddata[linefields[0]]["waiting"] = waiting
value := l_running + l_waiting
y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]any{"value": value}, now)
y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now)
if err == nil {
// Send it to output channel
output <- y
@@ -141,19 +134,11 @@ func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessa
now := time.Now()
tsdelta := now.Sub(m.lastTimestamp)
file, err := os.Open(SCHEDSTATFILE)
file, err := os.Open(string(SCHEDSTATFILE))
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", SCHEDSTATFILE, err))
cclog.ComponentError(m.name, err.Error())
}
defer func() {
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", SCHEDSTATFILE, err))
}
}()
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {

View File

@@ -9,13 +9,12 @@ package collectors
import (
"encoding/json"
"fmt"
"runtime"
"syscall"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
type SelfCollectorConfig struct {
@@ -35,9 +34,7 @@ type SelfCollector struct {
func (m *SelfCollector) Init(config json.RawMessage) error {
var err error = nil
m.name = "SelfCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Self"}
m.tags = map[string]string{"type": "node"}
@@ -59,49 +56,49 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
var memstats runtime.MemStats
runtime.ReadMemStats(&memstats)
y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]any{"value": memstats.TotalAlloc}, timestamp)
y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]any{"value": memstats.HeapAlloc}, timestamp)
y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]any{"value": memstats.HeapSys}, timestamp)
y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]any{"value": memstats.HeapIdle}, timestamp)
y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]any{"value": memstats.HeapInuse}, timestamp)
y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]any{"value": memstats.HeapReleased}, timestamp)
y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]any{"value": memstats.HeapObjects}, timestamp)
y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp)
if err == nil {
output <- y
}
}
if m.config.GoRoutines {
y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]any{"value": runtime.NumGoroutine()}, timestamp)
y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp)
if err == nil {
output <- y
}
}
if m.config.CgoCalls {
y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]any{"value": runtime.NumCgoCall()}, timestamp)
y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp)
if err == nil {
output <- y
}
@@ -112,35 +109,35 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if err == nil {
sec, nsec := rusage.Utime.Unix()
t := float64(sec) + (float64(nsec) * 1e-9)
y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]any{"value": t}, timestamp)
y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
if err == nil {
y.AddMeta("unit", "seconds")
output <- y
}
sec, nsec = rusage.Stime.Unix()
t = float64(sec) + (float64(nsec) * 1e-9)
y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]any{"value": t}, timestamp)
y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
if err == nil {
y.AddMeta("unit", "seconds")
output <- y
}
y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]any{"value": rusage.Nvcsw}, timestamp)
y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp)
if err == nil {
output <- y
}
y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]any{"value": rusage.Nivcsw}, timestamp)
y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp)
if err == nil {
output <- y
}
y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]any{"value": rusage.Nsignals}, timestamp)
y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp)
if err == nil {
output <- y
}
y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]any{"value": rusage.Majflt}, timestamp)
y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp)
if err == nil {
output <- y
}
y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]any{"value": rusage.Minflt}, timestamp)
y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp)
if err == nil {
output <- y
}

View File

@@ -1,350 +0,0 @@
package collectors
import (
"encoding/json"
"fmt"
"os"
"os/exec"
"os/user"
"path/filepath"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
)
type SlurmJobData struct {
MemoryUsage float64
MaxMemoryUsage float64
LimitMemoryUsage float64
CpuUsageUser float64
CpuUsageSys float64
CpuSet []int
}
type SlurmCgroupsConfig struct {
CgroupBase string `json:"cgroup_base"`
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
UseSudo bool `json:"use_sudo,omitempty"`
}
type SlurmCgroupCollector struct {
metricCollector
config SlurmCgroupsConfig
meta map[string]string
tags map[string]string
allCPUs []int
cpuUsed map[int]bool
cgroupBase string
excludeMetrics map[string]struct{}
useSudo bool
}
const defaultCgroupBase = "/sys/fs/cgroup/system.slice/slurmstepd.scope"
func ParseCPUs(cpuset string) ([]int, error) {
var result []int
if cpuset == "" {
return result, nil
}
for r := range strings.SplitSeq(cpuset, ",") {
if strings.Contains(r, "-") {
parts := strings.Split(r, "-")
if len(parts) != 2 {
return nil, fmt.Errorf("invalid CPU range: %s", r)
}
start, err := strconv.Atoi(strings.TrimSpace(parts[0]))
if err != nil {
return nil, fmt.Errorf("invalid CPU range start: %s", parts[0])
}
end, err := strconv.Atoi(strings.TrimSpace(parts[1]))
if err != nil {
return nil, fmt.Errorf("invalid CPU range end: %s", parts[1])
}
for i := start; i <= end; i++ {
result = append(result, i)
}
} else {
cpu, err := strconv.Atoi(strings.TrimSpace(r))
if err != nil {
return nil, fmt.Errorf("invalid CPU ID: %s", r)
}
result = append(result, cpu)
}
}
return result, nil
}
func GetAllCPUs() ([]int, error) {
data, err := os.ReadFile("/sys/devices/system/cpu/online")
if err != nil {
return nil, fmt.Errorf("failed to read /sys/devices/system/cpu/online: %v", err)
}
return ParseCPUs(strings.TrimSpace(string(data)))
}
func (m *SlurmCgroupCollector) isExcluded(metric string) bool {
_, found := m.excludeMetrics[metric]
return found
}
func (m *SlurmCgroupCollector) readFile(path string) ([]byte, error) {
if m.useSudo {
cmd := exec.Command("sudo", "cat", path)
return cmd.Output()
}
return os.ReadFile(path)
}
func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
var err error
m.name = "SlurmCgroupCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "SLURM"}
m.tags = map[string]string{"type": "hwthread"}
m.cpuUsed = make(map[int]bool)
m.cgroupBase = defaultCgroupBase
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
m.excludeMetrics = make(map[string]struct{})
for _, metric := range m.config.ExcludeMetrics {
m.excludeMetrics[metric] = struct{}{}
}
if m.config.CgroupBase != "" {
m.cgroupBase = m.config.CgroupBase
}
}
m.useSudo = m.config.UseSudo
if !m.useSudo {
user, err := user.Current()
if err != nil {
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
return err
}
if user.Uid != "0" {
cclog.ComponentError(m.name, "Reading cgroup files requires root privileges (or enable use_sudo in config)")
return fmt.Errorf("not root")
}
}
m.allCPUs, err = GetAllCPUs()
if err != nil {
cclog.ComponentError(m.name, "Error reading online CPUs:", err.Error())
return err
}
m.init = true
return nil
}
func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error) {
jobdata := SlurmJobData{
MemoryUsage: 0,
MaxMemoryUsage: 0,
LimitMemoryUsage: 0,
CpuUsageUser: 0,
CpuUsageSys: 0,
CpuSet: []int{},
}
cg := func(f string) string { return filepath.Join(m.cgroupBase, jobdir, f) }
memUsage, err := m.readFile(cg("memory.current"))
if err == nil {
x, err := strconv.ParseFloat(strings.TrimSpace(string(memUsage)), 64)
if err == nil {
jobdata.MemoryUsage = x
}
}
maxMem, err := m.readFile(cg("memory.peak"))
if err == nil {
x, err := strconv.ParseFloat(strings.TrimSpace(string(maxMem)), 64)
if err == nil {
jobdata.MaxMemoryUsage = x
}
}
limitMem, err := m.readFile(cg("memory.max"))
if err == nil {
x, err := strconv.ParseFloat(strings.TrimSpace(string(limitMem)), 64)
if err == nil {
jobdata.LimitMemoryUsage = x
}
}
cpuStat, err := m.readFile(cg("cpu.stat"))
if err == nil {
lines := strings.Split(strings.TrimSpace(string(cpuStat)), "\n")
var usageUsec, userUsec, systemUsec float64
for _, line := range lines {
fields := strings.Fields(line)
if len(fields) < 2 {
continue
}
value, err := strconv.ParseFloat(fields[1], 64)
if err != nil {
continue
}
switch fields[0] {
case "usage_usec":
usageUsec = value
case "user_usec":
userUsec = value
case "system_usec":
systemUsec = value
}
}
if usageUsec > 0 {
jobdata.CpuUsageUser = (userUsec * 100 / usageUsec)
jobdata.CpuUsageSys = (systemUsec * 100 / usageUsec)
}
}
cpuSet, err := m.readFile(cg("cpuset.cpus"))
if err == nil {
cpus, err := ParseCPUs(strings.TrimSpace(string(cpuSet)))
if err == nil {
jobdata.CpuSet = cpus
}
}
return jobdata, nil
}
func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMessage) {
timestamp := time.Now()
for k := range m.cpuUsed {
delete(m.cpuUsed, k)
}
globPattern := filepath.Join(m.cgroupBase, "job_*")
jobDirs, err := filepath.Glob(globPattern)
if err != nil {
cclog.ComponentError(m.name, "Error globbing job directories:", err.Error())
return
}
for _, jdir := range jobDirs {
jKey := filepath.Base(jdir)
jobdata, err := m.ReadJobData(jKey)
if err != nil {
cclog.ComponentError(m.name, "Error reading job data for", jKey, ":", err.Error())
continue
}
if len(jobdata.CpuSet) > 0 {
coreCount := float64(len(jobdata.CpuSet))
for _, cpu := range jobdata.CpuSet {
coreTags := map[string]string{
"type": "hwthread",
"type-id": fmt.Sprintf("%d", cpu),
}
if coreCount > 0 && !m.isExcluded("job_mem_used") {
memPerCore := jobdata.MemoryUsage / coreCount
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]any{"value": memPerCore}, timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
}
if coreCount > 0 && !m.isExcluded("job_max_mem_used") {
maxMemPerCore := jobdata.MaxMemoryUsage / coreCount
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]any{"value": maxMemPerCore}, timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
}
if coreCount > 0 && !m.isExcluded("job_mem_limit") {
limitPerCore := jobdata.LimitMemoryUsage / coreCount
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]any{"value": limitPerCore}, timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
}
if coreCount > 0 && !m.isExcluded("job_user_cpu") {
cpuUserPerCore := jobdata.CpuUsageUser / coreCount
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]any{"value": cpuUserPerCore}, timestamp); err == nil {
y.AddMeta("unit", "%")
output <- y
}
}
if coreCount > 0 && !m.isExcluded("job_sys_cpu") {
cpuSysPerCore := jobdata.CpuUsageSys / coreCount
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]any{"value": cpuSysPerCore}, timestamp); err == nil {
y.AddMeta("unit", "%")
output <- y
}
}
m.cpuUsed[cpu] = true
}
}
}
for _, cpu := range m.allCPUs {
if !m.cpuUsed[cpu] {
coreTags := map[string]string{
"type": "hwthread",
"type-id": fmt.Sprintf("%d", cpu),
}
if !m.isExcluded("job_mem_used") {
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
}
if !m.isExcluded("job_max_mem_used") {
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
}
if !m.isExcluded("job_mem_limit") {
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
}
if !m.isExcluded("job_user_cpu") {
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
y.AddMeta("unit", "%")
output <- y
}
}
if !m.isExcluded("job_sys_cpu") {
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
y.AddMeta("unit", "%")
output <- y
}
}
}
}
}
func (m *SlurmCgroupCollector) Close() {
m.init = false
}

View File

@@ -1,50 +0,0 @@
<!--
---
title: Slurm cgroup metric collector
description: Collect per-core memory and CPU usage for SLURM jobs from cgroup v2
categories: [cc-metric-collector]
tags: ['Admin']
weight: 3
hugo_path: docs/reference/cc-metric-collector/collectors/slurm_cgroup.md
---
-->
## `slurm_cgroup` collector
The `slurm_cgroup` collector reads job-specific resource metrics from the cgroup v2 filesystem and provides **hwthread** metrics for memory and CPU usage of running SLURM jobs.
### Example configuration
```json
"slurm_cgroup": {
"cgroup_base": "/sys/fs/cgroup/system.slice/slurmstepd.scope",
"exclude_metrics": [
"job_sys_cpu",
"job_mem_limit"
],
"use_sudo": false
}
```
* The `cgroup_base` parameter (optional) can be set to specify the root path to SLURM job cgroups. The default is `/sys/fs/cgroup/system.slice/slurmstepd.scope`.
* The `exclude_metrics` array can be used to suppress individual metrics from being sent to the sink.
* The cgroups metrics are only available for root users. If password-less sudo is configured, you can enable sudo in the configuration.
### Reported metrics
All metrics are available **per hardware thread** :
* `job_mem_used` (`unit=Bytes`): Current memory usage of the job
* `job_max_mem_used` (`unit=Bytes`): Peak memory usage
* `job_mem_limit` (`unit=Bytes`): Cgroup memory limit
* `job_user_cpu` (`unit=%`): User CPU utilization percentage
* `job_sys_cpu` (`unit=%`): System CPU utilization percentage
Each metric has tags:
* `type=hwthread`
* `type-id=<core_id>`
### Limitations
* **cgroups v2 required:** This collector only supports systems running with cgroups v2 (unified hierarchy).

View File

@@ -16,8 +16,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
@@ -58,9 +58,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
m.name = "TempCollector"
m.parallel = true
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
if err != nil {
@@ -119,7 +117,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
sensor.metricName = sensor.label
}
sensor.metricName = strings.ToLower(sensor.metricName)
sensor.metricName = strings.ReplaceAll(sensor.metricName, " ", "_")
sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1)
// Add temperature prefix, if required
if !strings.Contains(sensor.metricName, "temp") {
sensor.metricName = "temp_" + sensor.metricName
@@ -203,7 +201,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
sensor.metricName,
sensor.tags,
m.meta,
map[string]any{"value": x},
map[string]interface{}{"value": x},
time.Now(),
)
if err == nil {
@@ -216,7 +214,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
sensor.maxTempName,
sensor.tags,
m.meta,
map[string]any{"value": sensor.maxTemp},
map[string]interface{}{"value": sensor.maxTemp},
time.Now(),
)
if err == nil {
@@ -230,7 +228,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
sensor.critTempName,
sensor.tags,
m.meta,
map[string]any{"value": sensor.critTemp},
map[string]interface{}{"value": sensor.critTemp},
time.Now(),
)
if err == nil {

View File

@@ -9,13 +9,14 @@ package collectors
import (
"encoding/json"
"errors"
"fmt"
"log"
"os/exec"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const MAX_NUM_PROCS = 10
@@ -35,17 +36,12 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
var err error
m.name = "TopProcsCollector"
m.parallel = true
m.tags = map[string]string{
"type": "node",
}
m.meta = map[string]string{
"source": m.name,
"group": "TopProcs",
}
m.tags = map[string]string{"type": "node"}
m.meta = map[string]string{"source": m.name, "group": "TopProcs"}
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
return fmt.Errorf("%s Init(): json.Unmarshal() failed: %w", m.name, err)
return err
}
} else {
m.config.Num_procs = int(DEFAULT_NUM_PROCS)
@@ -53,13 +49,12 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
if m.config.Num_procs <= 0 || m.config.Num_procs > MAX_NUM_PROCS {
return fmt.Errorf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS)
}
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
command.Wait()
_, err = command.Output()
if err != nil {
return fmt.Errorf("%s Init(): failed to get output from command: %w", m.name, err)
return errors.New("failed to execute command")
}
m.init = true
return nil
@@ -70,18 +65,17 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessag
return
}
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
command.Wait()
stdout, err := command.Output()
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to read output from command \"%s\": %v", command.String(), err))
log.Print(m.name, err)
return
}
lines := strings.Split(string(stdout), "\n")
for i := 1; i < m.config.Num_procs+1; i++ {
name := fmt.Sprintf("topproc%d", i)
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": string(lines[i])}, time.Now())
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now())
if err == nil {
output <- y
}

View File

@@ -1,6 +1,15 @@
# Building the cc-metric-collector
In most cases, a simple `make` in the main folder is enough to get a `cc-metric-collector` binary. It is basically a `go build` but some collectors require additional tasks. There is currently no Golang interface to LIKWID, so it uses `cgo` to create bindings but `cgo` requires the LIKWID header files. Therefore, it checks whether LIKWID is installed and if not it downloads LIKWID and copies the headers.
Dependencies:
- golang
- hwloc
```
$ export CGO_LDFLAGS="-L/path/to/hwloc/lib/dir"
$ make
```
In most cases, a simple `make` in the main folder is enough to get a `cc-metric-collector` binary as long as hwloc is in default locations. It is basically a `go build` but some collectors require additional tasks. There is currently no Golang interface to LIKWID, so it uses `cgo` to create bindings but `cgo` requires the LIKWID header files. Therefore, it checks whether LIKWID is installed and if not it downloads LIKWID and copies the headers.
## System integration

View File

@@ -4,7 +4,7 @@ The configuration of the CC metric collector consists of five configuration file
## Global configuration
The global file contains the paths to the other four files and some global options. You can find examples in `example_configs`.
The global file contains the paths to the other four files and some global options.
```json
{

43
go.mod
View File

@@ -1,45 +1,46 @@
module github.com/ClusterCockpit/cc-metric-collector
go 1.24.0
go 1.23.4
toolchain go1.23.7
require (
github.com/ClusterCockpit/cc-lib/v2 v2.1.0
github.com/ClusterCockpit/cc-lib v0.5.0
github.com/ClusterCockpit/go-rocm-smi v0.3.0
github.com/NVIDIA/go-nvml v0.13.0-1
github.com/PaesslerAG/gval v1.2.4
github.com/NVIDIA/go-nvml v0.12.9-0
github.com/PaesslerAG/gval v1.2.2
github.com/fsnotify/fsnotify v1.9.0
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf
github.com/tklauser/go-sysconf v0.3.16
github.com/tklauser/go-sysconf v0.3.13
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1
golang.org/x/sys v0.40.0
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b
golang.org/x/sys v0.33.0
)
require (
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/expr-lang/expr v1.17.7 // indirect
github.com/expr-lang/expr v1.17.5 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/mux v1.8.1 // indirect
github.com/influxdata/influxdb-client-go/v2 v2.14.0 // indirect
github.com/influxdata/line-protocol/v2 v2.2.1 // indirect
github.com/klauspost/compress v1.18.2 // indirect
github.com/klauspost/compress v1.18.0 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/nats-io/nats.go v1.48.0 // indirect
github.com/nats-io/nkeys v0.4.12 // indirect
github.com/nats-io/nats.go v1.43.0 // indirect
github.com/nats-io/nkeys v0.4.11 // indirect
github.com/nats-io/nuid v1.0.1 // indirect
github.com/oapi-codegen/runtime v1.1.2 // indirect
github.com/prometheus/client_golang v1.23.2 // indirect
github.com/oapi-codegen/runtime v1.1.1 // indirect
github.com/prometheus/client_golang v1.22.0 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.67.5 // indirect
github.com/prometheus/procfs v0.19.2 // indirect
github.com/prometheus/common v0.65.0 // indirect
github.com/prometheus/procfs v0.16.1 // indirect
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 // indirect
github.com/shopspring/decimal v1.4.0 // indirect
github.com/shopspring/decimal v1.3.1 // indirect
github.com/stmcginnis/gofish v0.20.0 // indirect
github.com/tklauser/numcpus v0.11.0 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect
golang.org/x/crypto v0.47.0 // indirect
golang.org/x/exp v0.0.0-20260112195511-716be5621a96 // indirect
golang.org/x/net v0.49.0 // indirect
google.golang.org/protobuf v1.36.11 // indirect
github.com/tklauser/numcpus v0.7.0 // indirect
golang.org/x/crypto v0.39.0 // indirect
golang.org/x/net v0.41.0 // indirect
google.golang.org/protobuf v1.36.6 // indirect
)

93
go.sum
View File

@@ -1,17 +1,15 @@
github.com/ClusterCockpit/cc-lib/v2 v2.1.0 h1:B6l6h0IjfEuY9DU6aVM3fSsj24lQ1eudXK9QTKmJjqg=
github.com/ClusterCockpit/cc-lib/v2 v2.1.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw=
github.com/ClusterCockpit/cc-lib v0.5.0 h1:DSKAD1TxjVWyd1x3GWvxFeEkANF9o13T97nirj3CbRU=
github.com/ClusterCockpit/cc-lib v0.5.0/go.mod h1:0zLbJprwOWLA+OSNQ+OlUKLscZszwf9J2j8Ly5ztplk=
github.com/ClusterCockpit/go-rocm-smi v0.3.0 h1:1qZnSpG7/NyLtc7AjqnUL9Jb8xtqG1nMVgp69rJfaR8=
github.com/ClusterCockpit/go-rocm-smi v0.3.0/go.mod h1:+I3UMeX3OlizXDf1WpGD43W4KGZZGVSGmny6rTeOnWA=
github.com/NVIDIA/go-nvml v0.11.6-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw=
github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
github.com/NVIDIA/go-nvml v0.12.9-0 h1:e344UK8ZkeMeeLkdQtRhmXRxNf+u532LDZPGMtkdus0=
github.com/NVIDIA/go-nvml v0.12.9-0/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
github.com/PaesslerAG/gval v1.2.4 h1:rhX7MpjJlcxYwL2eTTYIOBUyEKZ+A96T9vQySWkVUiU=
github.com/PaesslerAG/gval v1.2.4/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac=
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op h1:Ucf+QxEKMbPogRO5guBNe5cgd9uZgfoJLOYs8WWhtjM=
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E=
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
@@ -23,8 +21,8 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/expr-lang/expr v1.17.7 h1:Q0xY/e/2aCIp8g9s/LGvMDCC5PxYlvHgDZRQ4y16JX8=
github.com/expr-lang/expr v1.17.7/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
github.com/expr-lang/expr v1.17.5 h1:i1WrMvcdLF249nSNlpQZN1S6NXuW9WaOfF5tPi3aw3k=
github.com/expr-lang/expr v1.17.5/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
@@ -35,8 +33,6 @@ github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/go-tpm v0.9.7 h1:u89J4tUUeDTlH8xxC3CTW7OHZjbjKoHdQ9W7gCUhtxA=
github.com/google/go-tpm v0.9.7/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
@@ -53,8 +49,8 @@ github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxks
github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE=
github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM=
github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
github.com/klauspost/compress v1.18.2 h1:iiPHWW0YrcFgpBYhsA6D1+fqHssJscY/Tm/y2Uqnapk=
github.com/klauspost/compress v1.18.2/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
@@ -64,75 +60,60 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 h1:KGuD/pM2JpL9FAYvBrnBBeENKZNh6eNtjqytV6TYjnk=
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g=
github.com/nats-io/jwt/v2 v2.8.0/go.mod h1:me11pOkwObtcBNR8AiMrUbtVOUGkqYjMQZ6jnSdVUIA=
github.com/nats-io/nats-server/v2 v2.12.3 h1:KRv+1n7lddMVgkJPQer+pt36TcO0ENxjilBmeWdjcHs=
github.com/nats-io/nats-server/v2 v2.12.3/go.mod h1:MQXjG9WjyXKz9koWzUc3jYUMKD8x3CLmTNy91IQQz3Y=
github.com/nats-io/nats.go v1.48.0 h1:pSFyXApG+yWU/TgbKCjmm5K4wrHu86231/w84qRVR+U=
github.com/nats-io/nats.go v1.48.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
github.com/nats-io/nkeys v0.4.12 h1:nssm7JKOG9/x4J8II47VWCL1Ds29avyiQDRn0ckMvDc=
github.com/nats-io/nkeys v0.4.12/go.mod h1:MT59A1HYcjIcyQDJStTfaOY6vhy9XTUjOFo+SVsvpBg=
github.com/nats-io/nats.go v1.43.0 h1:uRFZ2FEoRvP64+UUhaTokyS18XBCR/xM2vQZKO4i8ug=
github.com/nats-io/nats.go v1.43.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
github.com/nats-io/nkeys v0.4.11 h1:q44qGV008kYd9W1b1nEBkNzvnWxtRSQ7A8BoqRrcfa0=
github.com/nats-io/nkeys v0.4.11/go.mod h1:szDimtgmfOi9n25JpfIdGw12tZFYXqhGxjhVxsatHVE=
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/oapi-codegen/runtime v1.1.2 h1:P2+CubHq8fO4Q6fV1tqDBZHCwpVpvPg7oKiYzQgXIyI=
github.com/oapi-codegen/runtime v1.1.2/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro=
github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q=
github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0=
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4=
github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw=
github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws=
github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw=
github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE=
github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8=
github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4=
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY=
github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=
github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k=
github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME=
github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
github.com/stmcginnis/gofish v0.20.0 h1:hH2V2Qe898F2wWT1loApnkDUrXXiLKqbSlMaH3Y1n08=
github.com/stmcginnis/gofish v0.20.0/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA=
github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI=
github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw=
github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/tklauser/go-sysconf v0.3.13 h1:GBUpcahXSpR2xN01jhkNAbTLRk2Yzgggk8IM08lq3r4=
github.com/tklauser/go-sysconf v0.3.13/go.mod h1:zwleP4Q4OehZHGn4CYZDipCgg9usW5IJePewFCGVEa0=
github.com/tklauser/numcpus v0.7.0 h1:yjuerZP127QG9m5Zh/mSO4wqurYil27tHrqwRoRjpr4=
github.com/tklauser/numcpus v0.7.0/go.mod h1:bb6dMVcj8A42tSE7i32fsIUCbQNllK5iDguyOZRUzAY=
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 h1:P7S/GeHBAFEZIYp0ePPs2kHXoazz8q2KsyxHyQVGCJg=
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1/go.mod h1:9CWpnTUmlQkfdpdutA1nNf4iE5lAVt3QZOu0Z6hahBE=
golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8=
golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A=
golang.org/x/exp v0.0.0-20260112195511-716be5621a96 h1:Z/6YuSHTLOHfNFdb8zVZomZr7cqNgTJvA8+Qz75D8gU=
golang.org/x/exp v0.0.0-20260112195511-716be5621a96/go.mod h1:nzimsREAkjBCIEFtHiYkrJyT+2uy9YZJB7H1k68CXZU=
golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o=
golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8=
golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM=
golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U=
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b h1:M2rDM6z3Fhozi9O7NWsxAkg/yqS/lQJ6PmkyIV3YP+o=
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8=
golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw=
golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA=
golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY=
google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View File

@@ -10,17 +10,15 @@ package metricAggregator
import (
"context"
"fmt"
"maps"
"math"
"os"
"slices"
"strings"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"github.com/PaesslerAG/gval"
@@ -38,7 +36,7 @@ type MetricAggregatorIntervalConfig struct {
type metricAggregator struct {
functions []*MetricAggregatorIntervalConfig
constants map[string]any
constants map[string]interface{}
language gval.Language
output chan lp.CCMessage
}
@@ -86,7 +84,7 @@ var evaluables = struct {
func (c *metricAggregator) Init(output chan lp.CCMessage) error {
c.output = output
c.functions = make([]*MetricAggregatorIntervalConfig, 0)
c.constants = make(map[string]any)
c.constants = make(map[string]interface{})
// add constants like hostname, numSockets, ... to constants list
// Set hostname
@@ -122,8 +120,10 @@ func (c *metricAggregator) Init(output chan lp.CCMessage) error {
}
func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMessage) {
vars := make(map[string]any)
maps.Copy(vars, c.constants)
vars := make(map[string]interface{})
for k, v := range c.constants {
vars[k] = v
}
vars["starttime"] = starttime
vars["endtime"] = endtime
for _, f := range c.functions {
@@ -263,15 +263,15 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
var m lp.CCMessage
switch t := value.(type) {
case float64:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case float32:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case int:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case int64:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case string:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
default:
cclog.ComponentError("MetricCache", "Gval returned invalid type", t, "skipping metric", f.Name)
}
@@ -329,19 +329,18 @@ func (c *metricAggregator) AddAggregation(name, function, condition string, tags
}
func (c *metricAggregator) DeleteAggregation(name string) error {
i := slices.IndexFunc(
c.functions,
func(agg *MetricAggregatorIntervalConfig) bool {
return agg.Name == name
})
if i == -1 {
return fmt.Errorf("no aggregation for metric name %s", name)
for i, agg := range c.functions {
if agg.Name == name {
copy(c.functions[i:], c.functions[i+1:])
c.functions[len(c.functions)-1] = nil
c.functions = c.functions[:len(c.functions)-1]
return nil
}
}
c.functions = slices.Delete(c.functions, i, i)
return nil
return fmt.Errorf("no aggregation for metric name %s", name)
}
func (c *metricAggregator) AddConstant(name string, value any) {
func (c *metricAggregator) AddConstant(name string, value interface{}) {
c.constants[name] = value
}
@@ -349,11 +348,11 @@ func (c *metricAggregator) DelConstant(name string) {
delete(c.constants, name)
}
func (c *metricAggregator) AddFunction(name string, function func(args ...any) (any, error)) {
func (c *metricAggregator) AddFunction(name string, function func(args ...interface{}) (interface{}, error)) {
c.language = gval.NewLanguage(c.language, gval.Function(name, function))
}
func EvalBoolCondition(condition string, params map[string]any) (bool, error) {
func EvalBoolCondition(condition string, params map[string]interface{}) (bool, error) {
evaluables.mutex.Lock()
evaluable, ok := evaluables.mapping[condition]
evaluables.mutex.Unlock()

View File

@@ -11,9 +11,10 @@ import (
"errors"
"fmt"
"regexp"
"slices"
"strings"
"golang.org/x/exp/slices"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
)
@@ -33,7 +34,7 @@ func sumAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
}
// Sum up values
func sumfunc(args any) (any, error) {
func sumfunc(args interface{}) (interface{}, error) {
var err error
switch values := args.(type) {
@@ -62,7 +63,7 @@ func minAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
}
// Get the minimum value
func minfunc(args any) (any, error) {
func minfunc(args interface{}) (interface{}, error) {
switch values := args.(type) {
case []float64:
return minAnyType(values)
@@ -83,12 +84,12 @@ func avgAnyType[T float64 | float32 | int | int32 | int64](values []T) (float64,
if len(values) == 0 {
return 0.0, errors.New("average function requires at least one argument")
}
sum, err := sumAnyType(values)
sum, err := sumAnyType[T](values)
return float64(sum) / float64(len(values)), err
}
// Get the average or mean value
func avgfunc(args any) (any, error) {
func avgfunc(args interface{}) (interface{}, error) {
switch values := args.(type) {
case []float64:
return avgAnyType(values)
@@ -113,7 +114,7 @@ func maxAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
}
// Get the maximum value
func maxfunc(args any) (any, error) {
func maxfunc(args interface{}) (interface{}, error) {
switch values := args.(type) {
case []float64:
return maxAnyType(values)
@@ -145,7 +146,7 @@ func medianAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, er
}
// Get the median value
func medianfunc(args any) (any, error) {
func medianfunc(args interface{}) (interface{}, error) {
switch values := args.(type) {
case []float64:
return medianAnyType(values)
@@ -166,9 +167,9 @@ func medianfunc(args any) (any, error) {
* Get number of values in list. Returns always an int
*/
func lenfunc(args any) (any, error) {
func lenfunc(args interface{}) (interface{}, error) {
var err error = nil
length := 0
var length int = 0
switch values := args.(type) {
case []float64:
length = len(values)
@@ -180,7 +181,13 @@ func lenfunc(args any) (any, error) {
length = len(values)
case []int32:
length = len(values)
case float64, float32, int, int64:
case float64:
err = errors.New("function 'len' can only be applied on arrays and strings")
case float32:
err = errors.New("function 'len' can only be applied on arrays and strings")
case int:
err = errors.New("function 'len' can only be applied on arrays and strings")
case int64:
err = errors.New("function 'len' can only be applied on arrays and strings")
case string:
length = len(values)
@@ -190,13 +197,13 @@ func lenfunc(args any) (any, error) {
/*
* Check if a values is in a list
* In contrast to most of the other functions, this one is an infix operator for
* In constrast to most of the other functions, this one is an infix operator for
* - substring matching: `"abc" in "abcdef"` -> true
* - substring matching with int casting: `3 in "abd3"` -> true
* - search for an int in an int list: `3 in getCpuList()` -> true (if you have more than 4 CPU hardware threads)
*/
func infunc(a any, b any) (any, error) {
func infunc(a interface{}, b interface{}) (interface{}, error) {
switch match := a.(type) {
case string:
switch total := b.(type) {
@@ -206,7 +213,11 @@ func infunc(a any, b any) (any, error) {
case int:
switch total := b.(type) {
case []int:
return slices.Contains(total, match), nil
for _, x := range total {
if x == match {
return true, nil
}
}
case string:
smatch := fmt.Sprintf("%d", match)
return strings.Contains(total, smatch), nil
@@ -222,12 +233,12 @@ func infunc(a any, b any) (any, error) {
* format keys \d = %d, \w = %d, ... Not sure how to fix this
*/
func matchfunc(args ...any) (any, error) {
func matchfunc(args ...interface{}) (interface{}, error) {
switch match := args[0].(type) {
case string:
switch total := args[1].(type) {
case string:
smatch := strings.ReplaceAll(match, "%", "\\")
smatch := strings.Replace(match, "%", "\\", -1)
regex, err := regexp.Compile(smatch)
if err != nil {
return false, err
@@ -244,7 +255,7 @@ func matchfunc(args ...any) (any, error) {
*/
// for a given cpuid, it returns the core id
func getCpuCoreFunc(args any) (any, error) {
func getCpuCoreFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) {
case int:
return topo.GetHwthreadCore(cpuid), nil
@@ -253,7 +264,7 @@ func getCpuCoreFunc(args any) (any, error) {
}
// for a given cpuid, it returns the socket id
func getCpuSocketFunc(args any) (any, error) {
func getCpuSocketFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) {
case int:
return topo.GetHwthreadSocket(cpuid), nil
@@ -262,7 +273,7 @@ func getCpuSocketFunc(args any) (any, error) {
}
// for a given cpuid, it returns the id of the NUMA node
func getCpuNumaDomainFunc(args any) (any, error) {
func getCpuNumaDomainFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) {
case int:
return topo.GetHwthreadNumaDomain(cpuid), nil
@@ -271,7 +282,7 @@ func getCpuNumaDomainFunc(args any) (any, error) {
}
// for a given cpuid, it returns the id of the CPU die
func getCpuDieFunc(args any) (any, error) {
func getCpuDieFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) {
case int:
return topo.GetHwthreadDie(cpuid), nil
@@ -280,7 +291,7 @@ func getCpuDieFunc(args any) (any, error) {
}
// for a given core id, it returns the list of cpuids
func getCpuListOfCoreFunc(args any) (any, error) {
func getCpuListOfCoreFunc(args interface{}) (interface{}, error) {
cpulist := make([]int, 0)
switch in := args.(type) {
case int:
@@ -294,7 +305,7 @@ func getCpuListOfCoreFunc(args any) (any, error) {
}
// for a given socket id, it returns the list of cpuids
func getCpuListOfSocketFunc(args any) (any, error) {
func getCpuListOfSocketFunc(args interface{}) (interface{}, error) {
cpulist := make([]int, 0)
switch in := args.(type) {
case int:
@@ -308,7 +319,7 @@ func getCpuListOfSocketFunc(args any) (any, error) {
}
// for a given id of a NUMA domain, it returns the list of cpuids
func getCpuListOfNumaDomainFunc(args any) (any, error) {
func getCpuListOfNumaDomainFunc(args interface{}) (interface{}, error) {
cpulist := make([]int, 0)
switch in := args.(type) {
case int:
@@ -322,7 +333,7 @@ func getCpuListOfNumaDomainFunc(args any) (any, error) {
}
// for a given CPU die id, it returns the list of cpuids
func getCpuListOfDieFunc(args any) (any, error) {
func getCpuListOfDieFunc(args interface{}) (interface{}, error) {
cpulist := make([]int, 0)
switch in := args.(type) {
case int:
@@ -336,14 +347,14 @@ func getCpuListOfDieFunc(args any) (any, error) {
}
// wrapper function to get a list of all cpuids of the node
func getCpuListOfNode() (any, error) {
func getCpuListOfNode() (interface{}, error) {
return topo.HwthreadList(), nil
}
// helper function to get the cpuid list for a CCMetric type tag set (type and type-id)
// since there is no access to the metric data in the function, is should be called like
// `getCpuListOfType()`
func getCpuListOfType(args ...any) (any, error) {
func getCpuListOfType(args ...interface{}) (interface{}, error) {
cpulist := make([]int, 0)
switch typ := args[0].(type) {
case string:

View File

@@ -11,9 +11,9 @@ import (
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
)
@@ -51,7 +51,7 @@ type MetricCache interface {
}
func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error {
var err error
var err error = nil
c.done = make(chan bool)
c.wg = wg
c.ticker = ticker
@@ -161,8 +161,8 @@ func (c *metricCache) DeleteAggregation(name string) error {
// is the current one, index=1 the last interval and so on. Returns and empty array if a wrong index
// is given (negative index, index larger than configured number of total intervals, ...)
func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMessage) {
start := time.Now()
stop := time.Now()
var start time.Time = time.Now()
var stop time.Time = time.Now()
var metrics []lp.CCMessage
if index >= 0 && index < c.numPeriods {
pindex := c.curPeriod - index

View File

@@ -10,16 +10,15 @@ package metricRouter
import (
"encoding/json"
"fmt"
"maps"
"os"
"strings"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
mp "github.com/ClusterCockpit/cc-lib/v2/messageProcessor"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
mp "github.com/ClusterCockpit/cc-lib/messageProcessor"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
)
@@ -108,8 +107,10 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
cclog.ComponentError("MetricRouter", err.Error())
return err
}
r.maxForward = max(1, r.config.MaxForward)
r.maxForward = 1
if r.config.MaxForward > r.maxForward {
r.maxForward = r.config.MaxForward
}
if r.config.NumCacheIntervals > 0 {
r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals)
if err != nil {
@@ -117,74 +118,50 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
return err
}
for _, agg := range r.config.IntervalAgg {
err = r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
if err != nil {
return fmt.Errorf("MetricCache AddAggregation() failed: %w", err)
}
r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
}
}
p, err := mp.NewMessageProcessor()
if err != nil {
return fmt.Errorf("MessageProcessor NewMessageProcessor() failed: %w", err)
return fmt.Errorf("initialization of message processor failed: %v", err.Error())
}
r.mp = p
if len(r.config.MessageProcessor) > 0 {
err = r.mp.FromConfigJSON(r.config.MessageProcessor)
if err != nil {
return fmt.Errorf("MessageProcessor FromConfigJSON() failed: %w", err)
return fmt.Errorf("failed parsing JSON for message processor: %v", err.Error())
}
}
for _, mname := range r.config.DropMetrics {
err = r.mp.AddDropMessagesByName(mname)
if err != nil {
return fmt.Errorf("MessageProcessor AddDropMessagesByName() failed: %w", err)
}
r.mp.AddDropMessagesByName(mname)
}
for _, cond := range r.config.DropMetricsIf {
err = r.mp.AddDropMessagesByCondition(cond)
if err != nil {
return fmt.Errorf("MessageProcessor AddDropMessagesByCondition() failed: %w", err)
}
r.mp.AddDropMessagesByCondition(cond)
}
for _, data := range r.config.AddTags {
cond := data.Condition
if cond == "*" {
cond = "true"
}
err = r.mp.AddAddTagsByCondition(cond, data.Key, data.Value)
if err != nil {
return fmt.Errorf("MessageProcessor AddAddTagsByCondition() failed: %w", err)
}
r.mp.AddAddTagsByCondition(cond, data.Key, data.Value)
}
for _, data := range r.config.DelTags {
cond := data.Condition
if cond == "*" {
cond = "true"
}
err = r.mp.AddDeleteTagsByCondition(cond, data.Key, data.Value)
if err != nil {
return fmt.Errorf("MessageProcessor AddDeleteTagsByCondition() failed: %w", err)
}
r.mp.AddDeleteTagsByCondition(cond, data.Key, data.Value)
}
for oldname, newname := range r.config.RenameMetrics {
err = r.mp.AddRenameMetricByName(oldname, newname)
if err != nil {
return fmt.Errorf("MessageProcessor AddRenameMetricByName() failed: %w", err)
}
r.mp.AddRenameMetricByName(oldname, newname)
}
for metricName, prefix := range r.config.ChangeUnitPrefix {
err = r.mp.AddChangeUnitPrefix(fmt.Sprintf("name == '%s'", metricName), prefix)
if err != nil {
return fmt.Errorf("MessageProcessor AddChangeUnitPrefix() failed: %w", err)
}
r.mp.AddChangeUnitPrefix(fmt.Sprintf("name == '%s'", metricName), prefix)
}
r.mp.SetNormalizeUnits(r.config.NormalizeUnits)
err = r.mp.AddAddTagsByCondition("true", r.config.HostnameTagName, r.hostname)
if err != nil {
return fmt.Errorf("MessageProcessor AddAddTagsByCondition() failed: %w", err)
}
r.mp.AddAddTagsByCondition("true", r.config.HostnameTagName, r.hostname)
// r.config.dropMetrics = make(map[string]bool)
// for _, mname := range r.config.DropMetrics {
@@ -193,8 +170,8 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
return nil
}
func getParamMap(point lp.CCMessage) map[string]any {
params := make(map[string]any)
func getParamMap(point lp.CCMessage) map[string]interface{} {
params := make(map[string]interface{})
params["metric"] = point
params["name"] = point.Name()
for key, value := range point.Tags() {
@@ -203,7 +180,9 @@ func getParamMap(point lp.CCMessage) map[string]any {
for key, value := range point.Meta() {
params[key] = value
}
maps.Copy(params, point.Fields())
for key, value := range point.Fields() {
params[key] = value
}
params["timestamp"] = point.Time()
return params
}

View File

@@ -13,11 +13,11 @@ import (
"os"
"path/filepath"
"regexp"
"slices"
"strconv"
"strings"
cclogger "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
cclogger "github.com/ClusterCockpit/cc-lib/ccLogger"
"golang.org/x/exp/slices"
)
const SYSFS_CPUBASE = `/sys/devices/system/cpu`
@@ -80,7 +80,7 @@ func fileToList(path string) []int {
// Create list
list := make([]int, 0)
stringBuffer := strings.TrimSpace(string(buffer))
for valueRangeString := range strings.SplitSeq(stringBuffer, ",") {
for _, valueRangeString := range strings.Split(stringBuffer, ",") {
valueRange := strings.Split(valueRangeString, "-")
switch len(valueRange) {
case 1:

View File

@@ -10,7 +10,7 @@ package multiChanTicker
import (
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
)
type multiChanTicker struct {

View File

@@ -44,8 +44,6 @@ def group_to_json(groupfile):
scope = "socket"
if "PWR" in calc:
scope = "socket"
if "UMC" in calc:
scope = "socket"
m = {"name" : metric, "calc": calc, "type" : scope, "publish" : True}
metrics.append(m)