Compare commits

..

1 Commits

Author SHA1 Message Date
Thomas Roehl
b3eb0679b9 Fix max clock metrics 2025-10-20 16:58:39 +02:00
63 changed files with 2135 additions and 2756 deletions

View File

@@ -36,14 +36,22 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -70,13 +78,13 @@ jobs:
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector RPM for AlmaLinux 8
path: ${{ steps.rpmrename.outputs.RPM }}
overwrite: true
- name: Save SRPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector SRPM for AlmaLinux 8
path: ${{ steps.rpmrename.outputs.SRPM }}
@@ -106,14 +114,23 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.el9_6.noarch.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.23.9-1.el9_6.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -140,26 +157,25 @@ jobs:
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector RPM for AlmaLinux 9
path: ${{ steps.rpmrename.outputs.RPM }}
overwrite: true
- name: Save SRPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector SRPM for AlmaLinux 9
path: ${{ steps.rpmrename.outputs.SRPM }}
overwrite: true
#
# Build on Red Hat Universal Base Image (UBI 8) using go-toolset
# Build on UBI 8 using go-toolset
#
UBI-8-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+8
# https://hub.docker.com/r/redhat/ubi8
container: redhat/ubi8
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c35984d70cc534b3a3784e?container-tabs=gti
container: registry.access.redhat.com/ubi8/ubi:8.8-1032.1692772289
# The job outputs link to the outputs of the 'rpmbuild' step
outputs:
rpm : ${{steps.rpmbuild.outputs.RPM}}
@@ -174,14 +190,22 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-8-appstream-rpms install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -191,25 +215,24 @@ jobs:
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector RPM for UBI 8
path: ${{ steps.rpmbuild.outputs.RPM }}
overwrite: true
- name: Save SRPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector SRPM for UBI 8
path: ${{ steps.rpmbuild.outputs.SRPM }}
overwrite: true
#
# Build on Red Hat Universal Base Image (UBI 9) using go-toolset
# Build on UBI 9 using go-toolset
#
UBI-9-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+9
# https://hub.docker.com/r/redhat/ubi9
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
container: redhat/ubi9
# The job outputs link to the outputs of the 'rpmbuild' step
# The job outputs link to the outputs of the 'rpmbuild' step
@@ -226,14 +249,24 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-9-appstream-rpms install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.el9_6.noarch.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.23.9-1.el9_6.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -243,13 +276,13 @@ jobs:
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector RPM for UBI 9
path: ${{ steps.rpmbuild.outputs.RPM }}
overwrite: true
- name: Save SRPM as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector SRPM for UBI 9
path: ${{ steps.rpmbuild.outputs.SRPM }}
@@ -275,14 +308,13 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang
uses: actions/setup-go@v6
uses: actions/setup-go@v5
with:
go-version: 'stable'
@@ -300,7 +332,7 @@ jobs:
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
# See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector DEB for Ubuntu 22.04
path: ${{ steps.debrename.outputs.DEB }}
@@ -326,14 +358,13 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang
uses: actions/setup-go@v6
uses: actions/setup-go@v5
with:
go-version: 'stable'
@@ -351,7 +382,7 @@ jobs:
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
# See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact
uses: actions/upload-artifact@v6
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector DEB for Ubuntu 24.04
path: ${{ steps.debrename.outputs.DEB }}
@@ -369,48 +400,48 @@ jobs:
steps:
# See: https://github.com/actions/download-artifact
- name: Download AlmaLinux 8 RPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector RPM for AlmaLinux 8
- name: Download AlmaLinux 8 SRPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector SRPM for AlmaLinux 8
- name: Download AlmaLinux 9 RPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector RPM for AlmaLinux 9
- name: Download AlmaLinux 9 SRPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector SRPM for AlmaLinux 9
- name: Download UBI 8 RPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector RPM for UBI 8
- name: Download UBI 8 SRPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector SRPM for UBI 8
- name: Download UBI 9 RPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector RPM for UBI 9
- name: Download UBI 9 SRPM
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector SRPM for UBI 9
- name: Download Ubuntu 22.04 DEB
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector DEB for Ubuntu 22.04
- name: Download Ubuntu 24.04 DEB
uses: actions/download-artifact@v7
uses: actions/download-artifact@v4
with:
name: cc-metric-collector DEB for Ubuntu 24.04

View File

@@ -20,41 +20,25 @@ jobs:
# See: https://github.com/marketplace/actions/checkout
# Checkout git repository and submodules
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang
uses: actions/setup-go@v6
uses: actions/setup-go@v5
with:
go-version: 'stable'
go-version: '1.21'
check-latest: true
- name: Install reviewdog
run: |
go install github.com/reviewdog/reviewdog/cmd/reviewdog@latest
# See: https://golangci-lint.run
- name: Install GolangCI-Lint
run: |
go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@latest
- name: Build MetricCollector
run: make
- name: Run MetricCollector once
run: ./cc-metric-collector --once --config .github/ci-config.json
# Running the linter requires likwid.h, which gets downloaded in the build step
- name: Static Analysis with GolangCI-Lint and Upload Report with reviewdog
run: |
golangci-lint run --enable errorlint,govet,misspell,modernize,prealloc,staticcheck,unconvert,wastedassign | reviewdog -f=golangci-lint -name "Check golangci-lint on build-latest" -reporter=github-check -filter-mode=nofilter -fail-level none
env:
REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
#
# Build on AlmaLinux 8 using go-toolset
# Build on AlmaLinux 8
#
AlmaLinux8-RPM-build:
runs-on: ubuntu-latest
@@ -74,14 +58,23 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -90,7 +83,7 @@ jobs:
make RPM
#
# Build on AlmaLinux 9 using go-toolset
# Build on AlmaLinux 9
#
AlmaLinux9-RPM-build:
runs-on: ubuntu-latest
@@ -110,14 +103,24 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.el9_6.noarch.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.23.9-1.el9_6.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -125,49 +128,13 @@ jobs:
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
#
# Build on AlmaLinux 10 using go-toolset
#
AlmaLinux10-RPM-build:
runs-on: ubuntu-latest
# See: https://hub.docker.com/_/almalinux
container: almalinux:10
# The job outputs link to the outputs of the 'rpmrename' step
# Only job outputs can be used in child jobs
steps:
# Use dnf to install development packages
- name: Install development packages
run: |
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
dnf --assumeyes install wget openssl-devel diffutils delve which
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
fetch-depth: 0
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
#
# Build on Red Hat Universal Base Image (UBI 8) using go-toolset
# Build on UBI 8 using go-toolset
#
UBI-8-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+8
# https://hub.docker.com/r/redhat/ubi8
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
container: redhat/ubi8
# The job outputs link to the outputs of the 'rpmbuild' step
steps:
@@ -180,14 +147,23 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-8-appstream-rpms install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -196,12 +172,11 @@ jobs:
make RPM
#
# Build on Red Hat Universal Base Image (UBI 9) using go-toolset
# Build on UBI 9 using go-toolset
#
UBI-9-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+9
# https://hub.docker.com/r/redhat/ubi9
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
container: redhat/ubi9
# The job outputs link to the outputs of the 'rpmbuild' step
steps:
@@ -214,48 +189,24 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-9-appstream-rpms install go-toolset
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
#
# Build on Red Hat Universal Base Image (UBI 10) using go-toolset
#
UBI-10-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+10
# https://hub.docker.com/r/redhat/ubi10
container: redhat/ubi10
# The job outputs link to the outputs of the 'rpmbuild' step
steps:
# Use dnf to install development packages
- name: Install development packages
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros gcc make python3 git wget openssl-devel diffutils delve
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
fetch-depth: 0
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-10-for-x86_64-appstream-rpms install go-toolset
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.el9_6.noarch.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.23.9-1.el9_6.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
@@ -280,14 +231,14 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang
uses: actions/setup-go@v6
uses: actions/setup-go@v5
with:
go-version: 'stable'
@@ -314,14 +265,14 @@ jobs:
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang
uses: actions/setup-go@v6
uses: actions/setup-go@v5
with:
go-version: 'stable'

4
.gitignore vendored
View File

@@ -1,5 +1,4 @@
# Binaries for programs and plugins
/cc-metric-collector
*.exe
*.exe~
*.dll
@@ -14,6 +13,3 @@
# Dependency directories (remove the comment below to include it)
# vendor/
# Local copy of LIKWID headers
/collectors/likwid

View File

@@ -27,17 +27,6 @@ $(APP): $(GOSRC) go.mod
$(GOBIN) get
$(GOBIN) build -o $(APP) $(GOSRC_APP)
# -ldflags:
# -s : drops the OS symbol table
# -w : drops DWARF
# -> Panic stack traces still show function names and file:line
.PHONY: build-stripped
build-stripped:
make -C collectors
$(GOBIN) get
$(GOBIN) build -ldflags "-s -w" -trimpath -o $(APP) $(GOSRC_APP)
.PHONY: install
install: $(APP)
@WORKSPACE=$(PREFIX)
@if [ -z "$${WORKSPACE}" ]; then exit 1; fi
@@ -69,26 +58,12 @@ fmt:
$(GOBIN) fmt $(GOSRC_APP)
@for F in $(GOSRC_INTERNAL); do $(GOBIN) fmt $$F; done
# gofumpt <https://github.com/mvdan/gofumpt>:
# Enforce a stricter format than gofmt
.PHONY: gofumpt
gofumpt:
$(GOBIN) install mvdan.cc/gofumpt@latest
gofumpt -w $(GOSRC_COLLECTORS)
gofumpt -w $(GOSRC_SINKS)
gofumpt -w $(GOSRC_RECEIVERS)
gofumpt -w $(GOSRC_APP)
@for F in $(GOSRC_INTERNAL); do gofumpt -w $$F; done
# Examine Go source code and reports suspicious constructs
.PHONY: vet
vet:
$(GOBIN) vet ./...
.PHONY: modernize
modernize:
$(GOBIN) run golang.org/x/tools/go/analysis/passes/modernize/cmd/modernize@latest ./...
# Run linter for the Go programming language.
# Using static analysis, it finds bugs and performance issues, offers simplifications, and enforces style rules
@@ -97,11 +72,6 @@ staticcheck:
$(GOBIN) install honnef.co/go/tools/cmd/staticcheck@latest
$$($(GOBIN) env GOPATH)/bin/staticcheck ./...
.PHONY: golangci-lint
golangci-lint:
$(GOBIN) install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@latest
$$($(GOBIN) env GOPATH)/bin/golangci-lint run --enable errorlint,govet,misspell,modernize,prealloc,staticcheck,unconvert,wastedassign
.ONESHELL:
.PHONY: RPM
RPM: scripts/cc-metric-collector.spec

View File

@@ -8,22 +8,23 @@
package main
import (
"bytes"
"encoding/json"
"flag"
"os"
"os/signal"
"sync"
"syscall"
"time"
"github.com/ClusterCockpit/cc-lib/v2/receivers"
"github.com/ClusterCockpit/cc-lib/v2/sinks"
"github.com/ClusterCockpit/cc-lib/receivers"
"github.com/ClusterCockpit/cc-lib/sinks"
"github.com/ClusterCockpit/cc-metric-collector/collectors"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
// "strings"
"sync"
"time"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
)
@@ -49,25 +50,65 @@ type RuntimeConfig struct {
Sync sync.WaitGroup
}
// ReadCli reads the command line arguments
//// Structure of the configuration file
//type GlobalConfig struct {
// Sink sinks.SinkConfig `json:"sink"`
// Interval int `json:"interval"`
// Duration int `json:"duration"`
// Collectors []string `json:"collectors"`
// Receiver receivers.ReceiverConfig `json:"receiver"`
// DefTags map[string]string `json:"default_tags"`
// CollectConfigs map[string]json.RawMessage `json:"collect_config"`
//}
//// Load JSON configuration file
//func LoadConfiguration(file string, config *GlobalConfig) error {
// configFile, err := os.Open(file)
// defer configFile.Close()
// if err != nil {
// fmt.Println(err.Error())
// return err
// }
// jsonParser := json.NewDecoder(configFile)
// err = jsonParser.Decode(config)
// return err
//}
func ReadCli() map[string]string {
var m map[string]string
cfg := flag.String("config", "./config.json", "Path to configuration file")
logfile := flag.String("log", "stderr", "Path for logfile")
once := flag.Bool("once", false, "Run all collectors only once")
loglevel := flag.String("loglevel", "info", "Set log level")
flag.Parse()
m := map[string]string{
"configfile": *cfg,
"logfile": *logfile,
"once": "false",
"loglevel": *loglevel,
}
m = make(map[string]string)
m["configfile"] = *cfg
m["logfile"] = *logfile
if *once {
m["once"] = "true"
} else {
m["once"] = "false"
}
m["loglevel"] = *loglevel
return m
}
//func SetLogging(logfile string) error {
// var file *os.File
// var err error
// if logfile != "stderr" {
// file, err = os.OpenFile(logfile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600)
// if err != nil {
// log.Fatal(err)
// return err
// }
// } else {
// file = os.Stderr
// }
// log.SetOutput(file)
// return nil
//}
// General shutdownHandler function that gets executed in case of interrupt or graceful shutdownHandler
func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) {
defer config.Sync.Done()
@@ -121,10 +162,9 @@ func mainFunc() int {
// Load and check configuration
main := ccconf.GetPackageConfig("main")
d := json.NewDecoder(bytes.NewReader(main))
d.DisallowUnknownFields()
if err := d.Decode(&rcfg.ConfigFile); err != nil {
cclog.Errorf("Error reading configuration file %s: %v", rcfg.CliArgs["configfile"], err)
err = json.Unmarshal(main, &rcfg.ConfigFile)
if err != nil {
cclog.Error("Error reading configuration file ", rcfg.CliArgs["configfile"], ": ", err.Error())
return 1
}
@@ -176,6 +216,11 @@ func mainFunc() int {
return 1
}
// Set log file
// if logfile := rcfg.CliArgs["logfile"]; logfile != "stderr" {
// cclog.SetOutput(logfile)
// }
// Creat new multi channel ticker
rcfg.MultiChanTicker = mct.NewTicker(rcfg.Interval)

View File

@@ -1,19 +1,6 @@
{
"cpufreq": {},
"cpufreq_cpuinfo": {},
"cpustat": {
"exclude_metrics": [
"cpu_idle"
]
},
"diskstat": {
"exclude_metrics": [
"disk_total"
],
"exclude_mounts": [
"slurm-tmpfs"
]
},
"gpfs": {
"exclude_filesystem": [
"test_fs"
@@ -34,8 +21,6 @@
},
"numastats": {},
"nvidia": {},
"schedstat": {},
"smartmon": {},
"tempstat": {
"report_max_temperature": true,
"report_critical_temperature": true,

View File

@@ -1,5 +1,5 @@
# LIKWID version
LIKWID_VERSION := 5.5.1
LIKWID_VERSION := 5.4.1
LIKWID_INSTALLED_FOLDER := $(shell dirname $$(which likwid-topology 2>/dev/null) 2>/dev/null)
LIKWID_FOLDER := $(CURDIR)/likwid

View File

@@ -59,7 +59,6 @@ In contrast to the configuration files for sinks and receivers, the collectors c
* [ ] Aggreate metrics to higher topology entity (sum hwthread metrics to socket metric, ...). Needs to be configurable
# Contributing own collectors
A collector reads data from any source, parses it to metrics and submits these metrics to the `metric-collector`. A collector provides three function:
* `Name() string`: Return the name of the collector
@@ -68,7 +67,7 @@ A collector reads data from any source, parses it to metrics and submits these m
* `Read(duration time.Duration, output chan ccMessage.CCMessage)`: Read, parse and submit data to the `output` channel as [`CCMessage`](https://github.com/ClusterCockpit/cc-lib/blob/main/ccMessage/README.md). If the collector has to measure anything for some duration, use the provided function argument `duration`.
* `Close()`: Closes down the collector.
It is recommended to call `setup()` in the `Init()` function.
It is recommanded to call `setup()` in the `Init()` function.
Finally, the collector needs to be registered in the `collectorManager.go`. There is a list of collectors called `AvailableCollectors` which is a map (`collector_type_string` -> `pointer to MetricCollector interface`). Add a new entry with a descriptive name and the new collector.
@@ -101,14 +100,11 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
}
m.name = "SampleCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
m.meta = map[string]string{"source": m.name, "group": "Sample"}

View File

@@ -17,13 +17,12 @@ import (
"os/exec"
"os/user"
"regexp"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const DEFAULT_BEEGFS_CMD = "beegfs-ctl"
@@ -32,12 +31,11 @@ const DEFAULT_BEEGFS_CMD = "beegfs-ctl"
type BeegfsMetaCollectorConfig struct {
Beegfs string `json:"beegfs_path"`
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ExcludeFilesystems []string `json:"exclude_filesystem"`
ExcludeFilesystem []string `json:"exclude_filesystem"`
}
type BeegfsMetaCollector struct {
metricCollector
tags map[string]string
matches map[string]string
config BeegfsMetaCollectorConfig
@@ -50,7 +48,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
return nil
}
// Metrics
nodeMdstat_array := [39]string{
var nodeMdstat_array = [39]string{
"sum", "ack", "close", "entInf",
"fndOwn", "mkdir", "create", "rddir",
"refrEn", "mdsInf", "rmdir", "rmLnk",
@@ -60,13 +58,10 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
"lookLI", "statLI", "revalLI", "openLI",
"createLI", "hardlnk", "flckAp", "flckEn",
"flckRg", "dirparent", "listXA", "getXA",
"rmXA", "setXA", "mirror",
}
"rmXA", "setXA", "mirror"}
m.name = "BeegfsMetaCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
// Set default beegfs-ctl binary
@@ -74,17 +69,17 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
// Read JSON configuration
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Failed to decode JSON config: %w", m.name, err)
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
// Create map with possible variables
//create map with possible variables
m.matches = make(map[string]string)
for _, value := range nodeMdstat_array {
if slices.Contains(m.config.ExcludeMetrics, value) {
_, skip := stringArrayContains(m.config.ExcludeMetrics, value)
if skip {
m.matches["other"] = "0"
} else {
m.matches["beegfs_cmeta_"+value] = "0"
@@ -100,23 +95,23 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
"filesystem": "",
}
m.skipFS = make(map[string]struct{})
for _, fs := range m.config.ExcludeFilesystems {
for _, fs := range m.config.ExcludeFilesystem {
m.skipFS[fs] = struct{}{}
}
// Beegfs file system statistics can only be queried by user root
user, err := user.Current()
if err != nil {
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to get current user: %v", err)
}
if user.Uid != "0" {
return fmt.Errorf("%s Init(): BeeGFS file system statistics can only be queried by user root", m.name)
return fmt.Errorf("BeegfsMetaCollector.Init(): BeeGFS file system statistics can only be queried by user root")
}
// Check if beegfs-ctl is in executable search path
_, err = exec.LookPath(m.config.Beegfs)
if err != nil {
return fmt.Errorf("%s Init(): Failed to find beegfs-ctl binary '%s': %w", m.name, m.config.Beegfs, err)
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err)
}
m.init = true
return nil
@@ -126,7 +121,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
if !m.init {
return
}
// Get mounpoint
//get mounpoint
buffer, _ := os.ReadFile(string("/proc/mounts"))
mounts := strings.Split(string(buffer), "\n")
var mountpoints []string
@@ -156,6 +151,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
// --nodetype=meta: The node type to query (meta, storage).
// --interval:
// --mount=/mnt/beeond/: Which mount point
//cmd := exec.Command(m.config.Beegfs, "/root/mc/test.txt")
mountoption := "--mount=" + mountpoint
cmd := exec.Command(m.config.Beegfs, "--clientstats",
"--nodetype=meta", mountoption, "--allstats")
@@ -166,27 +162,26 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
cmd.Stderr = cmdStderr
err := cmd.Run()
if err != nil {
dataStdErr, _ := io.ReadAll(cmdStderr)
dataStdOut, _ := io.ReadAll(cmdStdout)
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err),
fmt.Sprintf("Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()),
fmt.Sprintf("Read(): command stderr: \"%s\"\n", string(dataStdErr)),
fmt.Sprintf("Read(): command stdout: \"%s\"\n", string(dataStdOut)),
)
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error())
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode())
data, _ := io.ReadAll(cmdStderr)
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stderr: \"%s\"\n", string(data))
data, _ = io.ReadAll(cmdStdout)
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stdout: \"%s\"\n", string(data))
return
}
// Read I/O statistics
scanner := bufio.NewScanner(cmdStdout)
sumLine := regexp.MustCompile(`^Sum:\s+\d+\s+\[[a-zA-Z]+\]+`)
//Line := regexp.MustCompile(`^(.*)\s+(\d)+\s+\[([a-zA-Z]+)\]+`)
statsLine := regexp.MustCompile(`^(.*?)\s+?(\d.*?)$`)
singleSpacePattern := regexp.MustCompile(`\s+`)
removePattern := regexp.MustCompile(`[\[|\]]`)
for scanner.Scan() {
readLine := scanner.Text()
//fmt.Println(readLine)
// Jump few lines, we only want the I/O stats from nodes
if !sumLine.MatchString(readLine) {
continue
@@ -195,7 +190,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
match := statsLine.FindStringSubmatch(readLine)
// nodeName = "Sum:" or would be nodes
// nodeName := match[1]
// Remove multiple whitespaces
//Remove multiple whitespaces
dummy := removePattern.ReplaceAllString(match[2], " ")
metaStats := strings.TrimSpace(singleSpacePattern.ReplaceAllString(dummy, " "))
split := strings.Split(metaStats, " ")
@@ -221,13 +216,14 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
continue
}
//mdStat["other"] = fmt.Sprintf("%f", f1+f2)
m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2)
}
}
for key, data := range m.matches {
value, _ := strconv.ParseFloat(data, 32)
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil {
output <- y
}

View File

@@ -17,25 +17,23 @@ import (
"os/exec"
"os/user"
"regexp"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// Struct for the collector-specific JSON config
type BeegfsStorageCollectorConfig struct {
Beegfs string `json:"beegfs_path"`
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ExcludeFilesystems []string `json:"exclude_filesystem"`
ExcludeFilesystem []string `json:"exclude_filesystem"`
}
type BeegfsStorageCollector struct {
metricCollector
tags map[string]string
matches map[string]string
config BeegfsStorageCollectorConfig
@@ -48,18 +46,15 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
return nil
}
// Metrics
storageStat_array := [18]string{
var storageStat_array = [18]string{
"sum", "ack", "sChDrct", "getFSize",
"sAttr", "statfs", "trunc", "close",
"fsync", "ops-rd", "MiB-rd/s", "ops-wr",
"MiB-wr/s", "gendbg", "hrtbeat", "remNode",
"storInf", "unlnk",
}
"storInf", "unlnk"}
m.name = "BeegfsStorageCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
// Set default beegfs-ctl binary
@@ -67,17 +62,17 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
// Read JSON configuration
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
// Create map with possible variables
println(m.config.Beegfs)
//create map with possible variables
m.matches = make(map[string]string)
for _, value := range storageStat_array {
if slices.Contains(m.config.ExcludeMetrics, value) {
_, skip := stringArrayContains(m.config.ExcludeMetrics, value)
if skip {
m.matches["other"] = "0"
} else {
m.matches["beegfs_cstorage_"+value] = "0"
@@ -93,23 +88,23 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
"filesystem": "",
}
m.skipFS = make(map[string]struct{})
for _, fs := range m.config.ExcludeFilesystems {
for _, fs := range m.config.ExcludeFilesystem {
m.skipFS[fs] = struct{}{}
}
// Beegfs file system statistics can only be queried by user root
user, err := user.Current()
if err != nil {
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to get current user: %v", err)
}
if user.Uid != "0" {
return fmt.Errorf("%s Init(): BeeGFS file system statistics can only be queried by user root", m.name)
return fmt.Errorf("BeegfsStorageCollector.Init(): BeeGFS file system statistics can only be queried by user root")
}
// Check if beegfs-ctl is in executable search path
_, err = exec.LookPath(m.config.Beegfs)
if err != nil {
return fmt.Errorf("%s Init(): Failed to find beegfs-ctl binary '%s': %w", m.name, m.config.Beegfs, err)
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err)
}
m.init = true
return nil
@@ -119,10 +114,11 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
if !m.init {
return
}
// Get mounpoint
buffer, _ := os.ReadFile("/proc/mounts")
//get mounpoint
buffer, _ := os.ReadFile(string("/proc/mounts"))
mounts := strings.Split(string(buffer), "\n")
var mountpoints []string
for line := range strings.Lines(string(buffer)) {
for _, line := range mounts {
if len(line) == 0 {
continue
}
@@ -147,6 +143,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
// --nodetype=meta: The node type to query (meta, storage).
// --interval:
// --mount=/mnt/beeond/: Which mount point
//cmd := exec.Command(m.config.Beegfs, "/root/mc/test.txt")
mountoption := "--mount=" + mountpoint
cmd := exec.Command(m.config.Beegfs, "--clientstats",
"--nodetype=storage", mountoption, "--allstats")
@@ -157,27 +154,26 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
cmd.Stderr = cmdStderr
err := cmd.Run()
if err != nil {
dataStdErr, _ := io.ReadAll(cmdStderr)
dataStdOut, _ := io.ReadAll(cmdStdout)
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err),
fmt.Sprintf("Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()),
fmt.Sprintf("Read(): command stderr: \"%s\"\n", string(dataStdErr)),
fmt.Sprintf("Read(): command stdout: \"%s\"\n", string(dataStdOut)),
)
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error())
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode())
data, _ := io.ReadAll(cmdStderr)
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stderr: \"%s\"\n", string(data))
data, _ = io.ReadAll(cmdStdout)
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stdout: \"%s\"\n", string(data))
return
}
// Read I/O statistics
scanner := bufio.NewScanner(cmdStdout)
sumLine := regexp.MustCompile(`^Sum:\s+\d+\s+\[[a-zA-Z]+\]+`)
//Line := regexp.MustCompile(`^(.*)\s+(\d)+\s+\[([a-zA-Z]+)\]+`)
statsLine := regexp.MustCompile(`^(.*?)\s+?(\d.*?)$`)
singleSpacePattern := regexp.MustCompile(`\s+`)
removePattern := regexp.MustCompile(`[\[|\]]`)
for scanner.Scan() {
readLine := scanner.Text()
//fmt.Println(readLine)
// Jump few lines, we only want the I/O stats from nodes
if !sumLine.MatchString(readLine) {
continue
@@ -186,7 +182,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
match := statsLine.FindStringSubmatch(readLine)
// nodeName = "Sum:" or would be nodes
// nodeName := match[1]
// Remove multiple whitespaces
//Remove multiple whitespaces
dummy := removePattern.ReplaceAllString(match[2], " ")
metaStats := strings.TrimSpace(singleSpacePattern.ReplaceAllString(dummy, " "))
split := strings.Split(metaStats, " ")
@@ -197,6 +193,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
for i := 0; i <= len(split)-1; i += 2 {
if _, ok := m.matches[split[i+1]]; ok {
m.matches["beegfs_cstorage_"+split[i+1]] = split[i]
//m.matches[split[i+1]] = split[i]
} else {
f1, err := strconv.ParseFloat(m.matches["other"], 32)
if err != nil {
@@ -218,7 +215,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
for key, data := range m.matches {
value, _ := strconv.ParseFloat(data, 32)
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil {
output <- y
}

View File

@@ -8,19 +8,18 @@
package collectors
import (
"bytes"
"encoding/json"
"fmt"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
)
// Map of all available metric collectors
var AvailableCollectors = map[string]MetricCollector{
"likwid": new(LikwidCollector),
"loadavg": new(LoadavgCollector),
"memstat": new(MemstatCollector),
@@ -49,7 +48,6 @@ var AvailableCollectors = map[string]MetricCollector{
"schedstat": new(SchedstatCollector),
"nfsiostat": new(NfsIOStatCollector),
"slurm_cgroup": new(SlurmCgroupCollector),
"smartmon": new(SmartMonCollector),
}
// Metric collector manager data structure
@@ -90,10 +88,10 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
cm.ticker = ticker
cm.duration = duration
d := json.NewDecoder(bytes.NewReader(collectConfig))
d.DisallowUnknownFields()
if err := d.Decode(&cm.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding collector manager config: %w", "CollectorManager", err)
err := json.Unmarshal(collectConfig, &cm.config)
if err != nil {
cclog.Error(err.Error())
return err
}
// Initialize configured collectors
@@ -104,9 +102,9 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
}
collector := AvailableCollectors[collectorName]
err := collector.Init(collectorCfg)
err = collector.Init(collectorCfg)
if err != nil {
cclog.ComponentError("CollectorManager", fmt.Sprintf("Collector %s initialization failed: %v", collectorName, err))
cclog.ComponentError("CollectorManager", "Collector", collectorName, "initialization failed:", err.Error())
continue
}
cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())
@@ -124,7 +122,9 @@ func (cm *collectorManager) Start() {
tick := make(chan time.Time)
cm.ticker.AddChannel(tick)
cm.wg.Go(func() {
cm.wg.Add(1)
go func() {
defer cm.wg.Done()
// Collector manager is done
done := func() {
// close all metric collectors
@@ -179,7 +179,7 @@ func (cm *collectorManager) Start() {
}
}
}
})
}()
// Collector manager is started
cclog.ComponentDebug("CollectorManager", "STARTED")

View File

@@ -10,14 +10,15 @@ package collectors
import (
"bufio"
"encoding/json"
"fmt"
"os"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// CPUFreqCollector
@@ -31,20 +32,18 @@ type CPUFreqCpuInfoCollectorTopology struct {
type CPUFreqCpuInfoCollector struct {
metricCollector
topology []CPUFreqCpuInfoCollectorTopology
}
func (m *CPUFreqCpuInfoCollector) Init(_ json.RawMessage) error {
func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
m.setup()
m.name = "CPUFreqCpuInfoCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true
m.meta = map[string]string{
"source": m.name,
@@ -55,8 +54,9 @@ func (m *CPUFreqCpuInfoCollector) Init(_ json.RawMessage) error {
const cpuInfoFile = "/proc/cpuinfo"
file, err := os.Open(cpuInfoFile)
if err != nil {
return fmt.Errorf("%s Init(): failed to open file '%s': %w", m.name, cpuInfoFile, err)
return fmt.Errorf("failed to open file '%s': %v", cpuInfoFile, err)
}
defer file.Close()
// Collect topology information from file cpuinfo
foundFreq := false
@@ -117,13 +117,9 @@ func (m *CPUFreqCpuInfoCollector) Init(_ json.RawMessage) error {
}
}
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): Call to file.Close() failed: %w", m.name, err)
}
// Check if at least one CPU with frequency information was detected
if len(m.topology) == 0 {
return fmt.Errorf("%s Init(): no CPU frequency info found in %s", m.name, cpuInfoFile)
return fmt.Errorf("no CPU frequency info found in %s", cpuInfoFile)
}
m.init = true
@@ -144,13 +140,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err))
return
}
defer func() {
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", cpuInfoFile, err))
}
}()
defer file.Close()
processorCounter := 0
now := time.Now()
@@ -171,7 +161,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
return
}
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": value}, now); err == nil {
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil {
output <- y
}
}

View File

@@ -12,9 +12,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/cpufreq_cpuinfo.md
## `cpufreq_cpuinfo` collector
```json
"cpufreq_cpuinfo": {
"exclude_metrics": []
}
"cpufreq_cpuinfo": {}
```
The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **hwthread** metrics.

View File

@@ -8,7 +8,6 @@
package collectors
import (
"bytes"
"encoding/json"
"fmt"
"os"
@@ -17,8 +16,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"golang.org/x/sys/unix"
)
@@ -36,7 +35,6 @@ type CPUFreqCollectorTopology struct {
// See: https://www.kernel.org/doc/html/latest/admin-guide/pm/cpufreq.html
type CPUFreqCollector struct {
metricCollector
topology []CPUFreqCollectorTopology
config struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
@@ -50,15 +48,12 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
}
m.name = "CPUFreqCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
m.meta = map[string]string{
@@ -79,15 +74,15 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq")
err := unix.Access(scalingCurFreqFile, unix.R_OK)
if err != nil {
return fmt.Errorf("%s Init(): unable to access file '%s': %w", m.name, scalingCurFreqFile, err)
return fmt.Errorf("unable to access file '%s': %v", scalingCurFreqFile, err)
}
m.topology = append(m.topology,
CPUFreqCollectorTopology{
tagSet: map[string]string{
"type": "hwthread",
"type-id": strconv.Itoa(c.CpuID),
"package_id": strconv.Itoa(c.Socket),
"type-id": fmt.Sprint(c.CpuID),
"package_id": fmt.Sprint(c.Socket),
},
scalingCurFreqFile: scalingCurFreqFile,
},
@@ -129,7 +124,7 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage
continue
}
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": cpuFreq}, now); err == nil {
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil {
output <- y
}
}

View File

@@ -9,17 +9,15 @@ package collectors
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"os"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
sysconf "github.com/tklauser/go-sysconf"
)
@@ -31,7 +29,6 @@ type CpustatCollectorConfig struct {
type CpustatCollector struct {
metricCollector
config CpustatCollectorConfig
lastTimestamp time.Time // Store time stamp of last tick to derive values
matches map[string]int
@@ -42,22 +39,14 @@ type CpustatCollector struct {
func (m *CpustatCollector) Init(config json.RawMessage) error {
m.name = "CpustatCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
m.meta = map[string]string{
"source": m.name,
"group": "CPU",
}
m.nodetags = map[string]string{
"type": "node",
}
m.meta = map[string]string{"source": m.name, "group": "CPU"}
m.nodetags = map[string]string{"type": "node"}
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
matches := map[string]int{
@@ -75,16 +64,24 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
m.matches = make(map[string]int)
for match, index := range matches {
if !slices.Contains(m.config.ExcludeMetrics, match) {
doExclude := false
for _, exclude := range m.config.ExcludeMetrics {
if match == exclude {
doExclude = true
break
}
}
if !doExclude {
m.matches[match] = index
}
}
// Check input file
file, err := os.Open(CPUSTATFILE)
file, err := os.Open(string(CPUSTATFILE))
if err != nil {
return fmt.Errorf("%s Init(): Failed to open file '%s': %w", m.name, CPUSTATFILE, err)
cclog.ComponentError(m.name, err.Error())
}
defer file.Close()
// Pre-generate tags for all CPUs
num_cpus := 0
@@ -102,10 +99,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
cpustr := strings.TrimLeft(linefields[0], "cpu")
cpu, _ := strconv.Atoi(cpustr)
m.cputags[linefields[0]] = map[string]string{
"type": "hwthread",
"type-id": strconv.Itoa(cpu),
}
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
m.olddata[linefields[0]] = make(map[string]int64)
for k, v := range m.matches {
m.olddata[linefields[0]][k], _ = strconv.ParseInt(linefields[v], 0, 64)
@@ -113,12 +107,6 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
num_cpus++
}
}
// Close file
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): Failed to close file '%s': %w", m.name, CPUSTATFILE, err)
}
m.lastTimestamp = time.Now()
m.init = true
return nil
@@ -141,7 +129,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
sum := float64(0)
for name, value := range values {
sum += value
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value * 100}, now)
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now)
if err == nil {
y.AddTag("unit", "Percent")
output <- y
@@ -149,7 +137,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
}
if v, ok := values["cpu_idle"]; ok {
sum -= v
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]any{"value": sum * 100}, now)
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]interface{}{"value": sum * 100}, now)
if err == nil {
y.AddTag("unit", "Percent")
output <- y
@@ -165,19 +153,11 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
now := time.Now()
tsdelta := now.Sub(m.lastTimestamp)
file, err := os.Open(CPUSTATFILE)
file, err := os.Open(string(CPUSTATFILE))
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", CPUSTATFILE, err))
cclog.ComponentError(m.name, err.Error())
}
defer func() {
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", string(CPUSTATFILE), err))
}
}()
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
@@ -194,7 +174,7 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
num_cpus_metric, err := lp.NewMessage("num_cpus",
m.nodetags,
m.meta,
map[string]any{"value": num_cpus},
map[string]interface{}{"value": int(num_cpus)},
now,
)
if err == nil {

View File

@@ -8,17 +8,16 @@
package collectors
import (
"bytes"
"encoding/json"
"fmt"
"errors"
"log"
"os"
"os/exec"
"slices"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
influx "github.com/influxdata/line-protocol"
)
const CUSTOMCMDPATH = `/home/unrz139/Work/cc-metric-collector/collectors/custom`
@@ -31,124 +30,102 @@ type CustomCmdCollectorConfig struct {
type CustomCmdCollector struct {
metricCollector
handler *influx.MetricHandler
parser *influx.Parser
config CustomCmdCollectorConfig
cmdFieldsSlice [][]string
commands []string
files []string
}
func (m *CustomCmdCollector) Init(config json.RawMessage) error {
var err error
m.name = "CustomCmdCollector"
m.parallel = true
m.meta = map[string]string{
"source": m.name,
"group": "Custom",
}
// Read configuration
m.meta = map[string]string{"source": m.name, "group": "Custom"}
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
log.Print(err.Error())
return err
}
}
// Setup
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
// Check if command can be executed
m.setup()
for _, c := range m.config.Commands {
cmdFields := strings.Fields(c)
command := exec.Command(cmdFields[0], cmdFields[1:]...)
if _, err := command.Output(); err != nil {
cclog.ComponentWarn(
m.name,
fmt.Sprintf("%s Init(): Execution of command \"%s\" failed: %v", m.name, command.String(), err))
cmdfields := strings.Fields(c)
command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " "))
command.Wait()
_, err = command.Output()
if err == nil {
m.commands = append(m.commands, c)
}
}
for _, f := range m.config.Files {
_, err = os.ReadFile(f)
if err == nil {
m.files = append(m.files, f)
} else {
log.Print(err.Error())
continue
}
m.cmdFieldsSlice = append(m.cmdFieldsSlice, cmdFields)
}
// Check if file can be read
for _, fileName := range m.config.Files {
if _, err := os.ReadFile(fileName); err != nil {
cclog.ComponentWarn(
m.name,
fmt.Sprintf("%s Init(): Reading of file \"%s\" failed: %v", m.name, fileName, err))
continue
}
m.files = append(m.files, fileName)
}
if len(m.files) == 0 && len(m.cmdFieldsSlice) == 0 {
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
if len(m.files) == 0 && len(m.commands) == 0 {
return errors.New("no metrics to collect")
}
m.handler = influx.NewMetricHandler()
m.parser = influx.NewParser(m.handler)
m.parser.SetTimeFunc(DefaultTime)
m.init = true
return nil
}
var DefaultTime = func() time.Time {
return time.Unix(42, 0)
}
func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
// Execute configured commands
for _, cmdFields := range m.cmdFieldsSlice {
command := exec.Command(cmdFields[0], cmdFields[1:]...)
for _, cmd := range m.commands {
cmdfields := strings.Fields(cmd)
command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " "))
command.Wait()
stdout, err := command.Output()
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to read command output for command \"%s\": %v", command.String(), err),
)
log.Print(err)
continue
}
cmdmetrics, err := m.parser.Parse(stdout)
if err != nil {
log.Print(err)
continue
}
for _, c := range cmdmetrics {
_, skip := stringArrayContains(m.config.ExcludeMetrics, c.Name())
if skip {
continue
}
// Read and decode influxDB line-protocol from command output
metrics, err := lp.FromBytes(stdout)
output <- lp.FromInfluxMetric(c)
}
}
for _, file := range m.files {
buffer, err := os.ReadFile(file)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to decode influx Message: %v", err),
)
continue
log.Print(err)
return
}
for _, metric := range metrics {
if slices.Contains(m.config.ExcludeMetrics, metric.Name()) {
continue
}
output <- metric
}
}
// Read configured files
for _, filename := range m.files {
input, err := os.ReadFile(filename)
fmetrics, err := m.parser.Parse(buffer)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to read file \"%s\": %v\n", filename, err),
)
log.Print(err)
continue
}
// Read and decode influxDB line-protocol from file
metrics, err := lp.FromBytes(input)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to decode influx Message: %v", err),
)
for _, f := range fmetrics {
_, skip := stringArrayContains(m.config.ExcludeMetrics, f.Name())
if skip {
continue
}
for _, metric := range metrics {
if slices.Contains(m.config.ExcludeMetrics, metric.Name()) {
continue
}
output <- metric
output <- lp.FromInfluxMetric(f)
}
}
}

View File

@@ -9,16 +9,14 @@ package collectors
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"os"
"strings"
"syscall"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const MOUNTFILE = `/proc/self/mounts`
@@ -30,7 +28,6 @@ type DiskstatCollectorConfig struct {
type DiskstatCollector struct {
metricCollector
config DiskstatCollectorConfig
allowedMetrics map[string]bool
}
@@ -39,14 +36,10 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
m.name = "DiskstatCollector"
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Disk"}
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
if err := json.Unmarshal(config, &m.config); err != nil {
return err
}
}
m.allowedMetrics = map[string]bool{
@@ -61,11 +54,10 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
}
file, err := os.Open(MOUNTFILE)
if err != nil {
return fmt.Errorf("%s Init(): file open for file \"%s\" failed: %w", m.name, MOUNTFILE, err)
}
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): file close for file \"%s\" failed: %w", m.name, MOUNTFILE, err)
cclog.ComponentError(m.name, err.Error())
return err
}
defer file.Close()
m.init = true
return nil
}
@@ -77,18 +69,10 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
file, err := os.Open(MOUNTFILE)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", MOUNTFILE, err))
cclog.ComponentError(m.name, err.Error())
return
}
defer func() {
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", MOUNTFILE, err))
}
}()
defer file.Close()
part_max_used := uint64(0)
scanner := bufio.NewScanner(file)
@@ -109,7 +93,7 @@ mountLoop:
continue
}
mountPath := strings.ReplaceAll(linefields[1], `\040`, " ")
mountPath := strings.Replace(linefields[1], `\040`, " ", -1)
for _, excl := range m.config.ExcludeMounts {
if strings.Contains(mountPath, excl) {
@@ -126,31 +110,17 @@ mountLoop:
continue
}
tags := map[string]string{"type": "node", "device": linefields[0]}
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000_000_000)
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
if m.allowedMetrics["disk_total"] {
y, err := lp.NewMessage(
"disk_total",
tags,
m.meta,
map[string]any{
"value": total,
},
time.Now())
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
if err == nil {
y.AddMeta("unit", "GBytes")
output <- y
}
}
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000_000_000)
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000)
if m.allowedMetrics["disk_free"] {
y, err := lp.NewMessage(
"disk_free",
tags,
m.meta,
map[string]any{
"value": free,
},
time.Now())
y, err := lp.NewMessage("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
if err == nil {
y.AddMeta("unit", "GBytes")
output <- y
@@ -164,16 +134,7 @@ mountLoop:
}
}
if m.allowedMetrics["part_max_used"] {
y, err := lp.NewMessage(
"part_max_used",
map[string]string{
"type": "node",
},
m.meta,
map[string]any{
"value": int(part_max_used),
},
time.Now())
y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
if err == nil {
y.AddMeta("unit", "percent")
output <- y

File diff suppressed because it is too large Load Diff

View File

@@ -14,18 +14,13 @@ hugo_path: docs/reference/cc-metric-collector/collectors/gpfs.md
```json
"gpfs": {
"mmpmon_path": "/path/to/mmpmon",
"use_sudo": true,
"use_sudo": "true",
"exclude_filesystem": [
"fs1"
],
"exclude_metrics": [
"gpfs_bytes_written"
],
"send_abs_values": true,
"send_diff_values": true,
"send_derived_values": true,
"send_bandwidths": true,
"send_total_values": true,
"send_bandwidths": true
"send_derived_values": true
}
```
@@ -34,50 +29,38 @@ GPFS / IBM Spectrum Scale filesystems.
The reported filesystems can be filtered with the `exclude_filesystem` option
in the configuration.
Individual metrics can be disabled for reporting using option `exclude_metrics`.
The path to the `mmpmon` command can be configured with the `mmpmon_path` option
in the configuration. If nothing is set, the collector searches in `$PATH` for `mmpmon`.
If cc-metric-collector is run as non-root, password-less `sudo` can be enabled with `use_sudo`.
If cc-metric-collector is run as non-root, `sudo` can be enabled with `use_sudo`.
Because `mmpmon` is by default only executable as root, the Go procedure to
search for it in `$PATH` will fail. If you use `sudo`, you must specify the
complete path for `mmpmon` using the parameter `mmpmon_path`.
Metrics:
* `gpfs_bytes_read` (if `send_abs_values == true`)
* `gpfs_bytes_written` (if `send_abs_values == true`)
* `gpfs_num_opens` (if `send_abs_values == true`)
* `gpfs_num_closes` (if `send_abs_values == true`)
* `gpfs_num_reads` (if `send_abs_values == true`)
* `gpfs_num_writes` (if `send_abs_values == true`)
* `gpfs_num_readdirs` (if `send_abs_values == true`)
* `gpfs_num_inode_updates` (if `send_abs_values == true`)
* `gpfs_bytes_read_diff` (if `send_diff_values == true`)
* `gpfs_bytes_written_diff` (if `send_diff_values == true`)
* `gpfs_num_opens_diff` (if `send_diff_values == true`)
* `gpfs_num_closes_diff` (if `send_diff_values == true`)
* `gpfs_num_reads_diff` (if `send_diff_values == true`)
* `gpfs_num_writes_diff` (if `send_diff_values == true`)
* `gpfs_num_readdirs_diff` (if `send_diff_values == true`)
* `gpfs_num_inode_updates_diff` (if `send_diff_values == true`)
* `gpfs_bw_read` (if `send_derived_values == true` or `send_bandwidths == true`)
* `gpfs_bw_write` (if `send_derived_values == true` or `send_bandwidths == true`)
* `gpfs_bytes_read`
* `gpfs_bytes_written`
* `gpfs_num_opens`
* `gpfs_num_closes`
* `gpfs_num_reads`
* `gpfs_num_writes`
* `gpfs_num_readdirs`
* `gpfs_num_inode_updates`
* `gpfs_opens_rate` (if `send_derived_values == true`)
* `gpfs_closes_rate` (if `send_derived_values == true`)
* `gpfs_reads_rate` (if `send_derived_values == true`)
* `gpfs_writes_rate` (if `send_derived_values == true`)
* `gpfs_readdirs_rate` (if `send_derived_values == true`)
* `gpfs_inode_updates_rate` (if `send_derived_values == true`)
* `gpfs_bytes_total = gpfs_bytes_read + gpfs_bytes_written` (if `send_total_values == true` and `send_abs_values == true`)
* `gpfs_bytes_total_diff` (if `send_total_values == true` and `send_diff_values == true`)
* `gpfs_bw_total` ((if `send_total_values == true` and `send_derived_values == true`) or `send_bandwidths == true`)
* `gpfs_iops = gpfs_num_reads + gpfs_num_writes` (if `send_total_values == true` and `send_abs_values == true`)
* `gpfs_iops_diff` (if `send_total_values == true` and `send_diff_values == true`)
* `gpfs_bytes_total = gpfs_bytes_read + gpfs_bytes_written` (if `send_total_values == true`)
* `gpfs_iops = gpfs_num_reads + gpfs_num_writes` (if `send_total_values == true`)
* `gpfs_iops_rate` (if `send_total_values == true` and `send_derived_values == true`)
* `gpfs_metaops = gpfs_num_inode_updates + gpfs_num_closes + gpfs_num_opens + gpfs_num_readdirs` (if `send_total_values == true` and `send_abs_values == true`)
* `gpfs_metaops_diff` (if `send_total_values == true` and `send_diff_values == true`)
* `gpfs_metaops = gpfs_num_inode_updates + gpfs_num_closes + gpfs_num_opens + gpfs_num_readdirs` (if `send_total_values == true`)
* `gpfs_metaops_rate` (if `send_total_values == true` and `send_derived_values == true`)
* `gpfs_bw_read` (if `send_bandwidths == true`)
* `gpfs_bw_write` (if `send_bandwidths == true`)
* `gpfs_bw_total` (if `send_bandwidths == true` and `send_total_values == true`)
The collector adds a `filesystem` tag to all metrics

View File

@@ -8,19 +8,18 @@
package collectors
import (
"bytes"
"encoding/json"
"fmt"
"os"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"golang.org/x/sys/unix"
"encoding/json"
"path/filepath"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
"golang.org/x/sys/unix"
)
const IB_BASEPATH = "/sys/class/infiniband/"
@@ -46,7 +45,6 @@ type InfinibandCollectorInfo struct {
type InfinibandCollector struct {
metricCollector
config struct {
ExcludeDevices []string `json:"exclude_devices,omitempty"` // IB device to exclude e.g. mlx5_0
SendAbsoluteValues bool `json:"send_abs_values"` // Send absolut values as read from sys filesystem
@@ -59,6 +57,7 @@ type InfinibandCollector struct {
// Init initializes the Infiniband collector by walking through files below IB_BASEPATH
func (m *InfinibandCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
@@ -66,9 +65,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
var err error
m.name = "InfinibandCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
m.meta = map[string]string{
"source": m.name,
@@ -80,10 +77,9 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
m.config.SendDerivedValues = false
// Read configuration file, allow overwriting default config
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
@@ -91,10 +87,10 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*")
ibDirs, err := filepath.Glob(globPattern)
if err != nil {
return fmt.Errorf("%s Init(): unable to glob files with pattern %s: %w", m.name, globPattern, err)
return fmt.Errorf("unable to glob files with pattern %s: %v", globPattern, err)
}
if ibDirs == nil {
return fmt.Errorf("%s Init(): unable to find any directories with pattern %s", m.name, globPattern)
return fmt.Errorf("unable to find any directories with pattern %s", globPattern)
}
for _, path := range ibDirs {
@@ -115,7 +111,14 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
port := pathSplit[6]
// Skip excluded devices
if slices.Contains(m.config.ExcludeDevices, device) {
skip := false
for _, excludedDevice := range m.config.ExcludeDevices {
if excludedDevice == device {
skip = true
break
}
}
if skip {
continue
}
@@ -158,7 +161,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
for _, counter := range portCounterFiles {
err := unix.Access(counter.path, unix.R_OK)
if err != nil {
return fmt.Errorf("%s Init(): unable to access %s: %w", m.name, counter.path, err)
return fmt.Errorf("unable to access %s: %v", counter.path, err)
}
}
@@ -178,7 +181,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
}
if len(m.info) == 0 {
return fmt.Errorf("%s Init(): found no IB devices", m.name)
return fmt.Errorf("found no IB devices")
}
m.init = true
@@ -187,6 +190,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
// Read reads Infiniband counter files below IB_BASEPATH
func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Check if already initialized
if !m.init {
return
@@ -232,11 +236,12 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
// Send absolut values
if m.config.SendAbsoluteValues {
if y, err := lp.NewMessage(
if y, err :=
lp.NewMessage(
counterDef.name,
info.tagSet,
m.meta,
map[string]any{
map[string]interface{}{
"value": counterDef.currentState,
},
now); err == nil {
@@ -249,11 +254,12 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
if m.config.SendDerivedValues {
if counterDef.lastState >= 0 {
rate := float64((counterDef.currentState - counterDef.lastState)) / timeDiff
if y, err := lp.NewMessage(
if y, err :=
lp.NewMessage(
counterDef.name+"_bw",
info.tagSet,
m.meta,
map[string]any{
map[string]interface{}{
"value": rate,
},
now); err == nil {
@@ -278,11 +284,12 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
// Send total values
if m.config.SendTotalValues {
if y, err := lp.NewMessage(
if y, err :=
lp.NewMessage(
"ib_total",
info.tagSet,
m.meta,
map[string]any{
map[string]interface{}{
"value": ib_total,
},
now); err == nil {
@@ -290,11 +297,12 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
output <- y
}
if y, err := lp.NewMessage(
if y, err :=
lp.NewMessage(
"ib_total_pkts",
info.tagSet,
m.meta,
map[string]any{
map[string]interface{}{
"value": ib_total_pkts,
},
now); err == nil {

View File

@@ -9,52 +9,48 @@ package collectors
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"errors"
"os"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// Konstante für den Pfad zu /proc/diskstats
const IOSTATFILE = `/proc/diskstats`
type IOstatCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
// Neues Feld zum Ausschließen von Devices per JSON-Konfiguration
ExcludeDevices []string `json:"exclude_devices,omitempty"`
}
type IOstatCollectorEntry struct {
currentValues map[string]int64
lastValues map[string]int64
tags map[string]string
}
type IOstatCollector struct {
metricCollector
matches map[string]int
config IOstatCollectorConfig
devices map[string]IOstatCollectorEntry
}
func (m *IOstatCollector) Init(config json.RawMessage) error {
var err error
m.name = "IOstatCollector"
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Disk"}
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
// https://www.kernel.org/doc/html/latest/admin-guide/iostats.html
@@ -80,17 +76,19 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
m.devices = make(map[string]IOstatCollectorEntry)
m.matches = make(map[string]int)
for k, v := range matches {
if !slices.Contains(m.config.ExcludeMetrics, k) {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, k); !skip {
m.matches[k] = v
}
}
if len(m.matches) == 0 {
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
return errors.New("no metrics to collect")
}
file, err := os.Open(IOSTATFILE)
if err != nil {
return fmt.Errorf("%s Init(): Failed to open file \"%s\": %w", m.name, IOSTATFILE, err)
cclog.ComponentError(m.name, err.Error())
return err
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
@@ -104,38 +102,23 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
if strings.Contains(device, "loop") {
continue
}
if slices.Contains(m.config.ExcludeDevices, device) {
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
continue
}
currentValues := make(map[string]int64)
lastValues := make(map[string]int64)
values := make(map[string]int64)
for m := range m.matches {
currentValues[m] = 0
lastValues[m] = 0
}
for name, idx := range m.matches {
if idx < len(linefields) {
if value, err := strconv.ParseInt(linefields[idx], 0, 64); err == nil {
currentValues[name] = value
lastValues[name] = value // Set last to current for first read
}
}
values[m] = 0
}
m.devices[device] = IOstatCollectorEntry{
tags: map[string]string{
"device": device,
"type": "node",
},
currentValues: currentValues,
lastValues: lastValues,
lastValues: values,
}
}
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): Failed to close file \"%s\": %w", m.name, IOSTATFILE, err)
}
m.init = true
return nil
return err
}
func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
@@ -145,18 +128,10 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
file, err := os.Open(IOSTATFILE)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", IOSTATFILE, err))
cclog.ComponentError(m.name, err.Error())
return
}
defer func() {
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", IOSTATFILE, err))
}
}()
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
@@ -172,28 +147,24 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
if strings.Contains(device, "loop") {
continue
}
if slices.Contains(m.config.ExcludeDevices, device) {
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
continue
}
if _, ok := m.devices[device]; !ok {
continue
}
// Update current and last values
entry := m.devices[device]
for name, idx := range m.matches {
if idx < len(linefields) {
x, err := strconv.ParseInt(linefields[idx], 0, 64)
if err == nil {
// Calculate difference using previous current and new value
diff := x - entry.currentValues[name]
y, err := lp.NewMetric(name, entry.tags, m.meta, int(diff), time.Now())
diff := x - entry.lastValues[name]
y, err := lp.NewMessage(name, entry.tags, m.meta, map[string]interface{}{"value": int(diff)}, time.Now())
if err == nil {
output <- y
}
// Update last to previous current, and current to new value
entry.lastValues[name] = entry.currentValues[name]
entry.currentValues[name] = x
}
entry.lastValues[name] = x
}
}
m.devices[device] = entry

View File

@@ -11,22 +11,23 @@ import (
"bufio"
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"log"
"os/exec"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const IPMISENSORS_PATH = `ipmi-sensors`
type IpmiCollector struct {
metricCollector
config struct {
ExcludeDevices []string `json:"exclude_devices"`
IpmitoolPath string `json:"ipmitool_path"`
@@ -43,9 +44,7 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
}
m.name = "IpmiCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
m.meta = map[string]string{
"source": m.name,
@@ -55,10 +54,9 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
m.config.IpmitoolPath = "ipmitool"
m.config.IpmisensorsPath = "ipmi-sensors"
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
// Check if executables ipmitool or ipmisensors are found
@@ -67,7 +65,7 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
command := exec.Command(p)
err := command.Run()
if err != nil {
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %s", p, err.Error()))
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
m.ipmitool = ""
} else {
m.ipmitool = p
@@ -78,14 +76,14 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
command := exec.Command(p)
err := command.Run()
if err != nil {
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %s", p, err.Error()))
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
m.ipmisensors = ""
} else {
m.ipmisensors = p
}
}
if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 {
return fmt.Errorf("%s Init(): no usable IPMI reader found", m.name)
return errors.New("no usable IPMI reader found")
}
m.init = true
@@ -93,6 +91,7 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
}
func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
// Setup ipmitool command
command := exec.Command(cmd, "sensor")
stdout, _ := command.StdoutPipe()
@@ -117,20 +116,19 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
}
v, err := strconv.ParseFloat(strings.TrimSpace(lv[1]), 64)
if err == nil {
name := strings.ToLower(strings.ReplaceAll(strings.TrimSpace(lv[0]), " ", "_"))
name := strings.ToLower(strings.Replace(strings.TrimSpace(lv[0]), " ", "_", -1))
unit := strings.TrimSpace(lv[2])
switch unit {
case "Volts":
if unit == "Volts" {
unit = "Volts"
case "degrees C":
} else if unit == "degrees C" {
unit = "degC"
case "degrees F":
} else if unit == "degrees F" {
unit = "degF"
case "Watts":
} else if unit == "Watts" {
unit = "Watts"
}
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
if err == nil {
y.AddMeta("unit", unit)
output <- y
@@ -151,30 +149,24 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
}
func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
// Setup ipmisensors command
command := exec.Command(cmd, "--comma-separated-output", "--sdr-cache-recreate")
stdout, _ := command.StdoutPipe()
errBuf := new(bytes.Buffer)
command.Stderr = errBuf
// start command
if err := command.Start(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("readIpmiSensors(): Failed to start command \"%s\": %v", command.String(), err),
)
command := exec.Command(cmd, "--comma-separated-output", "--sdr-cache-recreate")
command.Wait()
stdout, err := command.Output()
if err != nil {
log.Print(err)
return
}
// Read command output
scanner := bufio.NewScanner(stdout)
for scanner.Scan() {
lv := strings.Split(scanner.Text(), ",")
ll := strings.Split(string(stdout), "\n")
for _, line := range ll {
lv := strings.Split(line, ",")
if len(lv) > 3 {
v, err := strconv.ParseFloat(lv[3], 64)
if err == nil {
name := strings.ToLower(strings.ReplaceAll(lv[1], " ", "_"))
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
name := strings.ToLower(strings.Replace(lv[1], " ", "_", -1))
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
if err == nil {
if len(lv) > 4 {
y.AddMeta("unit", lv[4])
@@ -184,20 +176,10 @@ func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
}
}
}
// Wait for command end
if err := command.Wait(); err != nil {
errMsg, _ := io.ReadAll(errBuf)
cclog.ComponentError(
m.name,
fmt.Sprintf("readIpmiSensors(): Failed to wait for the end of command \"%s\": %v\n", command.String(), err),
)
cclog.ComponentError(m.name, fmt.Sprintf("readIpmiSensors(): command stderr: \"%s\"\n", strings.TrimSpace(string(errMsg))))
return
}
}
func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Check if already initialized
if !m.init {
return

View File

@@ -14,7 +14,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/ipmi.md
```json
"ipmistat": {
"ipmitool_path": "/path/to/ipmitool",
"ipmisensors_path": "/path/to/ipmi-sensors"
"ipmisensors_path": "/path/to/ipmi-sensors",
}
```

View File

@@ -16,15 +16,14 @@ package collectors
import "C"
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"maps"
"math"
"os"
"os/signal"
"os/user"
"slices"
"sort"
"strconv"
"strings"
"sync"
@@ -32,8 +31,8 @@ import (
"time"
"unsafe"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"github.com/NVIDIA/go-nvml/pkg/dl"
@@ -125,14 +124,22 @@ func checkMetricType(t string) bool {
return ok
}
func eventsToEventStr(events map[string]string) string {
elist := make([]string, 0)
for k, v := range events {
elist = append(elist, fmt.Sprintf("%s:%s", v, k))
}
return strings.Join(elist, ",")
}
func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig {
clist := make([]string, 0, len(input.Events))
tmplist := make([]string, 0)
clist := make([]string, 0)
for k := range input.Events {
clist = append(clist, k)
}
slices.Sort(clist)
tmplist := make([]string, 0, len(clist))
elist := make([]*C.char, 0, len(clist))
sort.Strings(clist)
elist := make([]*C.char, 0)
for _, k := range clist {
v := input.Events[k]
tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k))
@@ -142,7 +149,7 @@ func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig
estr := strings.Join(tmplist, ",")
res := make(map[int]map[string]float64)
met := make(map[int]map[string]float64)
for _, i := range topo.HwthreadList() {
for _, i := range topo.CpuList() {
res[i] = make(map[string]float64)
for k := range input.Events {
res[i][k] = 0.0
@@ -180,7 +187,7 @@ func getBaseFreq() float64 {
for _, f := range files {
buffer, err := os.ReadFile(f)
if err == nil {
data := strings.ReplaceAll(string(buffer), "\n", "")
data := strings.Replace(string(buffer), "\n", "", -1)
x, err := strconv.ParseInt(data, 0, 64)
if err == nil {
freq = float64(x)
@@ -207,30 +214,25 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
m.config.LibraryPath = LIKWID_LIB_NAME
m.config.LockfilePath = LIKWID_DEF_LOCKFILE
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS)
if lib == nil {
return fmt.Errorf("%s Init(): error instantiating DynamicLibrary for %s", m.name, m.config.LibraryPath)
return fmt.Errorf("error instantiating DynamicLibrary for %s", m.config.LibraryPath)
}
err := lib.Open()
if err != nil {
return fmt.Errorf("%s Init(): error opening %s: %w", m.name, m.config.LibraryPath, err)
return fmt.Errorf("error opening %s: %v", m.config.LibraryPath, err)
}
if m.config.ForceOverwrite {
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
if err := os.Setenv("LIKWID_FORCE", "1"); err != nil {
return fmt.Errorf("%s Init(): error setting environment variable LIKWID_FORCE=1: %w", m.name, err)
}
}
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
os.Setenv("LIKWID_FORCE", "1")
}
m.setup()
m.meta = map[string]string{"group": "PerfCounter"}
cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
@@ -296,12 +298,16 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
// If no event set could be added, shut down LikwidCollector
if totalMetrics == 0 {
return fmt.Errorf("%s Init(): no LIKWID eventset or metric usable", m.name)
err := errors.New("no LIKWID eventset or metric usable")
cclog.ComponentError(m.name, err.Error())
return err
}
ret := C.topology_init()
if ret != 0 {
return fmt.Errorf("%s Init(): failed to initialize topology module", m.name)
err := errors.New("failed to initialize topology module")
cclog.ComponentError(m.name, err.Error())
return err
}
m.measureThread = thread.New()
switch m.config.AccessMode {
@@ -310,14 +316,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
case "accessdaemon":
if len(m.config.DaemonPath) > 0 {
p := os.Getenv("PATH")
if len(p) > 0 {
p = m.config.DaemonPath + ":" + p
} else {
p = m.config.DaemonPath
}
if err := os.Setenv("PATH", p); err != nil {
return fmt.Errorf("%s Init(): error setting environment variable PATH=%s: %w", m.name, p, err)
}
os.Setenv("PATH", m.config.DaemonPath+":"+p)
}
C.HPMmode(1)
retCode := C.HPMinit()
@@ -328,7 +327,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
for _, c := range m.cpulist {
m.measureThread.Call(
func() {
retCode := C.HPMaddThread(C.uint32_t(c))
retCode := C.HPMaddThread(c)
if retCode != 0 {
err := fmt.Errorf("C.HPMaddThread(%v) failed with return code %v", c, retCode)
cclog.ComponentError(m.name, err.Error())
@@ -370,23 +369,16 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
// take a measurement for 'interval' seconds of event set index 'group'
func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, interval time.Duration) (bool, error) {
var ret C.int
var gid C.int = -1
sigchan := make(chan os.Signal, 1)
// Watch changes for the lock file ()
watcher, err := fsnotify.NewWatcher()
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("takeMeasurement(): Failed to create a new fsnotify.Watcher: %v", err))
cclog.ComponentError(m.name, err.Error())
return true, err
}
defer func() {
if err := watcher.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("takeMeasurement(): Failed to close fsnotify.Watcher: %v", err))
}
}()
defer watcher.Close()
if len(m.config.LockfilePath) > 0 {
// Check if the lock file exists
info, err := os.Stat(m.config.LockfilePath)
@@ -394,11 +386,9 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
// Create the lock file if it does not exist
file, createErr := os.Create(m.config.LockfilePath)
if createErr != nil {
return true, fmt.Errorf("failed to create lock file: %w", createErr)
}
if err := file.Close(); err != nil {
return true, fmt.Errorf("failed to close lock file: %w", err)
return true, fmt.Errorf("failed to create lock file: %v", createErr)
}
file.Close()
info, err = os.Stat(m.config.LockfilePath) // Recheck the file after creation
}
if err != nil {
@@ -450,7 +440,6 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
signal.Notify(sigchan, syscall.SIGCHLD)
// Add an event string to LIKWID
var gid C.int
select {
case <-sigchan:
gid = -1
@@ -606,20 +595,20 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
evset.metrics[tid][metric.Name] = value
// Now we have the result, send it with the proper tags
if !math.IsNaN(value) && metric.Publish {
y, err := lp.NewMessage(
fields := map[string]interface{}{"value": value}
y, err :=
lp.NewMessage(
metric.Name,
map[string]string{
"type": metric.Type,
},
m.meta,
map[string]any{
"value": value,
},
fields,
now,
)
if err == nil {
if metric.Type != "node" {
y.AddTag("type-id", strconv.Itoa(domain))
y.AddTag("type-id", fmt.Sprintf("%d", domain))
}
if len(metric.Unit) > 0 {
y.AddMeta("unit", metric.Unit)
@@ -644,14 +633,15 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
}
for coreID, value := range totalCoreValues {
y, err := lp.NewMessage(
y, err :=
lp.NewMessage(
metric.Name,
map[string]string{
"type": "core",
"type-id": strconv.Itoa(coreID),
"type-id": fmt.Sprintf("%d", coreID),
},
m.meta,
map[string]any{
map[string]interface{}{
"value": value,
},
now,
@@ -680,14 +670,15 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
}
for socketID, value := range totalSocketValues {
y, err := lp.NewMessage(
y, err :=
lp.NewMessage(
metric.Name,
map[string]string{
"type": "socket",
"type-id": strconv.Itoa(socketID),
"type-id": fmt.Sprintf("%d", socketID),
},
m.meta,
map[string]any{
map[string]interface{}{
"value": value,
},
now,
@@ -714,13 +705,14 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
}
}
y, err := lp.NewMessage(
y, err :=
lp.NewMessage(
metric.Name,
map[string]string{
"type": "node",
},
m.meta,
map[string]any{
map[string]interface{}{
"value": totalNodeValue,
},
now,
@@ -756,7 +748,9 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
// Here we generate parameter list
params := make(map[string]float64)
for _, evset := range groups {
maps.Copy(params, evset.metrics[tid])
for mname, mres := range evset.metrics[tid] {
params[mname] = mres
}
}
params["gotime"] = interval.Seconds()
// Evaluate the metric
@@ -771,20 +765,21 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
// Now we have the result, send it with the proper tags
if !math.IsNaN(value) {
if metric.Publish {
y, err := lp.NewMessage(
y, err :=
lp.NewMessage(
metric.Name,
map[string]string{
"type": metric.Type,
},
m.meta,
map[string]any{
map[string]interface{}{
"value": value,
},
now,
)
if err == nil {
if metric.Type != "node" {
y.AddTag("type-id", strconv.Itoa(domain))
y.AddTag("type-id", fmt.Sprintf("%d", domain))
}
if len(metric.Unit) > 0 {
y.AddMeta("unit", metric.Unit)
@@ -800,7 +795,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
}
func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMessage) {
var err error
var err error = nil
groups := make([]LikwidEventsetConfig, 0)
for evidx, evset := range m.config.Eventsets {
@@ -818,21 +813,13 @@ func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMe
if !skip {
// read measurements and derive event set metrics
err = m.calcEventsetMetrics(e, interval, output)
if err != nil {
cclog.ComponentError(m.name, err.Error())
return
}
m.calcEventsetMetrics(e, interval, output)
groups = append(groups, e)
}
}
if len(groups) > 0 {
// calculate global metrics
err = m.calcGlobalMetrics(groups, interval, output)
if err != nil {
cclog.ComponentError(m.name, err.Error())
return
}
m.calcGlobalMetrics(groups, interval, output)
}
}

View File

@@ -8,17 +8,15 @@
package collectors
import (
"bytes"
"encoding/json"
"fmt"
"os"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// LoadavgCollector collects:
@@ -31,7 +29,6 @@ const LOADAVGFILE = "/proc/loadavg"
type LoadavgCollector struct {
metricCollector
tags map[string]string
load_matches []string
load_skips []bool
@@ -45,39 +42,32 @@ type LoadavgCollector struct {
func (m *LoadavgCollector) Init(config json.RawMessage) error {
m.name = "LoadavgCollector"
m.parallel = true
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
m.meta = map[string]string{
"source": m.name,
"group": "LOAD",
}
"group": "LOAD"}
m.tags = map[string]string{"type": "node"}
m.load_matches = []string{
"load_one",
"load_five",
"load_fifteen",
}
"load_fifteen"}
m.load_skips = make([]bool, len(m.load_matches))
m.proc_matches = []string{
"proc_run",
"proc_total",
}
m.load_skips = make([]bool, len(m.load_matches))
for i, name := range m.load_matches {
m.load_skips[i] = slices.Contains(m.config.ExcludeMetrics, name)
}
"proc_total"}
m.proc_skips = make([]bool, len(m.proc_matches))
for i, name := range m.load_matches {
_, m.load_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
}
for i, name := range m.proc_matches {
m.proc_skips[i] = slices.Contains(m.config.ExcludeMetrics, name)
_, m.proc_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
}
m.init = true
return nil
@@ -109,7 +99,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
if m.load_skips[i] {
continue
}
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
if err == nil {
output <- y
}
@@ -128,7 +118,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
if m.proc_skips[i] {
continue
}
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
if err == nil {
output <- y
}

View File

@@ -8,25 +8,22 @@
package collectors
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"os/exec"
"os/user"
"slices"
"strconv"
"strings"
"time"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const (
LUSTRE_SYSFS = `/sys/fs/lustre`
LCTL_CMD = `lctl`
LCTL_OPTION = `get_param`
)
const LUSTRE_SYSFS = `/sys/fs/lustre`
const LCTL_CMD = `lctl`
const LCTL_OPTION = `get_param`
type LustreCollectorConfig struct {
LCtlCommand string `json:"lctl_command,omitempty"`
@@ -47,7 +44,6 @@ type LustreMetricDefinition struct {
type LustreCollector struct {
metricCollector
tags map[string]string
config LustreCollectorConfig
lctl string
@@ -65,6 +61,7 @@ func (m *LustreCollector) getDeviceDataCommand(device string) []string {
} else {
command = exec.Command(m.lctl, LCTL_OPTION, statsfile)
}
command.Wait()
stdout, _ := command.Output()
return strings.Split(string(stdout), "\n")
}
@@ -300,15 +297,12 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
m.name = "LustreCollector"
m.parallel = true
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.tags = map[string]string{"type": "node"}
m.meta = map[string]string{"source": m.name, "group": "Lustre"}
@@ -317,15 +311,18 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
if !m.config.Sudo {
user, err := user.Current()
if err != nil {
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
return err
}
if user.Uid != "0" {
return fmt.Errorf("%s Init(): Lustre file system statistics can only be queried by user root", m.name)
cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root")
return err
}
} else {
p, err := exec.LookPath("sudo")
if err != nil {
return fmt.Errorf("%s Init(): Cannot find 'sudo': %w", m.name, err)
cclog.ComponentError(m.name, "Cannot find 'sudo'")
return err
}
m.sudoCmd = p
}
@@ -334,7 +331,7 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
if err != nil {
p, err = exec.LookPath(LCTL_CMD)
if err != nil {
return fmt.Errorf("%s Init(): Cannot find %s command: %w", m.name, LCTL_CMD, err)
return err
}
}
m.lctl = p
@@ -342,32 +339,32 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
m.definitions = []LustreMetricDefinition{}
if m.config.SendAbsoluteValues {
for _, def := range LustreAbsMetrics {
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def)
}
}
}
if m.config.SendDiffValues {
for _, def := range LustreDiffMetrics {
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def)
}
}
}
if m.config.SendDerivedValues {
for _, def := range LustreDeriveMetrics {
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def)
}
}
}
if len(m.definitions) == 0 {
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
return errors.New("no metrics to collect")
}
devices := m.getDevices()
if len(devices) == 0 {
return fmt.Errorf("%s Init(): no Lustre devices found", m.name)
return errors.New("no Lustre devices found")
}
m.stats = make(map[string]map[string]int64)
for _, d := range devices {
@@ -405,23 +402,23 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMessage)
} else {
use_x = devData[def.name]
}
var value any
var value interface{}
switch def.calc {
case "none":
value = use_x
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
case "difference":
value = use_x - devData[def.name]
if value.(int64) < 0 {
value = 0
}
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
case "derivative":
value = float64(use_x-devData[def.name]) / tdiff.Seconds()
if value.(float64) < 0 {
value = 0
}
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
}
if err == nil {
y.AddTag("device", device)

View File

@@ -9,25 +9,22 @@ package collectors
import (
"bufio"
"bytes"
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
"regexp"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const (
MEMSTATFILE = "/proc/meminfo"
NUMA_MEMSTAT_BASE = "/sys/devices/system/node"
)
const MEMSTATFILE = "/proc/meminfo"
const NUMA_MEMSTAT_BASE = "/sys/devices/system/node"
type MemstatCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics"`
@@ -42,7 +39,6 @@ type MemstatCollectorNode struct {
type MemstatCollector struct {
metricCollector
stats map[string]int64
tags map[string]string
matches map[string]string
@@ -62,11 +58,7 @@ func getStats(filename string) map[string]MemstatStats {
if err != nil {
cclog.Error(err.Error())
}
defer func() {
if err := file.Close(); err != nil {
cclog.Error(err.Error())
}
}()
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
@@ -95,15 +87,15 @@ func getStats(filename string) map[string]MemstatStats {
}
func (m *MemstatCollector) Init(config json.RawMessage) error {
var err error
m.name = "MemstatCollector"
m.parallel = true
m.config.NodeStats = true
m.config.NumaStats = false
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
m.meta = map[string]string{"source": m.name, "group": "Memory"}
@@ -123,24 +115,23 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
"MemShared": "mem_shared",
}
for k, v := range matches {
if !slices.Contains(m.config.ExcludeMetrics, k) {
_, skip := stringArrayContains(m.config.ExcludeMetrics, k)
if !skip {
m.matches[k] = v
}
}
m.sendMemUsed = false
if !slices.Contains(m.config.ExcludeMetrics, "mem_used") {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_used"); !skip {
m.sendMemUsed = true
}
if len(m.matches) == 0 {
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
}
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
return errors.New("no metrics to collect")
}
m.setup()
if m.config.NodeStats {
if stats := getStats(MEMSTATFILE); len(stats) == 0 {
return fmt.Errorf("%s Init(): cannot read data from file %s", m.name, MEMSTATFILE)
return fmt.Errorf("cannot read data from file %s", MEMSTATFILE)
}
}
@@ -152,7 +143,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
m.nodefiles = make(map[int]MemstatCollectorNode)
for _, f := range files {
if stats := getStats(f); len(stats) == 0 {
return fmt.Errorf("%s Init(): cannot read data from file %s", m.name, f)
return fmt.Errorf("cannot read data from file %s", f)
}
rematch := regex.FindStringSubmatch(f)
if len(rematch) == 2 {
@@ -162,7 +153,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
file: f,
tags: map[string]string{
"type": "memoryDomain",
"type-id": strconv.Itoa(id),
"type-id": fmt.Sprintf("%d", id),
},
}
m.nodefiles[id] = f
@@ -172,7 +163,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
}
}
m.init = true
return nil
return err
}
func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
@@ -183,7 +174,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
sendStats := func(stats map[string]MemstatStats, tags map[string]string) {
for match, name := range m.matches {
var value float64 = 0
unit := ""
var unit string = ""
if v, ok := stats[match]; ok {
value = v.value
if len(v.unit) > 0 {
@@ -191,7 +182,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
}
}
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value}, time.Now())
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil {
if len(unit) > 0 {
y.AddMeta("unit", unit)
@@ -224,7 +215,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
}
}
}
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]any{"value": memUsed}, time.Now())
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now())
if err == nil {
if len(unit) > 0 {
y.AddMeta("unit", unit)

View File

@@ -12,7 +12,7 @@ import (
"fmt"
"time"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
type MetricCollector interface {
@@ -51,6 +51,30 @@ func (c *metricCollector) Initialized() bool {
return c.init
}
// intArrayContains scans an array of ints if the value str is present in the array
// If the specified value is found, the corresponding array index is returned.
// The bool value is used to signal success or failure
func intArrayContains(array []int, str int) (int, bool) {
for i, a := range array {
if a == str {
return i, true
}
}
return -1, false
}
// stringArrayContains scans an array of strings if the value str is present in the array
// If the specified value is found, the corresponding array index is returned.
// The bool value is used to signal success or failure
func stringArrayContains(array []string, str string) (int, bool) {
for i, a := range array {
if a == str {
return i, true
}
}
return -1, false
}
// RemoveFromStringList removes the string r from the array of strings s
// If r is not contained in the array an error is returned
func RemoveFromStringList(s []string, r string) ([]string, error) {

View File

@@ -9,17 +9,15 @@ package collectors
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"errors"
"os"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const NETSTATFILE = "/proc/net/dev"
@@ -42,7 +40,6 @@ type NetstatCollectorMetric struct {
type NetstatCollector struct {
metricCollector
config NetstatCollectorConfig
aliasToCanonical map[string]string
matches map[string][]NetstatCollectorMetric
@@ -68,9 +65,7 @@ func getCanonicalName(raw string, aliasToCanonical map[string]string) string {
func (m *NetstatCollector) Init(config json.RawMessage) error {
m.name = "NetstatCollector"
m.parallel = true
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.lastTimestamp = time.Now()
const (
@@ -100,10 +95,10 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
m.config.SendDerivedValues = false
// Read configuration file, allow overwriting default config
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
err := json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
@@ -112,8 +107,10 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
// Check access to net statistic file
file, err := os.Open(NETSTATFILE)
if err != nil {
return fmt.Errorf("%s Init(): failed to open netstat file \"%s\": %w", m.name, NETSTATFILE, err)
cclog.ComponentError(m.name, err.Error())
return err
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
@@ -132,33 +129,13 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
canonical := getCanonicalName(raw, m.aliasToCanonical)
// Check if device is a included device
if slices.Contains(m.config.IncludeDevices, canonical) {
if _, ok := stringArrayContains(m.config.IncludeDevices, canonical); ok {
// Tag will contain original device name (raw).
tags := map[string]string{
"stype": "network",
"stype-id": raw,
"type": "node",
}
meta_unit_byte := map[string]string{
"source": m.name,
"group": "Network",
"unit": "bytes",
}
meta_unit_byte_per_sec := map[string]string{
"source": m.name,
"group": "Network",
"unit": "bytes/sec",
}
meta_unit_pkts := map[string]string{
"source": m.name,
"group": "Network",
"unit": "packets",
}
meta_unit_pkts_per_sec := map[string]string{
"source": m.name,
"group": "Network",
"unit": "packets/sec",
}
tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"}
meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"}
meta_unit_byte_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "bytes/sec"}
meta_unit_pkts := map[string]string{"source": m.name, "group": "Network", "unit": "packets"}
meta_unit_pkts_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "packets/sec"}
m.matches[canonical] = []NetstatCollectorMetric{
{
@@ -197,13 +174,8 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
}
}
// Close netstat file
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): failed to close netstat file \"%s\": %w", m.name, NETSTATFILE, err)
}
if len(m.matches) == 0 {
return fmt.Errorf("%s Init(): no devices to collect metrics found", m.name)
return errors.New("no devices to collector metrics found")
}
m.init = true
return nil
@@ -222,18 +194,10 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
file, err := os.Open(NETSTATFILE)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", NETSTATFILE, err))
cclog.ComponentError(m.name, err.Error())
return
}
defer func() {
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", NETSTATFILE, err))
}
}()
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
@@ -262,14 +226,14 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
continue
}
if m.config.SendAbsoluteValues {
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]any{"value": v}, now); err == nil {
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]interface{}{"value": v}, now); err == nil {
output <- y
}
}
if m.config.SendDerivedValues {
if metric.lastValue >= 0 {
rate := float64(v-metric.lastValue) / timeDiff
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]any{"value": rate}, now); err == nil {
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]interface{}{"value": rate}, now); err == nil {
output <- y
}
}

View File

@@ -8,10 +8,9 @@
package collectors
import (
"bytes"
"encoding/json"
"fmt"
"slices"
"log"
// "os"
"os/exec"
@@ -19,8 +18,7 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// First part contains the code for the general NfsCollector.
@@ -35,7 +33,6 @@ type NfsCollectorData struct {
type nfsCollector struct {
metricCollector
tags map[string]string
version string
config struct {
@@ -45,56 +42,68 @@ type nfsCollector struct {
data map[string]NfsCollectorData
}
func (m *nfsCollector) updateStats() error {
cmd := exec.Command(m.config.Nfsstats, "-l", "--all")
func (m *nfsCollector) initStats() error {
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
cmd.Wait()
buffer, err := cmd.Output()
if err != nil {
return err
}
for name, data := range m.data {
m.data[name] = NfsCollectorData{
last: data.current,
current: -1,
}
}
for line := range strings.Lines(string(buffer)) {
if err == nil {
for _, line := range strings.Split(string(buffer), "\n") {
lf := strings.Fields(line)
if len(lf) != 5 {
continue
}
if lf[1] != m.version {
if lf[1] == m.version {
name := strings.Trim(lf[3], ":")
if _, exist := m.data[name]; !exist {
value, err := strconv.ParseInt(lf[4], 0, 64)
if err == nil {
x := m.data[name]
x.current = value
x.last = value
m.data[name] = x
}
}
}
}
}
return err
}
func (m *nfsCollector) updateStats() error {
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
cmd.Wait()
buffer, err := cmd.Output()
if err == nil {
for _, line := range strings.Split(string(buffer), "\n") {
lf := strings.Fields(line)
if len(lf) != 5 {
continue
}
if lf[1] == m.version {
name := strings.Trim(lf[3], ":")
if _, exist := m.data[name]; exist {
value, err := strconv.ParseInt(lf[4], 0, 64)
if err != nil {
if err == nil {
x := m.data[name]
x.last = x.current
x.current = value
m.data[name] = x
}
}
}
}
}
return err
}
collectorData, exist := m.data[name]
collectorData.current = value
if !exist {
collectorData.last = -1
}
m.data[name] = collectorData
}
return nil
}
func (m *nfsCollector) MainInit(config json.RawMessage) error {
m.config.Nfsstats = string(NFSSTAT_EXEC)
// Read JSON configuration
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
err := json.Unmarshal(config, &m.config)
if err != nil {
log.Print(err.Error())
return err
}
}
m.meta = map[string]string{
@@ -107,12 +116,10 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
// Check if nfsstat is in executable search path
_, err := exec.LookPath(m.config.Nfsstats)
if err != nil {
return fmt.Errorf("%s Init(): Failed to find nfsstat binary '%s': %w", m.name, m.config.Nfsstats, err)
return fmt.Errorf("NfsCollector.Init(): Failed to find nfsstat binary '%s': %v", m.config.Nfsstats, err)
}
m.data = make(map[string]NfsCollectorData)
if err := m.updateStats(); err != nil {
return fmt.Errorf("%s Init(): %w", m.name, err)
}
m.initStats()
m.init = true
m.parallel = true
return nil
@@ -124,14 +131,8 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
}
timestamp := time.Now()
if err := m.updateStats(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): updateStats() failed: %v", err),
)
return
}
var prefix string
m.updateStats()
prefix := ""
switch m.version {
case "v3":
prefix = "nfs3"
@@ -142,15 +143,11 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
}
for name, data := range m.data {
if slices.Contains(m.config.ExcludeMetrics, name) {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, name); skip {
continue
}
valueMap := make(map[string]any)
if data.current >= 0 && data.last >= 0 {
valueMap["value"] = data.current - data.last
}
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, valueMap, timestamp)
value := data.current - data.last
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
y.AddMeta("version", m.version)
output <- y
@@ -173,17 +170,13 @@ type Nfs4Collector struct {
func (m *Nfs3Collector) Init(config json.RawMessage) error {
m.name = "Nfs3Collector"
m.version = `v3`
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
return m.MainInit(config)
}
func (m *Nfs4Collector) Init(config json.RawMessage) error {
m.name = "Nfs4Collector"
m.version = `v4`
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
return m.MainInit(config)
}

View File

@@ -8,23 +8,22 @@
package collectors
import (
"bytes"
"encoding/json"
"fmt"
"os"
"regexp"
"slices"
"strconv"
"strings"
"time"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// These are the fields we read from the JSON configuration
type NfsIOStatCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ExcludeFilesystems []string `json:"exclude_filesystem,omitempty"`
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
UseServerAddressAsSType bool `json:"use_server_as_stype,omitempty"`
SendAbsoluteValues bool `json:"send_abs_values"`
SendDerivedValues bool `json:"send_derived_values"`
@@ -34,7 +33,6 @@ type NfsIOStatCollectorConfig struct {
// defined by metricCollector (name, init, ...)
type NfsIOStatCollector struct {
metricCollector
config NfsIOStatCollectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
@@ -43,10 +41,8 @@ type NfsIOStatCollector struct {
lastTimestamp time.Time
}
var (
deviceRegex = regexp.MustCompile(`device (?P<server>[^ ]+) mounted on (?P<mntpoint>[^ ]+) with fstype nfs(?P<version>\d*) statvers=[\d\.]+`)
bytesRegex = regexp.MustCompile(`\s+bytes:\s+(?P<nread>[^ ]+) (?P<nwrite>[^ ]+) (?P<dread>[^ ]+) (?P<dwrite>[^ ]+) (?P<nfsread>[^ ]+) (?P<nfswrite>[^ ]+) (?P<pageread>[^ ]+) (?P<pagewrite>[^ ]+)`)
)
var deviceRegex = regexp.MustCompile(`device (?P<server>[^ ]+) mounted on (?P<mntpoint>[^ ]+) with fstype nfs(?P<version>\d*) statvers=[\d\.]+`)
var bytesRegex = regexp.MustCompile(`\s+bytes:\s+(?P<nread>[^ ]+) (?P<nwrite>[^ ]+) (?P<dread>[^ ]+) (?P<dwrite>[^ ]+) (?P<nfsread>[^ ]+) (?P<nfswrite>[^ ]+) (?P<pageread>[^ ]+) (?P<pagewrite>[^ ]+)`)
func resolve_regex_fields(s string, regex *regexp.Regexp) map[string]string {
fields := make(map[string]string)
@@ -75,7 +71,7 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
// Is this a device line with mount point, remote target and NFS version?
dev := resolve_regex_fields(l, deviceRegex)
if len(dev) > 0 {
if !slices.Contains(m.config.ExcludeFilesystems, dev[m.key]) {
if _, ok := stringArrayContains(m.config.ExcludeFilesystem, dev[m.key]); !ok {
current = dev
if len(current["version"]) == 0 {
current["version"] = "3"
@@ -89,7 +85,7 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
if len(bytes) > 0 {
data[current[m.key]] = make(map[string]int64)
for name, sval := range bytes {
if !slices.Contains(m.config.ExcludeMetrics, name) {
if _, ok := stringArrayContains(m.config.ExcludeMetrics, name); !ok {
val, err := strconv.ParseInt(sval, 10, 64)
if err == nil {
data[current[m.key]][name] = val
@@ -104,10 +100,9 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
}
func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
var err error = nil
m.name = "NfsIOStatCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "NFS", "unit": "bytes"}
m.tags = map[string]string{"type": "node"}
@@ -116,10 +111,10 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
m.config.SendAbsoluteValues = true
m.config.SendDerivedValues = false
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
m.key = "mntpoint"
@@ -129,7 +124,7 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
m.data = m.readNfsiostats()
m.lastTimestamp = time.Now()
m.init = true
return nil
return err
}
func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
@@ -145,14 +140,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
if old, ok := m.data[mntpoint]; ok {
for name, newVal := range values {
if m.config.SendAbsoluteValues {
msg, err := lp.NewMessage(
"nfsio_"+name,
m.tags,
m.meta,
map[string]any{
"value": newVal,
},
now)
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", name), m.tags, m.meta, map[string]interface{}{"value": newVal}, now)
if err == nil {
msg.AddTag("stype", "filesystem")
msg.AddTag("stype-id", mntpoint)
@@ -161,7 +149,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
}
if m.config.SendDerivedValues {
rate := float64(newVal-old[name]) / timeDiff
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]any{"value": rate}, now)
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]interface{}{"value": rate}, now)
if err == nil {
if strings.HasPrefix(name, "page") {
msg.AddMeta("unit", "4K_pages/s")

View File

@@ -16,7 +16,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/nfsio.md
"exclude_metrics": [
"oread", "pageread"
],
"exclude_filesystem": [
"exclude_filesystems": [
"/mnt"
],
"use_server_as_stype": false,

View File

@@ -2,7 +2,6 @@ package collectors
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"os"
@@ -11,8 +10,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
type NUMAStatsCollectorConfig struct {
@@ -60,7 +59,6 @@ type NUMAStatsCollectorTopolgy struct {
type NUMAStatsCollector struct {
metricCollector
topology []NUMAStatsCollectorTopolgy
config NUMAStatsCollectorConfig
lastTimestamp time.Time
@@ -74,9 +72,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
m.name = "NUMAStatsCollector"
m.parallel = true
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.meta = map[string]string{
"source": m.name,
"group": "NUMA",
@@ -84,10 +80,9 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
m.config.SendAbsoluteValues = true
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err := json.Unmarshal(config, &m.config)
if err != nil {
return fmt.Errorf("unable to unmarshal numastat configuration: %s", err.Error())
}
}
@@ -96,10 +91,10 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
globPattern := base + "[0-9]*"
dirs, err := filepath.Glob(globPattern)
if err != nil {
return fmt.Errorf("%s Init(): unable to glob files with pattern '%s'", m.name, globPattern)
return fmt.Errorf("unable to glob files with pattern '%s'", globPattern)
}
if dirs == nil {
return fmt.Errorf("%s Init(): unable to find any files with pattern '%s'", m.name, globPattern)
return fmt.Errorf("unable to find any files with pattern '%s'", globPattern)
}
m.topology = make([]NUMAStatsCollectorTopolgy, 0, len(dirs))
for _, dir := range dirs {
@@ -191,11 +186,7 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
t.previousValues[key] = value
}
}
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", t.file, err))
}
file.Close()
}
}

View File

@@ -15,7 +15,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/numastat.md
"numastats": {
"send_abs_values" : true,
"send_derived_values" : true
}
}
```
The `numastat` collector reads data from `/sys/devices/system/node/node*/numastat` and outputs a handful **memoryDomain** metrics. See: <https://www.kernel.org/doc/html/latest/admin-guide/numastat.html>

View File

@@ -12,14 +12,11 @@ import (
"errors"
"fmt"
"log"
"maps"
"slices"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"github.com/NVIDIA/go-nvml/pkg/nvml"
)
@@ -47,7 +44,6 @@ type NvidiaCollectorDevice struct {
type NvidiaCollector struct {
metricCollector
config NvidiaCollectorConfig
gpus []NvidiaCollectorDevice
num_gpus int
@@ -68,14 +64,11 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
m.config.ProcessMigDevices = false
m.config.UseUuidForMigDevices = false
m.config.UseSliceForMigDevices = false
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
if len(config) > 0 {
d := json.NewDecoder(strings.NewReader(string(config)))
d.DisallowUnknownFields()
if err = d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
m.meta = map[string]string{
@@ -91,28 +84,32 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
// Error: NVML library not found
// (nvml.ErrorString can not be used in this case)
if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
return fmt.Errorf("%s Init(): NVML library not found", m.name)
err = fmt.Errorf("NVML library not found")
cclog.ComponentError(m.name, err.Error())
return err
}
if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret))
return fmt.Errorf("%s Init(): Unable to initialize NVML: %w", m.name, err)
cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error())
return err
}
// Number of NVIDIA GPUs
num_gpus, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
err = errors.New(nvml.ErrorString(ret))
return fmt.Errorf("%s Init(): Unable to get device count: %w", m.name, err)
cclog.ComponentError(m.name, "Unable to get device count", err.Error())
return err
}
// For all GPUs
idx := 0
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
for i := range num_gpus {
for i := 0; i < num_gpus; i++ {
// Skip excluded devices by ID
str_i := strconv.Itoa(i)
if slices.Contains(m.config.ExcludeDevices, str_i) {
str_i := fmt.Sprintf("%d", i)
if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip {
cclog.ComponentDebug(m.name, "Skipping excluded device", str_i)
continue
}
@@ -140,7 +137,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
pciInfo.Device)
// Skip excluded devices specified by PCI ID
if slices.Contains(m.config.ExcludeDevices, pci_id) {
if _, skip := stringArrayContains(m.config.ExcludeDevices, pci_id); skip {
cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id)
continue
}
@@ -225,20 +222,18 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
var total uint64
var used uint64
var reserved uint64 = 0
v2 := false
var v2 bool = false
meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
if ret != nvml.SUCCESS {
err := errors.New(nvml.ErrorString(ret))
return err
}
// Total physical device memory (in bytes)
total = meminfo.Total
// Sum of Reserved and Allocated device memory (in bytes)
used = meminfo.Used
if !device.excludeMetrics["nv_fb_mem_total"] {
t := float64(total) / (1024 * 1024)
y, err := lp.NewMetric("nv_fb_mem_total", device.tags, device.meta, t, time.Now())
y, err := lp.NewMessage("nv_fb_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
@@ -247,7 +242,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
if !device.excludeMetrics["nv_fb_mem_used"] {
f := float64(used) / (1024 * 1024)
y, err := lp.NewMetric("nv_fb_mem_used", device.tags, device.meta, f, time.Now())
y, err := lp.NewMessage("nv_fb_mem_used", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
@@ -256,7 +251,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] {
r := float64(reserved) / (1024 * 1024)
y, err := lp.NewMetric("nv_fb_mem_reserved", device.tags, device.meta, r, time.Now())
y, err := lp.NewMessage("nv_fb_mem_reserved", device.tags, device.meta, map[string]interface{}{"value": r}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
@@ -275,7 +270,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
}
if !device.excludeMetrics["nv_bar1_mem_total"] {
t := float64(meminfo.Bar1Total) / (1024 * 1024)
y, err := lp.NewMetric("nv_bar1_mem_total", device.tags, device.meta, t, time.Now())
y, err := lp.NewMessage("nv_bar1_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
@@ -283,7 +278,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
}
if !device.excludeMetrics["nv_bar1_mem_used"] {
t := float64(meminfo.Bar1Used) / (1024 * 1024)
y, err := lp.NewMetric("nv_bar1_mem_used", device.tags, device.meta, t, time.Now())
y, err := lp.NewMessage("nv_bar1_mem_used", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "MByte")
output <- y
@@ -317,14 +312,14 @@ func readUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
util, ret := nvml.DeviceGetUtilizationRates(device.device)
if ret == nvml.SUCCESS {
if !device.excludeMetrics["nv_util"] {
y, err := lp.NewMetric("nv_util", device.tags, device.meta, float64(util.Gpu), time.Now())
y, err := lp.NewMessage("nv_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
if err == nil {
y.AddMeta("unit", "%")
output <- y
}
}
if !device.excludeMetrics["nv_mem_util"] {
y, err := lp.NewMetric("nv_mem_util", device.tags, device.meta, float64(util.Memory), time.Now())
y, err := lp.NewMessage("nv_mem_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
if err == nil {
y.AddMeta("unit", "%")
output <- y
@@ -344,7 +339,7 @@ func readTemp(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
// * NVML_TEMPERATURE_COUNT
temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_temp", device.tags, device.meta, float64(temp), time.Now())
y, err := lp.NewMessage("nv_temp", device.tags, device.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
if err == nil {
y.AddMeta("unit", "degC")
output <- y
@@ -367,7 +362,7 @@ func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
// This value may exceed 100% in certain cases.
fan, ret := nvml.DeviceGetFanSpeed(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_fan", device.tags, device.meta, float64(fan), time.Now())
y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
if err == nil {
y.AddMeta("unit", "%")
output <- y
@@ -377,6 +372,27 @@ func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
return nil
}
// func readFans(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
// if !device.excludeMetrics["nv_fan"] {
// numFans, ret := nvml.DeviceGetNumFans(device.device)
// if ret == nvml.SUCCESS {
// for i := 0; i < numFans; i++ {
// fan, ret := nvml.DeviceGetFanSpeed_v2(device.device, i)
// if ret == nvml.SUCCESS {
// y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
// if err == nil {
// y.AddMeta("unit", "%")
// y.AddTag("stype", "fan")
// y.AddTag("stype-id", fmt.Sprintf("%d", i))
// output <- y
// }
// }
// }
// }
// }
// return nil
// }
func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_ecc_mode"] {
// Retrieves the current and pending ECC modes for the device.
@@ -387,23 +403,22 @@ func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error
// Changing ECC modes requires a reboot.
// The "pending" ECC mode refers to the target mode following the next reboot.
_, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
switch ret {
case nvml.SUCCESS:
if ret == nvml.SUCCESS {
var y lp.CCMessage
var err error
switch ecc_pend {
case nvml.FEATURE_DISABLED:
y, err = lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "OFF", time.Now())
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "OFF"}, time.Now())
case nvml.FEATURE_ENABLED:
y, err = lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "ON", time.Now())
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "ON"}, time.Now())
default:
y, err = lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "UNKNOWN", time.Now())
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now())
}
if err == nil {
output <- y
}
case nvml.ERROR_NOT_SUPPORTED:
y, err := lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "N/A", time.Now())
} else if ret == nvml.ERROR_NOT_SUPPORTED {
y, err := lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "N/A"}, time.Now())
if err == nil {
output <- y
}
@@ -423,7 +438,7 @@ func readPerfState(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
// 32: Unknown performance state.
pState, ret := nvml.DeviceGetPerformanceState(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_perf_state", device.tags, device.meta, fmt.Sprintf("P%d", int(pState)), time.Now())
y, err := lp.NewMessage("nv_perf_state", device.tags, device.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
if err == nil {
output <- y
}
@@ -449,7 +464,7 @@ func readPowerUsage(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
if mode == nvml.FEATURE_ENABLED {
power, ret := nvml.DeviceGetPowerUsage(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_power_usage", device.tags, device.meta, float64(power)/1000, time.Now())
y, err := lp.NewMessage("nv_power_usage", device.tags, device.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
if err == nil {
y.AddMeta("unit", "watts")
output <- y
@@ -475,12 +490,7 @@ func readEnergyConsumption(device *NvidiaCollectorDevice, output chan lp.CCMessa
if ret == nvml.SUCCESS {
if device.lastEnergyReading != 0 {
if !device.excludeMetrics["nv_energy"] {
y, err := lp.NewMetric(
"nv_energy",
device.tags,
device.meta,
(energy-device.lastEnergyReading)/1000,
now)
y, err := lp.NewMetric("nv_energy", device.tags, device.meta, (energy-device.lastEnergyReading)/1000, now)
if err == nil {
y.AddMeta("unit", "Joules")
output <- y
@@ -522,7 +532,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_graphics_clock"] {
graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_graphics_clock", device.tags, device.meta, float64(graphicsClock), time.Now())
y, err := lp.NewMessage("nv_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -533,7 +543,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_sm_clock"] {
smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_sm_clock", device.tags, device.meta, float64(smCock), time.Now())
y, err := lp.NewMessage("nv_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(smCock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -544,7 +554,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_mem_clock"] {
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_mem_clock", device.tags, device.meta, float64(memClock), time.Now())
y, err := lp.NewMessage("nv_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -554,7 +564,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_video_clock"] {
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_video_clock", device.tags, device.meta, float64(memClock), time.Now())
y, err := lp.NewMessage("nv_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
if err == nil {
y.AddMeta("unit", "MHz")
output <- y
@@ -635,7 +645,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
// i.e. the total set of errors across the entire device.
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_ecc_uncorrected_error", device.tags, device.meta, float64(ecc_db), time.Now())
y, err := lp.NewMessage("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
if err == nil {
output <- y
}
@@ -644,7 +654,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
if !device.excludeMetrics["nv_ecc_corrected_error"] {
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_ecc_corrected_error", device.tags, device.meta, float64(ecc_sb), time.Now())
y, err := lp.NewMessage("nv_ecc_corrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
if err == nil {
output <- y
}
@@ -663,7 +673,7 @@ func readPowerLimit(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
// If the card's total power draw reaches this limit the power management algorithm kicks in.
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_power_max_limit", device.tags, device.meta, float64(pwr_limit)/1000, time.Now())
y, err := lp.NewMessage("nv_power_max_limit", device.tags, device.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now())
if err == nil {
y.AddMeta("unit", "watts")
output <- y
@@ -690,7 +700,7 @@ func readEncUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_encoder_util", device.tags, device.meta, float64(enc_util), time.Now())
y, err := lp.NewMessage("nv_encoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
if err == nil {
y.AddMeta("unit", "%")
output <- y
@@ -717,7 +727,7 @@ func readDecUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_decoder_util", device.tags, device.meta, float64(dec_util), time.Now())
y, err := lp.NewMessage("nv_decoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
if err == nil {
y.AddMeta("unit", "%")
output <- y
@@ -744,33 +754,33 @@ func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) e
corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device)
if ret == nvml.SUCCESS {
if !device.excludeMetrics["nv_remapped_rows_corrected"] {
y, err := lp.NewMetric("nv_remapped_rows_corrected", device.tags, device.meta, float64(corrected), time.Now())
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(corrected)}, time.Now())
if err == nil {
output <- y
}
}
if !device.excludeMetrics["nv_remapped_rows_uncorrected"] {
y, err := lp.NewMetric("nv_remapped_rows_corrected", device.tags, device.meta, float64(uncorrected), time.Now())
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(uncorrected)}, time.Now())
if err == nil {
output <- y
}
}
if !device.excludeMetrics["nv_remapped_rows_pending"] {
p := 0
var p int = 0
if pending {
p = 1
}
y, err := lp.NewMetric("nv_remapped_rows_pending", device.tags, device.meta, p, time.Now())
y, err := lp.NewMessage("nv_remapped_rows_pending", device.tags, device.meta, map[string]interface{}{"value": p}, time.Now())
if err == nil {
output <- y
}
}
if !device.excludeMetrics["nv_remapped_rows_failure"] {
f := 0
var f int = 0
if failure {
f = 1
}
y, err := lp.NewMetric("nv_remapped_rows_failure", device.tags, device.meta, f, time.Now())
y, err := lp.NewMessage("nv_remapped_rows_failure", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
if err == nil {
output <- y
}
@@ -804,7 +814,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_compute_processes", device.tags, device.meta, len(procList), time.Now())
y, err := lp.NewMessage("nv_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
if err == nil {
output <- y
}
@@ -833,7 +843,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device)
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_graphics_processes", device.tags, device.meta, len(procList), time.Now())
y, err := lp.NewMessage("nv_graphics_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
if err == nil {
output <- y
}
@@ -863,7 +873,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
// procList, ret := nvml.DeviceGetMPSComputeRunningProcesses(device.device)
// if ret == nvml.SUCCESS {
// y, err := lp.NewMetric("nv_mps_compute_processes", device.tags, device.meta, len(procList), time.Now())
// y, err := lp.NewMessage("nv_mps_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
// if err == nil {
// output <- y
// }
@@ -891,7 +901,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_power", device.tags, device.meta, t, time.Now())
y, err := lp.NewMessage("nv_violation_power", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -903,7 +913,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_thermal", device.tags, device.meta, t, time.Now())
y, err := lp.NewMessage("nv_violation_thermal", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -915,7 +925,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_sync_boost", device.tags, device.meta, t, time.Now())
y, err := lp.NewMessage("nv_violation_sync_boost", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -927,7 +937,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_board_limit", device.tags, device.meta, t, time.Now())
y, err := lp.NewMessage("nv_violation_board_limit", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -939,7 +949,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_low_util", device.tags, device.meta, t, time.Now())
y, err := lp.NewMessage("nv_violation_low_util", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -951,7 +961,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_reliability", device.tags, device.meta, t, time.Now())
y, err := lp.NewMessage("nv_violation_reliability", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -963,7 +973,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_below_app_clock", device.tags, device.meta, t, time.Now())
y, err := lp.NewMessage("nv_violation_below_app_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -975,7 +985,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS)
if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_below_base_clock", device.tags, device.meta, t, time.Now())
y, err := lp.NewMessage("nv_violation_below_base_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil {
y.AddMeta("unit", "sec")
output <- y
@@ -998,19 +1008,19 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
var aggregate_recovery_errors uint64 = 0
var aggregate_crc_flit_errors uint64 = 0
for i := range nvml.NVLINK_MAX_LINKS {
for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ {
state, ret := nvml.DeviceGetNvLinkState(device.device, i)
if ret == nvml.SUCCESS {
if state == nvml.FEATURE_ENABLED {
if !device.excludeMetrics["nv_nvlink_crc_errors"] {
// Data link receive data CRC error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA)
aggregate_crc_errors += count
aggregate_crc_errors = aggregate_crc_errors + count
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_nvlink_crc_errors", device.tags, device.meta, count, time.Now())
y, err := lp.NewMessage("nv_nvlink_crc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
y.AddTag("stype-id", strconv.Itoa(i))
y.AddTag("stype-id", fmt.Sprintf("%d", i))
output <- y
}
}
@@ -1018,12 +1028,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
// Data link receive data ECC error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA)
aggregate_ecc_errors += count
aggregate_ecc_errors = aggregate_ecc_errors + count
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_nvlink_ecc_errors", device.tags, device.meta, count, time.Now())
y, err := lp.NewMessage("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
y.AddTag("stype-id", strconv.Itoa(i))
y.AddTag("stype-id", fmt.Sprintf("%d", i))
output <- y
}
}
@@ -1031,12 +1041,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
if !device.excludeMetrics["nv_nvlink_replay_errors"] {
// Data link transmit replay error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY)
aggregate_replay_errors += count
aggregate_replay_errors = aggregate_replay_errors + count
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_nvlink_replay_errors", device.tags, device.meta, count, time.Now())
y, err := lp.NewMessage("nv_nvlink_replay_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
y.AddTag("stype-id", strconv.Itoa(i))
y.AddTag("stype-id", fmt.Sprintf("%d", i))
output <- y
}
}
@@ -1044,12 +1054,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
// Data link transmit recovery error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY)
aggregate_recovery_errors += count
aggregate_recovery_errors = aggregate_recovery_errors + count
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_nvlink_recovery_errors", device.tags, device.meta, count, time.Now())
y, err := lp.NewMessage("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
y.AddTag("stype-id", strconv.Itoa(i))
y.AddTag("stype-id", fmt.Sprintf("%d", i))
output <- y
}
}
@@ -1057,12 +1067,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
// Data link receive flow control digit CRC error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT)
aggregate_crc_flit_errors += count
aggregate_crc_flit_errors = aggregate_crc_flit_errors + count
if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_nvlink_crc_flit_errors", device.tags, device.meta, count, time.Now())
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
y.AddTag("stype-id", strconv.Itoa(i))
y.AddTag("stype-id", fmt.Sprintf("%d", i))
output <- y
}
}
@@ -1074,7 +1084,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
// Export aggegated values
if !device.excludeMetrics["nv_nvlink_crc_errors"] {
// Data link receive data CRC error counter
y, err := lp.NewMetric("nv_nvlink_crc_errors_sum", device.tags, device.meta, aggregate_crc_errors, time.Now())
y, err := lp.NewMessage("nv_nvlink_crc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_errors}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
output <- y
@@ -1082,7 +1092,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
}
if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
// Data link receive data ECC error counter
y, err := lp.NewMetric("nv_nvlink_ecc_errors_sum", device.tags, device.meta, aggregate_ecc_errors, time.Now())
y, err := lp.NewMessage("nv_nvlink_ecc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_ecc_errors}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
output <- y
@@ -1090,7 +1100,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
}
if !device.excludeMetrics["nv_nvlink_replay_errors"] {
// Data link transmit replay error counter
y, err := lp.NewMetric("nv_nvlink_replay_errors_sum", device.tags, device.meta, aggregate_replay_errors, time.Now())
y, err := lp.NewMessage("nv_nvlink_replay_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_replay_errors}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
output <- y
@@ -1098,7 +1108,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
}
if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
// Data link transmit recovery error counter
y, err := lp.NewMetric("nv_nvlink_recovery_errors_sum", device.tags, device.meta, aggregate_recovery_errors, time.Now())
y, err := lp.NewMessage("nv_nvlink_recovery_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_recovery_errors}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
output <- y
@@ -1106,7 +1116,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
}
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
// Data link receive flow control digit CRC error counter
y, err := lp.NewMetric("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, aggregate_crc_flit_errors, time.Now())
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_flit_errors}, time.Now())
if err == nil {
y.AddTag("stype", "nvlink")
output <- y
@@ -1223,7 +1233,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
}
// Actual read loop over all attached Nvidia GPUs
for i := range m.num_gpus {
for i := 0; i < m.num_gpus; i++ {
readAll(&m.gpus[i], output)
@@ -1246,7 +1256,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
}
cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i)
for j := range maxMig {
for j := 0; j < maxMig; j++ {
mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
if ret != nvml.SUCCESS {
continue
@@ -1263,7 +1273,9 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
meta: map[string]string{},
excludeMetrics: excludeMetrics,
}
maps.Copy(migDevice.tags, m.gpus[i].tags)
for k, v := range m.gpus[i].tags {
migDevice.tags[k] = v
}
migDevice.tags["stype"] = "mig"
if m.config.UseUuidForMigDevices {
uuid, ret := nvml.DeviceGetUUID(mdev)
@@ -1277,17 +1289,19 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
if ret == nvml.SUCCESS {
mname, ret := nvml.DeviceGetName(mdev)
if ret == nvml.SUCCESS {
x := strings.ReplaceAll(mname, name, "")
x = strings.ReplaceAll(x, "MIG", "")
x := strings.Replace(mname, name, "", -1)
x = strings.Replace(x, "MIG", "", -1)
x = strings.TrimSpace(x)
migDevice.tags["stype-id"] = x
}
}
}
if _, ok := migDevice.tags["stype-id"]; !ok {
migDevice.tags["stype-id"] = strconv.Itoa(j)
migDevice.tags["stype-id"] = fmt.Sprintf("%d", j)
}
for k, v := range m.gpus[i].meta {
migDevice.meta[k] = v
}
maps.Copy(migDevice.meta, m.gpus[i].meta)
if _, ok := migDevice.meta["uuid"]; ok && !m.config.UseUuidForMigDevices {
uuid, ret := nvml.DeviceGetUUID(mdev)
if ret == nvml.SUCCESS {
@@ -1303,9 +1317,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
func (m *NvidiaCollector) Close() {
if m.init {
if ret := nvml.Shutdown(); ret != nvml.SUCCESS {
cclog.ComponentError(m.name, "nvml.Shutdown() not successful")
}
nvml.Shutdown()
m.init = false
}
}

View File

@@ -8,7 +8,6 @@
package collectors
import (
"bytes"
"encoding/json"
"fmt"
"os"
@@ -17,8 +16,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// running average power limit (RAPL) monitoring attributes for a zone
@@ -35,7 +34,6 @@ type RAPLZoneInfo struct {
type RAPLCollector struct {
metricCollector
config struct {
// Exclude IDs for RAPL zones, e.g.
// * 0 for zone 0
@@ -50,15 +48,15 @@ type RAPLCollector struct {
// Init initializes the running average power limit (RAPL) collector
func (m *RAPLCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
var err error = nil
m.name = "RAPLCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
m.meta = map[string]string{
"source": m.name,
@@ -68,10 +66,10 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
// Read in the JSON configuration
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
@@ -91,19 +89,18 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
// readZoneInfo reads RAPL monitoring attributes for a zone given by zonePath
// See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes
readZoneInfo := func(zonePath string) (
z struct {
readZoneInfo := func(zonePath string) (z struct {
name string // zones name e.g. psys, dram, core, uncore, package-0
energyFilepath string // path to a file containing the zones current energy counter in micro joules
energy int64 // current reading of the energy counter in micro joules
energyTimestamp time.Time // timestamp when energy counter was read
maxEnergyRange int64 // Range of the above energy counter in micro-joules
ok bool // Are all information available?
},
) {
}) {
// zones name e.g. psys, dram, core, uncore, package-0
foundName := false
if v, err := os.ReadFile(
if v, err :=
os.ReadFile(
filepath.Join(zonePath, "name")); err == nil {
foundName = true
z.name = strings.TrimSpace(string(v))
@@ -125,7 +122,8 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
// Range of the above energy counter in micro-joules
foundMaxEnergyRange := false
if v, err := os.ReadFile(
if v, err :=
os.ReadFile(
filepath.Join(zonePath, "max_energy_range_uj")); err == nil {
if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil {
foundMaxEnergyRange = true
@@ -158,7 +156,8 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
!isNameExcluded[z.name] {
// Add RAPL monitoring attributes for a zone
m.RAPLZoneInfo = append(
m.RAPLZoneInfo =
append(
m.RAPLZoneInfo,
RAPLZoneInfo{
tags: map[string]string{
@@ -186,7 +185,8 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
sz.ok &&
!isIDExcluded[zoneID+":"+subZoneID] &&
!isNameExcluded[sz.name] {
m.RAPLZoneInfo = append(
m.RAPLZoneInfo =
append(
m.RAPLZoneInfo,
RAPLZoneInfo{
tags: map[string]string{
@@ -205,6 +205,7 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
if m.RAPLZoneInfo == nil {
return fmt.Errorf("no running average power limit (RAPL) device found in %s", controlTypePath)
}
// Initialized
@@ -221,6 +222,7 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
// Read reads running average power limit (RAPL) monitoring attributes for all initialized zones
// See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes
func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) {
for i := range m.RAPLZoneInfo {
p := &m.RAPLZoneInfo[i]
@@ -246,7 +248,7 @@ func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) {
"rapl_average_power",
p.tags,
m.meta,
map[string]any{"value": averagePower},
map[string]interface{}{"value": averagePower},
energyTimestamp)
if err == nil {
output <- y

View File

@@ -8,15 +8,13 @@
package collectors
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"slices"
"strconv"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
)
@@ -38,7 +36,6 @@ type RocmSmiCollectorDevice struct {
type RocmSmiCollector struct {
metricCollector
config RocmSmiCollectorConfig // the configuration structure
devices []RocmSmiCollectorDevice
}
@@ -51,46 +48,73 @@ type RocmSmiCollector struct {
// Called once by the collector manager
// All tags, meta data tags and metrics that do not change over the runtime should be set here
func (m *RocmSmiCollector) Init(config json.RawMessage) error {
var err error = nil
// Always set the name early in Init() to use it in cclog.Component* functions
m.name = "RocmSmiCollector"
// This is for later use, also call it early
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
// Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta())
//m.meta = map[string]string{"source": m.name, "group": "AMD"}
// Define tags sent with each metric
// The 'type' tag is always needed, it defines the granulatity of the metric
// node -> whole system
// socket -> CPU socket (requires socket ID as 'type-id' tag)
// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
//m.tags = map[string]string{"type": "node"}
// Read in the JSON configuration
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
ret := rocm_smi.Init()
if ret != rocm_smi.STATUS_SUCCESS {
return fmt.Errorf("%s Init(): failed to initialize ROCm SMI library", m.name)
err = errors.New("failed to initialize ROCm SMI library")
cclog.ComponentError(m.name, err.Error())
return err
}
numDevs, ret := rocm_smi.NumMonitorDevices()
if ret != rocm_smi.STATUS_SUCCESS {
return fmt.Errorf("%s Init(): failed to get number of GPUs from ROCm SMI library", m.name)
err = errors.New("failed to get number of GPUs from ROCm SMI library")
cclog.ComponentError(m.name, err.Error())
return err
}
exclDev := func(s string) bool {
skip_device := false
for _, excl := range m.config.ExcludeDevices {
if excl == s {
skip_device = true
break
}
}
return skip_device
}
m.devices = make([]RocmSmiCollectorDevice, 0)
for i := range numDevs {
str_i := strconv.Itoa(i)
if slices.Contains(m.config.ExcludeDevices, str_i) {
for i := 0; i < numDevs; i++ {
str_i := fmt.Sprintf("%d", i)
if exclDev(str_i) {
continue
}
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
if ret != rocm_smi.STATUS_SUCCESS {
return fmt.Errorf("%s Init(): failed to get get handle for GPU %d", m.name, i)
err = fmt.Errorf("failed to get handle for GPU %d", i)
cclog.ComponentError(m.name, err.Error())
return err
}
pciInfo, ret := rocm_smi.DeviceGetPciInfo(device)
if ret != rocm_smi.STATUS_SUCCESS {
return fmt.Errorf("%s Init(): failed to get PCI information for GPU %d", m.name, i)
err = fmt.Errorf("failed to get PCI information for GPU %d", i)
cclog.ComponentError(m.name, err.Error())
return err
}
pciId := fmt.Sprintf(
@@ -100,7 +124,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
pciInfo.Device,
pciInfo.Function)
if slices.Contains(m.config.ExcludeDevices, pciId) {
if exclDev(pciId) {
continue
}
@@ -140,7 +164,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
// Set this flag only if everything is initialized properly, all required files exist, ...
m.init = true
return nil
return err
}
// Read collects all metrics belonging to the sample collector
@@ -158,135 +182,136 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage
if !dev.excludeMetrics["rocm_gfx_util"] {
value := metrics.Average_gfx_activity
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_umc_util"] {
value := metrics.Average_umc_activity
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_mm_util"] {
value := metrics.Average_mm_activity
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_avg_power"] {
value := metrics.Average_socket_power
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_mem"] {
value := metrics.Temperature_mem
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_hotspot"] {
value := metrics.Temperature_hotspot
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_edge"] {
value := metrics.Temperature_edge
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_vrgfx"] {
value := metrics.Temperature_vrgfx
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_vrsoc"] {
value := metrics.Temperature_vrsoc
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_vrmem"] {
value := metrics.Temperature_vrmem
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_gfx_clock"] {
value := metrics.Average_gfxclk_frequency
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_soc_clock"] {
value := metrics.Average_socclk_frequency
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_u_clock"] {
value := metrics.Average_uclk_frequency
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_v0_clock"] {
value := metrics.Average_vclk0_frequency
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_v1_clock"] {
value := metrics.Average_vclk1_frequency
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_d0_clock"] {
value := metrics.Average_dclk0_frequency
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_d1_clock"] {
value := metrics.Average_dclk1_frequency
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_hbm"] {
for i := range rocm_smi.NUM_HBM_INSTANCES {
for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ {
value := metrics.Temperature_hbm[i]
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
y.AddTag("stype", "device")
y.AddTag("stype-id", strconv.Itoa(i))
y.AddTag("stype-id", fmt.Sprintf("%d", i))
output <- y
}
}
}
}
}
// Close metric collector: close network connection, close files, close libraries, ...

View File

@@ -15,9 +15,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/rocmsmi.md
```json
"rocm_smi": {
"exclude_devices": [
"0",
"1",
"0000000:ff:01.0"
"0","1", "0000000:ff:01.0"
],
"exclude_metrics": [
"rocm_mm_util",
@@ -25,7 +23,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/rocmsmi.md
],
"use_pci_info_as_type_id": true,
"add_pci_info_tag": false,
"add_serial_meta": false
"add_serial_meta": false,
}
```

View File

@@ -8,12 +8,11 @@
package collectors
import (
"bytes"
"encoding/json"
"fmt"
"time"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// These are the fields we read from the JSON configuration
@@ -25,7 +24,6 @@ type SampleCollectorConfig struct {
// defined by metricCollector (name, init, ...)
type SampleCollector struct {
metricCollector
config SampleCollectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
@@ -43,19 +41,14 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
// Always set the name early in Init() to use it in cclog.Component* functions
m.name = "SampleCollector"
// This is for later use, also call it early
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
// Tell whether the collector should be run in parallel with others (reading files, ...)
// or it should be run serially, mostly for collectors actually doing measurements
// because they should not measure the execution of the other collectors
m.parallel = true
// Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta())
m.meta = map[string]string{
"source": m.name,
"group": "SAMPLE",
}
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
// Define tags sent with each metric
// The 'type' tag is always needed, it defines the granularity of the metric
// node -> whole system
@@ -66,15 +59,13 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
// core -> single CPU core that may consist of multiple hardware threads (SMT) (requires core ID as 'type-id' tag)
// hwthtread -> single CPU hardware thread (requires hardware thread ID as 'type-id' tag)
// accelerator -> A accelerator device like GPU or FPGA (requires an accelerator ID as 'type-id' tag)
m.tags = map[string]string{
"type": "node",
}
m.tags = map[string]string{"type": "node"}
// Read in the JSON configuration
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
@@ -101,11 +92,12 @@ func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMessage)
// stop := readState()
// value = (stop - start) / interval.Seconds()
y, err := lp.NewMetric("sample_metric", m.tags, m.meta, value, timestamp)
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
// Send it to output channel
output <- y
}
}
// Close metric collector: close network connection, close files, close libraries, ...

View File

@@ -8,14 +8,12 @@
package collectors
import (
"bytes"
"encoding/json"
"fmt"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// These are the fields we read from the JSON configuration
@@ -27,7 +25,6 @@ type SampleTimerCollectorConfig struct {
// defined by metricCollector (name, init, ...)
type SampleTimerCollector struct {
metricCollector
wg sync.WaitGroup // sync group for management
done chan bool // channel for management
meta map[string]string // default meta information
@@ -39,39 +36,33 @@ type SampleTimerCollector struct {
}
func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
var err error
var err error = nil
// Always set the name early in Init() to use it in cclog.Component* functions
m.name = "SampleTimerCollector"
// This is for later use, also call it early
if err = m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
// Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta())
m.meta = map[string]string{
"source": m.name,
"group": "SAMPLE",
}
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
// Define tags sent with each metric
// The 'type' tag is always needed, it defines the granularity of the metric
// node -> whole system
// socket -> CPU socket (requires socket ID as 'type-id' tag)
// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
m.tags = map[string]string{
"type": "node",
}
m.tags = map[string]string{"type": "node"}
// Read in the JSON configuration
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): error decoding JSON config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
// Parse the read interval duration
m.interval, err = time.ParseDuration(m.config.Interval)
if err != nil {
return fmt.Errorf("%s Init(): error parsing interval: %w", m.name, err)
cclog.ComponentError(m.name, "Error parsing interval:", err.Error())
return err
}
// Storage for output channel
@@ -82,11 +73,13 @@ func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
m.ticker = time.NewTicker(m.interval)
// Start the timer loop with return functionality by sending 'true' to the done channel
m.wg.Go(func() {
m.wg.Add(1)
go func() {
select {
case <-m.done:
// Exit the timer loop
cclog.ComponentDebug(m.name, "Closing...")
m.wg.Done()
return
case timestamp := <-m.ticker.C:
// This is executed every timer tick but we have to wait until the first
@@ -95,7 +88,7 @@ func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
m.ReadMetrics(timestamp)
}
}
})
}()
// Set this flag only if everything is initialized properly, all required files exist, ...
m.init = true
@@ -114,7 +107,7 @@ func (m *SampleTimerCollector) ReadMetrics(timestamp time.Time) {
// stop := readState()
// value = (stop - start) / interval.Seconds()
y, err := lp.NewMetric("sample_metric", m.tags, m.meta, value, timestamp)
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil && m.output != nil {
// Send it to output channel if we have a valid channel
m.output <- y

View File

@@ -9,16 +9,16 @@ package collectors
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"math"
"os"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const SCHEDSTATFILE = `/proc/schedstat`
@@ -32,7 +32,6 @@ type SchedstatCollectorConfig struct {
// defined by metricCollector (name, init, ...)
type SchedstatCollector struct {
metricCollector
config SchedstatCollectorConfig // the configuration structure
lastTimestamp time.Time // Store time stamp of last tick to derive values
meta map[string]string // default meta information
@@ -48,39 +47,37 @@ type SchedstatCollector struct {
// Called once by the collector manager
// All tags, meta data tags and metrics that do not change over the runtime should be set here
func (m *SchedstatCollector) Init(config json.RawMessage) error {
var err error = nil
// Always set the name early in Init() to use it in cclog.Component* functions
m.name = "SchedstatCollector"
// This is for later use, also call it early
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
// Tell whether the collector should be run in parallel with others (reading files, ...)
// or it should be run serially, mostly for collectors actually doing measurements
// or it should be run serially, mostly for collectors acutally doing measurements
// because they should not measure the execution of the other collectors
m.parallel = true
// Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta())
m.meta = map[string]string{
"source": m.name,
"group": "SCHEDSTAT",
}
m.meta = map[string]string{"source": m.name, "group": "SCHEDSTAT"}
// Read in the JSON configuration
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
// Check input file
file, err := os.Open(SCHEDSTATFILE)
file, err := os.Open(string(SCHEDSTATFILE))
if err != nil {
return fmt.Errorf("%s Init(): Failed opening scheduler statistics file \"%s\": %w", m.name, SCHEDSTATFILE, err)
cclog.ComponentError(m.name, err.Error())
}
defer file.Close()
// Pre-generate tags for all CPUs
num_cpus := 0
m.cputags = make(map[string]map[string]string)
m.olddata = make(map[string]map[string]int64)
scanner := bufio.NewScanner(file)
@@ -92,18 +89,10 @@ func (m *SchedstatCollector) Init(config json.RawMessage) error {
cpu, _ := strconv.Atoi(cpustr)
running, _ := strconv.ParseInt(linefields[7], 10, 64)
waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
m.cputags[linefields[0]] = map[string]string{
"type": "hwthread",
"type-id": strconv.Itoa(cpu),
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
m.olddata[linefields[0]] = map[string]int64{"running": running, "waiting": waiting}
num_cpus++
}
m.olddata[linefields[0]] = map[string]int64{
"running": running,
"waiting": waiting,
}
}
}
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): Failed closing scheduler statistics file \"%s\": %w", m.name, SCHEDSTATFILE, err)
}
// Save current timestamp
@@ -120,14 +109,14 @@ func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]
diff_running := running - m.olddata[linefields[0]]["running"]
diff_waiting := waiting - m.olddata[linefields[0]]["waiting"]
l_running := float64(diff_running) / tsdelta.Seconds() / 1000_000_000
l_waiting := float64(diff_waiting) / tsdelta.Seconds() / 1000_000_000
var l_running float64 = float64(diff_running) / tsdelta.Seconds() / (math.Pow(1000, 3))
var l_waiting float64 = float64(diff_waiting) / tsdelta.Seconds() / (math.Pow(1000, 3))
m.olddata[linefields[0]]["running"] = running
m.olddata[linefields[0]]["waiting"] = waiting
value := l_running + l_waiting
y, err := lp.NewMetric("cpu_load_core", tags, m.meta, value, now)
y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now)
if err == nil {
// Send it to output channel
output <- y
@@ -141,23 +130,15 @@ func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessa
return
}
// timestamps
//timestamps
now := time.Now()
tsdelta := now.Sub(m.lastTimestamp)
file, err := os.Open(SCHEDSTATFILE)
file, err := os.Open(string(SCHEDSTATFILE))
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", SCHEDSTATFILE, err))
cclog.ComponentError(m.name, err.Error())
}
defer func() {
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", SCHEDSTATFILE, err))
}
}()
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
@@ -169,6 +150,7 @@ func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessa
}
m.lastTimestamp = now
}
// Close metric collector: close network connection, close files, close libraries, ...

View File

@@ -8,14 +8,13 @@
package collectors
import (
"bytes"
"encoding/json"
"fmt"
"runtime"
"syscall"
"time"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
type SelfCollectorConfig struct {
@@ -27,7 +26,6 @@ type SelfCollectorConfig struct {
type SelfCollector struct {
metricCollector
config SelfCollectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
@@ -36,22 +34,15 @@ type SelfCollector struct {
func (m *SelfCollector) Init(config json.RawMessage) error {
var err error = nil
m.name = "SelfCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
m.meta = map[string]string{
"source": m.name,
"group": "Self",
}
m.tags = map[string]string{
"type": "node",
}
m.meta = map[string]string{"source": m.name, "group": "Self"}
m.tags = map[string]string{"type": "node"}
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
m.init = true
@@ -65,49 +56,49 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
var memstats runtime.MemStats
runtime.ReadMemStats(&memstats)
y, err := lp.NewMetric("total_alloc", m.tags, m.meta, memstats.TotalAlloc, timestamp)
y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMetric("heap_alloc", m.tags, m.meta, memstats.HeapAlloc, timestamp)
y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMetric("heap_sys", m.tags, m.meta, memstats.HeapSys, timestamp)
y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMetric("heap_idle", m.tags, m.meta, memstats.HeapIdle, timestamp)
y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMetric("heap_inuse", m.tags, m.meta, memstats.HeapInuse, timestamp)
y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMetric("heap_released", m.tags, m.meta, memstats.HeapReleased, timestamp)
y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMetric("heap_objects", m.tags, m.meta, memstats.HeapObjects, timestamp)
y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp)
if err == nil {
output <- y
}
}
if m.config.GoRoutines {
y, err := lp.NewMetric("num_goroutines", m.tags, m.meta, runtime.NumGoroutine(), timestamp)
y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp)
if err == nil {
output <- y
}
}
if m.config.CgoCalls {
y, err := lp.NewMetric("num_cgo_calls", m.tags, m.meta, runtime.NumCgoCall(), timestamp)
y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp)
if err == nil {
output <- y
}
@@ -118,35 +109,35 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if err == nil {
sec, nsec := rusage.Utime.Unix()
t := float64(sec) + (float64(nsec) * 1e-9)
y, err := lp.NewMetric("rusage_user_time", m.tags, m.meta, t, timestamp)
y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
if err == nil {
y.AddMeta("unit", "seconds")
output <- y
}
sec, nsec = rusage.Stime.Unix()
t = float64(sec) + (float64(nsec) * 1e-9)
y, err = lp.NewMetric("rusage_system_time", m.tags, m.meta, t, timestamp)
y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
if err == nil {
y.AddMeta("unit", "seconds")
output <- y
}
y, err = lp.NewMetric("rusage_vol_ctx_switch", m.tags, m.meta, rusage.Nvcsw, timestamp)
y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp)
if err == nil {
output <- y
}
y, err = lp.NewMetric("rusage_invol_ctx_switch", m.tags, m.meta, rusage.Nivcsw, timestamp)
y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp)
if err == nil {
output <- y
}
y, err = lp.NewMetric("rusage_signals", m.tags, m.meta, rusage.Nsignals, timestamp)
y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp)
if err == nil {
output <- y
}
y, err = lp.NewMetric("rusage_major_pgfaults", m.tags, m.meta, rusage.Majflt, timestamp)
y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp)
if err == nil {
output <- y
}
y, err = lp.NewMetric("rusage_minor_pgfaults", m.tags, m.meta, rusage.Minflt, timestamp)
y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp)
if err == nil {
output <- y
}

View File

@@ -11,8 +11,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
type SlurmJobData struct {
@@ -32,7 +32,6 @@ type SlurmCgroupsConfig struct {
type SlurmCgroupCollector struct {
metricCollector
config SlurmCgroupsConfig
meta map[string]string
tags map[string]string
@@ -51,7 +50,8 @@ func ParseCPUs(cpuset string) ([]int, error) {
return result, nil
}
for r := range strings.SplitSeq(cpuset, ",") {
ranges := strings.Split(cpuset, ",")
for _, r := range ranges {
if strings.Contains(r, "-") {
parts := strings.Split(r, "-")
if len(parts) != 2 {
@@ -80,10 +80,9 @@ func ParseCPUs(cpuset string) ([]int, error) {
}
func GetAllCPUs() ([]int, error) {
cpuOnline := "/sys/devices/system/cpu/online"
data, err := os.ReadFile(cpuOnline)
data, err := os.ReadFile("/sys/devices/system/cpu/online")
if err != nil {
return nil, fmt.Errorf("failed to read file \"%s\": %w", cpuOnline, err)
return nil, fmt.Errorf("failed to read /sys/devices/system/cpu/online: %v", err)
}
return ParseCPUs(strings.TrimSpace(string(data)))
}
@@ -104,25 +103,18 @@ func (m *SlurmCgroupCollector) readFile(path string) ([]byte, error) {
func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
var err error
m.name = "SlurmCgroupCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
m.parallel = true
m.meta = map[string]string{
"source": m.name,
"group": "SLURM",
}
m.tags = map[string]string{
"type": "hwthread",
}
m.meta = map[string]string{"source": m.name, "group": "SLURM"}
m.tags = map[string]string{"type": "hwthread"}
m.cpuUsed = make(map[int]bool)
m.cgroupBase = defaultCgroupBase
if len(config) > 0 {
d := json.NewDecoder(strings.NewReader(string(config)))
d.DisallowUnknownFields()
if err = d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error reading JSON config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
m.excludeMetrics = make(map[string]struct{})
for _, metric := range m.config.ExcludeMetrics {
@@ -137,16 +129,19 @@ func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
if !m.useSudo {
user, err := user.Current()
if err != nil {
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
return err
}
if user.Uid != "0" {
return fmt.Errorf("%s Init(): Reading cgroup files requires root privileges (or enable use_sudo in config)", m.name)
cclog.ComponentError(m.name, "Reading cgroup files requires root privileges (or enable use_sudo in config)")
return fmt.Errorf("not root")
}
}
m.allCPUs, err = GetAllCPUs()
if err != nil {
return fmt.Errorf("%s Init(): Error reading online CPUs: %w", m.name, err)
cclog.ComponentError(m.name, "Error reading online CPUs:", err.Error())
return err
}
m.init = true
@@ -163,9 +158,7 @@ func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error)
CpuSet: []int{},
}
cg := func(f string) string {
return filepath.Join(m.cgroupBase, jobdir, f)
}
cg := func(f string) string { return filepath.Join(m.cgroupBase, jobdir, f) }
memUsage, err := m.readFile(cg("memory.current"))
if err == nil {
@@ -214,8 +207,8 @@ func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error)
}
}
if usageUsec > 0 {
jobdata.CpuUsageUser = (userUsec * 100.0 / usageUsec)
jobdata.CpuUsageSys = (systemUsec * 100.0 / usageUsec)
jobdata.CpuUsageUser = (userUsec * 100 / usageUsec)
jobdata.CpuUsageSys = (systemUsec * 100 / usageUsec)
}
}
@@ -258,19 +251,12 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
for _, cpu := range jobdata.CpuSet {
coreTags := map[string]string{
"type": "hwthread",
"type-id": strconv.Itoa(cpu),
"type-id": fmt.Sprintf("%d", cpu),
}
if coreCount > 0 && !m.isExcluded("job_mem_used") {
memPerCore := jobdata.MemoryUsage / coreCount
if y, err := lp.NewMessage(
"job_mem_used",
coreTags,
m.meta,
map[string]any{
"value": memPerCore,
},
timestamp); err == nil {
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]interface{}{"value": memPerCore}, timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
@@ -278,14 +264,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_max_mem_used") {
maxMemPerCore := jobdata.MaxMemoryUsage / coreCount
if y, err := lp.NewMessage(
"job_max_mem_used",
coreTags,
m.meta,
map[string]any{
"value": maxMemPerCore,
},
timestamp); err == nil {
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]interface{}{"value": maxMemPerCore}, timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
@@ -293,14 +272,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_mem_limit") {
limitPerCore := jobdata.LimitMemoryUsage / coreCount
if y, err := lp.NewMessage(
"job_mem_limit",
coreTags,
m.meta,
map[string]any{
"value": limitPerCore,
},
timestamp); err == nil {
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]interface{}{"value": limitPerCore}, timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
@@ -308,14 +280,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_user_cpu") {
cpuUserPerCore := jobdata.CpuUsageUser / coreCount
if y, err := lp.NewMessage(
"job_user_cpu",
coreTags,
m.meta,
map[string]any{
"value": cpuUserPerCore,
},
timestamp); err == nil {
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]interface{}{"value": cpuUserPerCore}, timestamp); err == nil {
y.AddMeta("unit", "%")
output <- y
}
@@ -323,14 +288,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_sys_cpu") {
cpuSysPerCore := jobdata.CpuUsageSys / coreCount
if y, err := lp.NewMessage(
"job_sys_cpu",
coreTags,
m.meta,
map[string]any{
"value": cpuSysPerCore,
},
timestamp); err == nil {
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]interface{}{"value": cpuSysPerCore}, timestamp); err == nil {
y.AddMeta("unit", "%")
output <- y
}
@@ -345,60 +303,39 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if !m.cpuUsed[cpu] {
coreTags := map[string]string{
"type": "hwthread",
"type-id": strconv.Itoa(cpu),
"type-id": fmt.Sprintf("%d", cpu),
}
if !m.isExcluded("job_mem_used") {
if y, err := lp.NewMessage(
"job_mem_used",
coreTags,
m.meta,
map[string]any{
"value": 0,
},
timestamp); err == nil {
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
}
if !m.isExcluded("job_max_mem_used") {
if y, err := lp.NewMessage(
"job_max_mem_used",
coreTags,
m.meta,
map[string]any{
"value": 0,
},
timestamp); err == nil {
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
}
if !m.isExcluded("job_mem_limit") {
if y, err := lp.NewMessage(
"job_mem_limit",
coreTags,
m.meta,
map[string]any{
"value": 0,
},
timestamp); err == nil {
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
}
if !m.isExcluded("job_user_cpu") {
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
y.AddMeta("unit", "%")
output <- y
}
}
if !m.isExcluded("job_sys_cpu") {
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
y.AddMeta("unit", "%")
output <- y
}

View File

@@ -1,360 +0,0 @@
package collectors
import (
"bytes"
"encoding/json"
"fmt"
"os/exec"
"slices"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
)
type SmartMonCollectorConfig struct {
UseSudo bool `json:"use_sudo,omitempty"`
ExcludeDevices []string `json:"exclude_devices,omitempty"`
ExcludeMetrics []string `json:"excludeMetrics,omitempty"`
Devices []struct {
Name string `json:"name"`
Type string `json:"type"`
} `json:"devices,omitempty"`
}
type deviceT struct {
Name string `json:"name"`
Type string `json:"type"`
queryCommand []string
}
type SmartMonCollector struct {
metricCollector
config SmartMonCollectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
devices []deviceT // smartmon devices
sudoCmd string // Full path to 'sudo' command
smartCtlCmd string // Full path to 'smartctl' command
excludeMetric struct {
temp,
percentUsed,
availSpare,
dataUnitsRead,
dataUnitsWrite,
hostReads,
hostWrites,
powerCycles,
powerOn,
UnsafeShutdowns,
mediaErrors,
errlogEntries,
warnTempTime,
critCompTime bool
}
}
func (m *SmartMonCollector) getSmartmonDevices() error {
// Use configured devices
if len(m.config.Devices) > 0 {
for _, configDevice := range m.config.Devices {
if !slices.Contains(m.config.ExcludeDevices, configDevice.Name) {
d := deviceT{
Name: configDevice.Name,
Type: configDevice.Type,
}
if m.config.UseSudo {
d.queryCommand = append(d.queryCommand, m.sudoCmd)
}
d.queryCommand = append(d.queryCommand, m.smartCtlCmd, "--json=c", "--device="+d.Type, "--all", d.Name)
m.devices = append(m.devices, d)
}
}
return nil
}
// Use scan command
var scanCmd []string
if m.config.UseSudo {
scanCmd = append(scanCmd, m.sudoCmd)
}
scanCmd = append(scanCmd, m.smartCtlCmd, "--scan", "--json=c")
command := exec.Command(scanCmd[0], scanCmd[1:]...)
stdout, err := command.Output()
if err != nil {
return fmt.Errorf(
"%s getSmartmonDevices(): Failed to execute device scan command %s: %w",
m.name, command.String(), err)
}
var scanOutput struct {
Devices []deviceT `json:"devices"`
}
err = json.Unmarshal(stdout, &scanOutput)
if err != nil {
return fmt.Errorf("%s getSmartmonDevices(): Failed to parse JSON output from device scan command: %w",
m.name, err)
}
m.devices = make([]deviceT, 0)
for _, d := range scanOutput.Devices {
if !slices.Contains(m.config.ExcludeDevices, d.Name) {
if m.config.UseSudo {
d.queryCommand = append(d.queryCommand, m.sudoCmd)
}
d.queryCommand = append(d.queryCommand, m.smartCtlCmd, "--json=c", "--device="+d.Type, "--all", d.Name)
m.devices = append(m.devices, d)
}
}
return nil
}
func (m *SmartMonCollector) Init(config json.RawMessage) error {
m.name = "SmartMonCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true
m.meta = map[string]string{
"source": m.name,
"group": "Disk",
}
m.tags = map[string]string{
"type": "node",
"stype": "disk",
}
// Read in the JSON configuration
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error reading config: %w", m.name, err)
}
}
for _, excludeMetric := range m.config.ExcludeMetrics {
switch excludeMetric {
case "smartmon_temp":
m.excludeMetric.temp = true
case "smartmon_percent_used":
m.excludeMetric.percentUsed = true
case "smartmon_avail_spare":
m.excludeMetric.availSpare = true
case "smartmon_data_units_read":
m.excludeMetric.dataUnitsRead = true
case "smartmon_data_units_write":
m.excludeMetric.dataUnitsWrite = true
case "smartmon_host_reads":
m.excludeMetric.hostReads = true
case "smartmon_host_writes":
m.excludeMetric.hostWrites = true
case "smartmon_power_cycles":
m.excludeMetric.powerCycles = true
case "smartmon_power_on":
m.excludeMetric.powerOn = true
case "smartmon_unsafe_shutdowns":
m.excludeMetric.UnsafeShutdowns = true
case "smartmon_media_errors":
m.excludeMetric.mediaErrors = true
case "smartmon_errlog_entries":
m.excludeMetric.errlogEntries = true
case "smartmon_warn_temp_time":
m.excludeMetric.warnTempTime = true
case "smartmon_crit_comp_time":
m.excludeMetric.critCompTime = true
default:
return fmt.Errorf("%s Init(): Unknown excluded metric: %s", m.name, excludeMetric)
}
}
// Check if sudo and smartctl are in search path
if m.config.UseSudo {
p, err := exec.LookPath("sudo")
if err != nil {
return fmt.Errorf("%s Init(): No sudo command found in search path: %w", m.name, err)
}
m.sudoCmd = p
}
p, err := exec.LookPath("smartctl")
if err != nil {
return fmt.Errorf("%s Init(): No smartctl command found in search path: %w", m.name, err)
}
m.smartCtlCmd = p
if err = m.getSmartmonDevices(); err != nil {
return err
}
m.init = true
return err
}
type SmartMonData struct {
SerialNumber string `json:"serial_number"`
UserCapacity struct {
Blocks int `json:"blocks"`
Bytes int `json:"bytes"`
} `json:"user_capacity"`
HealthLog struct {
// Available SMART health information:
// sudo smartctl -a --json=c /dev/nvme0 | jq --color-output | less --RAW-CONTROL-CHARS
Temperature int `json:"temperature"`
PercentageUsed int `json:"percentage_used"`
AvailableSpare int `json:"available_spare"`
DataUnitsRead int `json:"data_units_read"`
DataUnitsWrite int `json:"data_units_written"`
HostReads int `json:"host_reads"`
HostWrites int `json:"host_writes"`
PowerCycles int `json:"power_cycles"`
PowerOnHours int `json:"power_on_hours"`
UnsafeShutdowns int `json:"unsafe_shutdowns"`
MediaErrors int `json:"media_errors"`
NumErrorLogEntries int `json:"num_err_log_entries"`
WarnTempTime int `json:"warning_temp_time"`
CriticalCompTime int `json:"critical_comp_time"`
} `json:"nvme_smart_health_information_log"`
}
func (m *SmartMonCollector) Read(interval time.Duration, output chan lp.CCMessage) {
timestamp := time.Now()
for _, d := range m.devices {
var data SmartMonData
command := exec.Command(d.queryCommand[0], d.queryCommand[1:]...)
stdout, err := command.Output()
if err != nil {
cclog.ComponentError(m.name, "cannot read data for device", d.Name)
continue
}
err = json.Unmarshal(stdout, &data)
if err != nil {
cclog.ComponentError(m.name, "cannot unmarshal data for device", d.Name)
continue
}
if !m.excludeMetric.temp {
y, err := lp.NewMetric(
"smartmon_temp", m.tags, m.meta, data.HealthLog.Temperature, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
y.AddMeta("unit", "degC")
output <- y
}
}
if !m.excludeMetric.percentUsed {
y, err := lp.NewMetric(
"smartmon_percent_used", m.tags, m.meta, data.HealthLog.PercentageUsed, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
y.AddMeta("unit", "percent")
output <- y
}
}
if !m.excludeMetric.availSpare {
y, err := lp.NewMetric(
"smartmon_avail_spare", m.tags, m.meta, data.HealthLog.AvailableSpare, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
y.AddMeta("unit", "percent")
output <- y
}
}
if !m.excludeMetric.dataUnitsRead {
y, err := lp.NewMetric(
"smartmon_data_units_read", m.tags, m.meta, data.HealthLog.DataUnitsRead, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.dataUnitsWrite {
y, err := lp.NewMetric(
"smartmon_data_units_write", m.tags, m.meta, data.HealthLog.DataUnitsWrite, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.hostReads {
y, err := lp.NewMetric(
"smartmon_host_reads", m.tags, m.meta, data.HealthLog.HostReads, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.hostWrites {
y, err := lp.NewMetric(
"smartmon_host_writes", m.tags, m.meta, data.HealthLog.HostWrites, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.powerCycles {
y, err := lp.NewMetric(
"smartmon_power_cycles", m.tags, m.meta, data.HealthLog.PowerCycles, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.powerOn {
y, err := lp.NewMetric(
"smartmon_power_on", m.tags, m.meta, int64(data.HealthLog.PowerOnHours)*3600, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
y.AddMeta("unit", "sec")
output <- y
}
}
if !m.excludeMetric.UnsafeShutdowns {
y, err := lp.NewMetric(
"smartmon_unsafe_shutdowns", m.tags, m.meta, data.HealthLog.UnsafeShutdowns, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.mediaErrors {
y, err := lp.NewMetric(
"smartmon_media_errors", m.tags, m.meta, data.HealthLog.MediaErrors, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.errlogEntries {
y, err := lp.NewMetric(
"smartmon_errlog_entries", m.tags, m.meta, data.HealthLog.NumErrorLogEntries, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.warnTempTime {
y, err := lp.NewMetric(
"smartmon_warn_temp_time", m.tags, m.meta, data.HealthLog.WarnTempTime, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
if !m.excludeMetric.critCompTime {
y, err := lp.NewMetric(
"smartmon_crit_comp_time", m.tags, m.meta, data.HealthLog.CriticalCompTime, timestamp)
if err == nil {
y.AddTag("stype-id", d.Name)
output <- y
}
}
}
}
func (m *SmartMonCollector) Close() {
m.init = false
}

View File

@@ -1,52 +0,0 @@
<!--
---
title: smartmon metric collector
description: Collect S.M.A.R.T data from NVMEs
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/smartmonMetric.md
---
-->
## `smartmon` collector
```json
"smartmon": {
"use_sudo": true,
"exclude_devices": [
"/dev/sda"
],
"excludeMetrics": [
"smartmon_warn_temp_time",
"smartmon_crit_comp_time"
],
"devices": [
{
"name": "/dev/nvme0",
"type": "nvme"
}
]
}
```
The `smartmon` collector retrieves S.M.A.R.T data from NVMEs via command `smartctl`.
Available NVMEs can be either automatically detected by a device scan or manually added with the "devices" config option.
Metrics:
* `smartmon_temp`: Temperature of the device (`unit=degC`)
* `smartmon_avail_spare`: Amount of spare left (`unit=percent`)
* `smartmon_percent_used`: Percentage of the device is used (`unit=percent`)
* `smartmon_data_units_read`: Read data units
* `smartmon_data_units_write`: Written data units
* `smartmon_host_reads`: Read operations
* `smartmon_host_writes`: Write operations
* `smartmon_power_cycles`: Number of power cycles
* `smartmon_power_on`: Seconds the device is powered on (`unit=seconds`)
* `smartmon_unsafe_shutdowns`: Count of unsafe shutdowns
* `smartmon_media_errors`: Media errors of the device
* `smartmon_errlog_entries`: Error log entries
* `smartmon_warn_temp_time`: Time above the warning temperature threshold
* `smartmon_crit_comp_time`: Time above the critical composite temperature threshold

View File

@@ -8,7 +8,6 @@
package collectors
import (
"bytes"
"encoding/json"
"fmt"
"os"
@@ -17,8 +16,8 @@ import (
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
@@ -42,7 +41,6 @@ type TempCollectorSensor struct {
type TempCollector struct {
metricCollector
config struct {
ExcludeMetrics []string `json:"exclude_metrics"`
TagOverride map[string]map[string]string `json:"tag_override"`
@@ -60,14 +58,11 @@ func (m *TempCollector) Init(config json.RawMessage) error {
m.name = "TempCollector"
m.parallel = true
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
@@ -83,10 +78,10 @@ func (m *TempCollector) Init(config json.RawMessage) error {
globPattern := filepath.Join("/sys/class/hwmon", "*", "temp*_input")
inputFiles, err := filepath.Glob(globPattern)
if err != nil {
return fmt.Errorf("%s Init(): unable to glob files with pattern '%s': %w", m.name, globPattern, err)
return fmt.Errorf("unable to glob files with pattern '%s': %v", globPattern, err)
}
if inputFiles == nil {
return fmt.Errorf("%s Init(): unable to find any files with pattern '%s'", m.name, globPattern)
return fmt.Errorf("unable to find any files with pattern '%s'", globPattern)
}
// Get sensor name for each temperature sensor file
@@ -122,7 +117,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
sensor.metricName = sensor.label
}
sensor.metricName = strings.ToLower(sensor.metricName)
sensor.metricName = strings.ReplaceAll(sensor.metricName, " ", "_")
sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1)
// Add temperature prefix, if required
if !strings.Contains(sensor.metricName, "temp") {
sensor.metricName = "temp_" + sensor.metricName
@@ -175,7 +170,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
// Empty sensors map
if len(m.sensors) == 0 {
return fmt.Errorf("%s Init(): no temperature sensors found", m.name)
return fmt.Errorf("no temperature sensors found")
}
// Finished initialization
@@ -184,6 +179,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
}
func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
for _, sensor := range m.sensors {
// Read sensor file
buffer, err := os.ReadFile(sensor.file)
@@ -205,7 +201,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
sensor.metricName,
sensor.tags,
m.meta,
map[string]any{"value": x},
map[string]interface{}{"value": x},
time.Now(),
)
if err == nil {
@@ -218,7 +214,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
sensor.maxTempName,
sensor.tags,
m.meta,
map[string]any{"value": sensor.maxTemp},
map[string]interface{}{"value": sensor.maxTemp},
time.Now(),
)
if err == nil {
@@ -232,7 +228,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
sensor.critTempName,
sensor.tags,
m.meta,
map[string]any{"value": sensor.critTemp},
map[string]interface{}{"value": sensor.critTemp},
time.Now(),
)
if err == nil {
@@ -240,6 +236,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
}
}
}
}
func (m *TempCollector) Close() {

View File

@@ -14,10 +14,10 @@ hugo_path: docs/reference/cc-metric-collector/collectors/temp.md
```json
"tempstat": {
"tag_override": {
"<device like hwmon1>": {
"type": "socket",
"type-id": "0"
"tag_override" : {
"<device like hwmon1>" : {
"type" : "socket",
"type-id" : "0"
}
},
"exclude_metrics": [

View File

@@ -8,21 +8,19 @@
package collectors
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"log"
"os/exec"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const (
MAX_NUM_PROCS = 10
DEFAULT_NUM_PROCS = 2
)
const MAX_NUM_PROCS = 10
const DEFAULT_NUM_PROCS = 2
type TopProcsCollectorConfig struct {
Num_procs int `json:"num_procs"`
@@ -30,7 +28,6 @@ type TopProcsCollectorConfig struct {
type TopProcsCollector struct {
metricCollector
tags map[string]string
config TopProcsCollectorConfig
}
@@ -39,18 +36,12 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
var err error
m.name = "TopProcsCollector"
m.parallel = true
m.tags = map[string]string{
"type": "node",
}
m.meta = map[string]string{
"source": m.name,
"group": "TopProcs",
}
m.tags = map[string]string{"type": "node"}
m.meta = map[string]string{"source": m.name, "group": "TopProcs"}
if len(config) > 0 {
d := json.NewDecoder(bytes.NewReader(config))
d.DisallowUnknownFields()
if err := d.Decode(&m.config); err != nil {
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
err = json.Unmarshal(config, &m.config)
if err != nil {
return err
}
} else {
m.config.Num_procs = int(DEFAULT_NUM_PROCS)
@@ -58,13 +49,12 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
if m.config.Num_procs <= 0 || m.config.Num_procs > MAX_NUM_PROCS {
return fmt.Errorf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS)
}
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.setup()
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
command.Wait()
_, err = command.Output()
if err != nil {
return fmt.Errorf("%s Init(): failed to get output from command: %w", m.name, err)
return errors.New("failed to execute command")
}
m.init = true
return nil
@@ -75,25 +65,17 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessag
return
}
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
command.Wait()
stdout, err := command.Output()
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to read output from command \"%s\": %v", command.String(), err))
log.Print(m.name, err)
return
}
lines := strings.Split(string(stdout), "\n")
for i := 1; i < m.config.Num_procs+1; i++ {
name := fmt.Sprintf("topproc%d", i)
y, err := lp.NewMessage(
name,
m.tags,
m.meta,
map[string]any{
"value": lines[i],
},
time.Now())
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now())
if err == nil {
output <- y
}

View File

@@ -4,7 +4,7 @@ The configuration of the CC metric collector consists of five configuration file
## Global configuration
The global file contains the paths to the other four files and some global options. You can find examples in `example_configs`.
The global file contains the paths to the other four files and some global options.
```json
{

41
go.mod
View File

@@ -1,44 +1,45 @@
module github.com/ClusterCockpit/cc-metric-collector
go 1.25.0
go 1.24.0
require (
github.com/ClusterCockpit/cc-lib/v2 v2.8.2
github.com/ClusterCockpit/cc-lib v0.10.1
github.com/ClusterCockpit/go-rocm-smi v0.3.0
github.com/NVIDIA/go-nvml v0.13.0-1
github.com/PaesslerAG/gval v1.2.4
github.com/fsnotify/fsnotify v1.9.0
github.com/tklauser/go-sysconf v0.3.16
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf
github.com/tklauser/go-sysconf v0.3.15
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1
golang.org/x/sys v0.42.0
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b
golang.org/x/sys v0.37.0
)
require (
github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 // indirect
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/expr-lang/expr v1.17.8 // indirect
github.com/expr-lang/expr v1.17.6 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/mux v1.8.1 // indirect
github.com/influxdata/influxdb-client-go/v2 v2.14.0 // indirect
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect
github.com/klauspost/compress v1.18.4 // indirect
github.com/influxdata/line-protocol/v2 v2.2.1 // indirect
github.com/klauspost/compress v1.18.0 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/nats-io/nats.go v1.49.0 // indirect
github.com/nats-io/nkeys v0.4.15 // indirect
github.com/nats-io/nats.go v1.46.1 // indirect
github.com/nats-io/nkeys v0.4.11 // indirect
github.com/nats-io/nuid v1.0.1 // indirect
github.com/oapi-codegen/runtime v1.2.0 // indirect
github.com/oapi-codegen/runtime v1.1.1 // indirect
github.com/prometheus/client_golang v1.23.2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.67.5 // indirect
github.com/prometheus/procfs v0.20.0 // indirect
github.com/prometheus/common v0.66.1 // indirect
github.com/prometheus/procfs v0.16.1 // indirect
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 // indirect
github.com/shopspring/decimal v1.4.0 // indirect
github.com/stmcginnis/gofish v0.21.3 // indirect
github.com/tklauser/numcpus v0.11.0 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect
golang.org/x/crypto v0.48.0 // indirect
golang.org/x/net v0.51.0 // indirect
google.golang.org/protobuf v1.36.11 // indirect
github.com/shopspring/decimal v1.3.1 // indirect
github.com/stmcginnis/gofish v0.20.0 // indirect
github.com/tklauser/numcpus v0.10.0 // indirect
go.yaml.in/yaml/v2 v2.4.2 // indirect
golang.org/x/crypto v0.42.0 // indirect
golang.org/x/net v0.43.0 // indirect
google.golang.org/protobuf v1.36.8 // indirect
)

110
go.sum
View File

@@ -1,9 +1,5 @@
github.com/ClusterCockpit/cc-lib/v2 v2.8.0 h1:ROduRzRuusi+6kLB991AAu3Pp2AHOasQJFJc7JU/n/E=
github.com/ClusterCockpit/cc-lib/v2 v2.8.0/go.mod h1:FwD8vnTIbBM3ngeLNKmCvp9FoSjQZm7xnuaVxEKR23o=
github.com/ClusterCockpit/cc-lib/v2 v2.8.2 h1:rCLZk8wz8yq8xBnBEdVKigvA2ngR8dPmHbEFwxxb3jw=
github.com/ClusterCockpit/cc-lib/v2 v2.8.2/go.mod h1:FwD8vnTIbBM3ngeLNKmCvp9FoSjQZm7xnuaVxEKR23o=
github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 h1:hIzxgTBWcmCIHtoDKDkSCsKCOCOwUC34sFsbD2wcW0Q=
github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0/go.mod h1:y42qUu+YFmu5fdNuUAS4VbbIKxVjxCvbVqFdpdh8ahY=
github.com/ClusterCockpit/cc-lib v0.10.1 h1:tjGEH8mFGgznYxO8BKLiiar0eZR1Oytk8x5iIQHZR5s=
github.com/ClusterCockpit/cc-lib v0.10.1/go.mod h1:nvTZuxFCTwlos8I1rL5O1RPab7vRtkU8E/PGiaF6pQA=
github.com/ClusterCockpit/go-rocm-smi v0.3.0 h1:1qZnSpG7/NyLtc7AjqnUL9Jb8xtqG1nMVgp69rJfaR8=
github.com/ClusterCockpit/go-rocm-smi v0.3.0/go.mod h1:+I3UMeX3OlizXDf1WpGD43W4KGZZGVSGmny6rTeOnWA=
github.com/NVIDIA/go-nvml v0.11.6-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
@@ -14,8 +10,8 @@ github.com/PaesslerAG/gval v1.2.4/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbV
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op h1:Ucf+QxEKMbPogRO5guBNe5cgd9uZgfoJLOYs8WWhtjM=
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E=
github.com/antithesishq/antithesis-sdk-go v0.4.3-default-no-op h1:+OSa/t11TFhqfrX0EOSqQBDJ0YlpmK0rDSiB19dg9M0=
github.com/antithesishq/antithesis-sdk-go v0.4.3-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E=
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
@@ -23,19 +19,24 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r
github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/expr-lang/expr v1.17.8 h1:W1loDTT+0PQf5YteHSTpju2qfUfNoBt4yw9+wOEU9VM=
github.com/expr-lang/expr v1.17.8/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
github.com/expr-lang/expr v1.17.6 h1:1h6i8ONk9cexhDmowO/A64VPxHScu7qfSl2k8OlINec=
github.com/expr-lang/expr v1.17.6/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/go-tpm v0.9.7 h1:u89J4tUUeDTlH8xxC3CTW7OHZjbjKoHdQ9W7gCUhtxA=
github.com/google/go-tpm v0.9.7/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY=
github.com/google/go-tpm v0.9.5 h1:ocUmnDebX54dnW+MQWGQRbdaAcJELsa6PqZhJ48KwVU=
github.com/google/go-tpm v0.9.5/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
@@ -44,80 +45,93 @@ github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjw
github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI=
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU=
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
github.com/influxdata/line-protocol-corpus v0.0.0-20210519164801-ca6fa5da0184/go.mod h1:03nmhxzZ7Xk2pdG+lmMd7mHDfeVOYFyhOgwO61qWU98=
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig=
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo=
github.com/influxdata/line-protocol/v2 v2.0.0-20210312151457-c52fdecb625a/go.mod h1:6+9Xt5Sq1rWx+glMgxhcg2c0DUaehK+5TDcPZ76GypY=
github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxksNj7PX9aUSeYOYE/ceHY=
github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE=
github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM=
github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c=
github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 h1:KGuD/pM2JpL9FAYvBrnBBeENKZNh6eNtjqytV6TYjnk=
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ=
github.com/minio/highwayhash v1.0.3 h1:kbnuUMoHYyVl7szWjSxJnxw11k2U709jqFPPmIUyD6Q=
github.com/minio/highwayhash v1.0.3/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g=
github.com/nats-io/jwt/v2 v2.8.0/go.mod h1:me11pOkwObtcBNR8AiMrUbtVOUGkqYjMQZ6jnSdVUIA=
github.com/nats-io/nats-server/v2 v2.12.3 h1:KRv+1n7lddMVgkJPQer+pt36TcO0ENxjilBmeWdjcHs=
github.com/nats-io/nats-server/v2 v2.12.3/go.mod h1:MQXjG9WjyXKz9koWzUc3jYUMKD8x3CLmTNy91IQQz3Y=
github.com/nats-io/nats.go v1.49.0 h1:yh/WvY59gXqYpgl33ZI+XoVPKyut/IcEaqtsiuTJpoE=
github.com/nats-io/nats.go v1.49.0/go.mod h1:fDCn3mN5cY8HooHwE2ukiLb4p4G4ImmzvXyJt+tGwdw=
github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4=
github.com/nats-io/nkeys v0.4.15/go.mod h1:CpMchTXC9fxA5zrMo4KpySxNjiDVvr8ANOSZdiNfUrs=
github.com/nats-io/nats-server/v2 v2.12.0 h1:OIwe8jZUqJFrh+hhiyKu8snNib66qsx806OslqJuo74=
github.com/nats-io/nats-server/v2 v2.12.0/go.mod h1:nr8dhzqkP5E/lDwmn+A2CvQPMd1yDKXQI7iGg3lAvww=
github.com/nats-io/nats.go v1.46.1 h1:bqQ2ZcxVd2lpYI97xYASeRTY3I5boe/IVmuUDPitHfo=
github.com/nats-io/nats.go v1.46.1/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
github.com/nats-io/nkeys v0.4.11 h1:q44qGV008kYd9W1b1nEBkNzvnWxtRSQ7A8BoqRrcfa0=
github.com/nats-io/nkeys v0.4.11/go.mod h1:szDimtgmfOi9n25JpfIdGw12tZFYXqhGxjhVxsatHVE=
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
github.com/oapi-codegen/runtime v1.2.0 h1:RvKc1CVS1QeKSNzO97FBQbSMZyQ8s6rZd+LpmzwHMP4=
github.com/oapi-codegen/runtime v1.2.0/go.mod h1:Y7ZhmmlE8ikZOmuHRRndiIm7nf3xcVv+YMweKgG1DT0=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro=
github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4=
github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw=
github.com/prometheus/procfs v0.20.0 h1:AA7aCvjxwAquZAlonN7888f2u4IN8WVeFgBi4k82M4Q=
github.com/prometheus/procfs v0.20.0/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo=
github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4=
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY=
github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=
github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k=
github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME=
github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
github.com/stmcginnis/gofish v0.21.3 h1:EBLCHfORnbx7MPw7lplOOVe9QAD1T3XRVz6+a1Z4z5Q=
github.com/stmcginnis/gofish v0.21.3/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
github.com/stmcginnis/gofish v0.20.0 h1:hH2V2Qe898F2wWT1loApnkDUrXXiLKqbSlMaH3Y1n08=
github.com/stmcginnis/gofish v0.20.0/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA=
github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI=
github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw=
github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ=
github.com/tklauser/go-sysconf v0.3.15 h1:VE89k0criAymJ/Os65CSn1IXaol+1wrsFHEB8Ol49K4=
github.com/tklauser/go-sysconf v0.3.15/go.mod h1:Dmjwr6tYFIseJw7a3dRLJfsHAMXZ3nEnL/aZY+0IuI4=
github.com/tklauser/numcpus v0.10.0 h1:18njr6LDBk1zuna922MgdjQuJFjrdppsZG60sHGfjso=
github.com/tklauser/numcpus v0.10.0/go.mod h1:BiTKazU708GQTYF4mB+cmlpT2Is1gLk7XVuEeem8LsQ=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 h1:P7S/GeHBAFEZIYp0ePPs2kHXoazz8q2KsyxHyQVGCJg=
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1/go.mod h1:9CWpnTUmlQkfdpdutA1nNf4iE5lAVt3QZOu0Z6hahBE=
golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts=
golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos=
golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo=
golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y=
golang.org/x/crypto v0.42.0 h1:chiH31gIWm57EkTXpwnqf8qeuMUi0yekh6mT2AvFlqI=
golang.org/x/crypto v0.42.0/go.mod h1:4+rDnOTJhQCx2q7/j6rAN5XDw8kPjeaXEUR2eL94ix8=
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b h1:M2rDM6z3Fhozi9O7NWsxAkg/yqS/lQJ6PmkyIV3YP+o=
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8=
golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE=
golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg=
golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/time v0.13.0 h1:eUlYslOIt32DgYD6utsuUeHs4d7AsEYLuIAdg7FlYgI=
golang.org/x/time v0.13.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View File

@@ -10,17 +10,15 @@ package metricAggregator
import (
"context"
"fmt"
"maps"
"math"
"os"
"slices"
"strings"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"github.com/PaesslerAG/gval"
@@ -38,7 +36,7 @@ type MetricAggregatorIntervalConfig struct {
type metricAggregator struct {
functions []*MetricAggregatorIntervalConfig
constants map[string]any
constants map[string]interface{}
language gval.Language
output chan lp.CCMessage
}
@@ -72,12 +70,10 @@ var metricCacheLanguage = gval.NewLanguage(
gval.Function("getCpuList", getCpuListOfNode),
gval.Function("getCpuListOfType", getCpuListOfType),
)
var language gval.Language = gval.NewLanguage(
gval.Full(),
metricCacheLanguage,
)
var evaluables = struct {
mapping map[string]gval.Evaluable
mutex sync.Mutex
@@ -88,13 +84,14 @@ var evaluables = struct {
func (c *metricAggregator) Init(output chan lp.CCMessage) error {
c.output = output
c.functions = make([]*MetricAggregatorIntervalConfig, 0)
c.constants = make(map[string]any)
c.constants = make(map[string]interface{})
// add constants like hostname, numSockets, ... to constants list
// Set hostname
hostname, err := os.Hostname()
if err != nil {
return fmt.Errorf("metricAggregator: failed to get hostname: %w", err)
cclog.Error(err.Error())
return err
}
// Drop domain part of host name
c.constants["hostname"] = strings.SplitN(hostname, `.`, 2)[0]
@@ -123,8 +120,10 @@ func (c *metricAggregator) Init(output chan lp.CCMessage) error {
}
func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMessage) {
vars := make(map[string]any)
maps.Copy(vars, c.constants)
vars := make(map[string]interface{})
for k, v := range c.constants {
vars[k] = v
}
vars["starttime"] = starttime
vars["endtime"] = endtime
for _, f := range c.functions {
@@ -138,6 +137,7 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
matches := make([]lp.CCMessage, 0)
for _, m := range metrics {
vars["metric"] = m
//value, err := gval.Evaluate(f.Condition, vars, c.language)
value, err := f.gvalCond.EvalBool(context.Background(), vars)
if err != nil {
cclog.ComponentError("MetricCache", "COLLECT", f.Name, "COND", f.Condition, ":", err.Error())
@@ -171,22 +171,22 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
// Check, that only values of one type were collected
countValueTypes := 0
if len(valuesFloat64) > 0 {
countValueTypes++
countValueTypes += 1
}
if len(valuesFloat32) > 0 {
countValueTypes++
countValueTypes += 1
}
if len(valuesInt) > 0 {
countValueTypes++
countValueTypes += 1
}
if len(valuesInt32) > 0 {
countValueTypes++
countValueTypes += 1
}
if len(valuesInt64) > 0 {
countValueTypes++
countValueTypes += 1
}
if len(valuesBool) > 0 {
countValueTypes++
countValueTypes += 1
}
if countValueTypes > 1 {
cclog.ComponentError("MetricCache", "Collected values of different types")
@@ -263,15 +263,15 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
var m lp.CCMessage
switch t := value.(type) {
case float64:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case float32:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case int:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case int64:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case string:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
default:
cclog.ComponentError("MetricCache", "Gval returned invalid type", t, "skipping metric", f.Name)
}
@@ -329,21 +329,18 @@ func (c *metricAggregator) AddAggregation(name, function, condition string, tags
}
func (c *metricAggregator) DeleteAggregation(name string) error {
i := slices.IndexFunc(
c.functions,
func(agg *MetricAggregatorIntervalConfig) bool {
return agg.Name == name
})
if i == -1 {
return fmt.Errorf("no aggregation for metric name %s", name)
}
for i, agg := range c.functions {
if agg.Name == name {
copy(c.functions[i:], c.functions[i+1:])
c.functions[len(c.functions)-1] = nil
c.functions = c.functions[:len(c.functions)-1]
return nil
}
}
return fmt.Errorf("no aggregation for metric name %s", name)
}
func (c *metricAggregator) AddConstant(name string, value any) {
func (c *metricAggregator) AddConstant(name string, value interface{}) {
c.constants[name] = value
}
@@ -351,16 +348,17 @@ func (c *metricAggregator) DelConstant(name string) {
delete(c.constants, name)
}
func (c *metricAggregator) AddFunction(name string, function func(args ...any) (any, error)) {
func (c *metricAggregator) AddFunction(name string, function func(args ...interface{}) (interface{}, error)) {
c.language = gval.NewLanguage(c.language, gval.Function(name, function))
}
func EvalBoolCondition(condition string, params map[string]any) (bool, error) {
func EvalBoolCondition(condition string, params map[string]interface{}) (bool, error) {
evaluables.mutex.Lock()
evaluable, ok := evaluables.mapping[condition]
evaluables.mutex.Unlock()
if !ok {
newcond := strings.ReplaceAll(
newcond :=
strings.ReplaceAll(
strings.ReplaceAll(
condition, "'", "\""), "%", "\\")
var err error
@@ -381,7 +379,8 @@ func EvalFloat64Condition(condition string, params map[string]float64) (float64,
evaluable, ok := evaluables.mapping[condition]
evaluables.mutex.Unlock()
if !ok {
newcond := strings.ReplaceAll(
newcond :=
strings.ReplaceAll(
strings.ReplaceAll(
condition, "'", "\""), "%", "\\")
var err error

View File

@@ -11,10 +11,10 @@ import (
"errors"
"fmt"
"regexp"
"slices"
"strconv"
"strings"
"golang.org/x/exp/slices"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
)
@@ -34,7 +34,8 @@ func sumAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
}
// Sum up values
func sumfunc(args any) (any, error) {
func sumfunc(args interface{}) (interface{}, error) {
var err error
switch values := args.(type) {
case []float64:
@@ -62,7 +63,7 @@ func minAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
}
// Get the minimum value
func minfunc(args any) (any, error) {
func minfunc(args interface{}) (interface{}, error) {
switch values := args.(type) {
case []float64:
return minAnyType(values)
@@ -83,12 +84,12 @@ func avgAnyType[T float64 | float32 | int | int32 | int64](values []T) (float64,
if len(values) == 0 {
return 0.0, errors.New("average function requires at least one argument")
}
sum, err := sumAnyType(values)
sum, err := sumAnyType[T](values)
return float64(sum) / float64(len(values)), err
}
// Get the average or mean value
func avgfunc(args any) (any, error) {
func avgfunc(args interface{}) (interface{}, error) {
switch values := args.(type) {
case []float64:
return avgAnyType(values)
@@ -113,7 +114,7 @@ func maxAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
}
// Get the maximum value
func maxfunc(args any) (any, error) {
func maxfunc(args interface{}) (interface{}, error) {
switch values := args.(type) {
case []float64:
return maxAnyType(values)
@@ -145,7 +146,7 @@ func medianAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, er
}
// Get the median value
func medianfunc(args any) (any, error) {
func medianfunc(args interface{}) (interface{}, error) {
switch values := args.(type) {
case []float64:
return medianAnyType(values)
@@ -166,9 +167,9 @@ func medianfunc(args any) (any, error) {
* Get number of values in list. Returns always an int
*/
func lenfunc(args any) (any, error) {
var err error
length := 0
func lenfunc(args interface{}) (interface{}, error) {
var err error = nil
var length int = 0
switch values := args.(type) {
case []float64:
length = len(values)
@@ -180,7 +181,13 @@ func lenfunc(args any) (any, error) {
length = len(values)
case []int32:
length = len(values)
case float64, float32, int, int64:
case float64:
err = errors.New("function 'len' can only be applied on arrays and strings")
case float32:
err = errors.New("function 'len' can only be applied on arrays and strings")
case int:
err = errors.New("function 'len' can only be applied on arrays and strings")
case int64:
err = errors.New("function 'len' can only be applied on arrays and strings")
case string:
length = len(values)
@@ -190,13 +197,13 @@ func lenfunc(args any) (any, error) {
/*
* Check if a values is in a list
* In contrast to most of the other functions, this one is an infix operator for
* In constrast to most of the other functions, this one is an infix operator for
* - substring matching: `"abc" in "abcdef"` -> true
* - substring matching with int casting: `3 in "abd3"` -> true
* - search for an int in an int list: `3 in getCpuList()` -> true (if you have more than 4 CPU hardware threads)
*/
func infunc(a any, b any) (any, error) {
func infunc(a interface{}, b interface{}) (interface{}, error) {
switch match := a.(type) {
case string:
switch total := b.(type) {
@@ -206,9 +213,13 @@ func infunc(a any, b any) (any, error) {
case int:
switch total := b.(type) {
case []int:
return slices.Contains(total, match), nil
for _, x := range total {
if x == match {
return true, nil
}
}
case string:
smatch := strconv.Itoa(match)
smatch := fmt.Sprintf("%d", match)
return strings.Contains(total, smatch), nil
}
@@ -222,12 +233,12 @@ func infunc(a any, b any) (any, error) {
* format keys \d = %d, \w = %d, ... Not sure how to fix this
*/
func matchfunc(args ...any) (any, error) {
func matchfunc(args ...interface{}) (interface{}, error) {
switch match := args[0].(type) {
case string:
switch total := args[1].(type) {
case string:
smatch := strings.ReplaceAll(match, "%", "\\")
smatch := strings.Replace(match, "%", "\\", -1)
regex, err := regexp.Compile(smatch)
if err != nil {
return false, err
@@ -244,7 +255,7 @@ func matchfunc(args ...any) (any, error) {
*/
// for a given cpuid, it returns the core id
func getCpuCoreFunc(args any) (any, error) {
func getCpuCoreFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) {
case int:
return topo.GetHwthreadCore(cpuid), nil
@@ -253,7 +264,7 @@ func getCpuCoreFunc(args any) (any, error) {
}
// for a given cpuid, it returns the socket id
func getCpuSocketFunc(args any) (any, error) {
func getCpuSocketFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) {
case int:
return topo.GetHwthreadSocket(cpuid), nil
@@ -262,7 +273,7 @@ func getCpuSocketFunc(args any) (any, error) {
}
// for a given cpuid, it returns the id of the NUMA node
func getCpuNumaDomainFunc(args any) (any, error) {
func getCpuNumaDomainFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) {
case int:
return topo.GetHwthreadNumaDomain(cpuid), nil
@@ -271,7 +282,7 @@ func getCpuNumaDomainFunc(args any) (any, error) {
}
// for a given cpuid, it returns the id of the CPU die
func getCpuDieFunc(args any) (any, error) {
func getCpuDieFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) {
case int:
return topo.GetHwthreadDie(cpuid), nil
@@ -280,7 +291,7 @@ func getCpuDieFunc(args any) (any, error) {
}
// for a given core id, it returns the list of cpuids
func getCpuListOfCoreFunc(args any) (any, error) {
func getCpuListOfCoreFunc(args interface{}) (interface{}, error) {
cpulist := make([]int, 0)
switch in := args.(type) {
case int:
@@ -294,7 +305,7 @@ func getCpuListOfCoreFunc(args any) (any, error) {
}
// for a given socket id, it returns the list of cpuids
func getCpuListOfSocketFunc(args any) (any, error) {
func getCpuListOfSocketFunc(args interface{}) (interface{}, error) {
cpulist := make([]int, 0)
switch in := args.(type) {
case int:
@@ -308,7 +319,7 @@ func getCpuListOfSocketFunc(args any) (any, error) {
}
// for a given id of a NUMA domain, it returns the list of cpuids
func getCpuListOfNumaDomainFunc(args any) (any, error) {
func getCpuListOfNumaDomainFunc(args interface{}) (interface{}, error) {
cpulist := make([]int, 0)
switch in := args.(type) {
case int:
@@ -322,7 +333,7 @@ func getCpuListOfNumaDomainFunc(args any) (any, error) {
}
// for a given CPU die id, it returns the list of cpuids
func getCpuListOfDieFunc(args any) (any, error) {
func getCpuListOfDieFunc(args interface{}) (interface{}, error) {
cpulist := make([]int, 0)
switch in := args.(type) {
case int:
@@ -336,14 +347,14 @@ func getCpuListOfDieFunc(args any) (any, error) {
}
// wrapper function to get a list of all cpuids of the node
func getCpuListOfNode() (any, error) {
func getCpuListOfNode() (interface{}, error) {
return topo.HwthreadList(), nil
}
// helper function to get the cpuid list for a CCMetric type tag set (type and type-id)
// since there is no access to the metric data in the function, is should be called like
// `getCpuListOfType()`
func getCpuListOfType(args ...any) (any, error) {
func getCpuListOfType(args ...interface{}) (interface{}, error) {
cpulist := make([]int, 0)
switch typ := args[0].(type) {
case string:

View File

@@ -8,13 +8,12 @@
package metricRouter
import (
"fmt"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
)
@@ -52,7 +51,7 @@ type MetricCache interface {
}
func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error {
var err error
var err error = nil
c.done = make(chan bool)
c.wg = wg
c.ticker = ticker
@@ -71,7 +70,8 @@ func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker,
// The code is executed by the MetricCache goroutine
c.aggEngine, err = agg.NewAggregator(c.output)
if err != nil {
return fmt.Errorf("MetricCache: failed to create aggregator: %w", err)
cclog.ComponentError("MetricCache", "Cannot create aggregator")
return err
}
return nil
@@ -79,6 +79,7 @@ func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker,
// Start starts the metric cache
func (c *metricCache) Start() {
c.tickchan = make(chan time.Time)
c.ticker.AddChannel(c.tickchan)
// Router cache is done
@@ -101,7 +102,9 @@ func (c *metricCache) Start() {
return oldPeriod
}
c.wg.Go(func() {
c.wg.Add(1)
go func() {
defer c.wg.Done()
for {
select {
case <-c.done:
@@ -121,7 +124,7 @@ func (c *metricCache) Start() {
}
}
}
})
}()
cclog.ComponentDebug("MetricCache", "START")
}
@@ -134,12 +137,12 @@ func (c *metricCache) Add(metric lp.CCMessage) {
p := c.intervals[c.curPeriod]
if p.numMetrics < p.sizeMetrics {
p.metrics[p.numMetrics] = metric
p.numMetrics++
p.numMetrics = p.numMetrics + 1
p.stopstamp = metric.Time()
} else {
p.metrics = append(p.metrics, metric)
p.numMetrics++
p.sizeMetrics++
p.numMetrics = p.numMetrics + 1
p.sizeMetrics = p.sizeMetrics + 1
p.stopstamp = metric.Time()
}
c.lock.Unlock()
@@ -158,8 +161,8 @@ func (c *metricCache) DeleteAggregation(name string) error {
// is the current one, index=1 the last interval and so on. Returns and empty array if a wrong index
// is given (negative index, index larger than configured number of total intervals, ...)
func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMessage) {
start := time.Now()
stop := time.Now()
var start time.Time = time.Now()
var stop time.Time = time.Now()
var metrics []lp.CCMessage
if index >= 0 && index < c.numPeriods {
pindex := c.curPeriod - index
@@ -170,6 +173,7 @@ func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMessage
start = c.intervals[pindex].startstamp
stop = c.intervals[pindex].stopstamp
metrics = c.intervals[pindex].metrics
//return c.intervals[pindex].startstamp, c.intervals[pindex].stopstamp, c.intervals[pindex].metrics
} else {
metrics = make([]lp.CCMessage, 0)
}

View File

@@ -8,18 +8,17 @@
package metricRouter
import (
"bytes"
"encoding/json"
"fmt"
"maps"
"os"
"strings"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
mp "github.com/ClusterCockpit/cc-lib/v2/messageProcessor"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
mp "github.com/ClusterCockpit/cc-lib/messageProcessor"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
)
@@ -47,6 +46,7 @@ type metricRouterConfig struct {
MaxForward int `json:"max_forward"` // Number of maximal forwarded metrics at one select
NormalizeUnits bool `json:"normalize_units"` // Check unit meta flag and normalize it using cc-units
ChangeUnitPrefix map[string]string `json:"change_unit_prefix"` // Add prefix that should be applied to the metrics
// dropMetrics map[string]bool // Internal map for O(1) lookup
MessageProcessor json.RawMessage `json:"process_messages,omitempty"`
}
@@ -102,93 +102,76 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
// Drop domain part of host name
r.hostname = strings.SplitN(hostname, `.`, 2)[0]
d := json.NewDecoder(bytes.NewReader(routerConfig))
d.DisallowUnknownFields()
if err := d.Decode(&r.config); err != nil {
return fmt.Errorf("failed to decode metric router config: %w", err)
err = json.Unmarshal(routerConfig, &r.config)
if err != nil {
cclog.ComponentError("MetricRouter", err.Error())
return err
}
r.maxForward = 1
if r.config.MaxForward > r.maxForward {
r.maxForward = r.config.MaxForward
}
r.maxForward = max(1, r.config.MaxForward)
if r.config.NumCacheIntervals > 0 {
r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals)
if err != nil {
return fmt.Errorf("MetricRouter: failed to initialize MetricCache: %w", err)
cclog.ComponentError("MetricRouter", "MetricCache initialization failed:", err.Error())
return err
}
for _, agg := range r.config.IntervalAgg {
err = r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
if err != nil {
return fmt.Errorf("MetricCache AddAggregation() failed: %w", err)
}
r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
}
}
p, err := mp.NewMessageProcessor()
if err != nil {
return fmt.Errorf("MessageProcessor NewMessageProcessor() failed: %w", err)
return fmt.Errorf("initialization of message processor failed: %v", err.Error())
}
r.mp = p
if len(r.config.MessageProcessor) > 0 {
err = r.mp.FromConfigJSON(r.config.MessageProcessor)
if err != nil {
return fmt.Errorf("MessageProcessor FromConfigJSON() failed: %w", err)
return fmt.Errorf("failed parsing JSON for message processor: %v", err.Error())
}
}
for _, mname := range r.config.DropMetrics {
err = r.mp.AddDropMessagesByName(mname)
if err != nil {
return fmt.Errorf("MessageProcessor AddDropMessagesByName() failed: %w", err)
}
r.mp.AddDropMessagesByName(mname)
}
for _, cond := range r.config.DropMetricsIf {
err = r.mp.AddDropMessagesByCondition(cond)
if err != nil {
return fmt.Errorf("MessageProcessor AddDropMessagesByCondition() failed: %w", err)
}
r.mp.AddDropMessagesByCondition(cond)
}
for _, data := range r.config.AddTags {
cond := data.Condition
if cond == "*" {
cond = "true"
}
err = r.mp.AddAddTagsByCondition(cond, data.Key, data.Value)
if err != nil {
return fmt.Errorf("MessageProcessor AddAddTagsByCondition() failed: %w", err)
}
r.mp.AddAddTagsByCondition(cond, data.Key, data.Value)
}
for _, data := range r.config.DelTags {
cond := data.Condition
if cond == "*" {
cond = "true"
}
err = r.mp.AddDeleteTagsByCondition(cond, data.Key, data.Value)
if err != nil {
return fmt.Errorf("MessageProcessor AddDeleteTagsByCondition() failed: %w", err)
}
r.mp.AddDeleteTagsByCondition(cond, data.Key, data.Value)
}
for oldname, newname := range r.config.RenameMetrics {
err = r.mp.AddRenameMetricByName(oldname, newname)
if err != nil {
return fmt.Errorf("MessageProcessor AddRenameMetricByName() failed: %w", err)
}
r.mp.AddRenameMetricByName(oldname, newname)
}
for metricName, prefix := range r.config.ChangeUnitPrefix {
err = r.mp.AddChangeUnitPrefix(fmt.Sprintf("name == '%s'", metricName), prefix)
if err != nil {
return fmt.Errorf("MessageProcessor AddChangeUnitPrefix() failed: %w", err)
}
r.mp.AddChangeUnitPrefix(fmt.Sprintf("name == '%s'", metricName), prefix)
}
r.mp.SetNormalizeUnits(r.config.NormalizeUnits)
err = r.mp.AddAddTagsByCondition("!msg.HasTag('"+r.config.HostnameTagName+"')", r.config.HostnameTagName, r.hostname)
if err != nil {
return fmt.Errorf("MessageProcessor AddAddTagsByCondition() failed: %w", err)
}
r.mp.AddAddTagsByCondition("true", r.config.HostnameTagName, r.hostname)
// r.config.dropMetrics = make(map[string]bool)
// for _, mname := range r.config.DropMetrics {
// r.config.dropMetrics[mname] = true
// }
return nil
}
func getParamMap(point lp.CCMessage) map[string]any {
params := make(map[string]any)
func getParamMap(point lp.CCMessage) map[string]interface{} {
params := make(map[string]interface{})
params["metric"] = point
params["name"] = point.Name()
for key, value := range point.Tags() {
@@ -197,12 +180,14 @@ func getParamMap(point lp.CCMessage) map[string]any {
for key, value := range point.Meta() {
params[key] = value
}
maps.Copy(params, point.Fields())
for key, value := range point.Fields() {
params[key] = value
}
params["timestamp"] = point.Time()
return params
}
// DoAddTags adds a tag when condition is fulfilled
// DoAddTags adds a tag when condition is fullfiled
func (r *metricRouter) DoAddTags(point lp.CCMessage) {
var conditionMatches bool
for _, m := range r.config.AddTags {
@@ -224,6 +209,83 @@ func (r *metricRouter) DoAddTags(point lp.CCMessage) {
}
}
// DoDelTags removes a tag when condition is fullfiled
// func (r *metricRouter) DoDelTags(point lp.CCMessage) {
// var conditionMatches bool
// for _, m := range r.config.DelTags {
// if m.Condition == "*" {
// // Condition is always matched
// conditionMatches = true
// } else {
// // Evaluate condition
// var err error
// conditionMatches, err = agg.EvalBoolCondition(m.Condition, getParamMap(point))
// if err != nil {
// cclog.ComponentError("MetricRouter", err.Error())
// conditionMatches = false
// }
// }
// if conditionMatches {
// point.RemoveTag(m.Key)
// }
// }
// }
// Conditional test whether a metric should be dropped
// func (r *metricRouter) dropMetric(point lp.CCMessage) bool {
// // Simple drop check
// if conditionMatches, ok := r.config.dropMetrics[point.Name()]; ok {
// return conditionMatches
// }
// // Checking the dropping conditions
// for _, m := range r.config.DropMetricsIf {
// conditionMatches, err := agg.EvalBoolCondition(m, getParamMap(point))
// if err != nil {
// cclog.ComponentError("MetricRouter", err.Error())
// conditionMatches = false
// }
// if conditionMatches {
// return conditionMatches
// }
// }
// // No dropping condition met
// return false
// }
// func (r *metricRouter) prepareUnit(point lp.CCMessage) bool {
// if r.config.NormalizeUnits {
// if in_unit, ok := point.GetMeta("unit"); ok {
// u := units.NewUnit(in_unit)
// if u.Valid() {
// point.AddMeta("unit", u.Short())
// }
// }
// }
// if newP, ok := r.config.ChangeUnitPrefix[point.Name()]; ok {
// newPrefix := units.NewPrefix(newP)
// if in_unit, ok := point.GetMeta("unit"); ok && newPrefix != units.InvalidPrefix {
// u := units.NewUnit(in_unit)
// if u.Valid() {
// cclog.ComponentDebug("MetricRouter", "Change prefix to", newP, "for metric", point.Name())
// conv, out_unit := units.GetUnitPrefixFactor(u, newPrefix)
// if conv != nil && out_unit.Valid() {
// if val, ok := point.GetField("value"); ok {
// point.AddField("value", conv(val))
// point.AddMeta("unit", out_unit.Short())
// }
// }
// }
// }
// }
// return true
// }
// Start starts the metric router
func (r *metricRouter) Start() {
// start timer if configured
@@ -239,9 +301,31 @@ func (r *metricRouter) Start() {
cclog.ComponentDebug("MetricRouter", "DONE")
}
// Forward message received from collector channel
// Forward takes a received metric, adds or deletes tags
// and forwards it to the output channels
// forward := func(point lp.CCMessage) {
// cclog.ComponentDebug("MetricRouter", "FORWARD", point)
// r.DoAddTags(point)
// r.DoDelTags(point)
// name := point.Name()
// if new, ok := r.config.RenameMetrics[name]; ok {
// point.SetName(new)
// point.AddMeta("oldname", name)
// r.DoAddTags(point)
// r.DoDelTags(point)
// }
// r.prepareUnit(point)
// for _, o := range r.outputs {
// o <- point
// }
// }
// Foward message received from collector channel
coll_forward := func(p lp.CCMessage) {
// receive from metric collector
//p.AddTag(r.config.HostnameTagName, r.hostname)
if r.config.IntervalStamp {
p.SetTime(r.timestamp)
}
@@ -251,6 +335,11 @@ func (r *metricRouter) Start() {
o <- m
}
}
// if !r.dropMetric(p) {
// for _, o := range r.outputs {
// o <- point
// }
// }
// even if the metric is dropped, it is stored in the cache for
// aggregations
if r.config.NumCacheIntervals > 0 {
@@ -270,6 +359,9 @@ func (r *metricRouter) Start() {
o <- m
}
}
// if !r.dropMetric(p) {
// forward(p)
// }
}
// Forward message received from cache channel
@@ -288,7 +380,10 @@ func (r *metricRouter) Start() {
r.cache.Start()
}
r.wg.Go(func() {
r.wg.Add(1)
go func() {
defer r.wg.Done()
for {
select {
case <-r.done:
@@ -318,7 +413,7 @@ func (r *metricRouter) Start() {
}
}
}
})
}()
cclog.ComponentDebug("MetricRouter", "STARTED")
}

View File

@@ -13,11 +13,11 @@ import (
"os"
"path/filepath"
"regexp"
"slices"
"strconv"
"strings"
cclogger "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
cclogger "github.com/ClusterCockpit/cc-lib/ccLogger"
"golang.org/x/exp/slices"
)
const SYSFS_CPUBASE = `/sys/devices/system/cpu`
@@ -51,13 +51,14 @@ var cache struct {
func fileToInt(path string) int {
buffer, err := os.ReadFile(path)
if err != nil {
cclogger.ComponentError("ccTopology", fmt.Sprintf("fileToInt(): Reading \"%s\": %v", path, err))
log.Print(err)
cclogger.ComponentError("ccTopology", "fileToInt", "Reading", path, ":", err.Error())
return -1
}
stringBuffer := strings.TrimSpace(string(buffer))
id, err := strconv.Atoi(stringBuffer)
if err != nil {
cclogger.ComponentError("ccTopology", fmt.Sprintf("fileToInt(): Parsing \"%s\": %v", stringBuffer, err))
cclogger.ComponentError("ccTopology", "fileToInt", "Parsing", path, ":", stringBuffer, err.Error())
return -1
}
return id
@@ -79,7 +80,7 @@ func fileToList(path string) []int {
// Create list
list := make([]int, 0)
stringBuffer := strings.TrimSpace(string(buffer))
for valueRangeString := range strings.SplitSeq(stringBuffer, ",") {
for _, valueRangeString := range strings.Split(stringBuffer, ",") {
valueRange := strings.Split(valueRangeString, "-")
switch len(valueRange) {
case 1:
@@ -111,7 +112,9 @@ func fileToList(path string) []int {
// init initializes the cache structure
func init() {
getHWThreads := func() []int {
getHWThreads :=
func() []int {
globPath := filepath.Join(SYSFS_CPUBASE, "cpu[0-9]*")
regexPath := filepath.Join(SYSFS_CPUBASE, "cpu([[:digit:]]+)")
regex := regexp.MustCompile(regexPath)
@@ -147,7 +150,8 @@ func init() {
return hwThreadIDs
}
getNumaDomain := func(basePath string) int {
getNumaDomain :=
func(basePath string) int {
globPath := filepath.Join(basePath, "node*")
regexPath := filepath.Join(basePath, "node([[:digit:]]+)")
regex := regexp.MustCompile(regexPath)
@@ -215,7 +219,8 @@ func init() {
// Lookup NUMA domain id
cache.NumaDomainList[i] = getNumaDomain(cpuBase)
cache.CpuData[i] = HwthreadEntry{
cache.CpuData[i] =
HwthreadEntry{
CpuID: cache.HwthreadList[i],
SMT: cache.SMTList[i],
CoreCPUsList: coreCPUsList,
@@ -255,6 +260,12 @@ func HwthreadList() []int {
return slices.Clone(cache.HwthreadList)
}
// Get list of hardware thread IDs in the order of listing in /proc/cpuinfo
// Deprecated! Use HwthreadList()
func CpuList() []int {
return HwthreadList()
}
// CoreList gets the list of CPU core IDs in the order of listing in /proc/cpuinfo
func CoreList() []int {
return slices.Clone(cache.CoreList)
@@ -293,19 +304,20 @@ func GetTypeList(topology_type string) []int {
}
func GetTypeId(hwt HwthreadEntry, topology_type string) (int, error) {
var err error = nil
switch topology_type {
case "node":
return 0, nil
return 0, err
case "socket":
return hwt.Socket, nil
return hwt.Socket, err
case "die":
return hwt.Die, nil
return hwt.Die, err
case "memoryDomain":
return hwt.NumaDomain, nil
return hwt.NumaDomain, err
case "core":
return hwt.Core, nil
return hwt.Core, err
case "hwthread":
return hwt.CpuID, nil
return hwt.CpuID, err
}
return -1, fmt.Errorf("unknown topology type '%s'", topology_type)
}

View File

@@ -10,7 +10,7 @@ package multiChanTicker
import (
"time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
)
type multiChanTicker struct {
@@ -21,7 +21,7 @@ type multiChanTicker struct {
type MultiChanTicker interface {
Init(duration time.Duration)
AddChannel(channel chan time.Time)
AddChannel(chan time.Time)
Close()
}

View File

@@ -1,6 +1,6 @@
{
"process_messages" : {
"add_tags_if": [
"add_tag_if": [
{
"key" : "cluster",
"value" : "testcluster",
@@ -12,7 +12,7 @@
"if" : "name == 'temp_package_id_0'"
}
],
"delete_meta_if": [
"delete_tag_if": [
{
"key" : "unit",
"if" : "true"

View File

@@ -30,11 +30,11 @@ make
%install
install -Dpm 0750 %{name} %{buildroot}%{_bindir}/%{name}
install -Dpm 0600 example-configs/config.json %{buildroot}%{_sysconfdir}/%{name}/%{name}.json
install -Dpm 0600 example-configs/collectors.json %{buildroot}%{_sysconfdir}/%{name}/collectors.json
install -Dpm 0600 example-configs/sinks.json %{buildroot}%{_sysconfdir}/%{name}/sinks.json
install -Dpm 0600 example-configs/receivers.json %{buildroot}%{_sysconfdir}/%{name}/receivers.json
install -Dpm 0600 example-configs/router.json %{buildroot}%{_sysconfdir}/%{name}/router.json
install -Dpm 0600 config.json %{buildroot}%{_sysconfdir}/%{name}/%{name}.json
install -Dpm 0600 collectors.json %{buildroot}%{_sysconfdir}/%{name}/collectors.json
install -Dpm 0600 sinks.json %{buildroot}%{_sysconfdir}/%{name}/sinks.json
install -Dpm 0600 receivers.json %{buildroot}%{_sysconfdir}/%{name}/receivers.json
install -Dpm 0600 router.json %{buildroot}%{_sysconfdir}/%{name}/router.json
install -Dpm 0644 scripts/%{name}.service %{buildroot}%{_unitdir}/%{name}.service
install -Dpm 0600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{name}
install -Dpm 0644 scripts/%{name}.sysusers %{buildroot}%{_sysusersdir}/%{name}.conf