Compare commits

..

6 Commits

Author SHA1 Message Date
Thomas Roehl
5fade3189f Do startup only if configured 2025-12-18 18:12:03 +01:00
Thomas Roehl
02dc524568 Set CGO_LDFLAGS 2025-12-18 17:12:27 +01:00
Thomas Roehl
e79a27556b Use ubuntu 24.04 container 2025-12-18 17:02:26 +01:00
Thomas Roehl
c950513f09 Reduce to build only at every push 2025-12-18 17:00:53 +01:00
Thomas Roehl
923f1e9b73 Update go files 2025-12-18 16:54:34 +01:00
Thomas Roehl
d3f19c059b Add ccStartup 2025-12-18 16:36:32 +01:00
48 changed files with 1406 additions and 1538 deletions

View File

@@ -36,14 +36,22 @@ jobs:
# fetch-depth must be 0 to use git describe # fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout # See: https://github.com/marketplace/actions/checkout
- name: Checkout - name: Checkout
uses: actions/checkout@v6 uses: actions/checkout@v4
with: with:
submodules: recursive submodules: recursive
fetch-depth: 0 fetch-depth: 0
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang - name: Setup Golang
run: | run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: RPM build MetricCollector - name: RPM build MetricCollector
id: rpmbuild id: rpmbuild
@@ -70,13 +78,13 @@ jobs:
# See: https://github.com/actions/upload-artifact # See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact - name: Save RPM as artifact
uses: actions/upload-artifact@v6 uses: actions/upload-artifact@v4
with: with:
name: cc-metric-collector RPM for AlmaLinux 8 name: cc-metric-collector RPM for AlmaLinux 8
path: ${{ steps.rpmrename.outputs.RPM }} path: ${{ steps.rpmrename.outputs.RPM }}
overwrite: true overwrite: true
- name: Save SRPM as artifact - name: Save SRPM as artifact
uses: actions/upload-artifact@v6 uses: actions/upload-artifact@v4
with: with:
name: cc-metric-collector SRPM for AlmaLinux 8 name: cc-metric-collector SRPM for AlmaLinux 8
path: ${{ steps.rpmrename.outputs.SRPM }} path: ${{ steps.rpmrename.outputs.SRPM }}
@@ -106,14 +114,23 @@ jobs:
# fetch-depth must be 0 to use git describe # fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout # See: https://github.com/marketplace/actions/checkout
- name: Checkout - name: Checkout
uses: actions/checkout@v6 uses: actions/checkout@v4
with: with:
submodules: recursive submodules: recursive
fetch-depth: 0 fetch-depth: 0
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang - name: Setup Golang
run: | run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.25.3-1.el9_7.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.25.3-1.el9_7.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.25.3-1.el9_7.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.25.3-1.el9_7.noarch.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.25.3-1.el9_7.x86_64.rpm
- name: RPM build MetricCollector - name: RPM build MetricCollector
id: rpmbuild id: rpmbuild
@@ -140,26 +157,25 @@ jobs:
# See: https://github.com/actions/upload-artifact # See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact - name: Save RPM as artifact
uses: actions/upload-artifact@v6 uses: actions/upload-artifact@v4
with: with:
name: cc-metric-collector RPM for AlmaLinux 9 name: cc-metric-collector RPM for AlmaLinux 9
path: ${{ steps.rpmrename.outputs.RPM }} path: ${{ steps.rpmrename.outputs.RPM }}
overwrite: true overwrite: true
- name: Save SRPM as artifact - name: Save SRPM as artifact
uses: actions/upload-artifact@v6 uses: actions/upload-artifact@v4
with: with:
name: cc-metric-collector SRPM for AlmaLinux 9 name: cc-metric-collector SRPM for AlmaLinux 9
path: ${{ steps.rpmrename.outputs.SRPM }} path: ${{ steps.rpmrename.outputs.SRPM }}
overwrite: true overwrite: true
# #
# Build on Red Hat Universal Base Image (UBI 8) using go-toolset # Build on UBI 8 using go-toolset
# #
UBI-8-RPM-build: UBI-8-RPM-build:
runs-on: ubuntu-latest runs-on: ubuntu-latest
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+8 # See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c35984d70cc534b3a3784e?container-tabs=gti
# https://hub.docker.com/r/redhat/ubi8 container: registry.access.redhat.com/ubi8/ubi:8.8-1032.1692772289
container: redhat/ubi8
# The job outputs link to the outputs of the 'rpmbuild' step # The job outputs link to the outputs of the 'rpmbuild' step
outputs: outputs:
rpm : ${{steps.rpmbuild.outputs.RPM}} rpm : ${{steps.rpmbuild.outputs.RPM}}
@@ -174,14 +190,22 @@ jobs:
# fetch-depth must be 0 to use git describe # fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout # See: https://github.com/marketplace/actions/checkout
- name: Checkout - name: Checkout
uses: actions/checkout@v6 uses: actions/checkout@v4
with: with:
submodules: recursive submodules: recursive
fetch-depth: 0 fetch-depth: 0
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang - name: Setup Golang
run: | run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-8-appstream-rpms install go-toolset dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: RPM build MetricCollector - name: RPM build MetricCollector
id: rpmbuild id: rpmbuild
@@ -191,25 +215,24 @@ jobs:
# See: https://github.com/actions/upload-artifact # See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact - name: Save RPM as artifact
uses: actions/upload-artifact@v6 uses: actions/upload-artifact@v4
with: with:
name: cc-metric-collector RPM for UBI 8 name: cc-metric-collector RPM for UBI 8
path: ${{ steps.rpmbuild.outputs.RPM }} path: ${{ steps.rpmbuild.outputs.RPM }}
overwrite: true overwrite: true
- name: Save SRPM as artifact - name: Save SRPM as artifact
uses: actions/upload-artifact@v6 uses: actions/upload-artifact@v4
with: with:
name: cc-metric-collector SRPM for UBI 8 name: cc-metric-collector SRPM for UBI 8
path: ${{ steps.rpmbuild.outputs.SRPM }} path: ${{ steps.rpmbuild.outputs.SRPM }}
overwrite: true overwrite: true
# #
# Build on Red Hat Universal Base Image (UBI 9) using go-toolset # Build on UBI 9 using go-toolset
# #
UBI-9-RPM-build: UBI-9-RPM-build:
runs-on: ubuntu-latest runs-on: ubuntu-latest
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+9 # See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
# https://hub.docker.com/r/redhat/ubi9
container: redhat/ubi9 container: redhat/ubi9
# The job outputs link to the outputs of the 'rpmbuild' step # The job outputs link to the outputs of the 'rpmbuild' step
# The job outputs link to the outputs of the 'rpmbuild' step # The job outputs link to the outputs of the 'rpmbuild' step
@@ -226,14 +249,24 @@ jobs:
# fetch-depth must be 0 to use git describe # fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout # See: https://github.com/marketplace/actions/checkout
- name: Checkout - name: Checkout
uses: actions/checkout@v6 uses: actions/checkout@v4
with: with:
submodules: recursive submodules: recursive
fetch-depth: 0 fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang - name: Setup Golang
run: | run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-9-appstream-rpms install go-toolset dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.25.3-1.el9_7.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.25.3-1.el9_7.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.25.3-1.el9_7.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.25.3-1.el9_7.noarch.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.25.3-1.el9_7.x86_64.rpm
- name: RPM build MetricCollector - name: RPM build MetricCollector
id: rpmbuild id: rpmbuild
@@ -243,13 +276,13 @@ jobs:
# See: https://github.com/actions/upload-artifact # See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact - name: Save RPM as artifact
uses: actions/upload-artifact@v6 uses: actions/upload-artifact@v4
with: with:
name: cc-metric-collector RPM for UBI 9 name: cc-metric-collector RPM for UBI 9
path: ${{ steps.rpmbuild.outputs.RPM }} path: ${{ steps.rpmbuild.outputs.RPM }}
overwrite: true overwrite: true
- name: Save SRPM as artifact - name: Save SRPM as artifact
uses: actions/upload-artifact@v6 uses: actions/upload-artifact@v4
with: with:
name: cc-metric-collector SRPM for UBI 9 name: cc-metric-collector SRPM for UBI 9
path: ${{ steps.rpmbuild.outputs.SRPM }} path: ${{ steps.rpmbuild.outputs.SRPM }}
@@ -275,14 +308,13 @@ jobs:
# fetch-depth must be 0 to use git describe # fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout # See: https://github.com/marketplace/actions/checkout
- name: Checkout - name: Checkout
uses: actions/checkout@v6 uses: actions/checkout@v4
with: with:
submodules: recursive submodules: recursive
fetch-depth: 0 fetch-depth: 0
# Use official golang package
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang - name: Setup Golang
uses: actions/setup-go@v6 uses: actions/setup-go@v5
with: with:
go-version: 'stable' go-version: 'stable'
@@ -300,7 +332,7 @@ jobs:
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
# See: https://github.com/actions/upload-artifact # See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact - name: Save DEB as artifact
uses: actions/upload-artifact@v6 uses: actions/upload-artifact@v4
with: with:
name: cc-metric-collector DEB for Ubuntu 22.04 name: cc-metric-collector DEB for Ubuntu 22.04
path: ${{ steps.debrename.outputs.DEB }} path: ${{ steps.debrename.outputs.DEB }}
@@ -326,14 +358,13 @@ jobs:
# fetch-depth must be 0 to use git describe # fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout # See: https://github.com/marketplace/actions/checkout
- name: Checkout - name: Checkout
uses: actions/checkout@v6 uses: actions/checkout@v4
with: with:
submodules: recursive submodules: recursive
fetch-depth: 0 fetch-depth: 0
# Use official golang package
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang - name: Setup Golang
uses: actions/setup-go@v6 uses: actions/setup-go@v5
with: with:
go-version: 'stable' go-version: 'stable'
@@ -351,7 +382,7 @@ jobs:
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
# See: https://github.com/actions/upload-artifact # See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact - name: Save DEB as artifact
uses: actions/upload-artifact@v6 uses: actions/upload-artifact@v4
with: with:
name: cc-metric-collector DEB for Ubuntu 24.04 name: cc-metric-collector DEB for Ubuntu 24.04
path: ${{ steps.debrename.outputs.DEB }} path: ${{ steps.debrename.outputs.DEB }}
@@ -369,48 +400,48 @@ jobs:
steps: steps:
# See: https://github.com/actions/download-artifact # See: https://github.com/actions/download-artifact
- name: Download AlmaLinux 8 RPM - name: Download AlmaLinux 8 RPM
uses: actions/download-artifact@v7 uses: actions/download-artifact@v4
with: with:
name: cc-metric-collector RPM for AlmaLinux 8 name: cc-metric-collector RPM for AlmaLinux 8
- name: Download AlmaLinux 8 SRPM - name: Download AlmaLinux 8 SRPM
uses: actions/download-artifact@v7 uses: actions/download-artifact@v4
with: with:
name: cc-metric-collector SRPM for AlmaLinux 8 name: cc-metric-collector SRPM for AlmaLinux 8
- name: Download AlmaLinux 9 RPM - name: Download AlmaLinux 9 RPM
uses: actions/download-artifact@v7 uses: actions/download-artifact@v4
with: with:
name: cc-metric-collector RPM for AlmaLinux 9 name: cc-metric-collector RPM for AlmaLinux 9
- name: Download AlmaLinux 9 SRPM - name: Download AlmaLinux 9 SRPM
uses: actions/download-artifact@v7 uses: actions/download-artifact@v4
with: with:
name: cc-metric-collector SRPM for AlmaLinux 9 name: cc-metric-collector SRPM for AlmaLinux 9
- name: Download UBI 8 RPM - name: Download UBI 8 RPM
uses: actions/download-artifact@v7 uses: actions/download-artifact@v4
with: with:
name: cc-metric-collector RPM for UBI 8 name: cc-metric-collector RPM for UBI 8
- name: Download UBI 8 SRPM - name: Download UBI 8 SRPM
uses: actions/download-artifact@v7 uses: actions/download-artifact@v4
with: with:
name: cc-metric-collector SRPM for UBI 8 name: cc-metric-collector SRPM for UBI 8
- name: Download UBI 9 RPM - name: Download UBI 9 RPM
uses: actions/download-artifact@v7 uses: actions/download-artifact@v4
with: with:
name: cc-metric-collector RPM for UBI 9 name: cc-metric-collector RPM for UBI 9
- name: Download UBI 9 SRPM - name: Download UBI 9 SRPM
uses: actions/download-artifact@v7 uses: actions/download-artifact@v4
with: with:
name: cc-metric-collector SRPM for UBI 9 name: cc-metric-collector SRPM for UBI 9
- name: Download Ubuntu 22.04 DEB - name: Download Ubuntu 22.04 DEB
uses: actions/download-artifact@v7 uses: actions/download-artifact@v4
with: with:
name: cc-metric-collector DEB for Ubuntu 22.04 name: cc-metric-collector DEB for Ubuntu 22.04
- name: Download Ubuntu 24.04 DEB - name: Download Ubuntu 24.04 DEB
uses: actions/download-artifact@v7 uses: actions/download-artifact@v4
with: with:
name: cc-metric-collector DEB for Ubuntu 24.04 name: cc-metric-collector DEB for Ubuntu 24.04

View File

@@ -16,7 +16,15 @@ jobs:
# #
build-latest: build-latest:
runs-on: ubuntu-latest runs-on: ubuntu-latest
container: ubuntu:24.04
env:
CGO_LDFLAGS : "-L/usr/lib"
steps: steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt -qq update && apt -qq --assume-yes upgrade
apt --assume-yes -qq install build-essential sed git wget bash hwloc libhwloc-dev
# See: https://github.com/marketplace/actions/checkout # See: https://github.com/marketplace/actions/checkout
# Checkout git repository and submodules # Checkout git repository and submodules
- name: Checkout - name: Checkout
@@ -28,17 +36,7 @@ jobs:
- name: Setup Golang - name: Setup Golang
uses: actions/setup-go@v6 uses: actions/setup-go@v6
with: with:
go-version: 'stable' go-version-file: 'go.mod'
check-latest: true
- name: Install reviewdog
run: |
go install github.com/reviewdog/reviewdog/cmd/reviewdog@latest
# See: https://golangci-lint.run
- name: Install GolangCI-Lint
run: |
go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@latest
- name: Build MetricCollector - name: Build MetricCollector
run: make run: make
@@ -46,287 +44,247 @@ jobs:
- name: Run MetricCollector once - name: Run MetricCollector once
run: ./cc-metric-collector --once --config .github/ci-config.json run: ./cc-metric-collector --once --config .github/ci-config.json
# Running the linter requires likwid.h, which gets downloaded in the build step # #
- name: Static Analysis with GolangCI-Lint and Upload Report with reviewdog # # Build on AlmaLinux 8
run: | # #
golangci-lint run --enable errorlint,govet,misspell,modernize,prealloc,staticcheck,unconvert,wastedassign | reviewdog -f=golangci-lint -name "Check golangci-lint on build-latest" -reporter=github-check -filter-mode=nofilter -fail-level none # AlmaLinux8-RPM-build:
env: # runs-on: ubuntu-latest
REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} # # See: https://hub.docker.com/_/almalinux
# container: almalinux:8
# # The job outputs link to the outputs of the 'rpmrename' step
# # Only job outputs can be used in child jobs
# steps:
# # # Use dnf to install development packages
# Build on AlmaLinux 8 using go-toolset # - name: Install development packages
# # run: |
AlmaLinux8-RPM-build: # dnf --assumeyes group install "Development Tools" "RPM Development Tools"
runs-on: ubuntu-latest # dnf --assumeyes install wget openssl-devel diffutils delve which
# See: https://hub.docker.com/_/almalinux
container: almalinux:8
# The job outputs link to the outputs of the 'rpmrename' step
# Only job outputs can be used in child jobs
steps:
# Use dnf to install development packages # # Checkout git repository and submodules
- name: Install development packages # # fetch-depth must be 0 to use git describe
run: | # # See: https://github.com/marketplace/actions/checkout
dnf --assumeyes group install "Development Tools" "RPM Development Tools" # - name: Checkout
dnf --assumeyes install wget openssl-devel diffutils delve which # uses: actions/checkout@v4
# with:
# submodules: recursive
# fetch-depth: 0
# Checkout git repository and submodules # # See: https://github.com/marketplace/actions/setup-go-environment
# fetch-depth must be 0 to use git describe # # - name: Setup Golang
# See: https://github.com/marketplace/actions/checkout # # uses: actions/setup-go@v5
- name: Checkout # # with:
uses: actions/checkout@v6 # # go-version: 'stable'
with: # - name: Setup Golang
submodules: recursive # run: |
fetch-depth: 0 # dnf --assumeyes --disableplugin=subscription-manager install \
# https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
# https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
# https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
# https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: Setup Golang # - name: RPM build MetricCollector
run: | # id: rpmbuild
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset # run: |
# git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
# make RPM
- name: RPM build MetricCollector # #
id: rpmbuild # # Build on AlmaLinux 9
run: | # #
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector # AlmaLinux9-RPM-build:
make RPM # runs-on: ubuntu-latest
# # See: https://hub.docker.com/_/almalinux
# container: almalinux:9
# # The job outputs link to the outputs of the 'rpmrename' step
# # Only job outputs can be used in child jobs
# steps:
# # # Use dnf to install development packages
# Build on AlmaLinux 9 using go-toolset # - name: Install development packages
# # run: |
AlmaLinux9-RPM-build: # dnf --assumeyes group install "Development Tools" "RPM Development Tools"
runs-on: ubuntu-latest # dnf --assumeyes install wget openssl-devel diffutils delve which
# See: https://hub.docker.com/_/almalinux
container: almalinux:9
# The job outputs link to the outputs of the 'rpmrename' step
# Only job outputs can be used in child jobs
steps:
# Use dnf to install development packages # # Checkout git repository and submodules
- name: Install development packages # # fetch-depth must be 0 to use git describe
run: | # # See: https://github.com/marketplace/actions/checkout
dnf --assumeyes group install "Development Tools" "RPM Development Tools" # - name: Checkout
dnf --assumeyes install wget openssl-devel diffutils delve which # uses: actions/checkout@v4
# with:
# submodules: recursive
# fetch-depth: 0
# Checkout git repository and submodules # # See: https://github.com/marketplace/actions/setup-go-environment
# fetch-depth must be 0 to use git describe # # - name: Setup Golang
# See: https://github.com/marketplace/actions/checkout # # uses: actions/setup-go@v5
- name: Checkout # # with:
uses: actions/checkout@v6 # # go-version: 'stable'
with: # - name: Setup Golang
submodules: recursive # run: |
fetch-depth: 0 # dnf --assumeyes --disableplugin=subscription-manager install \
# https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.25.3-1.el9_7.x86_64.rpm \
# https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.25.3-1.el9_7.x86_64.rpm \
# https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.25.3-1.el9_7.x86_64.rpm \
# https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.25.3-1.el9_7.noarch.rpm \
# https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.25.3-1.el9_7.x86_64.rpm
- name: Setup Golang # - name: RPM build MetricCollector
run: | # id: rpmbuild
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset # run: |
# git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
# make RPM
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
# # #
# Build on AlmaLinux 10 using go-toolset # # Build on UBI 8 using go-toolset
# # #
AlmaLinux10-RPM-build: # UBI-8-RPM-build:
runs-on: ubuntu-latest # runs-on: ubuntu-latest
# See: https://hub.docker.com/_/almalinux # # See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
container: almalinux:10 # container: redhat/ubi8
# The job outputs link to the outputs of the 'rpmrename' step # # The job outputs link to the outputs of the 'rpmbuild' step
# Only job outputs can be used in child jobs # steps:
steps:
# Use dnf to install development packages # # Use dnf to install development packages
- name: Install development packages # - name: Install development packages
run: | # run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros rpm-build-libs rpm-libs gcc make python38 git wget openssl-devel diffutils delve which
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
dnf --assumeyes install wget openssl-devel diffutils delve which
# Checkout git repository and submodules # # Checkout git repository and submodules
# fetch-depth must be 0 to use git describe # # fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout # # See: https://github.com/marketplace/actions/checkout
- name: Checkout # - name: Checkout
uses: actions/checkout@v6 # uses: actions/checkout@v4
with: # with:
submodules: recursive # submodules: recursive
fetch-depth: 0 # fetch-depth: 0
- name: Setup Golang # # See: https://github.com/marketplace/actions/setup-go-environment
run: | # # - name: Setup Golang
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset # # uses: actions/setup-go@v5
# # with:
# # go-version: 'stable'
# - name: Setup Golang
# run: |
# dnf --assumeyes --disableplugin=subscription-manager install \
# https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
# https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
# https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
# https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: RPM build MetricCollector # - name: RPM build MetricCollector
id: rpmbuild # id: rpmbuild
run: | # run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector # git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM # make RPM
# # #
# Build on Red Hat Universal Base Image (UBI 8) using go-toolset # # Build on UBI 9 using go-toolset
# # #
UBI-8-RPM-build: # UBI-9-RPM-build:
runs-on: ubuntu-latest # runs-on: ubuntu-latest
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+8 # # See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
# https://hub.docker.com/r/redhat/ubi8 # container: redhat/ubi9
container: redhat/ubi8 # # The job outputs link to the outputs of the 'rpmbuild' step
# The job outputs link to the outputs of the 'rpmbuild' step # steps:
steps:
# Use dnf to install development packages # # Use dnf to install development packages
- name: Install development packages # - name: Install development packages
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros rpm-build-libs rpm-libs gcc make python38 git wget openssl-devel diffutils delve which # run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros gcc make python39 git wget openssl-devel diffutils delve
# Checkout git repository and submodules # # Checkout git repository and submodules
# fetch-depth must be 0 to use git describe # # fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout # # See: https://github.com/marketplace/actions/checkout
- name: Checkout # - name: Checkout
uses: actions/checkout@v6 # uses: actions/checkout@v4
with: # with:
submodules: recursive # submodules: recursive
fetch-depth: 0 # fetch-depth: 0
- name: Setup Golang # # See: https://github.com/marketplace/actions/setup-go-environment
run: | # # - name: Setup Golang
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-8-appstream-rpms install go-toolset # # uses: actions/setup-go@v5
# # with:
# # go-version: 'stable'
# - name: Setup Golang
# run: |
# dnf --assumeyes --disableplugin=subscription-manager install \
# https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.25.3-1.el9_7.x86_64.rpm \
# https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.25.3-1.el9_7.x86_64.rpm \
# https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.25.3-1.el9_7.x86_64.rpm \
# https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.25.3-1.el9_7.noarch.rpm \
# https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.25.3-1.el9_7.x86_64.rpm
- name: RPM build MetricCollector # - name: RPM build MetricCollector
id: rpmbuild # id: rpmbuild
run: | # run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector # git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM # make RPM
# # #
# Build on Red Hat Universal Base Image (UBI 9) using go-toolset # # Build on Ubuntu 22.04 using official go package
# # #
UBI-9-RPM-build: # Ubuntu-jammy-build:
runs-on: ubuntu-latest # runs-on: ubuntu-latest
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+9 # container: ubuntu:22.04
# https://hub.docker.com/r/redhat/ubi9
container: redhat/ubi9
# The job outputs link to the outputs of the 'rpmbuild' step
steps:
# Use dnf to install development packages # steps:
- name: Install development packages # # Use apt to install development packages
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros gcc make python39 git wget openssl-devel diffutils delve # - name: Install development packages
# run: |
# apt update && apt --assume-yes upgrade
# apt --assume-yes install build-essential sed git wget bash
# # Checkout git repository and submodules
# # fetch-depth must be 0 to use git describe
# # See: https://github.com/marketplace/actions/checkout
# - name: Checkout
# uses: actions/checkout@v4
# with:
# submodules: recursive
# fetch-depth: 0
# # Use official golang package
# # See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
# Checkout git repository and submodules # - name: DEB build MetricCollector
# fetch-depth must be 0 to use git describe # id: dpkg-build
# See: https://github.com/marketplace/actions/checkout # run: |
- name: Checkout # export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
uses: actions/checkout@v6 # make DEB
with:
submodules: recursive
fetch-depth: 0
- name: Setup Golang # #
run: | # # Build on Ubuntu 24.04 using official go package
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-9-appstream-rpms install go-toolset # #
# Ubuntu-noblenumbat-build:
# runs-on: ubuntu-latest
# container: ubuntu:24.04
- name: RPM build MetricCollector # steps:
id: rpmbuild # # Use apt to install development packages
run: | # - name: Install development packages
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector # run: |
make RPM # apt update && apt --assume-yes upgrade
# apt --assume-yes install build-essential sed git wget bash
# # Checkout git repository and submodules
# # fetch-depth must be 0 to use git describe
# # See: https://github.com/marketplace/actions/checkout
# - name: Checkout
# uses: actions/checkout@v4
# with:
# submodules: recursive
# fetch-depth: 0
# # Use official golang package
# # See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
# # - name: DEB build MetricCollector
# Build on Red Hat Universal Base Image (UBI 10) using go-toolset # id: dpkg-build
# # run: |
UBI-10-RPM-build: # export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
runs-on: ubuntu-latest # make DEB
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+10
# https://hub.docker.com/r/redhat/ubi10
container: redhat/ubi10
# The job outputs link to the outputs of the 'rpmbuild' step
steps:
# Use dnf to install development packages
- name: Install development packages
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros gcc make python3 git wget openssl-devel diffutils delve
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
fetch-depth: 0
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-10-for-x86_64-appstream-rpms install go-toolset
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
#
# Build on Ubuntu 22.04 using official go package
#
Ubuntu-jammy-build:
runs-on: ubuntu-latest
container: ubuntu:22.04
steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt update && apt --assume-yes upgrade
apt --assume-yes install build-essential sed git wget bash
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang
uses: actions/setup-go@v6
with:
go-version: 'stable'
- name: DEB build MetricCollector
id: dpkg-build
run: |
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
make DEB
#
# Build on Ubuntu 24.04 using official go package
#
Ubuntu-noblenumbat-build:
runs-on: ubuntu-latest
container: ubuntu:24.04
steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt update && apt --assume-yes upgrade
apt --assume-yes install build-essential sed git wget bash
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang
uses: actions/setup-go@v6
with:
go-version: 'stable'
- name: DEB build MetricCollector
id: dpkg-build
run: |
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
make DEB

View File

@@ -72,11 +72,6 @@ staticcheck:
$(GOBIN) install honnef.co/go/tools/cmd/staticcheck@latest $(GOBIN) install honnef.co/go/tools/cmd/staticcheck@latest
$$($(GOBIN) env GOPATH)/bin/staticcheck ./... $$($(GOBIN) env GOPATH)/bin/staticcheck ./...
.PHONY: golangci-lint
golangci-lint:
$(GOBIN) install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@latest
$$($(GOBIN) env GOPATH)/bin/golangci-lint run
.ONESHELL: .ONESHELL:
.PHONY: RPM .PHONY: RPM
RPM: scripts/cc-metric-collector.spec RPM: scripts/cc-metric-collector.spec

View File

@@ -36,10 +36,9 @@ There is a main configuration file with basic settings that point to the other c
"collectors-file" : "collectors.json", "collectors-file" : "collectors.json",
"receivers-file" : "receivers.json", "receivers-file" : "receivers.json",
"router-file" : "router.json", "router-file" : "router.json",
"main": { "startup-file": "startup.json",
"interval": "10s", "interval": "10s",
"duration": "1s" "duration": "1s"
}
} }
``` ```
@@ -51,6 +50,7 @@ See the component READMEs for their configuration:
* [`sinks`](https://github.com/ClusterCockpit/cc-lib/blob/main/sinks/README.md) * [`sinks`](https://github.com/ClusterCockpit/cc-lib/blob/main/sinks/README.md)
* [`receivers`](https://github.com/ClusterCockpit/cc-lib/blob/main/receivers/README.md) * [`receivers`](https://github.com/ClusterCockpit/cc-lib/blob/main/receivers/README.md)
* [`router`](./internal/metricRouter/README.md) * [`router`](./internal/metricRouter/README.md)
* [`startup`](https://github.com/ClusterCockpit/cc-lib/blob/main/ccStartup/README.md)
# Installation # Installation

View File

@@ -14,17 +14,18 @@ import (
"os/signal" "os/signal"
"syscall" "syscall"
"github.com/ClusterCockpit/cc-lib/v2/receivers" "github.com/ClusterCockpit/cc-lib/receivers"
"github.com/ClusterCockpit/cc-lib/v2/sinks" "github.com/ClusterCockpit/cc-lib/sinks"
"github.com/ClusterCockpit/cc-metric-collector/collectors" "github.com/ClusterCockpit/cc-metric-collector/collectors"
// "strings" // "strings"
"sync" "sync"
"time" "time"
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig" ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
start "github.com/ClusterCockpit/cc-lib/ccStartup"
mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter" mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker" mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
) )
@@ -50,6 +51,30 @@ type RuntimeConfig struct {
Sync sync.WaitGroup Sync sync.WaitGroup
} }
//// Structure of the configuration file
//type GlobalConfig struct {
// Sink sinks.SinkConfig `json:"sink"`
// Interval int `json:"interval"`
// Duration int `json:"duration"`
// Collectors []string `json:"collectors"`
// Receiver receivers.ReceiverConfig `json:"receiver"`
// DefTags map[string]string `json:"default_tags"`
// CollectConfigs map[string]json.RawMessage `json:"collect_config"`
//}
//// Load JSON configuration file
//func LoadConfiguration(file string, config *GlobalConfig) error {
// configFile, err := os.Open(file)
// defer configFile.Close()
// if err != nil {
// fmt.Println(err.Error())
// return err
// }
// jsonParser := json.NewDecoder(configFile)
// err = jsonParser.Decode(config)
// return err
//}
func ReadCli() map[string]string { func ReadCli() map[string]string {
var m map[string]string var m map[string]string
cfg := flag.String("config", "./config.json", "Path to configuration file") cfg := flag.String("config", "./config.json", "Path to configuration file")
@@ -69,6 +94,22 @@ func ReadCli() map[string]string {
return m return m
} }
//func SetLogging(logfile string) error {
// var file *os.File
// var err error
// if logfile != "stderr" {
// file, err = os.OpenFile(logfile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600)
// if err != nil {
// log.Fatal(err)
// return err
// }
// } else {
// file = os.Stderr
// }
// log.SetOutput(file)
// return nil
//}
// General shutdownHandler function that gets executed in case of interrupt or graceful shutdownHandler // General shutdownHandler function that gets executed in case of interrupt or graceful shutdownHandler
func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) { func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) {
defer config.Sync.Done() defer config.Sync.Done()
@@ -176,6 +217,19 @@ func mainFunc() int {
return 1 return 1
} }
startupConf := ccconf.GetPackageConfig("startup")
if startupConf != nil && len(startupConf) > 0 {
err := start.CCStartup(startupConf)
if err != nil {
cclog.Errorf("Sending startup topology failed: %s", err.Error())
}
}
// Set log file
// if logfile := rcfg.CliArgs["logfile"]; logfile != "stderr" {
// cclog.SetOutput(logfile)
// }
// Creat new multi channel ticker // Creat new multi channel ticker
rcfg.MultiChanTicker = mct.NewTicker(rcfg.Interval) rcfg.MultiChanTicker = mct.NewTicker(rcfg.Interval)

View File

@@ -67,7 +67,7 @@ A collector reads data from any source, parses it to metrics and submits these m
* `Read(duration time.Duration, output chan ccMessage.CCMessage)`: Read, parse and submit data to the `output` channel as [`CCMessage`](https://github.com/ClusterCockpit/cc-lib/blob/main/ccMessage/README.md). If the collector has to measure anything for some duration, use the provided function argument `duration`. * `Read(duration time.Duration, output chan ccMessage.CCMessage)`: Read, parse and submit data to the `output` channel as [`CCMessage`](https://github.com/ClusterCockpit/cc-lib/blob/main/ccMessage/README.md). If the collector has to measure anything for some duration, use the provided function argument `duration`.
* `Close()`: Closes down the collector. * `Close()`: Closes down the collector.
It is recommended to call `setup()` in the `Init()` function. It is recommanded to call `setup()` in the `Init()` function.
Finally, the collector needs to be registered in the `collectorManager.go`. There is a list of collectors called `AvailableCollectors` which is a map (`collector_type_string` -> `pointer to MetricCollector interface`). Add a new entry with a descriptive name and the new collector. Finally, the collector needs to be registered in the `collectorManager.go`. There is a list of collectors called `AvailableCollectors` which is a map (`collector_type_string` -> `pointer to MetricCollector interface`). Add a new entry with a descriptive name and the new collector.
@@ -100,12 +100,11 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
} }
m.name = "SampleCollector" m.name = "SampleCollector"
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
if len(config) > 0 { if len(config) > 0 {
if err := json.Unmarshal(config, &m.config); err != nil { err := json.Unmarshal(config, &m.config)
return fmt.Errorf("%s Init(): json.Unmarshal() call failed: %w", m.name, err) if err != nil {
return err
} }
} }
m.meta = map[string]string{"source": m.name, "group": "Sample"} m.meta = map[string]string{"source": m.name, "group": "Sample"}

View File

@@ -17,13 +17,12 @@ import (
"os/exec" "os/exec"
"os/user" "os/user"
"regexp" "regexp"
"slices"
"strconv" "strconv"
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
const DEFAULT_BEEGFS_CMD = "beegfs-ctl" const DEFAULT_BEEGFS_CMD = "beegfs-ctl"
@@ -62,9 +61,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
"rmXA", "setXA", "mirror"} "rmXA", "setXA", "mirror"}
m.name = "BeegfsMetaCollector" m.name = "BeegfsMetaCollector"
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true m.parallel = true
// Set default beegfs-ctl binary // Set default beegfs-ctl binary
@@ -78,10 +75,11 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
} }
} }
// Create map with possible variables //create map with possible variables
m.matches = make(map[string]string) m.matches = make(map[string]string)
for _, value := range nodeMdstat_array { for _, value := range nodeMdstat_array {
if slices.Contains(m.config.ExcludeMetrics, value) { _, skip := stringArrayContains(m.config.ExcludeMetrics, value)
if skip {
m.matches["other"] = "0" m.matches["other"] = "0"
} else { } else {
m.matches["beegfs_cmeta_"+value] = "0" m.matches["beegfs_cmeta_"+value] = "0"
@@ -104,7 +102,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
// Beegfs file system statistics can only be queried by user root // Beegfs file system statistics can only be queried by user root
user, err := user.Current() user, err := user.Current()
if err != nil { if err != nil {
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to get current user: %w", err) return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to get current user: %v", err)
} }
if user.Uid != "0" { if user.Uid != "0" {
return fmt.Errorf("BeegfsMetaCollector.Init(): BeeGFS file system statistics can only be queried by user root") return fmt.Errorf("BeegfsMetaCollector.Init(): BeeGFS file system statistics can only be queried by user root")
@@ -113,7 +111,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
// Check if beegfs-ctl is in executable search path // Check if beegfs-ctl is in executable search path
_, err = exec.LookPath(m.config.Beegfs) _, err = exec.LookPath(m.config.Beegfs)
if err != nil { if err != nil {
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err) return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err)
} }
m.init = true m.init = true
return nil return nil
@@ -123,7 +121,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
if !m.init { if !m.init {
return return
} }
// Get mounpoint //get mounpoint
buffer, _ := os.ReadFile(string("/proc/mounts")) buffer, _ := os.ReadFile(string("/proc/mounts"))
mounts := strings.Split(string(buffer), "\n") mounts := strings.Split(string(buffer), "\n")
var mountpoints []string var mountpoints []string
@@ -164,15 +162,12 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
cmd.Stderr = cmdStderr cmd.Stderr = cmdStderr
err := cmd.Run() err := cmd.Run()
if err != nil { if err != nil {
dataStdErr, _ := io.ReadAll(cmdStderr) fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error())
dataStdOut, _ := io.ReadAll(cmdStdout) fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode())
cclog.ComponentError( data, _ := io.ReadAll(cmdStderr)
m.name, fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stderr: \"%s\"\n", string(data))
fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err), data, _ = io.ReadAll(cmdStdout)
fmt.Sprintf("Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()), fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stdout: \"%s\"\n", string(data))
fmt.Sprintf("Read(): command stderr: \"%s\"\n", string(dataStdErr)),
fmt.Sprintf("Read(): command stdout: \"%s\"\n", string(dataStdOut)),
)
return return
} }
// Read I/O statistics // Read I/O statistics
@@ -228,7 +223,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
for key, data := range m.matches { for key, data := range m.matches {
value, _ := strconv.ParseFloat(data, 32) value, _ := strconv.ParseFloat(data, 32)
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now()) y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }

View File

@@ -17,13 +17,12 @@ import (
"os/exec" "os/exec"
"os/user" "os/user"
"regexp" "regexp"
"slices"
"strconv" "strconv"
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
// Struct for the collector-specific JSON config // Struct for the collector-specific JSON config
@@ -55,9 +54,7 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
"storInf", "unlnk"} "storInf", "unlnk"}
m.name = "BeegfsStorageCollector" m.name = "BeegfsStorageCollector"
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true m.parallel = true
// Set default beegfs-ctl binary // Set default beegfs-ctl binary
@@ -74,7 +71,8 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
//create map with possible variables //create map with possible variables
m.matches = make(map[string]string) m.matches = make(map[string]string)
for _, value := range storageStat_array { for _, value := range storageStat_array {
if slices.Contains(m.config.ExcludeMetrics, value) { _, skip := stringArrayContains(m.config.ExcludeMetrics, value)
if skip {
m.matches["other"] = "0" m.matches["other"] = "0"
} else { } else {
m.matches["beegfs_cstorage_"+value] = "0" m.matches["beegfs_cstorage_"+value] = "0"
@@ -97,7 +95,7 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
// Beegfs file system statistics can only be queried by user root // Beegfs file system statistics can only be queried by user root
user, err := user.Current() user, err := user.Current()
if err != nil { if err != nil {
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to get current user: %w", err) return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to get current user: %v", err)
} }
if user.Uid != "0" { if user.Uid != "0" {
return fmt.Errorf("BeegfsStorageCollector.Init(): BeeGFS file system statistics can only be queried by user root") return fmt.Errorf("BeegfsStorageCollector.Init(): BeeGFS file system statistics can only be queried by user root")
@@ -106,7 +104,7 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
// Check if beegfs-ctl is in executable search path // Check if beegfs-ctl is in executable search path
_, err = exec.LookPath(m.config.Beegfs) _, err = exec.LookPath(m.config.Beegfs)
if err != nil { if err != nil {
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err) return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err)
} }
m.init = true m.init = true
return nil return nil
@@ -156,15 +154,12 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
cmd.Stderr = cmdStderr cmd.Stderr = cmdStderr
err := cmd.Run() err := cmd.Run()
if err != nil { if err != nil {
dataStdErr, _ := io.ReadAll(cmdStderr) fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error())
dataStdOut, _ := io.ReadAll(cmdStdout) fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode())
cclog.ComponentError( data, _ := io.ReadAll(cmdStderr)
m.name, fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stderr: \"%s\"\n", string(data))
fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err), data, _ = io.ReadAll(cmdStdout)
fmt.Sprintf("Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()), fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stdout: \"%s\"\n", string(data))
fmt.Sprintf("Read(): command stderr: \"%s\"\n", string(dataStdErr)),
fmt.Sprintf("Read(): command stdout: \"%s\"\n", string(dataStdOut)),
)
return return
} }
// Read I/O statistics // Read I/O statistics
@@ -220,7 +215,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
for key, data := range m.matches { for key, data := range m.matches {
value, _ := strconv.ParseFloat(data, 32) value, _ := strconv.ParseFloat(data, 32)
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now()) y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }

View File

@@ -9,12 +9,11 @@ package collectors
import ( import (
"encoding/json" "encoding/json"
"fmt"
"sync" "sync"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker" mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
) )
@@ -105,7 +104,7 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
err = collector.Init(collectorCfg) err = collector.Init(collectorCfg)
if err != nil { if err != nil {
cclog.ComponentError("CollectorManager", fmt.Sprintf("Collector %s initialization failed: %v", collectorName, err)) cclog.ComponentError("CollectorManager", "Collector", collectorName, "initialization failed:", err.Error())
continue continue
} }
cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name()) cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())

View File

@@ -17,8 +17,8 @@ import (
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
// CPUFreqCollector // CPUFreqCollector
@@ -35,16 +35,15 @@ type CPUFreqCpuInfoCollector struct {
topology []CPUFreqCpuInfoCollectorTopology topology []CPUFreqCpuInfoCollectorTopology
} }
func (m *CPUFreqCpuInfoCollector) Init(_ json.RawMessage) error { func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
// Check if already initialized // Check if already initialized
if m.init { if m.init {
return nil return nil
} }
m.setup()
m.name = "CPUFreqCpuInfoCollector" m.name = "CPUFreqCpuInfoCollector"
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true m.parallel = true
m.meta = map[string]string{ m.meta = map[string]string{
"source": m.name, "source": m.name,
@@ -55,8 +54,9 @@ func (m *CPUFreqCpuInfoCollector) Init(_ json.RawMessage) error {
const cpuInfoFile = "/proc/cpuinfo" const cpuInfoFile = "/proc/cpuinfo"
file, err := os.Open(cpuInfoFile) file, err := os.Open(cpuInfoFile)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): failed to open file '%s': %w", m.name, cpuInfoFile, err) return fmt.Errorf("failed to open file '%s': %v", cpuInfoFile, err)
} }
defer file.Close()
// Collect topology information from file cpuinfo // Collect topology information from file cpuinfo
foundFreq := false foundFreq := false
@@ -86,10 +86,6 @@ func (m *CPUFreqCpuInfoCollector) Init(_ json.RawMessage) error {
} }
} }
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): Call to file.Close() failed: %w", m.name, err)
}
// were all topology information collected? // were all topology information collected?
if foundFreq && if foundFreq &&
len(processor) > 0 && len(processor) > 0 &&
@@ -123,7 +119,7 @@ func (m *CPUFreqCpuInfoCollector) Init(_ json.RawMessage) error {
// Check if at least one CPU with frequency information was detected // Check if at least one CPU with frequency information was detected
if len(m.topology) == 0 { if len(m.topology) == 0 {
return fmt.Errorf("%s Init(): no CPU frequency info found in %s", m.name, cpuInfoFile) return fmt.Errorf("no CPU frequency info found in %s", cpuInfoFile)
} }
m.init = true m.init = true
@@ -144,13 +140,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err)) fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err))
return return
} }
defer func() { defer file.Close()
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", cpuInfoFile, err))
}
}()
processorCounter := 0 processorCounter := 0
now := time.Now() now := time.Now()
@@ -171,7 +161,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err)) fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
return return
} }
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": value}, now); err == nil { if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil {
output <- y output <- y
} }
} }

View File

@@ -16,8 +16,8 @@ import (
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology" "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
@@ -48,9 +48,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
} }
m.name = "CPUFreqCollector" m.name = "CPUFreqCollector"
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true m.parallel = true
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &m.config) err := json.Unmarshal(config, &m.config)
@@ -76,15 +74,15 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq") scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq")
err := unix.Access(scalingCurFreqFile, unix.R_OK) err := unix.Access(scalingCurFreqFile, unix.R_OK)
if err != nil { if err != nil {
return fmt.Errorf("unable to access file '%s': %w", scalingCurFreqFile, err) return fmt.Errorf("unable to access file '%s': %v", scalingCurFreqFile, err)
} }
m.topology = append(m.topology, m.topology = append(m.topology,
CPUFreqCollectorTopology{ CPUFreqCollectorTopology{
tagSet: map[string]string{ tagSet: map[string]string{
"type": "hwthread", "type": "hwthread",
"type-id": strconv.Itoa(c.CpuID), "type-id": fmt.Sprint(c.CpuID),
"package_id": strconv.Itoa(c.Socket), "package_id": fmt.Sprint(c.Socket),
}, },
scalingCurFreqFile: scalingCurFreqFile, scalingCurFreqFile: scalingCurFreqFile,
}, },
@@ -126,7 +124,7 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage
continue continue
} }
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": cpuFreq}, now); err == nil { if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil {
output <- y output <- y
} }
} }

View File

@@ -12,13 +12,12 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"os" "os"
"slices"
"strconv" "strconv"
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
sysconf "github.com/tklauser/go-sysconf" sysconf "github.com/tklauser/go-sysconf"
) )
@@ -40,17 +39,10 @@ type CpustatCollector struct {
func (m *CpustatCollector) Init(config json.RawMessage) error { func (m *CpustatCollector) Init(config json.RawMessage) error {
m.name = "CpustatCollector" m.name = "CpustatCollector"
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true m.parallel = true
m.meta = map[string]string{ m.meta = map[string]string{"source": m.name, "group": "CPU"}
"source": m.name, m.nodetags = map[string]string{"type": "node"}
"group": "CPU",
}
m.nodetags = map[string]string{
"type": "node",
}
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &m.config) err := json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
@@ -72,7 +64,14 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
m.matches = make(map[string]int) m.matches = make(map[string]int)
for match, index := range matches { for match, index := range matches {
if !slices.Contains(m.config.ExcludeMetrics, match) { doExclude := false
for _, exclude := range m.config.ExcludeMetrics {
if match == exclude {
doExclude = true
break
}
}
if !doExclude {
m.matches[match] = index m.matches[match] = index
} }
} }
@@ -80,17 +79,9 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
// Check input file // Check input file
file, err := os.Open(string(CPUSTATFILE)) file, err := os.Open(string(CPUSTATFILE))
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(m.name, err.Error())
m.name,
fmt.Sprintf("Init(): Failed to open file '%s': %v", string(CPUSTATFILE), err))
} }
defer func() { defer file.Close()
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Init(): Failed to close file '%s': %v", string(CPUSTATFILE), err))
}
}()
// Pre-generate tags for all CPUs // Pre-generate tags for all CPUs
num_cpus := 0 num_cpus := 0
@@ -108,9 +99,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 { } else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
cpustr := strings.TrimLeft(linefields[0], "cpu") cpustr := strings.TrimLeft(linefields[0], "cpu")
cpu, _ := strconv.Atoi(cpustr) cpu, _ := strconv.Atoi(cpustr)
m.cputags[linefields[0]] = map[string]string{ m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
"type": "hwthread",
"type-id": strconv.Itoa(cpu)}
m.olddata[linefields[0]] = make(map[string]int64) m.olddata[linefields[0]] = make(map[string]int64)
for k, v := range m.matches { for k, v := range m.matches {
m.olddata[linefields[0]][k], _ = strconv.ParseInt(linefields[v], 0, 64) m.olddata[linefields[0]][k], _ = strconv.ParseInt(linefields[v], 0, 64)
@@ -140,7 +129,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
sum := float64(0) sum := float64(0)
for name, value := range values { for name, value := range values {
sum += value sum += value
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value * 100}, now) y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now)
if err == nil { if err == nil {
y.AddTag("unit", "Percent") y.AddTag("unit", "Percent")
output <- y output <- y
@@ -148,7 +137,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
} }
if v, ok := values["cpu_idle"]; ok { if v, ok := values["cpu_idle"]; ok {
sum -= v sum -= v
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]any{"value": sum * 100}, now) y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]interface{}{"value": sum * 100}, now)
if err == nil { if err == nil {
y.AddTag("unit", "Percent") y.AddTag("unit", "Percent")
output <- y output <- y
@@ -166,17 +155,9 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
file, err := os.Open(string(CPUSTATFILE)) file, err := os.Open(string(CPUSTATFILE))
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(m.name, err.Error())
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", string(CPUSTATFILE), err))
} }
defer func() { defer file.Close()
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", string(CPUSTATFILE), err))
}
}()
scanner := bufio.NewScanner(file) scanner := bufio.NewScanner(file)
for scanner.Scan() { for scanner.Scan() {
@@ -193,7 +174,7 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
num_cpus_metric, err := lp.NewMessage("num_cpus", num_cpus_metric, err := lp.NewMessage("num_cpus",
m.nodetags, m.nodetags,
m.meta, m.meta,
map[string]any{"value": num_cpus}, map[string]interface{}{"value": int(num_cpus)},
now, now,
) )
if err == nil { if err == nil {

View File

@@ -10,16 +10,13 @@ package collectors
import ( import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt"
"log" "log"
"os" "os"
"os/exec" "os/exec"
"slices"
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
influx "github.com/influxdata/line-protocol" influx "github.com/influxdata/line-protocol"
) )
@@ -44,31 +41,22 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
var err error var err error
m.name = "CustomCmdCollector" m.name = "CustomCmdCollector"
m.parallel = true m.parallel = true
m.meta = map[string]string{ m.meta = map[string]string{"source": m.name, "group": "Custom"}
"source": m.name,
"group": "Custom",
}
if len(config) > 0 { if len(config) > 0 {
err = json.Unmarshal(config, &m.config) err = json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): json.Unmarshal() call failed: %w", m.name, err) log.Print(err.Error())
return err
} }
} }
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
for _, c := range m.config.Commands { for _, c := range m.config.Commands {
cmdfields := strings.Fields(c) cmdfields := strings.Fields(c)
command := exec.Command(cmdfields[0], cmdfields[1:]...) command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " "))
command.Wait()
_, err = command.Output() _, err = command.Output()
if err == nil { if err == nil {
m.commands = append(m.commands, c) m.commands = append(m.commands, c)
} else {
cclog.ComponentWarn(
m.name,
fmt.Sprintf("%s Init(): Execution of command \"%s\" failed: %v", m.name, command.String(), err),
)
continue
} }
} }
for _, f := range m.config.Files { for _, f := range m.config.Files {
@@ -76,10 +64,7 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
if err == nil { if err == nil {
m.files = append(m.files, f) m.files = append(m.files, f)
} else { } else {
cclog.ComponentWarn( log.Print(err.Error())
m.name,
fmt.Sprintf("%s Init(): Reading of file \"%s\" failed: %v", m.name, f, err),
)
continue continue
} }
} }
@@ -103,11 +88,8 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
} }
for _, cmd := range m.commands { for _, cmd := range m.commands {
cmdfields := strings.Fields(cmd) cmdfields := strings.Fields(cmd)
command := exec.Command(cmdfields[0], cmdfields[1:]...) command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " "))
if err := command.Wait(); err != nil { command.Wait()
log.Print(err)
continue
}
stdout, err := command.Output() stdout, err := command.Output()
if err != nil { if err != nil {
log.Print(err) log.Print(err)
@@ -119,7 +101,8 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
continue continue
} }
for _, c := range cmdmetrics { for _, c := range cmdmetrics {
if slices.Contains(m.config.ExcludeMetrics, c.Name()) { _, skip := stringArrayContains(m.config.ExcludeMetrics, c.Name())
if skip {
continue continue
} }
@@ -138,7 +121,8 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
continue continue
} }
for _, f := range fmetrics { for _, f := range fmetrics {
if slices.Contains(m.config.ExcludeMetrics, f.Name()) { _, skip := stringArrayContains(m.config.ExcludeMetrics, f.Name())
if skip {
continue continue
} }
output <- lp.FromInfluxMetric(f) output <- lp.FromInfluxMetric(f)

View File

@@ -10,14 +10,13 @@ package collectors
import ( import (
"bufio" "bufio"
"encoding/json" "encoding/json"
"fmt"
"os" "os"
"strings" "strings"
"syscall" "syscall"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
const MOUNTFILE = `/proc/self/mounts` const MOUNTFILE = `/proc/self/mounts`
@@ -37,9 +36,7 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
m.name = "DiskstatCollector" m.name = "DiskstatCollector"
m.parallel = true m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Disk"} m.meta = map[string]string{"source": m.name, "group": "Disk"}
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
if len(config) > 0 { if len(config) > 0 {
if err := json.Unmarshal(config, &m.config); err != nil { if err := json.Unmarshal(config, &m.config); err != nil {
return err return err
@@ -57,11 +54,10 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
} }
file, err := os.Open(MOUNTFILE) file, err := os.Open(MOUNTFILE)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): file open for file \"%s\" failed: %w", m.name, MOUNTFILE, err) cclog.ComponentError(m.name, err.Error())
} return err
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): file close for file \"%s\" failed: %w", m.name, MOUNTFILE, err)
} }
defer file.Close()
m.init = true m.init = true
return nil return nil
} }
@@ -73,18 +69,10 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
file, err := os.Open(MOUNTFILE) file, err := os.Open(MOUNTFILE)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(m.name, err.Error())
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", MOUNTFILE, err))
return return
} }
defer func() { defer file.Close()
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", MOUNTFILE, err))
}
}()
part_max_used := uint64(0) part_max_used := uint64(0)
scanner := bufio.NewScanner(file) scanner := bufio.NewScanner(file)
@@ -105,7 +93,7 @@ mountLoop:
continue continue
} }
mountPath := strings.ReplaceAll(linefields[1], `\040`, " ") mountPath := strings.Replace(linefields[1], `\040`, " ", -1)
for _, excl := range m.config.ExcludeMounts { for _, excl := range m.config.ExcludeMounts {
if strings.Contains(mountPath, excl) { if strings.Contains(mountPath, excl) {
@@ -124,13 +112,7 @@ mountLoop:
tags := map[string]string{"type": "node", "device": linefields[0]} tags := map[string]string{"type": "node", "device": linefields[0]}
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000) total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
if m.allowedMetrics["disk_total"] { if m.allowedMetrics["disk_total"] {
y, err := lp.NewMessage( y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
"disk_total",
tags,
m.meta,
map[string]any{
"value": total},
time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "GBytes") y.AddMeta("unit", "GBytes")
output <- y output <- y
@@ -138,13 +120,7 @@ mountLoop:
} }
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000) free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000)
if m.allowedMetrics["disk_free"] { if m.allowedMetrics["disk_free"] {
y, err := lp.NewMessage( y, err := lp.NewMessage("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
"disk_free",
tags,
m.meta,
map[string]any{
"value": free},
time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "GBytes") y.AddMeta("unit", "GBytes")
output <- y output <- y
@@ -158,14 +134,7 @@ mountLoop:
} }
} }
if m.allowedMetrics["part_max_used"] { if m.allowedMetrics["part_max_used"] {
y, err := lp.NewMessage( y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
"part_max_used",
map[string]string{
"type": "node"},
m.meta,
map[string]any{
"value": int(part_max_used)},
time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "percent") y.AddMeta("unit", "percent")
output <- y output <- y

View File

@@ -14,16 +14,16 @@ import (
"errors" "errors"
"fmt" "fmt"
"io" "io"
"log"
"os/exec" "os/exec"
"os/user" "os/user"
"slices"
"strconv" "strconv"
"strings" "strings"
"syscall" "syscall"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
const DEFAULT_GPFS_CMD = "mmpmon" const DEFAULT_GPFS_CMD = "mmpmon"
@@ -310,10 +310,9 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
return nil return nil
} }
var err error
m.name = "GpfsCollector" m.name = "GpfsCollector"
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true m.parallel = true
// Set default mmpmon binary // Set default mmpmon binary
@@ -321,9 +320,10 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
// Read JSON configuration // Read JSON configuration
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &m.config) err = json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err) log.Print(err.Error())
return err
} }
} }
m.meta = map[string]string{ m.meta = map[string]string{
@@ -364,7 +364,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
// when using sudo, the full path of mmpmon must be specified because // when using sudo, the full path of mmpmon must be specified because
// exec.LookPath will not work as mmpmon is not executable as user // exec.LookPath will not work as mmpmon is not executable as user
if m.config.Sudo && !strings.HasPrefix(m.config.Mmpmon, "/") { if m.config.Sudo && !strings.HasPrefix(m.config.Mmpmon, "/") {
return fmt.Errorf("%s Init(): when using sudo, mmpmon_path must be provided and an absolute path: %s", m.name, m.config.Mmpmon) return fmt.Errorf("when using sudo, mmpmon_path must be provided and an absolute path: %s", m.config.Mmpmon)
} }
// Check if mmpmon is in executable search path // Check if mmpmon is in executable search path
@@ -377,7 +377,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
p = m.config.Mmpmon p = m.config.Mmpmon
} else { } else {
cclog.ComponentError(m.name, fmt.Sprintf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err)) cclog.ComponentError(m.name, fmt.Sprintf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err))
return fmt.Errorf("%s Init(): failed to find mmpmon binary '%s': %w", m.name, m.config.Mmpmon, err) return fmt.Errorf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err)
} }
} }
m.config.Mmpmon = p m.config.Mmpmon = p
@@ -385,28 +385,28 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
m.definitions = []GpfsMetricDefinition{} m.definitions = []GpfsMetricDefinition{}
if m.config.SendAbsoluteValues { if m.config.SendAbsoluteValues {
for _, def := range GpfsAbsMetrics { for _, def := range GpfsAbsMetrics {
if !slices.Contains(m.config.ExcludeMetrics, def.name) { if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def) m.definitions = append(m.definitions, def)
} }
} }
} }
if m.config.SendDiffValues { if m.config.SendDiffValues {
for _, def := range GpfsDiffMetrics { for _, def := range GpfsDiffMetrics {
if !slices.Contains(m.config.ExcludeMetrics, def.name) { if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def) m.definitions = append(m.definitions, def)
} }
} }
} }
if m.config.SendDerivedValues { if m.config.SendDerivedValues {
for _, def := range GpfsDeriveMetrics { for _, def := range GpfsDeriveMetrics {
if !slices.Contains(m.config.ExcludeMetrics, def.name) { if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def) m.definitions = append(m.definitions, def)
} }
} }
} else if m.config.SendBandwidths { } else if m.config.SendBandwidths {
for _, def := range GpfsDeriveMetrics { for _, def := range GpfsDeriveMetrics {
if def.unit == "bytes/sec" { if def.unit == "bytes/sec" {
if !slices.Contains(m.config.ExcludeMetrics, def.name) { if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def) m.definitions = append(m.definitions, def)
} }
} }
@@ -414,11 +414,11 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
} }
if m.config.SendTotalValues { if m.config.SendTotalValues {
for _, def := range GpfsTotalMetrics { for _, def := range GpfsTotalMetrics {
if !slices.Contains(m.config.ExcludeMetrics, def.name) { if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
// only send total metrics of the types requested // only send total metrics of the types requested
if (def.calc == "none" && m.config.SendAbsoluteValues) || if ( def.calc == "none" && m.config.SendAbsoluteValues ) ||
(def.calc == "difference" && m.config.SendDiffValues) || ( def.calc == "difference" && m.config.SendDiffValues ) ||
(def.calc == "derivative" && m.config.SendDerivedValues) { ( def.calc == "derivative" && m.config.SendDerivedValues ) {
m.definitions = append(m.definitions, def) m.definitions = append(m.definitions, def)
} }
} }
@@ -426,7 +426,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
} else if m.config.SendBandwidths { } else if m.config.SendBandwidths {
for _, def := range GpfsTotalMetrics { for _, def := range GpfsTotalMetrics {
if def.unit == "bytes/sec" { if def.unit == "bytes/sec" {
if !slices.Contains(m.config.ExcludeMetrics, def.name) { if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def) m.definitions = append(m.definitions, def)
} }
} }
@@ -562,36 +562,36 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// compute total metrics (map[...] will return 0 if key not found) // compute total metrics (map[...] will return 0 if key not found)
// bytes read and written // bytes read and written
if br, br_ok := newstate["_br_"]; br_ok { if br, br_ok := newstate["_br_"]; br_ok {
newstate["bytesTotal"] += br newstate["bytesTotal"] = newstate["bytesTotal"] + br
} }
if bw, bw_ok := newstate["_bw_"]; bw_ok { if bw, bw_ok := newstate["_bw_"]; bw_ok {
newstate["bytesTotal"] += bw newstate["bytesTotal"] = newstate["bytesTotal"] + bw
} }
// read and write count // read and write count
if rdc, rdc_ok := newstate["_rdc_"]; rdc_ok { if rdc, rdc_ok := newstate["_rdc_"]; rdc_ok {
newstate["iops"] += rdc newstate["iops"] = newstate["iops"] + rdc
} }
if wc, wc_ok := newstate["_wc_"]; wc_ok { if wc, wc_ok := newstate["_wc_"]; wc_ok {
newstate["iops"] += wc newstate["iops"] = newstate["iops"] + wc
} }
// meta operations // meta operations
if oc, oc_ok := newstate["_oc_"]; oc_ok { if oc, oc_ok := newstate["_oc_"]; oc_ok {
newstate["metaops"] += oc newstate["metaops"] = newstate["metaops"] + oc
} }
if cc, cc_ok := newstate["_cc_"]; cc_ok { if cc, cc_ok := newstate["_cc_"]; cc_ok {
newstate["metaops"] += cc newstate["metaops"] = newstate["metaops"] + cc
} }
if dir, dir_ok := newstate["_dir_"]; dir_ok { if dir, dir_ok := newstate["_dir_"]; dir_ok {
newstate["metaops"] += dir newstate["metaops"] = newstate["metaops"] + dir
} }
if iu, iu_ok := newstate["_iu_"]; iu_ok { if iu, iu_ok := newstate["_iu_"]; iu_ok {
newstate["metaops"] += iu newstate["metaops"] = newstate["metaops"] + iu
} }
// send desired metrics for this filesystem // send desired metrics for this filesystem
for _, metric := range m.definitions { for _, metric := range m.definitions {
vold, vold_ok := m.lastState[filesystem][metric.prefix] vold, vold_ok := m.lastState[filesystem][metric.prefix]
vnew, vnew_ok := newstate[metric.prefix] vnew, vnew_ok := newstate[metric.prefix]
var value any var value interface{}
value_ok := false value_ok := false
switch metric.calc { switch metric.calc {
case "none": case "none":
@@ -617,14 +617,14 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
} }
case "derivative": case "derivative":
if vnew_ok && vold_ok && timeDiff > 0 { if vnew_ok && vold_ok && timeDiff > 0 {
value = float64(vnew-vold) / timeDiff value = float64(vnew - vold) / timeDiff
if value.(float64) < 0.0 { if value.(float64) < 0 {
value = 0.0 value = 0
} }
value_ok = true value_ok = true
} else if vold_ok { } else if vold_ok {
// if the difference is not computable, return 0 // if the difference is not computable, return 0
value = 0.0 value = 0
value_ok = true value_ok = true
} }
} }

View File

@@ -10,10 +10,9 @@ package collectors
import ( import (
"fmt" "fmt"
"os" "os"
"slices"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
"encoding/json" "encoding/json"
@@ -66,9 +65,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
var err error var err error
m.name = "InfinibandCollector" m.name = "InfinibandCollector"
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true m.parallel = true
m.meta = map[string]string{ m.meta = map[string]string{
"source": m.name, "source": m.name,
@@ -90,10 +87,10 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*") globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*")
ibDirs, err := filepath.Glob(globPattern) ibDirs, err := filepath.Glob(globPattern)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): unable to glob files with pattern %s: %w", m.name, globPattern, err) return fmt.Errorf("unable to glob files with pattern %s: %v", globPattern, err)
} }
if ibDirs == nil { if ibDirs == nil {
return fmt.Errorf("%s Init(): unable to find any directories with pattern %s", m.name, globPattern) return fmt.Errorf("unable to find any directories with pattern %s", globPattern)
} }
for _, path := range ibDirs { for _, path := range ibDirs {
@@ -114,7 +111,14 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
port := pathSplit[6] port := pathSplit[6]
// Skip excluded devices // Skip excluded devices
if slices.Contains(m.config.ExcludeDevices, device) { skip := false
for _, excludedDevice := range m.config.ExcludeDevices {
if excludedDevice == device {
skip = true
break
}
}
if skip {
continue continue
} }
@@ -157,7 +161,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
for _, counter := range portCounterFiles { for _, counter := range portCounterFiles {
err := unix.Access(counter.path, unix.R_OK) err := unix.Access(counter.path, unix.R_OK)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): unable to access %s: %w", m.name, counter.path, err) return fmt.Errorf("unable to access %s: %v", counter.path, err)
} }
} }
@@ -177,7 +181,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
} }
if len(m.info) == 0 { if len(m.info) == 0 {
return fmt.Errorf("%s Init(): found no IB devices", m.name) return fmt.Errorf("found no IB devices")
} }
m.init = true m.init = true
@@ -237,7 +241,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
counterDef.name, counterDef.name,
info.tagSet, info.tagSet,
m.meta, m.meta,
map[string]any{ map[string]interface{}{
"value": counterDef.currentState, "value": counterDef.currentState,
}, },
now); err == nil { now); err == nil {
@@ -255,7 +259,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
counterDef.name+"_bw", counterDef.name+"_bw",
info.tagSet, info.tagSet,
m.meta, m.meta,
map[string]any{ map[string]interface{}{
"value": rate, "value": rate,
}, },
now); err == nil { now); err == nil {
@@ -285,7 +289,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
"ib_total", "ib_total",
info.tagSet, info.tagSet,
m.meta, m.meta,
map[string]any{ map[string]interface{}{
"value": ib_total, "value": ib_total,
}, },
now); err == nil { now); err == nil {
@@ -298,7 +302,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
"ib_total_pkts", "ib_total_pkts",
info.tagSet, info.tagSet,
m.meta, m.meta,
map[string]any{ map[string]interface{}{
"value": ib_total_pkts, "value": ib_total_pkts,
}, },
now); err == nil { now); err == nil {

View File

@@ -11,15 +11,13 @@ import (
"bufio" "bufio"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt"
"os" "os"
"slices"
"strconv" "strconv"
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
const IOSTATFILE = `/proc/diskstats` const IOSTATFILE = `/proc/diskstats`
@@ -47,9 +45,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
m.name = "IOstatCollector" m.name = "IOstatCollector"
m.parallel = true m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Disk"} m.meta = map[string]string{"source": m.name, "group": "Disk"}
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
if len(config) > 0 { if len(config) > 0 {
err = json.Unmarshal(config, &m.config) err = json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
@@ -79,7 +75,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
m.devices = make(map[string]IOstatCollectorEntry) m.devices = make(map[string]IOstatCollectorEntry)
m.matches = make(map[string]int) m.matches = make(map[string]int)
for k, v := range matches { for k, v := range matches {
if !slices.Contains(m.config.ExcludeMetrics, k) { if _, skip := stringArrayContains(m.config.ExcludeMetrics, k); !skip {
m.matches[k] = v m.matches[k] = v
} }
} }
@@ -88,8 +84,10 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
} }
file, err := os.Open(IOSTATFILE) file, err := os.Open(IOSTATFILE)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): Failed to open file \"%s\": %w", m.name, IOSTATFILE, err) cclog.ComponentError(m.name, err.Error())
return err
} }
defer file.Close()
scanner := bufio.NewScanner(file) scanner := bufio.NewScanner(file)
for scanner.Scan() { for scanner.Scan() {
@@ -103,7 +101,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
if strings.Contains(device, "loop") { if strings.Contains(device, "loop") {
continue continue
} }
if slices.Contains(m.config.ExcludeDevices, device) { if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
continue continue
} }
currentValues := make(map[string]int64) currentValues := make(map[string]int64)
@@ -129,10 +127,6 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
lastValues: lastValues, lastValues: lastValues,
} }
} }
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): Failed to close file \"%s\": %w", m.name, IOSTATFILE, err)
}
m.init = true m.init = true
return err return err
} }
@@ -144,18 +138,10 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
file, err := os.Open(IOSTATFILE) file, err := os.Open(IOSTATFILE)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(m.name, err.Error())
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", IOSTATFILE, err))
return return
} }
defer func() { defer file.Close()
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", IOSTATFILE, err))
}
}()
scanner := bufio.NewScanner(file) scanner := bufio.NewScanner(file)
for scanner.Scan() { for scanner.Scan() {
@@ -171,7 +157,7 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
if strings.Contains(device, "loop") { if strings.Contains(device, "loop") {
continue continue
} }
if slices.Contains(m.config.ExcludeDevices, device) { if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
continue continue
} }
if _, ok := m.devices[device]; !ok { if _, ok := m.devices[device]; !ok {

View File

@@ -14,13 +14,14 @@ import (
"errors" "errors"
"fmt" "fmt"
"io" "io"
"log"
"os/exec" "os/exec"
"strconv" "strconv"
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
const IPMISENSORS_PATH = `ipmi-sensors` const IPMISENSORS_PATH = `ipmi-sensors`
@@ -43,9 +44,7 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
} }
m.name = "IpmiCollector" m.name = "IpmiCollector"
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true m.parallel = true
m.meta = map[string]string{ m.meta = map[string]string{
"source": m.name, "source": m.name,
@@ -117,20 +116,19 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
} }
v, err := strconv.ParseFloat(strings.TrimSpace(lv[1]), 64) v, err := strconv.ParseFloat(strings.TrimSpace(lv[1]), 64)
if err == nil { if err == nil {
name := strings.ToLower(strings.ReplaceAll(strings.TrimSpace(lv[0]), " ", "_")) name := strings.ToLower(strings.Replace(strings.TrimSpace(lv[0]), " ", "_", -1))
unit := strings.TrimSpace(lv[2]) unit := strings.TrimSpace(lv[2])
switch unit { if unit == "Volts" {
case "Volts":
unit = "Volts" unit = "Volts"
case "degrees C": } else if unit == "degrees C" {
unit = "degC" unit = "degC"
case "degrees F": } else if unit == "degrees F" {
unit = "degF" unit = "degF"
case "Watts": } else if unit == "Watts" {
unit = "Watts" unit = "Watts"
} }
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now()) y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", unit) y.AddMeta("unit", unit)
output <- y output <- y
@@ -152,30 +150,23 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) { func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
// Setup ipmisensors command
command := exec.Command(cmd, "--comma-separated-output", "--sdr-cache-recreate") command := exec.Command(cmd, "--comma-separated-output", "--sdr-cache-recreate")
stdout, _ := command.StdoutPipe() command.Wait()
errBuf := new(bytes.Buffer) stdout, err := command.Output()
command.Stderr = errBuf if err != nil {
log.Print(err)
// start command
if err := command.Start(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("readIpmiSensors(): Failed to start command \"%s\": %v", command.String(), err),
)
return return
} }
// Read command output ll := strings.Split(string(stdout), "\n")
scanner := bufio.NewScanner(stdout)
for scanner.Scan() { for _, line := range ll {
lv := strings.Split(scanner.Text(), ",") lv := strings.Split(line, ",")
if len(lv) > 3 { if len(lv) > 3 {
v, err := strconv.ParseFloat(lv[3], 64) v, err := strconv.ParseFloat(lv[3], 64)
if err == nil { if err == nil {
name := strings.ToLower(strings.ReplaceAll(lv[1], " ", "_")) name := strings.ToLower(strings.Replace(lv[1], " ", "_", -1))
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now()) y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
if err == nil { if err == nil {
if len(lv) > 4 { if len(lv) > 4 {
y.AddMeta("unit", lv[4]) y.AddMeta("unit", lv[4])
@@ -185,18 +176,6 @@ func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
} }
} }
} }
// Wait for command end
if err := command.Wait(); err != nil {
errMsg, _ := io.ReadAll(errBuf)
cclog.ComponentError(
m.name,
fmt.Sprintf("readIpmiSensors(): Failed to wait for the end of command \"%s\": %v\n", command.String(), err),
)
cclog.ComponentError(m.name, fmt.Sprintf("readIpmiSensors(): command stderr: \"%s\"\n", strings.TrimSpace(string(errMsg))))
return
}
} }
func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMessage) { func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMessage) {

View File

@@ -19,12 +19,11 @@ import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"maps"
"math" "math"
"os" "os"
"os/signal" "os/signal"
"os/user" "os/user"
"slices" "sort"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
@@ -32,8 +31,8 @@ import (
"time" "time"
"unsafe" "unsafe"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator" agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology" topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"github.com/NVIDIA/go-nvml/pkg/dl" "github.com/NVIDIA/go-nvml/pkg/dl"
@@ -125,14 +124,22 @@ func checkMetricType(t string) bool {
return ok return ok
} }
func eventsToEventStr(events map[string]string) string {
elist := make([]string, 0)
for k, v := range events {
elist = append(elist, fmt.Sprintf("%s:%s", v, k))
}
return strings.Join(elist, ",")
}
func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig { func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig {
clist := make([]string, 0, len(input.Events)) tmplist := make([]string, 0)
clist := make([]string, 0)
for k := range input.Events { for k := range input.Events {
clist = append(clist, k) clist = append(clist, k)
} }
slices.Sort(clist) sort.Strings(clist)
tmplist := make([]string, 0, len(clist)) elist := make([]*C.char, 0)
elist := make([]*C.char, 0, len(clist))
for _, k := range clist { for _, k := range clist {
v := input.Events[k] v := input.Events[k]
tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k)) tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k))
@@ -180,7 +187,7 @@ func getBaseFreq() float64 {
for _, f := range files { for _, f := range files {
buffer, err := os.ReadFile(f) buffer, err := os.ReadFile(f)
if err == nil { if err == nil {
data := strings.ReplaceAll(string(buffer), "\n", "") data := strings.Replace(string(buffer), "\n", "", -1)
x, err := strconv.ParseInt(data, 0, 64) x, err := strconv.ParseInt(data, 0, 64)
if err == nil { if err == nil {
freq = float64(x) freq = float64(x)
@@ -209,7 +216,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &m.config) err := json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err) return err
} }
} }
lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS) lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS)
@@ -218,18 +225,14 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
} }
err := lib.Open() err := lib.Open()
if err != nil { if err != nil {
return fmt.Errorf("error opening %s: %w", m.config.LibraryPath, err) return fmt.Errorf("error opening %s: %v", m.config.LibraryPath, err)
} }
if m.config.ForceOverwrite { if m.config.ForceOverwrite {
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1") cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
if err := os.Setenv("LIKWID_FORCE", "1"); err != nil { os.Setenv("LIKWID_FORCE", "1")
return fmt.Errorf("error setting environment variable LIKWID_FORCE=1: %w", err)
}
}
if err := m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
} }
m.setup()
m.meta = map[string]string{"group": "PerfCounter"} m.meta = map[string]string{"group": "PerfCounter"}
cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists") cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
@@ -313,14 +316,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
case "accessdaemon": case "accessdaemon":
if len(m.config.DaemonPath) > 0 { if len(m.config.DaemonPath) > 0 {
p := os.Getenv("PATH") p := os.Getenv("PATH")
if len(p) > 0 { os.Setenv("PATH", m.config.DaemonPath+":"+p)
p = m.config.DaemonPath + ":" + p
} else {
p = m.config.DaemonPath
}
if err := os.Setenv("PATH", p); err != nil {
return fmt.Errorf("error setting environment variable PATH=%s: %w", p, err)
}
} }
C.HPMmode(1) C.HPMmode(1)
retCode := C.HPMinit() retCode := C.HPMinit()
@@ -373,23 +369,16 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
// take a measurement for 'interval' seconds of event set index 'group' // take a measurement for 'interval' seconds of event set index 'group'
func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, interval time.Duration) (bool, error) { func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, interval time.Duration) (bool, error) {
var ret C.int var ret C.int
var gid C.int = -1
sigchan := make(chan os.Signal, 1) sigchan := make(chan os.Signal, 1)
// Watch changes for the lock file () // Watch changes for the lock file ()
watcher, err := fsnotify.NewWatcher() watcher, err := fsnotify.NewWatcher()
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(m.name, err.Error())
m.name,
fmt.Sprintf("takeMeasurement(): Failed to create a new fsnotify.Watcher: %v", err))
return true, err return true, err
} }
defer func() { defer watcher.Close()
if err := watcher.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("takeMeasurement(): Failed to close fsnotify.Watcher: %v", err))
}
}()
if len(m.config.LockfilePath) > 0 { if len(m.config.LockfilePath) > 0 {
// Check if the lock file exists // Check if the lock file exists
info, err := os.Stat(m.config.LockfilePath) info, err := os.Stat(m.config.LockfilePath)
@@ -397,11 +386,9 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
// Create the lock file if it does not exist // Create the lock file if it does not exist
file, createErr := os.Create(m.config.LockfilePath) file, createErr := os.Create(m.config.LockfilePath)
if createErr != nil { if createErr != nil {
return true, fmt.Errorf("failed to create lock file: %w", createErr) return true, fmt.Errorf("failed to create lock file: %v", createErr)
}
if err := file.Close(); err != nil {
return true, fmt.Errorf("failed to close lock file: %w", err)
} }
file.Close()
info, err = os.Stat(m.config.LockfilePath) // Recheck the file after creation info, err = os.Stat(m.config.LockfilePath) // Recheck the file after creation
} }
if err != nil { if err != nil {
@@ -453,7 +440,6 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
signal.Notify(sigchan, syscall.SIGCHLD) signal.Notify(sigchan, syscall.SIGCHLD)
// Add an event string to LIKWID // Add an event string to LIKWID
var gid C.int
select { select {
case <-sigchan: case <-sigchan:
gid = -1 gid = -1
@@ -609,6 +595,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
evset.metrics[tid][metric.Name] = value evset.metrics[tid][metric.Name] = value
// Now we have the result, send it with the proper tags // Now we have the result, send it with the proper tags
if !math.IsNaN(value) && metric.Publish { if !math.IsNaN(value) && metric.Publish {
fields := map[string]interface{}{"value": value}
y, err := y, err :=
lp.NewMessage( lp.NewMessage(
metric.Name, metric.Name,
@@ -616,14 +603,12 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
"type": metric.Type, "type": metric.Type,
}, },
m.meta, m.meta,
map[string]any{ fields,
"value": value,
},
now, now,
) )
if err == nil { if err == nil {
if metric.Type != "node" { if metric.Type != "node" {
y.AddTag("type-id", strconv.Itoa(domain)) y.AddTag("type-id", fmt.Sprintf("%d", domain))
} }
if len(metric.Unit) > 0 { if len(metric.Unit) > 0 {
y.AddMeta("unit", metric.Unit) y.AddMeta("unit", metric.Unit)
@@ -653,10 +638,10 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
metric.Name, metric.Name,
map[string]string{ map[string]string{
"type": "core", "type": "core",
"type-id": strconv.Itoa(coreID), "type-id": fmt.Sprintf("%d", coreID),
}, },
m.meta, m.meta,
map[string]any{ map[string]interface{}{
"value": value, "value": value,
}, },
now, now,
@@ -690,10 +675,10 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
metric.Name, metric.Name,
map[string]string{ map[string]string{
"type": "socket", "type": "socket",
"type-id": strconv.Itoa(socketID), "type-id": fmt.Sprintf("%d", socketID),
}, },
m.meta, m.meta,
map[string]any{ map[string]interface{}{
"value": value, "value": value,
}, },
now, now,
@@ -727,7 +712,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
"type": "node", "type": "node",
}, },
m.meta, m.meta,
map[string]any{ map[string]interface{}{
"value": totalNodeValue, "value": totalNodeValue,
}, },
now, now,
@@ -763,7 +748,9 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
// Here we generate parameter list // Here we generate parameter list
params := make(map[string]float64) params := make(map[string]float64)
for _, evset := range groups { for _, evset := range groups {
maps.Copy(params, evset.metrics[tid]) for mname, mres := range evset.metrics[tid] {
params[mname] = mres
}
} }
params["gotime"] = interval.Seconds() params["gotime"] = interval.Seconds()
// Evaluate the metric // Evaluate the metric
@@ -785,14 +772,14 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
"type": metric.Type, "type": metric.Type,
}, },
m.meta, m.meta,
map[string]any{ map[string]interface{}{
"value": value, "value": value,
}, },
now, now,
) )
if err == nil { if err == nil {
if metric.Type != "node" { if metric.Type != "node" {
y.AddTag("type-id", strconv.Itoa(domain)) y.AddTag("type-id", fmt.Sprintf("%d", domain))
} }
if len(metric.Unit) > 0 { if len(metric.Unit) > 0 {
y.AddMeta("unit", metric.Unit) y.AddMeta("unit", metric.Unit)
@@ -808,7 +795,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
} }
func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMessage) { func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMessage) {
var err error var err error = nil
groups := make([]LikwidEventsetConfig, 0) groups := make([]LikwidEventsetConfig, 0)
for evidx, evset := range m.config.Eventsets { for evidx, evset := range m.config.Eventsets {
@@ -826,21 +813,13 @@ func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMe
if !skip { if !skip {
// read measurements and derive event set metrics // read measurements and derive event set metrics
err = m.calcEventsetMetrics(e, interval, output) m.calcEventsetMetrics(e, interval, output)
if err != nil {
cclog.ComponentError(m.name, err.Error())
return
}
groups = append(groups, e) groups = append(groups, e)
} }
} }
if len(groups) > 0 { if len(groups) > 0 {
// calculate global metrics // calculate global metrics
err = m.calcGlobalMetrics(groups, interval, output) m.calcGlobalMetrics(groups, interval, output)
if err != nil {
cclog.ComponentError(m.name, err.Error())
return
}
} }
} }

View File

@@ -11,13 +11,12 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"os" "os"
"slices"
"strconv" "strconv"
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
// LoadavgCollector collects: // LoadavgCollector collects:
@@ -43,9 +42,7 @@ type LoadavgCollector struct {
func (m *LoadavgCollector) Init(config json.RawMessage) error { func (m *LoadavgCollector) Init(config json.RawMessage) error {
m.name = "LoadavgCollector" m.name = "LoadavgCollector"
m.parallel = true m.parallel = true
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &m.config) err := json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
@@ -67,10 +64,10 @@ func (m *LoadavgCollector) Init(config json.RawMessage) error {
m.proc_skips = make([]bool, len(m.proc_matches)) m.proc_skips = make([]bool, len(m.proc_matches))
for i, name := range m.load_matches { for i, name := range m.load_matches {
m.load_skips[i] = slices.Contains(m.config.ExcludeMetrics, name) _, m.load_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
} }
for i, name := range m.proc_matches { for i, name := range m.proc_matches {
m.proc_skips[i] = slices.Contains(m.config.ExcludeMetrics, name) _, m.proc_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
} }
m.init = true m.init = true
return nil return nil
@@ -102,7 +99,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
if m.load_skips[i] { if m.load_skips[i] {
continue continue
} }
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now) y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -121,7 +118,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
if m.proc_skips[i] { if m.proc_skips[i] {
continue continue
} }
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now) y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
if err == nil { if err == nil {
output <- y output <- y
} }

View File

@@ -13,13 +13,12 @@ import (
"fmt" "fmt"
"os/exec" "os/exec"
"os/user" "os/user"
"slices"
"strconv" "strconv"
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
const LUSTRE_SYSFS = `/sys/fs/lustre` const LUSTRE_SYSFS = `/sys/fs/lustre`
@@ -62,6 +61,7 @@ func (m *LustreCollector) getDeviceDataCommand(device string) []string {
} else { } else {
command = exec.Command(m.lctl, LCTL_OPTION, statsfile) command = exec.Command(m.lctl, LCTL_OPTION, statsfile)
} }
command.Wait()
stdout, _ := command.Output() stdout, _ := command.Output()
return strings.Split(string(stdout), "\n") return strings.Split(string(stdout), "\n")
} }
@@ -302,9 +302,7 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
return err return err
} }
} }
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.tags = map[string]string{"type": "node"} m.tags = map[string]string{"type": "node"}
m.meta = map[string]string{"source": m.name, "group": "Lustre"} m.meta = map[string]string{"source": m.name, "group": "Lustre"}
@@ -341,21 +339,21 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
m.definitions = []LustreMetricDefinition{} m.definitions = []LustreMetricDefinition{}
if m.config.SendAbsoluteValues { if m.config.SendAbsoluteValues {
for _, def := range LustreAbsMetrics { for _, def := range LustreAbsMetrics {
if !slices.Contains(m.config.ExcludeMetrics, def.name) { if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def) m.definitions = append(m.definitions, def)
} }
} }
} }
if m.config.SendDiffValues { if m.config.SendDiffValues {
for _, def := range LustreDiffMetrics { for _, def := range LustreDiffMetrics {
if !slices.Contains(m.config.ExcludeMetrics, def.name) { if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def) m.definitions = append(m.definitions, def)
} }
} }
} }
if m.config.SendDerivedValues { if m.config.SendDerivedValues {
for _, def := range LustreDeriveMetrics { for _, def := range LustreDeriveMetrics {
if !slices.Contains(m.config.ExcludeMetrics, def.name) { if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def) m.definitions = append(m.definitions, def)
} }
} }
@@ -404,23 +402,23 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMessage)
} else { } else {
use_x = devData[def.name] use_x = devData[def.name]
} }
var value any var value interface{}
switch def.calc { switch def.calc {
case "none": case "none":
value = use_x value = use_x
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now()) y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
case "difference": case "difference":
value = use_x - devData[def.name] value = use_x - devData[def.name]
if value.(int64) < 0 { if value.(int64) < 0 {
value = 0 value = 0
} }
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now()) y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
case "derivative": case "derivative":
value = float64(use_x-devData[def.name]) / tdiff.Seconds() value = float64(use_x-devData[def.name]) / tdiff.Seconds()
if value.(float64) < 0 { if value.(float64) < 0 {
value = 0 value = 0
} }
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now()) y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
} }
if err == nil { if err == nil {
y.AddTag("device", device) y.AddTag("device", device)

View File

@@ -15,13 +15,12 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"regexp" "regexp"
"slices"
"strconv" "strconv"
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
const MEMSTATFILE = "/proc/meminfo" const MEMSTATFILE = "/proc/meminfo"
@@ -59,11 +58,7 @@ func getStats(filename string) map[string]MemstatStats {
if err != nil { if err != nil {
cclog.Error(err.Error()) cclog.Error(err.Error())
} }
defer func() { defer file.Close()
if err := file.Close(); err != nil {
cclog.Error(err.Error())
}
}()
scanner := bufio.NewScanner(file) scanner := bufio.NewScanner(file)
for scanner.Scan() { for scanner.Scan() {
@@ -120,20 +115,19 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
"MemShared": "mem_shared", "MemShared": "mem_shared",
} }
for k, v := range matches { for k, v := range matches {
if !slices.Contains(m.config.ExcludeMetrics, k) { _, skip := stringArrayContains(m.config.ExcludeMetrics, k)
if !skip {
m.matches[k] = v m.matches[k] = v
} }
} }
m.sendMemUsed = false m.sendMemUsed = false
if !slices.Contains(m.config.ExcludeMetrics, "mem_used") { if _, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_used"); !skip {
m.sendMemUsed = true m.sendMemUsed = true
} }
if len(m.matches) == 0 { if len(m.matches) == 0 {
return errors.New("no metrics to collect") return errors.New("no metrics to collect")
} }
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
if m.config.NodeStats { if m.config.NodeStats {
if stats := getStats(MEMSTATFILE); len(stats) == 0 { if stats := getStats(MEMSTATFILE); len(stats) == 0 {
@@ -159,7 +153,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
file: f, file: f,
tags: map[string]string{ tags: map[string]string{
"type": "memoryDomain", "type": "memoryDomain",
"type-id": strconv.Itoa(id), "type-id": fmt.Sprintf("%d", id),
}, },
} }
m.nodefiles[id] = f m.nodefiles[id] = f
@@ -180,7 +174,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
sendStats := func(stats map[string]MemstatStats, tags map[string]string) { sendStats := func(stats map[string]MemstatStats, tags map[string]string) {
for match, name := range m.matches { for match, name := range m.matches {
var value float64 = 0 var value float64 = 0
unit := "" var unit string = ""
if v, ok := stats[match]; ok { if v, ok := stats[match]; ok {
value = v.value value = v.value
if len(v.unit) > 0 { if len(v.unit) > 0 {
@@ -188,7 +182,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
} }
} }
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value}, time.Now()) y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil { if err == nil {
if len(unit) > 0 { if len(unit) > 0 {
y.AddMeta("unit", unit) y.AddMeta("unit", unit)
@@ -221,7 +215,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
} }
} }
} }
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]any{"value": memUsed}, time.Now()) y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now())
if err == nil { if err == nil {
if len(unit) > 0 { if len(unit) > 0 {
y.AddMeta("unit", unit) y.AddMeta("unit", unit)

View File

@@ -12,7 +12,7 @@ import (
"fmt" "fmt"
"time" "time"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
type MetricCollector interface { type MetricCollector interface {
@@ -51,6 +51,30 @@ func (c *metricCollector) Initialized() bool {
return c.init return c.init
} }
// intArrayContains scans an array of ints if the value str is present in the array
// If the specified value is found, the corresponding array index is returned.
// The bool value is used to signal success or failure
func intArrayContains(array []int, str int) (int, bool) {
for i, a := range array {
if a == str {
return i, true
}
}
return -1, false
}
// stringArrayContains scans an array of strings if the value str is present in the array
// If the specified value is found, the corresponding array index is returned.
// The bool value is used to signal success or failure
func stringArrayContains(array []string, str string) (int, bool) {
for i, a := range array {
if a == str {
return i, true
}
}
return -1, false
}
// RemoveFromStringList removes the string r from the array of strings s // RemoveFromStringList removes the string r from the array of strings s
// If r is not contained in the array an error is returned // If r is not contained in the array an error is returned
func RemoveFromStringList(s []string, r string) ([]string, error) { func RemoveFromStringList(s []string, r string) ([]string, error) {

View File

@@ -10,15 +10,14 @@ package collectors
import ( import (
"bufio" "bufio"
"encoding/json" "encoding/json"
"fmt" "errors"
"os" "os"
"slices"
"strconv" "strconv"
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
const NETSTATFILE = "/proc/net/dev" const NETSTATFILE = "/proc/net/dev"
@@ -66,9 +65,7 @@ func getCanonicalName(raw string, aliasToCanonical map[string]string) string {
func (m *NetstatCollector) Init(config json.RawMessage) error { func (m *NetstatCollector) Init(config json.RawMessage) error {
m.name = "NetstatCollector" m.name = "NetstatCollector"
m.parallel = true m.parallel = true
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.lastTimestamp = time.Now() m.lastTimestamp = time.Now()
const ( const (
@@ -110,8 +107,10 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
// Check access to net statistic file // Check access to net statistic file
file, err := os.Open(NETSTATFILE) file, err := os.Open(NETSTATFILE)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): failed to open netstat file \"%s\": %w", m.name, NETSTATFILE, err) cclog.ComponentError(m.name, err.Error())
return err
} }
defer file.Close()
scanner := bufio.NewScanner(file) scanner := bufio.NewScanner(file)
for scanner.Scan() { for scanner.Scan() {
@@ -130,7 +129,7 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
canonical := getCanonicalName(raw, m.aliasToCanonical) canonical := getCanonicalName(raw, m.aliasToCanonical)
// Check if device is a included device // Check if device is a included device
if slices.Contains(m.config.IncludeDevices, canonical) { if _, ok := stringArrayContains(m.config.IncludeDevices, canonical); ok {
// Tag will contain original device name (raw). // Tag will contain original device name (raw).
tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"} tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"}
meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"} meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"}
@@ -175,13 +174,8 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
} }
} }
// Close netstat file
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): failed to close netstat file \"%s\": %w", m.name, NETSTATFILE, err)
}
if len(m.matches) == 0 { if len(m.matches) == 0 {
return fmt.Errorf("%s Init(): no devices to collect metrics found", m.name) return errors.New("no devices to collector metrics found")
} }
m.init = true m.init = true
return nil return nil
@@ -200,18 +194,10 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
file, err := os.Open(NETSTATFILE) file, err := os.Open(NETSTATFILE)
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(m.name, err.Error())
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", NETSTATFILE, err))
return return
} }
defer func() { defer file.Close()
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", NETSTATFILE, err))
}
}()
scanner := bufio.NewScanner(file) scanner := bufio.NewScanner(file)
for scanner.Scan() { for scanner.Scan() {
@@ -240,14 +226,14 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
continue continue
} }
if m.config.SendAbsoluteValues { if m.config.SendAbsoluteValues {
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]any{"value": v}, now); err == nil { if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]interface{}{"value": v}, now); err == nil {
output <- y output <- y
} }
} }
if m.config.SendDerivedValues { if m.config.SendDerivedValues {
if metric.lastValue >= 0 { if metric.lastValue >= 0 {
rate := float64(v-metric.lastValue) / timeDiff rate := float64(v-metric.lastValue) / timeDiff
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]any{"value": rate}, now); err == nil { if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]interface{}{"value": rate}, now); err == nil {
output <- y output <- y
} }
} }

View File

@@ -10,7 +10,7 @@ package collectors
import ( import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"slices" "log"
// "os" // "os"
"os/exec" "os/exec"
@@ -18,8 +18,7 @@ import (
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
) )
// First part contains the code for the general NfsCollector. // First part contains the code for the general NfsCollector.
@@ -45,15 +44,10 @@ type nfsCollector struct {
func (m *nfsCollector) initStats() error { func (m *nfsCollector) initStats() error {
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`) cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
cmd.Wait()
// Wait for cmd end
if err := cmd.Wait(); err != nil {
return fmt.Errorf("%s initStats(): %w", m.name, err)
}
buffer, err := cmd.Output() buffer, err := cmd.Output()
if err == nil { if err == nil {
for line := range strings.Lines(string(buffer)) { for _, line := range strings.Split(string(buffer), "\n") {
lf := strings.Fields(line) lf := strings.Fields(line)
if len(lf) != 5 { if len(lf) != 5 {
continue continue
@@ -77,15 +71,10 @@ func (m *nfsCollector) initStats() error {
func (m *nfsCollector) updateStats() error { func (m *nfsCollector) updateStats() error {
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`) cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
cmd.Wait()
// Wait for cmd end
if err := cmd.Wait(); err != nil {
return fmt.Errorf("%s updateStats(): %w", m.name, err)
}
buffer, err := cmd.Output() buffer, err := cmd.Output()
if err == nil { if err == nil {
for line := range strings.Lines(string(buffer)) { for _, line := range strings.Split(string(buffer), "\n") {
lf := strings.Fields(line) lf := strings.Fields(line)
if len(lf) != 5 { if len(lf) != 5 {
continue continue
@@ -113,7 +102,8 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &m.config) err := json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err) log.Print(err.Error())
return err
} }
} }
m.meta = map[string]string{ m.meta = map[string]string{
@@ -126,12 +116,10 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
// Check if nfsstat is in executable search path // Check if nfsstat is in executable search path
_, err := exec.LookPath(m.config.Nfsstats) _, err := exec.LookPath(m.config.Nfsstats)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): Failed to find nfsstat binary '%s': %w", m.name, m.config.Nfsstats, err) return fmt.Errorf("NfsCollector.Init(): Failed to find nfsstat binary '%s': %v", m.config.Nfsstats, err)
} }
m.data = make(map[string]NfsCollectorData) m.data = make(map[string]NfsCollectorData)
if err := m.initStats(); err != nil { m.initStats()
return fmt.Errorf("%s Init(): %w", m.name, err)
}
m.init = true m.init = true
m.parallel = true m.parallel = true
return nil return nil
@@ -143,14 +131,8 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
} }
timestamp := time.Now() timestamp := time.Now()
if err := m.updateStats(); err != nil { m.updateStats()
cclog.ComponentError( prefix := ""
m.name,
fmt.Sprintf("Read(): updateStats() failed: %v", err),
)
return
}
var prefix string
switch m.version { switch m.version {
case "v3": case "v3":
prefix = "nfs3" prefix = "nfs3"
@@ -161,11 +143,11 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
} }
for name, data := range m.data { for name, data := range m.data {
if slices.Contains(m.config.ExcludeMetrics, name) { if _, skip := stringArrayContains(m.config.ExcludeMetrics, name); skip {
continue continue
} }
value := data.current - data.last value := data.current - data.last
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
y.AddMeta("version", m.version) y.AddMeta("version", m.version)
output <- y output <- y
@@ -188,17 +170,13 @@ type Nfs4Collector struct {
func (m *Nfs3Collector) Init(config json.RawMessage) error { func (m *Nfs3Collector) Init(config json.RawMessage) error {
m.name = "Nfs3Collector" m.name = "Nfs3Collector"
m.version = `v3` m.version = `v3`
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
return m.MainInit(config) return m.MainInit(config)
} }
func (m *Nfs4Collector) Init(config json.RawMessage) error { func (m *Nfs4Collector) Init(config json.RawMessage) error {
m.name = "Nfs4Collector" m.name = "Nfs4Collector"
m.version = `v4` m.version = `v4`
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
return m.MainInit(config) return m.MainInit(config)
} }

View File

@@ -12,13 +12,12 @@ import (
"fmt" "fmt"
"os" "os"
"regexp" "regexp"
"slices"
"strconv" "strconv"
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
// These are the fields we read from the JSON configuration // These are the fields we read from the JSON configuration
@@ -72,7 +71,7 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
// Is this a device line with mount point, remote target and NFS version? // Is this a device line with mount point, remote target and NFS version?
dev := resolve_regex_fields(l, deviceRegex) dev := resolve_regex_fields(l, deviceRegex)
if len(dev) > 0 { if len(dev) > 0 {
if !slices.Contains(m.config.ExcludeFilesystem, dev[m.key]) { if _, ok := stringArrayContains(m.config.ExcludeFilesystem, dev[m.key]); !ok {
current = dev current = dev
if len(current["version"]) == 0 { if len(current["version"]) == 0 {
current["version"] = "3" current["version"] = "3"
@@ -86,7 +85,7 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
if len(bytes) > 0 { if len(bytes) > 0 {
data[current[m.key]] = make(map[string]int64) data[current[m.key]] = make(map[string]int64)
for name, sval := range bytes { for name, sval := range bytes {
if !slices.Contains(m.config.ExcludeMetrics, name) { if _, ok := stringArrayContains(m.config.ExcludeMetrics, name); !ok {
val, err := strconv.ParseInt(sval, 10, 64) val, err := strconv.ParseInt(sval, 10, 64)
if err == nil { if err == nil {
data[current[m.key]][name] = val data[current[m.key]][name] = val
@@ -103,9 +102,7 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
func (m *NfsIOStatCollector) Init(config json.RawMessage) error { func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
var err error = nil var err error = nil
m.name = "NfsIOStatCollector" m.name = "NfsIOStatCollector"
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "NFS", "unit": "bytes"} m.meta = map[string]string{"source": m.name, "group": "NFS", "unit": "bytes"}
m.tags = map[string]string{"type": "node"} m.tags = map[string]string{"type": "node"}
@@ -143,13 +140,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
if old, ok := m.data[mntpoint]; ok { if old, ok := m.data[mntpoint]; ok {
for name, newVal := range values { for name, newVal := range values {
if m.config.SendAbsoluteValues { if m.config.SendAbsoluteValues {
msg, err := lp.NewMessage( msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", name), m.tags, m.meta, map[string]interface{}{"value": newVal}, now)
"nfsio_"+name,
m.tags,
m.meta,
map[string]any{
"value": newVal},
now)
if err == nil { if err == nil {
msg.AddTag("stype", "filesystem") msg.AddTag("stype", "filesystem")
msg.AddTag("stype-id", mntpoint) msg.AddTag("stype-id", mntpoint)
@@ -158,7 +149,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
} }
if m.config.SendDerivedValues { if m.config.SendDerivedValues {
rate := float64(newVal-old[name]) / timeDiff rate := float64(newVal-old[name]) / timeDiff
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]any{"value": rate}, now) msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]interface{}{"value": rate}, now)
if err == nil { if err == nil {
if strings.HasPrefix(name, "page") { if strings.HasPrefix(name, "page") {
msg.AddMeta("unit", "4K_pages/s") msg.AddMeta("unit", "4K_pages/s")

View File

@@ -10,8 +10,8 @@ import (
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
type NUMAStatsCollectorConfig struct { type NUMAStatsCollectorConfig struct {
@@ -72,9 +72,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
m.name = "NUMAStatsCollector" m.name = "NUMAStatsCollector"
m.parallel = true m.parallel = true
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.meta = map[string]string{ m.meta = map[string]string{
"source": m.name, "source": m.name,
"group": "NUMA", "group": "NUMA",
@@ -84,7 +82,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &m.config) err := json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): unable to unmarshal numastat configuration: %w", m.name, err) return fmt.Errorf("unable to unmarshal numastat configuration: %s", err.Error())
} }
} }
@@ -93,10 +91,10 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
globPattern := base + "[0-9]*" globPattern := base + "[0-9]*"
dirs, err := filepath.Glob(globPattern) dirs, err := filepath.Glob(globPattern)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): unable to glob files with pattern '%s'", m.name, globPattern) return fmt.Errorf("unable to glob files with pattern '%s'", globPattern)
} }
if dirs == nil { if dirs == nil {
return fmt.Errorf("%s Init(): unable to find any files with pattern '%s'", m.name, globPattern) return fmt.Errorf("unable to find any files with pattern '%s'", globPattern)
} }
m.topology = make([]NUMAStatsCollectorTopolgy, 0, len(dirs)) m.topology = make([]NUMAStatsCollectorTopolgy, 0, len(dirs))
for _, dir := range dirs { for _, dir := range dirs {
@@ -188,11 +186,7 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
t.previousValues[key] = value t.previousValues[key] = value
} }
} }
if err := file.Close(); err != nil { file.Close()
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", t.file, err))
}
} }
} }

View File

@@ -12,14 +12,11 @@ import (
"errors" "errors"
"fmt" "fmt"
"log" "log"
"maps"
"slices"
"strconv"
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/NVIDIA/go-nvml/pkg/nvml"
) )
@@ -67,9 +64,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
m.config.ProcessMigDevices = false m.config.ProcessMigDevices = false
m.config.UseUuidForMigDevices = false m.config.UseUuidForMigDevices = false
m.config.UseSliceForMigDevices = false m.config.UseSliceForMigDevices = false
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
if len(config) > 0 { if len(config) > 0 {
err = json.Unmarshal(config, &m.config) err = json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
@@ -110,11 +105,11 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
// For all GPUs // For all GPUs
idx := 0 idx := 0
m.gpus = make([]NvidiaCollectorDevice, num_gpus) m.gpus = make([]NvidiaCollectorDevice, num_gpus)
for i := range num_gpus { for i := 0; i < num_gpus; i++ {
// Skip excluded devices by ID // Skip excluded devices by ID
str_i := strconv.Itoa(i) str_i := fmt.Sprintf("%d", i)
if slices.Contains(m.config.ExcludeDevices, str_i) { if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip {
cclog.ComponentDebug(m.name, "Skipping excluded device", str_i) cclog.ComponentDebug(m.name, "Skipping excluded device", str_i)
continue continue
} }
@@ -142,7 +137,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
pciInfo.Device) pciInfo.Device)
// Skip excluded devices specified by PCI ID // Skip excluded devices specified by PCI ID
if slices.Contains(m.config.ExcludeDevices, pci_id) { if _, skip := stringArrayContains(m.config.ExcludeDevices, pci_id); skip {
cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id) cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id)
continue continue
} }
@@ -227,20 +222,18 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
var total uint64 var total uint64
var used uint64 var used uint64
var reserved uint64 = 0 var reserved uint64 = 0
v2 := false var v2 bool = false
meminfo, ret := nvml.DeviceGetMemoryInfo(device.device) meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
err := errors.New(nvml.ErrorString(ret)) err := errors.New(nvml.ErrorString(ret))
return err return err
} }
// Total physical device memory (in bytes)
total = meminfo.Total total = meminfo.Total
// Sum of Reserved and Allocated device memory (in bytes)
used = meminfo.Used used = meminfo.Used
if !device.excludeMetrics["nv_fb_mem_total"] { if !device.excludeMetrics["nv_fb_mem_total"] {
t := float64(total) / (1024 * 1024) t := float64(total) / (1024 * 1024)
y, err := lp.NewMetric("nv_fb_mem_total", device.tags, device.meta, t, time.Now()) y, err := lp.NewMessage("nv_fb_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MByte") y.AddMeta("unit", "MByte")
output <- y output <- y
@@ -249,7 +242,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
if !device.excludeMetrics["nv_fb_mem_used"] { if !device.excludeMetrics["nv_fb_mem_used"] {
f := float64(used) / (1024 * 1024) f := float64(used) / (1024 * 1024)
y, err := lp.NewMetric("nv_fb_mem_used", device.tags, device.meta, f, time.Now()) y, err := lp.NewMessage("nv_fb_mem_used", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MByte") y.AddMeta("unit", "MByte")
output <- y output <- y
@@ -258,7 +251,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] { if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] {
r := float64(reserved) / (1024 * 1024) r := float64(reserved) / (1024 * 1024)
y, err := lp.NewMetric("nv_fb_mem_reserved", device.tags, device.meta, r, time.Now()) y, err := lp.NewMessage("nv_fb_mem_reserved", device.tags, device.meta, map[string]interface{}{"value": r}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MByte") y.AddMeta("unit", "MByte")
output <- y output <- y
@@ -277,7 +270,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
} }
if !device.excludeMetrics["nv_bar1_mem_total"] { if !device.excludeMetrics["nv_bar1_mem_total"] {
t := float64(meminfo.Bar1Total) / (1024 * 1024) t := float64(meminfo.Bar1Total) / (1024 * 1024)
y, err := lp.NewMetric("nv_bar1_mem_total", device.tags, device.meta, t, time.Now()) y, err := lp.NewMessage("nv_bar1_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MByte") y.AddMeta("unit", "MByte")
output <- y output <- y
@@ -285,7 +278,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
} }
if !device.excludeMetrics["nv_bar1_mem_used"] { if !device.excludeMetrics["nv_bar1_mem_used"] {
t := float64(meminfo.Bar1Used) / (1024 * 1024) t := float64(meminfo.Bar1Used) / (1024 * 1024)
y, err := lp.NewMetric("nv_bar1_mem_used", device.tags, device.meta, t, time.Now()) y, err := lp.NewMessage("nv_bar1_mem_used", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MByte") y.AddMeta("unit", "MByte")
output <- y output <- y
@@ -319,14 +312,14 @@ func readUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
util, ret := nvml.DeviceGetUtilizationRates(device.device) util, ret := nvml.DeviceGetUtilizationRates(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
if !device.excludeMetrics["nv_util"] { if !device.excludeMetrics["nv_util"] {
y, err := lp.NewMetric("nv_util", device.tags, device.meta, float64(util.Gpu), time.Now()) y, err := lp.NewMessage("nv_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
} }
} }
if !device.excludeMetrics["nv_mem_util"] { if !device.excludeMetrics["nv_mem_util"] {
y, err := lp.NewMetric("nv_mem_util", device.tags, device.meta, float64(util.Memory), time.Now()) y, err := lp.NewMessage("nv_mem_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
@@ -346,7 +339,7 @@ func readTemp(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
// * NVML_TEMPERATURE_COUNT // * NVML_TEMPERATURE_COUNT
temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU) temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_temp", device.tags, device.meta, float64(temp), time.Now()) y, err := lp.NewMessage("nv_temp", device.tags, device.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "degC") y.AddMeta("unit", "degC")
output <- y output <- y
@@ -369,7 +362,7 @@ func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
// This value may exceed 100% in certain cases. // This value may exceed 100% in certain cases.
fan, ret := nvml.DeviceGetFanSpeed(device.device) fan, ret := nvml.DeviceGetFanSpeed(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_fan", device.tags, device.meta, float64(fan), time.Now()) y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
@@ -379,6 +372,27 @@ func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
return nil return nil
} }
// func readFans(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
// if !device.excludeMetrics["nv_fan"] {
// numFans, ret := nvml.DeviceGetNumFans(device.device)
// if ret == nvml.SUCCESS {
// for i := 0; i < numFans; i++ {
// fan, ret := nvml.DeviceGetFanSpeed_v2(device.device, i)
// if ret == nvml.SUCCESS {
// y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
// if err == nil {
// y.AddMeta("unit", "%")
// y.AddTag("stype", "fan")
// y.AddTag("stype-id", fmt.Sprintf("%d", i))
// output <- y
// }
// }
// }
// }
// }
// return nil
// }
func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error { func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_ecc_mode"] { if !device.excludeMetrics["nv_ecc_mode"] {
// Retrieves the current and pending ECC modes for the device. // Retrieves the current and pending ECC modes for the device.
@@ -389,23 +403,22 @@ func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error
// Changing ECC modes requires a reboot. // Changing ECC modes requires a reboot.
// The "pending" ECC mode refers to the target mode following the next reboot. // The "pending" ECC mode refers to the target mode following the next reboot.
_, ecc_pend, ret := nvml.DeviceGetEccMode(device.device) _, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
switch ret { if ret == nvml.SUCCESS {
case nvml.SUCCESS:
var y lp.CCMessage var y lp.CCMessage
var err error var err error
switch ecc_pend { switch ecc_pend {
case nvml.FEATURE_DISABLED: case nvml.FEATURE_DISABLED:
y, err = lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "OFF", time.Now()) y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "OFF"}, time.Now())
case nvml.FEATURE_ENABLED: case nvml.FEATURE_ENABLED:
y, err = lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "ON", time.Now()) y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "ON"}, time.Now())
default: default:
y, err = lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "UNKNOWN", time.Now()) y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now())
} }
if err == nil { if err == nil {
output <- y output <- y
} }
case nvml.ERROR_NOT_SUPPORTED: } else if ret == nvml.ERROR_NOT_SUPPORTED {
y, err := lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "N/A", time.Now()) y, err := lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "N/A"}, time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -425,7 +438,7 @@ func readPerfState(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
// 32: Unknown performance state. // 32: Unknown performance state.
pState, ret := nvml.DeviceGetPerformanceState(device.device) pState, ret := nvml.DeviceGetPerformanceState(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_perf_state", device.tags, device.meta, fmt.Sprintf("P%d", int(pState)), time.Now()) y, err := lp.NewMessage("nv_perf_state", device.tags, device.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -451,7 +464,7 @@ func readPowerUsage(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
if mode == nvml.FEATURE_ENABLED { if mode == nvml.FEATURE_ENABLED {
power, ret := nvml.DeviceGetPowerUsage(device.device) power, ret := nvml.DeviceGetPowerUsage(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_power_usage", device.tags, device.meta, float64(power)/1000, time.Now()) y, err := lp.NewMessage("nv_power_usage", device.tags, device.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "watts") y.AddMeta("unit", "watts")
output <- y output <- y
@@ -477,12 +490,7 @@ func readEnergyConsumption(device *NvidiaCollectorDevice, output chan lp.CCMessa
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
if device.lastEnergyReading != 0 { if device.lastEnergyReading != 0 {
if !device.excludeMetrics["nv_energy"] { if !device.excludeMetrics["nv_energy"] {
y, err := lp.NewMetric( y, err := lp.NewMetric("nv_energy", device.tags, device.meta, (energy-device.lastEnergyReading)/1000, now)
"nv_energy",
device.tags,
device.meta,
(energy-device.lastEnergyReading)/1000,
now)
if err == nil { if err == nil {
y.AddMeta("unit", "Joules") y.AddMeta("unit", "Joules")
output <- y output <- y
@@ -524,7 +532,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_graphics_clock"] { if !device.excludeMetrics["nv_graphics_clock"] {
graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS) graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_graphics_clock", device.tags, device.meta, float64(graphicsClock), time.Now()) y, err := lp.NewMessage("nv_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MHz") y.AddMeta("unit", "MHz")
output <- y output <- y
@@ -535,7 +543,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_sm_clock"] { if !device.excludeMetrics["nv_sm_clock"] {
smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_sm_clock", device.tags, device.meta, float64(smCock), time.Now()) y, err := lp.NewMessage("nv_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(smCock)}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MHz") y.AddMeta("unit", "MHz")
output <- y output <- y
@@ -546,7 +554,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_mem_clock"] { if !device.excludeMetrics["nv_mem_clock"] {
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_mem_clock", device.tags, device.meta, float64(memClock), time.Now()) y, err := lp.NewMessage("nv_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MHz") y.AddMeta("unit", "MHz")
output <- y output <- y
@@ -556,7 +564,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_video_clock"] { if !device.excludeMetrics["nv_video_clock"] {
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO) memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_video_clock", device.tags, device.meta, float64(memClock), time.Now()) y, err := lp.NewMessage("nv_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MHz") y.AddMeta("unit", "MHz")
output <- y output <- y
@@ -637,7 +645,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
// i.e. the total set of errors across the entire device. // i.e. the total set of errors across the entire device.
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC) ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_ecc_uncorrected_error", device.tags, device.meta, float64(ecc_db), time.Now()) y, err := lp.NewMessage("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -646,7 +654,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
if !device.excludeMetrics["nv_ecc_corrected_error"] { if !device.excludeMetrics["nv_ecc_corrected_error"] {
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC) ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_ecc_corrected_error", device.tags, device.meta, float64(ecc_sb), time.Now()) y, err := lp.NewMessage("nv_ecc_corrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -665,7 +673,7 @@ func readPowerLimit(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
// If the card's total power draw reaches this limit the power management algorithm kicks in. // If the card's total power draw reaches this limit the power management algorithm kicks in.
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device) pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_power_max_limit", device.tags, device.meta, float64(pwr_limit)/1000, time.Now()) y, err := lp.NewMessage("nv_power_max_limit", device.tags, device.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "watts") y.AddMeta("unit", "watts")
output <- y output <- y
@@ -692,7 +700,7 @@ func readEncUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported. // Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device) enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_encoder_util", device.tags, device.meta, float64(enc_util), time.Now()) y, err := lp.NewMessage("nv_encoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
@@ -719,7 +727,7 @@ func readDecUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported. // Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device) dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_decoder_util", device.tags, device.meta, float64(dec_util), time.Now()) y, err := lp.NewMessage("nv_decoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
@@ -746,33 +754,33 @@ func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) e
corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device) corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
if !device.excludeMetrics["nv_remapped_rows_corrected"] { if !device.excludeMetrics["nv_remapped_rows_corrected"] {
y, err := lp.NewMetric("nv_remapped_rows_corrected", device.tags, device.meta, float64(corrected), time.Now()) y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(corrected)}, time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !device.excludeMetrics["nv_remapped_rows_uncorrected"] { if !device.excludeMetrics["nv_remapped_rows_uncorrected"] {
y, err := lp.NewMetric("nv_remapped_rows_corrected", device.tags, device.meta, float64(uncorrected), time.Now()) y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(uncorrected)}, time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !device.excludeMetrics["nv_remapped_rows_pending"] { if !device.excludeMetrics["nv_remapped_rows_pending"] {
p := 0 var p int = 0
if pending { if pending {
p = 1 p = 1
} }
y, err := lp.NewMetric("nv_remapped_rows_pending", device.tags, device.meta, p, time.Now()) y, err := lp.NewMessage("nv_remapped_rows_pending", device.tags, device.meta, map[string]interface{}{"value": p}, time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !device.excludeMetrics["nv_remapped_rows_failure"] { if !device.excludeMetrics["nv_remapped_rows_failure"] {
f := 0 var f int = 0
if failure { if failure {
f = 1 f = 1
} }
y, err := lp.NewMetric("nv_remapped_rows_failure", device.tags, device.meta, f, time.Now()) y, err := lp.NewMessage("nv_remapped_rows_failure", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -806,7 +814,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device) procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_compute_processes", device.tags, device.meta, len(procList), time.Now()) y, err := lp.NewMessage("nv_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -835,7 +843,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device) procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_graphics_processes", device.tags, device.meta, len(procList), time.Now()) y, err := lp.NewMessage("nv_graphics_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -865,7 +873,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. // // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
// procList, ret := nvml.DeviceGetMPSComputeRunningProcesses(device.device) // procList, ret := nvml.DeviceGetMPSComputeRunningProcesses(device.device)
// if ret == nvml.SUCCESS { // if ret == nvml.SUCCESS {
// y, err := lp.NewMetric("nv_mps_compute_processes", device.tags, device.meta, len(procList), time.Now()) // y, err := lp.NewMessage("nv_mps_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
// if err == nil { // if err == nil {
// output <- y // output <- y
// } // }
@@ -893,7 +901,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_power", device.tags, device.meta, t, time.Now()) y, err := lp.NewMessage("nv_violation_power", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -905,7 +913,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_thermal", device.tags, device.meta, t, time.Now()) y, err := lp.NewMessage("nv_violation_thermal", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -917,7 +925,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_sync_boost", device.tags, device.meta, t, time.Now()) y, err := lp.NewMessage("nv_violation_sync_boost", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -929,7 +937,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_board_limit", device.tags, device.meta, t, time.Now()) y, err := lp.NewMessage("nv_violation_board_limit", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -941,7 +949,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_low_util", device.tags, device.meta, t, time.Now()) y, err := lp.NewMessage("nv_violation_low_util", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -953,7 +961,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_reliability", device.tags, device.meta, t, time.Now()) y, err := lp.NewMessage("nv_violation_reliability", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -965,7 +973,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_below_app_clock", device.tags, device.meta, t, time.Now()) y, err := lp.NewMessage("nv_violation_below_app_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -977,7 +985,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMetric("nv_violation_below_base_clock", device.tags, device.meta, t, time.Now()) y, err := lp.NewMessage("nv_violation_below_base_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -1000,19 +1008,19 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
var aggregate_recovery_errors uint64 = 0 var aggregate_recovery_errors uint64 = 0
var aggregate_crc_flit_errors uint64 = 0 var aggregate_crc_flit_errors uint64 = 0
for i := range nvml.NVLINK_MAX_LINKS { for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ {
state, ret := nvml.DeviceGetNvLinkState(device.device, i) state, ret := nvml.DeviceGetNvLinkState(device.device, i)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
if state == nvml.FEATURE_ENABLED { if state == nvml.FEATURE_ENABLED {
if !device.excludeMetrics["nv_nvlink_crc_errors"] { if !device.excludeMetrics["nv_nvlink_crc_errors"] {
// Data link receive data CRC error counter // Data link receive data CRC error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA) count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA)
aggregate_crc_errors += count aggregate_crc_errors = aggregate_crc_errors + count
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_nvlink_crc_errors", device.tags, device.meta, count, time.Now()) y, err := lp.NewMessage("nv_nvlink_crc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
y.AddTag("stype-id", strconv.Itoa(i)) y.AddTag("stype-id", fmt.Sprintf("%d", i))
output <- y output <- y
} }
} }
@@ -1020,12 +1028,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
if !device.excludeMetrics["nv_nvlink_ecc_errors"] { if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
// Data link receive data ECC error counter // Data link receive data ECC error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA) count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA)
aggregate_ecc_errors += count aggregate_ecc_errors = aggregate_ecc_errors + count
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_nvlink_ecc_errors", device.tags, device.meta, count, time.Now()) y, err := lp.NewMessage("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
y.AddTag("stype-id", strconv.Itoa(i)) y.AddTag("stype-id", fmt.Sprintf("%d", i))
output <- y output <- y
} }
} }
@@ -1033,12 +1041,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
if !device.excludeMetrics["nv_nvlink_replay_errors"] { if !device.excludeMetrics["nv_nvlink_replay_errors"] {
// Data link transmit replay error counter // Data link transmit replay error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY) count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY)
aggregate_replay_errors += count aggregate_replay_errors = aggregate_replay_errors + count
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_nvlink_replay_errors", device.tags, device.meta, count, time.Now()) y, err := lp.NewMessage("nv_nvlink_replay_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
y.AddTag("stype-id", strconv.Itoa(i)) y.AddTag("stype-id", fmt.Sprintf("%d", i))
output <- y output <- y
} }
} }
@@ -1046,12 +1054,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
if !device.excludeMetrics["nv_nvlink_recovery_errors"] { if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
// Data link transmit recovery error counter // Data link transmit recovery error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY) count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY)
aggregate_recovery_errors += count aggregate_recovery_errors = aggregate_recovery_errors + count
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_nvlink_recovery_errors", device.tags, device.meta, count, time.Now()) y, err := lp.NewMessage("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
y.AddTag("stype-id", strconv.Itoa(i)) y.AddTag("stype-id", fmt.Sprintf("%d", i))
output <- y output <- y
} }
} }
@@ -1059,12 +1067,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] { if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
// Data link receive flow control digit CRC error counter // Data link receive flow control digit CRC error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT) count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT)
aggregate_crc_flit_errors += count aggregate_crc_flit_errors = aggregate_crc_flit_errors + count
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMetric("nv_nvlink_crc_flit_errors", device.tags, device.meta, count, time.Now()) y, err := lp.NewMessage("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
y.AddTag("stype-id", strconv.Itoa(i)) y.AddTag("stype-id", fmt.Sprintf("%d", i))
output <- y output <- y
} }
} }
@@ -1076,7 +1084,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
// Export aggegated values // Export aggegated values
if !device.excludeMetrics["nv_nvlink_crc_errors"] { if !device.excludeMetrics["nv_nvlink_crc_errors"] {
// Data link receive data CRC error counter // Data link receive data CRC error counter
y, err := lp.NewMetric("nv_nvlink_crc_errors_sum", device.tags, device.meta, aggregate_crc_errors, time.Now()) y, err := lp.NewMessage("nv_nvlink_crc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_errors}, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
output <- y output <- y
@@ -1084,7 +1092,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
} }
if !device.excludeMetrics["nv_nvlink_ecc_errors"] { if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
// Data link receive data ECC error counter // Data link receive data ECC error counter
y, err := lp.NewMetric("nv_nvlink_ecc_errors_sum", device.tags, device.meta, aggregate_ecc_errors, time.Now()) y, err := lp.NewMessage("nv_nvlink_ecc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_ecc_errors}, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
output <- y output <- y
@@ -1092,7 +1100,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
} }
if !device.excludeMetrics["nv_nvlink_replay_errors"] { if !device.excludeMetrics["nv_nvlink_replay_errors"] {
// Data link transmit replay error counter // Data link transmit replay error counter
y, err := lp.NewMetric("nv_nvlink_replay_errors_sum", device.tags, device.meta, aggregate_replay_errors, time.Now()) y, err := lp.NewMessage("nv_nvlink_replay_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_replay_errors}, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
output <- y output <- y
@@ -1100,7 +1108,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
} }
if !device.excludeMetrics["nv_nvlink_recovery_errors"] { if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
// Data link transmit recovery error counter // Data link transmit recovery error counter
y, err := lp.NewMetric("nv_nvlink_recovery_errors_sum", device.tags, device.meta, aggregate_recovery_errors, time.Now()) y, err := lp.NewMessage("nv_nvlink_recovery_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_recovery_errors}, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
output <- y output <- y
@@ -1108,7 +1116,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
} }
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] { if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
// Data link receive flow control digit CRC error counter // Data link receive flow control digit CRC error counter
y, err := lp.NewMetric("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, aggregate_crc_flit_errors, time.Now()) y, err := lp.NewMessage("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_flit_errors}, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
output <- y output <- y
@@ -1248,7 +1256,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
} }
cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i) cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i)
for j := range maxMig { for j := 0; j < maxMig; j++ {
mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j) mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
continue continue
@@ -1265,7 +1273,9 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
meta: map[string]string{}, meta: map[string]string{},
excludeMetrics: excludeMetrics, excludeMetrics: excludeMetrics,
} }
maps.Copy(migDevice.tags, m.gpus[i].tags) for k, v := range m.gpus[i].tags {
migDevice.tags[k] = v
}
migDevice.tags["stype"] = "mig" migDevice.tags["stype"] = "mig"
if m.config.UseUuidForMigDevices { if m.config.UseUuidForMigDevices {
uuid, ret := nvml.DeviceGetUUID(mdev) uuid, ret := nvml.DeviceGetUUID(mdev)
@@ -1279,17 +1289,19 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
mname, ret := nvml.DeviceGetName(mdev) mname, ret := nvml.DeviceGetName(mdev)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
x := strings.ReplaceAll(mname, name, "") x := strings.Replace(mname, name, "", -1)
x = strings.ReplaceAll(x, "MIG", "") x = strings.Replace(x, "MIG", "", -1)
x = strings.TrimSpace(x) x = strings.TrimSpace(x)
migDevice.tags["stype-id"] = x migDevice.tags["stype-id"] = x
} }
} }
} }
if _, ok := migDevice.tags["stype-id"]; !ok { if _, ok := migDevice.tags["stype-id"]; !ok {
migDevice.tags["stype-id"] = strconv.Itoa(j) migDevice.tags["stype-id"] = fmt.Sprintf("%d", j)
}
for k, v := range m.gpus[i].meta {
migDevice.meta[k] = v
} }
maps.Copy(migDevice.meta, m.gpus[i].meta)
if _, ok := migDevice.meta["uuid"]; ok && !m.config.UseUuidForMigDevices { if _, ok := migDevice.meta["uuid"]; ok && !m.config.UseUuidForMigDevices {
uuid, ret := nvml.DeviceGetUUID(mdev) uuid, ret := nvml.DeviceGetUUID(mdev)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
@@ -1305,9 +1317,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
func (m *NvidiaCollector) Close() { func (m *NvidiaCollector) Close() {
if m.init { if m.init {
if ret := nvml.Shutdown(); ret != nvml.SUCCESS { nvml.Shutdown()
cclog.ComponentError(m.name, "nvml.Shutdown() not successful")
}
m.init = false m.init = false
} }
} }

View File

@@ -16,8 +16,8 @@ import (
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
// running average power limit (RAPL) monitoring attributes for a zone // running average power limit (RAPL) monitoring attributes for a zone
@@ -54,10 +54,9 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
return nil return nil
} }
var err error = nil
m.name = "RAPLCollector" m.name = "RAPLCollector"
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true m.parallel = true
m.meta = map[string]string{ m.meta = map[string]string{
"source": m.name, "source": m.name,
@@ -67,7 +66,7 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
// Read in the JSON configuration // Read in the JSON configuration
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &m.config) err = json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error()) cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err return err
@@ -249,7 +248,7 @@ func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) {
"rapl_average_power", "rapl_average_power",
p.tags, p.tags,
m.meta, m.meta,
map[string]any{"value": averagePower}, map[string]interface{}{"value": averagePower},
energyTimestamp) energyTimestamp)
if err == nil { if err == nil {
output <- y output <- y

View File

@@ -11,12 +11,10 @@ import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"slices"
"strconv"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi" "github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
) )
@@ -54,9 +52,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
// Always set the name early in Init() to use it in cclog.Component* functions // Always set the name early in Init() to use it in cclog.Component* functions
m.name = "RocmSmiCollector" m.name = "RocmSmiCollector"
// This is for later use, also call it early // This is for later use, also call it early
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
// Define meta information sent with each metric // Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta()) // (Can also be dynamic or this is the basic set with extension through AddMeta())
//m.meta = map[string]string{"source": m.name, "group": "AMD"} //m.meta = map[string]string{"source": m.name, "group": "AMD"}
@@ -89,11 +85,22 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
return err return err
} }
exclDev := func(s string) bool {
skip_device := false
for _, excl := range m.config.ExcludeDevices {
if excl == s {
skip_device = true
break
}
}
return skip_device
}
m.devices = make([]RocmSmiCollectorDevice, 0) m.devices = make([]RocmSmiCollectorDevice, 0)
for i := range numDevs { for i := 0; i < numDevs; i++ {
str_i := strconv.Itoa(i) str_i := fmt.Sprintf("%d", i)
if slices.Contains(m.config.ExcludeDevices, str_i) { if exclDev(str_i) {
continue continue
} }
device, ret := rocm_smi.DeviceGetHandleByIndex(i) device, ret := rocm_smi.DeviceGetHandleByIndex(i)
@@ -117,7 +124,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
pciInfo.Device, pciInfo.Device,
pciInfo.Function) pciInfo.Function)
if slices.Contains(m.config.ExcludeDevices, pciId) { if exclDev(pciId) {
continue continue
} }
@@ -175,130 +182,130 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage
if !dev.excludeMetrics["rocm_gfx_util"] { if !dev.excludeMetrics["rocm_gfx_util"] {
value := metrics.Average_gfx_activity value := metrics.Average_gfx_activity
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_umc_util"] { if !dev.excludeMetrics["rocm_umc_util"] {
value := metrics.Average_umc_activity value := metrics.Average_umc_activity
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_mm_util"] { if !dev.excludeMetrics["rocm_mm_util"] {
value := metrics.Average_mm_activity value := metrics.Average_mm_activity
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_avg_power"] { if !dev.excludeMetrics["rocm_avg_power"] {
value := metrics.Average_socket_power value := metrics.Average_socket_power
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_temp_mem"] { if !dev.excludeMetrics["rocm_temp_mem"] {
value := metrics.Temperature_mem value := metrics.Temperature_mem
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_temp_hotspot"] { if !dev.excludeMetrics["rocm_temp_hotspot"] {
value := metrics.Temperature_hotspot value := metrics.Temperature_hotspot
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_temp_edge"] { if !dev.excludeMetrics["rocm_temp_edge"] {
value := metrics.Temperature_edge value := metrics.Temperature_edge
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_temp_vrgfx"] { if !dev.excludeMetrics["rocm_temp_vrgfx"] {
value := metrics.Temperature_vrgfx value := metrics.Temperature_vrgfx
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_temp_vrsoc"] { if !dev.excludeMetrics["rocm_temp_vrsoc"] {
value := metrics.Temperature_vrsoc value := metrics.Temperature_vrsoc
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_temp_vrmem"] { if !dev.excludeMetrics["rocm_temp_vrmem"] {
value := metrics.Temperature_vrmem value := metrics.Temperature_vrmem
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_gfx_clock"] { if !dev.excludeMetrics["rocm_gfx_clock"] {
value := metrics.Average_gfxclk_frequency value := metrics.Average_gfxclk_frequency
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_soc_clock"] { if !dev.excludeMetrics["rocm_soc_clock"] {
value := metrics.Average_socclk_frequency value := metrics.Average_socclk_frequency
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_u_clock"] { if !dev.excludeMetrics["rocm_u_clock"] {
value := metrics.Average_uclk_frequency value := metrics.Average_uclk_frequency
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_v0_clock"] { if !dev.excludeMetrics["rocm_v0_clock"] {
value := metrics.Average_vclk0_frequency value := metrics.Average_vclk0_frequency
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_v1_clock"] { if !dev.excludeMetrics["rocm_v1_clock"] {
value := metrics.Average_vclk1_frequency value := metrics.Average_vclk1_frequency
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_d0_clock"] { if !dev.excludeMetrics["rocm_d0_clock"] {
value := metrics.Average_dclk0_frequency value := metrics.Average_dclk0_frequency
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_d1_clock"] { if !dev.excludeMetrics["rocm_d1_clock"] {
value := metrics.Average_dclk1_frequency value := metrics.Average_dclk1_frequency
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !dev.excludeMetrics["rocm_temp_hbm"] { if !dev.excludeMetrics["rocm_temp_hbm"] {
for i := range rocm_smi.NUM_HBM_INSTANCES { for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ {
value := metrics.Temperature_hbm[i] value := metrics.Temperature_hbm[i]
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
y.AddTag("stype", "device") y.AddTag("stype", "device")
y.AddTag("stype-id", strconv.Itoa(i)) y.AddTag("stype-id", fmt.Sprintf("%d", i))
output <- y output <- y
} }
} }

View File

@@ -15,9 +15,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/rocmsmi.md
```json ```json
"rocm_smi": { "rocm_smi": {
"exclude_devices": [ "exclude_devices": [
"0", "0","1", "0000000:ff:01.0"
"1",
"0000000:ff:01.0"
], ],
"exclude_metrics": [ "exclude_metrics": [
"rocm_mm_util", "rocm_mm_util",
@@ -25,7 +23,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/rocmsmi.md
], ],
"use_pci_info_as_type_id": true, "use_pci_info_as_type_id": true,
"add_pci_info_tag": false, "add_pci_info_tag": false,
"add_serial_meta": false "add_serial_meta": false,
} }
``` ```

View File

@@ -9,11 +9,10 @@ package collectors
import ( import (
"encoding/json" "encoding/json"
"fmt"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
// These are the fields we read from the JSON configuration // These are the fields we read from the JSON configuration
@@ -42,9 +41,7 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
// Always set the name early in Init() to use it in cclog.Component* functions // Always set the name early in Init() to use it in cclog.Component* functions
m.name = "SampleCollector" m.name = "SampleCollector"
// This is for later use, also call it early // This is for later use, also call it early
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
// Tell whether the collector should be run in parallel with others (reading files, ...) // Tell whether the collector should be run in parallel with others (reading files, ...)
// or it should be run serially, mostly for collectors actually doing measurements // or it should be run serially, mostly for collectors actually doing measurements
// because they should not measure the execution of the other collectors // because they should not measure the execution of the other collectors
@@ -95,7 +92,7 @@ func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMessage)
// stop := readState() // stop := readState()
// value = (stop - start) / interval.Seconds() // value = (stop - start) / interval.Seconds()
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil { if err == nil {
// Send it to output channel // Send it to output channel
output <- y output <- y

View File

@@ -9,12 +9,11 @@ package collectors
import ( import (
"encoding/json" "encoding/json"
"fmt"
"sync" "sync"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
// These are the fields we read from the JSON configuration // These are the fields we read from the JSON configuration
@@ -37,13 +36,11 @@ type SampleTimerCollector struct {
} }
func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error { func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
var err error var err error = nil
// Always set the name early in Init() to use it in cclog.Component* functions // Always set the name early in Init() to use it in cclog.Component* functions
m.name = "SampleTimerCollector" m.name = "SampleTimerCollector"
// This is for later use, also call it early // This is for later use, also call it early
if err = m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
// Define meta information sent with each metric // Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta()) // (Can also be dynamic or this is the basic set with extension through AddMeta())
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"} m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
@@ -110,7 +107,7 @@ func (m *SampleTimerCollector) ReadMetrics(timestamp time.Time) {
// stop := readState() // stop := readState()
// value = (stop - start) / interval.Seconds() // value = (stop - start) / interval.Seconds()
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil && m.output != nil { if err == nil && m.output != nil {
// Send it to output channel if we have a valid channel // Send it to output channel if we have a valid channel
m.output <- y m.output <- y

View File

@@ -11,13 +11,14 @@ import (
"bufio" "bufio"
"encoding/json" "encoding/json"
"fmt" "fmt"
"math"
"os" "os"
"strconv" "strconv"
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
const SCHEDSTATFILE = `/proc/schedstat` const SCHEDSTATFILE = `/proc/schedstat`
@@ -46,37 +47,37 @@ type SchedstatCollector struct {
// Called once by the collector manager // Called once by the collector manager
// All tags, meta data tags and metrics that do not change over the runtime should be set here // All tags, meta data tags and metrics that do not change over the runtime should be set here
func (m *SchedstatCollector) Init(config json.RawMessage) error { func (m *SchedstatCollector) Init(config json.RawMessage) error {
var err error = nil
// Always set the name early in Init() to use it in cclog.Component* functions // Always set the name early in Init() to use it in cclog.Component* functions
m.name = "SchedstatCollector" m.name = "SchedstatCollector"
// This is for later use, also call it early // This is for later use, also call it early
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
// Tell whether the collector should be run in parallel with others (reading files, ...) // Tell whether the collector should be run in parallel with others (reading files, ...)
// or it should be run serially, mostly for collectors actually doing measurements // or it should be run serially, mostly for collectors acutally doing measurements
// because they should not measure the execution of the other collectors // because they should not measure the execution of the other collectors
m.parallel = true m.parallel = true
// Define meta information sent with each metric // Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta()) // (Can also be dynamic or this is the basic set with extension through AddMeta())
m.meta = map[string]string{ m.meta = map[string]string{"source": m.name, "group": "SCHEDSTAT"}
"source": m.name,
"group": "SCHEDSTAT",
}
// Read in the JSON configuration // Read in the JSON configuration
if len(config) > 0 { if len(config) > 0 {
if err := json.Unmarshal(config, &m.config); err != nil { err = json.Unmarshal(config, &m.config)
return fmt.Errorf("%s Init(): Error reading config: %w", m.name, err) if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
} }
} }
// Check input file // Check input file
file, err := os.Open(SCHEDSTATFILE) file, err := os.Open(string(SCHEDSTATFILE))
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): Failed opening scheduler statistics file \"%s\": %w", m.name, SCHEDSTATFILE, err) cclog.ComponentError(m.name, err.Error())
} }
defer file.Close()
// Pre-generate tags for all CPUs // Pre-generate tags for all CPUs
num_cpus := 0
m.cputags = make(map[string]map[string]string) m.cputags = make(map[string]map[string]string)
m.olddata = make(map[string]map[string]int64) m.olddata = make(map[string]map[string]int64)
scanner := bufio.NewScanner(file) scanner := bufio.NewScanner(file)
@@ -88,18 +89,10 @@ func (m *SchedstatCollector) Init(config json.RawMessage) error {
cpu, _ := strconv.Atoi(cpustr) cpu, _ := strconv.Atoi(cpustr)
running, _ := strconv.ParseInt(linefields[7], 10, 64) running, _ := strconv.ParseInt(linefields[7], 10, 64)
waiting, _ := strconv.ParseInt(linefields[8], 10, 64) waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
m.cputags[linefields[0]] = map[string]string{ m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
"type": "hwthread", m.olddata[linefields[0]] = map[string]int64{"running": running, "waiting": waiting}
"type-id": strconv.Itoa(cpu), num_cpus++
} }
m.olddata[linefields[0]] = map[string]int64{
"running": running,
"waiting": waiting,
}
}
}
if err := file.Close(); err != nil {
return fmt.Errorf("%s Init(): Failed closing scheduler statistics file \"%s\": %w", m.name, SCHEDSTATFILE, err)
} }
// Save current timestamp // Save current timestamp
@@ -116,14 +109,14 @@ func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]
diff_running := running - m.olddata[linefields[0]]["running"] diff_running := running - m.olddata[linefields[0]]["running"]
diff_waiting := waiting - m.olddata[linefields[0]]["waiting"] diff_waiting := waiting - m.olddata[linefields[0]]["waiting"]
l_running := float64(diff_running) / tsdelta.Seconds() / 1000_000_000 var l_running float64 = float64(diff_running) / tsdelta.Seconds() / (math.Pow(1000, 3))
l_waiting := float64(diff_waiting) / tsdelta.Seconds() / 1000_000_000 var l_waiting float64 = float64(diff_waiting) / tsdelta.Seconds() / (math.Pow(1000, 3))
m.olddata[linefields[0]]["running"] = running m.olddata[linefields[0]]["running"] = running
m.olddata[linefields[0]]["waiting"] = waiting m.olddata[linefields[0]]["waiting"] = waiting
value := l_running + l_waiting value := l_running + l_waiting
y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]any{"value": value}, now) y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now)
if err == nil { if err == nil {
// Send it to output channel // Send it to output channel
output <- y output <- y
@@ -141,19 +134,11 @@ func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessa
now := time.Now() now := time.Now()
tsdelta := now.Sub(m.lastTimestamp) tsdelta := now.Sub(m.lastTimestamp)
file, err := os.Open(SCHEDSTATFILE) file, err := os.Open(string(SCHEDSTATFILE))
if err != nil { if err != nil {
cclog.ComponentError( cclog.ComponentError(m.name, err.Error())
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", SCHEDSTATFILE, err))
} }
defer func() { defer file.Close()
if err := file.Close(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to close file '%s': %v", SCHEDSTATFILE, err))
}
}()
scanner := bufio.NewScanner(file) scanner := bufio.NewScanner(file)
for scanner.Scan() { for scanner.Scan() {

View File

@@ -9,13 +9,12 @@ package collectors
import ( import (
"encoding/json" "encoding/json"
"fmt"
"runtime" "runtime"
"syscall" "syscall"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
type SelfCollectorConfig struct { type SelfCollectorConfig struct {
@@ -35,9 +34,7 @@ type SelfCollector struct {
func (m *SelfCollector) Init(config json.RawMessage) error { func (m *SelfCollector) Init(config json.RawMessage) error {
var err error = nil var err error = nil
m.name = "SelfCollector" m.name = "SelfCollector"
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Self"} m.meta = map[string]string{"source": m.name, "group": "Self"}
m.tags = map[string]string{"type": "node"} m.tags = map[string]string{"type": "node"}
@@ -59,49 +56,49 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
var memstats runtime.MemStats var memstats runtime.MemStats
runtime.ReadMemStats(&memstats) runtime.ReadMemStats(&memstats)
y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]any{"value": memstats.TotalAlloc}, timestamp) y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp)
if err == nil { if err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]any{"value": memstats.HeapAlloc}, timestamp) y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp)
if err == nil { if err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]any{"value": memstats.HeapSys}, timestamp) y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp)
if err == nil { if err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]any{"value": memstats.HeapIdle}, timestamp) y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp)
if err == nil { if err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]any{"value": memstats.HeapInuse}, timestamp) y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp)
if err == nil { if err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]any{"value": memstats.HeapReleased}, timestamp) y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp)
if err == nil { if err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]any{"value": memstats.HeapObjects}, timestamp) y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if m.config.GoRoutines { if m.config.GoRoutines {
y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]any{"value": runtime.NumGoroutine()}, timestamp) y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if m.config.CgoCalls { if m.config.CgoCalls {
y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]any{"value": runtime.NumCgoCall()}, timestamp) y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -112,35 +109,35 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if err == nil { if err == nil {
sec, nsec := rusage.Utime.Unix() sec, nsec := rusage.Utime.Unix()
t := float64(sec) + (float64(nsec) * 1e-9) t := float64(sec) + (float64(nsec) * 1e-9)
y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]any{"value": t}, timestamp) y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
if err == nil { if err == nil {
y.AddMeta("unit", "seconds") y.AddMeta("unit", "seconds")
output <- y output <- y
} }
sec, nsec = rusage.Stime.Unix() sec, nsec = rusage.Stime.Unix()
t = float64(sec) + (float64(nsec) * 1e-9) t = float64(sec) + (float64(nsec) * 1e-9)
y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]any{"value": t}, timestamp) y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
if err == nil { if err == nil {
y.AddMeta("unit", "seconds") y.AddMeta("unit", "seconds")
output <- y output <- y
} }
y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]any{"value": rusage.Nvcsw}, timestamp) y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]any{"value": rusage.Nivcsw}, timestamp) y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]any{"value": rusage.Nsignals}, timestamp) y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]any{"value": rusage.Majflt}, timestamp) y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }
y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]any{"value": rusage.Minflt}, timestamp) y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp)
if err == nil { if err == nil {
output <- y output <- y
} }

View File

@@ -11,8 +11,8 @@ import (
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
type SlurmJobData struct { type SlurmJobData struct {
@@ -50,7 +50,8 @@ func ParseCPUs(cpuset string) ([]int, error) {
return result, nil return result, nil
} }
for r := range strings.SplitSeq(cpuset, ",") { ranges := strings.Split(cpuset, ",")
for _, r := range ranges {
if strings.Contains(r, "-") { if strings.Contains(r, "-") {
parts := strings.Split(r, "-") parts := strings.Split(r, "-")
if len(parts) != 2 { if len(parts) != 2 {
@@ -79,10 +80,9 @@ func ParseCPUs(cpuset string) ([]int, error) {
} }
func GetAllCPUs() ([]int, error) { func GetAllCPUs() ([]int, error) {
cpuOnline := "/sys/devices/system/cpu/online" data, err := os.ReadFile("/sys/devices/system/cpu/online")
data, err := os.ReadFile(cpuOnline)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to read file \"%s\": %w", cpuOnline, err) return nil, fmt.Errorf("failed to read /sys/devices/system/cpu/online: %v", err)
} }
return ParseCPUs(strings.TrimSpace(string(data))) return ParseCPUs(strings.TrimSpace(string(data)))
} }
@@ -103,22 +103,18 @@ func (m *SlurmCgroupCollector) readFile(path string) ([]byte, error) {
func (m *SlurmCgroupCollector) Init(config json.RawMessage) error { func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
var err error var err error
m.name = "SlurmCgroupCollector" m.name = "SlurmCgroupCollector"
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
m.parallel = true m.parallel = true
m.meta = map[string]string{ m.meta = map[string]string{"source": m.name, "group": "SLURM"}
"source": m.name, m.tags = map[string]string{"type": "hwthread"}
"group": "SLURM"}
m.tags = map[string]string{
"type": "hwthread"}
m.cpuUsed = make(map[int]bool) m.cpuUsed = make(map[int]bool)
m.cgroupBase = defaultCgroupBase m.cgroupBase = defaultCgroupBase
if len(config) > 0 { if len(config) > 0 {
err = json.Unmarshal(config, &m.config) err = json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): Error reading JSON config: %w", m.name, err) cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
} }
m.excludeMetrics = make(map[string]struct{}) m.excludeMetrics = make(map[string]struct{})
for _, metric := range m.config.ExcludeMetrics { for _, metric := range m.config.ExcludeMetrics {
@@ -133,16 +129,19 @@ func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
if !m.useSudo { if !m.useSudo {
user, err := user.Current() user, err := user.Current()
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err) cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
return err
} }
if user.Uid != "0" { if user.Uid != "0" {
return fmt.Errorf("%s Init(): Reading cgroup files requires root privileges (or enable use_sudo in config)", m.name) cclog.ComponentError(m.name, "Reading cgroup files requires root privileges (or enable use_sudo in config)")
return fmt.Errorf("not root")
} }
} }
m.allCPUs, err = GetAllCPUs() m.allCPUs, err = GetAllCPUs()
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): Error reading online CPUs: %w", m.name, err) cclog.ComponentError(m.name, "Error reading online CPUs:", err.Error())
return err
} }
m.init = true m.init = true
@@ -159,9 +158,7 @@ func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error)
CpuSet: []int{}, CpuSet: []int{},
} }
cg := func(f string) string { cg := func(f string) string { return filepath.Join(m.cgroupBase, jobdir, f) }
return filepath.Join(m.cgroupBase, jobdir, f)
}
memUsage, err := m.readFile(cg("memory.current")) memUsage, err := m.readFile(cg("memory.current"))
if err == nil { if err == nil {
@@ -210,8 +207,8 @@ func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error)
} }
} }
if usageUsec > 0 { if usageUsec > 0 {
jobdata.CpuUsageUser = (userUsec * 100.0 / usageUsec) jobdata.CpuUsageUser = (userUsec * 100 / usageUsec)
jobdata.CpuUsageSys = (systemUsec * 100.0 / usageUsec) jobdata.CpuUsageSys = (systemUsec * 100 / usageUsec)
} }
} }
@@ -254,18 +251,12 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
for _, cpu := range jobdata.CpuSet { for _, cpu := range jobdata.CpuSet {
coreTags := map[string]string{ coreTags := map[string]string{
"type": "hwthread", "type": "hwthread",
"type-id": strconv.Itoa(cpu), "type-id": fmt.Sprintf("%d", cpu),
} }
if coreCount > 0 && !m.isExcluded("job_mem_used") { if coreCount > 0 && !m.isExcluded("job_mem_used") {
memPerCore := jobdata.MemoryUsage / coreCount memPerCore := jobdata.MemoryUsage / coreCount
if y, err := lp.NewMessage( if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]interface{}{"value": memPerCore}, timestamp); err == nil {
"job_mem_used",
coreTags,
m.meta,
map[string]any{
"value": memPerCore},
timestamp); err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
@@ -273,13 +264,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_max_mem_used") { if coreCount > 0 && !m.isExcluded("job_max_mem_used") {
maxMemPerCore := jobdata.MaxMemoryUsage / coreCount maxMemPerCore := jobdata.MaxMemoryUsage / coreCount
if y, err := lp.NewMessage( if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]interface{}{"value": maxMemPerCore}, timestamp); err == nil {
"job_max_mem_used",
coreTags,
m.meta,
map[string]any{
"value": maxMemPerCore},
timestamp); err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
@@ -287,13 +272,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_mem_limit") { if coreCount > 0 && !m.isExcluded("job_mem_limit") {
limitPerCore := jobdata.LimitMemoryUsage / coreCount limitPerCore := jobdata.LimitMemoryUsage / coreCount
if y, err := lp.NewMessage( if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]interface{}{"value": limitPerCore}, timestamp); err == nil {
"job_mem_limit",
coreTags,
m.meta,
map[string]any{
"value": limitPerCore},
timestamp); err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
@@ -301,13 +280,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_user_cpu") { if coreCount > 0 && !m.isExcluded("job_user_cpu") {
cpuUserPerCore := jobdata.CpuUsageUser / coreCount cpuUserPerCore := jobdata.CpuUsageUser / coreCount
if y, err := lp.NewMessage( if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]interface{}{"value": cpuUserPerCore}, timestamp); err == nil {
"job_user_cpu",
coreTags,
m.meta,
map[string]any{
"value": cpuUserPerCore},
timestamp); err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
} }
@@ -315,13 +288,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_sys_cpu") { if coreCount > 0 && !m.isExcluded("job_sys_cpu") {
cpuSysPerCore := jobdata.CpuUsageSys / coreCount cpuSysPerCore := jobdata.CpuUsageSys / coreCount
if y, err := lp.NewMessage( if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]interface{}{"value": cpuSysPerCore}, timestamp); err == nil {
"job_sys_cpu",
coreTags,
m.meta,
map[string]any{
"value": cpuSysPerCore},
timestamp); err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
} }
@@ -336,57 +303,39 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if !m.cpuUsed[cpu] { if !m.cpuUsed[cpu] {
coreTags := map[string]string{ coreTags := map[string]string{
"type": "hwthread", "type": "hwthread",
"type-id": strconv.Itoa(cpu), "type-id": fmt.Sprintf("%d", cpu),
} }
if !m.isExcluded("job_mem_used") { if !m.isExcluded("job_mem_used") {
if y, err := lp.NewMessage( if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
"job_mem_used",
coreTags,
m.meta,
map[string]any{
"value": 0},
timestamp); err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
} }
if !m.isExcluded("job_max_mem_used") { if !m.isExcluded("job_max_mem_used") {
if y, err := lp.NewMessage( if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
"job_max_mem_used",
coreTags,
m.meta,
map[string]any{
"value": 0},
timestamp); err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
} }
if !m.isExcluded("job_mem_limit") { if !m.isExcluded("job_mem_limit") {
if y, err := lp.NewMessage( if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
"job_mem_limit",
coreTags,
m.meta,
map[string]any{
"value": 0},
timestamp); err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
} }
if !m.isExcluded("job_user_cpu") { if !m.isExcluded("job_user_cpu") {
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil { if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
} }
} }
if !m.isExcluded("job_sys_cpu") { if !m.isExcluded("job_sys_cpu") {
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil { if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
} }

View File

@@ -16,8 +16,8 @@ import (
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
) )
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html // See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
@@ -58,13 +58,11 @@ func (m *TempCollector) Init(config json.RawMessage) error {
m.name = "TempCollector" m.name = "TempCollector"
m.parallel = true m.parallel = true
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &m.config) err := json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err) return err
} }
} }
@@ -80,10 +78,10 @@ func (m *TempCollector) Init(config json.RawMessage) error {
globPattern := filepath.Join("/sys/class/hwmon", "*", "temp*_input") globPattern := filepath.Join("/sys/class/hwmon", "*", "temp*_input")
inputFiles, err := filepath.Glob(globPattern) inputFiles, err := filepath.Glob(globPattern)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): unable to glob files with pattern '%s': %w", m.name, globPattern, err) return fmt.Errorf("unable to glob files with pattern '%s': %v", globPattern, err)
} }
if inputFiles == nil { if inputFiles == nil {
return fmt.Errorf("%s Init(): unable to find any files with pattern '%s'", m.name, globPattern) return fmt.Errorf("unable to find any files with pattern '%s'", globPattern)
} }
// Get sensor name for each temperature sensor file // Get sensor name for each temperature sensor file
@@ -119,7 +117,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
sensor.metricName = sensor.label sensor.metricName = sensor.label
} }
sensor.metricName = strings.ToLower(sensor.metricName) sensor.metricName = strings.ToLower(sensor.metricName)
sensor.metricName = strings.ReplaceAll(sensor.metricName, " ", "_") sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1)
// Add temperature prefix, if required // Add temperature prefix, if required
if !strings.Contains(sensor.metricName, "temp") { if !strings.Contains(sensor.metricName, "temp") {
sensor.metricName = "temp_" + sensor.metricName sensor.metricName = "temp_" + sensor.metricName
@@ -172,7 +170,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
// Empty sensors map // Empty sensors map
if len(m.sensors) == 0 { if len(m.sensors) == 0 {
return fmt.Errorf("%s Init(): no temperature sensors found", m.name) return fmt.Errorf("no temperature sensors found")
} }
// Finished initialization // Finished initialization
@@ -203,7 +201,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
sensor.metricName, sensor.metricName,
sensor.tags, sensor.tags,
m.meta, m.meta,
map[string]any{"value": x}, map[string]interface{}{"value": x},
time.Now(), time.Now(),
) )
if err == nil { if err == nil {
@@ -216,7 +214,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
sensor.maxTempName, sensor.maxTempName,
sensor.tags, sensor.tags,
m.meta, m.meta,
map[string]any{"value": sensor.maxTemp}, map[string]interface{}{"value": sensor.maxTemp},
time.Now(), time.Now(),
) )
if err == nil { if err == nil {
@@ -230,7 +228,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
sensor.critTempName, sensor.critTempName,
sensor.tags, sensor.tags,
m.meta, m.meta,
map[string]any{"value": sensor.critTemp}, map[string]interface{}{"value": sensor.critTemp},
time.Now(), time.Now(),
) )
if err == nil { if err == nil {

View File

@@ -9,13 +9,14 @@ package collectors
import ( import (
"encoding/json" "encoding/json"
"errors"
"fmt" "fmt"
"log"
"os/exec" "os/exec"
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
) )
const MAX_NUM_PROCS = 10 const MAX_NUM_PROCS = 10
@@ -35,17 +36,12 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
var err error var err error
m.name = "TopProcsCollector" m.name = "TopProcsCollector"
m.parallel = true m.parallel = true
m.tags = map[string]string{ m.tags = map[string]string{"type": "node"}
"type": "node", m.meta = map[string]string{"source": m.name, "group": "TopProcs"}
}
m.meta = map[string]string{
"source": m.name,
"group": "TopProcs",
}
if len(config) > 0 { if len(config) > 0 {
err = json.Unmarshal(config, &m.config) err = json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): json.Unmarshal() failed: %w", m.name, err) return err
} }
} else { } else {
m.config.Num_procs = int(DEFAULT_NUM_PROCS) m.config.Num_procs = int(DEFAULT_NUM_PROCS)
@@ -53,13 +49,12 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
if m.config.Num_procs <= 0 || m.config.Num_procs > MAX_NUM_PROCS { if m.config.Num_procs <= 0 || m.config.Num_procs > MAX_NUM_PROCS {
return fmt.Errorf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS) return fmt.Errorf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS)
} }
if err := m.setup(); err != nil { m.setup()
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
}
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu") command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
command.Wait()
_, err = command.Output() _, err = command.Output()
if err != nil { if err != nil {
return fmt.Errorf("%s Init(): failed to get output from command: %w", m.name, err) return errors.New("failed to execute command")
} }
m.init = true m.init = true
return nil return nil
@@ -70,24 +65,17 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessag
return return
} }
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu") command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
command.Wait()
stdout, err := command.Output() stdout, err := command.Output()
if err != nil { if err != nil {
cclog.ComponentError( log.Print(m.name, err)
m.name,
fmt.Sprintf("Read(): Failed to read output from command \"%s\": %v", command.String(), err))
return return
} }
lines := strings.Split(string(stdout), "\n") lines := strings.Split(string(stdout), "\n")
for i := 1; i < m.config.Num_procs+1; i++ { for i := 1; i < m.config.Num_procs+1; i++ {
name := fmt.Sprintf("topproc%d", i) name := fmt.Sprintf("topproc%d", i)
y, err := lp.NewMessage( y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now())
name,
m.tags,
m.meta,
map[string]any{
"value": lines[i]},
time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }

View File

@@ -1,19 +1,6 @@
{ {
"cpufreq": {}, "cpufreq": {},
"cpufreq_cpuinfo": {}, "cpufreq_cpuinfo": {},
"cpustat": {
"exclude_metrics": [
"cpu_idle"
]
},
"diskstat": {
"exclude_metrics": [
"disk_total"
],
"exclude_mounts": [
"slurm-tmpfs"
]
},
"gpfs": { "gpfs": {
"exclude_filesystem": [ "exclude_filesystem": [
"test_fs" "test_fs"
@@ -34,8 +21,6 @@
}, },
"numastats": {}, "numastats": {},
"nvidia": {}, "nvidia": {},
"schedstat": {
},
"tempstat": { "tempstat": {
"report_max_temperature": true, "report_max_temperature": true,
"report_critical_temperature": true, "report_critical_temperature": true,

28
go.mod
View File

@@ -3,7 +3,7 @@ module github.com/ClusterCockpit/cc-metric-collector
go 1.24.0 go 1.24.0
require ( require (
github.com/ClusterCockpit/cc-lib/v2 v2.2.1 github.com/ClusterCockpit/cc-lib v0.11.0
github.com/ClusterCockpit/go-rocm-smi v0.3.0 github.com/ClusterCockpit/go-rocm-smi v0.3.0
github.com/NVIDIA/go-nvml v0.13.0-1 github.com/NVIDIA/go-nvml v0.13.0-1
github.com/PaesslerAG/gval v1.2.4 github.com/PaesslerAG/gval v1.2.4
@@ -11,35 +11,35 @@ require (
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf
github.com/tklauser/go-sysconf v0.3.16 github.com/tklauser/go-sysconf v0.3.16
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1
golang.org/x/sys v0.41.0 golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b
golang.org/x/sys v0.38.0
) )
require ( require (
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/expr-lang/expr v1.17.7 // indirect github.com/expr-lang/expr v1.17.6 // indirect
github.com/google/uuid v1.6.0 // indirect github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/mux v1.8.1 // indirect github.com/gorilla/mux v1.8.1 // indirect
github.com/influxdata/influxdb-client-go/v2 v2.14.0 // indirect github.com/influxdata/influxdb-client-go/v2 v2.14.0 // indirect
github.com/influxdata/line-protocol/v2 v2.2.1 // indirect github.com/influxdata/line-protocol/v2 v2.2.1 // indirect
github.com/klauspost/compress v1.18.2 // indirect github.com/klauspost/compress v1.18.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/nats-io/nats.go v1.48.0 // indirect github.com/nats-io/nats.go v1.47.0 // indirect
github.com/nats-io/nkeys v0.4.12 // indirect github.com/nats-io/nkeys v0.4.11 // indirect
github.com/nats-io/nuid v1.0.1 // indirect github.com/nats-io/nuid v1.0.1 // indirect
github.com/oapi-codegen/runtime v1.1.2 // indirect github.com/oapi-codegen/runtime v1.1.1 // indirect
github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_golang v1.23.2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.67.5 // indirect github.com/prometheus/common v0.66.1 // indirect
github.com/prometheus/procfs v0.19.2 // indirect github.com/prometheus/procfs v0.16.1 // indirect
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 // indirect github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 // indirect
github.com/shopspring/decimal v1.4.0 // indirect github.com/shopspring/decimal v1.3.1 // indirect
github.com/stmcginnis/gofish v0.20.0 // indirect github.com/stmcginnis/gofish v0.20.0 // indirect
github.com/tklauser/numcpus v0.11.0 // indirect github.com/tklauser/numcpus v0.11.0 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect
golang.org/x/crypto v0.47.0 // indirect golang.org/x/crypto v0.43.0 // indirect
golang.org/x/exp v0.0.0-20260112195511-716be5621a96 // indirect golang.org/x/net v0.45.0 // indirect
golang.org/x/net v0.49.0 // indirect google.golang.org/protobuf v1.36.8 // indirect
google.golang.org/protobuf v1.36.11 // indirect
) )

69
go.sum
View File

@@ -1,5 +1,5 @@
github.com/ClusterCockpit/cc-lib/v2 v2.2.1 h1:iCVas+Jc61zFH5S2VG3H1sc7tsn+U4lOJwUYjYZEims= github.com/ClusterCockpit/cc-lib v0.11.0 h1:66YkTOxWUak7nB3r7dJEm2q+B0uPRPGj0mwXZHXpOuA=
github.com/ClusterCockpit/cc-lib/v2 v2.2.1/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw= github.com/ClusterCockpit/cc-lib v0.11.0/go.mod h1:0LKjDJs813/NMmaSJXJc11A9rxiFDPV/QdWQbZUp0XY=
github.com/ClusterCockpit/go-rocm-smi v0.3.0 h1:1qZnSpG7/NyLtc7AjqnUL9Jb8xtqG1nMVgp69rJfaR8= github.com/ClusterCockpit/go-rocm-smi v0.3.0 h1:1qZnSpG7/NyLtc7AjqnUL9Jb8xtqG1nMVgp69rJfaR8=
github.com/ClusterCockpit/go-rocm-smi v0.3.0/go.mod h1:+I3UMeX3OlizXDf1WpGD43W4KGZZGVSGmny6rTeOnWA= github.com/ClusterCockpit/go-rocm-smi v0.3.0/go.mod h1:+I3UMeX3OlizXDf1WpGD43W4KGZZGVSGmny6rTeOnWA=
github.com/NVIDIA/go-nvml v0.11.6-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs= github.com/NVIDIA/go-nvml v0.11.6-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
@@ -10,8 +10,8 @@ github.com/PaesslerAG/gval v1.2.4/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbV
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI= github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8= github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk= github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op h1:Ucf+QxEKMbPogRO5guBNe5cgd9uZgfoJLOYs8WWhtjM= github.com/antithesishq/antithesis-sdk-go v0.4.3-default-no-op h1:+OSa/t11TFhqfrX0EOSqQBDJ0YlpmK0rDSiB19dg9M0=
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E= github.com/antithesishq/antithesis-sdk-go v0.4.3-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E=
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ= github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk= github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
@@ -23,8 +23,8 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/expr-lang/expr v1.17.7 h1:Q0xY/e/2aCIp8g9s/LGvMDCC5PxYlvHgDZRQ4y16JX8= github.com/expr-lang/expr v1.17.6 h1:1h6i8ONk9cexhDmowO/A64VPxHScu7qfSl2k8OlINec=
github.com/expr-lang/expr v1.17.7/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4= github.com/expr-lang/expr v1.17.6/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk= github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
@@ -35,8 +35,8 @@ github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/go-tpm v0.9.7 h1:u89J4tUUeDTlH8xxC3CTW7OHZjbjKoHdQ9W7gCUhtxA= github.com/google/go-tpm v0.9.6 h1:Ku42PT4LmjDu1H5C5ISWLlpI1mj+Zq7sPGKoRw2XROA=
github.com/google/go-tpm v0.9.7/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY= github.com/google/go-tpm v0.9.6/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
@@ -53,6 +53,8 @@ github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxks
github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE= github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE=
github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM= github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM=
github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE= github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
github.com/klauspost/compress v1.18.1 h1:bcSGx7UbpBqMChDtsF28Lw6v/G94LPrrbMbdC3JH2co=
github.com/klauspost/compress v1.18.1/go.mod h1:ZQFFVG+MdnR0P+l6wpXgIL4NTtwiKIdBnrBd8Nrxr+0=
github.com/klauspost/compress v1.18.2 h1:iiPHWW0YrcFgpBYhsA6D1+fqHssJscY/Tm/y2Uqnapk= github.com/klauspost/compress v1.18.2 h1:iiPHWW0YrcFgpBYhsA6D1+fqHssJscY/Tm/y2Uqnapk=
github.com/klauspost/compress v1.18.2/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= github.com/klauspost/compress v1.18.2/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
@@ -70,34 +72,33 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g= github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g=
github.com/nats-io/jwt/v2 v2.8.0/go.mod h1:me11pOkwObtcBNR8AiMrUbtVOUGkqYjMQZ6jnSdVUIA= github.com/nats-io/jwt/v2 v2.8.0/go.mod h1:me11pOkwObtcBNR8AiMrUbtVOUGkqYjMQZ6jnSdVUIA=
github.com/nats-io/nats-server/v2 v2.12.3 h1:KRv+1n7lddMVgkJPQer+pt36TcO0ENxjilBmeWdjcHs= github.com/nats-io/nats-server/v2 v2.12.2 h1:4TEQd0Y4zvcW0IsVxjlXnRso1hBkQl3TS0BI+SxgPhE=
github.com/nats-io/nats-server/v2 v2.12.3/go.mod h1:MQXjG9WjyXKz9koWzUc3jYUMKD8x3CLmTNy91IQQz3Y= github.com/nats-io/nats-server/v2 v2.12.2/go.mod h1:j1AAttYeu7WnvD8HLJ+WWKNMSyxsqmZ160pNtCQRMyE=
github.com/nats-io/nats.go v1.48.0 h1:pSFyXApG+yWU/TgbKCjmm5K4wrHu86231/w84qRVR+U= github.com/nats-io/nats.go v1.47.0 h1:YQdADw6J/UfGUd2Oy6tn4Hq6YHxCaJrVKayxxFqYrgM=
github.com/nats-io/nats.go v1.48.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g= github.com/nats-io/nats.go v1.47.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
github.com/nats-io/nkeys v0.4.12 h1:nssm7JKOG9/x4J8II47VWCL1Ds29avyiQDRn0ckMvDc= github.com/nats-io/nkeys v0.4.11 h1:q44qGV008kYd9W1b1nEBkNzvnWxtRSQ7A8BoqRrcfa0=
github.com/nats-io/nkeys v0.4.12/go.mod h1:MT59A1HYcjIcyQDJStTfaOY6vhy9XTUjOFo+SVsvpBg= github.com/nats-io/nkeys v0.4.11/go.mod h1:szDimtgmfOi9n25JpfIdGw12tZFYXqhGxjhVxsatHVE=
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/oapi-codegen/runtime v1.1.2 h1:P2+CubHq8fO4Q6fV1tqDBZHCwpVpvPg7oKiYzQgXIyI= github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro=
github.com/oapi-codegen/runtime v1.1.2/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg= github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws= github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw= github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4= github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4=
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY= github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY=
github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=
github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k=
github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME=
github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0= github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
github.com/stmcginnis/gofish v0.20.0 h1:hH2V2Qe898F2wWT1loApnkDUrXXiLKqbSlMaH3Y1n08= github.com/stmcginnis/gofish v0.20.0 h1:hH2V2Qe898F2wWT1loApnkDUrXXiLKqbSlMaH3Y1n08=
github.com/stmcginnis/gofish v0.20.0/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU= github.com/stmcginnis/gofish v0.20.0/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
@@ -111,24 +112,24 @@ github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9R
github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ= github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 h1:P7S/GeHBAFEZIYp0ePPs2kHXoazz8q2KsyxHyQVGCJg= golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 h1:P7S/GeHBAFEZIYp0ePPs2kHXoazz8q2KsyxHyQVGCJg=
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1/go.mod h1:9CWpnTUmlQkfdpdutA1nNf4iE5lAVt3QZOu0Z6hahBE= golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1/go.mod h1:9CWpnTUmlQkfdpdutA1nNf4iE5lAVt3QZOu0Z6hahBE=
golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8= golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04=
golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A= golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0=
golang.org/x/exp v0.0.0-20260112195511-716be5621a96 h1:Z/6YuSHTLOHfNFdb8zVZomZr7cqNgTJvA8+Qz75D8gU= golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b h1:M2rDM6z3Fhozi9O7NWsxAkg/yqS/lQJ6PmkyIV3YP+o=
golang.org/x/exp v0.0.0-20260112195511-716be5621a96/go.mod h1:nzimsREAkjBCIEFtHiYkrJyT+2uy9YZJB7H1k68CXZU= golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8=
golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= golang.org/x/net v0.45.0 h1:RLBg5JKixCy82FtLJpeNlVM0nrSqpCRYzVU1n8kj0tM=
golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= golang.org/x/net v0.45.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY=
golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=

View File

@@ -10,17 +10,15 @@ package metricAggregator
import ( import (
"context" "context"
"fmt" "fmt"
"maps"
"math" "math"
"os" "os"
"slices"
"strings" "strings"
"sync" "sync"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology" topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
"github.com/PaesslerAG/gval" "github.com/PaesslerAG/gval"
@@ -38,7 +36,7 @@ type MetricAggregatorIntervalConfig struct {
type metricAggregator struct { type metricAggregator struct {
functions []*MetricAggregatorIntervalConfig functions []*MetricAggregatorIntervalConfig
constants map[string]any constants map[string]interface{}
language gval.Language language gval.Language
output chan lp.CCMessage output chan lp.CCMessage
} }
@@ -86,7 +84,7 @@ var evaluables = struct {
func (c *metricAggregator) Init(output chan lp.CCMessage) error { func (c *metricAggregator) Init(output chan lp.CCMessage) error {
c.output = output c.output = output
c.functions = make([]*MetricAggregatorIntervalConfig, 0) c.functions = make([]*MetricAggregatorIntervalConfig, 0)
c.constants = make(map[string]any) c.constants = make(map[string]interface{})
// add constants like hostname, numSockets, ... to constants list // add constants like hostname, numSockets, ... to constants list
// Set hostname // Set hostname
@@ -122,8 +120,10 @@ func (c *metricAggregator) Init(output chan lp.CCMessage) error {
} }
func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMessage) { func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMessage) {
vars := make(map[string]any) vars := make(map[string]interface{})
maps.Copy(vars, c.constants) for k, v := range c.constants {
vars[k] = v
}
vars["starttime"] = starttime vars["starttime"] = starttime
vars["endtime"] = endtime vars["endtime"] = endtime
for _, f := range c.functions { for _, f := range c.functions {
@@ -137,6 +137,7 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
matches := make([]lp.CCMessage, 0) matches := make([]lp.CCMessage, 0)
for _, m := range metrics { for _, m := range metrics {
vars["metric"] = m vars["metric"] = m
//value, err := gval.Evaluate(f.Condition, vars, c.language)
value, err := f.gvalCond.EvalBool(context.Background(), vars) value, err := f.gvalCond.EvalBool(context.Background(), vars)
if err != nil { if err != nil {
cclog.ComponentError("MetricCache", "COLLECT", f.Name, "COND", f.Condition, ":", err.Error()) cclog.ComponentError("MetricCache", "COLLECT", f.Name, "COND", f.Condition, ":", err.Error())
@@ -170,22 +171,22 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
// Check, that only values of one type were collected // Check, that only values of one type were collected
countValueTypes := 0 countValueTypes := 0
if len(valuesFloat64) > 0 { if len(valuesFloat64) > 0 {
countValueTypes++ countValueTypes += 1
} }
if len(valuesFloat32) > 0 { if len(valuesFloat32) > 0 {
countValueTypes++ countValueTypes += 1
} }
if len(valuesInt) > 0 { if len(valuesInt) > 0 {
countValueTypes++ countValueTypes += 1
} }
if len(valuesInt32) > 0 { if len(valuesInt32) > 0 {
countValueTypes++ countValueTypes += 1
} }
if len(valuesInt64) > 0 { if len(valuesInt64) > 0 {
countValueTypes++ countValueTypes += 1
} }
if len(valuesBool) > 0 { if len(valuesBool) > 0 {
countValueTypes++ countValueTypes += 1
} }
if countValueTypes > 1 { if countValueTypes > 1 {
cclog.ComponentError("MetricCache", "Collected values of different types") cclog.ComponentError("MetricCache", "Collected values of different types")
@@ -262,15 +263,15 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
var m lp.CCMessage var m lp.CCMessage
switch t := value.(type) { switch t := value.(type) {
case float64: case float64:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime) m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case float32: case float32:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime) m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case int: case int:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime) m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case int64: case int64:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime) m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case string: case string:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime) m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
default: default:
cclog.ComponentError("MetricCache", "Gval returned invalid type", t, "skipping metric", f.Name) cclog.ComponentError("MetricCache", "Gval returned invalid type", t, "skipping metric", f.Name)
} }
@@ -328,21 +329,18 @@ func (c *metricAggregator) AddAggregation(name, function, condition string, tags
} }
func (c *metricAggregator) DeleteAggregation(name string) error { func (c *metricAggregator) DeleteAggregation(name string) error {
i := slices.IndexFunc( for i, agg := range c.functions {
c.functions, if agg.Name == name {
func(agg *MetricAggregatorIntervalConfig) bool {
return agg.Name == name
})
if i == -1 {
return fmt.Errorf("no aggregation for metric name %s", name)
}
copy(c.functions[i:], c.functions[i+1:]) copy(c.functions[i:], c.functions[i+1:])
c.functions[len(c.functions)-1] = nil c.functions[len(c.functions)-1] = nil
c.functions = c.functions[:len(c.functions)-1] c.functions = c.functions[:len(c.functions)-1]
return nil return nil
}
}
return fmt.Errorf("no aggregation for metric name %s", name)
} }
func (c *metricAggregator) AddConstant(name string, value any) { func (c *metricAggregator) AddConstant(name string, value interface{}) {
c.constants[name] = value c.constants[name] = value
} }
@@ -350,11 +348,11 @@ func (c *metricAggregator) DelConstant(name string) {
delete(c.constants, name) delete(c.constants, name)
} }
func (c *metricAggregator) AddFunction(name string, function func(args ...any) (any, error)) { func (c *metricAggregator) AddFunction(name string, function func(args ...interface{}) (interface{}, error)) {
c.language = gval.NewLanguage(c.language, gval.Function(name, function)) c.language = gval.NewLanguage(c.language, gval.Function(name, function))
} }
func EvalBoolCondition(condition string, params map[string]any) (bool, error) { func EvalBoolCondition(condition string, params map[string]interface{}) (bool, error) {
evaluables.mutex.Lock() evaluables.mutex.Lock()
evaluable, ok := evaluables.mapping[condition] evaluable, ok := evaluables.mapping[condition]
evaluables.mutex.Unlock() evaluables.mutex.Unlock()

View File

@@ -11,10 +11,10 @@ import (
"errors" "errors"
"fmt" "fmt"
"regexp" "regexp"
"slices"
"strconv"
"strings" "strings"
"golang.org/x/exp/slices"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology" topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
) )
@@ -34,7 +34,7 @@ func sumAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
} }
// Sum up values // Sum up values
func sumfunc(args any) (any, error) { func sumfunc(args interface{}) (interface{}, error) {
var err error var err error
switch values := args.(type) { switch values := args.(type) {
@@ -63,7 +63,7 @@ func minAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
} }
// Get the minimum value // Get the minimum value
func minfunc(args any) (any, error) { func minfunc(args interface{}) (interface{}, error) {
switch values := args.(type) { switch values := args.(type) {
case []float64: case []float64:
return minAnyType(values) return minAnyType(values)
@@ -84,12 +84,12 @@ func avgAnyType[T float64 | float32 | int | int32 | int64](values []T) (float64,
if len(values) == 0 { if len(values) == 0 {
return 0.0, errors.New("average function requires at least one argument") return 0.0, errors.New("average function requires at least one argument")
} }
sum, err := sumAnyType(values) sum, err := sumAnyType[T](values)
return float64(sum) / float64(len(values)), err return float64(sum) / float64(len(values)), err
} }
// Get the average or mean value // Get the average or mean value
func avgfunc(args any) (any, error) { func avgfunc(args interface{}) (interface{}, error) {
switch values := args.(type) { switch values := args.(type) {
case []float64: case []float64:
return avgAnyType(values) return avgAnyType(values)
@@ -114,7 +114,7 @@ func maxAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
} }
// Get the maximum value // Get the maximum value
func maxfunc(args any) (any, error) { func maxfunc(args interface{}) (interface{}, error) {
switch values := args.(type) { switch values := args.(type) {
case []float64: case []float64:
return maxAnyType(values) return maxAnyType(values)
@@ -146,7 +146,7 @@ func medianAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, er
} }
// Get the median value // Get the median value
func medianfunc(args any) (any, error) { func medianfunc(args interface{}) (interface{}, error) {
switch values := args.(type) { switch values := args.(type) {
case []float64: case []float64:
return medianAnyType(values) return medianAnyType(values)
@@ -167,9 +167,9 @@ func medianfunc(args any) (any, error) {
* Get number of values in list. Returns always an int * Get number of values in list. Returns always an int
*/ */
func lenfunc(args any) (any, error) { func lenfunc(args interface{}) (interface{}, error) {
var err error = nil var err error = nil
length := 0 var length int = 0
switch values := args.(type) { switch values := args.(type) {
case []float64: case []float64:
length = len(values) length = len(values)
@@ -181,7 +181,13 @@ func lenfunc(args any) (any, error) {
length = len(values) length = len(values)
case []int32: case []int32:
length = len(values) length = len(values)
case float64, float32, int, int64: case float64:
err = errors.New("function 'len' can only be applied on arrays and strings")
case float32:
err = errors.New("function 'len' can only be applied on arrays and strings")
case int:
err = errors.New("function 'len' can only be applied on arrays and strings")
case int64:
err = errors.New("function 'len' can only be applied on arrays and strings") err = errors.New("function 'len' can only be applied on arrays and strings")
case string: case string:
length = len(values) length = len(values)
@@ -191,13 +197,13 @@ func lenfunc(args any) (any, error) {
/* /*
* Check if a values is in a list * Check if a values is in a list
* In contrast to most of the other functions, this one is an infix operator for * In constrast to most of the other functions, this one is an infix operator for
* - substring matching: `"abc" in "abcdef"` -> true * - substring matching: `"abc" in "abcdef"` -> true
* - substring matching with int casting: `3 in "abd3"` -> true * - substring matching with int casting: `3 in "abd3"` -> true
* - search for an int in an int list: `3 in getCpuList()` -> true (if you have more than 4 CPU hardware threads) * - search for an int in an int list: `3 in getCpuList()` -> true (if you have more than 4 CPU hardware threads)
*/ */
func infunc(a any, b any) (any, error) { func infunc(a interface{}, b interface{}) (interface{}, error) {
switch match := a.(type) { switch match := a.(type) {
case string: case string:
switch total := b.(type) { switch total := b.(type) {
@@ -207,9 +213,13 @@ func infunc(a any, b any) (any, error) {
case int: case int:
switch total := b.(type) { switch total := b.(type) {
case []int: case []int:
return slices.Contains(total, match), nil for _, x := range total {
if x == match {
return true, nil
}
}
case string: case string:
smatch := strconv.Itoa(match) smatch := fmt.Sprintf("%d", match)
return strings.Contains(total, smatch), nil return strings.Contains(total, smatch), nil
} }
@@ -223,12 +233,12 @@ func infunc(a any, b any) (any, error) {
* format keys \d = %d, \w = %d, ... Not sure how to fix this * format keys \d = %d, \w = %d, ... Not sure how to fix this
*/ */
func matchfunc(args ...any) (any, error) { func matchfunc(args ...interface{}) (interface{}, error) {
switch match := args[0].(type) { switch match := args[0].(type) {
case string: case string:
switch total := args[1].(type) { switch total := args[1].(type) {
case string: case string:
smatch := strings.ReplaceAll(match, "%", "\\") smatch := strings.Replace(match, "%", "\\", -1)
regex, err := regexp.Compile(smatch) regex, err := regexp.Compile(smatch)
if err != nil { if err != nil {
return false, err return false, err
@@ -245,7 +255,7 @@ func matchfunc(args ...any) (any, error) {
*/ */
// for a given cpuid, it returns the core id // for a given cpuid, it returns the core id
func getCpuCoreFunc(args any) (any, error) { func getCpuCoreFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) { switch cpuid := args.(type) {
case int: case int:
return topo.GetHwthreadCore(cpuid), nil return topo.GetHwthreadCore(cpuid), nil
@@ -254,7 +264,7 @@ func getCpuCoreFunc(args any) (any, error) {
} }
// for a given cpuid, it returns the socket id // for a given cpuid, it returns the socket id
func getCpuSocketFunc(args any) (any, error) { func getCpuSocketFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) { switch cpuid := args.(type) {
case int: case int:
return topo.GetHwthreadSocket(cpuid), nil return topo.GetHwthreadSocket(cpuid), nil
@@ -263,7 +273,7 @@ func getCpuSocketFunc(args any) (any, error) {
} }
// for a given cpuid, it returns the id of the NUMA node // for a given cpuid, it returns the id of the NUMA node
func getCpuNumaDomainFunc(args any) (any, error) { func getCpuNumaDomainFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) { switch cpuid := args.(type) {
case int: case int:
return topo.GetHwthreadNumaDomain(cpuid), nil return topo.GetHwthreadNumaDomain(cpuid), nil
@@ -272,7 +282,7 @@ func getCpuNumaDomainFunc(args any) (any, error) {
} }
// for a given cpuid, it returns the id of the CPU die // for a given cpuid, it returns the id of the CPU die
func getCpuDieFunc(args any) (any, error) { func getCpuDieFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) { switch cpuid := args.(type) {
case int: case int:
return topo.GetHwthreadDie(cpuid), nil return topo.GetHwthreadDie(cpuid), nil
@@ -281,7 +291,7 @@ func getCpuDieFunc(args any) (any, error) {
} }
// for a given core id, it returns the list of cpuids // for a given core id, it returns the list of cpuids
func getCpuListOfCoreFunc(args any) (any, error) { func getCpuListOfCoreFunc(args interface{}) (interface{}, error) {
cpulist := make([]int, 0) cpulist := make([]int, 0)
switch in := args.(type) { switch in := args.(type) {
case int: case int:
@@ -295,7 +305,7 @@ func getCpuListOfCoreFunc(args any) (any, error) {
} }
// for a given socket id, it returns the list of cpuids // for a given socket id, it returns the list of cpuids
func getCpuListOfSocketFunc(args any) (any, error) { func getCpuListOfSocketFunc(args interface{}) (interface{}, error) {
cpulist := make([]int, 0) cpulist := make([]int, 0)
switch in := args.(type) { switch in := args.(type) {
case int: case int:
@@ -309,7 +319,7 @@ func getCpuListOfSocketFunc(args any) (any, error) {
} }
// for a given id of a NUMA domain, it returns the list of cpuids // for a given id of a NUMA domain, it returns the list of cpuids
func getCpuListOfNumaDomainFunc(args any) (any, error) { func getCpuListOfNumaDomainFunc(args interface{}) (interface{}, error) {
cpulist := make([]int, 0) cpulist := make([]int, 0)
switch in := args.(type) { switch in := args.(type) {
case int: case int:
@@ -323,7 +333,7 @@ func getCpuListOfNumaDomainFunc(args any) (any, error) {
} }
// for a given CPU die id, it returns the list of cpuids // for a given CPU die id, it returns the list of cpuids
func getCpuListOfDieFunc(args any) (any, error) { func getCpuListOfDieFunc(args interface{}) (interface{}, error) {
cpulist := make([]int, 0) cpulist := make([]int, 0)
switch in := args.(type) { switch in := args.(type) {
case int: case int:
@@ -337,14 +347,14 @@ func getCpuListOfDieFunc(args any) (any, error) {
} }
// wrapper function to get a list of all cpuids of the node // wrapper function to get a list of all cpuids of the node
func getCpuListOfNode() (any, error) { func getCpuListOfNode() (interface{}, error) {
return topo.HwthreadList(), nil return topo.HwthreadList(), nil
} }
// helper function to get the cpuid list for a CCMetric type tag set (type and type-id) // helper function to get the cpuid list for a CCMetric type tag set (type and type-id)
// since there is no access to the metric data in the function, is should be called like // since there is no access to the metric data in the function, is should be called like
// `getCpuListOfType()` // `getCpuListOfType()`
func getCpuListOfType(args ...any) (any, error) { func getCpuListOfType(args ...interface{}) (interface{}, error) {
cpulist := make([]int, 0) cpulist := make([]int, 0)
switch typ := args[0].(type) { switch typ := args[0].(type) {
case string: case string:

View File

@@ -11,9 +11,9 @@ import (
"sync" "sync"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator" agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker" mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
) )
@@ -51,7 +51,7 @@ type MetricCache interface {
} }
func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error { func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error {
var err error var err error = nil
c.done = make(chan bool) c.done = make(chan bool)
c.wg = wg c.wg = wg
c.ticker = ticker c.ticker = ticker
@@ -137,12 +137,12 @@ func (c *metricCache) Add(metric lp.CCMessage) {
p := c.intervals[c.curPeriod] p := c.intervals[c.curPeriod]
if p.numMetrics < p.sizeMetrics { if p.numMetrics < p.sizeMetrics {
p.metrics[p.numMetrics] = metric p.metrics[p.numMetrics] = metric
p.numMetrics++ p.numMetrics = p.numMetrics + 1
p.stopstamp = metric.Time() p.stopstamp = metric.Time()
} else { } else {
p.metrics = append(p.metrics, metric) p.metrics = append(p.metrics, metric)
p.numMetrics++ p.numMetrics = p.numMetrics + 1
p.sizeMetrics++ p.sizeMetrics = p.sizeMetrics + 1
p.stopstamp = metric.Time() p.stopstamp = metric.Time()
} }
c.lock.Unlock() c.lock.Unlock()
@@ -161,8 +161,8 @@ func (c *metricCache) DeleteAggregation(name string) error {
// is the current one, index=1 the last interval and so on. Returns and empty array if a wrong index // is the current one, index=1 the last interval and so on. Returns and empty array if a wrong index
// is given (negative index, index larger than configured number of total intervals, ...) // is given (negative index, index larger than configured number of total intervals, ...)
func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMessage) { func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMessage) {
start := time.Now() var start time.Time = time.Now()
stop := time.Now() var stop time.Time = time.Now()
var metrics []lp.CCMessage var metrics []lp.CCMessage
if index >= 0 && index < c.numPeriods { if index >= 0 && index < c.numPeriods {
pindex := c.curPeriod - index pindex := c.curPeriod - index

View File

@@ -10,16 +10,15 @@ package metricRouter
import ( import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"maps"
"os" "os"
"strings" "strings"
"sync" "sync"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/ccMessage"
mp "github.com/ClusterCockpit/cc-lib/v2/messageProcessor" mp "github.com/ClusterCockpit/cc-lib/messageProcessor"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator" agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker" mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
) )
@@ -108,8 +107,10 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
cclog.ComponentError("MetricRouter", err.Error()) cclog.ComponentError("MetricRouter", err.Error())
return err return err
} }
r.maxForward = max(1, r.config.MaxForward) r.maxForward = 1
if r.config.MaxForward > r.maxForward {
r.maxForward = r.config.MaxForward
}
if r.config.NumCacheIntervals > 0 { if r.config.NumCacheIntervals > 0 {
r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals) r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals)
if err != nil { if err != nil {
@@ -117,80 +118,60 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
return err return err
} }
for _, agg := range r.config.IntervalAgg { for _, agg := range r.config.IntervalAgg {
err = r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta) r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
if err != nil {
return fmt.Errorf("MetricCache AddAggregation() failed: %w", err)
}
} }
} }
p, err := mp.NewMessageProcessor() p, err := mp.NewMessageProcessor()
if err != nil { if err != nil {
return fmt.Errorf("MessageProcessor NewMessageProcessor() failed: %w", err) return fmt.Errorf("initialization of message processor failed: %v", err.Error())
} }
r.mp = p r.mp = p
if len(r.config.MessageProcessor) > 0 { if len(r.config.MessageProcessor) > 0 {
err = r.mp.FromConfigJSON(r.config.MessageProcessor) err = r.mp.FromConfigJSON(r.config.MessageProcessor)
if err != nil { if err != nil {
return fmt.Errorf("MessageProcessor FromConfigJSON() failed: %w", err) return fmt.Errorf("failed parsing JSON for message processor: %v", err.Error())
} }
} }
for _, mname := range r.config.DropMetrics { for _, mname := range r.config.DropMetrics {
err = r.mp.AddDropMessagesByName(mname) r.mp.AddDropMessagesByName(mname)
if err != nil {
return fmt.Errorf("MessageProcessor AddDropMessagesByName() failed: %w", err)
}
} }
for _, cond := range r.config.DropMetricsIf { for _, cond := range r.config.DropMetricsIf {
err = r.mp.AddDropMessagesByCondition(cond) r.mp.AddDropMessagesByCondition(cond)
if err != nil {
return fmt.Errorf("MessageProcessor AddDropMessagesByCondition() failed: %w", err)
}
} }
for _, data := range r.config.AddTags { for _, data := range r.config.AddTags {
cond := data.Condition cond := data.Condition
if cond == "*" { if cond == "*" {
cond = "true" cond = "true"
} }
err = r.mp.AddAddTagsByCondition(cond, data.Key, data.Value) r.mp.AddAddTagsByCondition(cond, data.Key, data.Value)
if err != nil {
return fmt.Errorf("MessageProcessor AddAddTagsByCondition() failed: %w", err)
}
} }
for _, data := range r.config.DelTags { for _, data := range r.config.DelTags {
cond := data.Condition cond := data.Condition
if cond == "*" { if cond == "*" {
cond = "true" cond = "true"
} }
err = r.mp.AddDeleteTagsByCondition(cond, data.Key, data.Value) r.mp.AddDeleteTagsByCondition(cond, data.Key, data.Value)
if err != nil {
return fmt.Errorf("MessageProcessor AddDeleteTagsByCondition() failed: %w", err)
}
} }
for oldname, newname := range r.config.RenameMetrics { for oldname, newname := range r.config.RenameMetrics {
err = r.mp.AddRenameMetricByName(oldname, newname) r.mp.AddRenameMetricByName(oldname, newname)
if err != nil {
return fmt.Errorf("MessageProcessor AddRenameMetricByName() failed: %w", err)
}
} }
for metricName, prefix := range r.config.ChangeUnitPrefix { for metricName, prefix := range r.config.ChangeUnitPrefix {
err = r.mp.AddChangeUnitPrefix(fmt.Sprintf("name == '%s'", metricName), prefix) r.mp.AddChangeUnitPrefix(fmt.Sprintf("name == '%s'", metricName), prefix)
if err != nil {
return fmt.Errorf("MessageProcessor AddChangeUnitPrefix() failed: %w", err)
}
} }
r.mp.SetNormalizeUnits(r.config.NormalizeUnits) r.mp.SetNormalizeUnits(r.config.NormalizeUnits)
err = r.mp.AddAddTagsByCondition("true", r.config.HostnameTagName, r.hostname) r.mp.AddAddTagsByCondition("true", r.config.HostnameTagName, r.hostname)
if err != nil {
return fmt.Errorf("MessageProcessor AddAddTagsByCondition() failed: %w", err)
}
// r.config.dropMetrics = make(map[string]bool)
// for _, mname := range r.config.DropMetrics {
// r.config.dropMetrics[mname] = true
// }
return nil return nil
} }
func getParamMap(point lp.CCMessage) map[string]any { func getParamMap(point lp.CCMessage) map[string]interface{} {
params := make(map[string]any) params := make(map[string]interface{})
params["metric"] = point params["metric"] = point
params["name"] = point.Name() params["name"] = point.Name()
for key, value := range point.Tags() { for key, value := range point.Tags() {
@@ -199,12 +180,14 @@ func getParamMap(point lp.CCMessage) map[string]any {
for key, value := range point.Meta() { for key, value := range point.Meta() {
params[key] = value params[key] = value
} }
maps.Copy(params, point.Fields()) for key, value := range point.Fields() {
params[key] = value
}
params["timestamp"] = point.Time() params["timestamp"] = point.Time()
return params return params
} }
// DoAddTags adds a tag when condition is fulfilled // DoAddTags adds a tag when condition is fullfiled
func (r *metricRouter) DoAddTags(point lp.CCMessage) { func (r *metricRouter) DoAddTags(point lp.CCMessage) {
var conditionMatches bool var conditionMatches bool
for _, m := range r.config.AddTags { for _, m := range r.config.AddTags {
@@ -226,6 +209,83 @@ func (r *metricRouter) DoAddTags(point lp.CCMessage) {
} }
} }
// DoDelTags removes a tag when condition is fullfiled
// func (r *metricRouter) DoDelTags(point lp.CCMessage) {
// var conditionMatches bool
// for _, m := range r.config.DelTags {
// if m.Condition == "*" {
// // Condition is always matched
// conditionMatches = true
// } else {
// // Evaluate condition
// var err error
// conditionMatches, err = agg.EvalBoolCondition(m.Condition, getParamMap(point))
// if err != nil {
// cclog.ComponentError("MetricRouter", err.Error())
// conditionMatches = false
// }
// }
// if conditionMatches {
// point.RemoveTag(m.Key)
// }
// }
// }
// Conditional test whether a metric should be dropped
// func (r *metricRouter) dropMetric(point lp.CCMessage) bool {
// // Simple drop check
// if conditionMatches, ok := r.config.dropMetrics[point.Name()]; ok {
// return conditionMatches
// }
// // Checking the dropping conditions
// for _, m := range r.config.DropMetricsIf {
// conditionMatches, err := agg.EvalBoolCondition(m, getParamMap(point))
// if err != nil {
// cclog.ComponentError("MetricRouter", err.Error())
// conditionMatches = false
// }
// if conditionMatches {
// return conditionMatches
// }
// }
// // No dropping condition met
// return false
// }
// func (r *metricRouter) prepareUnit(point lp.CCMessage) bool {
// if r.config.NormalizeUnits {
// if in_unit, ok := point.GetMeta("unit"); ok {
// u := units.NewUnit(in_unit)
// if u.Valid() {
// point.AddMeta("unit", u.Short())
// }
// }
// }
// if newP, ok := r.config.ChangeUnitPrefix[point.Name()]; ok {
// newPrefix := units.NewPrefix(newP)
// if in_unit, ok := point.GetMeta("unit"); ok && newPrefix != units.InvalidPrefix {
// u := units.NewUnit(in_unit)
// if u.Valid() {
// cclog.ComponentDebug("MetricRouter", "Change prefix to", newP, "for metric", point.Name())
// conv, out_unit := units.GetUnitPrefixFactor(u, newPrefix)
// if conv != nil && out_unit.Valid() {
// if val, ok := point.GetField("value"); ok {
// point.AddField("value", conv(val))
// point.AddMeta("unit", out_unit.Short())
// }
// }
// }
// }
// }
// return true
// }
// Start starts the metric router // Start starts the metric router
func (r *metricRouter) Start() { func (r *metricRouter) Start() {
// start timer if configured // start timer if configured
@@ -241,7 +301,28 @@ func (r *metricRouter) Start() {
cclog.ComponentDebug("MetricRouter", "DONE") cclog.ComponentDebug("MetricRouter", "DONE")
} }
// Forward message received from collector channel // Forward takes a received metric, adds or deletes tags
// and forwards it to the output channels
// forward := func(point lp.CCMessage) {
// cclog.ComponentDebug("MetricRouter", "FORWARD", point)
// r.DoAddTags(point)
// r.DoDelTags(point)
// name := point.Name()
// if new, ok := r.config.RenameMetrics[name]; ok {
// point.SetName(new)
// point.AddMeta("oldname", name)
// r.DoAddTags(point)
// r.DoDelTags(point)
// }
// r.prepareUnit(point)
// for _, o := range r.outputs {
// o <- point
// }
// }
// Foward message received from collector channel
coll_forward := func(p lp.CCMessage) { coll_forward := func(p lp.CCMessage) {
// receive from metric collector // receive from metric collector
//p.AddTag(r.config.HostnameTagName, r.hostname) //p.AddTag(r.config.HostnameTagName, r.hostname)
@@ -254,6 +335,11 @@ func (r *metricRouter) Start() {
o <- m o <- m
} }
} }
// if !r.dropMetric(p) {
// for _, o := range r.outputs {
// o <- point
// }
// }
// even if the metric is dropped, it is stored in the cache for // even if the metric is dropped, it is stored in the cache for
// aggregations // aggregations
if r.config.NumCacheIntervals > 0 { if r.config.NumCacheIntervals > 0 {
@@ -273,6 +359,9 @@ func (r *metricRouter) Start() {
o <- m o <- m
} }
} }
// if !r.dropMetric(p) {
// forward(p)
// }
} }
// Forward message received from cache channel // Forward message received from cache channel

View File

@@ -13,11 +13,11 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"regexp" "regexp"
"slices"
"strconv" "strconv"
"strings" "strings"
cclogger "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclogger "github.com/ClusterCockpit/cc-lib/ccLogger"
"golang.org/x/exp/slices"
) )
const SYSFS_CPUBASE = `/sys/devices/system/cpu` const SYSFS_CPUBASE = `/sys/devices/system/cpu`
@@ -51,13 +51,14 @@ var cache struct {
func fileToInt(path string) int { func fileToInt(path string) int {
buffer, err := os.ReadFile(path) buffer, err := os.ReadFile(path)
if err != nil { if err != nil {
cclogger.ComponentError("ccTopology", fmt.Sprintf("fileToInt(): Reading \"%s\": %v", path, err)) log.Print(err)
cclogger.ComponentError("ccTopology", "fileToInt", "Reading", path, ":", err.Error())
return -1 return -1
} }
stringBuffer := strings.TrimSpace(string(buffer)) stringBuffer := strings.TrimSpace(string(buffer))
id, err := strconv.Atoi(stringBuffer) id, err := strconv.Atoi(stringBuffer)
if err != nil { if err != nil {
cclogger.ComponentError("ccTopology", fmt.Sprintf("fileToInt(): Parsing \"%s\": %v", stringBuffer, err)) cclogger.ComponentError("ccTopology", "fileToInt", "Parsing", path, ":", stringBuffer, err.Error())
return -1 return -1
} }
return id return id
@@ -79,7 +80,7 @@ func fileToList(path string) []int {
// Create list // Create list
list := make([]int, 0) list := make([]int, 0)
stringBuffer := strings.TrimSpace(string(buffer)) stringBuffer := strings.TrimSpace(string(buffer))
for valueRangeString := range strings.SplitSeq(stringBuffer, ",") { for _, valueRangeString := range strings.Split(stringBuffer, ",") {
valueRange := strings.Split(valueRangeString, "-") valueRange := strings.Split(valueRangeString, "-")
switch len(valueRange) { switch len(valueRange) {
case 1: case 1:
@@ -303,19 +304,20 @@ func GetTypeList(topology_type string) []int {
} }
func GetTypeId(hwt HwthreadEntry, topology_type string) (int, error) { func GetTypeId(hwt HwthreadEntry, topology_type string) (int, error) {
var err error = nil
switch topology_type { switch topology_type {
case "node": case "node":
return 0, nil return 0, err
case "socket": case "socket":
return hwt.Socket, nil return hwt.Socket, err
case "die": case "die":
return hwt.Die, nil return hwt.Die, err
case "memoryDomain": case "memoryDomain":
return hwt.NumaDomain, nil return hwt.NumaDomain, err
case "core": case "core":
return hwt.Core, nil return hwt.Core, err
case "hwthread": case "hwthread":
return hwt.CpuID, nil return hwt.CpuID, err
} }
return -1, fmt.Errorf("unknown topology type '%s'", topology_type) return -1, fmt.Errorf("unknown topology type '%s'", topology_type)
} }

View File

@@ -10,7 +10,7 @@ package multiChanTicker
import ( import (
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
) )
type multiChanTicker struct { type multiChanTicker struct {
@@ -21,7 +21,7 @@ type multiChanTicker struct {
type MultiChanTicker interface { type MultiChanTicker interface {
Init(duration time.Duration) Init(duration time.Duration)
AddChannel(channel chan time.Time) AddChannel(chan time.Time)
Close() Close()
} }

View File

@@ -30,11 +30,11 @@ make
%install %install
install -Dpm 0750 %{name} %{buildroot}%{_bindir}/%{name} install -Dpm 0750 %{name} %{buildroot}%{_bindir}/%{name}
install -Dpm 0600 example-configs/config.json %{buildroot}%{_sysconfdir}/%{name}/%{name}.json install -Dpm 0600 config.json %{buildroot}%{_sysconfdir}/%{name}/%{name}.json
install -Dpm 0600 example-configs/collectors.json %{buildroot}%{_sysconfdir}/%{name}/collectors.json install -Dpm 0600 collectors.json %{buildroot}%{_sysconfdir}/%{name}/collectors.json
install -Dpm 0600 example-configs/sinks.json %{buildroot}%{_sysconfdir}/%{name}/sinks.json install -Dpm 0600 sinks.json %{buildroot}%{_sysconfdir}/%{name}/sinks.json
install -Dpm 0600 example-configs/receivers.json %{buildroot}%{_sysconfdir}/%{name}/receivers.json install -Dpm 0600 receivers.json %{buildroot}%{_sysconfdir}/%{name}/receivers.json
install -Dpm 0600 example-configs/router.json %{buildroot}%{_sysconfdir}/%{name}/router.json install -Dpm 0600 router.json %{buildroot}%{_sysconfdir}/%{name}/router.json
install -Dpm 0644 scripts/%{name}.service %{buildroot}%{_unitdir}/%{name}.service install -Dpm 0644 scripts/%{name}.service %{buildroot}%{_unitdir}/%{name}.service
install -Dpm 0600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{name} install -Dpm 0600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{name}
install -Dpm 0644 scripts/%{name}.sysusers %{buildroot}%{_sysusersdir}/%{name}.conf install -Dpm 0644 scripts/%{name}.sysusers %{buildroot}%{_sysusersdir}/%{name}.conf