mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-04-18 13:57:31 +02:00
Compare commits
26 Commits
main
...
nfsio_remo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4c05557941 | ||
|
|
02623f8c9d | ||
|
|
e080a82b55 | ||
|
|
b5520efc25 | ||
|
|
d2b1bad1b8 | ||
|
|
5762afa40b | ||
|
|
0e57c8db1c | ||
|
|
f2f38c81af | ||
|
|
f9acc51a50 | ||
|
|
87346e2eae | ||
|
|
0f92f10b66 | ||
|
|
6901b06e44 | ||
|
|
7b343d0bab | ||
|
|
7d3180b526 | ||
|
|
70a6afc549 | ||
|
|
e02a018327 | ||
|
|
bcecdd033b | ||
|
|
2645ffeff3 | ||
|
|
e968aa1991 | ||
|
|
d2a38e3844 | ||
|
|
1f35f6d3ca | ||
|
|
7e6870c7b3 | ||
|
|
d881093524 | ||
|
|
c01096c157 | ||
|
|
3d70c8afc9 | ||
|
|
7ee85a07dc |
11
.github/dependabot.yml
vendored
11
.github/dependabot.yml
vendored
@@ -1,11 +0,0 @@
|
|||||||
# To get started with Dependabot version updates, you'll need to specify which
|
|
||||||
# package ecosystems to update and where the package manifests are located.
|
|
||||||
# Please see the documentation for all configuration options:
|
|
||||||
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
|
|
||||||
|
|
||||||
version: 2
|
|
||||||
updates:
|
|
||||||
- package-ecosystem: "gomod"
|
|
||||||
directory: "/"
|
|
||||||
schedule:
|
|
||||||
interval: "weekly"
|
|
||||||
127
.github/workflows/Release.yml
vendored
127
.github/workflows/Release.yml
vendored
@@ -5,10 +5,10 @@ name: Release
|
|||||||
|
|
||||||
# Run on tag push
|
# Run on tag push
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
tags:
|
tags:
|
||||||
- '**'
|
- '**'
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|
||||||
@@ -36,14 +36,22 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
|
# - name: Setup Golang
|
||||||
|
# uses: actions/setup-go@v5
|
||||||
|
# with:
|
||||||
|
# go-version: 'stable'
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
|
dnf --assumeyes --disableplugin=subscription-manager install \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.22.9-1.module_el8.10.0+3938+8c723e16.noarch.rpm
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -70,13 +78,13 @@ jobs:
|
|||||||
|
|
||||||
# See: https://github.com/actions/upload-artifact
|
# See: https://github.com/actions/upload-artifact
|
||||||
- name: Save RPM as artifact
|
- name: Save RPM as artifact
|
||||||
uses: actions/upload-artifact@v6
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for AlmaLinux 8
|
name: cc-metric-collector RPM for AlmaLinux 8
|
||||||
path: ${{ steps.rpmrename.outputs.RPM }}
|
path: ${{ steps.rpmrename.outputs.RPM }}
|
||||||
overwrite: true
|
overwrite: true
|
||||||
- name: Save SRPM as artifact
|
- name: Save SRPM as artifact
|
||||||
uses: actions/upload-artifact@v6
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for AlmaLinux 8
|
name: cc-metric-collector SRPM for AlmaLinux 8
|
||||||
path: ${{ steps.rpmrename.outputs.SRPM }}
|
path: ${{ steps.rpmrename.outputs.SRPM }}
|
||||||
@@ -106,14 +114,23 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
|
# - name: Setup Golang
|
||||||
|
# uses: actions/setup-go@v5
|
||||||
|
# with:
|
||||||
|
# go-version: 'stable'
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
|
dnf --assumeyes --disableplugin=subscription-manager install \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.22.7-2.el9_5.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.22.7-2.el9_5.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.22.7-2.el9_5.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.22.7-2.el9_5.noarch.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.22.7-2.el9_5.x86_64.rpm
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -140,26 +157,25 @@ jobs:
|
|||||||
|
|
||||||
# See: https://github.com/actions/upload-artifact
|
# See: https://github.com/actions/upload-artifact
|
||||||
- name: Save RPM as artifact
|
- name: Save RPM as artifact
|
||||||
uses: actions/upload-artifact@v6
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for AlmaLinux 9
|
name: cc-metric-collector RPM for AlmaLinux 9
|
||||||
path: ${{ steps.rpmrename.outputs.RPM }}
|
path: ${{ steps.rpmrename.outputs.RPM }}
|
||||||
overwrite: true
|
overwrite: true
|
||||||
- name: Save SRPM as artifact
|
- name: Save SRPM as artifact
|
||||||
uses: actions/upload-artifact@v6
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for AlmaLinux 9
|
name: cc-metric-collector SRPM for AlmaLinux 9
|
||||||
path: ${{ steps.rpmrename.outputs.SRPM }}
|
path: ${{ steps.rpmrename.outputs.SRPM }}
|
||||||
overwrite: true
|
overwrite: true
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build on Red Hat Universal Base Image (UBI 8) using go-toolset
|
# Build on UBI 8 using go-toolset
|
||||||
#
|
#
|
||||||
UBI-8-RPM-build:
|
UBI-8-RPM-build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+8
|
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c35984d70cc534b3a3784e?container-tabs=gti
|
||||||
# https://hub.docker.com/r/redhat/ubi8
|
container: registry.access.redhat.com/ubi8/ubi:8.8-1032.1692772289
|
||||||
container: redhat/ubi8
|
|
||||||
# The job outputs link to the outputs of the 'rpmbuild' step
|
# The job outputs link to the outputs of the 'rpmbuild' step
|
||||||
outputs:
|
outputs:
|
||||||
rpm : ${{steps.rpmbuild.outputs.RPM}}
|
rpm : ${{steps.rpmbuild.outputs.RPM}}
|
||||||
@@ -174,14 +190,22 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
|
# - name: Setup Golang
|
||||||
|
# uses: actions/setup-go@v5
|
||||||
|
# with:
|
||||||
|
# go-version: 'stable'
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-8-appstream-rpms install go-toolset
|
dnf --assumeyes --disableplugin=subscription-manager install \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.22.9-1.module_el8.10.0+3938+8c723e16.noarch.rpm
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -191,25 +215,24 @@ jobs:
|
|||||||
|
|
||||||
# See: https://github.com/actions/upload-artifact
|
# See: https://github.com/actions/upload-artifact
|
||||||
- name: Save RPM as artifact
|
- name: Save RPM as artifact
|
||||||
uses: actions/upload-artifact@v6
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for UBI 8
|
name: cc-metric-collector RPM for UBI 8
|
||||||
path: ${{ steps.rpmbuild.outputs.RPM }}
|
path: ${{ steps.rpmbuild.outputs.RPM }}
|
||||||
overwrite: true
|
overwrite: true
|
||||||
- name: Save SRPM as artifact
|
- name: Save SRPM as artifact
|
||||||
uses: actions/upload-artifact@v6
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for UBI 8
|
name: cc-metric-collector SRPM for UBI 8
|
||||||
path: ${{ steps.rpmbuild.outputs.SRPM }}
|
path: ${{ steps.rpmbuild.outputs.SRPM }}
|
||||||
overwrite: true
|
overwrite: true
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build on Red Hat Universal Base Image (UBI 9) using go-toolset
|
# Build on UBI 9 using go-toolset
|
||||||
#
|
#
|
||||||
UBI-9-RPM-build:
|
UBI-9-RPM-build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+9
|
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
|
||||||
# https://hub.docker.com/r/redhat/ubi9
|
|
||||||
container: redhat/ubi9
|
container: redhat/ubi9
|
||||||
# The job outputs link to the outputs of the 'rpmbuild' step
|
# The job outputs link to the outputs of the 'rpmbuild' step
|
||||||
# The job outputs link to the outputs of the 'rpmbuild' step
|
# The job outputs link to the outputs of the 'rpmbuild' step
|
||||||
@@ -226,14 +249,24 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
|
# See: https://github.com/marketplace/actions/setup-go-environment
|
||||||
|
# - name: Setup Golang
|
||||||
|
# uses: actions/setup-go@v5
|
||||||
|
# with:
|
||||||
|
# go-version: 'stable'
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-9-appstream-rpms install go-toolset
|
dnf --assumeyes --disableplugin=subscription-manager install \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.22.7-2.el9_5.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.22.7-2.el9_5.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.22.7-2.el9_5.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.22.7-2.el9_5.noarch.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.22.7-2.el9_5.x86_64.rpm
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -243,13 +276,13 @@ jobs:
|
|||||||
|
|
||||||
# See: https://github.com/actions/upload-artifact
|
# See: https://github.com/actions/upload-artifact
|
||||||
- name: Save RPM as artifact
|
- name: Save RPM as artifact
|
||||||
uses: actions/upload-artifact@v6
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for UBI 9
|
name: cc-metric-collector RPM for UBI 9
|
||||||
path: ${{ steps.rpmbuild.outputs.RPM }}
|
path: ${{ steps.rpmbuild.outputs.RPM }}
|
||||||
overwrite: true
|
overwrite: true
|
||||||
- name: Save SRPM as artifact
|
- name: Save SRPM as artifact
|
||||||
uses: actions/upload-artifact@v6
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for UBI 9
|
name: cc-metric-collector SRPM for UBI 9
|
||||||
path: ${{ steps.rpmbuild.outputs.SRPM }}
|
path: ${{ steps.rpmbuild.outputs.SRPM }}
|
||||||
@@ -275,14 +308,13 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
# Use official golang package
|
|
||||||
# See: https://github.com/marketplace/actions/setup-go-environment
|
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
uses: actions/setup-go@v6
|
uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: 'stable'
|
go-version: 'stable'
|
||||||
|
|
||||||
@@ -300,13 +332,13 @@ jobs:
|
|||||||
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
|
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
|
||||||
# See: https://github.com/actions/upload-artifact
|
# See: https://github.com/actions/upload-artifact
|
||||||
- name: Save DEB as artifact
|
- name: Save DEB as artifact
|
||||||
uses: actions/upload-artifact@v6
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector DEB for Ubuntu 22.04
|
name: cc-metric-collector DEB for Ubuntu 22.04
|
||||||
path: ${{ steps.debrename.outputs.DEB }}
|
path: ${{ steps.debrename.outputs.DEB }}
|
||||||
overwrite: true
|
overwrite: true
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build on Ubuntu 24.04 using official go package
|
# Build on Ubuntu 24.04 using official go package
|
||||||
#
|
#
|
||||||
Ubuntu-noblenumbat-build:
|
Ubuntu-noblenumbat-build:
|
||||||
@@ -326,14 +358,13 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
# Use official golang package
|
|
||||||
# See: https://github.com/marketplace/actions/setup-go-environment
|
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
uses: actions/setup-go@v6
|
uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: 'stable'
|
go-version: 'stable'
|
||||||
|
|
||||||
@@ -351,7 +382,7 @@ jobs:
|
|||||||
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
|
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
|
||||||
# See: https://github.com/actions/upload-artifact
|
# See: https://github.com/actions/upload-artifact
|
||||||
- name: Save DEB as artifact
|
- name: Save DEB as artifact
|
||||||
uses: actions/upload-artifact@v6
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector DEB for Ubuntu 24.04
|
name: cc-metric-collector DEB for Ubuntu 24.04
|
||||||
path: ${{ steps.debrename.outputs.DEB }}
|
path: ${{ steps.debrename.outputs.DEB }}
|
||||||
@@ -369,48 +400,48 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
# See: https://github.com/actions/download-artifact
|
# See: https://github.com/actions/download-artifact
|
||||||
- name: Download AlmaLinux 8 RPM
|
- name: Download AlmaLinux 8 RPM
|
||||||
uses: actions/download-artifact@v7
|
uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for AlmaLinux 8
|
name: cc-metric-collector RPM for AlmaLinux 8
|
||||||
- name: Download AlmaLinux 8 SRPM
|
- name: Download AlmaLinux 8 SRPM
|
||||||
uses: actions/download-artifact@v7
|
uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for AlmaLinux 8
|
name: cc-metric-collector SRPM for AlmaLinux 8
|
||||||
|
|
||||||
- name: Download AlmaLinux 9 RPM
|
- name: Download AlmaLinux 9 RPM
|
||||||
uses: actions/download-artifact@v7
|
uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for AlmaLinux 9
|
name: cc-metric-collector RPM for AlmaLinux 9
|
||||||
- name: Download AlmaLinux 9 SRPM
|
- name: Download AlmaLinux 9 SRPM
|
||||||
uses: actions/download-artifact@v7
|
uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for AlmaLinux 9
|
name: cc-metric-collector SRPM for AlmaLinux 9
|
||||||
|
|
||||||
- name: Download UBI 8 RPM
|
- name: Download UBI 8 RPM
|
||||||
uses: actions/download-artifact@v7
|
uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for UBI 8
|
name: cc-metric-collector RPM for UBI 8
|
||||||
- name: Download UBI 8 SRPM
|
- name: Download UBI 8 SRPM
|
||||||
uses: actions/download-artifact@v7
|
uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for UBI 8
|
name: cc-metric-collector SRPM for UBI 8
|
||||||
|
|
||||||
- name: Download UBI 9 RPM
|
- name: Download UBI 9 RPM
|
||||||
uses: actions/download-artifact@v7
|
uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for UBI 9
|
name: cc-metric-collector RPM for UBI 9
|
||||||
- name: Download UBI 9 SRPM
|
- name: Download UBI 9 SRPM
|
||||||
uses: actions/download-artifact@v7
|
uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for UBI 9
|
name: cc-metric-collector SRPM for UBI 9
|
||||||
|
|
||||||
- name: Download Ubuntu 22.04 DEB
|
- name: Download Ubuntu 22.04 DEB
|
||||||
uses: actions/download-artifact@v7
|
uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector DEB for Ubuntu 22.04
|
name: cc-metric-collector DEB for Ubuntu 22.04
|
||||||
|
|
||||||
- name: Download Ubuntu 24.04 DEB
|
- name: Download Ubuntu 24.04 DEB
|
||||||
uses: actions/download-artifact@v7
|
uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector DEB for Ubuntu 24.04
|
name: cc-metric-collector DEB for Ubuntu 24.04
|
||||||
|
|
||||||
|
|||||||
167
.github/workflows/runonce.yml
vendored
167
.github/workflows/runonce.yml
vendored
@@ -20,41 +20,25 @@ jobs:
|
|||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
# Checkout git repository and submodules
|
# Checkout git repository and submodules
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
|
|
||||||
# See: https://github.com/marketplace/actions/setup-go-environment
|
# See: https://github.com/marketplace/actions/setup-go-environment
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
uses: actions/setup-go@v6
|
uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: 'stable'
|
go-version: '1.21'
|
||||||
check-latest: true
|
check-latest: true
|
||||||
|
|
||||||
- name: Install reviewdog
|
|
||||||
run: |
|
|
||||||
go install github.com/reviewdog/reviewdog/cmd/reviewdog@latest
|
|
||||||
|
|
||||||
# See: https://golangci-lint.run
|
|
||||||
- name: Install GolangCI-Lint
|
|
||||||
run: |
|
|
||||||
go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@latest
|
|
||||||
|
|
||||||
- name: Build MetricCollector
|
- name: Build MetricCollector
|
||||||
run: make
|
run: make
|
||||||
|
|
||||||
- name: Run MetricCollector once
|
- name: Run MetricCollector once
|
||||||
run: ./cc-metric-collector --once --config .github/ci-config.json
|
run: ./cc-metric-collector --once --config .github/ci-config.json
|
||||||
|
|
||||||
# Running the linter requires likwid.h, which gets downloaded in the build step
|
|
||||||
- name: Static Analysis with GolangCI-Lint and Upload Report with reviewdog
|
|
||||||
run: |
|
|
||||||
golangci-lint run --enable errorlint,govet,misspell,modernize,prealloc,staticcheck,unconvert,wastedassign | reviewdog -f=golangci-lint -name "Check golangci-lint on build-latest" -reporter=github-check -filter-mode=nofilter -fail-level none
|
|
||||||
env:
|
|
||||||
REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build on AlmaLinux 8 using go-toolset
|
# Build on AlmaLinux 8
|
||||||
#
|
#
|
||||||
AlmaLinux8-RPM-build:
|
AlmaLinux8-RPM-build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@@ -74,14 +58,23 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
|
# See: https://github.com/marketplace/actions/setup-go-environment
|
||||||
|
# - name: Setup Golang
|
||||||
|
# uses: actions/setup-go@v5
|
||||||
|
# with:
|
||||||
|
# go-version: 'stable'
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
|
dnf --assumeyes --disableplugin=subscription-manager install \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.22.9-1.module_el8.10.0+3938+8c723e16.noarch.rpm
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -90,7 +83,7 @@ jobs:
|
|||||||
make RPM
|
make RPM
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build on AlmaLinux 9 using go-toolset
|
# Build on AlmaLinux 9
|
||||||
#
|
#
|
||||||
AlmaLinux9-RPM-build:
|
AlmaLinux9-RPM-build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@@ -110,14 +103,24 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
|
# See: https://github.com/marketplace/actions/setup-go-environment
|
||||||
|
# - name: Setup Golang
|
||||||
|
# uses: actions/setup-go@v5
|
||||||
|
# with:
|
||||||
|
# go-version: 'stable'
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
|
dnf --assumeyes --disableplugin=subscription-manager install \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.22.7-2.el9_5.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.22.7-2.el9_5.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.22.7-2.el9_5.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.22.7-2.el9_5.noarch.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.22.7-2.el9_5.x86_64.rpm
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -125,49 +128,13 @@ jobs:
|
|||||||
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
|
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
|
||||||
make RPM
|
make RPM
|
||||||
|
|
||||||
#
|
|
||||||
# Build on AlmaLinux 10 using go-toolset
|
|
||||||
#
|
|
||||||
AlmaLinux10-RPM-build:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
# See: https://hub.docker.com/_/almalinux
|
|
||||||
container: almalinux:10
|
|
||||||
# The job outputs link to the outputs of the 'rpmrename' step
|
|
||||||
# Only job outputs can be used in child jobs
|
|
||||||
steps:
|
|
||||||
|
|
||||||
# Use dnf to install development packages
|
|
||||||
- name: Install development packages
|
|
||||||
run: |
|
|
||||||
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
|
|
||||||
dnf --assumeyes install wget openssl-devel diffutils delve which
|
|
||||||
|
|
||||||
# Checkout git repository and submodules
|
|
||||||
# fetch-depth must be 0 to use git describe
|
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v6
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Setup Golang
|
|
||||||
run: |
|
|
||||||
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
|
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
|
||||||
id: rpmbuild
|
|
||||||
run: |
|
|
||||||
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
|
|
||||||
make RPM
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build on Red Hat Universal Base Image (UBI 8) using go-toolset
|
# Build on UBI 8 using go-toolset
|
||||||
#
|
#
|
||||||
UBI-8-RPM-build:
|
UBI-8-RPM-build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+8
|
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
|
||||||
# https://hub.docker.com/r/redhat/ubi8
|
|
||||||
container: redhat/ubi8
|
container: redhat/ubi8
|
||||||
# The job outputs link to the outputs of the 'rpmbuild' step
|
# The job outputs link to the outputs of the 'rpmbuild' step
|
||||||
steps:
|
steps:
|
||||||
@@ -180,14 +147,23 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
|
# See: https://github.com/marketplace/actions/setup-go-environment
|
||||||
|
# - name: Setup Golang
|
||||||
|
# uses: actions/setup-go@v5
|
||||||
|
# with:
|
||||||
|
# go-version: 'stable'
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-8-appstream-rpms install go-toolset
|
dnf --assumeyes --disableplugin=subscription-manager install \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
||||||
|
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.22.9-1.module_el8.10.0+3938+8c723e16.noarch.rpm
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -196,12 +172,11 @@ jobs:
|
|||||||
make RPM
|
make RPM
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build on Red Hat Universal Base Image (UBI 9) using go-toolset
|
# Build on UBI 9 using go-toolset
|
||||||
#
|
#
|
||||||
UBI-9-RPM-build:
|
UBI-9-RPM-build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+9
|
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
|
||||||
# https://hub.docker.com/r/redhat/ubi9
|
|
||||||
container: redhat/ubi9
|
container: redhat/ubi9
|
||||||
# The job outputs link to the outputs of the 'rpmbuild' step
|
# The job outputs link to the outputs of the 'rpmbuild' step
|
||||||
steps:
|
steps:
|
||||||
@@ -214,48 +189,24 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
|
# See: https://github.com/marketplace/actions/setup-go-environment
|
||||||
|
# - name: Setup Golang
|
||||||
|
# uses: actions/setup-go@v5
|
||||||
|
# with:
|
||||||
|
# go-version: 'stable'
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-9-appstream-rpms install go-toolset
|
dnf --assumeyes --disableplugin=subscription-manager install \
|
||||||
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.22.7-2.el9_5.x86_64.rpm \
|
||||||
- name: RPM build MetricCollector
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.22.7-2.el9_5.x86_64.rpm \
|
||||||
id: rpmbuild
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.22.7-2.el9_5.x86_64.rpm \
|
||||||
run: |
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.22.7-2.el9_5.noarch.rpm \
|
||||||
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
|
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.22.7-2.el9_5.x86_64.rpm
|
||||||
make RPM
|
|
||||||
|
|
||||||
#
|
|
||||||
# Build on Red Hat Universal Base Image (UBI 10) using go-toolset
|
|
||||||
#
|
|
||||||
UBI-10-RPM-build:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+10
|
|
||||||
# https://hub.docker.com/r/redhat/ubi10
|
|
||||||
container: redhat/ubi10
|
|
||||||
# The job outputs link to the outputs of the 'rpmbuild' step
|
|
||||||
steps:
|
|
||||||
|
|
||||||
# Use dnf to install development packages
|
|
||||||
- name: Install development packages
|
|
||||||
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros gcc make python3 git wget openssl-devel diffutils delve
|
|
||||||
|
|
||||||
# Checkout git repository and submodules
|
|
||||||
# fetch-depth must be 0 to use git describe
|
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v6
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Setup Golang
|
|
||||||
run: |
|
|
||||||
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-10-for-x86_64-appstream-rpms install go-toolset
|
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -280,14 +231,14 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
# Use official golang package
|
# Use official golang package
|
||||||
# See: https://github.com/marketplace/actions/setup-go-environment
|
# See: https://github.com/marketplace/actions/setup-go-environment
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
uses: actions/setup-go@v6
|
uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: 'stable'
|
go-version: 'stable'
|
||||||
|
|
||||||
@@ -314,14 +265,14 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
# Use official golang package
|
# Use official golang package
|
||||||
# See: https://github.com/marketplace/actions/setup-go-environment
|
# See: https://github.com/marketplace/actions/setup-go-environment
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
uses: actions/setup-go@v6
|
uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: 'stable'
|
go-version: 'stable'
|
||||||
|
|
||||||
|
|||||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,5 +1,4 @@
|
|||||||
# Binaries for programs and plugins
|
# Binaries for programs and plugins
|
||||||
/cc-metric-collector
|
|
||||||
*.exe
|
*.exe
|
||||||
*.exe~
|
*.exe~
|
||||||
*.dll
|
*.dll
|
||||||
@@ -14,6 +13,3 @@
|
|||||||
|
|
||||||
# Dependency directories (remove the comment below to include it)
|
# Dependency directories (remove the comment below to include it)
|
||||||
# vendor/
|
# vendor/
|
||||||
|
|
||||||
# Local copy of LIKWID headers
|
|
||||||
/collectors/likwid
|
|
||||||
|
|||||||
30
Makefile
30
Makefile
@@ -27,17 +27,6 @@ $(APP): $(GOSRC) go.mod
|
|||||||
$(GOBIN) get
|
$(GOBIN) get
|
||||||
$(GOBIN) build -o $(APP) $(GOSRC_APP)
|
$(GOBIN) build -o $(APP) $(GOSRC_APP)
|
||||||
|
|
||||||
# -ldflags:
|
|
||||||
# -s : drops the OS symbol table
|
|
||||||
# -w : drops DWARF
|
|
||||||
# -> Panic stack traces still show function names and file:line
|
|
||||||
.PHONY: build-stripped
|
|
||||||
build-stripped:
|
|
||||||
make -C collectors
|
|
||||||
$(GOBIN) get
|
|
||||||
$(GOBIN) build -ldflags "-s -w" -trimpath -o $(APP) $(GOSRC_APP)
|
|
||||||
|
|
||||||
.PHONY: install
|
|
||||||
install: $(APP)
|
install: $(APP)
|
||||||
@WORKSPACE=$(PREFIX)
|
@WORKSPACE=$(PREFIX)
|
||||||
@if [ -z "$${WORKSPACE}" ]; then exit 1; fi
|
@if [ -z "$${WORKSPACE}" ]; then exit 1; fi
|
||||||
@@ -69,26 +58,12 @@ fmt:
|
|||||||
$(GOBIN) fmt $(GOSRC_APP)
|
$(GOBIN) fmt $(GOSRC_APP)
|
||||||
@for F in $(GOSRC_INTERNAL); do $(GOBIN) fmt $$F; done
|
@for F in $(GOSRC_INTERNAL); do $(GOBIN) fmt $$F; done
|
||||||
|
|
||||||
# gofumpt <https://github.com/mvdan/gofumpt>:
|
|
||||||
# Enforce a stricter format than gofmt
|
|
||||||
.PHONY: gofumpt
|
|
||||||
gofumpt:
|
|
||||||
$(GOBIN) install mvdan.cc/gofumpt@latest
|
|
||||||
gofumpt -w $(GOSRC_COLLECTORS)
|
|
||||||
gofumpt -w $(GOSRC_SINKS)
|
|
||||||
gofumpt -w $(GOSRC_RECEIVERS)
|
|
||||||
gofumpt -w $(GOSRC_APP)
|
|
||||||
@for F in $(GOSRC_INTERNAL); do gofumpt -w $$F; done
|
|
||||||
|
|
||||||
|
|
||||||
# Examine Go source code and reports suspicious constructs
|
# Examine Go source code and reports suspicious constructs
|
||||||
.PHONY: vet
|
.PHONY: vet
|
||||||
vet:
|
vet:
|
||||||
$(GOBIN) vet ./...
|
$(GOBIN) vet ./...
|
||||||
|
|
||||||
.PHONY: modernize
|
|
||||||
modernize:
|
|
||||||
$(GOBIN) run golang.org/x/tools/go/analysis/passes/modernize/cmd/modernize@latest ./...
|
|
||||||
|
|
||||||
# Run linter for the Go programming language.
|
# Run linter for the Go programming language.
|
||||||
# Using static analysis, it finds bugs and performance issues, offers simplifications, and enforces style rules
|
# Using static analysis, it finds bugs and performance issues, offers simplifications, and enforces style rules
|
||||||
@@ -97,11 +72,6 @@ staticcheck:
|
|||||||
$(GOBIN) install honnef.co/go/tools/cmd/staticcheck@latest
|
$(GOBIN) install honnef.co/go/tools/cmd/staticcheck@latest
|
||||||
$$($(GOBIN) env GOPATH)/bin/staticcheck ./...
|
$$($(GOBIN) env GOPATH)/bin/staticcheck ./...
|
||||||
|
|
||||||
.PHONY: golangci-lint
|
|
||||||
golangci-lint:
|
|
||||||
$(GOBIN) install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@latest
|
|
||||||
$$($(GOBIN) env GOPATH)/bin/golangci-lint run --enable errorlint,govet,misspell,modernize,prealloc,staticcheck,unconvert,wastedassign
|
|
||||||
|
|
||||||
.ONESHELL:
|
.ONESHELL:
|
||||||
.PHONY: RPM
|
.PHONY: RPM
|
||||||
RPM: scripts/cc-metric-collector.spec
|
RPM: scripts/cc-metric-collector.spec
|
||||||
|
|||||||
47
README.md
47
README.md
@@ -1,19 +1,12 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: cc-metric-collector
|
|
||||||
description: Metric collecting node agent
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/_index.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
# cc-metric-collector
|
# cc-metric-collector
|
||||||
|
|
||||||
A node agent for measuring, processing and forwarding node level metrics. It is part of the [ClusterCockpit ecosystem](https://clustercockpit.org/docs/overview/).
|
A node agent for measuring, processing and forwarding node level metrics. It is part of the [ClusterCockpit ecosystem](./docs/introduction.md).
|
||||||
|
|
||||||
The `cc-metric-collector` sends (and maybe receives) metrics in the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/) as it provides flexibility while providing a separation between tags (like index columns in relational databases) and fields (like data columns). The `cc-metric-collector` consists of 4 components: collectors, router, sinks and receivers. The collectors read data from the current system and submit metrics to the router. The router can be configured to manipulate the metrics before forwarding them to the sinks. The receivers are also attached to the router like the collectors but they receive data from external source like other `cc-metric-collector` instances.
|
The metric collector sends (and receives) metric in the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/) as it provides flexibility while providing a separation between tags (like index columns in relational databases) and fields (like data columns).
|
||||||
|
|
||||||
|
There is a single timer loop that triggers all collectors serially, collects the collectors' data and sends the metrics to the sink. This is done as all data is submitted with a single time stamp. The sinks currently use mostly blocking APIs.
|
||||||
|
|
||||||
|
The receiver runs as a go routine side-by-side with the timer loop and asynchronously forwards received metrics to the sink.
|
||||||
|
|
||||||
|
|
||||||
[](https://doi.org/10.5281/zenodo.7438287)
|
[](https://doi.org/10.5281/zenodo.7438287)
|
||||||
@@ -28,24 +21,22 @@ There is a main configuration file with basic settings that point to the other c
|
|||||||
|
|
||||||
``` json
|
``` json
|
||||||
{
|
{
|
||||||
"sinks-file": "sinks.json",
|
"sinks": "sinks.json",
|
||||||
"collectors-file" : "collectors.json",
|
"collectors" : "collectors.json",
|
||||||
"receivers-file" : "receivers.json",
|
"receivers" : "receivers.json",
|
||||||
"router-file" : "router.json",
|
"router" : "router.json",
|
||||||
"main": {
|
"interval": "10s",
|
||||||
"interval": "10s",
|
"duration": "1s"
|
||||||
"duration": "1s"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The `interval` defines how often the metrics should be read and send to the sink(s). The `duration` tells the collectors how long one measurement has to take. This is important for some collectors, like the `likwid` collector. For more information, see [here](./docs/configuration.md).
|
The `interval` defines how often the metrics should be read and send to the sink. The `duration` tells collectors how long one measurement has to take. This is important for some collectors, like the `likwid` collector. For more information, see [here](./docs/configuration.md).
|
||||||
|
|
||||||
See the component READMEs for their configuration:
|
See the component READMEs for their configuration:
|
||||||
|
|
||||||
* [`collectors`](./collectors/README.md)
|
* [`collectors`](./collectors/README.md)
|
||||||
* [`sinks`](https://github.com/ClusterCockpit/cc-lib/blob/main/sinks/README.md)
|
* [`sinks`](./sinks/README.md)
|
||||||
* [`receivers`](https://github.com/ClusterCockpit/cc-lib/blob/main/receivers/README.md)
|
* [`receivers`](./receivers/README.md)
|
||||||
* [`router`](./internal/metricRouter/README.md)
|
* [`router`](./internal/metricRouter/README.md)
|
||||||
|
|
||||||
# Installation
|
# Installation
|
||||||
@@ -53,7 +44,7 @@ See the component READMEs for their configuration:
|
|||||||
```
|
```
|
||||||
$ git clone git@github.com:ClusterCockpit/cc-metric-collector.git
|
$ git clone git@github.com:ClusterCockpit/cc-metric-collector.git
|
||||||
$ make (downloads LIKWID, builds it as static library with 'direct' accessmode and copies all required files for the collector)
|
$ make (downloads LIKWID, builds it as static library with 'direct' accessmode and copies all required files for the collector)
|
||||||
$ go get
|
$ go get (requires at least golang 1.16)
|
||||||
$ make
|
$ make
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -63,13 +54,11 @@ For more information, see [here](./docs/building.md).
|
|||||||
|
|
||||||
```
|
```
|
||||||
$ ./cc-metric-collector --help
|
$ ./cc-metric-collector --help
|
||||||
Usage of ./cc-metric-collector:
|
Usage of metric-collector:
|
||||||
-config string
|
-config string
|
||||||
Path to configuration file (default "./config.json")
|
Path to configuration file (default "./config.json")
|
||||||
-log string
|
-log string
|
||||||
Path for logfile (default "stderr")
|
Path for logfile (default "stderr")
|
||||||
-loglevel string
|
|
||||||
Set log level (default "info")
|
|
||||||
-once
|
-once
|
||||||
Run all collectors only once
|
Run all collectors only once
|
||||||
```
|
```
|
||||||
@@ -112,7 +101,7 @@ flowchart TD
|
|||||||
|
|
||||||
# Contributing
|
# Contributing
|
||||||
|
|
||||||
The ClusterCockpit ecosystem is designed to be used by different HPC computing centers. Since configurations and setups differ between the centers, the centers likely have to put some work into `cc-metric-collector` to gather all desired metrics.
|
The ClusterCockpit ecosystem is designed to be used by different HPC computing centers. Since configurations and setups differ between the centers, the centers likely have to put some work into the cc-metric-collector to gather all desired metrics.
|
||||||
|
|
||||||
You are free to open an issue to request a collector but we would also be happy about PRs.
|
You are free to open an issue to request a collector but we would also be happy about PRs.
|
||||||
|
|
||||||
|
|||||||
@@ -1,29 +1,23 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"flag"
|
"flag"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
"sync"
|
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-lib/v2/receivers"
|
"github.com/ClusterCockpit/cc-lib/receivers"
|
||||||
"github.com/ClusterCockpit/cc-lib/v2/sinks"
|
"github.com/ClusterCockpit/cc-lib/sinks"
|
||||||
"github.com/ClusterCockpit/cc-metric-collector/collectors"
|
"github.com/ClusterCockpit/cc-metric-collector/collectors"
|
||||||
|
|
||||||
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
// "strings"
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
"sync"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
"time"
|
||||||
|
|
||||||
|
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
||||||
)
|
)
|
||||||
@@ -49,25 +43,65 @@ type RuntimeConfig struct {
|
|||||||
Sync sync.WaitGroup
|
Sync sync.WaitGroup
|
||||||
}
|
}
|
||||||
|
|
||||||
// ReadCli reads the command line arguments
|
//// Structure of the configuration file
|
||||||
|
//type GlobalConfig struct {
|
||||||
|
// Sink sinks.SinkConfig `json:"sink"`
|
||||||
|
// Interval int `json:"interval"`
|
||||||
|
// Duration int `json:"duration"`
|
||||||
|
// Collectors []string `json:"collectors"`
|
||||||
|
// Receiver receivers.ReceiverConfig `json:"receiver"`
|
||||||
|
// DefTags map[string]string `json:"default_tags"`
|
||||||
|
// CollectConfigs map[string]json.RawMessage `json:"collect_config"`
|
||||||
|
//}
|
||||||
|
|
||||||
|
//// Load JSON configuration file
|
||||||
|
//func LoadConfiguration(file string, config *GlobalConfig) error {
|
||||||
|
// configFile, err := os.Open(file)
|
||||||
|
// defer configFile.Close()
|
||||||
|
// if err != nil {
|
||||||
|
// fmt.Println(err.Error())
|
||||||
|
// return err
|
||||||
|
// }
|
||||||
|
// jsonParser := json.NewDecoder(configFile)
|
||||||
|
// err = jsonParser.Decode(config)
|
||||||
|
// return err
|
||||||
|
//}
|
||||||
|
|
||||||
func ReadCli() map[string]string {
|
func ReadCli() map[string]string {
|
||||||
|
var m map[string]string
|
||||||
cfg := flag.String("config", "./config.json", "Path to configuration file")
|
cfg := flag.String("config", "./config.json", "Path to configuration file")
|
||||||
logfile := flag.String("log", "stderr", "Path for logfile")
|
logfile := flag.String("log", "stderr", "Path for logfile")
|
||||||
once := flag.Bool("once", false, "Run all collectors only once")
|
once := flag.Bool("once", false, "Run all collectors only once")
|
||||||
loglevel := flag.String("loglevel", "info", "Set log level")
|
loglevel := flag.String("loglevel", "info", "Set log level")
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
m := map[string]string{
|
m = make(map[string]string)
|
||||||
"configfile": *cfg,
|
m["configfile"] = *cfg
|
||||||
"logfile": *logfile,
|
m["logfile"] = *logfile
|
||||||
"once": "false",
|
|
||||||
"loglevel": *loglevel,
|
|
||||||
}
|
|
||||||
if *once {
|
if *once {
|
||||||
m["once"] = "true"
|
m["once"] = "true"
|
||||||
|
} else {
|
||||||
|
m["once"] = "false"
|
||||||
}
|
}
|
||||||
|
m["loglevel"] = *loglevel
|
||||||
return m
|
return m
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//func SetLogging(logfile string) error {
|
||||||
|
// var file *os.File
|
||||||
|
// var err error
|
||||||
|
// if logfile != "stderr" {
|
||||||
|
// file, err = os.OpenFile(logfile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600)
|
||||||
|
// if err != nil {
|
||||||
|
// log.Fatal(err)
|
||||||
|
// return err
|
||||||
|
// }
|
||||||
|
// } else {
|
||||||
|
// file = os.Stderr
|
||||||
|
// }
|
||||||
|
// log.SetOutput(file)
|
||||||
|
// return nil
|
||||||
|
//}
|
||||||
|
|
||||||
// General shutdownHandler function that gets executed in case of interrupt or graceful shutdownHandler
|
// General shutdownHandler function that gets executed in case of interrupt or graceful shutdownHandler
|
||||||
func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) {
|
func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) {
|
||||||
defer config.Sync.Done()
|
defer config.Sync.Done()
|
||||||
@@ -121,10 +155,9 @@ func mainFunc() int {
|
|||||||
|
|
||||||
// Load and check configuration
|
// Load and check configuration
|
||||||
main := ccconf.GetPackageConfig("main")
|
main := ccconf.GetPackageConfig("main")
|
||||||
d := json.NewDecoder(bytes.NewReader(main))
|
err = json.Unmarshal(main, &rcfg.ConfigFile)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&rcfg.ConfigFile); err != nil {
|
cclog.Error("Error reading configuration file ", rcfg.CliArgs["configfile"], ": ", err.Error())
|
||||||
cclog.Errorf("Error reading configuration file %s: %v", rcfg.CliArgs["configfile"], err)
|
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -176,6 +209,11 @@ func mainFunc() int {
|
|||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Set log file
|
||||||
|
// if logfile := rcfg.CliArgs["logfile"]; logfile != "stderr" {
|
||||||
|
// cclog.SetOutput(logfile)
|
||||||
|
// }
|
||||||
|
|
||||||
// Creat new multi channel ticker
|
// Creat new multi channel ticker
|
||||||
rcfg.MultiChanTicker = mct.NewTicker(rcfg.Interval)
|
rcfg.MultiChanTicker = mct.NewTicker(rcfg.Interval)
|
||||||
|
|
||||||
|
|||||||
@@ -1,19 +1,6 @@
|
|||||||
{
|
{
|
||||||
"cpufreq": {},
|
"cpufreq": {},
|
||||||
"cpufreq_cpuinfo": {},
|
"cpufreq_cpuinfo": {},
|
||||||
"cpustat": {
|
|
||||||
"exclude_metrics": [
|
|
||||||
"cpu_idle"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"diskstat": {
|
|
||||||
"exclude_metrics": [
|
|
||||||
"disk_total"
|
|
||||||
],
|
|
||||||
"exclude_mounts": [
|
|
||||||
"slurm-tmpfs"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"gpfs": {
|
"gpfs": {
|
||||||
"exclude_filesystem": [
|
"exclude_filesystem": [
|
||||||
"test_fs"
|
"test_fs"
|
||||||
@@ -34,8 +21,6 @@
|
|||||||
},
|
},
|
||||||
"numastats": {},
|
"numastats": {},
|
||||||
"nvidia": {},
|
"nvidia": {},
|
||||||
"schedstat": {},
|
|
||||||
"smartmon": {},
|
|
||||||
"tempstat": {
|
"tempstat": {
|
||||||
"report_max_temperature": true,
|
"report_max_temperature": true,
|
||||||
"report_critical_temperature": true,
|
"report_critical_temperature": true,
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
# LIKWID version
|
# LIKWID version
|
||||||
LIKWID_VERSION := 5.5.1
|
LIKWID_VERSION := 5.4.1
|
||||||
LIKWID_INSTALLED_FOLDER := $(shell dirname $$(which likwid-topology 2>/dev/null) 2>/dev/null)
|
LIKWID_INSTALLED_FOLDER := $(shell dirname $$(which likwid-topology 2>/dev/null) 2>/dev/null)
|
||||||
|
|
||||||
LIKWID_FOLDER := $(CURDIR)/likwid
|
LIKWID_FOLDER := $(CURDIR)/likwid
|
||||||
|
|||||||
@@ -1,14 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: Metric Collectors
|
|
||||||
description: Metric collectors for cc-metric-collector
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/_index.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
# CCMetric collectors
|
# CCMetric collectors
|
||||||
|
|
||||||
This folder contains the collectors for the cc-metric-collector.
|
This folder contains the collectors for the cc-metric-collector.
|
||||||
@@ -34,6 +23,7 @@ In contrast to the configuration files for sinks and receivers, the collectors c
|
|||||||
* [`loadavg`](./loadavgMetric.md)
|
* [`loadavg`](./loadavgMetric.md)
|
||||||
* [`netstat`](./netstatMetric.md)
|
* [`netstat`](./netstatMetric.md)
|
||||||
* [`ibstat`](./infinibandMetric.md)
|
* [`ibstat`](./infinibandMetric.md)
|
||||||
|
* [`ibstat_perfquery`](./infinibandPerfQueryMetric.md)
|
||||||
* [`tempstat`](./tempMetric.md)
|
* [`tempstat`](./tempMetric.md)
|
||||||
* [`lustrestat`](./lustreMetric.md)
|
* [`lustrestat`](./lustreMetric.md)
|
||||||
* [`likwid`](./likwidMetric.md)
|
* [`likwid`](./likwidMetric.md)
|
||||||
@@ -43,32 +33,28 @@ In contrast to the configuration files for sinks and receivers, the collectors c
|
|||||||
* [`topprocs`](./topprocsMetric.md)
|
* [`topprocs`](./topprocsMetric.md)
|
||||||
* [`nfs3stat`](./nfs3Metric.md)
|
* [`nfs3stat`](./nfs3Metric.md)
|
||||||
* [`nfs4stat`](./nfs4Metric.md)
|
* [`nfs4stat`](./nfs4Metric.md)
|
||||||
* [`nfsiostat`](./nfsiostatMetric.md)
|
|
||||||
* [`cpufreq`](./cpufreqMetric.md)
|
* [`cpufreq`](./cpufreqMetric.md)
|
||||||
* [`cpufreq_cpuinfo`](./cpufreqCpuinfoMetric.md)
|
* [`cpufreq_cpuinfo`](./cpufreqCpuinfoMetric.md)
|
||||||
* [`schedstat`](./schedstatMetric.md)
|
|
||||||
* [`numastats`](./numastatsMetric.md)
|
* [`numastats`](./numastatsMetric.md)
|
||||||
* [`gpfs`](./gpfsMetric.md)
|
* [`gpfs`](./gpfsMetric.md)
|
||||||
* [`beegfs_meta`](./beegfsmetaMetric.md)
|
* [`beegfs_meta`](./beegfsmetaMetric.md)
|
||||||
* [`beegfs_storage`](./beegfsstorageMetric.md)
|
* [`beegfs_storage`](./beegfsstorageMetric.md)
|
||||||
* [`rocm_smi`](./rocmsmiMetric.md)
|
* [`rocm_smi`](./rocmsmiMetric.md)
|
||||||
* [`slurm_cgroup`](./slurmCgroupMetric.md)
|
|
||||||
|
|
||||||
## Todos
|
## Todos
|
||||||
|
|
||||||
* [ ] Aggreate metrics to higher topology entity (sum hwthread metrics to socket metric, ...). Needs to be configurable
|
* [ ] Aggreate metrics to higher topology entity (sum hwthread metrics to socket metric, ...). Needs to be configurable
|
||||||
|
|
||||||
# Contributing own collectors
|
# Contributing own collectors
|
||||||
|
|
||||||
A collector reads data from any source, parses it to metrics and submits these metrics to the `metric-collector`. A collector provides three function:
|
A collector reads data from any source, parses it to metrics and submits these metrics to the `metric-collector`. A collector provides three function:
|
||||||
|
|
||||||
* `Name() string`: Return the name of the collector
|
* `Name() string`: Return the name of the collector
|
||||||
* `Init(config json.RawMessage) error`: Initializes the collector using the given collector-specific config in JSON. Check if needed files/commands exists, ...
|
* `Init(config json.RawMessage) error`: Initializes the collector using the given collector-specific config in JSON. Check if needed files/commands exists, ...
|
||||||
* `Initialized() bool`: Check if a collector is successfully initialized
|
* `Initialized() bool`: Check if a collector is successfully initialized
|
||||||
* `Read(duration time.Duration, output chan ccMessage.CCMessage)`: Read, parse and submit data to the `output` channel as [`CCMessage`](https://github.com/ClusterCockpit/cc-lib/blob/main/ccMessage/README.md). If the collector has to measure anything for some duration, use the provided function argument `duration`.
|
* `Read(duration time.Duration, output chan ccMetric.CCMetric)`: Read, parse and submit data to the `output` channel as [`CCMetric`](../internal/ccMetric/README.md). If the collector has to measure anything for some duration, use the provided function argument `duration`.
|
||||||
* `Close()`: Closes down the collector.
|
* `Close()`: Closes down the collector.
|
||||||
|
|
||||||
It is recommended to call `setup()` in the `Init()` function.
|
It is recommanded to call `setup()` in the `Init()` function.
|
||||||
|
|
||||||
Finally, the collector needs to be registered in the `collectorManager.go`. There is a list of collectors called `AvailableCollectors` which is a map (`collector_type_string` -> `pointer to MetricCollector interface`). Add a new entry with a descriptive name and the new collector.
|
Finally, the collector needs to be registered in the `collectorManager.go`. There is a list of collectors called `AvailableCollectors` which is a map (`collector_type_string` -> `pointer to MetricCollector interface`). Add a new entry with a descriptive name and the new collector.
|
||||||
|
|
||||||
@@ -101,14 +87,11 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
m.name = "SampleCollector"
|
m.name = "SampleCollector"
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err := json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
return err
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Sample"}
|
m.meta = map[string]string{"source": m.name, "group": "Sample"}
|
||||||
|
|||||||
@@ -1,10 +1,3 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -17,27 +10,25 @@ import (
|
|||||||
"os/exec"
|
"os/exec"
|
||||||
"os/user"
|
"os/user"
|
||||||
"regexp"
|
"regexp"
|
||||||
"slices"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const DEFAULT_BEEGFS_CMD = "beegfs-ctl"
|
const DEFAULT_BEEGFS_CMD = "beegfs-ctl"
|
||||||
|
|
||||||
// Struct for the collector-specific JSON config
|
// Struct for the collector-specific JSON config
|
||||||
type BeegfsMetaCollectorConfig struct {
|
type BeegfsMetaCollectorConfig struct {
|
||||||
Beegfs string `json:"beegfs_path"`
|
Beegfs string `json:"beegfs_path"`
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
ExcludeFilesystems []string `json:"exclude_filesystem"`
|
ExcludeFilesystem []string `json:"exclude_filesystem"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type BeegfsMetaCollector struct {
|
type BeegfsMetaCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
matches map[string]string
|
matches map[string]string
|
||||||
config BeegfsMetaCollectorConfig
|
config BeegfsMetaCollectorConfig
|
||||||
@@ -50,7 +41,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
// Metrics
|
// Metrics
|
||||||
nodeMdstat_array := [39]string{
|
var nodeMdstat_array = [39]string{
|
||||||
"sum", "ack", "close", "entInf",
|
"sum", "ack", "close", "entInf",
|
||||||
"fndOwn", "mkdir", "create", "rddir",
|
"fndOwn", "mkdir", "create", "rddir",
|
||||||
"refrEn", "mdsInf", "rmdir", "rmLnk",
|
"refrEn", "mdsInf", "rmdir", "rmLnk",
|
||||||
@@ -60,13 +51,10 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
|||||||
"lookLI", "statLI", "revalLI", "openLI",
|
"lookLI", "statLI", "revalLI", "openLI",
|
||||||
"createLI", "hardlnk", "flckAp", "flckEn",
|
"createLI", "hardlnk", "flckAp", "flckEn",
|
||||||
"flckRg", "dirparent", "listXA", "getXA",
|
"flckRg", "dirparent", "listXA", "getXA",
|
||||||
"rmXA", "setXA", "mirror",
|
"rmXA", "setXA", "mirror"}
|
||||||
}
|
|
||||||
|
|
||||||
m.name = "BeegfsMetaCollector"
|
m.name = "BeegfsMetaCollector"
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
// Set default beegfs-ctl binary
|
// Set default beegfs-ctl binary
|
||||||
|
|
||||||
@@ -74,17 +62,17 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Read JSON configuration
|
// Read JSON configuration
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err := json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
return err
|
||||||
return fmt.Errorf("%s Init(): Failed to decode JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create map with possible variables
|
//create map with possible variables
|
||||||
m.matches = make(map[string]string)
|
m.matches = make(map[string]string)
|
||||||
for _, value := range nodeMdstat_array {
|
for _, value := range nodeMdstat_array {
|
||||||
if slices.Contains(m.config.ExcludeMetrics, value) {
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, value)
|
||||||
|
if skip {
|
||||||
m.matches["other"] = "0"
|
m.matches["other"] = "0"
|
||||||
} else {
|
} else {
|
||||||
m.matches["beegfs_cmeta_"+value] = "0"
|
m.matches["beegfs_cmeta_"+value] = "0"
|
||||||
@@ -100,23 +88,23 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
|||||||
"filesystem": "",
|
"filesystem": "",
|
||||||
}
|
}
|
||||||
m.skipFS = make(map[string]struct{})
|
m.skipFS = make(map[string]struct{})
|
||||||
for _, fs := range m.config.ExcludeFilesystems {
|
for _, fs := range m.config.ExcludeFilesystem {
|
||||||
m.skipFS[fs] = struct{}{}
|
m.skipFS[fs] = struct{}{}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Beegfs file system statistics can only be queried by user root
|
// Beegfs file system statistics can only be queried by user root
|
||||||
user, err := user.Current()
|
user, err := user.Current()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
|
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to get current user: %v", err)
|
||||||
}
|
}
|
||||||
if user.Uid != "0" {
|
if user.Uid != "0" {
|
||||||
return fmt.Errorf("%s Init(): BeeGFS file system statistics can only be queried by user root", m.name)
|
return fmt.Errorf("BeegfsMetaCollector.Init(): BeeGFS file system statistics can only be queried by user root")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if beegfs-ctl is in executable search path
|
// Check if beegfs-ctl is in executable search path
|
||||||
_, err = exec.LookPath(m.config.Beegfs)
|
_, err = exec.LookPath(m.config.Beegfs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): Failed to find beegfs-ctl binary '%s': %w", m.name, m.config.Beegfs, err)
|
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err)
|
||||||
}
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
@@ -126,7 +114,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
if !m.init {
|
if !m.init {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Get mounpoint
|
//get mounpoint
|
||||||
buffer, _ := os.ReadFile(string("/proc/mounts"))
|
buffer, _ := os.ReadFile(string("/proc/mounts"))
|
||||||
mounts := strings.Split(string(buffer), "\n")
|
mounts := strings.Split(string(buffer), "\n")
|
||||||
var mountpoints []string
|
var mountpoints []string
|
||||||
@@ -156,6 +144,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
// --nodetype=meta: The node type to query (meta, storage).
|
// --nodetype=meta: The node type to query (meta, storage).
|
||||||
// --interval:
|
// --interval:
|
||||||
// --mount=/mnt/beeond/: Which mount point
|
// --mount=/mnt/beeond/: Which mount point
|
||||||
|
//cmd := exec.Command(m.config.Beegfs, "/root/mc/test.txt")
|
||||||
mountoption := "--mount=" + mountpoint
|
mountoption := "--mount=" + mountpoint
|
||||||
cmd := exec.Command(m.config.Beegfs, "--clientstats",
|
cmd := exec.Command(m.config.Beegfs, "--clientstats",
|
||||||
"--nodetype=meta", mountoption, "--allstats")
|
"--nodetype=meta", mountoption, "--allstats")
|
||||||
@@ -166,27 +155,26 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
cmd.Stderr = cmdStderr
|
cmd.Stderr = cmdStderr
|
||||||
err := cmd.Run()
|
err := cmd.Run()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
dataStdErr, _ := io.ReadAll(cmdStderr)
|
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error())
|
||||||
dataStdOut, _ := io.ReadAll(cmdStdout)
|
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode())
|
||||||
cclog.ComponentError(
|
data, _ := io.ReadAll(cmdStderr)
|
||||||
m.name,
|
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stderr: \"%s\"\n", string(data))
|
||||||
fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err),
|
data, _ = io.ReadAll(cmdStdout)
|
||||||
fmt.Sprintf("Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()),
|
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stdout: \"%s\"\n", string(data))
|
||||||
fmt.Sprintf("Read(): command stderr: \"%s\"\n", string(dataStdErr)),
|
|
||||||
fmt.Sprintf("Read(): command stdout: \"%s\"\n", string(dataStdOut)),
|
|
||||||
)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Read I/O statistics
|
// Read I/O statistics
|
||||||
scanner := bufio.NewScanner(cmdStdout)
|
scanner := bufio.NewScanner(cmdStdout)
|
||||||
|
|
||||||
sumLine := regexp.MustCompile(`^Sum:\s+\d+\s+\[[a-zA-Z]+\]+`)
|
sumLine := regexp.MustCompile(`^Sum:\s+\d+\s+\[[a-zA-Z]+\]+`)
|
||||||
|
//Line := regexp.MustCompile(`^(.*)\s+(\d)+\s+\[([a-zA-Z]+)\]+`)
|
||||||
statsLine := regexp.MustCompile(`^(.*?)\s+?(\d.*?)$`)
|
statsLine := regexp.MustCompile(`^(.*?)\s+?(\d.*?)$`)
|
||||||
singleSpacePattern := regexp.MustCompile(`\s+`)
|
singleSpacePattern := regexp.MustCompile(`\s+`)
|
||||||
removePattern := regexp.MustCompile(`[\[|\]]`)
|
removePattern := regexp.MustCompile(`[\[|\]]`)
|
||||||
|
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
readLine := scanner.Text()
|
readLine := scanner.Text()
|
||||||
|
//fmt.Println(readLine)
|
||||||
// Jump few lines, we only want the I/O stats from nodes
|
// Jump few lines, we only want the I/O stats from nodes
|
||||||
if !sumLine.MatchString(readLine) {
|
if !sumLine.MatchString(readLine) {
|
||||||
continue
|
continue
|
||||||
@@ -195,7 +183,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
match := statsLine.FindStringSubmatch(readLine)
|
match := statsLine.FindStringSubmatch(readLine)
|
||||||
// nodeName = "Sum:" or would be nodes
|
// nodeName = "Sum:" or would be nodes
|
||||||
// nodeName := match[1]
|
// nodeName := match[1]
|
||||||
// Remove multiple whitespaces
|
//Remove multiple whitespaces
|
||||||
dummy := removePattern.ReplaceAllString(match[2], " ")
|
dummy := removePattern.ReplaceAllString(match[2], " ")
|
||||||
metaStats := strings.TrimSpace(singleSpacePattern.ReplaceAllString(dummy, " "))
|
metaStats := strings.TrimSpace(singleSpacePattern.ReplaceAllString(dummy, " "))
|
||||||
split := strings.Split(metaStats, " ")
|
split := strings.Split(metaStats, " ")
|
||||||
@@ -221,13 +209,14 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
|
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
//mdStat["other"] = fmt.Sprintf("%f", f1+f2)
|
||||||
m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2)
|
m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for key, data := range m.matches {
|
for key, data := range m.matches {
|
||||||
value, _ := strconv.ParseFloat(data, 32)
|
value, _ := strconv.ParseFloat(data, 32)
|
||||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,17 +1,5 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: BeeGFS metadata metric collector
|
|
||||||
description: Collect metadata clientstats for `BeeGFS on Demand`
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/beegfsmeta.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
|
|
||||||
## `BeeGFS on Demand` collector
|
## `BeeGFS on Demand` collector
|
||||||
This Collector is to collect `BeeGFS on Demand` (BeeOND) metadata clientstats.
|
This Collector is to collect BeeGFS on Demand (BeeOND) metadata clientstats.
|
||||||
|
|
||||||
```json
|
```json
|
||||||
"beegfs_meta": {
|
"beegfs_meta": {
|
||||||
|
|||||||
@@ -1,10 +1,3 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -17,25 +10,23 @@ import (
|
|||||||
"os/exec"
|
"os/exec"
|
||||||
"os/user"
|
"os/user"
|
||||||
"regexp"
|
"regexp"
|
||||||
"slices"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Struct for the collector-specific JSON config
|
// Struct for the collector-specific JSON config
|
||||||
type BeegfsStorageCollectorConfig struct {
|
type BeegfsStorageCollectorConfig struct {
|
||||||
Beegfs string `json:"beegfs_path"`
|
Beegfs string `json:"beegfs_path"`
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
ExcludeFilesystems []string `json:"exclude_filesystem"`
|
ExcludeFilesystem []string `json:"exclude_filesystem"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type BeegfsStorageCollector struct {
|
type BeegfsStorageCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
matches map[string]string
|
matches map[string]string
|
||||||
config BeegfsStorageCollectorConfig
|
config BeegfsStorageCollectorConfig
|
||||||
@@ -48,18 +39,15 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
// Metrics
|
// Metrics
|
||||||
storageStat_array := [18]string{
|
var storageStat_array = [18]string{
|
||||||
"sum", "ack", "sChDrct", "getFSize",
|
"sum", "ack", "sChDrct", "getFSize",
|
||||||
"sAttr", "statfs", "trunc", "close",
|
"sAttr", "statfs", "trunc", "close",
|
||||||
"fsync", "ops-rd", "MiB-rd/s", "ops-wr",
|
"fsync", "ops-rd", "MiB-rd/s", "ops-wr",
|
||||||
"MiB-wr/s", "gendbg", "hrtbeat", "remNode",
|
"MiB-wr/s", "gendbg", "hrtbeat", "remNode",
|
||||||
"storInf", "unlnk",
|
"storInf", "unlnk"}
|
||||||
}
|
|
||||||
|
|
||||||
m.name = "BeegfsStorageCollector"
|
m.name = "BeegfsStorageCollector"
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
// Set default beegfs-ctl binary
|
// Set default beegfs-ctl binary
|
||||||
|
|
||||||
@@ -67,17 +55,17 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Read JSON configuration
|
// Read JSON configuration
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err := json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
return err
|
||||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
println(m.config.Beegfs)
|
||||||
// Create map with possible variables
|
//create map with possible variables
|
||||||
m.matches = make(map[string]string)
|
m.matches = make(map[string]string)
|
||||||
for _, value := range storageStat_array {
|
for _, value := range storageStat_array {
|
||||||
if slices.Contains(m.config.ExcludeMetrics, value) {
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, value)
|
||||||
|
if skip {
|
||||||
m.matches["other"] = "0"
|
m.matches["other"] = "0"
|
||||||
} else {
|
} else {
|
||||||
m.matches["beegfs_cstorage_"+value] = "0"
|
m.matches["beegfs_cstorage_"+value] = "0"
|
||||||
@@ -93,23 +81,23 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
|||||||
"filesystem": "",
|
"filesystem": "",
|
||||||
}
|
}
|
||||||
m.skipFS = make(map[string]struct{})
|
m.skipFS = make(map[string]struct{})
|
||||||
for _, fs := range m.config.ExcludeFilesystems {
|
for _, fs := range m.config.ExcludeFilesystem {
|
||||||
m.skipFS[fs] = struct{}{}
|
m.skipFS[fs] = struct{}{}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Beegfs file system statistics can only be queried by user root
|
// Beegfs file system statistics can only be queried by user root
|
||||||
user, err := user.Current()
|
user, err := user.Current()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
|
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to get current user: %v", err)
|
||||||
}
|
}
|
||||||
if user.Uid != "0" {
|
if user.Uid != "0" {
|
||||||
return fmt.Errorf("%s Init(): BeeGFS file system statistics can only be queried by user root", m.name)
|
return fmt.Errorf("BeegfsStorageCollector.Init(): BeeGFS file system statistics can only be queried by user root")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if beegfs-ctl is in executable search path
|
// Check if beegfs-ctl is in executable search path
|
||||||
_, err = exec.LookPath(m.config.Beegfs)
|
_, err = exec.LookPath(m.config.Beegfs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): Failed to find beegfs-ctl binary '%s': %w", m.name, m.config.Beegfs, err)
|
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err)
|
||||||
}
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
@@ -119,10 +107,11 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
|||||||
if !m.init {
|
if !m.init {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Get mounpoint
|
//get mounpoint
|
||||||
buffer, _ := os.ReadFile("/proc/mounts")
|
buffer, _ := os.ReadFile(string("/proc/mounts"))
|
||||||
|
mounts := strings.Split(string(buffer), "\n")
|
||||||
var mountpoints []string
|
var mountpoints []string
|
||||||
for line := range strings.Lines(string(buffer)) {
|
for _, line := range mounts {
|
||||||
if len(line) == 0 {
|
if len(line) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -147,6 +136,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
|||||||
// --nodetype=meta: The node type to query (meta, storage).
|
// --nodetype=meta: The node type to query (meta, storage).
|
||||||
// --interval:
|
// --interval:
|
||||||
// --mount=/mnt/beeond/: Which mount point
|
// --mount=/mnt/beeond/: Which mount point
|
||||||
|
//cmd := exec.Command(m.config.Beegfs, "/root/mc/test.txt")
|
||||||
mountoption := "--mount=" + mountpoint
|
mountoption := "--mount=" + mountpoint
|
||||||
cmd := exec.Command(m.config.Beegfs, "--clientstats",
|
cmd := exec.Command(m.config.Beegfs, "--clientstats",
|
||||||
"--nodetype=storage", mountoption, "--allstats")
|
"--nodetype=storage", mountoption, "--allstats")
|
||||||
@@ -157,27 +147,26 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
|||||||
cmd.Stderr = cmdStderr
|
cmd.Stderr = cmdStderr
|
||||||
err := cmd.Run()
|
err := cmd.Run()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
dataStdErr, _ := io.ReadAll(cmdStderr)
|
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error())
|
||||||
dataStdOut, _ := io.ReadAll(cmdStdout)
|
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode())
|
||||||
cclog.ComponentError(
|
data, _ := io.ReadAll(cmdStderr)
|
||||||
m.name,
|
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stderr: \"%s\"\n", string(data))
|
||||||
fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err),
|
data, _ = io.ReadAll(cmdStdout)
|
||||||
fmt.Sprintf("Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()),
|
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stdout: \"%s\"\n", string(data))
|
||||||
fmt.Sprintf("Read(): command stderr: \"%s\"\n", string(dataStdErr)),
|
|
||||||
fmt.Sprintf("Read(): command stdout: \"%s\"\n", string(dataStdOut)),
|
|
||||||
)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Read I/O statistics
|
// Read I/O statistics
|
||||||
scanner := bufio.NewScanner(cmdStdout)
|
scanner := bufio.NewScanner(cmdStdout)
|
||||||
|
|
||||||
sumLine := regexp.MustCompile(`^Sum:\s+\d+\s+\[[a-zA-Z]+\]+`)
|
sumLine := regexp.MustCompile(`^Sum:\s+\d+\s+\[[a-zA-Z]+\]+`)
|
||||||
|
//Line := regexp.MustCompile(`^(.*)\s+(\d)+\s+\[([a-zA-Z]+)\]+`)
|
||||||
statsLine := regexp.MustCompile(`^(.*?)\s+?(\d.*?)$`)
|
statsLine := regexp.MustCompile(`^(.*?)\s+?(\d.*?)$`)
|
||||||
singleSpacePattern := regexp.MustCompile(`\s+`)
|
singleSpacePattern := regexp.MustCompile(`\s+`)
|
||||||
removePattern := regexp.MustCompile(`[\[|\]]`)
|
removePattern := regexp.MustCompile(`[\[|\]]`)
|
||||||
|
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
readLine := scanner.Text()
|
readLine := scanner.Text()
|
||||||
|
//fmt.Println(readLine)
|
||||||
// Jump few lines, we only want the I/O stats from nodes
|
// Jump few lines, we only want the I/O stats from nodes
|
||||||
if !sumLine.MatchString(readLine) {
|
if !sumLine.MatchString(readLine) {
|
||||||
continue
|
continue
|
||||||
@@ -186,7 +175,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
|||||||
match := statsLine.FindStringSubmatch(readLine)
|
match := statsLine.FindStringSubmatch(readLine)
|
||||||
// nodeName = "Sum:" or would be nodes
|
// nodeName = "Sum:" or would be nodes
|
||||||
// nodeName := match[1]
|
// nodeName := match[1]
|
||||||
// Remove multiple whitespaces
|
//Remove multiple whitespaces
|
||||||
dummy := removePattern.ReplaceAllString(match[2], " ")
|
dummy := removePattern.ReplaceAllString(match[2], " ")
|
||||||
metaStats := strings.TrimSpace(singleSpacePattern.ReplaceAllString(dummy, " "))
|
metaStats := strings.TrimSpace(singleSpacePattern.ReplaceAllString(dummy, " "))
|
||||||
split := strings.Split(metaStats, " ")
|
split := strings.Split(metaStats, " ")
|
||||||
@@ -197,6 +186,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
|||||||
for i := 0; i <= len(split)-1; i += 2 {
|
for i := 0; i <= len(split)-1; i += 2 {
|
||||||
if _, ok := m.matches[split[i+1]]; ok {
|
if _, ok := m.matches[split[i+1]]; ok {
|
||||||
m.matches["beegfs_cstorage_"+split[i+1]] = split[i]
|
m.matches["beegfs_cstorage_"+split[i+1]] = split[i]
|
||||||
|
//m.matches[split[i+1]] = split[i]
|
||||||
} else {
|
} else {
|
||||||
f1, err := strconv.ParseFloat(m.matches["other"], 32)
|
f1, err := strconv.ParseFloat(m.matches["other"], 32)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -218,7 +208,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
|||||||
|
|
||||||
for key, data := range m.matches {
|
for key, data := range m.matches {
|
||||||
value, _ := strconv.ParseFloat(data, 32)
|
value, _ := strconv.ParseFloat(data, 32)
|
||||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,27 +1,16 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: "BeeGFS on Demand metric collector"
|
|
||||||
description: Collect performance metrics for BeeGFS filesystems
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/beegfsstorage.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `BeeGFS on Demand` collector
|
## `BeeGFS on Demand` collector
|
||||||
This Collector is to collect BeeGFS on Demand (BeeOND) storage stats.
|
This Collector is to collect BeeGFS on Demand (BeeOND) storage stats.
|
||||||
|
|
||||||
```json
|
```json
|
||||||
"beegfs_storage": {
|
"beegfs_storage": {
|
||||||
"beegfs_path": "/usr/bin/beegfs-ctl",
|
"beegfs_path": "/usr/bin/beegfs-ctl",
|
||||||
"exclude_filesystem": [
|
"exclude_filesystem": [
|
||||||
"/mnt/ignore_me"
|
"/mnt/ignore_me"
|
||||||
],
|
],
|
||||||
"exclude_metrics": [
|
"exclude_metrics": [
|
||||||
"ack",
|
"ack",
|
||||||
"storInf",
|
"storInf",
|
||||||
"unlnk"
|
"unlnk"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -1,26 +1,18 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Map of all available metric collectors
|
// Map of all available metric collectors
|
||||||
var AvailableCollectors = map[string]MetricCollector{
|
var AvailableCollectors = map[string]MetricCollector{
|
||||||
|
|
||||||
"likwid": new(LikwidCollector),
|
"likwid": new(LikwidCollector),
|
||||||
"loadavg": new(LoadavgCollector),
|
"loadavg": new(LoadavgCollector),
|
||||||
"memstat": new(MemstatCollector),
|
"memstat": new(MemstatCollector),
|
||||||
@@ -48,8 +40,6 @@ var AvailableCollectors = map[string]MetricCollector{
|
|||||||
"self": new(SelfCollector),
|
"self": new(SelfCollector),
|
||||||
"schedstat": new(SchedstatCollector),
|
"schedstat": new(SchedstatCollector),
|
||||||
"nfsiostat": new(NfsIOStatCollector),
|
"nfsiostat": new(NfsIOStatCollector),
|
||||||
"slurm_cgroup": new(SlurmCgroupCollector),
|
|
||||||
"smartmon": new(SmartMonCollector),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Metric collector manager data structure
|
// Metric collector manager data structure
|
||||||
@@ -90,10 +80,10 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
|
|||||||
cm.ticker = ticker
|
cm.ticker = ticker
|
||||||
cm.duration = duration
|
cm.duration = duration
|
||||||
|
|
||||||
d := json.NewDecoder(bytes.NewReader(collectConfig))
|
err := json.Unmarshal(collectConfig, &cm.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&cm.config); err != nil {
|
cclog.Error(err.Error())
|
||||||
return fmt.Errorf("%s Init(): Error decoding collector manager config: %w", "CollectorManager", err)
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize configured collectors
|
// Initialize configured collectors
|
||||||
@@ -104,9 +94,9 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
|
|||||||
}
|
}
|
||||||
collector := AvailableCollectors[collectorName]
|
collector := AvailableCollectors[collectorName]
|
||||||
|
|
||||||
err := collector.Init(collectorCfg)
|
err = collector.Init(collectorCfg)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError("CollectorManager", fmt.Sprintf("Collector %s initialization failed: %v", collectorName, err))
|
cclog.ComponentError("CollectorManager", "Collector", collectorName, "initialization failed:", err.Error())
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())
|
cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())
|
||||||
@@ -124,7 +114,9 @@ func (cm *collectorManager) Start() {
|
|||||||
tick := make(chan time.Time)
|
tick := make(chan time.Time)
|
||||||
cm.ticker.AddChannel(tick)
|
cm.ticker.AddChannel(tick)
|
||||||
|
|
||||||
cm.wg.Go(func() {
|
cm.wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer cm.wg.Done()
|
||||||
// Collector manager is done
|
// Collector manager is done
|
||||||
done := func() {
|
done := func() {
|
||||||
// close all metric collectors
|
// close all metric collectors
|
||||||
@@ -179,7 +171,7 @@ func (cm *collectorManager) Start() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
}()
|
||||||
|
|
||||||
// Collector manager is started
|
// Collector manager is started
|
||||||
cclog.ComponentDebug("CollectorManager", "STARTED")
|
cclog.ComponentDebug("CollectorManager", "STARTED")
|
||||||
|
|||||||
@@ -1,23 +1,17 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// CPUFreqCollector
|
// CPUFreqCollector
|
||||||
@@ -31,20 +25,18 @@ type CPUFreqCpuInfoCollectorTopology struct {
|
|||||||
|
|
||||||
type CPUFreqCpuInfoCollector struct {
|
type CPUFreqCpuInfoCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
topology []CPUFreqCpuInfoCollectorTopology
|
topology []CPUFreqCpuInfoCollectorTopology
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *CPUFreqCpuInfoCollector) Init(_ json.RawMessage) error {
|
func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
||||||
// Check if already initialized
|
// Check if already initialized
|
||||||
if m.init {
|
if m.init {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m.setup()
|
||||||
|
|
||||||
m.name = "CPUFreqCpuInfoCollector"
|
m.name = "CPUFreqCpuInfoCollector"
|
||||||
if err := m.setup(); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{
|
||||||
"source": m.name,
|
"source": m.name,
|
||||||
@@ -55,8 +47,9 @@ func (m *CPUFreqCpuInfoCollector) Init(_ json.RawMessage) error {
|
|||||||
const cpuInfoFile = "/proc/cpuinfo"
|
const cpuInfoFile = "/proc/cpuinfo"
|
||||||
file, err := os.Open(cpuInfoFile)
|
file, err := os.Open(cpuInfoFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): failed to open file '%s': %w", m.name, cpuInfoFile, err)
|
return fmt.Errorf("failed to open file '%s': %v", cpuInfoFile, err)
|
||||||
}
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
// Collect topology information from file cpuinfo
|
// Collect topology information from file cpuinfo
|
||||||
foundFreq := false
|
foundFreq := false
|
||||||
@@ -117,13 +110,9 @@ func (m *CPUFreqCpuInfoCollector) Init(_ json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := file.Close(); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): Call to file.Close() failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if at least one CPU with frequency information was detected
|
// Check if at least one CPU with frequency information was detected
|
||||||
if len(m.topology) == 0 {
|
if len(m.topology) == 0 {
|
||||||
return fmt.Errorf("%s Init(): no CPU frequency info found in %s", m.name, cpuInfoFile)
|
return fmt.Errorf("no CPU frequency info found in %s", cpuInfoFile)
|
||||||
}
|
}
|
||||||
|
|
||||||
m.init = true
|
m.init = true
|
||||||
@@ -144,13 +133,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
|||||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err))
|
fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer func() {
|
defer file.Close()
|
||||||
if err := file.Close(); err != nil {
|
|
||||||
cclog.ComponentError(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", cpuInfoFile, err))
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
processorCounter := 0
|
processorCounter := 0
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
@@ -171,7 +154,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
|||||||
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
|
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": value}, now); err == nil {
|
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,20 +1,7 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: CPU frequency metric collector through cpuinfo
|
|
||||||
description: Collect the CPU frequency from `/proc/cpuinfo`
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/cpufreq_cpuinfo.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `cpufreq_cpuinfo` collector
|
## `cpufreq_cpuinfo` collector
|
||||||
|
|
||||||
```json
|
```json
|
||||||
"cpufreq_cpuinfo": {
|
"cpufreq_cpuinfo": {}
|
||||||
"exclude_metrics": []
|
|
||||||
}
|
|
||||||
```
|
```
|
||||||
|
|
||||||
The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **hwthread** metrics.
|
The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **hwthread** metrics.
|
||||||
|
|||||||
@@ -1,14 +1,6 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
@@ -17,8 +9,8 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
"github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
"github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||||
"golang.org/x/sys/unix"
|
"golang.org/x/sys/unix"
|
||||||
)
|
)
|
||||||
@@ -36,7 +28,6 @@ type CPUFreqCollectorTopology struct {
|
|||||||
// See: https://www.kernel.org/doc/html/latest/admin-guide/pm/cpufreq.html
|
// See: https://www.kernel.org/doc/html/latest/admin-guide/pm/cpufreq.html
|
||||||
type CPUFreqCollector struct {
|
type CPUFreqCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
topology []CPUFreqCollectorTopology
|
topology []CPUFreqCollectorTopology
|
||||||
config struct {
|
config struct {
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
@@ -50,15 +41,12 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
m.name = "CPUFreqCollector"
|
m.name = "CPUFreqCollector"
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err := json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
return err
|
||||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{
|
||||||
@@ -79,15 +67,15 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
|||||||
scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq")
|
scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq")
|
||||||
err := unix.Access(scalingCurFreqFile, unix.R_OK)
|
err := unix.Access(scalingCurFreqFile, unix.R_OK)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): unable to access file '%s': %w", m.name, scalingCurFreqFile, err)
|
return fmt.Errorf("unable to access file '%s': %v", scalingCurFreqFile, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
m.topology = append(m.topology,
|
m.topology = append(m.topology,
|
||||||
CPUFreqCollectorTopology{
|
CPUFreqCollectorTopology{
|
||||||
tagSet: map[string]string{
|
tagSet: map[string]string{
|
||||||
"type": "hwthread",
|
"type": "hwthread",
|
||||||
"type-id": strconv.Itoa(c.CpuID),
|
"type-id": fmt.Sprint(c.CpuID),
|
||||||
"package_id": strconv.Itoa(c.Socket),
|
"package_id": fmt.Sprint(c.Socket),
|
||||||
},
|
},
|
||||||
scalingCurFreqFile: scalingCurFreqFile,
|
scalingCurFreqFile: scalingCurFreqFile,
|
||||||
},
|
},
|
||||||
@@ -129,7 +117,7 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": cpuFreq}, now); err == nil {
|
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,14 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: CPU frequency metric collector through sysfs
|
|
||||||
description: Collect the CPU frequency metrics from `/sys/.../cpu/.../cpufreq`
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/cpufreq.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `cpufreq_cpuinfo` collector
|
## `cpufreq_cpuinfo` collector
|
||||||
|
|
||||||
```json
|
```json
|
||||||
|
|||||||
@@ -1,25 +1,16 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"slices"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
sysconf "github.com/tklauser/go-sysconf"
|
sysconf "github.com/tklauser/go-sysconf"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -31,7 +22,6 @@ type CpustatCollectorConfig struct {
|
|||||||
|
|
||||||
type CpustatCollector struct {
|
type CpustatCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
config CpustatCollectorConfig
|
config CpustatCollectorConfig
|
||||||
lastTimestamp time.Time // Store time stamp of last tick to derive values
|
lastTimestamp time.Time // Store time stamp of last tick to derive values
|
||||||
matches map[string]int
|
matches map[string]int
|
||||||
@@ -42,22 +32,14 @@ type CpustatCollector struct {
|
|||||||
|
|
||||||
func (m *CpustatCollector) Init(config json.RawMessage) error {
|
func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||||
m.name = "CpustatCollector"
|
m.name = "CpustatCollector"
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{"source": m.name, "group": "CPU"}
|
||||||
"source": m.name,
|
m.nodetags = map[string]string{"type": "node"}
|
||||||
"group": "CPU",
|
|
||||||
}
|
|
||||||
m.nodetags = map[string]string{
|
|
||||||
"type": "node",
|
|
||||||
}
|
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err := json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
return err
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
matches := map[string]int{
|
matches := map[string]int{
|
||||||
@@ -75,16 +57,24 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
m.matches = make(map[string]int)
|
m.matches = make(map[string]int)
|
||||||
for match, index := range matches {
|
for match, index := range matches {
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, match) {
|
doExclude := false
|
||||||
|
for _, exclude := range m.config.ExcludeMetrics {
|
||||||
|
if match == exclude {
|
||||||
|
doExclude = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !doExclude {
|
||||||
m.matches[match] = index
|
m.matches[match] = index
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check input file
|
// Check input file
|
||||||
file, err := os.Open(CPUSTATFILE)
|
file, err := os.Open(string(CPUSTATFILE))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): Failed to open file '%s': %w", m.name, CPUSTATFILE, err)
|
cclog.ComponentError(m.name, err.Error())
|
||||||
}
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
// Pre-generate tags for all CPUs
|
// Pre-generate tags for all CPUs
|
||||||
num_cpus := 0
|
num_cpus := 0
|
||||||
@@ -102,10 +92,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
|||||||
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
||||||
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
||||||
cpu, _ := strconv.Atoi(cpustr)
|
cpu, _ := strconv.Atoi(cpustr)
|
||||||
m.cputags[linefields[0]] = map[string]string{
|
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
|
||||||
"type": "hwthread",
|
|
||||||
"type-id": strconv.Itoa(cpu),
|
|
||||||
}
|
|
||||||
m.olddata[linefields[0]] = make(map[string]int64)
|
m.olddata[linefields[0]] = make(map[string]int64)
|
||||||
for k, v := range m.matches {
|
for k, v := range m.matches {
|
||||||
m.olddata[linefields[0]][k], _ = strconv.ParseInt(linefields[v], 0, 64)
|
m.olddata[linefields[0]][k], _ = strconv.ParseInt(linefields[v], 0, 64)
|
||||||
@@ -113,12 +100,6 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
|||||||
num_cpus++
|
num_cpus++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close file
|
|
||||||
if err := file.Close(); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): Failed to close file '%s': %w", m.name, CPUSTATFILE, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
m.lastTimestamp = time.Now()
|
m.lastTimestamp = time.Now()
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
@@ -141,7 +122,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
|
|||||||
sum := float64(0)
|
sum := float64(0)
|
||||||
for name, value := range values {
|
for name, value := range values {
|
||||||
sum += value
|
sum += value
|
||||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value * 100}, now)
|
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("unit", "Percent")
|
y.AddTag("unit", "Percent")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -149,7 +130,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
|
|||||||
}
|
}
|
||||||
if v, ok := values["cpu_idle"]; ok {
|
if v, ok := values["cpu_idle"]; ok {
|
||||||
sum -= v
|
sum -= v
|
||||||
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]any{"value": sum * 100}, now)
|
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]interface{}{"value": sum * 100}, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("unit", "Percent")
|
y.AddTag("unit", "Percent")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -165,19 +146,11 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
now := time.Now()
|
now := time.Now()
|
||||||
tsdelta := now.Sub(m.lastTimestamp)
|
tsdelta := now.Sub(m.lastTimestamp)
|
||||||
|
|
||||||
file, err := os.Open(CPUSTATFILE)
|
file, err := os.Open(string(CPUSTATFILE))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, err.Error())
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", CPUSTATFILE, err))
|
|
||||||
}
|
}
|
||||||
defer func() {
|
defer file.Close()
|
||||||
if err := file.Close(); err != nil {
|
|
||||||
cclog.ComponentError(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", string(CPUSTATFILE), err))
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
@@ -194,7 +167,7 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
num_cpus_metric, err := lp.NewMessage("num_cpus",
|
num_cpus_metric, err := lp.NewMessage("num_cpus",
|
||||||
m.nodetags,
|
m.nodetags,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]any{"value": num_cpus},
|
map[string]interface{}{"value": int(num_cpus)},
|
||||||
now,
|
now,
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
|||||||
@@ -1,14 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: CPU usage metric collector
|
|
||||||
description: Collect CPU metrics from `/proc/stat`
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/cpustat.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
|
|
||||||
## `cpustat` collector
|
## `cpustat` collector
|
||||||
|
|
||||||
|
|||||||
@@ -1,24 +1,16 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"errors"
|
||||||
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"slices"
|
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
influx "github.com/influxdata/line-protocol"
|
||||||
)
|
)
|
||||||
|
|
||||||
const CUSTOMCMDPATH = `/home/unrz139/Work/cc-metric-collector/collectors/custom`
|
const CUSTOMCMDPATH = `/home/unrz139/Work/cc-metric-collector/collectors/custom`
|
||||||
@@ -31,124 +23,102 @@ type CustomCmdCollectorConfig struct {
|
|||||||
|
|
||||||
type CustomCmdCollector struct {
|
type CustomCmdCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
handler *influx.MetricHandler
|
||||||
config CustomCmdCollectorConfig
|
parser *influx.Parser
|
||||||
cmdFieldsSlice [][]string
|
config CustomCmdCollectorConfig
|
||||||
files []string
|
commands []string
|
||||||
|
files []string
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
||||||
|
var err error
|
||||||
m.name = "CustomCmdCollector"
|
m.name = "CustomCmdCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{"source": m.name, "group": "Custom"}
|
||||||
"source": m.name,
|
|
||||||
"group": "Custom",
|
|
||||||
}
|
|
||||||
|
|
||||||
// Read configuration
|
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err = json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
log.Print(err.Error())
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
m.setup()
|
||||||
// Setup
|
|
||||||
if err := m.setup(); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if command can be executed
|
|
||||||
for _, c := range m.config.Commands {
|
for _, c := range m.config.Commands {
|
||||||
cmdFields := strings.Fields(c)
|
cmdfields := strings.Fields(c)
|
||||||
command := exec.Command(cmdFields[0], cmdFields[1:]...)
|
command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " "))
|
||||||
if _, err := command.Output(); err != nil {
|
command.Wait()
|
||||||
cclog.ComponentWarn(
|
_, err = command.Output()
|
||||||
m.name,
|
if err == nil {
|
||||||
fmt.Sprintf("%s Init(): Execution of command \"%s\" failed: %v", m.name, command.String(), err))
|
m.commands = append(m.commands, c)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, f := range m.config.Files {
|
||||||
|
_, err = os.ReadFile(f)
|
||||||
|
if err == nil {
|
||||||
|
m.files = append(m.files, f)
|
||||||
|
} else {
|
||||||
|
log.Print(err.Error())
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
m.cmdFieldsSlice = append(m.cmdFieldsSlice, cmdFields)
|
|
||||||
}
|
}
|
||||||
|
if len(m.files) == 0 && len(m.commands) == 0 {
|
||||||
// Check if file can be read
|
return errors.New("no metrics to collect")
|
||||||
for _, fileName := range m.config.Files {
|
|
||||||
if _, err := os.ReadFile(fileName); err != nil {
|
|
||||||
cclog.ComponentWarn(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("%s Init(): Reading of file \"%s\" failed: %v", m.name, fileName, err))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
m.files = append(m.files, fileName)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(m.files) == 0 && len(m.cmdFieldsSlice) == 0 {
|
|
||||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
|
||||||
}
|
}
|
||||||
|
m.handler = influx.NewMetricHandler()
|
||||||
|
m.parser = influx.NewParser(m.handler)
|
||||||
|
m.parser.SetTimeFunc(DefaultTime)
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var DefaultTime = func() time.Time {
|
||||||
|
return time.Unix(42, 0)
|
||||||
|
}
|
||||||
|
|
||||||
func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
if !m.init {
|
if !m.init {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
for _, cmd := range m.commands {
|
||||||
// Execute configured commands
|
cmdfields := strings.Fields(cmd)
|
||||||
for _, cmdFields := range m.cmdFieldsSlice {
|
command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " "))
|
||||||
command := exec.Command(cmdFields[0], cmdFields[1:]...)
|
command.Wait()
|
||||||
stdout, err := command.Output()
|
stdout, err := command.Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
log.Print(err)
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to read command output for command \"%s\": %v", command.String(), err),
|
|
||||||
)
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
cmdmetrics, err := m.parser.Parse(stdout)
|
||||||
// Read and decode influxDB line-protocol from command output
|
|
||||||
metrics, err := lp.FromBytes(stdout)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
log.Print(err)
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to decode influx Message: %v", err),
|
|
||||||
)
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
for _, metric := range metrics {
|
for _, c := range cmdmetrics {
|
||||||
if slices.Contains(m.config.ExcludeMetrics, metric.Name()) {
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, c.Name())
|
||||||
|
if skip {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
output <- metric
|
|
||||||
|
output <- lp.FromInfluxMetric(c)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for _, file := range m.files {
|
||||||
// Read configured files
|
buffer, err := os.ReadFile(file)
|
||||||
for _, filename := range m.files {
|
|
||||||
input, err := os.ReadFile(filename)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
log.Print(err)
|
||||||
m.name,
|
return
|
||||||
fmt.Sprintf("Read(): Failed to read file \"%s\": %v\n", filename, err),
|
}
|
||||||
)
|
fmetrics, err := m.parser.Parse(buffer)
|
||||||
|
if err != nil {
|
||||||
|
log.Print(err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
for _, f := range fmetrics {
|
||||||
// Read and decode influxDB line-protocol from file
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, f.Name())
|
||||||
metrics, err := lp.FromBytes(input)
|
if skip {
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to decode influx Message: %v", err),
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
for _, metric := range metrics {
|
|
||||||
if slices.Contains(m.config.ExcludeMetrics, metric.Name()) {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
output <- metric
|
output <- lp.FromInfluxMetric(f)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,13 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: CustomCommand metric collector
|
|
||||||
description: Collect messages from custom command or files
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/customcmd.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `customcmd` collector
|
## `customcmd` collector
|
||||||
|
|
||||||
|
|||||||
@@ -1,24 +1,15 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const MOUNTFILE = `/proc/self/mounts`
|
const MOUNTFILE = `/proc/self/mounts`
|
||||||
@@ -30,7 +21,6 @@ type DiskstatCollectorConfig struct {
|
|||||||
|
|
||||||
type DiskstatCollector struct {
|
type DiskstatCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
config DiskstatCollectorConfig
|
config DiskstatCollectorConfig
|
||||||
allowedMetrics map[string]bool
|
allowedMetrics map[string]bool
|
||||||
}
|
}
|
||||||
@@ -39,14 +29,10 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
|
|||||||
m.name = "DiskstatCollector"
|
m.name = "DiskstatCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
if err := json.Unmarshal(config, &m.config); err != nil {
|
||||||
d.DisallowUnknownFields()
|
return err
|
||||||
if err := d.Decode(&m.config); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.allowedMetrics = map[string]bool{
|
m.allowedMetrics = map[string]bool{
|
||||||
@@ -61,11 +47,10 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
file, err := os.Open(MOUNTFILE)
|
file, err := os.Open(MOUNTFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): file open for file \"%s\" failed: %w", m.name, MOUNTFILE, err)
|
cclog.ComponentError(m.name, err.Error())
|
||||||
}
|
return err
|
||||||
if err := file.Close(); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): file close for file \"%s\" failed: %w", m.name, MOUNTFILE, err)
|
|
||||||
}
|
}
|
||||||
|
defer file.Close()
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -77,18 +62,10 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
|
|||||||
|
|
||||||
file, err := os.Open(MOUNTFILE)
|
file, err := os.Open(MOUNTFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, err.Error())
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", MOUNTFILE, err))
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer func() {
|
defer file.Close()
|
||||||
if err := file.Close(); err != nil {
|
|
||||||
cclog.ComponentError(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", MOUNTFILE, err))
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
part_max_used := uint64(0)
|
part_max_used := uint64(0)
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
@@ -109,7 +86,7 @@ mountLoop:
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
mountPath := strings.ReplaceAll(linefields[1], `\040`, " ")
|
mountPath := strings.Replace(linefields[1], `\040`, " ", -1)
|
||||||
|
|
||||||
for _, excl := range m.config.ExcludeMounts {
|
for _, excl := range m.config.ExcludeMounts {
|
||||||
if strings.Contains(mountPath, excl) {
|
if strings.Contains(mountPath, excl) {
|
||||||
@@ -126,31 +103,17 @@ mountLoop:
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
tags := map[string]string{"type": "node", "device": linefields[0]}
|
tags := map[string]string{"type": "node", "device": linefields[0]}
|
||||||
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000_000_000)
|
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
|
||||||
if m.allowedMetrics["disk_total"] {
|
if m.allowedMetrics["disk_total"] {
|
||||||
y, err := lp.NewMessage(
|
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
|
||||||
"disk_total",
|
|
||||||
tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": total,
|
|
||||||
},
|
|
||||||
time.Now())
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "GBytes")
|
y.AddMeta("unit", "GBytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000_000_000)
|
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000)
|
||||||
if m.allowedMetrics["disk_free"] {
|
if m.allowedMetrics["disk_free"] {
|
||||||
y, err := lp.NewMessage(
|
y, err := lp.NewMessage("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
|
||||||
"disk_free",
|
|
||||||
tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": free,
|
|
||||||
},
|
|
||||||
time.Now())
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "GBytes")
|
y.AddMeta("unit", "GBytes")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -164,16 +127,7 @@ mountLoop:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.allowedMetrics["part_max_used"] {
|
if m.allowedMetrics["part_max_used"] {
|
||||||
y, err := lp.NewMessage(
|
y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
|
||||||
"part_max_used",
|
|
||||||
map[string]string{
|
|
||||||
"type": "node",
|
|
||||||
},
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": int(part_max_used),
|
|
||||||
},
|
|
||||||
time.Now())
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "percent")
|
y.AddMeta("unit", "percent")
|
||||||
output <- y
|
output <- y
|
||||||
|
|||||||
@@ -1,13 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: Disk usage statistics metric collector
|
|
||||||
description: Collect metrics for various filesystems from `/proc/self/mounts`
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/diskstat.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `diskstat` collector
|
## `diskstat` collector
|
||||||
|
|
||||||
|
|||||||
@@ -1,308 +1,41 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"log"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"os/user"
|
"os/user"
|
||||||
"slices"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const DEFAULT_GPFS_CMD = "mmpmon"
|
const DEFAULT_GPFS_CMD = "mmpmon"
|
||||||
|
|
||||||
type GpfsCollectorState map[string]int64
|
type GpfsCollectorLastState struct {
|
||||||
|
bytesRead int64
|
||||||
type GpfsCollectorConfig struct {
|
bytesWritten int64
|
||||||
Mmpmon string `json:"mmpmon_path,omitempty"`
|
|
||||||
ExcludeFilesystems []string `json:"exclude_filesystem,omitempty"`
|
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
|
||||||
Sudo bool `json:"use_sudo,omitempty"`
|
|
||||||
SendAbsoluteValues bool `json:"send_abs_values,omitempty"`
|
|
||||||
SendDiffValues bool `json:"send_diff_values,omitempty"`
|
|
||||||
SendDerivedValues bool `json:"send_derived_values,omitempty"`
|
|
||||||
SendTotalValues bool `json:"send_total_values,omitempty"`
|
|
||||||
SendBandwidths bool `json:"send_bandwidths,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type GpfsMetricDefinition struct {
|
|
||||||
name string
|
|
||||||
desc string
|
|
||||||
prefix string
|
|
||||||
unit string
|
|
||||||
calc string
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type GpfsCollector struct {
|
type GpfsCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
tags map[string]string
|
||||||
tags map[string]string
|
config struct {
|
||||||
config GpfsCollectorConfig
|
Mmpmon string `json:"mmpmon_path,omitempty"`
|
||||||
sudoCmd string
|
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
|
||||||
|
SendBandwidths bool `json:"send_bandwidths"`
|
||||||
|
SendTotalValues bool `json:"send_total_values"`
|
||||||
|
}
|
||||||
skipFS map[string]struct{}
|
skipFS map[string]struct{}
|
||||||
lastTimestamp map[string]time.Time // Store timestamp of lastState per filesystem to derive bandwidths
|
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
||||||
definitions []GpfsMetricDefinition // all metrics to report
|
lastState map[string]GpfsCollectorLastState
|
||||||
lastState map[string]GpfsCollectorState // one GpfsCollectorState per filesystem
|
|
||||||
}
|
|
||||||
|
|
||||||
var GpfsAbsMetrics = []GpfsMetricDefinition{
|
|
||||||
{
|
|
||||||
name: "gpfs_num_opens",
|
|
||||||
desc: "number of opens",
|
|
||||||
prefix: "_oc_",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "none",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_num_closes",
|
|
||||||
desc: "number of closes",
|
|
||||||
prefix: "_cc_",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "none",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_num_reads",
|
|
||||||
desc: "number of reads",
|
|
||||||
prefix: "_rdc_",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "none",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_num_writes",
|
|
||||||
desc: "number of writes",
|
|
||||||
prefix: "_wc_",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "none",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_num_readdirs",
|
|
||||||
desc: "number of readdirs",
|
|
||||||
prefix: "_dir_",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "none",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_num_inode_updates",
|
|
||||||
desc: "number of Inode Updates",
|
|
||||||
prefix: "_iu_",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "none",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_bytes_read",
|
|
||||||
desc: "bytes read",
|
|
||||||
prefix: "_br_",
|
|
||||||
unit: "bytes",
|
|
||||||
calc: "none",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_bytes_written",
|
|
||||||
desc: "bytes written",
|
|
||||||
prefix: "_bw_",
|
|
||||||
unit: "bytes",
|
|
||||||
calc: "none",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
var GpfsDiffMetrics = []GpfsMetricDefinition{
|
|
||||||
{
|
|
||||||
name: "gpfs_num_opens_diff",
|
|
||||||
desc: "number of opens (diff)",
|
|
||||||
prefix: "_oc_",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "difference",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_num_closes_diff",
|
|
||||||
desc: "number of closes (diff)",
|
|
||||||
prefix: "_cc_",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "difference",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_num_reads_diff",
|
|
||||||
desc: "number of reads (diff)",
|
|
||||||
prefix: "_rdc_",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "difference",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_num_writes_diff",
|
|
||||||
desc: "number of writes (diff)",
|
|
||||||
prefix: "_wc_",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "difference",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_num_readdirs_diff",
|
|
||||||
desc: "number of readdirs (diff)",
|
|
||||||
prefix: "_dir_",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "difference",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_num_inode_updates_diff",
|
|
||||||
desc: "number of Inode Updates (diff)",
|
|
||||||
prefix: "_iu_",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "difference",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_bytes_read_diff",
|
|
||||||
desc: "bytes read (diff)",
|
|
||||||
prefix: "_br_",
|
|
||||||
unit: "bytes",
|
|
||||||
calc: "difference",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_bytes_written_diff",
|
|
||||||
desc: "bytes written (diff)",
|
|
||||||
prefix: "_bw_",
|
|
||||||
unit: "bytes",
|
|
||||||
calc: "difference",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
var GpfsDeriveMetrics = []GpfsMetricDefinition{
|
|
||||||
{
|
|
||||||
name: "gpfs_opens_rate",
|
|
||||||
desc: "number of opens (rate)",
|
|
||||||
prefix: "_oc_",
|
|
||||||
unit: "requests/sec",
|
|
||||||
calc: "derivative",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_closes_rate",
|
|
||||||
desc: "number of closes (rate)",
|
|
||||||
prefix: "_oc_",
|
|
||||||
unit: "requests/sec",
|
|
||||||
calc: "derivative",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_reads_rate",
|
|
||||||
desc: "number of reads (rate)",
|
|
||||||
prefix: "_rdc_",
|
|
||||||
unit: "requests/sec",
|
|
||||||
calc: "derivative",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_writes_rate",
|
|
||||||
desc: "number of writes (rate)",
|
|
||||||
prefix: "_wc_",
|
|
||||||
unit: "requests/sec",
|
|
||||||
calc: "derivative",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_readdirs_rate",
|
|
||||||
desc: "number of readdirs (rate)",
|
|
||||||
prefix: "_dir_",
|
|
||||||
unit: "requests/sec",
|
|
||||||
calc: "derivative",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_inode_updates_rate",
|
|
||||||
desc: "number of Inode Updates (rate)",
|
|
||||||
prefix: "_iu_",
|
|
||||||
unit: "requests/sec",
|
|
||||||
calc: "derivative",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_bw_read",
|
|
||||||
desc: "bytes read (rate)",
|
|
||||||
prefix: "_br_",
|
|
||||||
unit: "bytes/sec",
|
|
||||||
calc: "derivative",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_bw_write",
|
|
||||||
desc: "bytes written (rate)",
|
|
||||||
prefix: "_bw_",
|
|
||||||
unit: "bytes/sec",
|
|
||||||
calc: "derivative",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
var GpfsTotalMetrics = []GpfsMetricDefinition{
|
|
||||||
{
|
|
||||||
name: "gpfs_bytes_total",
|
|
||||||
desc: "bytes total",
|
|
||||||
prefix: "bytesTotal",
|
|
||||||
unit: "bytes",
|
|
||||||
calc: "none",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_bytes_total_diff",
|
|
||||||
desc: "bytes total (diff)",
|
|
||||||
prefix: "bytesTotal",
|
|
||||||
unit: "bytes",
|
|
||||||
calc: "difference",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_bw_total",
|
|
||||||
desc: "bytes total (rate)",
|
|
||||||
prefix: "bytesTotal",
|
|
||||||
unit: "bytes/sec",
|
|
||||||
calc: "derivative",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_iops",
|
|
||||||
desc: "iops",
|
|
||||||
prefix: "iops",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "none",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_iops_diff",
|
|
||||||
desc: "iops (diff)",
|
|
||||||
prefix: "iops",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "difference",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_iops_rate",
|
|
||||||
desc: "iops (rate)",
|
|
||||||
prefix: "iops",
|
|
||||||
unit: "requests/sec",
|
|
||||||
calc: "derivative",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_metaops",
|
|
||||||
desc: "metaops",
|
|
||||||
prefix: "metaops",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "none",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_metaops_diff",
|
|
||||||
desc: "metaops (diff)",
|
|
||||||
prefix: "metaops",
|
|
||||||
unit: "requests",
|
|
||||||
calc: "difference",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "gpfs_metaops_rate",
|
|
||||||
desc: "metaops (rate)",
|
|
||||||
prefix: "metaops",
|
|
||||||
unit: "requests/sec",
|
|
||||||
calc: "derivative",
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *GpfsCollector) Init(config json.RawMessage) error {
|
func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||||
@@ -311,10 +44,9 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var err error
|
||||||
m.name = "GpfsCollector"
|
m.name = "GpfsCollector"
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
|
|
||||||
// Set default mmpmon binary
|
// Set default mmpmon binary
|
||||||
@@ -322,10 +54,10 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Read JSON configuration
|
// Read JSON configuration
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err = json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
log.Print(err.Error())
|
||||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{
|
||||||
@@ -337,103 +69,27 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
|||||||
"filesystem": "",
|
"filesystem": "",
|
||||||
}
|
}
|
||||||
m.skipFS = make(map[string]struct{})
|
m.skipFS = make(map[string]struct{})
|
||||||
for _, fs := range m.config.ExcludeFilesystems {
|
for _, fs := range m.config.ExcludeFilesystem {
|
||||||
m.skipFS[fs] = struct{}{}
|
m.skipFS[fs] = struct{}{}
|
||||||
}
|
}
|
||||||
m.lastState = make(map[string]GpfsCollectorState)
|
m.lastState = make(map[string]GpfsCollectorLastState)
|
||||||
m.lastTimestamp = make(map[string]time.Time)
|
|
||||||
|
|
||||||
// GPFS / IBM Spectrum Scale file system statistics can only be queried by user root
|
// GPFS / IBM Spectrum Scale file system statistics can only be queried by user root
|
||||||
if !m.config.Sudo {
|
user, err := user.Current()
|
||||||
user, err := user.Current()
|
if err != nil {
|
||||||
if err != nil {
|
return fmt.Errorf("failed to get current user: %v", err)
|
||||||
return fmt.Errorf("%s Init(): failed to get current user: %w", m.name, err)
|
|
||||||
}
|
|
||||||
if user.Uid != "0" {
|
|
||||||
return fmt.Errorf("%s Init(): GPFS file system statistics can only be queried by user root", m.name)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
p, err := exec.LookPath("sudo")
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): cannot find 'sudo': %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.sudoCmd = p
|
|
||||||
}
|
}
|
||||||
|
if user.Uid != "0" {
|
||||||
// when using sudo, the full path of mmpmon must be specified because
|
return fmt.Errorf("GPFS file system statistics can only be queried by user root")
|
||||||
// exec.LookPath will not work as mmpmon is not executable as user
|
|
||||||
if m.config.Sudo && !strings.HasPrefix(m.config.Mmpmon, "/") {
|
|
||||||
return fmt.Errorf("%s Init(): when using sudo, mmpmon_path must be provided and an absolute path: %s", m.name, m.config.Mmpmon)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if mmpmon is in executable search path
|
// Check if mmpmon is in executable search path
|
||||||
p, err := exec.LookPath(m.config.Mmpmon)
|
p, err := exec.LookPath(m.config.Mmpmon)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// if using sudo, exec.lookPath will return EACCES (file mode r-x------), this can be ignored
|
return fmt.Errorf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err)
|
||||||
if m.config.Sudo && errors.Is(err, syscall.EACCES) {
|
|
||||||
cclog.ComponentWarn(m.name, fmt.Sprintf("got error looking for mmpmon binary '%s': %v . This is expected when using sudo, continuing.", m.config.Mmpmon, err))
|
|
||||||
// the file was given in the config, use it
|
|
||||||
p = m.config.Mmpmon
|
|
||||||
} else {
|
|
||||||
return fmt.Errorf("%s Init(): failed to find mmpmon binary '%s': %w", m.name, m.config.Mmpmon, err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
m.config.Mmpmon = p
|
m.config.Mmpmon = p
|
||||||
|
|
||||||
m.definitions = []GpfsMetricDefinition{}
|
|
||||||
if m.config.SendAbsoluteValues {
|
|
||||||
for _, def := range GpfsAbsMetrics {
|
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
|
||||||
m.definitions = append(m.definitions, def)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if m.config.SendDiffValues {
|
|
||||||
for _, def := range GpfsDiffMetrics {
|
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
|
||||||
m.definitions = append(m.definitions, def)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if m.config.SendDerivedValues {
|
|
||||||
for _, def := range GpfsDeriveMetrics {
|
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
|
||||||
m.definitions = append(m.definitions, def)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if m.config.SendBandwidths {
|
|
||||||
for _, def := range GpfsDeriveMetrics {
|
|
||||||
if def.unit == "bytes/sec" {
|
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
|
||||||
m.definitions = append(m.definitions, def)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if m.config.SendTotalValues {
|
|
||||||
for _, def := range GpfsTotalMetrics {
|
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
|
||||||
// only send total metrics of the types requested
|
|
||||||
if (def.calc == "none" && m.config.SendAbsoluteValues) ||
|
|
||||||
(def.calc == "difference" && m.config.SendDiffValues) ||
|
|
||||||
(def.calc == "derivative" && m.config.SendDerivedValues) {
|
|
||||||
m.definitions = append(m.definitions, def)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if m.config.SendBandwidths {
|
|
||||||
for _, def := range GpfsTotalMetrics {
|
|
||||||
if def.unit == "bytes/sec" {
|
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
|
||||||
m.definitions = append(m.definitions, def)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if len(m.definitions) == 0 {
|
|
||||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
|
||||||
}
|
|
||||||
|
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -444,17 +100,18 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Current time stamp
|
||||||
|
now := time.Now()
|
||||||
|
// time difference to last time stamp
|
||||||
|
timeDiff := now.Sub(m.lastTimestamp).Seconds()
|
||||||
|
// Save current timestamp
|
||||||
|
m.lastTimestamp = now
|
||||||
|
|
||||||
// mmpmon:
|
// mmpmon:
|
||||||
// -p: generate output that can be parsed
|
// -p: generate output that can be parsed
|
||||||
// -s: suppress the prompt on input
|
// -s: suppress the prompt on input
|
||||||
// fs_io_s: Displays I/O statistics per mounted file system
|
// fs_io_s: Displays I/O statistics per mounted file system
|
||||||
var cmd *exec.Cmd
|
cmd := exec.Command(m.config.Mmpmon, "-p", "-s")
|
||||||
if m.config.Sudo {
|
|
||||||
cmd = exec.Command(m.sudoCmd, m.config.Mmpmon, "-p", "-s")
|
|
||||||
} else {
|
|
||||||
cmd = exec.Command(m.config.Mmpmon, "-p", "-s")
|
|
||||||
}
|
|
||||||
|
|
||||||
cmd.Stdin = strings.NewReader("once fs_io_s\n")
|
cmd.Stdin = strings.NewReader("once fs_io_s\n")
|
||||||
cmdStdout := new(bytes.Buffer)
|
cmdStdout := new(bytes.Buffer)
|
||||||
cmdStderr := new(bytes.Buffer)
|
cmdStderr := new(bytes.Buffer)
|
||||||
@@ -497,7 +154,9 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
|
|
||||||
filesystem, ok := key_value["_fs_"]
|
filesystem, ok := key_value["_fs_"]
|
||||||
if !ok {
|
if !ok {
|
||||||
cclog.ComponentError(m.name, "Read(): Failed to get filesystem name.")
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
"Read(): Failed to get filesystem name.")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -509,141 +168,245 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
// Add filesystem tag
|
// Add filesystem tag
|
||||||
m.tags["filesystem"] = filesystem
|
m.tags["filesystem"] = filesystem
|
||||||
|
|
||||||
if _, ok := m.lastState[filesystem]; !ok {
|
// Create initial last state
|
||||||
m.lastState[filesystem] = make(GpfsCollectorState)
|
if m.config.SendBandwidths {
|
||||||
|
if _, ok := m.lastState[filesystem]; !ok {
|
||||||
|
m.lastState[filesystem] = GpfsCollectorLastState{
|
||||||
|
bytesRead: -1,
|
||||||
|
bytesWritten: -1,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// read the new values from mmpmon
|
|
||||||
// return code
|
// return code
|
||||||
rc, err := strconv.Atoi(key_value["_rc_"])
|
rc, err := strconv.Atoi(key_value["_rc_"])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err))
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if rc != 0 {
|
if rc != 0 {
|
||||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Filesystem '%s' is not ok.", filesystem))
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Filesystem '%s' is not ok.", filesystem))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// timestamp
|
|
||||||
sec, err := strconv.ParseInt(key_value["_t_"], 10, 64)
|
sec, err := strconv.ParseInt(key_value["_t_"], 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err))
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64)
|
msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err))
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
timestamp := time.Unix(sec, msec*1000)
|
timestamp := time.Unix(sec, msec*1000)
|
||||||
|
|
||||||
// time difference to last time stamp
|
// bytes read
|
||||||
var timeDiff float64 = 0
|
bytesRead, err := strconv.ParseInt(key_value["_br_"], 10, 64)
|
||||||
if lastTime, ok := m.lastTimestamp[filesystem]; !ok {
|
if err != nil {
|
||||||
m.lastTimestamp[filesystem] = time.Time{}
|
cclog.ComponentError(
|
||||||
} else {
|
m.name,
|
||||||
timeDiff = timestamp.Sub(lastTime).Seconds()
|
fmt.Sprintf("Read(): Failed to convert bytes read '%s' to int64: %v", key_value["_br_"], err))
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
if y, err :=
|
||||||
// get values of all abs metrics
|
lp.NewMessage(
|
||||||
newstate := make(GpfsCollectorState)
|
"gpfs_bytes_read",
|
||||||
for _, metric := range GpfsAbsMetrics {
|
m.tags,
|
||||||
value, err := strconv.ParseInt(key_value[metric.prefix], 10, 64)
|
m.meta,
|
||||||
if err != nil {
|
map[string]interface{}{
|
||||||
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert %s '%s' to int64: %v", metric.desc, key_value[metric.prefix], err))
|
"value": bytesRead,
|
||||||
continue
|
},
|
||||||
}
|
timestamp,
|
||||||
newstate[metric.prefix] = value
|
); err == nil {
|
||||||
|
y.AddMeta("unit", "bytes")
|
||||||
|
output <- y
|
||||||
}
|
}
|
||||||
|
if m.config.SendBandwidths {
|
||||||
// compute total metrics (map[...] will return 0 if key not found)
|
if lastBytesRead := m.lastState[filesystem].bytesRead; lastBytesRead >= 0 {
|
||||||
// bytes read and written
|
bwRead := float64(bytesRead-lastBytesRead) / timeDiff
|
||||||
if br, br_ok := newstate["_br_"]; br_ok {
|
if y, err :=
|
||||||
newstate["bytesTotal"] += br
|
lp.NewMessage(
|
||||||
}
|
"gpfs_bw_read",
|
||||||
if bw, bw_ok := newstate["_bw_"]; bw_ok {
|
m.tags,
|
||||||
newstate["bytesTotal"] += bw
|
m.meta,
|
||||||
}
|
map[string]interface{}{
|
||||||
// read and write count
|
"value": bwRead,
|
||||||
if rdc, rdc_ok := newstate["_rdc_"]; rdc_ok {
|
},
|
||||||
newstate["iops"] += rdc
|
timestamp,
|
||||||
}
|
); err == nil {
|
||||||
if wc, wc_ok := newstate["_wc_"]; wc_ok {
|
y.AddMeta("unit", "bytes/sec")
|
||||||
newstate["iops"] += wc
|
|
||||||
}
|
|
||||||
// meta operations
|
|
||||||
if oc, oc_ok := newstate["_oc_"]; oc_ok {
|
|
||||||
newstate["metaops"] += oc
|
|
||||||
}
|
|
||||||
if cc, cc_ok := newstate["_cc_"]; cc_ok {
|
|
||||||
newstate["metaops"] += cc
|
|
||||||
}
|
|
||||||
if dir, dir_ok := newstate["_dir_"]; dir_ok {
|
|
||||||
newstate["metaops"] += dir
|
|
||||||
}
|
|
||||||
if iu, iu_ok := newstate["_iu_"]; iu_ok {
|
|
||||||
newstate["metaops"] += iu
|
|
||||||
}
|
|
||||||
// send desired metrics for this filesystem
|
|
||||||
for _, metric := range m.definitions {
|
|
||||||
vold, vold_ok := m.lastState[filesystem][metric.prefix]
|
|
||||||
vnew, vnew_ok := newstate[metric.prefix]
|
|
||||||
var value any
|
|
||||||
value_ok := false
|
|
||||||
switch metric.calc {
|
|
||||||
case "none":
|
|
||||||
if vnew_ok {
|
|
||||||
value = vnew
|
|
||||||
value_ok = true
|
|
||||||
} else if vold_ok {
|
|
||||||
// for absolute values, if the new value is not available, report no change
|
|
||||||
value = vold
|
|
||||||
value_ok = true
|
|
||||||
}
|
|
||||||
case "difference":
|
|
||||||
if vnew_ok && vold_ok {
|
|
||||||
value = vnew - vold
|
|
||||||
if value.(int64) < 0 {
|
|
||||||
value = 0
|
|
||||||
}
|
|
||||||
value_ok = true
|
|
||||||
} else if vold_ok {
|
|
||||||
// if the difference is not computable, return 0
|
|
||||||
value = 0
|
|
||||||
value_ok = true
|
|
||||||
}
|
|
||||||
case "derivative":
|
|
||||||
if vnew_ok && vold_ok && timeDiff > 0 {
|
|
||||||
value = float64(vnew-vold) / timeDiff
|
|
||||||
if value.(float64) < 0.0 {
|
|
||||||
value = 0.0
|
|
||||||
}
|
|
||||||
value_ok = true
|
|
||||||
} else if vold_ok {
|
|
||||||
// if the difference is not computable, return 0
|
|
||||||
value = 0.0
|
|
||||||
value_ok = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if value_ok {
|
|
||||||
y, err := lp.NewMetric(metric.name, m.tags, m.meta, value, timestamp)
|
|
||||||
if err == nil {
|
|
||||||
if len(metric.unit) > 0 {
|
|
||||||
y.AddMeta("unit", metric.unit)
|
|
||||||
}
|
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// the value could not be computed correctly
|
|
||||||
cclog.ComponentWarn(m.name, fmt.Sprintf("Read(): Could not compute value for filesystem %s of metric %s: vold_ok = %t, vnew_ok = %t", filesystem, metric.name, vold_ok, vnew_ok))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save new state, if it contains proper values
|
// bytes written
|
||||||
if len(newstate) > 0 {
|
bytesWritten, err := strconv.ParseInt(key_value["_bw_"], 10, 64)
|
||||||
m.lastState[filesystem] = newstate
|
if err != nil {
|
||||||
m.lastTimestamp[filesystem] = timestamp
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to convert bytes written '%s' to int64: %v", key_value["_bw_"], err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if y, err :=
|
||||||
|
lp.NewMessage(
|
||||||
|
"gpfs_bytes_written",
|
||||||
|
m.tags,
|
||||||
|
m.meta,
|
||||||
|
map[string]interface{}{
|
||||||
|
"value": bytesWritten,
|
||||||
|
},
|
||||||
|
timestamp,
|
||||||
|
); err == nil {
|
||||||
|
y.AddMeta("unit", "bytes")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
if m.config.SendBandwidths {
|
||||||
|
if lastBytesWritten := m.lastState[filesystem].bytesRead; lastBytesWritten >= 0 {
|
||||||
|
bwWrite := float64(bytesWritten-lastBytesWritten) / timeDiff
|
||||||
|
if y, err :=
|
||||||
|
lp.NewMessage(
|
||||||
|
"gpfs_bw_write",
|
||||||
|
m.tags,
|
||||||
|
m.meta,
|
||||||
|
map[string]interface{}{
|
||||||
|
"value": bwWrite,
|
||||||
|
},
|
||||||
|
timestamp,
|
||||||
|
); err == nil {
|
||||||
|
y.AddMeta("unit", "bytes/sec")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if m.config.SendBandwidths {
|
||||||
|
m.lastState[filesystem] = GpfsCollectorLastState{
|
||||||
|
bytesRead: bytesRead,
|
||||||
|
bytesWritten: bytesWritten,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// number of opens
|
||||||
|
numOpens, err := strconv.ParseInt(key_value["_oc_"], 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to convert number of opens '%s' to int64: %v", key_value["_oc_"], err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if y, err := lp.NewMessage("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp); err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
|
||||||
|
// number of closes
|
||||||
|
numCloses, err := strconv.ParseInt(key_value["_cc_"], 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to convert number of closes: '%s' to int64: %v", key_value["_cc_"], err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if y, err := lp.NewMessage("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp); err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
|
||||||
|
// number of reads
|
||||||
|
numReads, err := strconv.ParseInt(key_value["_rdc_"], 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to convert number of reads: '%s' to int64: %v", key_value["_rdc_"], err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if y, err := lp.NewMessage("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp); err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
|
||||||
|
// number of writes
|
||||||
|
numWrites, err := strconv.ParseInt(key_value["_wc_"], 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to convert number of writes: '%s' to int64: %v", key_value["_wc_"], err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if y, err := lp.NewMessage("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp); err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
|
||||||
|
// number of read directories
|
||||||
|
numReaddirs, err := strconv.ParseInt(key_value["_dir_"], 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to convert number of read directories: '%s' to int64: %v", key_value["_dir_"], err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if y, err := lp.NewMessage("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp); err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
|
||||||
|
// Number of inode updates
|
||||||
|
numInodeUpdates, err := strconv.ParseInt(key_value["_iu_"], 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to convert number of inode updates: '%s' to int: %v", key_value["_iu_"], err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if y, err := lp.NewMessage("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
|
||||||
|
// Total values
|
||||||
|
if m.config.SendTotalValues {
|
||||||
|
bytesTotal := bytesRead + bytesWritten
|
||||||
|
if y, err :=
|
||||||
|
lp.NewMessage("gpfs_bytes_total",
|
||||||
|
m.tags,
|
||||||
|
m.meta,
|
||||||
|
map[string]interface{}{
|
||||||
|
"value": bytesTotal,
|
||||||
|
},
|
||||||
|
timestamp,
|
||||||
|
); err == nil {
|
||||||
|
y.AddMeta("unit", "bytes")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
iops := numReads + numWrites
|
||||||
|
if y, err :=
|
||||||
|
lp.NewMessage("gpfs_iops",
|
||||||
|
m.tags,
|
||||||
|
m.meta,
|
||||||
|
map[string]interface{}{
|
||||||
|
"value": iops,
|
||||||
|
},
|
||||||
|
timestamp,
|
||||||
|
); err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
metaops := numInodeUpdates + numCloses + numOpens + numReaddirs
|
||||||
|
if y, err :=
|
||||||
|
lp.NewMessage("gpfs_metaops",
|
||||||
|
m.tags,
|
||||||
|
m.meta,
|
||||||
|
map[string]interface{}{
|
||||||
|
"value": metaops,
|
||||||
|
},
|
||||||
|
timestamp,
|
||||||
|
); err == nil {
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,31 +1,13 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: GPFS collector
|
|
||||||
description: Collect infos about GPFS filesystems
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/gpfs.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `gpfs` collector
|
## `gpfs` collector
|
||||||
|
|
||||||
```json
|
```json
|
||||||
"gpfs": {
|
"ibstat": {
|
||||||
"mmpmon_path": "/path/to/mmpmon",
|
"mmpmon_path": "/path/to/mmpmon",
|
||||||
"use_sudo": true,
|
|
||||||
"exclude_filesystem": [
|
"exclude_filesystem": [
|
||||||
"fs1"
|
"fs1"
|
||||||
],
|
],
|
||||||
"exclude_metrics": [
|
"send_bandwidths": true,
|
||||||
"gpfs_bytes_written"
|
"send_total_values": true
|
||||||
],
|
|
||||||
"send_abs_values": true,
|
|
||||||
"send_diff_values": true,
|
|
||||||
"send_derived_values": true,
|
|
||||||
"send_total_values": true,
|
|
||||||
"send_bandwidths": true
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -34,63 +16,24 @@ GPFS / IBM Spectrum Scale filesystems.
|
|||||||
|
|
||||||
The reported filesystems can be filtered with the `exclude_filesystem` option
|
The reported filesystems can be filtered with the `exclude_filesystem` option
|
||||||
in the configuration.
|
in the configuration.
|
||||||
Individual metrics can be disabled for reporting using option `exclude_metrics`.
|
|
||||||
|
|
||||||
The path to the `mmpmon` command can be configured with the `mmpmon_path` option
|
The path to the `mmpmon` command can be configured with the `mmpmon_path` option
|
||||||
in the configuration. If nothing is set, the collector searches in `$PATH` for `mmpmon`.
|
in the configuration. If nothing is set, the collector searches in `$PATH` for `mmpmon`.
|
||||||
|
|
||||||
If cc-metric-collector is run as non-root, password-less `sudo` can be enabled with `use_sudo`.
|
|
||||||
Because `mmpmon` is by default only executable as root, the Go procedure to
|
|
||||||
search for it in `$PATH` will fail. If you use `sudo`, you must specify the
|
|
||||||
complete path for `mmpmon` using the parameter `mmpmon_path`.
|
|
||||||
|
|
||||||
|
|
||||||
Metrics:
|
Metrics:
|
||||||
* `gpfs_bytes_read` (if `send_abs_values == true`)
|
* `gpfs_bytes_read`
|
||||||
* `gpfs_bytes_written` (if `send_abs_values == true`)
|
* `gpfs_bytes_written`
|
||||||
* `gpfs_num_opens` (if `send_abs_values == true`)
|
* `gpfs_num_opens`
|
||||||
* `gpfs_num_closes` (if `send_abs_values == true`)
|
* `gpfs_num_closes`
|
||||||
* `gpfs_num_reads` (if `send_abs_values == true`)
|
* `gpfs_num_reads`
|
||||||
* `gpfs_num_writes` (if `send_abs_values == true`)
|
* `gpfs_num_writes`
|
||||||
* `gpfs_num_readdirs` (if `send_abs_values == true`)
|
* `gpfs_num_readdirs`
|
||||||
* `gpfs_num_inode_updates` (if `send_abs_values == true`)
|
* `gpfs_num_inode_updates`
|
||||||
* `gpfs_bytes_read_diff` (if `send_diff_values == true`)
|
* `gpfs_bytes_total = gpfs_bytes_read + gpfs_bytes_written` (if `send_total_values == true`)
|
||||||
* `gpfs_bytes_written_diff` (if `send_diff_values == true`)
|
* `gpfs_iops = gpfs_num_reads + gpfs_num_writes` (if `send_total_values == true`)
|
||||||
* `gpfs_num_opens_diff` (if `send_diff_values == true`)
|
* `gpfs_metaops = gpfs_num_inode_updates + gpfs_num_closes + gpfs_num_opens + gpfs_num_readdirs` (if `send_total_values == true`)
|
||||||
* `gpfs_num_closes_diff` (if `send_diff_values == true`)
|
* `gpfs_bw_read` (if `send_bandwidths == true`)
|
||||||
* `gpfs_num_reads_diff` (if `send_diff_values == true`)
|
* `gpfs_bw_write` (if `send_bandwidths == true`)
|
||||||
* `gpfs_num_writes_diff` (if `send_diff_values == true`)
|
|
||||||
* `gpfs_num_readdirs_diff` (if `send_diff_values == true`)
|
|
||||||
* `gpfs_num_inode_updates_diff` (if `send_diff_values == true`)
|
|
||||||
* `gpfs_bw_read` (if `send_derived_values == true` or `send_bandwidths == true`)
|
|
||||||
* `gpfs_bw_write` (if `send_derived_values == true` or `send_bandwidths == true`)
|
|
||||||
* `gpfs_opens_rate` (if `send_derived_values == true`)
|
|
||||||
* `gpfs_closes_rate` (if `send_derived_values == true`)
|
|
||||||
* `gpfs_reads_rate` (if `send_derived_values == true`)
|
|
||||||
* `gpfs_writes_rate` (if `send_derived_values == true`)
|
|
||||||
* `gpfs_readdirs_rate` (if `send_derived_values == true`)
|
|
||||||
* `gpfs_inode_updates_rate` (if `send_derived_values == true`)
|
|
||||||
* `gpfs_bytes_total = gpfs_bytes_read + gpfs_bytes_written` (if `send_total_values == true` and `send_abs_values == true`)
|
|
||||||
* `gpfs_bytes_total_diff` (if `send_total_values == true` and `send_diff_values == true`)
|
|
||||||
* `gpfs_bw_total` ((if `send_total_values == true` and `send_derived_values == true`) or `send_bandwidths == true`)
|
|
||||||
* `gpfs_iops = gpfs_num_reads + gpfs_num_writes` (if `send_total_values == true` and `send_abs_values == true`)
|
|
||||||
* `gpfs_iops_diff` (if `send_total_values == true` and `send_diff_values == true`)
|
|
||||||
* `gpfs_iops_rate` (if `send_total_values == true` and `send_derived_values == true`)
|
|
||||||
* `gpfs_metaops = gpfs_num_inode_updates + gpfs_num_closes + gpfs_num_opens + gpfs_num_readdirs` (if `send_total_values == true` and `send_abs_values == true`)
|
|
||||||
* `gpfs_metaops_diff` (if `send_total_values == true` and `send_diff_values == true`)
|
|
||||||
* `gpfs_metaops_rate` (if `send_total_values == true` and `send_derived_values == true`)
|
|
||||||
|
|
||||||
The collector adds a `filesystem` tag to all metrics
|
The collector adds a `filesystem` tag to all metrics
|
||||||
|
|
||||||
`mmpmon` typically require root to run.
|
|
||||||
In order to run `cc-metric-collector` without root priviliges, you can enable `use_sudo`.
|
|
||||||
Add a file like this in `/etc/sudoers.d/` to allow `cc-metric-collector` to run the required command:
|
|
||||||
|
|
||||||
```
|
|
||||||
# Do not log the following sudo commands from monitoring, since this causes a lot of log spam.
|
|
||||||
# However keep log_denied enabled, to detect failures
|
|
||||||
Defaults: monitoring !log_allowed, !pam_session
|
|
||||||
|
|
||||||
# Allow to use mmpmon
|
|
||||||
monitoring ALL = (root) NOPASSWD:/absolute/path/to/mmpmon -p -s
|
|
||||||
```
|
|
||||||
@@ -1,26 +1,18 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
|
|
||||||
|
"encoding/json"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"slices"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
|
||||||
"golang.org/x/sys/unix"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const IB_BASEPATH = "/sys/class/infiniband/"
|
const IB_BASEPATH = "/sys/class/infiniband/"
|
||||||
@@ -46,7 +38,6 @@ type InfinibandCollectorInfo struct {
|
|||||||
|
|
||||||
type InfinibandCollector struct {
|
type InfinibandCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
config struct {
|
config struct {
|
||||||
ExcludeDevices []string `json:"exclude_devices,omitempty"` // IB device to exclude e.g. mlx5_0
|
ExcludeDevices []string `json:"exclude_devices,omitempty"` // IB device to exclude e.g. mlx5_0
|
||||||
SendAbsoluteValues bool `json:"send_abs_values"` // Send absolut values as read from sys filesystem
|
SendAbsoluteValues bool `json:"send_abs_values"` // Send absolut values as read from sys filesystem
|
||||||
@@ -59,6 +50,7 @@ type InfinibandCollector struct {
|
|||||||
|
|
||||||
// Init initializes the Infiniband collector by walking through files below IB_BASEPATH
|
// Init initializes the Infiniband collector by walking through files below IB_BASEPATH
|
||||||
func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||||
|
|
||||||
// Check if already initialized
|
// Check if already initialized
|
||||||
if m.init {
|
if m.init {
|
||||||
return nil
|
return nil
|
||||||
@@ -66,9 +58,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
var err error
|
var err error
|
||||||
m.name = "InfinibandCollector"
|
m.name = "InfinibandCollector"
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{
|
||||||
"source": m.name,
|
"source": m.name,
|
||||||
@@ -80,10 +70,9 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
m.config.SendDerivedValues = false
|
m.config.SendDerivedValues = false
|
||||||
// Read configuration file, allow overwriting default config
|
// Read configuration file, allow overwriting default config
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err = json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
return err
|
||||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -91,10 +80,10 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*")
|
globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*")
|
||||||
ibDirs, err := filepath.Glob(globPattern)
|
ibDirs, err := filepath.Glob(globPattern)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): unable to glob files with pattern %s: %w", m.name, globPattern, err)
|
return fmt.Errorf("unable to glob files with pattern %s: %v", globPattern, err)
|
||||||
}
|
}
|
||||||
if ibDirs == nil {
|
if ibDirs == nil {
|
||||||
return fmt.Errorf("%s Init(): unable to find any directories with pattern %s", m.name, globPattern)
|
return fmt.Errorf("unable to find any directories with pattern %s", globPattern)
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, path := range ibDirs {
|
for _, path := range ibDirs {
|
||||||
@@ -115,7 +104,14 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
port := pathSplit[6]
|
port := pathSplit[6]
|
||||||
|
|
||||||
// Skip excluded devices
|
// Skip excluded devices
|
||||||
if slices.Contains(m.config.ExcludeDevices, device) {
|
skip := false
|
||||||
|
for _, excludedDevice := range m.config.ExcludeDevices {
|
||||||
|
if excludedDevice == device {
|
||||||
|
skip = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if skip {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -158,7 +154,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
for _, counter := range portCounterFiles {
|
for _, counter := range portCounterFiles {
|
||||||
err := unix.Access(counter.path, unix.R_OK)
|
err := unix.Access(counter.path, unix.R_OK)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): unable to access %s: %w", m.name, counter.path, err)
|
return fmt.Errorf("unable to access %s: %v", counter.path, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -178,7 +174,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(m.info) == 0 {
|
if len(m.info) == 0 {
|
||||||
return fmt.Errorf("%s Init(): found no IB devices", m.name)
|
return fmt.Errorf("found no IB devices")
|
||||||
}
|
}
|
||||||
|
|
||||||
m.init = true
|
m.init = true
|
||||||
@@ -187,6 +183,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Read reads Infiniband counter files below IB_BASEPATH
|
// Read reads Infiniband counter files below IB_BASEPATH
|
||||||
func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
|
|
||||||
// Check if already initialized
|
// Check if already initialized
|
||||||
if !m.init {
|
if !m.init {
|
||||||
return
|
return
|
||||||
@@ -232,14 +229,15 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
|
|
||||||
// Send absolut values
|
// Send absolut values
|
||||||
if m.config.SendAbsoluteValues {
|
if m.config.SendAbsoluteValues {
|
||||||
if y, err := lp.NewMessage(
|
if y, err :=
|
||||||
counterDef.name,
|
lp.NewMessage(
|
||||||
info.tagSet,
|
counterDef.name,
|
||||||
m.meta,
|
info.tagSet,
|
||||||
map[string]any{
|
m.meta,
|
||||||
"value": counterDef.currentState,
|
map[string]interface{}{
|
||||||
},
|
"value": counterDef.currentState,
|
||||||
now); err == nil {
|
},
|
||||||
|
now); err == nil {
|
||||||
y.AddMeta("unit", counterDef.unit)
|
y.AddMeta("unit", counterDef.unit)
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -249,14 +247,15 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
if m.config.SendDerivedValues {
|
if m.config.SendDerivedValues {
|
||||||
if counterDef.lastState >= 0 {
|
if counterDef.lastState >= 0 {
|
||||||
rate := float64((counterDef.currentState - counterDef.lastState)) / timeDiff
|
rate := float64((counterDef.currentState - counterDef.lastState)) / timeDiff
|
||||||
if y, err := lp.NewMessage(
|
if y, err :=
|
||||||
counterDef.name+"_bw",
|
lp.NewMessage(
|
||||||
info.tagSet,
|
counterDef.name+"_bw",
|
||||||
m.meta,
|
info.tagSet,
|
||||||
map[string]any{
|
m.meta,
|
||||||
"value": rate,
|
map[string]interface{}{
|
||||||
},
|
"value": rate,
|
||||||
now); err == nil {
|
},
|
||||||
|
now); err == nil {
|
||||||
y.AddMeta("unit", counterDef.unit+"/sec")
|
y.AddMeta("unit", counterDef.unit+"/sec")
|
||||||
output <- y
|
output <- y
|
||||||
|
|
||||||
@@ -278,26 +277,28 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
|
|
||||||
// Send total values
|
// Send total values
|
||||||
if m.config.SendTotalValues {
|
if m.config.SendTotalValues {
|
||||||
if y, err := lp.NewMessage(
|
if y, err :=
|
||||||
"ib_total",
|
lp.NewMessage(
|
||||||
info.tagSet,
|
"ib_total",
|
||||||
m.meta,
|
info.tagSet,
|
||||||
map[string]any{
|
m.meta,
|
||||||
"value": ib_total,
|
map[string]interface{}{
|
||||||
},
|
"value": ib_total,
|
||||||
now); err == nil {
|
},
|
||||||
|
now); err == nil {
|
||||||
y.AddMeta("unit", "bytes")
|
y.AddMeta("unit", "bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|
||||||
if y, err := lp.NewMessage(
|
if y, err :=
|
||||||
"ib_total_pkts",
|
lp.NewMessage(
|
||||||
info.tagSet,
|
"ib_total_pkts",
|
||||||
m.meta,
|
info.tagSet,
|
||||||
map[string]any{
|
m.meta,
|
||||||
"value": ib_total_pkts,
|
map[string]interface{}{
|
||||||
},
|
"value": ib_total_pkts,
|
||||||
now); err == nil {
|
},
|
||||||
|
now); err == nil {
|
||||||
y.AddMeta("unit", "packets")
|
y.AddMeta("unit", "packets")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,13 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: InfiniBand Metric collector
|
|
||||||
description: Collect metrics for InfiniBand devices
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/infiniband.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `ibstat` collector
|
## `ibstat` collector
|
||||||
|
|
||||||
|
|||||||
@@ -1,60 +1,49 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"errors"
|
||||||
"os"
|
"os"
|
||||||
"slices"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Konstante für den Pfad zu /proc/diskstats
|
||||||
const IOSTATFILE = `/proc/diskstats`
|
const IOSTATFILE = `/proc/diskstats`
|
||||||
|
|
||||||
type IOstatCollectorConfig struct {
|
type IOstatCollectorConfig struct {
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
|
// Neues Feld zum Ausschließen von Devices per JSON-Konfiguration
|
||||||
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type IOstatCollectorEntry struct {
|
type IOstatCollectorEntry struct {
|
||||||
currentValues map[string]int64
|
lastValues map[string]int64
|
||||||
lastValues map[string]int64
|
tags map[string]string
|
||||||
tags map[string]string
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type IOstatCollector struct {
|
type IOstatCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
matches map[string]int
|
matches map[string]int
|
||||||
config IOstatCollectorConfig
|
config IOstatCollectorConfig
|
||||||
devices map[string]IOstatCollectorEntry
|
devices map[string]IOstatCollectorEntry
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *IOstatCollector) Init(config json.RawMessage) error {
|
func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||||
|
var err error
|
||||||
m.name = "IOstatCollector"
|
m.name = "IOstatCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err = json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
return err
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// https://www.kernel.org/doc/html/latest/admin-guide/iostats.html
|
// https://www.kernel.org/doc/html/latest/admin-guide/iostats.html
|
||||||
@@ -80,17 +69,19 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
|||||||
m.devices = make(map[string]IOstatCollectorEntry)
|
m.devices = make(map[string]IOstatCollectorEntry)
|
||||||
m.matches = make(map[string]int)
|
m.matches = make(map[string]int)
|
||||||
for k, v := range matches {
|
for k, v := range matches {
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, k) {
|
if _, skip := stringArrayContains(m.config.ExcludeMetrics, k); !skip {
|
||||||
m.matches[k] = v
|
m.matches[k] = v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(m.matches) == 0 {
|
if len(m.matches) == 0 {
|
||||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
return errors.New("no metrics to collect")
|
||||||
}
|
}
|
||||||
file, err := os.Open(IOSTATFILE)
|
file, err := os.Open(IOSTATFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): Failed to open file \"%s\": %w", m.name, IOSTATFILE, err)
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
@@ -104,38 +95,23 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
|||||||
if strings.Contains(device, "loop") {
|
if strings.Contains(device, "loop") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if slices.Contains(m.config.ExcludeDevices, device) {
|
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
currentValues := make(map[string]int64)
|
values := make(map[string]int64)
|
||||||
lastValues := make(map[string]int64)
|
|
||||||
for m := range m.matches {
|
for m := range m.matches {
|
||||||
currentValues[m] = 0
|
values[m] = 0
|
||||||
lastValues[m] = 0
|
|
||||||
}
|
|
||||||
for name, idx := range m.matches {
|
|
||||||
if idx < len(linefields) {
|
|
||||||
if value, err := strconv.ParseInt(linefields[idx], 0, 64); err == nil {
|
|
||||||
currentValues[name] = value
|
|
||||||
lastValues[name] = value // Set last to current for first read
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
m.devices[device] = IOstatCollectorEntry{
|
m.devices[device] = IOstatCollectorEntry{
|
||||||
tags: map[string]string{
|
tags: map[string]string{
|
||||||
"device": device,
|
"device": device,
|
||||||
"type": "node",
|
"type": "node",
|
||||||
},
|
},
|
||||||
currentValues: currentValues,
|
lastValues: values,
|
||||||
lastValues: lastValues,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if err := file.Close(); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): Failed to close file \"%s\": %w", m.name, IOSTATFILE, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
@@ -145,18 +121,10 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
|
|
||||||
file, err := os.Open(IOSTATFILE)
|
file, err := os.Open(IOSTATFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, err.Error())
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", IOSTATFILE, err))
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer func() {
|
defer file.Close()
|
||||||
if err := file.Close(); err != nil {
|
|
||||||
cclog.ComponentError(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", IOSTATFILE, err))
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
@@ -172,28 +140,24 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
if strings.Contains(device, "loop") {
|
if strings.Contains(device, "loop") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if slices.Contains(m.config.ExcludeDevices, device) {
|
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if _, ok := m.devices[device]; !ok {
|
if _, ok := m.devices[device]; !ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Update current and last values
|
|
||||||
entry := m.devices[device]
|
entry := m.devices[device]
|
||||||
for name, idx := range m.matches {
|
for name, idx := range m.matches {
|
||||||
if idx < len(linefields) {
|
if idx < len(linefields) {
|
||||||
x, err := strconv.ParseInt(linefields[idx], 0, 64)
|
x, err := strconv.ParseInt(linefields[idx], 0, 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
// Calculate difference using previous current and new value
|
diff := x - entry.lastValues[name]
|
||||||
diff := x - entry.currentValues[name]
|
y, err := lp.NewMessage(name, entry.tags, m.meta, map[string]interface{}{"value": int(diff)}, time.Now())
|
||||||
y, err := lp.NewMetric(name, entry.tags, m.meta, int(diff), time.Now())
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
// Update last to previous current, and current to new value
|
|
||||||
entry.lastValues[name] = entry.currentValues[name]
|
|
||||||
entry.currentValues[name] = x
|
|
||||||
}
|
}
|
||||||
|
entry.lastValues[name] = x
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.devices[device] = entry
|
m.devices[device] = entry
|
||||||
|
|||||||
@@ -1,13 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: IOStat Metric collector
|
|
||||||
description: Collect metrics from `/proc/diskstats`
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/iostat.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `iostat` collector
|
## `iostat` collector
|
||||||
|
|
||||||
|
|||||||
@@ -1,38 +1,31 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"log"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const IPMISENSORS_PATH = `ipmi-sensors`
|
const IPMISENSORS_PATH = `ipmi-sensors`
|
||||||
|
|
||||||
type IpmiCollector struct {
|
type IpmiCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
config struct {
|
config struct {
|
||||||
|
ExcludeDevices []string `json:"exclude_devices"`
|
||||||
IpmitoolPath string `json:"ipmitool_path"`
|
IpmitoolPath string `json:"ipmitool_path"`
|
||||||
IpmisensorsPath string `json:"ipmisensors_path"`
|
IpmisensorsPath string `json:"ipmisensors_path"`
|
||||||
Sudo bool `json:"use_sudo"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ipmitool string
|
ipmitool string
|
||||||
ipmisensors string
|
ipmisensors string
|
||||||
}
|
}
|
||||||
@@ -44,9 +37,7 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
m.name = "IpmiCollector"
|
m.name = "IpmiCollector"
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{
|
||||||
"source": m.name,
|
"source": m.name,
|
||||||
@@ -55,75 +46,58 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
|
|||||||
// default path to IPMI tools
|
// default path to IPMI tools
|
||||||
m.config.IpmitoolPath = "ipmitool"
|
m.config.IpmitoolPath = "ipmitool"
|
||||||
m.config.IpmisensorsPath = "ipmi-sensors"
|
m.config.IpmisensorsPath = "ipmi-sensors"
|
||||||
|
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err := json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
return err
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Check if executables ipmitool or ipmisensors are found
|
||||||
m.ipmitool = m.config.IpmitoolPath
|
p, err := exec.LookPath(m.config.IpmitoolPath)
|
||||||
m.ipmisensors = m.config.IpmisensorsPath
|
if err == nil {
|
||||||
|
command := exec.Command(p)
|
||||||
// Test if any of the supported backends work
|
err := command.Run()
|
||||||
var dummyChan chan lp.CCMessage
|
if err != nil {
|
||||||
dummyConsumer := func() {
|
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
|
||||||
for range dummyChan {
|
m.ipmitool = ""
|
||||||
|
} else {
|
||||||
|
m.ipmitool = p
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
p, err = exec.LookPath(m.config.IpmisensorsPath)
|
||||||
// Test if ipmi-sensors works (preferred over ipmitool, because it's faster)
|
if err == nil {
|
||||||
var ipmiSensorsErr error
|
command := exec.Command(p)
|
||||||
if _, ipmiSensorsErr = exec.LookPath(m.ipmisensors); ipmiSensorsErr == nil {
|
err := command.Run()
|
||||||
dummyChan = make(chan lp.CCMessage)
|
if err != nil {
|
||||||
go dummyConsumer()
|
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
|
||||||
ipmiSensorsErr = m.readIpmiSensors(dummyChan)
|
m.ipmisensors = ""
|
||||||
close(dummyChan)
|
} else {
|
||||||
if ipmiSensorsErr == nil {
|
m.ipmisensors = p
|
||||||
cclog.ComponentDebugf(m.name, "Using ipmi-sensors for ipmistat collector")
|
|
||||||
m.init = true
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cclog.ComponentDebugf(m.name, "Unable to use ipmi-sensors for ipmistat collector: %v", ipmiSensorsErr)
|
if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 {
|
||||||
m.ipmisensors = ""
|
return errors.New("no usable IPMI reader found")
|
||||||
|
|
||||||
// Test if ipmitool works (may be very slow)
|
|
||||||
var ipmiToolErr error
|
|
||||||
if _, ipmiToolErr = exec.LookPath(m.ipmitool); ipmiToolErr == nil {
|
|
||||||
dummyChan = make(chan lp.CCMessage)
|
|
||||||
go dummyConsumer()
|
|
||||||
ipmiToolErr = m.readIpmiTool(dummyChan)
|
|
||||||
close(dummyChan)
|
|
||||||
if ipmiToolErr == nil {
|
|
||||||
cclog.ComponentDebugf(m.name, "Using ipmitool for ipmistat collector")
|
|
||||||
m.init = true
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
m.ipmitool = ""
|
|
||||||
cclog.ComponentDebugf(m.name, "Unable to use ipmitool for ipmistat collector: %v", ipmiToolErr)
|
|
||||||
|
|
||||||
return fmt.Errorf("unable to init neither ipmitool (%w) nor ipmi-sensors (%w)", ipmiToolErr, ipmiSensorsErr)
|
m.init = true
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *IpmiCollector) readIpmiTool(output chan lp.CCMessage) error {
|
func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
|
||||||
|
|
||||||
// Setup ipmitool command
|
// Setup ipmitool command
|
||||||
argv := make([]string, 0)
|
command := exec.Command(cmd, "sensor")
|
||||||
if m.config.Sudo {
|
|
||||||
argv = append(argv, "sudo", "-n")
|
|
||||||
}
|
|
||||||
argv = append(argv, m.ipmitool, "sensor")
|
|
||||||
command := exec.Command(argv[0], argv[1:]...)
|
|
||||||
stdout, _ := command.StdoutPipe()
|
stdout, _ := command.StdoutPipe()
|
||||||
errBuf := new(bytes.Buffer)
|
errBuf := new(bytes.Buffer)
|
||||||
command.Stderr = errBuf
|
command.Stderr = errBuf
|
||||||
|
|
||||||
// start command
|
// start command
|
||||||
if err := command.Start(); err != nil {
|
if err := command.Start(); err != nil {
|
||||||
return fmt.Errorf("failed to start command '%s': %w", command.String(), err)
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("readIpmiTool(): Failed to start command \"%s\": %v", command.String(), err),
|
||||||
|
)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read command output
|
// Read command output
|
||||||
@@ -133,118 +107,81 @@ func (m *IpmiCollector) readIpmiTool(output chan lp.CCMessage) error {
|
|||||||
if len(lv) < 3 {
|
if len(lv) < 3 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if strings.TrimSpace(lv[1]) == "0x0" || strings.TrimSpace(lv[1]) == "na" {
|
|
||||||
// Ignore known non-float values
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
v, err := strconv.ParseFloat(strings.TrimSpace(lv[1]), 64)
|
v, err := strconv.ParseFloat(strings.TrimSpace(lv[1]), 64)
|
||||||
if err != nil {
|
if err == nil {
|
||||||
cclog.ComponentErrorf(m.name, "Failed to parse float '%s': %v", lv[1], err)
|
name := strings.ToLower(strings.Replace(strings.TrimSpace(lv[0]), " ", "_", -1))
|
||||||
continue
|
unit := strings.TrimSpace(lv[2])
|
||||||
}
|
if unit == "Volts" {
|
||||||
name := strings.ToLower(strings.ReplaceAll(strings.TrimSpace(lv[0]), " ", "_"))
|
unit = "Volts"
|
||||||
unit := strings.TrimSpace(lv[2])
|
} else if unit == "degrees C" {
|
||||||
switch unit {
|
unit = "degC"
|
||||||
case "Volts":
|
} else if unit == "degrees F" {
|
||||||
unit = "Volts"
|
unit = "degF"
|
||||||
case "degrees C":
|
} else if unit == "Watts" {
|
||||||
unit = "degC"
|
unit = "Watts"
|
||||||
case "degrees F":
|
}
|
||||||
unit = "degF"
|
|
||||||
case "Watts":
|
|
||||||
unit = "Watts"
|
|
||||||
}
|
|
||||||
|
|
||||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
|
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
|
||||||
if err != nil {
|
if err == nil {
|
||||||
cclog.ComponentErrorf(m.name, "Failed to create message: %v", err)
|
y.AddMeta("unit", unit)
|
||||||
continue
|
output <- y
|
||||||
|
}
|
||||||
}
|
}
|
||||||
y.AddMeta("unit", unit)
|
|
||||||
output <- y
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for command end
|
// Wait for command end
|
||||||
if err := command.Wait(); err != nil {
|
if err := command.Wait(); err != nil {
|
||||||
errMsg, _ := io.ReadAll(errBuf)
|
errMsg, _ := io.ReadAll(errBuf)
|
||||||
return fmt.Errorf("failed to complete command '%s': %w (stderr: %s)", command.String(), err, strings.TrimSpace(string(errMsg)))
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("readIpmiTool(): Failed to wait for the end of command \"%s\": %v\n", command.String(), err),
|
||||||
|
)
|
||||||
|
cclog.ComponentError(m.name, fmt.Sprintf("readIpmiTool(): command stderr: \"%s\"\n", strings.TrimSpace(string(errMsg))))
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *IpmiCollector) readIpmiSensors(output chan lp.CCMessage) error {
|
func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
|
||||||
// Setup ipmisensors command
|
|
||||||
argv := make([]string, 0)
|
|
||||||
if m.config.Sudo {
|
|
||||||
argv = append(argv, "sudo", "-n")
|
|
||||||
}
|
|
||||||
argv = append(argv, m.ipmisensors, "--comma-separated-output", "--sdr-cache-recreate")
|
|
||||||
command := exec.Command(argv[0], argv[1:]...)
|
|
||||||
stdout, _ := command.StdoutPipe()
|
|
||||||
errBuf := new(bytes.Buffer)
|
|
||||||
command.Stderr = errBuf
|
|
||||||
|
|
||||||
// start command
|
command := exec.Command(cmd, "--comma-separated-output", "--sdr-cache-recreate")
|
||||||
if err := command.Start(); err != nil {
|
command.Wait()
|
||||||
return fmt.Errorf("failed to start command '%s': %w", command.String(), err)
|
stdout, err := command.Output()
|
||||||
|
if err != nil {
|
||||||
|
log.Print(err)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read command output
|
ll := strings.Split(string(stdout), "\n")
|
||||||
scanner := bufio.NewScanner(stdout)
|
|
||||||
for scanner.Scan() {
|
|
||||||
lv := strings.Split(scanner.Text(), ",")
|
|
||||||
if len(lv) <= 3 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if lv[3] == "N/A" || lv[3] == "Reading" {
|
|
||||||
// Ignore known non-float values
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
v, err := strconv.ParseFloat(strings.TrimSpace(lv[3]), 64)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentErrorf(m.name, "Failed to parse float '%s': %v", lv[3], err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
name := strings.ToLower(strings.ReplaceAll(lv[1], " ", "_"))
|
|
||||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentErrorf(m.name, "Failed to create message: %v", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if len(lv) > 4 {
|
|
||||||
y.AddMeta("unit", lv[4])
|
|
||||||
}
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wait for command end
|
for _, line := range ll {
|
||||||
if err := command.Wait(); err != nil {
|
lv := strings.Split(line, ",")
|
||||||
errMsg, _ := io.ReadAll(errBuf)
|
if len(lv) > 3 {
|
||||||
return fmt.Errorf("failed to complete command '%s': %w (stderr: %s)", command.String(), err, strings.TrimSpace(string(errMsg)))
|
v, err := strconv.ParseFloat(lv[3], 64)
|
||||||
|
if err == nil {
|
||||||
|
name := strings.ToLower(strings.Replace(lv[1], " ", "_", -1))
|
||||||
|
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
|
||||||
|
if err == nil {
|
||||||
|
if len(lv) > 4 {
|
||||||
|
y.AddMeta("unit", lv[4])
|
||||||
|
}
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
|
|
||||||
// Check if already initialized
|
// Check if already initialized
|
||||||
if !m.init {
|
if !m.init {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(m.ipmisensors) > 0 {
|
if len(m.config.IpmitoolPath) > 0 {
|
||||||
err := m.readIpmiSensors(output)
|
m.readIpmiTool(m.config.IpmitoolPath, output)
|
||||||
if err != nil {
|
} else if len(m.config.IpmisensorsPath) > 0 {
|
||||||
cclog.ComponentErrorf(m.name, "readIpmiSensors() failed: %v", err)
|
m.readIpmiSensors(m.config.IpmisensorsPath, output)
|
||||||
}
|
|
||||||
} else if len(m.ipmitool) > 0 {
|
|
||||||
err := m.readIpmiTool(output)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentErrorf(m.name, "readIpmiTool() failed: %v", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,13 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: IPMI Metric collector
|
|
||||||
description: Collect metrics using ipmitool or ipmi-sensors
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/ipmi.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `ipmistat` collector
|
## `ipmistat` collector
|
||||||
|
|
||||||
@@ -15,24 +5,9 @@ hugo_path: docs/reference/cc-metric-collector/collectors/ipmi.md
|
|||||||
"ipmistat": {
|
"ipmistat": {
|
||||||
"ipmitool_path": "/path/to/ipmitool",
|
"ipmitool_path": "/path/to/ipmitool",
|
||||||
"ipmisensors_path": "/path/to/ipmi-sensors",
|
"ipmisensors_path": "/path/to/ipmi-sensors",
|
||||||
"use_sudo": true
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The `ipmistat` collector reads data from `ipmitool` (`ipmitool sensor`) or `ipmi-sensors` (`ipmi-sensors --sdr-cache-recreate --comma-separated-output`).
|
The `ipmistat` collector reads data from `ipmitool` (`ipmitool sensor`) or `ipmi-sensors` (`ipmi-sensors --sdr-cache-recreate --comma-separated-output`).
|
||||||
|
|
||||||
The metrics depend on the output of the underlying tools but contain temperature, power and energy metrics.
|
The metrics depend on the output of the underlying tools but contain temperature, power and energy metrics.
|
||||||
|
|
||||||
`ipmitool` and `ipmi-sensors` typically require root to run.
|
|
||||||
In order to run `cc-metric-collector` without root priviliges, you can enable `use_sudo`.
|
|
||||||
Add a file like this in `/etc/sudoers.d/` to allow `cc-metric-collector` to run the required commands:
|
|
||||||
|
|
||||||
```
|
|
||||||
# Do not log the following sudo commands from monitoring, since this causes a lot of log spam.
|
|
||||||
# However keep log_denied enabled, to detect failures
|
|
||||||
Defaults: monitoring !log_allowed, !pam_session
|
|
||||||
|
|
||||||
# Allow to use ipmitool and ipmi-sensors
|
|
||||||
monitoring ALL = (root) NOPASSWD:/usr/bin/ipmitool sensor
|
|
||||||
monitoring ALL = (root) NOPASSWD:/usr/sbin/ipmi-sensors --comma-separated-output --sdr-cache-recreate
|
|
||||||
```
|
|
||||||
|
|||||||
@@ -1,10 +1,3 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -16,15 +9,14 @@ package collectors
|
|||||||
import "C"
|
import "C"
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"maps"
|
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
"os/user"
|
"os/user"
|
||||||
"slices"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -32,8 +24,8 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
||||||
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||||
"github.com/NVIDIA/go-nvml/pkg/dl"
|
"github.com/NVIDIA/go-nvml/pkg/dl"
|
||||||
@@ -125,14 +117,22 @@ func checkMetricType(t string) bool {
|
|||||||
return ok
|
return ok
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func eventsToEventStr(events map[string]string) string {
|
||||||
|
elist := make([]string, 0)
|
||||||
|
for k, v := range events {
|
||||||
|
elist = append(elist, fmt.Sprintf("%s:%s", v, k))
|
||||||
|
}
|
||||||
|
return strings.Join(elist, ",")
|
||||||
|
}
|
||||||
|
|
||||||
func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig {
|
func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig {
|
||||||
clist := make([]string, 0, len(input.Events))
|
tmplist := make([]string, 0)
|
||||||
|
clist := make([]string, 0)
|
||||||
for k := range input.Events {
|
for k := range input.Events {
|
||||||
clist = append(clist, k)
|
clist = append(clist, k)
|
||||||
}
|
}
|
||||||
slices.Sort(clist)
|
sort.Strings(clist)
|
||||||
tmplist := make([]string, 0, len(clist))
|
elist := make([]*C.char, 0)
|
||||||
elist := make([]*C.char, 0, len(clist))
|
|
||||||
for _, k := range clist {
|
for _, k := range clist {
|
||||||
v := input.Events[k]
|
v := input.Events[k]
|
||||||
tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k))
|
tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k))
|
||||||
@@ -142,7 +142,7 @@ func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig
|
|||||||
estr := strings.Join(tmplist, ",")
|
estr := strings.Join(tmplist, ",")
|
||||||
res := make(map[int]map[string]float64)
|
res := make(map[int]map[string]float64)
|
||||||
met := make(map[int]map[string]float64)
|
met := make(map[int]map[string]float64)
|
||||||
for _, i := range topo.HwthreadList() {
|
for _, i := range topo.CpuList() {
|
||||||
res[i] = make(map[string]float64)
|
res[i] = make(map[string]float64)
|
||||||
for k := range input.Events {
|
for k := range input.Events {
|
||||||
res[i][k] = 0.0
|
res[i][k] = 0.0
|
||||||
@@ -180,7 +180,7 @@ func getBaseFreq() float64 {
|
|||||||
for _, f := range files {
|
for _, f := range files {
|
||||||
buffer, err := os.ReadFile(f)
|
buffer, err := os.ReadFile(f)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
data := strings.ReplaceAll(string(buffer), "\n", "")
|
data := strings.Replace(string(buffer), "\n", "", -1)
|
||||||
x, err := strconv.ParseInt(data, 0, 64)
|
x, err := strconv.ParseInt(data, 0, 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
freq = float64(x)
|
freq = float64(x)
|
||||||
@@ -190,8 +190,12 @@ func getBaseFreq() float64 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if math.IsNaN(freq) {
|
if math.IsNaN(freq) {
|
||||||
C.timer_init()
|
C.power_init(0)
|
||||||
freq = float64(C.timer_getCycleClock()) / 1e3
|
info := C.get_powerInfo()
|
||||||
|
if float64(info.baseFrequency) != 0 {
|
||||||
|
freq = float64(info.baseFrequency)
|
||||||
|
}
|
||||||
|
C.power_finalize()
|
||||||
}
|
}
|
||||||
return freq * 1e3
|
return freq * 1e3
|
||||||
}
|
}
|
||||||
@@ -207,30 +211,25 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
m.config.LibraryPath = LIKWID_LIB_NAME
|
m.config.LibraryPath = LIKWID_LIB_NAME
|
||||||
m.config.LockfilePath = LIKWID_DEF_LOCKFILE
|
m.config.LockfilePath = LIKWID_DEF_LOCKFILE
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err := json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
return err
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS)
|
lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS)
|
||||||
if lib == nil {
|
if lib == nil {
|
||||||
return fmt.Errorf("%s Init(): error instantiating DynamicLibrary for %s", m.name, m.config.LibraryPath)
|
return fmt.Errorf("error instantiating DynamicLibrary for %s", m.config.LibraryPath)
|
||||||
}
|
}
|
||||||
err := lib.Open()
|
err := lib.Open()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): error opening %s: %w", m.name, m.config.LibraryPath, err)
|
return fmt.Errorf("error opening %s: %v", m.config.LibraryPath, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if m.config.ForceOverwrite {
|
if m.config.ForceOverwrite {
|
||||||
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
|
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
|
||||||
if err := os.Setenv("LIKWID_FORCE", "1"); err != nil {
|
os.Setenv("LIKWID_FORCE", "1")
|
||||||
return fmt.Errorf("%s Init(): error setting environment variable LIKWID_FORCE=1: %w", m.name, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if err := m.setup(); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
|
m.setup()
|
||||||
|
|
||||||
m.meta = map[string]string{"group": "PerfCounter"}
|
m.meta = map[string]string{"group": "PerfCounter"}
|
||||||
cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
|
cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
|
||||||
@@ -296,12 +295,16 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// If no event set could be added, shut down LikwidCollector
|
// If no event set could be added, shut down LikwidCollector
|
||||||
if totalMetrics == 0 {
|
if totalMetrics == 0 {
|
||||||
return fmt.Errorf("%s Init(): no LIKWID eventset or metric usable", m.name)
|
err := errors.New("no LIKWID eventset or metric usable")
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
ret := C.topology_init()
|
ret := C.topology_init()
|
||||||
if ret != 0 {
|
if ret != 0 {
|
||||||
return fmt.Errorf("%s Init(): failed to initialize topology module", m.name)
|
err := errors.New("failed to initialize topology module")
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
m.measureThread = thread.New()
|
m.measureThread = thread.New()
|
||||||
switch m.config.AccessMode {
|
switch m.config.AccessMode {
|
||||||
@@ -310,14 +313,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
case "accessdaemon":
|
case "accessdaemon":
|
||||||
if len(m.config.DaemonPath) > 0 {
|
if len(m.config.DaemonPath) > 0 {
|
||||||
p := os.Getenv("PATH")
|
p := os.Getenv("PATH")
|
||||||
if len(p) > 0 {
|
os.Setenv("PATH", m.config.DaemonPath+":"+p)
|
||||||
p = m.config.DaemonPath + ":" + p
|
|
||||||
} else {
|
|
||||||
p = m.config.DaemonPath
|
|
||||||
}
|
|
||||||
if err := os.Setenv("PATH", p); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): error setting environment variable PATH=%s: %w", m.name, p, err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
C.HPMmode(1)
|
C.HPMmode(1)
|
||||||
retCode := C.HPMinit()
|
retCode := C.HPMinit()
|
||||||
@@ -328,7 +324,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
for _, c := range m.cpulist {
|
for _, c := range m.cpulist {
|
||||||
m.measureThread.Call(
|
m.measureThread.Call(
|
||||||
func() {
|
func() {
|
||||||
retCode := C.HPMaddThread(C.uint32_t(c))
|
retCode := C.HPMaddThread(c)
|
||||||
if retCode != 0 {
|
if retCode != 0 {
|
||||||
err := fmt.Errorf("C.HPMaddThread(%v) failed with return code %v", c, retCode)
|
err := fmt.Errorf("C.HPMaddThread(%v) failed with return code %v", c, retCode)
|
||||||
cclog.ComponentError(m.name, err.Error())
|
cclog.ComponentError(m.name, err.Error())
|
||||||
@@ -370,23 +366,16 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
// take a measurement for 'interval' seconds of event set index 'group'
|
// take a measurement for 'interval' seconds of event set index 'group'
|
||||||
func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, interval time.Duration) (bool, error) {
|
func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, interval time.Duration) (bool, error) {
|
||||||
var ret C.int
|
var ret C.int
|
||||||
|
var gid C.int = -1
|
||||||
sigchan := make(chan os.Signal, 1)
|
sigchan := make(chan os.Signal, 1)
|
||||||
|
|
||||||
// Watch changes for the lock file ()
|
// Watch changes for the lock file ()
|
||||||
watcher, err := fsnotify.NewWatcher()
|
watcher, err := fsnotify.NewWatcher()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, err.Error())
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("takeMeasurement(): Failed to create a new fsnotify.Watcher: %v", err))
|
|
||||||
return true, err
|
return true, err
|
||||||
}
|
}
|
||||||
defer func() {
|
defer watcher.Close()
|
||||||
if err := watcher.Close(); err != nil {
|
|
||||||
cclog.ComponentError(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("takeMeasurement(): Failed to close fsnotify.Watcher: %v", err))
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
if len(m.config.LockfilePath) > 0 {
|
if len(m.config.LockfilePath) > 0 {
|
||||||
// Check if the lock file exists
|
// Check if the lock file exists
|
||||||
info, err := os.Stat(m.config.LockfilePath)
|
info, err := os.Stat(m.config.LockfilePath)
|
||||||
@@ -394,11 +383,9 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
|||||||
// Create the lock file if it does not exist
|
// Create the lock file if it does not exist
|
||||||
file, createErr := os.Create(m.config.LockfilePath)
|
file, createErr := os.Create(m.config.LockfilePath)
|
||||||
if createErr != nil {
|
if createErr != nil {
|
||||||
return true, fmt.Errorf("failed to create lock file: %w", createErr)
|
return true, fmt.Errorf("failed to create lock file: %v", createErr)
|
||||||
}
|
|
||||||
if err := file.Close(); err != nil {
|
|
||||||
return true, fmt.Errorf("failed to close lock file: %w", err)
|
|
||||||
}
|
}
|
||||||
|
file.Close()
|
||||||
info, err = os.Stat(m.config.LockfilePath) // Recheck the file after creation
|
info, err = os.Stat(m.config.LockfilePath) // Recheck the file after creation
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -450,7 +437,6 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
|||||||
signal.Notify(sigchan, syscall.SIGCHLD)
|
signal.Notify(sigchan, syscall.SIGCHLD)
|
||||||
|
|
||||||
// Add an event string to LIKWID
|
// Add an event string to LIKWID
|
||||||
var gid C.int
|
|
||||||
select {
|
select {
|
||||||
case <-sigchan:
|
case <-sigchan:
|
||||||
gid = -1
|
gid = -1
|
||||||
@@ -606,20 +592,20 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
|||||||
evset.metrics[tid][metric.Name] = value
|
evset.metrics[tid][metric.Name] = value
|
||||||
// Now we have the result, send it with the proper tags
|
// Now we have the result, send it with the proper tags
|
||||||
if !math.IsNaN(value) && metric.Publish {
|
if !math.IsNaN(value) && metric.Publish {
|
||||||
y, err := lp.NewMessage(
|
fields := map[string]interface{}{"value": value}
|
||||||
metric.Name,
|
y, err :=
|
||||||
map[string]string{
|
lp.NewMessage(
|
||||||
"type": metric.Type,
|
metric.Name,
|
||||||
},
|
map[string]string{
|
||||||
m.meta,
|
"type": metric.Type,
|
||||||
map[string]any{
|
},
|
||||||
"value": value,
|
m.meta,
|
||||||
},
|
fields,
|
||||||
now,
|
now,
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if metric.Type != "node" {
|
if metric.Type != "node" {
|
||||||
y.AddTag("type-id", strconv.Itoa(domain))
|
y.AddTag("type-id", fmt.Sprintf("%d", domain))
|
||||||
}
|
}
|
||||||
if len(metric.Unit) > 0 {
|
if len(metric.Unit) > 0 {
|
||||||
y.AddMeta("unit", metric.Unit)
|
y.AddMeta("unit", metric.Unit)
|
||||||
@@ -644,18 +630,19 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
|||||||
}
|
}
|
||||||
|
|
||||||
for coreID, value := range totalCoreValues {
|
for coreID, value := range totalCoreValues {
|
||||||
y, err := lp.NewMessage(
|
y, err :=
|
||||||
metric.Name,
|
lp.NewMessage(
|
||||||
map[string]string{
|
metric.Name,
|
||||||
"type": "core",
|
map[string]string{
|
||||||
"type-id": strconv.Itoa(coreID),
|
"type": "core",
|
||||||
},
|
"type-id": fmt.Sprintf("%d", coreID),
|
||||||
m.meta,
|
},
|
||||||
map[string]any{
|
m.meta,
|
||||||
"value": value,
|
map[string]interface{}{
|
||||||
},
|
"value": value,
|
||||||
now,
|
},
|
||||||
)
|
now,
|
||||||
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -680,18 +667,19 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
|||||||
}
|
}
|
||||||
|
|
||||||
for socketID, value := range totalSocketValues {
|
for socketID, value := range totalSocketValues {
|
||||||
y, err := lp.NewMessage(
|
y, err :=
|
||||||
metric.Name,
|
lp.NewMessage(
|
||||||
map[string]string{
|
metric.Name,
|
||||||
"type": "socket",
|
map[string]string{
|
||||||
"type-id": strconv.Itoa(socketID),
|
"type": "socket",
|
||||||
},
|
"type-id": fmt.Sprintf("%d", socketID),
|
||||||
m.meta,
|
},
|
||||||
map[string]any{
|
m.meta,
|
||||||
"value": value,
|
map[string]interface{}{
|
||||||
},
|
"value": value,
|
||||||
now,
|
},
|
||||||
)
|
now,
|
||||||
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -714,17 +702,18 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
y, err := lp.NewMessage(
|
y, err :=
|
||||||
metric.Name,
|
lp.NewMessage(
|
||||||
map[string]string{
|
metric.Name,
|
||||||
"type": "node",
|
map[string]string{
|
||||||
},
|
"type": "node",
|
||||||
m.meta,
|
},
|
||||||
map[string]any{
|
m.meta,
|
||||||
"value": totalNodeValue,
|
map[string]interface{}{
|
||||||
},
|
"value": totalNodeValue,
|
||||||
now,
|
},
|
||||||
)
|
now,
|
||||||
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -756,7 +745,9 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
|||||||
// Here we generate parameter list
|
// Here we generate parameter list
|
||||||
params := make(map[string]float64)
|
params := make(map[string]float64)
|
||||||
for _, evset := range groups {
|
for _, evset := range groups {
|
||||||
maps.Copy(params, evset.metrics[tid])
|
for mname, mres := range evset.metrics[tid] {
|
||||||
|
params[mname] = mres
|
||||||
|
}
|
||||||
}
|
}
|
||||||
params["gotime"] = interval.Seconds()
|
params["gotime"] = interval.Seconds()
|
||||||
// Evaluate the metric
|
// Evaluate the metric
|
||||||
@@ -771,20 +762,21 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
|||||||
// Now we have the result, send it with the proper tags
|
// Now we have the result, send it with the proper tags
|
||||||
if !math.IsNaN(value) {
|
if !math.IsNaN(value) {
|
||||||
if metric.Publish {
|
if metric.Publish {
|
||||||
y, err := lp.NewMessage(
|
y, err :=
|
||||||
metric.Name,
|
lp.NewMessage(
|
||||||
map[string]string{
|
metric.Name,
|
||||||
"type": metric.Type,
|
map[string]string{
|
||||||
},
|
"type": metric.Type,
|
||||||
m.meta,
|
},
|
||||||
map[string]any{
|
m.meta,
|
||||||
"value": value,
|
map[string]interface{}{
|
||||||
},
|
"value": value,
|
||||||
now,
|
},
|
||||||
)
|
now,
|
||||||
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if metric.Type != "node" {
|
if metric.Type != "node" {
|
||||||
y.AddTag("type-id", strconv.Itoa(domain))
|
y.AddTag("type-id", fmt.Sprintf("%d", domain))
|
||||||
}
|
}
|
||||||
if len(metric.Unit) > 0 {
|
if len(metric.Unit) > 0 {
|
||||||
y.AddMeta("unit", metric.Unit)
|
y.AddMeta("unit", metric.Unit)
|
||||||
@@ -800,7 +792,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMessage) {
|
func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMessage) {
|
||||||
var err error
|
var err error = nil
|
||||||
groups := make([]LikwidEventsetConfig, 0)
|
groups := make([]LikwidEventsetConfig, 0)
|
||||||
|
|
||||||
for evidx, evset := range m.config.Eventsets {
|
for evidx, evset := range m.config.Eventsets {
|
||||||
@@ -818,21 +810,13 @@ func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMe
|
|||||||
|
|
||||||
if !skip {
|
if !skip {
|
||||||
// read measurements and derive event set metrics
|
// read measurements and derive event set metrics
|
||||||
err = m.calcEventsetMetrics(e, interval, output)
|
m.calcEventsetMetrics(e, interval, output)
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
groups = append(groups, e)
|
groups = append(groups, e)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(groups) > 0 {
|
if len(groups) > 0 {
|
||||||
// calculate global metrics
|
// calculate global metrics
|
||||||
err = m.calcGlobalMetrics(groups, interval, output)
|
m.calcGlobalMetrics(groups, interval, output)
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,13 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: LIKWID collector
|
|
||||||
description: Collect hardware performance events and metrics using LIKWID
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/likwid.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `likwid` collector
|
## `likwid` collector
|
||||||
|
|
||||||
|
|||||||
@@ -1,24 +1,15 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"slices"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// LoadavgCollector collects:
|
// LoadavgCollector collects:
|
||||||
@@ -31,7 +22,6 @@ const LOADAVGFILE = "/proc/loadavg"
|
|||||||
|
|
||||||
type LoadavgCollector struct {
|
type LoadavgCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
load_matches []string
|
load_matches []string
|
||||||
load_skips []bool
|
load_skips []bool
|
||||||
@@ -45,39 +35,32 @@ type LoadavgCollector struct {
|
|||||||
func (m *LoadavgCollector) Init(config json.RawMessage) error {
|
func (m *LoadavgCollector) Init(config json.RawMessage) error {
|
||||||
m.name = "LoadavgCollector"
|
m.name = "LoadavgCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err := json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
return err
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{
|
||||||
"source": m.name,
|
"source": m.name,
|
||||||
"group": "LOAD",
|
"group": "LOAD"}
|
||||||
}
|
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
m.load_matches = []string{
|
m.load_matches = []string{
|
||||||
"load_one",
|
"load_one",
|
||||||
"load_five",
|
"load_five",
|
||||||
"load_fifteen",
|
"load_fifteen"}
|
||||||
}
|
m.load_skips = make([]bool, len(m.load_matches))
|
||||||
m.proc_matches = []string{
|
m.proc_matches = []string{
|
||||||
"proc_run",
|
"proc_run",
|
||||||
"proc_total",
|
"proc_total"}
|
||||||
}
|
|
||||||
|
|
||||||
m.load_skips = make([]bool, len(m.load_matches))
|
|
||||||
for i, name := range m.load_matches {
|
|
||||||
m.load_skips[i] = slices.Contains(m.config.ExcludeMetrics, name)
|
|
||||||
}
|
|
||||||
|
|
||||||
m.proc_skips = make([]bool, len(m.proc_matches))
|
m.proc_skips = make([]bool, len(m.proc_matches))
|
||||||
|
|
||||||
|
for i, name := range m.load_matches {
|
||||||
|
_, m.load_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
|
||||||
|
}
|
||||||
for i, name := range m.proc_matches {
|
for i, name := range m.proc_matches {
|
||||||
m.proc_skips[i] = slices.Contains(m.config.ExcludeMetrics, name)
|
_, m.proc_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
|
||||||
}
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
@@ -109,7 +92,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
if m.load_skips[i] {
|
if m.load_skips[i] {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
|
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -128,7 +111,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
if m.proc_skips[i] {
|
if m.proc_skips[i] {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
|
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,14 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: Load average metric collector
|
|
||||||
description: Collect metrics from `/proc/loadavg`
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/loadavg.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
|
|
||||||
## `loadavg` collector
|
## `loadavg` collector
|
||||||
|
|
||||||
|
|||||||
@@ -1,32 +1,22 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"os/user"
|
"os/user"
|
||||||
"slices"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const LUSTRE_SYSFS = `/sys/fs/lustre`
|
||||||
LUSTRE_SYSFS = `/sys/fs/lustre`
|
const LCTL_CMD = `lctl`
|
||||||
LCTL_CMD = `lctl`
|
const LCTL_OPTION = `get_param`
|
||||||
LCTL_OPTION = `get_param`
|
|
||||||
)
|
|
||||||
|
|
||||||
type LustreCollectorConfig struct {
|
type LustreCollectorConfig struct {
|
||||||
LCtlCommand string `json:"lctl_command,omitempty"`
|
LCtlCommand string `json:"lctl_command,omitempty"`
|
||||||
@@ -47,7 +37,6 @@ type LustreMetricDefinition struct {
|
|||||||
|
|
||||||
type LustreCollector struct {
|
type LustreCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
config LustreCollectorConfig
|
config LustreCollectorConfig
|
||||||
lctl string
|
lctl string
|
||||||
@@ -65,6 +54,7 @@ func (m *LustreCollector) getDeviceDataCommand(device string) []string {
|
|||||||
} else {
|
} else {
|
||||||
command = exec.Command(m.lctl, LCTL_OPTION, statsfile)
|
command = exec.Command(m.lctl, LCTL_OPTION, statsfile)
|
||||||
}
|
}
|
||||||
|
command.Wait()
|
||||||
stdout, _ := command.Output()
|
stdout, _ := command.Output()
|
||||||
return strings.Split(string(stdout), "\n")
|
return strings.Split(string(stdout), "\n")
|
||||||
}
|
}
|
||||||
@@ -300,15 +290,12 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
|||||||
m.name = "LustreCollector"
|
m.name = "LustreCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err = json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
return err
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Lustre"}
|
m.meta = map[string]string{"source": m.name, "group": "Lustre"}
|
||||||
|
|
||||||
@@ -317,15 +304,18 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
|||||||
if !m.config.Sudo {
|
if !m.config.Sudo {
|
||||||
user, err := user.Current()
|
user, err := user.Current()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
|
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
if user.Uid != "0" {
|
if user.Uid != "0" {
|
||||||
return fmt.Errorf("%s Init(): Lustre file system statistics can only be queried by user root", m.name)
|
cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root")
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
p, err := exec.LookPath("sudo")
|
p, err := exec.LookPath("sudo")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): Cannot find 'sudo': %w", m.name, err)
|
cclog.ComponentError(m.name, "Cannot find 'sudo'")
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
m.sudoCmd = p
|
m.sudoCmd = p
|
||||||
}
|
}
|
||||||
@@ -334,7 +324,7 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
p, err = exec.LookPath(LCTL_CMD)
|
p, err = exec.LookPath(LCTL_CMD)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): Cannot find %s command: %w", m.name, LCTL_CMD, err)
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.lctl = p
|
m.lctl = p
|
||||||
@@ -342,32 +332,32 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
|||||||
m.definitions = []LustreMetricDefinition{}
|
m.definitions = []LustreMetricDefinition{}
|
||||||
if m.config.SendAbsoluteValues {
|
if m.config.SendAbsoluteValues {
|
||||||
for _, def := range LustreAbsMetrics {
|
for _, def := range LustreAbsMetrics {
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
||||||
m.definitions = append(m.definitions, def)
|
m.definitions = append(m.definitions, def)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.config.SendDiffValues {
|
if m.config.SendDiffValues {
|
||||||
for _, def := range LustreDiffMetrics {
|
for _, def := range LustreDiffMetrics {
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
||||||
m.definitions = append(m.definitions, def)
|
m.definitions = append(m.definitions, def)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.config.SendDerivedValues {
|
if m.config.SendDerivedValues {
|
||||||
for _, def := range LustreDeriveMetrics {
|
for _, def := range LustreDeriveMetrics {
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
||||||
m.definitions = append(m.definitions, def)
|
m.definitions = append(m.definitions, def)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(m.definitions) == 0 {
|
if len(m.definitions) == 0 {
|
||||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
return errors.New("no metrics to collect")
|
||||||
}
|
}
|
||||||
|
|
||||||
devices := m.getDevices()
|
devices := m.getDevices()
|
||||||
if len(devices) == 0 {
|
if len(devices) == 0 {
|
||||||
return fmt.Errorf("%s Init(): no Lustre devices found", m.name)
|
return errors.New("no Lustre devices found")
|
||||||
}
|
}
|
||||||
m.stats = make(map[string]map[string]int64)
|
m.stats = make(map[string]map[string]int64)
|
||||||
for _, d := range devices {
|
for _, d := range devices {
|
||||||
@@ -405,23 +395,23 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
} else {
|
} else {
|
||||||
use_x = devData[def.name]
|
use_x = devData[def.name]
|
||||||
}
|
}
|
||||||
var value any
|
var value interface{}
|
||||||
switch def.calc {
|
switch def.calc {
|
||||||
case "none":
|
case "none":
|
||||||
value = use_x
|
value = use_x
|
||||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||||
case "difference":
|
case "difference":
|
||||||
value = use_x - devData[def.name]
|
value = use_x - devData[def.name]
|
||||||
if value.(int64) < 0 {
|
if value.(int64) < 0 {
|
||||||
value = 0
|
value = 0
|
||||||
}
|
}
|
||||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||||
case "derivative":
|
case "derivative":
|
||||||
value = float64(use_x-devData[def.name]) / tdiff.Seconds()
|
value = float64(use_x-devData[def.name]) / tdiff.Seconds()
|
||||||
if value.(float64) < 0 {
|
if value.(float64) < 0 {
|
||||||
value = 0
|
value = 0
|
||||||
}
|
}
|
||||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||||
}
|
}
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("device", device)
|
y.AddTag("device", device)
|
||||||
|
|||||||
@@ -1,14 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: Lustre filesystem metric collector
|
|
||||||
description: Collect metrics for Lustre filesystems
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/lustre.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
|
|
||||||
## `lustrestat` collector
|
## `lustrestat` collector
|
||||||
|
|
||||||
@@ -55,16 +44,3 @@ Metrics:
|
|||||||
* `lustre_inode_permission_diff` (if `send_diff_values == true`)
|
* `lustre_inode_permission_diff` (if `send_diff_values == true`)
|
||||||
|
|
||||||
This collector adds an `device` tag.
|
This collector adds an `device` tag.
|
||||||
|
|
||||||
`lctl` typically require root to run.
|
|
||||||
In order to run `cc-metric-collector` without root priviliges, you can enable `use_sudo`.
|
|
||||||
Add a file like this in `/etc/sudoers.d/` to allow `cc-metric-collector` to run the required command:
|
|
||||||
|
|
||||||
```
|
|
||||||
# Do not log the following sudo commands from monitoring, since this causes a lot of log spam.
|
|
||||||
# However keep log_denied enabled, to detect failures
|
|
||||||
Defaults: monitoring !log_allowed, !pam_session
|
|
||||||
|
|
||||||
# Allow to use lctl
|
|
||||||
monitoring ALL = (root) NOPASSWD:/absolute/path/to/lctl get_param llite.*.stats
|
|
||||||
```
|
|
||||||
@@ -1,33 +1,23 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
"slices"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const MEMSTATFILE = "/proc/meminfo"
|
||||||
MEMSTATFILE = "/proc/meminfo"
|
const NUMA_MEMSTAT_BASE = "/sys/devices/system/node"
|
||||||
NUMA_MEMSTAT_BASE = "/sys/devices/system/node"
|
|
||||||
)
|
|
||||||
|
|
||||||
type MemstatCollectorConfig struct {
|
type MemstatCollectorConfig struct {
|
||||||
ExcludeMetrics []string `json:"exclude_metrics"`
|
ExcludeMetrics []string `json:"exclude_metrics"`
|
||||||
@@ -42,7 +32,6 @@ type MemstatCollectorNode struct {
|
|||||||
|
|
||||||
type MemstatCollector struct {
|
type MemstatCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
stats map[string]int64
|
stats map[string]int64
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
matches map[string]string
|
matches map[string]string
|
||||||
@@ -62,11 +51,7 @@ func getStats(filename string) map[string]MemstatStats {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.Error(err.Error())
|
cclog.Error(err.Error())
|
||||||
}
|
}
|
||||||
defer func() {
|
defer file.Close()
|
||||||
if err := file.Close(); err != nil {
|
|
||||||
cclog.Error(err.Error())
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
@@ -95,15 +80,15 @@ func getStats(filename string) map[string]MemstatStats {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *MemstatCollector) Init(config json.RawMessage) error {
|
func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||||
|
var err error
|
||||||
m.name = "MemstatCollector"
|
m.name = "MemstatCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.config.NodeStats = true
|
m.config.NodeStats = true
|
||||||
m.config.NumaStats = false
|
m.config.NumaStats = false
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err = json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
return err
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Memory"}
|
m.meta = map[string]string{"source": m.name, "group": "Memory"}
|
||||||
@@ -111,58 +96,35 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
m.matches = make(map[string]string)
|
m.matches = make(map[string]string)
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
matches := map[string]string{
|
matches := map[string]string{
|
||||||
"MemTotal": "mem_total",
|
"MemTotal": "mem_total",
|
||||||
"SwapTotal": "swap_total",
|
"SwapTotal": "swap_total",
|
||||||
"SReclaimable": "mem_sreclaimable",
|
"SReclaimable": "mem_sreclaimable",
|
||||||
"Slab": "mem_slab",
|
"Slab": "mem_slab",
|
||||||
"MemFree": "mem_free",
|
"MemFree": "mem_free",
|
||||||
"Buffers": "mem_buffers",
|
"Buffers": "mem_buffers",
|
||||||
"Cached": "mem_cached",
|
"Cached": "mem_cached",
|
||||||
"MemAvailable": "mem_available",
|
"MemAvailable": "mem_available",
|
||||||
"SwapFree": "swap_free",
|
"SwapFree": "swap_free",
|
||||||
"Shmem": "mem_shared",
|
"MemShared": "mem_shared",
|
||||||
"Active": "mem_active",
|
|
||||||
"Inactive": "mem_inactive",
|
|
||||||
"Dirty": "mem_dirty",
|
|
||||||
"Writeback": "mem_writeback",
|
|
||||||
"AnonPages": "mem_anon_pages",
|
|
||||||
"Mapped": "mem_mapped",
|
|
||||||
"VmallocTotal": "mem_vmalloc_total",
|
|
||||||
"AnonHugePages": "mem_anon_hugepages",
|
|
||||||
"ShmemHugePages": "mem_shared_hugepages",
|
|
||||||
"ShmemPmdMapped": "mem_shared_pmd_mapped",
|
|
||||||
"HugePages_Total": "mem_hugepages_total",
|
|
||||||
"HugePages_Free": "mem_hugepages_free",
|
|
||||||
"HugePages_Rsvd": "mem_hugepages_reserved",
|
|
||||||
"HugePages_Surp": "mem_hugepages_surplus",
|
|
||||||
"Hugepagesize": "mem_hugepages_size",
|
|
||||||
"DirectMap4k": "mem_direct_mapped_4k",
|
|
||||||
"DirectMap4M": "mem_direct_mapped_4m",
|
|
||||||
"DirectMap2M": "mem_direct_mapped_2m",
|
|
||||||
"DirectMap1G": "mem_direct_mapped_1g",
|
|
||||||
"Mlocked": "mem_locked",
|
|
||||||
"PageTables": "mem_pagetables",
|
|
||||||
"KernelStack": "mem_kernelstack",
|
|
||||||
}
|
}
|
||||||
for k, v := range matches {
|
for k, v := range matches {
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, k) {
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, k)
|
||||||
|
if !skip {
|
||||||
m.matches[k] = v
|
m.matches[k] = v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.sendMemUsed = false
|
m.sendMemUsed = false
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, "mem_used") {
|
if _, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_used"); !skip {
|
||||||
m.sendMemUsed = true
|
m.sendMemUsed = true
|
||||||
}
|
}
|
||||||
if len(m.matches) == 0 {
|
if len(m.matches) == 0 {
|
||||||
return fmt.Errorf("%s Init(): no metrics to collect", m.name)
|
return errors.New("no metrics to collect")
|
||||||
}
|
|
||||||
if err := m.setup(); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
|
m.setup()
|
||||||
|
|
||||||
if m.config.NodeStats {
|
if m.config.NodeStats {
|
||||||
if stats := getStats(MEMSTATFILE); len(stats) == 0 {
|
if stats := getStats(MEMSTATFILE); len(stats) == 0 {
|
||||||
return fmt.Errorf("%s Init(): cannot read data from file %s", m.name, MEMSTATFILE)
|
return fmt.Errorf("cannot read data from file %s", MEMSTATFILE)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -174,7 +136,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
m.nodefiles = make(map[int]MemstatCollectorNode)
|
m.nodefiles = make(map[int]MemstatCollectorNode)
|
||||||
for _, f := range files {
|
for _, f := range files {
|
||||||
if stats := getStats(f); len(stats) == 0 {
|
if stats := getStats(f); len(stats) == 0 {
|
||||||
return fmt.Errorf("%s Init(): cannot read data from file %s", m.name, f)
|
return fmt.Errorf("cannot read data from file %s", f)
|
||||||
}
|
}
|
||||||
rematch := regex.FindStringSubmatch(f)
|
rematch := regex.FindStringSubmatch(f)
|
||||||
if len(rematch) == 2 {
|
if len(rematch) == 2 {
|
||||||
@@ -184,7 +146,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
file: f,
|
file: f,
|
||||||
tags: map[string]string{
|
tags: map[string]string{
|
||||||
"type": "memoryDomain",
|
"type": "memoryDomain",
|
||||||
"type-id": strconv.Itoa(id),
|
"type-id": fmt.Sprintf("%d", id),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
m.nodefiles[id] = f
|
m.nodefiles[id] = f
|
||||||
@@ -194,7 +156,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
@@ -205,7 +167,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
sendStats := func(stats map[string]MemstatStats, tags map[string]string) {
|
sendStats := func(stats map[string]MemstatStats, tags map[string]string) {
|
||||||
for match, name := range m.matches {
|
for match, name := range m.matches {
|
||||||
var value float64 = 0
|
var value float64 = 0
|
||||||
unit := ""
|
var unit string = ""
|
||||||
if v, ok := stats[match]; ok {
|
if v, ok := stats[match]; ok {
|
||||||
value = v.value
|
value = v.value
|
||||||
if len(v.unit) > 0 {
|
if len(v.unit) > 0 {
|
||||||
@@ -213,7 +175,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value}, time.Now())
|
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if len(unit) > 0 {
|
if len(unit) > 0 {
|
||||||
y.AddMeta("unit", unit)
|
y.AddMeta("unit", unit)
|
||||||
@@ -243,16 +205,10 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
unit = cacheVal.unit
|
unit = cacheVal.unit
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if shmemVal, shmem := stats["Shmem"]; shmem {
|
|
||||||
memUsed -= shmemVal.value
|
|
||||||
if len(shmemVal.unit) > 0 && len(unit) == 0 {
|
|
||||||
unit = shmemVal.unit
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]any{"value": memUsed}, time.Now())
|
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if len(unit) > 0 {
|
if len(unit) > 0 {
|
||||||
y.AddMeta("unit", unit)
|
y.AddMeta("unit", unit)
|
||||||
|
|||||||
@@ -1,14 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: Memory statistics metric collector
|
|
||||||
description: Collect metrics from `/proc/meminfo`
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/memstat.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
|
|
||||||
## `memstat` collector
|
## `memstat` collector
|
||||||
|
|
||||||
@@ -32,29 +21,7 @@ Metrics:
|
|||||||
* `mem_cached`
|
* `mem_cached`
|
||||||
* `mem_available`
|
* `mem_available`
|
||||||
* `mem_shared`
|
* `mem_shared`
|
||||||
* `mem_active`
|
|
||||||
* `mem_inactive`
|
|
||||||
* `mem_dirty`
|
|
||||||
* `mem_writeback`
|
|
||||||
* `mem_anon_pages`
|
|
||||||
* `mem_mapped`
|
|
||||||
* `mem_vmalloc_total`
|
|
||||||
* `mem_anon_hugepages`
|
|
||||||
* `mem_shared_hugepages`
|
|
||||||
* `mem_shared_pmd_mapped`
|
|
||||||
* `mem_hugepages_total`
|
|
||||||
* `mem_hugepages_free`
|
|
||||||
* `mem_hugepages_reserved`
|
|
||||||
* `mem_hugepages_surplus`
|
|
||||||
* `mem_hugepages_size`
|
|
||||||
* `mem_direct_mapped_4k`
|
|
||||||
* `mem_direct_mapped_2m`
|
|
||||||
* `mem_direct_mapped_4m`
|
|
||||||
* `mem_direct_mapped_1g`
|
|
||||||
* `mem_locked`
|
|
||||||
* `mem_pagetables`
|
|
||||||
* `mem_kernelstack`
|
|
||||||
* `swap_total`
|
* `swap_total`
|
||||||
* `swap_free`
|
* `swap_free`
|
||||||
* `mem_used` = `mem_total` - (`mem_free` + `mem_buffers` + `mem_cached` + `mem_shared`)
|
* `mem_used` = `mem_total` - (`mem_free` + `mem_buffers` + `mem_cached`)
|
||||||
|
|
||||||
|
|||||||
@@ -1,10 +1,3 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -12,7 +5,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
type MetricCollector interface {
|
type MetricCollector interface {
|
||||||
@@ -51,6 +44,30 @@ func (c *metricCollector) Initialized() bool {
|
|||||||
return c.init
|
return c.init
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// intArrayContains scans an array of ints if the value str is present in the array
|
||||||
|
// If the specified value is found, the corresponding array index is returned.
|
||||||
|
// The bool value is used to signal success or failure
|
||||||
|
func intArrayContains(array []int, str int) (int, bool) {
|
||||||
|
for i, a := range array {
|
||||||
|
if a == str {
|
||||||
|
return i, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// stringArrayContains scans an array of strings if the value str is present in the array
|
||||||
|
// If the specified value is found, the corresponding array index is returned.
|
||||||
|
// The bool value is used to signal success or failure
|
||||||
|
func stringArrayContains(array []string, str string) (int, bool) {
|
||||||
|
for i, a := range array {
|
||||||
|
if a == str {
|
||||||
|
return i, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1, false
|
||||||
|
}
|
||||||
|
|
||||||
// RemoveFromStringList removes the string r from the array of strings s
|
// RemoveFromStringList removes the string r from the array of strings s
|
||||||
// If r is not contained in the array an error is returned
|
// If r is not contained in the array an error is returned
|
||||||
func RemoveFromStringList(s []string, r string) ([]string, error) {
|
func RemoveFromStringList(s []string, r string) ([]string, error) {
|
||||||
|
|||||||
@@ -1,25 +1,16 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"errors"
|
||||||
"os"
|
"os"
|
||||||
"slices"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const NETSTATFILE = "/proc/net/dev"
|
const NETSTATFILE = "/proc/net/dev"
|
||||||
@@ -42,7 +33,6 @@ type NetstatCollectorMetric struct {
|
|||||||
|
|
||||||
type NetstatCollector struct {
|
type NetstatCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
config NetstatCollectorConfig
|
config NetstatCollectorConfig
|
||||||
aliasToCanonical map[string]string
|
aliasToCanonical map[string]string
|
||||||
matches map[string][]NetstatCollectorMetric
|
matches map[string][]NetstatCollectorMetric
|
||||||
@@ -68,9 +58,7 @@ func getCanonicalName(raw string, aliasToCanonical map[string]string) string {
|
|||||||
func (m *NetstatCollector) Init(config json.RawMessage) error {
|
func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||||
m.name = "NetstatCollector"
|
m.name = "NetstatCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.lastTimestamp = time.Now()
|
m.lastTimestamp = time.Now()
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@@ -100,10 +88,10 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
|||||||
m.config.SendDerivedValues = false
|
m.config.SendDerivedValues = false
|
||||||
// Read configuration file, allow overwriting default config
|
// Read configuration file, allow overwriting default config
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err := json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -112,8 +100,10 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
|||||||
// Check access to net statistic file
|
// Check access to net statistic file
|
||||||
file, err := os.Open(NETSTATFILE)
|
file, err := os.Open(NETSTATFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): failed to open netstat file \"%s\": %w", m.name, NETSTATFILE, err)
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
@@ -132,33 +122,13 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
|||||||
canonical := getCanonicalName(raw, m.aliasToCanonical)
|
canonical := getCanonicalName(raw, m.aliasToCanonical)
|
||||||
|
|
||||||
// Check if device is a included device
|
// Check if device is a included device
|
||||||
if slices.Contains(m.config.IncludeDevices, canonical) {
|
if _, ok := stringArrayContains(m.config.IncludeDevices, canonical); ok {
|
||||||
// Tag will contain original device name (raw).
|
// Tag will contain original device name (raw).
|
||||||
tags := map[string]string{
|
tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"}
|
||||||
"stype": "network",
|
meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"}
|
||||||
"stype-id": raw,
|
meta_unit_byte_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "bytes/sec"}
|
||||||
"type": "node",
|
meta_unit_pkts := map[string]string{"source": m.name, "group": "Network", "unit": "packets"}
|
||||||
}
|
meta_unit_pkts_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "packets/sec"}
|
||||||
meta_unit_byte := map[string]string{
|
|
||||||
"source": m.name,
|
|
||||||
"group": "Network",
|
|
||||||
"unit": "bytes",
|
|
||||||
}
|
|
||||||
meta_unit_byte_per_sec := map[string]string{
|
|
||||||
"source": m.name,
|
|
||||||
"group": "Network",
|
|
||||||
"unit": "bytes/sec",
|
|
||||||
}
|
|
||||||
meta_unit_pkts := map[string]string{
|
|
||||||
"source": m.name,
|
|
||||||
"group": "Network",
|
|
||||||
"unit": "packets",
|
|
||||||
}
|
|
||||||
meta_unit_pkts_per_sec := map[string]string{
|
|
||||||
"source": m.name,
|
|
||||||
"group": "Network",
|
|
||||||
"unit": "packets/sec",
|
|
||||||
}
|
|
||||||
|
|
||||||
m.matches[canonical] = []NetstatCollectorMetric{
|
m.matches[canonical] = []NetstatCollectorMetric{
|
||||||
{
|
{
|
||||||
@@ -197,13 +167,8 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close netstat file
|
|
||||||
if err := file.Close(); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): failed to close netstat file \"%s\": %w", m.name, NETSTATFILE, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(m.matches) == 0 {
|
if len(m.matches) == 0 {
|
||||||
return fmt.Errorf("%s Init(): no devices to collect metrics found", m.name)
|
return errors.New("no devices to collector metrics found")
|
||||||
}
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
@@ -222,18 +187,10 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
|
|
||||||
file, err := os.Open(NETSTATFILE)
|
file, err := os.Open(NETSTATFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, err.Error())
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", NETSTATFILE, err))
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer func() {
|
defer file.Close()
|
||||||
if err := file.Close(); err != nil {
|
|
||||||
cclog.ComponentError(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", NETSTATFILE, err))
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
@@ -262,14 +219,14 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if m.config.SendAbsoluteValues {
|
if m.config.SendAbsoluteValues {
|
||||||
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]any{"value": v}, now); err == nil {
|
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]interface{}{"value": v}, now); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.config.SendDerivedValues {
|
if m.config.SendDerivedValues {
|
||||||
if metric.lastValue >= 0 {
|
if metric.lastValue >= 0 {
|
||||||
rate := float64(v-metric.lastValue) / timeDiff
|
rate := float64(v-metric.lastValue) / timeDiff
|
||||||
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]any{"value": rate}, now); err == nil {
|
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]interface{}{"value": rate}, now); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,13 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: Network device metric collector
|
|
||||||
description: Collect metrics for network devices through procfs
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/netstat.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `netstat` collector
|
## `netstat` collector
|
||||||
|
|
||||||
|
|||||||
@@ -1,14 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: NFS network filesystem (v3) metric collector
|
|
||||||
description: Collect metrics for NFS network filesystems in version 3
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/nfs3.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
|
|
||||||
## `nfs3stat` collector
|
## `nfs3stat` collector
|
||||||
|
|
||||||
|
|||||||
@@ -1,14 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: NFS network filesystem (v4) metric collector
|
|
||||||
description: Collect metrics for NFS network filesystems in version 4
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/nfs4.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
|
|
||||||
## `nfs4stat` collector
|
## `nfs4stat` collector
|
||||||
|
|
||||||
|
|||||||
@@ -1,17 +1,9 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"slices"
|
"log"
|
||||||
|
|
||||||
// "os"
|
// "os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
@@ -19,8 +11,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// First part contains the code for the general NfsCollector.
|
// First part contains the code for the general NfsCollector.
|
||||||
@@ -35,7 +26,6 @@ type NfsCollectorData struct {
|
|||||||
|
|
||||||
type nfsCollector struct {
|
type nfsCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
version string
|
version string
|
||||||
config struct {
|
config struct {
|
||||||
@@ -45,56 +35,68 @@ type nfsCollector struct {
|
|||||||
data map[string]NfsCollectorData
|
data map[string]NfsCollectorData
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *nfsCollector) updateStats() error {
|
func (m *nfsCollector) initStats() error {
|
||||||
cmd := exec.Command(m.config.Nfsstats, "-l", "--all")
|
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
|
||||||
|
cmd.Wait()
|
||||||
buffer, err := cmd.Output()
|
buffer, err := cmd.Output()
|
||||||
if err != nil {
|
if err == nil {
|
||||||
return err
|
for _, line := range strings.Split(string(buffer), "\n") {
|
||||||
}
|
lf := strings.Fields(line)
|
||||||
|
if len(lf) != 5 {
|
||||||
for name, data := range m.data {
|
continue
|
||||||
m.data[name] = NfsCollectorData{
|
}
|
||||||
last: data.current,
|
if lf[1] == m.version {
|
||||||
current: -1,
|
name := strings.Trim(lf[3], ":")
|
||||||
|
if _, exist := m.data[name]; !exist {
|
||||||
|
value, err := strconv.ParseInt(lf[4], 0, 64)
|
||||||
|
if err == nil {
|
||||||
|
x := m.data[name]
|
||||||
|
x.current = value
|
||||||
|
x.last = value
|
||||||
|
m.data[name] = x
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
for line := range strings.Lines(string(buffer)) {
|
func (m *nfsCollector) updateStats() error {
|
||||||
lf := strings.Fields(line)
|
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
|
||||||
if len(lf) != 5 {
|
cmd.Wait()
|
||||||
continue
|
buffer, err := cmd.Output()
|
||||||
|
if err == nil {
|
||||||
|
for _, line := range strings.Split(string(buffer), "\n") {
|
||||||
|
lf := strings.Fields(line)
|
||||||
|
if len(lf) != 5 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if lf[1] == m.version {
|
||||||
|
name := strings.Trim(lf[3], ":")
|
||||||
|
if _, exist := m.data[name]; exist {
|
||||||
|
value, err := strconv.ParseInt(lf[4], 0, 64)
|
||||||
|
if err == nil {
|
||||||
|
x := m.data[name]
|
||||||
|
x.last = x.current
|
||||||
|
x.current = value
|
||||||
|
m.data[name] = x
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if lf[1] != m.version {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
name := strings.Trim(lf[3], ":")
|
|
||||||
value, err := strconv.ParseInt(lf[4], 0, 64)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
collectorData, exist := m.data[name]
|
|
||||||
collectorData.current = value
|
|
||||||
|
|
||||||
if !exist {
|
|
||||||
collectorData.last = -1
|
|
||||||
}
|
|
||||||
|
|
||||||
m.data[name] = collectorData
|
|
||||||
}
|
}
|
||||||
|
return err
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *nfsCollector) MainInit(config json.RawMessage) error {
|
func (m *nfsCollector) MainInit(config json.RawMessage) error {
|
||||||
m.config.Nfsstats = string(NFSSTAT_EXEC)
|
m.config.Nfsstats = string(NFSSTAT_EXEC)
|
||||||
// Read JSON configuration
|
// Read JSON configuration
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err := json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
log.Print(err.Error())
|
||||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{
|
||||||
@@ -107,12 +109,10 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
|
|||||||
// Check if nfsstat is in executable search path
|
// Check if nfsstat is in executable search path
|
||||||
_, err := exec.LookPath(m.config.Nfsstats)
|
_, err := exec.LookPath(m.config.Nfsstats)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): Failed to find nfsstat binary '%s': %w", m.name, m.config.Nfsstats, err)
|
return fmt.Errorf("NfsCollector.Init(): Failed to find nfsstat binary '%s': %v", m.config.Nfsstats, err)
|
||||||
}
|
}
|
||||||
m.data = make(map[string]NfsCollectorData)
|
m.data = make(map[string]NfsCollectorData)
|
||||||
if err := m.updateStats(); err != nil {
|
m.initStats()
|
||||||
return fmt.Errorf("%s Init(): %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.init = true
|
m.init = true
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
return nil
|
return nil
|
||||||
@@ -124,14 +124,8 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
}
|
}
|
||||||
timestamp := time.Now()
|
timestamp := time.Now()
|
||||||
|
|
||||||
if err := m.updateStats(); err != nil {
|
m.updateStats()
|
||||||
cclog.ComponentError(
|
prefix := ""
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): updateStats() failed: %v", err),
|
|
||||||
)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
var prefix string
|
|
||||||
switch m.version {
|
switch m.version {
|
||||||
case "v3":
|
case "v3":
|
||||||
prefix = "nfs3"
|
prefix = "nfs3"
|
||||||
@@ -142,15 +136,11 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for name, data := range m.data {
|
for name, data := range m.data {
|
||||||
if slices.Contains(m.config.ExcludeMetrics, name) {
|
if _, skip := stringArrayContains(m.config.ExcludeMetrics, name); skip {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
value := data.current - data.last
|
||||||
valueMap := make(map[string]any)
|
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if data.current >= 0 && data.last >= 0 {
|
|
||||||
valueMap["value"] = data.current - data.last
|
|
||||||
}
|
|
||||||
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, valueMap, timestamp)
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("version", m.version)
|
y.AddMeta("version", m.version)
|
||||||
output <- y
|
output <- y
|
||||||
@@ -173,17 +163,13 @@ type Nfs4Collector struct {
|
|||||||
func (m *Nfs3Collector) Init(config json.RawMessage) error {
|
func (m *Nfs3Collector) Init(config json.RawMessage) error {
|
||||||
m.name = "Nfs3Collector"
|
m.name = "Nfs3Collector"
|
||||||
m.version = `v3`
|
m.version = `v3`
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
return m.MainInit(config)
|
return m.MainInit(config)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Nfs4Collector) Init(config json.RawMessage) error {
|
func (m *Nfs4Collector) Init(config json.RawMessage) error {
|
||||||
m.name = "Nfs4Collector"
|
m.name = "Nfs4Collector"
|
||||||
m.version = `v4`
|
m.version = `v4`
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
return m.MainInit(config)
|
return m.MainInit(config)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,30 +1,22 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"regexp"
|
"regexp"
|
||||||
"slices"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// These are the fields we read from the JSON configuration
|
// These are the fields we read from the JSON configuration
|
||||||
type NfsIOStatCollectorConfig struct {
|
type NfsIOStatCollectorConfig struct {
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
ExcludeFilesystems []string `json:"exclude_filesystem,omitempty"`
|
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
|
||||||
UseServerAddressAsSType bool `json:"use_server_as_stype,omitempty"`
|
UseServerAddressAsSType bool `json:"use_server_as_stype,omitempty"`
|
||||||
SendAbsoluteValues bool `json:"send_abs_values"`
|
SendAbsoluteValues bool `json:"send_abs_values"`
|
||||||
SendDerivedValues bool `json:"send_derived_values"`
|
SendDerivedValues bool `json:"send_derived_values"`
|
||||||
@@ -34,7 +26,6 @@ type NfsIOStatCollectorConfig struct {
|
|||||||
// defined by metricCollector (name, init, ...)
|
// defined by metricCollector (name, init, ...)
|
||||||
type NfsIOStatCollector struct {
|
type NfsIOStatCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
config NfsIOStatCollectorConfig // the configuration structure
|
config NfsIOStatCollectorConfig // the configuration structure
|
||||||
meta map[string]string // default meta information
|
meta map[string]string // default meta information
|
||||||
tags map[string]string // default tags
|
tags map[string]string // default tags
|
||||||
@@ -43,10 +34,8 @@ type NfsIOStatCollector struct {
|
|||||||
lastTimestamp time.Time
|
lastTimestamp time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var deviceRegex = regexp.MustCompile(`device (?P<server>[^ ]+) mounted on (?P<mntpoint>[^ ]+) with fstype nfs(?P<version>\d*) statvers=[\d\.]+`)
|
||||||
deviceRegex = regexp.MustCompile(`device (?P<server>[^ ]+) mounted on (?P<mntpoint>[^ ]+) with fstype nfs(?P<version>\d*) statvers=[\d\.]+`)
|
var bytesRegex = regexp.MustCompile(`\s+bytes:\s+(?P<nread>[^ ]+) (?P<nwrite>[^ ]+) (?P<dread>[^ ]+) (?P<dwrite>[^ ]+) (?P<nfsread>[^ ]+) (?P<nfswrite>[^ ]+) (?P<pageread>[^ ]+) (?P<pagewrite>[^ ]+)`)
|
||||||
bytesRegex = regexp.MustCompile(`\s+bytes:\s+(?P<nread>[^ ]+) (?P<nwrite>[^ ]+) (?P<dread>[^ ]+) (?P<dwrite>[^ ]+) (?P<nfsread>[^ ]+) (?P<nfswrite>[^ ]+) (?P<pageread>[^ ]+) (?P<pagewrite>[^ ]+)`)
|
|
||||||
)
|
|
||||||
|
|
||||||
func resolve_regex_fields(s string, regex *regexp.Regexp) map[string]string {
|
func resolve_regex_fields(s string, regex *regexp.Regexp) map[string]string {
|
||||||
fields := make(map[string]string)
|
fields := make(map[string]string)
|
||||||
@@ -75,7 +64,7 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
|
|||||||
// Is this a device line with mount point, remote target and NFS version?
|
// Is this a device line with mount point, remote target and NFS version?
|
||||||
dev := resolve_regex_fields(l, deviceRegex)
|
dev := resolve_regex_fields(l, deviceRegex)
|
||||||
if len(dev) > 0 {
|
if len(dev) > 0 {
|
||||||
if !slices.Contains(m.config.ExcludeFilesystems, dev[m.key]) {
|
if _, ok := stringArrayContains(m.config.ExcludeFilesystem, dev[m.key]); !ok {
|
||||||
current = dev
|
current = dev
|
||||||
if len(current["version"]) == 0 {
|
if len(current["version"]) == 0 {
|
||||||
current["version"] = "3"
|
current["version"] = "3"
|
||||||
@@ -89,7 +78,7 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
|
|||||||
if len(bytes) > 0 {
|
if len(bytes) > 0 {
|
||||||
data[current[m.key]] = make(map[string]int64)
|
data[current[m.key]] = make(map[string]int64)
|
||||||
for name, sval := range bytes {
|
for name, sval := range bytes {
|
||||||
if !slices.Contains(m.config.ExcludeMetrics, name) {
|
if _, ok := stringArrayContains(m.config.ExcludeMetrics, name); !ok {
|
||||||
val, err := strconv.ParseInt(sval, 10, 64)
|
val, err := strconv.ParseInt(sval, 10, 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
data[current[m.key]][name] = val
|
data[current[m.key]][name] = val
|
||||||
@@ -104,10 +93,9 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
||||||
|
var err error = nil
|
||||||
m.name = "NfsIOStatCollector"
|
m.name = "NfsIOStatCollector"
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{"source": m.name, "group": "NFS", "unit": "bytes"}
|
m.meta = map[string]string{"source": m.name, "group": "NFS", "unit": "bytes"}
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
@@ -116,10 +104,10 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
|||||||
m.config.SendAbsoluteValues = true
|
m.config.SendAbsoluteValues = true
|
||||||
m.config.SendDerivedValues = false
|
m.config.SendDerivedValues = false
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err = json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.key = "mntpoint"
|
m.key = "mntpoint"
|
||||||
@@ -129,7 +117,7 @@ func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
|||||||
m.data = m.readNfsiostats()
|
m.data = m.readNfsiostats()
|
||||||
m.lastTimestamp = time.Now()
|
m.lastTimestamp = time.Now()
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
@@ -145,14 +133,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
if old, ok := m.data[mntpoint]; ok {
|
if old, ok := m.data[mntpoint]; ok {
|
||||||
for name, newVal := range values {
|
for name, newVal := range values {
|
||||||
if m.config.SendAbsoluteValues {
|
if m.config.SendAbsoluteValues {
|
||||||
msg, err := lp.NewMessage(
|
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", name), m.tags, m.meta, map[string]interface{}{"value": newVal}, now)
|
||||||
"nfsio_"+name,
|
|
||||||
m.tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": newVal,
|
|
||||||
},
|
|
||||||
now)
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
msg.AddTag("stype", "filesystem")
|
msg.AddTag("stype", "filesystem")
|
||||||
msg.AddTag("stype-id", mntpoint)
|
msg.AddTag("stype-id", mntpoint)
|
||||||
@@ -161,7 +142,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
}
|
}
|
||||||
if m.config.SendDerivedValues {
|
if m.config.SendDerivedValues {
|
||||||
rate := float64(newVal-old[name]) / timeDiff
|
rate := float64(newVal-old[name]) / timeDiff
|
||||||
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]any{"value": rate}, now)
|
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]interface{}{"value": rate}, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if strings.HasPrefix(name, "page") {
|
if strings.HasPrefix(name, "page") {
|
||||||
msg.AddMeta("unit", "4K_pages/s")
|
msg.AddMeta("unit", "4K_pages/s")
|
||||||
|
|||||||
@@ -1,14 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: NFS network filesystem metrics from procfs
|
|
||||||
description: Collect NFS network filesystem metrics for mounts from `/proc/self/mountstats`
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/nfsio.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `nfsiostat` collector
|
## `nfsiostat` collector
|
||||||
|
|
||||||
```json
|
```json
|
||||||
@@ -16,7 +5,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/nfsio.md
|
|||||||
"exclude_metrics": [
|
"exclude_metrics": [
|
||||||
"oread", "pageread"
|
"oread", "pageread"
|
||||||
],
|
],
|
||||||
"exclude_filesystem": [
|
"exclude_filesystems": [
|
||||||
"/mnt"
|
"/mnt"
|
||||||
],
|
],
|
||||||
"use_server_as_stype": false,
|
"use_server_as_stype": false,
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ package collectors
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
@@ -11,8 +10,8 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
type NUMAStatsCollectorConfig struct {
|
type NUMAStatsCollectorConfig struct {
|
||||||
@@ -60,7 +59,6 @@ type NUMAStatsCollectorTopolgy struct {
|
|||||||
|
|
||||||
type NUMAStatsCollector struct {
|
type NUMAStatsCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
topology []NUMAStatsCollectorTopolgy
|
topology []NUMAStatsCollectorTopolgy
|
||||||
config NUMAStatsCollectorConfig
|
config NUMAStatsCollectorConfig
|
||||||
lastTimestamp time.Time
|
lastTimestamp time.Time
|
||||||
@@ -74,32 +72,21 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
m.name = "NUMAStatsCollector"
|
m.name = "NUMAStatsCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{
|
||||||
"source": m.name,
|
"source": m.name,
|
||||||
"group": "NUMA",
|
"group": "NUMA",
|
||||||
}
|
}
|
||||||
|
|
||||||
m.config.SendAbsoluteValues = true
|
|
||||||
if len(config) > 0 {
|
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
|
||||||
d.DisallowUnknownFields()
|
|
||||||
if err := d.Decode(&m.config); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Loop for all NUMA node directories
|
// Loop for all NUMA node directories
|
||||||
base := "/sys/devices/system/node/node"
|
base := "/sys/devices/system/node/node"
|
||||||
globPattern := base + "[0-9]*"
|
globPattern := base + "[0-9]*"
|
||||||
dirs, err := filepath.Glob(globPattern)
|
dirs, err := filepath.Glob(globPattern)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): unable to glob files with pattern '%s'", m.name, globPattern)
|
return fmt.Errorf("unable to glob files with pattern '%s'", globPattern)
|
||||||
}
|
}
|
||||||
if dirs == nil {
|
if dirs == nil {
|
||||||
return fmt.Errorf("%s Init(): unable to find any files with pattern '%s'", m.name, globPattern)
|
return fmt.Errorf("unable to find any files with pattern '%s'", globPattern)
|
||||||
}
|
}
|
||||||
m.topology = make([]NUMAStatsCollectorTopolgy, 0, len(dirs))
|
m.topology = make([]NUMAStatsCollectorTopolgy, 0, len(dirs))
|
||||||
for _, dir := range dirs {
|
for _, dir := range dirs {
|
||||||
@@ -107,11 +94,8 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
|||||||
file := filepath.Join(dir, "numastat")
|
file := filepath.Join(dir, "numastat")
|
||||||
m.topology = append(m.topology,
|
m.topology = append(m.topology,
|
||||||
NUMAStatsCollectorTopolgy{
|
NUMAStatsCollectorTopolgy{
|
||||||
file: file,
|
file: file,
|
||||||
tagSet: map[string]string{
|
tagSet: map[string]string{"memoryDomain": node},
|
||||||
"type": "memoryDomain",
|
|
||||||
"type-id": node,
|
|
||||||
},
|
|
||||||
previousValues: make(map[string]int64),
|
previousValues: make(map[string]int64),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -161,11 +145,11 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
}
|
}
|
||||||
|
|
||||||
if m.config.SendAbsoluteValues {
|
if m.config.SendAbsoluteValues {
|
||||||
msg, err := lp.NewMetric(
|
msg, err := lp.NewMessage(
|
||||||
"numastats_"+key,
|
"numastats_"+key,
|
||||||
t.tagSet,
|
t.tagSet,
|
||||||
m.meta,
|
m.meta,
|
||||||
value,
|
map[string]interface{}{"value": value},
|
||||||
now,
|
now,
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -177,11 +161,11 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
prev, ok := t.previousValues[key]
|
prev, ok := t.previousValues[key]
|
||||||
if ok {
|
if ok {
|
||||||
rate := float64(value-prev) / timeDiff
|
rate := float64(value-prev) / timeDiff
|
||||||
msg, err := lp.NewMetric(
|
msg, err := lp.NewMessage(
|
||||||
"numastats_"+key+"_rate",
|
"numastats_"+key+"_rate",
|
||||||
t.tagSet,
|
t.tagSet,
|
||||||
m.meta,
|
m.meta,
|
||||||
rate,
|
map[string]interface{}{"value": rate},
|
||||||
now,
|
now,
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -191,11 +175,7 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
t.previousValues[key] = value
|
t.previousValues[key] = value
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if err := file.Close(); err != nil {
|
file.Close()
|
||||||
cclog.ComponentError(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", t.file, err))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,13 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: NUMAStat collector
|
|
||||||
description: Collect infos about NUMA domains
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/numastat.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `numastat` collector
|
## `numastat` collector
|
||||||
|
|
||||||
@@ -15,7 +5,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/numastat.md
|
|||||||
"numastats": {
|
"numastats": {
|
||||||
"send_abs_values" : true,
|
"send_abs_values" : true,
|
||||||
"send_derived_values" : true
|
"send_derived_values" : true
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The `numastat` collector reads data from `/sys/devices/system/node/node*/numastat` and outputs a handful **memoryDomain** metrics. See: <https://www.kernel.org/doc/html/latest/admin-guide/numastat.html>
|
The `numastat` collector reads data from `/sys/devices/system/node/node*/numastat` and outputs a handful **memoryDomain** metrics. See: <https://www.kernel.org/doc/html/latest/admin-guide/numastat.html>
|
||||||
|
|||||||
@@ -1,10 +1,3 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -12,14 +5,11 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"maps"
|
|
||||||
"slices"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -37,17 +27,14 @@ type NvidiaCollectorConfig struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaCollectorDevice struct {
|
type NvidiaCollectorDevice struct {
|
||||||
device nvml.Device
|
device nvml.Device
|
||||||
excludeMetrics map[string]bool
|
excludeMetrics map[string]bool
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
meta map[string]string
|
meta map[string]string
|
||||||
lastEnergyReading uint64
|
|
||||||
lastEnergyTimestamp time.Time
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaCollector struct {
|
type NvidiaCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
config NvidiaCollectorConfig
|
config NvidiaCollectorConfig
|
||||||
gpus []NvidiaCollectorDevice
|
gpus []NvidiaCollectorDevice
|
||||||
num_gpus int
|
num_gpus int
|
||||||
@@ -68,14 +55,11 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
m.config.ProcessMigDevices = false
|
m.config.ProcessMigDevices = false
|
||||||
m.config.UseUuidForMigDevices = false
|
m.config.UseUuidForMigDevices = false
|
||||||
m.config.UseSliceForMigDevices = false
|
m.config.UseSliceForMigDevices = false
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(strings.NewReader(string(config)))
|
err = json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err = d.Decode(&m.config); err != nil {
|
return err
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{
|
||||||
@@ -91,28 +75,32 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
// Error: NVML library not found
|
// Error: NVML library not found
|
||||||
// (nvml.ErrorString can not be used in this case)
|
// (nvml.ErrorString can not be used in this case)
|
||||||
if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
|
if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
|
||||||
return fmt.Errorf("%s Init(): NVML library not found", m.name)
|
err = fmt.Errorf("NVML library not found")
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err = errors.New(nvml.ErrorString(ret))
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
return fmt.Errorf("%s Init(): Unable to initialize NVML: %w", m.name, err)
|
cclog.ComponentError(m.name, "Unable to initialize NVML", err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Number of NVIDIA GPUs
|
// Number of NVIDIA GPUs
|
||||||
num_gpus, ret := nvml.DeviceGetCount()
|
num_gpus, ret := nvml.DeviceGetCount()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err = errors.New(nvml.ErrorString(ret))
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
return fmt.Errorf("%s Init(): Unable to get device count: %w", m.name, err)
|
cclog.ComponentError(m.name, "Unable to get device count", err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// For all GPUs
|
// For all GPUs
|
||||||
idx := 0
|
idx := 0
|
||||||
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
|
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
|
||||||
for i := range num_gpus {
|
for i := 0; i < num_gpus; i++ {
|
||||||
|
|
||||||
// Skip excluded devices by ID
|
// Skip excluded devices by ID
|
||||||
str_i := strconv.Itoa(i)
|
str_i := fmt.Sprintf("%d", i)
|
||||||
if slices.Contains(m.config.ExcludeDevices, str_i) {
|
if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip {
|
||||||
cclog.ComponentDebug(m.name, "Skipping excluded device", str_i)
|
cclog.ComponentDebug(m.name, "Skipping excluded device", str_i)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -140,7 +128,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
pciInfo.Device)
|
pciInfo.Device)
|
||||||
|
|
||||||
// Skip excluded devices specified by PCI ID
|
// Skip excluded devices specified by PCI ID
|
||||||
if slices.Contains(m.config.ExcludeDevices, pci_id) {
|
if _, skip := stringArrayContains(m.config.ExcludeDevices, pci_id); skip {
|
||||||
cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id)
|
cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -161,8 +149,6 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Add device handle
|
// Add device handle
|
||||||
g.device = device
|
g.device = device
|
||||||
g.lastEnergyReading = 0
|
|
||||||
g.lastEnergyTimestamp = time.Now()
|
|
||||||
|
|
||||||
// Add tags
|
// Add tags
|
||||||
g.tags = map[string]string{
|
g.tags = map[string]string{
|
||||||
@@ -220,25 +206,23 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_fb_mem_total"] || !device.excludeMetrics["nv_fb_mem_used"] || !device.excludeMetrics["nv_fb_mem_reserved"] {
|
if !device.excludeMetrics["nv_fb_mem_total"] || !device.excludeMetrics["nv_fb_mem_used"] || !device.excludeMetrics["nv_fb_mem_reserved"] {
|
||||||
var total uint64
|
var total uint64
|
||||||
var used uint64
|
var used uint64
|
||||||
var reserved uint64 = 0
|
var reserved uint64 = 0
|
||||||
v2 := false
|
var v2 bool = false
|
||||||
meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
|
meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err := errors.New(nvml.ErrorString(ret))
|
err := errors.New(nvml.ErrorString(ret))
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
// Total physical device memory (in bytes)
|
|
||||||
total = meminfo.Total
|
total = meminfo.Total
|
||||||
// Sum of Reserved and Allocated device memory (in bytes)
|
|
||||||
used = meminfo.Used
|
used = meminfo.Used
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_fb_mem_total"] {
|
if !device.excludeMetrics["nv_fb_mem_total"] {
|
||||||
t := float64(total) / (1024 * 1024)
|
t := float64(total) / (1024 * 1024)
|
||||||
y, err := lp.NewMetric("nv_fb_mem_total", device.tags, device.meta, t, time.Now())
|
y, err := lp.NewMessage("nv_fb_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MByte")
|
y.AddMeta("unit", "MByte")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -247,7 +231,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
|
|
||||||
if !device.excludeMetrics["nv_fb_mem_used"] {
|
if !device.excludeMetrics["nv_fb_mem_used"] {
|
||||||
f := float64(used) / (1024 * 1024)
|
f := float64(used) / (1024 * 1024)
|
||||||
y, err := lp.NewMetric("nv_fb_mem_used", device.tags, device.meta, f, time.Now())
|
y, err := lp.NewMessage("nv_fb_mem_used", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MByte")
|
y.AddMeta("unit", "MByte")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -256,7 +240,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
|
|
||||||
if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] {
|
if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] {
|
||||||
r := float64(reserved) / (1024 * 1024)
|
r := float64(reserved) / (1024 * 1024)
|
||||||
y, err := lp.NewMetric("nv_fb_mem_reserved", device.tags, device.meta, r, time.Now())
|
y, err := lp.NewMessage("nv_fb_mem_reserved", device.tags, device.meta, map[string]interface{}{"value": r}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MByte")
|
y.AddMeta("unit", "MByte")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -266,7 +250,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_bar1_mem_total"] || !device.excludeMetrics["nv_bar1_mem_used"] {
|
if !device.excludeMetrics["nv_bar1_mem_total"] || !device.excludeMetrics["nv_bar1_mem_used"] {
|
||||||
meminfo, ret := nvml.DeviceGetBAR1MemoryInfo(device.device)
|
meminfo, ret := nvml.DeviceGetBAR1MemoryInfo(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
@@ -275,7 +259,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_bar1_mem_total"] {
|
if !device.excludeMetrics["nv_bar1_mem_total"] {
|
||||||
t := float64(meminfo.Bar1Total) / (1024 * 1024)
|
t := float64(meminfo.Bar1Total) / (1024 * 1024)
|
||||||
y, err := lp.NewMetric("nv_bar1_mem_total", device.tags, device.meta, t, time.Now())
|
y, err := lp.NewMessage("nv_bar1_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MByte")
|
y.AddMeta("unit", "MByte")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -283,7 +267,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_bar1_mem_used"] {
|
if !device.excludeMetrics["nv_bar1_mem_used"] {
|
||||||
t := float64(meminfo.Bar1Used) / (1024 * 1024)
|
t := float64(meminfo.Bar1Used) / (1024 * 1024)
|
||||||
y, err := lp.NewMetric("nv_bar1_mem_used", device.tags, device.meta, t, time.Now())
|
y, err := lp.NewMessage("nv_bar1_mem_used", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MByte")
|
y.AddMeta("unit", "MByte")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -293,7 +277,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err := errors.New(nvml.ErrorString(ret))
|
err := errors.New(nvml.ErrorString(ret))
|
||||||
@@ -317,14 +301,14 @@ func readUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
util, ret := nvml.DeviceGetUtilizationRates(device.device)
|
util, ret := nvml.DeviceGetUtilizationRates(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
if !device.excludeMetrics["nv_util"] {
|
if !device.excludeMetrics["nv_util"] {
|
||||||
y, err := lp.NewMetric("nv_util", device.tags, device.meta, float64(util.Gpu), time.Now())
|
y, err := lp.NewMessage("nv_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_mem_util"] {
|
if !device.excludeMetrics["nv_mem_util"] {
|
||||||
y, err := lp.NewMetric("nv_mem_util", device.tags, device.meta, float64(util.Memory), time.Now())
|
y, err := lp.NewMessage("nv_mem_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -335,7 +319,7 @@ func readUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readTemp(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readTemp(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_temp"] {
|
if !device.excludeMetrics["nv_temp"] {
|
||||||
// Retrieves the current temperature readings for the device, in degrees C.
|
// Retrieves the current temperature readings for the device, in degrees C.
|
||||||
//
|
//
|
||||||
@@ -344,7 +328,7 @@ func readTemp(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
// * NVML_TEMPERATURE_COUNT
|
// * NVML_TEMPERATURE_COUNT
|
||||||
temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
|
temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_temp", device.tags, device.meta, float64(temp), time.Now())
|
y, err := lp.NewMessage("nv_temp", device.tags, device.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "degC")
|
y.AddMeta("unit", "degC")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -354,7 +338,7 @@ func readTemp(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readFan(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_fan"] {
|
if !device.excludeMetrics["nv_fan"] {
|
||||||
// Retrieves the intended operating speed of the device's fan.
|
// Retrieves the intended operating speed of the device's fan.
|
||||||
//
|
//
|
||||||
@@ -367,7 +351,7 @@ func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
// This value may exceed 100% in certain cases.
|
// This value may exceed 100% in certain cases.
|
||||||
fan, ret := nvml.DeviceGetFanSpeed(device.device)
|
fan, ret := nvml.DeviceGetFanSpeed(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_fan", device.tags, device.meta, float64(fan), time.Now())
|
y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -377,7 +361,28 @@ func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
// func readFans(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
|
// if !device.excludeMetrics["nv_fan"] {
|
||||||
|
// numFans, ret := nvml.DeviceGetNumFans(device.device)
|
||||||
|
// if ret == nvml.SUCCESS {
|
||||||
|
// for i := 0; i < numFans; i++ {
|
||||||
|
// fan, ret := nvml.DeviceGetFanSpeed_v2(device.device, i)
|
||||||
|
// if ret == nvml.SUCCESS {
|
||||||
|
// y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
|
||||||
|
// if err == nil {
|
||||||
|
// y.AddMeta("unit", "%")
|
||||||
|
// y.AddTag("stype", "fan")
|
||||||
|
// y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||||
|
// output <- y
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// return nil
|
||||||
|
// }
|
||||||
|
|
||||||
|
func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_ecc_mode"] {
|
if !device.excludeMetrics["nv_ecc_mode"] {
|
||||||
// Retrieves the current and pending ECC modes for the device.
|
// Retrieves the current and pending ECC modes for the device.
|
||||||
//
|
//
|
||||||
@@ -387,23 +392,22 @@ func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error
|
|||||||
// Changing ECC modes requires a reboot.
|
// Changing ECC modes requires a reboot.
|
||||||
// The "pending" ECC mode refers to the target mode following the next reboot.
|
// The "pending" ECC mode refers to the target mode following the next reboot.
|
||||||
_, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
|
_, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
|
||||||
switch ret {
|
if ret == nvml.SUCCESS {
|
||||||
case nvml.SUCCESS:
|
|
||||||
var y lp.CCMessage
|
var y lp.CCMessage
|
||||||
var err error
|
var err error
|
||||||
switch ecc_pend {
|
switch ecc_pend {
|
||||||
case nvml.FEATURE_DISABLED:
|
case nvml.FEATURE_DISABLED:
|
||||||
y, err = lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "OFF", time.Now())
|
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "OFF"}, time.Now())
|
||||||
case nvml.FEATURE_ENABLED:
|
case nvml.FEATURE_ENABLED:
|
||||||
y, err = lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "ON", time.Now())
|
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "ON"}, time.Now())
|
||||||
default:
|
default:
|
||||||
y, err = lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "UNKNOWN", time.Now())
|
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now())
|
||||||
}
|
}
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
case nvml.ERROR_NOT_SUPPORTED:
|
} else if ret == nvml.ERROR_NOT_SUPPORTED {
|
||||||
y, err := lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "N/A", time.Now())
|
y, err := lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "N/A"}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -412,7 +416,7 @@ func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readPerfState(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_perf_state"] {
|
if !device.excludeMetrics["nv_perf_state"] {
|
||||||
// Retrieves the current performance state for the device.
|
// Retrieves the current performance state for the device.
|
||||||
//
|
//
|
||||||
@@ -423,7 +427,7 @@ func readPerfState(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
// 32: Unknown performance state.
|
// 32: Unknown performance state.
|
||||||
pState, ret := nvml.DeviceGetPerformanceState(device.device)
|
pState, ret := nvml.DeviceGetPerformanceState(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_perf_state", device.tags, device.meta, fmt.Sprintf("P%d", int(pState)), time.Now())
|
y, err := lp.NewMessage("nv_perf_state", device.tags, device.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -432,16 +436,13 @@ func readPerfState(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readPowerUsage(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_power_usage"] {
|
if !device.excludeMetrics["nv_power_usage"] {
|
||||||
// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
|
// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
|
||||||
//
|
//
|
||||||
// On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
|
// On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
|
||||||
// On Ampere (except GA100) or newer GPUs, the API returns power averaged over 1 sec interval.
|
|
||||||
// On GA100 and older architectures, instantaneous power is returned.
|
|
||||||
//
|
//
|
||||||
// It is only available if power management mode is supported.
|
// It is only available if power management mode is supported
|
||||||
|
|
||||||
mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
|
mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
return nil
|
return nil
|
||||||
@@ -449,7 +450,7 @@ func readPowerUsage(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
if mode == nvml.FEATURE_ENABLED {
|
if mode == nvml.FEATURE_ENABLED {
|
||||||
power, ret := nvml.DeviceGetPowerUsage(device.device)
|
power, ret := nvml.DeviceGetPowerUsage(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_power_usage", device.tags, device.meta, float64(power)/1000, time.Now())
|
y, err := lp.NewMessage("nv_power_usage", device.tags, device.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "watts")
|
y.AddMeta("unit", "watts")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -460,59 +461,7 @@ func readPowerUsage(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readEnergyConsumption(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
// Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded
|
|
||||||
|
|
||||||
// For Volta or newer fully supported devices.
|
|
||||||
if (!device.excludeMetrics["nv_energy"]) && (!device.excludeMetrics["nv_energy_abs"]) && (!device.excludeMetrics["nv_average_power"]) {
|
|
||||||
now := time.Now()
|
|
||||||
mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
|
|
||||||
if ret != nvml.SUCCESS {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
if mode == nvml.FEATURE_ENABLED {
|
|
||||||
energy, ret := nvml.DeviceGetTotalEnergyConsumption(device.device)
|
|
||||||
if ret == nvml.SUCCESS {
|
|
||||||
if device.lastEnergyReading != 0 {
|
|
||||||
if !device.excludeMetrics["nv_energy"] {
|
|
||||||
y, err := lp.NewMetric(
|
|
||||||
"nv_energy",
|
|
||||||
device.tags,
|
|
||||||
device.meta,
|
|
||||||
(energy-device.lastEnergyReading)/1000,
|
|
||||||
now)
|
|
||||||
if err == nil {
|
|
||||||
y.AddMeta("unit", "Joules")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !device.excludeMetrics["nv_average_power"] {
|
|
||||||
|
|
||||||
energyDiff := (energy - device.lastEnergyReading) / 1000
|
|
||||||
timeDiff := now.Sub(device.lastEnergyTimestamp)
|
|
||||||
y, err := lp.NewMetric("nv_average_power", device.tags, device.meta, energyDiff/uint64(timeDiff.Seconds()), now)
|
|
||||||
if err == nil {
|
|
||||||
y.AddMeta("unit", "watts")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !device.excludeMetrics["nv_energy_abs"] {
|
|
||||||
y, err := lp.NewMetric("nv_energy_abs", device.tags, device.meta, energy/1000, now)
|
|
||||||
if err == nil {
|
|
||||||
y.AddMeta("unit", "Joules")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
device.lastEnergyReading = energy
|
|
||||||
device.lastEnergyTimestamp = time.Now()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|
||||||
// Retrieves the current clock speeds for the device.
|
// Retrieves the current clock speeds for the device.
|
||||||
//
|
//
|
||||||
// Available clock information:
|
// Available clock information:
|
||||||
@@ -522,7 +471,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
if !device.excludeMetrics["nv_graphics_clock"] {
|
if !device.excludeMetrics["nv_graphics_clock"] {
|
||||||
graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
|
graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_graphics_clock", device.tags, device.meta, float64(graphicsClock), time.Now())
|
y, err := lp.NewMessage("nv_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -533,7 +482,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
if !device.excludeMetrics["nv_sm_clock"] {
|
if !device.excludeMetrics["nv_sm_clock"] {
|
||||||
smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
|
smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_sm_clock", device.tags, device.meta, float64(smCock), time.Now())
|
y, err := lp.NewMessage("nv_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(smCock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -544,7 +493,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
if !device.excludeMetrics["nv_mem_clock"] {
|
if !device.excludeMetrics["nv_mem_clock"] {
|
||||||
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
|
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_mem_clock", device.tags, device.meta, float64(memClock), time.Now())
|
y, err := lp.NewMessage("nv_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -554,7 +503,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
if !device.excludeMetrics["nv_video_clock"] {
|
if !device.excludeMetrics["nv_video_clock"] {
|
||||||
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
|
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_video_clock", device.tags, device.meta, float64(memClock), time.Now())
|
y, err := lp.NewMessage("nv_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -564,7 +513,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readMaxClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
// Retrieves the maximum clock speeds for the device.
|
// Retrieves the maximum clock speeds for the device.
|
||||||
//
|
//
|
||||||
// Available clock information:
|
// Available clock information:
|
||||||
@@ -579,7 +528,7 @@ func readMaxClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
if !device.excludeMetrics["nv_max_graphics_clock"] {
|
if !device.excludeMetrics["nv_max_graphics_clock"] {
|
||||||
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS)
|
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_max_graphics_clock", device.tags, device.meta, float64(max_gclk), time.Now())
|
y, err := lp.NewMessage("nv_max_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -588,9 +537,9 @@ func readMaxClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_max_sm_clock"] {
|
if !device.excludeMetrics["nv_max_sm_clock"] {
|
||||||
maxSmClock, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_SM)
|
maxSmClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_max_sm_clock", device.tags, device.meta, float64(maxSmClock), time.Now())
|
y, err := lp.NewMessage("nv_max_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxSmClock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -599,9 +548,9 @@ func readMaxClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_max_mem_clock"] {
|
if !device.excludeMetrics["nv_max_mem_clock"] {
|
||||||
maxMemClock, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_MEM)
|
maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_max_mem_clock", device.tags, device.meta, float64(maxMemClock), time.Now())
|
y, err := lp.NewMessage("nv_max_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -610,9 +559,9 @@ func readMaxClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_max_video_clock"] {
|
if !device.excludeMetrics["nv_max_video_clock"] {
|
||||||
maxVideoClock, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_VIDEO)
|
maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_max_video_clock", device.tags, device.meta, float64(maxVideoClock), time.Now())
|
y, err := lp.NewMessage("nv_max_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -622,7 +571,7 @@ func readMaxClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_ecc_uncorrected_error"] {
|
if !device.excludeMetrics["nv_ecc_uncorrected_error"] {
|
||||||
// Retrieves the total ECC error counts for the device.
|
// Retrieves the total ECC error counts for the device.
|
||||||
//
|
//
|
||||||
@@ -635,7 +584,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
// i.e. the total set of errors across the entire device.
|
// i.e. the total set of errors across the entire device.
|
||||||
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
|
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_ecc_uncorrected_error", device.tags, device.meta, float64(ecc_db), time.Now())
|
y, err := lp.NewMessage("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -644,7 +593,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
if !device.excludeMetrics["nv_ecc_corrected_error"] {
|
if !device.excludeMetrics["nv_ecc_corrected_error"] {
|
||||||
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
|
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_ecc_corrected_error", device.tags, device.meta, float64(ecc_sb), time.Now())
|
y, err := lp.NewMessage("nv_ecc_corrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -653,7 +602,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readPowerLimit(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_power_max_limit"] {
|
if !device.excludeMetrics["nv_power_max_limit"] {
|
||||||
// Retrieves the power management limit associated with this device.
|
// Retrieves the power management limit associated with this device.
|
||||||
//
|
//
|
||||||
@@ -663,7 +612,7 @@ func readPowerLimit(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
// If the card's total power draw reaches this limit the power management algorithm kicks in.
|
// If the card's total power draw reaches this limit the power management algorithm kicks in.
|
||||||
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
|
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_power_max_limit", device.tags, device.meta, float64(pwr_limit)/1000, time.Now())
|
y, err := lp.NewMessage("nv_power_max_limit", device.tags, device.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "watts")
|
y.AddMeta("unit", "watts")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -673,7 +622,7 @@ func readPowerLimit(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readEncUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err := errors.New(nvml.ErrorString(ret))
|
err := errors.New(nvml.ErrorString(ret))
|
||||||
@@ -690,7 +639,7 @@ func readEncUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
|
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
|
||||||
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
|
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_encoder_util", device.tags, device.meta, float64(enc_util), time.Now())
|
y, err := lp.NewMessage("nv_encoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -700,7 +649,7 @@ func readEncUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readDecUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err := errors.New(nvml.ErrorString(ret))
|
err := errors.New(nvml.ErrorString(ret))
|
||||||
@@ -717,7 +666,7 @@ func readDecUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
|
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
|
||||||
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
|
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_decoder_util", device.tags, device.meta, float64(dec_util), time.Now())
|
y, err := lp.NewMessage("nv_decoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -727,7 +676,7 @@ func readDecUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_remapped_rows_corrected"] ||
|
if !device.excludeMetrics["nv_remapped_rows_corrected"] ||
|
||||||
!device.excludeMetrics["nv_remapped_rows_uncorrected"] ||
|
!device.excludeMetrics["nv_remapped_rows_uncorrected"] ||
|
||||||
!device.excludeMetrics["nv_remapped_rows_pending"] ||
|
!device.excludeMetrics["nv_remapped_rows_pending"] ||
|
||||||
@@ -744,33 +693,33 @@ func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) e
|
|||||||
corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device)
|
corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
if !device.excludeMetrics["nv_remapped_rows_corrected"] {
|
if !device.excludeMetrics["nv_remapped_rows_corrected"] {
|
||||||
y, err := lp.NewMetric("nv_remapped_rows_corrected", device.tags, device.meta, float64(corrected), time.Now())
|
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(corrected)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_remapped_rows_uncorrected"] {
|
if !device.excludeMetrics["nv_remapped_rows_uncorrected"] {
|
||||||
y, err := lp.NewMetric("nv_remapped_rows_corrected", device.tags, device.meta, float64(uncorrected), time.Now())
|
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(uncorrected)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_remapped_rows_pending"] {
|
if !device.excludeMetrics["nv_remapped_rows_pending"] {
|
||||||
p := 0
|
var p int = 0
|
||||||
if pending {
|
if pending {
|
||||||
p = 1
|
p = 1
|
||||||
}
|
}
|
||||||
y, err := lp.NewMetric("nv_remapped_rows_pending", device.tags, device.meta, p, time.Now())
|
y, err := lp.NewMessage("nv_remapped_rows_pending", device.tags, device.meta, map[string]interface{}{"value": p}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_remapped_rows_failure"] {
|
if !device.excludeMetrics["nv_remapped_rows_failure"] {
|
||||||
f := 0
|
var f int = 0
|
||||||
if failure {
|
if failure {
|
||||||
f = 1
|
f = 1
|
||||||
}
|
}
|
||||||
y, err := lp.NewMetric("nv_remapped_rows_failure", device.tags, device.meta, f, time.Now())
|
y, err := lp.NewMessage("nv_remapped_rows_failure", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -780,7 +729,7 @@ func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) e
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_compute_processes"] {
|
if !device.excludeMetrics["nv_compute_processes"] {
|
||||||
// Get information about processes with a compute context on a device
|
// Get information about processes with a compute context on a device
|
||||||
//
|
//
|
||||||
@@ -804,7 +753,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
|
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
|
||||||
procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device)
|
procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_compute_processes", device.tags, device.meta, len(procList), time.Now())
|
y, err := lp.NewMessage("nv_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -833,7 +782,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
|
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
|
||||||
procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device)
|
procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_graphics_processes", device.tags, device.meta, len(procList), time.Now())
|
y, err := lp.NewMessage("nv_graphics_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -863,7 +812,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
// // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
|
// // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
|
||||||
// procList, ret := nvml.DeviceGetMPSComputeRunningProcesses(device.device)
|
// procList, ret := nvml.DeviceGetMPSComputeRunningProcesses(device.device)
|
||||||
// if ret == nvml.SUCCESS {
|
// if ret == nvml.SUCCESS {
|
||||||
// y, err := lp.NewMetric("nv_mps_compute_processes", device.tags, device.meta, len(procList), time.Now())
|
// y, err := lp.NewMessage("nv_mps_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
|
||||||
// if err == nil {
|
// if err == nil {
|
||||||
// output <- y
|
// output <- y
|
||||||
// }
|
// }
|
||||||
@@ -872,7 +821,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
var violTime nvml.ViolationTime
|
var violTime nvml.ViolationTime
|
||||||
var ret nvml.Return
|
var ret nvml.Return
|
||||||
|
|
||||||
@@ -891,7 +840,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMetric("nv_violation_power", device.tags, device.meta, t, time.Now())
|
y, err := lp.NewMessage("nv_violation_power", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -903,7 +852,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMetric("nv_violation_thermal", device.tags, device.meta, t, time.Now())
|
y, err := lp.NewMessage("nv_violation_thermal", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -915,7 +864,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMetric("nv_violation_sync_boost", device.tags, device.meta, t, time.Now())
|
y, err := lp.NewMessage("nv_violation_sync_boost", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -927,7 +876,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMetric("nv_violation_board_limit", device.tags, device.meta, t, time.Now())
|
y, err := lp.NewMessage("nv_violation_board_limit", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -939,7 +888,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMetric("nv_violation_low_util", device.tags, device.meta, t, time.Now())
|
y, err := lp.NewMessage("nv_violation_low_util", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -951,7 +900,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMetric("nv_violation_reliability", device.tags, device.meta, t, time.Now())
|
y, err := lp.NewMessage("nv_violation_reliability", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -963,7 +912,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMetric("nv_violation_below_app_clock", device.tags, device.meta, t, time.Now())
|
y, err := lp.NewMessage("nv_violation_below_app_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -975,7 +924,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMetric("nv_violation_below_base_clock", device.tags, device.meta, t, time.Now())
|
y, err := lp.NewMessage("nv_violation_below_base_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -986,7 +935,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
// Retrieves the specified error counter value
|
// Retrieves the specified error counter value
|
||||||
// Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
|
// Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
|
||||||
//
|
//
|
||||||
@@ -998,19 +947,19 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
var aggregate_recovery_errors uint64 = 0
|
var aggregate_recovery_errors uint64 = 0
|
||||||
var aggregate_crc_flit_errors uint64 = 0
|
var aggregate_crc_flit_errors uint64 = 0
|
||||||
|
|
||||||
for i := range nvml.NVLINK_MAX_LINKS {
|
for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ {
|
||||||
state, ret := nvml.DeviceGetNvLinkState(device.device, i)
|
state, ret := nvml.DeviceGetNvLinkState(device.device, i)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
if state == nvml.FEATURE_ENABLED {
|
if state == nvml.FEATURE_ENABLED {
|
||||||
if !device.excludeMetrics["nv_nvlink_crc_errors"] {
|
if !device.excludeMetrics["nv_nvlink_crc_errors"] {
|
||||||
// Data link receive data CRC error counter
|
// Data link receive data CRC error counter
|
||||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA)
|
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA)
|
||||||
aggregate_crc_errors += count
|
aggregate_crc_errors = aggregate_crc_errors + count
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_nvlink_crc_errors", device.tags, device.meta, count, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_crc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
y.AddTag("stype-id", strconv.Itoa(i))
|
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1018,12 +967,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
|
if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
|
||||||
// Data link receive data ECC error counter
|
// Data link receive data ECC error counter
|
||||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA)
|
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA)
|
||||||
aggregate_ecc_errors += count
|
aggregate_ecc_errors = aggregate_ecc_errors + count
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_nvlink_ecc_errors", device.tags, device.meta, count, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
y.AddTag("stype-id", strconv.Itoa(i))
|
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1031,12 +980,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
if !device.excludeMetrics["nv_nvlink_replay_errors"] {
|
if !device.excludeMetrics["nv_nvlink_replay_errors"] {
|
||||||
// Data link transmit replay error counter
|
// Data link transmit replay error counter
|
||||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY)
|
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY)
|
||||||
aggregate_replay_errors += count
|
aggregate_replay_errors = aggregate_replay_errors + count
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_nvlink_replay_errors", device.tags, device.meta, count, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_replay_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
y.AddTag("stype-id", strconv.Itoa(i))
|
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1044,12 +993,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
|
if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
|
||||||
// Data link transmit recovery error counter
|
// Data link transmit recovery error counter
|
||||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY)
|
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY)
|
||||||
aggregate_recovery_errors += count
|
aggregate_recovery_errors = aggregate_recovery_errors + count
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_nvlink_recovery_errors", device.tags, device.meta, count, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
y.AddTag("stype-id", strconv.Itoa(i))
|
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1057,12 +1006,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
|
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
|
||||||
// Data link receive flow control digit CRC error counter
|
// Data link receive flow control digit CRC error counter
|
||||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT)
|
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT)
|
||||||
aggregate_crc_flit_errors += count
|
aggregate_crc_flit_errors = aggregate_crc_flit_errors + count
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMetric("nv_nvlink_crc_flit_errors", device.tags, device.meta, count, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
y.AddTag("stype-id", strconv.Itoa(i))
|
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1074,7 +1023,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
// Export aggegated values
|
// Export aggegated values
|
||||||
if !device.excludeMetrics["nv_nvlink_crc_errors"] {
|
if !device.excludeMetrics["nv_nvlink_crc_errors"] {
|
||||||
// Data link receive data CRC error counter
|
// Data link receive data CRC error counter
|
||||||
y, err := lp.NewMetric("nv_nvlink_crc_errors_sum", device.tags, device.meta, aggregate_crc_errors, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_crc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_errors}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -1082,7 +1031,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
|
if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
|
||||||
// Data link receive data ECC error counter
|
// Data link receive data ECC error counter
|
||||||
y, err := lp.NewMetric("nv_nvlink_ecc_errors_sum", device.tags, device.meta, aggregate_ecc_errors, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_ecc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_ecc_errors}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -1090,7 +1039,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_nvlink_replay_errors"] {
|
if !device.excludeMetrics["nv_nvlink_replay_errors"] {
|
||||||
// Data link transmit replay error counter
|
// Data link transmit replay error counter
|
||||||
y, err := lp.NewMetric("nv_nvlink_replay_errors_sum", device.tags, device.meta, aggregate_replay_errors, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_replay_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_replay_errors}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -1098,7 +1047,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
|
if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
|
||||||
// Data link transmit recovery error counter
|
// Data link transmit recovery error counter
|
||||||
y, err := lp.NewMetric("nv_nvlink_recovery_errors_sum", device.tags, device.meta, aggregate_recovery_errors, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_recovery_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_recovery_errors}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -1106,7 +1055,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
|
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
|
||||||
// Data link receive flow control digit CRC error counter
|
// Data link receive flow control digit CRC error counter
|
||||||
y, err := lp.NewMetric("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, aggregate_crc_flit_errors, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_flit_errors}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -1121,7 +1070,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
readAll := func(device *NvidiaCollectorDevice, output chan lp.CCMessage) {
|
readAll := func(device NvidiaCollectorDevice, output chan lp.CCMessage) {
|
||||||
name, ret := nvml.DeviceGetName(device.device)
|
name, ret := nvml.DeviceGetName(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
name = "NoName"
|
name = "NoName"
|
||||||
@@ -1161,11 +1110,6 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
err = readEnergyConsumption(device, output)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentDebug(m.name, "readEnergyConsumption for device", name, "failed")
|
|
||||||
}
|
|
||||||
|
|
||||||
err = readClocks(device, output)
|
err = readClocks(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
|
||||||
@@ -1223,9 +1167,9 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Actual read loop over all attached Nvidia GPUs
|
// Actual read loop over all attached Nvidia GPUs
|
||||||
for i := range m.num_gpus {
|
for i := 0; i < m.num_gpus; i++ {
|
||||||
|
|
||||||
readAll(&m.gpus[i], output)
|
readAll(m.gpus[i], output)
|
||||||
|
|
||||||
// Iterate over all MIG devices if any
|
// Iterate over all MIG devices if any
|
||||||
if m.config.ProcessMigDevices {
|
if m.config.ProcessMigDevices {
|
||||||
@@ -1246,7 +1190,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
}
|
}
|
||||||
cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i)
|
cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i)
|
||||||
|
|
||||||
for j := range maxMig {
|
for j := 0; j < maxMig; j++ {
|
||||||
mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
|
mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
continue
|
continue
|
||||||
@@ -1263,7 +1207,9 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
meta: map[string]string{},
|
meta: map[string]string{},
|
||||||
excludeMetrics: excludeMetrics,
|
excludeMetrics: excludeMetrics,
|
||||||
}
|
}
|
||||||
maps.Copy(migDevice.tags, m.gpus[i].tags)
|
for k, v := range m.gpus[i].tags {
|
||||||
|
migDevice.tags[k] = v
|
||||||
|
}
|
||||||
migDevice.tags["stype"] = "mig"
|
migDevice.tags["stype"] = "mig"
|
||||||
if m.config.UseUuidForMigDevices {
|
if m.config.UseUuidForMigDevices {
|
||||||
uuid, ret := nvml.DeviceGetUUID(mdev)
|
uuid, ret := nvml.DeviceGetUUID(mdev)
|
||||||
@@ -1277,17 +1223,19 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
mname, ret := nvml.DeviceGetName(mdev)
|
mname, ret := nvml.DeviceGetName(mdev)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
x := strings.ReplaceAll(mname, name, "")
|
x := strings.Replace(mname, name, "", -1)
|
||||||
x = strings.ReplaceAll(x, "MIG", "")
|
x = strings.Replace(x, "MIG", "", -1)
|
||||||
x = strings.TrimSpace(x)
|
x = strings.TrimSpace(x)
|
||||||
migDevice.tags["stype-id"] = x
|
migDevice.tags["stype-id"] = x
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if _, ok := migDevice.tags["stype-id"]; !ok {
|
if _, ok := migDevice.tags["stype-id"]; !ok {
|
||||||
migDevice.tags["stype-id"] = strconv.Itoa(j)
|
migDevice.tags["stype-id"] = fmt.Sprintf("%d", j)
|
||||||
|
}
|
||||||
|
for k, v := range m.gpus[i].meta {
|
||||||
|
migDevice.meta[k] = v
|
||||||
}
|
}
|
||||||
maps.Copy(migDevice.meta, m.gpus[i].meta)
|
|
||||||
if _, ok := migDevice.meta["uuid"]; ok && !m.config.UseUuidForMigDevices {
|
if _, ok := migDevice.meta["uuid"]; ok && !m.config.UseUuidForMigDevices {
|
||||||
uuid, ret := nvml.DeviceGetUUID(mdev)
|
uuid, ret := nvml.DeviceGetUUID(mdev)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
@@ -1295,7 +1243,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
readAll(&migDevice, output)
|
readAll(migDevice, output)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1303,9 +1251,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
|
|
||||||
func (m *NvidiaCollector) Close() {
|
func (m *NvidiaCollector) Close() {
|
||||||
if m.init {
|
if m.init {
|
||||||
if ret := nvml.Shutdown(); ret != nvml.SUCCESS {
|
nvml.Shutdown()
|
||||||
cclog.ComponentError(m.name, "nvml.Shutdown() not successful")
|
|
||||||
}
|
|
||||||
m.init = false
|
m.init = false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,13 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: "Nvidia NVML metric collector"
|
|
||||||
description: Collect metrics for Nvidia GPUs using the NVML
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/nvidia.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `nvidia` collector
|
## `nvidia` collector
|
||||||
|
|
||||||
@@ -82,8 +72,5 @@ Metrics:
|
|||||||
* `nv_nvlink_ecc_errors`
|
* `nv_nvlink_ecc_errors`
|
||||||
* `nv_nvlink_replay_errors`
|
* `nv_nvlink_replay_errors`
|
||||||
* `nv_nvlink_recovery_errors`
|
* `nv_nvlink_recovery_errors`
|
||||||
* `nv_energy`
|
|
||||||
* `nv_energy_abs`
|
|
||||||
* `nv_average_power`
|
|
||||||
|
|
||||||
Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`.
|
Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`.
|
||||||
@@ -1,14 +1,6 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
@@ -17,8 +9,8 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// running average power limit (RAPL) monitoring attributes for a zone
|
// running average power limit (RAPL) monitoring attributes for a zone
|
||||||
@@ -35,7 +27,6 @@ type RAPLZoneInfo struct {
|
|||||||
|
|
||||||
type RAPLCollector struct {
|
type RAPLCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
config struct {
|
config struct {
|
||||||
// Exclude IDs for RAPL zones, e.g.
|
// Exclude IDs for RAPL zones, e.g.
|
||||||
// * 0 for zone 0
|
// * 0 for zone 0
|
||||||
@@ -50,15 +41,15 @@ type RAPLCollector struct {
|
|||||||
|
|
||||||
// Init initializes the running average power limit (RAPL) collector
|
// Init initializes the running average power limit (RAPL) collector
|
||||||
func (m *RAPLCollector) Init(config json.RawMessage) error {
|
func (m *RAPLCollector) Init(config json.RawMessage) error {
|
||||||
|
|
||||||
// Check if already initialized
|
// Check if already initialized
|
||||||
if m.init {
|
if m.init {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var err error = nil
|
||||||
m.name = "RAPLCollector"
|
m.name = "RAPLCollector"
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{
|
||||||
"source": m.name,
|
"source": m.name,
|
||||||
@@ -68,10 +59,10 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Read in the JSON configuration
|
// Read in the JSON configuration
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err = json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -91,20 +82,19 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// readZoneInfo reads RAPL monitoring attributes for a zone given by zonePath
|
// readZoneInfo reads RAPL monitoring attributes for a zone given by zonePath
|
||||||
// See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes
|
// See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes
|
||||||
readZoneInfo := func(zonePath string) (
|
readZoneInfo := func(zonePath string) (z struct {
|
||||||
z struct {
|
name string // zones name e.g. psys, dram, core, uncore, package-0
|
||||||
name string // zones name e.g. psys, dram, core, uncore, package-0
|
energyFilepath string // path to a file containing the zones current energy counter in micro joules
|
||||||
energyFilepath string // path to a file containing the zones current energy counter in micro joules
|
energy int64 // current reading of the energy counter in micro joules
|
||||||
energy int64 // current reading of the energy counter in micro joules
|
energyTimestamp time.Time // timestamp when energy counter was read
|
||||||
energyTimestamp time.Time // timestamp when energy counter was read
|
maxEnergyRange int64 // Range of the above energy counter in micro-joules
|
||||||
maxEnergyRange int64 // Range of the above energy counter in micro-joules
|
ok bool // Are all information available?
|
||||||
ok bool // Are all information available?
|
}) {
|
||||||
},
|
|
||||||
) {
|
|
||||||
// zones name e.g. psys, dram, core, uncore, package-0
|
// zones name e.g. psys, dram, core, uncore, package-0
|
||||||
foundName := false
|
foundName := false
|
||||||
if v, err := os.ReadFile(
|
if v, err :=
|
||||||
filepath.Join(zonePath, "name")); err == nil {
|
os.ReadFile(
|
||||||
|
filepath.Join(zonePath, "name")); err == nil {
|
||||||
foundName = true
|
foundName = true
|
||||||
z.name = strings.TrimSpace(string(v))
|
z.name = strings.TrimSpace(string(v))
|
||||||
}
|
}
|
||||||
@@ -125,8 +115,9 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Range of the above energy counter in micro-joules
|
// Range of the above energy counter in micro-joules
|
||||||
foundMaxEnergyRange := false
|
foundMaxEnergyRange := false
|
||||||
if v, err := os.ReadFile(
|
if v, err :=
|
||||||
filepath.Join(zonePath, "max_energy_range_uj")); err == nil {
|
os.ReadFile(
|
||||||
|
filepath.Join(zonePath, "max_energy_range_uj")); err == nil {
|
||||||
if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil {
|
if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil {
|
||||||
foundMaxEnergyRange = true
|
foundMaxEnergyRange = true
|
||||||
z.maxEnergyRange = i
|
z.maxEnergyRange = i
|
||||||
@@ -158,18 +149,19 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
|
|||||||
!isNameExcluded[z.name] {
|
!isNameExcluded[z.name] {
|
||||||
|
|
||||||
// Add RAPL monitoring attributes for a zone
|
// Add RAPL monitoring attributes for a zone
|
||||||
m.RAPLZoneInfo = append(
|
m.RAPLZoneInfo =
|
||||||
m.RAPLZoneInfo,
|
append(
|
||||||
RAPLZoneInfo{
|
m.RAPLZoneInfo,
|
||||||
tags: map[string]string{
|
RAPLZoneInfo{
|
||||||
"id": zoneID,
|
tags: map[string]string{
|
||||||
"zone_name": z.name,
|
"id": zoneID,
|
||||||
},
|
"zone_name": z.name,
|
||||||
energyFilepath: z.energyFilepath,
|
},
|
||||||
energy: z.energy,
|
energyFilepath: z.energyFilepath,
|
||||||
energyTimestamp: z.energyTimestamp,
|
energy: z.energy,
|
||||||
maxEnergyRange: z.maxEnergyRange,
|
energyTimestamp: z.energyTimestamp,
|
||||||
})
|
maxEnergyRange: z.maxEnergyRange,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// find all sub zones for the given zone
|
// find all sub zones for the given zone
|
||||||
@@ -186,25 +178,27 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
|
|||||||
sz.ok &&
|
sz.ok &&
|
||||||
!isIDExcluded[zoneID+":"+subZoneID] &&
|
!isIDExcluded[zoneID+":"+subZoneID] &&
|
||||||
!isNameExcluded[sz.name] {
|
!isNameExcluded[sz.name] {
|
||||||
m.RAPLZoneInfo = append(
|
m.RAPLZoneInfo =
|
||||||
m.RAPLZoneInfo,
|
append(
|
||||||
RAPLZoneInfo{
|
m.RAPLZoneInfo,
|
||||||
tags: map[string]string{
|
RAPLZoneInfo{
|
||||||
"id": zoneID + ":" + subZoneID,
|
tags: map[string]string{
|
||||||
"zone_name": z.name,
|
"id": zoneID + ":" + subZoneID,
|
||||||
"sub_zone_name": sz.name,
|
"zone_name": z.name,
|
||||||
},
|
"sub_zone_name": sz.name,
|
||||||
energyFilepath: sz.energyFilepath,
|
},
|
||||||
energy: sz.energy,
|
energyFilepath: sz.energyFilepath,
|
||||||
energyTimestamp: sz.energyTimestamp,
|
energy: sz.energy,
|
||||||
maxEnergyRange: sz.maxEnergyRange,
|
energyTimestamp: sz.energyTimestamp,
|
||||||
})
|
maxEnergyRange: sz.maxEnergyRange,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if m.RAPLZoneInfo == nil {
|
if m.RAPLZoneInfo == nil {
|
||||||
return fmt.Errorf("no running average power limit (RAPL) device found in %s", controlTypePath)
|
return fmt.Errorf("no running average power limit (RAPL) device found in %s", controlTypePath)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialized
|
// Initialized
|
||||||
@@ -221,6 +215,7 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
|
|||||||
// Read reads running average power limit (RAPL) monitoring attributes for all initialized zones
|
// Read reads running average power limit (RAPL) monitoring attributes for all initialized zones
|
||||||
// See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes
|
// See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes
|
||||||
func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
|
|
||||||
for i := range m.RAPLZoneInfo {
|
for i := range m.RAPLZoneInfo {
|
||||||
p := &m.RAPLZoneInfo[i]
|
p := &m.RAPLZoneInfo[i]
|
||||||
|
|
||||||
@@ -246,7 +241,7 @@ func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
"rapl_average_power",
|
"rapl_average_power",
|
||||||
p.tags,
|
p.tags,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]any{"value": averagePower},
|
map[string]interface{}{"value": averagePower},
|
||||||
energyTimestamp)
|
energyTimestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
|||||||
@@ -1,14 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: RAPL metric collector
|
|
||||||
description: Collect energy data through the RAPL sysfs interface
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/rapl.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `rapl` collector
|
## `rapl` collector
|
||||||
|
|
||||||
This collector reads running average power limit (RAPL) monitoring attributes to compute average power consumption metrics. See <https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes>.
|
This collector reads running average power limit (RAPL) monitoring attributes to compute average power consumption metrics. See <https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes>.
|
||||||
|
|||||||
@@ -1,22 +1,13 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"slices"
|
|
||||||
"strconv"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
|
"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -38,7 +29,6 @@ type RocmSmiCollectorDevice struct {
|
|||||||
|
|
||||||
type RocmSmiCollector struct {
|
type RocmSmiCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
config RocmSmiCollectorConfig // the configuration structure
|
config RocmSmiCollectorConfig // the configuration structure
|
||||||
devices []RocmSmiCollectorDevice
|
devices []RocmSmiCollectorDevice
|
||||||
}
|
}
|
||||||
@@ -51,46 +41,73 @@ type RocmSmiCollector struct {
|
|||||||
// Called once by the collector manager
|
// Called once by the collector manager
|
||||||
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
||||||
func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||||
|
var err error = nil
|
||||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||||
m.name = "RocmSmiCollector"
|
m.name = "RocmSmiCollector"
|
||||||
// This is for later use, also call it early
|
// This is for later use, also call it early
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
// Define meta information sent with each metric
|
||||||
}
|
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||||
|
//m.meta = map[string]string{"source": m.name, "group": "AMD"}
|
||||||
|
// Define tags sent with each metric
|
||||||
|
// The 'type' tag is always needed, it defines the granulatity of the metric
|
||||||
|
// node -> whole system
|
||||||
|
// socket -> CPU socket (requires socket ID as 'type-id' tag)
|
||||||
|
// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
|
||||||
|
//m.tags = map[string]string{"type": "node"}
|
||||||
// Read in the JSON configuration
|
// Read in the JSON configuration
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err = json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ret := rocm_smi.Init()
|
ret := rocm_smi.Init()
|
||||||
if ret != rocm_smi.STATUS_SUCCESS {
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
return fmt.Errorf("%s Init(): failed to initialize ROCm SMI library", m.name)
|
err = errors.New("failed to initialize ROCm SMI library")
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
numDevs, ret := rocm_smi.NumMonitorDevices()
|
numDevs, ret := rocm_smi.NumMonitorDevices()
|
||||||
if ret != rocm_smi.STATUS_SUCCESS {
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
return fmt.Errorf("%s Init(): failed to get number of GPUs from ROCm SMI library", m.name)
|
err = errors.New("failed to get number of GPUs from ROCm SMI library")
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
exclDev := func(s string) bool {
|
||||||
|
skip_device := false
|
||||||
|
for _, excl := range m.config.ExcludeDevices {
|
||||||
|
if excl == s {
|
||||||
|
skip_device = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return skip_device
|
||||||
}
|
}
|
||||||
|
|
||||||
m.devices = make([]RocmSmiCollectorDevice, 0)
|
m.devices = make([]RocmSmiCollectorDevice, 0)
|
||||||
|
|
||||||
for i := range numDevs {
|
for i := 0; i < numDevs; i++ {
|
||||||
str_i := strconv.Itoa(i)
|
str_i := fmt.Sprintf("%d", i)
|
||||||
if slices.Contains(m.config.ExcludeDevices, str_i) {
|
if exclDev(str_i) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
|
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
|
||||||
if ret != rocm_smi.STATUS_SUCCESS {
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
return fmt.Errorf("%s Init(): failed to get get handle for GPU %d", m.name, i)
|
err = fmt.Errorf("failed to get handle for GPU %d", i)
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
pciInfo, ret := rocm_smi.DeviceGetPciInfo(device)
|
pciInfo, ret := rocm_smi.DeviceGetPciInfo(device)
|
||||||
if ret != rocm_smi.STATUS_SUCCESS {
|
if ret != rocm_smi.STATUS_SUCCESS {
|
||||||
return fmt.Errorf("%s Init(): failed to get PCI information for GPU %d", m.name, i)
|
err = fmt.Errorf("failed to get PCI information for GPU %d", i)
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
pciId := fmt.Sprintf(
|
pciId := fmt.Sprintf(
|
||||||
@@ -100,7 +117,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
|||||||
pciInfo.Device,
|
pciInfo.Device,
|
||||||
pciInfo.Function)
|
pciInfo.Function)
|
||||||
|
|
||||||
if slices.Contains(m.config.ExcludeDevices, pciId) {
|
if exclDev(pciId) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -140,7 +157,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Set this flag only if everything is initialized properly, all required files exist, ...
|
// Set this flag only if everything is initialized properly, all required files exist, ...
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read collects all metrics belonging to the sample collector
|
// Read collects all metrics belonging to the sample collector
|
||||||
@@ -158,135 +175,136 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
|
|
||||||
if !dev.excludeMetrics["rocm_gfx_util"] {
|
if !dev.excludeMetrics["rocm_gfx_util"] {
|
||||||
value := metrics.Average_gfx_activity
|
value := metrics.Average_gfx_activity
|
||||||
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_umc_util"] {
|
if !dev.excludeMetrics["rocm_umc_util"] {
|
||||||
value := metrics.Average_umc_activity
|
value := metrics.Average_umc_activity
|
||||||
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_mm_util"] {
|
if !dev.excludeMetrics["rocm_mm_util"] {
|
||||||
value := metrics.Average_mm_activity
|
value := metrics.Average_mm_activity
|
||||||
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_avg_power"] {
|
if !dev.excludeMetrics["rocm_avg_power"] {
|
||||||
value := metrics.Average_socket_power
|
value := metrics.Average_socket_power
|
||||||
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_mem"] {
|
if !dev.excludeMetrics["rocm_temp_mem"] {
|
||||||
value := metrics.Temperature_mem
|
value := metrics.Temperature_mem
|
||||||
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_hotspot"] {
|
if !dev.excludeMetrics["rocm_temp_hotspot"] {
|
||||||
value := metrics.Temperature_hotspot
|
value := metrics.Temperature_hotspot
|
||||||
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_edge"] {
|
if !dev.excludeMetrics["rocm_temp_edge"] {
|
||||||
value := metrics.Temperature_edge
|
value := metrics.Temperature_edge
|
||||||
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_vrgfx"] {
|
if !dev.excludeMetrics["rocm_temp_vrgfx"] {
|
||||||
value := metrics.Temperature_vrgfx
|
value := metrics.Temperature_vrgfx
|
||||||
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_vrsoc"] {
|
if !dev.excludeMetrics["rocm_temp_vrsoc"] {
|
||||||
value := metrics.Temperature_vrsoc
|
value := metrics.Temperature_vrsoc
|
||||||
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_vrmem"] {
|
if !dev.excludeMetrics["rocm_temp_vrmem"] {
|
||||||
value := metrics.Temperature_vrmem
|
value := metrics.Temperature_vrmem
|
||||||
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_gfx_clock"] {
|
if !dev.excludeMetrics["rocm_gfx_clock"] {
|
||||||
value := metrics.Average_gfxclk_frequency
|
value := metrics.Average_gfxclk_frequency
|
||||||
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_soc_clock"] {
|
if !dev.excludeMetrics["rocm_soc_clock"] {
|
||||||
value := metrics.Average_socclk_frequency
|
value := metrics.Average_socclk_frequency
|
||||||
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_u_clock"] {
|
if !dev.excludeMetrics["rocm_u_clock"] {
|
||||||
value := metrics.Average_uclk_frequency
|
value := metrics.Average_uclk_frequency
|
||||||
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_v0_clock"] {
|
if !dev.excludeMetrics["rocm_v0_clock"] {
|
||||||
value := metrics.Average_vclk0_frequency
|
value := metrics.Average_vclk0_frequency
|
||||||
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_v1_clock"] {
|
if !dev.excludeMetrics["rocm_v1_clock"] {
|
||||||
value := metrics.Average_vclk1_frequency
|
value := metrics.Average_vclk1_frequency
|
||||||
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_d0_clock"] {
|
if !dev.excludeMetrics["rocm_d0_clock"] {
|
||||||
value := metrics.Average_dclk0_frequency
|
value := metrics.Average_dclk0_frequency
|
||||||
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_d1_clock"] {
|
if !dev.excludeMetrics["rocm_d1_clock"] {
|
||||||
value := metrics.Average_dclk1_frequency
|
value := metrics.Average_dclk1_frequency
|
||||||
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_hbm"] {
|
if !dev.excludeMetrics["rocm_temp_hbm"] {
|
||||||
for i := range rocm_smi.NUM_HBM_INSTANCES {
|
for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ {
|
||||||
value := metrics.Temperature_hbm[i]
|
value := metrics.Temperature_hbm[i]
|
||||||
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "device")
|
y.AddTag("stype", "device")
|
||||||
y.AddTag("stype-id", strconv.Itoa(i))
|
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close metric collector: close network connection, close files, close libraries, ...
|
// Close metric collector: close network connection, close files, close libraries, ...
|
||||||
|
|||||||
@@ -1,23 +1,10 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: "ROCm SMI metric collector"
|
|
||||||
description: Collect metrics for AMD GPUs using the SMI library
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/rocmsmi.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
|
|
||||||
## `rocm_smi` collector
|
## `rocm_smi` collector
|
||||||
|
|
||||||
```json
|
```json
|
||||||
"rocm_smi": {
|
"rocm_smi": {
|
||||||
"exclude_devices": [
|
"exclude_devices": [
|
||||||
"0",
|
"0","1", "0000000:ff:01.0"
|
||||||
"1",
|
|
||||||
"0000000:ff:01.0"
|
|
||||||
],
|
],
|
||||||
"exclude_metrics": [
|
"exclude_metrics": [
|
||||||
"rocm_mm_util",
|
"rocm_mm_util",
|
||||||
@@ -25,7 +12,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/rocmsmi.md
|
|||||||
],
|
],
|
||||||
"use_pci_info_as_type_id": true,
|
"use_pci_info_as_type_id": true,
|
||||||
"add_pci_info_tag": false,
|
"add_pci_info_tag": false,
|
||||||
"add_serial_meta": false
|
"add_serial_meta": false,
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -1,19 +1,11 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// These are the fields we read from the JSON configuration
|
// These are the fields we read from the JSON configuration
|
||||||
@@ -25,7 +17,6 @@ type SampleCollectorConfig struct {
|
|||||||
// defined by metricCollector (name, init, ...)
|
// defined by metricCollector (name, init, ...)
|
||||||
type SampleCollector struct {
|
type SampleCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
config SampleCollectorConfig // the configuration structure
|
config SampleCollectorConfig // the configuration structure
|
||||||
meta map[string]string // default meta information
|
meta map[string]string // default meta information
|
||||||
tags map[string]string // default tags
|
tags map[string]string // default tags
|
||||||
@@ -43,19 +34,14 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
|
|||||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||||
m.name = "SampleCollector"
|
m.name = "SampleCollector"
|
||||||
// This is for later use, also call it early
|
// This is for later use, also call it early
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
// Tell whether the collector should be run in parallel with others (reading files, ...)
|
// Tell whether the collector should be run in parallel with others (reading files, ...)
|
||||||
// or it should be run serially, mostly for collectors actually doing measurements
|
// or it should be run serially, mostly for collectors actually doing measurements
|
||||||
// because they should not measure the execution of the other collectors
|
// because they should not measure the execution of the other collectors
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
// Define meta information sent with each metric
|
// Define meta information sent with each metric
|
||||||
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
|
||||||
"source": m.name,
|
|
||||||
"group": "SAMPLE",
|
|
||||||
}
|
|
||||||
// Define tags sent with each metric
|
// Define tags sent with each metric
|
||||||
// The 'type' tag is always needed, it defines the granularity of the metric
|
// The 'type' tag is always needed, it defines the granularity of the metric
|
||||||
// node -> whole system
|
// node -> whole system
|
||||||
@@ -66,15 +52,13 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
|
|||||||
// core -> single CPU core that may consist of multiple hardware threads (SMT) (requires core ID as 'type-id' tag)
|
// core -> single CPU core that may consist of multiple hardware threads (SMT) (requires core ID as 'type-id' tag)
|
||||||
// hwthtread -> single CPU hardware thread (requires hardware thread ID as 'type-id' tag)
|
// hwthtread -> single CPU hardware thread (requires hardware thread ID as 'type-id' tag)
|
||||||
// accelerator -> A accelerator device like GPU or FPGA (requires an accelerator ID as 'type-id' tag)
|
// accelerator -> A accelerator device like GPU or FPGA (requires an accelerator ID as 'type-id' tag)
|
||||||
m.tags = map[string]string{
|
m.tags = map[string]string{"type": "node"}
|
||||||
"type": "node",
|
|
||||||
}
|
|
||||||
// Read in the JSON configuration
|
// Read in the JSON configuration
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err = json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -101,11 +85,12 @@ func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
// stop := readState()
|
// stop := readState()
|
||||||
// value = (stop - start) / interval.Seconds()
|
// value = (stop - start) / interval.Seconds()
|
||||||
|
|
||||||
y, err := lp.NewMetric("sample_metric", m.tags, m.meta, value, timestamp)
|
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
// Send it to output channel
|
// Send it to output channel
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close metric collector: close network connection, close files, close libraries, ...
|
// Close metric collector: close network connection, close files, close libraries, ...
|
||||||
|
|||||||
@@ -1,21 +1,12 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// These are the fields we read from the JSON configuration
|
// These are the fields we read from the JSON configuration
|
||||||
@@ -27,7 +18,6 @@ type SampleTimerCollectorConfig struct {
|
|||||||
// defined by metricCollector (name, init, ...)
|
// defined by metricCollector (name, init, ...)
|
||||||
type SampleTimerCollector struct {
|
type SampleTimerCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
wg sync.WaitGroup // sync group for management
|
wg sync.WaitGroup // sync group for management
|
||||||
done chan bool // channel for management
|
done chan bool // channel for management
|
||||||
meta map[string]string // default meta information
|
meta map[string]string // default meta information
|
||||||
@@ -39,39 +29,33 @@ type SampleTimerCollector struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
|
func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
|
||||||
var err error
|
var err error = nil
|
||||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||||
m.name = "SampleTimerCollector"
|
m.name = "SampleTimerCollector"
|
||||||
// This is for later use, also call it early
|
// This is for later use, also call it early
|
||||||
if err = m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
// Define meta information sent with each metric
|
// Define meta information sent with each metric
|
||||||
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
|
||||||
"source": m.name,
|
|
||||||
"group": "SAMPLE",
|
|
||||||
}
|
|
||||||
// Define tags sent with each metric
|
// Define tags sent with each metric
|
||||||
// The 'type' tag is always needed, it defines the granularity of the metric
|
// The 'type' tag is always needed, it defines the granularity of the metric
|
||||||
// node -> whole system
|
// node -> whole system
|
||||||
// socket -> CPU socket (requires socket ID as 'type-id' tag)
|
// socket -> CPU socket (requires socket ID as 'type-id' tag)
|
||||||
// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
|
// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
|
||||||
m.tags = map[string]string{
|
m.tags = map[string]string{"type": "node"}
|
||||||
"type": "node",
|
|
||||||
}
|
|
||||||
// Read in the JSON configuration
|
// Read in the JSON configuration
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err = json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||||
return fmt.Errorf("%s Init(): error decoding JSON config: %w", m.name, err)
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Parse the read interval duration
|
// Parse the read interval duration
|
||||||
m.interval, err = time.ParseDuration(m.config.Interval)
|
m.interval, err = time.ParseDuration(m.config.Interval)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): error parsing interval: %w", m.name, err)
|
cclog.ComponentError(m.name, "Error parsing interval:", err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Storage for output channel
|
// Storage for output channel
|
||||||
@@ -82,11 +66,13 @@ func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
|
|||||||
m.ticker = time.NewTicker(m.interval)
|
m.ticker = time.NewTicker(m.interval)
|
||||||
|
|
||||||
// Start the timer loop with return functionality by sending 'true' to the done channel
|
// Start the timer loop with return functionality by sending 'true' to the done channel
|
||||||
m.wg.Go(func() {
|
m.wg.Add(1)
|
||||||
|
go func() {
|
||||||
select {
|
select {
|
||||||
case <-m.done:
|
case <-m.done:
|
||||||
// Exit the timer loop
|
// Exit the timer loop
|
||||||
cclog.ComponentDebug(m.name, "Closing...")
|
cclog.ComponentDebug(m.name, "Closing...")
|
||||||
|
m.wg.Done()
|
||||||
return
|
return
|
||||||
case timestamp := <-m.ticker.C:
|
case timestamp := <-m.ticker.C:
|
||||||
// This is executed every timer tick but we have to wait until the first
|
// This is executed every timer tick but we have to wait until the first
|
||||||
@@ -95,7 +81,7 @@ func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
|
|||||||
m.ReadMetrics(timestamp)
|
m.ReadMetrics(timestamp)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
}()
|
||||||
|
|
||||||
// Set this flag only if everything is initialized properly, all required files exist, ...
|
// Set this flag only if everything is initialized properly, all required files exist, ...
|
||||||
m.init = true
|
m.init = true
|
||||||
@@ -114,7 +100,7 @@ func (m *SampleTimerCollector) ReadMetrics(timestamp time.Time) {
|
|||||||
// stop := readState()
|
// stop := readState()
|
||||||
// value = (stop - start) / interval.Seconds()
|
// value = (stop - start) / interval.Seconds()
|
||||||
|
|
||||||
y, err := lp.NewMetric("sample_metric", m.tags, m.meta, value, timestamp)
|
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
|
||||||
if err == nil && m.output != nil {
|
if err == nil && m.output != nil {
|
||||||
// Send it to output channel if we have a valid channel
|
// Send it to output channel if we have a valid channel
|
||||||
m.output <- y
|
m.output <- y
|
||||||
|
|||||||
@@ -1,24 +1,17 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const SCHEDSTATFILE = `/proc/schedstat`
|
const SCHEDSTATFILE = `/proc/schedstat`
|
||||||
@@ -32,7 +25,6 @@ type SchedstatCollectorConfig struct {
|
|||||||
// defined by metricCollector (name, init, ...)
|
// defined by metricCollector (name, init, ...)
|
||||||
type SchedstatCollector struct {
|
type SchedstatCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
config SchedstatCollectorConfig // the configuration structure
|
config SchedstatCollectorConfig // the configuration structure
|
||||||
lastTimestamp time.Time // Store time stamp of last tick to derive values
|
lastTimestamp time.Time // Store time stamp of last tick to derive values
|
||||||
meta map[string]string // default meta information
|
meta map[string]string // default meta information
|
||||||
@@ -48,39 +40,37 @@ type SchedstatCollector struct {
|
|||||||
// Called once by the collector manager
|
// Called once by the collector manager
|
||||||
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
||||||
func (m *SchedstatCollector) Init(config json.RawMessage) error {
|
func (m *SchedstatCollector) Init(config json.RawMessage) error {
|
||||||
|
var err error = nil
|
||||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||||
m.name = "SchedstatCollector"
|
m.name = "SchedstatCollector"
|
||||||
// This is for later use, also call it early
|
// This is for later use, also call it early
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
// Tell whether the collector should be run in parallel with others (reading files, ...)
|
// Tell whether the collector should be run in parallel with others (reading files, ...)
|
||||||
// or it should be run serially, mostly for collectors actually doing measurements
|
// or it should be run serially, mostly for collectors acutally doing measurements
|
||||||
// because they should not measure the execution of the other collectors
|
// because they should not measure the execution of the other collectors
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
// Define meta information sent with each metric
|
// Define meta information sent with each metric
|
||||||
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{"source": m.name, "group": "SCHEDSTAT"}
|
||||||
"source": m.name,
|
|
||||||
"group": "SCHEDSTAT",
|
|
||||||
}
|
|
||||||
|
|
||||||
// Read in the JSON configuration
|
// Read in the JSON configuration
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err = json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||||
return fmt.Errorf("%s Init(): failed to decode JSON config: %w", m.name, err)
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check input file
|
// Check input file
|
||||||
file, err := os.Open(SCHEDSTATFILE)
|
file, err := os.Open(string(SCHEDSTATFILE))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): Failed opening scheduler statistics file \"%s\": %w", m.name, SCHEDSTATFILE, err)
|
cclog.ComponentError(m.name, err.Error())
|
||||||
}
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
// Pre-generate tags for all CPUs
|
// Pre-generate tags for all CPUs
|
||||||
|
num_cpus := 0
|
||||||
m.cputags = make(map[string]map[string]string)
|
m.cputags = make(map[string]map[string]string)
|
||||||
m.olddata = make(map[string]map[string]int64)
|
m.olddata = make(map[string]map[string]int64)
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
@@ -92,19 +82,11 @@ func (m *SchedstatCollector) Init(config json.RawMessage) error {
|
|||||||
cpu, _ := strconv.Atoi(cpustr)
|
cpu, _ := strconv.Atoi(cpustr)
|
||||||
running, _ := strconv.ParseInt(linefields[7], 10, 64)
|
running, _ := strconv.ParseInt(linefields[7], 10, 64)
|
||||||
waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
|
waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
|
||||||
m.cputags[linefields[0]] = map[string]string{
|
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
|
||||||
"type": "hwthread",
|
m.olddata[linefields[0]] = map[string]int64{"running": running, "waiting": waiting}
|
||||||
"type-id": strconv.Itoa(cpu),
|
num_cpus++
|
||||||
}
|
|
||||||
m.olddata[linefields[0]] = map[string]int64{
|
|
||||||
"running": running,
|
|
||||||
"waiting": waiting,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if err := file.Close(); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): Failed closing scheduler statistics file \"%s\": %w", m.name, SCHEDSTATFILE, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Save current timestamp
|
// Save current timestamp
|
||||||
m.lastTimestamp = time.Now()
|
m.lastTimestamp = time.Now()
|
||||||
@@ -120,14 +102,14 @@ func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]
|
|||||||
diff_running := running - m.olddata[linefields[0]]["running"]
|
diff_running := running - m.olddata[linefields[0]]["running"]
|
||||||
diff_waiting := waiting - m.olddata[linefields[0]]["waiting"]
|
diff_waiting := waiting - m.olddata[linefields[0]]["waiting"]
|
||||||
|
|
||||||
l_running := float64(diff_running) / tsdelta.Seconds() / 1000_000_000
|
var l_running float64 = float64(diff_running) / tsdelta.Seconds() / (math.Pow(1000, 3))
|
||||||
l_waiting := float64(diff_waiting) / tsdelta.Seconds() / 1000_000_000
|
var l_waiting float64 = float64(diff_waiting) / tsdelta.Seconds() / (math.Pow(1000, 3))
|
||||||
|
|
||||||
m.olddata[linefields[0]]["running"] = running
|
m.olddata[linefields[0]]["running"] = running
|
||||||
m.olddata[linefields[0]]["waiting"] = waiting
|
m.olddata[linefields[0]]["waiting"] = waiting
|
||||||
value := l_running + l_waiting
|
value := l_running + l_waiting
|
||||||
|
|
||||||
y, err := lp.NewMetric("cpu_load_core", tags, m.meta, value, now)
|
y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
// Send it to output channel
|
// Send it to output channel
|
||||||
output <- y
|
output <- y
|
||||||
@@ -141,23 +123,15 @@ func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// timestamps
|
//timestamps
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
tsdelta := now.Sub(m.lastTimestamp)
|
tsdelta := now.Sub(m.lastTimestamp)
|
||||||
|
|
||||||
file, err := os.Open(SCHEDSTATFILE)
|
file, err := os.Open(string(SCHEDSTATFILE))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, err.Error())
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", SCHEDSTATFILE, err))
|
|
||||||
}
|
}
|
||||||
defer func() {
|
defer file.Close()
|
||||||
if err := file.Close(); err != nil {
|
|
||||||
cclog.ComponentError(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", SCHEDSTATFILE, err))
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
@@ -169,6 +143,7 @@ func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
}
|
}
|
||||||
|
|
||||||
m.lastTimestamp = now
|
m.lastTimestamp = now
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close metric collector: close network connection, close files, close libraries, ...
|
// Close metric collector: close network connection, close files, close libraries, ...
|
||||||
|
|||||||
@@ -1,13 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: SchedStat Metric collector
|
|
||||||
description: Collect metrics from `/proc/schedstat`
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/schedstat.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `schedstat` collector
|
## `schedstat` collector
|
||||||
```json
|
```json
|
||||||
|
|||||||
@@ -1,21 +1,13 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
|
||||||
"runtime"
|
"runtime"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
type SelfCollectorConfig struct {
|
type SelfCollectorConfig struct {
|
||||||
@@ -27,7 +19,6 @@ type SelfCollectorConfig struct {
|
|||||||
|
|
||||||
type SelfCollector struct {
|
type SelfCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
config SelfCollectorConfig // the configuration structure
|
config SelfCollectorConfig // the configuration structure
|
||||||
meta map[string]string // default meta information
|
meta map[string]string // default meta information
|
||||||
tags map[string]string // default tags
|
tags map[string]string // default tags
|
||||||
@@ -36,22 +27,15 @@ type SelfCollector struct {
|
|||||||
func (m *SelfCollector) Init(config json.RawMessage) error {
|
func (m *SelfCollector) Init(config json.RawMessage) error {
|
||||||
var err error = nil
|
var err error = nil
|
||||||
m.name = "SelfCollector"
|
m.name = "SelfCollector"
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{"source": m.name, "group": "Self"}
|
||||||
"source": m.name,
|
m.tags = map[string]string{"type": "node"}
|
||||||
"group": "Self",
|
|
||||||
}
|
|
||||||
m.tags = map[string]string{
|
|
||||||
"type": "node",
|
|
||||||
}
|
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err = json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
@@ -65,49 +49,49 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
var memstats runtime.MemStats
|
var memstats runtime.MemStats
|
||||||
runtime.ReadMemStats(&memstats)
|
runtime.ReadMemStats(&memstats)
|
||||||
|
|
||||||
y, err := lp.NewMetric("total_alloc", m.tags, m.meta, memstats.TotalAlloc, timestamp)
|
y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "Bytes")
|
y.AddMeta("unit", "Bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMetric("heap_alloc", m.tags, m.meta, memstats.HeapAlloc, timestamp)
|
y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "Bytes")
|
y.AddMeta("unit", "Bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMetric("heap_sys", m.tags, m.meta, memstats.HeapSys, timestamp)
|
y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "Bytes")
|
y.AddMeta("unit", "Bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMetric("heap_idle", m.tags, m.meta, memstats.HeapIdle, timestamp)
|
y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "Bytes")
|
y.AddMeta("unit", "Bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMetric("heap_inuse", m.tags, m.meta, memstats.HeapInuse, timestamp)
|
y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "Bytes")
|
y.AddMeta("unit", "Bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMetric("heap_released", m.tags, m.meta, memstats.HeapReleased, timestamp)
|
y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "Bytes")
|
y.AddMeta("unit", "Bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMetric("heap_objects", m.tags, m.meta, memstats.HeapObjects, timestamp)
|
y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.config.GoRoutines {
|
if m.config.GoRoutines {
|
||||||
y, err := lp.NewMetric("num_goroutines", m.tags, m.meta, runtime.NumGoroutine(), timestamp)
|
y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.config.CgoCalls {
|
if m.config.CgoCalls {
|
||||||
y, err := lp.NewMetric("num_cgo_calls", m.tags, m.meta, runtime.NumCgoCall(), timestamp)
|
y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -118,35 +102,35 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
sec, nsec := rusage.Utime.Unix()
|
sec, nsec := rusage.Utime.Unix()
|
||||||
t := float64(sec) + (float64(nsec) * 1e-9)
|
t := float64(sec) + (float64(nsec) * 1e-9)
|
||||||
y, err := lp.NewMetric("rusage_user_time", m.tags, m.meta, t, timestamp)
|
y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "seconds")
|
y.AddMeta("unit", "seconds")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
sec, nsec = rusage.Stime.Unix()
|
sec, nsec = rusage.Stime.Unix()
|
||||||
t = float64(sec) + (float64(nsec) * 1e-9)
|
t = float64(sec) + (float64(nsec) * 1e-9)
|
||||||
y, err = lp.NewMetric("rusage_system_time", m.tags, m.meta, t, timestamp)
|
y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "seconds")
|
y.AddMeta("unit", "seconds")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMetric("rusage_vol_ctx_switch", m.tags, m.meta, rusage.Nvcsw, timestamp)
|
y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMetric("rusage_invol_ctx_switch", m.tags, m.meta, rusage.Nivcsw, timestamp)
|
y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMetric("rusage_signals", m.tags, m.meta, rusage.Nsignals, timestamp)
|
y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMetric("rusage_major_pgfaults", m.tags, m.meta, rusage.Majflt, timestamp)
|
y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMetric("rusage_minor_pgfaults", m.tags, m.meta, rusage.Minflt, timestamp)
|
y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,14 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: Self-monitoring metric collector
|
|
||||||
description: Collect metrics from the execution of cc-metric-collector itself
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/self.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `self` collector
|
## `self` collector
|
||||||
|
|
||||||
```json
|
```json
|
||||||
|
|||||||
@@ -1,412 +0,0 @@
|
|||||||
package collectors
|
|
||||||
|
|
||||||
import (
|
|
||||||
"encoding/json"
|
|
||||||
"fmt"
|
|
||||||
"os"
|
|
||||||
"os/exec"
|
|
||||||
"os/user"
|
|
||||||
"path/filepath"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
|
||||||
)
|
|
||||||
|
|
||||||
type SlurmJobData struct {
|
|
||||||
MemoryUsage float64
|
|
||||||
MaxMemoryUsage float64
|
|
||||||
LimitMemoryUsage float64
|
|
||||||
CpuUsageUser float64
|
|
||||||
CpuUsageSys float64
|
|
||||||
CpuSet []int
|
|
||||||
}
|
|
||||||
|
|
||||||
type SlurmCgroupsConfig struct {
|
|
||||||
CgroupBase string `json:"cgroup_base"`
|
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
|
||||||
UseSudo bool `json:"use_sudo,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type SlurmCgroupCollector struct {
|
|
||||||
metricCollector
|
|
||||||
|
|
||||||
config SlurmCgroupsConfig
|
|
||||||
meta map[string]string
|
|
||||||
tags map[string]string
|
|
||||||
allCPUs []int
|
|
||||||
cpuUsed map[int]bool
|
|
||||||
cgroupBase string
|
|
||||||
excludeMetrics map[string]struct{}
|
|
||||||
useSudo bool
|
|
||||||
}
|
|
||||||
|
|
||||||
const defaultCgroupBase = "/sys/fs/cgroup/system.slice/slurmstepd.scope"
|
|
||||||
|
|
||||||
func ParseCPUs(cpuset string) ([]int, error) {
|
|
||||||
var result []int
|
|
||||||
if cpuset == "" {
|
|
||||||
return result, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
for r := range strings.SplitSeq(cpuset, ",") {
|
|
||||||
if strings.Contains(r, "-") {
|
|
||||||
parts := strings.Split(r, "-")
|
|
||||||
if len(parts) != 2 {
|
|
||||||
return nil, fmt.Errorf("invalid CPU range: %s", r)
|
|
||||||
}
|
|
||||||
start, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid CPU range start: %s", parts[0])
|
|
||||||
}
|
|
||||||
end, err := strconv.Atoi(strings.TrimSpace(parts[1]))
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid CPU range end: %s", parts[1])
|
|
||||||
}
|
|
||||||
for i := start; i <= end; i++ {
|
|
||||||
result = append(result, i)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
cpu, err := strconv.Atoi(strings.TrimSpace(r))
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid CPU ID: %s", r)
|
|
||||||
}
|
|
||||||
result = append(result, cpu)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetAllCPUs() ([]int, error) {
|
|
||||||
cpuOnline := "/sys/devices/system/cpu/online"
|
|
||||||
data, err := os.ReadFile(cpuOnline)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to read file \"%s\": %w", cpuOnline, err)
|
|
||||||
}
|
|
||||||
return ParseCPUs(strings.TrimSpace(string(data)))
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *SlurmCgroupCollector) isExcluded(metric string) bool {
|
|
||||||
_, found := m.excludeMetrics[metric]
|
|
||||||
return found
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *SlurmCgroupCollector) readFile(path string) ([]byte, error) {
|
|
||||||
if m.useSudo {
|
|
||||||
cmd := exec.Command("sudo", "cat", path)
|
|
||||||
return cmd.Output()
|
|
||||||
}
|
|
||||||
return os.ReadFile(path)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
|
|
||||||
var err error
|
|
||||||
m.name = "SlurmCgroupCollector"
|
|
||||||
if err := m.setup(); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.parallel = true
|
|
||||||
m.meta = map[string]string{
|
|
||||||
"source": m.name,
|
|
||||||
"group": "SLURM",
|
|
||||||
}
|
|
||||||
m.tags = map[string]string{
|
|
||||||
"type": "hwthread",
|
|
||||||
}
|
|
||||||
m.cpuUsed = make(map[int]bool)
|
|
||||||
m.cgroupBase = defaultCgroupBase
|
|
||||||
|
|
||||||
if len(config) > 0 {
|
|
||||||
d := json.NewDecoder(strings.NewReader(string(config)))
|
|
||||||
d.DisallowUnknownFields()
|
|
||||||
if err = d.Decode(&m.config); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): Error reading JSON config: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.excludeMetrics = make(map[string]struct{})
|
|
||||||
for _, metric := range m.config.ExcludeMetrics {
|
|
||||||
m.excludeMetrics[metric] = struct{}{}
|
|
||||||
}
|
|
||||||
if m.config.CgroupBase != "" {
|
|
||||||
m.cgroupBase = m.config.CgroupBase
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
m.useSudo = m.config.UseSudo
|
|
||||||
if !m.useSudo {
|
|
||||||
user, err := user.Current()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
|
|
||||||
}
|
|
||||||
if user.Uid != "0" {
|
|
||||||
return fmt.Errorf("%s Init(): Reading cgroup files requires root privileges (or enable use_sudo in config)", m.name)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
m.allCPUs, err = GetAllCPUs()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): Error reading online CPUs: %w", m.name, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
m.init = true
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error) {
|
|
||||||
jobdata := SlurmJobData{
|
|
||||||
MemoryUsage: 0,
|
|
||||||
MaxMemoryUsage: 0,
|
|
||||||
LimitMemoryUsage: 0,
|
|
||||||
CpuUsageUser: 0,
|
|
||||||
CpuUsageSys: 0,
|
|
||||||
CpuSet: []int{},
|
|
||||||
}
|
|
||||||
|
|
||||||
cg := func(f string) string {
|
|
||||||
return filepath.Join(m.cgroupBase, jobdir, f)
|
|
||||||
}
|
|
||||||
|
|
||||||
memUsage, err := m.readFile(cg("memory.current"))
|
|
||||||
if err == nil {
|
|
||||||
x, err := strconv.ParseFloat(strings.TrimSpace(string(memUsage)), 64)
|
|
||||||
if err == nil {
|
|
||||||
jobdata.MemoryUsage = x
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
maxMem, err := m.readFile(cg("memory.peak"))
|
|
||||||
if err == nil {
|
|
||||||
x, err := strconv.ParseFloat(strings.TrimSpace(string(maxMem)), 64)
|
|
||||||
if err == nil {
|
|
||||||
jobdata.MaxMemoryUsage = x
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
limitMem, err := m.readFile(cg("memory.max"))
|
|
||||||
if err == nil {
|
|
||||||
x, err := strconv.ParseFloat(strings.TrimSpace(string(limitMem)), 64)
|
|
||||||
if err == nil {
|
|
||||||
jobdata.LimitMemoryUsage = x
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cpuStat, err := m.readFile(cg("cpu.stat"))
|
|
||||||
if err == nil {
|
|
||||||
lines := strings.Split(strings.TrimSpace(string(cpuStat)), "\n")
|
|
||||||
var usageUsec, userUsec, systemUsec float64
|
|
||||||
for _, line := range lines {
|
|
||||||
fields := strings.Fields(line)
|
|
||||||
if len(fields) < 2 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
value, err := strconv.ParseFloat(fields[1], 64)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
switch fields[0] {
|
|
||||||
case "usage_usec":
|
|
||||||
usageUsec = value
|
|
||||||
case "user_usec":
|
|
||||||
userUsec = value
|
|
||||||
case "system_usec":
|
|
||||||
systemUsec = value
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if usageUsec > 0 {
|
|
||||||
jobdata.CpuUsageUser = (userUsec * 100.0 / usageUsec)
|
|
||||||
jobdata.CpuUsageSys = (systemUsec * 100.0 / usageUsec)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cpuSet, err := m.readFile(cg("cpuset.cpus"))
|
|
||||||
if err == nil {
|
|
||||||
cpus, err := ParseCPUs(strings.TrimSpace(string(cpuSet)))
|
|
||||||
if err == nil {
|
|
||||||
jobdata.CpuSet = cpus
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return jobdata, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|
||||||
timestamp := time.Now()
|
|
||||||
|
|
||||||
for k := range m.cpuUsed {
|
|
||||||
delete(m.cpuUsed, k)
|
|
||||||
}
|
|
||||||
|
|
||||||
globPattern := filepath.Join(m.cgroupBase, "job_*")
|
|
||||||
jobDirs, err := filepath.Glob(globPattern)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(m.name, "Error globbing job directories:", err.Error())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, jdir := range jobDirs {
|
|
||||||
jKey := filepath.Base(jdir)
|
|
||||||
|
|
||||||
jobdata, err := m.ReadJobData(jKey)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(m.name, "Error reading job data for", jKey, ":", err.Error())
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(jobdata.CpuSet) > 0 {
|
|
||||||
coreCount := float64(len(jobdata.CpuSet))
|
|
||||||
for _, cpu := range jobdata.CpuSet {
|
|
||||||
coreTags := map[string]string{
|
|
||||||
"type": "hwthread",
|
|
||||||
"type-id": strconv.Itoa(cpu),
|
|
||||||
}
|
|
||||||
|
|
||||||
if coreCount > 0 && !m.isExcluded("job_mem_used") {
|
|
||||||
memPerCore := jobdata.MemoryUsage / coreCount
|
|
||||||
if y, err := lp.NewMessage(
|
|
||||||
"job_mem_used",
|
|
||||||
coreTags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": memPerCore,
|
|
||||||
},
|
|
||||||
timestamp); err == nil {
|
|
||||||
y.AddMeta("unit", "Bytes")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if coreCount > 0 && !m.isExcluded("job_max_mem_used") {
|
|
||||||
maxMemPerCore := jobdata.MaxMemoryUsage / coreCount
|
|
||||||
if y, err := lp.NewMessage(
|
|
||||||
"job_max_mem_used",
|
|
||||||
coreTags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": maxMemPerCore,
|
|
||||||
},
|
|
||||||
timestamp); err == nil {
|
|
||||||
y.AddMeta("unit", "Bytes")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if coreCount > 0 && !m.isExcluded("job_mem_limit") {
|
|
||||||
limitPerCore := jobdata.LimitMemoryUsage / coreCount
|
|
||||||
if y, err := lp.NewMessage(
|
|
||||||
"job_mem_limit",
|
|
||||||
coreTags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": limitPerCore,
|
|
||||||
},
|
|
||||||
timestamp); err == nil {
|
|
||||||
y.AddMeta("unit", "Bytes")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if coreCount > 0 && !m.isExcluded("job_user_cpu") {
|
|
||||||
cpuUserPerCore := jobdata.CpuUsageUser / coreCount
|
|
||||||
if y, err := lp.NewMessage(
|
|
||||||
"job_user_cpu",
|
|
||||||
coreTags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": cpuUserPerCore,
|
|
||||||
},
|
|
||||||
timestamp); err == nil {
|
|
||||||
y.AddMeta("unit", "%")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if coreCount > 0 && !m.isExcluded("job_sys_cpu") {
|
|
||||||
cpuSysPerCore := jobdata.CpuUsageSys / coreCount
|
|
||||||
if y, err := lp.NewMessage(
|
|
||||||
"job_sys_cpu",
|
|
||||||
coreTags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": cpuSysPerCore,
|
|
||||||
},
|
|
||||||
timestamp); err == nil {
|
|
||||||
y.AddMeta("unit", "%")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
m.cpuUsed[cpu] = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, cpu := range m.allCPUs {
|
|
||||||
if !m.cpuUsed[cpu] {
|
|
||||||
coreTags := map[string]string{
|
|
||||||
"type": "hwthread",
|
|
||||||
"type-id": strconv.Itoa(cpu),
|
|
||||||
}
|
|
||||||
|
|
||||||
if !m.isExcluded("job_mem_used") {
|
|
||||||
if y, err := lp.NewMessage(
|
|
||||||
"job_mem_used",
|
|
||||||
coreTags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": 0,
|
|
||||||
},
|
|
||||||
timestamp); err == nil {
|
|
||||||
y.AddMeta("unit", "Bytes")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !m.isExcluded("job_max_mem_used") {
|
|
||||||
if y, err := lp.NewMessage(
|
|
||||||
"job_max_mem_used",
|
|
||||||
coreTags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": 0,
|
|
||||||
},
|
|
||||||
timestamp); err == nil {
|
|
||||||
y.AddMeta("unit", "Bytes")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !m.isExcluded("job_mem_limit") {
|
|
||||||
if y, err := lp.NewMessage(
|
|
||||||
"job_mem_limit",
|
|
||||||
coreTags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": 0,
|
|
||||||
},
|
|
||||||
timestamp); err == nil {
|
|
||||||
y.AddMeta("unit", "Bytes")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !m.isExcluded("job_user_cpu") {
|
|
||||||
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
|
|
||||||
y.AddMeta("unit", "%")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !m.isExcluded("job_sys_cpu") {
|
|
||||||
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
|
|
||||||
y.AddMeta("unit", "%")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *SlurmCgroupCollector) Close() {
|
|
||||||
m.init = false
|
|
||||||
}
|
|
||||||
@@ -1,50 +0,0 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: Slurm cgroup metric collector
|
|
||||||
description: Collect per-core memory and CPU usage for SLURM jobs from cgroup v2
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 3
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/slurm_cgroup.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `slurm_cgroup` collector
|
|
||||||
|
|
||||||
The `slurm_cgroup` collector reads job-specific resource metrics from the cgroup v2 filesystem and provides **hwthread** metrics for memory and CPU usage of running SLURM jobs.
|
|
||||||
|
|
||||||
### Example configuration
|
|
||||||
|
|
||||||
```json
|
|
||||||
"slurm_cgroup": {
|
|
||||||
"cgroup_base": "/sys/fs/cgroup/system.slice/slurmstepd.scope",
|
|
||||||
"exclude_metrics": [
|
|
||||||
"job_sys_cpu",
|
|
||||||
"job_mem_limit"
|
|
||||||
],
|
|
||||||
"use_sudo": false
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
* The `cgroup_base` parameter (optional) can be set to specify the root path to SLURM job cgroups. The default is `/sys/fs/cgroup/system.slice/slurmstepd.scope`.
|
|
||||||
* The `exclude_metrics` array can be used to suppress individual metrics from being sent to the sink.
|
|
||||||
* The cgroups metrics are only available for root users. If password-less sudo is configured, you can enable sudo in the configuration.
|
|
||||||
|
|
||||||
### Reported metrics
|
|
||||||
|
|
||||||
All metrics are available **per hardware thread** :
|
|
||||||
|
|
||||||
* `job_mem_used` (`unit=Bytes`): Current memory usage of the job
|
|
||||||
* `job_max_mem_used` (`unit=Bytes`): Peak memory usage
|
|
||||||
* `job_mem_limit` (`unit=Bytes`): Cgroup memory limit
|
|
||||||
* `job_user_cpu` (`unit=%`): User CPU utilization percentage
|
|
||||||
* `job_sys_cpu` (`unit=%`): System CPU utilization percentage
|
|
||||||
|
|
||||||
Each metric has tags:
|
|
||||||
|
|
||||||
* `type=hwthread`
|
|
||||||
* `type-id=<core_id>`
|
|
||||||
|
|
||||||
### Limitations
|
|
||||||
|
|
||||||
* **cgroups v2 required:** This collector only supports systems running with cgroups v2 (unified hierarchy).
|
|
||||||
@@ -1,360 +0,0 @@
|
|||||||
package collectors
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
|
||||||
"fmt"
|
|
||||||
"os/exec"
|
|
||||||
"slices"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
|
||||||
)
|
|
||||||
|
|
||||||
type SmartMonCollectorConfig struct {
|
|
||||||
UseSudo bool `json:"use_sudo,omitempty"`
|
|
||||||
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
|
||||||
ExcludeMetrics []string `json:"excludeMetrics,omitempty"`
|
|
||||||
Devices []struct {
|
|
||||||
Name string `json:"name"`
|
|
||||||
Type string `json:"type"`
|
|
||||||
} `json:"devices,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type deviceT struct {
|
|
||||||
Name string `json:"name"`
|
|
||||||
Type string `json:"type"`
|
|
||||||
queryCommand []string
|
|
||||||
}
|
|
||||||
|
|
||||||
type SmartMonCollector struct {
|
|
||||||
metricCollector
|
|
||||||
config SmartMonCollectorConfig // the configuration structure
|
|
||||||
meta map[string]string // default meta information
|
|
||||||
tags map[string]string // default tags
|
|
||||||
devices []deviceT // smartmon devices
|
|
||||||
sudoCmd string // Full path to 'sudo' command
|
|
||||||
smartCtlCmd string // Full path to 'smartctl' command
|
|
||||||
excludeMetric struct {
|
|
||||||
temp,
|
|
||||||
percentUsed,
|
|
||||||
availSpare,
|
|
||||||
dataUnitsRead,
|
|
||||||
dataUnitsWrite,
|
|
||||||
hostReads,
|
|
||||||
hostWrites,
|
|
||||||
powerCycles,
|
|
||||||
powerOn,
|
|
||||||
UnsafeShutdowns,
|
|
||||||
mediaErrors,
|
|
||||||
errlogEntries,
|
|
||||||
warnTempTime,
|
|
||||||
critCompTime bool
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *SmartMonCollector) getSmartmonDevices() error {
|
|
||||||
// Use configured devices
|
|
||||||
if len(m.config.Devices) > 0 {
|
|
||||||
for _, configDevice := range m.config.Devices {
|
|
||||||
if !slices.Contains(m.config.ExcludeDevices, configDevice.Name) {
|
|
||||||
d := deviceT{
|
|
||||||
Name: configDevice.Name,
|
|
||||||
Type: configDevice.Type,
|
|
||||||
}
|
|
||||||
if m.config.UseSudo {
|
|
||||||
d.queryCommand = append(d.queryCommand, m.sudoCmd)
|
|
||||||
}
|
|
||||||
d.queryCommand = append(d.queryCommand, m.smartCtlCmd, "--json=c", "--device="+d.Type, "--all", d.Name)
|
|
||||||
|
|
||||||
m.devices = append(m.devices, d)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use scan command
|
|
||||||
var scanCmd []string
|
|
||||||
if m.config.UseSudo {
|
|
||||||
scanCmd = append(scanCmd, m.sudoCmd)
|
|
||||||
}
|
|
||||||
scanCmd = append(scanCmd, m.smartCtlCmd, "--scan", "--json=c")
|
|
||||||
command := exec.Command(scanCmd[0], scanCmd[1:]...)
|
|
||||||
|
|
||||||
stdout, err := command.Output()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf(
|
|
||||||
"%s getSmartmonDevices(): Failed to execute device scan command %s: %w",
|
|
||||||
m.name, command.String(), err)
|
|
||||||
}
|
|
||||||
|
|
||||||
var scanOutput struct {
|
|
||||||
Devices []deviceT `json:"devices"`
|
|
||||||
}
|
|
||||||
err = json.Unmarshal(stdout, &scanOutput)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("%s getSmartmonDevices(): Failed to parse JSON output from device scan command: %w",
|
|
||||||
m.name, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
m.devices = make([]deviceT, 0)
|
|
||||||
for _, d := range scanOutput.Devices {
|
|
||||||
if !slices.Contains(m.config.ExcludeDevices, d.Name) {
|
|
||||||
if m.config.UseSudo {
|
|
||||||
d.queryCommand = append(d.queryCommand, m.sudoCmd)
|
|
||||||
}
|
|
||||||
d.queryCommand = append(d.queryCommand, m.smartCtlCmd, "--json=c", "--device="+d.Type, "--all", d.Name)
|
|
||||||
|
|
||||||
m.devices = append(m.devices, d)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *SmartMonCollector) Init(config json.RawMessage) error {
|
|
||||||
m.name = "SmartMonCollector"
|
|
||||||
if err := m.setup(); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.parallel = true
|
|
||||||
m.meta = map[string]string{
|
|
||||||
"source": m.name,
|
|
||||||
"group": "Disk",
|
|
||||||
}
|
|
||||||
m.tags = map[string]string{
|
|
||||||
"type": "node",
|
|
||||||
"stype": "disk",
|
|
||||||
}
|
|
||||||
|
|
||||||
// Read in the JSON configuration
|
|
||||||
if len(config) > 0 {
|
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
|
||||||
d.DisallowUnknownFields()
|
|
||||||
if err := d.Decode(&m.config); err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): Error reading config: %w", m.name, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, excludeMetric := range m.config.ExcludeMetrics {
|
|
||||||
switch excludeMetric {
|
|
||||||
case "smartmon_temp":
|
|
||||||
m.excludeMetric.temp = true
|
|
||||||
case "smartmon_percent_used":
|
|
||||||
m.excludeMetric.percentUsed = true
|
|
||||||
case "smartmon_avail_spare":
|
|
||||||
m.excludeMetric.availSpare = true
|
|
||||||
case "smartmon_data_units_read":
|
|
||||||
m.excludeMetric.dataUnitsRead = true
|
|
||||||
case "smartmon_data_units_write":
|
|
||||||
m.excludeMetric.dataUnitsWrite = true
|
|
||||||
case "smartmon_host_reads":
|
|
||||||
m.excludeMetric.hostReads = true
|
|
||||||
case "smartmon_host_writes":
|
|
||||||
m.excludeMetric.hostWrites = true
|
|
||||||
case "smartmon_power_cycles":
|
|
||||||
m.excludeMetric.powerCycles = true
|
|
||||||
case "smartmon_power_on":
|
|
||||||
m.excludeMetric.powerOn = true
|
|
||||||
case "smartmon_unsafe_shutdowns":
|
|
||||||
m.excludeMetric.UnsafeShutdowns = true
|
|
||||||
case "smartmon_media_errors":
|
|
||||||
m.excludeMetric.mediaErrors = true
|
|
||||||
case "smartmon_errlog_entries":
|
|
||||||
m.excludeMetric.errlogEntries = true
|
|
||||||
case "smartmon_warn_temp_time":
|
|
||||||
m.excludeMetric.warnTempTime = true
|
|
||||||
case "smartmon_crit_comp_time":
|
|
||||||
m.excludeMetric.critCompTime = true
|
|
||||||
default:
|
|
||||||
return fmt.Errorf("%s Init(): Unknown excluded metric: %s", m.name, excludeMetric)
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if sudo and smartctl are in search path
|
|
||||||
if m.config.UseSudo {
|
|
||||||
p, err := exec.LookPath("sudo")
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): No sudo command found in search path: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.sudoCmd = p
|
|
||||||
}
|
|
||||||
p, err := exec.LookPath("smartctl")
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("%s Init(): No smartctl command found in search path: %w", m.name, err)
|
|
||||||
}
|
|
||||||
m.smartCtlCmd = p
|
|
||||||
|
|
||||||
if err = m.getSmartmonDevices(); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
m.init = true
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
type SmartMonData struct {
|
|
||||||
SerialNumber string `json:"serial_number"`
|
|
||||||
UserCapacity struct {
|
|
||||||
Blocks int `json:"blocks"`
|
|
||||||
Bytes int `json:"bytes"`
|
|
||||||
} `json:"user_capacity"`
|
|
||||||
HealthLog struct {
|
|
||||||
// Available SMART health information:
|
|
||||||
// sudo smartctl -a --json=c /dev/nvme0 | jq --color-output | less --RAW-CONTROL-CHARS
|
|
||||||
Temperature int `json:"temperature"`
|
|
||||||
PercentageUsed int `json:"percentage_used"`
|
|
||||||
AvailableSpare int `json:"available_spare"`
|
|
||||||
DataUnitsRead int `json:"data_units_read"`
|
|
||||||
DataUnitsWrite int `json:"data_units_written"`
|
|
||||||
HostReads int `json:"host_reads"`
|
|
||||||
HostWrites int `json:"host_writes"`
|
|
||||||
PowerCycles int `json:"power_cycles"`
|
|
||||||
PowerOnHours int `json:"power_on_hours"`
|
|
||||||
UnsafeShutdowns int `json:"unsafe_shutdowns"`
|
|
||||||
MediaErrors int `json:"media_errors"`
|
|
||||||
NumErrorLogEntries int `json:"num_err_log_entries"`
|
|
||||||
WarnTempTime int `json:"warning_temp_time"`
|
|
||||||
CriticalCompTime int `json:"critical_comp_time"`
|
|
||||||
} `json:"nvme_smart_health_information_log"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *SmartMonCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|
||||||
timestamp := time.Now()
|
|
||||||
for _, d := range m.devices {
|
|
||||||
var data SmartMonData
|
|
||||||
command := exec.Command(d.queryCommand[0], d.queryCommand[1:]...)
|
|
||||||
|
|
||||||
stdout, err := command.Output()
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(m.name, "cannot read data for device", d.Name)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
err = json.Unmarshal(stdout, &data)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(m.name, "cannot unmarshal data for device", d.Name)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if !m.excludeMetric.temp {
|
|
||||||
y, err := lp.NewMetric(
|
|
||||||
"smartmon_temp", m.tags, m.meta, data.HealthLog.Temperature, timestamp)
|
|
||||||
if err == nil {
|
|
||||||
y.AddTag("stype-id", d.Name)
|
|
||||||
y.AddMeta("unit", "degC")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !m.excludeMetric.percentUsed {
|
|
||||||
y, err := lp.NewMetric(
|
|
||||||
"smartmon_percent_used", m.tags, m.meta, data.HealthLog.PercentageUsed, timestamp)
|
|
||||||
if err == nil {
|
|
||||||
y.AddTag("stype-id", d.Name)
|
|
||||||
y.AddMeta("unit", "percent")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !m.excludeMetric.availSpare {
|
|
||||||
y, err := lp.NewMetric(
|
|
||||||
"smartmon_avail_spare", m.tags, m.meta, data.HealthLog.AvailableSpare, timestamp)
|
|
||||||
if err == nil {
|
|
||||||
y.AddTag("stype-id", d.Name)
|
|
||||||
y.AddMeta("unit", "percent")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !m.excludeMetric.dataUnitsRead {
|
|
||||||
y, err := lp.NewMetric(
|
|
||||||
"smartmon_data_units_read", m.tags, m.meta, data.HealthLog.DataUnitsRead, timestamp)
|
|
||||||
if err == nil {
|
|
||||||
y.AddTag("stype-id", d.Name)
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !m.excludeMetric.dataUnitsWrite {
|
|
||||||
y, err := lp.NewMetric(
|
|
||||||
"smartmon_data_units_write", m.tags, m.meta, data.HealthLog.DataUnitsWrite, timestamp)
|
|
||||||
if err == nil {
|
|
||||||
y.AddTag("stype-id", d.Name)
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !m.excludeMetric.hostReads {
|
|
||||||
y, err := lp.NewMetric(
|
|
||||||
"smartmon_host_reads", m.tags, m.meta, data.HealthLog.HostReads, timestamp)
|
|
||||||
if err == nil {
|
|
||||||
y.AddTag("stype-id", d.Name)
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !m.excludeMetric.hostWrites {
|
|
||||||
y, err := lp.NewMetric(
|
|
||||||
"smartmon_host_writes", m.tags, m.meta, data.HealthLog.HostWrites, timestamp)
|
|
||||||
if err == nil {
|
|
||||||
y.AddTag("stype-id", d.Name)
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !m.excludeMetric.powerCycles {
|
|
||||||
y, err := lp.NewMetric(
|
|
||||||
"smartmon_power_cycles", m.tags, m.meta, data.HealthLog.PowerCycles, timestamp)
|
|
||||||
if err == nil {
|
|
||||||
y.AddTag("stype-id", d.Name)
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !m.excludeMetric.powerOn {
|
|
||||||
y, err := lp.NewMetric(
|
|
||||||
"smartmon_power_on", m.tags, m.meta, int64(data.HealthLog.PowerOnHours)*3600, timestamp)
|
|
||||||
if err == nil {
|
|
||||||
y.AddTag("stype-id", d.Name)
|
|
||||||
y.AddMeta("unit", "sec")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !m.excludeMetric.UnsafeShutdowns {
|
|
||||||
y, err := lp.NewMetric(
|
|
||||||
"smartmon_unsafe_shutdowns", m.tags, m.meta, data.HealthLog.UnsafeShutdowns, timestamp)
|
|
||||||
if err == nil {
|
|
||||||
y.AddTag("stype-id", d.Name)
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !m.excludeMetric.mediaErrors {
|
|
||||||
y, err := lp.NewMetric(
|
|
||||||
"smartmon_media_errors", m.tags, m.meta, data.HealthLog.MediaErrors, timestamp)
|
|
||||||
if err == nil {
|
|
||||||
y.AddTag("stype-id", d.Name)
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !m.excludeMetric.errlogEntries {
|
|
||||||
y, err := lp.NewMetric(
|
|
||||||
"smartmon_errlog_entries", m.tags, m.meta, data.HealthLog.NumErrorLogEntries, timestamp)
|
|
||||||
if err == nil {
|
|
||||||
y.AddTag("stype-id", d.Name)
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !m.excludeMetric.warnTempTime {
|
|
||||||
y, err := lp.NewMetric(
|
|
||||||
"smartmon_warn_temp_time", m.tags, m.meta, data.HealthLog.WarnTempTime, timestamp)
|
|
||||||
if err == nil {
|
|
||||||
y.AddTag("stype-id", d.Name)
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !m.excludeMetric.critCompTime {
|
|
||||||
y, err := lp.NewMetric(
|
|
||||||
"smartmon_crit_comp_time", m.tags, m.meta, data.HealthLog.CriticalCompTime, timestamp)
|
|
||||||
if err == nil {
|
|
||||||
y.AddTag("stype-id", d.Name)
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *SmartMonCollector) Close() {
|
|
||||||
m.init = false
|
|
||||||
}
|
|
||||||
@@ -1,67 +0,0 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: smartmon metric collector
|
|
||||||
description: Collect S.M.A.R.T data from NVMEs
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/smartmonMetric.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
## `smartmon` collector
|
|
||||||
|
|
||||||
```json
|
|
||||||
"smartmon": {
|
|
||||||
"use_sudo": true,
|
|
||||||
"exclude_devices": [
|
|
||||||
"/dev/sda"
|
|
||||||
],
|
|
||||||
"excludeMetrics": [
|
|
||||||
"smartmon_warn_temp_time",
|
|
||||||
"smartmon_crit_comp_time"
|
|
||||||
],
|
|
||||||
"devices": [
|
|
||||||
{
|
|
||||||
"name": "/dev/nvme0",
|
|
||||||
"type": "nvme"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
The `smartmon` collector retrieves S.M.A.R.T data from NVMEs via command `smartctl`.
|
|
||||||
|
|
||||||
Available NVMEs can be either automatically detected by a device scan or manually added with the "devices" config option.
|
|
||||||
|
|
||||||
Metrics:
|
|
||||||
|
|
||||||
* `smartmon_temp`: Temperature of the device (`unit=degC`)
|
|
||||||
* `smartmon_avail_spare`: Amount of spare left (`unit=percent`)
|
|
||||||
* `smartmon_percent_used`: Percentage of the device is used (`unit=percent`)
|
|
||||||
* `smartmon_data_units_read`: Read data units
|
|
||||||
* `smartmon_data_units_write`: Written data units
|
|
||||||
* `smartmon_host_reads`: Read operations
|
|
||||||
* `smartmon_host_writes`: Write operations
|
|
||||||
* `smartmon_power_cycles`: Number of power cycles
|
|
||||||
* `smartmon_power_on`: Seconds the device is powered on (`unit=seconds`)
|
|
||||||
* `smartmon_unsafe_shutdowns`: Count of unsafe shutdowns
|
|
||||||
* `smartmon_media_errors`: Media errors of the device
|
|
||||||
* `smartmon_errlog_entries`: Error log entries
|
|
||||||
* `smartmon_warn_temp_time`: Time above the warning temperature threshold
|
|
||||||
* `smartmon_crit_comp_time`: Time above the critical composite temperature threshold
|
|
||||||
|
|
||||||
`smartctl` typically require root to run.
|
|
||||||
In order to run `cc-metric-collector` without root priviliges, you can enable `use_sudo`.
|
|
||||||
Add a file like this in `/etc/sudoers.d/` to allow `cc-metric-collector` to run the required command:
|
|
||||||
|
|
||||||
```
|
|
||||||
# Do not log the following sudo commands from monitoring, since this causes a lot of log spam.
|
|
||||||
# However keep log_denied enabled, to detect failures
|
|
||||||
Defaults: monitoring !log_allowed, !pam_session
|
|
||||||
|
|
||||||
# Allow to use lctl
|
|
||||||
monitoring ALL = (root) NOPASSWD:/absolute/path/to/smartctl --json=c --device=* "--all" *
|
|
||||||
# Or add individual rules for each device
|
|
||||||
# monitoring ALL = (root) NOPASSWD:/absolute/path/to/smartctl --json=c --device=<device_type> "--all" <device>
|
|
||||||
```
|
|
||||||
@@ -1,14 +1,6 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
@@ -17,8 +9,8 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
|
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
|
||||||
@@ -42,7 +34,6 @@ type TempCollectorSensor struct {
|
|||||||
|
|
||||||
type TempCollector struct {
|
type TempCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
config struct {
|
config struct {
|
||||||
ExcludeMetrics []string `json:"exclude_metrics"`
|
ExcludeMetrics []string `json:"exclude_metrics"`
|
||||||
TagOverride map[string]map[string]string `json:"tag_override"`
|
TagOverride map[string]map[string]string `json:"tag_override"`
|
||||||
@@ -60,14 +51,11 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
m.name = "TempCollector"
|
m.name = "TempCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err := json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
return err
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -83,10 +71,10 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
|||||||
globPattern := filepath.Join("/sys/class/hwmon", "*", "temp*_input")
|
globPattern := filepath.Join("/sys/class/hwmon", "*", "temp*_input")
|
||||||
inputFiles, err := filepath.Glob(globPattern)
|
inputFiles, err := filepath.Glob(globPattern)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): unable to glob files with pattern '%s': %w", m.name, globPattern, err)
|
return fmt.Errorf("unable to glob files with pattern '%s': %v", globPattern, err)
|
||||||
}
|
}
|
||||||
if inputFiles == nil {
|
if inputFiles == nil {
|
||||||
return fmt.Errorf("%s Init(): unable to find any files with pattern '%s'", m.name, globPattern)
|
return fmt.Errorf("unable to find any files with pattern '%s'", globPattern)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get sensor name for each temperature sensor file
|
// Get sensor name for each temperature sensor file
|
||||||
@@ -122,7 +110,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
|||||||
sensor.metricName = sensor.label
|
sensor.metricName = sensor.label
|
||||||
}
|
}
|
||||||
sensor.metricName = strings.ToLower(sensor.metricName)
|
sensor.metricName = strings.ToLower(sensor.metricName)
|
||||||
sensor.metricName = strings.ReplaceAll(sensor.metricName, " ", "_")
|
sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1)
|
||||||
// Add temperature prefix, if required
|
// Add temperature prefix, if required
|
||||||
if !strings.Contains(sensor.metricName, "temp") {
|
if !strings.Contains(sensor.metricName, "temp") {
|
||||||
sensor.metricName = "temp_" + sensor.metricName
|
sensor.metricName = "temp_" + sensor.metricName
|
||||||
@@ -175,7 +163,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Empty sensors map
|
// Empty sensors map
|
||||||
if len(m.sensors) == 0 {
|
if len(m.sensors) == 0 {
|
||||||
return fmt.Errorf("%s Init(): no temperature sensors found", m.name)
|
return fmt.Errorf("no temperature sensors found")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finished initialization
|
// Finished initialization
|
||||||
@@ -184,6 +172,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
|
|
||||||
for _, sensor := range m.sensors {
|
for _, sensor := range m.sensors {
|
||||||
// Read sensor file
|
// Read sensor file
|
||||||
buffer, err := os.ReadFile(sensor.file)
|
buffer, err := os.ReadFile(sensor.file)
|
||||||
@@ -205,7 +194,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
sensor.metricName,
|
sensor.metricName,
|
||||||
sensor.tags,
|
sensor.tags,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]any{"value": x},
|
map[string]interface{}{"value": x},
|
||||||
time.Now(),
|
time.Now(),
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -218,7 +207,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
sensor.maxTempName,
|
sensor.maxTempName,
|
||||||
sensor.tags,
|
sensor.tags,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]any{"value": sensor.maxTemp},
|
map[string]interface{}{"value": sensor.maxTemp},
|
||||||
time.Now(),
|
time.Now(),
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -232,7 +221,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
sensor.critTempName,
|
sensor.critTempName,
|
||||||
sensor.tags,
|
sensor.tags,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]any{"value": sensor.critTemp},
|
map[string]interface{}{"value": sensor.critTemp},
|
||||||
time.Now(),
|
time.Now(),
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -240,6 +229,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *TempCollector) Close() {
|
func (m *TempCollector) Close() {
|
||||||
|
|||||||
@@ -1,23 +1,12 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: Temperature metric collector
|
|
||||||
description: Collect thermal metrics from `/sys/class/hwmon/*`
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/temp.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
|
|
||||||
## `tempstat` collector
|
## `tempstat` collector
|
||||||
|
|
||||||
```json
|
```json
|
||||||
"tempstat": {
|
"tempstat": {
|
||||||
"tag_override": {
|
"tag_override" : {
|
||||||
"<device like hwmon1>": {
|
"<device like hwmon1>" : {
|
||||||
"type": "socket",
|
"type" : "socket",
|
||||||
"type-id": "0"
|
"type-id" : "0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"exclude_metrics": [
|
"exclude_metrics": [
|
||||||
|
|||||||
@@ -1,28 +1,19 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const MAX_NUM_PROCS = 10
|
||||||
MAX_NUM_PROCS = 10
|
const DEFAULT_NUM_PROCS = 2
|
||||||
DEFAULT_NUM_PROCS = 2
|
|
||||||
)
|
|
||||||
|
|
||||||
type TopProcsCollectorConfig struct {
|
type TopProcsCollectorConfig struct {
|
||||||
Num_procs int `json:"num_procs"`
|
Num_procs int `json:"num_procs"`
|
||||||
@@ -30,7 +21,6 @@ type TopProcsCollectorConfig struct {
|
|||||||
|
|
||||||
type TopProcsCollector struct {
|
type TopProcsCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
|
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
config TopProcsCollectorConfig
|
config TopProcsCollectorConfig
|
||||||
}
|
}
|
||||||
@@ -39,18 +29,12 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
|
|||||||
var err error
|
var err error
|
||||||
m.name = "TopProcsCollector"
|
m.name = "TopProcsCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.tags = map[string]string{
|
m.tags = map[string]string{"type": "node"}
|
||||||
"type": "node",
|
m.meta = map[string]string{"source": m.name, "group": "TopProcs"}
|
||||||
}
|
|
||||||
m.meta = map[string]string{
|
|
||||||
"source": m.name,
|
|
||||||
"group": "TopProcs",
|
|
||||||
}
|
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
d := json.NewDecoder(bytes.NewReader(config))
|
err = json.Unmarshal(config, &m.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&m.config); err != nil {
|
return err
|
||||||
return fmt.Errorf("%s Init(): Error decoding JSON config: %w", m.name, err)
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
m.config.Num_procs = int(DEFAULT_NUM_PROCS)
|
m.config.Num_procs = int(DEFAULT_NUM_PROCS)
|
||||||
@@ -58,13 +42,12 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
|
|||||||
if m.config.Num_procs <= 0 || m.config.Num_procs > MAX_NUM_PROCS {
|
if m.config.Num_procs <= 0 || m.config.Num_procs > MAX_NUM_PROCS {
|
||||||
return fmt.Errorf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS)
|
return fmt.Errorf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS)
|
||||||
}
|
}
|
||||||
if err := m.setup(); err != nil {
|
m.setup()
|
||||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
|
||||||
}
|
|
||||||
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
||||||
|
command.Wait()
|
||||||
_, err = command.Output()
|
_, err = command.Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("%s Init(): failed to get output from command: %w", m.name, err)
|
return errors.New("failed to execute command")
|
||||||
}
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
@@ -75,25 +58,17 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessag
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
||||||
|
command.Wait()
|
||||||
stdout, err := command.Output()
|
stdout, err := command.Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
log.Print(m.name, err)
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to read output from command \"%s\": %v", command.String(), err))
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
lines := strings.Split(string(stdout), "\n")
|
lines := strings.Split(string(stdout), "\n")
|
||||||
for i := 1; i < m.config.Num_procs+1; i++ {
|
for i := 1; i < m.config.Num_procs+1; i++ {
|
||||||
name := fmt.Sprintf("topproc%d", i)
|
name := fmt.Sprintf("topproc%d", i)
|
||||||
y, err := lp.NewMessage(
|
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now())
|
||||||
name,
|
|
||||||
m.tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]any{
|
|
||||||
"value": lines[i],
|
|
||||||
},
|
|
||||||
time.Now())
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,15 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: TopProcs collector
|
|
||||||
description: Collect infos about most CPU-consuming processes
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Admin']
|
|
||||||
weight: 2
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/collectors/topprocs.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## `topprocs` collector
|
## `topprocs` collector
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ The configuration of the CC metric collector consists of five configuration file
|
|||||||
|
|
||||||
## Global configuration
|
## Global configuration
|
||||||
|
|
||||||
The global file contains the paths to the other four files and some global options. You can find examples in `example_configs`.
|
The global file contains the paths to the other four files and some global options.
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
|
|||||||
60
go.mod
60
go.mod
@@ -1,44 +1,48 @@
|
|||||||
module github.com/ClusterCockpit/cc-metric-collector
|
module github.com/ClusterCockpit/cc-metric-collector
|
||||||
|
|
||||||
go 1.25.0
|
go 1.23.4
|
||||||
|
|
||||||
|
toolchain go1.23.7
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/ClusterCockpit/cc-lib/v2 v2.11.0
|
github.com/ClusterCockpit/cc-lib v0.1.1
|
||||||
github.com/ClusterCockpit/go-rocm-smi v0.4.0
|
github.com/ClusterCockpit/go-rocm-smi v0.3.0
|
||||||
github.com/NVIDIA/go-nvml v0.13.0-1
|
github.com/NVIDIA/go-nvml v0.12.0-2
|
||||||
github.com/PaesslerAG/gval v1.2.4
|
github.com/PaesslerAG/gval v1.2.2
|
||||||
github.com/fsnotify/fsnotify v1.9.0
|
github.com/fsnotify/fsnotify v1.7.0
|
||||||
github.com/tklauser/go-sysconf v0.3.16
|
github.com/gorilla/mux v1.8.1
|
||||||
|
github.com/influxdata/influxdb-client-go/v2 v2.14.0
|
||||||
|
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf
|
||||||
|
github.com/influxdata/line-protocol/v2 v2.2.1
|
||||||
|
github.com/nats-io/nats.go v1.39.0
|
||||||
|
github.com/prometheus/client_golang v1.20.5
|
||||||
|
github.com/stmcginnis/gofish v0.15.0
|
||||||
|
github.com/tklauser/go-sysconf v0.3.13
|
||||||
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1
|
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1
|
||||||
golang.org/x/sys v0.42.0
|
golang.org/x/exp v0.0.0-20250215185904-eff6e970281f
|
||||||
|
golang.org/x/sys v0.30.0
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 // indirect
|
github.com/ClusterCockpit/cc-backend v1.4.2 // indirect
|
||||||
|
github.com/ClusterCockpit/cc-units v0.4.0 // indirect
|
||||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
|
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
|
||||||
github.com/beorn7/perks v1.0.1 // indirect
|
github.com/beorn7/perks v1.0.1 // indirect
|
||||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||||
github.com/expr-lang/expr v1.17.8 // indirect
|
github.com/expr-lang/expr v1.17.0 // indirect
|
||||||
github.com/google/uuid v1.6.0 // indirect
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
github.com/gorilla/mux v1.8.1 // indirect
|
github.com/klauspost/compress v1.17.9 // indirect
|
||||||
github.com/influxdata/influxdb-client-go/v2 v2.14.0 // indirect
|
|
||||||
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf // indirect
|
|
||||||
github.com/klauspost/compress v1.18.4 // indirect
|
|
||||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||||
github.com/nats-io/nats.go v1.49.0 // indirect
|
github.com/nats-io/nkeys v0.4.9 // indirect
|
||||||
github.com/nats-io/nkeys v0.4.15 // indirect
|
|
||||||
github.com/nats-io/nuid v1.0.1 // indirect
|
github.com/nats-io/nuid v1.0.1 // indirect
|
||||||
github.com/oapi-codegen/runtime v1.3.0 // indirect
|
github.com/oapi-codegen/runtime v1.1.1 // indirect
|
||||||
github.com/prometheus/client_golang v1.23.2 // indirect
|
github.com/prometheus/client_model v0.6.1 // indirect
|
||||||
github.com/prometheus/client_model v0.6.2 // indirect
|
github.com/prometheus/common v0.55.0 // indirect
|
||||||
github.com/prometheus/common v0.67.5 // indirect
|
github.com/prometheus/procfs v0.15.1 // indirect
|
||||||
github.com/prometheus/procfs v0.20.1 // indirect
|
|
||||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 // indirect
|
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 // indirect
|
||||||
github.com/shopspring/decimal v1.4.0 // indirect
|
github.com/shopspring/decimal v1.3.1 // indirect
|
||||||
github.com/stmcginnis/gofish v0.21.4 // indirect
|
github.com/tklauser/numcpus v0.7.0 // indirect
|
||||||
github.com/tklauser/numcpus v0.11.0 // indirect
|
golang.org/x/crypto v0.35.0 // indirect
|
||||||
go.yaml.in/yaml/v2 v2.4.4 // indirect
|
golang.org/x/net v0.36.0 // indirect
|
||||||
golang.org/x/crypto v0.49.0 // indirect
|
google.golang.org/protobuf v1.35.2 // indirect
|
||||||
golang.org/x/net v0.52.0 // indirect
|
|
||||||
google.golang.org/protobuf v1.36.11 // indirect
|
|
||||||
)
|
)
|
||||||
|
|||||||
149
go.sum
149
go.sum
@@ -1,18 +1,21 @@
|
|||||||
github.com/ClusterCockpit/cc-lib/v2 v2.11.0 h1:LaLs4J0b7FArIXT8byMUcIcUr55R5obATjVi7qI02r4=
|
github.com/ClusterCockpit/cc-backend v1.4.2 h1:kTOzqkh9N0564N9nqQThnSs7TAfg8RLgvSm00e5HtIc=
|
||||||
github.com/ClusterCockpit/cc-lib/v2 v2.11.0/go.mod h1:Oj+N2lpFqiBOBzjfrLIGJ2YSWT400TX4M0ii4lNl81A=
|
github.com/ClusterCockpit/cc-backend v1.4.2/go.mod h1:g8TNHXe4AXej26snu2//jO3mUF980elT93iV/k11O/c=
|
||||||
github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0 h1:hIzxgTBWcmCIHtoDKDkSCsKCOCOwUC34sFsbD2wcW0Q=
|
github.com/ClusterCockpit/cc-lib v0.1.0-beta.1 h1:dz9j0g2cod8+SMDjuoIY6ISpiHHeekhX6yQaeiwiwJw=
|
||||||
github.com/ClusterCockpit/cc-line-protocol/v2 v2.4.0/go.mod h1:y42qUu+YFmu5fdNuUAS4VbbIKxVjxCvbVqFdpdh8ahY=
|
github.com/ClusterCockpit/cc-lib v0.1.0-beta.1/go.mod h1:kXMskla1i5ZSfXW0vVRIHgGeXMU5zu2PzYOYnUaOr80=
|
||||||
github.com/ClusterCockpit/go-rocm-smi v0.4.0 h1:3+bEPrSkjEJcOtt+qBUX48ugDVlOFaKUnXHTef2Ve2Q=
|
github.com/ClusterCockpit/cc-lib v0.1.1 h1:AXZWYUzgTaE/WdxLNSWPR7FJoA5WlzvYZxw4gIw3gNw=
|
||||||
github.com/ClusterCockpit/go-rocm-smi v0.4.0/go.mod h1:c19u5vBCcgb7DjL4EWTGSGpo6c79d07r4rxD50z25ng=
|
github.com/ClusterCockpit/cc-lib v0.1.1/go.mod h1:SHKcWW/+kN+pcofAtHJFxvmx1FV0VIJuQv5PuT0HDcc=
|
||||||
github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw=
|
github.com/ClusterCockpit/cc-units v0.4.0 h1:zP5DOu99GmErW0tCDf0gcLrlWt42RQ9dpoONEOh4cI0=
|
||||||
github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
|
github.com/ClusterCockpit/cc-units v0.4.0/go.mod h1:3S3PAhAayS3pbgcT4q9Vn9VJw22Op51X0YimtG77zBw=
|
||||||
github.com/PaesslerAG/gval v1.2.4 h1:rhX7MpjJlcxYwL2eTTYIOBUyEKZ+A96T9vQySWkVUiU=
|
github.com/ClusterCockpit/go-rocm-smi v0.3.0 h1:1qZnSpG7/NyLtc7AjqnUL9Jb8xtqG1nMVgp69rJfaR8=
|
||||||
github.com/PaesslerAG/gval v1.2.4/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac=
|
github.com/ClusterCockpit/go-rocm-smi v0.3.0/go.mod h1:+I3UMeX3OlizXDf1WpGD43W4KGZZGVSGmny6rTeOnWA=
|
||||||
|
github.com/NVIDIA/go-nvml v0.11.6-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
|
||||||
|
github.com/NVIDIA/go-nvml v0.12.0-2 h1:Sg239yy7jmopu/cuvYauoMj9fOpcGMngxVxxS1EBXeY=
|
||||||
|
github.com/NVIDIA/go-nvml v0.12.0-2/go.mod h1:7ruy85eOM73muOc/I37euONSwEyFqZsv5ED9AogD4G0=
|
||||||
|
github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E=
|
||||||
|
github.com/PaesslerAG/gval v1.2.2/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac=
|
||||||
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
|
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
|
||||||
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
|
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
|
||||||
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
|
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
|
||||||
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op h1:Ucf+QxEKMbPogRO5guBNe5cgd9uZgfoJLOYs8WWhtjM=
|
|
||||||
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E=
|
|
||||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
|
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
|
||||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
|
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
|
||||||
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
||||||
@@ -20,19 +23,24 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r
|
|||||||
github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w=
|
github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w=
|
||||||
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||||
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||||
|
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/expr-lang/expr v1.17.8 h1:W1loDTT+0PQf5YteHSTpju2qfUfNoBt4yw9+wOEU9VM=
|
github.com/expr-lang/expr v1.16.9 h1:WUAzmR0JNI9JCiF0/ewwHB1gmcGw5wW7nWt8gc6PpCI=
|
||||||
github.com/expr-lang/expr v1.17.8/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
|
github.com/expr-lang/expr v1.16.9/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
|
||||||
|
github.com/expr-lang/expr v1.17.0 h1:+vpszOyzKLQXC9VF+wA8cVA0tlA984/Wabc/1hF9Whg=
|
||||||
|
github.com/expr-lang/expr v1.17.0/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
|
||||||
|
github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
|
||||||
|
github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
|
||||||
github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
|
github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
|
||||||
github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
|
github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
|
||||||
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
|
github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
|
||||||
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
|
github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
|
||||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||||
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||||
github.com/google/go-tpm v0.9.7 h1:u89J4tUUeDTlH8xxC3CTW7OHZjbjKoHdQ9W7gCUhtxA=
|
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||||
github.com/google/go-tpm v0.9.7/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY=
|
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
|
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
|
||||||
@@ -41,86 +49,91 @@ github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjw
|
|||||||
github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI=
|
github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI=
|
||||||
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU=
|
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU=
|
||||||
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
|
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
|
||||||
|
github.com/influxdata/line-protocol-corpus v0.0.0-20210519164801-ca6fa5da0184/go.mod h1:03nmhxzZ7Xk2pdG+lmMd7mHDfeVOYFyhOgwO61qWU98=
|
||||||
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig=
|
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig=
|
||||||
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo=
|
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo=
|
||||||
|
github.com/influxdata/line-protocol/v2 v2.0.0-20210312151457-c52fdecb625a/go.mod h1:6+9Xt5Sq1rWx+glMgxhcg2c0DUaehK+5TDcPZ76GypY=
|
||||||
|
github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxksNj7PX9aUSeYOYE/ceHY=
|
||||||
|
github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE=
|
||||||
|
github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM=
|
||||||
github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
|
github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
|
||||||
github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c=
|
github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA=
|
||||||
github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
|
github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
|
||||||
|
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
|
||||||
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||||
|
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||||
|
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
|
||||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||||
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
|
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
|
||||||
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
|
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
|
||||||
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 h1:KGuD/pM2JpL9FAYvBrnBBeENKZNh6eNtjqytV6TYjnk=
|
|
||||||
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ=
|
|
||||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
||||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
||||||
github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g=
|
github.com/nats-io/nats.go v1.39.0 h1:2/yg2JQjiYYKLwDuBzV0FbB2sIV+eFNkEevlRi4n9lI=
|
||||||
github.com/nats-io/jwt/v2 v2.8.0/go.mod h1:me11pOkwObtcBNR8AiMrUbtVOUGkqYjMQZ6jnSdVUIA=
|
github.com/nats-io/nats.go v1.39.0/go.mod h1:MgRb8oOdigA6cYpEPhXJuRVH6UE/V4jblJ2jQ27IXYM=
|
||||||
github.com/nats-io/nats-server/v2 v2.12.3 h1:KRv+1n7lddMVgkJPQer+pt36TcO0ENxjilBmeWdjcHs=
|
github.com/nats-io/nkeys v0.4.9 h1:qe9Faq2Gxwi6RZnZMXfmGMZkg3afLLOtrU+gDZJ35b0=
|
||||||
github.com/nats-io/nats-server/v2 v2.12.3/go.mod h1:MQXjG9WjyXKz9koWzUc3jYUMKD8x3CLmTNy91IQQz3Y=
|
github.com/nats-io/nkeys v0.4.9/go.mod h1:jcMqs+FLG+W5YO36OX6wFIFcmpdAns+w1Wm6D3I/evE=
|
||||||
github.com/nats-io/nats.go v1.49.0 h1:yh/WvY59gXqYpgl33ZI+XoVPKyut/IcEaqtsiuTJpoE=
|
|
||||||
github.com/nats-io/nats.go v1.49.0/go.mod h1:fDCn3mN5cY8HooHwE2ukiLb4p4G4ImmzvXyJt+tGwdw=
|
|
||||||
github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4=
|
|
||||||
github.com/nats-io/nkeys v0.4.15/go.mod h1:CpMchTXC9fxA5zrMo4KpySxNjiDVvr8ANOSZdiNfUrs=
|
|
||||||
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
|
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
|
||||||
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
|
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
|
||||||
github.com/oapi-codegen/runtime v1.3.0 h1:vyK1zc0gDWWXgk2xoQa4+X4RNNc5SL2RbTpJS/4vMYA=
|
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
|
||||||
github.com/oapi-codegen/runtime v1.3.0/go.mod h1:kOdeacKy7t40Rclb1je37ZLFboFxh+YLy0zaPCMibPY=
|
github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro=
|
||||||
|
github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
|
||||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
|
github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y=
|
||||||
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
|
github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
|
||||||
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
|
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
|
||||||
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
|
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
|
||||||
github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4=
|
github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc=
|
||||||
github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw=
|
github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8=
|
||||||
github.com/prometheus/procfs v0.20.1 h1:XwbrGOIplXW/AU3YhIhLODXMJYyC1isLFfYCsTEycfc=
|
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
|
||||||
github.com/prometheus/procfs v0.20.1/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo=
|
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
|
||||||
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
|
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
|
||||||
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
|
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
|
||||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4=
|
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4=
|
||||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY=
|
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY=
|
||||||
|
github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=
|
||||||
github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
|
github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
|
||||||
github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k=
|
|
||||||
github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME=
|
|
||||||
github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
|
github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
|
||||||
github.com/stmcginnis/gofish v0.21.4 h1:daexK8sh31CgeSMkPUNs21HWHHA9ecCPJPyLCTxukCg=
|
github.com/stmcginnis/gofish v0.15.0 h1:8TG41+lvJk/0Nf8CIIYErxbMlQUy80W0JFRZP3Ld82A=
|
||||||
github.com/stmcginnis/gofish v0.21.4/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
|
github.com/stmcginnis/gofish v0.15.0/go.mod h1:BLDSFTp8pDlf/xDbLZa+F7f7eW0E/CHCboggsu8CznI=
|
||||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
||||||
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
|
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
|
||||||
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
|
|
||||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||||
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||||
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
github.com/tklauser/go-sysconf v0.3.13 h1:GBUpcahXSpR2xN01jhkNAbTLRk2Yzgggk8IM08lq3r4=
|
||||||
github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA=
|
github.com/tklauser/go-sysconf v0.3.13/go.mod h1:zwleP4Q4OehZHGn4CYZDipCgg9usW5IJePewFCGVEa0=
|
||||||
github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI=
|
github.com/tklauser/numcpus v0.7.0 h1:yjuerZP127QG9m5Zh/mSO4wqurYil27tHrqwRoRjpr4=
|
||||||
github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw=
|
github.com/tklauser/numcpus v0.7.0/go.mod h1:bb6dMVcj8A42tSE7i32fsIUCbQNllK5iDguyOZRUzAY=
|
||||||
github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ=
|
|
||||||
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
|
||||||
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
|
||||||
go.yaml.in/yaml/v2 v2.4.4 h1:tuyd0P+2Ont/d6e2rl3be67goVK4R6deVxCUX5vyPaQ=
|
|
||||||
go.yaml.in/yaml/v2 v2.4.4/go.mod h1:gMZqIpDtDqOfM0uNfy0SkpRhvUryYH0Z6wdMYcacYXQ=
|
|
||||||
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 h1:P7S/GeHBAFEZIYp0ePPs2kHXoazz8q2KsyxHyQVGCJg=
|
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 h1:P7S/GeHBAFEZIYp0ePPs2kHXoazz8q2KsyxHyQVGCJg=
|
||||||
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1/go.mod h1:9CWpnTUmlQkfdpdutA1nNf4iE5lAVt3QZOu0Z6hahBE=
|
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1/go.mod h1:9CWpnTUmlQkfdpdutA1nNf4iE5lAVt3QZOu0Z6hahBE=
|
||||||
golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4=
|
golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U=
|
||||||
golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA=
|
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
|
||||||
golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0=
|
golang.org/x/crypto v0.35.0 h1:b15kiHdrGCHrP6LvwaQ3c03kgNhhiMgvlhxHQhmg2Xs=
|
||||||
golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw=
|
golang.org/x/crypto v0.35.0/go.mod h1:dy7dXNW32cAb/6/PRuTNsix8T+vJAqvuIy5Bli/x0YQ=
|
||||||
|
golang.org/x/exp v0.0.0-20250215185904-eff6e970281f h1:oFMYAjX0867ZD2jcNiLBrI9BdpmEkvPyi5YrBGXbamg=
|
||||||
|
golang.org/x/exp v0.0.0-20250215185904-eff6e970281f/go.mod h1:BHOTPb3L19zxehTsLoJXVaTktb06DFgmdW6Wb9s8jqk=
|
||||||
|
golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo=
|
||||||
|
golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM=
|
||||||
|
golang.org/x/net v0.36.0 h1:vWF2fRbw4qslQsQzgFqZff+BItCvGFQqKzKIzx1rmoA=
|
||||||
|
golang.org/x/net v0.36.0/go.mod h1:bFmbeoIPfrw4sMHNhb4J9f6+tPziuGjq7Jk/38fxi1I=
|
||||||
golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
|
||||||
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||||
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
|
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
|
||||||
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
|
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||||
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
|
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||||
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
|
google.golang.org/protobuf v1.35.2 h1:8Ar7bF+apOIoThw1EdZl0p1oWvMqTHmpA2fRTyZO8io=
|
||||||
|
google.golang.org/protobuf v1.35.2/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
|
||||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
|
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
|||||||
@@ -1,14 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: Metric Aggregator
|
|
||||||
description: Subsystem for evaluating expressions on metrics (deprecated)
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Developer']
|
|
||||||
weight: 1
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/internal/metricaggregator/_index.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
# The MetricAggregator
|
# The MetricAggregator
|
||||||
|
|
||||||
In some cases, further combination of metrics or raw values is required. For that strings like `foo + 1` with runtime dependent `foo` need to be evaluated. The MetricAggregator relies on the [`gval`](https://github.com/PaesslerAG/gval) Golang package to perform all expression evaluation. The `gval` package provides the basic arithmetic operations but the MetricAggregator defines additional ones.
|
In some cases, further combination of metrics or raw values is required. For that strings like `foo + 1` with runtime dependent `foo` need to be evaluated. The MetricAggregator relies on the [`gval`](https://github.com/PaesslerAG/gval) Golang package to perform all expression evaluation. The `gval` package provides the basic arithmetic operations but the MetricAggregator defines additional ones.
|
||||||
|
|||||||
@@ -1,26 +1,17 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package metricAggregator
|
package metricAggregator
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"maps"
|
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"slices"
|
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||||
|
|
||||||
"github.com/PaesslerAG/gval"
|
"github.com/PaesslerAG/gval"
|
||||||
@@ -38,7 +29,7 @@ type MetricAggregatorIntervalConfig struct {
|
|||||||
|
|
||||||
type metricAggregator struct {
|
type metricAggregator struct {
|
||||||
functions []*MetricAggregatorIntervalConfig
|
functions []*MetricAggregatorIntervalConfig
|
||||||
constants map[string]any
|
constants map[string]interface{}
|
||||||
language gval.Language
|
language gval.Language
|
||||||
output chan lp.CCMessage
|
output chan lp.CCMessage
|
||||||
}
|
}
|
||||||
@@ -72,12 +63,10 @@ var metricCacheLanguage = gval.NewLanguage(
|
|||||||
gval.Function("getCpuList", getCpuListOfNode),
|
gval.Function("getCpuList", getCpuListOfNode),
|
||||||
gval.Function("getCpuListOfType", getCpuListOfType),
|
gval.Function("getCpuListOfType", getCpuListOfType),
|
||||||
)
|
)
|
||||||
|
|
||||||
var language gval.Language = gval.NewLanguage(
|
var language gval.Language = gval.NewLanguage(
|
||||||
gval.Full(),
|
gval.Full(),
|
||||||
metricCacheLanguage,
|
metricCacheLanguage,
|
||||||
)
|
)
|
||||||
|
|
||||||
var evaluables = struct {
|
var evaluables = struct {
|
||||||
mapping map[string]gval.Evaluable
|
mapping map[string]gval.Evaluable
|
||||||
mutex sync.Mutex
|
mutex sync.Mutex
|
||||||
@@ -88,13 +77,14 @@ var evaluables = struct {
|
|||||||
func (c *metricAggregator) Init(output chan lp.CCMessage) error {
|
func (c *metricAggregator) Init(output chan lp.CCMessage) error {
|
||||||
c.output = output
|
c.output = output
|
||||||
c.functions = make([]*MetricAggregatorIntervalConfig, 0)
|
c.functions = make([]*MetricAggregatorIntervalConfig, 0)
|
||||||
c.constants = make(map[string]any)
|
c.constants = make(map[string]interface{})
|
||||||
|
|
||||||
// add constants like hostname, numSockets, ... to constants list
|
// add constants like hostname, numSockets, ... to constants list
|
||||||
// Set hostname
|
// Set hostname
|
||||||
hostname, err := os.Hostname()
|
hostname, err := os.Hostname()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("metricAggregator: failed to get hostname: %w", err)
|
cclog.Error(err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
// Drop domain part of host name
|
// Drop domain part of host name
|
||||||
c.constants["hostname"] = strings.SplitN(hostname, `.`, 2)[0]
|
c.constants["hostname"] = strings.SplitN(hostname, `.`, 2)[0]
|
||||||
@@ -123,8 +113,10 @@ func (c *metricAggregator) Init(output chan lp.CCMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMessage) {
|
func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMessage) {
|
||||||
vars := make(map[string]any)
|
vars := make(map[string]interface{})
|
||||||
maps.Copy(vars, c.constants)
|
for k, v := range c.constants {
|
||||||
|
vars[k] = v
|
||||||
|
}
|
||||||
vars["starttime"] = starttime
|
vars["starttime"] = starttime
|
||||||
vars["endtime"] = endtime
|
vars["endtime"] = endtime
|
||||||
for _, f := range c.functions {
|
for _, f := range c.functions {
|
||||||
@@ -138,6 +130,7 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
|
|||||||
matches := make([]lp.CCMessage, 0)
|
matches := make([]lp.CCMessage, 0)
|
||||||
for _, m := range metrics {
|
for _, m := range metrics {
|
||||||
vars["metric"] = m
|
vars["metric"] = m
|
||||||
|
//value, err := gval.Evaluate(f.Condition, vars, c.language)
|
||||||
value, err := f.gvalCond.EvalBool(context.Background(), vars)
|
value, err := f.gvalCond.EvalBool(context.Background(), vars)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError("MetricCache", "COLLECT", f.Name, "COND", f.Condition, ":", err.Error())
|
cclog.ComponentError("MetricCache", "COLLECT", f.Name, "COND", f.Condition, ":", err.Error())
|
||||||
@@ -171,22 +164,22 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
|
|||||||
// Check, that only values of one type were collected
|
// Check, that only values of one type were collected
|
||||||
countValueTypes := 0
|
countValueTypes := 0
|
||||||
if len(valuesFloat64) > 0 {
|
if len(valuesFloat64) > 0 {
|
||||||
countValueTypes++
|
countValueTypes += 1
|
||||||
}
|
}
|
||||||
if len(valuesFloat32) > 0 {
|
if len(valuesFloat32) > 0 {
|
||||||
countValueTypes++
|
countValueTypes += 1
|
||||||
}
|
}
|
||||||
if len(valuesInt) > 0 {
|
if len(valuesInt) > 0 {
|
||||||
countValueTypes++
|
countValueTypes += 1
|
||||||
}
|
}
|
||||||
if len(valuesInt32) > 0 {
|
if len(valuesInt32) > 0 {
|
||||||
countValueTypes++
|
countValueTypes += 1
|
||||||
}
|
}
|
||||||
if len(valuesInt64) > 0 {
|
if len(valuesInt64) > 0 {
|
||||||
countValueTypes++
|
countValueTypes += 1
|
||||||
}
|
}
|
||||||
if len(valuesBool) > 0 {
|
if len(valuesBool) > 0 {
|
||||||
countValueTypes++
|
countValueTypes += 1
|
||||||
}
|
}
|
||||||
if countValueTypes > 1 {
|
if countValueTypes > 1 {
|
||||||
cclog.ComponentError("MetricCache", "Collected values of different types")
|
cclog.ComponentError("MetricCache", "Collected values of different types")
|
||||||
@@ -263,15 +256,15 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
|
|||||||
var m lp.CCMessage
|
var m lp.CCMessage
|
||||||
switch t := value.(type) {
|
switch t := value.(type) {
|
||||||
case float64:
|
case float64:
|
||||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
||||||
case float32:
|
case float32:
|
||||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
||||||
case int:
|
case int:
|
||||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
||||||
case int64:
|
case int64:
|
||||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
||||||
case string:
|
case string:
|
||||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
||||||
default:
|
default:
|
||||||
cclog.ComponentError("MetricCache", "Gval returned invalid type", t, "skipping metric", f.Name)
|
cclog.ComponentError("MetricCache", "Gval returned invalid type", t, "skipping metric", f.Name)
|
||||||
}
|
}
|
||||||
@@ -329,21 +322,18 @@ func (c *metricAggregator) AddAggregation(name, function, condition string, tags
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c *metricAggregator) DeleteAggregation(name string) error {
|
func (c *metricAggregator) DeleteAggregation(name string) error {
|
||||||
i := slices.IndexFunc(
|
for i, agg := range c.functions {
|
||||||
c.functions,
|
if agg.Name == name {
|
||||||
func(agg *MetricAggregatorIntervalConfig) bool {
|
copy(c.functions[i:], c.functions[i+1:])
|
||||||
return agg.Name == name
|
c.functions[len(c.functions)-1] = nil
|
||||||
})
|
c.functions = c.functions[:len(c.functions)-1]
|
||||||
if i == -1 {
|
return nil
|
||||||
return fmt.Errorf("no aggregation for metric name %s", name)
|
}
|
||||||
}
|
}
|
||||||
copy(c.functions[i:], c.functions[i+1:])
|
return fmt.Errorf("no aggregation for metric name %s", name)
|
||||||
c.functions[len(c.functions)-1] = nil
|
|
||||||
c.functions = c.functions[:len(c.functions)-1]
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *metricAggregator) AddConstant(name string, value any) {
|
func (c *metricAggregator) AddConstant(name string, value interface{}) {
|
||||||
c.constants[name] = value
|
c.constants[name] = value
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -351,18 +341,19 @@ func (c *metricAggregator) DelConstant(name string) {
|
|||||||
delete(c.constants, name)
|
delete(c.constants, name)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *metricAggregator) AddFunction(name string, function func(args ...any) (any, error)) {
|
func (c *metricAggregator) AddFunction(name string, function func(args ...interface{}) (interface{}, error)) {
|
||||||
c.language = gval.NewLanguage(c.language, gval.Function(name, function))
|
c.language = gval.NewLanguage(c.language, gval.Function(name, function))
|
||||||
}
|
}
|
||||||
|
|
||||||
func EvalBoolCondition(condition string, params map[string]any) (bool, error) {
|
func EvalBoolCondition(condition string, params map[string]interface{}) (bool, error) {
|
||||||
evaluables.mutex.Lock()
|
evaluables.mutex.Lock()
|
||||||
evaluable, ok := evaluables.mapping[condition]
|
evaluable, ok := evaluables.mapping[condition]
|
||||||
evaluables.mutex.Unlock()
|
evaluables.mutex.Unlock()
|
||||||
if !ok {
|
if !ok {
|
||||||
newcond := strings.ReplaceAll(
|
newcond :=
|
||||||
strings.ReplaceAll(
|
strings.ReplaceAll(
|
||||||
condition, "'", "\""), "%", "\\")
|
strings.ReplaceAll(
|
||||||
|
condition, "'", "\""), "%", "\\")
|
||||||
var err error
|
var err error
|
||||||
evaluable, err = language.NewEvaluable(newcond)
|
evaluable, err = language.NewEvaluable(newcond)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -381,9 +372,10 @@ func EvalFloat64Condition(condition string, params map[string]float64) (float64,
|
|||||||
evaluable, ok := evaluables.mapping[condition]
|
evaluable, ok := evaluables.mapping[condition]
|
||||||
evaluables.mutex.Unlock()
|
evaluables.mutex.Unlock()
|
||||||
if !ok {
|
if !ok {
|
||||||
newcond := strings.ReplaceAll(
|
newcond :=
|
||||||
strings.ReplaceAll(
|
strings.ReplaceAll(
|
||||||
condition, "'", "\""), "%", "\\")
|
strings.ReplaceAll(
|
||||||
|
condition, "'", "\""), "%", "\\")
|
||||||
var err error
|
var err error
|
||||||
evaluable, err = language.NewEvaluable(newcond)
|
evaluable, err = language.NewEvaluable(newcond)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -1,20 +1,13 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package metricAggregator
|
package metricAggregator
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"regexp"
|
"regexp"
|
||||||
"slices"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/exp/slices"
|
||||||
|
|
||||||
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -34,7 +27,8 @@ func sumAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Sum up values
|
// Sum up values
|
||||||
func sumfunc(args any) (any, error) {
|
func sumfunc(args interface{}) (interface{}, error) {
|
||||||
|
|
||||||
var err error
|
var err error
|
||||||
switch values := args.(type) {
|
switch values := args.(type) {
|
||||||
case []float64:
|
case []float64:
|
||||||
@@ -62,7 +56,7 @@ func minAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get the minimum value
|
// Get the minimum value
|
||||||
func minfunc(args any) (any, error) {
|
func minfunc(args interface{}) (interface{}, error) {
|
||||||
switch values := args.(type) {
|
switch values := args.(type) {
|
||||||
case []float64:
|
case []float64:
|
||||||
return minAnyType(values)
|
return minAnyType(values)
|
||||||
@@ -83,12 +77,12 @@ func avgAnyType[T float64 | float32 | int | int32 | int64](values []T) (float64,
|
|||||||
if len(values) == 0 {
|
if len(values) == 0 {
|
||||||
return 0.0, errors.New("average function requires at least one argument")
|
return 0.0, errors.New("average function requires at least one argument")
|
||||||
}
|
}
|
||||||
sum, err := sumAnyType(values)
|
sum, err := sumAnyType[T](values)
|
||||||
return float64(sum) / float64(len(values)), err
|
return float64(sum) / float64(len(values)), err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the average or mean value
|
// Get the average or mean value
|
||||||
func avgfunc(args any) (any, error) {
|
func avgfunc(args interface{}) (interface{}, error) {
|
||||||
switch values := args.(type) {
|
switch values := args.(type) {
|
||||||
case []float64:
|
case []float64:
|
||||||
return avgAnyType(values)
|
return avgAnyType(values)
|
||||||
@@ -113,7 +107,7 @@ func maxAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get the maximum value
|
// Get the maximum value
|
||||||
func maxfunc(args any) (any, error) {
|
func maxfunc(args interface{}) (interface{}, error) {
|
||||||
switch values := args.(type) {
|
switch values := args.(type) {
|
||||||
case []float64:
|
case []float64:
|
||||||
return maxAnyType(values)
|
return maxAnyType(values)
|
||||||
@@ -145,7 +139,7 @@ func medianAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, er
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get the median value
|
// Get the median value
|
||||||
func medianfunc(args any) (any, error) {
|
func medianfunc(args interface{}) (interface{}, error) {
|
||||||
switch values := args.(type) {
|
switch values := args.(type) {
|
||||||
case []float64:
|
case []float64:
|
||||||
return medianAnyType(values)
|
return medianAnyType(values)
|
||||||
@@ -166,9 +160,9 @@ func medianfunc(args any) (any, error) {
|
|||||||
* Get number of values in list. Returns always an int
|
* Get number of values in list. Returns always an int
|
||||||
*/
|
*/
|
||||||
|
|
||||||
func lenfunc(args any) (any, error) {
|
func lenfunc(args interface{}) (interface{}, error) {
|
||||||
var err error
|
var err error = nil
|
||||||
length := 0
|
var length int = 0
|
||||||
switch values := args.(type) {
|
switch values := args.(type) {
|
||||||
case []float64:
|
case []float64:
|
||||||
length = len(values)
|
length = len(values)
|
||||||
@@ -180,7 +174,13 @@ func lenfunc(args any) (any, error) {
|
|||||||
length = len(values)
|
length = len(values)
|
||||||
case []int32:
|
case []int32:
|
||||||
length = len(values)
|
length = len(values)
|
||||||
case float64, float32, int, int64:
|
case float64:
|
||||||
|
err = errors.New("function 'len' can only be applied on arrays and strings")
|
||||||
|
case float32:
|
||||||
|
err = errors.New("function 'len' can only be applied on arrays and strings")
|
||||||
|
case int:
|
||||||
|
err = errors.New("function 'len' can only be applied on arrays and strings")
|
||||||
|
case int64:
|
||||||
err = errors.New("function 'len' can only be applied on arrays and strings")
|
err = errors.New("function 'len' can only be applied on arrays and strings")
|
||||||
case string:
|
case string:
|
||||||
length = len(values)
|
length = len(values)
|
||||||
@@ -190,13 +190,13 @@ func lenfunc(args any) (any, error) {
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Check if a values is in a list
|
* Check if a values is in a list
|
||||||
* In contrast to most of the other functions, this one is an infix operator for
|
* In constrast to most of the other functions, this one is an infix operator for
|
||||||
* - substring matching: `"abc" in "abcdef"` -> true
|
* - substring matching: `"abc" in "abcdef"` -> true
|
||||||
* - substring matching with int casting: `3 in "abd3"` -> true
|
* - substring matching with int casting: `3 in "abd3"` -> true
|
||||||
* - search for an int in an int list: `3 in getCpuList()` -> true (if you have more than 4 CPU hardware threads)
|
* - search for an int in an int list: `3 in getCpuList()` -> true (if you have more than 4 CPU hardware threads)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
func infunc(a any, b any) (any, error) {
|
func infunc(a interface{}, b interface{}) (interface{}, error) {
|
||||||
switch match := a.(type) {
|
switch match := a.(type) {
|
||||||
case string:
|
case string:
|
||||||
switch total := b.(type) {
|
switch total := b.(type) {
|
||||||
@@ -206,9 +206,13 @@ func infunc(a any, b any) (any, error) {
|
|||||||
case int:
|
case int:
|
||||||
switch total := b.(type) {
|
switch total := b.(type) {
|
||||||
case []int:
|
case []int:
|
||||||
return slices.Contains(total, match), nil
|
for _, x := range total {
|
||||||
|
if x == match {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
case string:
|
case string:
|
||||||
smatch := strconv.Itoa(match)
|
smatch := fmt.Sprintf("%d", match)
|
||||||
return strings.Contains(total, smatch), nil
|
return strings.Contains(total, smatch), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -222,12 +226,12 @@ func infunc(a any, b any) (any, error) {
|
|||||||
* format keys \d = %d, \w = %d, ... Not sure how to fix this
|
* format keys \d = %d, \w = %d, ... Not sure how to fix this
|
||||||
*/
|
*/
|
||||||
|
|
||||||
func matchfunc(args ...any) (any, error) {
|
func matchfunc(args ...interface{}) (interface{}, error) {
|
||||||
switch match := args[0].(type) {
|
switch match := args[0].(type) {
|
||||||
case string:
|
case string:
|
||||||
switch total := args[1].(type) {
|
switch total := args[1].(type) {
|
||||||
case string:
|
case string:
|
||||||
smatch := strings.ReplaceAll(match, "%", "\\")
|
smatch := strings.Replace(match, "%", "\\", -1)
|
||||||
regex, err := regexp.Compile(smatch)
|
regex, err := regexp.Compile(smatch)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, err
|
return false, err
|
||||||
@@ -244,7 +248,7 @@ func matchfunc(args ...any) (any, error) {
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// for a given cpuid, it returns the core id
|
// for a given cpuid, it returns the core id
|
||||||
func getCpuCoreFunc(args any) (any, error) {
|
func getCpuCoreFunc(args interface{}) (interface{}, error) {
|
||||||
switch cpuid := args.(type) {
|
switch cpuid := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
return topo.GetHwthreadCore(cpuid), nil
|
return topo.GetHwthreadCore(cpuid), nil
|
||||||
@@ -253,7 +257,7 @@ func getCpuCoreFunc(args any) (any, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// for a given cpuid, it returns the socket id
|
// for a given cpuid, it returns the socket id
|
||||||
func getCpuSocketFunc(args any) (any, error) {
|
func getCpuSocketFunc(args interface{}) (interface{}, error) {
|
||||||
switch cpuid := args.(type) {
|
switch cpuid := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
return topo.GetHwthreadSocket(cpuid), nil
|
return topo.GetHwthreadSocket(cpuid), nil
|
||||||
@@ -262,7 +266,7 @@ func getCpuSocketFunc(args any) (any, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// for a given cpuid, it returns the id of the NUMA node
|
// for a given cpuid, it returns the id of the NUMA node
|
||||||
func getCpuNumaDomainFunc(args any) (any, error) {
|
func getCpuNumaDomainFunc(args interface{}) (interface{}, error) {
|
||||||
switch cpuid := args.(type) {
|
switch cpuid := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
return topo.GetHwthreadNumaDomain(cpuid), nil
|
return topo.GetHwthreadNumaDomain(cpuid), nil
|
||||||
@@ -271,7 +275,7 @@ func getCpuNumaDomainFunc(args any) (any, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// for a given cpuid, it returns the id of the CPU die
|
// for a given cpuid, it returns the id of the CPU die
|
||||||
func getCpuDieFunc(args any) (any, error) {
|
func getCpuDieFunc(args interface{}) (interface{}, error) {
|
||||||
switch cpuid := args.(type) {
|
switch cpuid := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
return topo.GetHwthreadDie(cpuid), nil
|
return topo.GetHwthreadDie(cpuid), nil
|
||||||
@@ -280,7 +284,7 @@ func getCpuDieFunc(args any) (any, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// for a given core id, it returns the list of cpuids
|
// for a given core id, it returns the list of cpuids
|
||||||
func getCpuListOfCoreFunc(args any) (any, error) {
|
func getCpuListOfCoreFunc(args interface{}) (interface{}, error) {
|
||||||
cpulist := make([]int, 0)
|
cpulist := make([]int, 0)
|
||||||
switch in := args.(type) {
|
switch in := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
@@ -294,7 +298,7 @@ func getCpuListOfCoreFunc(args any) (any, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// for a given socket id, it returns the list of cpuids
|
// for a given socket id, it returns the list of cpuids
|
||||||
func getCpuListOfSocketFunc(args any) (any, error) {
|
func getCpuListOfSocketFunc(args interface{}) (interface{}, error) {
|
||||||
cpulist := make([]int, 0)
|
cpulist := make([]int, 0)
|
||||||
switch in := args.(type) {
|
switch in := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
@@ -308,7 +312,7 @@ func getCpuListOfSocketFunc(args any) (any, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// for a given id of a NUMA domain, it returns the list of cpuids
|
// for a given id of a NUMA domain, it returns the list of cpuids
|
||||||
func getCpuListOfNumaDomainFunc(args any) (any, error) {
|
func getCpuListOfNumaDomainFunc(args interface{}) (interface{}, error) {
|
||||||
cpulist := make([]int, 0)
|
cpulist := make([]int, 0)
|
||||||
switch in := args.(type) {
|
switch in := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
@@ -322,7 +326,7 @@ func getCpuListOfNumaDomainFunc(args any) (any, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// for a given CPU die id, it returns the list of cpuids
|
// for a given CPU die id, it returns the list of cpuids
|
||||||
func getCpuListOfDieFunc(args any) (any, error) {
|
func getCpuListOfDieFunc(args interface{}) (interface{}, error) {
|
||||||
cpulist := make([]int, 0)
|
cpulist := make([]int, 0)
|
||||||
switch in := args.(type) {
|
switch in := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
@@ -336,14 +340,14 @@ func getCpuListOfDieFunc(args any) (any, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// wrapper function to get a list of all cpuids of the node
|
// wrapper function to get a list of all cpuids of the node
|
||||||
func getCpuListOfNode() (any, error) {
|
func getCpuListOfNode() (interface{}, error) {
|
||||||
return topo.HwthreadList(), nil
|
return topo.HwthreadList(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// helper function to get the cpuid list for a CCMetric type tag set (type and type-id)
|
// helper function to get the cpuid list for a CCMetric type tag set (type and type-id)
|
||||||
// since there is no access to the metric data in the function, is should be called like
|
// since there is no access to the metric data in the function, is should be called like
|
||||||
// `getCpuListOfType()`
|
// `getCpuListOfType()`
|
||||||
func getCpuListOfType(args ...any) (any, error) {
|
func getCpuListOfType(args ...interface{}) (interface{}, error) {
|
||||||
cpulist := make([]int, 0)
|
cpulist := make([]int, 0)
|
||||||
switch typ := args[0].(type) {
|
switch typ := args[0].(type) {
|
||||||
case string:
|
case string:
|
||||||
|
|||||||
@@ -1,22 +1,11 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: Message Router
|
|
||||||
description: Routing component inside cc-metric-collector
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Developer']
|
|
||||||
weight: 1
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/internal/metricrouter/_index.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
# CC Metric Router
|
# CC Metric Router
|
||||||
|
|
||||||
The CCMetric router sits in between the collectors and the sinks and can be used to add and remove tags to/from traversing [CCMessages](https://pkg.go.dev/github.com/ClusterCockpit/cc-lib/ccMessage).
|
The CCMetric router sits in between the collectors and the sinks and can be used to add and remove tags to/from traversing [CCMessages](https://pkg.go.dev/github.com/ClusterCockpit/cc-energy-manager@v0.0.0-20240919152819-92a17f2da4f7/pkg/cc-message.
|
||||||
|
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
|
|
||||||
**Note**: Use the [message processor configuration](https://github.com/ClusterCockpit/cc-lib/blob/main/messageProcessor/README.md) with option `process_messages`.
|
**Note**: Use the [message processor configuration](../../pkg/messageProcessor/README.md) with option `process_messages`.
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
@@ -80,7 +69,7 @@ The CCMetric router sits in between the collectors and the sinks and can be used
|
|||||||
|
|
||||||
There are three main options `add_tags`, `delete_tags` and `interval_timestamp`. `add_tags` and `delete_tags` are lists consisting of dicts with `key`, `value` and `if`. The `value` can be omitted in the `delete_tags` part as it only uses the `key` for removal. The `interval_timestamp` setting means that a unique timestamp is applied to all metrics traversing the router during an interval.
|
There are three main options `add_tags`, `delete_tags` and `interval_timestamp`. `add_tags` and `delete_tags` are lists consisting of dicts with `key`, `value` and `if`. The `value` can be omitted in the `delete_tags` part as it only uses the `key` for removal. The `interval_timestamp` setting means that a unique timestamp is applied to all metrics traversing the router during an interval.
|
||||||
|
|
||||||
**Note**: Use the [message processor configuration](https://github.com/ClusterCockpit/cc-lib/blob/main/messageProcessor/README.md) (option `process_messages`) instead of `add_tags`, `delete_tags`, `drop_metrics`, `drop_metrics_if`, `rename_metrics`, `normalize_units` and `change_unit_prefix`. These options are deprecated and will be removed in future versions. Until then, they are added to the message processor.
|
**Note**: Use the [message processor configuration](../../pkg/messageProcessor/README.md) (option `process_messages`) instead of `add_tags`, `delete_tags`, `drop_metrics`, `drop_metrics_if`, `rename_metrics`, `normalize_units` and `change_unit_prefix`. These options are deprecated and will be removed in future versions. Until then, they are added to the message processor.
|
||||||
|
|
||||||
# Processing order in the router
|
# Processing order in the router
|
||||||
|
|
||||||
@@ -236,13 +225,13 @@ __deprecated__
|
|||||||
|
|
||||||
|
|
||||||
The cc-metric-collector tries to read the data from the system as it is reported. If available, it tries to read the metric unit from the system as well (e.g. from `/proc/meminfo`). The problem is that, depending on the source, the metric units are named differently. Just think about `byte`, `Byte`, `B`, `bytes`, ...
|
The cc-metric-collector tries to read the data from the system as it is reported. If available, it tries to read the metric unit from the system as well (e.g. from `/proc/meminfo`). The problem is that, depending on the source, the metric units are named differently. Just think about `byte`, `Byte`, `B`, `bytes`, ...
|
||||||
The [cc-units](https://github.com/ClusterCockpit/cc-lib/ccUnits) package provides us a normalization option to use the same metric unit name for all metrics. It this option is set to true, all `unit` meta tags are normalized.
|
The [cc-units](https://github.com/ClusterCockpit/cc-units) package provides us a normalization option to use the same metric unit name for all metrics. It this option is set to true, all `unit` meta tags are normalized.
|
||||||
|
|
||||||
## The `change_unit_prefix` section
|
## The `change_unit_prefix` section
|
||||||
|
|
||||||
__deprecated__
|
__deprecated__
|
||||||
|
|
||||||
It is often the case that metrics are reported by the system using a rather outdated unit prefix (like `/proc/meminfo` still uses kByte despite current memory sizes are in the GByte range). If you want to change the prefix of a unit, you can do that with the help of [cc-units](https://github.com/ClusterCockpit/cc-lib/ccUnits). The setting works on the metric name and requires the new prefix for the metric. The cc-units package determines the scaling factor.
|
It is often the case that metrics are reported by the system using a rather outdated unit prefix (like `/proc/meminfo` still uses kByte despite current memory sizes are in the GByte range). If you want to change the prefix of a unit, you can do that with the help of [cc-units](https://github.com/ClusterCockpit/cc-units). The setting works on the metric name and requires the new prefix for the metric. The cc-units package determines the scaling factor.
|
||||||
|
|
||||||
# Aggregate metric values of the current interval with the `interval_aggregates` option
|
# Aggregate metric values of the current interval with the `interval_aggregates` option
|
||||||
|
|
||||||
@@ -274,7 +263,7 @@ The above configuration, collects all metric values for metrics evaluating `if`
|
|||||||
If you are not interested in the input metrics `sub_metric_%d+` at all, you can add the same condition used here to the `drop_metrics_if` section to drop them.
|
If you are not interested in the input metrics `sub_metric_%d+` at all, you can add the same condition used here to the `drop_metrics_if` section to drop them.
|
||||||
|
|
||||||
Use cases for `interval_aggregates`:
|
Use cases for `interval_aggregates`:
|
||||||
- Combine multiple metrics of the a collector to a new one like the [MemstatCollector](../../collectors/memstatMetric.md) does it for `mem_used`:
|
- Combine multiple metrics of the a collector to a new one like the [MemstatCollector](../../collectors/memstatMetric.md) does it for `mem_used`)):
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"name" : "mem_used",
|
"name" : "mem_used",
|
||||||
|
|||||||
@@ -1,20 +1,12 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package metricRouter
|
package metricRouter
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
||||||
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
||||||
)
|
)
|
||||||
@@ -52,7 +44,7 @@ type MetricCache interface {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error {
|
func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error {
|
||||||
var err error
|
var err error = nil
|
||||||
c.done = make(chan bool)
|
c.done = make(chan bool)
|
||||||
c.wg = wg
|
c.wg = wg
|
||||||
c.ticker = ticker
|
c.ticker = ticker
|
||||||
@@ -71,7 +63,8 @@ func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker,
|
|||||||
// The code is executed by the MetricCache goroutine
|
// The code is executed by the MetricCache goroutine
|
||||||
c.aggEngine, err = agg.NewAggregator(c.output)
|
c.aggEngine, err = agg.NewAggregator(c.output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("MetricCache: failed to create aggregator: %w", err)
|
cclog.ComponentError("MetricCache", "Cannot create aggregator")
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
@@ -79,6 +72,7 @@ func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker,
|
|||||||
|
|
||||||
// Start starts the metric cache
|
// Start starts the metric cache
|
||||||
func (c *metricCache) Start() {
|
func (c *metricCache) Start() {
|
||||||
|
|
||||||
c.tickchan = make(chan time.Time)
|
c.tickchan = make(chan time.Time)
|
||||||
c.ticker.AddChannel(c.tickchan)
|
c.ticker.AddChannel(c.tickchan)
|
||||||
// Router cache is done
|
// Router cache is done
|
||||||
@@ -101,7 +95,9 @@ func (c *metricCache) Start() {
|
|||||||
return oldPeriod
|
return oldPeriod
|
||||||
}
|
}
|
||||||
|
|
||||||
c.wg.Go(func() {
|
c.wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer c.wg.Done()
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-c.done:
|
case <-c.done:
|
||||||
@@ -121,7 +117,7 @@ func (c *metricCache) Start() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
}()
|
||||||
cclog.ComponentDebug("MetricCache", "START")
|
cclog.ComponentDebug("MetricCache", "START")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -134,12 +130,12 @@ func (c *metricCache) Add(metric lp.CCMessage) {
|
|||||||
p := c.intervals[c.curPeriod]
|
p := c.intervals[c.curPeriod]
|
||||||
if p.numMetrics < p.sizeMetrics {
|
if p.numMetrics < p.sizeMetrics {
|
||||||
p.metrics[p.numMetrics] = metric
|
p.metrics[p.numMetrics] = metric
|
||||||
p.numMetrics++
|
p.numMetrics = p.numMetrics + 1
|
||||||
p.stopstamp = metric.Time()
|
p.stopstamp = metric.Time()
|
||||||
} else {
|
} else {
|
||||||
p.metrics = append(p.metrics, metric)
|
p.metrics = append(p.metrics, metric)
|
||||||
p.numMetrics++
|
p.numMetrics = p.numMetrics + 1
|
||||||
p.sizeMetrics++
|
p.sizeMetrics = p.sizeMetrics + 1
|
||||||
p.stopstamp = metric.Time()
|
p.stopstamp = metric.Time()
|
||||||
}
|
}
|
||||||
c.lock.Unlock()
|
c.lock.Unlock()
|
||||||
@@ -158,8 +154,8 @@ func (c *metricCache) DeleteAggregation(name string) error {
|
|||||||
// is the current one, index=1 the last interval and so on. Returns and empty array if a wrong index
|
// is the current one, index=1 the last interval and so on. Returns and empty array if a wrong index
|
||||||
// is given (negative index, index larger than configured number of total intervals, ...)
|
// is given (negative index, index larger than configured number of total intervals, ...)
|
||||||
func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMessage) {
|
func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMessage) {
|
||||||
start := time.Now()
|
var start time.Time = time.Now()
|
||||||
stop := time.Now()
|
var stop time.Time = time.Now()
|
||||||
var metrics []lp.CCMessage
|
var metrics []lp.CCMessage
|
||||||
if index >= 0 && index < c.numPeriods {
|
if index >= 0 && index < c.numPeriods {
|
||||||
pindex := c.curPeriod - index
|
pindex := c.curPeriod - index
|
||||||
@@ -170,6 +166,7 @@ func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMessage
|
|||||||
start = c.intervals[pindex].startstamp
|
start = c.intervals[pindex].startstamp
|
||||||
stop = c.intervals[pindex].stopstamp
|
stop = c.intervals[pindex].stopstamp
|
||||||
metrics = c.intervals[pindex].metrics
|
metrics = c.intervals[pindex].metrics
|
||||||
|
//return c.intervals[pindex].startstamp, c.intervals[pindex].stopstamp, c.intervals[pindex].metrics
|
||||||
} else {
|
} else {
|
||||||
metrics = make([]lp.CCMessage, 0)
|
metrics = make([]lp.CCMessage, 0)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,25 +1,17 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package metricRouter
|
package metricRouter
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"maps"
|
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
|
||||||
mp "github.com/ClusterCockpit/cc-lib/v2/messageProcessor"
|
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
||||||
|
mp "github.com/ClusterCockpit/cc-lib/messageProcessor"
|
||||||
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
||||||
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
||||||
)
|
)
|
||||||
@@ -47,7 +39,8 @@ type metricRouterConfig struct {
|
|||||||
MaxForward int `json:"max_forward"` // Number of maximal forwarded metrics at one select
|
MaxForward int `json:"max_forward"` // Number of maximal forwarded metrics at one select
|
||||||
NormalizeUnits bool `json:"normalize_units"` // Check unit meta flag and normalize it using cc-units
|
NormalizeUnits bool `json:"normalize_units"` // Check unit meta flag and normalize it using cc-units
|
||||||
ChangeUnitPrefix map[string]string `json:"change_unit_prefix"` // Add prefix that should be applied to the metrics
|
ChangeUnitPrefix map[string]string `json:"change_unit_prefix"` // Add prefix that should be applied to the metrics
|
||||||
MessageProcessor json.RawMessage `json:"process_messages,omitempty"`
|
// dropMetrics map[string]bool // Internal map for O(1) lookup
|
||||||
|
MessageProcessor json.RawMessage `json:"process_messages,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Metric router data structure
|
// Metric router data structure
|
||||||
@@ -102,93 +95,76 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
|
|||||||
// Drop domain part of host name
|
// Drop domain part of host name
|
||||||
r.hostname = strings.SplitN(hostname, `.`, 2)[0]
|
r.hostname = strings.SplitN(hostname, `.`, 2)[0]
|
||||||
|
|
||||||
d := json.NewDecoder(bytes.NewReader(routerConfig))
|
err = json.Unmarshal(routerConfig, &r.config)
|
||||||
d.DisallowUnknownFields()
|
if err != nil {
|
||||||
if err := d.Decode(&r.config); err != nil {
|
cclog.ComponentError("MetricRouter", err.Error())
|
||||||
return fmt.Errorf("failed to decode metric router config: %w", err)
|
return err
|
||||||
|
}
|
||||||
|
r.maxForward = 1
|
||||||
|
if r.config.MaxForward > r.maxForward {
|
||||||
|
r.maxForward = r.config.MaxForward
|
||||||
}
|
}
|
||||||
r.maxForward = max(1, r.config.MaxForward)
|
|
||||||
|
|
||||||
if r.config.NumCacheIntervals > 0 {
|
if r.config.NumCacheIntervals > 0 {
|
||||||
r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals)
|
r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("MetricRouter: failed to initialize MetricCache: %w", err)
|
cclog.ComponentError("MetricRouter", "MetricCache initialization failed:", err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
for _, agg := range r.config.IntervalAgg {
|
for _, agg := range r.config.IntervalAgg {
|
||||||
err = r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
|
r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("MetricCache AddAggregation() failed: %w", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
p, err := mp.NewMessageProcessor()
|
p, err := mp.NewMessageProcessor()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("MessageProcessor NewMessageProcessor() failed: %w", err)
|
return fmt.Errorf("initialization of message processor failed: %v", err.Error())
|
||||||
}
|
}
|
||||||
r.mp = p
|
r.mp = p
|
||||||
|
|
||||||
if len(r.config.MessageProcessor) > 0 {
|
if len(r.config.MessageProcessor) > 0 {
|
||||||
err = r.mp.FromConfigJSON(r.config.MessageProcessor)
|
err = r.mp.FromConfigJSON(r.config.MessageProcessor)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("MessageProcessor FromConfigJSON() failed: %w", err)
|
return fmt.Errorf("failed parsing JSON for message processor: %v", err.Error())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for _, mname := range r.config.DropMetrics {
|
for _, mname := range r.config.DropMetrics {
|
||||||
err = r.mp.AddDropMessagesByName(mname)
|
r.mp.AddDropMessagesByName(mname)
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("MessageProcessor AddDropMessagesByName() failed: %w", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
for _, cond := range r.config.DropMetricsIf {
|
for _, cond := range r.config.DropMetricsIf {
|
||||||
err = r.mp.AddDropMessagesByCondition(cond)
|
r.mp.AddDropMessagesByCondition(cond)
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("MessageProcessor AddDropMessagesByCondition() failed: %w", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
for _, data := range r.config.AddTags {
|
for _, data := range r.config.AddTags {
|
||||||
cond := data.Condition
|
cond := data.Condition
|
||||||
if cond == "*" {
|
if cond == "*" {
|
||||||
cond = "true"
|
cond = "true"
|
||||||
}
|
}
|
||||||
err = r.mp.AddAddTagsByCondition(cond, data.Key, data.Value)
|
r.mp.AddAddTagsByCondition(cond, data.Key, data.Value)
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("MessageProcessor AddAddTagsByCondition() failed: %w", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
for _, data := range r.config.DelTags {
|
for _, data := range r.config.DelTags {
|
||||||
cond := data.Condition
|
cond := data.Condition
|
||||||
if cond == "*" {
|
if cond == "*" {
|
||||||
cond = "true"
|
cond = "true"
|
||||||
}
|
}
|
||||||
err = r.mp.AddDeleteTagsByCondition(cond, data.Key, data.Value)
|
r.mp.AddDeleteTagsByCondition(cond, data.Key, data.Value)
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("MessageProcessor AddDeleteTagsByCondition() failed: %w", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
for oldname, newname := range r.config.RenameMetrics {
|
for oldname, newname := range r.config.RenameMetrics {
|
||||||
err = r.mp.AddRenameMetricByName(oldname, newname)
|
r.mp.AddRenameMetricByName(oldname, newname)
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("MessageProcessor AddRenameMetricByName() failed: %w", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
for metricName, prefix := range r.config.ChangeUnitPrefix {
|
for metricName, prefix := range r.config.ChangeUnitPrefix {
|
||||||
err = r.mp.AddChangeUnitPrefix(fmt.Sprintf("name == '%s'", metricName), prefix)
|
r.mp.AddChangeUnitPrefix(fmt.Sprintf("name == '%s'", metricName), prefix)
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("MessageProcessor AddChangeUnitPrefix() failed: %w", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
r.mp.SetNormalizeUnits(r.config.NormalizeUnits)
|
r.mp.SetNormalizeUnits(r.config.NormalizeUnits)
|
||||||
|
|
||||||
err = r.mp.AddAddTagsByCondition("!msg.HasTag('"+r.config.HostnameTagName+"')", r.config.HostnameTagName, r.hostname)
|
r.mp.AddAddTagsByCondition("true", r.config.HostnameTagName, r.hostname)
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("MessageProcessor AddAddTagsByCondition() failed: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// r.config.dropMetrics = make(map[string]bool)
|
||||||
|
// for _, mname := range r.config.DropMetrics {
|
||||||
|
// r.config.dropMetrics[mname] = true
|
||||||
|
// }
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getParamMap(point lp.CCMessage) map[string]any {
|
func getParamMap(point lp.CCMessage) map[string]interface{} {
|
||||||
params := make(map[string]any)
|
params := make(map[string]interface{})
|
||||||
params["metric"] = point
|
params["metric"] = point
|
||||||
params["name"] = point.Name()
|
params["name"] = point.Name()
|
||||||
for key, value := range point.Tags() {
|
for key, value := range point.Tags() {
|
||||||
@@ -197,12 +173,14 @@ func getParamMap(point lp.CCMessage) map[string]any {
|
|||||||
for key, value := range point.Meta() {
|
for key, value := range point.Meta() {
|
||||||
params[key] = value
|
params[key] = value
|
||||||
}
|
}
|
||||||
maps.Copy(params, point.Fields())
|
for key, value := range point.Fields() {
|
||||||
|
params[key] = value
|
||||||
|
}
|
||||||
params["timestamp"] = point.Time()
|
params["timestamp"] = point.Time()
|
||||||
return params
|
return params
|
||||||
}
|
}
|
||||||
|
|
||||||
// DoAddTags adds a tag when condition is fulfilled
|
// DoAddTags adds a tag when condition is fullfiled
|
||||||
func (r *metricRouter) DoAddTags(point lp.CCMessage) {
|
func (r *metricRouter) DoAddTags(point lp.CCMessage) {
|
||||||
var conditionMatches bool
|
var conditionMatches bool
|
||||||
for _, m := range r.config.AddTags {
|
for _, m := range r.config.AddTags {
|
||||||
@@ -224,6 +202,83 @@ func (r *metricRouter) DoAddTags(point lp.CCMessage) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DoDelTags removes a tag when condition is fullfiled
|
||||||
|
// func (r *metricRouter) DoDelTags(point lp.CCMessage) {
|
||||||
|
// var conditionMatches bool
|
||||||
|
// for _, m := range r.config.DelTags {
|
||||||
|
// if m.Condition == "*" {
|
||||||
|
// // Condition is always matched
|
||||||
|
// conditionMatches = true
|
||||||
|
// } else {
|
||||||
|
// // Evaluate condition
|
||||||
|
// var err error
|
||||||
|
// conditionMatches, err = agg.EvalBoolCondition(m.Condition, getParamMap(point))
|
||||||
|
// if err != nil {
|
||||||
|
// cclog.ComponentError("MetricRouter", err.Error())
|
||||||
|
// conditionMatches = false
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// if conditionMatches {
|
||||||
|
// point.RemoveTag(m.Key)
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// Conditional test whether a metric should be dropped
|
||||||
|
// func (r *metricRouter) dropMetric(point lp.CCMessage) bool {
|
||||||
|
// // Simple drop check
|
||||||
|
// if conditionMatches, ok := r.config.dropMetrics[point.Name()]; ok {
|
||||||
|
// return conditionMatches
|
||||||
|
// }
|
||||||
|
|
||||||
|
// // Checking the dropping conditions
|
||||||
|
// for _, m := range r.config.DropMetricsIf {
|
||||||
|
// conditionMatches, err := agg.EvalBoolCondition(m, getParamMap(point))
|
||||||
|
// if err != nil {
|
||||||
|
// cclog.ComponentError("MetricRouter", err.Error())
|
||||||
|
// conditionMatches = false
|
||||||
|
// }
|
||||||
|
// if conditionMatches {
|
||||||
|
// return conditionMatches
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// // No dropping condition met
|
||||||
|
// return false
|
||||||
|
// }
|
||||||
|
|
||||||
|
// func (r *metricRouter) prepareUnit(point lp.CCMessage) bool {
|
||||||
|
// if r.config.NormalizeUnits {
|
||||||
|
// if in_unit, ok := point.GetMeta("unit"); ok {
|
||||||
|
// u := units.NewUnit(in_unit)
|
||||||
|
// if u.Valid() {
|
||||||
|
// point.AddMeta("unit", u.Short())
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// if newP, ok := r.config.ChangeUnitPrefix[point.Name()]; ok {
|
||||||
|
|
||||||
|
// newPrefix := units.NewPrefix(newP)
|
||||||
|
|
||||||
|
// if in_unit, ok := point.GetMeta("unit"); ok && newPrefix != units.InvalidPrefix {
|
||||||
|
// u := units.NewUnit(in_unit)
|
||||||
|
// if u.Valid() {
|
||||||
|
// cclog.ComponentDebug("MetricRouter", "Change prefix to", newP, "for metric", point.Name())
|
||||||
|
// conv, out_unit := units.GetUnitPrefixFactor(u, newPrefix)
|
||||||
|
// if conv != nil && out_unit.Valid() {
|
||||||
|
// if val, ok := point.GetField("value"); ok {
|
||||||
|
// point.AddField("value", conv(val))
|
||||||
|
// point.AddMeta("unit", out_unit.Short())
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// return true
|
||||||
|
// }
|
||||||
|
|
||||||
// Start starts the metric router
|
// Start starts the metric router
|
||||||
func (r *metricRouter) Start() {
|
func (r *metricRouter) Start() {
|
||||||
// start timer if configured
|
// start timer if configured
|
||||||
@@ -239,9 +294,31 @@ func (r *metricRouter) Start() {
|
|||||||
cclog.ComponentDebug("MetricRouter", "DONE")
|
cclog.ComponentDebug("MetricRouter", "DONE")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Forward message received from collector channel
|
// Forward takes a received metric, adds or deletes tags
|
||||||
|
// and forwards it to the output channels
|
||||||
|
// forward := func(point lp.CCMessage) {
|
||||||
|
// cclog.ComponentDebug("MetricRouter", "FORWARD", point)
|
||||||
|
// r.DoAddTags(point)
|
||||||
|
// r.DoDelTags(point)
|
||||||
|
// name := point.Name()
|
||||||
|
// if new, ok := r.config.RenameMetrics[name]; ok {
|
||||||
|
// point.SetName(new)
|
||||||
|
// point.AddMeta("oldname", name)
|
||||||
|
// r.DoAddTags(point)
|
||||||
|
// r.DoDelTags(point)
|
||||||
|
// }
|
||||||
|
|
||||||
|
// r.prepareUnit(point)
|
||||||
|
|
||||||
|
// for _, o := range r.outputs {
|
||||||
|
// o <- point
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// Foward message received from collector channel
|
||||||
coll_forward := func(p lp.CCMessage) {
|
coll_forward := func(p lp.CCMessage) {
|
||||||
// receive from metric collector
|
// receive from metric collector
|
||||||
|
//p.AddTag(r.config.HostnameTagName, r.hostname)
|
||||||
if r.config.IntervalStamp {
|
if r.config.IntervalStamp {
|
||||||
p.SetTime(r.timestamp)
|
p.SetTime(r.timestamp)
|
||||||
}
|
}
|
||||||
@@ -251,6 +328,11 @@ func (r *metricRouter) Start() {
|
|||||||
o <- m
|
o <- m
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// if !r.dropMetric(p) {
|
||||||
|
// for _, o := range r.outputs {
|
||||||
|
// o <- point
|
||||||
|
// }
|
||||||
|
// }
|
||||||
// even if the metric is dropped, it is stored in the cache for
|
// even if the metric is dropped, it is stored in the cache for
|
||||||
// aggregations
|
// aggregations
|
||||||
if r.config.NumCacheIntervals > 0 {
|
if r.config.NumCacheIntervals > 0 {
|
||||||
@@ -270,6 +352,9 @@ func (r *metricRouter) Start() {
|
|||||||
o <- m
|
o <- m
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// if !r.dropMetric(p) {
|
||||||
|
// forward(p)
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
// Forward message received from cache channel
|
// Forward message received from cache channel
|
||||||
@@ -288,7 +373,10 @@ func (r *metricRouter) Start() {
|
|||||||
r.cache.Start()
|
r.cache.Start()
|
||||||
}
|
}
|
||||||
|
|
||||||
r.wg.Go(func() {
|
r.wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer r.wg.Done()
|
||||||
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-r.done:
|
case <-r.done:
|
||||||
@@ -318,7 +406,7 @@ func (r *metricRouter) Start() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
}()
|
||||||
cclog.ComponentDebug("MetricRouter", "STARTED")
|
cclog.ComponentDebug("MetricRouter", "STARTED")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,10 +1,3 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package ccTopology
|
package ccTopology
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -13,11 +6,11 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
"slices"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
cclogger "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclogger "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
|
"golang.org/x/exp/slices"
|
||||||
)
|
)
|
||||||
|
|
||||||
const SYSFS_CPUBASE = `/sys/devices/system/cpu`
|
const SYSFS_CPUBASE = `/sys/devices/system/cpu`
|
||||||
@@ -51,13 +44,14 @@ var cache struct {
|
|||||||
func fileToInt(path string) int {
|
func fileToInt(path string) int {
|
||||||
buffer, err := os.ReadFile(path)
|
buffer, err := os.ReadFile(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclogger.ComponentError("ccTopology", fmt.Sprintf("fileToInt(): Reading \"%s\": %v", path, err))
|
log.Print(err)
|
||||||
|
cclogger.ComponentError("ccTopology", "fileToInt", "Reading", path, ":", err.Error())
|
||||||
return -1
|
return -1
|
||||||
}
|
}
|
||||||
stringBuffer := strings.TrimSpace(string(buffer))
|
stringBuffer := strings.TrimSpace(string(buffer))
|
||||||
id, err := strconv.Atoi(stringBuffer)
|
id, err := strconv.Atoi(stringBuffer)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclogger.ComponentError("ccTopology", fmt.Sprintf("fileToInt(): Parsing \"%s\": %v", stringBuffer, err))
|
cclogger.ComponentError("ccTopology", "fileToInt", "Parsing", path, ":", stringBuffer, err.Error())
|
||||||
return -1
|
return -1
|
||||||
}
|
}
|
||||||
return id
|
return id
|
||||||
@@ -79,7 +73,7 @@ func fileToList(path string) []int {
|
|||||||
// Create list
|
// Create list
|
||||||
list := make([]int, 0)
|
list := make([]int, 0)
|
||||||
stringBuffer := strings.TrimSpace(string(buffer))
|
stringBuffer := strings.TrimSpace(string(buffer))
|
||||||
for valueRangeString := range strings.SplitSeq(stringBuffer, ",") {
|
for _, valueRangeString := range strings.Split(stringBuffer, ",") {
|
||||||
valueRange := strings.Split(valueRangeString, "-")
|
valueRange := strings.Split(valueRangeString, "-")
|
||||||
switch len(valueRange) {
|
switch len(valueRange) {
|
||||||
case 1:
|
case 1:
|
||||||
@@ -111,76 +105,79 @@ func fileToList(path string) []int {
|
|||||||
|
|
||||||
// init initializes the cache structure
|
// init initializes the cache structure
|
||||||
func init() {
|
func init() {
|
||||||
getHWThreads := func() []int {
|
|
||||||
globPath := filepath.Join(SYSFS_CPUBASE, "cpu[0-9]*")
|
|
||||||
regexPath := filepath.Join(SYSFS_CPUBASE, "cpu([[:digit:]]+)")
|
|
||||||
regex := regexp.MustCompile(regexPath)
|
|
||||||
|
|
||||||
// File globbing for hardware threads
|
getHWThreads :=
|
||||||
files, err := filepath.Glob(globPath)
|
func() []int {
|
||||||
if err != nil {
|
globPath := filepath.Join(SYSFS_CPUBASE, "cpu[0-9]*")
|
||||||
cclogger.ComponentError("CCTopology", "init:getHWThreads", err.Error())
|
regexPath := filepath.Join(SYSFS_CPUBASE, "cpu([[:digit:]]+)")
|
||||||
return nil
|
regex := regexp.MustCompile(regexPath)
|
||||||
}
|
|
||||||
|
|
||||||
hwThreadIDs := make([]int, len(files))
|
// File globbing for hardware threads
|
||||||
for i, file := range files {
|
files, err := filepath.Glob(globPath)
|
||||||
// Extract hardware thread ID
|
if err != nil {
|
||||||
matches := regex.FindStringSubmatch(file)
|
cclogger.ComponentError("CCTopology", "init:getHWThreads", err.Error())
|
||||||
if len(matches) != 2 {
|
|
||||||
cclogger.ComponentError("CCTopology", "init:getHWThreads: Failed to extract hardware thread ID from ", file)
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert hardware thread ID to int
|
hwThreadIDs := make([]int, len(files))
|
||||||
|
for i, file := range files {
|
||||||
|
// Extract hardware thread ID
|
||||||
|
matches := regex.FindStringSubmatch(file)
|
||||||
|
if len(matches) != 2 {
|
||||||
|
cclogger.ComponentError("CCTopology", "init:getHWThreads: Failed to extract hardware thread ID from ", file)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert hardware thread ID to int
|
||||||
|
id, err := strconv.Atoi(matches[1])
|
||||||
|
if err != nil {
|
||||||
|
cclogger.ComponentError("CCTopology", "init:getHWThreads: Failed to convert to int hardware thread ID ", matches[1])
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
hwThreadIDs[i] = id
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort hardware thread IDs
|
||||||
|
slices.Sort(hwThreadIDs)
|
||||||
|
return hwThreadIDs
|
||||||
|
}
|
||||||
|
|
||||||
|
getNumaDomain :=
|
||||||
|
func(basePath string) int {
|
||||||
|
globPath := filepath.Join(basePath, "node*")
|
||||||
|
regexPath := filepath.Join(basePath, "node([[:digit:]]+)")
|
||||||
|
regex := regexp.MustCompile(regexPath)
|
||||||
|
|
||||||
|
// File globbing for NUMA node
|
||||||
|
files, err := filepath.Glob(globPath)
|
||||||
|
if err != nil {
|
||||||
|
cclogger.ComponentError("CCTopology", "init:getNumaDomain", err.Error())
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check, that exactly one NUMA domain was found
|
||||||
|
if len(files) != 1 {
|
||||||
|
cclogger.ComponentError("CCTopology", "init:getNumaDomain", "Number of NUMA domains != 1: ", len(files))
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract NUMA node ID
|
||||||
|
matches := regex.FindStringSubmatch(files[0])
|
||||||
|
if len(matches) != 2 {
|
||||||
|
cclogger.ComponentError("CCTopology", "init:getNumaDomain", "Failed to extract NUMA node ID from: ", files[0])
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
id, err := strconv.Atoi(matches[1])
|
id, err := strconv.Atoi(matches[1])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclogger.ComponentError("CCTopology", "init:getHWThreads: Failed to convert to int hardware thread ID ", matches[1])
|
cclogger.ComponentError("CCTopology", "init:getNumaDomain", "Failed to parse NUMA node ID from: ", matches[1])
|
||||||
return nil
|
return -1
|
||||||
}
|
}
|
||||||
|
|
||||||
hwThreadIDs[i] = id
|
return id
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort hardware thread IDs
|
|
||||||
slices.Sort(hwThreadIDs)
|
|
||||||
return hwThreadIDs
|
|
||||||
}
|
|
||||||
|
|
||||||
getNumaDomain := func(basePath string) int {
|
|
||||||
globPath := filepath.Join(basePath, "node*")
|
|
||||||
regexPath := filepath.Join(basePath, "node([[:digit:]]+)")
|
|
||||||
regex := regexp.MustCompile(regexPath)
|
|
||||||
|
|
||||||
// File globbing for NUMA node
|
|
||||||
files, err := filepath.Glob(globPath)
|
|
||||||
if err != nil {
|
|
||||||
cclogger.ComponentError("CCTopology", "init:getNumaDomain", err.Error())
|
|
||||||
return -1
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check, that exactly one NUMA domain was found
|
|
||||||
if len(files) != 1 {
|
|
||||||
cclogger.ComponentError("CCTopology", "init:getNumaDomain", "Number of NUMA domains != 1: ", len(files))
|
|
||||||
return -1
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract NUMA node ID
|
|
||||||
matches := regex.FindStringSubmatch(files[0])
|
|
||||||
if len(matches) != 2 {
|
|
||||||
cclogger.ComponentError("CCTopology", "init:getNumaDomain", "Failed to extract NUMA node ID from: ", files[0])
|
|
||||||
return -1
|
|
||||||
}
|
|
||||||
|
|
||||||
id, err := strconv.Atoi(matches[1])
|
|
||||||
if err != nil {
|
|
||||||
cclogger.ComponentError("CCTopology", "init:getNumaDomain", "Failed to parse NUMA node ID from: ", matches[1])
|
|
||||||
return -1
|
|
||||||
}
|
|
||||||
|
|
||||||
return id
|
|
||||||
}
|
|
||||||
|
|
||||||
cache.HwthreadList = getHWThreads()
|
cache.HwthreadList = getHWThreads()
|
||||||
cache.CoreList = make([]int, len(cache.HwthreadList))
|
cache.CoreList = make([]int, len(cache.HwthreadList))
|
||||||
cache.SocketList = make([]int, len(cache.HwthreadList))
|
cache.SocketList = make([]int, len(cache.HwthreadList))
|
||||||
@@ -215,15 +212,16 @@ func init() {
|
|||||||
// Lookup NUMA domain id
|
// Lookup NUMA domain id
|
||||||
cache.NumaDomainList[i] = getNumaDomain(cpuBase)
|
cache.NumaDomainList[i] = getNumaDomain(cpuBase)
|
||||||
|
|
||||||
cache.CpuData[i] = HwthreadEntry{
|
cache.CpuData[i] =
|
||||||
CpuID: cache.HwthreadList[i],
|
HwthreadEntry{
|
||||||
SMT: cache.SMTList[i],
|
CpuID: cache.HwthreadList[i],
|
||||||
CoreCPUsList: coreCPUsList,
|
SMT: cache.SMTList[i],
|
||||||
Socket: cache.SocketList[i],
|
CoreCPUsList: coreCPUsList,
|
||||||
NumaDomain: cache.NumaDomainList[i],
|
Socket: cache.SocketList[i],
|
||||||
Die: cache.DieList[i],
|
NumaDomain: cache.NumaDomainList[i],
|
||||||
Core: cache.CoreList[i],
|
Die: cache.DieList[i],
|
||||||
}
|
Core: cache.CoreList[i],
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slices.Sort(cache.HwthreadList)
|
slices.Sort(cache.HwthreadList)
|
||||||
@@ -255,6 +253,12 @@ func HwthreadList() []int {
|
|||||||
return slices.Clone(cache.HwthreadList)
|
return slices.Clone(cache.HwthreadList)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Get list of hardware thread IDs in the order of listing in /proc/cpuinfo
|
||||||
|
// Deprecated! Use HwthreadList()
|
||||||
|
func CpuList() []int {
|
||||||
|
return HwthreadList()
|
||||||
|
}
|
||||||
|
|
||||||
// CoreList gets the list of CPU core IDs in the order of listing in /proc/cpuinfo
|
// CoreList gets the list of CPU core IDs in the order of listing in /proc/cpuinfo
|
||||||
func CoreList() []int {
|
func CoreList() []int {
|
||||||
return slices.Clone(cache.CoreList)
|
return slices.Clone(cache.CoreList)
|
||||||
@@ -293,19 +297,20 @@ func GetTypeList(topology_type string) []int {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func GetTypeId(hwt HwthreadEntry, topology_type string) (int, error) {
|
func GetTypeId(hwt HwthreadEntry, topology_type string) (int, error) {
|
||||||
|
var err error = nil
|
||||||
switch topology_type {
|
switch topology_type {
|
||||||
case "node":
|
case "node":
|
||||||
return 0, nil
|
return 0, err
|
||||||
case "socket":
|
case "socket":
|
||||||
return hwt.Socket, nil
|
return hwt.Socket, err
|
||||||
case "die":
|
case "die":
|
||||||
return hwt.Die, nil
|
return hwt.Die, err
|
||||||
case "memoryDomain":
|
case "memoryDomain":
|
||||||
return hwt.NumaDomain, nil
|
return hwt.NumaDomain, err
|
||||||
case "core":
|
case "core":
|
||||||
return hwt.Core, nil
|
return hwt.Core, err
|
||||||
case "hwthread":
|
case "hwthread":
|
||||||
return hwt.CpuID, nil
|
return hwt.CpuID, err
|
||||||
}
|
}
|
||||||
return -1, fmt.Errorf("unknown topology type '%s'", topology_type)
|
return -1, fmt.Errorf("unknown topology type '%s'", topology_type)
|
||||||
}
|
}
|
||||||
|
|||||||
125
pkg/hostlist/hostlist.go
Normal file
125
pkg/hostlist/hostlist.go
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
package hostlist
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"regexp"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func Expand(in string) (result []string, err error) {
|
||||||
|
|
||||||
|
// Create ranges regular expression
|
||||||
|
reStNumber := "[[:digit:]]+"
|
||||||
|
reStRange := reStNumber + "-" + reStNumber
|
||||||
|
reStOptionalNumberOrRange := "(" + reStNumber + ",|" + reStRange + ",)*"
|
||||||
|
reStNumberOrRange := "(" + reStNumber + "|" + reStRange + ")"
|
||||||
|
reStBraceLeft := "[[]"
|
||||||
|
reStBraceRight := "[]]"
|
||||||
|
reStRanges := reStBraceLeft +
|
||||||
|
reStOptionalNumberOrRange +
|
||||||
|
reStNumberOrRange +
|
||||||
|
reStBraceRight
|
||||||
|
reRanges := regexp.MustCompile(reStRanges)
|
||||||
|
|
||||||
|
// Create host list regular expression
|
||||||
|
reStDNSChars := "[a-zA-Z0-9-]+"
|
||||||
|
reStPrefix := "^(" + reStDNSChars + ")"
|
||||||
|
reStOptionalSuffix := "(" + reStDNSChars + ")?"
|
||||||
|
re := regexp.MustCompile(reStPrefix + "([[][0-9,-]+[]])?" + reStOptionalSuffix)
|
||||||
|
|
||||||
|
// Remove all delimiters from the input
|
||||||
|
in = strings.TrimLeft(in, ", ")
|
||||||
|
|
||||||
|
for len(in) > 0 {
|
||||||
|
if v := re.FindStringSubmatch(in); v != nil {
|
||||||
|
|
||||||
|
// Remove matched part from the input
|
||||||
|
lenPrefix := len(v[0])
|
||||||
|
in = in[lenPrefix:]
|
||||||
|
|
||||||
|
// Remove all delimiters from the input
|
||||||
|
in = strings.TrimLeft(in, ", ")
|
||||||
|
|
||||||
|
// matched prefix, range and suffix
|
||||||
|
hlPrefix := v[1]
|
||||||
|
hlRanges := v[2]
|
||||||
|
hlSuffix := v[3]
|
||||||
|
|
||||||
|
// Single node without ranges
|
||||||
|
if hlRanges == "" {
|
||||||
|
result = append(result, hlPrefix)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Node with ranges
|
||||||
|
if v := reRanges.FindStringSubmatch(hlRanges); v != nil {
|
||||||
|
|
||||||
|
// Remove braces
|
||||||
|
hlRanges = hlRanges[1 : len(hlRanges)-1]
|
||||||
|
|
||||||
|
// Split host ranges at ,
|
||||||
|
for _, hlRange := range strings.Split(hlRanges, ",") {
|
||||||
|
|
||||||
|
// Split host range at -
|
||||||
|
RangeStartEnd := strings.Split(hlRange, "-")
|
||||||
|
|
||||||
|
// Range is only a single number
|
||||||
|
if len(RangeStartEnd) == 1 {
|
||||||
|
result = append(result, hlPrefix+RangeStartEnd[0]+hlSuffix)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Range has a start and an end
|
||||||
|
widthRangeStart := len(RangeStartEnd[0])
|
||||||
|
widthRangeEnd := len(RangeStartEnd[1])
|
||||||
|
iStart, _ := strconv.ParseUint(RangeStartEnd[0], 10, 64)
|
||||||
|
iEnd, _ := strconv.ParseUint(RangeStartEnd[1], 10, 64)
|
||||||
|
if iStart > iEnd {
|
||||||
|
return nil, fmt.Errorf("single range start is greater than end: %s", hlRange)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create print format string for range numbers
|
||||||
|
doPadding := widthRangeStart == widthRangeEnd
|
||||||
|
widthPadding := widthRangeStart
|
||||||
|
var formatString string
|
||||||
|
if doPadding {
|
||||||
|
formatString = "%0" + fmt.Sprint(widthPadding) + "d"
|
||||||
|
} else {
|
||||||
|
formatString = "%d"
|
||||||
|
}
|
||||||
|
formatString = hlPrefix + formatString + hlSuffix
|
||||||
|
|
||||||
|
// Add nodes from this range
|
||||||
|
for i := iStart; i <= iEnd; i++ {
|
||||||
|
result = append(result, fmt.Sprintf(formatString, i))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("not at hostlist range: %s", hlRanges)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("not a hostlist: %s", in)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if result != nil {
|
||||||
|
// sort
|
||||||
|
sort.Strings(result)
|
||||||
|
|
||||||
|
// uniq
|
||||||
|
previous := 1
|
||||||
|
for current := 1; current < len(result); current++ {
|
||||||
|
if result[current-1] != result[current] {
|
||||||
|
if previous != current {
|
||||||
|
result[previous] = result[current]
|
||||||
|
}
|
||||||
|
previous++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result = result[:previous]
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
126
pkg/hostlist/hostlist_test.go
Normal file
126
pkg/hostlist/hostlist_test.go
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
package hostlist
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestExpand(t *testing.T) {
|
||||||
|
|
||||||
|
// Compare two slices of strings
|
||||||
|
equal := func(a, b []string) bool {
|
||||||
|
if len(a) != len(b) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for i, v := range a {
|
||||||
|
if v != b[i] {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
type testDefinition struct {
|
||||||
|
input string
|
||||||
|
resultExpected []string
|
||||||
|
errorExpected bool
|
||||||
|
}
|
||||||
|
|
||||||
|
expandTests := []testDefinition{
|
||||||
|
{
|
||||||
|
// Single node
|
||||||
|
input: "n1",
|
||||||
|
resultExpected: []string{"n1"},
|
||||||
|
errorExpected: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Single node, duplicated
|
||||||
|
input: "n1,n1",
|
||||||
|
resultExpected: []string{"n1"},
|
||||||
|
errorExpected: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Single node with padding
|
||||||
|
input: "n[01]",
|
||||||
|
resultExpected: []string{"n01"},
|
||||||
|
errorExpected: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Single node with suffix
|
||||||
|
input: "n[01]-p",
|
||||||
|
resultExpected: []string{"n01-p"},
|
||||||
|
errorExpected: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Multiple nodes with a single range
|
||||||
|
input: "n[1-2]",
|
||||||
|
resultExpected: []string{"n1", "n2"},
|
||||||
|
errorExpected: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Multiple nodes with a single range and a single index
|
||||||
|
input: "n[1-2,3]",
|
||||||
|
resultExpected: []string{"n1", "n2", "n3"},
|
||||||
|
errorExpected: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Multiple nodes with different prefixes
|
||||||
|
input: "n[1-2],m[1,2]",
|
||||||
|
resultExpected: []string{"m1", "m2", "n1", "n2"},
|
||||||
|
errorExpected: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Multiple nodes with different suffixes
|
||||||
|
input: "n[1-2]-p,n[1,2]-q",
|
||||||
|
resultExpected: []string{"n1-p", "n1-q", "n2-p", "n2-q"},
|
||||||
|
errorExpected: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Multiple nodes with and without node ranges
|
||||||
|
input: " n09, n[01-04,06-07,09] , , n10,n04",
|
||||||
|
resultExpected: []string{"n01", "n02", "n03", "n04", "n06", "n07", "n09", "n10"},
|
||||||
|
errorExpected: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Forbidden DNS character
|
||||||
|
input: "n@",
|
||||||
|
resultExpected: []string{},
|
||||||
|
errorExpected: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Forbidden range
|
||||||
|
input: "n[1-2-2,3]",
|
||||||
|
resultExpected: []string{},
|
||||||
|
errorExpected: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Forbidden range limits
|
||||||
|
input: "n[2-1]",
|
||||||
|
resultExpected: []string{},
|
||||||
|
errorExpected: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, expandTest := range expandTests {
|
||||||
|
result, err := Expand(expandTest.input)
|
||||||
|
|
||||||
|
hasError := err != nil
|
||||||
|
if hasError != expandTest.errorExpected && hasError {
|
||||||
|
t.Errorf("Expand('%s') failed: unexpected error '%v'",
|
||||||
|
expandTest.input, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if hasError != expandTest.errorExpected && !hasError {
|
||||||
|
t.Errorf("Expand('%s') did not fail as expected: got result '%+v'",
|
||||||
|
expandTest.input, result)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !hasError && !equal(result, expandTest.resultExpected) {
|
||||||
|
t.Errorf("Expand('%s') failed: got result '%+v', expected result '%v'",
|
||||||
|
expandTest.input, result, expandTest.resultExpected)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Logf("Checked hostlist.Expand('%s'): result = '%+v', err = '%v'",
|
||||||
|
expandTest.input, result, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,14 +1,3 @@
|
|||||||
<!--
|
|
||||||
---
|
|
||||||
title: Multi-channel Ticker
|
|
||||||
description: Timer ticker that sends out the tick to multiple channels
|
|
||||||
categories: [cc-metric-collector]
|
|
||||||
tags: ['Developer']
|
|
||||||
weight: 1
|
|
||||||
hugo_path: docs/reference/cc-metric-collector/pkg/multichanticker/_index.md
|
|
||||||
---
|
|
||||||
-->
|
|
||||||
|
|
||||||
# MultiChanTicker
|
# MultiChanTicker
|
||||||
|
|
||||||
The idea of this ticker is to multiply the output channels. The original Golang `time.Ticker` provides only a single output channel, so the signal can only be received by a single other class. This ticker allows to add multiple channels which get all notified about the time tick.
|
The idea of this ticker is to multiply the output channels. The original Golang `time.Ticker` provides only a single output channel, so the signal can only be received by a single other class. This ticker allows to add multiple channels which get all notified about the time tick.
|
||||||
|
|||||||
@@ -1,16 +1,9 @@
|
|||||||
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
|
||||||
// All rights reserved. This file is part of cc-lib.
|
|
||||||
// Use of this source code is governed by a MIT-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
// additional authors:
|
|
||||||
// Holger Obermaier (NHR@KIT)
|
|
||||||
|
|
||||||
package multiChanTicker
|
package multiChanTicker
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
||||||
)
|
)
|
||||||
|
|
||||||
type multiChanTicker struct {
|
type multiChanTicker struct {
|
||||||
@@ -21,7 +14,7 @@ type multiChanTicker struct {
|
|||||||
|
|
||||||
type MultiChanTicker interface {
|
type MultiChanTicker interface {
|
||||||
Init(duration time.Duration)
|
Init(duration time.Duration)
|
||||||
AddChannel(channel chan time.Time)
|
AddChannel(chan time.Time)
|
||||||
Close()
|
Close()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"process_messages" : {
|
"process_messages" : {
|
||||||
"add_tags_if": [
|
"add_tag_if": [
|
||||||
{
|
{
|
||||||
"key" : "cluster",
|
"key" : "cluster",
|
||||||
"value" : "testcluster",
|
"value" : "testcluster",
|
||||||
@@ -12,7 +12,7 @@
|
|||||||
"if" : "name == 'temp_package_id_0'"
|
"if" : "name == 'temp_package_id_0'"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"delete_meta_if": [
|
"delete_tag_if": [
|
||||||
{
|
{
|
||||||
"key" : "unit",
|
"key" : "unit",
|
||||||
"if" : "true"
|
"if" : "true"
|
||||||
@@ -1,6 +1,4 @@
|
|||||||
Package: cc-metric-collector
|
Package: cc-metric-collector
|
||||||
Section: misc
|
|
||||||
Priority: optional
|
|
||||||
Version: {VERSION}
|
Version: {VERSION}
|
||||||
Installed-Size: {INSTALLED_SIZE}
|
Installed-Size: {INSTALLED_SIZE}
|
||||||
Architecture: {ARCH}
|
Architecture: {ARCH}
|
||||||
|
|||||||
@@ -29,12 +29,12 @@ make
|
|||||||
|
|
||||||
|
|
||||||
%install
|
%install
|
||||||
install -Dpm 0755 %{name} %{buildroot}%{_bindir}/%{name}
|
install -Dpm 0750 %{name} %{buildroot}%{_bindir}/%{name}
|
||||||
install -Dpm 0600 example-configs/config.json %{buildroot}%{_sysconfdir}/%{name}/%{name}.json
|
install -Dpm 0600 config.json %{buildroot}%{_sysconfdir}/%{name}/%{name}.json
|
||||||
install -Dpm 0600 example-configs/collectors.json %{buildroot}%{_sysconfdir}/%{name}/collectors.json
|
install -Dpm 0600 collectors.json %{buildroot}%{_sysconfdir}/%{name}/collectors.json
|
||||||
install -Dpm 0600 example-configs/sinks.json %{buildroot}%{_sysconfdir}/%{name}/sinks.json
|
install -Dpm 0600 sinks.json %{buildroot}%{_sysconfdir}/%{name}/sinks.json
|
||||||
install -Dpm 0600 example-configs/receivers.json %{buildroot}%{_sysconfdir}/%{name}/receivers.json
|
install -Dpm 0600 receivers.json %{buildroot}%{_sysconfdir}/%{name}/receivers.json
|
||||||
install -Dpm 0600 example-configs/router.json %{buildroot}%{_sysconfdir}/%{name}/router.json
|
install -Dpm 0600 router.json %{buildroot}%{_sysconfdir}/%{name}/router.json
|
||||||
install -Dpm 0644 scripts/%{name}.service %{buildroot}%{_unitdir}/%{name}.service
|
install -Dpm 0644 scripts/%{name}.service %{buildroot}%{_unitdir}/%{name}.service
|
||||||
install -Dpm 0600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{name}
|
install -Dpm 0600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{name}
|
||||||
install -Dpm 0644 scripts/%{name}.sysusers %{buildroot}%{_sysusersdir}/%{name}.conf
|
install -Dpm 0644 scripts/%{name}.sysusers %{buildroot}%{_sysusersdir}/%{name}.conf
|
||||||
@@ -54,7 +54,7 @@ install -Dpm 0644 scripts/%{name}.sysusers %{buildroot}%{_sysusersdir}/%{name}.c
|
|||||||
|
|
||||||
%files
|
%files
|
||||||
# Binary
|
# Binary
|
||||||
%attr(-,root,root) %{_bindir}/%{name}
|
%attr(-,clustercockpit,clustercockpit) %{_bindir}/%{name}
|
||||||
# Config
|
# Config
|
||||||
%dir %{_sysconfdir}/%{name}
|
%dir %{_sysconfdir}/%{name}
|
||||||
%attr(0600,clustercockpit,clustercockpit) %config(noreplace) %{_sysconfdir}/%{name}/%{name}.json
|
%attr(0600,clustercockpit,clustercockpit) %config(noreplace) %{_sysconfdir}/%{name}/%{name}.json
|
||||||
|
|||||||
@@ -44,8 +44,6 @@ def group_to_json(groupfile):
|
|||||||
scope = "socket"
|
scope = "socket"
|
||||||
if "PWR" in calc:
|
if "PWR" in calc:
|
||||||
scope = "socket"
|
scope = "socket"
|
||||||
if "UMC" in calc:
|
|
||||||
scope = "socket"
|
|
||||||
|
|
||||||
m = {"name" : metric, "calc": calc, "type" : scope, "publish" : True}
|
m = {"name" : metric, "calc": calc, "type" : scope, "publish" : True}
|
||||||
metrics.append(m)
|
metrics.append(m)
|
||||||
|
|||||||
Reference in New Issue
Block a user