mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-02-13 14:41:45 +01:00
Compare commits
69 Commits
nvidia_ene
...
golangci-m
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a5012950ba | ||
|
|
9bc7cacc11 | ||
|
|
f1e5e3374a | ||
|
|
9868772cba | ||
|
|
2f76a2d113 | ||
|
|
135d3e599c | ||
|
|
22be59dc2a | ||
|
|
0990035d63 | ||
|
|
cca0d23efa | ||
|
|
7cff283001 | ||
|
|
fa45d0d973 | ||
|
|
e70fd658f0 | ||
|
|
c58790cd54 | ||
|
|
67ee09ffef | ||
|
|
7f575269eb | ||
|
|
c8cd11796c | ||
|
|
92f6c75d23 | ||
|
|
62d40cfe00 | ||
|
|
ece1a52082 | ||
|
|
398aa207a9 | ||
|
|
b51bf592d0 | ||
|
|
6243203880 | ||
|
|
c7c9f8c273 | ||
|
|
6a4ad067ac | ||
|
|
ed2378f794 | ||
|
|
99e066ff5f | ||
|
|
67cdbefb02 | ||
|
|
b522aca693 | ||
|
|
ea7c4f4ec7 | ||
|
|
09cf89a951 | ||
|
|
d6499935a4 | ||
|
|
3e19c47ae4 | ||
|
|
97e09f13f4 | ||
|
|
e08bd3d926 | ||
|
|
fc525b7430 | ||
|
|
69d4567ecf | ||
|
|
c5183feafc | ||
|
|
a45366646e | ||
|
|
a551616566 | ||
|
|
a9fa168117 | ||
|
|
39d37597ab | ||
|
|
aeaba0021b | ||
|
|
5ceffb44b4 | ||
|
|
e29942a4be | ||
|
|
0b9b9a6e68 | ||
|
|
b47cb3a0c4 | ||
|
|
b49ae7b612 | ||
|
|
1fc5cc8483 | ||
|
|
e81099af8d | ||
|
|
eaca327d73 | ||
|
|
2e48996d87 | ||
|
|
7cdbada522 | ||
|
|
babe1e020d | ||
|
|
776af72231 | ||
|
|
2d4894b8f7 | ||
|
|
35295b0b3a | ||
|
|
1e734baa35 | ||
|
|
aa6181a018 | ||
|
|
0a2a85f2ce | ||
|
|
48f5afe2be | ||
|
|
979192af4e | ||
|
|
c1032ff329 | ||
|
|
6b03d3aee8 | ||
|
|
b9665d0d68 | ||
|
|
4c7a0e064f | ||
|
|
d8f10384a1 | ||
|
|
f74d856e69 | ||
|
|
fabb37ea70 | ||
|
|
3a0f148728 |
11
.github/dependabot.yml
vendored
Normal file
11
.github/dependabot.yml
vendored
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
# To get started with Dependabot version updates, you'll need to specify which
|
||||||
|
# package ecosystems to update and where the package manifests are located.
|
||||||
|
# Please see the documentation for all configuration options:
|
||||||
|
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
|
||||||
|
|
||||||
|
version: 2
|
||||||
|
updates:
|
||||||
|
- package-ecosystem: "gomod"
|
||||||
|
directory: "/"
|
||||||
|
schedule:
|
||||||
|
interval: "weekly"
|
||||||
117
.github/workflows/Release.yml
vendored
117
.github/workflows/Release.yml
vendored
@@ -36,22 +36,14 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
# - name: Setup Golang
|
|
||||||
# uses: actions/setup-go@v5
|
|
||||||
# with:
|
|
||||||
# go-version: 'stable'
|
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager install \
|
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.22.9-1.module_el8.10.0+3938+8c723e16.noarch.rpm
|
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -78,13 +70,13 @@ jobs:
|
|||||||
|
|
||||||
# See: https://github.com/actions/upload-artifact
|
# See: https://github.com/actions/upload-artifact
|
||||||
- name: Save RPM as artifact
|
- name: Save RPM as artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v6
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for AlmaLinux 8
|
name: cc-metric-collector RPM for AlmaLinux 8
|
||||||
path: ${{ steps.rpmrename.outputs.RPM }}
|
path: ${{ steps.rpmrename.outputs.RPM }}
|
||||||
overwrite: true
|
overwrite: true
|
||||||
- name: Save SRPM as artifact
|
- name: Save SRPM as artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v6
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for AlmaLinux 8
|
name: cc-metric-collector SRPM for AlmaLinux 8
|
||||||
path: ${{ steps.rpmrename.outputs.SRPM }}
|
path: ${{ steps.rpmrename.outputs.SRPM }}
|
||||||
@@ -114,23 +106,14 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
# - name: Setup Golang
|
|
||||||
# uses: actions/setup-go@v5
|
|
||||||
# with:
|
|
||||||
# go-version: 'stable'
|
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager install \
|
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.22.7-2.el9_5.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.22.7-2.el9_5.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.22.7-2.el9_5.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.22.7-2.el9_5.noarch.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.22.7-2.el9_5.x86_64.rpm
|
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -157,25 +140,26 @@ jobs:
|
|||||||
|
|
||||||
# See: https://github.com/actions/upload-artifact
|
# See: https://github.com/actions/upload-artifact
|
||||||
- name: Save RPM as artifact
|
- name: Save RPM as artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v6
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for AlmaLinux 9
|
name: cc-metric-collector RPM for AlmaLinux 9
|
||||||
path: ${{ steps.rpmrename.outputs.RPM }}
|
path: ${{ steps.rpmrename.outputs.RPM }}
|
||||||
overwrite: true
|
overwrite: true
|
||||||
- name: Save SRPM as artifact
|
- name: Save SRPM as artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v6
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for AlmaLinux 9
|
name: cc-metric-collector SRPM for AlmaLinux 9
|
||||||
path: ${{ steps.rpmrename.outputs.SRPM }}
|
path: ${{ steps.rpmrename.outputs.SRPM }}
|
||||||
overwrite: true
|
overwrite: true
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build on UBI 8 using go-toolset
|
# Build on Red Hat Universal Base Image (UBI 8) using go-toolset
|
||||||
#
|
#
|
||||||
UBI-8-RPM-build:
|
UBI-8-RPM-build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c35984d70cc534b3a3784e?container-tabs=gti
|
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+8
|
||||||
container: registry.access.redhat.com/ubi8/ubi:8.8-1032.1692772289
|
# https://hub.docker.com/r/redhat/ubi8
|
||||||
|
container: redhat/ubi8
|
||||||
# The job outputs link to the outputs of the 'rpmbuild' step
|
# The job outputs link to the outputs of the 'rpmbuild' step
|
||||||
outputs:
|
outputs:
|
||||||
rpm : ${{steps.rpmbuild.outputs.RPM}}
|
rpm : ${{steps.rpmbuild.outputs.RPM}}
|
||||||
@@ -190,22 +174,14 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
# - name: Setup Golang
|
|
||||||
# uses: actions/setup-go@v5
|
|
||||||
# with:
|
|
||||||
# go-version: 'stable'
|
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager install \
|
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-8-appstream-rpms install go-toolset
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.22.9-1.module_el8.10.0+3938+8c723e16.noarch.rpm
|
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -215,24 +191,25 @@ jobs:
|
|||||||
|
|
||||||
# See: https://github.com/actions/upload-artifact
|
# See: https://github.com/actions/upload-artifact
|
||||||
- name: Save RPM as artifact
|
- name: Save RPM as artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v6
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for UBI 8
|
name: cc-metric-collector RPM for UBI 8
|
||||||
path: ${{ steps.rpmbuild.outputs.RPM }}
|
path: ${{ steps.rpmbuild.outputs.RPM }}
|
||||||
overwrite: true
|
overwrite: true
|
||||||
- name: Save SRPM as artifact
|
- name: Save SRPM as artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v6
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for UBI 8
|
name: cc-metric-collector SRPM for UBI 8
|
||||||
path: ${{ steps.rpmbuild.outputs.SRPM }}
|
path: ${{ steps.rpmbuild.outputs.SRPM }}
|
||||||
overwrite: true
|
overwrite: true
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build on UBI 9 using go-toolset
|
# Build on Red Hat Universal Base Image (UBI 9) using go-toolset
|
||||||
#
|
#
|
||||||
UBI-9-RPM-build:
|
UBI-9-RPM-build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
|
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+9
|
||||||
|
# https://hub.docker.com/r/redhat/ubi9
|
||||||
container: redhat/ubi9
|
container: redhat/ubi9
|
||||||
# The job outputs link to the outputs of the 'rpmbuild' step
|
# The job outputs link to the outputs of the 'rpmbuild' step
|
||||||
# The job outputs link to the outputs of the 'rpmbuild' step
|
# The job outputs link to the outputs of the 'rpmbuild' step
|
||||||
@@ -249,24 +226,14 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
# See: https://github.com/marketplace/actions/setup-go-environment
|
|
||||||
# - name: Setup Golang
|
|
||||||
# uses: actions/setup-go@v5
|
|
||||||
# with:
|
|
||||||
# go-version: 'stable'
|
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager install \
|
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-9-appstream-rpms install go-toolset
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.22.7-2.el9_5.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.22.7-2.el9_5.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.22.7-2.el9_5.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.22.7-2.el9_5.noarch.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.22.7-2.el9_5.x86_64.rpm
|
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -276,13 +243,13 @@ jobs:
|
|||||||
|
|
||||||
# See: https://github.com/actions/upload-artifact
|
# See: https://github.com/actions/upload-artifact
|
||||||
- name: Save RPM as artifact
|
- name: Save RPM as artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v6
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for UBI 9
|
name: cc-metric-collector RPM for UBI 9
|
||||||
path: ${{ steps.rpmbuild.outputs.RPM }}
|
path: ${{ steps.rpmbuild.outputs.RPM }}
|
||||||
overwrite: true
|
overwrite: true
|
||||||
- name: Save SRPM as artifact
|
- name: Save SRPM as artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v6
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for UBI 9
|
name: cc-metric-collector SRPM for UBI 9
|
||||||
path: ${{ steps.rpmbuild.outputs.SRPM }}
|
path: ${{ steps.rpmbuild.outputs.SRPM }}
|
||||||
@@ -308,13 +275,14 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
# Use official golang package
|
||||||
|
# See: https://github.com/marketplace/actions/setup-go-environment
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
uses: actions/setup-go@v5
|
uses: actions/setup-go@v6
|
||||||
with:
|
with:
|
||||||
go-version: 'stable'
|
go-version: 'stable'
|
||||||
|
|
||||||
@@ -332,7 +300,7 @@ jobs:
|
|||||||
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
|
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
|
||||||
# See: https://github.com/actions/upload-artifact
|
# See: https://github.com/actions/upload-artifact
|
||||||
- name: Save DEB as artifact
|
- name: Save DEB as artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v6
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector DEB for Ubuntu 22.04
|
name: cc-metric-collector DEB for Ubuntu 22.04
|
||||||
path: ${{ steps.debrename.outputs.DEB }}
|
path: ${{ steps.debrename.outputs.DEB }}
|
||||||
@@ -358,13 +326,14 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
# Use official golang package
|
||||||
|
# See: https://github.com/marketplace/actions/setup-go-environment
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
uses: actions/setup-go@v5
|
uses: actions/setup-go@v6
|
||||||
with:
|
with:
|
||||||
go-version: 'stable'
|
go-version: 'stable'
|
||||||
|
|
||||||
@@ -382,7 +351,7 @@ jobs:
|
|||||||
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
|
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
|
||||||
# See: https://github.com/actions/upload-artifact
|
# See: https://github.com/actions/upload-artifact
|
||||||
- name: Save DEB as artifact
|
- name: Save DEB as artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v6
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector DEB for Ubuntu 24.04
|
name: cc-metric-collector DEB for Ubuntu 24.04
|
||||||
path: ${{ steps.debrename.outputs.DEB }}
|
path: ${{ steps.debrename.outputs.DEB }}
|
||||||
@@ -400,48 +369,48 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
# See: https://github.com/actions/download-artifact
|
# See: https://github.com/actions/download-artifact
|
||||||
- name: Download AlmaLinux 8 RPM
|
- name: Download AlmaLinux 8 RPM
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v7
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for AlmaLinux 8
|
name: cc-metric-collector RPM for AlmaLinux 8
|
||||||
- name: Download AlmaLinux 8 SRPM
|
- name: Download AlmaLinux 8 SRPM
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v7
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for AlmaLinux 8
|
name: cc-metric-collector SRPM for AlmaLinux 8
|
||||||
|
|
||||||
- name: Download AlmaLinux 9 RPM
|
- name: Download AlmaLinux 9 RPM
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v7
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for AlmaLinux 9
|
name: cc-metric-collector RPM for AlmaLinux 9
|
||||||
- name: Download AlmaLinux 9 SRPM
|
- name: Download AlmaLinux 9 SRPM
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v7
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for AlmaLinux 9
|
name: cc-metric-collector SRPM for AlmaLinux 9
|
||||||
|
|
||||||
- name: Download UBI 8 RPM
|
- name: Download UBI 8 RPM
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v7
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for UBI 8
|
name: cc-metric-collector RPM for UBI 8
|
||||||
- name: Download UBI 8 SRPM
|
- name: Download UBI 8 SRPM
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v7
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for UBI 8
|
name: cc-metric-collector SRPM for UBI 8
|
||||||
|
|
||||||
- name: Download UBI 9 RPM
|
- name: Download UBI 9 RPM
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v7
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector RPM for UBI 9
|
name: cc-metric-collector RPM for UBI 9
|
||||||
- name: Download UBI 9 SRPM
|
- name: Download UBI 9 SRPM
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v7
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector SRPM for UBI 9
|
name: cc-metric-collector SRPM for UBI 9
|
||||||
|
|
||||||
- name: Download Ubuntu 22.04 DEB
|
- name: Download Ubuntu 22.04 DEB
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v7
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector DEB for Ubuntu 22.04
|
name: cc-metric-collector DEB for Ubuntu 22.04
|
||||||
|
|
||||||
- name: Download Ubuntu 24.04 DEB
|
- name: Download Ubuntu 24.04 DEB
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v7
|
||||||
with:
|
with:
|
||||||
name: cc-metric-collector DEB for Ubuntu 24.04
|
name: cc-metric-collector DEB for Ubuntu 24.04
|
||||||
|
|
||||||
|
|||||||
167
.github/workflows/runonce.yml
vendored
167
.github/workflows/runonce.yml
vendored
@@ -20,25 +20,41 @@ jobs:
|
|||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
# Checkout git repository and submodules
|
# Checkout git repository and submodules
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
|
|
||||||
# See: https://github.com/marketplace/actions/setup-go-environment
|
# See: https://github.com/marketplace/actions/setup-go-environment
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
uses: actions/setup-go@v5
|
uses: actions/setup-go@v6
|
||||||
with:
|
with:
|
||||||
go-version: '1.21'
|
go-version: 'stable'
|
||||||
check-latest: true
|
check-latest: true
|
||||||
|
|
||||||
|
- name: Install reviewdog
|
||||||
|
run: |
|
||||||
|
go install github.com/reviewdog/reviewdog/cmd/reviewdog@latest
|
||||||
|
|
||||||
|
# See: https://golangci-lint.run
|
||||||
|
- name: Install GolangCI-Lint
|
||||||
|
run: |
|
||||||
|
go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@latest
|
||||||
|
|
||||||
- name: Build MetricCollector
|
- name: Build MetricCollector
|
||||||
run: make
|
run: make
|
||||||
|
|
||||||
- name: Run MetricCollector once
|
- name: Run MetricCollector once
|
||||||
run: ./cc-metric-collector --once --config .github/ci-config.json
|
run: ./cc-metric-collector --once --config .github/ci-config.json
|
||||||
|
|
||||||
|
# Running the linter requires likwid.h, which gets downloaded in the build step
|
||||||
|
- name: Static Analysis with GolangCI-Lint and Upload Report with reviewdog
|
||||||
|
run: |
|
||||||
|
golangci-lint run --enable modernize,staticcheck,govet | reviewdog -f=golangci-lint -name "Check golangci-lint on build-latest" -reporter=github-check -filter-mode=nofilter -fail-level none
|
||||||
|
env:
|
||||||
|
REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build on AlmaLinux 8
|
# Build on AlmaLinux 8 using go-toolset
|
||||||
#
|
#
|
||||||
AlmaLinux8-RPM-build:
|
AlmaLinux8-RPM-build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@@ -58,23 +74,14 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
# See: https://github.com/marketplace/actions/setup-go-environment
|
|
||||||
# - name: Setup Golang
|
|
||||||
# uses: actions/setup-go@v5
|
|
||||||
# with:
|
|
||||||
# go-version: 'stable'
|
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager install \
|
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.22.9-1.module_el8.10.0+3938+8c723e16.noarch.rpm
|
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -83,7 +90,7 @@ jobs:
|
|||||||
make RPM
|
make RPM
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build on AlmaLinux 9
|
# Build on AlmaLinux 9 using go-toolset
|
||||||
#
|
#
|
||||||
AlmaLinux9-RPM-build:
|
AlmaLinux9-RPM-build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@@ -103,24 +110,14 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
# See: https://github.com/marketplace/actions/setup-go-environment
|
|
||||||
# - name: Setup Golang
|
|
||||||
# uses: actions/setup-go@v5
|
|
||||||
# with:
|
|
||||||
# go-version: 'stable'
|
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager install \
|
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.22.7-2.el9_5.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.22.7-2.el9_5.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.22.7-2.el9_5.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.22.7-2.el9_5.noarch.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.22.7-2.el9_5.x86_64.rpm
|
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -128,13 +125,49 @@ jobs:
|
|||||||
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
|
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
|
||||||
make RPM
|
make RPM
|
||||||
|
|
||||||
|
#
|
||||||
|
# Build on AlmaLinux 10 using go-toolset
|
||||||
|
#
|
||||||
|
AlmaLinux10-RPM-build:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
# See: https://hub.docker.com/_/almalinux
|
||||||
|
container: almalinux:10
|
||||||
|
# The job outputs link to the outputs of the 'rpmrename' step
|
||||||
|
# Only job outputs can be used in child jobs
|
||||||
|
steps:
|
||||||
|
|
||||||
|
# Use dnf to install development packages
|
||||||
|
- name: Install development packages
|
||||||
|
run: |
|
||||||
|
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
|
||||||
|
dnf --assumeyes install wget openssl-devel diffutils delve which
|
||||||
|
|
||||||
|
# Checkout git repository and submodules
|
||||||
|
# fetch-depth must be 0 to use git describe
|
||||||
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Setup Golang
|
||||||
|
run: |
|
||||||
|
dnf --assumeyes --disableplugin=subscription-manager --enablerepo appstream install go-toolset
|
||||||
|
|
||||||
|
- name: RPM build MetricCollector
|
||||||
|
id: rpmbuild
|
||||||
|
run: |
|
||||||
|
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
|
||||||
|
make RPM
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build on UBI 8 using go-toolset
|
# Build on Red Hat Universal Base Image (UBI 8) using go-toolset
|
||||||
#
|
#
|
||||||
UBI-8-RPM-build:
|
UBI-8-RPM-build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
|
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+8
|
||||||
|
# https://hub.docker.com/r/redhat/ubi8
|
||||||
container: redhat/ubi8
|
container: redhat/ubi8
|
||||||
# The job outputs link to the outputs of the 'rpmbuild' step
|
# The job outputs link to the outputs of the 'rpmbuild' step
|
||||||
steps:
|
steps:
|
||||||
@@ -147,23 +180,14 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
# See: https://github.com/marketplace/actions/setup-go-environment
|
|
||||||
# - name: Setup Golang
|
|
||||||
# uses: actions/setup-go@v5
|
|
||||||
# with:
|
|
||||||
# go-version: 'stable'
|
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager install \
|
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-8-appstream-rpms install go-toolset
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.22.9-1.module_el8.10.0+3938+8c723e16.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.22.9-1.module_el8.10.0+3938+8c723e16.noarch.rpm
|
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -172,11 +196,12 @@ jobs:
|
|||||||
make RPM
|
make RPM
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build on UBI 9 using go-toolset
|
# Build on Red Hat Universal Base Image (UBI 9) using go-toolset
|
||||||
#
|
#
|
||||||
UBI-9-RPM-build:
|
UBI-9-RPM-build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
|
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+9
|
||||||
|
# https://hub.docker.com/r/redhat/ubi9
|
||||||
container: redhat/ubi9
|
container: redhat/ubi9
|
||||||
# The job outputs link to the outputs of the 'rpmbuild' step
|
# The job outputs link to the outputs of the 'rpmbuild' step
|
||||||
steps:
|
steps:
|
||||||
@@ -189,24 +214,48 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
# See: https://github.com/marketplace/actions/setup-go-environment
|
|
||||||
# - name: Setup Golang
|
|
||||||
# uses: actions/setup-go@v5
|
|
||||||
# with:
|
|
||||||
# go-version: 'stable'
|
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
run: |
|
run: |
|
||||||
dnf --assumeyes --disableplugin=subscription-manager install \
|
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-9-appstream-rpms install go-toolset
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.22.7-2.el9_5.x86_64.rpm \
|
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.22.7-2.el9_5.x86_64.rpm \
|
- name: RPM build MetricCollector
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.22.7-2.el9_5.x86_64.rpm \
|
id: rpmbuild
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.22.7-2.el9_5.noarch.rpm \
|
run: |
|
||||||
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.22.7-2.el9_5.x86_64.rpm
|
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
|
||||||
|
make RPM
|
||||||
|
|
||||||
|
#
|
||||||
|
# Build on Red Hat Universal Base Image (UBI 10) using go-toolset
|
||||||
|
#
|
||||||
|
UBI-10-RPM-build:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
# See: https://catalog.redhat.com/en/search?searchType=Containers&q=Red+Hat+Universal+Base+Image+10
|
||||||
|
# https://hub.docker.com/r/redhat/ubi10
|
||||||
|
container: redhat/ubi10
|
||||||
|
# The job outputs link to the outputs of the 'rpmbuild' step
|
||||||
|
steps:
|
||||||
|
|
||||||
|
# Use dnf to install development packages
|
||||||
|
- name: Install development packages
|
||||||
|
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros gcc make python3 git wget openssl-devel diffutils delve
|
||||||
|
|
||||||
|
# Checkout git repository and submodules
|
||||||
|
# fetch-depth must be 0 to use git describe
|
||||||
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Setup Golang
|
||||||
|
run: |
|
||||||
|
dnf --assumeyes --disableplugin=subscription-manager --enablerepo ubi-10-for-x86_64-appstream-rpms install go-toolset
|
||||||
|
|
||||||
- name: RPM build MetricCollector
|
- name: RPM build MetricCollector
|
||||||
id: rpmbuild
|
id: rpmbuild
|
||||||
@@ -231,14 +280,14 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
# Use official golang package
|
# Use official golang package
|
||||||
# See: https://github.com/marketplace/actions/setup-go-environment
|
# See: https://github.com/marketplace/actions/setup-go-environment
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
uses: actions/setup-go@v5
|
uses: actions/setup-go@v6
|
||||||
with:
|
with:
|
||||||
go-version: 'stable'
|
go-version: 'stable'
|
||||||
|
|
||||||
@@ -265,14 +314,14 @@ jobs:
|
|||||||
# fetch-depth must be 0 to use git describe
|
# fetch-depth must be 0 to use git describe
|
||||||
# See: https://github.com/marketplace/actions/checkout
|
# See: https://github.com/marketplace/actions/checkout
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
# Use official golang package
|
# Use official golang package
|
||||||
# See: https://github.com/marketplace/actions/setup-go-environment
|
# See: https://github.com/marketplace/actions/setup-go-environment
|
||||||
- name: Setup Golang
|
- name: Setup Golang
|
||||||
uses: actions/setup-go@v5
|
uses: actions/setup-go@v6
|
||||||
with:
|
with:
|
||||||
go-version: 'stable'
|
go-version: 'stable'
|
||||||
|
|
||||||
|
|||||||
5
Makefile
5
Makefile
@@ -72,6 +72,11 @@ staticcheck:
|
|||||||
$(GOBIN) install honnef.co/go/tools/cmd/staticcheck@latest
|
$(GOBIN) install honnef.co/go/tools/cmd/staticcheck@latest
|
||||||
$$($(GOBIN) env GOPATH)/bin/staticcheck ./...
|
$$($(GOBIN) env GOPATH)/bin/staticcheck ./...
|
||||||
|
|
||||||
|
.PHONY: golangci-lint
|
||||||
|
golangci-lint:
|
||||||
|
$(GOBIN) install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@latest
|
||||||
|
$$($(GOBIN) env GOPATH)/bin/golangci-lint run
|
||||||
|
|
||||||
.ONESHELL:
|
.ONESHELL:
|
||||||
.PHONY: RPM
|
.PHONY: RPM
|
||||||
RPM: scripts/cc-metric-collector.spec
|
RPM: scripts/cc-metric-collector.spec
|
||||||
|
|||||||
10
README.md
10
README.md
@@ -32,13 +32,15 @@ There is a main configuration file with basic settings that point to the other c
|
|||||||
|
|
||||||
``` json
|
``` json
|
||||||
{
|
{
|
||||||
"sinks": "sinks.json",
|
"sinks-file": "sinks.json",
|
||||||
"collectors" : "collectors.json",
|
"collectors-file" : "collectors.json",
|
||||||
"receivers" : "receivers.json",
|
"receivers-file" : "receivers.json",
|
||||||
"router" : "router.json",
|
"router-file" : "router.json",
|
||||||
|
"main": {
|
||||||
"interval": "10s",
|
"interval": "10s",
|
||||||
"duration": "1s"
|
"duration": "1s"
|
||||||
}
|
}
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The `interval` defines how often the metrics should be read and send to the sink. The `duration` tells collectors how long one measurement has to take. This is important for some collectors, like the `likwid` collector. For more information, see [here](./docs/configuration.md).
|
The `interval` defines how often the metrics should be read and send to the sink. The `duration` tells collectors how long one measurement has to take. This is important for some collectors, like the `likwid` collector. For more information, see [here](./docs/configuration.md).
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -7,17 +14,17 @@ import (
|
|||||||
"os/signal"
|
"os/signal"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
|
||||||
"github.com/ClusterCockpit/cc-lib/receivers"
|
"github.com/ClusterCockpit/cc-lib/v2/receivers"
|
||||||
"github.com/ClusterCockpit/cc-lib/sinks"
|
"github.com/ClusterCockpit/cc-lib/v2/sinks"
|
||||||
"github.com/ClusterCockpit/cc-metric-collector/collectors"
|
"github.com/ClusterCockpit/cc-metric-collector/collectors"
|
||||||
|
|
||||||
// "strings"
|
// "strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
|
ccconf "github.com/ClusterCockpit/cc-lib/v2/ccConfig"
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -52,6 +52,7 @@ In contrast to the configuration files for sinks and receivers, the collectors c
|
|||||||
* [`beegfs_meta`](./beegfsmetaMetric.md)
|
* [`beegfs_meta`](./beegfsmetaMetric.md)
|
||||||
* [`beegfs_storage`](./beegfsstorageMetric.md)
|
* [`beegfs_storage`](./beegfsstorageMetric.md)
|
||||||
* [`rocm_smi`](./rocmsmiMetric.md)
|
* [`rocm_smi`](./rocmsmiMetric.md)
|
||||||
|
* [`slurm_cgroup`](./slurmCgroupMetric.md)
|
||||||
|
|
||||||
## Todos
|
## Todos
|
||||||
|
|
||||||
@@ -66,7 +67,7 @@ A collector reads data from any source, parses it to metrics and submits these m
|
|||||||
* `Read(duration time.Duration, output chan ccMessage.CCMessage)`: Read, parse and submit data to the `output` channel as [`CCMessage`](https://github.com/ClusterCockpit/cc-lib/blob/main/ccMessage/README.md). If the collector has to measure anything for some duration, use the provided function argument `duration`.
|
* `Read(duration time.Duration, output chan ccMessage.CCMessage)`: Read, parse and submit data to the `output` channel as [`CCMessage`](https://github.com/ClusterCockpit/cc-lib/blob/main/ccMessage/README.md). If the collector has to measure anything for some duration, use the provided function argument `duration`.
|
||||||
* `Close()`: Closes down the collector.
|
* `Close()`: Closes down the collector.
|
||||||
|
|
||||||
It is recommanded to call `setup()` in the `Init()` function.
|
It is recommended to call `setup()` in the `Init()` function.
|
||||||
|
|
||||||
Finally, the collector needs to be registered in the `collectorManager.go`. There is a list of collectors called `AvailableCollectors` which is a map (`collector_type_string` -> `pointer to MetricCollector interface`). Add a new entry with a descriptive name and the new collector.
|
Finally, the collector needs to be registered in the `collectorManager.go`. There is a list of collectors called `AvailableCollectors` which is a map (`collector_type_string` -> `pointer to MetricCollector interface`). Add a new entry with a descriptive name and the new collector.
|
||||||
|
|
||||||
@@ -99,11 +100,12 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
m.name = "SampleCollector"
|
m.name = "SampleCollector"
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err := json.Unmarshal(config, &m.config)
|
if err := json.Unmarshal(config, &m.config); err != nil {
|
||||||
if err != nil {
|
return fmt.Errorf("%s Init(): json.Unmarshal() call failed: %w", m.name, err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Sample"}
|
m.meta = map[string]string{"source": m.name, "group": "Sample"}
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -10,12 +17,13 @@ import (
|
|||||||
"os/exec"
|
"os/exec"
|
||||||
"os/user"
|
"os/user"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const DEFAULT_BEEGFS_CMD = "beegfs-ctl"
|
const DEFAULT_BEEGFS_CMD = "beegfs-ctl"
|
||||||
@@ -54,7 +62,9 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
|||||||
"rmXA", "setXA", "mirror"}
|
"rmXA", "setXA", "mirror"}
|
||||||
|
|
||||||
m.name = "BeegfsMetaCollector"
|
m.name = "BeegfsMetaCollector"
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
// Set default beegfs-ctl binary
|
// Set default beegfs-ctl binary
|
||||||
|
|
||||||
@@ -71,8 +81,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
|||||||
//create map with possible variables
|
//create map with possible variables
|
||||||
m.matches = make(map[string]string)
|
m.matches = make(map[string]string)
|
||||||
for _, value := range nodeMdstat_array {
|
for _, value := range nodeMdstat_array {
|
||||||
_, skip := stringArrayContains(m.config.ExcludeMetrics, value)
|
if slices.Contains(m.config.ExcludeMetrics, value) {
|
||||||
if skip {
|
|
||||||
m.matches["other"] = "0"
|
m.matches["other"] = "0"
|
||||||
} else {
|
} else {
|
||||||
m.matches["beegfs_cmeta_"+value] = "0"
|
m.matches["beegfs_cmeta_"+value] = "0"
|
||||||
@@ -216,7 +225,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
|
|
||||||
for key, data := range m.matches {
|
for key, data := range m.matches {
|
||||||
value, _ := strconv.ParseFloat(data, 32)
|
value, _ := strconv.ParseFloat(data, 32)
|
||||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -10,12 +17,13 @@ import (
|
|||||||
"os/exec"
|
"os/exec"
|
||||||
"os/user"
|
"os/user"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Struct for the collector-specific JSON config
|
// Struct for the collector-specific JSON config
|
||||||
@@ -47,7 +55,9 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
|||||||
"storInf", "unlnk"}
|
"storInf", "unlnk"}
|
||||||
|
|
||||||
m.name = "BeegfsStorageCollector"
|
m.name = "BeegfsStorageCollector"
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
// Set default beegfs-ctl binary
|
// Set default beegfs-ctl binary
|
||||||
|
|
||||||
@@ -64,8 +74,7 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
|||||||
//create map with possible variables
|
//create map with possible variables
|
||||||
m.matches = make(map[string]string)
|
m.matches = make(map[string]string)
|
||||||
for _, value := range storageStat_array {
|
for _, value := range storageStat_array {
|
||||||
_, skip := stringArrayContains(m.config.ExcludeMetrics, value)
|
if slices.Contains(m.config.ExcludeMetrics, value) {
|
||||||
if skip {
|
|
||||||
m.matches["other"] = "0"
|
m.matches["other"] = "0"
|
||||||
} else {
|
} else {
|
||||||
m.matches["beegfs_cstorage_"+value] = "0"
|
m.matches["beegfs_cstorage_"+value] = "0"
|
||||||
@@ -208,7 +217,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
|||||||
|
|
||||||
for key, data := range m.matches {
|
for key, data := range m.matches {
|
||||||
value, _ := strconv.ParseFloat(data, 32)
|
value, _ := strconv.ParseFloat(data, 32)
|
||||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,12 +1,20 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -40,6 +48,7 @@ var AvailableCollectors = map[string]MetricCollector{
|
|||||||
"self": new(SelfCollector),
|
"self": new(SelfCollector),
|
||||||
"schedstat": new(SchedstatCollector),
|
"schedstat": new(SchedstatCollector),
|
||||||
"nfsiostat": new(NfsIOStatCollector),
|
"nfsiostat": new(NfsIOStatCollector),
|
||||||
|
"slurm_cgroup": new(SlurmCgroupCollector),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Metric collector manager data structure
|
// Metric collector manager data structure
|
||||||
@@ -96,7 +105,7 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
|
|||||||
|
|
||||||
err = collector.Init(collectorCfg)
|
err = collector.Init(collectorCfg)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError("CollectorManager", "Collector", collectorName, "initialization failed:", err.Error())
|
cclog.ComponentError("CollectorManager", fmt.Sprintf("Collector %s initialization failed: %v", collectorName, err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())
|
cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -10,8 +17,8 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// CPUFreqCollector
|
// CPUFreqCollector
|
||||||
@@ -34,9 +41,10 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
m.setup()
|
|
||||||
|
|
||||||
m.name = "CPUFreqCpuInfoCollector"
|
m.name = "CPUFreqCpuInfoCollector"
|
||||||
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{
|
||||||
"source": m.name,
|
"source": m.name,
|
||||||
@@ -49,7 +57,6 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to open file '%s': %v", cpuInfoFile, err)
|
return fmt.Errorf("failed to open file '%s': %v", cpuInfoFile, err)
|
||||||
}
|
}
|
||||||
defer file.Close()
|
|
||||||
|
|
||||||
// Collect topology information from file cpuinfo
|
// Collect topology information from file cpuinfo
|
||||||
foundFreq := false
|
foundFreq := false
|
||||||
@@ -79,6 +86,10 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err := file.Close(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): Call to file.Close() failed: %w", m.name, err)
|
||||||
|
}
|
||||||
|
|
||||||
// were all topology information collected?
|
// were all topology information collected?
|
||||||
if foundFreq &&
|
if foundFreq &&
|
||||||
len(processor) > 0 &&
|
len(processor) > 0 &&
|
||||||
@@ -133,7 +144,13 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
|||||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err))
|
fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer func() {
|
||||||
|
if err := file.Close(); err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to close file '%s': %v", cpuInfoFile, err))
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
processorCounter := 0
|
processorCounter := 0
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
@@ -154,7 +171,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
|||||||
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
|
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil {
|
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": value}, now); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -9,8 +16,8 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
"github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
"github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||||
"golang.org/x/sys/unix"
|
"golang.org/x/sys/unix"
|
||||||
)
|
)
|
||||||
@@ -41,7 +48,9 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
m.name = "CPUFreqCollector"
|
m.name = "CPUFreqCollector"
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err := json.Unmarshal(config, &m.config)
|
err := json.Unmarshal(config, &m.config)
|
||||||
@@ -117,7 +126,7 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil {
|
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": cpuFreq}, now); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -5,12 +12,13 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
sysconf "github.com/tklauser/go-sysconf"
|
sysconf "github.com/tklauser/go-sysconf"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -32,10 +40,17 @@ type CpustatCollector struct {
|
|||||||
|
|
||||||
func (m *CpustatCollector) Init(config json.RawMessage) error {
|
func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||||
m.name = "CpustatCollector"
|
m.name = "CpustatCollector"
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{"source": m.name, "group": "CPU"}
|
m.meta = map[string]string{
|
||||||
m.nodetags = map[string]string{"type": "node"}
|
"source": m.name,
|
||||||
|
"group": "CPU",
|
||||||
|
}
|
||||||
|
m.nodetags = map[string]string{
|
||||||
|
"type": "node",
|
||||||
|
}
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err := json.Unmarshal(config, &m.config)
|
err := json.Unmarshal(config, &m.config)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -57,14 +72,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
m.matches = make(map[string]int)
|
m.matches = make(map[string]int)
|
||||||
for match, index := range matches {
|
for match, index := range matches {
|
||||||
doExclude := false
|
if !slices.Contains(m.config.ExcludeMetrics, match) {
|
||||||
for _, exclude := range m.config.ExcludeMetrics {
|
|
||||||
if match == exclude {
|
|
||||||
doExclude = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !doExclude {
|
|
||||||
m.matches[match] = index
|
m.matches[match] = index
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -72,9 +80,17 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
|||||||
// Check input file
|
// Check input file
|
||||||
file, err := os.Open(string(CPUSTATFILE))
|
file, err := os.Open(string(CPUSTATFILE))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, err.Error())
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Init(): Failed to open file '%s': %v", string(CPUSTATFILE), err))
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer func() {
|
||||||
|
if err := file.Close(); err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Init(): Failed to close file '%s': %v", string(CPUSTATFILE), err))
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
// Pre-generate tags for all CPUs
|
// Pre-generate tags for all CPUs
|
||||||
num_cpus := 0
|
num_cpus := 0
|
||||||
@@ -122,7 +138,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
|
|||||||
sum := float64(0)
|
sum := float64(0)
|
||||||
for name, value := range values {
|
for name, value := range values {
|
||||||
sum += value
|
sum += value
|
||||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now)
|
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value * 100}, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("unit", "Percent")
|
y.AddTag("unit", "Percent")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -130,7 +146,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
|
|||||||
}
|
}
|
||||||
if v, ok := values["cpu_idle"]; ok {
|
if v, ok := values["cpu_idle"]; ok {
|
||||||
sum -= v
|
sum -= v
|
||||||
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]interface{}{"value": sum * 100}, now)
|
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]any{"value": sum * 100}, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("unit", "Percent")
|
y.AddTag("unit", "Percent")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -148,9 +164,17 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
|
|
||||||
file, err := os.Open(string(CPUSTATFILE))
|
file, err := os.Open(string(CPUSTATFILE))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, err.Error())
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to open file '%s': %v", string(CPUSTATFILE), err))
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer func() {
|
||||||
|
if err := file.Close(); err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to close file '%s': %v", string(CPUSTATFILE), err))
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
@@ -167,7 +191,7 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
num_cpus_metric, err := lp.NewMessage("num_cpus",
|
num_cpus_metric, err := lp.NewMessage("num_cpus",
|
||||||
m.nodetags,
|
m.nodetags,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]interface{}{"value": int(num_cpus)},
|
map[string]any{"value": int(num_cpus)},
|
||||||
now,
|
now,
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
|||||||
@@ -1,15 +1,24 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
influx "github.com/influxdata/line-protocol"
|
influx "github.com/influxdata/line-protocol"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -42,11 +51,16 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
for _, c := range m.config.Commands {
|
for _, c := range m.config.Commands {
|
||||||
cmdfields := strings.Fields(c)
|
cmdfields := strings.Fields(c)
|
||||||
command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " "))
|
command := exec.Command(cmdfields[0], cmdfields[1:]...)
|
||||||
command.Wait()
|
if err := command.Wait(); err != nil {
|
||||||
|
log.Print(err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
_, err = command.Output()
|
_, err = command.Output()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
m.commands = append(m.commands, c)
|
m.commands = append(m.commands, c)
|
||||||
@@ -81,8 +95,11 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
}
|
}
|
||||||
for _, cmd := range m.commands {
|
for _, cmd := range m.commands {
|
||||||
cmdfields := strings.Fields(cmd)
|
cmdfields := strings.Fields(cmd)
|
||||||
command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " "))
|
command := exec.Command(cmdfields[0], cmdfields[1:]...)
|
||||||
command.Wait()
|
if err := command.Wait(); err != nil {
|
||||||
|
log.Print(err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
stdout, err := command.Output()
|
stdout, err := command.Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print(err)
|
log.Print(err)
|
||||||
@@ -94,8 +111,7 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
for _, c := range cmdmetrics {
|
for _, c := range cmdmetrics {
|
||||||
_, skip := stringArrayContains(m.config.ExcludeMetrics, c.Name())
|
if slices.Contains(m.config.ExcludeMetrics, c.Name()) {
|
||||||
if skip {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -114,8 +130,7 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
for _, f := range fmetrics {
|
for _, f := range fmetrics {
|
||||||
_, skip := stringArrayContains(m.config.ExcludeMetrics, f.Name())
|
if slices.Contains(m.config.ExcludeMetrics, f.Name()) {
|
||||||
if skip {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
output <- lp.FromInfluxMetric(f)
|
output <- lp.FromInfluxMetric(f)
|
||||||
|
|||||||
@@ -1,15 +1,23 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const MOUNTFILE = `/proc/self/mounts`
|
const MOUNTFILE = `/proc/self/mounts`
|
||||||
@@ -29,7 +37,9 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
|
|||||||
m.name = "DiskstatCollector"
|
m.name = "DiskstatCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
if err := json.Unmarshal(config, &m.config); err != nil {
|
if err := json.Unmarshal(config, &m.config); err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -47,10 +57,11 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
file, err := os.Open(MOUNTFILE)
|
file, err := os.Open(MOUNTFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, err.Error())
|
return fmt.Errorf("%s Init(): file open for file \"%s\" failed: %w", m.name, MOUNTFILE, err)
|
||||||
return err
|
}
|
||||||
|
if err := file.Close(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): file close for file \"%s\" failed: %w", m.name, MOUNTFILE, err)
|
||||||
}
|
}
|
||||||
defer file.Close()
|
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -62,10 +73,18 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
|
|||||||
|
|
||||||
file, err := os.Open(MOUNTFILE)
|
file, err := os.Open(MOUNTFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, err.Error())
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to open file '%s': %v", MOUNTFILE, err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer func() {
|
||||||
|
if err := file.Close(); err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to close file '%s': %v", MOUNTFILE, err))
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
part_max_used := uint64(0)
|
part_max_used := uint64(0)
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
@@ -86,7 +105,7 @@ mountLoop:
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
mountPath := strings.Replace(linefields[1], `\040`, " ", -1)
|
mountPath := strings.ReplaceAll(linefields[1], `\040`, " ")
|
||||||
|
|
||||||
for _, excl := range m.config.ExcludeMounts {
|
for _, excl := range m.config.ExcludeMounts {
|
||||||
if strings.Contains(mountPath, excl) {
|
if strings.Contains(mountPath, excl) {
|
||||||
@@ -105,7 +124,7 @@ mountLoop:
|
|||||||
tags := map[string]string{"type": "node", "device": linefields[0]}
|
tags := map[string]string{"type": "node", "device": linefields[0]}
|
||||||
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
|
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
|
||||||
if m.allowedMetrics["disk_total"] {
|
if m.allowedMetrics["disk_total"] {
|
||||||
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
|
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]any{"value": total}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "GBytes")
|
y.AddMeta("unit", "GBytes")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -113,7 +132,7 @@ mountLoop:
|
|||||||
}
|
}
|
||||||
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000)
|
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000)
|
||||||
if m.allowedMetrics["disk_free"] {
|
if m.allowedMetrics["disk_free"] {
|
||||||
y, err := lp.NewMessage("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
|
y, err := lp.NewMessage("disk_free", tags, m.meta, map[string]any{"value": free}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "GBytes")
|
y.AddMeta("unit", "GBytes")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -127,7 +146,7 @@ mountLoop:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.allowedMetrics["part_max_used"] {
|
if m.allowedMetrics["part_max_used"] {
|
||||||
y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
|
y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]any{"value": int(part_max_used)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "percent")
|
y.AddMeta("unit", "percent")
|
||||||
output <- y
|
output <- y
|
||||||
|
|||||||
@@ -1,41 +1,308 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log"
|
"log"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"os/user"
|
"os/user"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const DEFAULT_GPFS_CMD = "mmpmon"
|
const DEFAULT_GPFS_CMD = "mmpmon"
|
||||||
|
|
||||||
type GpfsCollectorLastState struct {
|
type GpfsCollectorState map[string]int64
|
||||||
bytesRead int64
|
|
||||||
bytesWritten int64
|
type GpfsCollectorConfig struct {
|
||||||
|
Mmpmon string `json:"mmpmon_path,omitempty"`
|
||||||
|
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
|
||||||
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
|
Sudo bool `json:"use_sudo,omitempty"`
|
||||||
|
SendAbsoluteValues bool `json:"send_abs_values,omitempty"`
|
||||||
|
SendDiffValues bool `json:"send_diff_values,omitempty"`
|
||||||
|
SendDerivedValues bool `json:"send_derived_values,omitempty"`
|
||||||
|
SendTotalValues bool `json:"send_total_values,omitempty"`
|
||||||
|
SendBandwidths bool `json:"send_bandwidths,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type GpfsMetricDefinition struct {
|
||||||
|
name string
|
||||||
|
desc string
|
||||||
|
prefix string
|
||||||
|
unit string
|
||||||
|
calc string
|
||||||
}
|
}
|
||||||
|
|
||||||
type GpfsCollector struct {
|
type GpfsCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
config struct {
|
config GpfsCollectorConfig
|
||||||
Mmpmon string `json:"mmpmon_path,omitempty"`
|
sudoCmd string
|
||||||
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
|
|
||||||
SendBandwidths bool `json:"send_bandwidths"`
|
|
||||||
SendTotalValues bool `json:"send_total_values"`
|
|
||||||
}
|
|
||||||
skipFS map[string]struct{}
|
skipFS map[string]struct{}
|
||||||
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
lastTimestamp map[string]time.Time // Store timestamp of lastState per filesystem to derive bandwidths
|
||||||
lastState map[string]GpfsCollectorLastState
|
definitions []GpfsMetricDefinition // all metrics to report
|
||||||
|
lastState map[string]GpfsCollectorState // one GpfsCollectorState per filesystem
|
||||||
|
}
|
||||||
|
|
||||||
|
var GpfsAbsMetrics = []GpfsMetricDefinition{
|
||||||
|
{
|
||||||
|
name: "gpfs_num_opens",
|
||||||
|
desc: "number of opens",
|
||||||
|
prefix: "_oc_",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "none",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_num_closes",
|
||||||
|
desc: "number of closes",
|
||||||
|
prefix: "_cc_",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "none",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_num_reads",
|
||||||
|
desc: "number of reads",
|
||||||
|
prefix: "_rdc_",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "none",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_num_writes",
|
||||||
|
desc: "number of writes",
|
||||||
|
prefix: "_wc_",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "none",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_num_readdirs",
|
||||||
|
desc: "number of readdirs",
|
||||||
|
prefix: "_dir_",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "none",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_num_inode_updates",
|
||||||
|
desc: "number of Inode Updates",
|
||||||
|
prefix: "_iu_",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "none",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_bytes_read",
|
||||||
|
desc: "bytes read",
|
||||||
|
prefix: "_br_",
|
||||||
|
unit: "bytes",
|
||||||
|
calc: "none",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_bytes_written",
|
||||||
|
desc: "bytes written",
|
||||||
|
prefix: "_bw_",
|
||||||
|
unit: "bytes",
|
||||||
|
calc: "none",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
var GpfsDiffMetrics = []GpfsMetricDefinition{
|
||||||
|
{
|
||||||
|
name: "gpfs_num_opens_diff",
|
||||||
|
desc: "number of opens (diff)",
|
||||||
|
prefix: "_oc_",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "difference",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_num_closes_diff",
|
||||||
|
desc: "number of closes (diff)",
|
||||||
|
prefix: "_cc_",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "difference",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_num_reads_diff",
|
||||||
|
desc: "number of reads (diff)",
|
||||||
|
prefix: "_rdc_",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "difference",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_num_writes_diff",
|
||||||
|
desc: "number of writes (diff)",
|
||||||
|
prefix: "_wc_",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "difference",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_num_readdirs_diff",
|
||||||
|
desc: "number of readdirs (diff)",
|
||||||
|
prefix: "_dir_",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "difference",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_num_inode_updates_diff",
|
||||||
|
desc: "number of Inode Updates (diff)",
|
||||||
|
prefix: "_iu_",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "difference",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_bytes_read_diff",
|
||||||
|
desc: "bytes read (diff)",
|
||||||
|
prefix: "_br_",
|
||||||
|
unit: "bytes",
|
||||||
|
calc: "difference",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_bytes_written_diff",
|
||||||
|
desc: "bytes written (diff)",
|
||||||
|
prefix: "_bw_",
|
||||||
|
unit: "bytes",
|
||||||
|
calc: "difference",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
var GpfsDeriveMetrics = []GpfsMetricDefinition{
|
||||||
|
{
|
||||||
|
name: "gpfs_opens_rate",
|
||||||
|
desc: "number of opens (rate)",
|
||||||
|
prefix: "_oc_",
|
||||||
|
unit: "requests/sec",
|
||||||
|
calc: "derivative",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_closes_rate",
|
||||||
|
desc: "number of closes (rate)",
|
||||||
|
prefix: "_oc_",
|
||||||
|
unit: "requests/sec",
|
||||||
|
calc: "derivative",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_reads_rate",
|
||||||
|
desc: "number of reads (rate)",
|
||||||
|
prefix: "_rdc_",
|
||||||
|
unit: "requests/sec",
|
||||||
|
calc: "derivative",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_writes_rate",
|
||||||
|
desc: "number of writes (rate)",
|
||||||
|
prefix: "_wc_",
|
||||||
|
unit: "requests/sec",
|
||||||
|
calc: "derivative",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_readdirs_rate",
|
||||||
|
desc: "number of readdirs (rate)",
|
||||||
|
prefix: "_dir_",
|
||||||
|
unit: "requests/sec",
|
||||||
|
calc: "derivative",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_inode_updates_rate",
|
||||||
|
desc: "number of Inode Updates (rate)",
|
||||||
|
prefix: "_iu_",
|
||||||
|
unit: "requests/sec",
|
||||||
|
calc: "derivative",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_bw_read",
|
||||||
|
desc: "bytes read (rate)",
|
||||||
|
prefix: "_br_",
|
||||||
|
unit: "bytes/sec",
|
||||||
|
calc: "derivative",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_bw_write",
|
||||||
|
desc: "bytes written (rate)",
|
||||||
|
prefix: "_bw_",
|
||||||
|
unit: "bytes/sec",
|
||||||
|
calc: "derivative",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
var GpfsTotalMetrics = []GpfsMetricDefinition{
|
||||||
|
{
|
||||||
|
name: "gpfs_bytes_total",
|
||||||
|
desc: "bytes total",
|
||||||
|
prefix: "bytesTotal",
|
||||||
|
unit: "bytes",
|
||||||
|
calc: "none",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_bytes_total_diff",
|
||||||
|
desc: "bytes total (diff)",
|
||||||
|
prefix: "bytesTotal",
|
||||||
|
unit: "bytes",
|
||||||
|
calc: "difference",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_bw_total",
|
||||||
|
desc: "bytes total (rate)",
|
||||||
|
prefix: "bytesTotal",
|
||||||
|
unit: "bytes/sec",
|
||||||
|
calc: "derivative",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_iops",
|
||||||
|
desc: "iops",
|
||||||
|
prefix: "iops",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "none",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_iops_diff",
|
||||||
|
desc: "iops (diff)",
|
||||||
|
prefix: "iops",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "difference",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_iops_rate",
|
||||||
|
desc: "iops (rate)",
|
||||||
|
prefix: "iops",
|
||||||
|
unit: "requests/sec",
|
||||||
|
calc: "derivative",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_metaops",
|
||||||
|
desc: "metaops",
|
||||||
|
prefix: "metaops",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "none",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_metaops_diff",
|
||||||
|
desc: "metaops (diff)",
|
||||||
|
prefix: "metaops",
|
||||||
|
unit: "requests",
|
||||||
|
calc: "difference",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "gpfs_metaops_rate",
|
||||||
|
desc: "metaops (rate)",
|
||||||
|
prefix: "metaops",
|
||||||
|
unit: "requests/sec",
|
||||||
|
calc: "derivative",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *GpfsCollector) Init(config json.RawMessage) error {
|
func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||||
@@ -44,9 +311,10 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
var err error
|
|
||||||
m.name = "GpfsCollector"
|
m.name = "GpfsCollector"
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
|
|
||||||
// Set default mmpmon binary
|
// Set default mmpmon binary
|
||||||
@@ -54,7 +322,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Read JSON configuration
|
// Read JSON configuration
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err = json.Unmarshal(config, &m.config)
|
err := json.Unmarshal(config, &m.config)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print(err.Error())
|
log.Print(err.Error())
|
||||||
return err
|
return err
|
||||||
@@ -72,24 +340,104 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
|||||||
for _, fs := range m.config.ExcludeFilesystem {
|
for _, fs := range m.config.ExcludeFilesystem {
|
||||||
m.skipFS[fs] = struct{}{}
|
m.skipFS[fs] = struct{}{}
|
||||||
}
|
}
|
||||||
m.lastState = make(map[string]GpfsCollectorLastState)
|
m.lastState = make(map[string]GpfsCollectorState)
|
||||||
|
m.lastTimestamp = make(map[string]time.Time)
|
||||||
|
|
||||||
// GPFS / IBM Spectrum Scale file system statistics can only be queried by user root
|
// GPFS / IBM Spectrum Scale file system statistics can only be queried by user root
|
||||||
|
if !m.config.Sudo {
|
||||||
user, err := user.Current()
|
user, err := user.Current()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to get current user: %v", err)
|
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
if user.Uid != "0" {
|
if user.Uid != "0" {
|
||||||
return fmt.Errorf("GPFS file system statistics can only be queried by user root")
|
cclog.ComponentError(m.name, "GPFS file system statistics can only be queried by user root")
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
p, err := exec.LookPath("sudo")
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(m.name, "Cannot find 'sudo'")
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
m.sudoCmd = p
|
||||||
|
}
|
||||||
|
|
||||||
|
// when using sudo, the full path of mmpmon must be specified because
|
||||||
|
// exec.LookPath will not work as mmpmon is not executable as user
|
||||||
|
if m.config.Sudo && !strings.HasPrefix(m.config.Mmpmon, "/") {
|
||||||
|
return fmt.Errorf("when using sudo, mmpmon_path must be provided and an absolute path: %s", m.config.Mmpmon)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if mmpmon is in executable search path
|
// Check if mmpmon is in executable search path
|
||||||
p, err := exec.LookPath(m.config.Mmpmon)
|
p, err := exec.LookPath(m.config.Mmpmon)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
// if using sudo, exec.lookPath will return EACCES (file mode r-x------), this can be ignored
|
||||||
|
if m.config.Sudo && errors.Is(err, syscall.EACCES) {
|
||||||
|
cclog.ComponentWarn(m.name, fmt.Sprintf("got error looking for mmpmon binary '%s': %v . This is expected when using sudo, continuing.", m.config.Mmpmon, err))
|
||||||
|
// the file was given in the config, use it
|
||||||
|
p = m.config.Mmpmon
|
||||||
|
} else {
|
||||||
|
cclog.ComponentError(m.name, fmt.Sprintf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err))
|
||||||
return fmt.Errorf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err)
|
return fmt.Errorf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
m.config.Mmpmon = p
|
m.config.Mmpmon = p
|
||||||
|
|
||||||
|
m.definitions = []GpfsMetricDefinition{}
|
||||||
|
if m.config.SendAbsoluteValues {
|
||||||
|
for _, def := range GpfsAbsMetrics {
|
||||||
|
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||||
|
m.definitions = append(m.definitions, def)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if m.config.SendDiffValues {
|
||||||
|
for _, def := range GpfsDiffMetrics {
|
||||||
|
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||||
|
m.definitions = append(m.definitions, def)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if m.config.SendDerivedValues {
|
||||||
|
for _, def := range GpfsDeriveMetrics {
|
||||||
|
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||||
|
m.definitions = append(m.definitions, def)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if m.config.SendBandwidths {
|
||||||
|
for _, def := range GpfsDeriveMetrics {
|
||||||
|
if def.unit == "bytes/sec" {
|
||||||
|
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||||
|
m.definitions = append(m.definitions, def)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if m.config.SendTotalValues {
|
||||||
|
for _, def := range GpfsTotalMetrics {
|
||||||
|
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||||
|
// only send total metrics of the types requested
|
||||||
|
if (def.calc == "none" && m.config.SendAbsoluteValues) ||
|
||||||
|
(def.calc == "difference" && m.config.SendDiffValues) ||
|
||||||
|
(def.calc == "derivative" && m.config.SendDerivedValues) {
|
||||||
|
m.definitions = append(m.definitions, def)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if m.config.SendBandwidths {
|
||||||
|
for _, def := range GpfsTotalMetrics {
|
||||||
|
if def.unit == "bytes/sec" {
|
||||||
|
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||||
|
m.definitions = append(m.definitions, def)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(m.definitions) == 0 {
|
||||||
|
return errors.New("no metrics to collect")
|
||||||
|
}
|
||||||
|
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -100,18 +448,17 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Current time stamp
|
|
||||||
now := time.Now()
|
|
||||||
// time difference to last time stamp
|
|
||||||
timeDiff := now.Sub(m.lastTimestamp).Seconds()
|
|
||||||
// Save current timestamp
|
|
||||||
m.lastTimestamp = now
|
|
||||||
|
|
||||||
// mmpmon:
|
// mmpmon:
|
||||||
// -p: generate output that can be parsed
|
// -p: generate output that can be parsed
|
||||||
// -s: suppress the prompt on input
|
// -s: suppress the prompt on input
|
||||||
// fs_io_s: Displays I/O statistics per mounted file system
|
// fs_io_s: Displays I/O statistics per mounted file system
|
||||||
cmd := exec.Command(m.config.Mmpmon, "-p", "-s")
|
var cmd *exec.Cmd
|
||||||
|
if m.config.Sudo {
|
||||||
|
cmd = exec.Command(m.sudoCmd, m.config.Mmpmon, "-p", "-s")
|
||||||
|
} else {
|
||||||
|
cmd = exec.Command(m.config.Mmpmon, "-p", "-s")
|
||||||
|
}
|
||||||
|
|
||||||
cmd.Stdin = strings.NewReader("once fs_io_s\n")
|
cmd.Stdin = strings.NewReader("once fs_io_s\n")
|
||||||
cmdStdout := new(bytes.Buffer)
|
cmdStdout := new(bytes.Buffer)
|
||||||
cmdStderr := new(bytes.Buffer)
|
cmdStderr := new(bytes.Buffer)
|
||||||
@@ -154,9 +501,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
|
|
||||||
filesystem, ok := key_value["_fs_"]
|
filesystem, ok := key_value["_fs_"]
|
||||||
if !ok {
|
if !ok {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, "Read(): Failed to get filesystem name.")
|
||||||
m.name,
|
|
||||||
"Read(): Failed to get filesystem name.")
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -168,245 +513,141 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
// Add filesystem tag
|
// Add filesystem tag
|
||||||
m.tags["filesystem"] = filesystem
|
m.tags["filesystem"] = filesystem
|
||||||
|
|
||||||
// Create initial last state
|
|
||||||
if m.config.SendBandwidths {
|
|
||||||
if _, ok := m.lastState[filesystem]; !ok {
|
if _, ok := m.lastState[filesystem]; !ok {
|
||||||
m.lastState[filesystem] = GpfsCollectorLastState{
|
m.lastState[filesystem] = make(GpfsCollectorState)
|
||||||
bytesRead: -1,
|
|
||||||
bytesWritten: -1,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// read the new values from mmpmon
|
||||||
// return code
|
// return code
|
||||||
rc, err := strconv.Atoi(key_value["_rc_"])
|
rc, err := strconv.Atoi(key_value["_rc_"])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err))
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err))
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if rc != 0 {
|
if rc != 0 {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Filesystem '%s' is not ok.", filesystem))
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Filesystem '%s' is not ok.", filesystem))
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// timestamp
|
||||||
sec, err := strconv.ParseInt(key_value["_t_"], 10, 64)
|
sec, err := strconv.ParseInt(key_value["_t_"], 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err))
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err))
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64)
|
msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err))
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err))
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
timestamp := time.Unix(sec, msec*1000)
|
timestamp := time.Unix(sec, msec*1000)
|
||||||
|
|
||||||
// bytes read
|
// time difference to last time stamp
|
||||||
bytesRead, err := strconv.ParseInt(key_value["_br_"], 10, 64)
|
var timeDiff float64 = 0
|
||||||
|
if lastTime, ok := m.lastTimestamp[filesystem]; !ok {
|
||||||
|
m.lastTimestamp[filesystem] = time.Time{}
|
||||||
|
} else {
|
||||||
|
timeDiff = timestamp.Sub(lastTime).Seconds()
|
||||||
|
}
|
||||||
|
|
||||||
|
// get values of all abs metrics
|
||||||
|
newstate := make(GpfsCollectorState)
|
||||||
|
for _, metric := range GpfsAbsMetrics {
|
||||||
|
value, err := strconv.ParseInt(key_value[metric.prefix], 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(m.name, fmt.Sprintf("Read(): Failed to convert %s '%s' to int64: %v", metric.desc, key_value[metric.prefix], err))
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to convert bytes read '%s' to int64: %v", key_value["_br_"], err))
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if y, err :=
|
newstate[metric.prefix] = value
|
||||||
lp.NewMessage(
|
}
|
||||||
"gpfs_bytes_read",
|
|
||||||
m.tags,
|
// compute total metrics (map[...] will return 0 if key not found)
|
||||||
m.meta,
|
// bytes read and written
|
||||||
map[string]interface{}{
|
if br, br_ok := newstate["_br_"]; br_ok {
|
||||||
"value": bytesRead,
|
newstate["bytesTotal"] = newstate["bytesTotal"] + br
|
||||||
},
|
}
|
||||||
timestamp,
|
if bw, bw_ok := newstate["_bw_"]; bw_ok {
|
||||||
); err == nil {
|
newstate["bytesTotal"] = newstate["bytesTotal"] + bw
|
||||||
y.AddMeta("unit", "bytes")
|
}
|
||||||
output <- y
|
// read and write count
|
||||||
}
|
if rdc, rdc_ok := newstate["_rdc_"]; rdc_ok {
|
||||||
if m.config.SendBandwidths {
|
newstate["iops"] = newstate["iops"] + rdc
|
||||||
if lastBytesRead := m.lastState[filesystem].bytesRead; lastBytesRead >= 0 {
|
}
|
||||||
bwRead := float64(bytesRead-lastBytesRead) / timeDiff
|
if wc, wc_ok := newstate["_wc_"]; wc_ok {
|
||||||
if y, err :=
|
newstate["iops"] = newstate["iops"] + wc
|
||||||
lp.NewMessage(
|
}
|
||||||
"gpfs_bw_read",
|
// meta operations
|
||||||
m.tags,
|
if oc, oc_ok := newstate["_oc_"]; oc_ok {
|
||||||
m.meta,
|
newstate["metaops"] = newstate["metaops"] + oc
|
||||||
map[string]interface{}{
|
}
|
||||||
"value": bwRead,
|
if cc, cc_ok := newstate["_cc_"]; cc_ok {
|
||||||
},
|
newstate["metaops"] = newstate["metaops"] + cc
|
||||||
timestamp,
|
}
|
||||||
); err == nil {
|
if dir, dir_ok := newstate["_dir_"]; dir_ok {
|
||||||
y.AddMeta("unit", "bytes/sec")
|
newstate["metaops"] = newstate["metaops"] + dir
|
||||||
|
}
|
||||||
|
if iu, iu_ok := newstate["_iu_"]; iu_ok {
|
||||||
|
newstate["metaops"] = newstate["metaops"] + iu
|
||||||
|
}
|
||||||
|
// send desired metrics for this filesystem
|
||||||
|
for _, metric := range m.definitions {
|
||||||
|
vold, vold_ok := m.lastState[filesystem][metric.prefix]
|
||||||
|
vnew, vnew_ok := newstate[metric.prefix]
|
||||||
|
var value any
|
||||||
|
value_ok := false
|
||||||
|
switch metric.calc {
|
||||||
|
case "none":
|
||||||
|
if vnew_ok {
|
||||||
|
value = vnew
|
||||||
|
value_ok = true
|
||||||
|
} else if vold_ok {
|
||||||
|
// for absolute values, if the new value is not available, report no change
|
||||||
|
value = vold
|
||||||
|
value_ok = true
|
||||||
|
}
|
||||||
|
case "difference":
|
||||||
|
if vnew_ok && vold_ok {
|
||||||
|
value = vnew - vold
|
||||||
|
if value.(int64) < 0 {
|
||||||
|
value = 0
|
||||||
|
}
|
||||||
|
value_ok = true
|
||||||
|
} else if vold_ok {
|
||||||
|
// if the difference is not computable, return 0
|
||||||
|
value = 0
|
||||||
|
value_ok = true
|
||||||
|
}
|
||||||
|
case "derivative":
|
||||||
|
if vnew_ok && vold_ok && timeDiff > 0 {
|
||||||
|
value = float64(vnew-vold) / timeDiff
|
||||||
|
if value.(float64) < 0 {
|
||||||
|
value = 0
|
||||||
|
}
|
||||||
|
value_ok = true
|
||||||
|
} else if vold_ok {
|
||||||
|
// if the difference is not computable, return 0
|
||||||
|
value = 0
|
||||||
|
value_ok = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if value_ok {
|
||||||
|
y, err := lp.NewMetric(metric.name, m.tags, m.meta, value, timestamp)
|
||||||
|
if err == nil {
|
||||||
|
if len(metric.unit) > 0 {
|
||||||
|
y.AddMeta("unit", metric.unit)
|
||||||
|
}
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// the value could not be computed correctly
|
||||||
|
cclog.ComponentWarn(m.name, fmt.Sprintf("Read(): Could not compute value for filesystem %s of metric %s: vold_ok = %t, vnew_ok = %t", filesystem, metric.name, vold_ok, vnew_ok))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// bytes written
|
// Save new state, if it contains proper values
|
||||||
bytesWritten, err := strconv.ParseInt(key_value["_bw_"], 10, 64)
|
if len(newstate) > 0 {
|
||||||
if err != nil {
|
m.lastState[filesystem] = newstate
|
||||||
cclog.ComponentError(
|
m.lastTimestamp[filesystem] = timestamp
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to convert bytes written '%s' to int64: %v", key_value["_bw_"], err))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if y, err :=
|
|
||||||
lp.NewMessage(
|
|
||||||
"gpfs_bytes_written",
|
|
||||||
m.tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]interface{}{
|
|
||||||
"value": bytesWritten,
|
|
||||||
},
|
|
||||||
timestamp,
|
|
||||||
); err == nil {
|
|
||||||
y.AddMeta("unit", "bytes")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
if m.config.SendBandwidths {
|
|
||||||
if lastBytesWritten := m.lastState[filesystem].bytesRead; lastBytesWritten >= 0 {
|
|
||||||
bwWrite := float64(bytesWritten-lastBytesWritten) / timeDiff
|
|
||||||
if y, err :=
|
|
||||||
lp.NewMessage(
|
|
||||||
"gpfs_bw_write",
|
|
||||||
m.tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]interface{}{
|
|
||||||
"value": bwWrite,
|
|
||||||
},
|
|
||||||
timestamp,
|
|
||||||
); err == nil {
|
|
||||||
y.AddMeta("unit", "bytes/sec")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if m.config.SendBandwidths {
|
|
||||||
m.lastState[filesystem] = GpfsCollectorLastState{
|
|
||||||
bytesRead: bytesRead,
|
|
||||||
bytesWritten: bytesWritten,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// number of opens
|
|
||||||
numOpens, err := strconv.ParseInt(key_value["_oc_"], 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to convert number of opens '%s' to int64: %v", key_value["_oc_"], err))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if y, err := lp.NewMessage("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp); err == nil {
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
|
|
||||||
// number of closes
|
|
||||||
numCloses, err := strconv.ParseInt(key_value["_cc_"], 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to convert number of closes: '%s' to int64: %v", key_value["_cc_"], err))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if y, err := lp.NewMessage("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp); err == nil {
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
|
|
||||||
// number of reads
|
|
||||||
numReads, err := strconv.ParseInt(key_value["_rdc_"], 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to convert number of reads: '%s' to int64: %v", key_value["_rdc_"], err))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if y, err := lp.NewMessage("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp); err == nil {
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
|
|
||||||
// number of writes
|
|
||||||
numWrites, err := strconv.ParseInt(key_value["_wc_"], 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to convert number of writes: '%s' to int64: %v", key_value["_wc_"], err))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if y, err := lp.NewMessage("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp); err == nil {
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
|
|
||||||
// number of read directories
|
|
||||||
numReaddirs, err := strconv.ParseInt(key_value["_dir_"], 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to convert number of read directories: '%s' to int64: %v", key_value["_dir_"], err))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if y, err := lp.NewMessage("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp); err == nil {
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
|
|
||||||
// Number of inode updates
|
|
||||||
numInodeUpdates, err := strconv.ParseInt(key_value["_iu_"], 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
cclog.ComponentError(
|
|
||||||
m.name,
|
|
||||||
fmt.Sprintf("Read(): Failed to convert number of inode updates: '%s' to int: %v", key_value["_iu_"], err))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if y, err := lp.NewMessage("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil {
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
|
|
||||||
// Total values
|
|
||||||
if m.config.SendTotalValues {
|
|
||||||
bytesTotal := bytesRead + bytesWritten
|
|
||||||
if y, err :=
|
|
||||||
lp.NewMessage("gpfs_bytes_total",
|
|
||||||
m.tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]interface{}{
|
|
||||||
"value": bytesTotal,
|
|
||||||
},
|
|
||||||
timestamp,
|
|
||||||
); err == nil {
|
|
||||||
y.AddMeta("unit", "bytes")
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
iops := numReads + numWrites
|
|
||||||
if y, err :=
|
|
||||||
lp.NewMessage("gpfs_iops",
|
|
||||||
m.tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]interface{}{
|
|
||||||
"value": iops,
|
|
||||||
},
|
|
||||||
timestamp,
|
|
||||||
); err == nil {
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
metaops := numInodeUpdates + numCloses + numOpens + numReaddirs
|
|
||||||
if y, err :=
|
|
||||||
lp.NewMessage("gpfs_metaops",
|
|
||||||
m.tags,
|
|
||||||
m.meta,
|
|
||||||
map[string]interface{}{
|
|
||||||
"value": metaops,
|
|
||||||
},
|
|
||||||
timestamp,
|
|
||||||
); err == nil {
|
|
||||||
output <- y
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,13 +12,20 @@ hugo_path: docs/reference/cc-metric-collector/collectors/gpfs.md
|
|||||||
## `gpfs` collector
|
## `gpfs` collector
|
||||||
|
|
||||||
```json
|
```json
|
||||||
"ibstat": {
|
"gpfs": {
|
||||||
"mmpmon_path": "/path/to/mmpmon",
|
"mmpmon_path": "/path/to/mmpmon",
|
||||||
|
"use_sudo": "true",
|
||||||
"exclude_filesystem": [
|
"exclude_filesystem": [
|
||||||
"fs1"
|
"fs1"
|
||||||
],
|
],
|
||||||
"send_bandwidths": true,
|
"exclude_metrics": [
|
||||||
"send_total_values": true
|
"gpfs_bytes_written"
|
||||||
|
],
|
||||||
|
"send_abs_values": true,
|
||||||
|
"send_diff_values": true,
|
||||||
|
"send_derived_values": true,
|
||||||
|
"send_total_values": true,
|
||||||
|
"send_bandwidths": true
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -27,24 +34,50 @@ GPFS / IBM Spectrum Scale filesystems.
|
|||||||
|
|
||||||
The reported filesystems can be filtered with the `exclude_filesystem` option
|
The reported filesystems can be filtered with the `exclude_filesystem` option
|
||||||
in the configuration.
|
in the configuration.
|
||||||
|
Individual metrics can be disabled for reporting using option `exclude_metrics`.
|
||||||
|
|
||||||
The path to the `mmpmon` command can be configured with the `mmpmon_path` option
|
The path to the `mmpmon` command can be configured with the `mmpmon_path` option
|
||||||
in the configuration. If nothing is set, the collector searches in `$PATH` for `mmpmon`.
|
in the configuration. If nothing is set, the collector searches in `$PATH` for `mmpmon`.
|
||||||
|
|
||||||
|
If cc-metric-collector is run as non-root, password-less `sudo` can be enabled with `use_sudo`.
|
||||||
|
Because `mmpmon` is by default only executable as root, the Go procedure to
|
||||||
|
search for it in `$PATH` will fail. If you use `sudo`, you must specify the
|
||||||
|
complete path for `mmpmon` using the parameter `mmpmon_path`.
|
||||||
|
|
||||||
|
|
||||||
Metrics:
|
Metrics:
|
||||||
* `gpfs_bytes_read`
|
* `gpfs_bytes_read` (if `send_abs_values == true`)
|
||||||
* `gpfs_bytes_written`
|
* `gpfs_bytes_written` (if `send_abs_values == true`)
|
||||||
* `gpfs_num_opens`
|
* `gpfs_num_opens` (if `send_abs_values == true`)
|
||||||
* `gpfs_num_closes`
|
* `gpfs_num_closes` (if `send_abs_values == true`)
|
||||||
* `gpfs_num_reads`
|
* `gpfs_num_reads` (if `send_abs_values == true`)
|
||||||
* `gpfs_num_writes`
|
* `gpfs_num_writes` (if `send_abs_values == true`)
|
||||||
* `gpfs_num_readdirs`
|
* `gpfs_num_readdirs` (if `send_abs_values == true`)
|
||||||
* `gpfs_num_inode_updates`
|
* `gpfs_num_inode_updates` (if `send_abs_values == true`)
|
||||||
* `gpfs_bytes_total = gpfs_bytes_read + gpfs_bytes_written` (if `send_total_values == true`)
|
* `gpfs_bytes_read_diff` (if `send_diff_values == true`)
|
||||||
* `gpfs_iops = gpfs_num_reads + gpfs_num_writes` (if `send_total_values == true`)
|
* `gpfs_bytes_written_diff` (if `send_diff_values == true`)
|
||||||
* `gpfs_metaops = gpfs_num_inode_updates + gpfs_num_closes + gpfs_num_opens + gpfs_num_readdirs` (if `send_total_values == true`)
|
* `gpfs_num_opens_diff` (if `send_diff_values == true`)
|
||||||
* `gpfs_bw_read` (if `send_bandwidths == true`)
|
* `gpfs_num_closes_diff` (if `send_diff_values == true`)
|
||||||
* `gpfs_bw_write` (if `send_bandwidths == true`)
|
* `gpfs_num_reads_diff` (if `send_diff_values == true`)
|
||||||
|
* `gpfs_num_writes_diff` (if `send_diff_values == true`)
|
||||||
|
* `gpfs_num_readdirs_diff` (if `send_diff_values == true`)
|
||||||
|
* `gpfs_num_inode_updates_diff` (if `send_diff_values == true`)
|
||||||
|
* `gpfs_bw_read` (if `send_derived_values == true` or `send_bandwidths == true`)
|
||||||
|
* `gpfs_bw_write` (if `send_derived_values == true` or `send_bandwidths == true`)
|
||||||
|
* `gpfs_opens_rate` (if `send_derived_values == true`)
|
||||||
|
* `gpfs_closes_rate` (if `send_derived_values == true`)
|
||||||
|
* `gpfs_reads_rate` (if `send_derived_values == true`)
|
||||||
|
* `gpfs_writes_rate` (if `send_derived_values == true`)
|
||||||
|
* `gpfs_readdirs_rate` (if `send_derived_values == true`)
|
||||||
|
* `gpfs_inode_updates_rate` (if `send_derived_values == true`)
|
||||||
|
* `gpfs_bytes_total = gpfs_bytes_read + gpfs_bytes_written` (if `send_total_values == true` and `send_abs_values == true`)
|
||||||
|
* `gpfs_bytes_total_diff` (if `send_total_values == true` and `send_diff_values == true`)
|
||||||
|
* `gpfs_bw_total` ((if `send_total_values == true` and `send_derived_values == true`) or `send_bandwidths == true`)
|
||||||
|
* `gpfs_iops = gpfs_num_reads + gpfs_num_writes` (if `send_total_values == true` and `send_abs_values == true`)
|
||||||
|
* `gpfs_iops_diff` (if `send_total_values == true` and `send_diff_values == true`)
|
||||||
|
* `gpfs_iops_rate` (if `send_total_values == true` and `send_derived_values == true`)
|
||||||
|
* `gpfs_metaops = gpfs_num_inode_updates + gpfs_num_closes + gpfs_num_opens + gpfs_num_readdirs` (if `send_total_values == true` and `send_abs_values == true`)
|
||||||
|
* `gpfs_metaops_diff` (if `send_total_values == true` and `send_diff_values == true`)
|
||||||
|
* `gpfs_metaops_rate` (if `send_total_values == true` and `send_derived_values == true`)
|
||||||
|
|
||||||
The collector adds a `filesystem` tag to all metrics
|
The collector adds a `filesystem` tag to all metrics
|
||||||
|
|||||||
@@ -1,11 +1,19 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"slices"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
"golang.org/x/sys/unix"
|
"golang.org/x/sys/unix"
|
||||||
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
@@ -58,7 +66,9 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
var err error
|
var err error
|
||||||
m.name = "InfinibandCollector"
|
m.name = "InfinibandCollector"
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{
|
||||||
"source": m.name,
|
"source": m.name,
|
||||||
@@ -104,14 +114,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
port := pathSplit[6]
|
port := pathSplit[6]
|
||||||
|
|
||||||
// Skip excluded devices
|
// Skip excluded devices
|
||||||
skip := false
|
if slices.Contains(m.config.ExcludeDevices, device) {
|
||||||
for _, excludedDevice := range m.config.ExcludeDevices {
|
|
||||||
if excludedDevice == device {
|
|
||||||
skip = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if skip {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -234,7 +237,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
counterDef.name,
|
counterDef.name,
|
||||||
info.tagSet,
|
info.tagSet,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]interface{}{
|
map[string]any{
|
||||||
"value": counterDef.currentState,
|
"value": counterDef.currentState,
|
||||||
},
|
},
|
||||||
now); err == nil {
|
now); err == nil {
|
||||||
@@ -252,7 +255,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
counterDef.name+"_bw",
|
counterDef.name+"_bw",
|
||||||
info.tagSet,
|
info.tagSet,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]interface{}{
|
map[string]any{
|
||||||
"value": rate,
|
"value": rate,
|
||||||
},
|
},
|
||||||
now); err == nil {
|
now); err == nil {
|
||||||
@@ -282,7 +285,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
"ib_total",
|
"ib_total",
|
||||||
info.tagSet,
|
info.tagSet,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]interface{}{
|
map[string]any{
|
||||||
"value": ib_total,
|
"value": ib_total,
|
||||||
},
|
},
|
||||||
now); err == nil {
|
now); err == nil {
|
||||||
@@ -295,7 +298,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
|||||||
"ib_total_pkts",
|
"ib_total_pkts",
|
||||||
info.tagSet,
|
info.tagSet,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]interface{}{
|
map[string]any{
|
||||||
"value": ib_total_pkts,
|
"value": ib_total_pkts,
|
||||||
},
|
},
|
||||||
now); err == nil {
|
now); err == nil {
|
||||||
|
|||||||
@@ -1,28 +1,36 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Konstante für den Pfad zu /proc/diskstats
|
|
||||||
const IOSTATFILE = `/proc/diskstats`
|
const IOSTATFILE = `/proc/diskstats`
|
||||||
|
|
||||||
type IOstatCollectorConfig struct {
|
type IOstatCollectorConfig struct {
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
// Neues Feld zum Ausschließen von Devices per JSON-Konfiguration
|
|
||||||
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
ExcludeDevices []string `json:"exclude_devices,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type IOstatCollectorEntry struct {
|
type IOstatCollectorEntry struct {
|
||||||
|
currentValues map[string]int64
|
||||||
lastValues map[string]int64
|
lastValues map[string]int64
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
}
|
}
|
||||||
@@ -39,7 +47,9 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
|||||||
m.name = "IOstatCollector"
|
m.name = "IOstatCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err = json.Unmarshal(config, &m.config)
|
err = json.Unmarshal(config, &m.config)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -69,7 +79,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
|||||||
m.devices = make(map[string]IOstatCollectorEntry)
|
m.devices = make(map[string]IOstatCollectorEntry)
|
||||||
m.matches = make(map[string]int)
|
m.matches = make(map[string]int)
|
||||||
for k, v := range matches {
|
for k, v := range matches {
|
||||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, k); !skip {
|
if !slices.Contains(m.config.ExcludeMetrics, k) {
|
||||||
m.matches[k] = v
|
m.matches[k] = v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -78,10 +88,8 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
file, err := os.Open(IOSTATFILE)
|
file, err := os.Open(IOSTATFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, err.Error())
|
return fmt.Errorf("%s Init(): Failed to open file \"%s\": %w", m.name, IOSTATFILE, err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
defer file.Close()
|
|
||||||
|
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
@@ -95,21 +103,36 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
|||||||
if strings.Contains(device, "loop") {
|
if strings.Contains(device, "loop") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
|
if slices.Contains(m.config.ExcludeDevices, device) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
values := make(map[string]int64)
|
currentValues := make(map[string]int64)
|
||||||
|
lastValues := make(map[string]int64)
|
||||||
for m := range m.matches {
|
for m := range m.matches {
|
||||||
values[m] = 0
|
currentValues[m] = 0
|
||||||
|
lastValues[m] = 0
|
||||||
|
}
|
||||||
|
for name, idx := range m.matches {
|
||||||
|
if idx < len(linefields) {
|
||||||
|
if value, err := strconv.ParseInt(linefields[idx], 0, 64); err == nil {
|
||||||
|
currentValues[name] = value
|
||||||
|
lastValues[name] = value // Set last to current for first read
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
m.devices[device] = IOstatCollectorEntry{
|
m.devices[device] = IOstatCollectorEntry{
|
||||||
tags: map[string]string{
|
tags: map[string]string{
|
||||||
"device": device,
|
"device": device,
|
||||||
"type": "node",
|
"type": "node",
|
||||||
},
|
},
|
||||||
lastValues: values,
|
currentValues: currentValues,
|
||||||
|
lastValues: lastValues,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if err := file.Close(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): Failed to close file \"%s\": %w", m.name, IOSTATFILE, err)
|
||||||
|
}
|
||||||
|
|
||||||
m.init = true
|
m.init = true
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -121,10 +144,18 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
|
|
||||||
file, err := os.Open(IOSTATFILE)
|
file, err := os.Open(IOSTATFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, err.Error())
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to open file '%s': %v", IOSTATFILE, err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer func() {
|
||||||
|
if err := file.Close(); err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to close file '%s': %v", IOSTATFILE, err))
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
@@ -140,24 +171,28 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
if strings.Contains(device, "loop") {
|
if strings.Contains(device, "loop") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
|
if slices.Contains(m.config.ExcludeDevices, device) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if _, ok := m.devices[device]; !ok {
|
if _, ok := m.devices[device]; !ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
// Update current and last values
|
||||||
entry := m.devices[device]
|
entry := m.devices[device]
|
||||||
for name, idx := range m.matches {
|
for name, idx := range m.matches {
|
||||||
if idx < len(linefields) {
|
if idx < len(linefields) {
|
||||||
x, err := strconv.ParseInt(linefields[idx], 0, 64)
|
x, err := strconv.ParseInt(linefields[idx], 0, 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
diff := x - entry.lastValues[name]
|
// Calculate difference using previous current and new value
|
||||||
y, err := lp.NewMessage(name, entry.tags, m.meta, map[string]interface{}{"value": int(diff)}, time.Now())
|
diff := x - entry.currentValues[name]
|
||||||
|
y, err := lp.NewMetric(name, entry.tags, m.meta, int(diff), time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
// Update last to previous current, and current to new value
|
||||||
|
entry.lastValues[name] = entry.currentValues[name]
|
||||||
|
entry.currentValues[name] = x
|
||||||
}
|
}
|
||||||
entry.lastValues[name] = x
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.devices[device] = entry
|
m.devices[device] = entry
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -7,14 +14,13 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log"
|
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const IPMISENSORS_PATH = `ipmi-sensors`
|
const IPMISENSORS_PATH = `ipmi-sensors`
|
||||||
@@ -37,7 +43,9 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
m.name = "IpmiCollector"
|
m.name = "IpmiCollector"
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{
|
||||||
"source": m.name,
|
"source": m.name,
|
||||||
@@ -109,19 +117,20 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
|
|||||||
}
|
}
|
||||||
v, err := strconv.ParseFloat(strings.TrimSpace(lv[1]), 64)
|
v, err := strconv.ParseFloat(strings.TrimSpace(lv[1]), 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
name := strings.ToLower(strings.Replace(strings.TrimSpace(lv[0]), " ", "_", -1))
|
name := strings.ToLower(strings.ReplaceAll(strings.TrimSpace(lv[0]), " ", "_"))
|
||||||
unit := strings.TrimSpace(lv[2])
|
unit := strings.TrimSpace(lv[2])
|
||||||
if unit == "Volts" {
|
switch unit {
|
||||||
|
case "Volts":
|
||||||
unit = "Volts"
|
unit = "Volts"
|
||||||
} else if unit == "degrees C" {
|
case "degrees C":
|
||||||
unit = "degC"
|
unit = "degC"
|
||||||
} else if unit == "degrees F" {
|
case "degrees F":
|
||||||
unit = "degF"
|
unit = "degF"
|
||||||
} else if unit == "Watts" {
|
case "Watts":
|
||||||
unit = "Watts"
|
unit = "Watts"
|
||||||
}
|
}
|
||||||
|
|
||||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
|
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", unit)
|
y.AddMeta("unit", unit)
|
||||||
output <- y
|
output <- y
|
||||||
@@ -143,23 +152,30 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
|
|||||||
|
|
||||||
func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
|
func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
|
||||||
|
|
||||||
|
// Setup ipmisensors command
|
||||||
command := exec.Command(cmd, "--comma-separated-output", "--sdr-cache-recreate")
|
command := exec.Command(cmd, "--comma-separated-output", "--sdr-cache-recreate")
|
||||||
command.Wait()
|
stdout, _ := command.StdoutPipe()
|
||||||
stdout, err := command.Output()
|
errBuf := new(bytes.Buffer)
|
||||||
if err != nil {
|
command.Stderr = errBuf
|
||||||
log.Print(err)
|
|
||||||
|
// start command
|
||||||
|
if err := command.Start(); err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("readIpmiSensors(): Failed to start command \"%s\": %v", command.String(), err),
|
||||||
|
)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
ll := strings.Split(string(stdout), "\n")
|
// Read command output
|
||||||
|
scanner := bufio.NewScanner(stdout)
|
||||||
for _, line := range ll {
|
for scanner.Scan() {
|
||||||
lv := strings.Split(line, ",")
|
lv := strings.Split(scanner.Text(), ",")
|
||||||
if len(lv) > 3 {
|
if len(lv) > 3 {
|
||||||
v, err := strconv.ParseFloat(lv[3], 64)
|
v, err := strconv.ParseFloat(lv[3], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
name := strings.ToLower(strings.Replace(lv[1], " ", "_", -1))
|
name := strings.ToLower(strings.ReplaceAll(lv[1], " ", "_"))
|
||||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
|
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if len(lv) > 4 {
|
if len(lv) > 4 {
|
||||||
y.AddMeta("unit", lv[4])
|
y.AddMeta("unit", lv[4])
|
||||||
@@ -169,6 +185,18 @@ func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Wait for command end
|
||||||
|
if err := command.Wait(); err != nil {
|
||||||
|
errMsg, _ := io.ReadAll(errBuf)
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("readIpmiSensors(): Failed to wait for the end of command \"%s\": %v\n", command.String(), err),
|
||||||
|
)
|
||||||
|
cclog.ComponentError(m.name, fmt.Sprintf("readIpmiSensors(): command stderr: \"%s\"\n", strings.TrimSpace(string(errMsg))))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -12,6 +19,7 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"maps"
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
@@ -24,8 +32,8 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
||||||
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||||
"github.com/NVIDIA/go-nvml/pkg/dl"
|
"github.com/NVIDIA/go-nvml/pkg/dl"
|
||||||
@@ -180,7 +188,7 @@ func getBaseFreq() float64 {
|
|||||||
for _, f := range files {
|
for _, f := range files {
|
||||||
buffer, err := os.ReadFile(f)
|
buffer, err := os.ReadFile(f)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
data := strings.Replace(string(buffer), "\n", "", -1)
|
data := strings.ReplaceAll(string(buffer), "\n", "")
|
||||||
x, err := strconv.ParseInt(data, 0, 64)
|
x, err := strconv.ParseInt(data, 0, 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
freq = float64(x)
|
freq = float64(x)
|
||||||
@@ -223,9 +231,13 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
if m.config.ForceOverwrite {
|
if m.config.ForceOverwrite {
|
||||||
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
|
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
|
||||||
os.Setenv("LIKWID_FORCE", "1")
|
if err := os.Setenv("LIKWID_FORCE", "1"); err != nil {
|
||||||
|
return fmt.Errorf("error setting environment variable LIKWID_FORCE=1: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
}
|
}
|
||||||
m.setup()
|
|
||||||
|
|
||||||
m.meta = map[string]string{"group": "PerfCounter"}
|
m.meta = map[string]string{"group": "PerfCounter"}
|
||||||
cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
|
cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
|
||||||
@@ -309,7 +321,14 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
case "accessdaemon":
|
case "accessdaemon":
|
||||||
if len(m.config.DaemonPath) > 0 {
|
if len(m.config.DaemonPath) > 0 {
|
||||||
p := os.Getenv("PATH")
|
p := os.Getenv("PATH")
|
||||||
os.Setenv("PATH", m.config.DaemonPath+":"+p)
|
if len(p) > 0 {
|
||||||
|
p = m.config.DaemonPath + ":" + p
|
||||||
|
} else {
|
||||||
|
p = m.config.DaemonPath
|
||||||
|
}
|
||||||
|
if err := os.Setenv("PATH", p); err != nil {
|
||||||
|
return fmt.Errorf("error setting environment variable PATH=%s: %v", p, err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
C.HPMmode(1)
|
C.HPMmode(1)
|
||||||
retCode := C.HPMinit()
|
retCode := C.HPMinit()
|
||||||
@@ -368,10 +387,18 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
|||||||
// Watch changes for the lock file ()
|
// Watch changes for the lock file ()
|
||||||
watcher, err := fsnotify.NewWatcher()
|
watcher, err := fsnotify.NewWatcher()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, err.Error())
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("takeMeasurement(): Failed to create a new fsnotify.Watcher: %v", err))
|
||||||
return true, err
|
return true, err
|
||||||
}
|
}
|
||||||
defer watcher.Close()
|
defer func() {
|
||||||
|
if err := watcher.Close(); err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("takeMeasurement(): Failed to close fsnotify.Watcher: %v", err))
|
||||||
|
}
|
||||||
|
}()
|
||||||
if len(m.config.LockfilePath) > 0 {
|
if len(m.config.LockfilePath) > 0 {
|
||||||
// Check if the lock file exists
|
// Check if the lock file exists
|
||||||
info, err := os.Stat(m.config.LockfilePath)
|
info, err := os.Stat(m.config.LockfilePath)
|
||||||
@@ -381,7 +408,9 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
|||||||
if createErr != nil {
|
if createErr != nil {
|
||||||
return true, fmt.Errorf("failed to create lock file: %v", createErr)
|
return true, fmt.Errorf("failed to create lock file: %v", createErr)
|
||||||
}
|
}
|
||||||
file.Close()
|
if err := file.Close(); err != nil {
|
||||||
|
return true, fmt.Errorf("failed to close lock file: %v", err)
|
||||||
|
}
|
||||||
info, err = os.Stat(m.config.LockfilePath) // Recheck the file after creation
|
info, err = os.Stat(m.config.LockfilePath) // Recheck the file after creation
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -588,7 +617,6 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
|||||||
evset.metrics[tid][metric.Name] = value
|
evset.metrics[tid][metric.Name] = value
|
||||||
// Now we have the result, send it with the proper tags
|
// Now we have the result, send it with the proper tags
|
||||||
if !math.IsNaN(value) && metric.Publish {
|
if !math.IsNaN(value) && metric.Publish {
|
||||||
fields := map[string]interface{}{"value": value}
|
|
||||||
y, err :=
|
y, err :=
|
||||||
lp.NewMessage(
|
lp.NewMessage(
|
||||||
metric.Name,
|
metric.Name,
|
||||||
@@ -596,7 +624,9 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
|||||||
"type": metric.Type,
|
"type": metric.Type,
|
||||||
},
|
},
|
||||||
m.meta,
|
m.meta,
|
||||||
fields,
|
map[string]any{
|
||||||
|
"value": value,
|
||||||
|
},
|
||||||
now,
|
now,
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -634,7 +664,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
|||||||
"type-id": fmt.Sprintf("%d", coreID),
|
"type-id": fmt.Sprintf("%d", coreID),
|
||||||
},
|
},
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]interface{}{
|
map[string]any{
|
||||||
"value": value,
|
"value": value,
|
||||||
},
|
},
|
||||||
now,
|
now,
|
||||||
@@ -671,7 +701,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
|||||||
"type-id": fmt.Sprintf("%d", socketID),
|
"type-id": fmt.Sprintf("%d", socketID),
|
||||||
},
|
},
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]interface{}{
|
map[string]any{
|
||||||
"value": value,
|
"value": value,
|
||||||
},
|
},
|
||||||
now,
|
now,
|
||||||
@@ -705,7 +735,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
|||||||
"type": "node",
|
"type": "node",
|
||||||
},
|
},
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]interface{}{
|
map[string]any{
|
||||||
"value": totalNodeValue,
|
"value": totalNodeValue,
|
||||||
},
|
},
|
||||||
now,
|
now,
|
||||||
@@ -741,9 +771,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
|||||||
// Here we generate parameter list
|
// Here we generate parameter list
|
||||||
params := make(map[string]float64)
|
params := make(map[string]float64)
|
||||||
for _, evset := range groups {
|
for _, evset := range groups {
|
||||||
for mname, mres := range evset.metrics[tid] {
|
maps.Copy(params, evset.metrics[tid])
|
||||||
params[mname] = mres
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
params["gotime"] = interval.Seconds()
|
params["gotime"] = interval.Seconds()
|
||||||
// Evaluate the metric
|
// Evaluate the metric
|
||||||
@@ -765,7 +793,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
|||||||
"type": metric.Type,
|
"type": metric.Type,
|
||||||
},
|
},
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]interface{}{
|
map[string]any{
|
||||||
"value": value,
|
"value": value,
|
||||||
},
|
},
|
||||||
now,
|
now,
|
||||||
@@ -806,13 +834,21 @@ func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMe
|
|||||||
|
|
||||||
if !skip {
|
if !skip {
|
||||||
// read measurements and derive event set metrics
|
// read measurements and derive event set metrics
|
||||||
m.calcEventsetMetrics(e, interval, output)
|
err = m.calcEventsetMetrics(e, interval, output)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
groups = append(groups, e)
|
groups = append(groups, e)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(groups) > 0 {
|
if len(groups) > 0 {
|
||||||
// calculate global metrics
|
// calculate global metrics
|
||||||
m.calcGlobalMetrics(groups, interval, output)
|
err = m.calcGlobalMetrics(groups, interval, output)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,15 +1,23 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// LoadavgCollector collects:
|
// LoadavgCollector collects:
|
||||||
@@ -35,7 +43,9 @@ type LoadavgCollector struct {
|
|||||||
func (m *LoadavgCollector) Init(config json.RawMessage) error {
|
func (m *LoadavgCollector) Init(config json.RawMessage) error {
|
||||||
m.name = "LoadavgCollector"
|
m.name = "LoadavgCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err := json.Unmarshal(config, &m.config)
|
err := json.Unmarshal(config, &m.config)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -57,10 +67,10 @@ func (m *LoadavgCollector) Init(config json.RawMessage) error {
|
|||||||
m.proc_skips = make([]bool, len(m.proc_matches))
|
m.proc_skips = make([]bool, len(m.proc_matches))
|
||||||
|
|
||||||
for i, name := range m.load_matches {
|
for i, name := range m.load_matches {
|
||||||
_, m.load_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
|
m.load_skips[i] = slices.Contains(m.config.ExcludeMetrics, name)
|
||||||
}
|
}
|
||||||
for i, name := range m.proc_matches {
|
for i, name := range m.proc_matches {
|
||||||
_, m.proc_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
|
m.proc_skips[i] = slices.Contains(m.config.ExcludeMetrics, name)
|
||||||
}
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
@@ -92,7 +102,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
if m.load_skips[i] {
|
if m.load_skips[i] {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
|
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -111,7 +121,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
if m.proc_skips[i] {
|
if m.proc_skips[i] {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
|
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -6,12 +13,13 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"os/user"
|
"os/user"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const LUSTRE_SYSFS = `/sys/fs/lustre`
|
const LUSTRE_SYSFS = `/sys/fs/lustre`
|
||||||
@@ -54,7 +62,6 @@ func (m *LustreCollector) getDeviceDataCommand(device string) []string {
|
|||||||
} else {
|
} else {
|
||||||
command = exec.Command(m.lctl, LCTL_OPTION, statsfile)
|
command = exec.Command(m.lctl, LCTL_OPTION, statsfile)
|
||||||
}
|
}
|
||||||
command.Wait()
|
|
||||||
stdout, _ := command.Output()
|
stdout, _ := command.Output()
|
||||||
return strings.Split(string(stdout), "\n")
|
return strings.Split(string(stdout), "\n")
|
||||||
}
|
}
|
||||||
@@ -295,7 +302,9 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Lustre"}
|
m.meta = map[string]string{"source": m.name, "group": "Lustre"}
|
||||||
|
|
||||||
@@ -332,21 +341,21 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
|||||||
m.definitions = []LustreMetricDefinition{}
|
m.definitions = []LustreMetricDefinition{}
|
||||||
if m.config.SendAbsoluteValues {
|
if m.config.SendAbsoluteValues {
|
||||||
for _, def := range LustreAbsMetrics {
|
for _, def := range LustreAbsMetrics {
|
||||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||||
m.definitions = append(m.definitions, def)
|
m.definitions = append(m.definitions, def)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.config.SendDiffValues {
|
if m.config.SendDiffValues {
|
||||||
for _, def := range LustreDiffMetrics {
|
for _, def := range LustreDiffMetrics {
|
||||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||||
m.definitions = append(m.definitions, def)
|
m.definitions = append(m.definitions, def)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.config.SendDerivedValues {
|
if m.config.SendDerivedValues {
|
||||||
for _, def := range LustreDeriveMetrics {
|
for _, def := range LustreDeriveMetrics {
|
||||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||||
m.definitions = append(m.definitions, def)
|
m.definitions = append(m.definitions, def)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -395,23 +404,23 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
} else {
|
} else {
|
||||||
use_x = devData[def.name]
|
use_x = devData[def.name]
|
||||||
}
|
}
|
||||||
var value interface{}
|
var value any
|
||||||
switch def.calc {
|
switch def.calc {
|
||||||
case "none":
|
case "none":
|
||||||
value = use_x
|
value = use_x
|
||||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||||
case "difference":
|
case "difference":
|
||||||
value = use_x - devData[def.name]
|
value = use_x - devData[def.name]
|
||||||
if value.(int64) < 0 {
|
if value.(int64) < 0 {
|
||||||
value = 0
|
value = 0
|
||||||
}
|
}
|
||||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||||
case "derivative":
|
case "derivative":
|
||||||
value = float64(use_x-devData[def.name]) / tdiff.Seconds()
|
value = float64(use_x-devData[def.name]) / tdiff.Seconds()
|
||||||
if value.(float64) < 0 {
|
if value.(float64) < 0 {
|
||||||
value = 0
|
value = 0
|
||||||
}
|
}
|
||||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||||
}
|
}
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("device", device)
|
y.AddTag("device", device)
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -8,12 +15,13 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const MEMSTATFILE = "/proc/meminfo"
|
const MEMSTATFILE = "/proc/meminfo"
|
||||||
@@ -51,7 +59,11 @@ func getStats(filename string) map[string]MemstatStats {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.Error(err.Error())
|
cclog.Error(err.Error())
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer func() {
|
||||||
|
if err := file.Close(); err != nil {
|
||||||
|
cclog.Error(err.Error())
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
@@ -108,19 +120,20 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
"MemShared": "mem_shared",
|
"MemShared": "mem_shared",
|
||||||
}
|
}
|
||||||
for k, v := range matches {
|
for k, v := range matches {
|
||||||
_, skip := stringArrayContains(m.config.ExcludeMetrics, k)
|
if !slices.Contains(m.config.ExcludeMetrics, k) {
|
||||||
if !skip {
|
|
||||||
m.matches[k] = v
|
m.matches[k] = v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.sendMemUsed = false
|
m.sendMemUsed = false
|
||||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_used"); !skip {
|
if !slices.Contains(m.config.ExcludeMetrics, "mem_used") {
|
||||||
m.sendMemUsed = true
|
m.sendMemUsed = true
|
||||||
}
|
}
|
||||||
if len(m.matches) == 0 {
|
if len(m.matches) == 0 {
|
||||||
return errors.New("no metrics to collect")
|
return errors.New("no metrics to collect")
|
||||||
}
|
}
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
|
|
||||||
if m.config.NodeStats {
|
if m.config.NodeStats {
|
||||||
if stats := getStats(MEMSTATFILE); len(stats) == 0 {
|
if stats := getStats(MEMSTATFILE); len(stats) == 0 {
|
||||||
@@ -167,7 +180,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
sendStats := func(stats map[string]MemstatStats, tags map[string]string) {
|
sendStats := func(stats map[string]MemstatStats, tags map[string]string) {
|
||||||
for match, name := range m.matches {
|
for match, name := range m.matches {
|
||||||
var value float64 = 0
|
var value float64 = 0
|
||||||
var unit string = ""
|
unit := ""
|
||||||
if v, ok := stats[match]; ok {
|
if v, ok := stats[match]; ok {
|
||||||
value = v.value
|
value = v.value
|
||||||
if len(v.unit) > 0 {
|
if len(v.unit) > 0 {
|
||||||
@@ -175,7 +188,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if len(unit) > 0 {
|
if len(unit) > 0 {
|
||||||
y.AddMeta("unit", unit)
|
y.AddMeta("unit", unit)
|
||||||
@@ -208,7 +221,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now())
|
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]any{"value": memUsed}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if len(unit) > 0 {
|
if len(unit) > 0 {
|
||||||
y.AddMeta("unit", unit)
|
y.AddMeta("unit", unit)
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -5,7 +12,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
type MetricCollector interface {
|
type MetricCollector interface {
|
||||||
@@ -44,30 +51,6 @@ func (c *metricCollector) Initialized() bool {
|
|||||||
return c.init
|
return c.init
|
||||||
}
|
}
|
||||||
|
|
||||||
// intArrayContains scans an array of ints if the value str is present in the array
|
|
||||||
// If the specified value is found, the corresponding array index is returned.
|
|
||||||
// The bool value is used to signal success or failure
|
|
||||||
func intArrayContains(array []int, str int) (int, bool) {
|
|
||||||
for i, a := range array {
|
|
||||||
if a == str {
|
|
||||||
return i, true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return -1, false
|
|
||||||
}
|
|
||||||
|
|
||||||
// stringArrayContains scans an array of strings if the value str is present in the array
|
|
||||||
// If the specified value is found, the corresponding array index is returned.
|
|
||||||
// The bool value is used to signal success or failure
|
|
||||||
func stringArrayContains(array []string, str string) (int, bool) {
|
|
||||||
for i, a := range array {
|
|
||||||
if a == str {
|
|
||||||
return i, true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return -1, false
|
|
||||||
}
|
|
||||||
|
|
||||||
// RemoveFromStringList removes the string r from the array of strings s
|
// RemoveFromStringList removes the string r from the array of strings s
|
||||||
// If r is not contained in the array an error is returned
|
// If r is not contained in the array an error is returned
|
||||||
func RemoveFromStringList(s []string, r string) ([]string, error) {
|
func RemoveFromStringList(s []string, r string) ([]string, error) {
|
||||||
|
|||||||
@@ -1,16 +1,24 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const NETSTATFILE = "/proc/net/dev"
|
const NETSTATFILE = "/proc/net/dev"
|
||||||
@@ -58,7 +66,9 @@ func getCanonicalName(raw string, aliasToCanonical map[string]string) string {
|
|||||||
func (m *NetstatCollector) Init(config json.RawMessage) error {
|
func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||||
m.name = "NetstatCollector"
|
m.name = "NetstatCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
m.lastTimestamp = time.Now()
|
m.lastTimestamp = time.Now()
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@@ -100,10 +110,8 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
|||||||
// Check access to net statistic file
|
// Check access to net statistic file
|
||||||
file, err := os.Open(NETSTATFILE)
|
file, err := os.Open(NETSTATFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, err.Error())
|
return fmt.Errorf("%s Init(): failed to open netstat file \"%s\": %w", m.name, NETSTATFILE, err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
defer file.Close()
|
|
||||||
|
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
@@ -122,7 +130,7 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
|||||||
canonical := getCanonicalName(raw, m.aliasToCanonical)
|
canonical := getCanonicalName(raw, m.aliasToCanonical)
|
||||||
|
|
||||||
// Check if device is a included device
|
// Check if device is a included device
|
||||||
if _, ok := stringArrayContains(m.config.IncludeDevices, canonical); ok {
|
if slices.Contains(m.config.IncludeDevices, canonical) {
|
||||||
// Tag will contain original device name (raw).
|
// Tag will contain original device name (raw).
|
||||||
tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"}
|
tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"}
|
||||||
meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"}
|
meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"}
|
||||||
@@ -167,8 +175,13 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Close netstat file
|
||||||
|
if err := file.Close(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): failed to close netstat file \"%s\": %w", m.name, NETSTATFILE, err)
|
||||||
|
}
|
||||||
|
|
||||||
if len(m.matches) == 0 {
|
if len(m.matches) == 0 {
|
||||||
return errors.New("no devices to collector metrics found")
|
return fmt.Errorf("%s Init(): no devices to collect metrics found", m.name)
|
||||||
}
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
@@ -187,10 +200,18 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
|
|
||||||
file, err := os.Open(NETSTATFILE)
|
file, err := os.Open(NETSTATFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, err.Error())
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to open file '%s': %v", NETSTATFILE, err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer func() {
|
||||||
|
if err := file.Close(); err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to close file '%s': %v", NETSTATFILE, err))
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
@@ -219,14 +240,14 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if m.config.SendAbsoluteValues {
|
if m.config.SendAbsoluteValues {
|
||||||
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]interface{}{"value": v}, now); err == nil {
|
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]any{"value": v}, now); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.config.SendDerivedValues {
|
if m.config.SendDerivedValues {
|
||||||
if metric.lastValue >= 0 {
|
if metric.lastValue >= 0 {
|
||||||
rate := float64(v-metric.lastValue) / timeDiff
|
rate := float64(v-metric.lastValue) / timeDiff
|
||||||
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]interface{}{"value": rate}, now); err == nil {
|
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]any{"value": rate}, now); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +1,17 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
|
"slices"
|
||||||
|
|
||||||
// "os"
|
// "os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
@@ -11,7 +19,8 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// First part contains the code for the general NfsCollector.
|
// First part contains the code for the general NfsCollector.
|
||||||
@@ -37,10 +46,15 @@ type nfsCollector struct {
|
|||||||
|
|
||||||
func (m *nfsCollector) initStats() error {
|
func (m *nfsCollector) initStats() error {
|
||||||
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
|
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
|
||||||
cmd.Wait()
|
|
||||||
|
// Wait for cmd end
|
||||||
|
if err := cmd.Wait(); err != nil {
|
||||||
|
return fmt.Errorf("initStats(): %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
buffer, err := cmd.Output()
|
buffer, err := cmd.Output()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
for _, line := range strings.Split(string(buffer), "\n") {
|
for line := range strings.Lines(string(buffer)) {
|
||||||
lf := strings.Fields(line)
|
lf := strings.Fields(line)
|
||||||
if len(lf) != 5 {
|
if len(lf) != 5 {
|
||||||
continue
|
continue
|
||||||
@@ -64,10 +78,15 @@ func (m *nfsCollector) initStats() error {
|
|||||||
|
|
||||||
func (m *nfsCollector) updateStats() error {
|
func (m *nfsCollector) updateStats() error {
|
||||||
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
|
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
|
||||||
cmd.Wait()
|
|
||||||
|
// Wait for cmd end
|
||||||
|
if err := cmd.Wait(); err != nil {
|
||||||
|
return fmt.Errorf("updateStats(): %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
buffer, err := cmd.Output()
|
buffer, err := cmd.Output()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
for _, line := range strings.Split(string(buffer), "\n") {
|
for line := range strings.Lines(string(buffer)) {
|
||||||
lf := strings.Fields(line)
|
lf := strings.Fields(line)
|
||||||
if len(lf) != 5 {
|
if len(lf) != 5 {
|
||||||
continue
|
continue
|
||||||
@@ -112,7 +131,9 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
|
|||||||
return fmt.Errorf("NfsCollector.Init(): Failed to find nfsstat binary '%s': %v", m.config.Nfsstats, err)
|
return fmt.Errorf("NfsCollector.Init(): Failed to find nfsstat binary '%s': %v", m.config.Nfsstats, err)
|
||||||
}
|
}
|
||||||
m.data = make(map[string]NfsCollectorData)
|
m.data = make(map[string]NfsCollectorData)
|
||||||
m.initStats()
|
if err := m.initStats(); err != nil {
|
||||||
|
return fmt.Errorf("NfsCollector.Init(): %w", err)
|
||||||
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
return nil
|
return nil
|
||||||
@@ -124,7 +145,13 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
}
|
}
|
||||||
timestamp := time.Now()
|
timestamp := time.Now()
|
||||||
|
|
||||||
m.updateStats()
|
if err := m.updateStats(); err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): updateStats() failed: %v", err),
|
||||||
|
)
|
||||||
|
return
|
||||||
|
}
|
||||||
prefix := ""
|
prefix := ""
|
||||||
switch m.version {
|
switch m.version {
|
||||||
case "v3":
|
case "v3":
|
||||||
@@ -136,11 +163,11 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for name, data := range m.data {
|
for name, data := range m.data {
|
||||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, name); skip {
|
if slices.Contains(m.config.ExcludeMetrics, name) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
value := data.current - data.last
|
value := data.current - data.last
|
||||||
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("version", m.version)
|
y.AddMeta("version", m.version)
|
||||||
output <- y
|
output <- y
|
||||||
@@ -163,13 +190,17 @@ type Nfs4Collector struct {
|
|||||||
func (m *Nfs3Collector) Init(config json.RawMessage) error {
|
func (m *Nfs3Collector) Init(config json.RawMessage) error {
|
||||||
m.name = "Nfs3Collector"
|
m.name = "Nfs3Collector"
|
||||||
m.version = `v3`
|
m.version = `v3`
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
return m.MainInit(config)
|
return m.MainInit(config)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Nfs4Collector) Init(config json.RawMessage) error {
|
func (m *Nfs4Collector) Init(config json.RawMessage) error {
|
||||||
m.name = "Nfs4Collector"
|
m.name = "Nfs4Collector"
|
||||||
m.version = `v4`
|
m.version = `v4`
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
return m.MainInit(config)
|
return m.MainInit(config)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -5,12 +12,13 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// These are the fields we read from the JSON configuration
|
// These are the fields we read from the JSON configuration
|
||||||
@@ -64,7 +72,7 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
|
|||||||
// Is this a device line with mount point, remote target and NFS version?
|
// Is this a device line with mount point, remote target and NFS version?
|
||||||
dev := resolve_regex_fields(l, deviceRegex)
|
dev := resolve_regex_fields(l, deviceRegex)
|
||||||
if len(dev) > 0 {
|
if len(dev) > 0 {
|
||||||
if _, ok := stringArrayContains(m.config.ExcludeFilesystem, dev[m.key]); !ok {
|
if !slices.Contains(m.config.ExcludeFilesystem, dev[m.key]) {
|
||||||
current = dev
|
current = dev
|
||||||
if len(current["version"]) == 0 {
|
if len(current["version"]) == 0 {
|
||||||
current["version"] = "3"
|
current["version"] = "3"
|
||||||
@@ -78,7 +86,7 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
|
|||||||
if len(bytes) > 0 {
|
if len(bytes) > 0 {
|
||||||
data[current[m.key]] = make(map[string]int64)
|
data[current[m.key]] = make(map[string]int64)
|
||||||
for name, sval := range bytes {
|
for name, sval := range bytes {
|
||||||
if _, ok := stringArrayContains(m.config.ExcludeMetrics, name); !ok {
|
if !slices.Contains(m.config.ExcludeMetrics, name) {
|
||||||
val, err := strconv.ParseInt(sval, 10, 64)
|
val, err := strconv.ParseInt(sval, 10, 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
data[current[m.key]][name] = val
|
data[current[m.key]][name] = val
|
||||||
@@ -95,7 +103,9 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
|
|||||||
func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
||||||
var err error = nil
|
var err error = nil
|
||||||
m.name = "NfsIOStatCollector"
|
m.name = "NfsIOStatCollector"
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{"source": m.name, "group": "NFS", "unit": "bytes"}
|
m.meta = map[string]string{"source": m.name, "group": "NFS", "unit": "bytes"}
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
@@ -133,7 +143,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
if old, ok := m.data[mntpoint]; ok {
|
if old, ok := m.data[mntpoint]; ok {
|
||||||
for name, newVal := range values {
|
for name, newVal := range values {
|
||||||
if m.config.SendAbsoluteValues {
|
if m.config.SendAbsoluteValues {
|
||||||
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", name), m.tags, m.meta, map[string]interface{}{"value": newVal}, now)
|
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", name), m.tags, m.meta, map[string]any{"value": newVal}, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
msg.AddTag("stype", "filesystem")
|
msg.AddTag("stype", "filesystem")
|
||||||
msg.AddTag("stype-id", mntpoint)
|
msg.AddTag("stype-id", mntpoint)
|
||||||
@@ -142,7 +152,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
}
|
}
|
||||||
if m.config.SendDerivedValues {
|
if m.config.SendDerivedValues {
|
||||||
rate := float64(newVal-old[name]) / timeDiff
|
rate := float64(newVal-old[name]) / timeDiff
|
||||||
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]interface{}{"value": rate}, now)
|
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]any{"value": rate}, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if strings.HasPrefix(name, "page") {
|
if strings.HasPrefix(name, "page") {
|
||||||
msg.AddMeta("unit", "4K_pages/s")
|
msg.AddMeta("unit", "4K_pages/s")
|
||||||
|
|||||||
@@ -10,8 +10,8 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
type NUMAStatsCollectorConfig struct {
|
type NUMAStatsCollectorConfig struct {
|
||||||
@@ -72,12 +72,22 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
m.name = "NUMAStatsCollector"
|
m.name = "NUMAStatsCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{
|
||||||
"source": m.name,
|
"source": m.name,
|
||||||
"group": "NUMA",
|
"group": "NUMA",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m.config.SendAbsoluteValues = true
|
||||||
|
if len(config) > 0 {
|
||||||
|
err := json.Unmarshal(config, &m.config)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("unable to unmarshal numastat configuration: %s", err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Loop for all NUMA node directories
|
// Loop for all NUMA node directories
|
||||||
base := "/sys/devices/system/node/node"
|
base := "/sys/devices/system/node/node"
|
||||||
globPattern := base + "[0-9]*"
|
globPattern := base + "[0-9]*"
|
||||||
@@ -95,7 +105,10 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
|||||||
m.topology = append(m.topology,
|
m.topology = append(m.topology,
|
||||||
NUMAStatsCollectorTopolgy{
|
NUMAStatsCollectorTopolgy{
|
||||||
file: file,
|
file: file,
|
||||||
tagSet: map[string]string{"memoryDomain": node},
|
tagSet: map[string]string{
|
||||||
|
"type": "memoryDomain",
|
||||||
|
"type-id": node,
|
||||||
|
},
|
||||||
previousValues: make(map[string]int64),
|
previousValues: make(map[string]int64),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -145,11 +158,11 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
}
|
}
|
||||||
|
|
||||||
if m.config.SendAbsoluteValues {
|
if m.config.SendAbsoluteValues {
|
||||||
msg, err := lp.NewMessage(
|
msg, err := lp.NewMetric(
|
||||||
"numastats_"+key,
|
"numastats_"+key,
|
||||||
t.tagSet,
|
t.tagSet,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]interface{}{"value": value},
|
value,
|
||||||
now,
|
now,
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -161,11 +174,11 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
prev, ok := t.previousValues[key]
|
prev, ok := t.previousValues[key]
|
||||||
if ok {
|
if ok {
|
||||||
rate := float64(value-prev) / timeDiff
|
rate := float64(value-prev) / timeDiff
|
||||||
msg, err := lp.NewMessage(
|
msg, err := lp.NewMetric(
|
||||||
"numastats_"+key+"_rate",
|
"numastats_"+key+"_rate",
|
||||||
t.tagSet,
|
t.tagSet,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]interface{}{"value": rate},
|
rate,
|
||||||
now,
|
now,
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -175,7 +188,11 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
t.previousValues[key] = value
|
t.previousValues[key] = value
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
file.Close()
|
if err := file.Close(); err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to close file '%s': %v", t.file, err))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -5,11 +12,13 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
|
"maps"
|
||||||
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -31,6 +40,8 @@ type NvidiaCollectorDevice struct {
|
|||||||
excludeMetrics map[string]bool
|
excludeMetrics map[string]bool
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
meta map[string]string
|
meta map[string]string
|
||||||
|
lastEnergyReading uint64
|
||||||
|
lastEnergyTimestamp time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaCollector struct {
|
type NvidiaCollector struct {
|
||||||
@@ -55,7 +66,9 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
m.config.ProcessMigDevices = false
|
m.config.ProcessMigDevices = false
|
||||||
m.config.UseUuidForMigDevices = false
|
m.config.UseUuidForMigDevices = false
|
||||||
m.config.UseSliceForMigDevices = false
|
m.config.UseSliceForMigDevices = false
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err = json.Unmarshal(config, &m.config)
|
err = json.Unmarshal(config, &m.config)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -96,11 +109,11 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
// For all GPUs
|
// For all GPUs
|
||||||
idx := 0
|
idx := 0
|
||||||
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
|
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
|
||||||
for i := 0; i < num_gpus; i++ {
|
for i := range num_gpus {
|
||||||
|
|
||||||
// Skip excluded devices by ID
|
// Skip excluded devices by ID
|
||||||
str_i := fmt.Sprintf("%d", i)
|
str_i := fmt.Sprintf("%d", i)
|
||||||
if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip {
|
if slices.Contains(m.config.ExcludeDevices, str_i) {
|
||||||
cclog.ComponentDebug(m.name, "Skipping excluded device", str_i)
|
cclog.ComponentDebug(m.name, "Skipping excluded device", str_i)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -128,7 +141,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
pciInfo.Device)
|
pciInfo.Device)
|
||||||
|
|
||||||
// Skip excluded devices specified by PCI ID
|
// Skip excluded devices specified by PCI ID
|
||||||
if _, skip := stringArrayContains(m.config.ExcludeDevices, pci_id); skip {
|
if slices.Contains(m.config.ExcludeDevices, pci_id) {
|
||||||
cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id)
|
cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -149,6 +162,8 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Add device handle
|
// Add device handle
|
||||||
g.device = device
|
g.device = device
|
||||||
|
g.lastEnergyReading = 0
|
||||||
|
g.lastEnergyTimestamp = time.Now()
|
||||||
|
|
||||||
// Add tags
|
// Add tags
|
||||||
g.tags = map[string]string{
|
g.tags = map[string]string{
|
||||||
@@ -206,23 +221,25 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_fb_mem_total"] || !device.excludeMetrics["nv_fb_mem_used"] || !device.excludeMetrics["nv_fb_mem_reserved"] {
|
if !device.excludeMetrics["nv_fb_mem_total"] || !device.excludeMetrics["nv_fb_mem_used"] || !device.excludeMetrics["nv_fb_mem_reserved"] {
|
||||||
var total uint64
|
var total uint64
|
||||||
var used uint64
|
var used uint64
|
||||||
var reserved uint64 = 0
|
var reserved uint64 = 0
|
||||||
var v2 bool = false
|
v2 := false
|
||||||
meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
|
meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err := errors.New(nvml.ErrorString(ret))
|
err := errors.New(nvml.ErrorString(ret))
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
// Total physical device memory (in bytes)
|
||||||
total = meminfo.Total
|
total = meminfo.Total
|
||||||
|
// Sum of Reserved and Allocated device memory (in bytes)
|
||||||
used = meminfo.Used
|
used = meminfo.Used
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_fb_mem_total"] {
|
if !device.excludeMetrics["nv_fb_mem_total"] {
|
||||||
t := float64(total) / (1024 * 1024)
|
t := float64(total) / (1024 * 1024)
|
||||||
y, err := lp.NewMessage("nv_fb_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
y, err := lp.NewMessage("nv_fb_mem_total", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MByte")
|
y.AddMeta("unit", "MByte")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -231,7 +248,7 @@ func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
|
|
||||||
if !device.excludeMetrics["nv_fb_mem_used"] {
|
if !device.excludeMetrics["nv_fb_mem_used"] {
|
||||||
f := float64(used) / (1024 * 1024)
|
f := float64(used) / (1024 * 1024)
|
||||||
y, err := lp.NewMessage("nv_fb_mem_used", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
|
y, err := lp.NewMessage("nv_fb_mem_used", device.tags, device.meta, map[string]any{"value": f}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MByte")
|
y.AddMeta("unit", "MByte")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -240,7 +257,7 @@ func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
|
|
||||||
if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] {
|
if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] {
|
||||||
r := float64(reserved) / (1024 * 1024)
|
r := float64(reserved) / (1024 * 1024)
|
||||||
y, err := lp.NewMessage("nv_fb_mem_reserved", device.tags, device.meta, map[string]interface{}{"value": r}, time.Now())
|
y, err := lp.NewMessage("nv_fb_mem_reserved", device.tags, device.meta, map[string]any{"value": r}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MByte")
|
y.AddMeta("unit", "MByte")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -250,7 +267,7 @@ func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_bar1_mem_total"] || !device.excludeMetrics["nv_bar1_mem_used"] {
|
if !device.excludeMetrics["nv_bar1_mem_total"] || !device.excludeMetrics["nv_bar1_mem_used"] {
|
||||||
meminfo, ret := nvml.DeviceGetBAR1MemoryInfo(device.device)
|
meminfo, ret := nvml.DeviceGetBAR1MemoryInfo(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
@@ -259,7 +276,7 @@ func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) e
|
|||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_bar1_mem_total"] {
|
if !device.excludeMetrics["nv_bar1_mem_total"] {
|
||||||
t := float64(meminfo.Bar1Total) / (1024 * 1024)
|
t := float64(meminfo.Bar1Total) / (1024 * 1024)
|
||||||
y, err := lp.NewMessage("nv_bar1_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
y, err := lp.NewMessage("nv_bar1_mem_total", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MByte")
|
y.AddMeta("unit", "MByte")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -267,7 +284,7 @@ func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) e
|
|||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_bar1_mem_used"] {
|
if !device.excludeMetrics["nv_bar1_mem_used"] {
|
||||||
t := float64(meminfo.Bar1Used) / (1024 * 1024)
|
t := float64(meminfo.Bar1Used) / (1024 * 1024)
|
||||||
y, err := lp.NewMessage("nv_bar1_mem_used", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
y, err := lp.NewMessage("nv_bar1_mem_used", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MByte")
|
y.AddMeta("unit", "MByte")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -277,7 +294,7 @@ func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) e
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err := errors.New(nvml.ErrorString(ret))
|
err := errors.New(nvml.ErrorString(ret))
|
||||||
@@ -301,14 +318,14 @@ func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
util, ret := nvml.DeviceGetUtilizationRates(device.device)
|
util, ret := nvml.DeviceGetUtilizationRates(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
if !device.excludeMetrics["nv_util"] {
|
if !device.excludeMetrics["nv_util"] {
|
||||||
y, err := lp.NewMessage("nv_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
|
y, err := lp.NewMessage("nv_util", device.tags, device.meta, map[string]any{"value": float64(util.Gpu)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_mem_util"] {
|
if !device.excludeMetrics["nv_mem_util"] {
|
||||||
y, err := lp.NewMessage("nv_mem_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
|
y, err := lp.NewMessage("nv_mem_util", device.tags, device.meta, map[string]any{"value": float64(util.Memory)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -319,7 +336,7 @@ func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readTemp(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readTemp(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_temp"] {
|
if !device.excludeMetrics["nv_temp"] {
|
||||||
// Retrieves the current temperature readings for the device, in degrees C.
|
// Retrieves the current temperature readings for the device, in degrees C.
|
||||||
//
|
//
|
||||||
@@ -328,7 +345,7 @@ func readTemp(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
// * NVML_TEMPERATURE_COUNT
|
// * NVML_TEMPERATURE_COUNT
|
||||||
temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
|
temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_temp", device.tags, device.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
|
y, err := lp.NewMessage("nv_temp", device.tags, device.meta, map[string]any{"value": float64(temp)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "degC")
|
y.AddMeta("unit", "degC")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -338,7 +355,7 @@ func readTemp(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readFan(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_fan"] {
|
if !device.excludeMetrics["nv_fan"] {
|
||||||
// Retrieves the intended operating speed of the device's fan.
|
// Retrieves the intended operating speed of the device's fan.
|
||||||
//
|
//
|
||||||
@@ -351,7 +368,7 @@ func readFan(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
// This value may exceed 100% in certain cases.
|
// This value may exceed 100% in certain cases.
|
||||||
fan, ret := nvml.DeviceGetFanSpeed(device.device)
|
fan, ret := nvml.DeviceGetFanSpeed(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
|
y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]any{"value": float64(fan)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -361,7 +378,7 @@ func readFan(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// func readFans(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
// func readFans(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
// if !device.excludeMetrics["nv_fan"] {
|
// if !device.excludeMetrics["nv_fan"] {
|
||||||
// numFans, ret := nvml.DeviceGetNumFans(device.device)
|
// numFans, ret := nvml.DeviceGetNumFans(device.device)
|
||||||
// if ret == nvml.SUCCESS {
|
// if ret == nvml.SUCCESS {
|
||||||
@@ -382,7 +399,7 @@ func readFan(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
// return nil
|
// return nil
|
||||||
// }
|
// }
|
||||||
|
|
||||||
func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_ecc_mode"] {
|
if !device.excludeMetrics["nv_ecc_mode"] {
|
||||||
// Retrieves the current and pending ECC modes for the device.
|
// Retrieves the current and pending ECC modes for the device.
|
||||||
//
|
//
|
||||||
@@ -392,22 +409,23 @@ func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
// Changing ECC modes requires a reboot.
|
// Changing ECC modes requires a reboot.
|
||||||
// The "pending" ECC mode refers to the target mode following the next reboot.
|
// The "pending" ECC mode refers to the target mode following the next reboot.
|
||||||
_, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
|
_, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
switch ret {
|
||||||
|
case nvml.SUCCESS:
|
||||||
var y lp.CCMessage
|
var y lp.CCMessage
|
||||||
var err error
|
var err error
|
||||||
switch ecc_pend {
|
switch ecc_pend {
|
||||||
case nvml.FEATURE_DISABLED:
|
case nvml.FEATURE_DISABLED:
|
||||||
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "OFF"}, time.Now())
|
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "OFF"}, time.Now())
|
||||||
case nvml.FEATURE_ENABLED:
|
case nvml.FEATURE_ENABLED:
|
||||||
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "ON"}, time.Now())
|
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "ON"}, time.Now())
|
||||||
default:
|
default:
|
||||||
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now())
|
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "UNKNOWN"}, time.Now())
|
||||||
}
|
}
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
} else if ret == nvml.ERROR_NOT_SUPPORTED {
|
case nvml.ERROR_NOT_SUPPORTED:
|
||||||
y, err := lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "N/A"}, time.Now())
|
y, err := lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "N/A"}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -416,7 +434,7 @@ func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readPerfState(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_perf_state"] {
|
if !device.excludeMetrics["nv_perf_state"] {
|
||||||
// Retrieves the current performance state for the device.
|
// Retrieves the current performance state for the device.
|
||||||
//
|
//
|
||||||
@@ -427,7 +445,7 @@ func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMessage) error
|
|||||||
// 32: Unknown performance state.
|
// 32: Unknown performance state.
|
||||||
pState, ret := nvml.DeviceGetPerformanceState(device.device)
|
pState, ret := nvml.DeviceGetPerformanceState(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_perf_state", device.tags, device.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
|
y, err := lp.NewMessage("nv_perf_state", device.tags, device.meta, map[string]any{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -436,13 +454,16 @@ func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMessage) error
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readPowerUsage(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_power_usage"] {
|
if !device.excludeMetrics["nv_power_usage"] {
|
||||||
// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
|
// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
|
||||||
//
|
//
|
||||||
// On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
|
// On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
|
||||||
|
// On Ampere (except GA100) or newer GPUs, the API returns power averaged over 1 sec interval.
|
||||||
|
// On GA100 and older architectures, instantaneous power is returned.
|
||||||
//
|
//
|
||||||
// It is only available if power management mode is supported
|
// It is only available if power management mode is supported.
|
||||||
|
|
||||||
mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
|
mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
return nil
|
return nil
|
||||||
@@ -450,7 +471,7 @@ func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
if mode == nvml.FEATURE_ENABLED {
|
if mode == nvml.FEATURE_ENABLED {
|
||||||
power, ret := nvml.DeviceGetPowerUsage(device.device)
|
power, ret := nvml.DeviceGetPowerUsage(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_power_usage", device.tags, device.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
|
y, err := lp.NewMessage("nv_power_usage", device.tags, device.meta, map[string]any{"value": float64(power) / 1000}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "watts")
|
y.AddMeta("unit", "watts")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -461,7 +482,54 @@ func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readEnergyConsumption(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
|
// Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded
|
||||||
|
|
||||||
|
// For Volta or newer fully supported devices.
|
||||||
|
if (!device.excludeMetrics["nv_energy"]) && (!device.excludeMetrics["nv_energy_abs"]) && (!device.excludeMetrics["nv_average_power"]) {
|
||||||
|
now := time.Now()
|
||||||
|
mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if mode == nvml.FEATURE_ENABLED {
|
||||||
|
energy, ret := nvml.DeviceGetTotalEnergyConsumption(device.device)
|
||||||
|
if ret == nvml.SUCCESS {
|
||||||
|
if device.lastEnergyReading != 0 {
|
||||||
|
if !device.excludeMetrics["nv_energy"] {
|
||||||
|
y, err := lp.NewMetric("nv_energy", device.tags, device.meta, (energy-device.lastEnergyReading)/1000, now)
|
||||||
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "Joules")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !device.excludeMetrics["nv_average_power"] {
|
||||||
|
|
||||||
|
energyDiff := (energy - device.lastEnergyReading) / 1000
|
||||||
|
timeDiff := now.Sub(device.lastEnergyTimestamp)
|
||||||
|
y, err := lp.NewMetric("nv_average_power", device.tags, device.meta, energyDiff/uint64(timeDiff.Seconds()), now)
|
||||||
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "watts")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !device.excludeMetrics["nv_energy_abs"] {
|
||||||
|
y, err := lp.NewMetric("nv_energy_abs", device.tags, device.meta, energy/1000, now)
|
||||||
|
if err == nil {
|
||||||
|
y.AddMeta("unit", "Joules")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
device.lastEnergyReading = energy
|
||||||
|
device.lastEnergyTimestamp = time.Now()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
// Retrieves the current clock speeds for the device.
|
// Retrieves the current clock speeds for the device.
|
||||||
//
|
//
|
||||||
// Available clock information:
|
// Available clock information:
|
||||||
@@ -471,7 +539,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
if !device.excludeMetrics["nv_graphics_clock"] {
|
if !device.excludeMetrics["nv_graphics_clock"] {
|
||||||
graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
|
graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now())
|
y, err := lp.NewMessage("nv_graphics_clock", device.tags, device.meta, map[string]any{"value": float64(graphicsClock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -482,7 +550,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
if !device.excludeMetrics["nv_sm_clock"] {
|
if !device.excludeMetrics["nv_sm_clock"] {
|
||||||
smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
|
smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(smCock)}, time.Now())
|
y, err := lp.NewMessage("nv_sm_clock", device.tags, device.meta, map[string]any{"value": float64(smCock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -493,7 +561,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
if !device.excludeMetrics["nv_mem_clock"] {
|
if !device.excludeMetrics["nv_mem_clock"] {
|
||||||
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
|
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
|
y, err := lp.NewMessage("nv_mem_clock", device.tags, device.meta, map[string]any{"value": float64(memClock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -503,7 +571,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
if !device.excludeMetrics["nv_video_clock"] {
|
if !device.excludeMetrics["nv_video_clock"] {
|
||||||
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
|
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
|
y, err := lp.NewMessage("nv_video_clock", device.tags, device.meta, map[string]any{"value": float64(memClock)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -513,7 +581,7 @@ func readClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readMaxClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
// Retrieves the maximum clock speeds for the device.
|
// Retrieves the maximum clock speeds for the device.
|
||||||
//
|
//
|
||||||
// Available clock information:
|
// Available clock information:
|
||||||
@@ -528,7 +596,7 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error
|
|||||||
if !device.excludeMetrics["nv_max_graphics_clock"] {
|
if !device.excludeMetrics["nv_max_graphics_clock"] {
|
||||||
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS)
|
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_GRAPHICS)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_max_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
|
y, err := lp.NewMetric("nv_max_graphics_clock", device.tags, device.meta, float64(max_gclk), time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -537,9 +605,9 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_max_sm_clock"] {
|
if !device.excludeMetrics["nv_max_sm_clock"] {
|
||||||
maxSmClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
|
maxSmClock, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_SM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_max_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxSmClock)}, time.Now())
|
y, err := lp.NewMetric("nv_max_sm_clock", device.tags, device.meta, float64(maxSmClock), time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -548,9 +616,9 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_max_mem_clock"] {
|
if !device.excludeMetrics["nv_max_mem_clock"] {
|
||||||
maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
|
maxMemClock, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_MEM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_max_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now())
|
y, err := lp.NewMetric("nv_max_mem_clock", device.tags, device.meta, float64(maxMemClock), time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -559,9 +627,9 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !device.excludeMetrics["nv_max_video_clock"] {
|
if !device.excludeMetrics["nv_max_video_clock"] {
|
||||||
maxMemClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
|
maxVideoClock, ret := nvml.DeviceGetMaxClockInfo(device.device, nvml.CLOCK_VIDEO)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_max_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(maxMemClock)}, time.Now())
|
y, err := lp.NewMetric("nv_max_video_clock", device.tags, device.meta, float64(maxVideoClock), time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -571,7 +639,7 @@ func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_ecc_uncorrected_error"] {
|
if !device.excludeMetrics["nv_ecc_uncorrected_error"] {
|
||||||
// Retrieves the total ECC error counts for the device.
|
// Retrieves the total ECC error counts for the device.
|
||||||
//
|
//
|
||||||
@@ -584,7 +652,7 @@ func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMessage) error
|
|||||||
// i.e. the total set of errors across the entire device.
|
// i.e. the total set of errors across the entire device.
|
||||||
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
|
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
|
y, err := lp.NewMessage("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]any{"value": float64(ecc_db)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -593,7 +661,7 @@ func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMessage) error
|
|||||||
if !device.excludeMetrics["nv_ecc_corrected_error"] {
|
if !device.excludeMetrics["nv_ecc_corrected_error"] {
|
||||||
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
|
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_ecc_corrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
|
y, err := lp.NewMessage("nv_ecc_corrected_error", device.tags, device.meta, map[string]any{"value": float64(ecc_sb)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -602,7 +670,7 @@ func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMessage) error
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readPowerLimit(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_power_max_limit"] {
|
if !device.excludeMetrics["nv_power_max_limit"] {
|
||||||
// Retrieves the power management limit associated with this device.
|
// Retrieves the power management limit associated with this device.
|
||||||
//
|
//
|
||||||
@@ -612,7 +680,7 @@ func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
// If the card's total power draw reaches this limit the power management algorithm kicks in.
|
// If the card's total power draw reaches this limit the power management algorithm kicks in.
|
||||||
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
|
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_power_max_limit", device.tags, device.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now())
|
y, err := lp.NewMessage("nv_power_max_limit", device.tags, device.meta, map[string]any{"value": float64(pwr_limit) / 1000}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "watts")
|
y.AddMeta("unit", "watts")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -622,7 +690,7 @@ func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readEncUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err := errors.New(nvml.ErrorString(ret))
|
err := errors.New(nvml.ErrorString(ret))
|
||||||
@@ -639,7 +707,7 @@ func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
|
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
|
||||||
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
|
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_encoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
|
y, err := lp.NewMessage("nv_encoder_util", device.tags, device.meta, map[string]any{"value": float64(enc_util)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -649,7 +717,7 @@ func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readDecUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err := errors.New(nvml.ErrorString(ret))
|
err := errors.New(nvml.ErrorString(ret))
|
||||||
@@ -666,7 +734,7 @@ func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
|
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
|
||||||
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
|
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_decoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
|
y, err := lp.NewMessage("nv_decoder_util", device.tags, device.meta, map[string]any{"value": float64(dec_util)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -676,7 +744,7 @@ func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_remapped_rows_corrected"] ||
|
if !device.excludeMetrics["nv_remapped_rows_corrected"] ||
|
||||||
!device.excludeMetrics["nv_remapped_rows_uncorrected"] ||
|
!device.excludeMetrics["nv_remapped_rows_uncorrected"] ||
|
||||||
!device.excludeMetrics["nv_remapped_rows_pending"] ||
|
!device.excludeMetrics["nv_remapped_rows_pending"] ||
|
||||||
@@ -693,33 +761,33 @@ func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device)
|
corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
if !device.excludeMetrics["nv_remapped_rows_corrected"] {
|
if !device.excludeMetrics["nv_remapped_rows_corrected"] {
|
||||||
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(corrected)}, time.Now())
|
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]any{"value": float64(corrected)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_remapped_rows_uncorrected"] {
|
if !device.excludeMetrics["nv_remapped_rows_uncorrected"] {
|
||||||
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(uncorrected)}, time.Now())
|
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]any{"value": float64(uncorrected)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_remapped_rows_pending"] {
|
if !device.excludeMetrics["nv_remapped_rows_pending"] {
|
||||||
var p int = 0
|
p := 0
|
||||||
if pending {
|
if pending {
|
||||||
p = 1
|
p = 1
|
||||||
}
|
}
|
||||||
y, err := lp.NewMessage("nv_remapped_rows_pending", device.tags, device.meta, map[string]interface{}{"value": p}, time.Now())
|
y, err := lp.NewMessage("nv_remapped_rows_pending", device.tags, device.meta, map[string]any{"value": p}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_remapped_rows_failure"] {
|
if !device.excludeMetrics["nv_remapped_rows_failure"] {
|
||||||
var f int = 0
|
f := 0
|
||||||
if failure {
|
if failure {
|
||||||
f = 1
|
f = 1
|
||||||
}
|
}
|
||||||
y, err := lp.NewMessage("nv_remapped_rows_failure", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
|
y, err := lp.NewMessage("nv_remapped_rows_failure", device.tags, device.meta, map[string]any{"value": f}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -729,7 +797,7 @@ func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMessage) er
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
if !device.excludeMetrics["nv_compute_processes"] {
|
if !device.excludeMetrics["nv_compute_processes"] {
|
||||||
// Get information about processes with a compute context on a device
|
// Get information about processes with a compute context on a device
|
||||||
//
|
//
|
||||||
@@ -753,7 +821,7 @@ func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMessage) e
|
|||||||
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
|
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
|
||||||
procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device)
|
procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
|
y, err := lp.NewMessage("nv_compute_processes", device.tags, device.meta, map[string]any{"value": len(procList)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -782,7 +850,7 @@ func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMessage) e
|
|||||||
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
|
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
|
||||||
procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device)
|
procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_graphics_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
|
y, err := lp.NewMessage("nv_graphics_processes", device.tags, device.meta, map[string]any{"value": len(procList)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -821,7 +889,7 @@ func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMessage) e
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
var violTime nvml.ViolationTime
|
var violTime nvml.ViolationTime
|
||||||
var ret nvml.Return
|
var ret nvml.Return
|
||||||
|
|
||||||
@@ -840,7 +908,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMessage("nv_violation_power", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
y, err := lp.NewMessage("nv_violation_power", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -852,7 +920,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMessage("nv_violation_thermal", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
y, err := lp.NewMessage("nv_violation_thermal", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -864,7 +932,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMessage("nv_violation_sync_boost", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
y, err := lp.NewMessage("nv_violation_sync_boost", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -876,7 +944,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMessage("nv_violation_board_limit", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
y, err := lp.NewMessage("nv_violation_board_limit", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -888,7 +956,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMessage("nv_violation_low_util", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
y, err := lp.NewMessage("nv_violation_low_util", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -900,7 +968,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMessage("nv_violation_reliability", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
y, err := lp.NewMessage("nv_violation_reliability", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -912,7 +980,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMessage("nv_violation_below_app_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
y, err := lp.NewMessage("nv_violation_below_app_clock", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -924,7 +992,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS)
|
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(violTime.ViolationTime) * 1e-9
|
t := float64(violTime.ViolationTime) * 1e-9
|
||||||
y, err := lp.NewMessage("nv_violation_below_base_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
y, err := lp.NewMessage("nv_violation_below_base_clock", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "sec")
|
y.AddMeta("unit", "sec")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -935,7 +1003,7 @@ func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage)
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||||
// Retrieves the specified error counter value
|
// Retrieves the specified error counter value
|
||||||
// Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
|
// Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
|
||||||
//
|
//
|
||||||
@@ -947,7 +1015,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
var aggregate_recovery_errors uint64 = 0
|
var aggregate_recovery_errors uint64 = 0
|
||||||
var aggregate_crc_flit_errors uint64 = 0
|
var aggregate_crc_flit_errors uint64 = 0
|
||||||
|
|
||||||
for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ {
|
for i := range nvml.NVLINK_MAX_LINKS {
|
||||||
state, ret := nvml.DeviceGetNvLinkState(device.device, i)
|
state, ret := nvml.DeviceGetNvLinkState(device.device, i)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
if state == nvml.FEATURE_ENABLED {
|
if state == nvml.FEATURE_ENABLED {
|
||||||
@@ -956,7 +1024,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA)
|
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA)
|
||||||
aggregate_crc_errors = aggregate_crc_errors + count
|
aggregate_crc_errors = aggregate_crc_errors + count
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_nvlink_crc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_crc_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||||
@@ -969,7 +1037,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA)
|
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA)
|
||||||
aggregate_ecc_errors = aggregate_ecc_errors + count
|
aggregate_ecc_errors = aggregate_ecc_errors + count
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||||
@@ -982,7 +1050,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY)
|
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY)
|
||||||
aggregate_replay_errors = aggregate_replay_errors + count
|
aggregate_replay_errors = aggregate_replay_errors + count
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_nvlink_replay_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_replay_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||||
@@ -995,7 +1063,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY)
|
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY)
|
||||||
aggregate_recovery_errors = aggregate_recovery_errors + count
|
aggregate_recovery_errors = aggregate_recovery_errors + count
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||||
@@ -1008,7 +1076,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT)
|
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT)
|
||||||
aggregate_crc_flit_errors = aggregate_crc_flit_errors + count
|
aggregate_crc_flit_errors = aggregate_crc_flit_errors + count
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||||
@@ -1023,7 +1091,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
// Export aggegated values
|
// Export aggegated values
|
||||||
if !device.excludeMetrics["nv_nvlink_crc_errors"] {
|
if !device.excludeMetrics["nv_nvlink_crc_errors"] {
|
||||||
// Data link receive data CRC error counter
|
// Data link receive data CRC error counter
|
||||||
y, err := lp.NewMessage("nv_nvlink_crc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_errors}, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_crc_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_crc_errors}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -1031,7 +1099,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
|
if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
|
||||||
// Data link receive data ECC error counter
|
// Data link receive data ECC error counter
|
||||||
y, err := lp.NewMessage("nv_nvlink_ecc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_ecc_errors}, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_ecc_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_ecc_errors}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -1039,7 +1107,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_nvlink_replay_errors"] {
|
if !device.excludeMetrics["nv_nvlink_replay_errors"] {
|
||||||
// Data link transmit replay error counter
|
// Data link transmit replay error counter
|
||||||
y, err := lp.NewMessage("nv_nvlink_replay_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_replay_errors}, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_replay_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_replay_errors}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -1047,7 +1115,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
|
if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
|
||||||
// Data link transmit recovery error counter
|
// Data link transmit recovery error counter
|
||||||
y, err := lp.NewMessage("nv_nvlink_recovery_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_recovery_errors}, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_recovery_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_recovery_errors}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -1055,7 +1123,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) err
|
|||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
|
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
|
||||||
// Data link receive flow control digit CRC error counter
|
// Data link receive flow control digit CRC error counter
|
||||||
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_flit_errors}, time.Now())
|
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_crc_flit_errors}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "nvlink")
|
y.AddTag("stype", "nvlink")
|
||||||
output <- y
|
output <- y
|
||||||
@@ -1070,7 +1138,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
readAll := func(device NvidiaCollectorDevice, output chan lp.CCMessage) {
|
readAll := func(device *NvidiaCollectorDevice, output chan lp.CCMessage) {
|
||||||
name, ret := nvml.DeviceGetName(device.device)
|
name, ret := nvml.DeviceGetName(device.device)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
name = "NoName"
|
name = "NoName"
|
||||||
@@ -1110,6 +1178,11 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
err = readEnergyConsumption(device, output)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentDebug(m.name, "readEnergyConsumption for device", name, "failed")
|
||||||
|
}
|
||||||
|
|
||||||
err = readClocks(device, output)
|
err = readClocks(device, output)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
|
cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
|
||||||
@@ -1169,7 +1242,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
// Actual read loop over all attached Nvidia GPUs
|
// Actual read loop over all attached Nvidia GPUs
|
||||||
for i := 0; i < m.num_gpus; i++ {
|
for i := 0; i < m.num_gpus; i++ {
|
||||||
|
|
||||||
readAll(m.gpus[i], output)
|
readAll(&m.gpus[i], output)
|
||||||
|
|
||||||
// Iterate over all MIG devices if any
|
// Iterate over all MIG devices if any
|
||||||
if m.config.ProcessMigDevices {
|
if m.config.ProcessMigDevices {
|
||||||
@@ -1190,7 +1263,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
}
|
}
|
||||||
cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i)
|
cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i)
|
||||||
|
|
||||||
for j := 0; j < maxMig; j++ {
|
for j := range maxMig {
|
||||||
mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
|
mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
continue
|
continue
|
||||||
@@ -1207,9 +1280,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
meta: map[string]string{},
|
meta: map[string]string{},
|
||||||
excludeMetrics: excludeMetrics,
|
excludeMetrics: excludeMetrics,
|
||||||
}
|
}
|
||||||
for k, v := range m.gpus[i].tags {
|
maps.Copy(migDevice.tags, m.gpus[i].tags)
|
||||||
migDevice.tags[k] = v
|
|
||||||
}
|
|
||||||
migDevice.tags["stype"] = "mig"
|
migDevice.tags["stype"] = "mig"
|
||||||
if m.config.UseUuidForMigDevices {
|
if m.config.UseUuidForMigDevices {
|
||||||
uuid, ret := nvml.DeviceGetUUID(mdev)
|
uuid, ret := nvml.DeviceGetUUID(mdev)
|
||||||
@@ -1223,8 +1294,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
mname, ret := nvml.DeviceGetName(mdev)
|
mname, ret := nvml.DeviceGetName(mdev)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
x := strings.Replace(mname, name, "", -1)
|
x := strings.ReplaceAll(mname, name, "")
|
||||||
x = strings.Replace(x, "MIG", "", -1)
|
x = strings.ReplaceAll(x, "MIG", "")
|
||||||
x = strings.TrimSpace(x)
|
x = strings.TrimSpace(x)
|
||||||
migDevice.tags["stype-id"] = x
|
migDevice.tags["stype-id"] = x
|
||||||
}
|
}
|
||||||
@@ -1233,9 +1304,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
if _, ok := migDevice.tags["stype-id"]; !ok {
|
if _, ok := migDevice.tags["stype-id"]; !ok {
|
||||||
migDevice.tags["stype-id"] = fmt.Sprintf("%d", j)
|
migDevice.tags["stype-id"] = fmt.Sprintf("%d", j)
|
||||||
}
|
}
|
||||||
for k, v := range m.gpus[i].meta {
|
maps.Copy(migDevice.meta, m.gpus[i].meta)
|
||||||
migDevice.meta[k] = v
|
|
||||||
}
|
|
||||||
if _, ok := migDevice.meta["uuid"]; ok && !m.config.UseUuidForMigDevices {
|
if _, ok := migDevice.meta["uuid"]; ok && !m.config.UseUuidForMigDevices {
|
||||||
uuid, ret := nvml.DeviceGetUUID(mdev)
|
uuid, ret := nvml.DeviceGetUUID(mdev)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
@@ -1243,7 +1312,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
readAll(migDevice, output)
|
readAll(&migDevice, output)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1251,7 +1320,9 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
|
|
||||||
func (m *NvidiaCollector) Close() {
|
func (m *NvidiaCollector) Close() {
|
||||||
if m.init {
|
if m.init {
|
||||||
nvml.Shutdown()
|
if ret := nvml.Shutdown(); ret != nvml.SUCCESS {
|
||||||
|
cclog.ComponentError(m.name, "nvml.Shutdown() not successful")
|
||||||
|
}
|
||||||
m.init = false
|
m.init = false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -82,5 +82,8 @@ Metrics:
|
|||||||
* `nv_nvlink_ecc_errors`
|
* `nv_nvlink_ecc_errors`
|
||||||
* `nv_nvlink_replay_errors`
|
* `nv_nvlink_replay_errors`
|
||||||
* `nv_nvlink_recovery_errors`
|
* `nv_nvlink_recovery_errors`
|
||||||
|
* `nv_energy`
|
||||||
|
* `nv_energy_abs`
|
||||||
|
* `nv_average_power`
|
||||||
|
|
||||||
Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`.
|
Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`.
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -9,8 +16,8 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// running average power limit (RAPL) monitoring attributes for a zone
|
// running average power limit (RAPL) monitoring attributes for a zone
|
||||||
@@ -47,9 +54,10 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
var err error = nil
|
|
||||||
m.name = "RAPLCollector"
|
m.name = "RAPLCollector"
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{
|
m.meta = map[string]string{
|
||||||
"source": m.name,
|
"source": m.name,
|
||||||
@@ -59,7 +67,7 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Read in the JSON configuration
|
// Read in the JSON configuration
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err = json.Unmarshal(config, &m.config)
|
err := json.Unmarshal(config, &m.config)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||||
return err
|
return err
|
||||||
@@ -241,7 +249,7 @@ func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
"rapl_average_power",
|
"rapl_average_power",
|
||||||
p.tags,
|
p.tags,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]interface{}{"value": averagePower},
|
map[string]any{"value": averagePower},
|
||||||
energyTimestamp)
|
energyTimestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
|||||||
@@ -1,13 +1,21 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"slices"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
|
"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -45,7 +53,9 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
|||||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||||
m.name = "RocmSmiCollector"
|
m.name = "RocmSmiCollector"
|
||||||
// This is for later use, also call it early
|
// This is for later use, also call it early
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
// Define meta information sent with each metric
|
// Define meta information sent with each metric
|
||||||
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||||
//m.meta = map[string]string{"source": m.name, "group": "AMD"}
|
//m.meta = map[string]string{"source": m.name, "group": "AMD"}
|
||||||
@@ -78,22 +88,11 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
exclDev := func(s string) bool {
|
|
||||||
skip_device := false
|
|
||||||
for _, excl := range m.config.ExcludeDevices {
|
|
||||||
if excl == s {
|
|
||||||
skip_device = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return skip_device
|
|
||||||
}
|
|
||||||
|
|
||||||
m.devices = make([]RocmSmiCollectorDevice, 0)
|
m.devices = make([]RocmSmiCollectorDevice, 0)
|
||||||
|
|
||||||
for i := 0; i < numDevs; i++ {
|
for i := range numDevs {
|
||||||
str_i := fmt.Sprintf("%d", i)
|
str_i := fmt.Sprintf("%d", i)
|
||||||
if exclDev(str_i) {
|
if slices.Contains(m.config.ExcludeDevices, str_i) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
|
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
|
||||||
@@ -117,7 +116,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
|||||||
pciInfo.Device,
|
pciInfo.Device,
|
||||||
pciInfo.Function)
|
pciInfo.Function)
|
||||||
|
|
||||||
if exclDev(pciId) {
|
if slices.Contains(m.config.ExcludeDevices, pciId) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -175,127 +174,127 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage
|
|||||||
|
|
||||||
if !dev.excludeMetrics["rocm_gfx_util"] {
|
if !dev.excludeMetrics["rocm_gfx_util"] {
|
||||||
value := metrics.Average_gfx_activity
|
value := metrics.Average_gfx_activity
|
||||||
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_umc_util"] {
|
if !dev.excludeMetrics["rocm_umc_util"] {
|
||||||
value := metrics.Average_umc_activity
|
value := metrics.Average_umc_activity
|
||||||
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_mm_util"] {
|
if !dev.excludeMetrics["rocm_mm_util"] {
|
||||||
value := metrics.Average_mm_activity
|
value := metrics.Average_mm_activity
|
||||||
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_avg_power"] {
|
if !dev.excludeMetrics["rocm_avg_power"] {
|
||||||
value := metrics.Average_socket_power
|
value := metrics.Average_socket_power
|
||||||
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_mem"] {
|
if !dev.excludeMetrics["rocm_temp_mem"] {
|
||||||
value := metrics.Temperature_mem
|
value := metrics.Temperature_mem
|
||||||
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_hotspot"] {
|
if !dev.excludeMetrics["rocm_temp_hotspot"] {
|
||||||
value := metrics.Temperature_hotspot
|
value := metrics.Temperature_hotspot
|
||||||
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_edge"] {
|
if !dev.excludeMetrics["rocm_temp_edge"] {
|
||||||
value := metrics.Temperature_edge
|
value := metrics.Temperature_edge
|
||||||
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_vrgfx"] {
|
if !dev.excludeMetrics["rocm_temp_vrgfx"] {
|
||||||
value := metrics.Temperature_vrgfx
|
value := metrics.Temperature_vrgfx
|
||||||
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_vrsoc"] {
|
if !dev.excludeMetrics["rocm_temp_vrsoc"] {
|
||||||
value := metrics.Temperature_vrsoc
|
value := metrics.Temperature_vrsoc
|
||||||
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_vrmem"] {
|
if !dev.excludeMetrics["rocm_temp_vrmem"] {
|
||||||
value := metrics.Temperature_vrmem
|
value := metrics.Temperature_vrmem
|
||||||
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_gfx_clock"] {
|
if !dev.excludeMetrics["rocm_gfx_clock"] {
|
||||||
value := metrics.Average_gfxclk_frequency
|
value := metrics.Average_gfxclk_frequency
|
||||||
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_soc_clock"] {
|
if !dev.excludeMetrics["rocm_soc_clock"] {
|
||||||
value := metrics.Average_socclk_frequency
|
value := metrics.Average_socclk_frequency
|
||||||
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_u_clock"] {
|
if !dev.excludeMetrics["rocm_u_clock"] {
|
||||||
value := metrics.Average_uclk_frequency
|
value := metrics.Average_uclk_frequency
|
||||||
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_v0_clock"] {
|
if !dev.excludeMetrics["rocm_v0_clock"] {
|
||||||
value := metrics.Average_vclk0_frequency
|
value := metrics.Average_vclk0_frequency
|
||||||
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_v1_clock"] {
|
if !dev.excludeMetrics["rocm_v1_clock"] {
|
||||||
value := metrics.Average_vclk1_frequency
|
value := metrics.Average_vclk1_frequency
|
||||||
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_d0_clock"] {
|
if !dev.excludeMetrics["rocm_d0_clock"] {
|
||||||
value := metrics.Average_dclk0_frequency
|
value := metrics.Average_dclk0_frequency
|
||||||
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_d1_clock"] {
|
if !dev.excludeMetrics["rocm_d1_clock"] {
|
||||||
value := metrics.Average_dclk1_frequency
|
value := metrics.Average_dclk1_frequency
|
||||||
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !dev.excludeMetrics["rocm_temp_hbm"] {
|
if !dev.excludeMetrics["rocm_temp_hbm"] {
|
||||||
for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ {
|
for i := range rocm_smi.NUM_HBM_INSTANCES {
|
||||||
value := metrics.Temperature_hbm[i]
|
value := metrics.Temperature_hbm[i]
|
||||||
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddTag("stype", "device")
|
y.AddTag("stype", "device")
|
||||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||||
|
|||||||
@@ -15,7 +15,9 @@ hugo_path: docs/reference/cc-metric-collector/collectors/rocmsmi.md
|
|||||||
```json
|
```json
|
||||||
"rocm_smi": {
|
"rocm_smi": {
|
||||||
"exclude_devices": [
|
"exclude_devices": [
|
||||||
"0","1", "0000000:ff:01.0"
|
"0",
|
||||||
|
"1",
|
||||||
|
"0000000:ff:01.0"
|
||||||
],
|
],
|
||||||
"exclude_metrics": [
|
"exclude_metrics": [
|
||||||
"rocm_mm_util",
|
"rocm_mm_util",
|
||||||
@@ -23,7 +25,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/rocmsmi.md
|
|||||||
],
|
],
|
||||||
"use_pci_info_as_type_id": true,
|
"use_pci_info_as_type_id": true,
|
||||||
"add_pci_info_tag": false,
|
"add_pci_info_tag": false,
|
||||||
"add_serial_meta": false,
|
"add_serial_meta": false
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -1,11 +1,19 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// These are the fields we read from the JSON configuration
|
// These are the fields we read from the JSON configuration
|
||||||
@@ -34,7 +42,9 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
|
|||||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||||
m.name = "SampleCollector"
|
m.name = "SampleCollector"
|
||||||
// This is for later use, also call it early
|
// This is for later use, also call it early
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
// Tell whether the collector should be run in parallel with others (reading files, ...)
|
// Tell whether the collector should be run in parallel with others (reading files, ...)
|
||||||
// or it should be run serially, mostly for collectors actually doing measurements
|
// or it should be run serially, mostly for collectors actually doing measurements
|
||||||
// because they should not measure the execution of the other collectors
|
// because they should not measure the execution of the other collectors
|
||||||
@@ -85,7 +95,7 @@ func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
|||||||
// stop := readState()
|
// stop := readState()
|
||||||
// value = (stop - start) / interval.Seconds()
|
// value = (stop - start) / interval.Seconds()
|
||||||
|
|
||||||
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
// Send it to output channel
|
// Send it to output channel
|
||||||
output <- y
|
output <- y
|
||||||
|
|||||||
@@ -1,12 +1,20 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// These are the fields we read from the JSON configuration
|
// These are the fields we read from the JSON configuration
|
||||||
@@ -33,7 +41,9 @@ func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
|
|||||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||||
m.name = "SampleTimerCollector"
|
m.name = "SampleTimerCollector"
|
||||||
// This is for later use, also call it early
|
// This is for later use, also call it early
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
// Define meta information sent with each metric
|
// Define meta information sent with each metric
|
||||||
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||||
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
|
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
|
||||||
@@ -100,7 +110,7 @@ func (m *SampleTimerCollector) ReadMetrics(timestamp time.Time) {
|
|||||||
// stop := readState()
|
// stop := readState()
|
||||||
// value = (stop - start) / interval.Seconds()
|
// value = (stop - start) / interval.Seconds()
|
||||||
|
|
||||||
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
|
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]any{"value": value}, timestamp)
|
||||||
if err == nil && m.output != nil {
|
if err == nil && m.output != nil {
|
||||||
// Send it to output channel if we have a valid channel
|
// Send it to output channel if we have a valid channel
|
||||||
m.output <- y
|
m.output <- y
|
||||||
|
|||||||
@@ -1,17 +1,23 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
|
||||||
"os"
|
"os"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const SCHEDSTATFILE = `/proc/schedstat`
|
const SCHEDSTATFILE = `/proc/schedstat`
|
||||||
@@ -40,37 +46,37 @@ type SchedstatCollector struct {
|
|||||||
// Called once by the collector manager
|
// Called once by the collector manager
|
||||||
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
||||||
func (m *SchedstatCollector) Init(config json.RawMessage) error {
|
func (m *SchedstatCollector) Init(config json.RawMessage) error {
|
||||||
var err error = nil
|
|
||||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||||
m.name = "SchedstatCollector"
|
m.name = "SchedstatCollector"
|
||||||
// This is for later use, also call it early
|
// This is for later use, also call it early
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
// Tell whether the collector should be run in parallel with others (reading files, ...)
|
// Tell whether the collector should be run in parallel with others (reading files, ...)
|
||||||
// or it should be run serially, mostly for collectors acutally doing measurements
|
// or it should be run serially, mostly for collectors acutally doing measurements
|
||||||
// because they should not measure the execution of the other collectors
|
// because they should not measure the execution of the other collectors
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
// Define meta information sent with each metric
|
// Define meta information sent with each metric
|
||||||
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||||
m.meta = map[string]string{"source": m.name, "group": "SCHEDSTAT"}
|
m.meta = map[string]string{
|
||||||
|
"source": m.name,
|
||||||
|
"group": "SCHEDSTAT",
|
||||||
|
}
|
||||||
|
|
||||||
// Read in the JSON configuration
|
// Read in the JSON configuration
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err = json.Unmarshal(config, &m.config)
|
if err := json.Unmarshal(config, &m.config); err != nil {
|
||||||
if err != nil {
|
return fmt.Errorf("%s Init(): Error reading config: %w", m.name, err)
|
||||||
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check input file
|
// Check input file
|
||||||
file, err := os.Open(string(SCHEDSTATFILE))
|
file, err := os.Open(SCHEDSTATFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, err.Error())
|
return fmt.Errorf("%s Init(): Failed opening scheduler statistics file \"%s\": %w", m.name, SCHEDSTATFILE, err)
|
||||||
}
|
}
|
||||||
defer file.Close()
|
|
||||||
|
|
||||||
// Pre-generate tags for all CPUs
|
// Pre-generate tags for all CPUs
|
||||||
num_cpus := 0
|
|
||||||
m.cputags = make(map[string]map[string]string)
|
m.cputags = make(map[string]map[string]string)
|
||||||
m.olddata = make(map[string]map[string]int64)
|
m.olddata = make(map[string]map[string]int64)
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
@@ -82,10 +88,18 @@ func (m *SchedstatCollector) Init(config json.RawMessage) error {
|
|||||||
cpu, _ := strconv.Atoi(cpustr)
|
cpu, _ := strconv.Atoi(cpustr)
|
||||||
running, _ := strconv.ParseInt(linefields[7], 10, 64)
|
running, _ := strconv.ParseInt(linefields[7], 10, 64)
|
||||||
waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
|
waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
|
||||||
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
|
m.cputags[linefields[0]] = map[string]string{
|
||||||
m.olddata[linefields[0]] = map[string]int64{"running": running, "waiting": waiting}
|
"type": "hwthread",
|
||||||
num_cpus++
|
"type-id": fmt.Sprintf("%d", cpu),
|
||||||
}
|
}
|
||||||
|
m.olddata[linefields[0]] = map[string]int64{
|
||||||
|
"running": running,
|
||||||
|
"waiting": waiting,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := file.Close(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): Failed closing scheduler statistics file \"%s\": %w", m.name, SCHEDSTATFILE, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save current timestamp
|
// Save current timestamp
|
||||||
@@ -102,14 +116,14 @@ func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]
|
|||||||
diff_running := running - m.olddata[linefields[0]]["running"]
|
diff_running := running - m.olddata[linefields[0]]["running"]
|
||||||
diff_waiting := waiting - m.olddata[linefields[0]]["waiting"]
|
diff_waiting := waiting - m.olddata[linefields[0]]["waiting"]
|
||||||
|
|
||||||
var l_running float64 = float64(diff_running) / tsdelta.Seconds() / (math.Pow(1000, 3))
|
l_running := float64(diff_running) / tsdelta.Seconds() / 1000_000_000
|
||||||
var l_waiting float64 = float64(diff_waiting) / tsdelta.Seconds() / (math.Pow(1000, 3))
|
l_waiting := float64(diff_waiting) / tsdelta.Seconds() / 1000_000_000
|
||||||
|
|
||||||
m.olddata[linefields[0]]["running"] = running
|
m.olddata[linefields[0]]["running"] = running
|
||||||
m.olddata[linefields[0]]["waiting"] = waiting
|
m.olddata[linefields[0]]["waiting"] = waiting
|
||||||
value := l_running + l_waiting
|
value := l_running + l_waiting
|
||||||
|
|
||||||
y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now)
|
y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]any{"value": value}, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
// Send it to output channel
|
// Send it to output channel
|
||||||
output <- y
|
output <- y
|
||||||
@@ -127,11 +141,19 @@ func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
|||||||
now := time.Now()
|
now := time.Now()
|
||||||
tsdelta := now.Sub(m.lastTimestamp)
|
tsdelta := now.Sub(m.lastTimestamp)
|
||||||
|
|
||||||
file, err := os.Open(string(SCHEDSTATFILE))
|
file, err := os.Open(SCHEDSTATFILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, err.Error())
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to open file '%s': %v", SCHEDSTATFILE, err))
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer func() {
|
||||||
|
if err := file.Close(); err != nil {
|
||||||
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to close file '%s': %v", SCHEDSTATFILE, err))
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
|
|||||||
@@ -1,13 +1,21 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"runtime"
|
"runtime"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
type SelfCollectorConfig struct {
|
type SelfCollectorConfig struct {
|
||||||
@@ -27,7 +35,9 @@ type SelfCollector struct {
|
|||||||
func (m *SelfCollector) Init(config json.RawMessage) error {
|
func (m *SelfCollector) Init(config json.RawMessage) error {
|
||||||
var err error = nil
|
var err error = nil
|
||||||
m.name = "SelfCollector"
|
m.name = "SelfCollector"
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Self"}
|
m.meta = map[string]string{"source": m.name, "group": "Self"}
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
@@ -49,49 +59,49 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
var memstats runtime.MemStats
|
var memstats runtime.MemStats
|
||||||
runtime.ReadMemStats(&memstats)
|
runtime.ReadMemStats(&memstats)
|
||||||
|
|
||||||
y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp)
|
y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]any{"value": memstats.TotalAlloc}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "Bytes")
|
y.AddMeta("unit", "Bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp)
|
y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]any{"value": memstats.HeapAlloc}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "Bytes")
|
y.AddMeta("unit", "Bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp)
|
y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]any{"value": memstats.HeapSys}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "Bytes")
|
y.AddMeta("unit", "Bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp)
|
y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]any{"value": memstats.HeapIdle}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "Bytes")
|
y.AddMeta("unit", "Bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp)
|
y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]any{"value": memstats.HeapInuse}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "Bytes")
|
y.AddMeta("unit", "Bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp)
|
y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]any{"value": memstats.HeapReleased}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "Bytes")
|
y.AddMeta("unit", "Bytes")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp)
|
y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]any{"value": memstats.HeapObjects}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.config.GoRoutines {
|
if m.config.GoRoutines {
|
||||||
y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp)
|
y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]any{"value": runtime.NumGoroutine()}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.config.CgoCalls {
|
if m.config.CgoCalls {
|
||||||
y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp)
|
y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]any{"value": runtime.NumCgoCall()}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
@@ -102,35 +112,35 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
sec, nsec := rusage.Utime.Unix()
|
sec, nsec := rusage.Utime.Unix()
|
||||||
t := float64(sec) + (float64(nsec) * 1e-9)
|
t := float64(sec) + (float64(nsec) * 1e-9)
|
||||||
y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
|
y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]any{"value": t}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "seconds")
|
y.AddMeta("unit", "seconds")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
sec, nsec = rusage.Stime.Unix()
|
sec, nsec = rusage.Stime.Unix()
|
||||||
t = float64(sec) + (float64(nsec) * 1e-9)
|
t = float64(sec) + (float64(nsec) * 1e-9)
|
||||||
y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
|
y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]any{"value": t}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "seconds")
|
y.AddMeta("unit", "seconds")
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp)
|
y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]any{"value": rusage.Nvcsw}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp)
|
y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]any{"value": rusage.Nivcsw}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp)
|
y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]any{"value": rusage.Nsignals}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp)
|
y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]any{"value": rusage.Majflt}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp)
|
y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]any{"value": rusage.Minflt}, timestamp)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|||||||
350
collectors/slurmCgroupMetric.go
Normal file
350
collectors/slurmCgroupMetric.go
Normal file
@@ -0,0 +1,350 @@
|
|||||||
|
package collectors
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"os/user"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
|
)
|
||||||
|
|
||||||
|
type SlurmJobData struct {
|
||||||
|
MemoryUsage float64
|
||||||
|
MaxMemoryUsage float64
|
||||||
|
LimitMemoryUsage float64
|
||||||
|
CpuUsageUser float64
|
||||||
|
CpuUsageSys float64
|
||||||
|
CpuSet []int
|
||||||
|
}
|
||||||
|
|
||||||
|
type SlurmCgroupsConfig struct {
|
||||||
|
CgroupBase string `json:"cgroup_base"`
|
||||||
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
|
UseSudo bool `json:"use_sudo,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type SlurmCgroupCollector struct {
|
||||||
|
metricCollector
|
||||||
|
config SlurmCgroupsConfig
|
||||||
|
meta map[string]string
|
||||||
|
tags map[string]string
|
||||||
|
allCPUs []int
|
||||||
|
cpuUsed map[int]bool
|
||||||
|
cgroupBase string
|
||||||
|
excludeMetrics map[string]struct{}
|
||||||
|
useSudo bool
|
||||||
|
}
|
||||||
|
|
||||||
|
const defaultCgroupBase = "/sys/fs/cgroup/system.slice/slurmstepd.scope"
|
||||||
|
|
||||||
|
func ParseCPUs(cpuset string) ([]int, error) {
|
||||||
|
var result []int
|
||||||
|
if cpuset == "" {
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
for r := range strings.SplitSeq(cpuset, ",") {
|
||||||
|
if strings.Contains(r, "-") {
|
||||||
|
parts := strings.Split(r, "-")
|
||||||
|
if len(parts) != 2 {
|
||||||
|
return nil, fmt.Errorf("invalid CPU range: %s", r)
|
||||||
|
}
|
||||||
|
start, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid CPU range start: %s", parts[0])
|
||||||
|
}
|
||||||
|
end, err := strconv.Atoi(strings.TrimSpace(parts[1]))
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid CPU range end: %s", parts[1])
|
||||||
|
}
|
||||||
|
for i := start; i <= end; i++ {
|
||||||
|
result = append(result, i)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
cpu, err := strconv.Atoi(strings.TrimSpace(r))
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid CPU ID: %s", r)
|
||||||
|
}
|
||||||
|
result = append(result, cpu)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetAllCPUs() ([]int, error) {
|
||||||
|
data, err := os.ReadFile("/sys/devices/system/cpu/online")
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to read /sys/devices/system/cpu/online: %v", err)
|
||||||
|
}
|
||||||
|
return ParseCPUs(strings.TrimSpace(string(data)))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *SlurmCgroupCollector) isExcluded(metric string) bool {
|
||||||
|
_, found := m.excludeMetrics[metric]
|
||||||
|
return found
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *SlurmCgroupCollector) readFile(path string) ([]byte, error) {
|
||||||
|
if m.useSudo {
|
||||||
|
cmd := exec.Command("sudo", "cat", path)
|
||||||
|
return cmd.Output()
|
||||||
|
}
|
||||||
|
return os.ReadFile(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
|
||||||
|
var err error
|
||||||
|
m.name = "SlurmCgroupCollector"
|
||||||
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
|
m.parallel = true
|
||||||
|
m.meta = map[string]string{"source": m.name, "group": "SLURM"}
|
||||||
|
m.tags = map[string]string{"type": "hwthread"}
|
||||||
|
m.cpuUsed = make(map[int]bool)
|
||||||
|
m.cgroupBase = defaultCgroupBase
|
||||||
|
|
||||||
|
if len(config) > 0 {
|
||||||
|
err = json.Unmarshal(config, &m.config)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
m.excludeMetrics = make(map[string]struct{})
|
||||||
|
for _, metric := range m.config.ExcludeMetrics {
|
||||||
|
m.excludeMetrics[metric] = struct{}{}
|
||||||
|
}
|
||||||
|
if m.config.CgroupBase != "" {
|
||||||
|
m.cgroupBase = m.config.CgroupBase
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
m.useSudo = m.config.UseSudo
|
||||||
|
if !m.useSudo {
|
||||||
|
user, err := user.Current()
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if user.Uid != "0" {
|
||||||
|
cclog.ComponentError(m.name, "Reading cgroup files requires root privileges (or enable use_sudo in config)")
|
||||||
|
return fmt.Errorf("not root")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
m.allCPUs, err = GetAllCPUs()
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(m.name, "Error reading online CPUs:", err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
m.init = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error) {
|
||||||
|
jobdata := SlurmJobData{
|
||||||
|
MemoryUsage: 0,
|
||||||
|
MaxMemoryUsage: 0,
|
||||||
|
LimitMemoryUsage: 0,
|
||||||
|
CpuUsageUser: 0,
|
||||||
|
CpuUsageSys: 0,
|
||||||
|
CpuSet: []int{},
|
||||||
|
}
|
||||||
|
|
||||||
|
cg := func(f string) string { return filepath.Join(m.cgroupBase, jobdir, f) }
|
||||||
|
|
||||||
|
memUsage, err := m.readFile(cg("memory.current"))
|
||||||
|
if err == nil {
|
||||||
|
x, err := strconv.ParseFloat(strings.TrimSpace(string(memUsage)), 64)
|
||||||
|
if err == nil {
|
||||||
|
jobdata.MemoryUsage = x
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
maxMem, err := m.readFile(cg("memory.peak"))
|
||||||
|
if err == nil {
|
||||||
|
x, err := strconv.ParseFloat(strings.TrimSpace(string(maxMem)), 64)
|
||||||
|
if err == nil {
|
||||||
|
jobdata.MaxMemoryUsage = x
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
limitMem, err := m.readFile(cg("memory.max"))
|
||||||
|
if err == nil {
|
||||||
|
x, err := strconv.ParseFloat(strings.TrimSpace(string(limitMem)), 64)
|
||||||
|
if err == nil {
|
||||||
|
jobdata.LimitMemoryUsage = x
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cpuStat, err := m.readFile(cg("cpu.stat"))
|
||||||
|
if err == nil {
|
||||||
|
lines := strings.Split(strings.TrimSpace(string(cpuStat)), "\n")
|
||||||
|
var usageUsec, userUsec, systemUsec float64
|
||||||
|
for _, line := range lines {
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if len(fields) < 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, err := strconv.ParseFloat(fields[1], 64)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch fields[0] {
|
||||||
|
case "usage_usec":
|
||||||
|
usageUsec = value
|
||||||
|
case "user_usec":
|
||||||
|
userUsec = value
|
||||||
|
case "system_usec":
|
||||||
|
systemUsec = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if usageUsec > 0 {
|
||||||
|
jobdata.CpuUsageUser = (userUsec * 100 / usageUsec)
|
||||||
|
jobdata.CpuUsageSys = (systemUsec * 100 / usageUsec)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cpuSet, err := m.readFile(cg("cpuset.cpus"))
|
||||||
|
if err == nil {
|
||||||
|
cpus, err := ParseCPUs(strings.TrimSpace(string(cpuSet)))
|
||||||
|
if err == nil {
|
||||||
|
jobdata.CpuSet = cpus
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return jobdata, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||||
|
timestamp := time.Now()
|
||||||
|
|
||||||
|
for k := range m.cpuUsed {
|
||||||
|
delete(m.cpuUsed, k)
|
||||||
|
}
|
||||||
|
|
||||||
|
globPattern := filepath.Join(m.cgroupBase, "job_*")
|
||||||
|
jobDirs, err := filepath.Glob(globPattern)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(m.name, "Error globbing job directories:", err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, jdir := range jobDirs {
|
||||||
|
jKey := filepath.Base(jdir)
|
||||||
|
|
||||||
|
jobdata, err := m.ReadJobData(jKey)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(m.name, "Error reading job data for", jKey, ":", err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(jobdata.CpuSet) > 0 {
|
||||||
|
coreCount := float64(len(jobdata.CpuSet))
|
||||||
|
for _, cpu := range jobdata.CpuSet {
|
||||||
|
coreTags := map[string]string{
|
||||||
|
"type": "hwthread",
|
||||||
|
"type-id": fmt.Sprintf("%d", cpu),
|
||||||
|
}
|
||||||
|
|
||||||
|
if coreCount > 0 && !m.isExcluded("job_mem_used") {
|
||||||
|
memPerCore := jobdata.MemoryUsage / coreCount
|
||||||
|
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]any{"value": memPerCore}, timestamp); err == nil {
|
||||||
|
y.AddMeta("unit", "Bytes")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if coreCount > 0 && !m.isExcluded("job_max_mem_used") {
|
||||||
|
maxMemPerCore := jobdata.MaxMemoryUsage / coreCount
|
||||||
|
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]any{"value": maxMemPerCore}, timestamp); err == nil {
|
||||||
|
y.AddMeta("unit", "Bytes")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if coreCount > 0 && !m.isExcluded("job_mem_limit") {
|
||||||
|
limitPerCore := jobdata.LimitMemoryUsage / coreCount
|
||||||
|
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]any{"value": limitPerCore}, timestamp); err == nil {
|
||||||
|
y.AddMeta("unit", "Bytes")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if coreCount > 0 && !m.isExcluded("job_user_cpu") {
|
||||||
|
cpuUserPerCore := jobdata.CpuUsageUser / coreCount
|
||||||
|
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]any{"value": cpuUserPerCore}, timestamp); err == nil {
|
||||||
|
y.AddMeta("unit", "%")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if coreCount > 0 && !m.isExcluded("job_sys_cpu") {
|
||||||
|
cpuSysPerCore := jobdata.CpuUsageSys / coreCount
|
||||||
|
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]any{"value": cpuSysPerCore}, timestamp); err == nil {
|
||||||
|
y.AddMeta("unit", "%")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
m.cpuUsed[cpu] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, cpu := range m.allCPUs {
|
||||||
|
if !m.cpuUsed[cpu] {
|
||||||
|
coreTags := map[string]string{
|
||||||
|
"type": "hwthread",
|
||||||
|
"type-id": fmt.Sprintf("%d", cpu),
|
||||||
|
}
|
||||||
|
|
||||||
|
if !m.isExcluded("job_mem_used") {
|
||||||
|
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
|
||||||
|
y.AddMeta("unit", "Bytes")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !m.isExcluded("job_max_mem_used") {
|
||||||
|
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
|
||||||
|
y.AddMeta("unit", "Bytes")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !m.isExcluded("job_mem_limit") {
|
||||||
|
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
|
||||||
|
y.AddMeta("unit", "Bytes")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !m.isExcluded("job_user_cpu") {
|
||||||
|
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
|
||||||
|
y.AddMeta("unit", "%")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !m.isExcluded("job_sys_cpu") {
|
||||||
|
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
|
||||||
|
y.AddMeta("unit", "%")
|
||||||
|
output <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *SlurmCgroupCollector) Close() {
|
||||||
|
m.init = false
|
||||||
|
}
|
||||||
50
collectors/slurmCgroupMetric.md
Normal file
50
collectors/slurmCgroupMetric.md
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
<!--
|
||||||
|
---
|
||||||
|
title: Slurm cgroup metric collector
|
||||||
|
description: Collect per-core memory and CPU usage for SLURM jobs from cgroup v2
|
||||||
|
categories: [cc-metric-collector]
|
||||||
|
tags: ['Admin']
|
||||||
|
weight: 3
|
||||||
|
hugo_path: docs/reference/cc-metric-collector/collectors/slurm_cgroup.md
|
||||||
|
---
|
||||||
|
-->
|
||||||
|
|
||||||
|
## `slurm_cgroup` collector
|
||||||
|
|
||||||
|
The `slurm_cgroup` collector reads job-specific resource metrics from the cgroup v2 filesystem and provides **hwthread** metrics for memory and CPU usage of running SLURM jobs.
|
||||||
|
|
||||||
|
### Example configuration
|
||||||
|
|
||||||
|
```json
|
||||||
|
"slurm_cgroup": {
|
||||||
|
"cgroup_base": "/sys/fs/cgroup/system.slice/slurmstepd.scope",
|
||||||
|
"exclude_metrics": [
|
||||||
|
"job_sys_cpu",
|
||||||
|
"job_mem_limit"
|
||||||
|
],
|
||||||
|
"use_sudo": false
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
* The `cgroup_base` parameter (optional) can be set to specify the root path to SLURM job cgroups. The default is `/sys/fs/cgroup/system.slice/slurmstepd.scope`.
|
||||||
|
* The `exclude_metrics` array can be used to suppress individual metrics from being sent to the sink.
|
||||||
|
* The cgroups metrics are only available for root users. If password-less sudo is configured, you can enable sudo in the configuration.
|
||||||
|
|
||||||
|
### Reported metrics
|
||||||
|
|
||||||
|
All metrics are available **per hardware thread** :
|
||||||
|
|
||||||
|
* `job_mem_used` (`unit=Bytes`): Current memory usage of the job
|
||||||
|
* `job_max_mem_used` (`unit=Bytes`): Peak memory usage
|
||||||
|
* `job_mem_limit` (`unit=Bytes`): Cgroup memory limit
|
||||||
|
* `job_user_cpu` (`unit=%`): User CPU utilization percentage
|
||||||
|
* `job_sys_cpu` (`unit=%`): System CPU utilization percentage
|
||||||
|
|
||||||
|
Each metric has tags:
|
||||||
|
|
||||||
|
* `type=hwthread`
|
||||||
|
* `type-id=<core_id>`
|
||||||
|
|
||||||
|
### Limitations
|
||||||
|
|
||||||
|
* **cgroups v2 required:** This collector only supports systems running with cgroups v2 (unified hierarchy).
|
||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -9,8 +16,8 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
|
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
|
||||||
@@ -51,7 +58,9 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
m.name = "TempCollector"
|
m.name = "TempCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err := json.Unmarshal(config, &m.config)
|
err := json.Unmarshal(config, &m.config)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -110,7 +119,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
|||||||
sensor.metricName = sensor.label
|
sensor.metricName = sensor.label
|
||||||
}
|
}
|
||||||
sensor.metricName = strings.ToLower(sensor.metricName)
|
sensor.metricName = strings.ToLower(sensor.metricName)
|
||||||
sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1)
|
sensor.metricName = strings.ReplaceAll(sensor.metricName, " ", "_")
|
||||||
// Add temperature prefix, if required
|
// Add temperature prefix, if required
|
||||||
if !strings.Contains(sensor.metricName, "temp") {
|
if !strings.Contains(sensor.metricName, "temp") {
|
||||||
sensor.metricName = "temp_" + sensor.metricName
|
sensor.metricName = "temp_" + sensor.metricName
|
||||||
@@ -194,7 +203,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
sensor.metricName,
|
sensor.metricName,
|
||||||
sensor.tags,
|
sensor.tags,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]interface{}{"value": x},
|
map[string]any{"value": x},
|
||||||
time.Now(),
|
time.Now(),
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -207,7 +216,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
sensor.maxTempName,
|
sensor.maxTempName,
|
||||||
sensor.tags,
|
sensor.tags,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]interface{}{"value": sensor.maxTemp},
|
map[string]any{"value": sensor.maxTemp},
|
||||||
time.Now(),
|
time.Now(),
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -221,7 +230,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
|||||||
sensor.critTempName,
|
sensor.critTempName,
|
||||||
sensor.tags,
|
sensor.tags,
|
||||||
m.meta,
|
m.meta,
|
||||||
map[string]interface{}{"value": sensor.critTemp},
|
map[string]any{"value": sensor.critTemp},
|
||||||
time.Now(),
|
time.Now(),
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
|||||||
@@ -1,15 +1,21 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
)
|
)
|
||||||
|
|
||||||
const MAX_NUM_PROCS = 10
|
const MAX_NUM_PROCS = 10
|
||||||
@@ -29,12 +35,17 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
|
|||||||
var err error
|
var err error
|
||||||
m.name = "TopProcsCollector"
|
m.name = "TopProcsCollector"
|
||||||
m.parallel = true
|
m.parallel = true
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{
|
||||||
m.meta = map[string]string{"source": m.name, "group": "TopProcs"}
|
"type": "node",
|
||||||
|
}
|
||||||
|
m.meta = map[string]string{
|
||||||
|
"source": m.name,
|
||||||
|
"group": "TopProcs",
|
||||||
|
}
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err = json.Unmarshal(config, &m.config)
|
err = json.Unmarshal(config, &m.config)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("%s Init(): json.Unmarshal() failed: %w", m.name, err)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
m.config.Num_procs = int(DEFAULT_NUM_PROCS)
|
m.config.Num_procs = int(DEFAULT_NUM_PROCS)
|
||||||
@@ -42,12 +53,13 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
|
|||||||
if m.config.Num_procs <= 0 || m.config.Num_procs > MAX_NUM_PROCS {
|
if m.config.Num_procs <= 0 || m.config.Num_procs > MAX_NUM_PROCS {
|
||||||
return fmt.Errorf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS)
|
return fmt.Errorf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS)
|
||||||
}
|
}
|
||||||
m.setup()
|
if err := m.setup(); err != nil {
|
||||||
|
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||||
|
}
|
||||||
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
||||||
command.Wait()
|
|
||||||
_, err = command.Output()
|
_, err = command.Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return errors.New("failed to execute command")
|
return fmt.Errorf("%s Init(): failed to get output from command: %w", m.name, err)
|
||||||
}
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
@@ -58,17 +70,18 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessag
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
||||||
command.Wait()
|
|
||||||
stdout, err := command.Output()
|
stdout, err := command.Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print(m.name, err)
|
cclog.ComponentError(
|
||||||
|
m.name,
|
||||||
|
fmt.Sprintf("Read(): Failed to read output from command \"%s\": %v", command.String(), err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
lines := strings.Split(string(stdout), "\n")
|
lines := strings.Split(string(stdout), "\n")
|
||||||
for i := 1; i < m.config.Num_procs+1; i++ {
|
for i := 1; i < m.config.Num_procs+1; i++ {
|
||||||
name := fmt.Sprintf("topproc%d", i)
|
name := fmt.Sprintf("topproc%d", i)
|
||||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now())
|
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": string(lines[i])}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ The configuration of the CC metric collector consists of five configuration file
|
|||||||
|
|
||||||
## Global configuration
|
## Global configuration
|
||||||
|
|
||||||
The global file contains the paths to the other four files and some global options.
|
The global file contains the paths to the other four files and some global options. You can find examples in `example_configs`.
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -1,6 +1,19 @@
|
|||||||
{
|
{
|
||||||
"cpufreq": {},
|
"cpufreq": {},
|
||||||
"cpufreq_cpuinfo": {},
|
"cpufreq_cpuinfo": {},
|
||||||
|
"cpustat": {
|
||||||
|
"exclude_metrics": [
|
||||||
|
"cpu_idle"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"diskstat": {
|
||||||
|
"exclude_metrics": [
|
||||||
|
"disk_total"
|
||||||
|
],
|
||||||
|
"exclude_mounts": [
|
||||||
|
"slurm-tmpfs"
|
||||||
|
]
|
||||||
|
},
|
||||||
"gpfs": {
|
"gpfs": {
|
||||||
"exclude_filesystem": [
|
"exclude_filesystem": [
|
||||||
"test_fs"
|
"test_fs"
|
||||||
@@ -21,6 +34,8 @@
|
|||||||
},
|
},
|
||||||
"numastats": {},
|
"numastats": {},
|
||||||
"nvidia": {},
|
"nvidia": {},
|
||||||
|
"schedstat": {
|
||||||
|
},
|
||||||
"tempstat": {
|
"tempstat": {
|
||||||
"report_max_temperature": true,
|
"report_max_temperature": true,
|
||||||
"report_critical_temperature": true,
|
"report_critical_temperature": true,
|
||||||
57
go.mod
57
go.mod
@@ -1,48 +1,45 @@
|
|||||||
module github.com/ClusterCockpit/cc-metric-collector
|
module github.com/ClusterCockpit/cc-metric-collector
|
||||||
|
|
||||||
go 1.23.4
|
go 1.24.0
|
||||||
|
|
||||||
toolchain go1.23.7
|
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/ClusterCockpit/cc-lib v0.1.1
|
github.com/ClusterCockpit/cc-lib/v2 v2.1.0
|
||||||
github.com/ClusterCockpit/go-rocm-smi v0.3.0
|
github.com/ClusterCockpit/go-rocm-smi v0.3.0
|
||||||
github.com/NVIDIA/go-nvml v0.12.0-2
|
github.com/NVIDIA/go-nvml v0.13.0-1
|
||||||
github.com/PaesslerAG/gval v1.2.2
|
github.com/PaesslerAG/gval v1.2.4
|
||||||
github.com/fsnotify/fsnotify v1.7.0
|
github.com/fsnotify/fsnotify v1.9.0
|
||||||
github.com/gorilla/mux v1.8.1
|
|
||||||
github.com/influxdata/influxdb-client-go/v2 v2.14.0
|
|
||||||
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf
|
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf
|
||||||
github.com/influxdata/line-protocol/v2 v2.2.1
|
github.com/tklauser/go-sysconf v0.3.16
|
||||||
github.com/nats-io/nats.go v1.39.0
|
|
||||||
github.com/prometheus/client_golang v1.20.5
|
|
||||||
github.com/stmcginnis/gofish v0.15.0
|
|
||||||
github.com/tklauser/go-sysconf v0.3.13
|
|
||||||
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1
|
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1
|
||||||
golang.org/x/exp v0.0.0-20250215185904-eff6e970281f
|
golang.org/x/sys v0.40.0
|
||||||
golang.org/x/sys v0.30.0
|
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/ClusterCockpit/cc-backend v1.4.2 // indirect
|
|
||||||
github.com/ClusterCockpit/cc-units v0.4.0 // indirect
|
|
||||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
|
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
|
||||||
github.com/beorn7/perks v1.0.1 // indirect
|
github.com/beorn7/perks v1.0.1 // indirect
|
||||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||||
github.com/expr-lang/expr v1.17.0 // indirect
|
github.com/expr-lang/expr v1.17.7 // indirect
|
||||||
github.com/google/uuid v1.6.0 // indirect
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
github.com/klauspost/compress v1.17.9 // indirect
|
github.com/gorilla/mux v1.8.1 // indirect
|
||||||
|
github.com/influxdata/influxdb-client-go/v2 v2.14.0 // indirect
|
||||||
|
github.com/influxdata/line-protocol/v2 v2.2.1 // indirect
|
||||||
|
github.com/klauspost/compress v1.18.2 // indirect
|
||||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||||
github.com/nats-io/nkeys v0.4.9 // indirect
|
github.com/nats-io/nats.go v1.48.0 // indirect
|
||||||
|
github.com/nats-io/nkeys v0.4.12 // indirect
|
||||||
github.com/nats-io/nuid v1.0.1 // indirect
|
github.com/nats-io/nuid v1.0.1 // indirect
|
||||||
github.com/oapi-codegen/runtime v1.1.1 // indirect
|
github.com/oapi-codegen/runtime v1.1.2 // indirect
|
||||||
github.com/prometheus/client_model v0.6.1 // indirect
|
github.com/prometheus/client_golang v1.23.2 // indirect
|
||||||
github.com/prometheus/common v0.55.0 // indirect
|
github.com/prometheus/client_model v0.6.2 // indirect
|
||||||
github.com/prometheus/procfs v0.15.1 // indirect
|
github.com/prometheus/common v0.67.5 // indirect
|
||||||
|
github.com/prometheus/procfs v0.19.2 // indirect
|
||||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 // indirect
|
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 // indirect
|
||||||
github.com/shopspring/decimal v1.3.1 // indirect
|
github.com/shopspring/decimal v1.4.0 // indirect
|
||||||
github.com/tklauser/numcpus v0.7.0 // indirect
|
github.com/stmcginnis/gofish v0.20.0 // indirect
|
||||||
golang.org/x/crypto v0.35.0 // indirect
|
github.com/tklauser/numcpus v0.11.0 // indirect
|
||||||
golang.org/x/net v0.36.0 // indirect
|
go.yaml.in/yaml/v2 v2.4.3 // indirect
|
||||||
google.golang.org/protobuf v1.35.2 // indirect
|
golang.org/x/crypto v0.47.0 // indirect
|
||||||
|
golang.org/x/exp v0.0.0-20260112195511-716be5621a96 // indirect
|
||||||
|
golang.org/x/net v0.49.0 // indirect
|
||||||
|
google.golang.org/protobuf v1.36.11 // indirect
|
||||||
)
|
)
|
||||||
|
|||||||
133
go.sum
133
go.sum
@@ -1,21 +1,17 @@
|
|||||||
github.com/ClusterCockpit/cc-backend v1.4.2 h1:kTOzqkh9N0564N9nqQThnSs7TAfg8RLgvSm00e5HtIc=
|
github.com/ClusterCockpit/cc-lib/v2 v2.1.0 h1:B6l6h0IjfEuY9DU6aVM3fSsj24lQ1eudXK9QTKmJjqg=
|
||||||
github.com/ClusterCockpit/cc-backend v1.4.2/go.mod h1:g8TNHXe4AXej26snu2//jO3mUF980elT93iV/k11O/c=
|
github.com/ClusterCockpit/cc-lib/v2 v2.1.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw=
|
||||||
github.com/ClusterCockpit/cc-lib v0.1.0-beta.1 h1:dz9j0g2cod8+SMDjuoIY6ISpiHHeekhX6yQaeiwiwJw=
|
|
||||||
github.com/ClusterCockpit/cc-lib v0.1.0-beta.1/go.mod h1:kXMskla1i5ZSfXW0vVRIHgGeXMU5zu2PzYOYnUaOr80=
|
|
||||||
github.com/ClusterCockpit/cc-lib v0.1.1 h1:AXZWYUzgTaE/WdxLNSWPR7FJoA5WlzvYZxw4gIw3gNw=
|
|
||||||
github.com/ClusterCockpit/cc-lib v0.1.1/go.mod h1:SHKcWW/+kN+pcofAtHJFxvmx1FV0VIJuQv5PuT0HDcc=
|
|
||||||
github.com/ClusterCockpit/cc-units v0.4.0 h1:zP5DOu99GmErW0tCDf0gcLrlWt42RQ9dpoONEOh4cI0=
|
|
||||||
github.com/ClusterCockpit/cc-units v0.4.0/go.mod h1:3S3PAhAayS3pbgcT4q9Vn9VJw22Op51X0YimtG77zBw=
|
|
||||||
github.com/ClusterCockpit/go-rocm-smi v0.3.0 h1:1qZnSpG7/NyLtc7AjqnUL9Jb8xtqG1nMVgp69rJfaR8=
|
github.com/ClusterCockpit/go-rocm-smi v0.3.0 h1:1qZnSpG7/NyLtc7AjqnUL9Jb8xtqG1nMVgp69rJfaR8=
|
||||||
github.com/ClusterCockpit/go-rocm-smi v0.3.0/go.mod h1:+I3UMeX3OlizXDf1WpGD43W4KGZZGVSGmny6rTeOnWA=
|
github.com/ClusterCockpit/go-rocm-smi v0.3.0/go.mod h1:+I3UMeX3OlizXDf1WpGD43W4KGZZGVSGmny6rTeOnWA=
|
||||||
github.com/NVIDIA/go-nvml v0.11.6-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
|
github.com/NVIDIA/go-nvml v0.11.6-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
|
||||||
github.com/NVIDIA/go-nvml v0.12.0-2 h1:Sg239yy7jmopu/cuvYauoMj9fOpcGMngxVxxS1EBXeY=
|
github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw=
|
||||||
github.com/NVIDIA/go-nvml v0.12.0-2/go.mod h1:7ruy85eOM73muOc/I37euONSwEyFqZsv5ED9AogD4G0=
|
github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
|
||||||
github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E=
|
github.com/PaesslerAG/gval v1.2.4 h1:rhX7MpjJlcxYwL2eTTYIOBUyEKZ+A96T9vQySWkVUiU=
|
||||||
github.com/PaesslerAG/gval v1.2.2/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac=
|
github.com/PaesslerAG/gval v1.2.4/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac=
|
||||||
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
|
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
|
||||||
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
|
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
|
||||||
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
|
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
|
||||||
|
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op h1:Ucf+QxEKMbPogRO5guBNe5cgd9uZgfoJLOYs8WWhtjM=
|
||||||
|
github.com/antithesishq/antithesis-sdk-go v0.5.0-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E=
|
||||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
|
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
|
||||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
|
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
|
||||||
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
||||||
@@ -27,20 +23,20 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3
|
|||||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/expr-lang/expr v1.16.9 h1:WUAzmR0JNI9JCiF0/ewwHB1gmcGw5wW7nWt8gc6PpCI=
|
github.com/expr-lang/expr v1.17.7 h1:Q0xY/e/2aCIp8g9s/LGvMDCC5PxYlvHgDZRQ4y16JX8=
|
||||||
github.com/expr-lang/expr v1.16.9/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
|
github.com/expr-lang/expr v1.17.7/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
|
||||||
github.com/expr-lang/expr v1.17.0 h1:+vpszOyzKLQXC9VF+wA8cVA0tlA984/Wabc/1hF9Whg=
|
|
||||||
github.com/expr-lang/expr v1.17.0/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
|
|
||||||
github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
|
github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
|
||||||
github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
|
github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
|
||||||
github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
|
github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
|
||||||
github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
|
github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
|
||||||
github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
|
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
|
||||||
github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
|
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
|
||||||
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||||
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||||
|
github.com/google/go-tpm v0.9.7 h1:u89J4tUUeDTlH8xxC3CTW7OHZjbjKoHdQ9W7gCUhtxA=
|
||||||
|
github.com/google/go-tpm v0.9.7/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY=
|
||||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
|
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
|
||||||
@@ -57,8 +53,8 @@ github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxks
|
|||||||
github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE=
|
github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE=
|
||||||
github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM=
|
github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM=
|
||||||
github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
|
github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
|
||||||
github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA=
|
github.com/klauspost/compress v1.18.2 h1:iiPHWW0YrcFgpBYhsA6D1+fqHssJscY/Tm/y2Uqnapk=
|
||||||
github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
|
github.com/klauspost/compress v1.18.2/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
|
||||||
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
|
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
|
||||||
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||||
@@ -68,72 +64,75 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
|||||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||||
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
|
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
|
||||||
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
|
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
|
||||||
|
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 h1:KGuD/pM2JpL9FAYvBrnBBeENKZNh6eNtjqytV6TYjnk=
|
||||||
|
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ=
|
||||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
||||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
||||||
github.com/nats-io/nats.go v1.39.0 h1:2/yg2JQjiYYKLwDuBzV0FbB2sIV+eFNkEevlRi4n9lI=
|
github.com/nats-io/jwt/v2 v2.8.0 h1:K7uzyz50+yGZDO5o772eRE7atlcSEENpL7P+b74JV1g=
|
||||||
github.com/nats-io/nats.go v1.39.0/go.mod h1:MgRb8oOdigA6cYpEPhXJuRVH6UE/V4jblJ2jQ27IXYM=
|
github.com/nats-io/jwt/v2 v2.8.0/go.mod h1:me11pOkwObtcBNR8AiMrUbtVOUGkqYjMQZ6jnSdVUIA=
|
||||||
github.com/nats-io/nkeys v0.4.9 h1:qe9Faq2Gxwi6RZnZMXfmGMZkg3afLLOtrU+gDZJ35b0=
|
github.com/nats-io/nats-server/v2 v2.12.3 h1:KRv+1n7lddMVgkJPQer+pt36TcO0ENxjilBmeWdjcHs=
|
||||||
github.com/nats-io/nkeys v0.4.9/go.mod h1:jcMqs+FLG+W5YO36OX6wFIFcmpdAns+w1Wm6D3I/evE=
|
github.com/nats-io/nats-server/v2 v2.12.3/go.mod h1:MQXjG9WjyXKz9koWzUc3jYUMKD8x3CLmTNy91IQQz3Y=
|
||||||
|
github.com/nats-io/nats.go v1.48.0 h1:pSFyXApG+yWU/TgbKCjmm5K4wrHu86231/w84qRVR+U=
|
||||||
|
github.com/nats-io/nats.go v1.48.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
|
||||||
|
github.com/nats-io/nkeys v0.4.12 h1:nssm7JKOG9/x4J8II47VWCL1Ds29avyiQDRn0ckMvDc=
|
||||||
|
github.com/nats-io/nkeys v0.4.12/go.mod h1:MT59A1HYcjIcyQDJStTfaOY6vhy9XTUjOFo+SVsvpBg=
|
||||||
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
|
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
|
||||||
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
|
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
|
||||||
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
|
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
|
||||||
github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro=
|
github.com/oapi-codegen/runtime v1.1.2 h1:P2+CubHq8fO4Q6fV1tqDBZHCwpVpvPg7oKiYzQgXIyI=
|
||||||
github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
|
github.com/oapi-codegen/runtime v1.1.2/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
|
||||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y=
|
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
|
||||||
github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
|
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
|
||||||
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
|
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
|
||||||
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
|
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
|
||||||
github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc=
|
github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4=
|
||||||
github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8=
|
github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw=
|
||||||
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
|
github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws=
|
||||||
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
|
github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw=
|
||||||
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
|
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
|
||||||
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
|
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
|
||||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4=
|
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4=
|
||||||
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY=
|
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY=
|
||||||
github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=
|
|
||||||
github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
|
github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
|
||||||
|
github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k=
|
||||||
|
github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME=
|
||||||
github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
|
github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
|
||||||
github.com/stmcginnis/gofish v0.15.0 h1:8TG41+lvJk/0Nf8CIIYErxbMlQUy80W0JFRZP3Ld82A=
|
github.com/stmcginnis/gofish v0.20.0 h1:hH2V2Qe898F2wWT1loApnkDUrXXiLKqbSlMaH3Y1n08=
|
||||||
github.com/stmcginnis/gofish v0.15.0/go.mod h1:BLDSFTp8pDlf/xDbLZa+F7f7eW0E/CHCboggsu8CznI=
|
github.com/stmcginnis/gofish v0.20.0/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
|
||||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
|
||||||
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
|
|
||||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||||
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA=
|
||||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI=
|
||||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw=
|
||||||
github.com/tklauser/go-sysconf v0.3.13 h1:GBUpcahXSpR2xN01jhkNAbTLRk2Yzgggk8IM08lq3r4=
|
github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ=
|
||||||
github.com/tklauser/go-sysconf v0.3.13/go.mod h1:zwleP4Q4OehZHGn4CYZDipCgg9usW5IJePewFCGVEa0=
|
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||||
github.com/tklauser/numcpus v0.7.0 h1:yjuerZP127QG9m5Zh/mSO4wqurYil27tHrqwRoRjpr4=
|
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||||
github.com/tklauser/numcpus v0.7.0/go.mod h1:bb6dMVcj8A42tSE7i32fsIUCbQNllK5iDguyOZRUzAY=
|
go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
|
||||||
|
go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
|
||||||
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 h1:P7S/GeHBAFEZIYp0ePPs2kHXoazz8q2KsyxHyQVGCJg=
|
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 h1:P7S/GeHBAFEZIYp0ePPs2kHXoazz8q2KsyxHyQVGCJg=
|
||||||
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1/go.mod h1:9CWpnTUmlQkfdpdutA1nNf4iE5lAVt3QZOu0Z6hahBE=
|
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1/go.mod h1:9CWpnTUmlQkfdpdutA1nNf4iE5lAVt3QZOu0Z6hahBE=
|
||||||
golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U=
|
golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8=
|
||||||
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
|
golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A=
|
||||||
golang.org/x/crypto v0.35.0 h1:b15kiHdrGCHrP6LvwaQ3c03kgNhhiMgvlhxHQhmg2Xs=
|
golang.org/x/exp v0.0.0-20260112195511-716be5621a96 h1:Z/6YuSHTLOHfNFdb8zVZomZr7cqNgTJvA8+Qz75D8gU=
|
||||||
golang.org/x/crypto v0.35.0/go.mod h1:dy7dXNW32cAb/6/PRuTNsix8T+vJAqvuIy5Bli/x0YQ=
|
golang.org/x/exp v0.0.0-20260112195511-716be5621a96/go.mod h1:nzimsREAkjBCIEFtHiYkrJyT+2uy9YZJB7H1k68CXZU=
|
||||||
golang.org/x/exp v0.0.0-20250215185904-eff6e970281f h1:oFMYAjX0867ZD2jcNiLBrI9BdpmEkvPyi5YrBGXbamg=
|
golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o=
|
||||||
golang.org/x/exp v0.0.0-20250215185904-eff6e970281f/go.mod h1:BHOTPb3L19zxehTsLoJXVaTktb06DFgmdW6Wb9s8jqk=
|
golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8=
|
||||||
golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo=
|
|
||||||
golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM=
|
|
||||||
golang.org/x/net v0.36.0 h1:vWF2fRbw4qslQsQzgFqZff+BItCvGFQqKzKIzx1rmoA=
|
|
||||||
golang.org/x/net v0.36.0/go.mod h1:bFmbeoIPfrw4sMHNhb4J9f6+tPziuGjq7Jk/38fxi1I=
|
|
||||||
golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
|
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
|
||||||
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||||
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
|
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
|
||||||
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
|
||||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||||
google.golang.org/protobuf v1.35.2 h1:8Ar7bF+apOIoThw1EdZl0p1oWvMqTHmpA2fRTyZO8io=
|
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
|
||||||
google.golang.org/protobuf v1.35.2/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
|
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
|
||||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||||
|
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||||
gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
|||||||
@@ -1,17 +1,26 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package metricAggregator
|
package metricAggregator
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"maps"
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||||
|
|
||||||
"github.com/PaesslerAG/gval"
|
"github.com/PaesslerAG/gval"
|
||||||
@@ -29,7 +38,7 @@ type MetricAggregatorIntervalConfig struct {
|
|||||||
|
|
||||||
type metricAggregator struct {
|
type metricAggregator struct {
|
||||||
functions []*MetricAggregatorIntervalConfig
|
functions []*MetricAggregatorIntervalConfig
|
||||||
constants map[string]interface{}
|
constants map[string]any
|
||||||
language gval.Language
|
language gval.Language
|
||||||
output chan lp.CCMessage
|
output chan lp.CCMessage
|
||||||
}
|
}
|
||||||
@@ -77,7 +86,7 @@ var evaluables = struct {
|
|||||||
func (c *metricAggregator) Init(output chan lp.CCMessage) error {
|
func (c *metricAggregator) Init(output chan lp.CCMessage) error {
|
||||||
c.output = output
|
c.output = output
|
||||||
c.functions = make([]*MetricAggregatorIntervalConfig, 0)
|
c.functions = make([]*MetricAggregatorIntervalConfig, 0)
|
||||||
c.constants = make(map[string]interface{})
|
c.constants = make(map[string]any)
|
||||||
|
|
||||||
// add constants like hostname, numSockets, ... to constants list
|
// add constants like hostname, numSockets, ... to constants list
|
||||||
// Set hostname
|
// Set hostname
|
||||||
@@ -113,10 +122,8 @@ func (c *metricAggregator) Init(output chan lp.CCMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMessage) {
|
func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMessage) {
|
||||||
vars := make(map[string]interface{})
|
vars := make(map[string]any)
|
||||||
for k, v := range c.constants {
|
maps.Copy(vars, c.constants)
|
||||||
vars[k] = v
|
|
||||||
}
|
|
||||||
vars["starttime"] = starttime
|
vars["starttime"] = starttime
|
||||||
vars["endtime"] = endtime
|
vars["endtime"] = endtime
|
||||||
for _, f := range c.functions {
|
for _, f := range c.functions {
|
||||||
@@ -256,15 +263,15 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
|
|||||||
var m lp.CCMessage
|
var m lp.CCMessage
|
||||||
switch t := value.(type) {
|
switch t := value.(type) {
|
||||||
case float64:
|
case float64:
|
||||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
||||||
case float32:
|
case float32:
|
||||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
||||||
case int:
|
case int:
|
||||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
||||||
case int64:
|
case int64:
|
||||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
||||||
case string:
|
case string:
|
||||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
||||||
default:
|
default:
|
||||||
cclog.ComponentError("MetricCache", "Gval returned invalid type", t, "skipping metric", f.Name)
|
cclog.ComponentError("MetricCache", "Gval returned invalid type", t, "skipping metric", f.Name)
|
||||||
}
|
}
|
||||||
@@ -322,18 +329,19 @@ func (c *metricAggregator) AddAggregation(name, function, condition string, tags
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c *metricAggregator) DeleteAggregation(name string) error {
|
func (c *metricAggregator) DeleteAggregation(name string) error {
|
||||||
for i, agg := range c.functions {
|
i := slices.IndexFunc(
|
||||||
if agg.Name == name {
|
c.functions,
|
||||||
copy(c.functions[i:], c.functions[i+1:])
|
func(agg *MetricAggregatorIntervalConfig) bool {
|
||||||
c.functions[len(c.functions)-1] = nil
|
return agg.Name == name
|
||||||
c.functions = c.functions[:len(c.functions)-1]
|
})
|
||||||
return nil
|
if i == -1 {
|
||||||
}
|
|
||||||
}
|
|
||||||
return fmt.Errorf("no aggregation for metric name %s", name)
|
return fmt.Errorf("no aggregation for metric name %s", name)
|
||||||
}
|
}
|
||||||
|
c.functions = slices.Delete(c.functions, i, i)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (c *metricAggregator) AddConstant(name string, value interface{}) {
|
func (c *metricAggregator) AddConstant(name string, value any) {
|
||||||
c.constants[name] = value
|
c.constants[name] = value
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -341,11 +349,11 @@ func (c *metricAggregator) DelConstant(name string) {
|
|||||||
delete(c.constants, name)
|
delete(c.constants, name)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *metricAggregator) AddFunction(name string, function func(args ...interface{}) (interface{}, error)) {
|
func (c *metricAggregator) AddFunction(name string, function func(args ...any) (any, error)) {
|
||||||
c.language = gval.NewLanguage(c.language, gval.Function(name, function))
|
c.language = gval.NewLanguage(c.language, gval.Function(name, function))
|
||||||
}
|
}
|
||||||
|
|
||||||
func EvalBoolCondition(condition string, params map[string]interface{}) (bool, error) {
|
func EvalBoolCondition(condition string, params map[string]any) (bool, error) {
|
||||||
evaluables.mutex.Lock()
|
evaluables.mutex.Lock()
|
||||||
evaluable, ok := evaluables.mapping[condition]
|
evaluable, ok := evaluables.mapping[condition]
|
||||||
evaluables.mutex.Unlock()
|
evaluables.mutex.Unlock()
|
||||||
|
|||||||
@@ -1,13 +1,19 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package metricAggregator
|
package metricAggregator
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"golang.org/x/exp/slices"
|
|
||||||
|
|
||||||
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -27,7 +33,7 @@ func sumAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Sum up values
|
// Sum up values
|
||||||
func sumfunc(args interface{}) (interface{}, error) {
|
func sumfunc(args any) (any, error) {
|
||||||
|
|
||||||
var err error
|
var err error
|
||||||
switch values := args.(type) {
|
switch values := args.(type) {
|
||||||
@@ -56,7 +62,7 @@ func minAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get the minimum value
|
// Get the minimum value
|
||||||
func minfunc(args interface{}) (interface{}, error) {
|
func minfunc(args any) (any, error) {
|
||||||
switch values := args.(type) {
|
switch values := args.(type) {
|
||||||
case []float64:
|
case []float64:
|
||||||
return minAnyType(values)
|
return minAnyType(values)
|
||||||
@@ -77,12 +83,12 @@ func avgAnyType[T float64 | float32 | int | int32 | int64](values []T) (float64,
|
|||||||
if len(values) == 0 {
|
if len(values) == 0 {
|
||||||
return 0.0, errors.New("average function requires at least one argument")
|
return 0.0, errors.New("average function requires at least one argument")
|
||||||
}
|
}
|
||||||
sum, err := sumAnyType[T](values)
|
sum, err := sumAnyType(values)
|
||||||
return float64(sum) / float64(len(values)), err
|
return float64(sum) / float64(len(values)), err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the average or mean value
|
// Get the average or mean value
|
||||||
func avgfunc(args interface{}) (interface{}, error) {
|
func avgfunc(args any) (any, error) {
|
||||||
switch values := args.(type) {
|
switch values := args.(type) {
|
||||||
case []float64:
|
case []float64:
|
||||||
return avgAnyType(values)
|
return avgAnyType(values)
|
||||||
@@ -107,7 +113,7 @@ func maxAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get the maximum value
|
// Get the maximum value
|
||||||
func maxfunc(args interface{}) (interface{}, error) {
|
func maxfunc(args any) (any, error) {
|
||||||
switch values := args.(type) {
|
switch values := args.(type) {
|
||||||
case []float64:
|
case []float64:
|
||||||
return maxAnyType(values)
|
return maxAnyType(values)
|
||||||
@@ -139,7 +145,7 @@ func medianAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, er
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get the median value
|
// Get the median value
|
||||||
func medianfunc(args interface{}) (interface{}, error) {
|
func medianfunc(args any) (any, error) {
|
||||||
switch values := args.(type) {
|
switch values := args.(type) {
|
||||||
case []float64:
|
case []float64:
|
||||||
return medianAnyType(values)
|
return medianAnyType(values)
|
||||||
@@ -160,9 +166,9 @@ func medianfunc(args interface{}) (interface{}, error) {
|
|||||||
* Get number of values in list. Returns always an int
|
* Get number of values in list. Returns always an int
|
||||||
*/
|
*/
|
||||||
|
|
||||||
func lenfunc(args interface{}) (interface{}, error) {
|
func lenfunc(args any) (any, error) {
|
||||||
var err error = nil
|
var err error = nil
|
||||||
var length int = 0
|
length := 0
|
||||||
switch values := args.(type) {
|
switch values := args.(type) {
|
||||||
case []float64:
|
case []float64:
|
||||||
length = len(values)
|
length = len(values)
|
||||||
@@ -174,13 +180,7 @@ func lenfunc(args interface{}) (interface{}, error) {
|
|||||||
length = len(values)
|
length = len(values)
|
||||||
case []int32:
|
case []int32:
|
||||||
length = len(values)
|
length = len(values)
|
||||||
case float64:
|
case float64, float32, int, int64:
|
||||||
err = errors.New("function 'len' can only be applied on arrays and strings")
|
|
||||||
case float32:
|
|
||||||
err = errors.New("function 'len' can only be applied on arrays and strings")
|
|
||||||
case int:
|
|
||||||
err = errors.New("function 'len' can only be applied on arrays and strings")
|
|
||||||
case int64:
|
|
||||||
err = errors.New("function 'len' can only be applied on arrays and strings")
|
err = errors.New("function 'len' can only be applied on arrays and strings")
|
||||||
case string:
|
case string:
|
||||||
length = len(values)
|
length = len(values)
|
||||||
@@ -190,13 +190,13 @@ func lenfunc(args interface{}) (interface{}, error) {
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Check if a values is in a list
|
* Check if a values is in a list
|
||||||
* In constrast to most of the other functions, this one is an infix operator for
|
* In contrast to most of the other functions, this one is an infix operator for
|
||||||
* - substring matching: `"abc" in "abcdef"` -> true
|
* - substring matching: `"abc" in "abcdef"` -> true
|
||||||
* - substring matching with int casting: `3 in "abd3"` -> true
|
* - substring matching with int casting: `3 in "abd3"` -> true
|
||||||
* - search for an int in an int list: `3 in getCpuList()` -> true (if you have more than 4 CPU hardware threads)
|
* - search for an int in an int list: `3 in getCpuList()` -> true (if you have more than 4 CPU hardware threads)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
func infunc(a interface{}, b interface{}) (interface{}, error) {
|
func infunc(a any, b any) (any, error) {
|
||||||
switch match := a.(type) {
|
switch match := a.(type) {
|
||||||
case string:
|
case string:
|
||||||
switch total := b.(type) {
|
switch total := b.(type) {
|
||||||
@@ -206,11 +206,7 @@ func infunc(a interface{}, b interface{}) (interface{}, error) {
|
|||||||
case int:
|
case int:
|
||||||
switch total := b.(type) {
|
switch total := b.(type) {
|
||||||
case []int:
|
case []int:
|
||||||
for _, x := range total {
|
return slices.Contains(total, match), nil
|
||||||
if x == match {
|
|
||||||
return true, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case string:
|
case string:
|
||||||
smatch := fmt.Sprintf("%d", match)
|
smatch := fmt.Sprintf("%d", match)
|
||||||
return strings.Contains(total, smatch), nil
|
return strings.Contains(total, smatch), nil
|
||||||
@@ -226,12 +222,12 @@ func infunc(a interface{}, b interface{}) (interface{}, error) {
|
|||||||
* format keys \d = %d, \w = %d, ... Not sure how to fix this
|
* format keys \d = %d, \w = %d, ... Not sure how to fix this
|
||||||
*/
|
*/
|
||||||
|
|
||||||
func matchfunc(args ...interface{}) (interface{}, error) {
|
func matchfunc(args ...any) (any, error) {
|
||||||
switch match := args[0].(type) {
|
switch match := args[0].(type) {
|
||||||
case string:
|
case string:
|
||||||
switch total := args[1].(type) {
|
switch total := args[1].(type) {
|
||||||
case string:
|
case string:
|
||||||
smatch := strings.Replace(match, "%", "\\", -1)
|
smatch := strings.ReplaceAll(match, "%", "\\")
|
||||||
regex, err := regexp.Compile(smatch)
|
regex, err := regexp.Compile(smatch)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, err
|
return false, err
|
||||||
@@ -248,7 +244,7 @@ func matchfunc(args ...interface{}) (interface{}, error) {
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// for a given cpuid, it returns the core id
|
// for a given cpuid, it returns the core id
|
||||||
func getCpuCoreFunc(args interface{}) (interface{}, error) {
|
func getCpuCoreFunc(args any) (any, error) {
|
||||||
switch cpuid := args.(type) {
|
switch cpuid := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
return topo.GetHwthreadCore(cpuid), nil
|
return topo.GetHwthreadCore(cpuid), nil
|
||||||
@@ -257,7 +253,7 @@ func getCpuCoreFunc(args interface{}) (interface{}, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// for a given cpuid, it returns the socket id
|
// for a given cpuid, it returns the socket id
|
||||||
func getCpuSocketFunc(args interface{}) (interface{}, error) {
|
func getCpuSocketFunc(args any) (any, error) {
|
||||||
switch cpuid := args.(type) {
|
switch cpuid := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
return topo.GetHwthreadSocket(cpuid), nil
|
return topo.GetHwthreadSocket(cpuid), nil
|
||||||
@@ -266,7 +262,7 @@ func getCpuSocketFunc(args interface{}) (interface{}, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// for a given cpuid, it returns the id of the NUMA node
|
// for a given cpuid, it returns the id of the NUMA node
|
||||||
func getCpuNumaDomainFunc(args interface{}) (interface{}, error) {
|
func getCpuNumaDomainFunc(args any) (any, error) {
|
||||||
switch cpuid := args.(type) {
|
switch cpuid := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
return topo.GetHwthreadNumaDomain(cpuid), nil
|
return topo.GetHwthreadNumaDomain(cpuid), nil
|
||||||
@@ -275,7 +271,7 @@ func getCpuNumaDomainFunc(args interface{}) (interface{}, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// for a given cpuid, it returns the id of the CPU die
|
// for a given cpuid, it returns the id of the CPU die
|
||||||
func getCpuDieFunc(args interface{}) (interface{}, error) {
|
func getCpuDieFunc(args any) (any, error) {
|
||||||
switch cpuid := args.(type) {
|
switch cpuid := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
return topo.GetHwthreadDie(cpuid), nil
|
return topo.GetHwthreadDie(cpuid), nil
|
||||||
@@ -284,7 +280,7 @@ func getCpuDieFunc(args interface{}) (interface{}, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// for a given core id, it returns the list of cpuids
|
// for a given core id, it returns the list of cpuids
|
||||||
func getCpuListOfCoreFunc(args interface{}) (interface{}, error) {
|
func getCpuListOfCoreFunc(args any) (any, error) {
|
||||||
cpulist := make([]int, 0)
|
cpulist := make([]int, 0)
|
||||||
switch in := args.(type) {
|
switch in := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
@@ -298,7 +294,7 @@ func getCpuListOfCoreFunc(args interface{}) (interface{}, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// for a given socket id, it returns the list of cpuids
|
// for a given socket id, it returns the list of cpuids
|
||||||
func getCpuListOfSocketFunc(args interface{}) (interface{}, error) {
|
func getCpuListOfSocketFunc(args any) (any, error) {
|
||||||
cpulist := make([]int, 0)
|
cpulist := make([]int, 0)
|
||||||
switch in := args.(type) {
|
switch in := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
@@ -312,7 +308,7 @@ func getCpuListOfSocketFunc(args interface{}) (interface{}, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// for a given id of a NUMA domain, it returns the list of cpuids
|
// for a given id of a NUMA domain, it returns the list of cpuids
|
||||||
func getCpuListOfNumaDomainFunc(args interface{}) (interface{}, error) {
|
func getCpuListOfNumaDomainFunc(args any) (any, error) {
|
||||||
cpulist := make([]int, 0)
|
cpulist := make([]int, 0)
|
||||||
switch in := args.(type) {
|
switch in := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
@@ -326,7 +322,7 @@ func getCpuListOfNumaDomainFunc(args interface{}) (interface{}, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// for a given CPU die id, it returns the list of cpuids
|
// for a given CPU die id, it returns the list of cpuids
|
||||||
func getCpuListOfDieFunc(args interface{}) (interface{}, error) {
|
func getCpuListOfDieFunc(args any) (any, error) {
|
||||||
cpulist := make([]int, 0)
|
cpulist := make([]int, 0)
|
||||||
switch in := args.(type) {
|
switch in := args.(type) {
|
||||||
case int:
|
case int:
|
||||||
@@ -340,14 +336,14 @@ func getCpuListOfDieFunc(args interface{}) (interface{}, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// wrapper function to get a list of all cpuids of the node
|
// wrapper function to get a list of all cpuids of the node
|
||||||
func getCpuListOfNode() (interface{}, error) {
|
func getCpuListOfNode() (any, error) {
|
||||||
return topo.HwthreadList(), nil
|
return topo.HwthreadList(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// helper function to get the cpuid list for a CCMetric type tag set (type and type-id)
|
// helper function to get the cpuid list for a CCMetric type tag set (type and type-id)
|
||||||
// since there is no access to the metric data in the function, is should be called like
|
// since there is no access to the metric data in the function, is should be called like
|
||||||
// `getCpuListOfType()`
|
// `getCpuListOfType()`
|
||||||
func getCpuListOfType(args ...interface{}) (interface{}, error) {
|
func getCpuListOfType(args ...any) (any, error) {
|
||||||
cpulist := make([]int, 0)
|
cpulist := make([]int, 0)
|
||||||
switch typ := args[0].(type) {
|
switch typ := args[0].(type) {
|
||||||
case string:
|
case string:
|
||||||
|
|||||||
@@ -236,13 +236,13 @@ __deprecated__
|
|||||||
|
|
||||||
|
|
||||||
The cc-metric-collector tries to read the data from the system as it is reported. If available, it tries to read the metric unit from the system as well (e.g. from `/proc/meminfo`). The problem is that, depending on the source, the metric units are named differently. Just think about `byte`, `Byte`, `B`, `bytes`, ...
|
The cc-metric-collector tries to read the data from the system as it is reported. If available, it tries to read the metric unit from the system as well (e.g. from `/proc/meminfo`). The problem is that, depending on the source, the metric units are named differently. Just think about `byte`, `Byte`, `B`, `bytes`, ...
|
||||||
The [cc-units](https://github.com/ClusterCockpit/cc-units) package provides us a normalization option to use the same metric unit name for all metrics. It this option is set to true, all `unit` meta tags are normalized.
|
The [cc-units](https://github.com/ClusterCockpit/cc-lib/ccUnits) package provides us a normalization option to use the same metric unit name for all metrics. It this option is set to true, all `unit` meta tags are normalized.
|
||||||
|
|
||||||
## The `change_unit_prefix` section
|
## The `change_unit_prefix` section
|
||||||
|
|
||||||
__deprecated__
|
__deprecated__
|
||||||
|
|
||||||
It is often the case that metrics are reported by the system using a rather outdated unit prefix (like `/proc/meminfo` still uses kByte despite current memory sizes are in the GByte range). If you want to change the prefix of a unit, you can do that with the help of [cc-units](https://github.com/ClusterCockpit/cc-units). The setting works on the metric name and requires the new prefix for the metric. The cc-units package determines the scaling factor.
|
It is often the case that metrics are reported by the system using a rather outdated unit prefix (like `/proc/meminfo` still uses kByte despite current memory sizes are in the GByte range). If you want to change the prefix of a unit, you can do that with the help of [cc-units](https://github.com/ClusterCockpit/cc-lib/ccUnits). The setting works on the metric name and requires the new prefix for the metric. The cc-units package determines the scaling factor.
|
||||||
|
|
||||||
# Aggregate metric values of the current interval with the `interval_aggregates` option
|
# Aggregate metric values of the current interval with the `interval_aggregates` option
|
||||||
|
|
||||||
|
|||||||
@@ -1,12 +1,19 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package metricRouter
|
package metricRouter
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
||||||
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
||||||
)
|
)
|
||||||
@@ -44,7 +51,7 @@ type MetricCache interface {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error {
|
func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error {
|
||||||
var err error = nil
|
var err error
|
||||||
c.done = make(chan bool)
|
c.done = make(chan bool)
|
||||||
c.wg = wg
|
c.wg = wg
|
||||||
c.ticker = ticker
|
c.ticker = ticker
|
||||||
@@ -154,8 +161,8 @@ func (c *metricCache) DeleteAggregation(name string) error {
|
|||||||
// is the current one, index=1 the last interval and so on. Returns and empty array if a wrong index
|
// is the current one, index=1 the last interval and so on. Returns and empty array if a wrong index
|
||||||
// is given (negative index, index larger than configured number of total intervals, ...)
|
// is given (negative index, index larger than configured number of total intervals, ...)
|
||||||
func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMessage) {
|
func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMessage) {
|
||||||
var start time.Time = time.Now()
|
start := time.Now()
|
||||||
var stop time.Time = time.Now()
|
stop := time.Now()
|
||||||
var metrics []lp.CCMessage
|
var metrics []lp.CCMessage
|
||||||
if index >= 0 && index < c.numPeriods {
|
if index >= 0 && index < c.numPeriods {
|
||||||
pindex := c.curPeriod - index
|
pindex := c.curPeriod - index
|
||||||
|
|||||||
@@ -1,17 +1,25 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package metricRouter
|
package metricRouter
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"maps"
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
|
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||||
mp "github.com/ClusterCockpit/cc-lib/messageProcessor"
|
mp "github.com/ClusterCockpit/cc-lib/v2/messageProcessor"
|
||||||
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
||||||
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
|
||||||
)
|
)
|
||||||
@@ -100,10 +108,8 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
|
|||||||
cclog.ComponentError("MetricRouter", err.Error())
|
cclog.ComponentError("MetricRouter", err.Error())
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
r.maxForward = 1
|
r.maxForward = max(1, r.config.MaxForward)
|
||||||
if r.config.MaxForward > r.maxForward {
|
|
||||||
r.maxForward = r.config.MaxForward
|
|
||||||
}
|
|
||||||
if r.config.NumCacheIntervals > 0 {
|
if r.config.NumCacheIntervals > 0 {
|
||||||
r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals)
|
r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -111,50 +117,74 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
for _, agg := range r.config.IntervalAgg {
|
for _, agg := range r.config.IntervalAgg {
|
||||||
r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
|
err = r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("MetricCache AddAggregation() failed: %w", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
p, err := mp.NewMessageProcessor()
|
p, err := mp.NewMessageProcessor()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("initialization of message processor failed: %v", err.Error())
|
return fmt.Errorf("MessageProcessor NewMessageProcessor() failed: %w", err)
|
||||||
}
|
}
|
||||||
r.mp = p
|
r.mp = p
|
||||||
|
|
||||||
if len(r.config.MessageProcessor) > 0 {
|
if len(r.config.MessageProcessor) > 0 {
|
||||||
err = r.mp.FromConfigJSON(r.config.MessageProcessor)
|
err = r.mp.FromConfigJSON(r.config.MessageProcessor)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed parsing JSON for message processor: %v", err.Error())
|
return fmt.Errorf("MessageProcessor FromConfigJSON() failed: %w", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for _, mname := range r.config.DropMetrics {
|
for _, mname := range r.config.DropMetrics {
|
||||||
r.mp.AddDropMessagesByName(mname)
|
err = r.mp.AddDropMessagesByName(mname)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("MessageProcessor AddDropMessagesByName() failed: %w", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for _, cond := range r.config.DropMetricsIf {
|
for _, cond := range r.config.DropMetricsIf {
|
||||||
r.mp.AddDropMessagesByCondition(cond)
|
err = r.mp.AddDropMessagesByCondition(cond)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("MessageProcessor AddDropMessagesByCondition() failed: %w", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for _, data := range r.config.AddTags {
|
for _, data := range r.config.AddTags {
|
||||||
cond := data.Condition
|
cond := data.Condition
|
||||||
if cond == "*" {
|
if cond == "*" {
|
||||||
cond = "true"
|
cond = "true"
|
||||||
}
|
}
|
||||||
r.mp.AddAddTagsByCondition(cond, data.Key, data.Value)
|
err = r.mp.AddAddTagsByCondition(cond, data.Key, data.Value)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("MessageProcessor AddAddTagsByCondition() failed: %w", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for _, data := range r.config.DelTags {
|
for _, data := range r.config.DelTags {
|
||||||
cond := data.Condition
|
cond := data.Condition
|
||||||
if cond == "*" {
|
if cond == "*" {
|
||||||
cond = "true"
|
cond = "true"
|
||||||
}
|
}
|
||||||
r.mp.AddDeleteTagsByCondition(cond, data.Key, data.Value)
|
err = r.mp.AddDeleteTagsByCondition(cond, data.Key, data.Value)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("MessageProcessor AddDeleteTagsByCondition() failed: %w", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for oldname, newname := range r.config.RenameMetrics {
|
for oldname, newname := range r.config.RenameMetrics {
|
||||||
r.mp.AddRenameMetricByName(oldname, newname)
|
err = r.mp.AddRenameMetricByName(oldname, newname)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("MessageProcessor AddRenameMetricByName() failed: %w", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for metricName, prefix := range r.config.ChangeUnitPrefix {
|
for metricName, prefix := range r.config.ChangeUnitPrefix {
|
||||||
r.mp.AddChangeUnitPrefix(fmt.Sprintf("name == '%s'", metricName), prefix)
|
err = r.mp.AddChangeUnitPrefix(fmt.Sprintf("name == '%s'", metricName), prefix)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("MessageProcessor AddChangeUnitPrefix() failed: %w", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
r.mp.SetNormalizeUnits(r.config.NormalizeUnits)
|
r.mp.SetNormalizeUnits(r.config.NormalizeUnits)
|
||||||
|
|
||||||
r.mp.AddAddTagsByCondition("true", r.config.HostnameTagName, r.hostname)
|
err = r.mp.AddAddTagsByCondition("true", r.config.HostnameTagName, r.hostname)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("MessageProcessor AddAddTagsByCondition() failed: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
// r.config.dropMetrics = make(map[string]bool)
|
// r.config.dropMetrics = make(map[string]bool)
|
||||||
// for _, mname := range r.config.DropMetrics {
|
// for _, mname := range r.config.DropMetrics {
|
||||||
@@ -163,8 +193,8 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getParamMap(point lp.CCMessage) map[string]interface{} {
|
func getParamMap(point lp.CCMessage) map[string]any {
|
||||||
params := make(map[string]interface{})
|
params := make(map[string]any)
|
||||||
params["metric"] = point
|
params["metric"] = point
|
||||||
params["name"] = point.Name()
|
params["name"] = point.Name()
|
||||||
for key, value := range point.Tags() {
|
for key, value := range point.Tags() {
|
||||||
@@ -173,9 +203,7 @@ func getParamMap(point lp.CCMessage) map[string]interface{} {
|
|||||||
for key, value := range point.Meta() {
|
for key, value := range point.Meta() {
|
||||||
params[key] = value
|
params[key] = value
|
||||||
}
|
}
|
||||||
for key, value := range point.Fields() {
|
maps.Copy(params, point.Fields())
|
||||||
params[key] = value
|
|
||||||
}
|
|
||||||
params["timestamp"] = point.Time()
|
params["timestamp"] = point.Time()
|
||||||
return params
|
return params
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package ccTopology
|
package ccTopology
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@@ -6,11 +13,11 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
cclogger "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclogger "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
"golang.org/x/exp/slices"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const SYSFS_CPUBASE = `/sys/devices/system/cpu`
|
const SYSFS_CPUBASE = `/sys/devices/system/cpu`
|
||||||
@@ -73,7 +80,7 @@ func fileToList(path string) []int {
|
|||||||
// Create list
|
// Create list
|
||||||
list := make([]int, 0)
|
list := make([]int, 0)
|
||||||
stringBuffer := strings.TrimSpace(string(buffer))
|
stringBuffer := strings.TrimSpace(string(buffer))
|
||||||
for _, valueRangeString := range strings.Split(stringBuffer, ",") {
|
for valueRangeString := range strings.SplitSeq(stringBuffer, ",") {
|
||||||
valueRange := strings.Split(valueRangeString, "-")
|
valueRange := strings.Split(valueRangeString, "-")
|
||||||
switch len(valueRange) {
|
switch len(valueRange) {
|
||||||
case 1:
|
case 1:
|
||||||
|
|||||||
@@ -1,125 +0,0 @@
|
|||||||
package hostlist
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"regexp"
|
|
||||||
"sort"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
func Expand(in string) (result []string, err error) {
|
|
||||||
|
|
||||||
// Create ranges regular expression
|
|
||||||
reStNumber := "[[:digit:]]+"
|
|
||||||
reStRange := reStNumber + "-" + reStNumber
|
|
||||||
reStOptionalNumberOrRange := "(" + reStNumber + ",|" + reStRange + ",)*"
|
|
||||||
reStNumberOrRange := "(" + reStNumber + "|" + reStRange + ")"
|
|
||||||
reStBraceLeft := "[[]"
|
|
||||||
reStBraceRight := "[]]"
|
|
||||||
reStRanges := reStBraceLeft +
|
|
||||||
reStOptionalNumberOrRange +
|
|
||||||
reStNumberOrRange +
|
|
||||||
reStBraceRight
|
|
||||||
reRanges := regexp.MustCompile(reStRanges)
|
|
||||||
|
|
||||||
// Create host list regular expression
|
|
||||||
reStDNSChars := "[a-zA-Z0-9-]+"
|
|
||||||
reStPrefix := "^(" + reStDNSChars + ")"
|
|
||||||
reStOptionalSuffix := "(" + reStDNSChars + ")?"
|
|
||||||
re := regexp.MustCompile(reStPrefix + "([[][0-9,-]+[]])?" + reStOptionalSuffix)
|
|
||||||
|
|
||||||
// Remove all delimiters from the input
|
|
||||||
in = strings.TrimLeft(in, ", ")
|
|
||||||
|
|
||||||
for len(in) > 0 {
|
|
||||||
if v := re.FindStringSubmatch(in); v != nil {
|
|
||||||
|
|
||||||
// Remove matched part from the input
|
|
||||||
lenPrefix := len(v[0])
|
|
||||||
in = in[lenPrefix:]
|
|
||||||
|
|
||||||
// Remove all delimiters from the input
|
|
||||||
in = strings.TrimLeft(in, ", ")
|
|
||||||
|
|
||||||
// matched prefix, range and suffix
|
|
||||||
hlPrefix := v[1]
|
|
||||||
hlRanges := v[2]
|
|
||||||
hlSuffix := v[3]
|
|
||||||
|
|
||||||
// Single node without ranges
|
|
||||||
if hlRanges == "" {
|
|
||||||
result = append(result, hlPrefix)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Node with ranges
|
|
||||||
if v := reRanges.FindStringSubmatch(hlRanges); v != nil {
|
|
||||||
|
|
||||||
// Remove braces
|
|
||||||
hlRanges = hlRanges[1 : len(hlRanges)-1]
|
|
||||||
|
|
||||||
// Split host ranges at ,
|
|
||||||
for _, hlRange := range strings.Split(hlRanges, ",") {
|
|
||||||
|
|
||||||
// Split host range at -
|
|
||||||
RangeStartEnd := strings.Split(hlRange, "-")
|
|
||||||
|
|
||||||
// Range is only a single number
|
|
||||||
if len(RangeStartEnd) == 1 {
|
|
||||||
result = append(result, hlPrefix+RangeStartEnd[0]+hlSuffix)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Range has a start and an end
|
|
||||||
widthRangeStart := len(RangeStartEnd[0])
|
|
||||||
widthRangeEnd := len(RangeStartEnd[1])
|
|
||||||
iStart, _ := strconv.ParseUint(RangeStartEnd[0], 10, 64)
|
|
||||||
iEnd, _ := strconv.ParseUint(RangeStartEnd[1], 10, 64)
|
|
||||||
if iStart > iEnd {
|
|
||||||
return nil, fmt.Errorf("single range start is greater than end: %s", hlRange)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create print format string for range numbers
|
|
||||||
doPadding := widthRangeStart == widthRangeEnd
|
|
||||||
widthPadding := widthRangeStart
|
|
||||||
var formatString string
|
|
||||||
if doPadding {
|
|
||||||
formatString = "%0" + fmt.Sprint(widthPadding) + "d"
|
|
||||||
} else {
|
|
||||||
formatString = "%d"
|
|
||||||
}
|
|
||||||
formatString = hlPrefix + formatString + hlSuffix
|
|
||||||
|
|
||||||
// Add nodes from this range
|
|
||||||
for i := iStart; i <= iEnd; i++ {
|
|
||||||
result = append(result, fmt.Sprintf(formatString, i))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
return nil, fmt.Errorf("not at hostlist range: %s", hlRanges)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
return nil, fmt.Errorf("not a hostlist: %s", in)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if result != nil {
|
|
||||||
// sort
|
|
||||||
sort.Strings(result)
|
|
||||||
|
|
||||||
// uniq
|
|
||||||
previous := 1
|
|
||||||
for current := 1; current < len(result); current++ {
|
|
||||||
if result[current-1] != result[current] {
|
|
||||||
if previous != current {
|
|
||||||
result[previous] = result[current]
|
|
||||||
}
|
|
||||||
previous++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
result = result[:previous]
|
|
||||||
}
|
|
||||||
|
|
||||||
return
|
|
||||||
}
|
|
||||||
@@ -1,126 +0,0 @@
|
|||||||
package hostlist
|
|
||||||
|
|
||||||
import (
|
|
||||||
"testing"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestExpand(t *testing.T) {
|
|
||||||
|
|
||||||
// Compare two slices of strings
|
|
||||||
equal := func(a, b []string) bool {
|
|
||||||
if len(a) != len(b) {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
for i, v := range a {
|
|
||||||
if v != b[i] {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
type testDefinition struct {
|
|
||||||
input string
|
|
||||||
resultExpected []string
|
|
||||||
errorExpected bool
|
|
||||||
}
|
|
||||||
|
|
||||||
expandTests := []testDefinition{
|
|
||||||
{
|
|
||||||
// Single node
|
|
||||||
input: "n1",
|
|
||||||
resultExpected: []string{"n1"},
|
|
||||||
errorExpected: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
// Single node, duplicated
|
|
||||||
input: "n1,n1",
|
|
||||||
resultExpected: []string{"n1"},
|
|
||||||
errorExpected: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
// Single node with padding
|
|
||||||
input: "n[01]",
|
|
||||||
resultExpected: []string{"n01"},
|
|
||||||
errorExpected: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
// Single node with suffix
|
|
||||||
input: "n[01]-p",
|
|
||||||
resultExpected: []string{"n01-p"},
|
|
||||||
errorExpected: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
// Multiple nodes with a single range
|
|
||||||
input: "n[1-2]",
|
|
||||||
resultExpected: []string{"n1", "n2"},
|
|
||||||
errorExpected: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
// Multiple nodes with a single range and a single index
|
|
||||||
input: "n[1-2,3]",
|
|
||||||
resultExpected: []string{"n1", "n2", "n3"},
|
|
||||||
errorExpected: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
// Multiple nodes with different prefixes
|
|
||||||
input: "n[1-2],m[1,2]",
|
|
||||||
resultExpected: []string{"m1", "m2", "n1", "n2"},
|
|
||||||
errorExpected: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
// Multiple nodes with different suffixes
|
|
||||||
input: "n[1-2]-p,n[1,2]-q",
|
|
||||||
resultExpected: []string{"n1-p", "n1-q", "n2-p", "n2-q"},
|
|
||||||
errorExpected: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
// Multiple nodes with and without node ranges
|
|
||||||
input: " n09, n[01-04,06-07,09] , , n10,n04",
|
|
||||||
resultExpected: []string{"n01", "n02", "n03", "n04", "n06", "n07", "n09", "n10"},
|
|
||||||
errorExpected: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
// Forbidden DNS character
|
|
||||||
input: "n@",
|
|
||||||
resultExpected: []string{},
|
|
||||||
errorExpected: true,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
// Forbidden range
|
|
||||||
input: "n[1-2-2,3]",
|
|
||||||
resultExpected: []string{},
|
|
||||||
errorExpected: true,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
// Forbidden range limits
|
|
||||||
input: "n[2-1]",
|
|
||||||
resultExpected: []string{},
|
|
||||||
errorExpected: true,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, expandTest := range expandTests {
|
|
||||||
result, err := Expand(expandTest.input)
|
|
||||||
|
|
||||||
hasError := err != nil
|
|
||||||
if hasError != expandTest.errorExpected && hasError {
|
|
||||||
t.Errorf("Expand('%s') failed: unexpected error '%v'",
|
|
||||||
expandTest.input, err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if hasError != expandTest.errorExpected && !hasError {
|
|
||||||
t.Errorf("Expand('%s') did not fail as expected: got result '%+v'",
|
|
||||||
expandTest.input, result)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if !hasError && !equal(result, expandTest.resultExpected) {
|
|
||||||
t.Errorf("Expand('%s') failed: got result '%+v', expected result '%v'",
|
|
||||||
expandTest.input, result, expandTest.resultExpected)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
t.Logf("Checked hostlist.Expand('%s'): result = '%+v', err = '%v'",
|
|
||||||
expandTest.input, result, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,9 +1,16 @@
|
|||||||
|
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||||
|
// All rights reserved. This file is part of cc-lib.
|
||||||
|
// Use of this source code is governed by a MIT-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
// additional authors:
|
||||||
|
// Holger Obermaier (NHR@KIT)
|
||||||
|
|
||||||
package multiChanTicker
|
package multiChanTicker
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||||
)
|
)
|
||||||
|
|
||||||
type multiChanTicker struct {
|
type multiChanTicker struct {
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
Package: cc-metric-collector
|
Package: cc-metric-collector
|
||||||
|
Section: misc
|
||||||
|
Priority: optional
|
||||||
Version: {VERSION}
|
Version: {VERSION}
|
||||||
Installed-Size: {INSTALLED_SIZE}
|
Installed-Size: {INSTALLED_SIZE}
|
||||||
Architecture: {ARCH}
|
Architecture: {ARCH}
|
||||||
|
|||||||
@@ -44,6 +44,8 @@ def group_to_json(groupfile):
|
|||||||
scope = "socket"
|
scope = "socket"
|
||||||
if "PWR" in calc:
|
if "PWR" in calc:
|
||||||
scope = "socket"
|
scope = "socket"
|
||||||
|
if "UMC" in calc:
|
||||||
|
scope = "socket"
|
||||||
|
|
||||||
m = {"name" : metric, "calc": calc, "type" : scope, "publish" : True}
|
m = {"name" : metric, "calc": calc, "type" : scope, "publish" : True}
|
||||||
metrics.append(m)
|
metrics.append(m)
|
||||||
|
|||||||
Reference in New Issue
Block a user