Compare commits

..

7 Commits

Author SHA1 Message Date
Thomas Roehl
0ca87ea0be Fix SqliteSink 2022-01-31 13:24:39 +01:00
Thomas Roehl
4195786242 Add all CCMetric functions to interface 2022-01-31 06:04:30 +01:00
Thomas Roehl
328d26bf3c Merge branch 'develop' into sqlite3_sink 2022-01-31 05:59:25 +01:00
Thomas Roehl
2b07798af2 Fix Write() arguments 2021-11-26 19:21:18 +01:00
Thomas Roehl
aa842a8a9c Add Flush method 2021-11-26 19:13:48 +01:00
Thomas Roehl
06ab58dc92 Merge branch 'main' into sqlite3_sink 2021-11-25 18:23:04 +01:00
Thomas Roehl
40855b1164 Sqlite3 sink 2021-05-18 15:53:20 +02:00
117 changed files with 4656 additions and 11094 deletions

View File

@@ -1,10 +1,8 @@
{
"sinks-file": ".github/ci-sinks.json",
"collectors-file" : ".github/ci-collectors.json",
"receivers-file" : ".github/ci-receivers.json",
"router-file" : ".github/ci-router.json",
"main" : {
"interval": "5s",
"duration": "1s"
}
"sinks": ".github/ci-sinks.json",
"collectors" : ".github/ci-collectors.json",
"receivers" : ".github/ci-receivers.json",
"router" : ".github/ci-router.json",
"interval": 5,
"duration": 1
}

View File

@@ -1 +1 @@
{}
[]

10
.github/ci-sinks.json vendored
View File

@@ -1,8 +1,6 @@
{
"testoutput" : {
[
{
"type" : "stdout",
"meta_as_tags" : [
"unit"
]
"meta_as_tags" : true
}
}
]

View File

@@ -1,11 +0,0 @@
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
version: 2
updates:
- package-ecosystem: "gomod"
directory: "/"
schedule:
interval: "weekly"

View File

@@ -1,504 +0,0 @@
# See: https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions
# Workflow name
name: Release
# Run on tag push
on:
push:
tags:
- '**'
workflow_dispatch:
jobs:
#
# Build on AlmaLinux 8 using go-toolset
#
AlmaLinux8-RPM-build:
runs-on: ubuntu-latest
# See: https://hub.docker.com/_/almalinux
container: almalinux:8
# The job outputs link to the outputs of the 'rpmrename' step
# Only job outputs can be used in child jobs
outputs:
rpm : ${{steps.rpmrename.outputs.RPM}}
srpm : ${{steps.rpmrename.outputs.SRPM}}
steps:
# Use dnf to install development packages
- name: Install development packages
run: |
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
dnf --assumeyes install wget openssl-devel diffutils delve which
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
# AlmaLinux 8 is a derivate of RedHat Enterprise Linux 8 (UBI8),
# so the created RPM both contain the substring 'el8' in the RPM file names
# This step replaces the substring 'el8' to 'alma8'. It uses the move operation
# because it is unclear whether the default AlmaLinux 8 container contains the
# 'rename' command. This way we also get the new names for output.
- name: Rename RPMs (s/el8/alma8/)
id: rpmrename
run: |
OLD_RPM="${{steps.rpmbuild.outputs.RPM}}"
OLD_SRPM="${{steps.rpmbuild.outputs.SRPM}}"
NEW_RPM="${OLD_RPM/el8/alma8}"
NEW_SRPM=${OLD_SRPM/el8/alma8}
mv "${OLD_RPM}" "${NEW_RPM}"
mv "${OLD_SRPM}" "${NEW_SRPM}"
echo "SRPM=${NEW_SRPM}" >> $GITHUB_OUTPUT
echo "RPM=${NEW_RPM}" >> $GITHUB_OUTPUT
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector RPM for AlmaLinux 8
path: ${{ steps.rpmrename.outputs.RPM }}
overwrite: true
- name: Save SRPM as artifact
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector SRPM for AlmaLinux 8
path: ${{ steps.rpmrename.outputs.SRPM }}
overwrite: true
#
# Build on AlmaLinux 9 using go-toolset
#
AlmaLinux9-RPM-build:
runs-on: ubuntu-latest
# See: https://hub.docker.com/_/almalinux
container: almalinux:9
# The job outputs link to the outputs of the 'rpmrename' step
# Only job outputs can be used in child jobs
outputs:
rpm : ${{steps.rpmrename.outputs.RPM}}
srpm : ${{steps.rpmrename.outputs.SRPM}}
steps:
# Use dnf to install development packages
- name: Install development packages
run: |
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
dnf --assumeyes install wget openssl-devel diffutils delve which
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.el9_6.noarch.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.23.9-1.el9_6.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
# AlmaLinux 9 is a derivate of RedHat Enterprise Linux 8 (UBI8),
# so the created RPM both contain the substring 'el9' in the RPM file names
# This step replaces the substring 'el8' to 'alma8'. It uses the move operation
# because it is unclear whether the default AlmaLinux 8 container contains the
# 'rename' command. This way we also get the new names for output.
- name: Rename RPMs (s/el9/alma9/)
id: rpmrename
run: |
OLD_RPM="${{steps.rpmbuild.outputs.RPM}}"
OLD_SRPM="${{steps.rpmbuild.outputs.SRPM}}"
NEW_RPM="${OLD_RPM/el9/alma9}"
NEW_SRPM=${OLD_SRPM/el9/alma9}
mv "${OLD_RPM}" "${NEW_RPM}"
mv "${OLD_SRPM}" "${NEW_SRPM}"
echo "SRPM=${NEW_SRPM}" >> $GITHUB_OUTPUT
echo "RPM=${NEW_RPM}" >> $GITHUB_OUTPUT
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector RPM for AlmaLinux 9
path: ${{ steps.rpmrename.outputs.RPM }}
overwrite: true
- name: Save SRPM as artifact
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector SRPM for AlmaLinux 9
path: ${{ steps.rpmrename.outputs.SRPM }}
overwrite: true
#
# Build on UBI 8 using go-toolset
#
UBI-8-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c35984d70cc534b3a3784e?container-tabs=gti
container: registry.access.redhat.com/ubi8/ubi:8.8-1032.1692772289
# The job outputs link to the outputs of the 'rpmbuild' step
outputs:
rpm : ${{steps.rpmbuild.outputs.RPM}}
srpm : ${{steps.rpmbuild.outputs.SRPM}}
steps:
# Use dnf to install development packages
- name: Install development packages
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros rpm-build-libs rpm-libs gcc make python38 git wget openssl-devel diffutils delve which
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector RPM for UBI 8
path: ${{ steps.rpmbuild.outputs.RPM }}
overwrite: true
- name: Save SRPM as artifact
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector SRPM for UBI 8
path: ${{ steps.rpmbuild.outputs.SRPM }}
overwrite: true
#
# Build on UBI 9 using go-toolset
#
UBI-9-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
container: redhat/ubi9
# The job outputs link to the outputs of the 'rpmbuild' step
# The job outputs link to the outputs of the 'rpmbuild' step
outputs:
rpm : ${{steps.rpmbuild.outputs.RPM}}
srpm : ${{steps.rpmbuild.outputs.SRPM}}
steps:
# Use dnf to install development packages
- name: Install development packages
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros gcc make python39 git wget openssl-devel diffutils delve
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.el9_6.noarch.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.23.9-1.el9_6.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
# See: https://github.com/actions/upload-artifact
- name: Save RPM as artifact
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector RPM for UBI 9
path: ${{ steps.rpmbuild.outputs.RPM }}
overwrite: true
- name: Save SRPM as artifact
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector SRPM for UBI 9
path: ${{ steps.rpmbuild.outputs.SRPM }}
overwrite: true
#
# Build on Ubuntu 22.04 using official go package
#
Ubuntu-jammy-build:
runs-on: ubuntu-latest
container: ubuntu:22.04
# The job outputs link to the outputs of the 'debrename' step
# Only job outputs can be used in child jobs
outputs:
deb : ${{steps.debrename.outputs.DEB}}
steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt update && apt --assume-yes upgrade
apt --assume-yes install build-essential sed git wget bash
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
- name: Setup Golang
uses: actions/setup-go@v5
with:
go-version: 'stable'
- name: DEB build MetricCollector
id: dpkg-build
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make DEB
- name: Rename DEB (add '_ubuntu22.04')
id: debrename
run: |
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu22.04.deb"
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
# See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector DEB for Ubuntu 22.04
path: ${{ steps.debrename.outputs.DEB }}
overwrite: true
#
# Build on Ubuntu 24.04 using official go package
#
Ubuntu-noblenumbat-build:
runs-on: ubuntu-latest
container: ubuntu:24.04
# The job outputs link to the outputs of the 'debrename' step
# Only job outputs can be used in child jobs
outputs:
deb : ${{steps.debrename.outputs.DEB}}
steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt update && apt --assume-yes upgrade
apt --assume-yes install build-essential sed git wget bash
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
- name: Setup Golang
uses: actions/setup-go@v5
with:
go-version: 'stable'
- name: DEB build MetricCollector
id: dpkg-build
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make DEB
- name: Rename DEB (add '_ubuntu24.04')
id: debrename
run: |
OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev)
NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu24.04.deb"
mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}"
echo "DEB=${NEW_DEB_FILE}" >> $GITHUB_OUTPUT
# See: https://github.com/actions/upload-artifact
- name: Save DEB as artifact
uses: actions/upload-artifact@v4
with:
name: cc-metric-collector DEB for Ubuntu 24.04
path: ${{ steps.debrename.outputs.DEB }}
overwrite: true
#
# Create release with fresh RPMs
#
Release:
runs-on: ubuntu-latest
# We need the RPMs, so add dependency
needs: [AlmaLinux8-RPM-build, AlmaLinux9-RPM-build, UBI-8-RPM-build, UBI-9-RPM-build, Ubuntu-jammy-build, Ubuntu-noblenumbat-build]
steps:
# See: https://github.com/actions/download-artifact
- name: Download AlmaLinux 8 RPM
uses: actions/download-artifact@v4
with:
name: cc-metric-collector RPM for AlmaLinux 8
- name: Download AlmaLinux 8 SRPM
uses: actions/download-artifact@v4
with:
name: cc-metric-collector SRPM for AlmaLinux 8
- name: Download AlmaLinux 9 RPM
uses: actions/download-artifact@v4
with:
name: cc-metric-collector RPM for AlmaLinux 9
- name: Download AlmaLinux 9 SRPM
uses: actions/download-artifact@v4
with:
name: cc-metric-collector SRPM for AlmaLinux 9
- name: Download UBI 8 RPM
uses: actions/download-artifact@v4
with:
name: cc-metric-collector RPM for UBI 8
- name: Download UBI 8 SRPM
uses: actions/download-artifact@v4
with:
name: cc-metric-collector SRPM for UBI 8
- name: Download UBI 9 RPM
uses: actions/download-artifact@v4
with:
name: cc-metric-collector RPM for UBI 9
- name: Download UBI 9 SRPM
uses: actions/download-artifact@v4
with:
name: cc-metric-collector SRPM for UBI 9
- name: Download Ubuntu 22.04 DEB
uses: actions/download-artifact@v4
with:
name: cc-metric-collector DEB for Ubuntu 22.04
- name: Download Ubuntu 24.04 DEB
uses: actions/download-artifact@v4
with:
name: cc-metric-collector DEB for Ubuntu 24.04
# The download actions do not publish the name of the downloaded file,
# so we re-use the job outputs of the parent jobs. The files are all
# downloaded to the current folder.
# The gh-release action afterwards does not accept file lists but all
# files have to be listed at 'files'. The step creates one output per
# RPM package (2 per distro)
- name: Set RPM variables
id: files
run: |
ALMA_8_RPM=$(basename "${{ needs.AlmaLinux8-RPM-build.outputs.rpm}}")
ALMA_8_SRPM=$(basename "${{ needs.AlmaLinux8-RPM-build.outputs.srpm}}")
ALMA_9_RPM=$(basename "${{ needs.AlmaLinux9-RPM-build.outputs.rpm}}")
ALMA_9_SRPM=$(basename "${{ needs.AlmaLinux9-RPM-build.outputs.srpm}}")
UBI_8_RPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.rpm}}")
UBI_8_SRPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.srpm}}")
UBI_9_RPM=$(basename "${{ needs.UBI-9-RPM-build.outputs.rpm}}")
UBI_9_SRPM=$(basename "${{ needs.UBI-9-RPM-build.outputs.srpm}}")
U_2204_DEB=$(basename "${{ needs.Ubuntu-jammy-build.outputs.deb}}")
U_2404_DEB=$(basename "${{ needs.Ubuntu-noblenumbat-build.outputs.deb}}")
echo "ALMA_8_RPM::${ALMA_8_RPM}"
echo "ALMA_8_SRPM::${ALMA_8_SRPM}"
echo "ALMA_9_RPM::${ALMA_9_RPM}"
echo "ALMA_9_SRPM::${ALMA_9_SRPM}"
echo "UBI_8_RPM::${UBI_8_RPM}"
echo "UBI_8_SRPM::${UBI_8_SRPM}"
echo "UBI_9_RPM::${UBI_9_RPM}"
echo "UBI_9_SRPM::${UBI_9_SRPM}"
echo "U_2204_DEB::${U_2204_DEB}"
echo "U_2404_DEB::${U_2404_DEB}"
echo "ALMA_8_RPM=${ALMA_8_RPM}" >> $GITHUB_OUTPUT
echo "ALMA_8_SRPM=${ALMA_8_SRPM}" >> $GITHUB_OUTPUT
echo "ALMA_9_RPM=${ALMA_9_RPM}" >> $GITHUB_OUTPUT
echo "ALMA_9_SRPM=${ALMA_9_SRPM}" >> $GITHUB_OUTPUT
echo "UBI_8_RPM=${UBI_8_RPM}" >> $GITHUB_OUTPUT
echo "UBI_8_SRPM=${UBI_8_SRPM}" >> $GITHUB_OUTPUT
echo "UBI_9_RPM=${UBI_9_RPM}" >> $GITHUB_OUTPUT
echo "UBI_9_SRPM=${UBI_9_SRPM}" >> $GITHUB_OUTPUT
echo "U_2204_DEB=${U_2204_DEB}" >> $GITHUB_OUTPUT
echo "U_2404_DEB=${U_2404_DEB}" >> $GITHUB_OUTPUT
# See: https://github.com/softprops/action-gh-release
- name: Release
uses: softprops/action-gh-release@v2
if: startsWith(github.ref, 'refs/tags/')
with:
name: cc-metric-collector-${{github.ref_name}}
files: |
${{ steps.files.outputs.ALMA_8_RPM }}
${{ steps.files.outputs.ALMA_8_SRPM }}
${{ steps.files.outputs.ALMA_9_RPM }}
${{ steps.files.outputs.ALMA_9_SRPM }}
${{ steps.files.outputs.UBI_8_RPM }}
${{ steps.files.outputs.UBI_8_SRPM }}
${{ steps.files.outputs.UBI_9_RPM }}
${{ steps.files.outputs.UBI_9_SRPM }}
${{ steps.files.outputs.U_2204_DEB }}
${{ steps.files.outputs.U_2404_DEB }}

61
.github/workflows/rpmbuild.yml vendored Normal file
View File

@@ -0,0 +1,61 @@
name: Run RPM Build
on: push
jobs:
build-centos8:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: TomTheBear/rpmbuild@master
id: rpm
name: Build RPM package on CentOS8
with:
spec_file: "./scripts/cc-metric-collector.spec"
- name: Save RPM as artifact
uses: actions/upload-artifact@v1.0.0
with:
name: cc-metric-collector RPM CentOS8
path: ${{ steps.rpm.outputs.rpm_dir_path }}
- name: Save SRPM as artifact
uses: actions/upload-artifact@v1.0.0
with:
name: cc-metric-collector SRPM CentOS8
path: ${{ steps.rpm.outputs.source_rpm_path }}
build-centos-latest:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: TomTheBear/rpmbuild@centos_latest
id: rpm
name: Build RPM package on CentOS 'Latest'
with:
spec_file: "./scripts/cc-metric-collector.spec"
- name: Save RPM as artifact
uses: actions/upload-artifact@v1.0.0
with:
name: cc-metric-collector RPM CentOS 'Latest'
path: ${{ steps.rpm.outputs.rpm_dir_path }}
- name: Save SRPM as artifact
uses: actions/upload-artifact@v1.0.0
with:
name: cc-metric-collector SRPM CentOS 'Latest'
path: ${{ steps.rpm.outputs.source_rpm_path }}
build-alma-8_5:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: TomTheBear/rpmbuild@alma8.5
id: rpm
name: Build RPM package on AlmaLinux 8.5
with:
spec_file: "./scripts/cc-metric-collector.spec"
- name: Save RPM as artifact
uses: actions/upload-artifact@v1.0.0
with:
name: cc-metric-collector RPM AlmaLinux 8.5
path: ${{ steps.rpm.outputs.rpm_dir_path }}
- name: Save SRPM as artifact
uses: actions/upload-artifact@v1.0.0
with:
name: cc-metric-collector SRPM AlmaLinux 8.5
path: ${{ steps.rpm.outputs.source_rpm_path }}

View File

@@ -1,283 +1,20 @@
# See: https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions
# Workflow name
name: Run Test
# Run on event push
on:
push:
workflow_dispatch:
on: push
jobs:
#
# Job build-latest
# Build on latest Ubuntu using latest golang version
#
build-latest:
build:
runs-on: ubuntu-latest
steps:
# See: https://github.com/marketplace/actions/checkout
# Checkout git repository and submodules
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/checkout@v2
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang
uses: actions/setup-go@v5
uses: actions/setup-go@v2.1.5
with:
go-version: '1.21'
check-latest: true
go-version: '^1.17.6'
- name: Build MetricCollector
run: make
- name: Run MetricCollector once
- name: Run MetricCollector
run: ./cc-metric-collector --once --config .github/ci-config.json
#
# Build on AlmaLinux 8
#
AlmaLinux8-RPM-build:
runs-on: ubuntu-latest
# See: https://hub.docker.com/_/almalinux
container: almalinux:8
# The job outputs link to the outputs of the 'rpmrename' step
# Only job outputs can be used in child jobs
steps:
# Use dnf to install development packages
- name: Install development packages
run: |
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
dnf --assumeyes install wget openssl-devel diffutils delve which
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
#
# Build on AlmaLinux 9
#
AlmaLinux9-RPM-build:
runs-on: ubuntu-latest
# See: https://hub.docker.com/_/almalinux
container: almalinux:9
# The job outputs link to the outputs of the 'rpmrename' step
# Only job outputs can be used in child jobs
steps:
# Use dnf to install development packages
- name: Install development packages
run: |
dnf --assumeyes group install "Development Tools" "RPM Development Tools"
dnf --assumeyes install wget openssl-devel diffutils delve which
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.el9_6.noarch.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.23.9-1.el9_6.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
#
# Build on UBI 8 using go-toolset
#
UBI-8-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
container: redhat/ubi8
# The job outputs link to the outputs of the 'rpmbuild' step
steps:
# Use dnf to install development packages
- name: Install development packages
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros rpm-build-libs rpm-libs gcc make python38 git wget openssl-devel diffutils delve which
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.x86_64.rpm \
https://repo.almalinux.org/almalinux/8/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.module_el8.10.0+4000+1ad1b2cc.noarch.rpm
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
#
# Build on UBI 9 using go-toolset
#
UBI-9-RPM-build:
runs-on: ubuntu-latest
# See: https://catalog.redhat.com/software/containers/ubi8/ubi/5c359854d70cc534b3a3784e?container-tabs=gti
container: redhat/ubi9
# The job outputs link to the outputs of the 'rpmbuild' step
steps:
# Use dnf to install development packages
- name: Install development packages
run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros gcc make python39 git wget openssl-devel diffutils delve
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# See: https://github.com/marketplace/actions/setup-go-environment
# - name: Setup Golang
# uses: actions/setup-go@v5
# with:
# go-version: 'stable'
- name: Setup Golang
run: |
dnf --assumeyes --disableplugin=subscription-manager install \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/go-toolset-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-bin-1.23.9-1.el9_6.x86_64.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-src-1.23.9-1.el9_6.noarch.rpm \
https://repo.almalinux.org/almalinux/9/AppStream/x86_64/os/Packages/golang-race-1.23.9-1.el9_6.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
make RPM
#
# Build on Ubuntu 22.04 using official go package
#
Ubuntu-jammy-build:
runs-on: ubuntu-latest
container: ubuntu:22.04
steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt update && apt --assume-yes upgrade
apt --assume-yes install build-essential sed git wget bash
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang
uses: actions/setup-go@v5
with:
go-version: 'stable'
- name: DEB build MetricCollector
id: dpkg-build
run: |
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
make DEB
#
# Build on Ubuntu 24.04 using official go package
#
Ubuntu-noblenumbat-build:
runs-on: ubuntu-latest
container: ubuntu:24.04
steps:
# Use apt to install development packages
- name: Install development packages
run: |
apt update && apt --assume-yes upgrade
apt --assume-yes install build-essential sed git wget bash
# Checkout git repository and submodules
# fetch-depth must be 0 to use git describe
# See: https://github.com/marketplace/actions/checkout
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
# Use official golang package
# See: https://github.com/marketplace/actions/setup-go-environment
- name: Setup Golang
uses: actions/setup-go@v5
with:
go-version: 'stable'
- name: DEB build MetricCollector
id: dpkg-build
run: |
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
make DEB

4
.gitmodules vendored
View File

@@ -1,4 +0,0 @@
[submodule ".github/actions/rpmbuild-centos8-golang"]
path = .github/actions/rpmbuild-centos8-golang
url = https://github.com/naveenrajm7/rpmbuild.git
branch = centos8

View File

@@ -1,29 +0,0 @@
{
"title": "cc-metric-collector",
"description": "Monitoring agent for ClusterCockpit.",
"creators": [
{
"affiliation": "Regionales Rechenzentrum Erlangen, Friedrich-Alexander-Universität Erlangen-Nürnberg",
"name": "Thomas Gruber",
"orcid": "0000-0001-5560-6964"
},
{
"affiliation": "Steinbuch Centre for Computing, Karlsruher Institut für Technologie",
"name": "Holger Obermaier",
"orcid": "0000-0002-6830-6626"
}
],
"upload_type": "software",
"license": "MIT",
"access_right": "open",
"keywords": [
"performance-monitoring",
"cluster-monitoring",
"open-source"
],
"communities": [
{
"identifier": "clustercockpit"
}
]
}

112
Makefile
View File

@@ -1,129 +1,43 @@
APP = cc-metric-collector
GOSRC_APP := cc-metric-collector.go
GOSRC_APP := metric-collector.go
GOSRC_COLLECTORS := $(wildcard collectors/*.go)
GOSRC_SINKS := $(wildcard sinks/*.go)
GOSRC_RECEIVERS := $(wildcard receivers/*.go)
GOSRC_INTERNAL := $(wildcard internal/*/*.go)
GOSRC := $(GOSRC_APP) $(GOSRC_COLLECTORS) $(GOSRC_SINKS) $(GOSRC_RECEIVERS) $(GOSRC_INTERNAL)
COMPONENT_DIRS := collectors \
sinks \
receivers \
internal/metricRouter \
internal/ccMetric \
internal/metricAggregator \
internal/ccLogger \
internal/ccTopology \
internal/multiChanTicker
BINDIR = bin
GOBIN = $(shell which go)
.PHONY: all
all: $(APP)
$(APP): $(GOSRC) go.mod
$(APP): $(GOSRC)
make -C collectors
$(GOBIN) get
$(GOBIN) build -o $(APP) $(GOSRC_APP)
install: $(APP)
@WORKSPACE=$(PREFIX)
@if [ -z "$${WORKSPACE}" ]; then exit 1; fi
@mkdir --parents --verbose $${WORKSPACE}/usr/$(BINDIR)
@install -Dpm 755 $(APP) $${WORKSPACE}/usr/$(BINDIR)/$(APP)
@mkdir --parents --verbose $${WORKSPACE}/etc/cc-metric-collector $${WORKSPACE}/etc/default $${WORKSPACE}/etc/systemd/system $${WORKSPACE}/etc/init.d
@install -Dpm 600 config.json $${WORKSPACE}/etc/cc-metric-collector/cc-metric-collector.json
@sed -i -e s+"\"./"+"\"/etc/cc-metric-collector/"+g $${WORKSPACE}/etc/cc-metric-collector/cc-metric-collector.json
@install -Dpm 600 sinks.json $${WORKSPACE}/etc/cc-metric-collector/sinks.json
@install -Dpm 600 collectors.json $${WORKSPACE}/etc/cc-metric-collector/collectors.json
@install -Dpm 600 router.json $${WORKSPACE}/etc/cc-metric-collector/router.json
@install -Dpm 600 receivers.json $${WORKSPACE}/etc/cc-metric-collector/receivers.json
@install -Dpm 600 scripts/cc-metric-collector.config $${WORKSPACE}/etc/default/cc-metric-collector
@install -Dpm 644 scripts/cc-metric-collector.service $${WORKSPACE}/etc/systemd/system/cc-metric-collector.service
@install -Dpm 644 scripts/cc-metric-collector.init $${WORKSPACE}/etc/init.d/cc-metric-collector
go get
go build -o $(APP) $(GOSRC_APP)
.PHONY: clean
.ONESHELL:
clean:
@for COMP in $(COMPONENT_DIRS); do if [ -e $$COMP/Makefile ]; then make -C $$COMP clean; fi; done
make -C collectors clean
rm -f $(APP)
.PHONY: fmt
fmt:
$(GOBIN) fmt $(GOSRC_COLLECTORS)
$(GOBIN) fmt $(GOSRC_SINKS)
$(GOBIN) fmt $(GOSRC_RECEIVERS)
$(GOBIN) fmt $(GOSRC_APP)
@for F in $(GOSRC_INTERNAL); do $(GOBIN) fmt $$F; done
go fmt $(GOSRC_COLLECTORS)
go fmt $(GOSRC_SINKS)
go fmt $(GOSRC_RECEIVERS)
go fmt $(GOSRC_APP)
@for F in $(GOSRC_INTERNAL); do go fmt $$F; done
# Examine Go source code and reports suspicious constructs
.PHONY: vet
vet:
$(GOBIN) vet ./...
go vet ./...
# Run linter for the Go programming language.
# Using static analysis, it finds bugs and performance issues, offers simplifications, and enforces style rules
.PHONY: staticcheck
staticcheck:
$(GOBIN) install honnef.co/go/tools/cmd/staticcheck@latest
$$($(GOBIN) env GOPATH)/bin/staticcheck ./...
.ONESHELL:
.PHONY: RPM
RPM: scripts/cc-metric-collector.spec
@WORKSPACE="$${PWD}"
@SPECFILE="$${WORKSPACE}/scripts/cc-metric-collector.spec"
# Setup RPM build tree
@eval $$(rpm --eval "ARCH='%{_arch}' RPMDIR='%{_rpmdir}' SOURCEDIR='%{_sourcedir}' SPECDIR='%{_specdir}' SRPMDIR='%{_srcrpmdir}' BUILDDIR='%{_builddir}'")
@mkdir --parents --verbose "$${RPMDIR}" "$${SOURCEDIR}" "$${SPECDIR}" "$${SRPMDIR}" "$${BUILDDIR}"
# Create source tarball
@COMMITISH="HEAD"
@VERS=$$(git describe --tags $${COMMITISH})
@VERS=$${VERS#v}
@VERS=$$(echo $${VERS} | sed -e s+'-'+'_'+g)
@eval $$(rpmspec --query --queryformat "NAME='%{name}' VERSION='%{version}' RELEASE='%{release}' NVR='%{NVR}' NVRA='%{NVRA}'" --define="VERS $${VERS}" "$${SPECFILE}")
@PREFIX="$${NAME}-$${VERSION}"
@FORMAT="tar.gz"
@SRCFILE="$${SOURCEDIR}/$${PREFIX}.$${FORMAT}"
@git archive --verbose --format "$${FORMAT}" --prefix="$${PREFIX}/" --output="$${SRCFILE}" $${COMMITISH}
# Build RPM and SRPM
@rpmbuild -ba --define="VERS $${VERS}" --rmsource --clean "$${SPECFILE}"
# Report RPMs and SRPMs when in GitHub Workflow
@if [[ "$${GITHUB_ACTIONS}" == true ]]; then
@ RPMFILE="$${RPMDIR}/$${ARCH}/$${NVRA}.rpm"
@ SRPMFILE="$${SRPMDIR}/$${NVR}.src.rpm"
@ echo "SRPM=$${SRPMFILE}" >> $${GITHUB_OUTPUT}
@ echo "RPM=$${RPMFILE}" >> $${GITHUB_OUTPUT}
@fi
.PHONY: DEB
DEB: scripts/cc-metric-collector.deb.control $(APP)
@BASEDIR=$${PWD}
@WORKSPACE=$${PWD}/.dpkgbuild
@DEBIANDIR=$${WORKSPACE}/debian
@DEBIANBINDIR=$${WORKSPACE}/DEBIAN
@mkdir --parents --verbose $${WORKSPACE} $${DEBIANBINDIR}
#@mkdir --parents --verbose $$DEBIANDIR
@CONTROLFILE="$${BASEDIR}/scripts/cc-metric-collector.deb.control"
@COMMITISH="HEAD"
@VERS=$$(git describe --tags --abbrev=0 $${COMMITISH})
@if [ -z "$${VERS}" ]; then VERS=${GITHUB_REF_NAME}; fi
@VERS=$${VERS#v}
@ARCH=$$(uname -m)
@ARCH=$$(echo $${ARCH} | sed -e s+'_'+'-'+g)
@if [ "$${ARCH}" = "x86-64" ]; then ARCH=amd64; fi
@PREFIX="$${NAME}-$${VERSION}_$${ARCH}"
@SIZE_BYTES=$$(du -bcs --exclude=.dpkgbuild "$${WORKSPACE}"/ | awk '{print $$1}' | head -1 | sed -e 's/^0\+//')
@SIZE="$$(awk -v size="$${SIZE_BYTES}" 'BEGIN {print (size/1024)+1}' | awk '{print int($$0)}')"
@sed -e s+"{VERSION}"+"$${VERS}"+g -e s+"{INSTALLED_SIZE}"+"$${SIZE}"+g -e s+"{ARCH}"+"$${ARCH}"+g $${CONTROLFILE} > $${DEBIANBINDIR}/control
@make PREFIX=$${WORKSPACE} install
@DEB_FILE="cc-metric-collector_$${VERS}_$${ARCH}.deb"
@dpkg-deb -b $${WORKSPACE} "$${DEB_FILE}"
@if [ "$${GITHUB_ACTIONS}" = "true" ]; then
@ echo "DEB=$${DEB_FILE}" >> $${GITHUB_OUTPUT}
@fi
@rm -r "$${WORKSPACE}"
go install honnef.co/go/tools/cmd/staticcheck@latest
$$(go env GOPATH)/bin/staticcheck ./...

View File

@@ -1,17 +1,5 @@
<!--
---
title: cc-metric-collector
description: Metric collecting node agent
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/_index.md
---
-->
# cc-metric-collector
A node agent for measuring, processing and forwarding node level metrics. It is part of the [ClusterCockpit ecosystem](https://clustercockpit.org/docs/overview/).
A node agent for measuring, processing and forwarding node level metrics. It is part of the ClusterCockpit ecosystem.
The metric collector sends (and receives) metric in the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/) as it provides flexibility while providing a separation between tags (like index columns in relational databases) and fields (like data columns).
@@ -19,14 +7,10 @@ There is a single timer loop that triggers all collectors serially, collects the
The receiver runs as a go routine side-by-side with the timer loop and asynchronously forwards received metrics to the sink.
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7438287.svg)](https://doi.org/10.5281/zenodo.7438287)
# Configuration
Configuration is implemented using a single json document that is distributed over network and may be persisted as file.
Supported metrics are documented [here](https://github.com/ClusterCockpit/cc-specifications/blob/master/interfaces/lineprotocol/README.md).
Supported metrics are documented [here](https://github.com/ClusterCockpit/cc-specifications/blob/master/metrics/lineprotocol_alternative.md).
There is a main configuration file with basic settings that point to the other configuration files for the different components.
@@ -36,35 +20,33 @@ There is a main configuration file with basic settings that point to the other c
"collectors" : "collectors.json",
"receivers" : "receivers.json",
"router" : "router.json",
"interval": "10s",
"duration": "1s"
"interval": 10,
"duration": 1
}
```
The `interval` defines how often the metrics should be read and send to the sink. The `duration` tells collectors how long one measurement has to take. This is important for some collectors, like the `likwid` collector. For more information, see [here](./docs/configuration.md).
The `interval` defines how often the metrics should be read and send to the sink. The `duration` tells collectors how long one measurement has to take. This is important for some collectors, like the `likwid` collector.
See the component READMEs for their configuration:
* [`collectors`](./collectors/README.md)
* [`sinks`](https://github.com/ClusterCockpit/cc-lib/blob/main/sinks/README.md)
* [`receivers`](https://github.com/ClusterCockpit/cc-lib/blob/main/receivers/README.md)
* [`sinks`](./sinks/README.md)
* [`receivers`](./receivers/README.md)
* [`router`](./internal/metricRouter/README.md)
# Installation
```
$ git clone git@github.com:ClusterCockpit/cc-metric-collector.git
$ make (downloads LIKWID, builds it as static library with 'direct' accessmode and copies all required files for the collector)
$ go get (requires at least golang 1.16)
$ make
$ go get (requires at least golang 1.13)
$ go build metric-collector
```
For more information, see [here](./docs/building.md).
# Running
```
$ ./cc-metric-collector --help
$ ./metric-collector --help
Usage of metric-collector:
-config string
Path to configuration file (default "./config.json")
@@ -72,51 +54,17 @@ Usage of metric-collector:
Path for logfile (default "stderr")
-once
Run all collectors only once
-pidfile string
Path for PID file (default "/var/run/cc-metric-collector.pid")
```
# Scenarios
The metric collector was designed with flexibility in mind, so it can be used in many scenarios. Here are a few:
```mermaid
flowchart TD
subgraph a ["Cluster A"]
nodeA[NodeA with CC collector]
nodeB[NodeB with CC collector]
nodeC[NodeC with CC collector]
end
a --> db[(Database)]
db <--> ccweb("Webfrontend")
```
``` mermaid
flowchart TD
subgraph a [ClusterA]
direction LR
nodeA[NodeA with CC collector]
nodeB[NodeB with CC collector]
nodeC[NodeC with CC collector]
end
subgraph b [ClusterB]
direction LR
nodeD[NodeD with CC collector]
nodeE[NodeE with CC collector]
nodeF[NodeF with CC collector]
end
a --> ccrecv{"CC collector as receiver"}
b --> ccrecv
ccrecv --> db[("Database1")]
ccrecv -.-> db2[("Database2")]
db <-.-> ccweb("Webfrontend")
```
# Contributing
The ClusterCockpit ecosystem is designed to be used by different HPC computing centers. Since configurations and setups differ between the centers, the centers likely have to put some work into the cc-metric-collector to gather all desired metrics.
You are free to open an issue to request a collector but we would also be happy about PRs.
# Contact
# Contact
* [Matrix.org ClusterCockpit General chat](https://matrix.to/#/#clustercockpit-dev:matrix.org)
* [Matrix.org ClusterCockpit Development chat](https://matrix.to/#/#clustercockpit:matrix.org)

View File

@@ -1,41 +1,15 @@
{
"cpufreq": {},
"cpufreq_cpuinfo": {},
"gpfs": {
"exclude_filesystem": [
"test_fs"
]
},
"ibstat": {},
"loadavg": {
"exclude_metrics": [
"proc_total"
]
},
"memstat": {},
"netstat": {
"include_devices": [
"enp5s0"
],
"send_derived_values": true
},
"numastats": {},
"nvidia": {},
"tempstat": {
"report_max_temperature": true,
"report_critical_temperature": true,
"tag_override": {
"hwmon0": {
"type": "socket",
"type-id": "0"
},
"hwmon1": {
"type": "socket",
"type-id": "1"
}
"tag_override": {
"hwmon0" : {
"type" : "socket",
"type-id" : "0"
},
"hwmon1" : {
"type" : "socket",
"type-id" : "1"
}
},
"topprocs": {
"num_procs": 5
}
}
}
}

View File

@@ -1,33 +1,82 @@
# LIKWID version
LIKWID_VERSION := 5.4.1
LIKWID_INSTALLED_FOLDER := $(shell dirname $$(which likwid-topology 2>/dev/null) 2>/dev/null)
# Use central installation
CENTRAL_INSTALL = false
# How to access hardware performance counters through LIKWID.
# Recommended is 'direct' mode
ACCESSMODE = direct
LIKWID_FOLDER := $(CURDIR)/likwid
#######################################################################
# if CENTRAL_INSTALL == true
#######################################################################
# Path to central installation (if CENTRAL_INSTALL=true)
LIKWID_BASE=/apps/likwid/5.2.1
# LIKWID version (should be same major version as central installation, 5.2.x)
LIKWID_VERSION = 5.2.1
all: likwid
#######################################################################
# if CENTRAL_INSTALL == false and ACCESSMODE == accessdaemon
#######################################################################
# Where to install the accessdaemon
DAEMON_INSTALLDIR = /usr/local
# Which user to use for the accessdaemon
DAEMON_USER = root
# Which group to use for the accessdaemon
DAEMON_GROUP = root
.ONESHELL:
.PHONY: likwid
likwid:
if [ -n "$(LIKWID_INSTALLED_FOLDER)" ]; then
# Using likwid include files from system installation
INCLUDE_DIR="$(LIKWID_INSTALLED_FOLDER)/../include"
mkdir --parents --verbose "$(LIKWID_FOLDER)"
cp "$${INCLUDE_DIR}"/*.h "$(LIKWID_FOLDER)"
else
# Using likwid include files from downloaded tar archive
if [ -d "$(LIKWID_FOLDER)" ]; then
rm --recursive "$(LIKWID_FOLDER)"
fi
BUILD_FOLDER="$${PWD}/likwidbuild"
mkdir --parents --verbose "$${BUILD_FOLDER}"
wget --output-document=- http://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz |
tar --directory="$${BUILD_FOLDER}" --extract --gz
install -D --verbose --preserve-timestamps --mode=0644 --target-directory="$(LIKWID_FOLDER)" "$${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes"/likwid*.h
rm --recursive "$${BUILD_FOLDER}"
fi
#################################################
# No need to change anything below this line
#################################################
INSTALL_FOLDER = ./likwid
BUILD_FOLDER = ./likwid/build
ifneq ($(strip $(CENTRAL_INSTALL)),true)
LIKWID_BASE := $(shell pwd)/$(INSTALL_FOLDER)
DAEMON_BASE := $(LIKWID_BASE)
GROUPS_BASE := $(LIKWID_BASE)/groups
all: $(INSTALL_FOLDER)/liblikwid.a cleanup
else
DAEMON_BASE= $(LIKWID_BASE)/sbin
all: $(INSTALL_FOLDER)/liblikwid.a cleanup
endif
$(BUILD_FOLDER)/likwid-$(LIKWID_VERSION).tar.gz: $(BUILD_FOLDER)
wget -P $(BUILD_FOLDER) ftp://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz
$(BUILD_FOLDER):
mkdir -p $(BUILD_FOLDER)
$(INSTALL_FOLDER):
mkdir -p $(INSTALL_FOLDER)
$(BUILD_FOLDER)/likwid-$(LIKWID_VERSION): $(BUILD_FOLDER)/likwid-$(LIKWID_VERSION).tar.gz
tar -C $(BUILD_FOLDER) -xf $(BUILD_FOLDER)/likwid-$(LIKWID_VERSION).tar.gz
$(INSTALL_FOLDER)/liblikwid.a: $(BUILD_FOLDER)/likwid-$(LIKWID_VERSION) $(INSTALL_FOLDER)
sed -i -e s+"PREFIX ?= .*"+"PREFIX = $(LIKWID_BASE)"+g \
-e s+"SHARED_LIBRARY = .*"+"SHARED_LIBRARY = false"+g \
-e s+"ACCESSMODE = .*"+"ACCESSMODE = $(ACCESSMODE)"+g \
-e s+"INSTALLED_ACCESSDAEMON = .*"+"INSTALLED_ACCESSDAEMON = $(DAEMON_INSTALLDIR)/likwid-accessD"+g \
$(BUILD_FOLDER)/likwid-$(LIKWID_VERSION)/config.mk
cd $(BUILD_FOLDER)/likwid-$(LIKWID_VERSION) && make
cp $(BUILD_FOLDER)/likwid-$(LIKWID_VERSION)/liblikwid.a $(INSTALL_FOLDER)
cp $(BUILD_FOLDER)/likwid-$(LIKWID_VERSION)/ext/hwloc/liblikwid-hwloc.a $(INSTALL_FOLDER)
cp $(BUILD_FOLDER)/likwid-$(LIKWID_VERSION)/src/includes/likwid*.h $(INSTALL_FOLDER)
cp $(BUILD_FOLDER)/likwid-$(LIKWID_VERSION)/src/includes/bstrlib.h $(INSTALL_FOLDER)
$(DAEMON_INSTALLDIR)/likwid-accessD: $(BUILD_FOLDER)/likwid-$(LIKWID_VERSION)/likwid-accessD
sudo -u $(DAEMON_USER) -g $(DAEMON_GROUP) install -m 4775 $(BUILD_FOLDER)/likwid-$(LIKWID_VERSION)/likwid-accessD $(DAEMON_INSTALLDIR)/likwid-accessD
prepare_collector: likwidMetric.go
cp likwidMetric.go likwidMetric.go.orig
sed -i -e s+"const GROUPPATH =.*"+"const GROUPPATH = \`$(GROUPS_BASE)\`"+g likwidMetric.go
cleanup:
rm -rf $(BUILD_FOLDER)
clean: cleanup
rm -rf likwid
.PHONY: clean
clean:
rm -rf likwid

View File

@@ -1,14 +1,3 @@
<!--
---
title: Metric Collectors
description: Metric collectors for cc-metric-collector
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/_index.md
---
-->
# CCMetric collectors
This folder contains the collectors for the cc-metric-collector.
@@ -29,32 +18,21 @@ In contrast to the configuration files for sinks and receivers, the collectors c
* [`cpustat`](./cpustatMetric.md)
* [`memstat`](./memstatMetric.md)
* [`iostat`](./iostatMetric.md)
* [`diskstat`](./diskstatMetric.md)
* [`loadavg`](./loadavgMetric.md)
* [`netstat`](./netstatMetric.md)
* [`ibstat`](./infinibandMetric.md)
* [`tempstat`](./tempMetric.md)
* [`lustrestat`](./lustreMetric.md)
* [`lustre`](./lustreMetric.md)
* [`likwid`](./likwidMetric.md)
* [`nvidia`](./nvidiaMetric.md)
* [`customcmd`](./customCmdMetric.md)
* [`ipmistat`](./ipmiMetric.md)
* [`topprocs`](./topprocsMetric.md)
* [`nfs3stat`](./nfs3Metric.md)
* [`nfs4stat`](./nfs4Metric.md)
* [`nfsiostat`](./nfsiostatMetric.md)
* [`cpufreq`](./cpufreqMetric.md)
* [`cpufreq_cpuinfo`](./cpufreqCpuinfoMetric.md)
* [`schedstat`](./schedstatMetric.md)
* [`numastats`](./numastatsMetric.md)
* [`gpfs`](./gpfsMetric.md)
* [`beegfs_meta`](./beegfsmetaMetric.md)
* [`beegfs_storage`](./beegfsstorageMetric.md)
* [`rocm_smi`](./rocmsmiMetric.md)
## Todos
* [ ] Exclude devices for `diskstat` collector
* [ ] Aggreate metrics to higher topology entity (sum hwthread metrics to socket metric, ...). Needs to be configurable
# Contributing own collectors
@@ -63,7 +41,7 @@ A collector reads data from any source, parses it to metrics and submits these m
* `Name() string`: Return the name of the collector
* `Init(config json.RawMessage) error`: Initializes the collector using the given collector-specific config in JSON. Check if needed files/commands exists, ...
* `Initialized() bool`: Check if a collector is successfully initialized
* `Read(duration time.Duration, output chan ccMessage.CCMessage)`: Read, parse and submit data to the `output` channel as [`CCMessage`](https://github.com/ClusterCockpit/cc-lib/blob/main/ccMessage/README.md). If the collector has to measure anything for some duration, use the provided function argument `duration`.
* `Read(duration time.Duration, output chan ccMetric.CCMetric)`: Read, parse and submit data to the `output` channel as [`CCMetric`](../internal/ccMetric/README.md). If the collector has to measure anything for some duration, use the provided function argument `duration`.
* `Close()`: Closes down the collector.
It is recommanded to call `setup()` in the `Init()` function.
@@ -93,11 +71,6 @@ type SampleCollector struct {
}
func (m *SampleCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
m.name = "SampleCollector"
m.setup()
if len(config) > 0 {
@@ -118,15 +91,10 @@ func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMetric)
}
// tags for the metric, if type != node use proper type and type-id
tags := map[string]string{"type" : "node"}
x, err := GetMetric()
if err != nil {
cclog.ComponentError(m.name, fmt.Sprintf("Read(): %v", err))
}
// Each metric has exactly one field: value !
value := map[string]interface{}{"value": int64(x)}
if y, err := lp.New("sample_metric", tags, m.meta, value, time.Now()); err == nil {
value := map[string]interface{}{"value": int(x)}
y, err := lp.New("sample_metric", tags, m.meta, value, time.Now())
if err == nil {
output <- y
}
}

View File

@@ -1,237 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"io"
"os"
"os/exec"
"os/user"
"regexp"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const DEFAULT_BEEGFS_CMD = "beegfs-ctl"
// Struct for the collector-specific JSON config
type BeegfsMetaCollectorConfig struct {
Beegfs string `json:"beegfs_path"`
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ExcludeFilesystem []string `json:"exclude_filesystem"`
}
type BeegfsMetaCollector struct {
metricCollector
tags map[string]string
matches map[string]string
config BeegfsMetaCollectorConfig
skipFS map[string]struct{}
}
func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
// Metrics
var nodeMdstat_array = [39]string{
"sum", "ack", "close", "entInf",
"fndOwn", "mkdir", "create", "rddir",
"refrEn", "mdsInf", "rmdir", "rmLnk",
"mvDirIns", "mvFiIns", "open", "ren",
"sChDrct", "sAttr", "sDirPat", "stat",
"statfs", "trunc", "symlnk", "unlnk",
"lookLI", "statLI", "revalLI", "openLI",
"createLI", "hardlnk", "flckAp", "flckEn",
"flckRg", "dirparent", "listXA", "getXA",
"rmXA", "setXA", "mirror"}
m.name = "BeegfsMetaCollector"
m.setup()
m.parallel = true
// Set default beegfs-ctl binary
m.config.Beegfs = DEFAULT_BEEGFS_CMD
// Read JSON configuration
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
//create map with possible variables
m.matches = make(map[string]string)
for _, value := range nodeMdstat_array {
_, skip := stringArrayContains(m.config.ExcludeMetrics, value)
if skip {
m.matches["other"] = "0"
} else {
m.matches["beegfs_cmeta_"+value] = "0"
}
}
m.meta = map[string]string{
"source": m.name,
"group": "BeegfsMeta",
}
m.tags = map[string]string{
"type": "node",
"filesystem": "",
}
m.skipFS = make(map[string]struct{})
for _, fs := range m.config.ExcludeFilesystem {
m.skipFS[fs] = struct{}{}
}
// Beegfs file system statistics can only be queried by user root
user, err := user.Current()
if err != nil {
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to get current user: %v", err)
}
if user.Uid != "0" {
return fmt.Errorf("BeegfsMetaCollector.Init(): BeeGFS file system statistics can only be queried by user root")
}
// Check if beegfs-ctl is in executable search path
_, err = exec.LookPath(m.config.Beegfs)
if err != nil {
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err)
}
m.init = true
return nil
}
func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
//get mounpoint
buffer, _ := os.ReadFile(string("/proc/mounts"))
mounts := strings.Split(string(buffer), "\n")
var mountpoints []string
for _, line := range mounts {
if len(line) == 0 {
continue
}
f := strings.Fields(line)
if strings.Contains(f[0], "beegfs_ondemand") {
// Skip excluded filesystems
if _, skip := m.skipFS[f[1]]; skip {
continue
}
mountpoints = append(mountpoints, f[1])
}
}
if len(mountpoints) == 0 {
return
}
for _, mountpoint := range mountpoints {
m.tags["filesystem"] = mountpoint
// bwwgfs-ctl:
// --clientstats: Show client IO statistics.
// --nodetype=meta: The node type to query (meta, storage).
// --interval:
// --mount=/mnt/beeond/: Which mount point
//cmd := exec.Command(m.config.Beegfs, "/root/mc/test.txt")
mountoption := "--mount=" + mountpoint
cmd := exec.Command(m.config.Beegfs, "--clientstats",
"--nodetype=meta", mountoption, "--allstats")
cmd.Stdin = strings.NewReader("\n")
cmdStdout := new(bytes.Buffer)
cmdStderr := new(bytes.Buffer)
cmd.Stdout = cmdStdout
cmd.Stderr = cmdStderr
err := cmd.Run()
if err != nil {
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error())
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode())
data, _ := io.ReadAll(cmdStderr)
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stderr: \"%s\"\n", string(data))
data, _ = io.ReadAll(cmdStdout)
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stdout: \"%s\"\n", string(data))
return
}
// Read I/O statistics
scanner := bufio.NewScanner(cmdStdout)
sumLine := regexp.MustCompile(`^Sum:\s+\d+\s+\[[a-zA-Z]+\]+`)
//Line := regexp.MustCompile(`^(.*)\s+(\d)+\s+\[([a-zA-Z]+)\]+`)
statsLine := regexp.MustCompile(`^(.*?)\s+?(\d.*?)$`)
singleSpacePattern := regexp.MustCompile(`\s+`)
removePattern := regexp.MustCompile(`[\[|\]]`)
for scanner.Scan() {
readLine := scanner.Text()
//fmt.Println(readLine)
// Jump few lines, we only want the I/O stats from nodes
if !sumLine.MatchString(readLine) {
continue
}
match := statsLine.FindStringSubmatch(readLine)
// nodeName = "Sum:" or would be nodes
// nodeName := match[1]
//Remove multiple whitespaces
dummy := removePattern.ReplaceAllString(match[2], " ")
metaStats := strings.TrimSpace(singleSpacePattern.ReplaceAllString(dummy, " "))
split := strings.Split(metaStats, " ")
// fill map with values
// split[i+1] = mdname
// split[i] = amount of md operations
for i := 0; i <= len(split)-1; i += 2 {
if _, ok := m.matches[split[i+1]]; ok {
m.matches["beegfs_cmeta_"+split[i+1]] = split[i]
} else {
f1, err := strconv.ParseFloat(m.matches["other"], 32)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
continue
}
f2, err := strconv.ParseFloat(split[i], 32)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
continue
}
//mdStat["other"] = fmt.Sprintf("%f", f1+f2)
m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2)
}
}
for key, data := range m.matches {
value, _ := strconv.ParseFloat(data, 32)
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil {
output <- y
}
}
}
}
}
func (m *BeegfsMetaCollector) Close() {
m.init = false
}

View File

@@ -1,87 +0,0 @@
<!--
---
title: BeeGFS metadata metric collector
description: Collect metadata clientstats for `BeeGFS on Demand`
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/beegfsmeta.md
---
-->
## `BeeGFS on Demand` collector
This Collector is to collect `BeeGFS on Demand` (BeeOND) metadata clientstats.
```json
"beegfs_meta": {
"beegfs_path": "/usr/bin/beegfs-ctl",
"exclude_filesystem": [
"/mnt/ignore_me"
],
"exclude_metrics": [
"ack",
"entInf",
"fndOwn"
]
}
```
The `BeeGFS On Demand (BeeOND)` collector uses the `beegfs-ctl` command to read performance metrics for
BeeGFS filesystems.
The reported filesystems can be filtered with the `exclude_filesystem` option
in the configuration.
The path to the `beegfs-ctl` command can be configured with the `beegfs_path` option
in the configuration.
When using the `exclude_metrics` option, the excluded metrics are summed as `other`.
Important: The metrics listed below, are similar to the naming of BeeGFS. The Collector prefixes these with `beegfs_cstorage`(beegfs client storage).
For example beegfs metric `open`-> `beegfs_cstorage_open`
Available Metrics:
* sum
* ack
* close
* entInf
* fndOwn
* mkdir
* create
* rddir
* refrEnt
* mdsInf
* rmdir
* rmLnk
* mvDirIns
* mvFiIns
* open
* ren
* sChDrct
* sAttr
* sDirPat
* stat
* statfs
* trunc
* symlnk
* unlnk
* lookLI
* statLI
* revalLI
* openLI
* createLI
* hardlnk
* flckAp
* flckEn
* flckRg
* dirparent
* listXA
* getXA
* rmXA
* setXA
* mirror
The collector adds a `filesystem` tag to all metrics

View File

@@ -1,229 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"io"
"os"
"os/exec"
"os/user"
"regexp"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// Struct for the collector-specific JSON config
type BeegfsStorageCollectorConfig struct {
Beegfs string `json:"beegfs_path"`
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ExcludeFilesystem []string `json:"exclude_filesystem"`
}
type BeegfsStorageCollector struct {
metricCollector
tags map[string]string
matches map[string]string
config BeegfsStorageCollectorConfig
skipFS map[string]struct{}
}
func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
// Metrics
var storageStat_array = [18]string{
"sum", "ack", "sChDrct", "getFSize",
"sAttr", "statfs", "trunc", "close",
"fsync", "ops-rd", "MiB-rd/s", "ops-wr",
"MiB-wr/s", "gendbg", "hrtbeat", "remNode",
"storInf", "unlnk"}
m.name = "BeegfsStorageCollector"
m.setup()
m.parallel = true
// Set default beegfs-ctl binary
m.config.Beegfs = DEFAULT_BEEGFS_CMD
// Read JSON configuration
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
println(m.config.Beegfs)
//create map with possible variables
m.matches = make(map[string]string)
for _, value := range storageStat_array {
_, skip := stringArrayContains(m.config.ExcludeMetrics, value)
if skip {
m.matches["other"] = "0"
} else {
m.matches["beegfs_cstorage_"+value] = "0"
}
}
m.meta = map[string]string{
"source": m.name,
"group": "BeegfsStorage",
}
m.tags = map[string]string{
"type": "node",
"filesystem": "",
}
m.skipFS = make(map[string]struct{})
for _, fs := range m.config.ExcludeFilesystem {
m.skipFS[fs] = struct{}{}
}
// Beegfs file system statistics can only be queried by user root
user, err := user.Current()
if err != nil {
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to get current user: %v", err)
}
if user.Uid != "0" {
return fmt.Errorf("BeegfsStorageCollector.Init(): BeeGFS file system statistics can only be queried by user root")
}
// Check if beegfs-ctl is in executable search path
_, err = exec.LookPath(m.config.Beegfs)
if err != nil {
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err)
}
m.init = true
return nil
}
func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
//get mounpoint
buffer, _ := os.ReadFile(string("/proc/mounts"))
mounts := strings.Split(string(buffer), "\n")
var mountpoints []string
for _, line := range mounts {
if len(line) == 0 {
continue
}
f := strings.Fields(line)
if strings.Contains(f[0], "beegfs_ondemand") {
// Skip excluded filesystems
if _, skip := m.skipFS[f[1]]; skip {
continue
}
mountpoints = append(mountpoints, f[1])
}
}
if len(mountpoints) == 0 {
return
}
// collects stats for each BeeGFS on Demand FS
for _, mountpoint := range mountpoints {
m.tags["filesystem"] = mountpoint
// bwwgfs-ctl:
// --clientstats: Show client IO statistics.
// --nodetype=meta: The node type to query (meta, storage).
// --interval:
// --mount=/mnt/beeond/: Which mount point
//cmd := exec.Command(m.config.Beegfs, "/root/mc/test.txt")
mountoption := "--mount=" + mountpoint
cmd := exec.Command(m.config.Beegfs, "--clientstats",
"--nodetype=storage", mountoption, "--allstats")
cmd.Stdin = strings.NewReader("\n")
cmdStdout := new(bytes.Buffer)
cmdStderr := new(bytes.Buffer)
cmd.Stdout = cmdStdout
cmd.Stderr = cmdStderr
err := cmd.Run()
if err != nil {
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error())
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode())
data, _ := io.ReadAll(cmdStderr)
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stderr: \"%s\"\n", string(data))
data, _ = io.ReadAll(cmdStdout)
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stdout: \"%s\"\n", string(data))
return
}
// Read I/O statistics
scanner := bufio.NewScanner(cmdStdout)
sumLine := regexp.MustCompile(`^Sum:\s+\d+\s+\[[a-zA-Z]+\]+`)
//Line := regexp.MustCompile(`^(.*)\s+(\d)+\s+\[([a-zA-Z]+)\]+`)
statsLine := regexp.MustCompile(`^(.*?)\s+?(\d.*?)$`)
singleSpacePattern := regexp.MustCompile(`\s+`)
removePattern := regexp.MustCompile(`[\[|\]]`)
for scanner.Scan() {
readLine := scanner.Text()
//fmt.Println(readLine)
// Jump few lines, we only want the I/O stats from nodes
if !sumLine.MatchString(readLine) {
continue
}
match := statsLine.FindStringSubmatch(readLine)
// nodeName = "Sum:" or would be nodes
// nodeName := match[1]
//Remove multiple whitespaces
dummy := removePattern.ReplaceAllString(match[2], " ")
metaStats := strings.TrimSpace(singleSpacePattern.ReplaceAllString(dummy, " "))
split := strings.Split(metaStats, " ")
// fill map with values
// split[i+1] = mdname
// split[i] = amount of operations
for i := 0; i <= len(split)-1; i += 2 {
if _, ok := m.matches[split[i+1]]; ok {
m.matches["beegfs_cstorage_"+split[i+1]] = split[i]
//m.matches[split[i+1]] = split[i]
} else {
f1, err := strconv.ParseFloat(m.matches["other"], 32)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
continue
}
f2, err := strconv.ParseFloat(split[i], 32)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Metric (other): Failed to convert str written '%s' to float: %v", m.matches["other"], err))
continue
}
m.matches["beegfs_cstorage_other"] = fmt.Sprintf("%f", f1+f2)
}
}
for key, data := range m.matches {
value, _ := strconv.ParseFloat(data, 32)
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil {
output <- y
}
}
}
}
}
func (m *BeegfsStorageCollector) Close() {
m.init = false
}

View File

@@ -1,66 +0,0 @@
<!--
---
title: "BeeGFS on Demand metric collector"
description: Collect performance metrics for BeeGFS filesystems
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/beegfsstorage.md
---
-->
## `BeeGFS on Demand` collector
This Collector is to collect BeeGFS on Demand (BeeOND) storage stats.
```json
"beegfs_storage": {
"beegfs_path": "/usr/bin/beegfs-ctl",
"exclude_filesystem": [
"/mnt/ignore_me"
],
"exclude_metrics": [
"ack",
"storInf",
"unlnk"
]
}
```
The `BeeGFS On Demand (BeeOND)` collector uses the `beegfs-ctl` command to read performance metrics for BeeGFS filesystems.
The reported filesystems can be filtered with the `exclude_filesystem` option
in the configuration.
The path to the `beegfs-ctl` command can be configured with the `beegfs_path` option
in the configuration.
When using the `exclude_metrics` option, the excluded metrics are summed as `other`.
Important: The metrics listed below, are similar to the naming of BeeGFS. The Collector prefixes these with `beegfs_cstorage_`(beegfs client meta).
For example beegfs metric `open`-> `beegfs_cstorage_`
Note: BeeGFS FS offers many Metadata Information. Probably it makes sense to exlcude most of them. Nevertheless, these excluded metrics will be summed as `beegfs_cstorage_other`.
Available Metrics:
* "sum"
* "ack"
* "sChDrct"
* "getFSize"
* "sAttr"
* "statfs"
* "trunc"
* "close"
* "fsync"
* "ops-rd"
* "MiB-rd/s"
* "ops-wr"
* "MiB-wr/s"
* "endbg"
* "hrtbeat"
* "remNode"
* "storInf"
* "unlnk"
The collector adds a `filesystem` tag to all metrics

View File

@@ -1,72 +1,54 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"encoding/json"
"os"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker"
)
// Map of all available metric collectors
var AvailableCollectors = map[string]MetricCollector{
"likwid": new(LikwidCollector),
"loadavg": new(LoadavgCollector),
"memstat": new(MemstatCollector),
"netstat": new(NetstatCollector),
"ibstat": new(InfinibandCollector),
"lustrestat": new(LustreCollector),
"cpustat": new(CpustatCollector),
"topprocs": new(TopProcsCollector),
"nvidia": new(NvidiaCollector),
"customcmd": new(CustomCmdCollector),
"iostat": new(IOstatCollector),
"diskstat": new(DiskstatCollector),
"tempstat": new(TempCollector),
"ipmistat": new(IpmiCollector),
"gpfs": new(GpfsCollector),
"cpufreq": new(CPUFreqCollector),
"cpufreq_cpuinfo": new(CPUFreqCpuInfoCollector),
"nfs3stat": new(Nfs3Collector),
"nfs4stat": new(Nfs4Collector),
"numastats": new(NUMAStatsCollector),
"beegfs_meta": new(BeegfsMetaCollector),
"beegfs_storage": new(BeegfsStorageCollector),
"rapl": new(RAPLCollector),
"rocm_smi": new(RocmSmiCollector),
"self": new(SelfCollector),
"schedstat": new(SchedstatCollector),
"nfsiostat": new(NfsIOStatCollector),
"likwid": new(LikwidCollector),
"loadavg": new(LoadavgCollector),
"memstat": new(MemstatCollector),
"netstat": new(NetstatCollector),
"ibstat": new(InfinibandCollector),
"ibstat_perfquery": new(InfinibandPerfQueryCollector),
"lustrestat": new(LustreCollector),
"cpustat": new(CpustatCollector),
"topprocs": new(TopProcsCollector),
"nvidia": new(NvidiaCollector),
"customcmd": new(CustomCmdCollector),
"diskstat": new(DiskstatCollector),
"tempstat": new(TempCollector),
"ipmistat": new(IpmiCollector),
"gpfs": new(GpfsCollector),
"cpufreq": new(CPUFreqCollector),
"cpufreq_cpuinfo": new(CPUFreqCpuInfoCollector),
"nfsstat": new(NfsCollector),
}
// Metric collector manager data structure
type collectorManager struct {
collectors []MetricCollector // List of metric collectors to read in parallel
serial []MetricCollector // List of metric collectors to read serially
output chan lp.CCMessage // Output channels
done chan bool // channel to finish / stop metric collector manager
ticker mct.MultiChanTicker // periodically ticking once each interval
duration time.Duration // duration (for metrics that measure over a given duration)
wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector
config map[string]json.RawMessage // json encoded config for collector manager
collector_wg sync.WaitGroup // internally used wait group for the parallel reading of collector
parallel_run bool // Flag whether the collectors are currently read in parallel
collectors []MetricCollector // List of metric collectors to use
output chan lp.CCMetric // Output channels
done chan bool // channel to finish / stop metric collector manager
ticker mct.MultiChanTicker // periodically ticking once each interval
duration time.Duration // duration (for metrics that measure over a given duration)
wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector
config map[string]json.RawMessage // json encoded config for collector manager
}
// Metric collector manager access functions
type CollectorManager interface {
Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfig json.RawMessage) error
AddOutput(output chan lp.CCMessage)
Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) error
AddOutput(output chan lp.CCMetric)
Start()
Close()
}
@@ -78,16 +60,23 @@ type CollectorManager interface {
// * ticker (from variable ticker)
// * configuration (read from config file in variable collectConfigFile)
// Initialization is done for all configured collectors
func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfig json.RawMessage) error {
func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) error {
cm.collectors = make([]MetricCollector, 0)
cm.serial = make([]MetricCollector, 0)
cm.output = nil
cm.done = make(chan bool)
cm.wg = wg
cm.ticker = ticker
cm.duration = duration
err := json.Unmarshal(collectConfig, &cm.config)
// Read collector config file
configFile, err := os.Open(collectConfigFile)
if err != nil {
cclog.Error(err.Error())
return err
}
defer configFile.Close()
jsonParser := json.NewDecoder(configFile)
err = jsonParser.Decode(&cm.config)
if err != nil {
cclog.Error(err.Error())
return err
@@ -107,11 +96,7 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
continue
}
cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())
if collector.Parallel() {
cm.collectors = append(cm.collectors, collector)
} else {
cm.serial = append(cm.serial, collector)
}
cm.collectors = append(cm.collectors, collector)
}
return nil
}
@@ -127,10 +112,6 @@ func (cm *collectorManager) Start() {
// Collector manager is done
done := func() {
// close all metric collectors
if cm.parallel_run {
cm.collector_wg.Wait()
cm.parallel_run = false
}
for _, c := range cm.collectors {
c.Close()
}
@@ -145,26 +126,7 @@ func (cm *collectorManager) Start() {
done()
return
case t := <-tick:
cm.parallel_run = true
for _, c := range cm.collectors {
// Wait for done signal or execute the collector
select {
case <-cm.done:
done()
return
default:
// Read metrics from collector c via goroutine
cclog.ComponentDebug("CollectorManager", c.Name(), t)
cm.collector_wg.Add(1)
go func(myc MetricCollector) {
myc.Read(cm.duration, cm.output)
cm.collector_wg.Done()
}(c)
}
}
cm.collector_wg.Wait()
cm.parallel_run = false
for _, c := range cm.serial {
// Wait for done signal or execute the collector
select {
case <-cm.done:
@@ -185,7 +147,7 @@ func (cm *collectorManager) Start() {
}
// AddOutput adds the output channel to the metric collector manager
func (cm *collectorManager) AddOutput(output chan lp.CCMessage) {
func (cm *collectorManager) AddOutput(output chan lp.CCMetric) {
cm.output = output
}
@@ -198,9 +160,9 @@ func (cm *collectorManager) Close() {
}
// New creates a new initialized metric collector manager
func New(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfig json.RawMessage) (CollectorManager, error) {
func New(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) (CollectorManager, error) {
cm := new(collectorManager)
err := cm.Init(ticker, duration, wg, collectConfig)
err := cm.Init(ticker, duration, wg, collectConfigFile)
if err != nil {
return nil, err
}

View File

@@ -1,10 +1,3 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
@@ -12,22 +5,33 @@ import (
"encoding/json"
"fmt"
"log"
"os"
"strconv"
"strings"
"time"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
//
// CPUFreqCollector
// a metric collector to measure the current frequency of the CPUs
// as obtained from /proc/cpuinfo
// Only measure on the first hyperthread
//
type CPUFreqCpuInfoCollectorTopology struct {
isHT bool
tagSet map[string]string
processor string // logical processor number (continuous, starting at 0)
coreID string // socket local core ID
coreID_int int
physicalPackageID string // socket / package ID
physicalPackageID_int int
numPhysicalPackages string // number of sockets / packages
numPhysicalPackages_int int
isHT bool
numNonHT string // number of non hyperthreading processors
numNonHT_int int
tagSet map[string]string
}
type CPUFreqCpuInfoCollector struct {
@@ -36,37 +40,28 @@ type CPUFreqCpuInfoCollector struct {
}
func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
m.setup()
m.name = "CPUFreqCpuInfoCollector"
m.parallel = true
m.meta = map[string]string{
"source": m.name,
"group": "CPU",
"unit": "MHz",
"group": "cpufreq",
}
const cpuInfoFile = "/proc/cpuinfo"
file, err := os.Open(cpuInfoFile)
if err != nil {
return fmt.Errorf("failed to open file '%s': %v", cpuInfoFile, err)
return fmt.Errorf("Failed to open '%s': %v", cpuInfoFile, err)
}
defer file.Close()
// Collect topology information from file cpuinfo
foundFreq := false
processor := ""
numNonHT_int := 0
coreID := ""
physicalPackageID := ""
maxPhysicalPackageID := 0
m.topology = make([]CPUFreqCpuInfoCollectorTopology, 0)
coreSeenBefore := make(map[string]bool)
// Read cpuinfo file, line by line
scanner := bufio.NewScanner(file)
for scanner.Scan() {
lineSplit := strings.Split(scanner.Text(), ":")
@@ -92,22 +87,39 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
len(coreID) > 0 &&
len(physicalPackageID) > 0 {
coreID_int, err := strconv.Atoi(coreID)
if err != nil {
return fmt.Errorf("Unable to convert coreID to int: %v", err)
}
physicalPackageID_int, err := strconv.Atoi(physicalPackageID)
if err != nil {
return fmt.Errorf("Unable to convert physicalPackageID to int: %v", err)
}
// increase maximun socket / package ID, when required
if physicalPackageID_int > maxPhysicalPackageID {
maxPhysicalPackageID = physicalPackageID_int
}
globalID := physicalPackageID + ":" + coreID
isHT := coreSeenBefore[globalID]
coreSeenBefore[globalID] = true
if !isHT {
// increase number on non hyper thread cores
numNonHT_int++
}
// store collected topology information
m.topology = append(m.topology,
m.topology = append(
m.topology,
CPUFreqCpuInfoCollectorTopology{
isHT: coreSeenBefore[globalID],
tagSet: map[string]string{
"type": "hwthread",
"type-id": processor,
"package_id": physicalPackageID,
},
},
)
// mark core as seen before
coreSeenBefore[globalID] = true
processor: processor,
coreID: coreID,
coreID_int: coreID_int,
physicalPackageID: physicalPackageID,
physicalPackageID_int: physicalPackageID_int,
isHT: isHT,
})
// reset topology information
foundFreq = false
@@ -117,27 +129,37 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
}
}
// Check if at least one CPU with frequency information was detected
if len(m.topology) == 0 {
return fmt.Errorf("no CPU frequency info found in %s", cpuInfoFile)
numPhysicalPackageID_int := maxPhysicalPackageID + 1
numPhysicalPackageID := fmt.Sprint(numPhysicalPackageID_int)
numNonHT := fmt.Sprint(numNonHT_int)
for i := range m.topology {
t := &m.topology[i]
t.numPhysicalPackages = numPhysicalPackageID
t.numPhysicalPackages_int = numPhysicalPackageID_int
t.numNonHT = numNonHT
t.numNonHT_int = numNonHT_int
t.tagSet = map[string]string{
"type": "cpu",
"type-id": t.processor,
"num_core": t.numNonHT,
"package_id": t.physicalPackageID,
"num_package": t.numPhysicalPackages,
}
}
m.init = true
return nil
}
func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Check if already initialized
func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if !m.init {
return
}
const cpuInfoFile = "/proc/cpuinfo"
file, err := os.Open(cpuInfoFile)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err))
log.Printf("Failed to open '%s': %v", cpuInfoFile, err)
return
}
defer file.Close()
@@ -152,16 +174,15 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
// frequency
if key == "cpu MHz" {
t := m.topology[processorCounter]
t := &m.topology[processorCounter]
if !t.isHT {
value, err := strconv.ParseFloat(strings.TrimSpace(lineSplit[1]), 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
log.Printf("Failed to convert cpu MHz to float: %v", err)
return
}
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil {
y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now)
if err == nil {
output <- y
}
}

View File

@@ -1,22 +0,0 @@
<!--
---
title: CPU frequency metric collector through cpuinfo
description: Collect the CPU frequency from `/proc/cpuinfo`
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/cpufreq_cpuinfo.md
---
-->
## `cpufreq_cpuinfo` collector
```json
"cpufreq_cpuinfo": {}
```
The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **hwthread** metrics.
Metrics:
* `cpufreq`

View File

@@ -1,38 +1,60 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"bufio"
"encoding/json"
"fmt"
"log"
"os"
"path/filepath"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
"golang.org/x/sys/unix"
)
type CPUFreqCollectorTopology struct {
scalingCurFreqFile string
tagSet map[string]string
//
// readOneLine reads one line from a file.
// It returns ok when file was successfully read.
// In this case text contains the first line of the files contents.
//
func readOneLine(filename string) (text string, ok bool) {
file, err := os.Open(filename)
if err != nil {
return
}
defer file.Close()
scanner := bufio.NewScanner(file)
ok = scanner.Scan()
text = scanner.Text()
return
}
type CPUFreqCollectorTopology struct {
processor string // logical processor number (continuous, starting at 0)
coreID string // socket local core ID
coreID_int int
physicalPackageID string // socket / package ID
physicalPackageID_int int
numPhysicalPackages string // number of sockets / packages
numPhysicalPackages_int int
isHT bool
numNonHT string // number of non hyperthreading processors
numNonHT_int int
scalingCurFreqFile string
tagSet map[string]string
}
//
// CPUFreqCollector
// a metric collector to measure the current frequency of the CPUs
// as obtained from the hardware (in KHz)
// Only measure on the first hyper-thread
// Only measure on the first hyper thread
//
// See: https://www.kernel.org/doc/html/latest/admin-guide/pm/cpufreq.html
//
type CPUFreqCollector struct {
metricCollector
topology []CPUFreqCollectorTopology
@@ -42,14 +64,8 @@ type CPUFreqCollector struct {
}
func (m *CPUFreqCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
m.name = "CPUFreqCollector"
m.setup()
m.parallel = true
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
if err != nil {
@@ -58,48 +74,116 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
}
m.meta = map[string]string{
"source": m.name,
"group": "CPU",
"unit": "Hz",
"group": "CPU Frequency",
}
m.topology = make([]CPUFreqCollectorTopology, 0)
for _, c := range ccTopology.CpuData() {
// Loop for all CPU directories
baseDir := "/sys/devices/system/cpu"
globPattern := filepath.Join(baseDir, "cpu[0-9]*")
cpuDirs, err := filepath.Glob(globPattern)
if err != nil {
return fmt.Errorf("CPUFreqCollector.Init() unable to glob files with pattern %s: %v", globPattern, err)
}
if cpuDirs == nil {
return fmt.Errorf("CPUFreqCollector.Init() unable to find any files with pattern %s", globPattern)
}
// Skip hyper threading CPUs
if c.CpuID != c.CoreCPUsList[0] {
continue
// Initialize CPU topology
m.topology = make([]CPUFreqCollectorTopology, len(cpuDirs))
for _, cpuDir := range cpuDirs {
processor := strings.TrimPrefix(cpuDir, "/sys/devices/system/cpu/cpu")
processor_int, err := strconv.Atoi(processor)
if err != nil {
return fmt.Errorf("CPUFreqCollector.Init() unable to convert cpuID to int: %v", err)
}
// Read package ID
physicalPackageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id")
physicalPackageID, ok := readOneLine(physicalPackageIDFile)
if !ok {
return fmt.Errorf("CPUFreqCollector.Init() unable to read physical package ID from %s", physicalPackageIDFile)
}
physicalPackageID_int, err := strconv.Atoi(physicalPackageID)
if err != nil {
return fmt.Errorf("CPUFreqCollector.Init() unable to convert packageID to int: %v", err)
}
// Read core ID
coreIDFile := filepath.Join(cpuDir, "topology", "core_id")
coreID, ok := readOneLine(coreIDFile)
if !ok {
return fmt.Errorf("CPUFreqCollector.Init() unable to read core ID from %s", coreIDFile)
}
coreID_int, err := strconv.Atoi(coreID)
if err != nil {
return fmt.Errorf("CPUFreqCollector.Init() unable to convert coreID to int: %v", err)
}
// Check access to current frequency file
scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq")
err := unix.Access(scalingCurFreqFile, unix.R_OK)
scalingCurFreqFile := filepath.Join(cpuDir, "cpufreq", "scaling_cur_freq")
err = unix.Access(scalingCurFreqFile, unix.R_OK)
if err != nil {
return fmt.Errorf("unable to access file '%s': %v", scalingCurFreqFile, err)
return fmt.Errorf("CPUFreqCollector.Init() unable to access %s: %v", scalingCurFreqFile, err)
}
m.topology = append(m.topology,
CPUFreqCollectorTopology{
tagSet: map[string]string{
"type": "hwthread",
"type-id": fmt.Sprint(c.CpuID),
"package_id": fmt.Sprint(c.Socket),
},
scalingCurFreqFile: scalingCurFreqFile,
},
)
t := &m.topology[processor_int]
t.processor = processor
t.physicalPackageID = physicalPackageID
t.physicalPackageID_int = physicalPackageID_int
t.coreID = coreID
t.coreID_int = coreID_int
t.scalingCurFreqFile = scalingCurFreqFile
}
// is processor a hyperthread?
coreSeenBefore := make(map[string]bool)
for i := range m.topology {
t := &m.topology[i]
globalID := t.physicalPackageID + ":" + t.coreID
t.isHT = coreSeenBefore[globalID]
coreSeenBefore[globalID] = true
}
// number of non hyper thread cores and packages / sockets
numNonHT_int := 0
maxPhysicalPackageID := 0
for i := range m.topology {
t := &m.topology[i]
// Update maxPackageID
if t.physicalPackageID_int > maxPhysicalPackageID {
maxPhysicalPackageID = t.physicalPackageID_int
}
if !t.isHT {
numNonHT_int++
}
}
numPhysicalPackageID_int := maxPhysicalPackageID + 1
numPhysicalPackageID := fmt.Sprint(numPhysicalPackageID_int)
numNonHT := fmt.Sprint(numNonHT_int)
for i := range m.topology {
t := &m.topology[i]
t.numPhysicalPackages = numPhysicalPackageID
t.numPhysicalPackages_int = numPhysicalPackageID_int
t.numNonHT = numNonHT
t.numNonHT_int = numNonHT_int
t.tagSet = map[string]string{
"type": "cpu",
"type-id": t.processor,
"num_core": t.numNonHT,
"package_id": t.physicalPackageID,
"num_package": t.numPhysicalPackages,
}
}
// Initialized
cclog.ComponentDebug(
m.name,
"initialized",
len(m.topology), "non-hyper-threading CPUs")
m.init = true
return nil
}
func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Check if already initialized
func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if !m.init {
return
}
@@ -108,23 +192,25 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage
for i := range m.topology {
t := &m.topology[i]
// Read current frequency
line, err := os.ReadFile(t.scalingCurFreqFile)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to read file '%s': %v", t.scalingCurFreqFile, err))
continue
}
cpuFreq, err := strconv.ParseInt(strings.TrimSpace(string(line)), 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert CPU frequency '%s' to int64: %v", line, err))
// skip hyperthreads
if t.isHT {
continue
}
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil {
// Read current frequency
line, ok := readOneLine(t.scalingCurFreqFile)
if !ok {
log.Printf("CPUFreqCollector.Read(): Failed to read one line from file '%s'", t.scalingCurFreqFile)
continue
}
cpuFreq, err := strconv.Atoi(line)
if err != nil {
log.Printf("CPUFreqCollector.Read(): Failed to convert CPU frequency '%s': %v", line, err)
continue
}
y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now)
if err == nil {
output <- y
}
}

View File

@@ -1,24 +0,0 @@
<!--
---
title: CPU frequency metric collector through sysfs
description: Collect the CPU frequency metrics from `/sys/.../cpu/.../cpufreq`
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/cpufreq.md
---
-->
## `cpufreq_cpuinfo` collector
```json
"cpufreq": {
"exclude_metrics": []
}
```
The `cpufreq` collector reads the clock frequency from `/sys/devices/system/cpu/cpu*/cpufreq` and outputs a handful **hwthread** metrics.
Metrics:
* `cpufreq`

View File

@@ -1,24 +1,13 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"bufio"
"encoding/json"
"fmt"
"os"
"io/ioutil"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
sysconf "github.com/tklauser/go-sysconf"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
)
const CPUSTATFILE = `/proc/stat`
@@ -29,159 +18,73 @@ type CpustatCollectorConfig struct {
type CpustatCollector struct {
metricCollector
config CpustatCollectorConfig
lastTimestamp time.Time // Store time stamp of last tick to derive values
matches map[string]int
cputags map[string]map[string]string
nodetags map[string]string
olddata map[string]map[string]int64
config CpustatCollectorConfig
}
func (m *CpustatCollector) Init(config json.RawMessage) error {
m.name = "CpustatCollector"
m.setup()
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "CPU"}
m.nodetags = map[string]string{"type": "node"}
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
matches := map[string]int{
"cpu_user": 1,
"cpu_nice": 2,
"cpu_system": 3,
"cpu_idle": 4,
"cpu_iowait": 5,
"cpu_irq": 6,
"cpu_softirq": 7,
"cpu_steal": 8,
"cpu_guest": 9,
"cpu_guest_nice": 10,
}
m.matches = make(map[string]int)
for match, index := range matches {
doExclude := false
for _, exclude := range m.config.ExcludeMetrics {
if match == exclude {
doExclude = true
break
}
}
if !doExclude {
m.matches[match] = index
}
}
// Check input file
file, err := os.Open(string(CPUSTATFILE))
if err != nil {
cclog.ComponentError(m.name, err.Error())
}
defer file.Close()
// Pre-generate tags for all CPUs
num_cpus := 0
m.cputags = make(map[string]map[string]string)
m.olddata = make(map[string]map[string]int64)
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
linefields := strings.Fields(line)
if strings.Compare(linefields[0], "cpu") == 0 {
m.olddata["cpu"] = make(map[string]int64)
for k, v := range m.matches {
m.olddata["cpu"][k], _ = strconv.ParseInt(linefields[v], 0, 64)
}
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
cpustr := strings.TrimLeft(linefields[0], "cpu")
cpu, _ := strconv.Atoi(cpustr)
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
m.olddata[linefields[0]] = make(map[string]int64)
for k, v := range m.matches {
m.olddata[linefields[0]][k], _ = strconv.ParseInt(linefields[v], 0, 64)
}
num_cpus++
}
}
m.lastTimestamp = time.Now()
m.init = true
return nil
}
func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]string, output chan lp.CCMessage, now time.Time, tsdelta time.Duration) {
values := make(map[string]float64)
clktck, _ := sysconf.Sysconf(sysconf.SC_CLK_TCK)
for match, index := range m.matches {
if len(match) > 0 {
x, err := strconv.ParseInt(linefields[index], 0, 64)
if err == nil {
vdiff := x - m.olddata[linefields[0]][match]
m.olddata[linefields[0]][match] = x // Store new value for next run
values[match] = float64(vdiff) / float64(tsdelta.Seconds()) / float64(clktck)
}
}
func (c *CpustatCollector) parseStatLine(line string, cpu int, exclude []string, output chan lp.CCMetric) {
ls := strings.Fields(line)
matches := []string{"", "cpu_user", "cpu_nice", "cpu_system", "cpu_idle", "cpu_iowait", "cpu_irq", "cpu_softirq", "cpu_steal", "cpu_guest", "cpu_guest_nice"}
for _, ex := range exclude {
matches, _ = RemoveFromStringList(matches, ex)
}
sum := float64(0)
for name, value := range values {
sum += value
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now)
if err == nil {
y.AddTag("unit", "Percent")
output <- y
}
var tags map[string]string
if cpu < 0 {
tags = map[string]string{"type": "node"}
} else {
tags = map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", cpu)}
}
if v, ok := values["cpu_idle"]; ok {
sum -= v
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]interface{}{"value": sum * 100}, now)
if err == nil {
y.AddTag("unit", "Percent")
output <- y
for i, m := range matches {
if len(m) > 0 {
x, err := strconv.ParseInt(ls[i], 0, 64)
if err == nil {
y, err := lp.New(m, tags, c.meta, map[string]interface{}{"value": int(x)}, time.Now())
if err == nil {
output <- y
}
}
}
}
}
func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if !m.init {
return
}
num_cpus := 0
now := time.Now()
tsdelta := now.Sub(m.lastTimestamp)
buffer, err := ioutil.ReadFile(string(CPUSTATFILE))
file, err := os.Open(string(CPUSTATFILE))
if err != nil {
cclog.ComponentError(m.name, err.Error())
return
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
linefields := strings.Fields(line)
if strings.Compare(linefields[0], "cpu") == 0 {
m.parseStatLine(linefields, m.nodetags, output, now, tsdelta)
} else if strings.HasPrefix(linefields[0], "cpu") {
m.parseStatLine(linefields, m.cputags[linefields[0]], output, now, tsdelta)
num_cpus++
ll := strings.Split(string(buffer), "\n")
for _, line := range ll {
if len(line) == 0 {
continue
}
ls := strings.Fields(line)
if strings.Compare(ls[0], "cpu") == 0 {
m.parseStatLine(line, -1, m.config.ExcludeMetrics, output)
} else if strings.HasPrefix(ls[0], "cpu") {
cpustr := strings.TrimLeft(ls[0], "cpu")
cpu, _ := strconv.Atoi(cpustr)
m.parseStatLine(line, cpu, m.config.ExcludeMetrics, output)
}
}
num_cpus_metric, err := lp.NewMessage("num_cpus",
m.nodetags,
m.meta,
map[string]interface{}{"value": int(num_cpus)},
now,
)
if err == nil {
output <- num_cpus_metric
}
m.lastTimestamp = now
}
func (m *CpustatCollector) Close() {

View File

@@ -1,17 +1,5 @@
<!--
---
title: CPU usage metric collector
description: Collect CPU metrics from `/proc/stat`
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/cpustat.md
---
-->
## `cpustat` collector
```json
"cpustat": {
"exclude_metrics": [
@@ -20,19 +8,16 @@ hugo_path: docs/reference/cc-metric-collector/collectors/cpustat.md
}
```
The `cpustat` collector reads data from `/proc/stat` and outputs a handful **node** and **hwthread** metrics. If a metric is not required, it can be excluded from forwarding it to the sink.
The `cpustat` collector reads data from `/proc/stats` and outputs a handful **node** and **hwthread** metrics. If a metric is not required, it can be excluded from forwarding it to the sink.
Metrics:
* `cpu_user` with `unit=Percent`
* `cpu_nice` with `unit=Percent`
* `cpu_system` with `unit=Percent`
* `cpu_idle` with `unit=Percent`
* `cpu_iowait` with `unit=Percent`
* `cpu_irq` with `unit=Percent`
* `cpu_softirq` with `unit=Percent`
* `cpu_steal` with `unit=Percent`
* `cpu_guest` with `unit=Percent`
* `cpu_guest_nice` with `unit=Percent`
* `cpu_used` = `cpu_* - cpu_idle` with `unit=Percent`
* `num_cpus`
* `cpu_user`
* `cpu_nice`
* `cpu_system`
* `cpu_idle`
* `cpu_iowait`
* `cpu_irq`
* `cpu_softirq`
* `cpu_steal`
* `cpu_guest`
* `cpu_guest_nice`

View File

@@ -1,22 +1,15 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"encoding/json"
"errors"
"io/ioutil"
"log"
"os"
"os/exec"
"strings"
"time"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
influx "github.com/influxdata/line-protocol"
)
@@ -40,7 +33,6 @@ type CustomCmdCollector struct {
func (m *CustomCmdCollector) Init(config json.RawMessage) error {
var err error
m.name = "CustomCmdCollector"
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Custom"}
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
@@ -55,12 +47,12 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " "))
command.Wait()
_, err = command.Output()
if err == nil {
if err != nil {
m.commands = append(m.commands, c)
}
}
for _, f := range m.config.Files {
_, err = os.ReadFile(f)
_, err = ioutil.ReadFile(f)
if err == nil {
m.files = append(m.files, f)
} else {
@@ -69,7 +61,7 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
}
}
if len(m.files) == 0 && len(m.commands) == 0 {
return errors.New("no metrics to collect")
return errors.New("No metrics to collect")
}
m.handler = influx.NewMetricHandler()
m.parser = influx.NewParser(m.handler)
@@ -82,7 +74,7 @@ var DefaultTime = func() time.Time {
return time.Unix(42, 0)
}
func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessage) {
func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if !m.init {
return
}
@@ -105,12 +97,14 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
if skip {
continue
}
output <- lp.FromInfluxMetric(c)
y, err := lp.New(c.Name(), Tags2Map(c), m.meta, Fields2Map(c), c.Time())
if err == nil {
output <- y
}
}
}
for _, file := range m.files {
buffer, err := os.ReadFile(file)
buffer, err := ioutil.ReadFile(file)
if err != nil {
log.Print(err)
return
@@ -125,7 +119,10 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
if skip {
continue
}
output <- lp.FromInfluxMetric(f)
y, err := lp.New(f.Name(), Tags2Map(f), m.meta, Fields2Map(f), f.Time())
if err == nil {
output <- y
}
}
}
}

View File

@@ -1,13 +1,3 @@
<!--
---
title: CustomCommand metric collector
description: Collect messages from custom command or files
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/customcmd.md
---
-->
## `customcmd` collector

View File

@@ -1,144 +1,112 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"bufio"
"io/ioutil"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
// "log"
"encoding/json"
"os"
"errors"
"strconv"
"strings"
"syscall"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const MOUNTFILE = `/proc/self/mounts`
const DISKSTATFILE = `/proc/diskstats`
const DISKSTAT_SYSFSPATH = `/sys/block`
type DiskstatCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ExcludeMounts []string `json:"exclude_mounts,omitempty"`
}
type DiskstatCollector struct {
metricCollector
config DiskstatCollectorConfig
allowedMetrics map[string]bool
matches map[int]string
config DiskstatCollectorConfig
}
func (m *DiskstatCollector) Init(config json.RawMessage) error {
var err error
m.name = "DiskstatCollector"
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Disk"}
m.setup()
if len(config) > 0 {
if err := json.Unmarshal(config, &m.config); err != nil {
err = json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
m.allowedMetrics = map[string]bool{
"disk_total": true,
"disk_free": true,
"part_max_used": true,
// https://www.kernel.org/doc/html/latest/admin-guide/iostats.html
matches := map[int]string{
3: "reads",
4: "reads_merged",
5: "read_sectors",
6: "read_ms",
7: "writes",
8: "writes_merged",
9: "writes_sectors",
10: "writes_ms",
11: "ioops",
12: "ioops_ms",
13: "ioops_weighted_ms",
14: "discards",
15: "discards_merged",
16: "discards_sectors",
17: "discards_ms",
18: "flushes",
19: "flushes_ms",
}
for _, excl := range m.config.ExcludeMetrics {
if _, ok := m.allowedMetrics[excl]; ok {
m.allowedMetrics[excl] = false
m.matches = make(map[int]string)
for k, v := range matches {
_, skip := stringArrayContains(m.config.ExcludeMetrics, v)
if !skip {
m.matches[k] = v
}
}
file, err := os.Open(MOUNTFILE)
if err != nil {
cclog.ComponentError(m.name, err.Error())
return err
if len(m.matches) == 0 {
return errors.New("No metrics to collect")
}
defer file.Close()
m.init = true
return nil
_, err = ioutil.ReadFile(string(DISKSTATFILE))
if err == nil {
m.init = true
}
return err
}
func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
var lines []string
if !m.init {
return
}
file, err := os.Open(MOUNTFILE)
buffer, err := ioutil.ReadFile(string(DISKSTATFILE))
if err != nil {
cclog.ComponentError(m.name, err.Error())
return
}
defer file.Close()
lines = strings.Split(string(buffer), "\n")
part_max_used := uint64(0)
scanner := bufio.NewScanner(file)
mountLoop:
for scanner.Scan() {
line := scanner.Text()
for _, line := range lines {
if len(line) == 0 {
continue
}
if !strings.HasPrefix(line, "/dev") {
f := strings.Fields(line)
if strings.Contains(f[2], "loop") {
continue
}
linefields := strings.Fields(line)
if strings.Contains(linefields[0], "loop") {
continue
tags := map[string]string{
"device": f[2],
"type": "node",
}
if strings.Contains(linefields[1], "boot") {
continue
}
mountPath := strings.Replace(linefields[1], `\040`, " ", -1)
for _, excl := range m.config.ExcludeMounts {
if strings.Contains(mountPath, excl) {
continue mountLoop
for idx, name := range m.matches {
if idx < len(f) {
x, err := strconv.ParseInt(f[idx], 0, 64)
if err == nil {
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": int(x)}, time.Now())
if err == nil {
output <- y
}
}
}
}
stat := syscall.Statfs_t{}
err := syscall.Statfs(mountPath, &stat)
if err != nil {
continue
}
if stat.Blocks == 0 || stat.Bsize == 0 {
continue
}
tags := map[string]string{"type": "node", "device": linefields[0]}
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
if m.allowedMetrics["disk_total"] {
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
if err == nil {
y.AddMeta("unit", "GBytes")
output <- y
}
}
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000)
if m.allowedMetrics["disk_free"] {
y, err := lp.NewMessage("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
if err == nil {
y.AddMeta("unit", "GBytes")
output <- y
}
}
if total > 0 {
perc := (100 * (total - free)) / total
if perc > part_max_used {
part_max_used = perc
}
}
}
if m.allowedMetrics["part_max_used"] {
y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
if err == nil {
y.AddMeta("unit", "percent")
output <- y
}
}
}

View File

@@ -1,34 +1,34 @@
<!--
---
title: Disk usage statistics metric collector
description: Collect metrics for various filesystems from `/proc/self/mounts`
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/diskstat.md
---
-->
## `diskstat` collector
```json
"diskstat": {
"exclude_metrics": [
"disk_total"
"read_ms"
],
"exclude_mounts": [
"slurm-tmpfs"
]
}
```
The `diskstat` collector reads data from `/proc/self/mounts` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. Additionally, any mount point containing one of the strings specified in `exclude_mounts` will be skipped during metric collection.
The `netstat` collector reads data from `/proc/net/dev` and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink.
Metrics per device (with `device` tag):
* `disk_total` (unit `GBytes`)
* `disk_free` (unit `GBytes`)
Global metrics:
* `part_max_used` (unit `percent`)
Metrics:
* `reads`
* `reads_merged`
* `read_sectors`
* `read_ms`
* `writes`
* `writes_merged`
* `writes_sectors`
* `writes_ms`
* `ioops`
* `ioops_ms`
* `ioops_weighted_ms`
* `discards`
* `discards_merged`
* `discards_sectors`
* `discards_ms`
* `flushes`
* `flushes_ms`
The device name is added as tag `device`.

View File

@@ -1,10 +1,3 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
@@ -12,62 +5,33 @@ import (
"bytes"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"log"
"os"
"os/exec"
"os/user"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
)
const DEFAULT_GPFS_CMD = "mmpmon"
type GpfsCollectorLastState struct {
numOpens int64
numCloses int64
numReads int64
numWrites int64
numReaddirs int64
numInodeUpdates int64
bytesRead int64
bytesWritten int64
bytesTotal int64
iops int64
metaops int64
}
type GpfsCollector struct {
metricCollector
tags map[string]string
config struct {
Mmpmon string `json:"mmpmon_path,omitempty"`
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
SendBandwidths bool `json:"send_bandwidths"`
SendTotalValues bool `json:"send_total_values"`
SendDerivedValues bool `json:"send_derived_values"`
Mmpmon string `json:"mmpmon"`
}
skipFS map[string]struct{}
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
lastState map[string]GpfsCollectorLastState
}
func (m *GpfsCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
var err error
m.name = "GpfsCollector"
m.setup()
m.parallel = true
// Set default mmpmon binary
m.config.Mmpmon = DEFAULT_GPFS_CMD
m.config.Mmpmon = "/usr/lpp/mmfs/bin/mmpmon"
// Read JSON configuration
if len(config) > 0 {
@@ -85,45 +49,31 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
"type": "node",
"filesystem": "",
}
m.skipFS = make(map[string]struct{})
for _, fs := range m.config.ExcludeFilesystem {
m.skipFS[fs] = struct{}{}
}
m.lastState = make(map[string]GpfsCollectorLastState)
// GPFS / IBM Spectrum Scale file system statistics can only be queried by user root
user, err := user.Current()
if err != nil {
return fmt.Errorf("failed to get current user: %v", err)
return fmt.Errorf("GpfsCollector.Init(): Failed to get current user: %v", err)
}
if user.Uid != "0" {
return fmt.Errorf("GPFS file system statistics can only be queried by user root")
return fmt.Errorf("GpfsCollector.Init(): GPFS file system statistics can only be queried by user root")
}
// Check if mmpmon is in executable search path
p, err := exec.LookPath(m.config.Mmpmon)
_, err = exec.LookPath(m.config.Mmpmon)
if err != nil {
return fmt.Errorf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err)
return fmt.Errorf("GpfsCollector.Init(): Failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err)
}
m.config.Mmpmon = p
m.init = true
return nil
}
func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Check if already initialized
func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if !m.init {
return
}
// Current time stamp
now := time.Now()
// time difference to last time stamp
timeDiff := now.Sub(m.lastTimestamp).Seconds()
// Save current timestamp
m.lastTimestamp = now
// mmpmon:
// -p: generate output that can be parsed
// -s: suppress the prompt on input
@@ -136,15 +86,12 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
cmd.Stderr = cmdStderr
err := cmd.Run()
if err != nil {
dataStdErr, _ := io.ReadAll(cmdStderr)
dataStdOut, _ := io.ReadAll(cmdStdout)
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err),
fmt.Sprintf("Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()),
fmt.Sprintf("Read(): command stderr: \"%s\"\n", string(dataStdErr)),
fmt.Sprintf("Read(): command stdout: \"%s\"\n", string(dataStdOut)),
)
fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error())
fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode())
data, _ := ioutil.ReadAll(cmdStderr)
fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): command stderr: \"%s\"\n", string(data))
data, _ = ioutil.ReadAll(cmdStdout)
fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): command stdout: \"%s\"\n", string(data))
return
}
@@ -152,471 +99,149 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
scanner := bufio.NewScanner(cmdStdout)
for scanner.Scan() {
lineSplit := strings.Fields(scanner.Text())
// Only process lines starting with _fs_io_s_
if lineSplit[0] != "_fs_io_s_" {
continue
}
key_value := make(map[string]string)
for i := 1; i < len(lineSplit); i += 2 {
key_value[lineSplit[i]] = lineSplit[i+1]
}
// Ignore keys:
// _n_: node IP address,
// _nn_: node name,
// _cl_: cluster name,
// _d_: number of disks
filesystem, ok := key_value["_fs_"]
if !ok {
cclog.ComponentError(
m.name,
"Read(): Failed to get filesystem name.")
continue
}
// Skip excluded filesystems
if _, skip := m.skipFS[filesystem]; skip {
continue
}
// Add filesystem tag
m.tags["filesystem"] = filesystem
// Create initial last state
if m.config.SendBandwidths {
if _, ok := m.lastState[filesystem]; !ok {
m.lastState[filesystem] = GpfsCollectorLastState{
bytesRead: -1,
bytesWritten: -1,
}
if lineSplit[0] == "_fs_io_s_" {
key_value := make(map[string]string)
for i := 1; i < len(lineSplit); i += 2 {
key_value[lineSplit[i]] = lineSplit[i+1]
}
}
if m.config.SendDerivedValues {
if _, ok := m.lastState[filesystem]; !ok {
m.lastState[filesystem] = GpfsCollectorLastState{
numReads: -1,
numWrites: -1,
numOpens: -1,
numCloses: -1,
numReaddirs: -1,
numInodeUpdates: -1,
bytesTotal: -1,
iops: -1,
metaops: -1,
}
}
}
// Ignore keys:
// _n_: node IP address,
// _nn_: node name,
// _cl_: cluster name,
// _d_: number of disks
// return code
rc, err := strconv.Atoi(key_value["_rc_"])
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert return code '%s' to int: %v", key_value["_rc_"], err))
continue
}
if rc != 0 {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Filesystem '%s' is not ok.", filesystem))
continue
}
sec, err := strconv.ParseInt(key_value["_t_"], 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert seconds '%s' to int64: %v", key_value["_t_"], err))
continue
}
msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert micro seconds '%s' to int64: %v", key_value["_tu_"], err))
continue
}
timestamp := time.Unix(sec, msec*1000)
// bytes read
bytesRead, err := strconv.ParseInt(key_value["_br_"], 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert bytes read '%s' to int64: %v", key_value["_br_"], err))
continue
}
if y, err :=
lp.NewMessage(
"gpfs_bytes_read",
m.tags,
m.meta,
map[string]interface{}{
"value": bytesRead,
},
timestamp,
); err == nil {
y.AddMeta("unit", "bytes")
output <- y
}
if m.config.SendBandwidths {
if lastBytesRead := m.lastState[filesystem].bytesRead; lastBytesRead >= 0 {
bwRead := float64(bytesRead-lastBytesRead) / timeDiff
if y, err :=
lp.NewMessage(
"gpfs_bw_read",
m.tags,
m.meta,
map[string]interface{}{
"value": bwRead,
},
timestamp,
); err == nil {
y.AddMeta("unit", "bytes/sec")
output <- y
}
filesystem, ok := key_value["_fs_"]
if !ok {
fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to get filesystem name.\n")
continue
}
}
// bytes written
bytesWritten, err := strconv.ParseInt(key_value["_bw_"], 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert bytes written '%s' to int64: %v", key_value["_bw_"], err))
continue
}
if y, err :=
lp.NewMessage(
"gpfs_bytes_written",
m.tags,
m.meta,
map[string]interface{}{
"value": bytesWritten,
},
timestamp,
); err == nil {
y.AddMeta("unit", "bytes")
output <- y
}
if m.config.SendBandwidths {
if lastBytesWritten := m.lastState[filesystem].bytesWritten; lastBytesWritten >= 0 {
bwWrite := float64(bytesWritten-lastBytesWritten) / timeDiff
if y, err :=
lp.NewMessage(
"gpfs_bw_write",
m.tags,
m.meta,
map[string]interface{}{
"value": bwWrite,
},
timestamp,
); err == nil {
y.AddMeta("unit", "bytes/sec")
output <- y
}
m.tags["filesystem"] = filesystem
// return code
rc, err := strconv.Atoi(key_value["_rc_"])
if err != nil {
fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert return code: %s\n", err.Error())
continue
}
}
// number of opens
numOpens, err := strconv.ParseInt(key_value["_oc_"], 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert number of opens '%s' to int64: %v", key_value["_oc_"], err))
continue
}
if y, err := lp.NewMessage("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp); err == nil {
output <- y
}
if m.config.SendDerivedValues {
if lastNumOpens := m.lastState[filesystem].numOpens; lastNumOpens >= 0 {
opensRate := float64(numOpens-lastNumOpens) / timeDiff
if y, err :=
lp.NewMessage(
"gpfs_opens_rate",
m.tags,
m.meta,
map[string]interface{}{
"value": opensRate,
},
timestamp,
); err == nil {
y.AddMeta("unit", "requests/sec")
output <- y
}
if rc != 0 {
fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Filesystem %s not ok.", filesystem)
continue
}
}
// number of closes
numCloses, err := strconv.ParseInt(key_value["_cc_"], 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert number of closes: '%s' to int64: %v", key_value["_cc_"], err))
continue
}
if y, err := lp.NewMessage("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp); err == nil {
output <- y
}
if m.config.SendDerivedValues {
if lastNumCloses := m.lastState[filesystem].numCloses; lastNumCloses >= 0 {
closesRate := float64(numCloses-lastNumCloses) / timeDiff
if y, err :=
lp.NewMessage(
"gpfs_closes_rate",
m.tags,
m.meta,
map[string]interface{}{
"value": closesRate,
},
timestamp,
); err == nil {
y.AddMeta("unit", "requests/sec")
output <- y
}
sec, err := strconv.ParseInt(key_value["_t_"], 10, 64)
if err != nil {
fmt.Fprintf(os.Stderr,
"GpfsCollector.Read(): Failed to convert seconds to int '%s': %v\n",
key_value["_t_"], err)
continue
}
}
// number of reads
numReads, err := strconv.ParseInt(key_value["_rdc_"], 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert number of reads: '%s' to int64: %v", key_value["_rdc_"], err))
continue
}
if y, err := lp.NewMessage("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp); err == nil {
output <- y
}
if m.config.SendDerivedValues {
if lastNumReads := m.lastState[filesystem].numReads; lastNumReads >= 0 {
readsRate := float64(numOpens-lastNumReads) / timeDiff
if y, err :=
lp.NewMessage(
"gpfs_reads_rate",
m.tags,
m.meta,
map[string]interface{}{
"value": readsRate,
},
timestamp,
); err == nil {
y.AddMeta("unit", "requests/sec")
output <- y
}
msec, err := strconv.ParseInt(key_value["_tu_"], 10, 64)
if err != nil {
fmt.Fprintf(os.Stderr,
"GpfsCollector.Read(): Failed to convert micro seconds to int '%s': %v\n",
key_value["_tu_"], err)
continue
}
}
timestamp := time.Unix(sec, msec*1000)
// number of writes
numWrites, err := strconv.ParseInt(key_value["_wc_"], 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert number of writes: '%s' to int64: %v", key_value["_wc_"], err))
continue
}
if y, err := lp.NewMessage("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp); err == nil {
output <- y
}
if m.config.SendDerivedValues {
if lastNumWrites := m.lastState[filesystem].numWrites; lastNumWrites >= 0 {
writesRate := float64(numWrites-lastNumWrites) / timeDiff
if y, err :=
lp.NewMessage(
"gpfs_writes_rate",
m.tags,
m.meta,
map[string]interface{}{
"value": writesRate,
},
timestamp,
); err == nil {
y.AddMeta("unit", "requests/sec")
output <- y
}
// bytes read
bytesRead, err := strconv.ParseInt(key_value["_br_"], 10, 64)
if err != nil {
fmt.Fprintf(os.Stderr,
"GpfsCollector.Read(): Failed to convert bytes read '%s': %s\n",
key_value["_br_"], err.Error())
continue
}
}
// number of read directories
numReaddirs, err := strconv.ParseInt(key_value["_dir_"], 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert number of read directories: '%s' to int64: %v", key_value["_dir_"], err))
continue
}
if y, err := lp.NewMessage("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp); err == nil {
output <- y
}
if m.config.SendDerivedValues {
if lastNumReaddirs := m.lastState[filesystem].numReaddirs; lastNumReaddirs >= 0 {
readdirsRate := float64(numReaddirs-lastNumReaddirs) / timeDiff
if y, err :=
lp.NewMessage(
"gpfs_readdirs_rate",
m.tags,
m.meta,
map[string]interface{}{
"value": readdirsRate,
},
timestamp,
); err == nil {
y.AddMeta("unit", "requests/sec")
output <- y
}
}
}
// Number of inode updates
numInodeUpdates, err := strconv.ParseInt(key_value["_iu_"], 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert number of inode updates: '%s' to int: %v", key_value["_iu_"], err))
continue
}
if y, err := lp.NewMessage("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil {
output <- y
}
if m.config.SendDerivedValues {
if lastNumInodeUpdates := m.lastState[filesystem].numInodeUpdates; lastNumInodeUpdates >= 0 {
inodeUpdatesRate := float64(numInodeUpdates-lastNumInodeUpdates) / timeDiff
if y, err :=
lp.NewMessage(
"gpfs_inode_updates_rate",
m.tags,
m.meta,
map[string]interface{}{
"value": inodeUpdatesRate,
},
timestamp,
); err == nil {
y.AddMeta("unit", "requests/sec")
output <- y
}
}
}
// Total values
bytesTotal := int64(-1);
iops := int64(-1);
metaops := int64(-1);
if m.config.SendTotalValues {
bytesTotal = bytesRead + bytesWritten
if y, err :=
lp.NewMessage("gpfs_bytes_total",
m.tags,
m.meta,
map[string]interface{}{
"value": bytesTotal,
},
timestamp,
); err == nil {
y.AddMeta("unit", "bytes")
y, err := lp.New("gpfs_bytes_read", m.tags, m.meta, map[string]interface{}{"value": bytesRead}, timestamp)
if err == nil {
output <- y
}
if m.config.SendBandwidths {
if lastBytesTotal := m.lastState[filesystem].bytesTotal; lastBytesTotal >= 0 {
bwTotal := float64(bytesTotal-lastBytesTotal) / timeDiff
if y, err :=
lp.NewMessage(
"gpfs_bw_total",
m.tags,
m.meta,
map[string]interface{}{
"value": bwTotal,
},
timestamp,
); err == nil {
y.AddMeta("unit", "bytes/sec")
output <- y
}
}
// bytes written
bytesWritten, err := strconv.ParseInt(key_value["_bw_"], 10, 64)
if err != nil {
fmt.Fprintf(os.Stderr,
"GpfsCollector.Read(): Failed to convert bytes written '%s': %s\n",
key_value["_bw_"], err.Error())
continue
}
iops = numReads + numWrites
if y, err :=
lp.NewMessage("gpfs_iops",
m.tags,
m.meta,
map[string]interface{}{
"value": iops,
},
timestamp,
); err == nil {
y, err = lp.New("gpfs_bytes_written", m.tags, m.meta, map[string]interface{}{"value": bytesWritten}, timestamp)
if err == nil {
output <- y
}
if m.config.SendDerivedValues {
if lastIops := m.lastState[filesystem].iops; lastIops >= 0 {
iopsRate := float64(iops-lastIops) / timeDiff
if y, err :=
lp.NewMessage(
"gpfs_iops_rate",
m.tags,
m.meta,
map[string]interface{}{
"value": iopsRate,
},
timestamp,
); err == nil {
y.AddMeta("unit", "requests/sec")
output <- y
}
}
// number of opens
numOpens, err := strconv.ParseInt(key_value["_oc_"], 10, 64)
if err != nil {
fmt.Fprintf(os.Stderr,
"GpfsCollector.Read(): Failed to convert number of opens '%s': %s\n",
key_value["_oc_"], err.Error())
continue
}
metaops = numInodeUpdates + numCloses + numOpens + numReaddirs
if y, err :=
lp.NewMessage("gpfs_metaops",
m.tags,
m.meta,
map[string]interface{}{
"value": metaops,
},
timestamp,
); err == nil {
y, err = lp.New("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp)
if err == nil {
output <- y
}
if m.config.SendDerivedValues {
if lastMetaops := m.lastState[filesystem].metaops; lastMetaops >= 0 {
metaopsRate := float64(metaops-lastMetaops) / timeDiff
if y, err :=
lp.NewMessage(
"gpfs_metaops_rate",
m.tags,
m.meta,
map[string]interface{}{
"value": metaopsRate,
},
timestamp,
); err == nil {
y.AddMeta("unit", "requests/sec")
output <- y
}
}
// number of closes
numCloses, err := strconv.ParseInt(key_value["_cc_"], 10, 64)
if err != nil {
fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of closes: %s\n", err.Error())
continue
}
y, err = lp.New("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp)
if err == nil {
output <- y
}
// number of reads
numReads, err := strconv.ParseInt(key_value["_rdc_"], 10, 64)
if err != nil {
fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of reads: %s\n", err.Error())
continue
}
y, err = lp.New("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp)
if err == nil {
output <- y
}
// number of writes
numWrites, err := strconv.ParseInt(key_value["_wc_"], 10, 64)
if err != nil {
fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of writes: %s\n", err.Error())
continue
}
y, err = lp.New("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp)
if err == nil {
output <- y
}
// number of read directories
numReaddirs, err := strconv.ParseInt(key_value["_dir_"], 10, 64)
if err != nil {
fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert number of read directories: %s\n", err.Error())
continue
}
y, err = lp.New("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp)
if err == nil {
output <- y
}
// Number of inode updates
numInodeUpdates, err := strconv.ParseInt(key_value["_iu_"], 10, 64)
if err != nil {
fmt.Fprintf(os.Stderr, "GpfsCollector.Read(): Failed to convert Number of inode updates: %s\n", err.Error())
continue
}
y, err = lp.New("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp)
if err == nil {
output <- y
}
}
// Save last state
m.lastState[filesystem] = GpfsCollectorLastState{
bytesRead: bytesRead,
bytesWritten: bytesWritten,
numOpens: numOpens,
numCloses: numCloses,
numReads: numReads,
numWrites: numWrites,
numReaddirs: numReaddirs,
numInodeUpdates: numInodeUpdates,
bytesTotal: bytesTotal,
iops: iops,
metaops: metaops,
}
}
}

View File

@@ -1,60 +0,0 @@
<!--
---
title: GPFS collector
description: Collect infos about GPFS filesystems
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/gpfs.md
---
-->
## `gpfs` collector
```json
"gpfs": {
"mmpmon_path": "/path/to/mmpmon",
"exclude_filesystem": [
"fs1"
],
"send_bandwidths": true,
"send_total_values": true,
"send_derived_values": true
}
```
The `gpfs` collector uses the `mmpmon` command to read performance metrics for
GPFS / IBM Spectrum Scale filesystems.
The reported filesystems can be filtered with the `exclude_filesystem` option
in the configuration.
The path to the `mmpmon` command can be configured with the `mmpmon_path` option
in the configuration. If nothing is set, the collector searches in `$PATH` for `mmpmon`.
Metrics:
* `gpfs_bytes_read`
* `gpfs_bytes_written`
* `gpfs_num_opens`
* `gpfs_num_closes`
* `gpfs_num_reads`
* `gpfs_num_writes`
* `gpfs_num_readdirs`
* `gpfs_num_inode_updates`
* `gpfs_opens_rate` (if `send_derived_values == true`)
* `gpfs_closes_rate` (if `send_derived_values == true`)
* `gpfs_reads_rate` (if `send_derived_values == true`)
* `gpfs_writes_rate` (if `send_derived_values == true`)
* `gpfs_readdirs_rate` (if `send_derived_values == true`)
* `gpfs_inode_updates_rate` (if `send_derived_values == true`)
* `gpfs_bytes_total = gpfs_bytes_read + gpfs_bytes_written` (if `send_total_values == true`)
* `gpfs_iops = gpfs_num_reads + gpfs_num_writes` (if `send_total_values == true`)
* `gpfs_iops_rate` (if `send_total_values == true` and `send_derived_values == true`)
* `gpfs_metaops = gpfs_num_inode_updates + gpfs_num_closes + gpfs_num_opens + gpfs_num_readdirs` (if `send_total_values == true`)
* `gpfs_metaops_rate` (if `send_total_values == true` and `send_derived_values == true`)
* `gpfs_bw_read` (if `send_bandwidths == true`)
* `gpfs_bw_write` (if `send_bandwidths == true`)
* `gpfs_bw_total` (if `send_bandwidths == true` and `send_total_values == true`)
The collector adds a `filesystem` tag to all metrics

View File

@@ -1,18 +1,10 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"fmt"
"os"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
"golang.org/x/sys/unix"
"encoding/json"
@@ -22,60 +14,51 @@ import (
"time"
)
const IB_BASEPATH = "/sys/class/infiniband/"
type InfinibandCollectorMetric struct {
name string
path string
unit string
scale int64
addToIBTotal bool
addToIBTotalPkgs bool
currentState int64
lastState int64
}
const IB_BASEPATH = `/sys/class/infiniband/`
type InfinibandCollectorInfo struct {
LID string // IB local Identifier (LID)
device string // IB device
port string // IB device port
portCounterFiles []InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric
tagSet map[string]string // corresponding tag list
LID string // IB local Identifier (LID)
device string // IB device
port string // IB device port
portCounterFiles map[string]string // mapping counter name -> file
tagSet map[string]string // corresponding tag list
}
type InfinibandCollector struct {
metricCollector
config struct {
ExcludeDevices []string `json:"exclude_devices,omitempty"` // IB device to exclude e.g. mlx5_0
SendAbsoluteValues bool `json:"send_abs_values"` // Send absolut values as read from sys filesystem
SendTotalValues bool `json:"send_total_values"` // Send computed total values
SendDerivedValues bool `json:"send_derived_values"` // Send derived values e.g. rates
ExcludeDevices []string `json:"exclude_devices,omitempty"` // IB device to exclude e.g. mlx5_0
}
info []InfinibandCollectorInfo
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
info []InfinibandCollectorInfo
}
func (m *InfinibandCollector) Help() {
fmt.Println("This collector includes all devices that can be found below ", IB_BASEPATH)
fmt.Println("and where any of the ports provides a 'lid' file (glob ", IB_BASEPATH, "/<dev>/ports/<port>/lid).")
fmt.Println("The devices can be filtered with the 'exclude_devices' option in the configuration.")
fmt.Println("For each found LIDs the collector calls the 'perfquery' command")
fmt.Println("")
fmt.Println("Full configuration object:")
fmt.Println("\"ibstat\" : {")
fmt.Println(" \"exclude_devices\" : [\"dev1\"]")
fmt.Println("}")
fmt.Println("")
fmt.Println("Metrics:")
fmt.Println("- ib_recv")
fmt.Println("- ib_xmit")
fmt.Println("- ib_recv_pkts")
fmt.Println("- ib_xmit_pkts")
}
// Init initializes the Infiniband collector by walking through files below IB_BASEPATH
func (m *InfinibandCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
var err error
m.name = "InfinibandCollector"
m.setup()
m.parallel = true
m.meta = map[string]string{
"source": m.name,
"group": "Network",
}
// Set default configuration,
m.config.SendAbsoluteValues = true
m.config.SendDerivedValues = false
// Read configuration file, allow overwriting default config
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
@@ -87,21 +70,17 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*")
ibDirs, err := filepath.Glob(globPattern)
if err != nil {
return fmt.Errorf("unable to glob files with pattern %s: %v", globPattern, err)
return fmt.Errorf("Unable to glob files with pattern %s: %v", globPattern, err)
}
if ibDirs == nil {
return fmt.Errorf("unable to find any directories with pattern %s", globPattern)
return fmt.Errorf("Unable to find any directories with pattern %s", globPattern)
}
for _, path := range ibDirs {
// Skip, when no LID is assigned
line, err := os.ReadFile(filepath.Join(path, "lid"))
if err != nil {
continue
}
LID := strings.TrimSpace(string(line))
if LID == "0x0" {
LID, ok := readOneLine(path + "/lid")
if !ok || LID == "0x0" {
continue
}
@@ -124,44 +103,16 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
// Check access to counter files
countersDir := filepath.Join(path, "counters")
portCounterFiles := []InfinibandCollectorMetric{
{
name: "ib_recv",
path: filepath.Join(countersDir, "port_rcv_data"),
unit: "bytes",
scale: 4,
addToIBTotal: true,
lastState: -1,
},
{
name: "ib_xmit",
path: filepath.Join(countersDir, "port_xmit_data"),
unit: "bytes",
scale: 4,
addToIBTotal: true,
lastState: -1,
},
{
name: "ib_recv_pkts",
path: filepath.Join(countersDir, "port_rcv_packets"),
unit: "packets",
scale: 1,
addToIBTotalPkgs: true,
lastState: -1,
},
{
name: "ib_xmit_pkts",
path: filepath.Join(countersDir, "port_xmit_packets"),
unit: "packets",
scale: 1,
addToIBTotalPkgs: true,
lastState: -1,
},
portCounterFiles := map[string]string{
"ib_recv": filepath.Join(countersDir, "port_rcv_data"),
"ib_xmit": filepath.Join(countersDir, "port_xmit_data"),
"ib_recv_pkts": filepath.Join(countersDir, "port_rcv_packets"),
"ib_xmit_pkts": filepath.Join(countersDir, "port_xmit_packets"),
}
for _, counter := range portCounterFiles {
err := unix.Access(counter.path, unix.R_OK)
for _, counterFile := range portCounterFiles {
err := unix.Access(counterFile, unix.R_OK)
if err != nil {
return fmt.Errorf("unable to access %s: %v", counter.path, err)
return fmt.Errorf("Unable to access %s: %v", counterFile, err)
}
}
@@ -181,7 +132,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
}
if len(m.info) == 0 {
return fmt.Errorf("found no IB devices")
return fmt.Errorf("Found no IB devices")
}
m.init = true
@@ -189,125 +140,25 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
}
// Read reads Infiniband counter files below IB_BASEPATH
func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMessage) {
func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetric) {
// Check if already initialized
if !m.init {
return
}
// Current time stamp
now := time.Now()
// time difference to last time stamp
timeDiff := now.Sub(m.lastTimestamp).Seconds()
// Save current timestamp
m.lastTimestamp = now
for i := range m.info {
// device info
info := &m.info[i]
var ib_total, ib_total_pkts int64
for i := range info.portCounterFiles {
counterDef := &info.portCounterFiles[i]
// Read counter file
line, err := os.ReadFile(counterDef.path)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err))
continue
}
data := strings.TrimSpace(string(line))
// convert counter to int64
v, err := strconv.ParseInt(data, 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterDef.name, data, err))
continue
}
// Scale raw value
v *= counterDef.scale
// Save current state
counterDef.currentState = v
// Send absolut values
if m.config.SendAbsoluteValues {
if y, err :=
lp.NewMessage(
counterDef.name,
info.tagSet,
m.meta,
map[string]interface{}{
"value": counterDef.currentState,
},
now); err == nil {
y.AddMeta("unit", counterDef.unit)
output <- y
}
}
// Send derived values
if m.config.SendDerivedValues {
if counterDef.lastState >= 0 {
rate := float64((counterDef.currentState - counterDef.lastState)) / timeDiff
if y, err :=
lp.NewMessage(
counterDef.name+"_bw",
info.tagSet,
m.meta,
map[string]interface{}{
"value": rate,
},
now); err == nil {
y.AddMeta("unit", counterDef.unit+"/sec")
for counterName, counterFile := range info.portCounterFiles {
if data, ok := readOneLine(counterFile); ok {
if v, err := strconv.ParseInt(data, 10, 64); err == nil {
if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil {
output <- y
}
}
counterDef.lastState = counterDef.currentState
}
// Sum up total values
if m.config.SendTotalValues {
switch {
case counterDef.addToIBTotal:
ib_total += counterDef.currentState
case counterDef.addToIBTotalPkgs:
ib_total_pkts += counterDef.currentState
}
}
}
// Send total values
if m.config.SendTotalValues {
if y, err :=
lp.NewMessage(
"ib_total",
info.tagSet,
m.meta,
map[string]interface{}{
"value": ib_total,
},
now); err == nil {
y.AddMeta("unit", "bytes")
output <- y
}
if y, err :=
lp.NewMessage(
"ib_total_pkts",
info.tagSet,
m.meta,
map[string]interface{}{
"value": ib_total_pkts,
},
now); err == nil {
y.AddMeta("unit", "packets")
output <- y
}
}
}

View File

@@ -1,45 +1,19 @@
<!--
---
title: InfiniBand Metric collector
description: Collect metrics for InfiniBand devices
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/infiniband.md
---
-->
## `ibstat` collector
```json
"ibstat": {
"perfquery_path" : "<path to perfquery command>",
"exclude_devices": [
"mlx4"
],
"send_abs_values": true,
"send_derived_values": true
]
}
```
The `ibstat` collector includes all Infiniband devices that can be
found below `/sys/class/infiniband/` and where any of the ports provides a
LID file (`/sys/class/infiniband/<dev>/ports/<port>/lid`)
The devices can be filtered with the `exclude_devices` option in the configuration.
For each found LID the collector reads data through the sysfs files below `/sys/class/infiniband/<device>`. (See: <https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-class-infiniband>)
The `ibstat` collector reads either data through the `perfquery` command or the sysfs files below `/sys/class/infiniband/<device>`.
Metrics:
* `ib_recv`
* `ib_xmit`
* `ib_recv_pkts`
* `ib_xmit_pkts`
* `ib_total = ib_recv + ib_xmit` (if `send_total_values == true`)
* `ib_total_pkts = ib_recv_pkts + ib_xmit_pkts` (if `send_total_values == true`)
* `ib_recv_bw` (if `send_derived_values == true`)
* `ib_xmit_bw` (if `send_derived_values == true`)
* `ib_recv_pkts_bw` (if `send_derived_values == true`)
* `ib_xmit_pkts_bw` (if `send_derived_values == true`)
The collector adds a `device` tag to all metrics

View File

@@ -0,0 +1,250 @@
package collectors
import (
"fmt"
"io/ioutil"
"log"
"os/exec"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
// "os"
"encoding/json"
"errors"
"path/filepath"
"strconv"
"strings"
"time"
)
const PERFQUERY = `/usr/sbin/perfquery`
type InfinibandPerfQueryCollector struct {
metricCollector
tags map[string]string
lids map[string]map[string]string
config struct {
ExcludeDevices []string `json:"exclude_devices,omitempty"`
PerfQueryPath string `json:"perfquery_path"`
}
}
func (m *InfinibandPerfQueryCollector) Help() {
fmt.Println("This collector includes all devices that can be found below ", IB_BASEPATH)
fmt.Println("and where any of the ports provides a 'lid' file (glob ", IB_BASEPATH, "/<dev>/ports/<port>/lid).")
fmt.Println("The devices can be filtered with the 'exclude_devices' option in the configuration.")
fmt.Println("For each found LIDs the collector calls the 'perfquery' command")
fmt.Println("The path to the 'perfquery' command can be configured with the 'perfquery_path' option")
fmt.Println("in the configuration")
fmt.Println("")
fmt.Println("Full configuration object:")
fmt.Println("\"ibstat\" : {")
fmt.Println(" \"perfquery_path\" : \"path/to/perfquery\" # if omitted, it searches in $PATH")
fmt.Println(" \"exclude_devices\" : [\"dev1\"]")
fmt.Println("}")
fmt.Println("")
fmt.Println("Metrics:")
fmt.Println("- ib_recv")
fmt.Println("- ib_xmit")
fmt.Println("- ib_recv_pkts")
fmt.Println("- ib_xmit_pkts")
}
func (m *InfinibandPerfQueryCollector) Init(config json.RawMessage) error {
var err error
m.name = "InfinibandCollectorPerfQuery"
m.setup()
m.meta = map[string]string{"source": m.name, "group": "Network"}
m.tags = map[string]string{"type": "node"}
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
if len(m.config.PerfQueryPath) == 0 {
path, err := exec.LookPath("perfquery")
if err == nil {
m.config.PerfQueryPath = path
}
}
m.lids = make(map[string]map[string]string)
p := fmt.Sprintf("%s/*/ports/*/lid", string(IB_BASEPATH))
files, err := filepath.Glob(p)
for _, f := range files {
lid, err := ioutil.ReadFile(f)
if err == nil {
plist := strings.Split(strings.Replace(f, string(IB_BASEPATH), "", -1), "/")
skip := false
for _, d := range m.config.ExcludeDevices {
if d == plist[0] {
skip = true
}
}
if !skip {
m.lids[plist[0]] = make(map[string]string)
m.lids[plist[0]][plist[2]] = string(lid)
}
}
}
for _, ports := range m.lids {
for port, lid := range ports {
args := fmt.Sprintf("-r %s %s 0xf000", lid, port)
command := exec.Command(m.config.PerfQueryPath, args)
command.Wait()
_, err := command.Output()
if err != nil {
return fmt.Errorf("Failed to execute %s: %v", m.config.PerfQueryPath, err)
}
}
}
if len(m.lids) == 0 {
return errors.New("No usable IB devices")
}
m.init = true
return nil
}
func (m *InfinibandPerfQueryCollector) doPerfQuery(cmd string, dev string, lid string, port string, tags map[string]string, output chan lp.CCMetric) error {
args := fmt.Sprintf("-r %s %s 0xf000", lid, port)
command := exec.Command(cmd, args)
command.Wait()
stdout, err := command.Output()
if err != nil {
log.Print(err)
return err
}
ll := strings.Split(string(stdout), "\n")
for _, line := range ll {
if strings.HasPrefix(line, "PortRcvData") || strings.HasPrefix(line, "RcvData") {
lv := strings.Fields(line)
v, err := strconv.ParseFloat(lv[1], 64)
if err == nil {
y, err := lp.New("ib_recv", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now())
if err == nil {
output <- y
}
}
}
if strings.HasPrefix(line, "PortXmitData") || strings.HasPrefix(line, "XmtData") {
lv := strings.Fields(line)
v, err := strconv.ParseFloat(lv[1], 64)
if err == nil {
y, err := lp.New("ib_xmit", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now())
if err == nil {
output <- y
}
}
}
if strings.HasPrefix(line, "PortRcvPkts") || strings.HasPrefix(line, "RcvPkts") {
lv := strings.Fields(line)
v, err := strconv.ParseFloat(lv[1], 64)
if err == nil {
y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now())
if err == nil {
output <- y
}
}
}
if strings.HasPrefix(line, "PortXmitPkts") || strings.HasPrefix(line, "XmtPkts") {
lv := strings.Fields(line)
v, err := strconv.ParseFloat(lv[1], 64)
if err == nil {
y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now())
if err == nil {
output <- y
}
}
}
if strings.HasPrefix(line, "PortRcvPkts") || strings.HasPrefix(line, "RcvPkts") {
lv := strings.Fields(line)
v, err := strconv.ParseFloat(lv[1], 64)
if err == nil {
y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now())
if err == nil {
output <- y
}
}
}
if strings.HasPrefix(line, "PortXmitPkts") || strings.HasPrefix(line, "XmtPkts") {
lv := strings.Fields(line)
v, err := strconv.ParseFloat(lv[1], 64)
if err == nil {
y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now())
if err == nil {
output <- y
}
}
}
}
return nil
}
func (m *InfinibandPerfQueryCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if m.init {
for dev, ports := range m.lids {
for port, lid := range ports {
tags := map[string]string{
"type": "node",
"device": dev,
"port": port,
"lid": lid}
path := fmt.Sprintf("%s/%s/ports/%s/counters/", string(IB_BASEPATH), dev, port)
buffer, err := ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_data", path))
if err == nil {
data := strings.Replace(string(buffer), "\n", "", -1)
v, err := strconv.ParseFloat(data, 64)
if err == nil {
y, err := lp.New("ib_recv", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now())
if err == nil {
output <- y
}
}
}
buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_xmit_data", path))
if err == nil {
data := strings.Replace(string(buffer), "\n", "", -1)
v, err := strconv.ParseFloat(data, 64)
if err == nil {
y, err := lp.New("ib_xmit", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now())
if err == nil {
output <- y
}
}
}
buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_packets", path))
if err == nil {
data := strings.Replace(string(buffer), "\n", "", -1)
v, err := strconv.ParseFloat(data, 64)
if err == nil {
y, err := lp.New("ib_recv_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now())
if err == nil {
output <- y
}
}
}
buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_xmit_packets", path))
if err == nil {
data := strings.Replace(string(buffer), "\n", "", -1)
v, err := strconv.ParseFloat(data, 64)
if err == nil {
y, err := lp.New("ib_xmit_pkts", tags, m.meta, map[string]interface{}{"value": float64(v)}, time.Now())
if err == nil {
output <- y
}
}
}
}
}
}
}
func (m *InfinibandPerfQueryCollector) Close() {
m.init = false
}

View File

@@ -1,176 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"bufio"
"encoding/json"
"errors"
"os"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// Konstante für den Pfad zu /proc/diskstats
const IOSTATFILE = `/proc/diskstats`
type IOstatCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
// Neues Feld zum Ausschließen von Devices per JSON-Konfiguration
ExcludeDevices []string `json:"exclude_devices,omitempty"`
}
type IOstatCollectorEntry struct {
lastValues map[string]int64
tags map[string]string
}
type IOstatCollector struct {
metricCollector
matches map[string]int
config IOstatCollectorConfig
devices map[string]IOstatCollectorEntry
}
func (m *IOstatCollector) Init(config json.RawMessage) error {
var err error
m.name = "IOstatCollector"
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Disk"}
m.setup()
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
// https://www.kernel.org/doc/html/latest/admin-guide/iostats.html
matches := map[string]int{
"io_reads": 3,
"io_reads_merged": 4,
"io_read_sectors": 5,
"io_read_ms": 6,
"io_writes": 7,
"io_writes_merged": 8,
"io_writes_sectors": 9,
"io_writes_ms": 10,
"io_ioops": 11,
"io_ioops_ms": 12,
"io_ioops_weighted_ms": 13,
"io_discards": 14,
"io_discards_merged": 15,
"io_discards_sectors": 16,
"io_discards_ms": 17,
"io_flushes": 18,
"io_flushes_ms": 19,
}
m.devices = make(map[string]IOstatCollectorEntry)
m.matches = make(map[string]int)
for k, v := range matches {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, k); !skip {
m.matches[k] = v
}
}
if len(m.matches) == 0 {
return errors.New("no metrics to collect")
}
file, err := os.Open(IOSTATFILE)
if err != nil {
cclog.ComponentError(m.name, err.Error())
return err
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
linefields := strings.Fields(line)
if len(linefields) < 3 {
continue
}
device := linefields[2]
if strings.Contains(device, "loop") {
continue
}
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
continue
}
values := make(map[string]int64)
for m := range m.matches {
values[m] = 0
}
m.devices[device] = IOstatCollectorEntry{
tags: map[string]string{
"device": device,
"type": "node",
},
lastValues: values,
}
}
m.init = true
return err
}
func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
file, err := os.Open(IOSTATFILE)
if err != nil {
cclog.ComponentError(m.name, err.Error())
return
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
if len(line) == 0 {
continue
}
linefields := strings.Fields(line)
if len(linefields) < 3 {
continue
}
device := linefields[2]
if strings.Contains(device, "loop") {
continue
}
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
continue
}
if _, ok := m.devices[device]; !ok {
continue
}
entry := m.devices[device]
for name, idx := range m.matches {
if idx < len(linefields) {
x, err := strconv.ParseInt(linefields[idx], 0, 64)
if err == nil {
diff := x - entry.lastValues[name]
y, err := lp.NewMessage(name, entry.tags, m.meta, map[string]interface{}{"value": int(diff)}, time.Now())
if err == nil {
output <- y
}
}
entry.lastValues[name] = x
}
}
m.devices[device] = entry
}
}
func (m *IOstatCollector) Close() {
m.init = false
}

View File

@@ -1,49 +0,0 @@
<!--
---
title: IOStat Metric collector
description: Collect metrics from `/proc/diskstats`
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/iostat.md
---
-->
## `iostat` collector
```json
"iostat": {
"exclude_metrics": [
"io_read_ms"
],
"exclude_devices": [
"nvme0n1p1",
"nvme0n1p2",
"md127"
]
}
```
The `iostat` collector reads data from `/proc/diskstats` and outputs a handful **node** metrics. If a metric or device is not required, it can be excluded from forwarding it to the sink.
Metrics:
* `io_reads`
* `io_reads_merged`
* `io_read_sectors`
* `io_read_ms`
* `io_writes`
* `io_writes_merged`
* `io_writes_sectors`
* `io_writes_ms`
* `io_ioops`
* `io_ioops_ms`
* `io_ioops_weighted_ms`
* `io_discards`
* `io_discards_merged`
* `io_discards_sectors`
* `io_discards_ms`
* `io_flushes`
* `io_flushes_ms`
The device name is added as tag `device`. For more details, see https://www.kernel.org/doc/html/latest/admin-guide/iostats.html

View File

@@ -1,123 +1,78 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"bufio"
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"log"
"os"
"os/exec"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
)
const IPMISENSORS_PATH = `ipmi-sensors`
const IPMITOOL_PATH = `/usr/bin/ipmitool`
const IPMISENSORS_PATH = `/usr/sbin/ipmi-sensors`
type IpmiCollectorConfig struct {
ExcludeDevices []string `json:"exclude_devices"`
IpmitoolPath string `json:"ipmitool_path"`
IpmisensorsPath string `json:"ipmisensors_path"`
}
type IpmiCollector struct {
metricCollector
config struct {
ExcludeDevices []string `json:"exclude_devices"`
IpmitoolPath string `json:"ipmitool_path"`
IpmisensorsPath string `json:"ipmisensors_path"`
}
ipmitool string
ipmisensors string
tags map[string]string
matches map[string]string
config IpmiCollectorConfig
}
func (m *IpmiCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
m.name = "IpmiCollector"
m.setup()
m.parallel = true
m.meta = map[string]string{
"source": m.name,
"group": "IPMI",
}
// default path to IPMI tools
m.config.IpmitoolPath = "ipmitool"
m.config.IpmisensorsPath = "ipmi-sensors"
m.meta = map[string]string{"source": m.name, "group": "IPMI"}
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
// Check if executables ipmitool or ipmisensors are found
p, err := exec.LookPath(m.config.IpmitoolPath)
if err == nil {
command := exec.Command(p)
err := command.Run()
if err != nil {
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
m.ipmitool = ""
} else {
m.ipmitool = p
}
_, err1 := os.Stat(m.config.IpmitoolPath)
_, err2 := os.Stat(m.config.IpmisensorsPath)
if err1 != nil {
m.config.IpmitoolPath = ""
}
p, err = exec.LookPath(m.config.IpmisensorsPath)
if err == nil {
command := exec.Command(p)
err := command.Run()
if err != nil {
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
m.ipmisensors = ""
} else {
m.ipmisensors = p
}
if err2 != nil {
m.config.IpmisensorsPath = ""
}
if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 {
return errors.New("no usable IPMI reader found")
if err1 != nil && err2 != nil {
return errors.New("No IPMI reader found")
}
m.init = true
return nil
}
func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
// Setup ipmitool command
func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMetric) {
command := exec.Command(cmd, "sensor")
stdout, _ := command.StdoutPipe()
errBuf := new(bytes.Buffer)
command.Stderr = errBuf
// start command
if err := command.Start(); err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("readIpmiTool(): Failed to start command \"%s\": %v", command.String(), err),
)
command.Wait()
stdout, err := command.Output()
if err != nil {
log.Print(err)
return
}
// Read command output
scanner := bufio.NewScanner(stdout)
for scanner.Scan() {
lv := strings.Split(scanner.Text(), "|")
ll := strings.Split(string(stdout), "\n")
for _, line := range ll {
lv := strings.Split(line, "|")
if len(lv) < 3 {
continue
}
v, err := strconv.ParseFloat(strings.TrimSpace(lv[1]), 64)
v, err := strconv.ParseFloat(strings.Trim(lv[1], " "), 64)
if err == nil {
name := strings.ToLower(strings.Replace(strings.TrimSpace(lv[0]), " ", "_", -1))
unit := strings.TrimSpace(lv[2])
name := strings.ToLower(strings.Replace(strings.Trim(lv[0], " "), " ", "_", -1))
unit := strings.Trim(lv[2], " ")
if unit == "Volts" {
unit = "Volts"
} else if unit == "degrees C" {
@@ -128,27 +83,16 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
unit = "Watts"
}
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
y, err := lp.New(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
if err == nil {
y.AddMeta("unit", unit)
output <- y
}
}
}
// Wait for command end
if err := command.Wait(); err != nil {
errMsg, _ := io.ReadAll(errBuf)
cclog.ComponentError(
m.name,
fmt.Sprintf("readIpmiTool(): Failed to wait for the end of command \"%s\": %v\n", command.String(), err),
)
cclog.ComponentError(m.name, fmt.Sprintf("readIpmiTool(): command stderr: \"%s\"\n", strings.TrimSpace(string(errMsg))))
return
}
}
func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMetric) {
command := exec.Command(cmd, "--comma-separated-output", "--sdr-cache-recreate")
command.Wait()
@@ -166,7 +110,7 @@ func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
v, err := strconv.ParseFloat(lv[3], 64)
if err == nil {
name := strings.ToLower(strings.Replace(lv[1], " ", "_", -1))
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
y, err := lp.New(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
if err == nil {
if len(lv) > 4 {
y.AddMeta("unit", lv[4])
@@ -178,17 +122,17 @@ func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
}
}
func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Check if already initialized
if !m.init {
return
}
func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if len(m.config.IpmitoolPath) > 0 {
m.readIpmiTool(m.config.IpmitoolPath, output)
_, err := os.Stat(m.config.IpmitoolPath)
if err == nil {
m.readIpmiTool(m.config.IpmitoolPath, output)
}
} else if len(m.config.IpmisensorsPath) > 0 {
m.readIpmiSensors(m.config.IpmisensorsPath, output)
_, err := os.Stat(m.config.IpmisensorsPath)
if err == nil {
m.readIpmiSensors(m.config.IpmisensorsPath, output)
}
}
}

View File

@@ -1,13 +1,3 @@
<!--
---
title: IPMI Metric collector
description: Collect metrics using ipmitool or ipmi-sensors
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/ipmi.md
---
-->
## `ipmistat` collector
@@ -18,6 +8,9 @@ hugo_path: docs/reference/cc-metric-collector/collectors/ipmi.md
}
```
The `ipmistat` collector reads data from `ipmitool` (`ipmitool sensor`) or `ipmi-sensors` (`ipmi-sensors --sdr-cache-recreate --comma-separated-output`).
The `ipmistat` collector reads data from `ipmitool` (`ipmitool sensor`) or `ipmi-sensors` (`ipmi-sensors --sdr-cache-recreate --comma-separated-output`).
The metrics depend on the output of the underlying tools but contain temperature, power and energy metrics.

File diff suppressed because it is too large Load Diff

View File

@@ -1,164 +1,7 @@
<!--
---
title: LIKWID collector
description: Collect hardware performance events and metrics using LIKWID
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/likwid.md
---
-->
## `likwid` collector
The `likwid` collector is probably the most complicated collector. The LIKWID library is included as static library with *direct* access mode. The *direct* access mode is suitable if the daemon is executed by a root user. The static library does not contain the performance groups, so all information needs to be provided in the configuration.
```json
"likwid": {
"force_overwrite" : false,
"invalid_to_zero" : false,
"liblikwid_path" : "/path/to/liblikwid.so",
"accessdaemon_path" : "/folder/that/contains/likwid-accessD",
"access_mode" : "direct or accessdaemon or perf_event",
"lockfile_path" : "/var/run/likwid.lock",
"eventsets": [
{
"events" : {
"COUNTER0": "EVENT0",
"COUNTER1": "EVENT1"
},
"metrics" : [
{
"name": "sum_01",
"calc": "COUNTER0 + COUNTER1",
"publish": false,
"unit": "myunit",
"type": "hwthread"
}
]
}
],
"globalmetrics" : [
{
"name": "global_sum",
"calc": "sum_01",
"publish": true,
"unit": "myunit",
"type": "hwthread"
}
]
}
```
The `likwid` configuration consists of two parts, the `eventsets` and `globalmetrics`:
- An event set list itself has two parts, the `events` and a set of derivable `metrics`. Each of the `events` is a `counter:event` pair in LIKWID's syntax. The `metrics` are a list of formulas to derive the metric value from the measurements of the `events`' values. Each metric has a name, the formula, a type and a publish flag. There is an optional `unit` field. Counter names can be used like variables in the formulas, so `PMC0+PMC1` sums the measurements for the both events configured in the counters `PMC0` and `PMC1`. You can optionally use `time` for the measurement time and `inverseClock` for `1.0/baseCpuFrequency`. The type tells the LikwidCollector whether it is a metric for each hardware thread (`cpu`) or each CPU socket (`socket`). You may specify a unit for the metric with `unit`. The last one is the publishing flag. It tells the LikwidCollector whether a metric should be sent to the router or is only used internally to compute a global metric.
- The `globalmetrics` are metrics which require data from multiple event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a type and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. Also `time` and `inverseClock` cannot be used anymore. So, the idea is to derive a metric in the `eventsets` section and reuse it in the `globalmetrics` part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics (`"publish": false`). **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Similar to the metrics in the eventset, you can specify a metric unit with the `unit` field.
Additional options:
- `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements
- `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`. See below in [seperate section](./likwidMetric.md#invalid_to_zero-option)
- `access_mode`: Specify LIKWID access mode: `direct` for direct register access as root user or `accessdaemon`. The access mode `perf_event` is current untested.
- `accessdaemon_path`: Folder of the accessDaemon `likwid-accessD` (like `/usr/local/sbin`)
- `liblikwid_path`: Location of `liblikwid.so` including file name like `/usr/local/lib/liblikwid.so`
- `lockfile_path`: Location of LIKWID's lock file if multiple tools should access the hardware counters. Default `/var/run/likwid.lock`
### Available metric types
Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the LikwidCollector provides the specification of a `type` for each metric.
- `hwthread` : One metric per CPU hardware thread with the tags `"type" : "hwthread"` and `"type-id" : "$hwthread_id"`
- `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"`
**Note:** You cannot specify `socket` type for a metric that is measured at `hwthread` type, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the type of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific.
As a guideline:
- All counters `FIXCx`, `PMCy` and `TMAz` have the type `hwthread`
- All counters names containing `BOX` have the type `socket`
- All `PWRx` counters have type `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `hwthread` type
- All `DFCx` counters have type `socket`
### Help with the configuration
The configuration for the `likwid` collector is quite complicated. Most users don't use LIKWID with the event:counter notation but rely on the performance groups defined by the LIKWID team for each architecture. In order to help with the `likwid` collector configuration, we included a script `scripts/likwid_perfgroup_to_cc_config.py` that creates the configuration of an `eventset` from a performance group (using a LIKWID installation in `$PATH`):
```
$ likwid-perfctr -i
[...]
short name: ICX
[...]
$ likwid-perfctr -a
[...]
MEM_DP
MEM
FLOPS_SP
CLOCK
[...]
$ scripts/likwid_perfgroup_to_cc_config.py ICX MEM_DP
{
"events": {
"FIXC0": "INSTR_RETIRED_ANY",
"FIXC1": "CPU_CLK_UNHALTED_CORE",
"..." : "..."
},
"metrics" : [
{
"calc": "time",
"name": "Runtime (RDTSC) [s]",
"publish": true,
"unit": "seconds"
"type": "hwthread"
},
{
"..." : "..."
}
]
}
```
You can copy this JSON and add it to the `eventsets` list. If you specify multiple event sets, you can add globally derived metrics in the extra `global_metrics` section with the metric names as variables.
### Mixed usage between daemon and users
LIKWID checks the file `/var/run/likwid.lock` before performing any interfering operations. Who is allowed to access the counters is determined by the owner of the file. If it does not exist, it is created for the current user. So, if you want to temporarly allow counter access to a user (e.g. in a job):
Before (SLURM prolog, ...)
```bash
chown $JOBUSER /var/run/likwid.lock
```
After (SLURM epilog, ...)
```bash
chown $CCUSER /var/run/likwid.lock
```
### `invalid_to_zero` option
In some cases LIKWID returns `0.0` for some events that are further used in processing and maybe used as divisor in a calculation. After evaluation of a metric, the result might be `NaN` or `+-Inf`. These resulting metrics are commonly not created and forwarded to the router because the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#float) does not support these special floating-point values. If you want to have them sent, this option forces these metric values to be `0.0` instead.
One might think this does not happen often but often used metrics in the world of performance engineering like Instructions-per-Cycle (IPC) or more frequently the actual CPU clock are derived with events like `CPU_CLK_UNHALTED_CORE` (Intel) which do not increment in halted state (as the name implies). In there are different power management systems in a chip which can cause a hardware thread to go in such a state. Moreover, if no cycles are executed by the core, also many other events are not incremented as well (like `INSTR_RETIRED_ANY` for retired instructions and part of IPC).
### `lockfile_path` option
LIKWID can be configured with a lock file with which the access to the performance monitoring registers can be disabled (only the owner of the lock file is allowed to access the registers). When the `lockfile_path` option is set, the collector subscribes to changes to this file to stop monitoring if the owner of the lock file changes. This feature is useful when users should be able to perform own hardware performance counter measurements through LIKWID or any other tool.
### `send_*_total values` option
- `send_core_total_values`: Metrics, which are usually collected on a per hardware thread basis, are additionally summed up per CPU core.
- `send_socket_total_values` Metrics, which are usually collected on a per hardware thread basis, are additionally summed up per CPU socket.
- `send_node_total_values` Metrics, which are usually collected on a per hardware thread basis, are additionally summed up per node.
### Example configuration
#### AMD Zen3
```json
"likwid": {
"force_overwrite" : false,
"invalid_to_zero" : false,
"eventsets": [
{
"events": {
@@ -177,28 +20,25 @@ LIKWID can be configured with a lock file with which the access to the performan
{
"name": "ipc",
"calc": "PMC0/PMC1",
"type": "hwthread",
"socket_scope": false,
"publish": true
},
{
"name": "flops_any",
"calc": "0.000001*PMC2/time",
"unit": "MFlops/s",
"type": "hwthread",
"socket_scope": false,
"publish": true
},
{
"name": "clock",
"name": "clock_mhz",
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
"type": "hwthread",
"unit": "MHz",
"socket_scope": false,
"publish": true
},
{
"name": "mem1",
"calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time",
"unit": "Mbyte/s",
"type": "socket",
"socket_scope": true,
"publish": false
}
]
@@ -216,22 +56,19 @@ LIKWID can be configured with a lock file with which the access to the performan
{
"name": "pwr_core",
"calc": "PWR0/time",
"unit": "Watt"
"type": "socket",
"socket_scope": false,
"publish": true
},
{
"name": "pwr_pkg",
"calc": "PWR1/time",
"type": "socket",
"unit": "Watt"
"socket_scope": true,
"publish": true
},
{
"name": "mem2",
"calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time",
"unit": "Mbyte/s",
"type": "socket",
"socket_scope": true,
"publish": false
}
]
@@ -241,20 +78,18 @@ LIKWID can be configured with a lock file with which the access to the performan
{
"name": "mem_bw",
"calc": "mem1+mem2",
"type": "socket",
"unit": "Mbyte/s",
"socket_scope": true,
"publish": true
}
]
}
```
### How to get the eventsets and metrics from LIKWID
_Example config suitable for AMD Zen3_
The `likwid` collector reads hardware performance counters at a **hwthread** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility.
The logic is as following: There are multiple eventsets, each consisting of a list of counters+events and a list of metrics. If you compare a common performance group with the example setting above, there is not much difference:
```
EVENTSET -> "events": {
FIXC1 ACTUAL_CPU_CLOCK -> "FIXC1": "ACTUAL_CPU_CLOCK",
@@ -273,10 +108,12 @@ METRICS -> "metrics": [
IPC PMC0/PMC1 -> {
-> "name" : "IPC",
-> "calc" : "PMC0/PMC1",
-> "type": "hwthread",
-> "socket_scope": false,
-> "publish": true
-> }
-> ]
```
The script `scripts/likwid_perfgroup_to_cc_config.py` might help you.
The `socket_scope` option tells whether it is submitted per socket or per hwthread. If a metric is only used for internal calculations, you can set `publish = false`.
Since some metrics can only be gathered in multiple measurements (like the memory bandwidth on AMD Zen3 chips), configure multiple eventsets like in the example config and use the `globalmetrics` section to combine them. **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases.

View File

@@ -1,47 +1,30 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"encoding/json"
"fmt"
"os"
"io/ioutil"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
)
// LoadavgCollector collects:
// * load average of last 1, 5 & 15 minutes
// * number of processes currently runnable
// * total number of processes in system
//
// See: https://www.kernel.org/doc/html/latest/filesystems/proc.html
const LOADAVGFILE = "/proc/loadavg"
const LOADAVGFILE = `/proc/loadavg`
type LoadavgCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
}
type LoadavgCollector struct {
metricCollector
tags map[string]string
load_matches []string
load_skips []bool
proc_matches []string
proc_skips []bool
config struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
}
config LoadavgCollectorConfig
}
func (m *LoadavgCollector) Init(config json.RawMessage) error {
m.name = "LoadavgCollector"
m.parallel = true
m.setup()
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
@@ -49,80 +32,46 @@ func (m *LoadavgCollector) Init(config json.RawMessage) error {
return err
}
}
m.meta = map[string]string{
"source": m.name,
"group": "LOAD"}
m.meta = map[string]string{"source": m.name, "group": "LOAD"}
m.tags = map[string]string{"type": "node"}
m.load_matches = []string{
"load_one",
"load_five",
"load_fifteen"}
m.load_skips = make([]bool, len(m.load_matches))
m.proc_matches = []string{
"proc_run",
"proc_total"}
m.proc_skips = make([]bool, len(m.proc_matches))
for i, name := range m.load_matches {
_, m.load_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
}
for i, name := range m.proc_matches {
_, m.proc_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
}
m.load_matches = []string{"load_one", "load_five", "load_fifteen"}
m.proc_matches = []string{"proc_run", "proc_total"}
m.init = true
return nil
}
func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage) {
func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric) {
var skip bool
if !m.init {
return
}
buffer, err := os.ReadFile(LOADAVGFILE)
buffer, err := ioutil.ReadFile(string(LOADAVGFILE))
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to read file '%s': %v", LOADAVGFILE, err))
return
}
now := time.Now()
// Load metrics
ls := strings.Split(string(buffer), ` `)
for i, name := range m.load_matches {
x, err := strconv.ParseFloat(ls[i], 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", ls[i], err))
continue
}
if m.load_skips[i] {
continue
}
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
if err == nil {
output <- y
_, skip = stringArrayContains(m.config.ExcludeMetrics, name)
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": float64(x)}, time.Now())
if err == nil && !skip {
output <- y
}
}
}
// Process metrics
lv := strings.Split(ls[3], `/`)
for i, name := range m.proc_matches {
x, err := strconv.ParseInt(lv[i], 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert '%s' to float64: %v", lv[i], err))
continue
}
if m.proc_skips[i] {
continue
}
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
x, err := strconv.ParseFloat(lv[i], 64)
if err == nil {
output <- y
_, skip = stringArrayContains(m.config.ExcludeMetrics, name)
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": float64(x)}, time.Now())
if err == nil && !skip {
output <- y
}
}
}
}

View File

@@ -1,14 +1,3 @@
<!--
---
title: Load average metric collector
description: Collect metrics from `/proc/loadavg`
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/loadavg.md
---
-->
## `loadavg` collector

View File

@@ -1,301 +1,34 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"encoding/json"
"errors"
"fmt"
"os/exec"
"os/user"
"io/ioutil"
"log"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
)
const LUSTRE_SYSFS = `/sys/fs/lustre`
const LCTL_CMD = `lctl`
const LCTL_OPTION = `get_param`
const LUSTREFILE = `/proc/fs/lustre/llite/lnec-XXXXXX/stats`
type LustreCollectorConfig struct {
LCtlCommand string `json:"lctl_command,omitempty"`
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
Sudo bool `json:"use_sudo,omitempty"`
SendAbsoluteValues bool `json:"send_abs_values,omitempty"`
SendDerivedValues bool `json:"send_derived_values,omitempty"`
SendDiffValues bool `json:"send_diff_values,omitempty"`
}
type LustreMetricDefinition struct {
name string
lineprefix string
lineoffset int
unit string
calc string
Procfiles []string `json:"procfiles"`
ExcludeMetrics []string `json:"exclude_metrics"`
}
type LustreCollector struct {
metricCollector
tags map[string]string
config LustreCollectorConfig
lctl string
sudoCmd string
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
definitions []LustreMetricDefinition // Combined list without excluded metrics
stats map[string]map[string]int64 // Data for last value per device and metric
}
func (m *LustreCollector) getDeviceDataCommand(device string) []string {
var command *exec.Cmd
statsfile := fmt.Sprintf("llite.%s.stats", device)
if m.config.Sudo {
command = exec.Command(m.sudoCmd, m.lctl, LCTL_OPTION, statsfile)
} else {
command = exec.Command(m.lctl, LCTL_OPTION, statsfile)
}
command.Wait()
stdout, _ := command.Output()
return strings.Split(string(stdout), "\n")
}
func (m *LustreCollector) getDevices() []string {
devices := make([]string, 0)
// //Version reading devices from sysfs
// globPattern := filepath.Join(LUSTRE_SYSFS, "llite/*/stats")
// files, err := filepath.Glob(globPattern)
// if err != nil {
// return devices
// }
// for _, f := range files {
// pathlist := strings.Split(f, "/")
// devices = append(devices, pathlist[4])
// }
data := m.getDeviceDataCommand("*")
for _, line := range data {
if strings.HasPrefix(line, "llite") {
linefields := strings.Split(line, ".")
if len(linefields) > 2 {
devices = append(devices, linefields[1])
}
}
}
return devices
}
func getMetricData(lines []string, prefix string, offset int) (int64, error) {
for _, line := range lines {
if strings.HasPrefix(line, prefix) {
lf := strings.Fields(line)
return strconv.ParseInt(lf[offset], 0, 64)
}
}
return 0, errors.New("no such line in data")
}
// //Version reading the stats data of a device from sysfs
// func (m *LustreCollector) getDeviceDataSysfs(device string) []string {
// llitedir := filepath.Join(LUSTRE_SYSFS, "llite")
// devdir := filepath.Join(llitedir, device)
// statsfile := filepath.Join(devdir, "stats")
// buffer, err := os.ReadFile(statsfile)
// if err != nil {
// return make([]string, 0)
// }
// return strings.Split(string(buffer), "\n")
// }
var LustreAbsMetrics = []LustreMetricDefinition{
{
name: "lustre_read_requests",
lineprefix: "read_bytes",
lineoffset: 1,
unit: "requests",
calc: "none",
},
{
name: "lustre_write_requests",
lineprefix: "write_bytes",
lineoffset: 1,
unit: "requests",
calc: "none",
},
{
name: "lustre_read_bytes",
lineprefix: "read_bytes",
lineoffset: 6,
unit: "bytes",
calc: "none",
},
{
name: "lustre_write_bytes",
lineprefix: "write_bytes",
lineoffset: 6,
unit: "bytes",
calc: "none",
},
{
name: "lustre_open",
lineprefix: "open",
lineoffset: 1,
unit: "",
calc: "none",
},
{
name: "lustre_close",
lineprefix: "close",
lineoffset: 1,
unit: "",
calc: "none",
},
{
name: "lustre_setattr",
lineprefix: "setattr",
lineoffset: 1,
unit: "",
calc: "none",
},
{
name: "lustre_getattr",
lineprefix: "getattr",
lineoffset: 1,
unit: "",
calc: "none",
},
{
name: "lustre_statfs",
lineprefix: "statfs",
lineoffset: 1,
unit: "",
calc: "none",
},
{
name: "lustre_inode_permission",
lineprefix: "inode_permission",
lineoffset: 1,
unit: "",
calc: "none",
},
}
var LustreDiffMetrics = []LustreMetricDefinition{
{
name: "lustre_read_requests_diff",
lineprefix: "read_bytes",
lineoffset: 1,
unit: "requests",
calc: "difference",
},
{
name: "lustre_write_requests_diff",
lineprefix: "write_bytes",
lineoffset: 1,
unit: "requests",
calc: "difference",
},
{
name: "lustre_read_bytes_diff",
lineprefix: "read_bytes",
lineoffset: 6,
unit: "bytes",
calc: "difference",
},
{
name: "lustre_write_bytes_diff",
lineprefix: "write_bytes",
lineoffset: 6,
unit: "bytes",
calc: "difference",
},
{
name: "lustre_open_diff",
lineprefix: "open",
lineoffset: 1,
unit: "",
calc: "difference",
},
{
name: "lustre_close_diff",
lineprefix: "close",
lineoffset: 1,
unit: "",
calc: "difference",
},
{
name: "lustre_setattr_diff",
lineprefix: "setattr",
lineoffset: 1,
unit: "",
calc: "difference",
},
{
name: "lustre_getattr_diff",
lineprefix: "getattr",
lineoffset: 1,
unit: "",
calc: "difference",
},
{
name: "lustre_statfs_diff",
lineprefix: "statfs",
lineoffset: 1,
unit: "",
calc: "difference",
},
{
name: "lustre_inode_permission_diff",
lineprefix: "inode_permission",
lineoffset: 1,
unit: "",
calc: "difference",
},
}
var LustreDeriveMetrics = []LustreMetricDefinition{
{
name: "lustre_read_requests_rate",
lineprefix: "read_bytes",
lineoffset: 1,
unit: "requests/sec",
calc: "derivative",
},
{
name: "lustre_write_requests_rate",
lineprefix: "write_bytes",
lineoffset: 1,
unit: "requests/sec",
calc: "derivative",
},
{
name: "lustre_read_bw",
lineprefix: "read_bytes",
lineoffset: 6,
unit: "bytes/sec",
calc: "derivative",
},
{
name: "lustre_write_bw",
lineprefix: "write_bytes",
lineoffset: 6,
unit: "bytes/sec",
calc: "derivative",
},
tags map[string]string
matches map[string]map[string]int
devices []string
config LustreCollectorConfig
}
func (m *LustreCollector) Init(config json.RawMessage) error {
var err error
m.name = "LustreCollector"
m.parallel = true
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
@@ -305,132 +38,70 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
m.setup()
m.tags = map[string]string{"type": "node"}
m.meta = map[string]string{"source": m.name, "group": "Lustre"}
// Lustre file system statistics can only be queried by user root
// or with password-less sudo
if !m.config.Sudo {
user, err := user.Current()
if err != nil {
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
return err
m.matches = map[string]map[string]int{"read_bytes": {"read_bytes": 6, "read_requests": 1},
"write_bytes": {"write_bytes": 6, "write_requests": 1},
"open": {"open": 1},
"close": {"close": 1},
"setattr": {"setattr": 1},
"getattr": {"getattr": 1},
"statfs": {"statfs": 1},
"inode_permission": {"inode_permission": 1}}
m.devices = make([]string, 0)
for _, p := range m.config.Procfiles {
_, err := ioutil.ReadFile(p)
if err == nil {
m.devices = append(m.devices, p)
} else {
log.Print(err.Error())
continue
}
if user.Uid != "0" {
cclog.ComponentError(m.name, "Lustre file system statistics can only be queried by user root")
return err
}
} else {
p, err := exec.LookPath("sudo")
if err != nil {
cclog.ComponentError(m.name, "Cannot find 'sudo'")
return err
}
m.sudoCmd = p
}
p, err := exec.LookPath(m.config.LCtlCommand)
if err != nil {
p, err = exec.LookPath(LCTL_CMD)
if err != nil {
return err
}
if len(m.devices) == 0 {
return errors.New("No metrics to collect")
}
m.lctl = p
m.definitions = []LustreMetricDefinition{}
if m.config.SendAbsoluteValues {
for _, def := range LustreAbsMetrics {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def)
}
}
}
if m.config.SendDiffValues {
for _, def := range LustreDiffMetrics {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def)
}
}
}
if m.config.SendDerivedValues {
for _, def := range LustreDeriveMetrics {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
m.definitions = append(m.definitions, def)
}
}
}
if len(m.definitions) == 0 {
return errors.New("no metrics to collect")
}
devices := m.getDevices()
if len(devices) == 0 {
return errors.New("no Lustre devices found")
}
m.stats = make(map[string]map[string]int64)
for _, d := range devices {
m.stats[d] = make(map[string]int64)
data := m.getDeviceDataCommand(d)
for _, def := range m.definitions {
x, err := getMetricData(data, def.lineprefix, def.lineoffset)
if err == nil {
m.stats[d][def.name] = x
} else {
m.stats[d][def.name] = 0
}
}
}
m.lastTimestamp = time.Now()
m.init = true
return nil
}
func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMessage) {
func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if !m.init {
return
}
now := time.Now()
tdiff := now.Sub(m.lastTimestamp)
for device, devData := range m.stats {
data := m.getDeviceDataCommand(device)
for _, def := range m.definitions {
var use_x int64
var err error
var y lp.CCMessage
x, err := getMetricData(data, def.lineprefix, def.lineoffset)
if err == nil {
use_x = x
} else {
use_x = devData[def.name]
}
var value interface{}
switch def.calc {
case "none":
value = use_x
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
case "difference":
value = use_x - devData[def.name]
if value.(int64) < 0 {
value = 0
for _, p := range m.devices {
buffer, err := ioutil.ReadFile(p)
if err != nil {
log.Print(err)
return
}
for _, line := range strings.Split(string(buffer), "\n") {
lf := strings.Fields(line)
if len(lf) > 1 {
for match, fields := range m.matches {
if lf[0] == match {
for name, idx := range fields {
_, skip := stringArrayContains(m.config.ExcludeMetrics, name)
if skip {
continue
}
x, err := strconv.ParseInt(lf[idx], 0, 64)
if err == nil {
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, time.Now())
if err == nil {
if strings.Contains(name, "byte") {
y.AddMeta("unit", "Byte")
}
output <- y
}
}
}
}
}
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
case "derivative":
value = float64(use_x-devData[def.name]) / tdiff.Seconds()
if value.(float64) < 0 {
value = 0
}
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
}
if err == nil {
y.AddTag("device", device)
if len(def.unit) > 0 {
y.AddMeta("unit", def.unit)
}
output <- y
}
devData[def.name] = use_x
}
}
m.lastTimestamp = now
}
func (m *LustreCollector) Close() {

View File

@@ -1,57 +1,29 @@
<!--
---
title: Lustre filesystem metric collector
description: Collect metrics for Lustre filesystems
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/lustre.md
---
-->
## `lustrestat` collector
```json
"lustrestat": {
"lctl_command": "/path/to/lctl",
"procfiles" : [
"/proc/fs/lustre/llite/lnec-XXXXXX/stats"
],
"exclude_metrics": [
"setattr",
"getattr"
],
"send_abs_values" : true,
"send_derived_values" : true,
"send_diff_values": true,
"use_sudo": false
]
}
```
The `lustrestat` collector uses the `lctl` application with the `get_param` option to get all `llite` metrics (Lustre client). The `llite` metrics are only available for root users. If password-less sudo is configured, you can enable `sudo` in the configuration.
The `lustrestat` collector reads from the procfs stat files for Lustre like `/proc/fs/lustre/llite/lnec-XXXXXX/stats`.
Metrics:
* `lustre_read_bytes` (unit `bytes`)
* `lustre_read_requests` (unit `requests`)
* `lustre_write_bytes` (unit `bytes`)
* `lustre_write_requests` (unit `requests`)
* `lustre_open`
* `lustre_close`
* `lustre_getattr`
* `lustre_setattr`
* `lustre_statfs`
* `lustre_inode_permission`
* `lustre_read_bw` (if `send_derived_values == true`, unit `bytes/sec`)
* `lustre_write_bw` (if `send_derived_values == true`, unit `bytes/sec`)
* `lustre_read_requests_rate` (if `send_derived_values == true`, unit `requests/sec`)
* `lustre_write_requests_rate` (if `send_derived_values == true`, unit `requests/sec`)
* `lustre_read_bytes_diff` (if `send_diff_values == true`, unit `bytes`)
* `lustre_read_requests_diff` (if `send_diff_values == true`, unit `requests`)
* `lustre_write_bytes_diff` (if `send_diff_values == true`, unit `bytes`)
* `lustre_write_requests_diff` (if `send_diff_values == true`, unit `requests`)
* `lustre_open_diff` (if `send_diff_values == true`)
* `lustre_close_diff` (if `send_diff_values == true`)
* `lustre_getattr_diff` (if `send_diff_values == true`)
* `lustre_setattr_diff` (if `send_diff_values == true`)
* `lustre_statfs_diff` (if `send_diff_values == true`)
* `lustre_inode_permission_diff` (if `send_diff_values == true`)
* `read_bytes`
* `read_requests`
* `write_bytes`
* `write_requests`
* `open`
* `close`
* `getattr`
* `setattr`
* `statfs`
* `inode_permission`
This collector adds an `device` tag.

View File

@@ -1,109 +1,46 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"bufio"
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
"regexp"
"io/ioutil"
"log"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
)
const MEMSTATFILE = "/proc/meminfo"
const NUMA_MEMSTAT_BASE = "/sys/devices/system/node"
const MEMSTATFILE = `/proc/meminfo`
type MemstatCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics"`
NodeStats bool `json:"node_stats,omitempty"`
NumaStats bool `json:"numa_stats,omitempty"`
}
type MemstatCollectorNode struct {
file string
tags map[string]string
}
type MemstatCollector struct {
metricCollector
stats map[string]int64
tags map[string]string
matches map[string]string
config MemstatCollectorConfig
nodefiles map[int]MemstatCollectorNode
sendMemUsed bool
}
type MemstatStats struct {
value float64
unit string
}
func getStats(filename string) map[string]MemstatStats {
stats := make(map[string]MemstatStats)
file, err := os.Open(filename)
if err != nil {
cclog.Error(err.Error())
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
linefields := strings.Fields(line)
if len(linefields) == 3 {
v, err := strconv.ParseFloat(linefields[1], 64)
if err == nil {
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
value: v,
unit: linefields[2],
}
}
} else if len(linefields) == 5 {
v, err := strconv.ParseFloat(linefields[3], 64)
if err == nil {
cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4])
stats[strings.Trim(linefields[2], ":")] = MemstatStats{
value: v,
unit: linefields[4],
}
}
}
}
return stats
stats map[string]int64
tags map[string]string
matches map[string]string
config MemstatCollectorConfig
}
func (m *MemstatCollector) Init(config json.RawMessage) error {
var err error
m.name = "MemstatCollector"
m.parallel = true
m.config.NodeStats = true
m.config.NumaStats = false
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
m.meta = map[string]string{"source": m.name, "group": "Memory"}
m.meta = map[string]string{"source": m.name, "group": "Memory", "unit": "kByte"}
m.stats = make(map[string]int64)
m.matches = make(map[string]string)
m.tags = map[string]string{"type": "node"}
matches := map[string]string{
"MemTotal": "mem_total",
matches := map[string]string{`MemTotal`: "mem_total",
"SwapTotal": "swap_total",
"SReclaimable": "mem_sreclaimable",
"Slab": "mem_slab",
@@ -111,129 +48,79 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
"Buffers": "mem_buffers",
"Cached": "mem_cached",
"MemAvailable": "mem_available",
"SwapFree": "swap_free",
"MemShared": "mem_shared",
}
"SwapFree": "swap_free"}
for k, v := range matches {
_, skip := stringArrayContains(m.config.ExcludeMetrics, k)
if !skip {
m.matches[k] = v
}
}
m.sendMemUsed = false
if _, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_used"); !skip {
m.sendMemUsed = true
}
if len(m.matches) == 0 {
return errors.New("no metrics to collect")
return errors.New("No metrics to collect")
}
m.setup()
if m.config.NodeStats {
if stats := getStats(MEMSTATFILE); len(stats) == 0 {
return fmt.Errorf("cannot read data from file %s", MEMSTATFILE)
}
_, err = ioutil.ReadFile(string(MEMSTATFILE))
if err == nil {
m.init = true
}
if m.config.NumaStats {
globPattern := filepath.Join(NUMA_MEMSTAT_BASE, "node[0-9]*", "meminfo")
regex := regexp.MustCompile(filepath.Join(NUMA_MEMSTAT_BASE, "node(\\d+)", "meminfo"))
files, err := filepath.Glob(globPattern)
if err == nil {
m.nodefiles = make(map[int]MemstatCollectorNode)
for _, f := range files {
if stats := getStats(f); len(stats) == 0 {
return fmt.Errorf("cannot read data from file %s", f)
}
rematch := regex.FindStringSubmatch(f)
if len(rematch) == 2 {
id, err := strconv.Atoi(rematch[1])
if err == nil {
f := MemstatCollectorNode{
file: f,
tags: map[string]string{
"type": "memoryDomain",
"type-id": fmt.Sprintf("%d", id),
},
}
m.nodefiles[id] = f
}
}
}
}
}
m.init = true
return err
}
func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if !m.init {
return
}
sendStats := func(stats map[string]MemstatStats, tags map[string]string) {
for match, name := range m.matches {
var value float64 = 0
var unit string = ""
if v, ok := stats[match]; ok {
value = v.value
if len(v.unit) > 0 {
unit = v.unit
}
}
buffer, err := ioutil.ReadFile(string(MEMSTATFILE))
if err != nil {
log.Print(err)
return
}
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now())
if err == nil {
if len(unit) > 0 {
y.AddMeta("unit", unit)
}
output <- y
}
}
if m.sendMemUsed {
memUsed := 0.0
unit := ""
if totalVal, total := stats["MemTotal"]; total {
if freeVal, free := stats["MemFree"]; free {
memUsed = totalVal.value - freeVal.value
if len(totalVal.unit) > 0 {
unit = totalVal.unit
} else if len(freeVal.unit) > 0 {
unit = freeVal.unit
}
if bufVal, buffers := stats["Buffers"]; buffers {
memUsed -= bufVal.value
if len(bufVal.unit) > 0 && len(unit) == 0 {
unit = bufVal.unit
}
if cacheVal, cached := stats["Cached"]; cached {
memUsed -= cacheVal.value
if len(cacheVal.unit) > 0 && len(unit) == 0 {
unit = cacheVal.unit
}
}
}
}
}
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now())
if err == nil {
if len(unit) > 0 {
y.AddMeta("unit", unit)
}
output <- y
}
ll := strings.Split(string(buffer), "\n")
for _, line := range ll {
ls := strings.Split(line, `:`)
if len(ls) > 1 {
lv := strings.Fields(ls[1])
m.stats[ls[0]], err = strconv.ParseInt(lv[0], 0, 64)
}
}
if m.config.NodeStats {
nodestats := getStats(MEMSTATFILE)
sendStats(nodestats, m.tags)
if _, exists := m.stats[`MemTotal`]; !exists {
err = errors.New("Parse error")
log.Print(err)
return
}
if m.config.NumaStats {
for _, nodeConf := range m.nodefiles {
stats := getStats(nodeConf.file)
sendStats(stats, nodeConf.tags)
for match, name := range m.matches {
if _, exists := m.stats[match]; !exists {
err = fmt.Errorf("Parse error for %s : %s", match, name)
log.Print(err)
continue
}
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": int(float64(m.stats[match]) * 1.0e-3)}, time.Now())
if err == nil {
output <- y
}
}
if _, free := m.stats[`MemFree`]; free {
if _, buffers := m.stats[`Buffers`]; buffers {
if _, cached := m.stats[`Cached`]; cached {
memUsed := m.stats[`MemTotal`] - (m.stats[`MemFree`] + m.stats[`Buffers`] + m.stats[`Cached`])
_, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_used")
y, err := lp.New("mem_used", m.tags, m.meta, map[string]interface{}{"value": int(float64(memUsed) * 1.0e-3)}, time.Now())
if err == nil && !skip {
output <- y
}
}
}
}
if _, found := m.stats[`MemShared`]; found {
_, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_shared")
y, err := lp.New("mem_shared", m.tags, m.meta, map[string]interface{}{"value": int(float64(m.stats[`MemShared`]) * 1.0e-3)}, time.Now())
if err == nil && !skip {
output <- y
}
}
}

View File

@@ -1,14 +1,3 @@
<!--
---
title: Memory statistics metric collector
description: Collect metrics from `/proc/meminfo`
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/memstat.md
---
-->
## `memstat` collector

View File

@@ -1,52 +1,42 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"encoding/json"
"fmt"
"io/ioutil"
"log"
"strconv"
"strings"
"time"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
influx "github.com/influxdata/line-protocol"
)
type MetricCollector interface {
Name() string // Name of the metric collector
Init(config json.RawMessage) error // Initialize metric collector
Initialized() bool // Is metric collector initialized?
Parallel() bool
Read(duration time.Duration, output chan lp.CCMessage) // Read metrics from metric collector
Close() // Close / finish metric collector
Name() string
Init(config json.RawMessage) error
Initialized() bool
Read(duration time.Duration, output chan lp.CCMetric)
Close()
}
type metricCollector struct {
name string // name of the metric
init bool // is metric collector initialized?
parallel bool // can the metric collector be executed in parallel with others
meta map[string]string // static meta data tags
name string
init bool
meta map[string]string
}
// Name returns the name of the metric collector
// Name() returns the name of the metric collector
func (c *metricCollector) Name() string {
return c.name
}
// Name returns the name of the metric collector
func (c *metricCollector) Parallel() bool {
return c.parallel
}
// Setup is for future use
func (c *metricCollector) setup() error {
return nil
}
// Initialized indicates whether the metric collector has been initialized
// Initialized() indicates whether the metric collector has been initialized.
func (c *metricCollector) Initialized() bool {
return c.init
}
@@ -75,13 +65,81 @@ func stringArrayContains(array []string, str string) (int, bool) {
return -1, false
}
func SocketList() []int {
buffer, err := ioutil.ReadFile("/proc/cpuinfo")
if err != nil {
log.Print(err)
return nil
}
ll := strings.Split(string(buffer), "\n")
var packs []int
for _, line := range ll {
if strings.HasPrefix(line, "physical id") {
lv := strings.Fields(line)
id, err := strconv.ParseInt(lv[3], 10, 32)
if err != nil {
log.Print(err)
return packs
}
_, found := intArrayContains(packs, int(id))
if !found {
packs = append(packs, int(id))
}
}
}
return packs
}
func CpuList() []int {
buffer, err := ioutil.ReadFile("/proc/cpuinfo")
if err != nil {
log.Print(err)
return nil
}
ll := strings.Split(string(buffer), "\n")
var cpulist []int
for _, line := range ll {
if strings.HasPrefix(line, "processor") {
lv := strings.Fields(line)
id, err := strconv.ParseInt(lv[2], 10, 32)
if err != nil {
log.Print(err)
return cpulist
}
_, found := intArrayContains(cpulist, int(id))
if !found {
cpulist = append(cpulist, int(id))
}
}
}
return cpulist
}
// Tags2Map stores a InfluxDB list of tags in a map of key value pairs
func Tags2Map(metric influx.Metric) map[string]string {
tags := make(map[string]string)
for _, t := range metric.TagList() {
tags[t.Key] = t.Value
}
return tags
}
// Fields2Map stores a InfluxDB list of fields in a map of key value pairs
func Fields2Map(metric influx.Metric) map[string]interface{} {
fields := make(map[string]interface{})
for _, f := range metric.FieldList() {
fields[f.Key] = f.Value
}
return fields
}
// RemoveFromStringList removes the string r from the array of strings s
// If r is not contained in the array an error is returned
func RemoveFromStringList(s []string, r string) ([]string, error) {
for i := range s {
if r == s[i] {
for i, item := range s {
if r == item {
return append(s[:i], s[i+1:]...), nil
}
}
return s, fmt.Errorf("no such string in list")
return s, fmt.Errorf("No such string in list")
}

View File

@@ -1,247 +1,92 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"bufio"
"encoding/json"
"errors"
"os"
"io/ioutil"
"log"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
)
const NETSTATFILE = "/proc/net/dev"
const NETSTATFILE = `/proc/net/dev`
type NetstatCollectorConfig struct {
IncludeDevices []string `json:"include_devices"`
SendAbsoluteValues bool `json:"send_abs_values"`
SendDerivedValues bool `json:"send_derived_values"`
InterfaceAliases map[string][]string `json:"interface_aliases,omitempty"`
}
type NetstatCollectorMetric struct {
name string
index int
tags map[string]string
meta map[string]string
meta_rates map[string]string
lastValue int64
ExcludeDevices []string `json:"exclude_devices"`
}
type NetstatCollector struct {
metricCollector
config NetstatCollectorConfig
aliasToCanonical map[string]string
matches map[string][]NetstatCollectorMetric
lastTimestamp time.Time
}
func (m *NetstatCollector) buildAliasMapping() {
m.aliasToCanonical = make(map[string]string)
for canon, aliases := range m.config.InterfaceAliases {
for _, alias := range aliases {
m.aliasToCanonical[alias] = canon
}
}
}
func getCanonicalName(raw string, aliasToCanonical map[string]string) string {
if canon, ok := aliasToCanonical[raw]; ok {
return canon
}
return raw
config NetstatCollectorConfig
matches map[int]string
}
func (m *NetstatCollector) Init(config json.RawMessage) error {
m.name = "NetstatCollector"
m.parallel = true
m.setup()
m.lastTimestamp = time.Now()
const (
fieldInterface = iota
fieldReceiveBytes
fieldReceivePackets
fieldReceiveErrs
fieldReceiveDrop
fieldReceiveFifo
fieldReceiveFrame
fieldReceiveCompressed
fieldReceiveMulticast
fieldTransmitBytes
fieldTransmitPackets
fieldTransmitErrs
fieldTransmitDrop
fieldTransmitFifo
fieldTransmitColls
fieldTransmitCarrier
fieldTransmitCompressed
)
m.matches = make(map[string][]NetstatCollectorMetric)
// Set default configuration,
m.config.SendAbsoluteValues = true
m.config.SendDerivedValues = false
// Read configuration file, allow overwriting default config
m.meta = map[string]string{"source": m.name, "group": "Memory"}
m.matches = map[int]string{
1: "bytes_in",
9: "bytes_out",
2: "pkts_in",
10: "pkts_out",
}
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
log.Print(err.Error())
return err
}
}
m.buildAliasMapping()
// Check access to net statistic file
file, err := os.Open(NETSTATFILE)
if err != nil {
cclog.ComponentError(m.name, err.Error())
return err
_, err := ioutil.ReadFile(string(NETSTATFILE))
if err == nil {
m.init = true
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
l := scanner.Text()
// Skip lines with no net device entry
if !strings.Contains(l, ":") {
continue
}
// Split line into fields
f := strings.Fields(l)
// Get raw and canonical names
raw := strings.Trim(f[0], ": ")
canonical := getCanonicalName(raw, m.aliasToCanonical)
// Check if device is a included device
if _, ok := stringArrayContains(m.config.IncludeDevices, canonical); ok {
// Tag will contain original device name (raw).
tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"}
meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"}
meta_unit_byte_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "bytes/sec"}
meta_unit_pkts := map[string]string{"source": m.name, "group": "Network", "unit": "packets"}
meta_unit_pkts_per_sec := map[string]string{"source": m.name, "group": "Network", "unit": "packets/sec"}
m.matches[canonical] = []NetstatCollectorMetric{
{
name: "net_bytes_in",
index: fieldReceiveBytes,
lastValue: -1,
tags: tags,
meta: meta_unit_byte,
meta_rates: meta_unit_byte_per_sec,
},
{
name: "net_pkts_in",
index: fieldReceivePackets,
lastValue: -1,
tags: tags,
meta: meta_unit_pkts,
meta_rates: meta_unit_pkts_per_sec,
},
{
name: "net_bytes_out",
index: fieldTransmitBytes,
lastValue: -1,
tags: tags,
meta: meta_unit_byte,
meta_rates: meta_unit_byte_per_sec,
},
{
name: "net_pkts_out",
index: fieldTransmitPackets,
lastValue: -1,
tags: tags,
meta: meta_unit_pkts,
meta_rates: meta_unit_pkts_per_sec,
},
}
}
}
if len(m.matches) == 0 {
return errors.New("no devices to collector metrics found")
}
m.init = true
return nil
}
func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
// Current time stamp
now := time.Now()
// time difference to last time stamp
timeDiff := now.Sub(m.lastTimestamp).Seconds()
// Save current timestamp
m.lastTimestamp = now
file, err := os.Open(NETSTATFILE)
func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
data, err := ioutil.ReadFile(string(NETSTATFILE))
if err != nil {
cclog.ComponentError(m.name, err.Error())
log.Print(err.Error())
return
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
l := scanner.Text()
// Skip lines with no net device entry
lines := strings.Split(string(data), "\n")
for _, l := range lines {
if !strings.Contains(l, ":") {
continue
}
// Split line into fields
f := strings.Fields(l)
// Get raw and canonical names
raw := strings.Trim(f[0], ":")
canonical := getCanonicalName(raw, m.aliasToCanonical)
// Check if device is a included device
if devmetrics, ok := m.matches[canonical]; ok {
for i := range devmetrics {
metric := &devmetrics[i]
// Read value
v, err := strconv.ParseInt(f[metric.index], 10, 64)
if err != nil {
continue
}
if m.config.SendAbsoluteValues {
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]interface{}{"value": v}, now); err == nil {
output <- y
dev := f[0][0 : len(f[0])-1]
cont := false
for _, d := range m.config.ExcludeDevices {
if d == dev {
cont = true
}
}
if cont {
continue
}
tags := map[string]string{"device": dev, "type": "node"}
for i, name := range m.matches {
v, err := strconv.ParseInt(f[i], 10, 0)
if err == nil {
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": int(float64(v) * 1.0e-3)}, time.Now())
if err == nil {
switch {
case strings.Contains(name, "byte"):
y.AddMeta("unit", "Byte")
case strings.Contains(name, "pkt"):
y.AddMeta("unit", "Packets")
}
}
if m.config.SendDerivedValues {
if metric.lastValue >= 0 {
rate := float64(v-metric.lastValue) / timeDiff
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]interface{}{"value": rate}, now); err == nil {
output <- y
}
}
metric.lastValue = v
output <- y
}
}
}
}
}
func (m *NetstatCollector) Close() {

View File

@@ -1,41 +1,21 @@
<!--
---
title: Network device metric collector
description: Collect metrics for network devices through procfs
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/netstat.md
---
-->
## `netstat` collector
```json
"netstat": {
"include_devices": [
"eth0",
"eno1"
],
"send_abs_values": true,
"send_derived_values": true,
"interface_aliases": {
"eno1": ["eno1np0", "eno1_alt"],
"eth0": ["eth0_alias"]
}
"exclude_devices": [
"lo"
]
}
```
The `netstat` collector reads data from `/proc/net/dev` and outputs a handful **node** metrics. With the `include_devices` list you can specify which network devices should be measured. **Note**: Most other collectors use an _exclude_ list instead of an include list. Optionally, you can define an interface_aliases mapping. For each canonical device (as listed in include_devices), you may provide an array of aliases that may be reported by the system. When an alias is detected, it is preferred for matching, while the output tag stype-id always shows the actual system-reported name.
The `netstat` collector reads data from `/proc/net/dev` and outputs a handful **node** metrics. If a device is not required, it can be excluded from forwarding it to the sink. Commonly the `lo` device should be excluded.
Metrics:
* `net_bytes_in` (`unit=bytes`)
* `net_bytes_out` (`unit=bytes`)
* `net_pkts_in` (`unit=packets`)
* `net_pkts_out` (`unit=packets`)
* `net_bytes_in_bw` (`unit=bytes/sec` if `send_derived_values == true`)
* `net_bytes_out_bw` (`unit=bytes/sec` if `send_derived_values == true`)
* `net_pkts_in_bw` (`unit=packets/sec` if `send_derived_values == true`)
* `net_pkts_out_bw` (`unit=packets/sec` if `send_derived_values == true`)
* `bytes_in`
* `bytes_out`
* `pkts_in`
* `pkts_out`
The device name is added as tag `device`.
The device name is added as tag `stype=network,stype-id=<device>`.

View File

@@ -1,50 +0,0 @@
<!--
---
title: NFS network filesystem (v3) metric collector
description: Collect metrics for NFS network filesystems in version 3
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/nfs3.md
---
-->
## `nfs3stat` collector
```json
"nfs3stat": {
"nfsstat" : "/path/to/nfsstat",
"exclude_metrics": [
"nfs3_total"
]
}
```
The `nfs3stat` collector reads data from `nfsstat` command and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. There is currently no possibility to get the metrics per mount point.
Metrics:
* `nfs3_total`
* `nfs3_null`
* `nfs3_getattr`
* `nfs3_setattr`
* `nfs3_lookup`
* `nfs3_access`
* `nfs3_readlink`
* `nfs3_read`
* `nfs3_write`
* `nfs3_create`
* `nfs3_mkdir`
* `nfs3_symlink`
* `nfs3_remove`
* `nfs3_rmdir`
* `nfs3_rename`
* `nfs3_link`
* `nfs3_readdir`
* `nfs3_readdirplus`
* `nfs3_fsstat`
* `nfs3_fsinfo`
* `nfs3_pathconf`
* `nfs3_commit`

View File

@@ -1,73 +0,0 @@
<!--
---
title: NFS network filesystem (v4) metric collector
description: Collect metrics for NFS network filesystems in version 4
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/nfs4.md
---
-->
## `nfs4stat` collector
```json
"nfs4stat": {
"nfsstat" : "/path/to/nfsstat",
"exclude_metrics": [
"nfs4_total"
]
}
```
The `nfs4stat` collector reads data from `nfsstat` command and outputs a handful **node** metrics. If a metric is not required, it can be excluded from forwarding it to the sink. There is currently no possibility to get the metrics per mount point.
Metrics:
* `nfs4_total`
* `nfs4_null`
* `nfs4_read`
* `nfs4_write`
* `nfs4_commit`
* `nfs4_open`
* `nfs4_open_conf`
* `nfs4_open_noat`
* `nfs4_open_dgrd`
* `nfs4_close`
* `nfs4_setattr`
* `nfs4_fsinfo`
* `nfs4_renew`
* `nfs4_setclntid`
* `nfs4_confirm`
* `nfs4_lock`
* `nfs4_lockt`
* `nfs4_locku`
* `nfs4_access`
* `nfs4_getattr`
* `nfs4_lookup`
* `nfs4_lookup_root`
* `nfs4_remove`
* `nfs4_rename`
* `nfs4_link`
* `nfs4_symlink`
* `nfs4_create`
* `nfs4_pathconf`
* `nfs4_statfs`
* `nfs4_readlink`
* `nfs4_readdir`
* `nfs4_server_caps`
* `nfs4_delegreturn`
* `nfs4_getacl`
* `nfs4_setacl`
* `nfs4_rel_lkowner`
* `nfs4_exchange_id`
* `nfs4_create_session`
* `nfs4_destroy_session`
* `nfs4_sequence`
* `nfs4_get_lease_time`
* `nfs4_reclaim_comp`
* `nfs4_secinfo_no`
* `nfs4_bind_conn_to_ses`

View File

@@ -1,10 +1,3 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
@@ -18,32 +11,26 @@ import (
"strings"
"time"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
)
// First part contains the code for the general NfsCollector.
// Later, the general NfsCollector is more limited to Nfs3- and Nfs4Collector.
const NFSSTAT_EXEC = `nfsstat`
type NfsCollectorData struct {
current int64
last int64
}
type nfsCollector struct {
type NfsCollector struct {
metricCollector
tags map[string]string
version string
config struct {
Nfsstats string `json:"nfsstat"`
tags map[string]string
config struct {
Nfsutils string `json:"nfsutils"`
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
}
data map[string]NfsCollectorData
data map[string]map[string]NfsCollectorData
}
func (m *nfsCollector) initStats() error {
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
func (m *NfsCollector) initStats() error {
cmd := exec.Command(m.config.Nfsutils, "-l")
cmd.Wait()
buffer, err := cmd.Output()
if err == nil {
@@ -52,16 +39,17 @@ func (m *nfsCollector) initStats() error {
if len(lf) != 5 {
continue
}
if lf[1] == m.version {
name := strings.Trim(lf[3], ":")
if _, exist := m.data[name]; !exist {
value, err := strconv.ParseInt(lf[4], 0, 64)
if err == nil {
x := m.data[name]
x.current = value
x.last = value
m.data[name] = x
}
if _, exist := m.data[lf[1]]; !exist {
m.data[lf[1]] = make(map[string]NfsCollectorData)
}
name := strings.Trim(lf[3], ":")
if _, exist := m.data[lf[1]][name]; !exist {
value, err := strconv.ParseInt(lf[4], 0, 64)
if err == nil {
x := m.data[lf[1]][name]
x.current = value
x.last = 0
m.data[lf[1]][name] = x
}
}
}
@@ -69,8 +57,8 @@ func (m *nfsCollector) initStats() error {
return err
}
func (m *nfsCollector) updateStats() error {
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
func (m *NfsCollector) updateStats() error {
cmd := exec.Command(m.config.Nfsutils, "-l")
cmd.Wait()
buffer, err := cmd.Output()
if err == nil {
@@ -79,16 +67,17 @@ func (m *nfsCollector) updateStats() error {
if len(lf) != 5 {
continue
}
if lf[1] == m.version {
name := strings.Trim(lf[3], ":")
if _, exist := m.data[name]; exist {
value, err := strconv.ParseInt(lf[4], 0, 64)
if err == nil {
x := m.data[name]
x.last = x.current
x.current = value
m.data[name] = x
}
if _, exist := m.data[lf[1]]; !exist {
m.data[lf[1]] = make(map[string]NfsCollectorData)
}
name := strings.Trim(lf[3], ":")
if _, exist := m.data[lf[1]][name]; exist {
value, err := strconv.ParseInt(lf[4], 0, 64)
if err == nil {
x := m.data[lf[1]][name]
x.last = x.current
x.current = value
m.data[lf[1]][name] = x
}
}
}
@@ -96,11 +85,17 @@ func (m *nfsCollector) updateStats() error {
return err
}
func (m *nfsCollector) MainInit(config json.RawMessage) error {
m.config.Nfsstats = string(NFSSTAT_EXEC)
func (m *NfsCollector) Init(config json.RawMessage) error {
var err error
m.name = "NfsCollector"
m.setup()
// Set default mmpmon binary
m.config.Nfsutils = "/usr/sbin/nfsstat"
// Read JSON configuration
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
err = json.Unmarshal(config, &m.config)
if err != nil {
log.Print(err.Error())
return err
@@ -113,70 +108,40 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
m.tags = map[string]string{
"type": "node",
}
// Check if nfsstat is in executable search path
_, err := exec.LookPath(m.config.Nfsstats)
// Check if mmpmon is in executable search path
_, err = exec.LookPath(m.config.Nfsutils)
if err != nil {
return fmt.Errorf("NfsCollector.Init(): Failed to find nfsstat binary '%s': %v", m.config.Nfsstats, err)
return fmt.Errorf("NfsCollector.Init(): Failed to find nfsstat binary '%s': %v", m.config.Nfsutils, err)
}
m.data = make(map[string]NfsCollectorData)
m.data = make(map[string]map[string]NfsCollectorData)
m.initStats()
m.init = true
m.parallel = true
return nil
}
func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
func (m *NfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if !m.init {
return
}
timestamp := time.Now()
m.updateStats()
prefix := ""
switch m.version {
case "v3":
prefix = "nfs3"
case "v4":
prefix = "nfs4"
default:
prefix = "nfs"
}
for name, data := range m.data {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, name); skip {
continue
}
value := data.current - data.last
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
y.AddMeta("version", m.version)
output <- y
for version, metrics := range m.data {
for name, data := range metrics {
if _, skip := stringArrayContains(m.config.ExcludeMetrics, name); skip {
continue
}
value := data.current - data.last
y, err := lp.New(fmt.Sprintf("nfs_%s", name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
y.AddMeta("version", version)
output <- y
}
}
}
}
func (m *nfsCollector) Close() {
func (m *NfsCollector) Close() {
m.init = false
}
type Nfs3Collector struct {
nfsCollector
}
type Nfs4Collector struct {
nfsCollector
}
func (m *Nfs3Collector) Init(config json.RawMessage) error {
m.name = "Nfs3Collector"
m.version = `v3`
m.setup()
return m.MainInit(config)
}
func (m *Nfs4Collector) Init(config json.RawMessage) error {
m.name = "Nfs4Collector"
m.version = `v4`
m.setup()
return m.MainInit(config)
}

View File

@@ -1,189 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"encoding/json"
"fmt"
"os"
"regexp"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// These are the fields we read from the JSON configuration
type NfsIOStatCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
UseServerAddressAsSType bool `json:"use_server_as_stype,omitempty"`
SendAbsoluteValues bool `json:"send_abs_values"`
SendDerivedValues bool `json:"send_derived_values"`
}
// This contains all variables we need during execution and the variables
// defined by metricCollector (name, init, ...)
type NfsIOStatCollector struct {
metricCollector
config NfsIOStatCollectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
data map[string]map[string]int64 // data storage for difference calculation
key string // which device info should be used as subtype ID? 'server' or 'mntpoint'
lastTimestamp time.Time
}
var deviceRegex = regexp.MustCompile(`device (?P<server>[^ ]+) mounted on (?P<mntpoint>[^ ]+) with fstype nfs(?P<version>\d*) statvers=[\d\.]+`)
var bytesRegex = regexp.MustCompile(`\s+bytes:\s+(?P<nread>[^ ]+) (?P<nwrite>[^ ]+) (?P<dread>[^ ]+) (?P<dwrite>[^ ]+) (?P<nfsread>[^ ]+) (?P<nfswrite>[^ ]+) (?P<pageread>[^ ]+) (?P<pagewrite>[^ ]+)`)
func resolve_regex_fields(s string, regex *regexp.Regexp) map[string]string {
fields := make(map[string]string)
groups := regex.SubexpNames()
for _, match := range regex.FindAllStringSubmatch(s, -1) {
for groupIdx, group := range match {
if len(groups[groupIdx]) > 0 {
fields[groups[groupIdx]] = group
}
}
}
return fields
}
func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
data := make(map[string]map[string]int64)
filename := "/proc/self/mountstats"
stats, err := os.ReadFile(filename)
if err != nil {
return data
}
lines := strings.Split(string(stats), "\n")
var current map[string]string = nil
for _, l := range lines {
// Is this a device line with mount point, remote target and NFS version?
dev := resolve_regex_fields(l, deviceRegex)
if len(dev) > 0 {
if _, ok := stringArrayContains(m.config.ExcludeFilesystem, dev[m.key]); !ok {
current = dev
if len(current["version"]) == 0 {
current["version"] = "3"
}
}
}
if len(current) > 0 {
// Byte line parsing (if found the device for it)
bytes := resolve_regex_fields(l, bytesRegex)
if len(bytes) > 0 {
data[current[m.key]] = make(map[string]int64)
for name, sval := range bytes {
if _, ok := stringArrayContains(m.config.ExcludeMetrics, name); !ok {
val, err := strconv.ParseInt(sval, 10, 64)
if err == nil {
data[current[m.key]][name] = val
}
}
}
current = nil
}
}
}
return data
}
func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
var err error = nil
m.name = "NfsIOStatCollector"
m.setup()
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "NFS", "unit": "bytes"}
m.tags = map[string]string{"type": "node"}
m.config.UseServerAddressAsSType = false
// Set default configuration
m.config.SendAbsoluteValues = true
m.config.SendDerivedValues = false
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
m.key = "mntpoint"
if m.config.UseServerAddressAsSType {
m.key = "server"
}
m.data = m.readNfsiostats()
m.lastTimestamp = time.Now()
m.init = true
return err
}
func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
now := time.Now()
timeDiff := now.Sub(m.lastTimestamp).Seconds()
m.lastTimestamp = now
// Get the current values for all mountpoints
newdata := m.readNfsiostats()
for mntpoint, values := range newdata {
// Was the mount point already present in the last iteration
if old, ok := m.data[mntpoint]; ok {
for name, newVal := range values {
if m.config.SendAbsoluteValues {
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", name), m.tags, m.meta, map[string]interface{}{"value": newVal}, now)
if err == nil {
msg.AddTag("stype", "filesystem")
msg.AddTag("stype-id", mntpoint)
output <- msg
}
}
if m.config.SendDerivedValues {
rate := float64(newVal-old[name]) / timeDiff
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]interface{}{"value": rate}, now)
if err == nil {
if strings.HasPrefix(name, "page") {
msg.AddMeta("unit", "4K_pages/s")
} else {
msg.AddMeta("unit", "bytes/sec")
}
msg.AddTag("stype", "filesystem")
msg.AddTag("stype-id", mntpoint)
output <- msg
}
}
old[name] = newVal
}
} else {
// First time we see this mount point, store all values
m.data[mntpoint] = values
}
}
// Reset entries that do not exist anymore
for mntpoint := range m.data {
found := false
for new := range newdata {
if new == mntpoint {
found = true
break
}
}
if !found {
delete(m.data, mntpoint)
}
}
}
func (m *NfsIOStatCollector) Close() {
// Unset flag
m.init = false
}

View File

@@ -1,45 +0,0 @@
<!--
---
title: NFS network filesystem metrics from procfs
description: Collect NFS network filesystem metrics for mounts from `/proc/self/mountstats`
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/nfsio.md
---
-->
## `nfsiostat` collector
```json
"nfsiostat": {
"exclude_metrics": [
"oread", "pageread"
],
"exclude_filesystems": [
"/mnt"
],
"use_server_as_stype": false,
"send_abs_values": false,
"send_derived_values": true
}
```
The `nfsiostat` collector reads data from `/proc/self/mountstats` and outputs a handful **node** metrics for each NFS filesystem. If a metric or filesystem is not required, it can be excluded from forwarding it to the sink. **Note:** When excluding metrics, you must provide the base metric name (e.g. pageread) without the nfsio_ prefix. This exclusion applies to both absolute and derived values.
Metrics:
* `nfsio_nread`: Bytes transferred by normal `read()` calls
* `nfsio_nwrite`: Bytes transferred by normal `write()` calls
* `nfsio_oread`: Bytes transferred by `read()` calls with `O_DIRECT`
* `nfsio_owrite`: Bytes transferred by `write()` calls with `O_DIRECT`
* `nfsio_pageread`: Pages transferred by `read()` calls
* `nfsio_pagewrite`: Pages transferred by `write()` calls
* `nfsio_nfsread`: Bytes transferred for reading from the server
* `nfsio_nfswrite`: Pages transferred by writing to the server
For each of these, if derived values are enabled, an additional metric is sent with the `_bw` suffix, which represents the rate:
* For normal byte metrics: `unit=bytes/sec`
* For page metrics: `unit=4K_pages/s`
The `nfsiostat` collector adds the mountpoint to the tags as `stype=filesystem,stype-id=<mountpoint>`. If the server address should be used instead of the mountpoint, use the `use_server_as_stype` config setting.

View File

@@ -1,192 +0,0 @@
package collectors
import (
"bufio"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
type NUMAStatsCollectorConfig struct {
SendAbsoluteValues bool `json:"send_abs_values"`
SendDerivedValues bool `json:"send_derived_values"`
}
// Non-Uniform Memory Access (NUMA) policy hit/miss statistics
//
// numa_hit:
//
// A process wanted to allocate memory from this node, and succeeded.
//
// numa_miss:
//
// A process wanted to allocate memory from another node,
// but ended up with memory from this node.
//
// numa_foreign:
//
// A process wanted to allocate on this node,
// but ended up with memory from another node.
//
// local_node:
//
// A process ran on this node's CPU,
// and got memory from this node.
//
// other_node:
//
// A process ran on a different node's CPU
// and got memory from this node.
//
// interleave_hit:
//
// Interleaving wanted to allocate from this node
// and succeeded.
//
// See: https://www.kernel.org/doc/html/latest/admin-guide/numastat.html
type NUMAStatsCollectorTopolgy struct {
file string
tagSet map[string]string
previousValues map[string]int64
}
type NUMAStatsCollector struct {
metricCollector
topology []NUMAStatsCollectorTopolgy
config NUMAStatsCollectorConfig
lastTimestamp time.Time
}
func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
m.name = "NUMAStatsCollector"
m.parallel = true
m.setup()
m.meta = map[string]string{
"source": m.name,
"group": "NUMA",
}
m.config.SendAbsoluteValues = true
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
if err != nil {
return fmt.Errorf("unable to unmarshal numastat configuration: %s", err.Error())
}
}
// Loop for all NUMA node directories
base := "/sys/devices/system/node/node"
globPattern := base + "[0-9]*"
dirs, err := filepath.Glob(globPattern)
if err != nil {
return fmt.Errorf("unable to glob files with pattern '%s'", globPattern)
}
if dirs == nil {
return fmt.Errorf("unable to find any files with pattern '%s'", globPattern)
}
m.topology = make([]NUMAStatsCollectorTopolgy, 0, len(dirs))
for _, dir := range dirs {
node := strings.TrimPrefix(dir, base)
file := filepath.Join(dir, "numastat")
m.topology = append(m.topology,
NUMAStatsCollectorTopolgy{
file: file,
tagSet: map[string]string{"memoryDomain": node},
previousValues: make(map[string]int64),
})
}
// Initialized
cclog.ComponentDebug(m.name, "initialized", len(m.topology), "NUMA domains")
m.init = true
return nil
}
func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
now := time.Now()
timeDiff := now.Sub(m.lastTimestamp).Seconds()
m.lastTimestamp = now
for i := range m.topology {
// Loop for all NUMA domains
t := &m.topology[i]
file, err := os.Open(t.file)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to open file '%s': %v", t.file, err))
continue
}
scanner := bufio.NewScanner(file)
// Read line by line
for scanner.Scan() {
line := scanner.Text()
split := strings.Fields(line)
if len(split) != 2 {
continue
}
key := split[0]
value, err := strconv.ParseInt(split[1], 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert %s='%s' to int64: %v", key, split[1], err))
continue
}
if m.config.SendAbsoluteValues {
msg, err := lp.NewMetric(
"numastats_"+key,
t.tagSet,
m.meta,
value,
now,
)
if err == nil {
output <- msg
}
}
if m.config.SendDerivedValues {
prev, ok := t.previousValues[key]
if ok {
rate := float64(value-prev) / timeDiff
msg, err := lp.NewMetric(
"numastats_"+key+"_rate",
t.tagSet,
m.meta,
rate,
now,
)
if err == nil {
output <- msg
}
}
t.previousValues[key] = value
}
}
file.Close()
}
}
func (m *NUMAStatsCollector) Close() {
m.init = false
}

View File

@@ -1,36 +0,0 @@
<!--
---
title: NUMAStat collector
description: Collect infos about NUMA domains
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/numastat.md
---
-->
## `numastat` collector
```json
"numastats": {
"send_abs_values" : true,
"send_derived_values" : true
}
```
The `numastat` collector reads data from `/sys/devices/system/node/node*/numastat` and outputs a handful **memoryDomain** metrics. See: <https://www.kernel.org/doc/html/latest/admin-guide/numastat.html>
Metrics:
* `numastats_numa_hit`: A process wanted to allocate memory from this node, and succeeded.
* `numastats_numa_miss`: A process wanted to allocate memory from another node, but ended up with memory from this node.
* `numastats_numa_foreign`: A process wanted to allocate on this node, but ended up with memory from another node.
* `numastats_local_node`: A process ran on this node's CPU, and got memory from this node.
* `numastats_other_node`: A process ran on a different node's CPU, and got memory from this node.
* `numastats_interleave_hit`: Interleaving wanted to allocate from this node and succeeded.
* `numastats_numa_hit_rate` (if `send_derived_values == true`): Derived rate value per second.
* `numastats_numa_miss_rate` (if `send_derived_values == true`): Derived rate value per second.
* `numastats_numa_foreign_rate` (if `send_derived_values == true`): Derived rate value per second.
* `numastats_local_node_rate` (if `send_derived_values == true`): Derived rate value per second.
* `numastats_other_node_rate` (if `send_derived_values == true`): Derived rate value per second.
* `numastats_interleave_hit_rate` (if `send_derived_values == true`): Derived rate value per second.

File diff suppressed because it is too large Load Diff

View File

@@ -1,89 +1,40 @@
<!--
---
title: "Nvidia NVML metric collector"
description: Collect metrics for Nvidia GPUs using the NVML
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/nvidia.md
---
-->
## `nvidia` collector
```json
"nvidia": {
"exclude_devices": [
"0","1", "0000000:ff:01.0"
"exclude_devices" : [
"0","1"
],
"exclude_metrics": [
"nv_fb_mem_used",
"nv_fb_memory",
"nv_fan"
],
"process_mig_devices": false,
"use_pci_info_as_type_id": true,
"add_pci_info_tag": false,
"add_uuid_meta": false,
"add_board_number_meta": false,
"add_serial_meta": false,
"use_uuid_for_mig_device": false,
"use_slice_for_mig_device": false
]
}
```
The `nvidia` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option. Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=<mig_index>`). With the options `use_uuid_for_mig_device` and `use_slice_for_mig_device`, the `<mig_index>` can be replaced with the UUID (e.g. `MIG-6a9f7cc8-6d5b-5ce0-92de-750edc4d8849`) or the MIG slice name (e.g. `1g.5gb`).
The metrics sent by the `nvidia` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
Optionally, it is possible to add the UUID, the board part number and the serial to the meta informations. They are not sent to the sinks (if not configured otherwise).
Metrics:
* `nv_util`
* `nv_mem_util`
* `nv_fb_mem_total`
* `nv_fb_mem_used`
* `nv_bar1_mem_total`
* `nv_bar1_mem_used`
* `nv_mem_total`
* `nv_fb_memory`
* `nv_temp`
* `nv_fan`
* `nv_ecc_mode`
* `nv_perf_state`
* `nv_power_usage`
* `nv_graphics_clock`
* `nv_sm_clock`
* `nv_mem_clock`
* `nv_video_clock`
* `nv_power_usage_report`
* `nv_graphics_clock_report`
* `nv_sm_clock_report`
* `nv_mem_clock_report`
* `nv_max_graphics_clock`
* `nv_max_sm_clock`
* `nv_max_mem_clock`
* `nv_max_video_clock`
* `nv_ecc_uncorrected_error`
* `nv_ecc_corrected_error`
* `nv_power_max_limit`
* `nv_ecc_db_error`
* `nv_ecc_sb_error`
* `nv_power_man_limit`
* `nv_encoder_util`
* `nv_decoder_util`
* `nv_remapped_rows_corrected`
* `nv_remapped_rows_uncorrected`
* `nv_remapped_rows_pending`
* `nv_remapped_rows_failure`
* `nv_compute_processes`
* `nv_graphics_processes`
* `nv_violation_power`
* `nv_violation_thermal`
* `nv_violation_sync_boost`
* `nv_violation_board_limit`
* `nv_violation_low_util`
* `nv_violation_reliability`
* `nv_violation_below_app_clock`
* `nv_violation_below_base_clock`
* `nv_nvlink_crc_flit_errors`
* `nv_nvlink_crc_errors`
* `nv_nvlink_ecc_errors`
* `nv_nvlink_replay_errors`
* `nv_nvlink_recovery_errors`
* `nv_energy`
* `nv_energy_abs`
* `nv_average_power`
Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`.
It uses a separate `type` in the metrics. The output metric looks like this:
`<name>,type=accelerator,type-id=<nvidia-gpu-id> value=<metric value> <timestamp>`

View File

@@ -1,269 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// running average power limit (RAPL) monitoring attributes for a zone
type RAPLZoneInfo struct {
// tags describing the RAPL zone:
// * zone_name, subzone_name: e.g. psys, dram, core, uncore, package-0
// * zone_id: e.g. 0:1 (zone 0 sub zone 1)
tags map[string]string
energyFilepath string // path to a file containing the zones current energy counter in micro joules
energy int64 // current reading of the energy counter in micro joules
energyTimestamp time.Time // timestamp when energy counter was read
maxEnergyRange int64 // Range of the above energy counter in micro-joules
}
type RAPLCollector struct {
metricCollector
config struct {
// Exclude IDs for RAPL zones, e.g.
// * 0 for zone 0
// * 0:1 for zone 0 subzone 1
ExcludeByID []string `json:"exclude_device_by_id,omitempty"`
// Exclude names for RAPL zones, e.g. psys, dram, core, uncore, package-0
ExcludeByName []string `json:"exclude_device_by_name,omitempty"`
}
RAPLZoneInfo []RAPLZoneInfo
meta map[string]string // default meta information
}
// Init initializes the running average power limit (RAPL) collector
func (m *RAPLCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
var err error = nil
m.name = "RAPLCollector"
m.setup()
m.parallel = true
m.meta = map[string]string{
"source": m.name,
"group": "energy",
"unit": "Watt",
}
// Read in the JSON configuration
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
// Configure excluded RAPL zones
isIDExcluded := make(map[string]bool)
if m.config.ExcludeByID != nil {
for _, ID := range m.config.ExcludeByID {
isIDExcluded[ID] = true
}
}
isNameExcluded := make(map[string]bool)
if m.config.ExcludeByName != nil {
for _, name := range m.config.ExcludeByName {
isNameExcluded[name] = true
}
}
// readZoneInfo reads RAPL monitoring attributes for a zone given by zonePath
// See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes
readZoneInfo := func(zonePath string) (z struct {
name string // zones name e.g. psys, dram, core, uncore, package-0
energyFilepath string // path to a file containing the zones current energy counter in micro joules
energy int64 // current reading of the energy counter in micro joules
energyTimestamp time.Time // timestamp when energy counter was read
maxEnergyRange int64 // Range of the above energy counter in micro-joules
ok bool // Are all information available?
}) {
// zones name e.g. psys, dram, core, uncore, package-0
foundName := false
if v, err :=
os.ReadFile(
filepath.Join(zonePath, "name")); err == nil {
foundName = true
z.name = strings.TrimSpace(string(v))
}
// path to a file containing the zones current energy counter in micro joules
z.energyFilepath = filepath.Join(zonePath, "energy_uj")
// current reading of the energy counter in micro joules
foundEnergy := false
if v, err := os.ReadFile(z.energyFilepath); err == nil {
// timestamp when energy counter was read
z.energyTimestamp = time.Now()
if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil {
foundEnergy = true
z.energy = i
}
}
// Range of the above energy counter in micro-joules
foundMaxEnergyRange := false
if v, err :=
os.ReadFile(
filepath.Join(zonePath, "max_energy_range_uj")); err == nil {
if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil {
foundMaxEnergyRange = true
z.maxEnergyRange = i
}
}
// Are all information available?
z.ok = foundName && foundEnergy && foundMaxEnergyRange
return
}
powerCapPrefix := "/sys/devices/virtual/powercap"
controlType := "intel-rapl"
controlTypePath := filepath.Join(powerCapPrefix, controlType)
// Find all RAPL zones
zonePrefix := filepath.Join(controlTypePath, controlType+":")
zonesPath, err := filepath.Glob(zonePrefix + "*")
if err != nil || zonesPath == nil {
return fmt.Errorf("unable to find any zones under %s", controlTypePath)
}
for _, zonePath := range zonesPath {
zoneID := strings.TrimPrefix(zonePath, zonePrefix)
z := readZoneInfo(zonePath)
if z.ok &&
!isIDExcluded[zoneID] &&
!isNameExcluded[z.name] {
// Add RAPL monitoring attributes for a zone
m.RAPLZoneInfo =
append(
m.RAPLZoneInfo,
RAPLZoneInfo{
tags: map[string]string{
"id": zoneID,
"zone_name": z.name,
},
energyFilepath: z.energyFilepath,
energy: z.energy,
energyTimestamp: z.energyTimestamp,
maxEnergyRange: z.maxEnergyRange,
})
}
// find all sub zones for the given zone
subZonePrefix := filepath.Join(zonePath, controlType+":"+zoneID+":")
subZonesPath, err := filepath.Glob(subZonePrefix + "*")
if err != nil || subZonesPath == nil {
continue
}
for _, subZonePath := range subZonesPath {
subZoneID := strings.TrimPrefix(subZonePath, subZonePrefix)
sz := readZoneInfo(subZonePath)
if len(zoneID) > 0 && len(z.name) > 0 &&
sz.ok &&
!isIDExcluded[zoneID+":"+subZoneID] &&
!isNameExcluded[sz.name] {
m.RAPLZoneInfo =
append(
m.RAPLZoneInfo,
RAPLZoneInfo{
tags: map[string]string{
"id": zoneID + ":" + subZoneID,
"zone_name": z.name,
"sub_zone_name": sz.name,
},
energyFilepath: sz.energyFilepath,
energy: sz.energy,
energyTimestamp: sz.energyTimestamp,
maxEnergyRange: sz.maxEnergyRange,
})
}
}
}
if m.RAPLZoneInfo == nil {
return fmt.Errorf("no running average power limit (RAPL) device found in %s", controlTypePath)
}
// Initialized
cclog.ComponentDebug(
m.name,
"initialized",
len(m.RAPLZoneInfo),
"zones with running average power limit (RAPL) monitoring attributes")
m.init = true
return err
}
// Read reads running average power limit (RAPL) monitoring attributes for all initialized zones
// See: https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes
func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) {
for i := range m.RAPLZoneInfo {
p := &m.RAPLZoneInfo[i]
// Read current value of the energy counter in micro joules
if v, err := os.ReadFile(p.energyFilepath); err == nil {
energyTimestamp := time.Now()
if i, err := strconv.ParseInt(strings.TrimSpace(string(v)), 10, 64); err == nil {
energy := i
// Compute average power (Δ energy / Δ time)
energyDiff := energy - p.energy
if energyDiff < 0 {
// Handle overflow:
// ( p.maxEnergyRange - p.energy ) + energy
// = p.maxEnergyRange + ( energy - p.energy )
// = p.maxEnergyRange + diffEnergy
energyDiff += p.maxEnergyRange
}
timeDiff := energyTimestamp.Sub(p.energyTimestamp)
averagePower := float64(energyDiff) / float64(timeDiff.Microseconds())
y, err := lp.NewMessage(
"rapl_average_power",
p.tags,
m.meta,
map[string]interface{}{"value": averagePower},
energyTimestamp)
if err == nil {
output <- y
}
// Save current energy counter state
p.energy = energy
p.energyTimestamp = energyTimestamp
}
}
}
}
// Close closes running average power limit (RAPL) metric collector
func (m *RAPLCollector) Close() {
// Unset flag
m.init = false
}

View File

@@ -1,26 +0,0 @@
<!--
---
title: RAPL metric collector
description: Collect energy data through the RAPL sysfs interface
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/rapl.md
---
-->
## `rapl` collector
This collector reads running average power limit (RAPL) monitoring attributes to compute average power consumption metrics. See <https://www.kernel.org/doc/html/latest/power/powercap/powercap.html#monitoring-attributes>.
The Likwid metric collector provides similar functionality.
```json
"rapl": {
"exclude_device_by_id": ["0:1", "0:2"],
"exclude_device_by_name": ["psys"]
}
```
Metrics:
* `rapl_average_power`: average power consumption in Watt. The average is computed over the entire runtime from the last measurement to the current measurement

View File

@@ -1,326 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"encoding/json"
"errors"
"fmt"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
)
type RocmSmiCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
ExcludeDevices []string `json:"exclude_devices,omitempty"`
AddPciInfoTag bool `json:"add_pci_info_tag,omitempty"`
UsePciInfoAsTypeId bool `json:"use_pci_info_as_type_id,omitempty"`
AddSerialMeta bool `json:"add_serial_meta,omitempty"`
}
type RocmSmiCollectorDevice struct {
device rocm_smi.DeviceHandle
index int
tags map[string]string // default tags
meta map[string]string // default meta information
excludeMetrics map[string]bool // copy of exclude metrics from config
}
type RocmSmiCollector struct {
metricCollector
config RocmSmiCollectorConfig // the configuration structure
devices []RocmSmiCollectorDevice
}
// Functions to implement MetricCollector interface
// Init(...), Read(...), Close()
// See: metricCollector.go
// Init initializes the sample collector
// Called once by the collector manager
// All tags, meta data tags and metrics that do not change over the runtime should be set here
func (m *RocmSmiCollector) Init(config json.RawMessage) error {
var err error = nil
// Always set the name early in Init() to use it in cclog.Component* functions
m.name = "RocmSmiCollector"
// This is for later use, also call it early
m.setup()
// Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta())
//m.meta = map[string]string{"source": m.name, "group": "AMD"}
// Define tags sent with each metric
// The 'type' tag is always needed, it defines the granulatity of the metric
// node -> whole system
// socket -> CPU socket (requires socket ID as 'type-id' tag)
// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
//m.tags = map[string]string{"type": "node"}
// Read in the JSON configuration
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
ret := rocm_smi.Init()
if ret != rocm_smi.STATUS_SUCCESS {
err = errors.New("failed to initialize ROCm SMI library")
cclog.ComponentError(m.name, err.Error())
return err
}
numDevs, ret := rocm_smi.NumMonitorDevices()
if ret != rocm_smi.STATUS_SUCCESS {
err = errors.New("failed to get number of GPUs from ROCm SMI library")
cclog.ComponentError(m.name, err.Error())
return err
}
exclDev := func(s string) bool {
skip_device := false
for _, excl := range m.config.ExcludeDevices {
if excl == s {
skip_device = true
break
}
}
return skip_device
}
m.devices = make([]RocmSmiCollectorDevice, 0)
for i := 0; i < numDevs; i++ {
str_i := fmt.Sprintf("%d", i)
if exclDev(str_i) {
continue
}
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
if ret != rocm_smi.STATUS_SUCCESS {
err = fmt.Errorf("failed to get handle for GPU %d", i)
cclog.ComponentError(m.name, err.Error())
return err
}
pciInfo, ret := rocm_smi.DeviceGetPciInfo(device)
if ret != rocm_smi.STATUS_SUCCESS {
err = fmt.Errorf("failed to get PCI information for GPU %d", i)
cclog.ComponentError(m.name, err.Error())
return err
}
pciId := fmt.Sprintf(
"%08X:%02X:%02X.%X",
pciInfo.Domain,
pciInfo.Bus,
pciInfo.Device,
pciInfo.Function)
if exclDev(pciId) {
continue
}
dev := RocmSmiCollectorDevice{
device: device,
tags: map[string]string{
"type": "accelerator",
"type-id": str_i,
},
meta: map[string]string{
"source": m.name,
"group": "AMD",
},
}
if m.config.UsePciInfoAsTypeId {
dev.tags["type-id"] = pciId
} else if m.config.AddPciInfoTag {
dev.tags["pci_identifier"] = pciId
}
if m.config.AddSerialMeta {
serial, ret := rocm_smi.DeviceGetSerialNumber(device)
if ret != rocm_smi.STATUS_SUCCESS {
cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", rocm_smi.StatusStringNoError(ret))
} else {
dev.meta["serial"] = serial
}
}
// Add excluded metrics
dev.excludeMetrics = map[string]bool{}
for _, e := range m.config.ExcludeMetrics {
dev.excludeMetrics[e] = true
}
dev.index = i
m.devices = append(m.devices, dev)
}
// Set this flag only if everything is initialized properly, all required files exist, ...
m.init = true
return err
}
// Read collects all metrics belonging to the sample collector
// and sends them through the output channel to the collector manager
func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Create a sample metric
timestamp := time.Now()
for _, dev := range m.devices {
metrics, ret := rocm_smi.DeviceGetMetrics(dev.device)
if ret != rocm_smi.STATUS_SUCCESS {
cclog.ComponentError(m.name, "Unable to get metrics for device at index", dev.index, ":", rocm_smi.StatusStringNoError(ret))
continue
}
if !dev.excludeMetrics["rocm_gfx_util"] {
value := metrics.Average_gfx_activity
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_umc_util"] {
value := metrics.Average_umc_activity
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_mm_util"] {
value := metrics.Average_mm_activity
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_avg_power"] {
value := metrics.Average_socket_power
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_mem"] {
value := metrics.Temperature_mem
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_hotspot"] {
value := metrics.Temperature_hotspot
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_edge"] {
value := metrics.Temperature_edge
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_vrgfx"] {
value := metrics.Temperature_vrgfx
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_vrsoc"] {
value := metrics.Temperature_vrsoc
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_vrmem"] {
value := metrics.Temperature_vrmem
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_gfx_clock"] {
value := metrics.Average_gfxclk_frequency
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_soc_clock"] {
value := metrics.Average_socclk_frequency
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_u_clock"] {
value := metrics.Average_uclk_frequency
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_v0_clock"] {
value := metrics.Average_vclk0_frequency
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_v1_clock"] {
value := metrics.Average_vclk1_frequency
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_d0_clock"] {
value := metrics.Average_dclk0_frequency
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_d1_clock"] {
value := metrics.Average_dclk1_frequency
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
output <- y
}
}
if !dev.excludeMetrics["rocm_temp_hbm"] {
for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ {
value := metrics.Temperature_hbm[i]
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
y.AddTag("stype", "device")
y.AddTag("stype-id", fmt.Sprintf("%d", i))
output <- y
}
}
}
}
}
// Close metric collector: close network connection, close files, close libraries, ...
// Called once by the collector manager
func (m *RocmSmiCollector) Close() {
// Unset flag
ret := rocm_smi.Shutdown()
if ret != rocm_smi.STATUS_SUCCESS {
cclog.ComponentError(m.name, "Failed to shutdown ROCm SMI library")
}
m.init = false
}

View File

@@ -1,58 +0,0 @@
<!--
---
title: "ROCm SMI metric collector"
description: Collect metrics for AMD GPUs using the SMI library
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/rocmsmi.md
---
-->
## `rocm_smi` collector
```json
"rocm_smi": {
"exclude_devices": [
"0","1", "0000000:ff:01.0"
],
"exclude_metrics": [
"rocm_mm_util",
"rocm_temp_vrsoc"
],
"use_pci_info_as_type_id": true,
"add_pci_info_tag": false,
"add_serial_meta": false,
}
```
The `rocm_smi` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes logical IDs in the list of available devices or the PCI address similar to NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option.
The metrics sent by the `rocm_smi` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
Optionally, it is possible to add the serial to the meta informations. They are not sent to the sinks (if not configured otherwise).
Metrics:
* `rocm_gfx_util`
* `rocm_umc_util`
* `rocm_mm_util`
* `rocm_avg_power`
* `rocm_temp_mem`
* `rocm_temp_hotspot`
* `rocm_temp_edge`
* `rocm_temp_vrgfx`
* `rocm_temp_vrsoc`
* `rocm_temp_vrmem`
* `rocm_gfx_clock`
* `rocm_soc_clock`
* `rocm_u_clock`
* `rocm_v0_clock`
* `rocm_v1_clock`
* `rocm_d0_clock`
* `rocm_d1_clock`
* `rocm_temp_hbm`
Some metrics add the additional sub type tag (`stype`) like the `rocm_temp_hbm` metrics set `stype=device,stype-id=<HBM_slice_number>`.

View File

@@ -1,108 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"encoding/json"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// These are the fields we read from the JSON configuration
type SampleCollectorConfig struct {
Interval string `json:"interval"`
}
// This contains all variables we need during execution and the variables
// defined by metricCollector (name, init, ...)
type SampleCollector struct {
metricCollector
config SampleCollectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
}
// Functions to implement MetricCollector interface
// Init(...), Read(...), Close()
// See: metricCollector.go
// Init initializes the sample collector
// Called once by the collector manager
// All tags, meta data tags and metrics that do not change over the runtime should be set here
func (m *SampleCollector) Init(config json.RawMessage) error {
var err error = nil
// Always set the name early in Init() to use it in cclog.Component* functions
m.name = "SampleCollector"
// This is for later use, also call it early
m.setup()
// Tell whether the collector should be run in parallel with others (reading files, ...)
// or it should be run serially, mostly for collectors actually doing measurements
// because they should not measure the execution of the other collectors
m.parallel = true
// Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta())
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
// Define tags sent with each metric
// The 'type' tag is always needed, it defines the granularity of the metric
// node -> whole system
// socket -> CPU socket (requires socket ID as 'type-id' tag)
// die -> CPU die (requires CPU die ID as 'type-id' tag)
// memoryDomain -> NUMA domain (requires NUMA domain ID as 'type-id' tag)
// llc -> Last level cache (requires last level cache ID as 'type-id' tag)
// core -> single CPU core that may consist of multiple hardware threads (SMT) (requires core ID as 'type-id' tag)
// hwthtread -> single CPU hardware thread (requires hardware thread ID as 'type-id' tag)
// accelerator -> A accelerator device like GPU or FPGA (requires an accelerator ID as 'type-id' tag)
m.tags = map[string]string{"type": "node"}
// Read in the JSON configuration
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
// Set up everything that the collector requires during the Read() execution
// Check files required, test execution of some commands, create data structure
// for all topological entities (sockets, NUMA domains, ...)
// Return some useful error message in case of any failures
// Set this flag only if everything is initialized properly, all required files exist, ...
m.init = true
return err
}
// Read collects all metrics belonging to the sample collector
// and sends them through the output channel to the collector manager
func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Create a sample metric
timestamp := time.Now()
value := 1.0
// If you want to measure something for a specific amount of time, use interval
// start := readState()
// time.Sleep(interval)
// stop := readState()
// value = (stop - start) / interval.Seconds()
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil {
// Send it to output channel
output <- y
}
}
// Close metric collector: close network connection, close files, close libraries, ...
// Called once by the collector manager
func (m *SampleCollector) Close() {
// Unset flag
m.init = false
}

View File

@@ -1,129 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"encoding/json"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
// These are the fields we read from the JSON configuration
type SampleTimerCollectorConfig struct {
Interval string `json:"interval"`
}
// This contains all variables we need during execution and the variables
// defined by metricCollector (name, init, ...)
type SampleTimerCollector struct {
metricCollector
wg sync.WaitGroup // sync group for management
done chan bool // channel for management
meta map[string]string // default meta information
tags map[string]string // default tags
config SampleTimerCollectorConfig // the configuration structure
interval time.Duration // the interval parsed from configuration
ticker *time.Ticker // own timer
output chan lp.CCMessage // own internal output channel
}
func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
var err error = nil
// Always set the name early in Init() to use it in cclog.Component* functions
m.name = "SampleTimerCollector"
// This is for later use, also call it early
m.setup()
// Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta())
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
// Define tags sent with each metric
// The 'type' tag is always needed, it defines the granularity of the metric
// node -> whole system
// socket -> CPU socket (requires socket ID as 'type-id' tag)
// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
m.tags = map[string]string{"type": "node"}
// Read in the JSON configuration
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
// Parse the read interval duration
m.interval, err = time.ParseDuration(m.config.Interval)
if err != nil {
cclog.ComponentError(m.name, "Error parsing interval:", err.Error())
return err
}
// Storage for output channel
m.output = nil
// Management channel for the timer function.
m.done = make(chan bool)
// Create the own ticker
m.ticker = time.NewTicker(m.interval)
// Start the timer loop with return functionality by sending 'true' to the done channel
m.wg.Add(1)
go func() {
select {
case <-m.done:
// Exit the timer loop
cclog.ComponentDebug(m.name, "Closing...")
m.wg.Done()
return
case timestamp := <-m.ticker.C:
// This is executed every timer tick but we have to wait until the first
// Read() to get the output channel
if m.output != nil {
m.ReadMetrics(timestamp)
}
}
}()
// Set this flag only if everything is initialized properly, all required files exist, ...
m.init = true
return err
}
// This function is called at each interval timer tick
func (m *SampleTimerCollector) ReadMetrics(timestamp time.Time) {
// Create a sample metric
value := 1.0
// If you want to measure something for a specific amount of time, use interval
// start := readState()
// time.Sleep(interval)
// stop := readState()
// value = (stop - start) / interval.Seconds()
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
if err == nil && m.output != nil {
// Send it to output channel if we have a valid channel
m.output <- y
}
}
func (m *SampleTimerCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// Capture output channel
m.output = output
}
func (m *SampleTimerCollector) Close() {
// Send signal to the timer loop to stop it
m.done <- true
// Wait until the timer loop is done
m.wg.Wait()
// Unset flag
m.init = false
}

View File

@@ -1,161 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"bufio"
"encoding/json"
"fmt"
"math"
"os"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
const SCHEDSTATFILE = `/proc/schedstat`
// These are the fields we read from the JSON configuration
type SchedstatCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
}
// This contains all variables we need during execution and the variables
// defined by metricCollector (name, init, ...)
type SchedstatCollector struct {
metricCollector
config SchedstatCollectorConfig // the configuration structure
lastTimestamp time.Time // Store time stamp of last tick to derive values
meta map[string]string // default meta information
cputags map[string]map[string]string // default tags
olddata map[string]map[string]int64 // default tags
}
// Functions to implement MetricCollector interface
// Init(...), Read(...), Close()
// See: metricCollector.go
// Init initializes the sample collector
// Called once by the collector manager
// All tags, meta data tags and metrics that do not change over the runtime should be set here
func (m *SchedstatCollector) Init(config json.RawMessage) error {
var err error = nil
// Always set the name early in Init() to use it in cclog.Component* functions
m.name = "SchedstatCollector"
// This is for later use, also call it early
m.setup()
// Tell whether the collector should be run in parallel with others (reading files, ...)
// or it should be run serially, mostly for collectors acutally doing measurements
// because they should not measure the execution of the other collectors
m.parallel = true
// Define meta information sent with each metric
// (Can also be dynamic or this is the basic set with extension through AddMeta())
m.meta = map[string]string{"source": m.name, "group": "SCHEDSTAT"}
// Read in the JSON configuration
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
// Check input file
file, err := os.Open(string(SCHEDSTATFILE))
if err != nil {
cclog.ComponentError(m.name, err.Error())
}
defer file.Close()
// Pre-generate tags for all CPUs
num_cpus := 0
m.cputags = make(map[string]map[string]string)
m.olddata = make(map[string]map[string]int64)
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
linefields := strings.Fields(line)
if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
cpustr := strings.TrimLeft(linefields[0], "cpu")
cpu, _ := strconv.Atoi(cpustr)
running, _ := strconv.ParseInt(linefields[7], 10, 64)
waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
m.olddata[linefields[0]] = map[string]int64{"running": running, "waiting": waiting}
num_cpus++
}
}
// Save current timestamp
m.lastTimestamp = time.Now()
// Set this flag only if everything is initialized properly, all required files exist, ...
m.init = true
return err
}
func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]string, output chan lp.CCMessage, now time.Time, tsdelta time.Duration) {
running, _ := strconv.ParseInt(linefields[7], 10, 64)
waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
diff_running := running - m.olddata[linefields[0]]["running"]
diff_waiting := waiting - m.olddata[linefields[0]]["waiting"]
var l_running float64 = float64(diff_running) / tsdelta.Seconds() / (math.Pow(1000, 3))
var l_waiting float64 = float64(diff_waiting) / tsdelta.Seconds() / (math.Pow(1000, 3))
m.olddata[linefields[0]]["running"] = running
m.olddata[linefields[0]]["waiting"] = waiting
value := l_running + l_waiting
y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now)
if err == nil {
// Send it to output channel
output <- y
}
}
// Read collects all metrics belonging to the sample collector
// and sends them through the output channel to the collector manager
func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessage) {
if !m.init {
return
}
//timestamps
now := time.Now()
tsdelta := now.Sub(m.lastTimestamp)
file, err := os.Open(string(SCHEDSTATFILE))
if err != nil {
cclog.ComponentError(m.name, err.Error())
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
linefields := strings.Fields(line)
if strings.HasPrefix(linefields[0], "cpu") {
m.ParseProcLine(linefields, m.cputags[linefields[0]], output, now, tsdelta)
}
}
m.lastTimestamp = now
}
// Close metric collector: close network connection, close files, close libraries, ...
// Called once by the collector manager
func (m *SchedstatCollector) Close() {
// Unset flag
m.init = false
}

View File

@@ -1,21 +0,0 @@
<!--
---
title: SchedStat Metric collector
description: Collect metrics from `/proc/schedstat`
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/schedstat.md
---
-->
## `schedstat` collector
```json
"schedstat": {
}
```
The `schedstat` collector reads data from /proc/schedstat and calculates a load value, separated by hwthread. This might be useful to detect bad cpu pinning on shared nodes etc.
Metric:
* `cpu_load_core`

View File

@@ -1,151 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"encoding/json"
"runtime"
"syscall"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
)
type SelfCollectorConfig struct {
MemStats bool `json:"read_mem_stats"`
GoRoutines bool `json:"read_goroutines"`
CgoCalls bool `json:"read_cgo_calls"`
Rusage bool `json:"read_rusage"`
}
type SelfCollector struct {
metricCollector
config SelfCollectorConfig // the configuration structure
meta map[string]string // default meta information
tags map[string]string // default tags
}
func (m *SelfCollector) Init(config json.RawMessage) error {
var err error = nil
m.name = "SelfCollector"
m.setup()
m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Self"}
m.tags = map[string]string{"type": "node"}
if len(config) > 0 {
err = json.Unmarshal(config, &m.config)
if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error())
return err
}
}
m.init = true
return err
}
func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
timestamp := time.Now()
if m.config.MemStats {
var memstats runtime.MemStats
runtime.ReadMemStats(&memstats)
y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp)
if err == nil {
y.AddMeta("unit", "Bytes")
output <- y
}
y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp)
if err == nil {
output <- y
}
}
if m.config.GoRoutines {
y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp)
if err == nil {
output <- y
}
}
if m.config.CgoCalls {
y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp)
if err == nil {
output <- y
}
}
if m.config.Rusage {
var rusage syscall.Rusage
err := syscall.Getrusage(syscall.RUSAGE_SELF, &rusage)
if err == nil {
sec, nsec := rusage.Utime.Unix()
t := float64(sec) + (float64(nsec) * 1e-9)
y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
if err == nil {
y.AddMeta("unit", "seconds")
output <- y
}
sec, nsec = rusage.Stime.Unix()
t = float64(sec) + (float64(nsec) * 1e-9)
y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
if err == nil {
y.AddMeta("unit", "seconds")
output <- y
}
y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp)
if err == nil {
output <- y
}
y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp)
if err == nil {
output <- y
}
y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp)
if err == nil {
output <- y
}
y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp)
if err == nil {
output <- y
}
y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp)
if err == nil {
output <- y
}
}
}
}
func (m *SelfCollector) Close() {
m.init = false
}

View File

@@ -1,45 +0,0 @@
<!--
---
title: Self-monitoring metric collector
description: Collect metrics from the execution of cc-metric-collector itself
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/self.md
---
-->
## `self` collector
```json
"self": {
"read_mem_stats" : true,
"read_goroutines" : true,
"read_cgo_calls" : true,
"read_rusage" : true
}
```
The `self` collector reads the data from the `runtime` and `syscall` packages, so monitors the execution of the cc-metric-collector itself.
Metrics:
* If `read_mem_stats == true`:
* `total_alloc`: The metric reports cumulative bytes allocated for heap objects.
* `heap_alloc`: The metric reports bytes of allocated heap objects.
* `heap_sys`: The metric reports bytes of heap memory obtained from the OS.
* `heap_idle`: The metric reports bytes in idle (unused) spans.
* `heap_inuse`: The metric reports bytes in in-use spans.
* `heap_released`: The metric reports bytes of physical memory returned to the OS.
* `heap_objects`: The metric reports the number of allocated heap objects.
* If `read_goroutines == true`:
* `num_goroutines`: The metric reports the number of goroutines that currently exist.
* If `read_cgo_calls == true`:
* `num_cgo_calls`: The metric reports the number of cgo calls made by the current process.
* If `read_rusage == true`:
* `rusage_user_time`: The metric reports the amount of time that this process has been scheduled in user mode.
* `rusage_system_time`: The metric reports the amount of time that this process has been scheduled in kernel mode.
* `rusage_vol_ctx_switch`: The metric reports the amount of voluntary context switches.
* `rusage_invol_ctx_switch`: The metric reports the amount of involuntary context switches.
* `rusage_signals`: The metric reports the number of signals received.
* `rusage_major_pgfaults`: The metric reports the number of major faults the process has made which have required loading a memory page from disk.
* `rusage_minor_pgfaults`: The metric reports the number of minor faults the process has made which have not required loading a memory page from disk.

View File

@@ -1,242 +1,113 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
"encoding/json"
"fmt"
"io/ioutil"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
"os"
"path/filepath"
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
)
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
// /sys/class/hwmon/hwmon*/name -> coretemp
// /sys/class/hwmon/hwmon*/temp*_label -> Core 0
// /sys/class/hwmon/hwmon*/temp*_input -> 27800 = 27.8°C
// /sys/class/hwmon/hwmon*/temp*_max -> 86000 = 86.0°C
// /sys/class/hwmon/hwmon*/temp*_crit -> 100000 = 100.0°C
const HWMON_PATH = `/sys/class/hwmon`
type TempCollectorSensor struct {
name string
label string
metricName string // Default: name_label
file string
maxTempName string
maxTemp int64
critTempName string
critTemp int64
tags map[string]string
type TempCollectorConfig struct {
ExcludeMetrics []string `json:"exclude_metrics"`
TagOverride map[string]map[string]string `json:"tag_override"`
}
type TempCollector struct {
metricCollector
config struct {
ExcludeMetrics []string `json:"exclude_metrics"`
TagOverride map[string]map[string]string `json:"tag_override"`
ReportMaxTemp bool `json:"report_max_temperature"`
ReportCriticalTemp bool `json:"report_critical_temperature"`
}
sensors []*TempCollectorSensor
config TempCollectorConfig
}
func (m *TempCollector) Init(config json.RawMessage) error {
// Check if already initialized
if m.init {
return nil
}
m.name = "TempCollector"
m.parallel = true
m.setup()
m.meta = map[string]string{"source": m.name, "group": "IPMI", "unit": "degC"}
if len(config) > 0 {
err := json.Unmarshal(config, &m.config)
if err != nil {
return err
}
}
m.meta = map[string]string{
"source": m.name,
"group": "IPMI",
"unit": "degC",
}
m.sensors = make([]*TempCollectorSensor, 0)
// Find all temperature sensor files
globPattern := filepath.Join("/sys/class/hwmon", "*", "temp*_input")
inputFiles, err := filepath.Glob(globPattern)
if err != nil {
return fmt.Errorf("unable to glob files with pattern '%s': %v", globPattern, err)
}
if inputFiles == nil {
return fmt.Errorf("unable to find any files with pattern '%s'", globPattern)
}
// Get sensor name for each temperature sensor file
for _, file := range inputFiles {
sensor := new(TempCollectorSensor)
// sensor name
nameFile := filepath.Join(filepath.Dir(file), "name")
name, err := os.ReadFile(nameFile)
if err == nil {
sensor.name = strings.TrimSpace(string(name))
}
// sensor label
labelFile := strings.TrimSuffix(file, "_input") + "_label"
label, err := os.ReadFile(labelFile)
if err == nil {
sensor.label = strings.TrimSpace(string(label))
}
// sensor metric name
switch {
case len(sensor.name) == 0 && len(sensor.label) == 0:
continue
case sensor.name == "coretemp" && strings.HasPrefix(sensor.label, "Core ") ||
sensor.name == "coretemp" && strings.HasPrefix(sensor.label, "Package id "):
sensor.metricName = "temp_" + sensor.label
case len(sensor.name) != 0 && len(sensor.label) != 0:
sensor.metricName = sensor.name + "_" + sensor.label
case len(sensor.name) != 0:
sensor.metricName = sensor.name
case len(sensor.label) != 0:
sensor.metricName = sensor.label
}
sensor.metricName = strings.ToLower(sensor.metricName)
sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1)
// Add temperature prefix, if required
if !strings.Contains(sensor.metricName, "temp") {
sensor.metricName = "temp_" + sensor.metricName
}
// Sensor file
_, err = os.ReadFile(file)
if err != nil {
continue
}
sensor.file = file
// Sensor tags
sensor.tags = map[string]string{
"type": "node",
}
// Apply tag override configuration
for key, newtags := range m.config.TagOverride {
if strings.Contains(sensor.file, key) {
sensor.tags = newtags
break
}
}
// max temperature
if m.config.ReportMaxTemp {
maxTempFile := strings.TrimSuffix(file, "_input") + "_max"
if buffer, err := os.ReadFile(maxTempFile); err == nil {
if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil {
sensor.maxTempName = strings.Replace(sensor.metricName, "temp", "max_temp", 1)
sensor.maxTemp = x / 1000
}
}
}
// critical temperature
if m.config.ReportCriticalTemp {
criticalTempFile := strings.TrimSuffix(file, "_input") + "_crit"
if buffer, err := os.ReadFile(criticalTempFile); err == nil {
if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil {
sensor.critTempName = strings.Replace(sensor.metricName, "temp", "crit_temp", 1)
sensor.critTemp = x / 1000
}
}
}
m.sensors = append(m.sensors, sensor)
}
// Empty sensors map
if len(m.sensors) == 0 {
return fmt.Errorf("no temperature sensors found")
}
// Finished initialization
m.init = true
return nil
}
func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
func get_hwmon_sensors() (map[string]map[string]string, error) {
var folders []string
var sensors map[string]map[string]string
sensors = make(map[string]map[string]string)
err := filepath.Walk(HWMON_PATH, func(p string, info os.FileInfo, err error) error {
if info.IsDir() {
return nil
}
folders = append(folders, p)
return nil
})
if err != nil {
return sensors, err
}
for _, sensor := range m.sensors {
// Read sensor file
buffer, err := os.ReadFile(sensor.file)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to read file '%s': %v", sensor.file, err))
continue
}
x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64)
if err != nil {
cclog.ComponentError(
m.name,
fmt.Sprintf("Read(): Failed to convert temperature '%s' to int64: %v", buffer, err))
continue
}
x /= 1000
y, err := lp.NewMessage(
sensor.metricName,
sensor.tags,
m.meta,
map[string]interface{}{"value": x},
time.Now(),
)
if err == nil {
output <- y
}
// max temperature
if m.config.ReportMaxTemp && sensor.maxTemp != 0 {
y, err := lp.NewMessage(
sensor.maxTempName,
sensor.tags,
m.meta,
map[string]interface{}{"value": sensor.maxTemp},
time.Now(),
)
if err == nil {
output <- y
for _, f := range folders {
sensors[f] = make(map[string]string)
myp := fmt.Sprintf("%s/", f)
err := filepath.Walk(myp, func(path string, info os.FileInfo, err error) error {
dir, fname := filepath.Split(path)
if strings.Contains(fname, "temp") && strings.Contains(fname, "_input") {
namefile := fmt.Sprintf("%s/%s", dir, strings.Replace(fname, "_input", "_label", -1))
name, ierr := ioutil.ReadFile(namefile)
if ierr == nil {
sensors[f][strings.Replace(string(name), "\n", "", -1)] = path
}
}
return nil
})
if err != nil {
continue
}
}
return sensors, nil
}
// critical temperature
if m.config.ReportCriticalTemp && sensor.critTemp != 0 {
y, err := lp.NewMessage(
sensor.critTempName,
sensor.tags,
m.meta,
map[string]interface{}{"value": sensor.critTemp},
time.Now(),
)
func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) {
sensors, err := get_hwmon_sensors()
if err != nil {
return
}
for _, files := range sensors {
for name, file := range files {
tags := map[string]string{"type": "node"}
for key, newtags := range m.config.TagOverride {
if strings.Contains(file, key) {
tags = newtags
break
}
}
mname := strings.Replace(name, " ", "_", -1)
if !strings.Contains(mname, "temp") {
mname = fmt.Sprintf("temp_%s", mname)
}
buffer, err := ioutil.ReadFile(string(file))
if err != nil {
continue
}
x, err := strconv.ParseInt(strings.Replace(string(buffer), "\n", "", -1), 0, 64)
if err == nil {
output <- y
y, err := lp.New(strings.ToLower(mname), tags, m.meta, map[string]interface{}{"value": int(float64(x) / 1000)}, time.Now())
if err == nil {
cclog.ComponentDebug(m.name, y)
output <- y
}
}
}
}
}
func (m *TempCollector) Close() {

View File

@@ -1,14 +1,3 @@
<!--
---
title: Temperature metric collector
description: Collect thermal metrics from `/sys/class/hwmon/*`
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/temp.md
---
-->
## `tempstat` collector

View File

@@ -1,10 +1,3 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package collectors
import (
@@ -15,8 +8,7 @@ import (
"os/exec"
"strings"
"time"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
)
const MAX_NUM_PROCS = 10
@@ -35,7 +27,6 @@ type TopProcsCollector struct {
func (m *TopProcsCollector) Init(config json.RawMessage) error {
var err error
m.name = "TopProcsCollector"
m.parallel = true
m.tags = map[string]string{"type": "node"}
m.meta = map[string]string{"source": m.name, "group": "TopProcs"}
if len(config) > 0 {
@@ -47,20 +38,20 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
m.config.Num_procs = int(DEFAULT_NUM_PROCS)
}
if m.config.Num_procs <= 0 || m.config.Num_procs > MAX_NUM_PROCS {
return fmt.Errorf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS)
return errors.New(fmt.Sprintf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS))
}
m.setup()
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
command.Wait()
_, err = command.Output()
if err != nil {
return errors.New("failed to execute command")
return errors.New("Failed to execute command")
}
m.init = true
return nil
}
func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
if !m.init {
return
}
@@ -75,7 +66,7 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessag
lines := strings.Split(string(stdout), "\n")
for i := 1; i < m.config.Num_procs+1; i++ {
name := fmt.Sprintf("topproc%d", i)
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now())
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now())
if err == nil {
output <- y
}

View File

@@ -1,15 +1,3 @@
<!--
---
title: TopProcs collector
description: Collect infos about most CPU-consuming processes
categories: [cc-metric-collector]
tags: ['Admin']
weight: 2
hugo_path: docs/reference/cc-metric-collector/collectors/topprocs.md
---
-->
## `topprocs` collector

View File

@@ -1,10 +1,8 @@
{
"sinks-file": "./sinks.json",
"collectors-file" : "./collectors.json",
"receivers-file" : "./receivers.json",
"router-file" : "./router.json",
"main" : {
"interval": "10s",
"duration": "1s"
}
"sinks": "sinks.json",
"collectors" : "collectors.json",
"receivers" : "receivers.json",
"router" : "router.json",
"interval": 10,
"duration": 1
}

View File

@@ -1,74 +0,0 @@
# Building the cc-metric-collector
In most cases, a simple `make` in the main folder is enough to get a `cc-metric-collector` binary. It is basically a `go build` but some collectors require additional tasks. There is currently no Golang interface to LIKWID, so it uses `cgo` to create bindings but `cgo` requires the LIKWID header files. Therefore, it checks whether LIKWID is installed and if not it downloads LIKWID and copies the headers.
## System integration
The main configuration settings for system integration are pre-defined in `scripts/cc-metric-collector.config`. The file contains the UNIX user and group used for execution, the PID file location and other settings. Adjust it accordingly and copy it to `/etc/default/cc-metric-collector`
```bash
$ install --mode 644 \
--owner $CC_USER \
--group $CC_GROUP \
scripts/cc-metric-collector.config /etc/default/cc-metric-collector
$ edit /etc/default/cc-metric-collector
```
### SysVinit and similar
If you are using a init system based in `/etc/init.d` daemons, you can use the sample `scripts/cc-metric-collector.init`. It reads the basic configuration from `/etc/default/cc-metric-collector`
```bash
$ install --mode 755 \
--owner $CC_USER \
--group $CC_GROUP \
scripts/cc-metric-collector.init /etc/init.d/cc-metric-collector
```
### Systemd
If you are using `systemd` as init system, you can use the sample systemd service file `scripts/cc-metric-collector.service`, the configuration file `scripts/cc-metric-collector.config`.
```bash
$ install --mode 644 \
--owner $CC_USER \
--group $CC_GROUP \
scripts/cc-metric-collector.service /etc/systemd/system/cc-metric-collector.service
$ systemctl enable cc-metric-collector
```
## Packaging
### RPM
In order to get a RPM packages for cc-metric-collector, just use:
```bash
$ make RPM
```
It uses the RPM SPEC file `scripts/cc-metric-collector.spec` and requires the RPM tools (`rpm` and `rpmspec`) and `git`.
### DEB
In order to get very simple Debian packages for cc-metric-collector, just use:
```bash
$ make DEB
```
It uses the DEB control file `scripts/cc-metric-collector.control` and requires `dpkg-deb`, `awk`, `sed` and `git`. It creates only a binary deb package.
_This option is not well tested and therefore experimental_
### Customizing RPMs or DEB packages
If you want to customize the RPMs or DEB packages for your local system, use the following workflow.
- (if there is already a fork in the private account, delete it and wait until Github realizes the deletion)
- Fork the cc-metric-collector repository (if Github hasn't realized it, it creates a fork named cc-metric-collector2)
- Go to private cc-metric-collector repository and enable Github Actions
- Do changes to the scripts, code, ... Commit and push your changes.
- Tag the new commit with `v0.x.y-<myversion>` (`git tag v0.x.y-<myversion>`)
- Push tags to repository (`git push --tags`)
- Wait until the Release action finishes. It creates fresh RPMs and DEBs in your private repository on the Releases page.

View File

@@ -1,189 +0,0 @@
# Configuring the CC metric collector
The configuration of the CC metric collector consists of five configuration files: one global file and four component related files.
## Global configuration
The global file contains the paths to the other four files and some global options.
```json
{
"sinks": "sinks.json",
"collectors" : "collectors.json",
"receivers" : "receivers.json",
"router" : "router.json",
"interval": "10s",
"duration": "1s"
}
```
Be aware that the paths are relative to the execution folder of the cc-metric-collector binary, so it is recommended to use absolute paths.
## Component configuration
The others are mainly list of of subcomponents: the collectors, the receivers, the router and the sinks. Their role is best shown in a picture:
```mermaid
flowchart LR
subgraph col ["Collectors"]
direction TB
cpustat["cpustat"]
memstat["memstat"]
tempstat["tempstat"]
misc["..."]
end
subgraph Receivers ["Receivers"]
direction TB
nats["NATS"]
httprecv["HTTP"]
miscrecv[...]
end
subgraph calc["Aggregator"]
direction LR
cache["Cache"]
agg["Calculator"]
end
subgraph sinks ["Sinks"]
direction RL
influx["InfluxDB"]
ganglia["Ganglia"]
logger["Logfile"]
miscsink["..."]
end
cpustat --> CollectorManager["CollectorManager"]
memstat --> CollectorManager
tempstat --> CollectorManager
misc --> CollectorManager
nats --> ReceiverManager["ReceiverManager"]
httprecv --> ReceiverManager
miscrecv --> ReceiverManager
CollectorManager --> newrouter["Router"]
ReceiverManager -.-> newrouter
calc -.-> newrouter
newrouter --> SinkManager["SinkManager"]
newrouter -.-> calc
SinkManager --> influx
SinkManager --> ganglia
SinkManager --> logger
SinkManager --> miscsink
```
There are four parts:
- The collectors read data from files, execute commands and call dynamically loaded library function and send it to the router
- The router can process metrics by cacheing and evaluating functions and conditions on them
- The sinks send the metrics to storage backends
- The receivers can be used to receive metrics from other collectors and forward them to the router. They can be used to create a tree-like structure of collectors.
(A maybe better differentiation between collectors and receivers is that the collectors are called periodically while the receivers have their own logic and submit metrics at any time)
### Collectors configuration file
The collectors configuration file tells which metrics should be queried from the system. The metric gathering is logically grouped in so called 'Collectors'. So there are Collectors to read CPU, memory or filesystem statistics. The collectors configuration file is a list of these collectors with collector-specific configurations:
```json
{
"cpustat" : {},
"diskstat": {
"exclude_metrics": [
"disk_total"
]
}
}
```
The first one is the CPU statistics collector without any collector-specific setting. The second one enables disk mount statistics but excludes the metric `disk_total`.
All names and possible collector-specific configuration options can be found [here](../collectors/README.md).
Some collectors might dynamically load shared libraries. In order to enable these collectors, make sure that the shared library path is part of the `LD_LIBRARY_PATH` environment variable.
### Sinks configuration file
The sinks define the output/sending of metrics. The metrics can be forwarded to multiple sinks, even to sinks of the same type. The sinks configuration file is a list of these sinks, each with an individual name.
```json
{
"myinflux" : {
"type" : "influxasync",
"host": "localhost",
"port": "8086",
"organization" : "testorga",
"database" : "testbucket",
"password" : "<my secret JWT>"
},
"companyinflux" : {
"type" : "influxasync",
"host": "companyhost",
"port": "8086",
"organization" : "company",
"database" : "main",
"password" : "<company's secret JWT>"
}
}
```
The above example configuration file defines two sink, both ot type `influxasync`. They are differentiated internally by the names: `myinflux` and `companyinflux`.
All types and possible sink-specific configuration options can be found [here](../sinks/README.md).
Some sinks might dynamically load shared libraries. In order to enable these sinks, make sure that the shared library path is part of the `LD_LIBRARY_PATH` environment variable.
### Router configuration file
The collectors and the sinks are connected through the router. The router forwards the metrics to the sinks but enables some data processing. A common example is to tag all passing metrics like adding `cluster=mycluster`. But also aggregations like "take the average of all 'ipc' metrics" (ipc -> Instructions Per Cycle). Since the configurations of these aggregations can be quite complicated, we refer to the router's [README](../internal/metricRouter/README.md).
A simple router configuration file to start with looks like this:
```json
{
"add_tags" : [
{
"key" : "cluster",
"value" : "mycluster",
"if" : "*"
}
],
"interval_timestamp" : false,
"num_cache_intervals" : 0
}
```
With the `add_tags` section, we tell to attach the `cluster=mycluster` tag to each (`*` metric). The `interval_timestamp` tell the router to not touch the timestamp of metrics. It is possible to send all metrics within an interval with a common time stamp to avoid later alignment issues. The `num_cache_intervals` diables the cache completely. The cache is only required if you want to do complex metric aggregations.
All configuration options can be found [here](../internal/metricRouter/README.md).
### Receivers configuration file
The receivers are a special feature of the CC Metric Collector to enable simpler integration into exising setups. While collectors query data from the local system, the receivers commonly get data from other systems through some network technology like HTTP or NATS. The idea is keep the current setup but send it to a CC Metric Collector which forwards it to the the destination system (if a sink exists for it). For most setups, the receivers are not required and an the receiver config file should contain only an empty JSON map (`{}`).
```json
{
"nats_rack0": {
"type": "nats",
"address" : "nats-server.example.org",
"port" : "4222",
"subject" : "rack0",
},
"nats_rack1": {
"type": "nats",
"address" : "nats-server.example.org",
"port" : "4222",
"subject" : "rack1",
}
}
```
This example configuration creates two receivers with the names `nats_rack0` and `nats_rack1`. While one subscribes to metrics published with the `rack0` subject, the other one subscribes to the `rack0` subject. The NATS server is the same as it manages all subjects in a subnet. (As example, the router could add tags `rack=0` and `rack=1` respectively to the received metrics.)
All types and possible receiver-specific configuration options can be found [here](../receivers/README.md).

View File

@@ -1,23 +0,0 @@
# The ClusterCockpit Project
The ClusterCockpit project is a joined project of computing centers in Europe to set up a cluster monitoring stack for small to mid-sized computing centers under the lead of NHR@FAU.
# The ClusterCockpit Stack
In cluster environment, there are commonly a lot of systems dedicated for computation, backend servers for file systems and frontend servers for the user interaction and cluster control. The ClusterCockpit Stack is mainly used for monitoring the compute systems with some interaction to the frontend servers. It consists of multiple components:
- cc-metric-collector: Monitor resource usage on the compute systems
- cc-metric-store: In-memory database
- cc-backend & cc-frontend: The web-based visualizer
# CC Metric Collector
The CC Metric Collector project was started to provide a useful set of metrics for HPC and data science related compute systems. It runs as a system daemon and gathers system data periodically to forward the metrics to one or more databases. One of the provided backends can be used for the cc-metric-store but many others exist like InfluxDB time-series databases, the Ganglia Monitoring System or the Prometheus Monitoring System.
The data is gathered by so-called "Collectors", forwarded to an internal router for on-the-fly manipulation (tagging, aggregation, ...) which pushes the metrics to the different metric writers called "Sinks". There is a forth component, the "Receivers", which receive data through some networking system like a HTTP server at any time.
# CC Metric Store
The CC Metric Store is a data management system with short-term in-memory and long-term file-base metric storage.
# CC Backend and CC Frontend
The CC Backend and Frontend form together the web interface for ClusterCockpit.

48
go.mod
View File

@@ -1,46 +1,20 @@
module github.com/ClusterCockpit/cc-metric-collector
go 1.23.4
toolchain go1.23.7
go 1.16
require (
github.com/ClusterCockpit/cc-lib v0.5.0
github.com/ClusterCockpit/go-rocm-smi v0.3.0
github.com/NVIDIA/go-nvml v0.12.9-0
github.com/PaesslerAG/gval v1.2.2
github.com/fsnotify/fsnotify v1.9.0
github.com/NVIDIA/go-nvml v0.11.1-0
github.com/influxdata/influxdb-client-go/v2 v2.7.0
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf
github.com/tklauser/go-sysconf v0.3.13
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b
golang.org/x/sys v0.33.0
github.com/nats-io/nats.go v1.13.1-0.20211122170419-d7c1d78a50fc
golang.org/x/sys v0.0.0-20220114195835-da31bd327af9
gopkg.in/Knetic/govaluate.v2 v2.3.0
)
require (
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/expr-lang/expr v1.17.5 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/mux v1.8.1 // indirect
github.com/influxdata/influxdb-client-go/v2 v2.14.0 // indirect
github.com/influxdata/line-protocol/v2 v2.2.1 // indirect
github.com/klauspost/compress v1.18.0 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/nats-io/nats.go v1.43.0 // indirect
github.com/nats-io/nkeys v0.4.11 // indirect
github.com/nats-io/nuid v1.0.1 // indirect
github.com/oapi-codegen/runtime v1.1.1 // indirect
github.com/prometheus/client_golang v1.22.0 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.65.0 // indirect
github.com/prometheus/procfs v0.16.1 // indirect
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 // indirect
github.com/shopspring/decimal v1.3.1 // indirect
github.com/stmcginnis/gofish v0.20.0 // indirect
github.com/tklauser/numcpus v0.7.0 // indirect
golang.org/x/crypto v0.39.0 // indirect
golang.org/x/net v0.41.0 // indirect
google.golang.org/protobuf v1.36.6 // indirect
github.com/PaesslerAG/gval v1.1.2
github.com/golang/protobuf v1.5.2 // indirect
github.com/mattn/go-sqlite3 v1.14.11
github.com/nats-io/nats-server/v2 v2.7.0 // indirect
google.golang.org/protobuf v1.27.1 // indirect
)

232
go.sum
View File

@@ -1,119 +1,147 @@
github.com/ClusterCockpit/cc-lib v0.5.0 h1:DSKAD1TxjVWyd1x3GWvxFeEkANF9o13T97nirj3CbRU=
github.com/ClusterCockpit/cc-lib v0.5.0/go.mod h1:0zLbJprwOWLA+OSNQ+OlUKLscZszwf9J2j8Ly5ztplk=
github.com/ClusterCockpit/go-rocm-smi v0.3.0 h1:1qZnSpG7/NyLtc7AjqnUL9Jb8xtqG1nMVgp69rJfaR8=
github.com/ClusterCockpit/go-rocm-smi v0.3.0/go.mod h1:+I3UMeX3OlizXDf1WpGD43W4KGZZGVSGmny6rTeOnWA=
github.com/NVIDIA/go-nvml v0.11.6-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
github.com/NVIDIA/go-nvml v0.12.9-0 h1:e344UK8ZkeMeeLkdQtRhmXRxNf+u532LDZPGMtkdus0=
github.com/NVIDIA/go-nvml v0.12.9-0/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
github.com/PaesslerAG/gval v1.2.4 h1:rhX7MpjJlcxYwL2eTTYIOBUyEKZ+A96T9vQySWkVUiU=
github.com/PaesslerAG/gval v1.2.4/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac=
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
github.com/NVIDIA/go-nvml v0.11.1-0 h1:XHSz3zZKC4NCP2ja1rI7++DXFhA+uDhdYa3MykCTGHY=
github.com/NVIDIA/go-nvml v0.11.1-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
github.com/PaesslerAG/gval v1.1.2 h1:EROKxV4/fAKWb0Qoj7NOxmHZA7gcpjOV9XgiRZMRCUU=
github.com/PaesslerAG/gval v1.1.2/go.mod h1:Fa8gfkCmUsELXgayr8sfL/sw+VzCVoa03dcOcR/if2w=
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/cyberdelia/templates v0.0.0-20141128023046-ca7fffd4298c/go.mod h1:GyV+0YP4qX0UQ7r2MoYZ+AvYDp12OF5yg4q8rGnyNh4=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/expr-lang/expr v1.17.5 h1:i1WrMvcdLF249nSNlpQZN1S6NXuW9WaOfF5tPi3aw3k=
github.com/expr-lang/expr v1.17.5/go.mod h1:8/vRC7+7HBzESEqt5kKpYXxrxkr31SaO8r40VO/1IT4=
github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
github.com/frankban/quicktest v1.11.2/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
github.com/frankban/quicktest v1.13.0 h1:yNZif1OkDfNoDfb9zZa9aXIpejNR4F23Wely0c+Qdqk=
github.com/frankban/quicktest v1.13.0/go.mod h1:qLE0fzW0VuyUAJgPU19zByoIr0HtCHN/r/VLSOOIySU=
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/deepmap/oapi-codegen v1.8.2 h1:SegyeYGcdi0jLLrpbCMoJxnUUn8GBXHsvr4rbzjuhfU=
github.com/deepmap/oapi-codegen v1.8.2/go.mod h1:YLgSKSDv/bZQB7N4ws6luhozi3cEdRktEqrX88CvjIw=
github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
github.com/getkin/kin-openapi v0.61.0/go.mod h1:7Yn5whZr5kJi6t+kShccXS8ae1APpYTW6yheSwk8Yi4=
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
github.com/go-chi/chi/v5 v5.0.0/go.mod h1:BBug9lr0cqtdAhsu6R4AAdvufI0/XBzAQSsUqJpoZOs=
github.com/go-openapi/jsonpointer v0.19.5/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg=
github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk=
github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw=
github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/golangci/lint-1 v0.0.0-20181222135242-d2cdd8c08219/go.mod h1:/X8TswGSh1pIozq4ZwCfxS0WA5JGXguxk94ar/4c87Y=
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ=
github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjwJdUHnwvfjMF71M1iI4=
github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI=
github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
github.com/influxdata/influxdb-client-go/v2 v2.7.0 h1:QgP5mlBE9sGnzplpnf96pr+p7uqlIlL4W2GAP3n+XZg=
github.com/influxdata/influxdb-client-go/v2 v2.7.0/go.mod h1:Y/0W1+TZir7ypoQZYd2IrnVOKB3Tq6oegAQeSVN/+EU=
github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU=
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
github.com/influxdata/line-protocol-corpus v0.0.0-20210519164801-ca6fa5da0184/go.mod h1:03nmhxzZ7Xk2pdG+lmMd7mHDfeVOYFyhOgwO61qWU98=
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937 h1:MHJNQ+p99hFATQm6ORoLmpUCF7ovjwEFshs/NHzAbig=
github.com/influxdata/line-protocol-corpus v0.0.0-20210922080147-aa28ccfb8937/go.mod h1:BKR9c0uHSmRgM/se9JhFHtTT7JTO67X23MtKMHtZcpo=
github.com/influxdata/line-protocol/v2 v2.0.0-20210312151457-c52fdecb625a/go.mod h1:6+9Xt5Sq1rWx+glMgxhcg2c0DUaehK+5TDcPZ76GypY=
github.com/influxdata/line-protocol/v2 v2.1.0/go.mod h1:QKw43hdUBg3GTk2iC3iyCxksNj7PX9aUSeYOYE/ceHY=
github.com/influxdata/line-protocol/v2 v2.2.1 h1:EAPkqJ9Km4uAxtMRgUubJyqAr6zgWM0dznKMLRauQRE=
github.com/influxdata/line-protocol/v2 v2.2.1/go.mod h1:DmB3Cnh+3oxmG6LOBIxce4oaL4CPj3OmMPgvauXh+tM=
github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/klauspost/compress v1.13.4 h1:0zhec2I8zGnjWcKyLl6i3gPqKANCCn5e9xmviEEeX6s=
github.com/klauspost/compress v1.13.4/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/nats-io/nats.go v1.43.0 h1:uRFZ2FEoRvP64+UUhaTokyS18XBCR/xM2vQZKO4i8ug=
github.com/nats-io/nats.go v1.43.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
github.com/nats-io/nkeys v0.4.11 h1:q44qGV008kYd9W1b1nEBkNzvnWxtRSQ7A8BoqRrcfa0=
github.com/nats-io/nkeys v0.4.11/go.mod h1:szDimtgmfOi9n25JpfIdGw12tZFYXqhGxjhVxsatHVE=
github.com/labstack/echo/v4 v4.2.1/go.mod h1:AA49e0DZ8kk5jTOOCKNuPR6oTnBS0dYiM4FW1e6jwpg=
github.com/labstack/gommon v0.3.0/go.mod h1:MULnywXg0yavhxWKc+lOruYdAhDwPK9wf0OL7NoOu+k=
github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/matryer/moq v0.0.0-20190312154309-6cfb0558e1bd/go.mod h1:9ELz6aaclSIGnZBoaSLZ3NAl1VTufbOrXBPvtcy6WiQ=
github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE=
github.com/mattn/go-colorable v0.1.7/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc=
github.com/mattn/go-colorable v0.1.8/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc=
github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
github.com/mattn/go-isatty v0.0.9/go.mod h1:YNRxwqDuOph6SZLI9vUUz6OYw3QyUt7WiY2yME+cCiQ=
github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
github.com/mattn/go-sqlite3 v1.14.11 h1:gt+cp9c0XGqe9S/wAHTL3n/7MqY+siPWgWJgqdsFrzQ=
github.com/mattn/go-sqlite3 v1.14.11/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
github.com/minio/highwayhash v1.0.1 h1:dZ6IIu8Z14VlC0VpfKofAhCy74wu/Qb5gcn52yWoz/0=
github.com/minio/highwayhash v1.0.1/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLTk+kldvVxY=
github.com/nats-io/jwt/v2 v2.2.1-0.20220113022732-58e87895b296 h1:vU9tpM3apjYlLLeY23zRWJ9Zktr5jp+mloR942LEOpY=
github.com/nats-io/jwt/v2 v2.2.1-0.20220113022732-58e87895b296/go.mod h1:0tqz9Hlu6bCBFLWAASKhE5vUA4c24L9KPUUgvwumE/k=
github.com/nats-io/nats-server/v2 v2.7.0 h1:UpqcAM93FI7AHlCyI2FD5QcV3QuHNCauQF2LBVU0238=
github.com/nats-io/nats-server/v2 v2.7.0/go.mod h1:cjxtMhZsZovK1XS2iiapCduR8HuqB/YpFamL0qntIcw=
github.com/nats-io/nats.go v1.13.1-0.20211122170419-d7c1d78a50fc h1:SHr4MUUZJ/fAC0uSm2OzWOJYsHpapmR86mpw7q1qPXU=
github.com/nats-io/nats.go v1.13.1-0.20211122170419-d7c1d78a50fc/go.mod h1:BPko4oXsySz4aSWeFgOHLZs3G4Jq4ZAyE6/zMCxRT6w=
github.com/nats-io/nkeys v0.3.0 h1:cgM5tL53EvYRU+2YLXIK0G2mJtK12Ft9oeooSZMA2G8=
github.com/nats-io/nkeys v0.3.0/go.mod h1:gvUNGjVcM2IPr5rCsRsC6Wb3Hr2CQAm08dsxtV6A5y4=
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro=
github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q=
github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0=
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE=
github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8=
github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4=
github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY=
github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=
github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
github.com/stmcginnis/gofish v0.20.0 h1:hH2V2Qe898F2wWT1loApnkDUrXXiLKqbSlMaH3Y1n08=
github.com/stmcginnis/gofish v0.20.0/go.mod h1:PzF5i8ecRG9A2ol8XT64npKUunyraJ+7t0kYMpQAtqU=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/tklauser/go-sysconf v0.3.13 h1:GBUpcahXSpR2xN01jhkNAbTLRk2Yzgggk8IM08lq3r4=
github.com/tklauser/go-sysconf v0.3.13/go.mod h1:zwleP4Q4OehZHGn4CYZDipCgg9usW5IJePewFCGVEa0=
github.com/tklauser/numcpus v0.7.0 h1:yjuerZP127QG9m5Zh/mSO4wqurYil27tHrqwRoRjpr4=
github.com/tklauser/numcpus v0.7.0/go.mod h1:bb6dMVcj8A42tSE7i32fsIUCbQNllK5iDguyOZRUzAY=
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 h1:P7S/GeHBAFEZIYp0ePPs2kHXoazz8q2KsyxHyQVGCJg=
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1/go.mod h1:9CWpnTUmlQkfdpdutA1nNf4iE5lAVt3QZOu0Z6hahBE=
golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM=
golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U=
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b h1:M2rDM6z3Fhozi9O7NWsxAkg/yqS/lQJ6PmkyIV3YP+o=
golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8=
golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw=
golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA=
golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4=
github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
github.com/valyala/fasttemplate v1.0.1/go.mod h1:UQGH1tvbgY+Nz5t2n7tXsz52dQxojPUpymEIMZ47gx8=
github.com/valyala/fasttemplate v1.2.1/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I=
golang.org/x/crypto v0.0.0-20210314154223-e6e6c4f2bb5b/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4=
golang.org/x/crypto v0.0.0-20220112180741-5e0467b6c7ce h1:Roh6XWxHFKrPgC/EQhVubSAGQ6Ozk6IdxHSzt1mR0EI=
golang.org/x/crypto v0.0.0-20220112180741-5e0467b6c7ce/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
golang.org/x/net v0.0.0-20210119194325-5f4716e94777/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2 h1:CIJ76btIcR3eFI5EgSo6k1qKw9KJexJuRLI9G7Hp5wE=
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190130150945-aca44879d564/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200826173525-f9321e4c35a6/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220111092808-5a964db01320/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220114195835-da31bd327af9 h1:XfKQ4OlFl8okEOr5UvAqFRVj8pY/4yfcXrddB8qAbU0=
golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/time v0.0.0-20201208040808-7e3f01d25324/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20210220033141-f8bda1e9f3ba/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20211116232009-f0f3c7e86c11 h1:GZokNIeuVkl3aZHJchRrr13WCsols02MLUcz1U9is6M=
golang.org/x/time v0.0.0-20211116232009-f0f3c7e86c11/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY=
google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
google.golang.org/protobuf v1.27.1 h1:SnqbnDw1V7RiZcXPx5MEeqPv2s79L9i7BJUlG/+RurQ=
google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
gopkg.in/Knetic/govaluate.v2 v2.3.0 h1:naJVc9CZlWA8rC8f5mvECJD7jreTrn7FvGXjBthkHJQ=
gopkg.in/Knetic/govaluate.v2 v2.3.0/go.mod h1:NW0gr10J8s7aNghEg6uhdxiEaBvc0+8VgJjVViHUKp4=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU=
gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=

View File

@@ -0,0 +1,113 @@
package cclogger
import (
"fmt"
"log"
"os"
"runtime"
)
var (
globalDebug = false
stdout = os.Stdout
stderr = os.Stderr
debugLog *log.Logger = nil
infoLog *log.Logger = nil
errorLog *log.Logger = nil
warnLog *log.Logger = nil
defaultLog *log.Logger = nil
)
func initLogger() {
if debugLog == nil {
debugLog = log.New(stderr, "DEBUG ", log.LstdFlags)
}
if infoLog == nil {
infoLog = log.New(stdout, "INFO ", log.LstdFlags)
}
if errorLog == nil {
errorLog = log.New(stderr, "ERROR ", log.LstdFlags)
}
if warnLog == nil {
warnLog = log.New(stderr, "WARN ", log.LstdFlags)
}
if defaultLog == nil {
defaultLog = log.New(stdout, "", log.LstdFlags)
}
}
func Print(e ...interface{}) {
initLogger()
defaultLog.Print(e...)
}
func ComponentPrint(component string, e ...interface{}) {
initLogger()
defaultLog.Print(fmt.Sprintf("[%s] ", component), e)
}
func Info(e ...interface{}) {
initLogger()
infoLog.Print(e...)
}
func ComponentInfo(component string, e ...interface{}) {
initLogger()
infoLog.Print(fmt.Sprintf("[%s] ", component), e)
}
func Debug(e ...interface{}) {
initLogger()
if globalDebug {
debugLog.Print(e...)
}
}
func ComponentDebug(component string, e ...interface{}) {
initLogger()
if globalDebug && debugLog != nil {
//CCComponentPrint(debugLog, component, e)
debugLog.Print(fmt.Sprintf("[%s] ", component), e)
}
}
func Error(e ...interface{}) {
initLogger()
_, fn, line, _ := runtime.Caller(1)
errorLog.Print(fmt.Sprintf("[%s:%d] ", fn, line), e)
}
func ComponentError(component string, e ...interface{}) {
initLogger()
_, fn, line, _ := runtime.Caller(1)
errorLog.Print(fmt.Sprintf("[%s|%s:%d] ", component, fn, line), e)
}
func SetDebug() {
globalDebug = true
initLogger()
}
func SetOutput(filename string) {
if filename == "stderr" {
if stderr != os.Stderr && stderr != os.Stdout {
stderr.Close()
}
stderr = os.Stderr
} else if filename == "stdout" {
if stderr != os.Stderr && stderr != os.Stdout {
stderr.Close()
}
stderr = os.Stdout
} else {
file, err := os.OpenFile(filename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600)
if err == nil {
defer file.Close()
stderr = file
}
}
debugLog = nil
errorLog = nil
warnLog = nil
initLogger()
}

View File

@@ -0,0 +1,32 @@
# ClusterCockpit metrics
As described in the [ClusterCockpit specifications](https://github.com/ClusterCockpit/cc-specifications), the whole ClusterCockpit stack uses metrics in the InfluxDB line protocol format. This is also the input and output format for the ClusterCockpit Metric Collector but internally it uses an extended format while processing, named CCMetric.
It is basically a copy of the [InfluxDB line protocol](https://github.com/influxdata/line-protocol) `MutableMetric` interface with one extension. Besides the tags and fields, it contains a list of meta information (re-using the `Tag` structure of the original protocol):
```golang
type ccMetric struct {
name string // same as
tags []*influx.Tag // original
fields []*influx.Field // Influx
tm time.Time // line-protocol
meta []*influx.Tag
}
type CCMetric interface {
influx.MutableMetric // the same functions as defined by influx.MutableMetric
RemoveTag(key string) // this is not published by the original influx.MutableMetric
Meta() map[string]string
MetaList() []*inlux.Tag
AddMeta(key, value string)
HasMeta(key string) bool
GetMeta(key string) (string, bool)
RemoveMeta(key string)
}
```
The `CCMetric` interface provides the same functions as the `MutableMetric` like `{Add, Remove, Has}{Tag, Field}` and additionally provides `{Add, Remove, Has}Meta`.
The InfluxDB protocol creates a new metric with `influx.New(name, tags, fields, time)` while CCMetric uses `ccMetric.New(name, tags, meta, fields, time)` where `tags` and `meta` are both of type `map[string]string`.
You can copy a CCMetric with `FromMetric(other CCMetric) CCMetric`. If you get an `influx.Metric` from a function, like the line protocol parser, you can use `FromInfluxMetric(other influx.Metric) CCMetric` to get a CCMetric out of it (see `NatsReceiver` for an example).

View File

@@ -0,0 +1,422 @@
package ccmetric
import (
"fmt"
"sort"
"time"
lp "github.com/influxdata/line-protocol" // MIT license
)
// Most functions are derived from github.com/influxdata/line-protocol/metric.go
// The metric type is extended with an extra meta information list re-using the Tag
// type.
type ccMetric struct {
name string
tags []*lp.Tag
fields []*lp.Field
tm time.Time
meta []*lp.Tag
}
type CCMetric interface {
lp.MutableMetric
Name() string
AddTag(key, value string)
GetTag(key string) (string, bool)
HasTag(key string) bool
RemoveTag(key string)
Tags() map[string]string
TagList() []*lp.Tag
AddMeta(key, value string)
GetMeta(key string) (string, bool)
HasMeta(key string) bool
RemoveMeta(key string)
Meta() map[string]string
MetaList() []*lp.Tag
AddField(key string, value interface{})
GetField(key string) (interface{}, bool)
HasField(key string) bool
RemoveField(key string)
Fields() map[string]interface{}
FieldList() []*lp.Field
String() string
SetTime(t time.Time)
}
func (m *ccMetric) Meta() map[string]string {
meta := make(map[string]string, len(m.meta))
for _, m := range m.meta {
meta[m.Key] = m.Value
}
return meta
}
func (m *ccMetric) MetaList() []*lp.Tag {
return m.meta
}
func (m *ccMetric) String() string {
return fmt.Sprintf("%s %v %v %v %d", m.name, m.Tags(), m.Meta(), m.Fields(), m.tm.UnixNano())
}
func (m *ccMetric) Name() string {
return m.name
}
func (m *ccMetric) Tags() map[string]string {
tags := make(map[string]string, len(m.tags))
for _, tag := range m.tags {
tags[tag.Key] = tag.Value
}
return tags
}
func (m *ccMetric) TagList() []*lp.Tag {
return m.tags
}
func (m *ccMetric) Fields() map[string]interface{} {
fields := make(map[string]interface{}, len(m.fields))
for _, field := range m.fields {
fields[field.Key] = field.Value
}
return fields
}
func (m *ccMetric) FieldList() []*lp.Field {
return m.fields
}
func (m *ccMetric) Time() time.Time {
return m.tm
}
func (m *ccMetric) SetTime(t time.Time) {
m.tm = t
}
func (m *ccMetric) HasTag(key string) bool {
for _, tag := range m.tags {
if tag.Key == key {
return true
}
}
return false
}
func (m *ccMetric) GetTag(key string) (string, bool) {
for _, tag := range m.tags {
if tag.Key == key {
return tag.Value, true
}
}
return "", false
}
func (m *ccMetric) RemoveTag(key string) {
for i, tag := range m.tags {
if tag.Key == key {
copy(m.tags[i:], m.tags[i+1:])
m.tags[len(m.tags)-1] = nil
m.tags = m.tags[:len(m.tags)-1]
return
}
}
}
func (m *ccMetric) AddTag(key, value string) {
for i, tag := range m.tags {
if key > tag.Key {
continue
}
if key == tag.Key {
tag.Value = value
return
}
m.tags = append(m.tags, nil)
copy(m.tags[i+1:], m.tags[i:])
m.tags[i] = &lp.Tag{Key: key, Value: value}
return
}
m.tags = append(m.tags, &lp.Tag{Key: key, Value: value})
}
func (m *ccMetric) HasMeta(key string) bool {
for _, tag := range m.meta {
if tag.Key == key {
return true
}
}
return false
}
func (m *ccMetric) GetMeta(key string) (string, bool) {
for _, tag := range m.meta {
if tag.Key == key {
return tag.Value, true
}
}
return "", false
}
func (m *ccMetric) RemoveMeta(key string) {
for i, tag := range m.meta {
if tag.Key == key {
copy(m.meta[i:], m.meta[i+1:])
m.meta[len(m.meta)-1] = nil
m.meta = m.meta[:len(m.meta)-1]
return
}
}
}
func (m *ccMetric) AddMeta(key, value string) {
for i, tag := range m.meta {
if key > tag.Key {
continue
}
if key == tag.Key {
tag.Value = value
return
}
m.meta = append(m.meta, nil)
copy(m.meta[i+1:], m.meta[i:])
m.meta[i] = &lp.Tag{Key: key, Value: value}
return
}
m.meta = append(m.meta, &lp.Tag{Key: key, Value: value})
}
func (m *ccMetric) AddField(key string, value interface{}) {
for i, field := range m.fields {
if key == field.Key {
m.fields[i] = &lp.Field{Key: key, Value: convertField(value)}
return
}
}
m.fields = append(m.fields, &lp.Field{Key: key, Value: convertField(value)})
}
func (m *ccMetric) GetField(key string) (interface{}, bool) {
for _, field := range m.fields {
if field.Key == key {
return field.Value, true
}
}
return "", false
}
func (m *ccMetric) HasField(key string) bool {
for _, field := range m.fields {
if field.Key == key {
return true
}
}
return false
}
func (m *ccMetric) RemoveField(key string) {
for i, field := range m.fields {
if field.Key == key {
copy(m.fields[i:], m.fields[i+1:])
m.fields[len(m.fields)-1] = nil
m.fields = m.fields[:len(m.fields)-1]
return
}
}
}
func New(
name string,
tags map[string]string,
meta map[string]string,
fields map[string]interface{},
tm time.Time,
) (CCMetric, error) {
m := &ccMetric{
name: name,
tags: nil,
fields: nil,
tm: tm,
meta: nil,
}
if len(tags) > 0 {
m.tags = make([]*lp.Tag, 0, len(tags))
for k, v := range tags {
m.tags = append(m.tags,
&lp.Tag{Key: k, Value: v})
}
sort.Slice(m.tags, func(i, j int) bool { return m.tags[i].Key < m.tags[j].Key })
}
if len(meta) > 0 {
m.meta = make([]*lp.Tag, 0, len(meta))
for k, v := range meta {
m.meta = append(m.meta,
&lp.Tag{Key: k, Value: v})
}
sort.Slice(m.meta, func(i, j int) bool { return m.meta[i].Key < m.meta[j].Key })
}
if len(fields) > 0 {
m.fields = make([]*lp.Field, 0, len(fields))
for k, v := range fields {
v := convertField(v)
if v == nil {
continue
}
m.AddField(k, v)
}
}
return m, nil
}
func FromMetric(other CCMetric) CCMetric {
m := &ccMetric{
name: other.Name(),
tags: make([]*lp.Tag, len(other.TagList())),
fields: make([]*lp.Field, len(other.FieldList())),
meta: make([]*lp.Tag, len(other.MetaList())),
tm: other.Time(),
}
for i, tag := range other.TagList() {
m.tags[i] = &lp.Tag{Key: tag.Key, Value: tag.Value}
}
for i, s := range other.MetaList() {
m.meta[i] = &lp.Tag{Key: s.Key, Value: s.Value}
}
for i, field := range other.FieldList() {
m.fields[i] = &lp.Field{Key: field.Key, Value: field.Value}
}
return m
}
func FromInfluxMetric(other lp.Metric) CCMetric {
m := &ccMetric{
name: other.Name(),
tags: make([]*lp.Tag, len(other.TagList())),
fields: make([]*lp.Field, len(other.FieldList())),
meta: make([]*lp.Tag, 0),
tm: other.Time(),
}
for i, tag := range other.TagList() {
m.tags[i] = &lp.Tag{Key: tag.Key, Value: tag.Value}
}
for i, field := range other.FieldList() {
m.fields[i] = &lp.Field{Key: field.Key, Value: field.Value}
}
return m
}
func convertField(v interface{}) interface{} {
switch v := v.(type) {
case float64:
return v
case int64:
return v
case string:
return v
case bool:
return v
case int:
return int64(v)
case uint:
return uint64(v)
case uint64:
return uint64(v)
case []byte:
return string(v)
case int32:
return int64(v)
case int16:
return int64(v)
case int8:
return int64(v)
case uint32:
return uint64(v)
case uint16:
return uint64(v)
case uint8:
return uint64(v)
case float32:
return float64(v)
case *float64:
if v != nil {
return *v
}
case *int64:
if v != nil {
return *v
}
case *string:
if v != nil {
return *v
}
case *bool:
if v != nil {
return *v
}
case *int:
if v != nil {
return int64(*v)
}
case *uint:
if v != nil {
return uint64(*v)
}
case *uint64:
if v != nil {
return uint64(*v)
}
case *[]byte:
if v != nil {
return string(*v)
}
case *int32:
if v != nil {
return int64(*v)
}
case *int16:
if v != nil {
return int64(*v)
}
case *int8:
if v != nil {
return int64(*v)
}
case *uint32:
if v != nil {
return uint64(*v)
}
case *uint16:
if v != nil {
return uint64(*v)
}
case *uint8:
if v != nil {
return uint64(*v)
}
case *float32:
if v != nil {
return float64(*v)
}
default:
return nil
}
return nil
}

View File

@@ -0,0 +1,277 @@
package ccTopology
import (
"fmt"
"io/ioutil"
"log"
"os"
"path/filepath"
"strconv"
"strings"
cclogger "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
)
// intArrayContains scans an array of ints if the value str is present in the array
// If the specified value is found, the corresponding array index is returned.
// The bool value is used to signal success or failure
func intArrayContains(array []int, str int) (int, bool) {
for i, a := range array {
if a == str {
return i, true
}
}
return -1, false
}
// stringArrayContains scans an array of strings if the value str is present in the array
// If the specified value is found, the corresponding array index is returned.
// The bool value is used to signal success or failure
// func stringArrayContains(array []string, str string) (int, bool) {
// for i, a := range array {
// if a == str {
// return i, true
// }
// }
// return -1, false
// }
func SocketList() []int {
buffer, err := ioutil.ReadFile("/proc/cpuinfo")
if err != nil {
log.Print(err)
return nil
}
ll := strings.Split(string(buffer), "\n")
var packs []int
for _, line := range ll {
if strings.HasPrefix(line, "physical id") {
lv := strings.Fields(line)
id, err := strconv.ParseInt(lv[3], 10, 32)
if err != nil {
log.Print(err)
return packs
}
_, found := intArrayContains(packs, int(id))
if !found {
packs = append(packs, int(id))
}
}
}
return packs
}
func CpuList() []int {
buffer, err := ioutil.ReadFile("/proc/cpuinfo")
if err != nil {
log.Print(err)
return nil
}
ll := strings.Split(string(buffer), "\n")
var cpulist []int
for _, line := range ll {
if strings.HasPrefix(line, "processor") {
lv := strings.Fields(line)
id, err := strconv.ParseInt(lv[2], 10, 32)
if err != nil {
log.Print(err)
return cpulist
}
_, found := intArrayContains(cpulist, int(id))
if !found {
cpulist = append(cpulist, int(id))
}
}
}
return cpulist
}
type CpuEntry struct {
Cpuid int
SMT int
Core int
Socket int
Numadomain int
Die int
}
func CpuData() []CpuEntry {
fileToInt := func(path string) int {
buffer, err := ioutil.ReadFile(path)
if err != nil {
log.Print(err)
cclogger.ComponentError("ccTopology", "Reading", path, ":", err.Error())
return -1
}
sbuffer := strings.Replace(string(buffer), "\n", "", -1)
var id int64
//_, err = fmt.Scanf("%d", sbuffer, &id)
id, err = strconv.ParseInt(sbuffer, 10, 32)
if err != nil {
cclogger.ComponentError("ccTopology", "Parsing", path, ":", sbuffer, err.Error())
return -1
}
return int(id)
}
getCore := func(basepath string) int {
return fileToInt(fmt.Sprintf("%s/core_id", basepath))
}
getSocket := func(basepath string) int {
return fileToInt(fmt.Sprintf("%s/physical_package_id", basepath))
}
getDie := func(basepath string) int {
return fileToInt(fmt.Sprintf("%s/die_id", basepath))
}
getSMT := func(cpuid int, basepath string) int {
buffer, err := ioutil.ReadFile(fmt.Sprintf("%s/thread_siblings_list", basepath))
if err != nil {
log.Print(err)
}
threadlist := make([]int, 0)
sbuffer := strings.Replace(string(buffer), "\n", "", -1)
for _, x := range strings.Split(sbuffer, ",") {
id, err := strconv.ParseInt(x, 10, 32)
if err != nil {
log.Print(err)
}
threadlist = append(threadlist, int(id))
}
for i, x := range threadlist {
if x == cpuid {
return i
}
}
return 1
}
getNumaDomain := func(basepath string) int {
files, err := filepath.Glob(fmt.Sprintf("%s/node*", basepath))
if err != nil {
log.Print(err)
}
for _, f := range files {
finfo, err := os.Lstat(f)
if err == nil && (finfo.IsDir() || finfo.Mode()&os.ModeSymlink != 0) {
var id int
parts := strings.Split(f, "/")
_, err = fmt.Scanf("node%d", parts[len(parts)-1], &id)
if err == nil {
return id
}
}
}
return 0
}
clist := make([]CpuEntry, 0)
for _, c := range CpuList() {
clist = append(clist, CpuEntry{Cpuid: c})
}
for _, centry := range clist {
centry.Socket = -1
centry.Numadomain = -1
centry.Die = -1
centry.Core = -1
// Set base directory for topology lookup
base := fmt.Sprintf("/sys/devices/system/cpu/cpu%d/topology", centry.Cpuid)
// Lookup CPU core id
centry.Core = getCore(base)
// Lookup CPU socket id
centry.Socket = getSocket(base)
// Lookup CPU die id
centry.Die = getDie(base)
// Lookup SMT thread id
centry.SMT = getSMT(centry.Cpuid, base)
// Lookup NUMA domain id
centry.Numadomain = getNumaDomain(base)
}
return clist
}
type CpuInformation struct {
NumHWthreads int
SMTWidth int
NumSockets int
NumDies int
NumNumaDomains int
}
func CpuInfo() CpuInformation {
var c CpuInformation
smt := 0
numa := 0
die := 0
socket := 0
cdata := CpuData()
for _, d := range cdata {
if d.SMT > smt {
smt = d.SMT
}
if d.Numadomain > numa {
numa = d.Numadomain
}
if d.Die > die {
die = d.Die
}
if d.Socket > socket {
socket = d.Socket
}
}
c.NumNumaDomains = numa + 1
c.SMTWidth = smt + 1
c.NumDies = die + 1
c.NumSockets = socket + 1
c.NumHWthreads = len(cdata)
return c
}
func GetCpuSocket(cpuid int) int {
cdata := CpuData()
for _, d := range cdata {
if d.Cpuid == cpuid {
return d.Socket
}
}
return -1
}
func GetCpuNumaDomain(cpuid int) int {
cdata := CpuData()
for _, d := range cdata {
if d.Cpuid == cpuid {
return d.Numadomain
}
}
return -1
}
func GetCpuDie(cpuid int) int {
cdata := CpuData()
for _, d := range cdata {
if d.Cpuid == cpuid {
return d.Die
}
}
return -1
}
func GetCpuCore(cpuid int) int {
cdata := CpuData()
for _, d := range cdata {
if d.Cpuid == cpuid {
return d.Core
}
}
return -1
}

View File

@@ -1,49 +0,0 @@
<!--
---
title: Metric Aggregator
description: Subsystem for evaluating expressions on metrics (deprecated)
categories: [cc-metric-collector]
tags: ['Developer']
weight: 1
hugo_path: docs/reference/cc-metric-collector/internal/metricaggregator/_index.md
---
-->
# The MetricAggregator
In some cases, further combination of metrics or raw values is required. For that strings like `foo + 1` with runtime dependent `foo` need to be evaluated. The MetricAggregator relies on the [`gval`](https://github.com/PaesslerAG/gval) Golang package to perform all expression evaluation. The `gval` package provides the basic arithmetic operations but the MetricAggregator defines additional ones.
**Note**: To get an impression which expressions can be handled by `gval`, see its [README](https://github.com/PaesslerAG/gval/blob/master/README.md)
## Simple expression evaluation
For simple expression evaluation, the MetricAggregator provides two function for different use-cases:
- `EvalBoolCondition(expression string, params map[string]interface{}`: Used by the MetricRouter to match metrics like `metric.Name() == 'mymetric'`
- `EvalFloat64Condition(expression string, params map[string]interface{})`: Used by the MetricRouter and LikwidCollector to derive new values like `(PMC0+PMC1)/PMC3`
## MetricAggregator extensions for `gval`
The MetricAggregator provides these functions additional to the `Full` language in `gval`:
- `sum(array)`: Sum up values in an array like `sum(values)`
- `min(array)`: Get the minimum value in an array like `min(values)`
- `avg(array)`: Get the mean value in an array like `avg(values)`
- `mean(array)`: Get the mean value in an array like `mean(values)`
- `max(array)`: Get the maximum value in an array like `max(values)`
- `len(array)`: Get the length of an array like `len(values)`
- `median(array)`: Get the median value in an array like `mean(values)`
- `in`: Check existence in an array like `0 in getCpuList()` to check whether there is an entry `0`. Also substring matching works like `temp in metric.Name()`
- `match`: Regular-expression matching like `match('temp_cores_%d+', metric.Name())`. **Note** all `\` in an regex has to be replaced with `%`
- `getCpuCore(cpuid)`: For a CPU id, the the corresponding CPU core id like `getCpuCore(0)`
- `getCpuSocket(cpuid)`: For a CPU id, the the corresponding CPU socket id
- `getCpuNuma(cpuid)`: For a CPU id, the the corresponding NUMA domain id
- `getCpuDie(cpuid)`: For a CPU id, the the corresponding CPU die id
- `getSockCpuList(sockid)`: For a given CPU socket id, the list of CPU ids is returned like the CPUs on socket 1 `getSockCpuList(1)`
- `getNumaCpuList(numaid)`: For a given NUMA node id, the list of CPU ids is returned
- `getDieCpuList(dieid)`: For a given CPU die id, the list of CPU ids is returned
- `getCoreCpuList(coreid)`: For a given CPU core id, the list of CPU ids is returned
- `getCpuList`: Get the list of all CPUs
## Limitations
- Since the metrics are written in JSON files which do not allow `""` without proper escaping inside of JSON strings, you have to use `''` for strings.
- Since `\` is interpreted by JSON as escape character, it cannot be used in metrics. But it is required to write regular expressions. So instead of `/`, use `%` and the MetricAggregator replaces them after reading the JSON file.

View File

@@ -1,32 +1,11 @@
<!--
---
title: Message Router
description: Routing component inside cc-metric-collector
categories: [cc-metric-collector]
tags: ['Developer']
weight: 1
hugo_path: docs/reference/cc-metric-collector/internal/metricrouter/_index.md
---
-->
# CC Metric Router
The CCMetric router sits in between the collectors and the sinks and can be used to add and remove tags to/from traversing [CCMessages](https://pkg.go.dev/github.com/ClusterCockpit/cc-lib/ccMessage).
The CCMetric router sits in between the collectors and the sinks and can be used to add and remove tags to/from traversing [CCMetrics](../ccMetric/README.md).
# Configuration
**Note**: Use the [message processor configuration](https://github.com/ClusterCockpit/cc-lib/blob/main/messageProcessor/README.md) with option `process_messages`.
```json
{
"num_cache_intervals" : 1,
"interval_timestamp" : true,
"hostname_tag" : "hostname",
"max_forward" : 50,
"process_messages": {
"see": "pkg/messageProcessor/README.md"
},
"add_tags" : [
{
"key" : "cluster",
@@ -46,91 +25,16 @@ The CCMetric router sits in between the collectors and the sinks and can be used
"if" : "*"
}
],
"interval_aggregates" : [
{
"name" : "temp_cores_avg",
"if" : "match('temp_core_%d+', metric.Name())",
"function" : "avg(values)",
"tags" : {
"type" : "node"
},
"meta" : {
"group": "IPMI",
"unit": "degC",
"source": "TempCollector"
}
}
],
"drop_metrics" : [
"not_interesting_metric_at_all"
],
"drop_metrics_if" : [
"match('temp_core_%d+', metric.Name())"
],
"rename_metrics" : {
"metric_12345" : "mymetric"
},
"normalize_units" : true,
"change_unit_prefix" : {
"mem_used" : "G",
"mem_total" : "G"
}
"interval_timestamp" : true
}
```
There are three main options `add_tags`, `delete_tags` and `interval_timestamp`. `add_tags` and `delete_tags` are lists consisting of dicts with `key`, `value` and `if`. The `value` can be omitted in the `delete_tags` part as it only uses the `key` for removal. The `interval_timestamp` setting means that a unique timestamp is applied to all metrics traversing the router during an interval.
**Note**: Use the [message processor configuration](https://github.com/ClusterCockpit/cc-lib/blob/main/messageProcessor/README.md) (option `process_messages`) instead of `add_tags`, `delete_tags`, `drop_metrics`, `drop_metrics_if`, `rename_metrics`, `normalize_units` and `change_unit_prefix`. These options are deprecated and will be removed in future versions. Until then, they are added to the message processor.
# Conditional manipulation of tags
# Processing order in the router
The `if` setting allows conditional testing of a single metric like in the example:
- Add the `hostname_tag` tag (if sent by collectors or cache)
- If `interval_timestamp == true`, change time of metrics
- Check if metric should be dropped (`drop_metrics` and `drop_metrics_if`)
- Add tags from `add_tags`
- Delete tags from `del_tags`
- Rename metric based on `rename_metrics` and store old name as `oldname` in meta information
- Add tags from `add_tags` (if you used the new name in the `if` condition)
- Delete tags from `del_tags` (if you used the new name in the `if` condition)
- Send to sinks
- Move to cache (if `num_cache_intervals > 0`)
# The `interval_timestamp` option
The collectors' `Read()` functions are not called simultaneously and therefore the metrics gathered in an interval can have different timestamps. If you want to avoid that and have a common timestamp (the beginning of the interval), set this option to `true` and the MetricRouter sets the time.
# The `num_cache_intervals` option
If the MetricRouter should buffer metrics of intervals in a MetricCache, this option specifies the number of past intervals that should be kept. If `num_cache_intervals = 0`, the cache is disabled. With `num_cache_intervals = 1`, only the metrics of the last interval are buffered.
A `num_cache_intervals > 0` is required to use the `interval_aggregates` option.
# The `hostname_tag` option
By default, the router tags metrics with the hostname for all locally created metrics. The default tag name is `hostname`, but it can be changed if your organization wants anything else
# The `max_forward` option
Every time the router receives a metric through any of the channels, it tries to directly read up to `max_forward` metrics from the same channel. This was done as the router thread would go to sleep and wake up with every arriving metric. The default are `50` metrics at once and `max_forward` needs to greater than `1`.
# The `rename_metrics` option
__deprecated__
In the ClusterCockpit world we specified a set of standard metrics. Since some collectors determine the metric names based on files, execuables and libraries, they might change from system to system (or installation to installtion, OS to OS, ...). In order to get the common names, you can rename incoming metrics before sending them to the sink. If the metric name matches the `oldname`, it is changed to `newname`
```json
{
"oldname" : "newname",
"clock_mhz" : "clock"
}
```
# Conditional manipulation of tags (`add_tags` and `del_tags`)
__deprecated__
Common config format:
```json
{
"key" : "test",
@@ -139,173 +43,8 @@ Common config format:
}
```
## The `del_tags` option
If the CCMetric name is equal to 'temp_package_id_0', it adds an additional tag `test=testing` to the metric.
__deprecated__
The collectors are free to add whatever `key=value` pair to the metric tags (although the usage of tags should be minimized). If you want to delete a tag afterwards, you can do that. When the `if` condition matches on a metric, the `key` is removed from the metric's tags.
If you want to remove a tag for all metrics, use the condition wildcard `*`. The `value` field can be omitted in the `del_tags` case.
Never delete tags:
- `hostname`
- `type`
- `type-id`
## The `add_tags` option
__deprecated__
In some cases, metrics should be tagged or an existing tag changed based on some condition. This can be done in the `add_tags` section. When the `if` condition evaluates to `true`, the tag `key` is added or gets changed to the new `value`.
If the CCMetric name is equal to `temp_package_id_0`, it adds an additional tag `test=testing` to the metric.
For this metric, a more useful example would be:
```json
[
{
"key" : "type",
"value" : "socket",
"if" : "name == 'temp_package_id_0'"
},
{
"key" : "type-id",
"value" : "0",
"if" : "name == 'temp_package_id_0'"
},
]
```
The metric `temp_package_id_0` corresponds to the tempature of the first CPU socket (=package). With the above configuration, the tags would reflect that because commonly the [TempCollector](../../collectors/tempMetric.md) submits only `node` metrics.
In order to match all metrics, you can use `*`, so in order to add a flag per default. This is useful to attached system-specific tags like `cluster=testcluster`:
```json
{
"key" : "cluster",
"value" : "testcluster",
"if" : "*"
}
```
# Dropping metrics
In some cases, you want to drop a metric and don't get it forwarded to the sinks. There are two options based on the required specification:
- Based only on the metric name -> `drop_metrics` section
- An evaluable condition with more overhead -> `drop_metrics_if` section
## The `drop_metrics` section
__deprecated__
The argument is a list of metric names. No futher checks are performed, only a comparison of the metric name
```json
{
"drop_metrics" : [
"drop_metric_1",
"drop_metric_2"
]
}
```
The example drops all metrics with the name `drop_metric_1` and `drop_metric_2`.
## The `drop_metrics_if` section
__deprecated__
This option takes a list of evaluable conditions and performs them one after the other on **all** metrics incoming from the collectors and the metric cache (aka `interval_aggregates`).
```json
{
"drop_metrics_if" : [
"match('drop_metric_%d+', name)",
"match('cpu', type) && type-id == 0"
]
}
```
The first line is comparable with the example in `drop_metrics`, it drops all metrics starting with `drop_metric_` and ending with a number. The second line drops all metrics of the first hardware thread (**not** recommended)
# Manipulating the metric units
## The `normalize_units` option
__deprecated__
In order to match all metrics, you can use `*`, so in order to add a flag per default, like the `cluster=testcluster` tag in the example.
The cc-metric-collector tries to read the data from the system as it is reported. If available, it tries to read the metric unit from the system as well (e.g. from `/proc/meminfo`). The problem is that, depending on the source, the metric units are named differently. Just think about `byte`, `Byte`, `B`, `bytes`, ...
The [cc-units](https://github.com/ClusterCockpit/cc-lib/ccUnits) package provides us a normalization option to use the same metric unit name for all metrics. It this option is set to true, all `unit` meta tags are normalized.
## The `change_unit_prefix` section
__deprecated__
It is often the case that metrics are reported by the system using a rather outdated unit prefix (like `/proc/meminfo` still uses kByte despite current memory sizes are in the GByte range). If you want to change the prefix of a unit, you can do that with the help of [cc-units](https://github.com/ClusterCockpit/cc-lib/ccUnits). The setting works on the metric name and requires the new prefix for the metric. The cc-units package determines the scaling factor.
# Aggregate metric values of the current interval with the `interval_aggregates` option
**Note:** `interval_aggregates` works only if `num_cache_intervals` > 0 and is **experimental**
In some cases, you need to derive new metrics based on the metrics arriving during an interval. This can be done in the `interval_aggregates` section. The logic is similar to the other metric manipulation and filtering options. A cache stores all metrics that arrive during an interval. At the beginning of the *next* interval, the list of metrics is submitted to the MetricAggregator. It derives new metrics and submits them back to the MetricRouter, so they are sent in the next interval but have the timestamp of the previous interval beginning.
```json
"interval_aggregates" : [
{
"name" : "new_metric_name",
"if" : "match('sub_metric_%d+', metric.Name())",
"function" : "avg(values)",
"tags" : {
"key" : "value",
"type" : "node"
},
"meta" : {
"key" : "value",
"group": "IPMI",
"unit": "<copy>",
}
}
]
```
The above configuration, collects all metric values for metrics evaluating `if` to `true`. Afterwards it calculates the average `avg` of the `values` (list of all metrics' field `value`) and creates a new CCMetric with the name `new_metric_name` and adds the tags in `tags` and the meta information in `meta`. The special value `<copy>` searches the input metrics and copies the value of the first match of `key` to the new CCMetric.
If you are not interested in the input metrics `sub_metric_%d+` at all, you can add the same condition used here to the `drop_metrics_if` section to drop them.
Use cases for `interval_aggregates`:
- Combine multiple metrics of the a collector to a new one like the [MemstatCollector](../../collectors/memstatMetric.md) does it for `mem_used`:
```json
{
"name" : "mem_used",
"if" : "source == 'MemstatCollector'",
"function" : "sum(mem_total) - (sum(mem_free) + sum(mem_buffers) + sum(mem_cached))",
"tags" : {
"type" : "node"
},
"meta" : {
"group": "<copy>",
"unit": "<copy>",
"source": "<copy>"
}
}
```
# Order of operations
The router performs the above mentioned options in a specific order. In order to get the logic you want for a specific metric, it is crucial to know the processing order:
- Add the `hostname` tag (c)
- Manipulate the timestamp to the interval timestamp (c,r)
- Drop metrics based on `drop_metrics` and `drop_metrics_if` (c,r)
- Add tags based on `add_tags` (c,r)
- Delete tags based on `del_tags` (c,r)
- Rename metric based on `rename_metric` (c,r)
- Add tags based on `add_tags` to still work if the configuration uses the new name (c,r)
- Delete tags based on `del_tags` to still work if the configuration uses the new name (c,r)
- Normalize units when `normalize_units` is set (c,r)
- Convert unit prefix based on `change_unit_prefix` (c,r)
Legend:
- 'c' if metric is coming from a collector
- 'r' if metric is coming from a receiver

View File

@@ -1,30 +1,21 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package metricAggregator
package metricRouter
import (
"context"
"fmt"
"math"
"os"
"strings"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology"
"github.com/PaesslerAG/gval"
)
type MetricAggregatorIntervalConfig struct {
type metricAggregatorIntervalConfig struct {
Name string `json:"name"` // Metric name for the new metric
Function string `json:"function"` // Function to apply on the metric
Condition string `json:"if"` // Condition for applying function
@@ -35,17 +26,17 @@ type MetricAggregatorIntervalConfig struct {
}
type metricAggregator struct {
functions []*MetricAggregatorIntervalConfig
functions []*metricAggregatorIntervalConfig
constants map[string]interface{}
language gval.Language
output chan lp.CCMessage
output chan lp.CCMetric
}
type MetricAggregator interface {
AddAggregation(name, function, condition string, tags, meta map[string]string) error
DeleteAggregation(name string) error
Init(output chan lp.CCMessage) error
Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMessage)
Init(output chan lp.CCMetric) error
Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMetric)
}
var metricCacheLanguage = gval.NewLanguage(
@@ -70,20 +61,10 @@ var metricCacheLanguage = gval.NewLanguage(
gval.Function("getCpuList", getCpuListOfNode),
gval.Function("getCpuListOfType", getCpuListOfType),
)
var language gval.Language = gval.NewLanguage(
gval.Full(),
metricCacheLanguage,
)
var evaluables = struct {
mapping map[string]gval.Evaluable
mutex sync.Mutex
}{
mapping: make(map[string]gval.Evaluable),
}
func (c *metricAggregator) Init(output chan lp.CCMessage) error {
func (c *metricAggregator) Init(output chan lp.CCMetric) error {
c.output = output
c.functions = make([]*MetricAggregatorIntervalConfig, 0)
c.functions = make([]*metricAggregatorIntervalConfig, 0)
c.constants = make(map[string]interface{})
// add constants like hostname, numSockets, ... to constants list
@@ -103,7 +84,7 @@ func (c *metricAggregator) Init(output chan lp.CCMessage) error {
c.constants["smtWidth"] = cinfo.SMTWidth
c.language = gval.NewLanguage(
gval.Full(),
gval.Base(),
metricCacheLanguage,
)
@@ -119,7 +100,7 @@ func (c *metricAggregator) Init(output chan lp.CCMessage) error {
return nil
}
func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMessage) {
func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMetric) {
vars := make(map[string]interface{})
for k, v := range c.constants {
vars[k] = v
@@ -128,13 +109,8 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
vars["endtime"] = endtime
for _, f := range c.functions {
cclog.ComponentDebug("MetricCache", "COLLECT", f.Name, "COND", f.Condition)
var valuesFloat64 []float64
var valuesFloat32 []float32
var valuesInt []int
var valuesInt32 []int32
var valuesInt64 []int64
var valuesBool []bool
matches := make([]lp.CCMessage, 0)
values := make([]float64, 0)
matches := make([]lp.CCMetric, 0)
for _, m := range metrics {
vars["metric"] = m
//value, err := gval.Evaluate(f.Condition, vars, c.language)
@@ -148,17 +124,17 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
if valid {
switch x := v.(type) {
case float64:
valuesFloat64 = append(valuesFloat64, x)
values = append(values, x)
case float32:
valuesFloat32 = append(valuesFloat32, x)
case int:
valuesInt = append(valuesInt, x)
case int32:
valuesInt32 = append(valuesInt32, x)
case int64:
valuesInt64 = append(valuesInt64, x)
values = append(values, float64(x))
case bool:
valuesBool = append(valuesBool, x)
if x {
values = append(values, float64(1.0))
} else {
values = append(values, float64(0.0))
}
default:
cclog.ComponentError("MetricCache", "COLLECT ADD VALUE", v, "FAILED")
}
@@ -167,63 +143,17 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
}
}
delete(vars, "metric")
// Check, that only values of one type were collected
countValueTypes := 0
if len(valuesFloat64) > 0 {
countValueTypes += 1
}
if len(valuesFloat32) > 0 {
countValueTypes += 1
}
if len(valuesInt) > 0 {
countValueTypes += 1
}
if len(valuesInt32) > 0 {
countValueTypes += 1
}
if len(valuesInt64) > 0 {
countValueTypes += 1
}
if len(valuesBool) > 0 {
countValueTypes += 1
}
if countValueTypes > 1 {
cclog.ComponentError("MetricCache", "Collected values of different types")
}
var len_values int
switch {
case len(valuesFloat64) > 0:
vars["values"] = valuesFloat64
len_values = len(valuesFloat64)
case len(valuesFloat32) > 0:
vars["values"] = valuesFloat32
len_values = len(valuesFloat32)
case len(valuesInt) > 0:
vars["values"] = valuesInt
len_values = len(valuesInt)
case len(valuesInt32) > 0:
vars["values"] = valuesInt32
len_values = len(valuesInt32)
case len(valuesInt64) > 0:
vars["values"] = valuesInt64
len_values = len(valuesInt64)
case len(valuesBool) > 0:
vars["values"] = valuesBool
len_values = len(valuesBool)
}
cclog.ComponentDebug("MetricCache", "EVALUATE", f.Name, "METRICS", len_values, "CALC", f.Function)
cclog.ComponentDebug("MetricCache", "EVALUATE", f.Name, "METRICS", len(values), "CALC", f.Function)
vars["values"] = values
vars["metrics"] = matches
if len_values > 0 {
if len(values) > 0 {
value, err := gval.Evaluate(f.Function, vars, c.language)
if err != nil {
cclog.ComponentError("MetricCache", "EVALUATE", f.Name, "METRICS", len_values, "CALC", f.Function, ":", err.Error())
cclog.ComponentError("MetricCache", "EVALUATE", f.Name, "METRICS", len(values), "CALC", f.Function, ":", err.Error())
break
}
copy_tags := func(tags map[string]string, metrics []lp.CCMessage) map[string]string {
copy_tags := func(tags map[string]string, metrics []lp.CCMetric) map[string]string {
out := make(map[string]string)
for key, value := range tags {
switch value {
@@ -240,7 +170,7 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
}
return out
}
copy_meta := func(meta map[string]string, metrics []lp.CCMessage) map[string]string {
copy_meta := func(meta map[string]string, metrics []lp.CCMetric) map[string]string {
out := make(map[string]string)
for key, value := range meta {
switch value {
@@ -260,18 +190,18 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
tags := copy_tags(f.Tags, matches)
meta := copy_meta(f.Meta, matches)
var m lp.CCMessage
var m lp.CCMetric
switch t := value.(type) {
case float64:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
m, err = lp.New(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case float32:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
m, err = lp.New(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case int:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
m, err = lp.New(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case int64:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
m, err = lp.New(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
case string:
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
m, err = lp.New(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
default:
cclog.ComponentError("MetricCache", "Gval returned invalid type", t, "skipping metric", f.Name)
}
@@ -315,16 +245,15 @@ func (c *metricAggregator) AddAggregation(name, function, condition string, tags
return nil
}
}
agg := &MetricAggregatorIntervalConfig{
Name: name,
Condition: newcond,
gvalCond: gvalCond,
Function: newfunc,
gvalFunc: gvalFunc,
Tags: tags,
Meta: meta,
}
c.functions = append(c.functions, agg)
var agg metricAggregatorIntervalConfig
agg.Name = name
agg.Condition = newcond
agg.gvalCond = gvalCond
agg.Function = newfunc
agg.gvalFunc = gvalFunc
agg.Tags = tags
agg.Meta = meta
c.functions = append(c.functions, &agg)
return nil
}
@@ -352,51 +281,7 @@ func (c *metricAggregator) AddFunction(name string, function func(args ...interf
c.language = gval.NewLanguage(c.language, gval.Function(name, function))
}
func EvalBoolCondition(condition string, params map[string]interface{}) (bool, error) {
evaluables.mutex.Lock()
evaluable, ok := evaluables.mapping[condition]
evaluables.mutex.Unlock()
if !ok {
newcond :=
strings.ReplaceAll(
strings.ReplaceAll(
condition, "'", "\""), "%", "\\")
var err error
evaluable, err = language.NewEvaluable(newcond)
if err != nil {
return false, err
}
evaluables.mutex.Lock()
evaluables.mapping[condition] = evaluable
evaluables.mutex.Unlock()
}
value, err := evaluable.EvalBool(context.Background(), params)
return value, err
}
func EvalFloat64Condition(condition string, params map[string]float64) (float64, error) {
evaluables.mutex.Lock()
evaluable, ok := evaluables.mapping[condition]
evaluables.mutex.Unlock()
if !ok {
newcond :=
strings.ReplaceAll(
strings.ReplaceAll(
condition, "'", "\""), "%", "\\")
var err error
evaluable, err = language.NewEvaluable(newcond)
if err != nil {
return math.NaN(), err
}
evaluables.mutex.Lock()
evaluables.mapping[condition] = evaluable
evaluables.mutex.Unlock()
}
value, err := evaluable.EvalFloat64(context.Background(), params)
return value, err
}
func NewAggregator(output chan lp.CCMessage) (MetricAggregator, error) {
func NewAggregator(output chan lp.CCMetric) (MetricAggregator, error) {
a := new(metricAggregator)
err := a.Init(output)
if err != nil {

View File

@@ -1,176 +1,164 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package metricAggregator
package metricRouter
import (
"errors"
"fmt"
"math"
"regexp"
"sort"
"strings"
"golang.org/x/exp/slices"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology"
)
/*
* Arithmetic functions on value arrays
*/
func sumAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error) {
if len(values) == 0 {
return 0.0, errors.New("sum function requires at least one argument")
// Sum up values
func sumfunc(args ...interface{}) (interface{}, error) {
s := 0.0
values, ok := args[0].([]float64)
if ok {
cclog.ComponentDebug("MetricCache", "SUM FUNC START")
for _, x := range values {
s += x
}
cclog.ComponentDebug("MetricCache", "SUM FUNC END", s)
} else {
cclog.ComponentDebug("MetricCache", "SUM FUNC CAST FAILED")
}
var sum T
for _, value := range values {
sum += value
}
return sum, nil
return s, nil
}
// Sum up values
func sumfunc(args interface{}) (interface{}, error) {
var err error
switch values := args.(type) {
// Get the minimum value
func minfunc(args ...interface{}) (interface{}, error) {
var err error = nil
switch values := args[0].(type) {
case []float64:
return sumAnyType(values)
var s float64 = math.MaxFloat64
for _, x := range values {
if x < s {
s = x
}
}
return s, nil
case []float32:
return sumAnyType(values)
var s float32 = math.MaxFloat32
for _, x := range values {
if x < s {
s = x
}
}
return s, nil
case []int:
return sumAnyType(values)
var s int = int(math.MaxInt32)
for _, x := range values {
if x < s {
s = x
}
}
return s, nil
case []int64:
return sumAnyType(values)
var s int64 = math.MaxInt64
for _, x := range values {
if x < s {
s = x
}
}
return s, nil
case []int32:
return sumAnyType(values)
var s int32 = math.MaxInt32
for _, x := range values {
if x < s {
s = x
}
}
return s, nil
default:
err = errors.New("function 'sum' only on list of values (float64, float32, int, int32, int64)")
err = errors.New("function 'min' only on list of values (float64, float32, int, int32, int64)")
}
return 0.0, err
}
func minAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error) {
if len(values) == 0 {
return 0.0, errors.New("min function requires at least one argument")
}
return slices.Min(values), nil
}
// Get the minimum value
func minfunc(args interface{}) (interface{}, error) {
switch values := args.(type) {
case []float64:
return minAnyType(values)
case []float32:
return minAnyType(values)
case []int:
return minAnyType(values)
case []int64:
return minAnyType(values)
case []int32:
return minAnyType(values)
default:
return 0.0, errors.New("function 'min' only on list of values (float64, float32, int, int32, int64)")
}
}
func avgAnyType[T float64 | float32 | int | int32 | int64](values []T) (float64, error) {
if len(values) == 0 {
return 0.0, errors.New("average function requires at least one argument")
}
sum, err := sumAnyType[T](values)
return float64(sum) / float64(len(values)), err
}
// Get the average or mean value
func avgfunc(args interface{}) (interface{}, error) {
switch values := args.(type) {
func avgfunc(args ...interface{}) (interface{}, error) {
switch values := args[0].(type) {
case []float64:
return avgAnyType(values)
var s float64 = 0
for _, x := range values {
s += x
}
return s / float64(len(values)), nil
case []float32:
return avgAnyType(values)
var s float32 = 0
for _, x := range values {
s += x
}
return s / float32(len(values)), nil
case []int:
return avgAnyType(values)
var s int = 0
for _, x := range values {
s += x
}
return s / len(values), nil
case []int64:
return avgAnyType(values)
case []int32:
return avgAnyType(values)
default:
return 0.0, errors.New("function 'average' only on list of values (float64, float32, int, int32, int64)")
var s int64 = 0
for _, x := range values {
s += x
}
return s / int64(len(values)), nil
}
}
func maxAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error) {
if len(values) == 0 {
return 0.0, errors.New("max function requires at least one argument")
}
return slices.Max(values), nil
return 0.0, nil
}
// Get the maximum value
func maxfunc(args interface{}) (interface{}, error) {
switch values := args.(type) {
case []float64:
return maxAnyType(values)
case []float32:
return maxAnyType(values)
case []int:
return maxAnyType(values)
case []int64:
return maxAnyType(values)
case []int32:
return maxAnyType(values)
default:
return 0.0, errors.New("function 'max' only on list of values (float64, float32, int, int32, int64)")
func maxfunc(args ...interface{}) (interface{}, error) {
s := 0.0
values, ok := args[0].([]float64)
if ok {
for _, x := range values {
if x > s {
s = x
}
}
}
}
func medianAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error) {
if len(values) == 0 {
return 0.0, errors.New("median function requires at least one argument")
}
slices.Sort(values)
var median T
if midPoint := len(values) % 2; midPoint == 0 {
median = (values[midPoint-1] + values[midPoint]) / 2
} else {
median = values[midPoint]
}
return median, nil
return s, nil
}
// Get the median value
func medianfunc(args interface{}) (interface{}, error) {
switch values := args.(type) {
func medianfunc(args ...interface{}) (interface{}, error) {
switch values := args[0].(type) {
case []float64:
return medianAnyType(values)
case []float32:
return medianAnyType(values)
sort.Float64s(values)
return values[len(values)/2], nil
// case []float32:
// sort.Float64s(values)
// return values[len(values)/2], nil
case []int:
return medianAnyType(values)
case []int64:
return medianAnyType(values)
case []int32:
return medianAnyType(values)
default:
return 0.0, errors.New("function 'median' only on list of values (float64, float32, int, int32, int64)")
sort.Ints(values)
return values[len(values)/2], nil
// case []int64:
// sort.Ints(values)
// return values[len(values)/2], nil
// case []int32:
// sort.Ints(values)
// return values[len(values)/2], nil
}
return 0.0, errors.New("function 'median()' only on lists of type float64 and int")
}
/*
* Get number of values in list. Returns always an int
*/
func lenfunc(args interface{}) (interface{}, error) {
func lenfunc(args ...interface{}) (interface{}, error) {
var err error = nil
var length int = 0
switch values := args.(type) {
switch values := args[0].(type) {
case []float64:
length = len(values)
case []float32:
@@ -255,49 +243,49 @@ func matchfunc(args ...interface{}) (interface{}, error) {
*/
// for a given cpuid, it returns the core id
func getCpuCoreFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) {
func getCpuCoreFunc(args ...interface{}) (interface{}, error) {
switch cpuid := args[0].(type) {
case int:
return topo.GetHwthreadCore(cpuid), nil
return topo.GetCpuCore(cpuid), nil
}
return -1, errors.New("function 'getCpuCore' accepts only an 'int' cpuid")
}
// for a given cpuid, it returns the socket id
func getCpuSocketFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) {
func getCpuSocketFunc(args ...interface{}) (interface{}, error) {
switch cpuid := args[0].(type) {
case int:
return topo.GetHwthreadSocket(cpuid), nil
return topo.GetCpuSocket(cpuid), nil
}
return -1, errors.New("function 'getCpuCore' accepts only an 'int' cpuid")
}
// for a given cpuid, it returns the id of the NUMA node
func getCpuNumaDomainFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) {
func getCpuNumaDomainFunc(args ...interface{}) (interface{}, error) {
switch cpuid := args[0].(type) {
case int:
return topo.GetHwthreadNumaDomain(cpuid), nil
return topo.GetCpuNumaDomain(cpuid), nil
}
return -1, errors.New("function 'getCpuNuma' accepts only an 'int' cpuid")
}
// for a given cpuid, it returns the id of the CPU die
func getCpuDieFunc(args interface{}) (interface{}, error) {
switch cpuid := args.(type) {
func getCpuDieFunc(args ...interface{}) (interface{}, error) {
switch cpuid := args[0].(type) {
case int:
return topo.GetHwthreadDie(cpuid), nil
return topo.GetCpuDie(cpuid), nil
}
return -1, errors.New("function 'getCpuDie' accepts only an 'int' cpuid")
}
// for a given core id, it returns the list of cpuids
func getCpuListOfCoreFunc(args interface{}) (interface{}, error) {
func getCpuListOfCoreFunc(args ...interface{}) (interface{}, error) {
cpulist := make([]int, 0)
switch in := args.(type) {
switch in := args[0].(type) {
case int:
for _, c := range topo.CpuData() {
if c.Core == in {
cpulist = append(cpulist, c.CpuID)
cpulist = append(cpulist, c.Cpuid)
}
}
}
@@ -305,13 +293,13 @@ func getCpuListOfCoreFunc(args interface{}) (interface{}, error) {
}
// for a given socket id, it returns the list of cpuids
func getCpuListOfSocketFunc(args interface{}) (interface{}, error) {
func getCpuListOfSocketFunc(args ...interface{}) (interface{}, error) {
cpulist := make([]int, 0)
switch in := args.(type) {
switch in := args[0].(type) {
case int:
for _, c := range topo.CpuData() {
if c.Socket == in {
cpulist = append(cpulist, c.CpuID)
cpulist = append(cpulist, c.Cpuid)
}
}
}
@@ -319,13 +307,13 @@ func getCpuListOfSocketFunc(args interface{}) (interface{}, error) {
}
// for a given id of a NUMA domain, it returns the list of cpuids
func getCpuListOfNumaDomainFunc(args interface{}) (interface{}, error) {
func getCpuListOfNumaDomainFunc(args ...interface{}) (interface{}, error) {
cpulist := make([]int, 0)
switch in := args.(type) {
switch in := args[0].(type) {
case int:
for _, c := range topo.CpuData() {
if c.NumaDomain == in {
cpulist = append(cpulist, c.CpuID)
if c.Numadomain == in {
cpulist = append(cpulist, c.Cpuid)
}
}
}
@@ -333,13 +321,13 @@ func getCpuListOfNumaDomainFunc(args interface{}) (interface{}, error) {
}
// for a given CPU die id, it returns the list of cpuids
func getCpuListOfDieFunc(args interface{}) (interface{}, error) {
func getCpuListOfDieFunc(args ...interface{}) (interface{}, error) {
cpulist := make([]int, 0)
switch in := args.(type) {
switch in := args[0].(type) {
case int:
for _, c := range topo.CpuData() {
if c.Die == in {
cpulist = append(cpulist, c.CpuID)
cpulist = append(cpulist, c.Cpuid)
}
}
}
@@ -347,8 +335,8 @@ func getCpuListOfDieFunc(args interface{}) (interface{}, error) {
}
// wrapper function to get a list of all cpuids of the node
func getCpuListOfNode() (interface{}, error) {
return topo.HwthreadList(), nil
func getCpuListOfNode(args ...interface{}) (interface{}, error) {
return topo.CpuList(), nil
}
// helper function to get the cpuid list for a CCMetric type tag set (type and type-id)
@@ -360,14 +348,14 @@ func getCpuListOfType(args ...interface{}) (interface{}, error) {
case string:
switch typ {
case "node":
return topo.HwthreadList(), nil
return topo.CpuList(), nil
case "socket":
return getCpuListOfSocketFunc(args[1])
case "numadomain":
return getCpuListOfNumaDomainFunc(args[1])
case "core":
return getCpuListOfCoreFunc(args[1])
case "hwthread":
case "cpu":
var cpu int
switch id := args[1].(type) {

View File

@@ -1,21 +1,13 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package metricRouter
import (
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker"
)
type metricCachePeriod struct {
@@ -23,34 +15,33 @@ type metricCachePeriod struct {
stopstamp time.Time
numMetrics int
sizeMetrics int
metrics []lp.CCMessage
metrics []lp.CCMetric
}
// Metric cache data structure
type metricCache struct {
numPeriods int
curPeriod int
lock sync.Mutex
intervals []*metricCachePeriod
wg *sync.WaitGroup
ticker mct.MultiChanTicker
tickchan chan time.Time
done chan bool
output chan lp.CCMessage
aggEngine agg.MetricAggregator
output chan lp.CCMetric
aggEngine MetricAggregator
}
type MetricCache interface {
Init(output chan lp.CCMessage, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error
Init(output chan lp.CCMetric, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error
Start()
Add(metric lp.CCMessage)
GetPeriod(index int) (time.Time, time.Time, []lp.CCMessage)
Add(metric lp.CCMetric)
GetPeriod(index int) (time.Time, time.Time, []lp.CCMetric)
AddAggregation(name, function, condition string, tags, meta map[string]string) error
DeleteAggregation(name string) error
Close()
}
func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error {
func (c *metricCache) Init(output chan lp.CCMetric, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error {
var err error = nil
c.done = make(chan bool)
c.wg = wg
@@ -62,13 +53,13 @@ func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker,
p := new(metricCachePeriod)
p.numMetrics = 0
p.sizeMetrics = 0
p.metrics = make([]lp.CCMessage, 0)
p.metrics = make([]lp.CCMetric, 0)
c.intervals = append(c.intervals, p)
}
// Create a new aggregation engine. No separate goroutine at the moment
// The code is executed by the MetricCache goroutine
c.aggEngine, err = agg.NewAggregator(c.output)
c.aggEngine, err = NewAggregator(c.output)
if err != nil {
cclog.ComponentError("MetricCache", "Cannot create aggregator")
return err
@@ -111,11 +102,9 @@ func (c *metricCache) Start() {
done()
return
case tick := <-c.tickchan:
c.lock.Lock()
old := rotate(tick)
// Get the last period and evaluate aggregation metrics
starttime, endtime, metrics := c.GetPeriod(old)
c.lock.Unlock()
if len(metrics) > 0 {
c.aggEngine.Eval(starttime, endtime, metrics)
} else {
@@ -131,9 +120,8 @@ func (c *metricCache) Start() {
// Add a metric to the cache. The interval is defined by the global timer (rotate() in Start())
// The intervals list is used as round-robin buffer and the metric list grows dynamically and
// to avoid reallocations
func (c *metricCache) Add(metric lp.CCMessage) {
func (c *metricCache) Add(metric lp.CCMetric) {
if c.curPeriod >= 0 && c.curPeriod < c.numPeriods {
c.lock.Lock()
p := c.intervals[c.curPeriod]
if p.numMetrics < p.sizeMetrics {
p.metrics[p.numMetrics] = metric
@@ -145,7 +133,6 @@ func (c *metricCache) Add(metric lp.CCMessage) {
p.sizeMetrics = p.sizeMetrics + 1
p.stopstamp = metric.Time()
}
c.lock.Unlock()
}
}
@@ -160,27 +147,17 @@ func (c *metricCache) DeleteAggregation(name string) error {
// Get all metrics of a interval. The index is the difference to the current interval, so index=0
// is the current one, index=1 the last interval and so on. Returns and empty array if a wrong index
// is given (negative index, index larger than configured number of total intervals, ...)
func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMessage) {
var start time.Time = time.Now()
var stop time.Time = time.Now()
var metrics []lp.CCMessage
func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMetric) {
if index >= 0 && index < c.numPeriods {
pindex := c.curPeriod - index
if pindex < 0 {
pindex = c.numPeriods - pindex
}
if pindex >= 0 && pindex < c.numPeriods {
start = c.intervals[pindex].startstamp
stop = c.intervals[pindex].stopstamp
metrics = c.intervals[pindex].metrics
//return c.intervals[pindex].startstamp, c.intervals[pindex].stopstamp, c.intervals[pindex].metrics
} else {
metrics = make([]lp.CCMessage, 0)
return c.intervals[pindex].startstamp, c.intervals[pindex].stopstamp, c.intervals[pindex].metrics
}
} else {
metrics = make([]lp.CCMessage, 0)
}
return start, stop, metrics
return time.Now(), time.Now(), make([]lp.CCMetric, 0)
}
// Close finishes / stops the metric cache
@@ -189,7 +166,7 @@ func (c *metricCache) Close() {
c.done <- true
}
func NewCache(output chan lp.CCMessage, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) (MetricCache, error) {
func NewCache(output chan lp.CCMetric, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) (MetricCache, error) {
c := new(metricCache)
err := c.Init(output, ticker, wg, numPeriods)
if err != nil {

View File

@@ -1,30 +1,19 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package metricRouter
import (
"encoding/json"
"fmt"
"os"
"strings"
"sync"
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
mp "github.com/ClusterCockpit/cc-lib/messageProcessor"
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker"
"gopkg.in/Knetic/govaluate.v2"
)
const ROUTER_MAX_FORWARD = 50
// Metric router tag configuration
type metricRouterTagConfig struct {
Key string `json:"key"` // Tag name
@@ -34,46 +23,36 @@ type metricRouterTagConfig struct {
// Metric router configuration
type metricRouterConfig struct {
HostnameTagName string `json:"hostname_tag"` // Key name used when adding the hostname to a metric (default 'hostname')
AddTags []metricRouterTagConfig `json:"add_tags"` // List of tags that are added when the condition is met
DelTags []metricRouterTagConfig `json:"delete_tags"` // List of tags that are removed when the condition is met
IntervalAgg []agg.MetricAggregatorIntervalConfig `json:"interval_aggregates"` // List of aggregation function processed at the end of an interval
DropMetrics []string `json:"drop_metrics"` // List of metric names to drop. For fine-grained dropping use drop_metrics_if
DropMetricsIf []string `json:"drop_metrics_if"` // List of evaluatable terms to drop metrics
RenameMetrics map[string]string `json:"rename_metrics"` // Map to rename metric name from key to value
IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically by ticker each interval?
NumCacheIntervals int `json:"num_cache_intervals"` // Number of intervals of cached metrics for evaluation
MaxForward int `json:"max_forward"` // Number of maximal forwarded metrics at one select
NormalizeUnits bool `json:"normalize_units"` // Check unit meta flag and normalize it using cc-units
ChangeUnitPrefix map[string]string `json:"change_unit_prefix"` // Add prefix that should be applied to the metrics
// dropMetrics map[string]bool // Internal map for O(1) lookup
MessageProcessor json.RawMessage `json:"process_messages,omitempty"`
AddTags []metricRouterTagConfig `json:"add_tags"` // List of tags that are added when the condition is met
DelTags []metricRouterTagConfig `json:"delete_tags"` // List of tags that are removed when the condition is met
IntervalAgg []metricAggregatorIntervalConfig `json:"interval_aggregates"` // List of aggregation function processed at the end of an interval
IntervalStamp bool `json:"interval_timestamp"` // Update timestamp periodically by ticker each interval?
NumCacheIntervals int `json:"num_cache_intervals"` // Number of intervals of cached metrics for evaluation
}
// Metric router data structure
type metricRouter struct {
hostname string // Hostname used in tags
coll_input chan lp.CCMessage // Input channel from CollectorManager
recv_input chan lp.CCMessage // Input channel from ReceiveManager
cache_input chan lp.CCMessage // Input channel from MetricCache
outputs []chan lp.CCMessage // List of all output channels
coll_input chan lp.CCMetric // Input channel from CollectorManager
recv_input chan lp.CCMetric // Input channel from ReceiveManager
cache_input chan lp.CCMetric // Input channel from MetricCache
outputs []chan lp.CCMetric // List of all output channels
done chan bool // channel to finish / stop metric router
wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector
timestamp time.Time // timestamp periodically updated by ticker each interval
timerdone chan bool // channel to finish / stop timestamp updater
ticker mct.MultiChanTicker // periodically ticking once each interval
config metricRouterConfig // json encoded config for metric router
cache MetricCache // pointer to MetricCache
cachewg sync.WaitGroup // wait group for MetricCache
maxForward int // number of metrics to forward maximally in one iteration
mp mp.MessageProcessor
}
// MetricRouter access functions
type MetricRouter interface {
Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfig json.RawMessage) error
AddCollectorInput(input chan lp.CCMessage)
AddReceiverInput(input chan lp.CCMessage)
AddOutput(output chan lp.CCMessage)
Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfigFile string) error
AddCollectorInput(input chan lp.CCMetric)
AddReceiverInput(input chan lp.CCMetric)
AddOutput(output chan lp.CCMetric)
Start()
Close()
}
@@ -84,14 +63,12 @@ type MetricRouter interface {
// * wait group synchronization (from variable wg)
// * ticker (from variable ticker)
// * configuration (read from config file in variable routerConfigFile)
func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfig json.RawMessage) error {
r.outputs = make([]chan lp.CCMessage, 0)
func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfigFile string) error {
r.outputs = make([]chan lp.CCMetric, 0)
r.done = make(chan bool)
r.cache_input = make(chan lp.CCMessage)
r.cache_input = make(chan lp.CCMetric)
r.wg = wg
r.ticker = ticker
r.config.MaxForward = ROUTER_MAX_FORWARD
r.config.HostnameTagName = "hostname"
// Set hostname
hostname, err := os.Hostname()
@@ -102,102 +79,98 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
// Drop domain part of host name
r.hostname = strings.SplitN(hostname, `.`, 2)[0]
err = json.Unmarshal(routerConfig, &r.config)
// Read metric router config file
configFile, err := os.Open(routerConfigFile)
if err != nil {
cclog.ComponentError("MetricRouter", err.Error())
return err
}
r.maxForward = 1
if r.config.MaxForward > r.maxForward {
r.maxForward = r.config.MaxForward
}
if r.config.NumCacheIntervals > 0 {
r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals)
if err != nil {
cclog.ComponentError("MetricRouter", "MetricCache initialization failed:", err.Error())
return err
}
for _, agg := range r.config.IntervalAgg {
r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
}
}
p, err := mp.NewMessageProcessor()
defer configFile.Close()
jsonParser := json.NewDecoder(configFile)
err = jsonParser.Decode(&r.config)
if err != nil {
return fmt.Errorf("initialization of message processor failed: %v", err.Error())
cclog.ComponentError("MetricRouter", err.Error())
return err
}
r.mp = p
if len(r.config.MessageProcessor) > 0 {
err = r.mp.FromConfigJSON(r.config.MessageProcessor)
if err != nil {
return fmt.Errorf("failed parsing JSON for message processor: %v", err.Error())
}
numIntervals := r.config.NumCacheIntervals
if numIntervals <= 0 {
numIntervals = 1
}
for _, mname := range r.config.DropMetrics {
r.mp.AddDropMessagesByName(mname)
r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, numIntervals)
if err != nil {
cclog.ComponentError("MetricRouter", "MetricCache initialization failed:", err.Error())
return err
}
for _, cond := range r.config.DropMetricsIf {
r.mp.AddDropMessagesByCondition(cond)
for _, agg := range r.config.IntervalAgg {
r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
}
for _, data := range r.config.AddTags {
cond := data.Condition
if cond == "*" {
cond = "true"
}
r.mp.AddAddTagsByCondition(cond, data.Key, data.Value)
}
for _, data := range r.config.DelTags {
cond := data.Condition
if cond == "*" {
cond = "true"
}
r.mp.AddDeleteTagsByCondition(cond, data.Key, data.Value)
}
for oldname, newname := range r.config.RenameMetrics {
r.mp.AddRenameMetricByName(oldname, newname)
}
for metricName, prefix := range r.config.ChangeUnitPrefix {
r.mp.AddChangeUnitPrefix(fmt.Sprintf("name == '%s'", metricName), prefix)
}
r.mp.SetNormalizeUnits(r.config.NormalizeUnits)
r.mp.AddAddTagsByCondition("true", r.config.HostnameTagName, r.hostname)
// r.config.dropMetrics = make(map[string]bool)
// for _, mname := range r.config.DropMetrics {
// r.config.dropMetrics[mname] = true
// }
return nil
}
func getParamMap(point lp.CCMessage) map[string]interface{} {
// StartTimer starts a timer which updates timestamp periodically
func (r *metricRouter) StartTimer() {
m := make(chan time.Time)
r.ticker.AddChannel(m)
r.timerdone = make(chan bool)
r.wg.Add(1)
go func() {
defer r.wg.Done()
for {
select {
case <-r.timerdone:
close(r.timerdone)
cclog.ComponentDebug("MetricRouter", "TIMER DONE")
return
case t := <-m:
r.timestamp = t
}
}
}()
cclog.ComponentDebug("MetricRouter", "TIMER START")
}
// EvalCondition evaluates condition cond for metric data from point
func (r *metricRouter) EvalCondition(cond string, point lp.CCMetric) (bool, error) {
expression, err := govaluate.NewEvaluableExpression(cond)
if err != nil {
cclog.ComponentDebug("MetricRouter", cond, " = ", err.Error())
return false, err
}
// Add metric name, tags, meta data, fields and timestamp to the parameter list
params := make(map[string]interface{})
params["metric"] = point
params["name"] = point.Name()
for key, value := range point.Tags() {
params[key] = value
for _, t := range point.TagList() {
params[t.Key] = t.Value
}
for key, value := range point.Meta() {
params[key] = value
for _, m := range point.MetaList() {
params[m.Key] = m.Value
}
for key, value := range point.Fields() {
params[key] = value
for _, f := range point.FieldList() {
params[f.Key] = f.Value
}
params["timestamp"] = point.Time()
return params
// evaluate condition
result, err := expression.Evaluate(params)
if err != nil {
cclog.ComponentDebug("MetricRouter", cond, " = ", err.Error())
return false, err
}
return bool(result.(bool)), err
}
// DoAddTags adds a tag when condition is fullfiled
func (r *metricRouter) DoAddTags(point lp.CCMessage) {
var conditionMatches bool
func (r *metricRouter) DoAddTags(point lp.CCMetric) {
for _, m := range r.config.AddTags {
var conditionMatches bool
if m.Condition == "*" {
// Condition is always matched
conditionMatches = true
} else {
// Evaluate condition
var err error
conditionMatches, err = agg.EvalBoolCondition(m.Condition, getParamMap(point))
conditionMatches, err = r.EvalCondition(m.Condition, point)
if err != nil {
cclog.ComponentError("MetricRouter", err.Error())
conditionMatches = false
@@ -210,89 +183,33 @@ func (r *metricRouter) DoAddTags(point lp.CCMessage) {
}
// DoDelTags removes a tag when condition is fullfiled
// func (r *metricRouter) DoDelTags(point lp.CCMessage) {
// var conditionMatches bool
// for _, m := range r.config.DelTags {
// if m.Condition == "*" {
// // Condition is always matched
// conditionMatches = true
// } else {
// // Evaluate condition
// var err error
// conditionMatches, err = agg.EvalBoolCondition(m.Condition, getParamMap(point))
// if err != nil {
// cclog.ComponentError("MetricRouter", err.Error())
// conditionMatches = false
// }
// }
// if conditionMatches {
// point.RemoveTag(m.Key)
// }
// }
// }
func (r *metricRouter) DoDelTags(point lp.CCMetric) {
for _, m := range r.config.DelTags {
var conditionMatches bool
// Conditional test whether a metric should be dropped
// func (r *metricRouter) dropMetric(point lp.CCMessage) bool {
// // Simple drop check
// if conditionMatches, ok := r.config.dropMetrics[point.Name()]; ok {
// return conditionMatches
// }
// // Checking the dropping conditions
// for _, m := range r.config.DropMetricsIf {
// conditionMatches, err := agg.EvalBoolCondition(m, getParamMap(point))
// if err != nil {
// cclog.ComponentError("MetricRouter", err.Error())
// conditionMatches = false
// }
// if conditionMatches {
// return conditionMatches
// }
// }
// // No dropping condition met
// return false
// }
// func (r *metricRouter) prepareUnit(point lp.CCMessage) bool {
// if r.config.NormalizeUnits {
// if in_unit, ok := point.GetMeta("unit"); ok {
// u := units.NewUnit(in_unit)
// if u.Valid() {
// point.AddMeta("unit", u.Short())
// }
// }
// }
// if newP, ok := r.config.ChangeUnitPrefix[point.Name()]; ok {
// newPrefix := units.NewPrefix(newP)
// if in_unit, ok := point.GetMeta("unit"); ok && newPrefix != units.InvalidPrefix {
// u := units.NewUnit(in_unit)
// if u.Valid() {
// cclog.ComponentDebug("MetricRouter", "Change prefix to", newP, "for metric", point.Name())
// conv, out_unit := units.GetUnitPrefixFactor(u, newPrefix)
// if conv != nil && out_unit.Valid() {
// if val, ok := point.GetField("value"); ok {
// point.AddField("value", conv(val))
// point.AddMeta("unit", out_unit.Short())
// }
// }
// }
// }
// }
// return true
// }
if m.Condition == "*" {
conditionMatches = true
} else {
var err error
conditionMatches, err = r.EvalCondition(m.Condition, point)
if err != nil {
cclog.ComponentError("MetricRouter", err.Error())
conditionMatches = false
}
}
if conditionMatches {
point.RemoveTag(m.Key)
}
}
}
// Start starts the metric router
func (r *metricRouter) Start() {
// start timer if configured
r.timestamp = time.Now()
timeChan := make(chan time.Time)
if r.config.IntervalStamp {
r.ticker.AddChannel(timeChan)
r.StartTimer()
}
// Router manager is done
@@ -303,114 +220,47 @@ func (r *metricRouter) Start() {
// Forward takes a received metric, adds or deletes tags
// and forwards it to the output channels
// forward := func(point lp.CCMessage) {
// cclog.ComponentDebug("MetricRouter", "FORWARD", point)
// r.DoAddTags(point)
// r.DoDelTags(point)
// name := point.Name()
// if new, ok := r.config.RenameMetrics[name]; ok {
// point.SetName(new)
// point.AddMeta("oldname", name)
// r.DoAddTags(point)
// r.DoDelTags(point)
// }
// r.prepareUnit(point)
// for _, o := range r.outputs {
// o <- point
// }
// }
// Foward message received from collector channel
coll_forward := func(p lp.CCMessage) {
// receive from metric collector
//p.AddTag(r.config.HostnameTagName, r.hostname)
if r.config.IntervalStamp {
p.SetTime(r.timestamp)
}
m, err := r.mp.ProcessMessage(p)
if err == nil && m != nil {
for _, o := range r.outputs {
o <- m
}
}
// if !r.dropMetric(p) {
// for _, o := range r.outputs {
// o <- point
// }
// }
// even if the metric is dropped, it is stored in the cache for
// aggregations
if r.config.NumCacheIntervals > 0 {
r.cache.Add(m)
}
}
// Forward message received from receivers channel
recv_forward := func(p lp.CCMessage) {
// receive from receive manager
if r.config.IntervalStamp {
p.SetTime(r.timestamp)
}
m, err := r.mp.ProcessMessage(p)
if err == nil && m != nil {
for _, o := range r.outputs {
o <- m
}
}
// if !r.dropMetric(p) {
// forward(p)
// }
}
// Forward message received from cache channel
cache_forward := func(p lp.CCMessage) {
// receive from metric collector
m, err := r.mp.ProcessMessage(p)
if err == nil && m != nil {
for _, o := range r.outputs {
o <- m
}
forward := func(point lp.CCMetric) {
cclog.ComponentDebug("MetricRouter", "FORWARD", point)
r.DoAddTags(point)
r.DoDelTags(point)
for _, o := range r.outputs {
o <- point
}
}
// Start Metric Cache
if r.config.NumCacheIntervals > 0 {
r.cache.Start()
}
r.cache.Start()
r.wg.Add(1)
go func() {
defer r.wg.Done()
for {
select {
case <-r.done:
done()
return
case timestamp := <-timeChan:
r.timestamp = timestamp
cclog.ComponentDebug("MetricRouter", "Update timestamp", r.timestamp.UnixNano())
case p := <-r.coll_input:
coll_forward(p)
for i := 0; len(r.coll_input) > 0 && i < (r.maxForward-1); i++ {
coll_forward(<-r.coll_input)
// receive from metric collector
p.AddTag("hostname", r.hostname)
if r.config.IntervalStamp {
p.SetTime(r.timestamp)
}
forward(p)
r.cache.Add(p)
case p := <-r.recv_input:
recv_forward(p)
for i := 0; len(r.recv_input) > 0 && i < (r.maxForward-1); i++ {
recv_forward(<-r.recv_input)
// receive from receive manager
if r.config.IntervalStamp {
p.SetTime(r.timestamp)
}
forward(p)
case p := <-r.cache_input:
cache_forward(p)
for i := 0; len(r.cache_input) > 0 && i < (r.maxForward-1); i++ {
cache_forward(<-r.cache_input)
}
// receive from metric collector
p.AddTag("hostname", r.hostname)
forward(p)
}
}
}()
@@ -418,17 +268,17 @@ func (r *metricRouter) Start() {
}
// AddCollectorInput adds a channel between metric collector and metric router
func (r *metricRouter) AddCollectorInput(input chan lp.CCMessage) {
func (r *metricRouter) AddCollectorInput(input chan lp.CCMetric) {
r.coll_input = input
}
// AddReceiverInput adds a channel between metric receiver and metric router
func (r *metricRouter) AddReceiverInput(input chan lp.CCMessage) {
func (r *metricRouter) AddReceiverInput(input chan lp.CCMetric) {
r.recv_input = input
}
// AddOutput adds a output channel to the metric router
func (r *metricRouter) AddOutput(output chan lp.CCMessage) {
func (r *metricRouter) AddOutput(output chan lp.CCMetric) {
r.outputs = append(r.outputs, output)
}
@@ -438,19 +288,20 @@ func (r *metricRouter) Close() {
r.done <- true
// wait for close of channel r.done
<-r.done
// stop metric cache
if r.config.NumCacheIntervals > 0 {
cclog.ComponentDebug("MetricRouter", "CACHE CLOSE")
r.cache.Close()
r.cachewg.Wait()
if r.config.IntervalStamp {
cclog.ComponentDebug("MetricRouter", "TIMER CLOSE")
r.timerdone <- true
// wait for close of channel r.timerdone
<-r.timerdone
}
r.cache.Close()
r.cachewg.Wait()
}
// New creates a new initialized metric router
func New(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfig json.RawMessage) (MetricRouter, error) {
func New(ticker mct.MultiChanTicker, wg *sync.WaitGroup, routerConfigFile string) (MetricRouter, error) {
r := new(metricRouter)
err := r.Init(ticker, wg, routerConfig)
err := r.Init(ticker, wg, routerConfigFile)
if err != nil {
return nil, err
}

View File

@@ -1,14 +1,3 @@
<!--
---
title: Multi-channel Ticker
description: Timer ticker that sends out the tick to multiple channels
categories: [cc-metric-collector]
tags: ['Developer']
weight: 1
hugo_path: docs/reference/cc-metric-collector/pkg/multichanticker/_index.md
---
-->
# MultiChanTicker
The idea of this ticker is to multiply the output channels. The original Golang `time.Ticker` provides only a single output channel, so the signal can only be received by a single other class. This ticker allows to add multiple channels which get all notified about the time tick.

View File

@@ -1,16 +1,9 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package multiChanTicker
import (
"time"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
)
type multiChanTicker struct {

View File

@@ -1,10 +1,3 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package main
import (
@@ -14,24 +7,40 @@ import (
"os/signal"
"syscall"
"github.com/ClusterCockpit/cc-lib/receivers"
"github.com/ClusterCockpit/cc-lib/sinks"
"github.com/ClusterCockpit/cc-metric-collector/collectors"
"github.com/ClusterCockpit/cc-metric-collector/receivers"
"github.com/ClusterCockpit/cc-metric-collector/sinks"
// "strings"
"sync"
"time"
ccconf "github.com/ClusterCockpit/cc-lib/ccConfig"
cclog "github.com/ClusterCockpit/cc-lib/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/ccMessage"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker"
mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker"
)
type CentralConfigFile struct {
Interval string `json:"interval"`
Duration string `json:"duration"`
Interval int `json:"interval"`
Duration int `json:"duration"`
Pidfile string `json:"pidfile,omitempty"`
CollectorConfigFile string `json:"collectors"`
RouterConfigFile string `json:"router"`
SinkConfigFile string `json:"sinks"`
ReceiverConfigFile string `json:"receivers,omitempty"`
}
func LoadCentralConfiguration(file string, config *CentralConfigFile) error {
configFile, err := os.Open(file)
if err != nil {
cclog.Error(err.Error())
return err
}
defer configFile.Close()
jsonParser := json.NewDecoder(configFile)
err = jsonParser.Decode(config)
return err
}
type RuntimeConfig struct {
@@ -46,7 +55,7 @@ type RuntimeConfig struct {
ReceiveManager receivers.ReceiveManager
MultiChanTicker mct.MultiChanTicker
Channels []chan lp.CCMessage
Channels []chan lp.CCMetric
Sync sync.WaitGroup
}
@@ -78,18 +87,25 @@ func ReadCli() map[string]string {
var m map[string]string
cfg := flag.String("config", "./config.json", "Path to configuration file")
logfile := flag.String("log", "stderr", "Path for logfile")
pidfile := flag.String("pidfile", "/var/run/cc-metric-collector.pid", "Path for PID file")
once := flag.Bool("once", false, "Run all collectors only once")
loglevel := flag.String("loglevel", "info", "Set log level")
debug := flag.Bool("debug", false, "Activate debug output")
flag.Parse()
m = make(map[string]string)
m["configfile"] = *cfg
m["logfile"] = *logfile
m["pidfile"] = *pidfile
if *once {
m["once"] = "true"
} else {
m["once"] = "false"
}
m["loglevel"] = *loglevel
if *debug {
m["debug"] = "true"
cclog.SetDebug()
} else {
m["debug"] = "false"
}
return m
}
@@ -109,6 +125,25 @@ func ReadCli() map[string]string {
// return nil
//}
//func CreatePidfile(pidfile string) error {
// file, err := os.OpenFile(pidfile, os.O_CREATE|os.O_RDWR, 0600)
// if err != nil {
// log.Print(err)
// return err
// }
// file.Write([]byte(fmt.Sprintf("%d", os.Getpid())))
// file.Close()
// return nil
//}
//func RemovePidfile(pidfile string) error {
// info, err := os.Stat(pidfile)
// if !os.IsNotExist(err) && !info.IsDir() {
// os.Remove(pidfile)
// }
// return nil
//}
// General shutdownHandler function that gets executed in case of interrupt or graceful shutdownHandler
func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) {
defer config.Sync.Done()
@@ -139,6 +174,11 @@ func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) {
cclog.Debug("Shutdown SinkManager...")
config.SinkManager.Close()
}
// pidfile := config.ConfigFile.Pidfile
// RemovePidfile(pidfile)
// pidfile = config.CliArgs["pidfile"]
// RemovePidfile(pidfile)
}
func mainFunc() int {
@@ -154,118 +194,89 @@ func mainFunc() int {
CliArgs: ReadCli(),
}
// Set loglevel based on command line input.
cclog.Init(rcfg.CliArgs["loglevel"], false)
// Init ccConfig with configuration file
ccconf.Init(rcfg.CliArgs["configfile"])
// Load and check configuration
main := ccconf.GetPackageConfig("main")
err = json.Unmarshal(main, &rcfg.ConfigFile)
err = LoadCentralConfiguration(rcfg.CliArgs["configfile"], &rcfg.ConfigFile)
if err != nil {
cclog.Error("Error reading configuration file ", rcfg.CliArgs["configfile"], ": ", err.Error())
return 1
}
// Properly use duration parser with inputs like '60s', '5m' or similar
if len(rcfg.ConfigFile.Interval) > 0 {
t, err := time.ParseDuration(rcfg.ConfigFile.Interval)
if err != nil {
cclog.Error("Configuration value 'interval' no valid duration")
}
rcfg.Interval = t
if rcfg.Interval == 0 {
cclog.Error("Configuration value 'interval' must be greater than zero")
return 1
}
}
// Properly use duration parser with inputs like '60s', '5m' or similar
if len(rcfg.ConfigFile.Duration) > 0 {
t, err := time.ParseDuration(rcfg.ConfigFile.Duration)
if err != nil {
cclog.Error("Configuration value 'duration' no valid duration")
}
rcfg.Duration = t
if rcfg.Duration == 0 {
cclog.Error("Configuration value 'duration' must be greater than zero")
return 1
}
}
if rcfg.Duration > rcfg.Interval {
cclog.Error("The interval should be greater than duration")
if rcfg.ConfigFile.Interval <= 0 || time.Duration(rcfg.ConfigFile.Interval)*time.Second <= 0 {
cclog.Error("Configuration value 'interval' must be greater than zero")
return 1
}
rcfg.Interval = time.Duration(rcfg.ConfigFile.Interval) * time.Second
if rcfg.ConfigFile.Duration <= 0 || time.Duration(rcfg.ConfigFile.Duration)*time.Second <= 0 {
cclog.Error("Configuration value 'duration' must be greater than zero")
return 1
}
rcfg.Duration = time.Duration(rcfg.ConfigFile.Duration) * time.Second
routerConf := ccconf.GetPackageConfig("router")
if len(routerConf) == 0 {
if len(rcfg.ConfigFile.RouterConfigFile) == 0 {
cclog.Error("Metric router configuration file must be set")
return 1
}
sinkConf := ccconf.GetPackageConfig("sinks")
if len(sinkConf) == 0 {
if len(rcfg.ConfigFile.SinkConfigFile) == 0 {
cclog.Error("Sink configuration file must be set")
return 1
}
collectorConf := ccconf.GetPackageConfig("collectors")
if len(collectorConf) == 0 {
if len(rcfg.ConfigFile.CollectorConfigFile) == 0 {
cclog.Error("Metric collector configuration file must be set")
return 1
}
// err = CreatePidfile(rcfg.CliArgs["pidfile"])
// Set log file
// if logfile := rcfg.CliArgs["logfile"]; logfile != "stderr" {
// cclog.SetOutput(logfile)
// }
if logfile := rcfg.CliArgs["logfile"]; logfile != "stderr" {
cclog.SetOutput(logfile)
}
// Creat new multi channel ticker
rcfg.MultiChanTicker = mct.NewTicker(rcfg.Interval)
// Create new metric router
rcfg.MetricRouter, err = mr.New(rcfg.MultiChanTicker, &rcfg.Sync, routerConf)
rcfg.MetricRouter, err = mr.New(rcfg.MultiChanTicker, &rcfg.Sync, rcfg.ConfigFile.RouterConfigFile)
if err != nil {
cclog.Error(err.Error())
return 1
}
// Create new sink
rcfg.SinkManager, err = sinks.New(&rcfg.Sync, sinkConf)
rcfg.SinkManager, err = sinks.New(&rcfg.Sync, rcfg.ConfigFile.SinkConfigFile)
if err != nil {
cclog.Error(err.Error())
return 1
}
// Connect metric router to sink manager
RouterToSinksChannel := make(chan lp.CCMessage, 200)
RouterToSinksChannel := make(chan lp.CCMetric, 200)
rcfg.SinkManager.AddInput(RouterToSinksChannel)
rcfg.MetricRouter.AddOutput(RouterToSinksChannel)
// Create new collector manager
rcfg.CollectManager, err = collectors.New(rcfg.MultiChanTicker, rcfg.Duration, &rcfg.Sync, collectorConf)
rcfg.CollectManager, err = collectors.New(rcfg.MultiChanTicker, rcfg.Duration, &rcfg.Sync, rcfg.ConfigFile.CollectorConfigFile)
if err != nil {
cclog.Error(err.Error())
return 1
}
// Connect collector manager to metric router
CollectToRouterChannel := make(chan lp.CCMessage, 200)
CollectToRouterChannel := make(chan lp.CCMetric, 200)
rcfg.CollectManager.AddOutput(CollectToRouterChannel)
rcfg.MetricRouter.AddCollectorInput(CollectToRouterChannel)
// Create new receive manager
receiveConf := ccconf.GetPackageConfig("receivers")
if len(receiveConf) > 0 {
rcfg.ReceiveManager, err = receivers.New(&rcfg.Sync, receiveConf)
if len(rcfg.ConfigFile.ReceiverConfigFile) > 0 {
rcfg.ReceiveManager, err = receivers.New(&rcfg.Sync, rcfg.ConfigFile.ReceiverConfigFile)
if err != nil {
cclog.Error(err.Error())
return 1
}
// Connect receive manager to metric router
ReceiveToRouterChannel := make(chan lp.CCMessage, 200)
ReceiveToRouterChannel := make(chan lp.CCMetric, 200)
rcfg.ReceiveManager.AddOutput(ReceiveToRouterChannel)
rcfg.MetricRouter.AddReceiverInput(ReceiveToRouterChannel)
use_recv = true
@@ -289,7 +300,7 @@ func mainFunc() int {
// Wait until one tick has passed. This is a workaround
if rcfg.CliArgs["once"] == "true" {
x := 1.2 * float64(rcfg.Interval.Seconds())
x := 1.2 * float64(rcfg.ConfigFile.Interval)
time.Sleep(time.Duration(int(x)) * time.Second)
shutdownSignal <- os.Interrupt
}

View File

@@ -1,470 +0,0 @@
// Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
// All rights reserved. This file is part of cc-lib.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
// additional authors:
// Holger Obermaier (NHR@KIT)
package ccTopology
import (
"fmt"
"log"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
cclogger "github.com/ClusterCockpit/cc-lib/ccLogger"
"golang.org/x/exp/slices"
)
const SYSFS_CPUBASE = `/sys/devices/system/cpu`
// Structure holding all information about a hardware thread
// See https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-devices-system-cpu
type HwthreadEntry struct {
// for each CPUx:
CpuID int // CPU / hardware thread ID
SMT int // Simultaneous Multithreading ID
CoreCPUsList []int // CPUs within the same core
Core int // Socket local core ID
Socket int // Sockets (physical) ID
Die int // Die ID
NumaDomain int // NUMA Domain
}
var cache struct {
HwthreadList []int // List of CPU hardware threads
SMTList []int // List of symmetric hyper threading IDs
CoreList []int // List of CPU core IDs
SocketList []int // List of CPU sockets (physical) IDs
DieList []int // List of CPU Die IDs
NumaDomainList []int // List of NUMA Domains
CpuData []HwthreadEntry
}
// fileToInt reads an integer value from a sysfs file
// In case of an error -1 is returned
func fileToInt(path string) int {
buffer, err := os.ReadFile(path)
if err != nil {
log.Print(err)
cclogger.ComponentError("ccTopology", "fileToInt", "Reading", path, ":", err.Error())
return -1
}
stringBuffer := strings.TrimSpace(string(buffer))
id, err := strconv.Atoi(stringBuffer)
if err != nil {
cclogger.ComponentError("ccTopology", "fileToInt", "Parsing", path, ":", stringBuffer, err.Error())
return -1
}
return id
}
// fileToList reads a list from a sysfs file
// A list consists of value ranges separated by colon
// A range can be a single value or a range of values given by a startValue-endValue
// In case of an error nil is returned
func fileToList(path string) []int {
// Read list
buffer, err := os.ReadFile(path)
if err != nil {
log.Print(err)
cclogger.ComponentError("ccTopology", "fileToList", "Reading", path, ":", err.Error())
return nil
}
// Create list
list := make([]int, 0)
stringBuffer := strings.TrimSpace(string(buffer))
for _, valueRangeString := range strings.Split(stringBuffer, ",") {
valueRange := strings.Split(valueRangeString, "-")
switch len(valueRange) {
case 1:
singleValue, err := strconv.Atoi(valueRange[0])
if err != nil {
cclogger.ComponentError("CCTopology", "fileToList", "Parsing", valueRange[0], ":", err.Error())
return nil
}
list = append(list, singleValue)
case 2:
startValue, err := strconv.Atoi(valueRange[0])
if err != nil {
cclogger.ComponentError("CCTopology", "fileToList", "Parsing", valueRange[0], ":", err.Error())
return nil
}
endValue, err := strconv.Atoi(valueRange[1])
if err != nil {
cclogger.ComponentError("CCTopology", "fileToList", "Parsing", valueRange[1], ":", err.Error())
return nil
}
for value := startValue; value <= endValue; value++ {
list = append(list, value)
}
}
}
return list
}
// init initializes the cache structure
func init() {
getHWThreads :=
func() []int {
globPath := filepath.Join(SYSFS_CPUBASE, "cpu[0-9]*")
regexPath := filepath.Join(SYSFS_CPUBASE, "cpu([[:digit:]]+)")
regex := regexp.MustCompile(regexPath)
// File globbing for hardware threads
files, err := filepath.Glob(globPath)
if err != nil {
cclogger.ComponentError("CCTopology", "init:getHWThreads", err.Error())
return nil
}
hwThreadIDs := make([]int, len(files))
for i, file := range files {
// Extract hardware thread ID
matches := regex.FindStringSubmatch(file)
if len(matches) != 2 {
cclogger.ComponentError("CCTopology", "init:getHWThreads: Failed to extract hardware thread ID from ", file)
return nil
}
// Convert hardware thread ID to int
id, err := strconv.Atoi(matches[1])
if err != nil {
cclogger.ComponentError("CCTopology", "init:getHWThreads: Failed to convert to int hardware thread ID ", matches[1])
return nil
}
hwThreadIDs[i] = id
}
// Sort hardware thread IDs
slices.Sort(hwThreadIDs)
return hwThreadIDs
}
getNumaDomain :=
func(basePath string) int {
globPath := filepath.Join(basePath, "node*")
regexPath := filepath.Join(basePath, "node([[:digit:]]+)")
regex := regexp.MustCompile(regexPath)
// File globbing for NUMA node
files, err := filepath.Glob(globPath)
if err != nil {
cclogger.ComponentError("CCTopology", "init:getNumaDomain", err.Error())
return -1
}
// Check, that exactly one NUMA domain was found
if len(files) != 1 {
cclogger.ComponentError("CCTopology", "init:getNumaDomain", "Number of NUMA domains != 1: ", len(files))
return -1
}
// Extract NUMA node ID
matches := regex.FindStringSubmatch(files[0])
if len(matches) != 2 {
cclogger.ComponentError("CCTopology", "init:getNumaDomain", "Failed to extract NUMA node ID from: ", files[0])
return -1
}
id, err := strconv.Atoi(matches[1])
if err != nil {
cclogger.ComponentError("CCTopology", "init:getNumaDomain", "Failed to parse NUMA node ID from: ", matches[1])
return -1
}
return id
}
cache.HwthreadList = getHWThreads()
cache.CoreList = make([]int, len(cache.HwthreadList))
cache.SocketList = make([]int, len(cache.HwthreadList))
cache.DieList = make([]int, len(cache.HwthreadList))
cache.SMTList = make([]int, len(cache.HwthreadList))
cache.NumaDomainList = make([]int, len(cache.HwthreadList))
cache.CpuData = make([]HwthreadEntry, len(cache.HwthreadList))
for i, c := range cache.HwthreadList {
// Set cpuBase directory for topology lookup
cpuBase := filepath.Join(SYSFS_CPUBASE, fmt.Sprintf("cpu%d", c))
topoBase := filepath.Join(cpuBase, "topology")
// Lookup Core ID
cache.CoreList[i] = fileToInt(filepath.Join(topoBase, "core_id"))
// Lookup socket / physical package ID
cache.SocketList[i] = fileToInt(filepath.Join(topoBase, "physical_package_id"))
// Lookup CPU die id
cache.DieList[i] = fileToInt(filepath.Join(topoBase, "die_id"))
if cache.DieList[i] < 0 {
cache.DieList[i] = cache.SocketList[i]
}
// Lookup List of CPUs within the same core
coreCPUsList := fileToList(filepath.Join(topoBase, "core_cpus_list"))
// Find index of CPU ID in List of CPUs within the same core
// if not found return -1
cache.SMTList[i] = slices.Index(coreCPUsList, c)
// Lookup NUMA domain id
cache.NumaDomainList[i] = getNumaDomain(cpuBase)
cache.CpuData[i] =
HwthreadEntry{
CpuID: cache.HwthreadList[i],
SMT: cache.SMTList[i],
CoreCPUsList: coreCPUsList,
Socket: cache.SocketList[i],
NumaDomain: cache.NumaDomainList[i],
Die: cache.DieList[i],
Core: cache.CoreList[i],
}
}
slices.Sort(cache.HwthreadList)
cache.HwthreadList = slices.Compact(cache.HwthreadList)
slices.Sort(cache.SMTList)
cache.SMTList = slices.Compact(cache.SMTList)
slices.Sort(cache.CoreList)
cache.CoreList = slices.Compact(cache.CoreList)
slices.Sort(cache.SocketList)
cache.SocketList = slices.Compact(cache.SocketList)
slices.Sort(cache.DieList)
cache.DieList = slices.Compact(cache.DieList)
slices.Sort(cache.NumaDomainList)
cache.NumaDomainList = slices.Compact(cache.NumaDomainList)
}
// SocketList gets the list of CPU socket IDs
func SocketList() []int {
return slices.Clone(cache.SocketList)
}
// HwthreadList gets the list of hardware thread IDs in the order of listing in /proc/cpuinfo
func HwthreadList() []int {
return slices.Clone(cache.HwthreadList)
}
// Get list of hardware thread IDs in the order of listing in /proc/cpuinfo
// Deprecated! Use HwthreadList()
func CpuList() []int {
return HwthreadList()
}
// CoreList gets the list of CPU core IDs in the order of listing in /proc/cpuinfo
func CoreList() []int {
return slices.Clone(cache.CoreList)
}
// Get list of NUMA node IDs
func NumaNodeList() []int {
return slices.Clone(cache.NumaDomainList)
}
// DieList gets the list of CPU die IDs
func DieList() []int {
if len(cache.DieList) > 0 {
return slices.Clone(cache.DieList)
}
return SocketList()
}
// GetTypeList gets the list of specified type using the naming format inside ClusterCockpit
func GetTypeList(topology_type string) []int {
switch topology_type {
case "node":
return []int{0}
case "socket":
return SocketList()
case "die":
return DieList()
case "memoryDomain":
return NumaNodeList()
case "core":
return CoreList()
case "hwthread":
return HwthreadList()
}
return []int{}
}
func GetTypeId(hwt HwthreadEntry, topology_type string) (int, error) {
var err error = nil
switch topology_type {
case "node":
return 0, err
case "socket":
return hwt.Socket, err
case "die":
return hwt.Die, err
case "memoryDomain":
return hwt.NumaDomain, err
case "core":
return hwt.Core, err
case "hwthread":
return hwt.CpuID, err
}
return -1, fmt.Errorf("unknown topology type '%s'", topology_type)
}
// CpuData returns CPU data for each hardware thread
func CpuData() []HwthreadEntry {
// return a deep copy to protect cache data
c := slices.Clone(cache.CpuData)
for i := range c {
c[i].CoreCPUsList = slices.Clone(cache.CpuData[i].CoreCPUsList)
}
return c
}
// Structure holding basic information about a CPU
type CpuInformation struct {
NumHWthreads int
SMTWidth int
NumSockets int
NumDies int
NumCores int
NumNumaDomains int
}
// CpuInformation reports basic information about the CPU
func CpuInfo() CpuInformation {
return CpuInformation{
NumNumaDomains: len(cache.NumaDomainList),
SMTWidth: len(cache.SMTList),
NumDies: len(cache.DieList),
NumCores: len(cache.CoreList),
NumSockets: len(cache.SocketList),
NumHWthreads: len(cache.HwthreadList),
}
}
// GetHwthreadSocket gets the CPU socket ID for a given hardware thread ID
// In case hardware thread ID is not found -1 is returned
func GetHwthreadSocket(cpuID int) int {
for i := range cache.CpuData {
d := &cache.CpuData[i]
if d.CpuID == cpuID {
return d.Socket
}
}
return -1
}
// GetHwthreadNumaDomain gets the NUMA domain ID for a given hardware thread ID
// In case hardware thread ID is not found -1 is returned
func GetHwthreadNumaDomain(cpuID int) int {
for i := range cache.CpuData {
d := &cache.CpuData[i]
if d.CpuID == cpuID {
return d.NumaDomain
}
}
return -1
}
// GetHwthreadDie gets the CPU die ID for a given hardware thread ID
// In case hardware thread ID is not found -1 is returned
func GetHwthreadDie(cpuID int) int {
for i := range cache.CpuData {
d := &cache.CpuData[i]
if d.CpuID == cpuID {
return d.Die
}
}
return -1
}
// GetHwthreadCore gets the CPU core ID for a given hardware thread ID
// In case hardware thread ID is not found -1 is returned
func GetHwthreadCore(cpuID int) int {
for i := range cache.CpuData {
d := &cache.CpuData[i]
if d.CpuID == cpuID {
return d.Core
}
}
return -1
}
// GetSocketHwthreads gets all hardware thread IDs associated with a CPU socket
func GetSocketHwthreads(socket int) []int {
cpuList := make([]int, 0)
for i := range cache.CpuData {
d := &cache.CpuData[i]
if d.Socket == socket {
cpuList = append(cpuList, d.CpuID)
}
}
return cpuList
}
// GetNumaDomainHwthreads gets the all hardware thread IDs associated with a NUMA domain
func GetNumaDomainHwthreads(numaDomain int) []int {
cpuList := make([]int, 0)
for i := range cache.CpuData {
d := &cache.CpuData[i]
if d.NumaDomain == numaDomain {
cpuList = append(cpuList, d.CpuID)
}
}
return cpuList
}
// GetDieHwthreads gets all hardware thread IDs associated with a CPU die
func GetDieHwthreads(die int) []int {
cpuList := make([]int, 0)
for i := range cache.CpuData {
d := &cache.CpuData[i]
if d.Die == die {
cpuList = append(cpuList, d.CpuID)
}
}
return cpuList
}
// GetCoreHwthreads get all hardware thread IDs associated with a CPU core
func GetCoreHwthreads(core int) []int {
cpuList := make([]int, 0)
for i := range cache.CpuData {
d := &cache.CpuData[i]
if d.Core == core {
cpuList = append(cpuList, d.CpuID)
}
}
return cpuList
}
// GetTypeList gets the list of specified type using the naming format inside ClusterCockpit
func GetTypeHwthreads(topology_type string, id int) []int {
switch topology_type {
case "node":
return HwthreadList()
case "socket":
return GetSocketHwthreads(id)
case "die":
return GetDieHwthreads(id)
case "memoryDomain":
return GetNumaDomainHwthreads(id)
case "core":
return GetCoreHwthreads(id)
case "hwthread":
return []int{id}
}
return []int{}
}

View File

@@ -1,44 +1,8 @@
{
"natsrecv": {
[
{
"type": "nats",
"address": "nats://my-url",
"port": "4222",
"port" : "4222",
"database": "testcluster"
},
"redfish_recv": {
"type": "redfish",
"endpoint": "https://%h-bmc",
"client_config": [
{
"host_list": "my-host-1-[1-2]",
"username": "username-1",
"password": "password-1"
},
{
"host_list": "my-host-2-[1,2]",
"username": "username-2",
"password": "password-2"
}
]
},
"ipmi_recv": {
"type": "ipmi",
"endpoint": "ipmi-sensors://%h-ipmi",
"exclude_metrics": [
"fan_speed",
"voltage"
],
"client_config": [
{
"username": "username-1",
"password": "password-1",
"host_list": "my-host-1-[1-2]"
},
{
"username": "username-2",
"password": "password-2",
"host_list": "my-host-2-[1,2]"
}
]
}
}
]

44
receivers/README.md Normal file
View File

@@ -0,0 +1,44 @@
# CCMetric receivers
This folder contains the ReceiveManager and receiver implementations for the cc-metric-collector.
# Configuration
The configuration file for the receivers is a list of configurations. The `type` field in each specifies which receiver to initialize.
```json
[
{
"type": "nats",
"address": "nats://my-url",
"port" : "4222",
"database": "testcluster"
}
]
```
## Type `nats`
```json
{
"type": "nats",
"address": "<nats-URI or hostname>",
"port" : "<portnumber>",
"database": "<subscribe topic>"
}
```
The `nats` receiver subscribes to the topic `database` and listens on `address` and `port` for metrics in the InfluxDB line protocol.
# Contributing own receivers
A receiver contains three functions and is derived from the type `Receiver` (in `metricReceiver.go`):
* `Init(config ReceiverConfig) error`
* `Start() error`
* `Close()`
* `Name() string`
* `SetSink(sink chan ccMetric.CCMetric)`
The data structures should be set up in `Init()` like opening a file or server connection. The `Start()` function should either start a go routine or issue some other asynchronous mechanism for receiving metrics. The `Close()` function should tear down anything created in `Init()`.
Finally, the receiver needs to be registered in the `receiveManager.go`. There is a list of receivers called `AvailableReceivers` which is a map (`receiver_type_string` -> `pointer to Receiver interface`). Add a new entry with a descriptive name and the new receiver.

View File

@@ -0,0 +1,56 @@
package receivers
import (
// "time"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
influx "github.com/influxdata/line-protocol"
)
type ReceiverConfig struct {
Addr string `json:"address"`
Port string `json:"port"`
Database string `json:"database"`
Organization string `json:"organization,omitempty"`
Type string `json:"type"`
}
type receiver struct {
name string
addr string
port string
database string
organization string
sink chan lp.CCMetric
}
type Receiver interface {
Init(config ReceiverConfig) error
Start()
Close()
Name() string
SetSink(sink chan lp.CCMetric)
}
func (r *receiver) Name() string {
return r.name
}
func (r *receiver) SetSink(sink chan lp.CCMetric) {
r.sink = sink
}
func Tags2Map(metric influx.Metric) map[string]string {
tags := make(map[string]string)
for _, t := range metric.TagList() {
tags[t.Key] = t.Value
}
return tags
}
func Fields2Map(metric influx.Metric) map[string]interface{} {
fields := make(map[string]interface{})
for _, f := range metric.FieldList() {
fields[f.Key] = f.Value
}
return fields
}

90
receivers/natsReceiver.go Normal file
View File

@@ -0,0 +1,90 @@
package receivers
import (
"errors"
"fmt"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
influx "github.com/influxdata/line-protocol"
nats "github.com/nats-io/nats.go"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
"time"
)
type NatsReceiverConfig struct {
Addr string `json:"address"`
Port string `json:"port"`
Database string `json:"database"`
}
type NatsReceiver struct {
receiver
nc *nats.Conn
handler *influx.MetricHandler
parser *influx.Parser
meta map[string]string
config ReceiverConfig
}
var DefaultTime = func() time.Time {
return time.Unix(42, 0)
}
func (r *NatsReceiver) Init(config ReceiverConfig) error {
r.name = "NatsReceiver"
r.config = config
if len(r.config.Addr) == 0 ||
len(r.config.Port) == 0 ||
len(r.config.Database) == 0 {
return errors.New("Not all configuration variables set required by NatsReceiver")
}
r.meta = map[string]string{"source": r.name}
r.addr = r.config.Addr
if len(r.addr) == 0 {
r.addr = nats.DefaultURL
}
r.port = r.config.Port
if len(r.port) == 0 {
r.port = "4222"
}
uri := fmt.Sprintf("%s:%s", r.addr, r.port)
cclog.ComponentDebug("NatsReceiver", "INIT", uri)
nc, err := nats.Connect(uri)
if err == nil {
r.database = r.config.Database
r.nc = nc
} else {
r.nc = nil
return err
}
r.handler = influx.NewMetricHandler()
r.parser = influx.NewParser(r.handler)
r.parser.SetTimeFunc(DefaultTime)
return err
}
func (r *NatsReceiver) Start() {
cclog.ComponentDebug("NatsReceiver", "START")
r.nc.Subscribe(r.database, r._NatsReceive)
}
func (r *NatsReceiver) _NatsReceive(m *nats.Msg) {
metrics, err := r.parser.Parse(m.Data)
if err == nil {
for _, m := range metrics {
y := lp.FromInfluxMetric(m)
for k, v := range r.meta {
y.AddMeta(k, v)
}
if r.sink != nil {
r.sink <- y
}
}
}
}
func (r *NatsReceiver) Close() {
if r.nc != nil {
cclog.ComponentDebug("NatsReceiver", "CLOSE")
r.nc.Close()
}
}

113
receivers/receiveManager.go Normal file
View File

@@ -0,0 +1,113 @@
package receivers
import (
"encoding/json"
"os"
"sync"
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
)
var AvailableReceivers = map[string]Receiver{
"nats": &NatsReceiver{},
}
type receiveManager struct {
inputs []Receiver
output chan lp.CCMetric
done chan bool
wg *sync.WaitGroup
config []ReceiverConfig
}
type ReceiveManager interface {
Init(wg *sync.WaitGroup, receiverConfigFile string) error
AddInput(rawConfig json.RawMessage) error
AddOutput(output chan lp.CCMetric)
Start()
Close()
}
func (rm *receiveManager) Init(wg *sync.WaitGroup, receiverConfigFile string) error {
rm.inputs = make([]Receiver, 0)
rm.output = nil
rm.done = make(chan bool)
rm.wg = wg
rm.config = make([]ReceiverConfig, 0)
configFile, err := os.Open(receiverConfigFile)
if err != nil {
cclog.ComponentError("ReceiveManager", err.Error())
return err
}
defer configFile.Close()
jsonParser := json.NewDecoder(configFile)
var rawConfigs []json.RawMessage
err = jsonParser.Decode(&rawConfigs)
if err != nil {
cclog.ComponentError("ReceiveManager", err.Error())
return err
}
for _, raw := range rawConfigs {
rm.AddInput(raw)
}
return nil
}
func (rm *receiveManager) Start() {
rm.wg.Add(1)
for _, r := range rm.inputs {
cclog.ComponentDebug("ReceiveManager", "START", r.Name())
r.Start()
}
cclog.ComponentDebug("ReceiveManager", "STARTED")
}
func (rm *receiveManager) AddInput(rawConfig json.RawMessage) error {
var config ReceiverConfig
err := json.Unmarshal(rawConfig, &config)
if err != nil {
cclog.ComponentError("ReceiveManager", "SKIP", config.Type, "JSON config error:", err.Error())
return err
}
if _, found := AvailableReceivers[config.Type]; !found {
cclog.ComponentError("ReceiveManager", "SKIP", config.Type, "unknown receiver:", err.Error())
return err
}
r := AvailableReceivers[config.Type]
err = r.Init(config)
if err != nil {
cclog.ComponentError("ReceiveManager", "SKIP", r.Name(), "initialization failed:", err.Error())
return err
}
rm.inputs = append(rm.inputs, r)
rm.config = append(rm.config, config)
cclog.ComponentDebug("ReceiveManager", "ADD RECEIVER", r.Name())
return nil
}
func (rm *receiveManager) AddOutput(output chan lp.CCMetric) {
rm.output = output
for _, r := range rm.inputs {
r.SetSink(rm.output)
}
}
func (rm *receiveManager) Close() {
for _, r := range rm.inputs {
cclog.ComponentDebug("ReceiveManager", "CLOSE", r.Name())
r.Close()
}
rm.wg.Done()
cclog.ComponentDebug("ReceiveManager", "CLOSE")
}
func New(wg *sync.WaitGroup, receiverConfigFile string) (ReceiveManager, error) {
r := &receiveManager{}
err := r.Init(wg, receiverConfigFile)
if err != nil {
return nil, err
}
return r, err
}

View File

@@ -1,23 +1,22 @@
{
"process_messages" : {
"add_tag_if": [
{
"key" : "cluster",
"value" : "testcluster",
"if" : "true"
},
{
"key" : "test",
"value" : "testing",
"if" : "name == 'temp_package_id_0'"
}
],
"delete_tag_if": [
{
"key" : "unit",
"if" : "true"
}
]
},
"add_tags" : [
{
"key" : "cluster",
"value" : "testcluster",
"if" : "*"
},
{
"key" : "test",
"value" : "testing",
"if" : "name == 'temp_package_id_0'"
}
],
"delete_tags" : [
{
"key" : "unit",
"value" : "*",
"if" : "*"
}
],
"interval_timestamp" : true
}

View File

@@ -6,7 +6,7 @@ CC_HOME=/tmp
LOG_DIR=/var/log
DATA_DIR=/var/lib/cc-metric-collector
DATA_DIR=/var/lib/grafana
MAX_OPEN_FILES=10000
@@ -16,8 +16,5 @@ CONF_FILE=/etc/cc-metric-collector/cc-metric-collector.json
RESTART_ON_UPGRADE=true
# Golang runtime debugging. (see: https://pkg.go.dev/runtime)
# GODEBUG=gctrace=1
# Golang garbage collection target percentage
# GOGC=100
# Only used on systemd systems
PID_FILE_DIR=/var/run

View File

@@ -1,14 +0,0 @@
Package: cc-metric-collector
Section: misc
Priority: optional
Version: {VERSION}
Installed-Size: {INSTALLED_SIZE}
Architecture: {ARCH}
Maintainer: thomas.gruber@fau.de
Depends: libc6 (>= 2.2.1)
Build-Depends: debhelper-compat (= 13), git, golang-go
Description: Metric collection daemon from the ClusterCockpit suite
Homepage: https://github.com/ClusterCockpit/cc-metric-collector
Source: cc-metric-collector
Rules-Requires-Root: no

Some files were not shown because too many files have changed in this diff Show More