mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-07-19 19:31:41 +02:00
Compare commits
2 Commits
nats_nkey_
...
likwid_col
Author | SHA1 | Date | |
---|---|---|---|
|
9ca73a9f50 | ||
|
0186dce521 |
22
.github/workflows/Release.yml
vendored
22
.github/workflows/Release.yml
vendored
@@ -44,16 +44,16 @@ jobs:
|
||||
# Use dnf to install build dependencies
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \
|
||||
tar --directory=/usr/local --extract --gzip
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
go version
|
||||
dnf --assumeyes install \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.21.7-1.module_el8+960+4060efbe.noarch.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm
|
||||
|
||||
- name: RPM build MetricCollector
|
||||
id: rpmbuild
|
||||
run: |
|
||||
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
make RPM
|
||||
|
||||
# AlmaLinux 8 is a derivate of RedHat Enterprise Linux 8 (UBI8),
|
||||
@@ -114,16 +114,16 @@ jobs:
|
||||
# Use dnf to install build dependencies
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \
|
||||
tar --directory=/usr/local --extract --gzip
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
go version
|
||||
dnf --assumeyes --disableplugin=subscription-manager install \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.21.7-1.module_el8+960+4060efbe.noarch.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm
|
||||
|
||||
- name: RPM build MetricCollector
|
||||
id: rpmbuild
|
||||
run: |
|
||||
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
make RPM
|
||||
|
||||
# See: https://github.com/actions/upload-artifact
|
||||
@@ -165,7 +165,7 @@ jobs:
|
||||
# Use official golang package
|
||||
- name: Install Golang
|
||||
run: |
|
||||
wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \
|
||||
wget -q https://go.dev/dl/go1.21.1.linux-amd64.tar.gz --output-document=- | \
|
||||
tar --directory=/usr/local --extract --gzip
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
go version
|
||||
|
22
.github/workflows/runonce.yml
vendored
22
.github/workflows/runonce.yml
vendored
@@ -91,16 +91,16 @@ jobs:
|
||||
# Use dnf to install build dependencies
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \
|
||||
tar --directory=/usr/local --extract --gzip
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
go version
|
||||
dnf --assumeyes install \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.21.7-1.module_el8+960+4060efbe.noarch.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm
|
||||
|
||||
- name: RPM build MetricCollector
|
||||
id: rpmbuild
|
||||
run: |
|
||||
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
make RPM
|
||||
|
||||
#
|
||||
@@ -129,16 +129,16 @@ jobs:
|
||||
# Use dnf to install build dependencies
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \
|
||||
tar --directory=/usr/local --extract --gzip
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
go version
|
||||
dnf --assumeyes --disableplugin=subscription-manager install \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.21.7-1.module_el8+960+4060efbe.noarch.rpm \
|
||||
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm
|
||||
|
||||
- name: RPM build MetricCollector
|
||||
id: rpmbuild
|
||||
run: |
|
||||
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
make RPM
|
||||
|
||||
#
|
||||
@@ -165,7 +165,7 @@ jobs:
|
||||
# Use official golang package
|
||||
- name: Install Golang
|
||||
run: |
|
||||
wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \
|
||||
wget -q https://go.dev/dl/go1.21.1.linux-amd64.tar.gz --output-document=- | \
|
||||
tar --directory=/usr/local --extract --gzip
|
||||
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
|
||||
go version
|
||||
|
@@ -12,7 +12,6 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
|
||||
)
|
||||
@@ -55,30 +54,15 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
|
||||
// Check if executables ipmitool or ipmisensors are found
|
||||
p, err := exec.LookPath(m.config.IpmitoolPath)
|
||||
if err == nil {
|
||||
command := exec.Command(p)
|
||||
err := command.Run()
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
|
||||
m.ipmitool = ""
|
||||
} else {
|
||||
m.ipmitool = p
|
||||
}
|
||||
m.ipmitool = p
|
||||
}
|
||||
p, err = exec.LookPath(m.config.IpmisensorsPath)
|
||||
if err == nil {
|
||||
command := exec.Command(p)
|
||||
err := command.Run()
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
|
||||
m.ipmisensors = ""
|
||||
} else {
|
||||
m.ipmisensors = p
|
||||
}
|
||||
m.ipmisensors = p
|
||||
}
|
||||
if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 {
|
||||
return errors.New("no usable IPMI reader found")
|
||||
return errors.New("no IPMI reader found")
|
||||
}
|
||||
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
@@ -135,8 +119,8 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMetric) {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("readIpmiTool(): Failed to wait for the end of command \"%s\": %v\n", command.String(), err),
|
||||
fmt.Sprintf("readIpmiTool(): command stderr: \"%s\"\n", string(errMsg)),
|
||||
)
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("readIpmiTool(): command stderr: \"%s\"\n", strings.TrimSpace(string(errMsg))))
|
||||
return
|
||||
}
|
||||
}
|
||||
|
@@ -374,6 +374,14 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
}
|
||||
defer watcher.Close()
|
||||
if len(m.config.LockfilePath) > 0 {
|
||||
if _, err := os.Stat(m.config.LockfilePath); os.IsNotExist(err) {
|
||||
file, err := os.Create(m.config.LockfilePath)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Cannot create lockfile", m.config.LockfilePath, ":", err.Error())
|
||||
return true, err
|
||||
}
|
||||
file.Close()
|
||||
}
|
||||
info, err := os.Stat(m.config.LockfilePath)
|
||||
if err != nil {
|
||||
return true, err
|
||||
@@ -382,9 +390,9 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
if uid != uint32(os.Getuid()) {
|
||||
usr, err := user.LookupId(fmt.Sprint(uid))
|
||||
if err == nil {
|
||||
return true, fmt.Errorf("Access to performance counters locked by %s", usr.Username)
|
||||
return true, fmt.Errorf("access to performance counters locked by %s", usr.Username)
|
||||
} else {
|
||||
return true, fmt.Errorf("Access to performance counters locked by %d", uid)
|
||||
return true, fmt.Errorf("access to performance counters locked by %d", uid)
|
||||
}
|
||||
}
|
||||
err = watcher.Add(m.config.LockfilePath)
|
||||
|
@@ -267,3 +267,45 @@ IPC PMC0/PMC1 -> {
|
||||
```
|
||||
|
||||
The script `scripts/likwid_perfgroup_to_cc_config.py` might help you.
|
||||
|
||||
### Internal structure
|
||||
|
||||
This section describes the internal structure of the `likwid` collector.
|
||||
|
||||
#### At initialization
|
||||
|
||||
After setting the defaults, the configuration is read.
|
||||
|
||||
Based on the configuration, the library is searched using `dlopen` to see whether it makes sense to proceed.
|
||||
|
||||
Next, the user-given metrics are tested to ensure they can be evaluated. For this, it creates a list of all user-given events/counters with the value `1.0` which is provided to the metric evaluator. The same is done for the global metrics by using the metric names with value `1.0`. If the evaluator does not fail, the metric can be evaluated and the collector initialization can proceed.
|
||||
|
||||
A separate thread is started to do the measurement. This is not done using a common goroutine but a real application thread with full control. This is required because LIKWID's access system tracks the processes of the using application and the PID should not change between measurements because that would require teardown and reopening of the access system.
|
||||
|
||||
With the separate thread, the access system is initialized by setting the user-given access mode and adding all hardware threads.
|
||||
|
||||
LIKWID measures per hardware thread in general but only some HW threads read the counters available only e.g. per CPU socket (often memory traffic). For this, the collector gets the system topology through LIKWID and creates different mappings like 'hwthread to list offset' and others. With this, the hardware threads responsible for a topological entity can be determined because those read the counters of the per CPU socket units. These mappings are later used in the measurement phase.
|
||||
|
||||
In the end, we read the base CPU frequency of the system. It may be used in the metric evaluation.
|
||||
|
||||
#### Measurements
|
||||
|
||||
The reading of events is done by the separate application thread.
|
||||
|
||||
It traverses over all configured event sets, creates valid LIKWID eventstrings out of them and pass them to take a measurement. This could be done only once but when the LIKWID lock changes, LIKWID has to be completely reopened to provide access again. With this reopening, the already added event sets are gone.
|
||||
|
||||
LIKWID has it's own locking mechanism using a lock file. But not the content of the file is of interest but the owner. In order to track changes of the file, a `fsnotify` watcher is installed on the file. If the file does not exist, it is created and consequently is owned by the same user as `cc-metric-collector`. The LikwidCollector has to watch the file on it's own because LIKWID does not provide proper error handling for this.
|
||||
|
||||
Each call to the LIKWID library for loading the event set, setting up the counting facilities as well as starting and stopping of the counters is wrapped into lockfile checks to ensure no state change happens. If the file owner changed, the LikwidCollector cannot access the counters anymore, so no further operation can be done and measurment stops.
|
||||
|
||||
Although start/stop would be sufficient, the LikwidCollector performs start, read, wait, read, `getLastResult`, stop. Reason might be "historic" but is not 100% clear anymore. The author failed to document ;)
|
||||
|
||||
#### Metric evaluation
|
||||
|
||||
After each meaurement, the metrics of the event set are directly evaluated. It updates the counter->result mapping with the new measurements, calls the evaluator and generates the `CCMetric` with the user-given settings if it should be published. Each metric name to result calculation is stored for the global metric evaluation, which is done as a final step.
|
||||
|
||||
#### Shutdown
|
||||
|
||||
Since each measurment involves a complete initialize to finalize cycle of the LIKWID library, only the topology module needs to be closed.
|
||||
|
||||
Moreover, the separate application thread is stopped.
|
@@ -12,8 +12,8 @@ The global file contains the paths to the other four files and some global optio
|
||||
"collectors" : "collectors.json",
|
||||
"receivers" : "receivers.json",
|
||||
"router" : "router.json",
|
||||
"interval": "10s",
|
||||
"duration": "1s"
|
||||
"interval": 10,
|
||||
"duration": 1
|
||||
}
|
||||
```
|
||||
|
||||
|
@@ -4,7 +4,6 @@ import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
||||
@@ -14,13 +13,10 @@ import (
|
||||
)
|
||||
|
||||
type NatsReceiverConfig struct {
|
||||
Type string `json:"type"`
|
||||
Addr string `json:"address"`
|
||||
Port string `json:"port"`
|
||||
Subject string `json:"subject"`
|
||||
User string `json:"user,omitempty"`
|
||||
Password string `json:"password,omitempty"`
|
||||
NkeyFile string `json:"nkey_file,omitempty"`
|
||||
Type string `json:"type"`
|
||||
Addr string `json:"address"`
|
||||
Port string `json:"port"`
|
||||
Subject string `json:"subject"`
|
||||
}
|
||||
|
||||
type NatsReceiver struct {
|
||||
@@ -113,7 +109,6 @@ func (r *NatsReceiver) Close() {
|
||||
|
||||
// NewNatsReceiver creates a new Receiver which subscribes to messages from a NATS server
|
||||
func NewNatsReceiver(name string, config json.RawMessage) (Receiver, error) {
|
||||
var uinfo nats.Option = nil
|
||||
r := new(NatsReceiver)
|
||||
r.name = fmt.Sprintf("NatsReceiver(%s)", name)
|
||||
|
||||
@@ -138,22 +133,10 @@ func NewNatsReceiver(name string, config json.RawMessage) (Receiver, error) {
|
||||
"source": r.name,
|
||||
}
|
||||
|
||||
if len(r.config.User) > 0 && len(r.config.Password) > 0 {
|
||||
uinfo = nats.UserInfo(r.config.User, r.config.Password)
|
||||
} else if len(r.config.NkeyFile) > 0 {
|
||||
_, err := os.Stat(r.config.NkeyFile)
|
||||
if err == nil {
|
||||
uinfo = nats.UserCredentials(r.config.NkeyFile)
|
||||
} else {
|
||||
cclog.ComponentError(r.name, "NKEY file", r.config.NkeyFile, "does not exist: %v", err.Error())
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// Connect to NATS server
|
||||
url := fmt.Sprintf("nats://%s:%s", r.config.Addr, r.config.Port)
|
||||
cclog.ComponentDebug(r.name, "NewNatsReceiver", url, "Subject", r.config.Subject)
|
||||
if nc, err := nats.Connect(url, uinfo); err == nil {
|
||||
if nc, err := nats.Connect(url); err == nil {
|
||||
r.nc = nc
|
||||
} else {
|
||||
r.nc = nil
|
||||
|
@@ -10,10 +10,7 @@ The `nats` receiver can be used receive metrics from the NATS network. The `nats
|
||||
"type": "nats",
|
||||
"address" : "nats-server.example.org",
|
||||
"port" : "4222",
|
||||
"subject" : "subject",
|
||||
"user": "natsuser",
|
||||
"password": "natssecret",
|
||||
"nkey_file": "/path/to/nkey_file"
|
||||
"subject" : "subject"
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -22,9 +19,6 @@ The `nats` receiver can be used receive metrics from the NATS network. The `nats
|
||||
- `address`: Address of the NATS control server
|
||||
- `port`: Port of the NATS control server
|
||||
- `subject`: Subscribes to this subject and receive metrics
|
||||
- `user`: Connect to nats using this user
|
||||
- `password`: Connect to nats using this password
|
||||
- `nkey_file`: Path to credentials file with NKEY
|
||||
|
||||
### Debugging
|
||||
|
||||
|
@@ -17,7 +17,7 @@ This folder contains the SinkManager and sink implementations for the cc-metric-
|
||||
The configuration file for the sinks is a list of configurations. The `type` field in each specifies which sink to initialize.
|
||||
|
||||
```json
|
||||
{
|
||||
[
|
||||
"mystdout" : {
|
||||
"type" : "stdout",
|
||||
"meta_as_tags" : [
|
||||
@@ -31,7 +31,7 @@ The configuration file for the sinks is a list of configurations. The `type` fie
|
||||
"database" : "ccmetric",
|
||||
"password" : "<jwt token>"
|
||||
}
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
|
||||
|
@@ -5,7 +5,6 @@ import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -23,7 +22,6 @@ type NatsSinkConfig struct {
|
||||
User string `json:"user,omitempty"`
|
||||
Password string `json:"password,omitempty"`
|
||||
FlushDelay string `json:"flush_delay,omitempty"`
|
||||
NkeyFile string `json:"nkey_file,omitempty"`
|
||||
}
|
||||
|
||||
type NatsSink struct {
|
||||
@@ -44,13 +42,6 @@ func (s *NatsSink) connect() error {
|
||||
var nc *nats.Conn
|
||||
if len(s.config.User) > 0 && len(s.config.Password) > 0 {
|
||||
uinfo = nats.UserInfo(s.config.User, s.config.Password)
|
||||
} else if len(s.config.NkeyFile) > 0 {
|
||||
if _, err := os.Stat(s.config.NkeyFile); err == nil {
|
||||
uinfo = nats.UserCredentials(s.config.NkeyFile)
|
||||
} else {
|
||||
cclog.ComponentError(s.name, "NKEY file", s.config.NkeyFile, "does not exist: %v", err.Error())
|
||||
return err
|
||||
}
|
||||
}
|
||||
uri := fmt.Sprintf("nats://%s:%s", s.config.Host, s.config.Port)
|
||||
cclog.ComponentDebug(s.name, "Connect to", uri)
|
||||
|
@@ -13,7 +13,6 @@ The `nats` sink publishes all metrics into a NATS network. The publishing key is
|
||||
"port": "4222",
|
||||
"user": "exampleuser",
|
||||
"password" : "examplepw",
|
||||
"nkey_file": "/path/to/nkey_file",
|
||||
"meta_as_tags" : [],
|
||||
}
|
||||
}
|
||||
@@ -26,4 +25,3 @@ The `nats` sink publishes all metrics into a NATS network. The publishing key is
|
||||
- `user`: Username for basic authentication
|
||||
- `password`: Password for basic authentication
|
||||
- `meta_as_tags`: print all meta information as tags in the output (optional)
|
||||
- `nkey_file`: Path to credentials file with NKEY
|
||||
|
Reference in New Issue
Block a user