Compare commits

..

2 Commits

Author SHA1 Message Date
Thomas Röhl
9ca73a9f50 Add documentation of the internal structure of the LikwidCollector to its documentation 2024-04-25 17:26:18 +02:00
Thomas Röhl
0186dce521 Create lockfile if it does not exist 2024-04-25 17:25:53 +02:00
11 changed files with 88 additions and 88 deletions

View File

@@ -44,16 +44,16 @@ jobs:
# Use dnf to install build dependencies
- name: Install build dependencies
run: |
wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \
tar --directory=/usr/local --extract --gzip
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
go version
dnf --assumeyes install \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.21.7-1.module_el8+960+4060efbe.noarch.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
make RPM
# AlmaLinux 8 is a derivate of RedHat Enterprise Linux 8 (UBI8),
@@ -114,16 +114,16 @@ jobs:
# Use dnf to install build dependencies
- name: Install build dependencies
run: |
wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \
tar --directory=/usr/local --extract --gzip
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
go version
dnf --assumeyes --disableplugin=subscription-manager install \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.21.7-1.module_el8+960+4060efbe.noarch.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
make RPM
# See: https://github.com/actions/upload-artifact
@@ -165,7 +165,7 @@ jobs:
# Use official golang package
- name: Install Golang
run: |
wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \
wget -q https://go.dev/dl/go1.21.1.linux-amd64.tar.gz --output-document=- | \
tar --directory=/usr/local --extract --gzip
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
go version

View File

@@ -91,16 +91,16 @@ jobs:
# Use dnf to install build dependencies
- name: Install build dependencies
run: |
wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \
tar --directory=/usr/local --extract --gzip
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
go version
dnf --assumeyes install \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.21.7-1.module_el8+960+4060efbe.noarch.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
make RPM
#
@@ -129,16 +129,16 @@ jobs:
# Use dnf to install build dependencies
- name: Install build dependencies
run: |
wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \
tar --directory=/usr/local --extract --gzip
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
go version
dnf --assumeyes --disableplugin=subscription-manager install \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.21.7-1.module_el8+960+4060efbe.noarch.rpm \
http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.21.7-1.module_el8+960+4060efbe.x86_64.rpm
- name: RPM build MetricCollector
id: rpmbuild
run: |
git config --global --add safe.directory /__w/cc-metric-collector/cc-metric-collector
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
make RPM
#
@@ -165,7 +165,7 @@ jobs:
# Use official golang package
- name: Install Golang
run: |
wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \
wget -q https://go.dev/dl/go1.21.1.linux-amd64.tar.gz --output-document=- | \
tar --directory=/usr/local --extract --gzip
export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH
go version

View File

@@ -12,7 +12,6 @@ import (
"strconv"
"strings"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric"
)
@@ -55,30 +54,15 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
// Check if executables ipmitool or ipmisensors are found
p, err := exec.LookPath(m.config.IpmitoolPath)
if err == nil {
command := exec.Command(p)
err := command.Run()
if err != nil {
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
m.ipmitool = ""
} else {
m.ipmitool = p
}
m.ipmitool = p
}
p, err = exec.LookPath(m.config.IpmisensorsPath)
if err == nil {
command := exec.Command(p)
err := command.Run()
if err != nil {
cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error()))
m.ipmisensors = ""
} else {
m.ipmisensors = p
}
m.ipmisensors = p
}
if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 {
return errors.New("no usable IPMI reader found")
return errors.New("no IPMI reader found")
}
m.init = true
return nil
}
@@ -135,8 +119,8 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMetric) {
cclog.ComponentError(
m.name,
fmt.Sprintf("readIpmiTool(): Failed to wait for the end of command \"%s\": %v\n", command.String(), err),
fmt.Sprintf("readIpmiTool(): command stderr: \"%s\"\n", string(errMsg)),
)
cclog.ComponentError(m.name, fmt.Sprintf("readIpmiTool(): command stderr: \"%s\"\n", strings.TrimSpace(string(errMsg))))
return
}
}

View File

@@ -374,6 +374,14 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
}
defer watcher.Close()
if len(m.config.LockfilePath) > 0 {
if _, err := os.Stat(m.config.LockfilePath); os.IsNotExist(err) {
file, err := os.Create(m.config.LockfilePath)
if err != nil {
cclog.ComponentError(m.name, "Cannot create lockfile", m.config.LockfilePath, ":", err.Error())
return true, err
}
file.Close()
}
info, err := os.Stat(m.config.LockfilePath)
if err != nil {
return true, err
@@ -382,9 +390,9 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
if uid != uint32(os.Getuid()) {
usr, err := user.LookupId(fmt.Sprint(uid))
if err == nil {
return true, fmt.Errorf("Access to performance counters locked by %s", usr.Username)
return true, fmt.Errorf("access to performance counters locked by %s", usr.Username)
} else {
return true, fmt.Errorf("Access to performance counters locked by %d", uid)
return true, fmt.Errorf("access to performance counters locked by %d", uid)
}
}
err = watcher.Add(m.config.LockfilePath)

View File

@@ -267,3 +267,45 @@ IPC PMC0/PMC1 -> {
```
The script `scripts/likwid_perfgroup_to_cc_config.py` might help you.
### Internal structure
This section describes the internal structure of the `likwid` collector.
#### At initialization
After setting the defaults, the configuration is read.
Based on the configuration, the library is searched using `dlopen` to see whether it makes sense to proceed.
Next, the user-given metrics are tested to ensure they can be evaluated. For this, it creates a list of all user-given events/counters with the value `1.0` which is provided to the metric evaluator. The same is done for the global metrics by using the metric names with value `1.0`. If the evaluator does not fail, the metric can be evaluated and the collector initialization can proceed.
A separate thread is started to do the measurement. This is not done using a common goroutine but a real application thread with full control. This is required because LIKWID's access system tracks the processes of the using application and the PID should not change between measurements because that would require teardown and reopening of the access system.
With the separate thread, the access system is initialized by setting the user-given access mode and adding all hardware threads.
LIKWID measures per hardware thread in general but only some HW threads read the counters available only e.g. per CPU socket (often memory traffic). For this, the collector gets the system topology through LIKWID and creates different mappings like 'hwthread to list offset' and others. With this, the hardware threads responsible for a topological entity can be determined because those read the counters of the per CPU socket units. These mappings are later used in the measurement phase.
In the end, we read the base CPU frequency of the system. It may be used in the metric evaluation.
#### Measurements
The reading of events is done by the separate application thread.
It traverses over all configured event sets, creates valid LIKWID eventstrings out of them and pass them to take a measurement. This could be done only once but when the LIKWID lock changes, LIKWID has to be completely reopened to provide access again. With this reopening, the already added event sets are gone.
LIKWID has it's own locking mechanism using a lock file. But not the content of the file is of interest but the owner. In order to track changes of the file, a `fsnotify` watcher is installed on the file. If the file does not exist, it is created and consequently is owned by the same user as `cc-metric-collector`. The LikwidCollector has to watch the file on it's own because LIKWID does not provide proper error handling for this.
Each call to the LIKWID library for loading the event set, setting up the counting facilities as well as starting and stopping of the counters is wrapped into lockfile checks to ensure no state change happens. If the file owner changed, the LikwidCollector cannot access the counters anymore, so no further operation can be done and measurment stops.
Although start/stop would be sufficient, the LikwidCollector performs start, read, wait, read, `getLastResult`, stop. Reason might be "historic" but is not 100% clear anymore. The author failed to document ;)
#### Metric evaluation
After each meaurement, the metrics of the event set are directly evaluated. It updates the counter->result mapping with the new measurements, calls the evaluator and generates the `CCMetric` with the user-given settings if it should be published. Each metric name to result calculation is stored for the global metric evaluation, which is done as a final step.
#### Shutdown
Since each measurment involves a complete initialize to finalize cycle of the LIKWID library, only the topology module needs to be closed.
Moreover, the separate application thread is stopped.

View File

@@ -12,8 +12,8 @@ The global file contains the paths to the other four files and some global optio
"collectors" : "collectors.json",
"receivers" : "receivers.json",
"router" : "router.json",
"interval": "10s",
"duration": "1s"
"interval": 10,
"duration": 1
}
```

View File

@@ -4,7 +4,6 @@ import (
"encoding/json"
"errors"
"fmt"
"os"
"time"
cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
@@ -14,13 +13,10 @@ import (
)
type NatsReceiverConfig struct {
Type string `json:"type"`
Addr string `json:"address"`
Port string `json:"port"`
Subject string `json:"subject"`
User string `json:"user,omitempty"`
Password string `json:"password,omitempty"`
NkeyFile string `json:"nkey_file,omitempty"`
Type string `json:"type"`
Addr string `json:"address"`
Port string `json:"port"`
Subject string `json:"subject"`
}
type NatsReceiver struct {
@@ -113,7 +109,6 @@ func (r *NatsReceiver) Close() {
// NewNatsReceiver creates a new Receiver which subscribes to messages from a NATS server
func NewNatsReceiver(name string, config json.RawMessage) (Receiver, error) {
var uinfo nats.Option = nil
r := new(NatsReceiver)
r.name = fmt.Sprintf("NatsReceiver(%s)", name)
@@ -138,22 +133,10 @@ func NewNatsReceiver(name string, config json.RawMessage) (Receiver, error) {
"source": r.name,
}
if len(r.config.User) > 0 && len(r.config.Password) > 0 {
uinfo = nats.UserInfo(r.config.User, r.config.Password)
} else if len(r.config.NkeyFile) > 0 {
_, err := os.Stat(r.config.NkeyFile)
if err == nil {
uinfo = nats.UserCredentials(r.config.NkeyFile)
} else {
cclog.ComponentError(r.name, "NKEY file", r.config.NkeyFile, "does not exist: %v", err.Error())
return nil, err
}
}
// Connect to NATS server
url := fmt.Sprintf("nats://%s:%s", r.config.Addr, r.config.Port)
cclog.ComponentDebug(r.name, "NewNatsReceiver", url, "Subject", r.config.Subject)
if nc, err := nats.Connect(url, uinfo); err == nil {
if nc, err := nats.Connect(url); err == nil {
r.nc = nc
} else {
r.nc = nil

View File

@@ -10,10 +10,7 @@ The `nats` receiver can be used receive metrics from the NATS network. The `nats
"type": "nats",
"address" : "nats-server.example.org",
"port" : "4222",
"subject" : "subject",
"user": "natsuser",
"password": "natssecret",
"nkey_file": "/path/to/nkey_file"
"subject" : "subject"
}
}
```
@@ -22,9 +19,6 @@ The `nats` receiver can be used receive metrics from the NATS network. The `nats
- `address`: Address of the NATS control server
- `port`: Port of the NATS control server
- `subject`: Subscribes to this subject and receive metrics
- `user`: Connect to nats using this user
- `password`: Connect to nats using this password
- `nkey_file`: Path to credentials file with NKEY
### Debugging

View File

@@ -17,7 +17,7 @@ This folder contains the SinkManager and sink implementations for the cc-metric-
The configuration file for the sinks is a list of configurations. The `type` field in each specifies which sink to initialize.
```json
{
[
"mystdout" : {
"type" : "stdout",
"meta_as_tags" : [
@@ -31,7 +31,7 @@ The configuration file for the sinks is a list of configurations. The `type` fie
"database" : "ccmetric",
"password" : "<jwt token>"
}
}
]
```

View File

@@ -5,7 +5,6 @@ import (
"encoding/json"
"errors"
"fmt"
"os"
"sync"
"time"
@@ -23,7 +22,6 @@ type NatsSinkConfig struct {
User string `json:"user,omitempty"`
Password string `json:"password,omitempty"`
FlushDelay string `json:"flush_delay,omitempty"`
NkeyFile string `json:"nkey_file,omitempty"`
}
type NatsSink struct {
@@ -44,13 +42,6 @@ func (s *NatsSink) connect() error {
var nc *nats.Conn
if len(s.config.User) > 0 && len(s.config.Password) > 0 {
uinfo = nats.UserInfo(s.config.User, s.config.Password)
} else if len(s.config.NkeyFile) > 0 {
if _, err := os.Stat(s.config.NkeyFile); err == nil {
uinfo = nats.UserCredentials(s.config.NkeyFile)
} else {
cclog.ComponentError(s.name, "NKEY file", s.config.NkeyFile, "does not exist: %v", err.Error())
return err
}
}
uri := fmt.Sprintf("nats://%s:%s", s.config.Host, s.config.Port)
cclog.ComponentDebug(s.name, "Connect to", uri)

View File

@@ -13,7 +13,6 @@ The `nats` sink publishes all metrics into a NATS network. The publishing key is
"port": "4222",
"user": "exampleuser",
"password" : "examplepw",
"nkey_file": "/path/to/nkey_file",
"meta_as_tags" : [],
}
}
@@ -26,4 +25,3 @@ The `nats` sink publishes all metrics into a NATS network. The publishing key is
- `user`: Username for basic authentication
- `password`: Password for basic authentication
- `meta_as_tags`: print all meta information as tags in the output (optional)
- `nkey_file`: Path to credentials file with NKEY