mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2024-11-13 21:47:25 +01:00
Merge branch 'develop' into main
This commit is contained in:
commit
fdbdb79527
7
.github/workflows/AlmaLinux.yml
vendored
7
.github/workflows/AlmaLinux.yml
vendored
@ -3,8 +3,11 @@
|
|||||||
# Workflow name
|
# Workflow name
|
||||||
name: AlmaLinux 8.5 RPM build
|
name: AlmaLinux 8.5 RPM build
|
||||||
|
|
||||||
# Run on event push
|
# Run on tag push
|
||||||
on: push
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- '**'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|
||||||
|
@ -3,8 +3,11 @@
|
|||||||
# Workflow name
|
# Workflow name
|
||||||
name: Red Hat Universal Base Image 8 RPM build
|
name: Red Hat Universal Base Image 8 RPM build
|
||||||
|
|
||||||
# Run on event push
|
# Run on tag push
|
||||||
on: push
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- '**'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|
||||||
|
2
Makefile
2
Makefile
@ -1,5 +1,5 @@
|
|||||||
APP = cc-metric-collector
|
APP = cc-metric-collector
|
||||||
GOSRC_APP := metric-collector.go
|
GOSRC_APP := cc-metric-collector.go
|
||||||
GOSRC_COLLECTORS := $(wildcard collectors/*.go)
|
GOSRC_COLLECTORS := $(wildcard collectors/*.go)
|
||||||
GOSRC_SINKS := $(wildcard sinks/*.go)
|
GOSRC_SINKS := $(wildcard sinks/*.go)
|
||||||
GOSRC_RECEIVERS := $(wildcard receivers/*.go)
|
GOSRC_RECEIVERS := $(wildcard receivers/*.go)
|
||||||
|
@ -2,7 +2,7 @@ package collectors
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
#cgo CFLAGS: -I./likwid
|
#cgo CFLAGS: -I./likwid
|
||||||
#cgo LDFLAGS: -L./likwid -llikwid -llikwid-hwloc -lm -Wl,--unresolved-symbols=ignore-in-object-files
|
#cgo LDFLAGS: -Wl,--unresolved-symbols=ignore-in-object-files
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <likwid.h>
|
#include <likwid.h>
|
||||||
*/
|
*/
|
||||||
@ -73,6 +73,7 @@ func GetAllMetricScopes() []MetricScope {
|
|||||||
const (
|
const (
|
||||||
LIKWID_LIB_NAME = "liblikwid.so"
|
LIKWID_LIB_NAME = "liblikwid.so"
|
||||||
LIKWID_LIB_DL_FLAGS = dl.RTLD_LAZY | dl.RTLD_GLOBAL
|
LIKWID_LIB_DL_FLAGS = dl.RTLD_LAZY | dl.RTLD_GLOBAL
|
||||||
|
LIKWID_DEF_ACCESSMODE = "direct"
|
||||||
)
|
)
|
||||||
|
|
||||||
type LikwidCollectorMetricConfig struct {
|
type LikwidCollectorMetricConfig struct {
|
||||||
@ -95,6 +96,8 @@ type LikwidCollectorConfig struct {
|
|||||||
Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"`
|
Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"`
|
||||||
ForceOverwrite bool `json:"force_overwrite,omitempty"`
|
ForceOverwrite bool `json:"force_overwrite,omitempty"`
|
||||||
InvalidToZero bool `json:"invalid_to_zero,omitempty"`
|
InvalidToZero bool `json:"invalid_to_zero,omitempty"`
|
||||||
|
AccessMode string `json:"access_mode,omitempty"`
|
||||||
|
DaemonPath string `json:"accessdaemon_path,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type LikwidCollector struct {
|
type LikwidCollector struct {
|
||||||
@ -260,6 +263,7 @@ func (m *LikwidCollector) getResponsiblities() map[MetricScope]map[int]int {
|
|||||||
func (m *LikwidCollector) Init(config json.RawMessage) error {
|
func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||||
var ret C.int
|
var ret C.int
|
||||||
m.name = "LikwidCollector"
|
m.name = "LikwidCollector"
|
||||||
|
m.config.AccessMode = LIKWID_DEF_ACCESSMODE
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err := json.Unmarshal(config, &m.config)
|
err := json.Unmarshal(config, &m.config)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -270,6 +274,11 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
if lib == nil {
|
if lib == nil {
|
||||||
return fmt.Errorf("error instantiating DynamicLibrary for %s", LIKWID_LIB_NAME)
|
return fmt.Errorf("error instantiating DynamicLibrary for %s", LIKWID_LIB_NAME)
|
||||||
}
|
}
|
||||||
|
err := lib.Open()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error opening %s: %v", LIKWID_LIB_NAME, err)
|
||||||
|
}
|
||||||
|
|
||||||
if m.config.ForceOverwrite {
|
if m.config.ForceOverwrite {
|
||||||
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
|
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
|
||||||
os.Setenv("LIKWID_FORCE", "1")
|
os.Setenv("LIKWID_FORCE", "1")
|
||||||
@ -301,6 +310,16 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
m.initGranularity()
|
m.initGranularity()
|
||||||
// Generate map for MetricScope -> scope_id (like socket id) -> responsible id (offset in cpulist)
|
// Generate map for MetricScope -> scope_id (like socket id) -> responsible id (offset in cpulist)
|
||||||
m.scopeRespTids = m.getResponsiblities()
|
m.scopeRespTids = m.getResponsiblities()
|
||||||
|
switch m.config.AccessMode {
|
||||||
|
case "direct":
|
||||||
|
C.HPMmode(0)
|
||||||
|
case "accessdaemon":
|
||||||
|
if len(m.config.DaemonPath) > 0 {
|
||||||
|
p := os.Getenv("PATH")
|
||||||
|
os.Setenv("PATH", m.config.DaemonPath+":"+p)
|
||||||
|
}
|
||||||
|
C.HPMmode(1)
|
||||||
|
}
|
||||||
|
|
||||||
cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module")
|
cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module")
|
||||||
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
||||||
|
@ -8,6 +8,8 @@ The `likwid` configuration consists of two parts, the "eventsets" and "globalmet
|
|||||||
- The global metrics are metrics which require data from all event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. So, the idea is to derive a metric in the "eventsets" section and reuse it in the "globalmetrics" part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics. **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases.
|
- The global metrics are metrics which require data from all event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. So, the idea is to derive a metric in the "eventsets" section and reuse it in the "globalmetrics" part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics. **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases.
|
||||||
|
|
||||||
Additional options:
|
Additional options:
|
||||||
|
- `access_mode` : Method to use for hardware performance monitoring (`direct` access as root user, `accessdaemon` for the daemon mode)
|
||||||
|
- `accessdaemon_path`: Folder with the access daemon `likwid-accessD`, commonly `$LIKWID_INSTALL_LOC/sbin`
|
||||||
- `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements
|
- `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements
|
||||||
- `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`.
|
- `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`.
|
||||||
|
|
||||||
@ -63,6 +65,20 @@ $ scripts/likwid_perfgroup_to_cc_config.py ICX MEM_DP
|
|||||||
|
|
||||||
You can copy this JSON and add it to the `eventsets` list. If you specify multiple event sets, you can add globally derived metrics in the extra `global_metrics` section with the metric names as variables.
|
You can copy this JSON and add it to the `eventsets` list. If you specify multiple event sets, you can add globally derived metrics in the extra `global_metrics` section with the metric names as variables.
|
||||||
|
|
||||||
|
### Mixed usage between daemon and users
|
||||||
|
|
||||||
|
LIKWID checks the file `/var/run/likwid.lock` before performing any interfering operations. Who is allowed to access the counters is determined by the owner of the file. If it does not exist, it is created for the current user. So, if you want to temporarly allow counter access to a user (e.g. in a job):
|
||||||
|
|
||||||
|
Before (SLURM prolog, ...)
|
||||||
|
```
|
||||||
|
$ chwon $JOBUSER /var/run/likwid.lock
|
||||||
|
```
|
||||||
|
|
||||||
|
After (SLURM epilog, ...)
|
||||||
|
```
|
||||||
|
$ chwon $CCUSER /var/run/likwid.lock
|
||||||
|
```
|
||||||
|
|
||||||
### Example configuration
|
### Example configuration
|
||||||
|
|
||||||
|
|
||||||
|
@ -22,6 +22,7 @@ type LustreCollectorConfig struct {
|
|||||||
LCtlCommand string `json:"lctl_command"`
|
LCtlCommand string `json:"lctl_command"`
|
||||||
ExcludeMetrics []string `json:"exclude_metrics"`
|
ExcludeMetrics []string `json:"exclude_metrics"`
|
||||||
SendAllMetrics bool `json:"send_all_metrics"`
|
SendAllMetrics bool `json:"send_all_metrics"`
|
||||||
|
Sudo bool `json:"use_sudo"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type LustreCollector struct {
|
type LustreCollector struct {
|
||||||
@ -31,11 +32,17 @@ type LustreCollector struct {
|
|||||||
stats map[string]map[string]int64
|
stats map[string]map[string]int64
|
||||||
config LustreCollectorConfig
|
config LustreCollectorConfig
|
||||||
lctl string
|
lctl string
|
||||||
|
sudoCmd string
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *LustreCollector) getDeviceDataCommand(device string) []string {
|
func (m *LustreCollector) getDeviceDataCommand(device string) []string {
|
||||||
|
var command *exec.Cmd
|
||||||
statsfile := fmt.Sprintf("llite.%s.stats", device)
|
statsfile := fmt.Sprintf("llite.%s.stats", device)
|
||||||
command := exec.Command(m.lctl, LCTL_OPTION, statsfile)
|
if m.config.Sudo {
|
||||||
|
command = exec.Command(m.sudoCmd, m.lctl, LCTL_OPTION, statsfile)
|
||||||
|
} else {
|
||||||
|
command = exec.Command(m.lctl, LCTL_OPTION, statsfile)
|
||||||
|
}
|
||||||
command.Wait()
|
command.Wait()
|
||||||
stdout, _ := command.Output()
|
stdout, _ := command.Output()
|
||||||
return strings.Split(string(stdout), "\n")
|
return strings.Split(string(stdout), "\n")
|
||||||
@ -136,6 +143,12 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.lctl = p
|
m.lctl = p
|
||||||
|
if m.config.Sudo {
|
||||||
|
p, err := exec.LookPath("sudo")
|
||||||
|
if err != nil {
|
||||||
|
m.sudoCmd = p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
devices := m.getDevices()
|
devices := m.getDevices()
|
||||||
if len(devices) == 0 {
|
if len(devices) == 0 {
|
||||||
|
@ -7,6 +7,7 @@ License: MIT
|
|||||||
Source0: %{name}-%{version}.tar.gz
|
Source0: %{name}-%{version}.tar.gz
|
||||||
|
|
||||||
BuildRequires: go-toolset
|
BuildRequires: go-toolset
|
||||||
|
BuildRequires: systemd-rpm-macros
|
||||||
# for internal LIKWID installation
|
# for internal LIKWID installation
|
||||||
BuildRequires: wget perl-Data-Dumper
|
BuildRequires: wget perl-Data-Dumper
|
||||||
|
|
||||||
@ -34,11 +35,15 @@ install -Dpm 0600 receivers.json %{buildroot}%{_sysconfdir}/%{name}/receivers.js
|
|||||||
install -Dpm 0600 router.json %{buildroot}%{_sysconfdir}/%{name}/router.json
|
install -Dpm 0600 router.json %{buildroot}%{_sysconfdir}/%{name}/router.json
|
||||||
install -Dpm 0644 scripts/%{name}.service %{buildroot}%{_unitdir}/%{name}.service
|
install -Dpm 0644 scripts/%{name}.service %{buildroot}%{_unitdir}/%{name}.service
|
||||||
install -Dpm 0600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{name}
|
install -Dpm 0600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{name}
|
||||||
|
install -Dpm 0644 scripts/%{name}.sysusers %{buildroot}%{_sysusersdir}/%{name}.conf
|
||||||
|
|
||||||
|
|
||||||
%check
|
%check
|
||||||
# go test should be here... :)
|
# go test should be here... :)
|
||||||
|
|
||||||
|
%pre
|
||||||
|
%sysusers_create_compat scripts/%{name}.sysusers
|
||||||
|
|
||||||
%post
|
%post
|
||||||
%systemd_post %{name}.service
|
%systemd_post %{name}.service
|
||||||
|
|
||||||
@ -55,8 +60,11 @@ install -Dpm 0600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{na
|
|||||||
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/sinks.json
|
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/sinks.json
|
||||||
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/receivers.json
|
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/receivers.json
|
||||||
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/router.json
|
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/router.json
|
||||||
|
%{_sysusersdir}/%{name}.conf
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Thu Mar 03 2022 Thomas Gruber - 0.3
|
||||||
|
- Add clustercockpit user installation
|
||||||
* Mon Feb 14 2022 Thomas Gruber - 0.2
|
* Mon Feb 14 2022 Thomas Gruber - 0.2
|
||||||
- Add component specific configuration files
|
- Add component specific configuration files
|
||||||
- Add %attr to config files
|
- Add %attr to config files
|
||||||
|
2
scripts/cc-metric-collector.sysusers
Normal file
2
scripts/cc-metric-collector.sysusers
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#Type Name ID GECOS Home directory Shell
|
||||||
|
s hpcop - "User for ClusterCockpit" /run/cc-metric-collector /sbin/nologin
|
Loading…
Reference in New Issue
Block a user