From f7e8b526673f955d2e3f099cc702a8f53f09c58b Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 2 Mar 2022 13:21:54 +0100 Subject: [PATCH 1/8] Run RPM build actions only on tag push --- .github/workflows/AlmaLinux.yml | 7 +++++-- .github/workflows/RedHatUniversalBaseImage.yml | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/AlmaLinux.yml b/.github/workflows/AlmaLinux.yml index 442ad41..dd06dd2 100644 --- a/.github/workflows/AlmaLinux.yml +++ b/.github/workflows/AlmaLinux.yml @@ -3,8 +3,11 @@ # Workflow name name: AlmaLinux 8.5 RPM build -# Run on event push -on: push +# Run on tag push +on: + push: + tags: + - '**' jobs: diff --git a/.github/workflows/RedHatUniversalBaseImage.yml b/.github/workflows/RedHatUniversalBaseImage.yml index cfd8a1f..205a133 100644 --- a/.github/workflows/RedHatUniversalBaseImage.yml +++ b/.github/workflows/RedHatUniversalBaseImage.yml @@ -3,8 +3,11 @@ # Workflow name name: Red Hat Universal Base Image 8 RPM build -# Run on event push -on: push +# Run on tag push +on: + push: + tags: + - '**' jobs: From 092e7f6a71836562190e8eed9455a9f8b11c5bb3 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 2 Mar 2022 13:54:43 +0100 Subject: [PATCH 2/8] Add section how to temporarly disable LIKWID access to page --- collectors/likwidMetric.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index 3ef51f3..2d0b840 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -61,7 +61,21 @@ $ scripts/likwid_perfgroup_to_cc_config.py ICX MEM_DP } ``` -You can copy this JSON and add it to the `eventsets` list. If you specify multiple event sets, you can add globally derived metrics in the extra `global_metrics` section with the metric names as variables. +You can copy this JSON and add it to the `eventsets` list. If you specify multiple event sets, you can add globally derived metrics in the extra `global_metrics` section with the metric names as variables. + +### Mixed usage between daemon and users + +LIKWID checks the file `/var/run/likwid.lock` before performing any interfering operations. Who is allowed to access the counters is determined by the owner of the file. If it does not exist, it is created for the current user. So, if you want to temporarly allow counter access to a user (e.g. in a job): + +Before (SLURM prolog, ...) +``` +$ chwon $USER /var/run/likwid.lock +``` + +After (SLURM epilog, ...) +``` +$ chwon root /var/run/likwid.lock +``` ### Example configuration From 0753c811564348b24d44c197d8099cfbfaaa801e Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 2 Mar 2022 15:10:14 +0100 Subject: [PATCH 3/8] Add/Remove clustercockpit user and group in RPM --- scripts/cc-metric-collector.spec | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/cc-metric-collector.spec b/scripts/cc-metric-collector.spec index 9a1ec3b..2a34263 100644 --- a/scripts/cc-metric-collector.spec +++ b/scripts/cc-metric-collector.spec @@ -39,12 +39,26 @@ install -Dpm 0600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{na %check # go test should be here... :) +%pre +getent group clustercockpit >/dev/null || groupadd -r clustercockpit +getent passwd clustercockpit >/dev/null || \ + useradd -r -g clustercockpit -d /nonexistent -s /sbin/nologin \ + -c "Create system user and group for CC metric collector" clustercockpit +exit 0 + %post %systemd_post %{name}.service %preun %systemd_preun %{name}.service +%postun +if [ "$1" = "1" ]; then +getent passwd clustercockpit >/dev/null && userdel clustercockpit +getent group clustercockpit >/dev/null && groupdel clustercockpit +fi +exit 0 + %files %dir %{_sysconfdir}/%{name} %{_sbindir}/%{name} From 6023abd02877330ef472a53ea21adebe9f95cbac Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 3 Mar 2022 11:02:37 +0100 Subject: [PATCH 4/8] Rename main file to match with executable name --- metric-collector.go => cc-metric-collector.go | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename metric-collector.go => cc-metric-collector.go (100%) diff --git a/metric-collector.go b/cc-metric-collector.go similarity index 100% rename from metric-collector.go rename to cc-metric-collector.go From c61b8d28774b28232e7018dfa1f31481c58ffdd6 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 3 Mar 2022 11:03:51 +0100 Subject: [PATCH 5/8] Set name change in Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e0ec475..d747899 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ APP = cc-metric-collector -GOSRC_APP := metric-collector.go +GOSRC_APP := cc-metric-collector.go GOSRC_COLLECTORS := $(wildcard collectors/*.go) GOSRC_SINKS := $(wildcard sinks/*.go) GOSRC_RECEIVERS := $(wildcard receivers/*.go) From 276c00442a18fc5685dff03be8149a89b87f4e41 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 3 Mar 2022 13:02:00 +0100 Subject: [PATCH 6/8] Add option to LustreCollector to call lctl with sudo --- collectors/lustreMetric.go | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/collectors/lustreMetric.go b/collectors/lustreMetric.go index 6d6fe26..f98d746 100644 --- a/collectors/lustreMetric.go +++ b/collectors/lustreMetric.go @@ -22,6 +22,7 @@ type LustreCollectorConfig struct { LCtlCommand string `json:"lctl_command"` ExcludeMetrics []string `json:"exclude_metrics"` SendAllMetrics bool `json:"send_all_metrics"` + Sudo bool `json:"use_sudo"` } type LustreCollector struct { @@ -31,11 +32,17 @@ type LustreCollector struct { stats map[string]map[string]int64 config LustreCollectorConfig lctl string + sudoCmd string } func (m *LustreCollector) getDeviceDataCommand(device string) []string { + var command *exec.Cmd statsfile := fmt.Sprintf("llite.%s.stats", device) - command := exec.Command(m.lctl, LCTL_OPTION, statsfile) + if m.config.Sudo { + command = exec.Command(m.sudoCmd, m.lctl, LCTL_OPTION, statsfile) + } else { + command = exec.Command(m.lctl, LCTL_OPTION, statsfile) + } command.Wait() stdout, _ := command.Output() return strings.Split(string(stdout), "\n") @@ -136,6 +143,12 @@ func (m *LustreCollector) Init(config json.RawMessage) error { } } m.lctl = p + if m.config.Sudo { + p, err := exec.LookPath("sudo") + if err != nil { + m.sudoCmd = p + } + } devices := m.getDevices() if len(devices) == 0 { From 60de21c41edd9cbcfa07ed68da851b83a7306aba Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 3 Mar 2022 13:03:58 +0100 Subject: [PATCH 7/8] Switch access mode of LikwidCollector in config file --- collectors/likwidMetric.go | 25 ++++++++++++++++++++++--- collectors/likwidMetric.md | 6 ++++-- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 8626d7c..8ab42d5 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -2,7 +2,7 @@ package collectors /* #cgo CFLAGS: -I./likwid -#cgo LDFLAGS: -L./likwid -llikwid -llikwid-hwloc -lm -Wl,--unresolved-symbols=ignore-in-object-files +#cgo LDFLAGS: -Wl,--unresolved-symbols=ignore-in-object-files #include #include */ @@ -71,8 +71,9 @@ func GetAllMetricScopes() []MetricScope { } const ( - LIKWID_LIB_NAME = "liblikwid.so" - LIKWID_LIB_DL_FLAGS = dl.RTLD_LAZY | dl.RTLD_GLOBAL + LIKWID_LIB_NAME = "liblikwid.so" + LIKWID_LIB_DL_FLAGS = dl.RTLD_LAZY | dl.RTLD_GLOBAL + LIKWID_DEF_ACCESSMODE = "direct" ) type LikwidCollectorMetricConfig struct { @@ -95,6 +96,8 @@ type LikwidCollectorConfig struct { Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"` ForceOverwrite bool `json:"force_overwrite,omitempty"` InvalidToZero bool `json:"invalid_to_zero,omitempty"` + AccessMode string `json:"access_mode,omitempty"` + DaemonPath string `json:"accessdaemon_path,omitempty"` } type LikwidCollector struct { @@ -260,6 +263,7 @@ func (m *LikwidCollector) getResponsiblities() map[MetricScope]map[int]int { func (m *LikwidCollector) Init(config json.RawMessage) error { var ret C.int m.name = "LikwidCollector" + m.config.AccessMode = LIKWID_DEF_ACCESSMODE if len(config) > 0 { err := json.Unmarshal(config, &m.config) if err != nil { @@ -270,6 +274,11 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { if lib == nil { return fmt.Errorf("error instantiating DynamicLibrary for %s", LIKWID_LIB_NAME) } + err := lib.Open() + if err != nil { + return fmt.Errorf("error opening %s: %v", LIKWID_LIB_NAME, err) + } + if m.config.ForceOverwrite { cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1") os.Setenv("LIKWID_FORCE", "1") @@ -301,6 +310,16 @@ func (m *LikwidCollector) Init(config json.RawMessage) error { m.initGranularity() // Generate map for MetricScope -> scope_id (like socket id) -> responsible id (offset in cpulist) m.scopeRespTids = m.getResponsiblities() + switch m.config.AccessMode { + case "direct": + C.HPMmode(0) + case "accessdaemon": + if len(m.config.DaemonPath) > 0 { + p := os.Getenv("PATH") + os.Setenv("PATH", m.config.DaemonPath+":"+p) + } + C.HPMmode(1) + } cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module") ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index 2d0b840..1aa4242 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -8,6 +8,8 @@ The `likwid` configuration consists of two parts, the "eventsets" and "globalmet - The global metrics are metrics which require data from all event set measurements to be derived. The inputs are the metrics in the event sets. Similar to the metrics in the event sets, the global metrics are defined by a name, a formula, a scope and a publish flag. See event set metrics for details. The only difference is that there is no access to the raw event measurements anymore but only to the metrics. So, the idea is to derive a metric in the "eventsets" section and reuse it in the "globalmetrics" part. If you need a metric only for deriving the global metrics, disable forwarding of the event set metrics. **Be aware** that the combination might be misleading because the "behavior" of a metric changes over time and the multiple measurements might count different computing phases. Additional options: +- `access_mode` : Method to use for hardware performance monitoring (`direct` access as root user, `accessdaemon` for the daemon mode) +- `accessdaemon_path`: Folder with the access daemon `likwid-accessD`, commonly `$LIKWID_INSTALL_LOC/sbin` - `force_overwrite`: Same as setting `LIKWID_FORCE=1`. In case counters are already in-use, LIKWID overwrites their configuration to do its measurements - `invalid_to_zero`: In some cases, the calculations result in `NaN` or `Inf`. With this option, all `NaN` and `Inf` values are replaces with `0.0`. @@ -69,12 +71,12 @@ LIKWID checks the file `/var/run/likwid.lock` before performing any interfering Before (SLURM prolog, ...) ``` -$ chwon $USER /var/run/likwid.lock +$ chwon $JOBUSER /var/run/likwid.lock ``` After (SLURM epilog, ...) ``` -$ chwon root /var/run/likwid.lock +$ chwon $CCUSER /var/run/likwid.lock ``` ### Example configuration From 948c34d74d7c2ba0db3aa209043d7009ff66fed6 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 3 Mar 2022 13:43:43 +0100 Subject: [PATCH 8/8] Add user creation in RPM --- scripts/cc-metric-collector.spec | 18 ++++++------------ scripts/cc-metric-collector.sysusers | 2 ++ 2 files changed, 8 insertions(+), 12 deletions(-) create mode 100644 scripts/cc-metric-collector.sysusers diff --git a/scripts/cc-metric-collector.spec b/scripts/cc-metric-collector.spec index 2a34263..fef1bb8 100644 --- a/scripts/cc-metric-collector.spec +++ b/scripts/cc-metric-collector.spec @@ -7,6 +7,7 @@ License: MIT Source0: %{name}-%{version}.tar.gz BuildRequires: go-toolset +BuildRequires: systemd-rpm-macros # for internal LIKWID installation BuildRequires: wget perl-Data-Dumper @@ -34,17 +35,14 @@ install -Dpm 0600 receivers.json %{buildroot}%{_sysconfdir}/%{name}/receivers.js install -Dpm 0600 router.json %{buildroot}%{_sysconfdir}/%{name}/router.json install -Dpm 0644 scripts/%{name}.service %{buildroot}%{_unitdir}/%{name}.service install -Dpm 0600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{name} +install -Dpm 0644 scripts/%{name}.sysusers %{buildroot}%{_sysusersdir}/%{name}.conf %check # go test should be here... :) %pre -getent group clustercockpit >/dev/null || groupadd -r clustercockpit -getent passwd clustercockpit >/dev/null || \ - useradd -r -g clustercockpit -d /nonexistent -s /sbin/nologin \ - -c "Create system user and group for CC metric collector" clustercockpit -exit 0 +%sysusers_create_compat scripts/%{name}.sysusers %post %systemd_post %{name}.service @@ -52,13 +50,6 @@ exit 0 %preun %systemd_preun %{name}.service -%postun -if [ "$1" = "1" ]; then -getent passwd clustercockpit >/dev/null && userdel clustercockpit -getent group clustercockpit >/dev/null && groupdel clustercockpit -fi -exit 0 - %files %dir %{_sysconfdir}/%{name} %{_sbindir}/%{name} @@ -69,8 +60,11 @@ exit 0 %attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/sinks.json %attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/receivers.json %attr(0600,root,root) %config(noreplace) %{_sysconfdir}/%{name}/router.json +%{_sysusersdir}/%{name}.conf %changelog +* Thu Mar 03 2022 Thomas Gruber - 0.3 +- Add clustercockpit user installation * Mon Feb 14 2022 Thomas Gruber - 0.2 - Add component specific configuration files - Add %attr to config files diff --git a/scripts/cc-metric-collector.sysusers b/scripts/cc-metric-collector.sysusers new file mode 100644 index 0000000..65646af --- /dev/null +++ b/scripts/cc-metric-collector.sysusers @@ -0,0 +1,2 @@ +#Type Name ID GECOS Home directory Shell +s hpcop - "User for ClusterCockpit" /run/cc-metric-collector /sbin/nologin