diff --git a/.github/workflows/Release.yml b/.github/workflows/Release.yml index 6ec5782..36b812e 100644 --- a/.github/workflows/Release.yml +++ b/.github/workflows/Release.yml @@ -195,7 +195,7 @@ jobs: Release: runs-on: ubuntu-latest # We need the RPMs, so add dependency - needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-focal-build] + needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-jammy-build] steps: # See: https://github.com/actions/download-artifact diff --git a/.github/workflows/runonce.yml b/.github/workflows/runonce.yml index 878c24f..ddc41b3 100644 --- a/.github/workflows/runonce.yml +++ b/.github/workflows/runonce.yml @@ -88,13 +88,11 @@ jobs: submodules: recursive fetch-depth: 0 - # Use dnf to install build dependencies - - name: Install build dependencies - run: | - wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \ - tar --directory=/usr/local --extract --gzip - export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH - go version + # See: https://github.com/marketplace/actions/setup-go-environment + - name: Setup Golang + uses: actions/setup-go@v4 + with: + go-version: '1.21' - name: RPM build MetricCollector id: rpmbuild @@ -126,13 +124,11 @@ jobs: submodules: recursive fetch-depth: 0 - # Use dnf to install build dependencies - - name: Install build dependencies - run: | - wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \ - tar --directory=/usr/local --extract --gzip - export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH - go version + # See: https://github.com/marketplace/actions/setup-go-environment + - name: Setup Golang + uses: actions/setup-go@v4 + with: + go-version: '1.21' - name: RPM build MetricCollector id: rpmbuild @@ -163,12 +159,13 @@ jobs: submodules: recursive fetch-depth: 0 # Use official golang package - - name: Install Golang - run: | - wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \ - tar --directory=/usr/local --extract --gzip - export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH - go version + + # See: https://github.com/marketplace/actions/setup-go-environment + - name: Setup Golang + uses: actions/setup-go@v4 + with: + go-version: '1.21' + - name: DEB build MetricCollector id: dpkg-build run: | diff --git a/collectors/ipmiMetric.go b/collectors/ipmiMetric.go index d53d8af..a4b4b88 100644 --- a/collectors/ipmiMetric.go +++ b/collectors/ipmiMetric.go @@ -12,6 +12,7 @@ import ( "strconv" "strings" "time" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) @@ -54,15 +55,30 @@ func (m *IpmiCollector) Init(config json.RawMessage) error { // Check if executables ipmitool or ipmisensors are found p, err := exec.LookPath(m.config.IpmitoolPath) if err == nil { - m.ipmitool = p + command := exec.Command(p) + err := command.Run() + if err != nil { + cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error())) + m.ipmitool = "" + } else { + m.ipmitool = p + } } p, err = exec.LookPath(m.config.IpmisensorsPath) if err == nil { - m.ipmisensors = p + command := exec.Command(p) + err := command.Run() + if err != nil { + cclog.ComponentError(m.name, fmt.Sprintf("Failed to execute %s: %v", p, err.Error())) + m.ipmisensors = "" + } else { + m.ipmisensors = p + } } if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 { - return errors.New("no IPMI reader found") + return errors.New("no usable IPMI reader found") } + m.init = true return nil } @@ -119,8 +135,8 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMetric) { cclog.ComponentError( m.name, fmt.Sprintf("readIpmiTool(): Failed to wait for the end of command \"%s\": %v\n", command.String(), err), - fmt.Sprintf("readIpmiTool(): command stderr: \"%s\"\n", string(errMsg)), ) + cclog.ComponentError(m.name, fmt.Sprintf("readIpmiTool(): command stderr: \"%s\"\n", strings.TrimSpace(string(errMsg)))) return } } diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 606f6a6..12757c3 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -374,10 +374,21 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, } defer watcher.Close() if len(m.config.LockfilePath) > 0 { + // Check if the lock file exists info, err := os.Stat(m.config.LockfilePath) + if os.IsNotExist(err) { + // Create the lock file if it does not exist + file, createErr := os.Create(m.config.LockfilePath) + if createErr != nil { + return true, fmt.Errorf("failed to create lock file: %v", createErr) + } + file.Close() + info, err = os.Stat(m.config.LockfilePath) // Recheck the file after creation + } if err != nil { return true, err } + // Check file ownership uid := info.Sys().(*syscall.Stat_t).Uid if uid != uint32(os.Getuid()) { usr, err := user.LookupId(fmt.Sprint(uid)) @@ -387,6 +398,7 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, return true, fmt.Errorf("Access to performance counters locked by %d", uid) } } + // Add the lock file to the watcher err = watcher.Add(m.config.LockfilePath) if err != nil { cclog.ComponentError(m.name, err.Error()) diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index 7740543..0bd5b2b 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -15,7 +15,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li { "events" : { "COUNTER0": "EVENT0", - "COUNTER1": "EVENT1", + "COUNTER1": "EVENT1" }, "metrics" : [ { @@ -27,7 +27,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li } ] } - ] + ], "globalmetrics" : [ { "name": "global_sum", @@ -132,6 +132,9 @@ In some cases LIKWID returns `0.0` for some events that are further used in proc One might think this does not happen often but often used metrics in the world of performance engineering like Instructions-per-Cycle (IPC) or more frequently the actual CPU clock are derived with events like `CPU_CLK_UNHALTED_CORE` (Intel) which do not increment in halted state (as the name implies). In there are different power management systems in a chip which can cause a hardware thread to go in such a state. Moreover, if no cycles are executed by the core, also many other events are not incremented as well (like `INSTR_RETIRED_ANY` for retired instructions and part of IPC). +### `lockfile_path` option +LIKWID can be configured with a lock file with which the access to the performance monitoring registers can be disabled (only the owner of the lock file is allowed to access the registers). When the `lockfile_path` option is set, the collector subscribes to changes to this file to stop monitoring if the owner of the lock file changes. This feature is useful when users should be able to perform own hardware performance counter measurements through LIKWID or any other tool. + ### `send_*_total values` option - `send_core_total_values`: Metrics, which are usually collected on a per hardware thread basis, are additionally summed up per CPU core. diff --git a/sinks/README.md b/sinks/README.md index d6c88b8..d28fba8 100644 --- a/sinks/README.md +++ b/sinks/README.md @@ -17,7 +17,7 @@ This folder contains the SinkManager and sink implementations for the cc-metric- The configuration file for the sinks is a list of configurations. The `type` field in each specifies which sink to initialize. ```json -[ +{ "mystdout" : { "type" : "stdout", "meta_as_tags" : [ @@ -31,7 +31,7 @@ The configuration file for the sinks is a list of configurations. The `type` fie "database" : "ccmetric", "password" : "" } -] +} ```