diff --git a/.github/workflows/Release.yml b/.github/workflows/Release.yml index 6ec5782..36b812e 100644 --- a/.github/workflows/Release.yml +++ b/.github/workflows/Release.yml @@ -195,7 +195,7 @@ jobs: Release: runs-on: ubuntu-latest # We need the RPMs, so add dependency - needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-focal-build] + needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-jammy-build] steps: # See: https://github.com/actions/download-artifact diff --git a/.github/workflows/runonce.yml b/.github/workflows/runonce.yml index 878c24f..182e043 100644 --- a/.github/workflows/runonce.yml +++ b/.github/workflows/runonce.yml @@ -88,13 +88,11 @@ jobs: submodules: recursive fetch-depth: 0 - # Use dnf to install build dependencies - - name: Install build dependencies - run: | - wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \ - tar --directory=/usr/local --extract --gzip - export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH - go version + # See: https://github.com/marketplace/actions/setup-go-environment + - name: Setup Golang + uses: actions/setup-go@v4 + with: + go-version: '1.21' - name: RPM build MetricCollector id: rpmbuild @@ -126,13 +124,11 @@ jobs: submodules: recursive fetch-depth: 0 - # Use dnf to install build dependencies - - name: Install build dependencies - run: | - wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \ - tar --directory=/usr/local --extract --gzip - export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH - go version + # See: https://github.com/marketplace/actions/setup-go-environment + - name: Setup Golang + uses: actions/setup-go@v4 + with: + go-version: '1.21' - name: RPM build MetricCollector id: rpmbuild @@ -163,12 +159,12 @@ jobs: submodules: recursive fetch-depth: 0 # Use official golang package - - name: Install Golang - run: | - wget -q https://go.dev/dl/go1.22.4.linux-amd64.tar.gz --output-document=- | \ - tar --directory=/usr/local --extract --gzip - export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH - go version + # See: https://github.com/marketplace/actions/setup-go-environment + - name: Setup Golang + uses: actions/setup-go@v4 + with: + go-version: '1.21' + - name: DEB build MetricCollector id: dpkg-build run: | diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index 606f6a6..12757c3 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -374,10 +374,21 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, } defer watcher.Close() if len(m.config.LockfilePath) > 0 { + // Check if the lock file exists info, err := os.Stat(m.config.LockfilePath) + if os.IsNotExist(err) { + // Create the lock file if it does not exist + file, createErr := os.Create(m.config.LockfilePath) + if createErr != nil { + return true, fmt.Errorf("failed to create lock file: %v", createErr) + } + file.Close() + info, err = os.Stat(m.config.LockfilePath) // Recheck the file after creation + } if err != nil { return true, err } + // Check file ownership uid := info.Sys().(*syscall.Stat_t).Uid if uid != uint32(os.Getuid()) { usr, err := user.LookupId(fmt.Sprint(uid)) @@ -387,6 +398,7 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, return true, fmt.Errorf("Access to performance counters locked by %d", uid) } } + // Add the lock file to the watcher err = watcher.Add(m.config.LockfilePath) if err != nil { cclog.ComponentError(m.name, err.Error()) diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index 7740543..0bd5b2b 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -15,7 +15,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li { "events" : { "COUNTER0": "EVENT0", - "COUNTER1": "EVENT1", + "COUNTER1": "EVENT1" }, "metrics" : [ { @@ -27,7 +27,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li } ] } - ] + ], "globalmetrics" : [ { "name": "global_sum", @@ -132,6 +132,9 @@ In some cases LIKWID returns `0.0` for some events that are further used in proc One might think this does not happen often but often used metrics in the world of performance engineering like Instructions-per-Cycle (IPC) or more frequently the actual CPU clock are derived with events like `CPU_CLK_UNHALTED_CORE` (Intel) which do not increment in halted state (as the name implies). In there are different power management systems in a chip which can cause a hardware thread to go in such a state. Moreover, if no cycles are executed by the core, also many other events are not incremented as well (like `INSTR_RETIRED_ANY` for retired instructions and part of IPC). +### `lockfile_path` option +LIKWID can be configured with a lock file with which the access to the performance monitoring registers can be disabled (only the owner of the lock file is allowed to access the registers). When the `lockfile_path` option is set, the collector subscribes to changes to this file to stop monitoring if the owner of the lock file changes. This feature is useful when users should be able to perform own hardware performance counter measurements through LIKWID or any other tool. + ### `send_*_total values` option - `send_core_total_values`: Metrics, which are usually collected on a per hardware thread basis, are additionally summed up per CPU core. diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 369f12b..fbbcaed 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -941,6 +941,12 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro // // For Pascal &tm; or newer fully supported devices. + var aggregate_crc_errors uint64 = 0 + var aggregate_ecc_errors uint64 = 0 + var aggregate_replay_errors uint64 = 0 + var aggregate_recovery_errors uint64 = 0 + var aggregate_crc_flit_errors uint64 = 0 + for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ { state, ret := nvml.DeviceGetNvLinkState(device.device, i) if ret == nvml.SUCCESS { @@ -948,6 +954,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro if !device.excludeMetrics["nv_nvlink_crc_errors"] { // Data link receive data CRC error counter count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA) + aggregate_crc_errors = aggregate_crc_errors + count if ret == nvml.SUCCESS { y, err := lp.New("nv_nvlink_crc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) if err == nil { @@ -960,6 +967,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro if !device.excludeMetrics["nv_nvlink_ecc_errors"] { // Data link receive data ECC error counter count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA) + aggregate_ecc_errors = aggregate_ecc_errors + count if ret == nvml.SUCCESS { y, err := lp.New("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) if err == nil { @@ -972,6 +980,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro if !device.excludeMetrics["nv_nvlink_replay_errors"] { // Data link transmit replay error counter count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY) + aggregate_replay_errors = aggregate_replay_errors + count if ret == nvml.SUCCESS { y, err := lp.New("nv_nvlink_replay_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) if err == nil { @@ -984,6 +993,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro if !device.excludeMetrics["nv_nvlink_recovery_errors"] { // Data link transmit recovery error counter count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY) + aggregate_recovery_errors = aggregate_recovery_errors + count if ret == nvml.SUCCESS { y, err := lp.New("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) if err == nil { @@ -996,6 +1006,7 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] { // Data link receive flow control digit CRC error counter count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT) + aggregate_crc_flit_errors = aggregate_crc_flit_errors + count if ret == nvml.SUCCESS { y, err := lp.New("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now()) if err == nil { @@ -1008,6 +1019,48 @@ func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMetric) erro } } } + + // Export aggegated values + if !device.excludeMetrics["nv_nvlink_crc_errors"] { + // Data link receive data CRC error counter + y, err := lp.New("nv_nvlink_crc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_errors}, time.Now()) + if err == nil { + y.AddTag("stype", "nvlink") + output <- y + } + } + if !device.excludeMetrics["nv_nvlink_ecc_errors"] { + // Data link receive data ECC error counter + y, err := lp.New("nv_nvlink_ecc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_ecc_errors}, time.Now()) + if err == nil { + y.AddTag("stype", "nvlink") + output <- y + } + } + if !device.excludeMetrics["nv_nvlink_replay_errors"] { + // Data link transmit replay error counter + y, err := lp.New("nv_nvlink_replay_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_replay_errors}, time.Now()) + if err == nil { + y.AddTag("stype", "nvlink") + output <- y + } + } + if !device.excludeMetrics["nv_nvlink_recovery_errors"] { + // Data link transmit recovery error counter + y, err := lp.New("nv_nvlink_recovery_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_recovery_errors}, time.Now()) + if err == nil { + y.AddTag("stype", "nvlink") + output <- y + } + } + if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] { + // Data link receive flow control digit CRC error counter + y, err := lp.New("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_flit_errors}, time.Now()) + if err == nil { + y.AddTag("stype", "nvlink") + output <- y + } + } return nil } diff --git a/sinks/prometheusSink.go b/sinks/prometheusSink.go index 7d792cd..0a6974e 100644 --- a/sinks/prometheusSink.go +++ b/sinks/prometheusSink.go @@ -49,6 +49,8 @@ func intToFloat64(input interface{}) (float64, error) { return float64(value), nil case int64: return float64(value), nil + case uint64: + return float64(value), nil } return 0, errors.New("cannot cast value to float64") } diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index cd2680f..858aede 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -21,6 +21,7 @@ var AvailableSinks = map[string]func(name string, config json.RawMessage) (Sink, "influxdb": NewInfluxSink, "influxasync": NewInfluxAsyncSink, "http": NewHttpSink, + "prometheus": NewPrometheusSink, } // Metric collector manager data structure