mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-02-13 14:41:45 +01:00
Compare commits
8 Commits
update_ci
...
replace_de
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7aa0e31562 | ||
|
|
9908d76aac | ||
|
|
b69281dae6 | ||
|
|
053eb27463 | ||
|
|
665db57a11 | ||
|
|
fc297854d2 | ||
|
|
cca0d23efa | ||
|
|
7cff283001 |
21
.github/workflows/runonce.yml
vendored
21
.github/workflows/runonce.yml
vendored
@@ -35,11 +35,6 @@ jobs:
|
||||
run: |
|
||||
go install github.com/reviewdog/reviewdog/cmd/reviewdog@latest
|
||||
|
||||
# See: https://staticcheck.io
|
||||
- name: Install staticcheck
|
||||
run: |
|
||||
go install honnef.co/go/tools/cmd/staticcheck@latest
|
||||
|
||||
# See: https://golangci-lint.run
|
||||
- name: Install GolangCI-Lint
|
||||
run: |
|
||||
@@ -54,21 +49,7 @@ jobs:
|
||||
# Running the linter requires likwid.h, which gets downloaded in the build step
|
||||
- name: Static Analysis with GolangCI-Lint and Upload Report with reviewdog
|
||||
run: |
|
||||
golangci-lint run | reviewdog -f=golangci-lint -name "Check golangci-lint on build-latest" -reporter=github-check -filter-mode=nofilter -fail-level none
|
||||
env:
|
||||
REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
# Running the linter requires likwid.h, which gets downloaded in the build step
|
||||
- name: Run Static Analysis with go vet and Upload Report with reviewdog
|
||||
run: |
|
||||
go vet ./... | reviewdog -f=govet -name "Check govet on build-latest" -reporter=github-check -filter-mode=nofilter -fail-level none
|
||||
env:
|
||||
REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
# Running the linter requires likwid.h, which gets downloaded in the build step
|
||||
- name: Run Static Analysis with staticcheck and Upload Report with reviewdog
|
||||
run: |
|
||||
staticcheck ./... | reviewdog -f=staticcheck -name "Check staticcheck on build-latest" -reporter=github-check -filter-mode=nofilter -fail-level none
|
||||
golangci-lint run --enable errorlint,govet,misspell,modernize,prealloc,staticcheck,unconvert,wastedassign | reviewdog -f=golangci-lint -name "Check golangci-lint on build-latest" -reporter=github-check -filter-mode=nofilter -fail-level none
|
||||
env:
|
||||
REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
|
||||
5
Makefile
5
Makefile
@@ -72,6 +72,11 @@ staticcheck:
|
||||
$(GOBIN) install honnef.co/go/tools/cmd/staticcheck@latest
|
||||
$$($(GOBIN) env GOPATH)/bin/staticcheck ./...
|
||||
|
||||
.PHONY: golangci-lint
|
||||
golangci-lint:
|
||||
$(GOBIN) install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@latest
|
||||
$$($(GOBIN) env GOPATH)/bin/golangci-lint run
|
||||
|
||||
.ONESHELL:
|
||||
.PHONY: RPM
|
||||
RPM: scripts/cc-metric-collector.spec
|
||||
|
||||
@@ -50,30 +50,6 @@ type RuntimeConfig struct {
|
||||
Sync sync.WaitGroup
|
||||
}
|
||||
|
||||
//// Structure of the configuration file
|
||||
//type GlobalConfig struct {
|
||||
// Sink sinks.SinkConfig `json:"sink"`
|
||||
// Interval int `json:"interval"`
|
||||
// Duration int `json:"duration"`
|
||||
// Collectors []string `json:"collectors"`
|
||||
// Receiver receivers.ReceiverConfig `json:"receiver"`
|
||||
// DefTags map[string]string `json:"default_tags"`
|
||||
// CollectConfigs map[string]json.RawMessage `json:"collect_config"`
|
||||
//}
|
||||
|
||||
//// Load JSON configuration file
|
||||
//func LoadConfiguration(file string, config *GlobalConfig) error {
|
||||
// configFile, err := os.Open(file)
|
||||
// defer configFile.Close()
|
||||
// if err != nil {
|
||||
// fmt.Println(err.Error())
|
||||
// return err
|
||||
// }
|
||||
// jsonParser := json.NewDecoder(configFile)
|
||||
// err = jsonParser.Decode(config)
|
||||
// return err
|
||||
//}
|
||||
|
||||
func ReadCli() map[string]string {
|
||||
var m map[string]string
|
||||
cfg := flag.String("config", "./config.json", "Path to configuration file")
|
||||
@@ -93,22 +69,6 @@ func ReadCli() map[string]string {
|
||||
return m
|
||||
}
|
||||
|
||||
//func SetLogging(logfile string) error {
|
||||
// var file *os.File
|
||||
// var err error
|
||||
// if logfile != "stderr" {
|
||||
// file, err = os.OpenFile(logfile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600)
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// return err
|
||||
// }
|
||||
// } else {
|
||||
// file = os.Stderr
|
||||
// }
|
||||
// log.SetOutput(file)
|
||||
// return nil
|
||||
//}
|
||||
|
||||
// General shutdownHandler function that gets executed in case of interrupt or graceful shutdownHandler
|
||||
func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) {
|
||||
defer config.Sync.Done()
|
||||
@@ -216,11 +176,6 @@ func mainFunc() int {
|
||||
return 1
|
||||
}
|
||||
|
||||
// Set log file
|
||||
// if logfile := rcfg.CliArgs["logfile"]; logfile != "stderr" {
|
||||
// cclog.SetOutput(logfile)
|
||||
// }
|
||||
|
||||
// Creat new multi channel ticker
|
||||
rcfg.MultiChanTicker = mct.NewTicker(rcfg.Interval)
|
||||
|
||||
|
||||
@@ -67,7 +67,7 @@ A collector reads data from any source, parses it to metrics and submits these m
|
||||
* `Read(duration time.Duration, output chan ccMessage.CCMessage)`: Read, parse and submit data to the `output` channel as [`CCMessage`](https://github.com/ClusterCockpit/cc-lib/blob/main/ccMessage/README.md). If the collector has to measure anything for some duration, use the provided function argument `duration`.
|
||||
* `Close()`: Closes down the collector.
|
||||
|
||||
It is recommanded to call `setup()` in the `Init()` function.
|
||||
It is recommended to call `setup()` in the `Init()` function.
|
||||
|
||||
Finally, the collector needs to be registered in the `collectorManager.go`. There is a list of collectors called `AvailableCollectors` which is a map (`collector_type_string` -> `pointer to MetricCollector interface`). Add a new entry with a descriptive name and the new collector.
|
||||
|
||||
@@ -100,11 +100,12 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
m.name = "SampleCollector"
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
if err := json.Unmarshal(config, &m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): json.Unmarshal() call failed: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.meta = map[string]string{"source": m.name, "group": "Sample"}
|
||||
|
||||
@@ -17,6 +17,7 @@ import (
|
||||
"os/exec"
|
||||
"os/user"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -61,7 +62,9 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
||||
"rmXA", "setXA", "mirror"}
|
||||
|
||||
m.name = "BeegfsMetaCollector"
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.parallel = true
|
||||
// Set default beegfs-ctl binary
|
||||
|
||||
@@ -75,11 +78,10 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
}
|
||||
|
||||
//create map with possible variables
|
||||
// Create map with possible variables
|
||||
m.matches = make(map[string]string)
|
||||
for _, value := range nodeMdstat_array {
|
||||
_, skip := stringArrayContains(m.config.ExcludeMetrics, value)
|
||||
if skip {
|
||||
if slices.Contains(m.config.ExcludeMetrics, value) {
|
||||
m.matches["other"] = "0"
|
||||
} else {
|
||||
m.matches["beegfs_cmeta_"+value] = "0"
|
||||
@@ -102,7 +104,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
||||
// Beegfs file system statistics can only be queried by user root
|
||||
user, err := user.Current()
|
||||
if err != nil {
|
||||
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to get current user: %v", err)
|
||||
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to get current user: %w", err)
|
||||
}
|
||||
if user.Uid != "0" {
|
||||
return fmt.Errorf("BeegfsMetaCollector.Init(): BeeGFS file system statistics can only be queried by user root")
|
||||
@@ -111,7 +113,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
||||
// Check if beegfs-ctl is in executable search path
|
||||
_, err = exec.LookPath(m.config.Beegfs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err)
|
||||
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err)
|
||||
}
|
||||
m.init = true
|
||||
return nil
|
||||
@@ -121,7 +123,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
if !m.init {
|
||||
return
|
||||
}
|
||||
//get mounpoint
|
||||
// Get mounpoint
|
||||
buffer, _ := os.ReadFile(string("/proc/mounts"))
|
||||
mounts := strings.Split(string(buffer), "\n")
|
||||
var mountpoints []string
|
||||
@@ -162,12 +164,15 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
cmd.Stderr = cmdStderr
|
||||
err := cmd.Run()
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error())
|
||||
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode())
|
||||
data, _ := io.ReadAll(cmdStderr)
|
||||
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stderr: \"%s\"\n", string(data))
|
||||
data, _ = io.ReadAll(cmdStdout)
|
||||
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stdout: \"%s\"\n", string(data))
|
||||
dataStdErr, _ := io.ReadAll(cmdStderr)
|
||||
dataStdOut, _ := io.ReadAll(cmdStdout)
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err),
|
||||
fmt.Sprintf("Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()),
|
||||
fmt.Sprintf("Read(): command stderr: \"%s\"\n", string(dataStdErr)),
|
||||
fmt.Sprintf("Read(): command stdout: \"%s\"\n", string(dataStdOut)),
|
||||
)
|
||||
return
|
||||
}
|
||||
// Read I/O statistics
|
||||
@@ -223,7 +228,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
|
||||
for key, data := range m.matches {
|
||||
value, _ := strconv.ParseFloat(data, 32)
|
||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ import (
|
||||
"os/exec"
|
||||
"os/user"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -54,7 +55,9 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
||||
"storInf", "unlnk"}
|
||||
|
||||
m.name = "BeegfsStorageCollector"
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.parallel = true
|
||||
// Set default beegfs-ctl binary
|
||||
|
||||
@@ -71,8 +74,7 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
||||
//create map with possible variables
|
||||
m.matches = make(map[string]string)
|
||||
for _, value := range storageStat_array {
|
||||
_, skip := stringArrayContains(m.config.ExcludeMetrics, value)
|
||||
if skip {
|
||||
if slices.Contains(m.config.ExcludeMetrics, value) {
|
||||
m.matches["other"] = "0"
|
||||
} else {
|
||||
m.matches["beegfs_cstorage_"+value] = "0"
|
||||
@@ -95,7 +97,7 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
||||
// Beegfs file system statistics can only be queried by user root
|
||||
user, err := user.Current()
|
||||
if err != nil {
|
||||
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to get current user: %v", err)
|
||||
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to get current user: %w", err)
|
||||
}
|
||||
if user.Uid != "0" {
|
||||
return fmt.Errorf("BeegfsStorageCollector.Init(): BeeGFS file system statistics can only be queried by user root")
|
||||
@@ -104,7 +106,7 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
||||
// Check if beegfs-ctl is in executable search path
|
||||
_, err = exec.LookPath(m.config.Beegfs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err)
|
||||
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err)
|
||||
}
|
||||
m.init = true
|
||||
return nil
|
||||
@@ -154,12 +156,15 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
||||
cmd.Stderr = cmdStderr
|
||||
err := cmd.Run()
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error())
|
||||
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode())
|
||||
data, _ := io.ReadAll(cmdStderr)
|
||||
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stderr: \"%s\"\n", string(data))
|
||||
data, _ = io.ReadAll(cmdStdout)
|
||||
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stdout: \"%s\"\n", string(data))
|
||||
dataStdErr, _ := io.ReadAll(cmdStderr)
|
||||
dataStdOut, _ := io.ReadAll(cmdStdout)
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err),
|
||||
fmt.Sprintf("Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()),
|
||||
fmt.Sprintf("Read(): command stderr: \"%s\"\n", string(dataStdErr)),
|
||||
fmt.Sprintf("Read(): command stdout: \"%s\"\n", string(dataStdOut)),
|
||||
)
|
||||
return
|
||||
}
|
||||
// Read I/O statistics
|
||||
@@ -215,7 +220,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
||||
|
||||
for key, data := range m.matches {
|
||||
value, _ := strconv.ParseFloat(data, 32)
|
||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ package collectors
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -104,7 +105,7 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
|
||||
|
||||
err = collector.Init(collectorCfg)
|
||||
if err != nil {
|
||||
cclog.ComponentError("CollectorManager", "Collector", collectorName, "initialization failed:", err.Error())
|
||||
cclog.ComponentError("CollectorManager", fmt.Sprintf("Collector %s initialization failed: %v", collectorName, err))
|
||||
continue
|
||||
}
|
||||
cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())
|
||||
|
||||
@@ -35,15 +35,16 @@ type CPUFreqCpuInfoCollector struct {
|
||||
topology []CPUFreqCpuInfoCollectorTopology
|
||||
}
|
||||
|
||||
func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
||||
func (m *CPUFreqCpuInfoCollector) Init(_ json.RawMessage) error {
|
||||
// Check if already initialized
|
||||
if m.init {
|
||||
return nil
|
||||
}
|
||||
|
||||
m.setup()
|
||||
|
||||
m.name = "CPUFreqCpuInfoCollector"
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
@@ -54,9 +55,8 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
||||
const cpuInfoFile = "/proc/cpuinfo"
|
||||
file, err := os.Open(cpuInfoFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open file '%s': %v", cpuInfoFile, err)
|
||||
return fmt.Errorf("%s Init(): failed to open file '%s': %w", m.name, cpuInfoFile, err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
// Collect topology information from file cpuinfo
|
||||
foundFreq := false
|
||||
@@ -86,6 +86,10 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
}
|
||||
|
||||
if err := file.Close(); err != nil {
|
||||
return fmt.Errorf("%s Init(): Call to file.Close() failed: %w", m.name, err)
|
||||
}
|
||||
|
||||
// were all topology information collected?
|
||||
if foundFreq &&
|
||||
len(processor) > 0 &&
|
||||
@@ -119,7 +123,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Check if at least one CPU with frequency information was detected
|
||||
if len(m.topology) == 0 {
|
||||
return fmt.Errorf("no CPU frequency info found in %s", cpuInfoFile)
|
||||
return fmt.Errorf("%s Init(): no CPU frequency info found in %s", m.name, cpuInfoFile)
|
||||
}
|
||||
|
||||
m.init = true
|
||||
@@ -140,7 +144,13 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", cpuInfoFile, err))
|
||||
return
|
||||
}
|
||||
defer file.Close()
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", cpuInfoFile, err))
|
||||
}
|
||||
}()
|
||||
|
||||
processorCounter := 0
|
||||
now := time.Now()
|
||||
@@ -161,7 +171,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
||||
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
|
||||
return
|
||||
}
|
||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil {
|
||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": value}, now); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,7 +48,9 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
m.name = "CPUFreqCollector"
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.parallel = true
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
@@ -74,15 +76,15 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
||||
scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq")
|
||||
err := unix.Access(scalingCurFreqFile, unix.R_OK)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to access file '%s': %v", scalingCurFreqFile, err)
|
||||
return fmt.Errorf("unable to access file '%s': %w", scalingCurFreqFile, err)
|
||||
}
|
||||
|
||||
m.topology = append(m.topology,
|
||||
CPUFreqCollectorTopology{
|
||||
tagSet: map[string]string{
|
||||
"type": "hwthread",
|
||||
"type-id": fmt.Sprint(c.CpuID),
|
||||
"package_id": fmt.Sprint(c.Socket),
|
||||
"type-id": strconv.Itoa(c.CpuID),
|
||||
"package_id": strconv.Itoa(c.Socket),
|
||||
},
|
||||
scalingCurFreqFile: scalingCurFreqFile,
|
||||
},
|
||||
@@ -124,7 +126,7 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
continue
|
||||
}
|
||||
|
||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil {
|
||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": cpuFreq}, now); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -39,10 +40,17 @@ type CpustatCollector struct {
|
||||
|
||||
func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||
m.name = "CpustatCollector"
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{"source": m.name, "group": "CPU"}
|
||||
m.nodetags = map[string]string{"type": "node"}
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "CPU",
|
||||
}
|
||||
m.nodetags = map[string]string{
|
||||
"type": "node",
|
||||
}
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
@@ -64,14 +72,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||
|
||||
m.matches = make(map[string]int)
|
||||
for match, index := range matches {
|
||||
doExclude := false
|
||||
for _, exclude := range m.config.ExcludeMetrics {
|
||||
if match == exclude {
|
||||
doExclude = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !doExclude {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, match) {
|
||||
m.matches[match] = index
|
||||
}
|
||||
}
|
||||
@@ -79,9 +80,17 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||
// Check input file
|
||||
file, err := os.Open(string(CPUSTATFILE))
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Init(): Failed to open file '%s': %v", string(CPUSTATFILE), err))
|
||||
}
|
||||
defer file.Close()
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Init(): Failed to close file '%s': %v", string(CPUSTATFILE), err))
|
||||
}
|
||||
}()
|
||||
|
||||
// Pre-generate tags for all CPUs
|
||||
num_cpus := 0
|
||||
@@ -99,7 +108,9 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
||||
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
||||
cpu, _ := strconv.Atoi(cpustr)
|
||||
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
|
||||
m.cputags[linefields[0]] = map[string]string{
|
||||
"type": "hwthread",
|
||||
"type-id": strconv.Itoa(cpu)}
|
||||
m.olddata[linefields[0]] = make(map[string]int64)
|
||||
for k, v := range m.matches {
|
||||
m.olddata[linefields[0]][k], _ = strconv.ParseInt(linefields[v], 0, 64)
|
||||
@@ -129,7 +140,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
|
||||
sum := float64(0)
|
||||
for name, value := range values {
|
||||
sum += value
|
||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now)
|
||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value * 100}, now)
|
||||
if err == nil {
|
||||
y.AddTag("unit", "Percent")
|
||||
output <- y
|
||||
@@ -137,7 +148,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
|
||||
}
|
||||
if v, ok := values["cpu_idle"]; ok {
|
||||
sum -= v
|
||||
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]interface{}{"value": sum * 100}, now)
|
||||
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]any{"value": sum * 100}, now)
|
||||
if err == nil {
|
||||
y.AddTag("unit", "Percent")
|
||||
output <- y
|
||||
@@ -155,9 +166,17 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
|
||||
file, err := os.Open(string(CPUSTATFILE))
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", string(CPUSTATFILE), err))
|
||||
}
|
||||
defer file.Close()
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", string(CPUSTATFILE), err))
|
||||
}
|
||||
}()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
@@ -174,7 +193,7 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
num_cpus_metric, err := lp.NewMessage("num_cpus",
|
||||
m.nodetags,
|
||||
m.meta,
|
||||
map[string]interface{}{"value": int(num_cpus)},
|
||||
map[string]any{"value": num_cpus},
|
||||
now,
|
||||
)
|
||||
if err == nil {
|
||||
|
||||
@@ -8,16 +8,22 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"log"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"slices"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
influx "github.com/influxdata/line-protocol"
|
||||
|
||||
receivers "github.com/ClusterCockpit/cc-lib/v2/receivers"
|
||||
lp2 "github.com/influxdata/line-protocol/v2/lineprotocol"
|
||||
)
|
||||
|
||||
const CUSTOMCMDPATH = `/home/unrz139/Work/cc-metric-collector/collectors/custom`
|
||||
@@ -30,8 +36,6 @@ type CustomCmdCollectorConfig struct {
|
||||
|
||||
type CustomCmdCollector struct {
|
||||
metricCollector
|
||||
handler *influx.MetricHandler
|
||||
parser *influx.Parser
|
||||
config CustomCmdCollectorConfig
|
||||
commands []string
|
||||
files []string
|
||||
@@ -41,22 +45,31 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
||||
var err error
|
||||
m.name = "CustomCmdCollector"
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{"source": m.name, "group": "Custom"}
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "Custom",
|
||||
}
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
log.Print(err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): json.Unmarshal() call failed: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
for _, c := range m.config.Commands {
|
||||
cmdfields := strings.Fields(c)
|
||||
command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " "))
|
||||
command.Wait()
|
||||
command := exec.Command(cmdfields[0], cmdfields[1:]...)
|
||||
_, err = command.Output()
|
||||
if err == nil {
|
||||
m.commands = append(m.commands, c)
|
||||
} else {
|
||||
cclog.ComponentWarn(
|
||||
m.name,
|
||||
fmt.Sprintf("%s Init(): Execution of command \"%s\" failed: %v", m.name, command.String(), err),
|
||||
)
|
||||
continue
|
||||
}
|
||||
}
|
||||
for _, f := range m.config.Files {
|
||||
@@ -64,16 +77,16 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
||||
if err == nil {
|
||||
m.files = append(m.files, f)
|
||||
} else {
|
||||
log.Print(err.Error())
|
||||
cclog.ComponentWarn(
|
||||
m.name,
|
||||
fmt.Sprintf("%s Init(): Reading of file \"%s\" failed: %v", m.name, f, err),
|
||||
)
|
||||
continue
|
||||
}
|
||||
}
|
||||
if len(m.files) == 0 && len(m.commands) == 0 {
|
||||
return errors.New("no metrics to collect")
|
||||
}
|
||||
m.handler = influx.NewMetricHandler()
|
||||
m.parser = influx.NewParser(m.handler)
|
||||
m.parser.SetTimeFunc(DefaultTime)
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
@@ -87,45 +100,83 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
return
|
||||
}
|
||||
for _, cmd := range m.commands {
|
||||
cmdfields := strings.Fields(cmd)
|
||||
command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " "))
|
||||
command.Wait()
|
||||
stdout, err := command.Output()
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
continue
|
||||
}
|
||||
cmdmetrics, err := m.parser.Parse(stdout)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
continue
|
||||
}
|
||||
for _, c := range cmdmetrics {
|
||||
_, skip := stringArrayContains(m.config.ExcludeMetrics, c.Name())
|
||||
if skip {
|
||||
continue
|
||||
}
|
||||
// Execute configured commands
|
||||
cmdFields := strings.Fields(cmd)
|
||||
command := exec.Command(cmdFields[0], cmdFields[1:]...)
|
||||
stdout, _ := command.StdoutPipe()
|
||||
errBuf := new(bytes.Buffer)
|
||||
command.Stderr = errBuf
|
||||
|
||||
output <- lp.FromInfluxMetric(c)
|
||||
}
|
||||
}
|
||||
for _, file := range m.files {
|
||||
buffer, err := os.ReadFile(file)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
// Start command
|
||||
if err := command.Start(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to start command \"%s\": %v", command.String(), err),
|
||||
)
|
||||
return
|
||||
}
|
||||
fmetrics, err := m.parser.Parse(buffer)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
continue
|
||||
}
|
||||
for _, f := range fmetrics {
|
||||
_, skip := stringArrayContains(m.config.ExcludeMetrics, f.Name())
|
||||
if skip {
|
||||
|
||||
// Read and decode influxDB line-protocol from command output
|
||||
d := lp2.NewDecoder(stdout)
|
||||
for d.Next() {
|
||||
metric, err := receivers.DecodeInfluxMessage(d)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to decode influx Message: %v", err),
|
||||
)
|
||||
continue
|
||||
}
|
||||
output <- lp.FromInfluxMetric(f)
|
||||
if slices.Contains(m.config.ExcludeMetrics, metric.Name()) {
|
||||
continue
|
||||
}
|
||||
output <- metric
|
||||
}
|
||||
|
||||
// Wait for command end
|
||||
if err := command.Wait(); err != nil {
|
||||
errMsg, _ := io.ReadAll(errBuf)
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to wait for the end of command \"%s\": %v\n", command.String(), err),
|
||||
)
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): command stderr: \"%s\"\n", strings.TrimSpace(string(errMsg))))
|
||||
return
|
||||
}
|
||||
}
|
||||
for _, filename := range m.files {
|
||||
file, err := os.Open(filename)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file \"%s\": %v\n", filename, err),
|
||||
)
|
||||
}
|
||||
|
||||
// Read and decode influxDB line-protocol from file
|
||||
d := lp2.NewDecoder(file)
|
||||
for d.Next() {
|
||||
metric, err := receivers.DecodeInfluxMessage(d)
|
||||
if err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to decode influx Message: %v", err),
|
||||
)
|
||||
continue
|
||||
}
|
||||
if slices.Contains(m.config.ExcludeMetrics, metric.Name()) {
|
||||
continue
|
||||
}
|
||||
output <- metric
|
||||
}
|
||||
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file \"%s\": %v\n", filename, err),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ package collectors
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"syscall"
|
||||
@@ -36,7 +37,9 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
|
||||
m.name = "DiskstatCollector"
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
if len(config) > 0 {
|
||||
if err := json.Unmarshal(config, &m.config); err != nil {
|
||||
return err
|
||||
@@ -54,10 +57,11 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
file, err := os.Open(MOUNTFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): file open for file \"%s\" failed: %w", m.name, MOUNTFILE, err)
|
||||
}
|
||||
if err := file.Close(); err != nil {
|
||||
return fmt.Errorf("%s Init(): file close for file \"%s\" failed: %w", m.name, MOUNTFILE, err)
|
||||
}
|
||||
defer file.Close()
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
@@ -69,10 +73,18 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMessag
|
||||
|
||||
file, err := os.Open(MOUNTFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", MOUNTFILE, err))
|
||||
return
|
||||
}
|
||||
defer file.Close()
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", MOUNTFILE, err))
|
||||
}
|
||||
}()
|
||||
|
||||
part_max_used := uint64(0)
|
||||
scanner := bufio.NewScanner(file)
|
||||
@@ -93,7 +105,7 @@ mountLoop:
|
||||
continue
|
||||
}
|
||||
|
||||
mountPath := strings.Replace(linefields[1], `\040`, " ", -1)
|
||||
mountPath := strings.ReplaceAll(linefields[1], `\040`, " ")
|
||||
|
||||
for _, excl := range m.config.ExcludeMounts {
|
||||
if strings.Contains(mountPath, excl) {
|
||||
@@ -112,7 +124,13 @@ mountLoop:
|
||||
tags := map[string]string{"type": "node", "device": linefields[0]}
|
||||
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
|
||||
if m.allowedMetrics["disk_total"] {
|
||||
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
|
||||
y, err := lp.NewMessage(
|
||||
"disk_total",
|
||||
tags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": total},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "GBytes")
|
||||
output <- y
|
||||
@@ -120,7 +138,13 @@ mountLoop:
|
||||
}
|
||||
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000)
|
||||
if m.allowedMetrics["disk_free"] {
|
||||
y, err := lp.NewMessage("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
|
||||
y, err := lp.NewMessage(
|
||||
"disk_free",
|
||||
tags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": free},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "GBytes")
|
||||
output <- y
|
||||
@@ -134,7 +158,14 @@ mountLoop:
|
||||
}
|
||||
}
|
||||
if m.allowedMetrics["part_max_used"] {
|
||||
y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
|
||||
y, err := lp.NewMessage(
|
||||
"part_max_used",
|
||||
map[string]string{
|
||||
"type": "node"},
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": int(part_max_used)},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "percent")
|
||||
output <- y
|
||||
|
||||
@@ -14,9 +14,9 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"os/exec"
|
||||
"os/user"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
@@ -43,11 +43,11 @@ type GpfsCollectorConfig struct {
|
||||
}
|
||||
|
||||
type GpfsMetricDefinition struct {
|
||||
name string
|
||||
desc string
|
||||
prefix string
|
||||
unit string
|
||||
calc string
|
||||
name string
|
||||
desc string
|
||||
prefix string
|
||||
unit string
|
||||
calc string
|
||||
}
|
||||
|
||||
type GpfsCollector struct {
|
||||
@@ -56,251 +56,251 @@ type GpfsCollector struct {
|
||||
config GpfsCollectorConfig
|
||||
sudoCmd string
|
||||
skipFS map[string]struct{}
|
||||
lastTimestamp map[string]time.Time // Store timestamp of lastState per filesystem to derive bandwidths
|
||||
definitions []GpfsMetricDefinition // all metrics to report
|
||||
lastTimestamp map[string]time.Time // Store timestamp of lastState per filesystem to derive bandwidths
|
||||
definitions []GpfsMetricDefinition // all metrics to report
|
||||
lastState map[string]GpfsCollectorState // one GpfsCollectorState per filesystem
|
||||
}
|
||||
|
||||
var GpfsAbsMetrics = []GpfsMetricDefinition{
|
||||
{
|
||||
name: "gpfs_num_opens",
|
||||
desc: "number of opens",
|
||||
prefix: "_oc_",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
name: "gpfs_num_opens",
|
||||
desc: "number of opens",
|
||||
prefix: "_oc_",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
},
|
||||
{
|
||||
name: "gpfs_num_closes",
|
||||
desc: "number of closes",
|
||||
prefix: "_cc_",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
name: "gpfs_num_closes",
|
||||
desc: "number of closes",
|
||||
prefix: "_cc_",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
},
|
||||
{
|
||||
name: "gpfs_num_reads",
|
||||
desc: "number of reads",
|
||||
prefix: "_rdc_",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
name: "gpfs_num_reads",
|
||||
desc: "number of reads",
|
||||
prefix: "_rdc_",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
},
|
||||
{
|
||||
name: "gpfs_num_writes",
|
||||
desc: "number of writes",
|
||||
prefix: "_wc_",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
name: "gpfs_num_writes",
|
||||
desc: "number of writes",
|
||||
prefix: "_wc_",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
},
|
||||
{
|
||||
name: "gpfs_num_readdirs",
|
||||
desc: "number of readdirs",
|
||||
prefix: "_dir_",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
name: "gpfs_num_readdirs",
|
||||
desc: "number of readdirs",
|
||||
prefix: "_dir_",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
},
|
||||
{
|
||||
name: "gpfs_num_inode_updates",
|
||||
desc: "number of Inode Updates",
|
||||
prefix: "_iu_",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
name: "gpfs_num_inode_updates",
|
||||
desc: "number of Inode Updates",
|
||||
prefix: "_iu_",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
},
|
||||
{
|
||||
name: "gpfs_bytes_read",
|
||||
desc: "bytes read",
|
||||
prefix: "_br_",
|
||||
unit: "bytes",
|
||||
calc: "none",
|
||||
name: "gpfs_bytes_read",
|
||||
desc: "bytes read",
|
||||
prefix: "_br_",
|
||||
unit: "bytes",
|
||||
calc: "none",
|
||||
},
|
||||
{
|
||||
name: "gpfs_bytes_written",
|
||||
desc: "bytes written",
|
||||
prefix: "_bw_",
|
||||
unit: "bytes",
|
||||
calc: "none",
|
||||
name: "gpfs_bytes_written",
|
||||
desc: "bytes written",
|
||||
prefix: "_bw_",
|
||||
unit: "bytes",
|
||||
calc: "none",
|
||||
},
|
||||
}
|
||||
|
||||
var GpfsDiffMetrics = []GpfsMetricDefinition{
|
||||
{
|
||||
name: "gpfs_num_opens_diff",
|
||||
desc: "number of opens (diff)",
|
||||
prefix: "_oc_",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
name: "gpfs_num_opens_diff",
|
||||
desc: "number of opens (diff)",
|
||||
prefix: "_oc_",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
},
|
||||
{
|
||||
name: "gpfs_num_closes_diff",
|
||||
desc: "number of closes (diff)",
|
||||
prefix: "_cc_",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
name: "gpfs_num_closes_diff",
|
||||
desc: "number of closes (diff)",
|
||||
prefix: "_cc_",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
},
|
||||
{
|
||||
name: "gpfs_num_reads_diff",
|
||||
desc: "number of reads (diff)",
|
||||
prefix: "_rdc_",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
name: "gpfs_num_reads_diff",
|
||||
desc: "number of reads (diff)",
|
||||
prefix: "_rdc_",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
},
|
||||
{
|
||||
name: "gpfs_num_writes_diff",
|
||||
desc: "number of writes (diff)",
|
||||
prefix: "_wc_",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
name: "gpfs_num_writes_diff",
|
||||
desc: "number of writes (diff)",
|
||||
prefix: "_wc_",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
},
|
||||
{
|
||||
name: "gpfs_num_readdirs_diff",
|
||||
desc: "number of readdirs (diff)",
|
||||
prefix: "_dir_",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
name: "gpfs_num_readdirs_diff",
|
||||
desc: "number of readdirs (diff)",
|
||||
prefix: "_dir_",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
},
|
||||
{
|
||||
name: "gpfs_num_inode_updates_diff",
|
||||
desc: "number of Inode Updates (diff)",
|
||||
prefix: "_iu_",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
name: "gpfs_num_inode_updates_diff",
|
||||
desc: "number of Inode Updates (diff)",
|
||||
prefix: "_iu_",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
},
|
||||
{
|
||||
name: "gpfs_bytes_read_diff",
|
||||
desc: "bytes read (diff)",
|
||||
prefix: "_br_",
|
||||
unit: "bytes",
|
||||
calc: "difference",
|
||||
name: "gpfs_bytes_read_diff",
|
||||
desc: "bytes read (diff)",
|
||||
prefix: "_br_",
|
||||
unit: "bytes",
|
||||
calc: "difference",
|
||||
},
|
||||
{
|
||||
name: "gpfs_bytes_written_diff",
|
||||
desc: "bytes written (diff)",
|
||||
prefix: "_bw_",
|
||||
unit: "bytes",
|
||||
calc: "difference",
|
||||
name: "gpfs_bytes_written_diff",
|
||||
desc: "bytes written (diff)",
|
||||
prefix: "_bw_",
|
||||
unit: "bytes",
|
||||
calc: "difference",
|
||||
},
|
||||
}
|
||||
|
||||
var GpfsDeriveMetrics = []GpfsMetricDefinition{
|
||||
{
|
||||
name: "gpfs_opens_rate",
|
||||
desc: "number of opens (rate)",
|
||||
prefix: "_oc_",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
name: "gpfs_opens_rate",
|
||||
desc: "number of opens (rate)",
|
||||
prefix: "_oc_",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
},
|
||||
{
|
||||
name: "gpfs_closes_rate",
|
||||
desc: "number of closes (rate)",
|
||||
prefix: "_oc_",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
name: "gpfs_closes_rate",
|
||||
desc: "number of closes (rate)",
|
||||
prefix: "_oc_",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
},
|
||||
{
|
||||
name: "gpfs_reads_rate",
|
||||
desc: "number of reads (rate)",
|
||||
prefix: "_rdc_",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
name: "gpfs_reads_rate",
|
||||
desc: "number of reads (rate)",
|
||||
prefix: "_rdc_",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
},
|
||||
{
|
||||
name: "gpfs_writes_rate",
|
||||
desc: "number of writes (rate)",
|
||||
prefix: "_wc_",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
name: "gpfs_writes_rate",
|
||||
desc: "number of writes (rate)",
|
||||
prefix: "_wc_",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
},
|
||||
{
|
||||
name: "gpfs_readdirs_rate",
|
||||
desc: "number of readdirs (rate)",
|
||||
prefix: "_dir_",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
name: "gpfs_readdirs_rate",
|
||||
desc: "number of readdirs (rate)",
|
||||
prefix: "_dir_",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
},
|
||||
{
|
||||
name: "gpfs_inode_updates_rate",
|
||||
desc: "number of Inode Updates (rate)",
|
||||
prefix: "_iu_",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
name: "gpfs_inode_updates_rate",
|
||||
desc: "number of Inode Updates (rate)",
|
||||
prefix: "_iu_",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
},
|
||||
{
|
||||
name: "gpfs_bw_read",
|
||||
desc: "bytes read (rate)",
|
||||
prefix: "_br_",
|
||||
unit: "bytes/sec",
|
||||
calc: "derivative",
|
||||
name: "gpfs_bw_read",
|
||||
desc: "bytes read (rate)",
|
||||
prefix: "_br_",
|
||||
unit: "bytes/sec",
|
||||
calc: "derivative",
|
||||
},
|
||||
{
|
||||
name: "gpfs_bw_write",
|
||||
desc: "bytes written (rate)",
|
||||
prefix: "_bw_",
|
||||
unit: "bytes/sec",
|
||||
calc: "derivative",
|
||||
name: "gpfs_bw_write",
|
||||
desc: "bytes written (rate)",
|
||||
prefix: "_bw_",
|
||||
unit: "bytes/sec",
|
||||
calc: "derivative",
|
||||
},
|
||||
}
|
||||
|
||||
var GpfsTotalMetrics = []GpfsMetricDefinition{
|
||||
{
|
||||
name: "gpfs_bytes_total",
|
||||
desc: "bytes total",
|
||||
prefix: "bytesTotal",
|
||||
unit: "bytes",
|
||||
calc: "none",
|
||||
name: "gpfs_bytes_total",
|
||||
desc: "bytes total",
|
||||
prefix: "bytesTotal",
|
||||
unit: "bytes",
|
||||
calc: "none",
|
||||
},
|
||||
{
|
||||
name: "gpfs_bytes_total_diff",
|
||||
desc: "bytes total (diff)",
|
||||
prefix: "bytesTotal",
|
||||
unit: "bytes",
|
||||
calc: "difference",
|
||||
name: "gpfs_bytes_total_diff",
|
||||
desc: "bytes total (diff)",
|
||||
prefix: "bytesTotal",
|
||||
unit: "bytes",
|
||||
calc: "difference",
|
||||
},
|
||||
{
|
||||
name: "gpfs_bw_total",
|
||||
desc: "bytes total (rate)",
|
||||
prefix: "bytesTotal",
|
||||
unit: "bytes/sec",
|
||||
calc: "derivative",
|
||||
name: "gpfs_bw_total",
|
||||
desc: "bytes total (rate)",
|
||||
prefix: "bytesTotal",
|
||||
unit: "bytes/sec",
|
||||
calc: "derivative",
|
||||
},
|
||||
{
|
||||
name: "gpfs_iops",
|
||||
desc: "iops",
|
||||
prefix: "iops",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
name: "gpfs_iops",
|
||||
desc: "iops",
|
||||
prefix: "iops",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
},
|
||||
{
|
||||
name: "gpfs_iops_diff",
|
||||
desc: "iops (diff)",
|
||||
prefix: "iops",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
name: "gpfs_iops_diff",
|
||||
desc: "iops (diff)",
|
||||
prefix: "iops",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
},
|
||||
{
|
||||
name: "gpfs_iops_rate",
|
||||
desc: "iops (rate)",
|
||||
prefix: "iops",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
name: "gpfs_iops_rate",
|
||||
desc: "iops (rate)",
|
||||
prefix: "iops",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
},
|
||||
{
|
||||
name: "gpfs_metaops",
|
||||
desc: "metaops",
|
||||
prefix: "metaops",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
name: "gpfs_metaops",
|
||||
desc: "metaops",
|
||||
prefix: "metaops",
|
||||
unit: "requests",
|
||||
calc: "none",
|
||||
},
|
||||
{
|
||||
name: "gpfs_metaops_diff",
|
||||
desc: "metaops (diff)",
|
||||
prefix: "metaops",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
name: "gpfs_metaops_diff",
|
||||
desc: "metaops (diff)",
|
||||
prefix: "metaops",
|
||||
unit: "requests",
|
||||
calc: "difference",
|
||||
},
|
||||
{
|
||||
name: "gpfs_metaops_rate",
|
||||
desc: "metaops (rate)",
|
||||
prefix: "metaops",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
name: "gpfs_metaops_rate",
|
||||
desc: "metaops (rate)",
|
||||
prefix: "metaops",
|
||||
unit: "requests/sec",
|
||||
calc: "derivative",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -310,9 +310,10 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
var err error
|
||||
m.name = "GpfsCollector"
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.parallel = true
|
||||
|
||||
// Set default mmpmon binary
|
||||
@@ -320,10 +321,9 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Read JSON configuration
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
log.Print(err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.meta = map[string]string{
|
||||
@@ -364,9 +364,9 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||
// when using sudo, the full path of mmpmon must be specified because
|
||||
// exec.LookPath will not work as mmpmon is not executable as user
|
||||
if m.config.Sudo && !strings.HasPrefix(m.config.Mmpmon, "/") {
|
||||
return fmt.Errorf("when using sudo, mmpmon_path must be provided and an absolute path: %s", m.config.Mmpmon)
|
||||
return fmt.Errorf("%s Init(): when using sudo, mmpmon_path must be provided and an absolute path: %s", m.name, m.config.Mmpmon)
|
||||
}
|
||||
|
||||
|
||||
// Check if mmpmon is in executable search path
|
||||
p, err := exec.LookPath(m.config.Mmpmon)
|
||||
if err != nil {
|
||||
@@ -377,7 +377,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||
p = m.config.Mmpmon
|
||||
} else {
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err))
|
||||
return fmt.Errorf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err)
|
||||
return fmt.Errorf("%s Init(): failed to find mmpmon binary '%s': %w", m.name, m.config.Mmpmon, err)
|
||||
}
|
||||
}
|
||||
m.config.Mmpmon = p
|
||||
@@ -385,28 +385,28 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||
m.definitions = []GpfsMetricDefinition{}
|
||||
if m.config.SendAbsoluteValues {
|
||||
for _, def := range GpfsAbsMetrics {
|
||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||
m.definitions = append(m.definitions, def)
|
||||
}
|
||||
}
|
||||
}
|
||||
if m.config.SendDiffValues {
|
||||
for _, def := range GpfsDiffMetrics {
|
||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||
m.definitions = append(m.definitions, def)
|
||||
}
|
||||
}
|
||||
}
|
||||
if m.config.SendDerivedValues {
|
||||
for _, def := range GpfsDeriveMetrics {
|
||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||
m.definitions = append(m.definitions, def)
|
||||
}
|
||||
}
|
||||
} else if m.config.SendBandwidths {
|
||||
for _, def := range GpfsDeriveMetrics {
|
||||
if def.unit == "bytes/sec" {
|
||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||
m.definitions = append(m.definitions, def)
|
||||
}
|
||||
}
|
||||
@@ -414,19 +414,19 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
if m.config.SendTotalValues {
|
||||
for _, def := range GpfsTotalMetrics {
|
||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||
// only send total metrics of the types requested
|
||||
if ( def.calc == "none" && m.config.SendAbsoluteValues ) ||
|
||||
( def.calc == "difference" && m.config.SendDiffValues ) ||
|
||||
( def.calc == "derivative" && m.config.SendDerivedValues ) {
|
||||
if (def.calc == "none" && m.config.SendAbsoluteValues) ||
|
||||
(def.calc == "difference" && m.config.SendDiffValues) ||
|
||||
(def.calc == "derivative" && m.config.SendDerivedValues) {
|
||||
m.definitions = append(m.definitions, def)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if m.config.SendBandwidths {
|
||||
for _, def := range GpfsTotalMetrics {
|
||||
if def.unit == "bytes/sec" {
|
||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||
m.definitions = append(m.definitions, def)
|
||||
}
|
||||
}
|
||||
@@ -456,7 +456,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
} else {
|
||||
cmd = exec.Command(m.config.Mmpmon, "-p", "-s")
|
||||
}
|
||||
|
||||
|
||||
cmd.Stdin = strings.NewReader("once fs_io_s\n")
|
||||
cmdStdout := new(bytes.Buffer)
|
||||
cmdStderr := new(bytes.Buffer)
|
||||
@@ -562,36 +562,36 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
// compute total metrics (map[...] will return 0 if key not found)
|
||||
// bytes read and written
|
||||
if br, br_ok := newstate["_br_"]; br_ok {
|
||||
newstate["bytesTotal"] = newstate["bytesTotal"] + br
|
||||
newstate["bytesTotal"] += br
|
||||
}
|
||||
if bw, bw_ok := newstate["_bw_"]; bw_ok {
|
||||
newstate["bytesTotal"] = newstate["bytesTotal"] + bw
|
||||
newstate["bytesTotal"] += bw
|
||||
}
|
||||
// read and write count
|
||||
if rdc, rdc_ok := newstate["_rdc_"]; rdc_ok {
|
||||
newstate["iops"] = newstate["iops"] + rdc
|
||||
newstate["iops"] += rdc
|
||||
}
|
||||
if wc, wc_ok := newstate["_wc_"]; wc_ok {
|
||||
newstate["iops"] = newstate["iops"] + wc
|
||||
newstate["iops"] += wc
|
||||
}
|
||||
// meta operations
|
||||
if oc, oc_ok := newstate["_oc_"]; oc_ok {
|
||||
newstate["metaops"] = newstate["metaops"] + oc
|
||||
newstate["metaops"] += oc
|
||||
}
|
||||
if cc, cc_ok := newstate["_cc_"]; cc_ok {
|
||||
newstate["metaops"] = newstate["metaops"] + cc
|
||||
newstate["metaops"] += cc
|
||||
}
|
||||
if dir, dir_ok := newstate["_dir_"]; dir_ok {
|
||||
newstate["metaops"] = newstate["metaops"] + dir
|
||||
newstate["metaops"] += dir
|
||||
}
|
||||
if iu, iu_ok := newstate["_iu_"]; iu_ok {
|
||||
newstate["metaops"] = newstate["metaops"] + iu
|
||||
newstate["metaops"] += iu
|
||||
}
|
||||
// send desired metrics for this filesystem
|
||||
for _, metric := range m.definitions {
|
||||
vold, vold_ok := m.lastState[filesystem][metric.prefix]
|
||||
vnew, vnew_ok := newstate[metric.prefix]
|
||||
var value interface{}
|
||||
var value any
|
||||
value_ok := false
|
||||
switch metric.calc {
|
||||
case "none":
|
||||
@@ -617,14 +617,14 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
}
|
||||
case "derivative":
|
||||
if vnew_ok && vold_ok && timeDiff > 0 {
|
||||
value = float64(vnew - vold) / timeDiff
|
||||
if value.(float64) < 0 {
|
||||
value = 0
|
||||
value = float64(vnew-vold) / timeDiff
|
||||
if value.(float64) < 0.0 {
|
||||
value = 0.0
|
||||
}
|
||||
value_ok = true
|
||||
} else if vold_ok {
|
||||
// if the difference is not computable, return 0
|
||||
value = 0
|
||||
value = 0.0
|
||||
value_ok = true
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ package collectors
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"slices"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
@@ -65,7 +66,9 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
|
||||
var err error
|
||||
m.name = "InfinibandCollector"
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
@@ -87,10 +90,10 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*")
|
||||
ibDirs, err := filepath.Glob(globPattern)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to glob files with pattern %s: %v", globPattern, err)
|
||||
return fmt.Errorf("%s Init(): unable to glob files with pattern %s: %w", m.name, globPattern, err)
|
||||
}
|
||||
if ibDirs == nil {
|
||||
return fmt.Errorf("unable to find any directories with pattern %s", globPattern)
|
||||
return fmt.Errorf("%s Init(): unable to find any directories with pattern %s", m.name, globPattern)
|
||||
}
|
||||
|
||||
for _, path := range ibDirs {
|
||||
@@ -111,14 +114,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
port := pathSplit[6]
|
||||
|
||||
// Skip excluded devices
|
||||
skip := false
|
||||
for _, excludedDevice := range m.config.ExcludeDevices {
|
||||
if excludedDevice == device {
|
||||
skip = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if skip {
|
||||
if slices.Contains(m.config.ExcludeDevices, device) {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -161,7 +157,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
for _, counter := range portCounterFiles {
|
||||
err := unix.Access(counter.path, unix.R_OK)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to access %s: %v", counter.path, err)
|
||||
return fmt.Errorf("%s Init(): unable to access %s: %w", m.name, counter.path, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -181,7 +177,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
if len(m.info) == 0 {
|
||||
return fmt.Errorf("found no IB devices")
|
||||
return fmt.Errorf("%s Init(): found no IB devices", m.name)
|
||||
}
|
||||
|
||||
m.init = true
|
||||
@@ -241,7 +237,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
counterDef.name,
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
map[string]any{
|
||||
"value": counterDef.currentState,
|
||||
},
|
||||
now); err == nil {
|
||||
@@ -259,7 +255,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
counterDef.name+"_bw",
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
map[string]any{
|
||||
"value": rate,
|
||||
},
|
||||
now); err == nil {
|
||||
@@ -289,7 +285,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
"ib_total",
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
map[string]any{
|
||||
"value": ib_total,
|
||||
},
|
||||
now); err == nil {
|
||||
@@ -302,7 +298,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
"ib_total_pkts",
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
map[string]any{
|
||||
"value": ib_total_pkts,
|
||||
},
|
||||
now); err == nil {
|
||||
|
||||
@@ -11,7 +11,9 @@ import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -45,7 +47,9 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||
m.name = "IOstatCollector"
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{"source": m.name, "group": "Disk"}
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
@@ -75,7 +79,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||
m.devices = make(map[string]IOstatCollectorEntry)
|
||||
m.matches = make(map[string]int)
|
||||
for k, v := range matches {
|
||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, k); !skip {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, k) {
|
||||
m.matches[k] = v
|
||||
}
|
||||
}
|
||||
@@ -84,10 +88,8 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
file, err := os.Open(IOSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Failed to open file \"%s\": %w", m.name, IOSTATFILE, err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
@@ -101,7 +103,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||
if strings.Contains(device, "loop") {
|
||||
continue
|
||||
}
|
||||
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
|
||||
if slices.Contains(m.config.ExcludeDevices, device) {
|
||||
continue
|
||||
}
|
||||
currentValues := make(map[string]int64)
|
||||
@@ -127,6 +129,10 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||
lastValues: lastValues,
|
||||
}
|
||||
}
|
||||
if err := file.Close(); err != nil {
|
||||
return fmt.Errorf("%s Init(): Failed to close file \"%s\": %w", m.name, IOSTATFILE, err)
|
||||
}
|
||||
|
||||
m.init = true
|
||||
return err
|
||||
}
|
||||
@@ -138,10 +144,18 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
|
||||
file, err := os.Open(IOSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", IOSTATFILE, err))
|
||||
return
|
||||
}
|
||||
defer file.Close()
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", IOSTATFILE, err))
|
||||
}
|
||||
}()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
@@ -157,7 +171,7 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
if strings.Contains(device, "loop") {
|
||||
continue
|
||||
}
|
||||
if _, skip := stringArrayContains(m.config.ExcludeDevices, device); skip {
|
||||
if slices.Contains(m.config.ExcludeDevices, device) {
|
||||
continue
|
||||
}
|
||||
if _, ok := m.devices[device]; !ok {
|
||||
|
||||
@@ -14,7 +14,6 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -44,7 +43,9 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
|
||||
m.name = "IpmiCollector"
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
@@ -116,19 +117,20 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
|
||||
}
|
||||
v, err := strconv.ParseFloat(strings.TrimSpace(lv[1]), 64)
|
||||
if err == nil {
|
||||
name := strings.ToLower(strings.Replace(strings.TrimSpace(lv[0]), " ", "_", -1))
|
||||
name := strings.ToLower(strings.ReplaceAll(strings.TrimSpace(lv[0]), " ", "_"))
|
||||
unit := strings.TrimSpace(lv[2])
|
||||
if unit == "Volts" {
|
||||
switch unit {
|
||||
case "Volts":
|
||||
unit = "Volts"
|
||||
} else if unit == "degrees C" {
|
||||
case "degrees C":
|
||||
unit = "degC"
|
||||
} else if unit == "degrees F" {
|
||||
case "degrees F":
|
||||
unit = "degF"
|
||||
} else if unit == "Watts" {
|
||||
case "Watts":
|
||||
unit = "Watts"
|
||||
}
|
||||
|
||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
|
||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", unit)
|
||||
output <- y
|
||||
@@ -150,23 +152,30 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
|
||||
|
||||
func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
|
||||
|
||||
// Setup ipmisensors command
|
||||
command := exec.Command(cmd, "--comma-separated-output", "--sdr-cache-recreate")
|
||||
command.Wait()
|
||||
stdout, err := command.Output()
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
stdout, _ := command.StdoutPipe()
|
||||
errBuf := new(bytes.Buffer)
|
||||
command.Stderr = errBuf
|
||||
|
||||
// start command
|
||||
if err := command.Start(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("readIpmiSensors(): Failed to start command \"%s\": %v", command.String(), err),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
ll := strings.Split(string(stdout), "\n")
|
||||
|
||||
for _, line := range ll {
|
||||
lv := strings.Split(line, ",")
|
||||
// Read command output
|
||||
scanner := bufio.NewScanner(stdout)
|
||||
for scanner.Scan() {
|
||||
lv := strings.Split(scanner.Text(), ",")
|
||||
if len(lv) > 3 {
|
||||
v, err := strconv.ParseFloat(lv[3], 64)
|
||||
if err == nil {
|
||||
name := strings.ToLower(strings.Replace(lv[1], " ", "_", -1))
|
||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
|
||||
name := strings.ToLower(strings.ReplaceAll(lv[1], " ", "_"))
|
||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
|
||||
if err == nil {
|
||||
if len(lv) > 4 {
|
||||
y.AddMeta("unit", lv[4])
|
||||
@@ -176,6 +185,18 @@ func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for command end
|
||||
if err := command.Wait(); err != nil {
|
||||
errMsg, _ := io.ReadAll(errBuf)
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("readIpmiSensors(): Failed to wait for the end of command \"%s\": %v\n", command.String(), err),
|
||||
)
|
||||
cclog.ComponentError(m.name, fmt.Sprintf("readIpmiSensors(): command stderr: \"%s\"\n", strings.TrimSpace(string(errMsg))))
|
||||
return
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
|
||||
@@ -19,11 +19,12 @@ import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"maps"
|
||||
"math"
|
||||
"os"
|
||||
"os/signal"
|
||||
"os/user"
|
||||
"sort"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -124,22 +125,14 @@ func checkMetricType(t string) bool {
|
||||
return ok
|
||||
}
|
||||
|
||||
func eventsToEventStr(events map[string]string) string {
|
||||
elist := make([]string, 0)
|
||||
for k, v := range events {
|
||||
elist = append(elist, fmt.Sprintf("%s:%s", v, k))
|
||||
}
|
||||
return strings.Join(elist, ",")
|
||||
}
|
||||
|
||||
func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig {
|
||||
tmplist := make([]string, 0)
|
||||
clist := make([]string, 0)
|
||||
clist := make([]string, 0, len(input.Events))
|
||||
for k := range input.Events {
|
||||
clist = append(clist, k)
|
||||
}
|
||||
sort.Strings(clist)
|
||||
elist := make([]*C.char, 0)
|
||||
slices.Sort(clist)
|
||||
tmplist := make([]string, 0, len(clist))
|
||||
elist := make([]*C.char, 0, len(clist))
|
||||
for _, k := range clist {
|
||||
v := input.Events[k]
|
||||
tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k))
|
||||
@@ -187,7 +180,7 @@ func getBaseFreq() float64 {
|
||||
for _, f := range files {
|
||||
buffer, err := os.ReadFile(f)
|
||||
if err == nil {
|
||||
data := strings.Replace(string(buffer), "\n", "", -1)
|
||||
data := strings.ReplaceAll(string(buffer), "\n", "")
|
||||
x, err := strconv.ParseInt(data, 0, 64)
|
||||
if err == nil {
|
||||
freq = float64(x)
|
||||
@@ -216,7 +209,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS)
|
||||
@@ -225,14 +218,18 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
err := lib.Open()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error opening %s: %v", m.config.LibraryPath, err)
|
||||
return fmt.Errorf("error opening %s: %w", m.config.LibraryPath, err)
|
||||
}
|
||||
|
||||
if m.config.ForceOverwrite {
|
||||
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
|
||||
os.Setenv("LIKWID_FORCE", "1")
|
||||
if err := os.Setenv("LIKWID_FORCE", "1"); err != nil {
|
||||
return fmt.Errorf("error setting environment variable LIKWID_FORCE=1: %w", err)
|
||||
}
|
||||
}
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.setup()
|
||||
|
||||
m.meta = map[string]string{"group": "PerfCounter"}
|
||||
cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
|
||||
@@ -316,7 +313,14 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
case "accessdaemon":
|
||||
if len(m.config.DaemonPath) > 0 {
|
||||
p := os.Getenv("PATH")
|
||||
os.Setenv("PATH", m.config.DaemonPath+":"+p)
|
||||
if len(p) > 0 {
|
||||
p = m.config.DaemonPath + ":" + p
|
||||
} else {
|
||||
p = m.config.DaemonPath
|
||||
}
|
||||
if err := os.Setenv("PATH", p); err != nil {
|
||||
return fmt.Errorf("error setting environment variable PATH=%s: %w", p, err)
|
||||
}
|
||||
}
|
||||
C.HPMmode(1)
|
||||
retCode := C.HPMinit()
|
||||
@@ -369,16 +373,23 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||
// take a measurement for 'interval' seconds of event set index 'group'
|
||||
func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, interval time.Duration) (bool, error) {
|
||||
var ret C.int
|
||||
var gid C.int = -1
|
||||
sigchan := make(chan os.Signal, 1)
|
||||
|
||||
// Watch changes for the lock file ()
|
||||
watcher, err := fsnotify.NewWatcher()
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("takeMeasurement(): Failed to create a new fsnotify.Watcher: %v", err))
|
||||
return true, err
|
||||
}
|
||||
defer watcher.Close()
|
||||
defer func() {
|
||||
if err := watcher.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("takeMeasurement(): Failed to close fsnotify.Watcher: %v", err))
|
||||
}
|
||||
}()
|
||||
if len(m.config.LockfilePath) > 0 {
|
||||
// Check if the lock file exists
|
||||
info, err := os.Stat(m.config.LockfilePath)
|
||||
@@ -386,9 +397,11 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
// Create the lock file if it does not exist
|
||||
file, createErr := os.Create(m.config.LockfilePath)
|
||||
if createErr != nil {
|
||||
return true, fmt.Errorf("failed to create lock file: %v", createErr)
|
||||
return true, fmt.Errorf("failed to create lock file: %w", createErr)
|
||||
}
|
||||
if err := file.Close(); err != nil {
|
||||
return true, fmt.Errorf("failed to close lock file: %w", err)
|
||||
}
|
||||
file.Close()
|
||||
info, err = os.Stat(m.config.LockfilePath) // Recheck the file after creation
|
||||
}
|
||||
if err != nil {
|
||||
@@ -440,6 +453,7 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
|
||||
signal.Notify(sigchan, syscall.SIGCHLD)
|
||||
|
||||
// Add an event string to LIKWID
|
||||
var gid C.int
|
||||
select {
|
||||
case <-sigchan:
|
||||
gid = -1
|
||||
@@ -595,7 +609,6 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
evset.metrics[tid][metric.Name] = value
|
||||
// Now we have the result, send it with the proper tags
|
||||
if !math.IsNaN(value) && metric.Publish {
|
||||
fields := map[string]interface{}{"value": value}
|
||||
y, err :=
|
||||
lp.NewMessage(
|
||||
metric.Name,
|
||||
@@ -603,12 +616,14 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
"type": metric.Type,
|
||||
},
|
||||
m.meta,
|
||||
fields,
|
||||
map[string]any{
|
||||
"value": value,
|
||||
},
|
||||
now,
|
||||
)
|
||||
if err == nil {
|
||||
if metric.Type != "node" {
|
||||
y.AddTag("type-id", fmt.Sprintf("%d", domain))
|
||||
y.AddTag("type-id", strconv.Itoa(domain))
|
||||
}
|
||||
if len(metric.Unit) > 0 {
|
||||
y.AddMeta("unit", metric.Unit)
|
||||
@@ -638,10 +653,10 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
metric.Name,
|
||||
map[string]string{
|
||||
"type": "core",
|
||||
"type-id": fmt.Sprintf("%d", coreID),
|
||||
"type-id": strconv.Itoa(coreID),
|
||||
},
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
map[string]any{
|
||||
"value": value,
|
||||
},
|
||||
now,
|
||||
@@ -675,10 +690,10 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
metric.Name,
|
||||
map[string]string{
|
||||
"type": "socket",
|
||||
"type-id": fmt.Sprintf("%d", socketID),
|
||||
"type-id": strconv.Itoa(socketID),
|
||||
},
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
map[string]any{
|
||||
"value": value,
|
||||
},
|
||||
now,
|
||||
@@ -712,7 +727,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
"type": "node",
|
||||
},
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
map[string]any{
|
||||
"value": totalNodeValue,
|
||||
},
|
||||
now,
|
||||
@@ -748,9 +763,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
||||
// Here we generate parameter list
|
||||
params := make(map[string]float64)
|
||||
for _, evset := range groups {
|
||||
for mname, mres := range evset.metrics[tid] {
|
||||
params[mname] = mres
|
||||
}
|
||||
maps.Copy(params, evset.metrics[tid])
|
||||
}
|
||||
params["gotime"] = interval.Seconds()
|
||||
// Evaluate the metric
|
||||
@@ -772,14 +785,14 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
||||
"type": metric.Type,
|
||||
},
|
||||
m.meta,
|
||||
map[string]interface{}{
|
||||
map[string]any{
|
||||
"value": value,
|
||||
},
|
||||
now,
|
||||
)
|
||||
if err == nil {
|
||||
if metric.Type != "node" {
|
||||
y.AddTag("type-id", fmt.Sprintf("%d", domain))
|
||||
y.AddTag("type-id", strconv.Itoa(domain))
|
||||
}
|
||||
if len(metric.Unit) > 0 {
|
||||
y.AddMeta("unit", metric.Unit)
|
||||
@@ -795,7 +808,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
||||
}
|
||||
|
||||
func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMessage) {
|
||||
var err error = nil
|
||||
var err error
|
||||
groups := make([]LikwidEventsetConfig, 0)
|
||||
|
||||
for evidx, evset := range m.config.Eventsets {
|
||||
@@ -813,13 +826,21 @@ func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMe
|
||||
|
||||
if !skip {
|
||||
// read measurements and derive event set metrics
|
||||
m.calcEventsetMetrics(e, interval, output)
|
||||
err = m.calcEventsetMetrics(e, interval, output)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return
|
||||
}
|
||||
groups = append(groups, e)
|
||||
}
|
||||
}
|
||||
if len(groups) > 0 {
|
||||
// calculate global metrics
|
||||
m.calcGlobalMetrics(groups, interval, output)
|
||||
err = m.calcGlobalMetrics(groups, interval, output)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -42,7 +43,9 @@ type LoadavgCollector struct {
|
||||
func (m *LoadavgCollector) Init(config json.RawMessage) error {
|
||||
m.name = "LoadavgCollector"
|
||||
m.parallel = true
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
@@ -64,10 +67,10 @@ func (m *LoadavgCollector) Init(config json.RawMessage) error {
|
||||
m.proc_skips = make([]bool, len(m.proc_matches))
|
||||
|
||||
for i, name := range m.load_matches {
|
||||
_, m.load_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
|
||||
m.load_skips[i] = slices.Contains(m.config.ExcludeMetrics, name)
|
||||
}
|
||||
for i, name := range m.proc_matches {
|
||||
_, m.proc_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
|
||||
m.proc_skips[i] = slices.Contains(m.config.ExcludeMetrics, name)
|
||||
}
|
||||
m.init = true
|
||||
return nil
|
||||
@@ -99,7 +102,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
if m.load_skips[i] {
|
||||
continue
|
||||
}
|
||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
|
||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -118,7 +121,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
if m.proc_skips[i] {
|
||||
continue
|
||||
}
|
||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
|
||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"os/user"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -61,7 +62,6 @@ func (m *LustreCollector) getDeviceDataCommand(device string) []string {
|
||||
} else {
|
||||
command = exec.Command(m.lctl, LCTL_OPTION, statsfile)
|
||||
}
|
||||
command.Wait()
|
||||
stdout, _ := command.Output()
|
||||
return strings.Split(string(stdout), "\n")
|
||||
}
|
||||
@@ -302,7 +302,9 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
||||
return err
|
||||
}
|
||||
}
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
m.meta = map[string]string{"source": m.name, "group": "Lustre"}
|
||||
|
||||
@@ -339,21 +341,21 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
||||
m.definitions = []LustreMetricDefinition{}
|
||||
if m.config.SendAbsoluteValues {
|
||||
for _, def := range LustreAbsMetrics {
|
||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||
m.definitions = append(m.definitions, def)
|
||||
}
|
||||
}
|
||||
}
|
||||
if m.config.SendDiffValues {
|
||||
for _, def := range LustreDiffMetrics {
|
||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||
m.definitions = append(m.definitions, def)
|
||||
}
|
||||
}
|
||||
}
|
||||
if m.config.SendDerivedValues {
|
||||
for _, def := range LustreDeriveMetrics {
|
||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, def.name); !skip {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, def.name) {
|
||||
m.definitions = append(m.definitions, def)
|
||||
}
|
||||
}
|
||||
@@ -402,23 +404,23 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
} else {
|
||||
use_x = devData[def.name]
|
||||
}
|
||||
var value interface{}
|
||||
var value any
|
||||
switch def.calc {
|
||||
case "none":
|
||||
value = use_x
|
||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||
case "difference":
|
||||
value = use_x - devData[def.name]
|
||||
if value.(int64) < 0 {
|
||||
value = 0
|
||||
}
|
||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||
case "derivative":
|
||||
value = float64(use_x-devData[def.name]) / tdiff.Seconds()
|
||||
if value.(float64) < 0 {
|
||||
value = 0
|
||||
}
|
||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||
}
|
||||
if err == nil {
|
||||
y.AddTag("device", device)
|
||||
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -58,7 +59,11 @@ func getStats(filename string) map[string]MemstatStats {
|
||||
if err != nil {
|
||||
cclog.Error(err.Error())
|
||||
}
|
||||
defer file.Close()
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.Error(err.Error())
|
||||
}
|
||||
}()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
@@ -115,19 +120,20 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
"MemShared": "mem_shared",
|
||||
}
|
||||
for k, v := range matches {
|
||||
_, skip := stringArrayContains(m.config.ExcludeMetrics, k)
|
||||
if !skip {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, k) {
|
||||
m.matches[k] = v
|
||||
}
|
||||
}
|
||||
m.sendMemUsed = false
|
||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_used"); !skip {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, "mem_used") {
|
||||
m.sendMemUsed = true
|
||||
}
|
||||
if len(m.matches) == 0 {
|
||||
return errors.New("no metrics to collect")
|
||||
}
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
|
||||
if m.config.NodeStats {
|
||||
if stats := getStats(MEMSTATFILE); len(stats) == 0 {
|
||||
@@ -153,7 +159,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
||||
file: f,
|
||||
tags: map[string]string{
|
||||
"type": "memoryDomain",
|
||||
"type-id": fmt.Sprintf("%d", id),
|
||||
"type-id": strconv.Itoa(id),
|
||||
},
|
||||
}
|
||||
m.nodefiles[id] = f
|
||||
@@ -174,7 +180,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
sendStats := func(stats map[string]MemstatStats, tags map[string]string) {
|
||||
for match, name := range m.matches {
|
||||
var value float64 = 0
|
||||
var unit string = ""
|
||||
unit := ""
|
||||
if v, ok := stats[match]; ok {
|
||||
value = v.value
|
||||
if len(v.unit) > 0 {
|
||||
@@ -182,7 +188,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
}
|
||||
}
|
||||
|
||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||
if err == nil {
|
||||
if len(unit) > 0 {
|
||||
y.AddMeta("unit", unit)
|
||||
@@ -215,7 +221,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
}
|
||||
}
|
||||
}
|
||||
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now())
|
||||
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]any{"value": memUsed}, time.Now())
|
||||
if err == nil {
|
||||
if len(unit) > 0 {
|
||||
y.AddMeta("unit", unit)
|
||||
|
||||
@@ -51,30 +51,6 @@ func (c *metricCollector) Initialized() bool {
|
||||
return c.init
|
||||
}
|
||||
|
||||
// intArrayContains scans an array of ints if the value str is present in the array
|
||||
// If the specified value is found, the corresponding array index is returned.
|
||||
// The bool value is used to signal success or failure
|
||||
func intArrayContains(array []int, str int) (int, bool) {
|
||||
for i, a := range array {
|
||||
if a == str {
|
||||
return i, true
|
||||
}
|
||||
}
|
||||
return -1, false
|
||||
}
|
||||
|
||||
// stringArrayContains scans an array of strings if the value str is present in the array
|
||||
// If the specified value is found, the corresponding array index is returned.
|
||||
// The bool value is used to signal success or failure
|
||||
func stringArrayContains(array []string, str string) (int, bool) {
|
||||
for i, a := range array {
|
||||
if a == str {
|
||||
return i, true
|
||||
}
|
||||
}
|
||||
return -1, false
|
||||
}
|
||||
|
||||
// RemoveFromStringList removes the string r from the array of strings s
|
||||
// If r is not contained in the array an error is returned
|
||||
func RemoveFromStringList(s []string, r string) ([]string, error) {
|
||||
|
||||
@@ -10,8 +10,9 @@ package collectors
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -65,7 +66,9 @@ func getCanonicalName(raw string, aliasToCanonical map[string]string) string {
|
||||
func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||
m.name = "NetstatCollector"
|
||||
m.parallel = true
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.lastTimestamp = time.Now()
|
||||
|
||||
const (
|
||||
@@ -107,10 +110,8 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||
// Check access to net statistic file
|
||||
file, err := os.Open(NETSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to open netstat file \"%s\": %w", m.name, NETSTATFILE, err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
@@ -129,7 +130,7 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||
canonical := getCanonicalName(raw, m.aliasToCanonical)
|
||||
|
||||
// Check if device is a included device
|
||||
if _, ok := stringArrayContains(m.config.IncludeDevices, canonical); ok {
|
||||
if slices.Contains(m.config.IncludeDevices, canonical) {
|
||||
// Tag will contain original device name (raw).
|
||||
tags := map[string]string{"stype": "network", "stype-id": raw, "type": "node"}
|
||||
meta_unit_byte := map[string]string{"source": m.name, "group": "Network", "unit": "bytes"}
|
||||
@@ -174,8 +175,13 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||
}
|
||||
}
|
||||
|
||||
// Close netstat file
|
||||
if err := file.Close(); err != nil {
|
||||
return fmt.Errorf("%s Init(): failed to close netstat file \"%s\": %w", m.name, NETSTATFILE, err)
|
||||
}
|
||||
|
||||
if len(m.matches) == 0 {
|
||||
return errors.New("no devices to collector metrics found")
|
||||
return fmt.Errorf("%s Init(): no devices to collect metrics found", m.name)
|
||||
}
|
||||
m.init = true
|
||||
return nil
|
||||
@@ -194,10 +200,18 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
|
||||
file, err := os.Open(NETSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", NETSTATFILE, err))
|
||||
return
|
||||
}
|
||||
defer file.Close()
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", NETSTATFILE, err))
|
||||
}
|
||||
}()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
@@ -226,14 +240,14 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
continue
|
||||
}
|
||||
if m.config.SendAbsoluteValues {
|
||||
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]interface{}{"value": v}, now); err == nil {
|
||||
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]any{"value": v}, now); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if m.config.SendDerivedValues {
|
||||
if metric.lastValue >= 0 {
|
||||
rate := float64(v-metric.lastValue) / timeDiff
|
||||
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]interface{}{"value": rate}, now); err == nil {
|
||||
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]any{"value": rate}, now); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ package collectors
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"slices"
|
||||
|
||||
// "os"
|
||||
"os/exec"
|
||||
@@ -18,6 +18,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
)
|
||||
|
||||
@@ -44,10 +45,15 @@ type nfsCollector struct {
|
||||
|
||||
func (m *nfsCollector) initStats() error {
|
||||
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
|
||||
cmd.Wait()
|
||||
|
||||
// Wait for cmd end
|
||||
if err := cmd.Wait(); err != nil {
|
||||
return fmt.Errorf("%s initStats(): %w", m.name, err)
|
||||
}
|
||||
|
||||
buffer, err := cmd.Output()
|
||||
if err == nil {
|
||||
for _, line := range strings.Split(string(buffer), "\n") {
|
||||
for line := range strings.Lines(string(buffer)) {
|
||||
lf := strings.Fields(line)
|
||||
if len(lf) != 5 {
|
||||
continue
|
||||
@@ -71,10 +77,15 @@ func (m *nfsCollector) initStats() error {
|
||||
|
||||
func (m *nfsCollector) updateStats() error {
|
||||
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
|
||||
cmd.Wait()
|
||||
|
||||
// Wait for cmd end
|
||||
if err := cmd.Wait(); err != nil {
|
||||
return fmt.Errorf("%s updateStats(): %w", m.name, err)
|
||||
}
|
||||
|
||||
buffer, err := cmd.Output()
|
||||
if err == nil {
|
||||
for _, line := range strings.Split(string(buffer), "\n") {
|
||||
for line := range strings.Lines(string(buffer)) {
|
||||
lf := strings.Fields(line)
|
||||
if len(lf) != 5 {
|
||||
continue
|
||||
@@ -102,8 +113,7 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
log.Print(err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
m.meta = map[string]string{
|
||||
@@ -116,10 +126,12 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
|
||||
// Check if nfsstat is in executable search path
|
||||
_, err := exec.LookPath(m.config.Nfsstats)
|
||||
if err != nil {
|
||||
return fmt.Errorf("NfsCollector.Init(): Failed to find nfsstat binary '%s': %v", m.config.Nfsstats, err)
|
||||
return fmt.Errorf("%s Init(): Failed to find nfsstat binary '%s': %w", m.name, m.config.Nfsstats, err)
|
||||
}
|
||||
m.data = make(map[string]NfsCollectorData)
|
||||
m.initStats()
|
||||
if err := m.initStats(); err != nil {
|
||||
return fmt.Errorf("%s Init(): %w", m.name, err)
|
||||
}
|
||||
m.init = true
|
||||
m.parallel = true
|
||||
return nil
|
||||
@@ -131,8 +143,14 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
}
|
||||
timestamp := time.Now()
|
||||
|
||||
m.updateStats()
|
||||
prefix := ""
|
||||
if err := m.updateStats(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): updateStats() failed: %v", err),
|
||||
)
|
||||
return
|
||||
}
|
||||
var prefix string
|
||||
switch m.version {
|
||||
case "v3":
|
||||
prefix = "nfs3"
|
||||
@@ -143,11 +161,11 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
}
|
||||
|
||||
for name, data := range m.data {
|
||||
if _, skip := stringArrayContains(m.config.ExcludeMetrics, name); skip {
|
||||
if slices.Contains(m.config.ExcludeMetrics, name) {
|
||||
continue
|
||||
}
|
||||
value := data.current - data.last
|
||||
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("version", m.version)
|
||||
output <- y
|
||||
@@ -170,13 +188,17 @@ type Nfs4Collector struct {
|
||||
func (m *Nfs3Collector) Init(config json.RawMessage) error {
|
||||
m.name = "Nfs3Collector"
|
||||
m.version = `v3`
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
return m.MainInit(config)
|
||||
}
|
||||
|
||||
func (m *Nfs4Collector) Init(config json.RawMessage) error {
|
||||
m.name = "Nfs4Collector"
|
||||
m.version = `v4`
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
return m.MainInit(config)
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -71,7 +72,7 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
|
||||
// Is this a device line with mount point, remote target and NFS version?
|
||||
dev := resolve_regex_fields(l, deviceRegex)
|
||||
if len(dev) > 0 {
|
||||
if _, ok := stringArrayContains(m.config.ExcludeFilesystem, dev[m.key]); !ok {
|
||||
if !slices.Contains(m.config.ExcludeFilesystem, dev[m.key]) {
|
||||
current = dev
|
||||
if len(current["version"]) == 0 {
|
||||
current["version"] = "3"
|
||||
@@ -85,7 +86,7 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
|
||||
if len(bytes) > 0 {
|
||||
data[current[m.key]] = make(map[string]int64)
|
||||
for name, sval := range bytes {
|
||||
if _, ok := stringArrayContains(m.config.ExcludeMetrics, name); !ok {
|
||||
if !slices.Contains(m.config.ExcludeMetrics, name) {
|
||||
val, err := strconv.ParseInt(sval, 10, 64)
|
||||
if err == nil {
|
||||
data[current[m.key]][name] = val
|
||||
@@ -102,7 +103,9 @@ func (m *NfsIOStatCollector) readNfsiostats() map[string]map[string]int64 {
|
||||
func (m *NfsIOStatCollector) Init(config json.RawMessage) error {
|
||||
var err error = nil
|
||||
m.name = "NfsIOStatCollector"
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{"source": m.name, "group": "NFS", "unit": "bytes"}
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
@@ -140,7 +143,13 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
if old, ok := m.data[mntpoint]; ok {
|
||||
for name, newVal := range values {
|
||||
if m.config.SendAbsoluteValues {
|
||||
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", name), m.tags, m.meta, map[string]interface{}{"value": newVal}, now)
|
||||
msg, err := lp.NewMessage(
|
||||
"nfsio_"+name,
|
||||
m.tags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": newVal},
|
||||
now)
|
||||
if err == nil {
|
||||
msg.AddTag("stype", "filesystem")
|
||||
msg.AddTag("stype-id", mntpoint)
|
||||
@@ -149,7 +158,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
}
|
||||
if m.config.SendDerivedValues {
|
||||
rate := float64(newVal-old[name]) / timeDiff
|
||||
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]interface{}{"value": rate}, now)
|
||||
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]any{"value": rate}, now)
|
||||
if err == nil {
|
||||
if strings.HasPrefix(name, "page") {
|
||||
msg.AddMeta("unit", "4K_pages/s")
|
||||
|
||||
@@ -72,7 +72,9 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
||||
|
||||
m.name = "NUMAStatsCollector"
|
||||
m.parallel = true
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "NUMA",
|
||||
@@ -82,7 +84,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to unmarshal numastat configuration: %s", err.Error())
|
||||
return fmt.Errorf("%s Init(): unable to unmarshal numastat configuration: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,10 +93,10 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
||||
globPattern := base + "[0-9]*"
|
||||
dirs, err := filepath.Glob(globPattern)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to glob files with pattern '%s'", globPattern)
|
||||
return fmt.Errorf("%s Init(): unable to glob files with pattern '%s'", m.name, globPattern)
|
||||
}
|
||||
if dirs == nil {
|
||||
return fmt.Errorf("unable to find any files with pattern '%s'", globPattern)
|
||||
return fmt.Errorf("%s Init(): unable to find any files with pattern '%s'", m.name, globPattern)
|
||||
}
|
||||
m.topology = make([]NUMAStatsCollectorTopolgy, 0, len(dirs))
|
||||
for _, dir := range dirs {
|
||||
@@ -186,7 +188,11 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
t.previousValues[key] = value
|
||||
}
|
||||
}
|
||||
file.Close()
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", t.file, err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -12,6 +12,9 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"maps"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -64,7 +67,9 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
m.config.ProcessMigDevices = false
|
||||
m.config.UseUuidForMigDevices = false
|
||||
m.config.UseSliceForMigDevices = false
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
@@ -105,11 +110,11 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
// For all GPUs
|
||||
idx := 0
|
||||
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
|
||||
for i := 0; i < num_gpus; i++ {
|
||||
for i := range num_gpus {
|
||||
|
||||
// Skip excluded devices by ID
|
||||
str_i := fmt.Sprintf("%d", i)
|
||||
if _, skip := stringArrayContains(m.config.ExcludeDevices, str_i); skip {
|
||||
str_i := strconv.Itoa(i)
|
||||
if slices.Contains(m.config.ExcludeDevices, str_i) {
|
||||
cclog.ComponentDebug(m.name, "Skipping excluded device", str_i)
|
||||
continue
|
||||
}
|
||||
@@ -137,7 +142,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
pciInfo.Device)
|
||||
|
||||
// Skip excluded devices specified by PCI ID
|
||||
if _, skip := stringArrayContains(m.config.ExcludeDevices, pci_id); skip {
|
||||
if slices.Contains(m.config.ExcludeDevices, pci_id) {
|
||||
cclog.ComponentDebug(m.name, "Skipping excluded device", pci_id)
|
||||
continue
|
||||
}
|
||||
@@ -222,7 +227,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
||||
var total uint64
|
||||
var used uint64
|
||||
var reserved uint64 = 0
|
||||
var v2 bool = false
|
||||
v2 := false
|
||||
meminfo, ret := nvml.DeviceGetMemoryInfo(device.device)
|
||||
if ret != nvml.SUCCESS {
|
||||
err := errors.New(nvml.ErrorString(ret))
|
||||
@@ -235,7 +240,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
||||
|
||||
if !device.excludeMetrics["nv_fb_mem_total"] {
|
||||
t := float64(total) / (1024 * 1024)
|
||||
y, err := lp.NewMessage("nv_fb_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
y, err := lp.NewMetric("nv_fb_mem_total", device.tags, device.meta, t, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MByte")
|
||||
output <- y
|
||||
@@ -244,7 +249,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
||||
|
||||
if !device.excludeMetrics["nv_fb_mem_used"] {
|
||||
f := float64(used) / (1024 * 1024)
|
||||
y, err := lp.NewMessage("nv_fb_mem_used", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
|
||||
y, err := lp.NewMetric("nv_fb_mem_used", device.tags, device.meta, f, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MByte")
|
||||
output <- y
|
||||
@@ -253,7 +258,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
||||
|
||||
if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] {
|
||||
r := float64(reserved) / (1024 * 1024)
|
||||
y, err := lp.NewMessage("nv_fb_mem_reserved", device.tags, device.meta, map[string]interface{}{"value": r}, time.Now())
|
||||
y, err := lp.NewMetric("nv_fb_mem_reserved", device.tags, device.meta, r, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MByte")
|
||||
output <- y
|
||||
@@ -272,7 +277,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
}
|
||||
if !device.excludeMetrics["nv_bar1_mem_total"] {
|
||||
t := float64(meminfo.Bar1Total) / (1024 * 1024)
|
||||
y, err := lp.NewMessage("nv_bar1_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
y, err := lp.NewMetric("nv_bar1_mem_total", device.tags, device.meta, t, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MByte")
|
||||
output <- y
|
||||
@@ -280,7 +285,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
}
|
||||
if !device.excludeMetrics["nv_bar1_mem_used"] {
|
||||
t := float64(meminfo.Bar1Used) / (1024 * 1024)
|
||||
y, err := lp.NewMessage("nv_bar1_mem_used", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
y, err := lp.NewMetric("nv_bar1_mem_used", device.tags, device.meta, t, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MByte")
|
||||
output <- y
|
||||
@@ -314,14 +319,14 @@ func readUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
util, ret := nvml.DeviceGetUtilizationRates(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
if !device.excludeMetrics["nv_util"] {
|
||||
y, err := lp.NewMessage("nv_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_util", device.tags, device.meta, float64(util.Gpu), time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !device.excludeMetrics["nv_mem_util"] {
|
||||
y, err := lp.NewMessage("nv_mem_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_mem_util", device.tags, device.meta, float64(util.Memory), time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
@@ -341,7 +346,7 @@ func readTemp(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
// * NVML_TEMPERATURE_COUNT
|
||||
temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_temp", device.tags, device.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_temp", device.tags, device.meta, float64(temp), time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "degC")
|
||||
output <- y
|
||||
@@ -364,7 +369,7 @@ func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
// This value may exceed 100% in certain cases.
|
||||
fan, ret := nvml.DeviceGetFanSpeed(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_fan", device.tags, device.meta, float64(fan), time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
@@ -374,27 +379,6 @@ func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// func readFans(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
// if !device.excludeMetrics["nv_fan"] {
|
||||
// numFans, ret := nvml.DeviceGetNumFans(device.device)
|
||||
// if ret == nvml.SUCCESS {
|
||||
// for i := 0; i < numFans; i++ {
|
||||
// fan, ret := nvml.DeviceGetFanSpeed_v2(device.device, i)
|
||||
// if ret == nvml.SUCCESS {
|
||||
// y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
|
||||
// if err == nil {
|
||||
// y.AddMeta("unit", "%")
|
||||
// y.AddTag("stype", "fan")
|
||||
// y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||
// output <- y
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// return nil
|
||||
// }
|
||||
|
||||
func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
if !device.excludeMetrics["nv_ecc_mode"] {
|
||||
// Retrieves the current and pending ECC modes for the device.
|
||||
@@ -405,22 +389,23 @@ func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error
|
||||
// Changing ECC modes requires a reboot.
|
||||
// The "pending" ECC mode refers to the target mode following the next reboot.
|
||||
_, ecc_pend, ret := nvml.DeviceGetEccMode(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
switch ret {
|
||||
case nvml.SUCCESS:
|
||||
var y lp.CCMessage
|
||||
var err error
|
||||
switch ecc_pend {
|
||||
case nvml.FEATURE_DISABLED:
|
||||
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "OFF"}, time.Now())
|
||||
y, err = lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "OFF", time.Now())
|
||||
case nvml.FEATURE_ENABLED:
|
||||
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "ON"}, time.Now())
|
||||
y, err = lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "ON", time.Now())
|
||||
default:
|
||||
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now())
|
||||
y, err = lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "UNKNOWN", time.Now())
|
||||
}
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
} else if ret == nvml.ERROR_NOT_SUPPORTED {
|
||||
y, err := lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "N/A"}, time.Now())
|
||||
case nvml.ERROR_NOT_SUPPORTED:
|
||||
y, err := lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "N/A", time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -440,7 +425,7 @@ func readPerfState(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
||||
// 32: Unknown performance state.
|
||||
pState, ret := nvml.DeviceGetPerformanceState(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_perf_state", device.tags, device.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
|
||||
y, err := lp.NewMetric("nv_perf_state", device.tags, device.meta, fmt.Sprintf("P%d", int(pState)), time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -466,7 +451,7 @@ func readPowerUsage(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
||||
if mode == nvml.FEATURE_ENABLED {
|
||||
power, ret := nvml.DeviceGetPowerUsage(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_power_usage", device.tags, device.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
|
||||
y, err := lp.NewMetric("nv_power_usage", device.tags, device.meta, float64(power)/1000, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "watts")
|
||||
output <- y
|
||||
@@ -492,7 +477,12 @@ func readEnergyConsumption(device *NvidiaCollectorDevice, output chan lp.CCMessa
|
||||
if ret == nvml.SUCCESS {
|
||||
if device.lastEnergyReading != 0 {
|
||||
if !device.excludeMetrics["nv_energy"] {
|
||||
y, err := lp.NewMetric("nv_energy", device.tags, device.meta, (energy-device.lastEnergyReading)/1000, now)
|
||||
y, err := lp.NewMetric(
|
||||
"nv_energy",
|
||||
device.tags,
|
||||
device.meta,
|
||||
(energy-device.lastEnergyReading)/1000,
|
||||
now)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Joules")
|
||||
output <- y
|
||||
@@ -534,7 +524,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
if !device.excludeMetrics["nv_graphics_clock"] {
|
||||
graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_graphics_clock", device.tags, device.meta, float64(graphicsClock), time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MHz")
|
||||
output <- y
|
||||
@@ -545,7 +535,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
if !device.excludeMetrics["nv_sm_clock"] {
|
||||
smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(smCock)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_sm_clock", device.tags, device.meta, float64(smCock), time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MHz")
|
||||
output <- y
|
||||
@@ -556,7 +546,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
if !device.excludeMetrics["nv_mem_clock"] {
|
||||
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_mem_clock", device.tags, device.meta, float64(memClock), time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MHz")
|
||||
output <- y
|
||||
@@ -566,7 +556,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
if !device.excludeMetrics["nv_video_clock"] {
|
||||
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_video_clock", device.tags, device.meta, float64(memClock), time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MHz")
|
||||
output <- y
|
||||
@@ -647,7 +637,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
||||
// i.e. the total set of errors across the entire device.
|
||||
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_ecc_uncorrected_error", device.tags, device.meta, float64(ecc_db), time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -656,7 +646,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
||||
if !device.excludeMetrics["nv_ecc_corrected_error"] {
|
||||
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_ecc_corrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_ecc_corrected_error", device.tags, device.meta, float64(ecc_sb), time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -675,7 +665,7 @@ func readPowerLimit(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
||||
// If the card's total power draw reaches this limit the power management algorithm kicks in.
|
||||
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_power_max_limit", device.tags, device.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now())
|
||||
y, err := lp.NewMetric("nv_power_max_limit", device.tags, device.meta, float64(pwr_limit)/1000, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "watts")
|
||||
output <- y
|
||||
@@ -702,7 +692,7 @@ func readEncUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
|
||||
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_encoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_encoder_util", device.tags, device.meta, float64(enc_util), time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
@@ -729,7 +719,7 @@ func readDecUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
|
||||
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_decoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_decoder_util", device.tags, device.meta, float64(dec_util), time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
@@ -756,33 +746,33 @@ func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) e
|
||||
corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
if !device.excludeMetrics["nv_remapped_rows_corrected"] {
|
||||
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(corrected)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_remapped_rows_corrected", device.tags, device.meta, float64(corrected), time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !device.excludeMetrics["nv_remapped_rows_uncorrected"] {
|
||||
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(uncorrected)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_remapped_rows_corrected", device.tags, device.meta, float64(uncorrected), time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !device.excludeMetrics["nv_remapped_rows_pending"] {
|
||||
var p int = 0
|
||||
p := 0
|
||||
if pending {
|
||||
p = 1
|
||||
}
|
||||
y, err := lp.NewMessage("nv_remapped_rows_pending", device.tags, device.meta, map[string]interface{}{"value": p}, time.Now())
|
||||
y, err := lp.NewMetric("nv_remapped_rows_pending", device.tags, device.meta, p, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !device.excludeMetrics["nv_remapped_rows_failure"] {
|
||||
var f int = 0
|
||||
f := 0
|
||||
if failure {
|
||||
f = 1
|
||||
}
|
||||
y, err := lp.NewMessage("nv_remapped_rows_failure", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
|
||||
y, err := lp.NewMetric("nv_remapped_rows_failure", device.tags, device.meta, f, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -816,7 +806,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
|
||||
procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_compute_processes", device.tags, device.meta, len(procList), time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -845,7 +835,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
|
||||
procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_graphics_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
|
||||
y, err := lp.NewMetric("nv_graphics_processes", device.tags, device.meta, len(procList), time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -875,7 +865,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
// // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
|
||||
// procList, ret := nvml.DeviceGetMPSComputeRunningProcesses(device.device)
|
||||
// if ret == nvml.SUCCESS {
|
||||
// y, err := lp.NewMessage("nv_mps_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
|
||||
// y, err := lp.NewMetric("nv_mps_compute_processes", device.tags, device.meta, len(procList), time.Now())
|
||||
// if err == nil {
|
||||
// output <- y
|
||||
// }
|
||||
@@ -903,7 +893,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_power", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
y, err := lp.NewMetric("nv_violation_power", device.tags, device.meta, t, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -915,7 +905,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_thermal", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
y, err := lp.NewMetric("nv_violation_thermal", device.tags, device.meta, t, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -927,7 +917,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_sync_boost", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
y, err := lp.NewMetric("nv_violation_sync_boost", device.tags, device.meta, t, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -939,7 +929,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_board_limit", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
y, err := lp.NewMetric("nv_violation_board_limit", device.tags, device.meta, t, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -951,7 +941,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_low_util", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
y, err := lp.NewMetric("nv_violation_low_util", device.tags, device.meta, t, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -963,7 +953,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_reliability", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
y, err := lp.NewMetric("nv_violation_reliability", device.tags, device.meta, t, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -975,7 +965,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_below_app_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
y, err := lp.NewMetric("nv_violation_below_app_clock", device.tags, device.meta, t, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -987,7 +977,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_below_base_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
y, err := lp.NewMetric("nv_violation_below_base_clock", device.tags, device.meta, t, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -1010,19 +1000,19 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
var aggregate_recovery_errors uint64 = 0
|
||||
var aggregate_crc_flit_errors uint64 = 0
|
||||
|
||||
for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ {
|
||||
for i := range nvml.NVLINK_MAX_LINKS {
|
||||
state, ret := nvml.DeviceGetNvLinkState(device.device, i)
|
||||
if ret == nvml.SUCCESS {
|
||||
if state == nvml.FEATURE_ENABLED {
|
||||
if !device.excludeMetrics["nv_nvlink_crc_errors"] {
|
||||
// Data link receive data CRC error counter
|
||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA)
|
||||
aggregate_crc_errors = aggregate_crc_errors + count
|
||||
aggregate_crc_errors += count
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_nvlink_crc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
||||
y, err := lp.NewMetric("nv_nvlink_crc_errors", device.tags, device.meta, count, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||
y.AddTag("stype-id", strconv.Itoa(i))
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
@@ -1030,12 +1020,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
|
||||
// Data link receive data ECC error counter
|
||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA)
|
||||
aggregate_ecc_errors = aggregate_ecc_errors + count
|
||||
aggregate_ecc_errors += count
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
||||
y, err := lp.NewMetric("nv_nvlink_ecc_errors", device.tags, device.meta, count, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||
y.AddTag("stype-id", strconv.Itoa(i))
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
@@ -1043,12 +1033,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
if !device.excludeMetrics["nv_nvlink_replay_errors"] {
|
||||
// Data link transmit replay error counter
|
||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY)
|
||||
aggregate_replay_errors = aggregate_replay_errors + count
|
||||
aggregate_replay_errors += count
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_nvlink_replay_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
||||
y, err := lp.NewMetric("nv_nvlink_replay_errors", device.tags, device.meta, count, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||
y.AddTag("stype-id", strconv.Itoa(i))
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
@@ -1056,12 +1046,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
|
||||
// Data link transmit recovery error counter
|
||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY)
|
||||
aggregate_recovery_errors = aggregate_recovery_errors + count
|
||||
aggregate_recovery_errors += count
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
||||
y, err := lp.NewMetric("nv_nvlink_recovery_errors", device.tags, device.meta, count, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||
y.AddTag("stype-id", strconv.Itoa(i))
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
@@ -1069,12 +1059,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
|
||||
// Data link receive flow control digit CRC error counter
|
||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT)
|
||||
aggregate_crc_flit_errors = aggregate_crc_flit_errors + count
|
||||
aggregate_crc_flit_errors += count
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
||||
y, err := lp.NewMetric("nv_nvlink_crc_flit_errors", device.tags, device.meta, count, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||
y.AddTag("stype-id", strconv.Itoa(i))
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
@@ -1086,7 +1076,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
// Export aggegated values
|
||||
if !device.excludeMetrics["nv_nvlink_crc_errors"] {
|
||||
// Data link receive data CRC error counter
|
||||
y, err := lp.NewMessage("nv_nvlink_crc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_errors}, time.Now())
|
||||
y, err := lp.NewMetric("nv_nvlink_crc_errors_sum", device.tags, device.meta, aggregate_crc_errors, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
output <- y
|
||||
@@ -1094,7 +1084,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
}
|
||||
if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
|
||||
// Data link receive data ECC error counter
|
||||
y, err := lp.NewMessage("nv_nvlink_ecc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_ecc_errors}, time.Now())
|
||||
y, err := lp.NewMetric("nv_nvlink_ecc_errors_sum", device.tags, device.meta, aggregate_ecc_errors, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
output <- y
|
||||
@@ -1102,7 +1092,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
}
|
||||
if !device.excludeMetrics["nv_nvlink_replay_errors"] {
|
||||
// Data link transmit replay error counter
|
||||
y, err := lp.NewMessage("nv_nvlink_replay_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_replay_errors}, time.Now())
|
||||
y, err := lp.NewMetric("nv_nvlink_replay_errors_sum", device.tags, device.meta, aggregate_replay_errors, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
output <- y
|
||||
@@ -1110,7 +1100,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
}
|
||||
if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
|
||||
// Data link transmit recovery error counter
|
||||
y, err := lp.NewMessage("nv_nvlink_recovery_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_recovery_errors}, time.Now())
|
||||
y, err := lp.NewMetric("nv_nvlink_recovery_errors_sum", device.tags, device.meta, aggregate_recovery_errors, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
output <- y
|
||||
@@ -1118,7 +1108,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
}
|
||||
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
|
||||
// Data link receive flow control digit CRC error counter
|
||||
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_flit_errors}, time.Now())
|
||||
y, err := lp.NewMetric("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, aggregate_crc_flit_errors, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
output <- y
|
||||
@@ -1258,7 +1248,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
}
|
||||
cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i)
|
||||
|
||||
for j := 0; j < maxMig; j++ {
|
||||
for j := range maxMig {
|
||||
mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
|
||||
if ret != nvml.SUCCESS {
|
||||
continue
|
||||
@@ -1275,9 +1265,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
meta: map[string]string{},
|
||||
excludeMetrics: excludeMetrics,
|
||||
}
|
||||
for k, v := range m.gpus[i].tags {
|
||||
migDevice.tags[k] = v
|
||||
}
|
||||
maps.Copy(migDevice.tags, m.gpus[i].tags)
|
||||
migDevice.tags["stype"] = "mig"
|
||||
if m.config.UseUuidForMigDevices {
|
||||
uuid, ret := nvml.DeviceGetUUID(mdev)
|
||||
@@ -1291,19 +1279,17 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
if ret == nvml.SUCCESS {
|
||||
mname, ret := nvml.DeviceGetName(mdev)
|
||||
if ret == nvml.SUCCESS {
|
||||
x := strings.Replace(mname, name, "", -1)
|
||||
x = strings.Replace(x, "MIG", "", -1)
|
||||
x := strings.ReplaceAll(mname, name, "")
|
||||
x = strings.ReplaceAll(x, "MIG", "")
|
||||
x = strings.TrimSpace(x)
|
||||
migDevice.tags["stype-id"] = x
|
||||
}
|
||||
}
|
||||
}
|
||||
if _, ok := migDevice.tags["stype-id"]; !ok {
|
||||
migDevice.tags["stype-id"] = fmt.Sprintf("%d", j)
|
||||
}
|
||||
for k, v := range m.gpus[i].meta {
|
||||
migDevice.meta[k] = v
|
||||
migDevice.tags["stype-id"] = strconv.Itoa(j)
|
||||
}
|
||||
maps.Copy(migDevice.meta, m.gpus[i].meta)
|
||||
if _, ok := migDevice.meta["uuid"]; ok && !m.config.UseUuidForMigDevices {
|
||||
uuid, ret := nvml.DeviceGetUUID(mdev)
|
||||
if ret == nvml.SUCCESS {
|
||||
@@ -1319,7 +1305,9 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
|
||||
func (m *NvidiaCollector) Close() {
|
||||
if m.init {
|
||||
nvml.Shutdown()
|
||||
if ret := nvml.Shutdown(); ret != nvml.SUCCESS {
|
||||
cclog.ComponentError(m.name, "nvml.Shutdown() not successful")
|
||||
}
|
||||
m.init = false
|
||||
}
|
||||
}
|
||||
|
||||
@@ -54,9 +54,10 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
var err error = nil
|
||||
m.name = "RAPLCollector"
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
@@ -66,7 +67,7 @@ func (m *RAPLCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Read in the JSON configuration
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||
return err
|
||||
@@ -248,7 +249,7 @@ func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
"rapl_average_power",
|
||||
p.tags,
|
||||
m.meta,
|
||||
map[string]interface{}{"value": averagePower},
|
||||
map[string]any{"value": averagePower},
|
||||
energyTimestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
|
||||
@@ -11,6 +11,8 @@ import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"slices"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
@@ -52,7 +54,9 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||
m.name = "RocmSmiCollector"
|
||||
// This is for later use, also call it early
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
// Define meta information sent with each metric
|
||||
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||
//m.meta = map[string]string{"source": m.name, "group": "AMD"}
|
||||
@@ -85,22 +89,11 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
return err
|
||||
}
|
||||
|
||||
exclDev := func(s string) bool {
|
||||
skip_device := false
|
||||
for _, excl := range m.config.ExcludeDevices {
|
||||
if excl == s {
|
||||
skip_device = true
|
||||
break
|
||||
}
|
||||
}
|
||||
return skip_device
|
||||
}
|
||||
|
||||
m.devices = make([]RocmSmiCollectorDevice, 0)
|
||||
|
||||
for i := 0; i < numDevs; i++ {
|
||||
str_i := fmt.Sprintf("%d", i)
|
||||
if exclDev(str_i) {
|
||||
for i := range numDevs {
|
||||
str_i := strconv.Itoa(i)
|
||||
if slices.Contains(m.config.ExcludeDevices, str_i) {
|
||||
continue
|
||||
}
|
||||
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
|
||||
@@ -124,7 +117,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
pciInfo.Device,
|
||||
pciInfo.Function)
|
||||
|
||||
if exclDev(pciId) {
|
||||
if slices.Contains(m.config.ExcludeDevices, pciId) {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -182,130 +175,130 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
|
||||
if !dev.excludeMetrics["rocm_gfx_util"] {
|
||||
value := metrics.Average_gfx_activity
|
||||
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_umc_util"] {
|
||||
value := metrics.Average_umc_activity
|
||||
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_mm_util"] {
|
||||
value := metrics.Average_mm_activity
|
||||
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_avg_power"] {
|
||||
value := metrics.Average_socket_power
|
||||
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_mem"] {
|
||||
value := metrics.Temperature_mem
|
||||
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_hotspot"] {
|
||||
value := metrics.Temperature_hotspot
|
||||
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_edge"] {
|
||||
value := metrics.Temperature_edge
|
||||
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_vrgfx"] {
|
||||
value := metrics.Temperature_vrgfx
|
||||
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_vrsoc"] {
|
||||
value := metrics.Temperature_vrsoc
|
||||
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_vrmem"] {
|
||||
value := metrics.Temperature_vrmem
|
||||
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_gfx_clock"] {
|
||||
value := metrics.Average_gfxclk_frequency
|
||||
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_soc_clock"] {
|
||||
value := metrics.Average_socclk_frequency
|
||||
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_u_clock"] {
|
||||
value := metrics.Average_uclk_frequency
|
||||
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_v0_clock"] {
|
||||
value := metrics.Average_vclk0_frequency
|
||||
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_v1_clock"] {
|
||||
value := metrics.Average_vclk1_frequency
|
||||
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_d0_clock"] {
|
||||
value := metrics.Average_dclk0_frequency
|
||||
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_d1_clock"] {
|
||||
value := metrics.Average_dclk1_frequency
|
||||
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_hbm"] {
|
||||
for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ {
|
||||
for i := range rocm_smi.NUM_HBM_INSTANCES {
|
||||
value := metrics.Temperature_hbm[i]
|
||||
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype", "device")
|
||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||
y.AddTag("stype-id", strconv.Itoa(i))
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,7 +15,9 @@ hugo_path: docs/reference/cc-metric-collector/collectors/rocmsmi.md
|
||||
```json
|
||||
"rocm_smi": {
|
||||
"exclude_devices": [
|
||||
"0","1", "0000000:ff:01.0"
|
||||
"0",
|
||||
"1",
|
||||
"0000000:ff:01.0"
|
||||
],
|
||||
"exclude_metrics": [
|
||||
"rocm_mm_util",
|
||||
@@ -23,7 +25,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/rocmsmi.md
|
||||
],
|
||||
"use_pci_info_as_type_id": true,
|
||||
"add_pci_info_tag": false,
|
||||
"add_serial_meta": false,
|
||||
"add_serial_meta": false
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ package collectors
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
@@ -41,7 +42,9 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
|
||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||
m.name = "SampleCollector"
|
||||
// This is for later use, also call it early
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
// Tell whether the collector should be run in parallel with others (reading files, ...)
|
||||
// or it should be run serially, mostly for collectors actually doing measurements
|
||||
// because they should not measure the execution of the other collectors
|
||||
@@ -92,7 +95,7 @@ func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
// stop := readState()
|
||||
// value = (stop - start) / interval.Seconds()
|
||||
|
||||
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
// Send it to output channel
|
||||
output <- y
|
||||
|
||||
@@ -9,6 +9,7 @@ package collectors
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -36,11 +37,13 @@ type SampleTimerCollector struct {
|
||||
}
|
||||
|
||||
func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
|
||||
var err error = nil
|
||||
var err error
|
||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||
m.name = "SampleTimerCollector"
|
||||
// This is for later use, also call it early
|
||||
m.setup()
|
||||
if err = m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
// Define meta information sent with each metric
|
||||
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
|
||||
@@ -107,7 +110,7 @@ func (m *SampleTimerCollector) ReadMetrics(timestamp time.Time) {
|
||||
// stop := readState()
|
||||
// value = (stop - start) / interval.Seconds()
|
||||
|
||||
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]any{"value": value}, timestamp)
|
||||
if err == nil && m.output != nil {
|
||||
// Send it to output channel if we have a valid channel
|
||||
m.output <- y
|
||||
|
||||
@@ -11,7 +11,6 @@ import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -47,37 +46,37 @@ type SchedstatCollector struct {
|
||||
// Called once by the collector manager
|
||||
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
||||
func (m *SchedstatCollector) Init(config json.RawMessage) error {
|
||||
var err error = nil
|
||||
// Always set the name early in Init() to use it in cclog.Component* functions
|
||||
m.name = "SchedstatCollector"
|
||||
// This is for later use, also call it early
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
// Tell whether the collector should be run in parallel with others (reading files, ...)
|
||||
// or it should be run serially, mostly for collectors acutally doing measurements
|
||||
// or it should be run serially, mostly for collectors actually doing measurements
|
||||
// because they should not measure the execution of the other collectors
|
||||
m.parallel = true
|
||||
// Define meta information sent with each metric
|
||||
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
||||
m.meta = map[string]string{"source": m.name, "group": "SCHEDSTAT"}
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "SCHEDSTAT",
|
||||
}
|
||||
|
||||
// Read in the JSON configuration
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||
return err
|
||||
if err := json.Unmarshal(config, &m.config); err != nil {
|
||||
return fmt.Errorf("%s Init(): Error reading config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Check input file
|
||||
file, err := os.Open(string(SCHEDSTATFILE))
|
||||
file, err := os.Open(SCHEDSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
return fmt.Errorf("%s Init(): Failed opening scheduler statistics file \"%s\": %w", m.name, SCHEDSTATFILE, err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
// Pre-generate tags for all CPUs
|
||||
num_cpus := 0
|
||||
m.cputags = make(map[string]map[string]string)
|
||||
m.olddata = make(map[string]map[string]int64)
|
||||
scanner := bufio.NewScanner(file)
|
||||
@@ -89,11 +88,19 @@ func (m *SchedstatCollector) Init(config json.RawMessage) error {
|
||||
cpu, _ := strconv.Atoi(cpustr)
|
||||
running, _ := strconv.ParseInt(linefields[7], 10, 64)
|
||||
waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
|
||||
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
|
||||
m.olddata[linefields[0]] = map[string]int64{"running": running, "waiting": waiting}
|
||||
num_cpus++
|
||||
m.cputags[linefields[0]] = map[string]string{
|
||||
"type": "hwthread",
|
||||
"type-id": strconv.Itoa(cpu),
|
||||
}
|
||||
m.olddata[linefields[0]] = map[string]int64{
|
||||
"running": running,
|
||||
"waiting": waiting,
|
||||
}
|
||||
}
|
||||
}
|
||||
if err := file.Close(); err != nil {
|
||||
return fmt.Errorf("%s Init(): Failed closing scheduler statistics file \"%s\": %w", m.name, SCHEDSTATFILE, err)
|
||||
}
|
||||
|
||||
// Save current timestamp
|
||||
m.lastTimestamp = time.Now()
|
||||
@@ -109,14 +116,14 @@ func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]
|
||||
diff_running := running - m.olddata[linefields[0]]["running"]
|
||||
diff_waiting := waiting - m.olddata[linefields[0]]["waiting"]
|
||||
|
||||
var l_running float64 = float64(diff_running) / tsdelta.Seconds() / (math.Pow(1000, 3))
|
||||
var l_waiting float64 = float64(diff_waiting) / tsdelta.Seconds() / (math.Pow(1000, 3))
|
||||
l_running := float64(diff_running) / tsdelta.Seconds() / 1000_000_000
|
||||
l_waiting := float64(diff_waiting) / tsdelta.Seconds() / 1000_000_000
|
||||
|
||||
m.olddata[linefields[0]]["running"] = running
|
||||
m.olddata[linefields[0]]["waiting"] = waiting
|
||||
value := l_running + l_waiting
|
||||
|
||||
y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now)
|
||||
y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]any{"value": value}, now)
|
||||
if err == nil {
|
||||
// Send it to output channel
|
||||
output <- y
|
||||
@@ -134,11 +141,19 @@ func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
now := time.Now()
|
||||
tsdelta := now.Sub(m.lastTimestamp)
|
||||
|
||||
file, err := os.Open(string(SCHEDSTATFILE))
|
||||
file, err := os.Open(SCHEDSTATFILE)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, err.Error())
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to open file '%s': %v", SCHEDSTATFILE, err))
|
||||
}
|
||||
defer file.Close()
|
||||
defer func() {
|
||||
if err := file.Close(); err != nil {
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to close file '%s': %v", SCHEDSTATFILE, err))
|
||||
}
|
||||
}()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
|
||||
@@ -9,6 +9,7 @@ package collectors
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"runtime"
|
||||
"syscall"
|
||||
"time"
|
||||
@@ -34,7 +35,9 @@ type SelfCollector struct {
|
||||
func (m *SelfCollector) Init(config json.RawMessage) error {
|
||||
var err error = nil
|
||||
m.name = "SelfCollector"
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{"source": m.name, "group": "Self"}
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
@@ -56,49 +59,49 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
var memstats runtime.MemStats
|
||||
runtime.ReadMemStats(&memstats)
|
||||
|
||||
y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp)
|
||||
y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]any{"value": memstats.TotalAlloc}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp)
|
||||
y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]any{"value": memstats.HeapAlloc}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp)
|
||||
y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]any{"value": memstats.HeapSys}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp)
|
||||
y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]any{"value": memstats.HeapIdle}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp)
|
||||
y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]any{"value": memstats.HeapInuse}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp)
|
||||
y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]any{"value": memstats.HeapReleased}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp)
|
||||
y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]any{"value": memstats.HeapObjects}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if m.config.GoRoutines {
|
||||
y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp)
|
||||
y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]any{"value": runtime.NumGoroutine()}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if m.config.CgoCalls {
|
||||
y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp)
|
||||
y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]any{"value": runtime.NumCgoCall()}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -109,35 +112,35 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
if err == nil {
|
||||
sec, nsec := rusage.Utime.Unix()
|
||||
t := float64(sec) + (float64(nsec) * 1e-9)
|
||||
y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
|
||||
y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]any{"value": t}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "seconds")
|
||||
output <- y
|
||||
}
|
||||
sec, nsec = rusage.Stime.Unix()
|
||||
t = float64(sec) + (float64(nsec) * 1e-9)
|
||||
y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
|
||||
y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]any{"value": t}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "seconds")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp)
|
||||
y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]any{"value": rusage.Nvcsw}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp)
|
||||
y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]any{"value": rusage.Nivcsw}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp)
|
||||
y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]any{"value": rusage.Nsignals}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp)
|
||||
y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]any{"value": rusage.Majflt}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp)
|
||||
y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]any{"value": rusage.Minflt}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
@@ -50,8 +50,7 @@ func ParseCPUs(cpuset string) ([]int, error) {
|
||||
return result, nil
|
||||
}
|
||||
|
||||
ranges := strings.Split(cpuset, ",")
|
||||
for _, r := range ranges {
|
||||
for r := range strings.SplitSeq(cpuset, ",") {
|
||||
if strings.Contains(r, "-") {
|
||||
parts := strings.Split(r, "-")
|
||||
if len(parts) != 2 {
|
||||
@@ -80,9 +79,10 @@ func ParseCPUs(cpuset string) ([]int, error) {
|
||||
}
|
||||
|
||||
func GetAllCPUs() ([]int, error) {
|
||||
data, err := os.ReadFile("/sys/devices/system/cpu/online")
|
||||
cpuOnline := "/sys/devices/system/cpu/online"
|
||||
data, err := os.ReadFile(cpuOnline)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read /sys/devices/system/cpu/online: %v", err)
|
||||
return nil, fmt.Errorf("failed to read file \"%s\": %w", cpuOnline, err)
|
||||
}
|
||||
return ParseCPUs(strings.TrimSpace(string(data)))
|
||||
}
|
||||
@@ -103,18 +103,22 @@ func (m *SlurmCgroupCollector) readFile(path string) ([]byte, error) {
|
||||
func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
|
||||
var err error
|
||||
m.name = "SlurmCgroupCollector"
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
m.parallel = true
|
||||
m.meta = map[string]string{"source": m.name, "group": "SLURM"}
|
||||
m.tags = map[string]string{"type": "hwthread"}
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "SLURM"}
|
||||
m.tags = map[string]string{
|
||||
"type": "hwthread"}
|
||||
m.cpuUsed = make(map[int]bool)
|
||||
m.cgroupBase = defaultCgroupBase
|
||||
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Error reading JSON config: %w", m.name, err)
|
||||
}
|
||||
m.excludeMetrics = make(map[string]struct{})
|
||||
for _, metric := range m.config.ExcludeMetrics {
|
||||
@@ -129,19 +133,16 @@ func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
|
||||
if !m.useSudo {
|
||||
user, err := user.Current()
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Failed to get current user:", err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
|
||||
}
|
||||
if user.Uid != "0" {
|
||||
cclog.ComponentError(m.name, "Reading cgroup files requires root privileges (or enable use_sudo in config)")
|
||||
return fmt.Errorf("not root")
|
||||
return fmt.Errorf("%s Init(): Reading cgroup files requires root privileges (or enable use_sudo in config)", m.name)
|
||||
}
|
||||
}
|
||||
|
||||
m.allCPUs, err = GetAllCPUs()
|
||||
if err != nil {
|
||||
cclog.ComponentError(m.name, "Error reading online CPUs:", err.Error())
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): Error reading online CPUs: %w", m.name, err)
|
||||
}
|
||||
|
||||
m.init = true
|
||||
@@ -158,7 +159,9 @@ func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error)
|
||||
CpuSet: []int{},
|
||||
}
|
||||
|
||||
cg := func(f string) string { return filepath.Join(m.cgroupBase, jobdir, f) }
|
||||
cg := func(f string) string {
|
||||
return filepath.Join(m.cgroupBase, jobdir, f)
|
||||
}
|
||||
|
||||
memUsage, err := m.readFile(cg("memory.current"))
|
||||
if err == nil {
|
||||
@@ -207,8 +210,8 @@ func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error)
|
||||
}
|
||||
}
|
||||
if usageUsec > 0 {
|
||||
jobdata.CpuUsageUser = (userUsec * 100 / usageUsec)
|
||||
jobdata.CpuUsageSys = (systemUsec * 100 / usageUsec)
|
||||
jobdata.CpuUsageUser = (userUsec * 100.0 / usageUsec)
|
||||
jobdata.CpuUsageSys = (systemUsec * 100.0 / usageUsec)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -251,12 +254,18 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
for _, cpu := range jobdata.CpuSet {
|
||||
coreTags := map[string]string{
|
||||
"type": "hwthread",
|
||||
"type-id": fmt.Sprintf("%d", cpu),
|
||||
"type-id": strconv.Itoa(cpu),
|
||||
}
|
||||
|
||||
if coreCount > 0 && !m.isExcluded("job_mem_used") {
|
||||
memPerCore := jobdata.MemoryUsage / coreCount
|
||||
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]interface{}{"value": memPerCore}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage(
|
||||
"job_mem_used",
|
||||
coreTags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": memPerCore},
|
||||
timestamp); err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
@@ -264,7 +273,13 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
|
||||
if coreCount > 0 && !m.isExcluded("job_max_mem_used") {
|
||||
maxMemPerCore := jobdata.MaxMemoryUsage / coreCount
|
||||
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]interface{}{"value": maxMemPerCore}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage(
|
||||
"job_max_mem_used",
|
||||
coreTags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": maxMemPerCore},
|
||||
timestamp); err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
@@ -272,7 +287,13 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
|
||||
if coreCount > 0 && !m.isExcluded("job_mem_limit") {
|
||||
limitPerCore := jobdata.LimitMemoryUsage / coreCount
|
||||
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]interface{}{"value": limitPerCore}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage(
|
||||
"job_mem_limit",
|
||||
coreTags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": limitPerCore},
|
||||
timestamp); err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
@@ -280,7 +301,13 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
|
||||
if coreCount > 0 && !m.isExcluded("job_user_cpu") {
|
||||
cpuUserPerCore := jobdata.CpuUsageUser / coreCount
|
||||
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]interface{}{"value": cpuUserPerCore}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage(
|
||||
"job_user_cpu",
|
||||
coreTags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": cpuUserPerCore},
|
||||
timestamp); err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
}
|
||||
@@ -288,7 +315,13 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
|
||||
if coreCount > 0 && !m.isExcluded("job_sys_cpu") {
|
||||
cpuSysPerCore := jobdata.CpuUsageSys / coreCount
|
||||
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]interface{}{"value": cpuSysPerCore}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage(
|
||||
"job_sys_cpu",
|
||||
coreTags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": cpuSysPerCore},
|
||||
timestamp); err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
}
|
||||
@@ -303,39 +336,57 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
if !m.cpuUsed[cpu] {
|
||||
coreTags := map[string]string{
|
||||
"type": "hwthread",
|
||||
"type-id": fmt.Sprintf("%d", cpu),
|
||||
"type-id": strconv.Itoa(cpu),
|
||||
}
|
||||
|
||||
if !m.isExcluded("job_mem_used") {
|
||||
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage(
|
||||
"job_mem_used",
|
||||
coreTags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": 0},
|
||||
timestamp); err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
if !m.isExcluded("job_max_mem_used") {
|
||||
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage(
|
||||
"job_max_mem_used",
|
||||
coreTags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": 0},
|
||||
timestamp); err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
if !m.isExcluded("job_mem_limit") {
|
||||
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage(
|
||||
"job_mem_limit",
|
||||
coreTags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": 0},
|
||||
timestamp); err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
if !m.isExcluded("job_user_cpu") {
|
||||
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
if !m.isExcluded("job_sys_cpu") {
|
||||
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
}
|
||||
|
||||
@@ -58,11 +58,13 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
||||
|
||||
m.name = "TempCollector"
|
||||
m.parallel = true
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
if len(config) > 0 {
|
||||
err := json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,10 +80,10 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
||||
globPattern := filepath.Join("/sys/class/hwmon", "*", "temp*_input")
|
||||
inputFiles, err := filepath.Glob(globPattern)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to glob files with pattern '%s': %v", globPattern, err)
|
||||
return fmt.Errorf("%s Init(): unable to glob files with pattern '%s': %w", m.name, globPattern, err)
|
||||
}
|
||||
if inputFiles == nil {
|
||||
return fmt.Errorf("unable to find any files with pattern '%s'", globPattern)
|
||||
return fmt.Errorf("%s Init(): unable to find any files with pattern '%s'", m.name, globPattern)
|
||||
}
|
||||
|
||||
// Get sensor name for each temperature sensor file
|
||||
@@ -117,7 +119,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
||||
sensor.metricName = sensor.label
|
||||
}
|
||||
sensor.metricName = strings.ToLower(sensor.metricName)
|
||||
sensor.metricName = strings.Replace(sensor.metricName, " ", "_", -1)
|
||||
sensor.metricName = strings.ReplaceAll(sensor.metricName, " ", "_")
|
||||
// Add temperature prefix, if required
|
||||
if !strings.Contains(sensor.metricName, "temp") {
|
||||
sensor.metricName = "temp_" + sensor.metricName
|
||||
@@ -170,7 +172,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
||||
|
||||
// Empty sensors map
|
||||
if len(m.sensors) == 0 {
|
||||
return fmt.Errorf("no temperature sensors found")
|
||||
return fmt.Errorf("%s Init(): no temperature sensors found", m.name)
|
||||
}
|
||||
|
||||
// Finished initialization
|
||||
@@ -201,7 +203,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
sensor.metricName,
|
||||
sensor.tags,
|
||||
m.meta,
|
||||
map[string]interface{}{"value": x},
|
||||
map[string]any{"value": x},
|
||||
time.Now(),
|
||||
)
|
||||
if err == nil {
|
||||
@@ -214,7 +216,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
sensor.maxTempName,
|
||||
sensor.tags,
|
||||
m.meta,
|
||||
map[string]interface{}{"value": sensor.maxTemp},
|
||||
map[string]any{"value": sensor.maxTemp},
|
||||
time.Now(),
|
||||
)
|
||||
if err == nil {
|
||||
@@ -228,7 +230,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
sensor.critTempName,
|
||||
sensor.tags,
|
||||
m.meta,
|
||||
map[string]interface{}{"value": sensor.critTemp},
|
||||
map[string]any{"value": sensor.critTemp},
|
||||
time.Now(),
|
||||
)
|
||||
if err == nil {
|
||||
|
||||
@@ -9,13 +9,12 @@ package collectors
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
)
|
||||
|
||||
@@ -36,12 +35,17 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
|
||||
var err error
|
||||
m.name = "TopProcsCollector"
|
||||
m.parallel = true
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
m.meta = map[string]string{"source": m.name, "group": "TopProcs"}
|
||||
m.tags = map[string]string{
|
||||
"type": "node",
|
||||
}
|
||||
m.meta = map[string]string{
|
||||
"source": m.name,
|
||||
"group": "TopProcs",
|
||||
}
|
||||
if len(config) > 0 {
|
||||
err = json.Unmarshal(config, &m.config)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("%s Init(): json.Unmarshal() failed: %w", m.name, err)
|
||||
}
|
||||
} else {
|
||||
m.config.Num_procs = int(DEFAULT_NUM_PROCS)
|
||||
@@ -49,12 +53,13 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
|
||||
if m.config.Num_procs <= 0 || m.config.Num_procs > MAX_NUM_PROCS {
|
||||
return fmt.Errorf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS)
|
||||
}
|
||||
m.setup()
|
||||
if err := m.setup(); err != nil {
|
||||
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
|
||||
}
|
||||
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
||||
command.Wait()
|
||||
_, err = command.Output()
|
||||
if err != nil {
|
||||
return errors.New("failed to execute command")
|
||||
return fmt.Errorf("%s Init(): failed to get output from command: %w", m.name, err)
|
||||
}
|
||||
m.init = true
|
||||
return nil
|
||||
@@ -65,17 +70,24 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessag
|
||||
return
|
||||
}
|
||||
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
||||
command.Wait()
|
||||
stdout, err := command.Output()
|
||||
if err != nil {
|
||||
log.Print(m.name, err)
|
||||
cclog.ComponentError(
|
||||
m.name,
|
||||
fmt.Sprintf("Read(): Failed to read output from command \"%s\": %v", command.String(), err))
|
||||
return
|
||||
}
|
||||
|
||||
lines := strings.Split(string(stdout), "\n")
|
||||
for i := 1; i < m.config.Num_procs+1; i++ {
|
||||
name := fmt.Sprintf("topproc%d", i)
|
||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now())
|
||||
y, err := lp.NewMessage(
|
||||
name,
|
||||
m.tags,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": lines[i]},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
@@ -1,6 +1,19 @@
|
||||
{
|
||||
"cpufreq": {},
|
||||
"cpufreq_cpuinfo": {},
|
||||
"cpustat": {
|
||||
"exclude_metrics": [
|
||||
"cpu_idle"
|
||||
]
|
||||
},
|
||||
"diskstat": {
|
||||
"exclude_metrics": [
|
||||
"disk_total"
|
||||
],
|
||||
"exclude_mounts": [
|
||||
"slurm-tmpfs"
|
||||
]
|
||||
},
|
||||
"gpfs": {
|
||||
"exclude_filesystem": [
|
||||
"test_fs"
|
||||
@@ -21,6 +34,8 @@
|
||||
},
|
||||
"numastats": {},
|
||||
"nvidia": {},
|
||||
"schedstat": {
|
||||
},
|
||||
"tempstat": {
|
||||
"report_max_temperature": true,
|
||||
"report_critical_temperature": true,
|
||||
@@ -38,4 +53,4 @@
|
||||
"topprocs": {
|
||||
"num_procs": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
6
go.mod
6
go.mod
@@ -3,7 +3,7 @@ module github.com/ClusterCockpit/cc-metric-collector
|
||||
go 1.24.0
|
||||
|
||||
require (
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.1.0
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.2.1
|
||||
github.com/ClusterCockpit/go-rocm-smi v0.3.0
|
||||
github.com/NVIDIA/go-nvml v0.13.0-1
|
||||
github.com/PaesslerAG/gval v1.2.4
|
||||
@@ -11,8 +11,7 @@ require (
|
||||
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf
|
||||
github.com/tklauser/go-sysconf v0.3.16
|
||||
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1
|
||||
golang.org/x/exp v0.0.0-20260112195511-716be5621a96
|
||||
golang.org/x/sys v0.40.0
|
||||
golang.org/x/sys v0.41.0
|
||||
)
|
||||
|
||||
require (
|
||||
@@ -40,6 +39,7 @@ require (
|
||||
github.com/tklauser/numcpus v0.11.0 // indirect
|
||||
go.yaml.in/yaml/v2 v2.4.3 // indirect
|
||||
golang.org/x/crypto v0.47.0 // indirect
|
||||
golang.org/x/exp v0.0.0-20260112195511-716be5621a96 // indirect
|
||||
golang.org/x/net v0.49.0 // indirect
|
||||
google.golang.org/protobuf v1.36.11 // indirect
|
||||
)
|
||||
|
||||
8
go.sum
8
go.sum
@@ -1,5 +1,5 @@
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.1.0 h1:B6l6h0IjfEuY9DU6aVM3fSsj24lQ1eudXK9QTKmJjqg=
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.1.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw=
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.2.1 h1:iCVas+Jc61zFH5S2VG3H1sc7tsn+U4lOJwUYjYZEims=
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.2.1/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw=
|
||||
github.com/ClusterCockpit/go-rocm-smi v0.3.0 h1:1qZnSpG7/NyLtc7AjqnUL9Jb8xtqG1nMVgp69rJfaR8=
|
||||
github.com/ClusterCockpit/go-rocm-smi v0.3.0/go.mod h1:+I3UMeX3OlizXDf1WpGD43W4KGZZGVSGmny6rTeOnWA=
|
||||
github.com/NVIDIA/go-nvml v0.11.6-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
|
||||
@@ -122,8 +122,8 @@ golang.org/x/exp v0.0.0-20260112195511-716be5621a96/go.mod h1:nzimsREAkjBCIEFtHi
|
||||
golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o=
|
||||
golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8=
|
||||
golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
|
||||
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k=
|
||||
golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
|
||||
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
|
||||
@@ -10,8 +10,10 @@ package metricAggregator
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"maps"
|
||||
"math"
|
||||
"os"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -36,7 +38,7 @@ type MetricAggregatorIntervalConfig struct {
|
||||
|
||||
type metricAggregator struct {
|
||||
functions []*MetricAggregatorIntervalConfig
|
||||
constants map[string]interface{}
|
||||
constants map[string]any
|
||||
language gval.Language
|
||||
output chan lp.CCMessage
|
||||
}
|
||||
@@ -84,7 +86,7 @@ var evaluables = struct {
|
||||
func (c *metricAggregator) Init(output chan lp.CCMessage) error {
|
||||
c.output = output
|
||||
c.functions = make([]*MetricAggregatorIntervalConfig, 0)
|
||||
c.constants = make(map[string]interface{})
|
||||
c.constants = make(map[string]any)
|
||||
|
||||
// add constants like hostname, numSockets, ... to constants list
|
||||
// Set hostname
|
||||
@@ -120,10 +122,8 @@ func (c *metricAggregator) Init(output chan lp.CCMessage) error {
|
||||
}
|
||||
|
||||
func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMessage) {
|
||||
vars := make(map[string]interface{})
|
||||
for k, v := range c.constants {
|
||||
vars[k] = v
|
||||
}
|
||||
vars := make(map[string]any)
|
||||
maps.Copy(vars, c.constants)
|
||||
vars["starttime"] = starttime
|
||||
vars["endtime"] = endtime
|
||||
for _, f := range c.functions {
|
||||
@@ -137,7 +137,6 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
|
||||
matches := make([]lp.CCMessage, 0)
|
||||
for _, m := range metrics {
|
||||
vars["metric"] = m
|
||||
//value, err := gval.Evaluate(f.Condition, vars, c.language)
|
||||
value, err := f.gvalCond.EvalBool(context.Background(), vars)
|
||||
if err != nil {
|
||||
cclog.ComponentError("MetricCache", "COLLECT", f.Name, "COND", f.Condition, ":", err.Error())
|
||||
@@ -171,22 +170,22 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
|
||||
// Check, that only values of one type were collected
|
||||
countValueTypes := 0
|
||||
if len(valuesFloat64) > 0 {
|
||||
countValueTypes += 1
|
||||
countValueTypes++
|
||||
}
|
||||
if len(valuesFloat32) > 0 {
|
||||
countValueTypes += 1
|
||||
countValueTypes++
|
||||
}
|
||||
if len(valuesInt) > 0 {
|
||||
countValueTypes += 1
|
||||
countValueTypes++
|
||||
}
|
||||
if len(valuesInt32) > 0 {
|
||||
countValueTypes += 1
|
||||
countValueTypes++
|
||||
}
|
||||
if len(valuesInt64) > 0 {
|
||||
countValueTypes += 1
|
||||
countValueTypes++
|
||||
}
|
||||
if len(valuesBool) > 0 {
|
||||
countValueTypes += 1
|
||||
countValueTypes++
|
||||
}
|
||||
if countValueTypes > 1 {
|
||||
cclog.ComponentError("MetricCache", "Collected values of different types")
|
||||
@@ -263,15 +262,15 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
|
||||
var m lp.CCMessage
|
||||
switch t := value.(type) {
|
||||
case float64:
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
||||
case float32:
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
||||
case int:
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
||||
case int64:
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
||||
case string:
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
||||
default:
|
||||
cclog.ComponentError("MetricCache", "Gval returned invalid type", t, "skipping metric", f.Name)
|
||||
}
|
||||
@@ -329,18 +328,21 @@ func (c *metricAggregator) AddAggregation(name, function, condition string, tags
|
||||
}
|
||||
|
||||
func (c *metricAggregator) DeleteAggregation(name string) error {
|
||||
for i, agg := range c.functions {
|
||||
if agg.Name == name {
|
||||
copy(c.functions[i:], c.functions[i+1:])
|
||||
c.functions[len(c.functions)-1] = nil
|
||||
c.functions = c.functions[:len(c.functions)-1]
|
||||
return nil
|
||||
}
|
||||
i := slices.IndexFunc(
|
||||
c.functions,
|
||||
func(agg *MetricAggregatorIntervalConfig) bool {
|
||||
return agg.Name == name
|
||||
})
|
||||
if i == -1 {
|
||||
return fmt.Errorf("no aggregation for metric name %s", name)
|
||||
}
|
||||
return fmt.Errorf("no aggregation for metric name %s", name)
|
||||
copy(c.functions[i:], c.functions[i+1:])
|
||||
c.functions[len(c.functions)-1] = nil
|
||||
c.functions = c.functions[:len(c.functions)-1]
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *metricAggregator) AddConstant(name string, value interface{}) {
|
||||
func (c *metricAggregator) AddConstant(name string, value any) {
|
||||
c.constants[name] = value
|
||||
}
|
||||
|
||||
@@ -348,11 +350,11 @@ func (c *metricAggregator) DelConstant(name string) {
|
||||
delete(c.constants, name)
|
||||
}
|
||||
|
||||
func (c *metricAggregator) AddFunction(name string, function func(args ...interface{}) (interface{}, error)) {
|
||||
func (c *metricAggregator) AddFunction(name string, function func(args ...any) (any, error)) {
|
||||
c.language = gval.NewLanguage(c.language, gval.Function(name, function))
|
||||
}
|
||||
|
||||
func EvalBoolCondition(condition string, params map[string]interface{}) (bool, error) {
|
||||
func EvalBoolCondition(condition string, params map[string]any) (bool, error) {
|
||||
evaluables.mutex.Lock()
|
||||
evaluable, ok := evaluables.mapping[condition]
|
||||
evaluables.mutex.Unlock()
|
||||
|
||||
@@ -11,10 +11,10 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/exp/slices"
|
||||
|
||||
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
|
||||
)
|
||||
|
||||
@@ -34,7 +34,7 @@ func sumAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
|
||||
}
|
||||
|
||||
// Sum up values
|
||||
func sumfunc(args interface{}) (interface{}, error) {
|
||||
func sumfunc(args any) (any, error) {
|
||||
|
||||
var err error
|
||||
switch values := args.(type) {
|
||||
@@ -63,7 +63,7 @@ func minAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
|
||||
}
|
||||
|
||||
// Get the minimum value
|
||||
func minfunc(args interface{}) (interface{}, error) {
|
||||
func minfunc(args any) (any, error) {
|
||||
switch values := args.(type) {
|
||||
case []float64:
|
||||
return minAnyType(values)
|
||||
@@ -84,12 +84,12 @@ func avgAnyType[T float64 | float32 | int | int32 | int64](values []T) (float64,
|
||||
if len(values) == 0 {
|
||||
return 0.0, errors.New("average function requires at least one argument")
|
||||
}
|
||||
sum, err := sumAnyType[T](values)
|
||||
sum, err := sumAnyType(values)
|
||||
return float64(sum) / float64(len(values)), err
|
||||
}
|
||||
|
||||
// Get the average or mean value
|
||||
func avgfunc(args interface{}) (interface{}, error) {
|
||||
func avgfunc(args any) (any, error) {
|
||||
switch values := args.(type) {
|
||||
case []float64:
|
||||
return avgAnyType(values)
|
||||
@@ -114,7 +114,7 @@ func maxAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
|
||||
}
|
||||
|
||||
// Get the maximum value
|
||||
func maxfunc(args interface{}) (interface{}, error) {
|
||||
func maxfunc(args any) (any, error) {
|
||||
switch values := args.(type) {
|
||||
case []float64:
|
||||
return maxAnyType(values)
|
||||
@@ -146,7 +146,7 @@ func medianAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, er
|
||||
}
|
||||
|
||||
// Get the median value
|
||||
func medianfunc(args interface{}) (interface{}, error) {
|
||||
func medianfunc(args any) (any, error) {
|
||||
switch values := args.(type) {
|
||||
case []float64:
|
||||
return medianAnyType(values)
|
||||
@@ -167,9 +167,9 @@ func medianfunc(args interface{}) (interface{}, error) {
|
||||
* Get number of values in list. Returns always an int
|
||||
*/
|
||||
|
||||
func lenfunc(args interface{}) (interface{}, error) {
|
||||
func lenfunc(args any) (any, error) {
|
||||
var err error = nil
|
||||
var length int = 0
|
||||
length := 0
|
||||
switch values := args.(type) {
|
||||
case []float64:
|
||||
length = len(values)
|
||||
@@ -181,13 +181,7 @@ func lenfunc(args interface{}) (interface{}, error) {
|
||||
length = len(values)
|
||||
case []int32:
|
||||
length = len(values)
|
||||
case float64:
|
||||
err = errors.New("function 'len' can only be applied on arrays and strings")
|
||||
case float32:
|
||||
err = errors.New("function 'len' can only be applied on arrays and strings")
|
||||
case int:
|
||||
err = errors.New("function 'len' can only be applied on arrays and strings")
|
||||
case int64:
|
||||
case float64, float32, int, int64:
|
||||
err = errors.New("function 'len' can only be applied on arrays and strings")
|
||||
case string:
|
||||
length = len(values)
|
||||
@@ -197,13 +191,13 @@ func lenfunc(args interface{}) (interface{}, error) {
|
||||
|
||||
/*
|
||||
* Check if a values is in a list
|
||||
* In constrast to most of the other functions, this one is an infix operator for
|
||||
* In contrast to most of the other functions, this one is an infix operator for
|
||||
* - substring matching: `"abc" in "abcdef"` -> true
|
||||
* - substring matching with int casting: `3 in "abd3"` -> true
|
||||
* - search for an int in an int list: `3 in getCpuList()` -> true (if you have more than 4 CPU hardware threads)
|
||||
*/
|
||||
|
||||
func infunc(a interface{}, b interface{}) (interface{}, error) {
|
||||
func infunc(a any, b any) (any, error) {
|
||||
switch match := a.(type) {
|
||||
case string:
|
||||
switch total := b.(type) {
|
||||
@@ -213,13 +207,9 @@ func infunc(a interface{}, b interface{}) (interface{}, error) {
|
||||
case int:
|
||||
switch total := b.(type) {
|
||||
case []int:
|
||||
for _, x := range total {
|
||||
if x == match {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
return slices.Contains(total, match), nil
|
||||
case string:
|
||||
smatch := fmt.Sprintf("%d", match)
|
||||
smatch := strconv.Itoa(match)
|
||||
return strings.Contains(total, smatch), nil
|
||||
}
|
||||
|
||||
@@ -233,12 +223,12 @@ func infunc(a interface{}, b interface{}) (interface{}, error) {
|
||||
* format keys \d = %d, \w = %d, ... Not sure how to fix this
|
||||
*/
|
||||
|
||||
func matchfunc(args ...interface{}) (interface{}, error) {
|
||||
func matchfunc(args ...any) (any, error) {
|
||||
switch match := args[0].(type) {
|
||||
case string:
|
||||
switch total := args[1].(type) {
|
||||
case string:
|
||||
smatch := strings.Replace(match, "%", "\\", -1)
|
||||
smatch := strings.ReplaceAll(match, "%", "\\")
|
||||
regex, err := regexp.Compile(smatch)
|
||||
if err != nil {
|
||||
return false, err
|
||||
@@ -255,7 +245,7 @@ func matchfunc(args ...interface{}) (interface{}, error) {
|
||||
*/
|
||||
|
||||
// for a given cpuid, it returns the core id
|
||||
func getCpuCoreFunc(args interface{}) (interface{}, error) {
|
||||
func getCpuCoreFunc(args any) (any, error) {
|
||||
switch cpuid := args.(type) {
|
||||
case int:
|
||||
return topo.GetHwthreadCore(cpuid), nil
|
||||
@@ -264,7 +254,7 @@ func getCpuCoreFunc(args interface{}) (interface{}, error) {
|
||||
}
|
||||
|
||||
// for a given cpuid, it returns the socket id
|
||||
func getCpuSocketFunc(args interface{}) (interface{}, error) {
|
||||
func getCpuSocketFunc(args any) (any, error) {
|
||||
switch cpuid := args.(type) {
|
||||
case int:
|
||||
return topo.GetHwthreadSocket(cpuid), nil
|
||||
@@ -273,7 +263,7 @@ func getCpuSocketFunc(args interface{}) (interface{}, error) {
|
||||
}
|
||||
|
||||
// for a given cpuid, it returns the id of the NUMA node
|
||||
func getCpuNumaDomainFunc(args interface{}) (interface{}, error) {
|
||||
func getCpuNumaDomainFunc(args any) (any, error) {
|
||||
switch cpuid := args.(type) {
|
||||
case int:
|
||||
return topo.GetHwthreadNumaDomain(cpuid), nil
|
||||
@@ -282,7 +272,7 @@ func getCpuNumaDomainFunc(args interface{}) (interface{}, error) {
|
||||
}
|
||||
|
||||
// for a given cpuid, it returns the id of the CPU die
|
||||
func getCpuDieFunc(args interface{}) (interface{}, error) {
|
||||
func getCpuDieFunc(args any) (any, error) {
|
||||
switch cpuid := args.(type) {
|
||||
case int:
|
||||
return topo.GetHwthreadDie(cpuid), nil
|
||||
@@ -291,7 +281,7 @@ func getCpuDieFunc(args interface{}) (interface{}, error) {
|
||||
}
|
||||
|
||||
// for a given core id, it returns the list of cpuids
|
||||
func getCpuListOfCoreFunc(args interface{}) (interface{}, error) {
|
||||
func getCpuListOfCoreFunc(args any) (any, error) {
|
||||
cpulist := make([]int, 0)
|
||||
switch in := args.(type) {
|
||||
case int:
|
||||
@@ -305,7 +295,7 @@ func getCpuListOfCoreFunc(args interface{}) (interface{}, error) {
|
||||
}
|
||||
|
||||
// for a given socket id, it returns the list of cpuids
|
||||
func getCpuListOfSocketFunc(args interface{}) (interface{}, error) {
|
||||
func getCpuListOfSocketFunc(args any) (any, error) {
|
||||
cpulist := make([]int, 0)
|
||||
switch in := args.(type) {
|
||||
case int:
|
||||
@@ -319,7 +309,7 @@ func getCpuListOfSocketFunc(args interface{}) (interface{}, error) {
|
||||
}
|
||||
|
||||
// for a given id of a NUMA domain, it returns the list of cpuids
|
||||
func getCpuListOfNumaDomainFunc(args interface{}) (interface{}, error) {
|
||||
func getCpuListOfNumaDomainFunc(args any) (any, error) {
|
||||
cpulist := make([]int, 0)
|
||||
switch in := args.(type) {
|
||||
case int:
|
||||
@@ -333,7 +323,7 @@ func getCpuListOfNumaDomainFunc(args interface{}) (interface{}, error) {
|
||||
}
|
||||
|
||||
// for a given CPU die id, it returns the list of cpuids
|
||||
func getCpuListOfDieFunc(args interface{}) (interface{}, error) {
|
||||
func getCpuListOfDieFunc(args any) (any, error) {
|
||||
cpulist := make([]int, 0)
|
||||
switch in := args.(type) {
|
||||
case int:
|
||||
@@ -347,14 +337,14 @@ func getCpuListOfDieFunc(args interface{}) (interface{}, error) {
|
||||
}
|
||||
|
||||
// wrapper function to get a list of all cpuids of the node
|
||||
func getCpuListOfNode() (interface{}, error) {
|
||||
func getCpuListOfNode() (any, error) {
|
||||
return topo.HwthreadList(), nil
|
||||
}
|
||||
|
||||
// helper function to get the cpuid list for a CCMetric type tag set (type and type-id)
|
||||
// since there is no access to the metric data in the function, is should be called like
|
||||
// `getCpuListOfType()`
|
||||
func getCpuListOfType(args ...interface{}) (interface{}, error) {
|
||||
func getCpuListOfType(args ...any) (any, error) {
|
||||
cpulist := make([]int, 0)
|
||||
switch typ := args[0].(type) {
|
||||
case string:
|
||||
|
||||
@@ -51,7 +51,7 @@ type MetricCache interface {
|
||||
}
|
||||
|
||||
func (c *metricCache) Init(output chan lp.CCMessage, ticker mct.MultiChanTicker, wg *sync.WaitGroup, numPeriods int) error {
|
||||
var err error = nil
|
||||
var err error
|
||||
c.done = make(chan bool)
|
||||
c.wg = wg
|
||||
c.ticker = ticker
|
||||
@@ -137,12 +137,12 @@ func (c *metricCache) Add(metric lp.CCMessage) {
|
||||
p := c.intervals[c.curPeriod]
|
||||
if p.numMetrics < p.sizeMetrics {
|
||||
p.metrics[p.numMetrics] = metric
|
||||
p.numMetrics = p.numMetrics + 1
|
||||
p.numMetrics++
|
||||
p.stopstamp = metric.Time()
|
||||
} else {
|
||||
p.metrics = append(p.metrics, metric)
|
||||
p.numMetrics = p.numMetrics + 1
|
||||
p.sizeMetrics = p.sizeMetrics + 1
|
||||
p.numMetrics++
|
||||
p.sizeMetrics++
|
||||
p.stopstamp = metric.Time()
|
||||
}
|
||||
c.lock.Unlock()
|
||||
@@ -161,8 +161,8 @@ func (c *metricCache) DeleteAggregation(name string) error {
|
||||
// is the current one, index=1 the last interval and so on. Returns and empty array if a wrong index
|
||||
// is given (negative index, index larger than configured number of total intervals, ...)
|
||||
func (c *metricCache) GetPeriod(index int) (time.Time, time.Time, []lp.CCMessage) {
|
||||
var start time.Time = time.Now()
|
||||
var stop time.Time = time.Now()
|
||||
start := time.Now()
|
||||
stop := time.Now()
|
||||
var metrics []lp.CCMessage
|
||||
if index >= 0 && index < c.numPeriods {
|
||||
pindex := c.curPeriod - index
|
||||
|
||||
@@ -10,6 +10,7 @@ package metricRouter
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"maps"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -107,10 +108,8 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
|
||||
cclog.ComponentError("MetricRouter", err.Error())
|
||||
return err
|
||||
}
|
||||
r.maxForward = 1
|
||||
if r.config.MaxForward > r.maxForward {
|
||||
r.maxForward = r.config.MaxForward
|
||||
}
|
||||
r.maxForward = max(1, r.config.MaxForward)
|
||||
|
||||
if r.config.NumCacheIntervals > 0 {
|
||||
r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals)
|
||||
if err != nil {
|
||||
@@ -118,60 +117,80 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
|
||||
return err
|
||||
}
|
||||
for _, agg := range r.config.IntervalAgg {
|
||||
r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
|
||||
err = r.cache.AddAggregation(agg.Name, agg.Function, agg.Condition, agg.Tags, agg.Meta)
|
||||
if err != nil {
|
||||
return fmt.Errorf("MetricCache AddAggregation() failed: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
p, err := mp.NewMessageProcessor()
|
||||
if err != nil {
|
||||
return fmt.Errorf("initialization of message processor failed: %v", err.Error())
|
||||
return fmt.Errorf("MessageProcessor NewMessageProcessor() failed: %w", err)
|
||||
}
|
||||
r.mp = p
|
||||
|
||||
if len(r.config.MessageProcessor) > 0 {
|
||||
err = r.mp.FromConfigJSON(r.config.MessageProcessor)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed parsing JSON for message processor: %v", err.Error())
|
||||
return fmt.Errorf("MessageProcessor FromConfigJSON() failed: %w", err)
|
||||
}
|
||||
}
|
||||
for _, mname := range r.config.DropMetrics {
|
||||
r.mp.AddDropMessagesByName(mname)
|
||||
err = r.mp.AddDropMessagesByName(mname)
|
||||
if err != nil {
|
||||
return fmt.Errorf("MessageProcessor AddDropMessagesByName() failed: %w", err)
|
||||
}
|
||||
}
|
||||
for _, cond := range r.config.DropMetricsIf {
|
||||
r.mp.AddDropMessagesByCondition(cond)
|
||||
err = r.mp.AddDropMessagesByCondition(cond)
|
||||
if err != nil {
|
||||
return fmt.Errorf("MessageProcessor AddDropMessagesByCondition() failed: %w", err)
|
||||
}
|
||||
}
|
||||
for _, data := range r.config.AddTags {
|
||||
cond := data.Condition
|
||||
if cond == "*" {
|
||||
cond = "true"
|
||||
}
|
||||
r.mp.AddAddTagsByCondition(cond, data.Key, data.Value)
|
||||
err = r.mp.AddAddTagsByCondition(cond, data.Key, data.Value)
|
||||
if err != nil {
|
||||
return fmt.Errorf("MessageProcessor AddAddTagsByCondition() failed: %w", err)
|
||||
}
|
||||
}
|
||||
for _, data := range r.config.DelTags {
|
||||
cond := data.Condition
|
||||
if cond == "*" {
|
||||
cond = "true"
|
||||
}
|
||||
r.mp.AddDeleteTagsByCondition(cond, data.Key, data.Value)
|
||||
err = r.mp.AddDeleteTagsByCondition(cond, data.Key, data.Value)
|
||||
if err != nil {
|
||||
return fmt.Errorf("MessageProcessor AddDeleteTagsByCondition() failed: %w", err)
|
||||
}
|
||||
}
|
||||
for oldname, newname := range r.config.RenameMetrics {
|
||||
r.mp.AddRenameMetricByName(oldname, newname)
|
||||
err = r.mp.AddRenameMetricByName(oldname, newname)
|
||||
if err != nil {
|
||||
return fmt.Errorf("MessageProcessor AddRenameMetricByName() failed: %w", err)
|
||||
}
|
||||
}
|
||||
for metricName, prefix := range r.config.ChangeUnitPrefix {
|
||||
r.mp.AddChangeUnitPrefix(fmt.Sprintf("name == '%s'", metricName), prefix)
|
||||
err = r.mp.AddChangeUnitPrefix(fmt.Sprintf("name == '%s'", metricName), prefix)
|
||||
if err != nil {
|
||||
return fmt.Errorf("MessageProcessor AddChangeUnitPrefix() failed: %w", err)
|
||||
}
|
||||
}
|
||||
r.mp.SetNormalizeUnits(r.config.NormalizeUnits)
|
||||
|
||||
r.mp.AddAddTagsByCondition("true", r.config.HostnameTagName, r.hostname)
|
||||
err = r.mp.AddAddTagsByCondition("true", r.config.HostnameTagName, r.hostname)
|
||||
if err != nil {
|
||||
return fmt.Errorf("MessageProcessor AddAddTagsByCondition() failed: %w", err)
|
||||
}
|
||||
|
||||
// r.config.dropMetrics = make(map[string]bool)
|
||||
// for _, mname := range r.config.DropMetrics {
|
||||
// r.config.dropMetrics[mname] = true
|
||||
// }
|
||||
return nil
|
||||
}
|
||||
|
||||
func getParamMap(point lp.CCMessage) map[string]interface{} {
|
||||
params := make(map[string]interface{})
|
||||
func getParamMap(point lp.CCMessage) map[string]any {
|
||||
params := make(map[string]any)
|
||||
params["metric"] = point
|
||||
params["name"] = point.Name()
|
||||
for key, value := range point.Tags() {
|
||||
@@ -180,14 +199,12 @@ func getParamMap(point lp.CCMessage) map[string]interface{} {
|
||||
for key, value := range point.Meta() {
|
||||
params[key] = value
|
||||
}
|
||||
for key, value := range point.Fields() {
|
||||
params[key] = value
|
||||
}
|
||||
maps.Copy(params, point.Fields())
|
||||
params["timestamp"] = point.Time()
|
||||
return params
|
||||
}
|
||||
|
||||
// DoAddTags adds a tag when condition is fullfiled
|
||||
// DoAddTags adds a tag when condition is fulfilled
|
||||
func (r *metricRouter) DoAddTags(point lp.CCMessage) {
|
||||
var conditionMatches bool
|
||||
for _, m := range r.config.AddTags {
|
||||
@@ -209,83 +226,6 @@ func (r *metricRouter) DoAddTags(point lp.CCMessage) {
|
||||
}
|
||||
}
|
||||
|
||||
// DoDelTags removes a tag when condition is fullfiled
|
||||
// func (r *metricRouter) DoDelTags(point lp.CCMessage) {
|
||||
// var conditionMatches bool
|
||||
// for _, m := range r.config.DelTags {
|
||||
// if m.Condition == "*" {
|
||||
// // Condition is always matched
|
||||
// conditionMatches = true
|
||||
// } else {
|
||||
// // Evaluate condition
|
||||
// var err error
|
||||
// conditionMatches, err = agg.EvalBoolCondition(m.Condition, getParamMap(point))
|
||||
// if err != nil {
|
||||
// cclog.ComponentError("MetricRouter", err.Error())
|
||||
// conditionMatches = false
|
||||
// }
|
||||
// }
|
||||
// if conditionMatches {
|
||||
// point.RemoveTag(m.Key)
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// Conditional test whether a metric should be dropped
|
||||
// func (r *metricRouter) dropMetric(point lp.CCMessage) bool {
|
||||
// // Simple drop check
|
||||
// if conditionMatches, ok := r.config.dropMetrics[point.Name()]; ok {
|
||||
// return conditionMatches
|
||||
// }
|
||||
|
||||
// // Checking the dropping conditions
|
||||
// for _, m := range r.config.DropMetricsIf {
|
||||
// conditionMatches, err := agg.EvalBoolCondition(m, getParamMap(point))
|
||||
// if err != nil {
|
||||
// cclog.ComponentError("MetricRouter", err.Error())
|
||||
// conditionMatches = false
|
||||
// }
|
||||
// if conditionMatches {
|
||||
// return conditionMatches
|
||||
// }
|
||||
// }
|
||||
|
||||
// // No dropping condition met
|
||||
// return false
|
||||
// }
|
||||
|
||||
// func (r *metricRouter) prepareUnit(point lp.CCMessage) bool {
|
||||
// if r.config.NormalizeUnits {
|
||||
// if in_unit, ok := point.GetMeta("unit"); ok {
|
||||
// u := units.NewUnit(in_unit)
|
||||
// if u.Valid() {
|
||||
// point.AddMeta("unit", u.Short())
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// if newP, ok := r.config.ChangeUnitPrefix[point.Name()]; ok {
|
||||
|
||||
// newPrefix := units.NewPrefix(newP)
|
||||
|
||||
// if in_unit, ok := point.GetMeta("unit"); ok && newPrefix != units.InvalidPrefix {
|
||||
// u := units.NewUnit(in_unit)
|
||||
// if u.Valid() {
|
||||
// cclog.ComponentDebug("MetricRouter", "Change prefix to", newP, "for metric", point.Name())
|
||||
// conv, out_unit := units.GetUnitPrefixFactor(u, newPrefix)
|
||||
// if conv != nil && out_unit.Valid() {
|
||||
// if val, ok := point.GetField("value"); ok {
|
||||
// point.AddField("value", conv(val))
|
||||
// point.AddMeta("unit", out_unit.Short())
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// }
|
||||
// }
|
||||
|
||||
// return true
|
||||
// }
|
||||
|
||||
// Start starts the metric router
|
||||
func (r *metricRouter) Start() {
|
||||
// start timer if configured
|
||||
@@ -301,28 +241,7 @@ func (r *metricRouter) Start() {
|
||||
cclog.ComponentDebug("MetricRouter", "DONE")
|
||||
}
|
||||
|
||||
// Forward takes a received metric, adds or deletes tags
|
||||
// and forwards it to the output channels
|
||||
// forward := func(point lp.CCMessage) {
|
||||
// cclog.ComponentDebug("MetricRouter", "FORWARD", point)
|
||||
// r.DoAddTags(point)
|
||||
// r.DoDelTags(point)
|
||||
// name := point.Name()
|
||||
// if new, ok := r.config.RenameMetrics[name]; ok {
|
||||
// point.SetName(new)
|
||||
// point.AddMeta("oldname", name)
|
||||
// r.DoAddTags(point)
|
||||
// r.DoDelTags(point)
|
||||
// }
|
||||
|
||||
// r.prepareUnit(point)
|
||||
|
||||
// for _, o := range r.outputs {
|
||||
// o <- point
|
||||
// }
|
||||
// }
|
||||
|
||||
// Foward message received from collector channel
|
||||
// Forward message received from collector channel
|
||||
coll_forward := func(p lp.CCMessage) {
|
||||
// receive from metric collector
|
||||
//p.AddTag(r.config.HostnameTagName, r.hostname)
|
||||
@@ -335,11 +254,6 @@ func (r *metricRouter) Start() {
|
||||
o <- m
|
||||
}
|
||||
}
|
||||
// if !r.dropMetric(p) {
|
||||
// for _, o := range r.outputs {
|
||||
// o <- point
|
||||
// }
|
||||
// }
|
||||
// even if the metric is dropped, it is stored in the cache for
|
||||
// aggregations
|
||||
if r.config.NumCacheIntervals > 0 {
|
||||
@@ -359,9 +273,6 @@ func (r *metricRouter) Start() {
|
||||
o <- m
|
||||
}
|
||||
}
|
||||
// if !r.dropMetric(p) {
|
||||
// forward(p)
|
||||
// }
|
||||
}
|
||||
|
||||
// Forward message received from cache channel
|
||||
|
||||
@@ -13,11 +13,11 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
cclogger "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
const SYSFS_CPUBASE = `/sys/devices/system/cpu`
|
||||
@@ -51,14 +51,13 @@ var cache struct {
|
||||
func fileToInt(path string) int {
|
||||
buffer, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
cclogger.ComponentError("ccTopology", "fileToInt", "Reading", path, ":", err.Error())
|
||||
cclogger.ComponentError("ccTopology", fmt.Sprintf("fileToInt(): Reading \"%s\": %v", path, err))
|
||||
return -1
|
||||
}
|
||||
stringBuffer := strings.TrimSpace(string(buffer))
|
||||
id, err := strconv.Atoi(stringBuffer)
|
||||
if err != nil {
|
||||
cclogger.ComponentError("ccTopology", "fileToInt", "Parsing", path, ":", stringBuffer, err.Error())
|
||||
cclogger.ComponentError("ccTopology", fmt.Sprintf("fileToInt(): Parsing \"%s\": %v", stringBuffer, err))
|
||||
return -1
|
||||
}
|
||||
return id
|
||||
@@ -80,7 +79,7 @@ func fileToList(path string) []int {
|
||||
// Create list
|
||||
list := make([]int, 0)
|
||||
stringBuffer := strings.TrimSpace(string(buffer))
|
||||
for _, valueRangeString := range strings.Split(stringBuffer, ",") {
|
||||
for valueRangeString := range strings.SplitSeq(stringBuffer, ",") {
|
||||
valueRange := strings.Split(valueRangeString, "-")
|
||||
switch len(valueRange) {
|
||||
case 1:
|
||||
@@ -304,20 +303,19 @@ func GetTypeList(topology_type string) []int {
|
||||
}
|
||||
|
||||
func GetTypeId(hwt HwthreadEntry, topology_type string) (int, error) {
|
||||
var err error = nil
|
||||
switch topology_type {
|
||||
case "node":
|
||||
return 0, err
|
||||
return 0, nil
|
||||
case "socket":
|
||||
return hwt.Socket, err
|
||||
return hwt.Socket, nil
|
||||
case "die":
|
||||
return hwt.Die, err
|
||||
return hwt.Die, nil
|
||||
case "memoryDomain":
|
||||
return hwt.NumaDomain, err
|
||||
return hwt.NumaDomain, nil
|
||||
case "core":
|
||||
return hwt.Core, err
|
||||
return hwt.Core, nil
|
||||
case "hwthread":
|
||||
return hwt.CpuID, err
|
||||
return hwt.CpuID, nil
|
||||
}
|
||||
return -1, fmt.Errorf("unknown topology type '%s'", topology_type)
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@ type multiChanTicker struct {
|
||||
|
||||
type MultiChanTicker interface {
|
||||
Init(duration time.Duration)
|
||||
AddChannel(chan time.Time)
|
||||
AddChannel(channel chan time.Time)
|
||||
Close()
|
||||
}
|
||||
|
||||
|
||||
@@ -30,11 +30,11 @@ make
|
||||
|
||||
%install
|
||||
install -Dpm 0750 %{name} %{buildroot}%{_bindir}/%{name}
|
||||
install -Dpm 0600 config.json %{buildroot}%{_sysconfdir}/%{name}/%{name}.json
|
||||
install -Dpm 0600 collectors.json %{buildroot}%{_sysconfdir}/%{name}/collectors.json
|
||||
install -Dpm 0600 sinks.json %{buildroot}%{_sysconfdir}/%{name}/sinks.json
|
||||
install -Dpm 0600 receivers.json %{buildroot}%{_sysconfdir}/%{name}/receivers.json
|
||||
install -Dpm 0600 router.json %{buildroot}%{_sysconfdir}/%{name}/router.json
|
||||
install -Dpm 0600 example-configs/config.json %{buildroot}%{_sysconfdir}/%{name}/%{name}.json
|
||||
install -Dpm 0600 example-configs/collectors.json %{buildroot}%{_sysconfdir}/%{name}/collectors.json
|
||||
install -Dpm 0600 example-configs/sinks.json %{buildroot}%{_sysconfdir}/%{name}/sinks.json
|
||||
install -Dpm 0600 example-configs/receivers.json %{buildroot}%{_sysconfdir}/%{name}/receivers.json
|
||||
install -Dpm 0600 example-configs/router.json %{buildroot}%{_sysconfdir}/%{name}/router.json
|
||||
install -Dpm 0644 scripts/%{name}.service %{buildroot}%{_unitdir}/%{name}.service
|
||||
install -Dpm 0600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{name}
|
||||
install -Dpm 0644 scripts/%{name}.sysusers %{buildroot}%{_sysusersdir}/%{name}.conf
|
||||
|
||||
Reference in New Issue
Block a user