Compare commits

...

11 Commits

Author SHA1 Message Date
Holger Obermaier
d715f7aa07 Preallocate slices of known length 2026-02-12 15:07:39 +01:00
Holger Obermaier
555ba9504a * Replace fmt.Sprintf("%d", i)) by strconv.Itoa(i)
* Correct misspelled words
* Remove unused code
* Break up very long lines into multiple lines
* lp.NewMessage -> lp.NewMetric
2026-02-12 14:31:01 +01:00
Holger Obermaier
309bc32a24 Enable linter: errorlint 2026-02-11 15:59:01 +01:00
Holger Obermaier
0b1f88b8a6 Fixed: interface method AddChannel must have all named params (inamedparam) 2026-02-11 15:55:14 +01:00
Holger Obermaier
3181f81db1 Suggestions from the gocritic linter 2026-02-11 14:56:21 +01:00
Holger Obermaier
18e2518660 Fix derivative values should be float 2026-02-11 14:27:08 +01:00
Holger Obermaier
2cca8d6ac0 Revert wrong use of slices.Delete() 2026-02-11 14:02:47 +01:00
Holger Obermaier
6bfdd6ff17 Wrap errors so that they can be unwrapped 2026-02-11 13:51:58 +01:00
Holger Obermaier
ca95494a83 Use cclog for logging 2026-02-11 12:15:58 +01:00
Holger Obermaier
e512f3255c Use cclog for logging 2026-02-11 12:01:47 +01:00
Holger Obermaier
5b08183d54 Removed unused code 2026-02-11 11:45:08 +01:00
29 changed files with 311 additions and 395 deletions

View File

@@ -49,7 +49,7 @@ jobs:
# Running the linter requires likwid.h, which gets downloaded in the build step # Running the linter requires likwid.h, which gets downloaded in the build step
- name: Static Analysis with GolangCI-Lint and Upload Report with reviewdog - name: Static Analysis with GolangCI-Lint and Upload Report with reviewdog
run: | run: |
golangci-lint run --enable modernize,staticcheck,govet | reviewdog -f=golangci-lint -name "Check golangci-lint on build-latest" -reporter=github-check -filter-mode=nofilter -fail-level none golangci-lint run --enable errorlint,govet,misspell,modernize,prealloc,staticcheck,unconvert,wastedassign | reviewdog -f=golangci-lint -name "Check golangci-lint on build-latest" -reporter=github-check -filter-mode=nofilter -fail-level none
env: env:
REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -50,30 +50,6 @@ type RuntimeConfig struct {
Sync sync.WaitGroup Sync sync.WaitGroup
} }
//// Structure of the configuration file
//type GlobalConfig struct {
// Sink sinks.SinkConfig `json:"sink"`
// Interval int `json:"interval"`
// Duration int `json:"duration"`
// Collectors []string `json:"collectors"`
// Receiver receivers.ReceiverConfig `json:"receiver"`
// DefTags map[string]string `json:"default_tags"`
// CollectConfigs map[string]json.RawMessage `json:"collect_config"`
//}
//// Load JSON configuration file
//func LoadConfiguration(file string, config *GlobalConfig) error {
// configFile, err := os.Open(file)
// defer configFile.Close()
// if err != nil {
// fmt.Println(err.Error())
// return err
// }
// jsonParser := json.NewDecoder(configFile)
// err = jsonParser.Decode(config)
// return err
//}
func ReadCli() map[string]string { func ReadCli() map[string]string {
var m map[string]string var m map[string]string
cfg := flag.String("config", "./config.json", "Path to configuration file") cfg := flag.String("config", "./config.json", "Path to configuration file")
@@ -93,22 +69,6 @@ func ReadCli() map[string]string {
return m return m
} }
//func SetLogging(logfile string) error {
// var file *os.File
// var err error
// if logfile != "stderr" {
// file, err = os.OpenFile(logfile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600)
// if err != nil {
// log.Fatal(err)
// return err
// }
// } else {
// file = os.Stderr
// }
// log.SetOutput(file)
// return nil
//}
// General shutdownHandler function that gets executed in case of interrupt or graceful shutdownHandler // General shutdownHandler function that gets executed in case of interrupt or graceful shutdownHandler
func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) { func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) {
defer config.Sync.Done() defer config.Sync.Done()
@@ -216,11 +176,6 @@ func mainFunc() int {
return 1 return 1
} }
// Set log file
// if logfile := rcfg.CliArgs["logfile"]; logfile != "stderr" {
// cclog.SetOutput(logfile)
// }
// Creat new multi channel ticker // Creat new multi channel ticker
rcfg.MultiChanTicker = mct.NewTicker(rcfg.Interval) rcfg.MultiChanTicker = mct.NewTicker(rcfg.Interval)

View File

@@ -78,7 +78,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
} }
} }
//create map with possible variables // Create map with possible variables
m.matches = make(map[string]string) m.matches = make(map[string]string)
for _, value := range nodeMdstat_array { for _, value := range nodeMdstat_array {
if slices.Contains(m.config.ExcludeMetrics, value) { if slices.Contains(m.config.ExcludeMetrics, value) {
@@ -104,7 +104,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
// Beegfs file system statistics can only be queried by user root // Beegfs file system statistics can only be queried by user root
user, err := user.Current() user, err := user.Current()
if err != nil { if err != nil {
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to get current user: %v", err) return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to get current user: %w", err)
} }
if user.Uid != "0" { if user.Uid != "0" {
return fmt.Errorf("BeegfsMetaCollector.Init(): BeeGFS file system statistics can only be queried by user root") return fmt.Errorf("BeegfsMetaCollector.Init(): BeeGFS file system statistics can only be queried by user root")
@@ -113,7 +113,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
// Check if beegfs-ctl is in executable search path // Check if beegfs-ctl is in executable search path
_, err = exec.LookPath(m.config.Beegfs) _, err = exec.LookPath(m.config.Beegfs)
if err != nil { if err != nil {
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err) return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err)
} }
m.init = true m.init = true
return nil return nil
@@ -123,7 +123,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
if !m.init { if !m.init {
return return
} }
//get mounpoint // Get mounpoint
buffer, _ := os.ReadFile(string("/proc/mounts")) buffer, _ := os.ReadFile(string("/proc/mounts"))
mounts := strings.Split(string(buffer), "\n") mounts := strings.Split(string(buffer), "\n")
var mountpoints []string var mountpoints []string
@@ -164,12 +164,15 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
cmd.Stderr = cmdStderr cmd.Stderr = cmdStderr
err := cmd.Run() err := cmd.Run()
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error()) dataStdErr, _ := io.ReadAll(cmdStderr)
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()) dataStdOut, _ := io.ReadAll(cmdStdout)
data, _ := io.ReadAll(cmdStderr) cclog.ComponentError(
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stderr: \"%s\"\n", string(data)) m.name,
data, _ = io.ReadAll(cmdStdout) fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err),
fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stdout: \"%s\"\n", string(data)) fmt.Sprintf("Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()),
fmt.Sprintf("Read(): command stderr: \"%s\"\n", string(dataStdErr)),
fmt.Sprintf("Read(): command stdout: \"%s\"\n", string(dataStdOut)),
)
return return
} }
// Read I/O statistics // Read I/O statistics

View File

@@ -97,7 +97,7 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
// Beegfs file system statistics can only be queried by user root // Beegfs file system statistics can only be queried by user root
user, err := user.Current() user, err := user.Current()
if err != nil { if err != nil {
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to get current user: %v", err) return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to get current user: %w", err)
} }
if user.Uid != "0" { if user.Uid != "0" {
return fmt.Errorf("BeegfsStorageCollector.Init(): BeeGFS file system statistics can only be queried by user root") return fmt.Errorf("BeegfsStorageCollector.Init(): BeeGFS file system statistics can only be queried by user root")
@@ -106,7 +106,7 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
// Check if beegfs-ctl is in executable search path // Check if beegfs-ctl is in executable search path
_, err = exec.LookPath(m.config.Beegfs) _, err = exec.LookPath(m.config.Beegfs)
if err != nil { if err != nil {
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err) return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %w", m.config.Beegfs, err)
} }
m.init = true m.init = true
return nil return nil
@@ -156,12 +156,15 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
cmd.Stderr = cmdStderr cmd.Stderr = cmdStderr
err := cmd.Run() err := cmd.Run()
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error()) dataStdErr, _ := io.ReadAll(cmdStderr)
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()) dataStdOut, _ := io.ReadAll(cmdStdout)
data, _ := io.ReadAll(cmdStderr) cclog.ComponentError(
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stderr: \"%s\"\n", string(data)) m.name,
data, _ = io.ReadAll(cmdStdout) fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err),
fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stdout: \"%s\"\n", string(data)) fmt.Sprintf("Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()),
fmt.Sprintf("Read(): command stderr: \"%s\"\n", string(dataStdErr)),
fmt.Sprintf("Read(): command stdout: \"%s\"\n", string(dataStdOut)),
)
return return
} }
// Read I/O statistics // Read I/O statistics

View File

@@ -35,7 +35,7 @@ type CPUFreqCpuInfoCollector struct {
topology []CPUFreqCpuInfoCollectorTopology topology []CPUFreqCpuInfoCollectorTopology
} }
func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error { func (m *CPUFreqCpuInfoCollector) Init(_ json.RawMessage) error {
// Check if already initialized // Check if already initialized
if m.init { if m.init {
return nil return nil
@@ -55,7 +55,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
const cpuInfoFile = "/proc/cpuinfo" const cpuInfoFile = "/proc/cpuinfo"
file, err := os.Open(cpuInfoFile) file, err := os.Open(cpuInfoFile)
if err != nil { if err != nil {
return fmt.Errorf("failed to open file '%s': %v", cpuInfoFile, err) return fmt.Errorf("%s Init(): failed to open file '%s': %w", m.name, cpuInfoFile, err)
} }
// Collect topology information from file cpuinfo // Collect topology information from file cpuinfo
@@ -123,7 +123,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
// Check if at least one CPU with frequency information was detected // Check if at least one CPU with frequency information was detected
if len(m.topology) == 0 { if len(m.topology) == 0 {
return fmt.Errorf("no CPU frequency info found in %s", cpuInfoFile) return fmt.Errorf("%s Init(): no CPU frequency info found in %s", m.name, cpuInfoFile)
} }
m.init = true m.init = true

View File

@@ -76,15 +76,15 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq") scalingCurFreqFile := filepath.Join("/sys/devices/system/cpu", fmt.Sprintf("cpu%d", c.CpuID), "cpufreq/scaling_cur_freq")
err := unix.Access(scalingCurFreqFile, unix.R_OK) err := unix.Access(scalingCurFreqFile, unix.R_OK)
if err != nil { if err != nil {
return fmt.Errorf("unable to access file '%s': %v", scalingCurFreqFile, err) return fmt.Errorf("unable to access file '%s': %w", scalingCurFreqFile, err)
} }
m.topology = append(m.topology, m.topology = append(m.topology,
CPUFreqCollectorTopology{ CPUFreqCollectorTopology{
tagSet: map[string]string{ tagSet: map[string]string{
"type": "hwthread", "type": "hwthread",
"type-id": fmt.Sprint(c.CpuID), "type-id": strconv.Itoa(c.CpuID),
"package_id": fmt.Sprint(c.Socket), "package_id": strconv.Itoa(c.Socket),
}, },
scalingCurFreqFile: scalingCurFreqFile, scalingCurFreqFile: scalingCurFreqFile,
}, },

View File

@@ -108,7 +108,9 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
} else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 { } else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
cpustr := strings.TrimLeft(linefields[0], "cpu") cpustr := strings.TrimLeft(linefields[0], "cpu")
cpu, _ := strconv.Atoi(cpustr) cpu, _ := strconv.Atoi(cpustr)
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)} m.cputags[linefields[0]] = map[string]string{
"type": "hwthread",
"type-id": strconv.Itoa(cpu)}
m.olddata[linefields[0]] = make(map[string]int64) m.olddata[linefields[0]] = make(map[string]int64)
for k, v := range m.matches { for k, v := range m.matches {
m.olddata[linefields[0]][k], _ = strconv.ParseInt(linefields[v], 0, 64) m.olddata[linefields[0]][k], _ = strconv.ParseInt(linefields[v], 0, 64)
@@ -191,7 +193,7 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
num_cpus_metric, err := lp.NewMessage("num_cpus", num_cpus_metric, err := lp.NewMessage("num_cpus",
m.nodetags, m.nodetags,
m.meta, m.meta,
map[string]any{"value": int(num_cpus)}, map[string]any{"value": num_cpus},
now, now,
) )
if err == nil { if err == nil {

View File

@@ -18,6 +18,7 @@ import (
"strings" "strings"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage" lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
influx "github.com/influxdata/line-protocol" influx "github.com/influxdata/line-protocol"
) )
@@ -43,12 +44,14 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
var err error var err error
m.name = "CustomCmdCollector" m.name = "CustomCmdCollector"
m.parallel = true m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "Custom"} m.meta = map[string]string{
"source": m.name,
"group": "Custom",
}
if len(config) > 0 { if len(config) > 0 {
err = json.Unmarshal(config, &m.config) err = json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
log.Print(err.Error()) return fmt.Errorf("%s Init(): json.Unmarshal() call failed: %w", m.name, err)
return err
} }
} }
if err := m.setup(); err != nil { if err := m.setup(); err != nil {
@@ -57,13 +60,15 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
for _, c := range m.config.Commands { for _, c := range m.config.Commands {
cmdfields := strings.Fields(c) cmdfields := strings.Fields(c)
command := exec.Command(cmdfields[0], cmdfields[1:]...) command := exec.Command(cmdfields[0], cmdfields[1:]...)
if err := command.Wait(); err != nil {
log.Print(err)
continue
}
_, err = command.Output() _, err = command.Output()
if err == nil { if err == nil {
m.commands = append(m.commands, c) m.commands = append(m.commands, c)
} else {
cclog.ComponentWarn(
m.name,
fmt.Sprintf("%s Init(): Execution of command \"%s\" failed: %v", m.name, command.String(), err),
)
continue
} }
} }
for _, f := range m.config.Files { for _, f := range m.config.Files {
@@ -71,7 +76,10 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
if err == nil { if err == nil {
m.files = append(m.files, f) m.files = append(m.files, f)
} else { } else {
log.Print(err.Error()) cclog.ComponentWarn(
m.name,
fmt.Sprintf("%s Init(): Reading of file \"%s\" failed: %v", m.name, f, err),
)
continue continue
} }
} }

View File

@@ -124,7 +124,13 @@ mountLoop:
tags := map[string]string{"type": "node", "device": linefields[0]} tags := map[string]string{"type": "node", "device": linefields[0]}
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000) total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
if m.allowedMetrics["disk_total"] { if m.allowedMetrics["disk_total"] {
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]any{"value": total}, time.Now()) y, err := lp.NewMessage(
"disk_total",
tags,
m.meta,
map[string]any{
"value": total},
time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "GBytes") y.AddMeta("unit", "GBytes")
output <- y output <- y
@@ -132,7 +138,13 @@ mountLoop:
} }
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000) free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000)
if m.allowedMetrics["disk_free"] { if m.allowedMetrics["disk_free"] {
y, err := lp.NewMessage("disk_free", tags, m.meta, map[string]any{"value": free}, time.Now()) y, err := lp.NewMessage(
"disk_free",
tags,
m.meta,
map[string]any{
"value": free},
time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "GBytes") y.AddMeta("unit", "GBytes")
output <- y output <- y
@@ -146,7 +158,14 @@ mountLoop:
} }
} }
if m.allowedMetrics["part_max_used"] { if m.allowedMetrics["part_max_used"] {
y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]any{"value": int(part_max_used)}, time.Now()) y, err := lp.NewMessage(
"part_max_used",
map[string]string{
"type": "node"},
m.meta,
map[string]any{
"value": int(part_max_used)},
time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "percent") y.AddMeta("unit", "percent")
output <- y output <- y

View File

@@ -14,7 +14,6 @@ import (
"errors" "errors"
"fmt" "fmt"
"io" "io"
"log"
"os/exec" "os/exec"
"os/user" "os/user"
"slices" "slices"
@@ -324,8 +323,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &m.config) err := json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
log.Print(err.Error()) return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err)
return err
} }
} }
m.meta = map[string]string{ m.meta = map[string]string{
@@ -366,7 +364,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
// when using sudo, the full path of mmpmon must be specified because // when using sudo, the full path of mmpmon must be specified because
// exec.LookPath will not work as mmpmon is not executable as user // exec.LookPath will not work as mmpmon is not executable as user
if m.config.Sudo && !strings.HasPrefix(m.config.Mmpmon, "/") { if m.config.Sudo && !strings.HasPrefix(m.config.Mmpmon, "/") {
return fmt.Errorf("when using sudo, mmpmon_path must be provided and an absolute path: %s", m.config.Mmpmon) return fmt.Errorf("%s Init(): when using sudo, mmpmon_path must be provided and an absolute path: %s", m.name, m.config.Mmpmon)
} }
// Check if mmpmon is in executable search path // Check if mmpmon is in executable search path
@@ -379,7 +377,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
p = m.config.Mmpmon p = m.config.Mmpmon
} else { } else {
cclog.ComponentError(m.name, fmt.Sprintf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err)) cclog.ComponentError(m.name, fmt.Sprintf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err))
return fmt.Errorf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err) return fmt.Errorf("%s Init(): failed to find mmpmon binary '%s': %w", m.name, m.config.Mmpmon, err)
} }
} }
m.config.Mmpmon = p m.config.Mmpmon = p
@@ -564,30 +562,30 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
// compute total metrics (map[...] will return 0 if key not found) // compute total metrics (map[...] will return 0 if key not found)
// bytes read and written // bytes read and written
if br, br_ok := newstate["_br_"]; br_ok { if br, br_ok := newstate["_br_"]; br_ok {
newstate["bytesTotal"] = newstate["bytesTotal"] + br newstate["bytesTotal"] += br
} }
if bw, bw_ok := newstate["_bw_"]; bw_ok { if bw, bw_ok := newstate["_bw_"]; bw_ok {
newstate["bytesTotal"] = newstate["bytesTotal"] + bw newstate["bytesTotal"] += bw
} }
// read and write count // read and write count
if rdc, rdc_ok := newstate["_rdc_"]; rdc_ok { if rdc, rdc_ok := newstate["_rdc_"]; rdc_ok {
newstate["iops"] = newstate["iops"] + rdc newstate["iops"] += rdc
} }
if wc, wc_ok := newstate["_wc_"]; wc_ok { if wc, wc_ok := newstate["_wc_"]; wc_ok {
newstate["iops"] = newstate["iops"] + wc newstate["iops"] += wc
} }
// meta operations // meta operations
if oc, oc_ok := newstate["_oc_"]; oc_ok { if oc, oc_ok := newstate["_oc_"]; oc_ok {
newstate["metaops"] = newstate["metaops"] + oc newstate["metaops"] += oc
} }
if cc, cc_ok := newstate["_cc_"]; cc_ok { if cc, cc_ok := newstate["_cc_"]; cc_ok {
newstate["metaops"] = newstate["metaops"] + cc newstate["metaops"] += cc
} }
if dir, dir_ok := newstate["_dir_"]; dir_ok { if dir, dir_ok := newstate["_dir_"]; dir_ok {
newstate["metaops"] = newstate["metaops"] + dir newstate["metaops"] += dir
} }
if iu, iu_ok := newstate["_iu_"]; iu_ok { if iu, iu_ok := newstate["_iu_"]; iu_ok {
newstate["metaops"] = newstate["metaops"] + iu newstate["metaops"] += iu
} }
// send desired metrics for this filesystem // send desired metrics for this filesystem
for _, metric := range m.definitions { for _, metric := range m.definitions {
@@ -620,13 +618,13 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
case "derivative": case "derivative":
if vnew_ok && vold_ok && timeDiff > 0 { if vnew_ok && vold_ok && timeDiff > 0 {
value = float64(vnew-vold) / timeDiff value = float64(vnew-vold) / timeDiff
if value.(float64) < 0 { if value.(float64) < 0.0 {
value = 0 value = 0.0
} }
value_ok = true value_ok = true
} else if vold_ok { } else if vold_ok {
// if the difference is not computable, return 0 // if the difference is not computable, return 0
value = 0 value = 0.0
value_ok = true value_ok = true
} }
} }

View File

@@ -90,10 +90,10 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*") globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*")
ibDirs, err := filepath.Glob(globPattern) ibDirs, err := filepath.Glob(globPattern)
if err != nil { if err != nil {
return fmt.Errorf("unable to glob files with pattern %s: %v", globPattern, err) return fmt.Errorf("%s Init(): unable to glob files with pattern %s: %w", m.name, globPattern, err)
} }
if ibDirs == nil { if ibDirs == nil {
return fmt.Errorf("unable to find any directories with pattern %s", globPattern) return fmt.Errorf("%s Init(): unable to find any directories with pattern %s", m.name, globPattern)
} }
for _, path := range ibDirs { for _, path := range ibDirs {
@@ -157,7 +157,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
for _, counter := range portCounterFiles { for _, counter := range portCounterFiles {
err := unix.Access(counter.path, unix.R_OK) err := unix.Access(counter.path, unix.R_OK)
if err != nil { if err != nil {
return fmt.Errorf("unable to access %s: %v", counter.path, err) return fmt.Errorf("%s Init(): unable to access %s: %w", m.name, counter.path, err)
} }
} }
@@ -177,7 +177,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
} }
if len(m.info) == 0 { if len(m.info) == 0 {
return fmt.Errorf("found no IB devices") return fmt.Errorf("%s Init(): found no IB devices", m.name)
} }
m.init = true m.init = true

View File

@@ -24,7 +24,7 @@ import (
"os" "os"
"os/signal" "os/signal"
"os/user" "os/user"
"sort" "slices"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
@@ -125,22 +125,14 @@ func checkMetricType(t string) bool {
return ok return ok
} }
func eventsToEventStr(events map[string]string) string {
elist := make([]string, 0)
for k, v := range events {
elist = append(elist, fmt.Sprintf("%s:%s", v, k))
}
return strings.Join(elist, ",")
}
func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig { func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig {
tmplist := make([]string, 0) clist := make([]string, 0, len(input.Events))
clist := make([]string, 0)
for k := range input.Events { for k := range input.Events {
clist = append(clist, k) clist = append(clist, k)
} }
sort.Strings(clist) slices.Sort(clist)
elist := make([]*C.char, 0) tmplist := make([]string, 0, len(clist))
elist := make([]*C.char, 0, len(clist))
for _, k := range clist { for _, k := range clist {
v := input.Events[k] v := input.Events[k]
tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k)) tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k))
@@ -217,7 +209,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &m.config) err := json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
return err return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err)
} }
} }
lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS) lib := dl.New(m.config.LibraryPath, LIKWID_LIB_DL_FLAGS)
@@ -226,13 +218,13 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
} }
err := lib.Open() err := lib.Open()
if err != nil { if err != nil {
return fmt.Errorf("error opening %s: %v", m.config.LibraryPath, err) return fmt.Errorf("error opening %s: %w", m.config.LibraryPath, err)
} }
if m.config.ForceOverwrite { if m.config.ForceOverwrite {
cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1") cclog.ComponentDebug(m.name, "Set LIKWID_FORCE=1")
if err := os.Setenv("LIKWID_FORCE", "1"); err != nil { if err := os.Setenv("LIKWID_FORCE", "1"); err != nil {
return fmt.Errorf("error setting environment variable LIKWID_FORCE=1: %v", err) return fmt.Errorf("error setting environment variable LIKWID_FORCE=1: %w", err)
} }
} }
if err := m.setup(); err != nil { if err := m.setup(); err != nil {
@@ -327,7 +319,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
p = m.config.DaemonPath p = m.config.DaemonPath
} }
if err := os.Setenv("PATH", p); err != nil { if err := os.Setenv("PATH", p); err != nil {
return fmt.Errorf("error setting environment variable PATH=%s: %v", p, err) return fmt.Errorf("error setting environment variable PATH=%s: %w", p, err)
} }
} }
C.HPMmode(1) C.HPMmode(1)
@@ -381,7 +373,6 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
// take a measurement for 'interval' seconds of event set index 'group' // take a measurement for 'interval' seconds of event set index 'group'
func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, interval time.Duration) (bool, error) { func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig, interval time.Duration) (bool, error) {
var ret C.int var ret C.int
var gid C.int = -1
sigchan := make(chan os.Signal, 1) sigchan := make(chan os.Signal, 1)
// Watch changes for the lock file () // Watch changes for the lock file ()
@@ -406,10 +397,10 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
// Create the lock file if it does not exist // Create the lock file if it does not exist
file, createErr := os.Create(m.config.LockfilePath) file, createErr := os.Create(m.config.LockfilePath)
if createErr != nil { if createErr != nil {
return true, fmt.Errorf("failed to create lock file: %v", createErr) return true, fmt.Errorf("failed to create lock file: %w", createErr)
} }
if err := file.Close(); err != nil { if err := file.Close(); err != nil {
return true, fmt.Errorf("failed to close lock file: %v", err) return true, fmt.Errorf("failed to close lock file: %w", err)
} }
info, err = os.Stat(m.config.LockfilePath) // Recheck the file after creation info, err = os.Stat(m.config.LockfilePath) // Recheck the file after creation
} }
@@ -462,6 +453,7 @@ func (m *LikwidCollector) takeMeasurement(evidx int, evset LikwidEventsetConfig,
signal.Notify(sigchan, syscall.SIGCHLD) signal.Notify(sigchan, syscall.SIGCHLD)
// Add an event string to LIKWID // Add an event string to LIKWID
var gid C.int
select { select {
case <-sigchan: case <-sigchan:
gid = -1 gid = -1
@@ -631,7 +623,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
) )
if err == nil { if err == nil {
if metric.Type != "node" { if metric.Type != "node" {
y.AddTag("type-id", fmt.Sprintf("%d", domain)) y.AddTag("type-id", strconv.Itoa(domain))
} }
if len(metric.Unit) > 0 { if len(metric.Unit) > 0 {
y.AddMeta("unit", metric.Unit) y.AddMeta("unit", metric.Unit)
@@ -661,7 +653,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
metric.Name, metric.Name,
map[string]string{ map[string]string{
"type": "core", "type": "core",
"type-id": fmt.Sprintf("%d", coreID), "type-id": strconv.Itoa(coreID),
}, },
m.meta, m.meta,
map[string]any{ map[string]any{
@@ -698,7 +690,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
metric.Name, metric.Name,
map[string]string{ map[string]string{
"type": "socket", "type": "socket",
"type-id": fmt.Sprintf("%d", socketID), "type-id": strconv.Itoa(socketID),
}, },
m.meta, m.meta,
map[string]any{ map[string]any{
@@ -800,7 +792,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
) )
if err == nil { if err == nil {
if metric.Type != "node" { if metric.Type != "node" {
y.AddTag("type-id", fmt.Sprintf("%d", domain)) y.AddTag("type-id", strconv.Itoa(domain))
} }
if len(metric.Unit) > 0 { if len(metric.Unit) > 0 {
y.AddMeta("unit", metric.Unit) y.AddMeta("unit", metric.Unit)
@@ -816,7 +808,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
} }
func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMessage) { func (m *LikwidCollector) ReadThread(interval time.Duration, output chan lp.CCMessage) {
var err error = nil var err error
groups := make([]LikwidEventsetConfig, 0) groups := make([]LikwidEventsetConfig, 0)
for evidx, evset := range m.config.Eventsets { for evidx, evset := range m.config.Eventsets {

View File

@@ -159,7 +159,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
file: f, file: f,
tags: map[string]string{ tags: map[string]string{
"type": "memoryDomain", "type": "memoryDomain",
"type-id": fmt.Sprintf("%d", id), "type-id": strconv.Itoa(id),
}, },
} }
m.nodefiles[id] = f m.nodefiles[id] = f

View File

@@ -10,7 +10,6 @@ package collectors
import ( import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"log"
"slices" "slices"
// "os" // "os"
@@ -49,7 +48,7 @@ func (m *nfsCollector) initStats() error {
// Wait for cmd end // Wait for cmd end
if err := cmd.Wait(); err != nil { if err := cmd.Wait(); err != nil {
return fmt.Errorf("initStats(): %w", err) return fmt.Errorf("%s initStats(): %w", m.name, err)
} }
buffer, err := cmd.Output() buffer, err := cmd.Output()
@@ -81,7 +80,7 @@ func (m *nfsCollector) updateStats() error {
// Wait for cmd end // Wait for cmd end
if err := cmd.Wait(); err != nil { if err := cmd.Wait(); err != nil {
return fmt.Errorf("updateStats(): %w", err) return fmt.Errorf("%s updateStats(): %w", m.name, err)
} }
buffer, err := cmd.Output() buffer, err := cmd.Output()
@@ -114,8 +113,7 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &m.config) err := json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
log.Print(err.Error()) return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err)
return err
} }
} }
m.meta = map[string]string{ m.meta = map[string]string{
@@ -128,11 +126,11 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
// Check if nfsstat is in executable search path // Check if nfsstat is in executable search path
_, err := exec.LookPath(m.config.Nfsstats) _, err := exec.LookPath(m.config.Nfsstats)
if err != nil { if err != nil {
return fmt.Errorf("NfsCollector.Init(): Failed to find nfsstat binary '%s': %v", m.config.Nfsstats, err) return fmt.Errorf("%s Init(): Failed to find nfsstat binary '%s': %w", m.name, m.config.Nfsstats, err)
} }
m.data = make(map[string]NfsCollectorData) m.data = make(map[string]NfsCollectorData)
if err := m.initStats(); err != nil { if err := m.initStats(); err != nil {
return fmt.Errorf("NfsCollector.Init(): %w", err) return fmt.Errorf("%s Init(): %w", m.name, err)
} }
m.init = true m.init = true
m.parallel = true m.parallel = true
@@ -152,7 +150,7 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
) )
return return
} }
prefix := "" var prefix string
switch m.version { switch m.version {
case "v3": case "v3":
prefix = "nfs3" prefix = "nfs3"

View File

@@ -143,7 +143,13 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
if old, ok := m.data[mntpoint]; ok { if old, ok := m.data[mntpoint]; ok {
for name, newVal := range values { for name, newVal := range values {
if m.config.SendAbsoluteValues { if m.config.SendAbsoluteValues {
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", name), m.tags, m.meta, map[string]any{"value": newVal}, now) msg, err := lp.NewMessage(
"nfsio_"+name,
m.tags,
m.meta,
map[string]any{
"value": newVal},
now)
if err == nil { if err == nil {
msg.AddTag("stype", "filesystem") msg.AddTag("stype", "filesystem")
msg.AddTag("stype-id", mntpoint) msg.AddTag("stype-id", mntpoint)

View File

@@ -84,7 +84,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &m.config) err := json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
return fmt.Errorf("unable to unmarshal numastat configuration: %s", err.Error()) return fmt.Errorf("%s Init(): unable to unmarshal numastat configuration: %w", m.name, err)
} }
} }
@@ -93,10 +93,10 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
globPattern := base + "[0-9]*" globPattern := base + "[0-9]*"
dirs, err := filepath.Glob(globPattern) dirs, err := filepath.Glob(globPattern)
if err != nil { if err != nil {
return fmt.Errorf("unable to glob files with pattern '%s'", globPattern) return fmt.Errorf("%s Init(): unable to glob files with pattern '%s'", m.name, globPattern)
} }
if dirs == nil { if dirs == nil {
return fmt.Errorf("unable to find any files with pattern '%s'", globPattern) return fmt.Errorf("%s Init(): unable to find any files with pattern '%s'", m.name, globPattern)
} }
m.topology = make([]NUMAStatsCollectorTopolgy, 0, len(dirs)) m.topology = make([]NUMAStatsCollectorTopolgy, 0, len(dirs))
for _, dir := range dirs { for _, dir := range dirs {

View File

@@ -14,6 +14,7 @@ import (
"log" "log"
"maps" "maps"
"slices" "slices"
"strconv"
"strings" "strings"
"time" "time"
@@ -112,7 +113,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
for i := range num_gpus { for i := range num_gpus {
// Skip excluded devices by ID // Skip excluded devices by ID
str_i := fmt.Sprintf("%d", i) str_i := strconv.Itoa(i)
if slices.Contains(m.config.ExcludeDevices, str_i) { if slices.Contains(m.config.ExcludeDevices, str_i) {
cclog.ComponentDebug(m.name, "Skipping excluded device", str_i) cclog.ComponentDebug(m.name, "Skipping excluded device", str_i)
continue continue
@@ -239,7 +240,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
if !device.excludeMetrics["nv_fb_mem_total"] { if !device.excludeMetrics["nv_fb_mem_total"] {
t := float64(total) / (1024 * 1024) t := float64(total) / (1024 * 1024)
y, err := lp.NewMessage("nv_fb_mem_total", device.tags, device.meta, map[string]any{"value": t}, time.Now()) y, err := lp.NewMetric("nv_fb_mem_total", device.tags, device.meta, t, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MByte") y.AddMeta("unit", "MByte")
output <- y output <- y
@@ -248,7 +249,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
if !device.excludeMetrics["nv_fb_mem_used"] { if !device.excludeMetrics["nv_fb_mem_used"] {
f := float64(used) / (1024 * 1024) f := float64(used) / (1024 * 1024)
y, err := lp.NewMessage("nv_fb_mem_used", device.tags, device.meta, map[string]any{"value": f}, time.Now()) y, err := lp.NewMetric("nv_fb_mem_used", device.tags, device.meta, f, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MByte") y.AddMeta("unit", "MByte")
output <- y output <- y
@@ -257,7 +258,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] { if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] {
r := float64(reserved) / (1024 * 1024) r := float64(reserved) / (1024 * 1024)
y, err := lp.NewMessage("nv_fb_mem_reserved", device.tags, device.meta, map[string]any{"value": r}, time.Now()) y, err := lp.NewMetric("nv_fb_mem_reserved", device.tags, device.meta, r, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MByte") y.AddMeta("unit", "MByte")
output <- y output <- y
@@ -276,7 +277,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
} }
if !device.excludeMetrics["nv_bar1_mem_total"] { if !device.excludeMetrics["nv_bar1_mem_total"] {
t := float64(meminfo.Bar1Total) / (1024 * 1024) t := float64(meminfo.Bar1Total) / (1024 * 1024)
y, err := lp.NewMessage("nv_bar1_mem_total", device.tags, device.meta, map[string]any{"value": t}, time.Now()) y, err := lp.NewMetric("nv_bar1_mem_total", device.tags, device.meta, t, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MByte") y.AddMeta("unit", "MByte")
output <- y output <- y
@@ -284,7 +285,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
} }
if !device.excludeMetrics["nv_bar1_mem_used"] { if !device.excludeMetrics["nv_bar1_mem_used"] {
t := float64(meminfo.Bar1Used) / (1024 * 1024) t := float64(meminfo.Bar1Used) / (1024 * 1024)
y, err := lp.NewMessage("nv_bar1_mem_used", device.tags, device.meta, map[string]any{"value": t}, time.Now()) y, err := lp.NewMetric("nv_bar1_mem_used", device.tags, device.meta, t, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MByte") y.AddMeta("unit", "MByte")
output <- y output <- y
@@ -318,14 +319,14 @@ func readUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
util, ret := nvml.DeviceGetUtilizationRates(device.device) util, ret := nvml.DeviceGetUtilizationRates(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
if !device.excludeMetrics["nv_util"] { if !device.excludeMetrics["nv_util"] {
y, err := lp.NewMessage("nv_util", device.tags, device.meta, map[string]any{"value": float64(util.Gpu)}, time.Now()) y, err := lp.NewMetric("nv_util", device.tags, device.meta, float64(util.Gpu), time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
} }
} }
if !device.excludeMetrics["nv_mem_util"] { if !device.excludeMetrics["nv_mem_util"] {
y, err := lp.NewMessage("nv_mem_util", device.tags, device.meta, map[string]any{"value": float64(util.Memory)}, time.Now()) y, err := lp.NewMetric("nv_mem_util", device.tags, device.meta, float64(util.Memory), time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
@@ -345,7 +346,7 @@ func readTemp(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
// * NVML_TEMPERATURE_COUNT // * NVML_TEMPERATURE_COUNT
temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU) temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_temp", device.tags, device.meta, map[string]any{"value": float64(temp)}, time.Now()) y, err := lp.NewMetric("nv_temp", device.tags, device.meta, float64(temp), time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "degC") y.AddMeta("unit", "degC")
output <- y output <- y
@@ -368,7 +369,7 @@ func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
// This value may exceed 100% in certain cases. // This value may exceed 100% in certain cases.
fan, ret := nvml.DeviceGetFanSpeed(device.device) fan, ret := nvml.DeviceGetFanSpeed(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]any{"value": float64(fan)}, time.Now()) y, err := lp.NewMetric("nv_fan", device.tags, device.meta, float64(fan), time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
@@ -378,27 +379,6 @@ func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
return nil return nil
} }
// func readFans(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
// if !device.excludeMetrics["nv_fan"] {
// numFans, ret := nvml.DeviceGetNumFans(device.device)
// if ret == nvml.SUCCESS {
// for i := 0; i < numFans; i++ {
// fan, ret := nvml.DeviceGetFanSpeed_v2(device.device, i)
// if ret == nvml.SUCCESS {
// y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
// if err == nil {
// y.AddMeta("unit", "%")
// y.AddTag("stype", "fan")
// y.AddTag("stype-id", fmt.Sprintf("%d", i))
// output <- y
// }
// }
// }
// }
// }
// return nil
// }
func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error { func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_ecc_mode"] { if !device.excludeMetrics["nv_ecc_mode"] {
// Retrieves the current and pending ECC modes for the device. // Retrieves the current and pending ECC modes for the device.
@@ -415,17 +395,17 @@ func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error
var err error var err error
switch ecc_pend { switch ecc_pend {
case nvml.FEATURE_DISABLED: case nvml.FEATURE_DISABLED:
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "OFF"}, time.Now()) y, err = lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "OFF", time.Now())
case nvml.FEATURE_ENABLED: case nvml.FEATURE_ENABLED:
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "ON"}, time.Now()) y, err = lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "ON", time.Now())
default: default:
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "UNKNOWN"}, time.Now()) y, err = lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "UNKNOWN", time.Now())
} }
if err == nil { if err == nil {
output <- y output <- y
} }
case nvml.ERROR_NOT_SUPPORTED: case nvml.ERROR_NOT_SUPPORTED:
y, err := lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "N/A"}, time.Now()) y, err := lp.NewMetric("nv_ecc_mode", device.tags, device.meta, "N/A", time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -445,7 +425,7 @@ func readPerfState(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
// 32: Unknown performance state. // 32: Unknown performance state.
pState, ret := nvml.DeviceGetPerformanceState(device.device) pState, ret := nvml.DeviceGetPerformanceState(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_perf_state", device.tags, device.meta, map[string]any{"value": fmt.Sprintf("P%d", int(pState))}, time.Now()) y, err := lp.NewMetric("nv_perf_state", device.tags, device.meta, fmt.Sprintf("P%d", int(pState)), time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -471,7 +451,7 @@ func readPowerUsage(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
if mode == nvml.FEATURE_ENABLED { if mode == nvml.FEATURE_ENABLED {
power, ret := nvml.DeviceGetPowerUsage(device.device) power, ret := nvml.DeviceGetPowerUsage(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_power_usage", device.tags, device.meta, map[string]any{"value": float64(power) / 1000}, time.Now()) y, err := lp.NewMetric("nv_power_usage", device.tags, device.meta, float64(power)/1000, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "watts") y.AddMeta("unit", "watts")
output <- y output <- y
@@ -497,7 +477,12 @@ func readEnergyConsumption(device *NvidiaCollectorDevice, output chan lp.CCMessa
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
if device.lastEnergyReading != 0 { if device.lastEnergyReading != 0 {
if !device.excludeMetrics["nv_energy"] { if !device.excludeMetrics["nv_energy"] {
y, err := lp.NewMetric("nv_energy", device.tags, device.meta, (energy-device.lastEnergyReading)/1000, now) y, err := lp.NewMetric(
"nv_energy",
device.tags,
device.meta,
(energy-device.lastEnergyReading)/1000,
now)
if err == nil { if err == nil {
y.AddMeta("unit", "Joules") y.AddMeta("unit", "Joules")
output <- y output <- y
@@ -539,7 +524,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_graphics_clock"] { if !device.excludeMetrics["nv_graphics_clock"] {
graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS) graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_graphics_clock", device.tags, device.meta, map[string]any{"value": float64(graphicsClock)}, time.Now()) y, err := lp.NewMetric("nv_graphics_clock", device.tags, device.meta, float64(graphicsClock), time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MHz") y.AddMeta("unit", "MHz")
output <- y output <- y
@@ -550,7 +535,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_sm_clock"] { if !device.excludeMetrics["nv_sm_clock"] {
smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM) smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_sm_clock", device.tags, device.meta, map[string]any{"value": float64(smCock)}, time.Now()) y, err := lp.NewMetric("nv_sm_clock", device.tags, device.meta, float64(smCock), time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MHz") y.AddMeta("unit", "MHz")
output <- y output <- y
@@ -561,7 +546,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_mem_clock"] { if !device.excludeMetrics["nv_mem_clock"] {
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM) memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_mem_clock", device.tags, device.meta, map[string]any{"value": float64(memClock)}, time.Now()) y, err := lp.NewMetric("nv_mem_clock", device.tags, device.meta, float64(memClock), time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MHz") y.AddMeta("unit", "MHz")
output <- y output <- y
@@ -571,7 +556,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_video_clock"] { if !device.excludeMetrics["nv_video_clock"] {
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO) memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_video_clock", device.tags, device.meta, map[string]any{"value": float64(memClock)}, time.Now()) y, err := lp.NewMetric("nv_video_clock", device.tags, device.meta, float64(memClock), time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "MHz") y.AddMeta("unit", "MHz")
output <- y output <- y
@@ -652,7 +637,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
// i.e. the total set of errors across the entire device. // i.e. the total set of errors across the entire device.
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC) ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]any{"value": float64(ecc_db)}, time.Now()) y, err := lp.NewMetric("nv_ecc_uncorrected_error", device.tags, device.meta, float64(ecc_db), time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -661,7 +646,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
if !device.excludeMetrics["nv_ecc_corrected_error"] { if !device.excludeMetrics["nv_ecc_corrected_error"] {
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC) ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_ecc_corrected_error", device.tags, device.meta, map[string]any{"value": float64(ecc_sb)}, time.Now()) y, err := lp.NewMetric("nv_ecc_corrected_error", device.tags, device.meta, float64(ecc_sb), time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -680,7 +665,7 @@ func readPowerLimit(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
// If the card's total power draw reaches this limit the power management algorithm kicks in. // If the card's total power draw reaches this limit the power management algorithm kicks in.
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device) pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_power_max_limit", device.tags, device.meta, map[string]any{"value": float64(pwr_limit) / 1000}, time.Now()) y, err := lp.NewMetric("nv_power_max_limit", device.tags, device.meta, float64(pwr_limit)/1000, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "watts") y.AddMeta("unit", "watts")
output <- y output <- y
@@ -707,7 +692,7 @@ func readEncUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported. // Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device) enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_encoder_util", device.tags, device.meta, map[string]any{"value": float64(enc_util)}, time.Now()) y, err := lp.NewMetric("nv_encoder_util", device.tags, device.meta, float64(enc_util), time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
@@ -734,7 +719,7 @@ func readDecUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported. // Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device) dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_decoder_util", device.tags, device.meta, map[string]any{"value": float64(dec_util)}, time.Now()) y, err := lp.NewMetric("nv_decoder_util", device.tags, device.meta, float64(dec_util), time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
@@ -761,13 +746,13 @@ func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) e
corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device) corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
if !device.excludeMetrics["nv_remapped_rows_corrected"] { if !device.excludeMetrics["nv_remapped_rows_corrected"] {
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]any{"value": float64(corrected)}, time.Now()) y, err := lp.NewMetric("nv_remapped_rows_corrected", device.tags, device.meta, float64(corrected), time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
} }
if !device.excludeMetrics["nv_remapped_rows_uncorrected"] { if !device.excludeMetrics["nv_remapped_rows_uncorrected"] {
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]any{"value": float64(uncorrected)}, time.Now()) y, err := lp.NewMetric("nv_remapped_rows_corrected", device.tags, device.meta, float64(uncorrected), time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -777,7 +762,7 @@ func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) e
if pending { if pending {
p = 1 p = 1
} }
y, err := lp.NewMessage("nv_remapped_rows_pending", device.tags, device.meta, map[string]any{"value": p}, time.Now()) y, err := lp.NewMetric("nv_remapped_rows_pending", device.tags, device.meta, p, time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -787,7 +772,7 @@ func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) e
if failure { if failure {
f = 1 f = 1
} }
y, err := lp.NewMessage("nv_remapped_rows_failure", device.tags, device.meta, map[string]any{"value": f}, time.Now()) y, err := lp.NewMetric("nv_remapped_rows_failure", device.tags, device.meta, f, time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -821,7 +806,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device) procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_compute_processes", device.tags, device.meta, map[string]any{"value": len(procList)}, time.Now()) y, err := lp.NewMetric("nv_compute_processes", device.tags, device.meta, len(procList), time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -850,7 +835,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device) procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_graphics_processes", device.tags, device.meta, map[string]any{"value": len(procList)}, time.Now()) y, err := lp.NewMetric("nv_graphics_processes", device.tags, device.meta, len(procList), time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }
@@ -880,7 +865,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
// // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. // // Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
// procList, ret := nvml.DeviceGetMPSComputeRunningProcesses(device.device) // procList, ret := nvml.DeviceGetMPSComputeRunningProcesses(device.device)
// if ret == nvml.SUCCESS { // if ret == nvml.SUCCESS {
// y, err := lp.NewMessage("nv_mps_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now()) // y, err := lp.NewMetric("nv_mps_compute_processes", device.tags, device.meta, len(procList), time.Now())
// if err == nil { // if err == nil {
// output <- y // output <- y
// } // }
@@ -908,7 +893,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_power", device.tags, device.meta, map[string]any{"value": t}, time.Now()) y, err := lp.NewMetric("nv_violation_power", device.tags, device.meta, t, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -920,7 +905,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_thermal", device.tags, device.meta, map[string]any{"value": t}, time.Now()) y, err := lp.NewMetric("nv_violation_thermal", device.tags, device.meta, t, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -932,7 +917,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_sync_boost", device.tags, device.meta, map[string]any{"value": t}, time.Now()) y, err := lp.NewMetric("nv_violation_sync_boost", device.tags, device.meta, t, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -944,7 +929,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_board_limit", device.tags, device.meta, map[string]any{"value": t}, time.Now()) y, err := lp.NewMetric("nv_violation_board_limit", device.tags, device.meta, t, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -956,7 +941,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_low_util", device.tags, device.meta, map[string]any{"value": t}, time.Now()) y, err := lp.NewMetric("nv_violation_low_util", device.tags, device.meta, t, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -968,7 +953,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_reliability", device.tags, device.meta, map[string]any{"value": t}, time.Now()) y, err := lp.NewMetric("nv_violation_reliability", device.tags, device.meta, t, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -980,7 +965,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_below_app_clock", device.tags, device.meta, map[string]any{"value": t}, time.Now()) y, err := lp.NewMetric("nv_violation_below_app_clock", device.tags, device.meta, t, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -992,7 +977,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS) violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS)
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
t := float64(violTime.ViolationTime) * 1e-9 t := float64(violTime.ViolationTime) * 1e-9
y, err := lp.NewMessage("nv_violation_below_base_clock", device.tags, device.meta, map[string]any{"value": t}, time.Now()) y, err := lp.NewMetric("nv_violation_below_base_clock", device.tags, device.meta, t, time.Now())
if err == nil { if err == nil {
y.AddMeta("unit", "sec") y.AddMeta("unit", "sec")
output <- y output <- y
@@ -1022,12 +1007,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
if !device.excludeMetrics["nv_nvlink_crc_errors"] { if !device.excludeMetrics["nv_nvlink_crc_errors"] {
// Data link receive data CRC error counter // Data link receive data CRC error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA) count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA)
aggregate_crc_errors = aggregate_crc_errors + count aggregate_crc_errors += count
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_nvlink_crc_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now()) y, err := lp.NewMetric("nv_nvlink_crc_errors", device.tags, device.meta, count, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
y.AddTag("stype-id", fmt.Sprintf("%d", i)) y.AddTag("stype-id", strconv.Itoa(i))
output <- y output <- y
} }
} }
@@ -1035,12 +1020,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
if !device.excludeMetrics["nv_nvlink_ecc_errors"] { if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
// Data link receive data ECC error counter // Data link receive data ECC error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA) count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA)
aggregate_ecc_errors = aggregate_ecc_errors + count aggregate_ecc_errors += count
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now()) y, err := lp.NewMetric("nv_nvlink_ecc_errors", device.tags, device.meta, count, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
y.AddTag("stype-id", fmt.Sprintf("%d", i)) y.AddTag("stype-id", strconv.Itoa(i))
output <- y output <- y
} }
} }
@@ -1048,12 +1033,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
if !device.excludeMetrics["nv_nvlink_replay_errors"] { if !device.excludeMetrics["nv_nvlink_replay_errors"] {
// Data link transmit replay error counter // Data link transmit replay error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY) count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY)
aggregate_replay_errors = aggregate_replay_errors + count aggregate_replay_errors += count
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_nvlink_replay_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now()) y, err := lp.NewMetric("nv_nvlink_replay_errors", device.tags, device.meta, count, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
y.AddTag("stype-id", fmt.Sprintf("%d", i)) y.AddTag("stype-id", strconv.Itoa(i))
output <- y output <- y
} }
} }
@@ -1061,12 +1046,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
if !device.excludeMetrics["nv_nvlink_recovery_errors"] { if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
// Data link transmit recovery error counter // Data link transmit recovery error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY) count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY)
aggregate_recovery_errors = aggregate_recovery_errors + count aggregate_recovery_errors += count
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now()) y, err := lp.NewMetric("nv_nvlink_recovery_errors", device.tags, device.meta, count, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
y.AddTag("stype-id", fmt.Sprintf("%d", i)) y.AddTag("stype-id", strconv.Itoa(i))
output <- y output <- y
} }
} }
@@ -1074,12 +1059,12 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] { if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
// Data link receive flow control digit CRC error counter // Data link receive flow control digit CRC error counter
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT) count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT)
aggregate_crc_flit_errors = aggregate_crc_flit_errors + count aggregate_crc_flit_errors += count
if ret == nvml.SUCCESS { if ret == nvml.SUCCESS {
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now()) y, err := lp.NewMetric("nv_nvlink_crc_flit_errors", device.tags, device.meta, count, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
y.AddTag("stype-id", fmt.Sprintf("%d", i)) y.AddTag("stype-id", strconv.Itoa(i))
output <- y output <- y
} }
} }
@@ -1091,7 +1076,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
// Export aggegated values // Export aggegated values
if !device.excludeMetrics["nv_nvlink_crc_errors"] { if !device.excludeMetrics["nv_nvlink_crc_errors"] {
// Data link receive data CRC error counter // Data link receive data CRC error counter
y, err := lp.NewMessage("nv_nvlink_crc_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_crc_errors}, time.Now()) y, err := lp.NewMetric("nv_nvlink_crc_errors_sum", device.tags, device.meta, aggregate_crc_errors, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
output <- y output <- y
@@ -1099,7 +1084,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
} }
if !device.excludeMetrics["nv_nvlink_ecc_errors"] { if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
// Data link receive data ECC error counter // Data link receive data ECC error counter
y, err := lp.NewMessage("nv_nvlink_ecc_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_ecc_errors}, time.Now()) y, err := lp.NewMetric("nv_nvlink_ecc_errors_sum", device.tags, device.meta, aggregate_ecc_errors, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
output <- y output <- y
@@ -1107,7 +1092,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
} }
if !device.excludeMetrics["nv_nvlink_replay_errors"] { if !device.excludeMetrics["nv_nvlink_replay_errors"] {
// Data link transmit replay error counter // Data link transmit replay error counter
y, err := lp.NewMessage("nv_nvlink_replay_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_replay_errors}, time.Now()) y, err := lp.NewMetric("nv_nvlink_replay_errors_sum", device.tags, device.meta, aggregate_replay_errors, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
output <- y output <- y
@@ -1115,7 +1100,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
} }
if !device.excludeMetrics["nv_nvlink_recovery_errors"] { if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
// Data link transmit recovery error counter // Data link transmit recovery error counter
y, err := lp.NewMessage("nv_nvlink_recovery_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_recovery_errors}, time.Now()) y, err := lp.NewMetric("nv_nvlink_recovery_errors_sum", device.tags, device.meta, aggregate_recovery_errors, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
output <- y output <- y
@@ -1123,7 +1108,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
} }
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] { if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
// Data link receive flow control digit CRC error counter // Data link receive flow control digit CRC error counter
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_crc_flit_errors}, time.Now()) y, err := lp.NewMetric("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, aggregate_crc_flit_errors, time.Now())
if err == nil { if err == nil {
y.AddTag("stype", "nvlink") y.AddTag("stype", "nvlink")
output <- y output <- y
@@ -1302,7 +1287,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
} }
} }
if _, ok := migDevice.tags["stype-id"]; !ok { if _, ok := migDevice.tags["stype-id"]; !ok {
migDevice.tags["stype-id"] = fmt.Sprintf("%d", j) migDevice.tags["stype-id"] = strconv.Itoa(j)
} }
maps.Copy(migDevice.meta, m.gpus[i].meta) maps.Copy(migDevice.meta, m.gpus[i].meta)
if _, ok := migDevice.meta["uuid"]; ok && !m.config.UseUuidForMigDevices { if _, ok := migDevice.meta["uuid"]; ok && !m.config.UseUuidForMigDevices {

View File

@@ -12,6 +12,7 @@ import (
"errors" "errors"
"fmt" "fmt"
"slices" "slices"
"strconv"
"time" "time"
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger" cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
@@ -91,7 +92,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
m.devices = make([]RocmSmiCollectorDevice, 0) m.devices = make([]RocmSmiCollectorDevice, 0)
for i := range numDevs { for i := range numDevs {
str_i := fmt.Sprintf("%d", i) str_i := strconv.Itoa(i)
if slices.Contains(m.config.ExcludeDevices, str_i) { if slices.Contains(m.config.ExcludeDevices, str_i) {
continue continue
} }
@@ -297,7 +298,7 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]any{"value": value}, timestamp) y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
if err == nil { if err == nil {
y.AddTag("stype", "device") y.AddTag("stype", "device")
y.AddTag("stype-id", fmt.Sprintf("%d", i)) y.AddTag("stype-id", strconv.Itoa(i))
output <- y output <- y
} }
} }

View File

@@ -37,11 +37,11 @@ type SampleTimerCollector struct {
} }
func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error { func (m *SampleTimerCollector) Init(name string, config json.RawMessage) error {
var err error = nil var err error
// Always set the name early in Init() to use it in cclog.Component* functions // Always set the name early in Init() to use it in cclog.Component* functions
m.name = "SampleTimerCollector" m.name = "SampleTimerCollector"
// This is for later use, also call it early // This is for later use, also call it early
if err := m.setup(); err != nil { if err = m.setup(); err != nil {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err) return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
} }
// Define meta information sent with each metric // Define meta information sent with each metric

View File

@@ -53,7 +53,7 @@ func (m *SchedstatCollector) Init(config json.RawMessage) error {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err) return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
} }
// Tell whether the collector should be run in parallel with others (reading files, ...) // Tell whether the collector should be run in parallel with others (reading files, ...)
// or it should be run serially, mostly for collectors acutally doing measurements // or it should be run serially, mostly for collectors actually doing measurements
// because they should not measure the execution of the other collectors // because they should not measure the execution of the other collectors
m.parallel = true m.parallel = true
// Define meta information sent with each metric // Define meta information sent with each metric
@@ -90,7 +90,7 @@ func (m *SchedstatCollector) Init(config json.RawMessage) error {
waiting, _ := strconv.ParseInt(linefields[8], 10, 64) waiting, _ := strconv.ParseInt(linefields[8], 10, 64)
m.cputags[linefields[0]] = map[string]string{ m.cputags[linefields[0]] = map[string]string{
"type": "hwthread", "type": "hwthread",
"type-id": fmt.Sprintf("%d", cpu), "type-id": strconv.Itoa(cpu),
} }
m.olddata[linefields[0]] = map[string]int64{ m.olddata[linefields[0]] = map[string]int64{
"running": running, "running": running,

View File

@@ -79,9 +79,10 @@ func ParseCPUs(cpuset string) ([]int, error) {
} }
func GetAllCPUs() ([]int, error) { func GetAllCPUs() ([]int, error) {
data, err := os.ReadFile("/sys/devices/system/cpu/online") cpuOnline := "/sys/devices/system/cpu/online"
data, err := os.ReadFile(cpuOnline)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to read /sys/devices/system/cpu/online: %v", err) return nil, fmt.Errorf("failed to read file \"%s\": %w", cpuOnline, err)
} }
return ParseCPUs(strings.TrimSpace(string(data))) return ParseCPUs(strings.TrimSpace(string(data)))
} }
@@ -106,16 +107,18 @@ func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err) return fmt.Errorf("%s Init(): setup() call failed: %w", m.name, err)
} }
m.parallel = true m.parallel = true
m.meta = map[string]string{"source": m.name, "group": "SLURM"} m.meta = map[string]string{
m.tags = map[string]string{"type": "hwthread"} "source": m.name,
"group": "SLURM"}
m.tags = map[string]string{
"type": "hwthread"}
m.cpuUsed = make(map[int]bool) m.cpuUsed = make(map[int]bool)
m.cgroupBase = defaultCgroupBase m.cgroupBase = defaultCgroupBase
if len(config) > 0 { if len(config) > 0 {
err = json.Unmarshal(config, &m.config) err = json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
cclog.ComponentError(m.name, "Error reading config:", err.Error()) return fmt.Errorf("%s Init(): Error reading JSON config: %w", m.name, err)
return err
} }
m.excludeMetrics = make(map[string]struct{}) m.excludeMetrics = make(map[string]struct{})
for _, metric := range m.config.ExcludeMetrics { for _, metric := range m.config.ExcludeMetrics {
@@ -130,19 +133,16 @@ func (m *SlurmCgroupCollector) Init(config json.RawMessage) error {
if !m.useSudo { if !m.useSudo {
user, err := user.Current() user, err := user.Current()
if err != nil { if err != nil {
cclog.ComponentError(m.name, "Failed to get current user:", err.Error()) return fmt.Errorf("%s Init(): Failed to get current user: %w", m.name, err)
return err
} }
if user.Uid != "0" { if user.Uid != "0" {
cclog.ComponentError(m.name, "Reading cgroup files requires root privileges (or enable use_sudo in config)") return fmt.Errorf("%s Init(): Reading cgroup files requires root privileges (or enable use_sudo in config)", m.name)
return fmt.Errorf("not root")
} }
} }
m.allCPUs, err = GetAllCPUs() m.allCPUs, err = GetAllCPUs()
if err != nil { if err != nil {
cclog.ComponentError(m.name, "Error reading online CPUs:", err.Error()) return fmt.Errorf("%s Init(): Error reading online CPUs: %w", m.name, err)
return err
} }
m.init = true m.init = true
@@ -159,7 +159,9 @@ func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error)
CpuSet: []int{}, CpuSet: []int{},
} }
cg := func(f string) string { return filepath.Join(m.cgroupBase, jobdir, f) } cg := func(f string) string {
return filepath.Join(m.cgroupBase, jobdir, f)
}
memUsage, err := m.readFile(cg("memory.current")) memUsage, err := m.readFile(cg("memory.current"))
if err == nil { if err == nil {
@@ -208,8 +210,8 @@ func (m *SlurmCgroupCollector) ReadJobData(jobdir string) (SlurmJobData, error)
} }
} }
if usageUsec > 0 { if usageUsec > 0 {
jobdata.CpuUsageUser = (userUsec * 100 / usageUsec) jobdata.CpuUsageUser = (userUsec * 100.0 / usageUsec)
jobdata.CpuUsageSys = (systemUsec * 100 / usageUsec) jobdata.CpuUsageSys = (systemUsec * 100.0 / usageUsec)
} }
} }
@@ -252,12 +254,18 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
for _, cpu := range jobdata.CpuSet { for _, cpu := range jobdata.CpuSet {
coreTags := map[string]string{ coreTags := map[string]string{
"type": "hwthread", "type": "hwthread",
"type-id": fmt.Sprintf("%d", cpu), "type-id": strconv.Itoa(cpu),
} }
if coreCount > 0 && !m.isExcluded("job_mem_used") { if coreCount > 0 && !m.isExcluded("job_mem_used") {
memPerCore := jobdata.MemoryUsage / coreCount memPerCore := jobdata.MemoryUsage / coreCount
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]any{"value": memPerCore}, timestamp); err == nil { if y, err := lp.NewMessage(
"job_mem_used",
coreTags,
m.meta,
map[string]any{
"value": memPerCore},
timestamp); err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
@@ -265,7 +273,13 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_max_mem_used") { if coreCount > 0 && !m.isExcluded("job_max_mem_used") {
maxMemPerCore := jobdata.MaxMemoryUsage / coreCount maxMemPerCore := jobdata.MaxMemoryUsage / coreCount
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]any{"value": maxMemPerCore}, timestamp); err == nil { if y, err := lp.NewMessage(
"job_max_mem_used",
coreTags,
m.meta,
map[string]any{
"value": maxMemPerCore},
timestamp); err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
@@ -273,7 +287,13 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_mem_limit") { if coreCount > 0 && !m.isExcluded("job_mem_limit") {
limitPerCore := jobdata.LimitMemoryUsage / coreCount limitPerCore := jobdata.LimitMemoryUsage / coreCount
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]any{"value": limitPerCore}, timestamp); err == nil { if y, err := lp.NewMessage(
"job_mem_limit",
coreTags,
m.meta,
map[string]any{
"value": limitPerCore},
timestamp); err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
@@ -281,7 +301,13 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_user_cpu") { if coreCount > 0 && !m.isExcluded("job_user_cpu") {
cpuUserPerCore := jobdata.CpuUsageUser / coreCount cpuUserPerCore := jobdata.CpuUsageUser / coreCount
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]any{"value": cpuUserPerCore}, timestamp); err == nil { if y, err := lp.NewMessage(
"job_user_cpu",
coreTags,
m.meta,
map[string]any{
"value": cpuUserPerCore},
timestamp); err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
} }
@@ -289,7 +315,13 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if coreCount > 0 && !m.isExcluded("job_sys_cpu") { if coreCount > 0 && !m.isExcluded("job_sys_cpu") {
cpuSysPerCore := jobdata.CpuUsageSys / coreCount cpuSysPerCore := jobdata.CpuUsageSys / coreCount
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]any{"value": cpuSysPerCore}, timestamp); err == nil { if y, err := lp.NewMessage(
"job_sys_cpu",
coreTags,
m.meta,
map[string]any{
"value": cpuSysPerCore},
timestamp); err == nil {
y.AddMeta("unit", "%") y.AddMeta("unit", "%")
output <- y output <- y
} }
@@ -304,25 +336,43 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
if !m.cpuUsed[cpu] { if !m.cpuUsed[cpu] {
coreTags := map[string]string{ coreTags := map[string]string{
"type": "hwthread", "type": "hwthread",
"type-id": fmt.Sprintf("%d", cpu), "type-id": strconv.Itoa(cpu),
} }
if !m.isExcluded("job_mem_used") { if !m.isExcluded("job_mem_used") {
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil { if y, err := lp.NewMessage(
"job_mem_used",
coreTags,
m.meta,
map[string]any{
"value": 0},
timestamp); err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
} }
if !m.isExcluded("job_max_mem_used") { if !m.isExcluded("job_max_mem_used") {
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil { if y, err := lp.NewMessage(
"job_max_mem_used",
coreTags,
m.meta,
map[string]any{
"value": 0},
timestamp); err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }
} }
if !m.isExcluded("job_mem_limit") { if !m.isExcluded("job_mem_limit") {
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil { if y, err := lp.NewMessage(
"job_mem_limit",
coreTags,
m.meta,
map[string]any{
"value": 0},
timestamp); err == nil {
y.AddMeta("unit", "Bytes") y.AddMeta("unit", "Bytes")
output <- y output <- y
} }

View File

@@ -64,7 +64,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
if len(config) > 0 { if len(config) > 0 {
err := json.Unmarshal(config, &m.config) err := json.Unmarshal(config, &m.config)
if err != nil { if err != nil {
return err return fmt.Errorf("%s Init(): failed to unmarshal JSON config: %w", m.name, err)
} }
} }
@@ -80,10 +80,10 @@ func (m *TempCollector) Init(config json.RawMessage) error {
globPattern := filepath.Join("/sys/class/hwmon", "*", "temp*_input") globPattern := filepath.Join("/sys/class/hwmon", "*", "temp*_input")
inputFiles, err := filepath.Glob(globPattern) inputFiles, err := filepath.Glob(globPattern)
if err != nil { if err != nil {
return fmt.Errorf("unable to glob files with pattern '%s': %v", globPattern, err) return fmt.Errorf("%s Init(): unable to glob files with pattern '%s': %w", m.name, globPattern, err)
} }
if inputFiles == nil { if inputFiles == nil {
return fmt.Errorf("unable to find any files with pattern '%s'", globPattern) return fmt.Errorf("%s Init(): unable to find any files with pattern '%s'", m.name, globPattern)
} }
// Get sensor name for each temperature sensor file // Get sensor name for each temperature sensor file
@@ -172,7 +172,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
// Empty sensors map // Empty sensors map
if len(m.sensors) == 0 { if len(m.sensors) == 0 {
return fmt.Errorf("no temperature sensors found") return fmt.Errorf("%s Init(): no temperature sensors found", m.name)
} }
// Finished initialization // Finished initialization

View File

@@ -81,7 +81,13 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessag
lines := strings.Split(string(stdout), "\n") lines := strings.Split(string(stdout), "\n")
for i := 1; i < m.config.Num_procs+1; i++ { for i := 1; i < m.config.Num_procs+1; i++ {
name := fmt.Sprintf("topproc%d", i) name := fmt.Sprintf("topproc%d", i)
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": string(lines[i])}, time.Now()) y, err := lp.NewMessage(
name,
m.tags,
m.meta,
map[string]any{
"value": lines[i]},
time.Now())
if err == nil { if err == nil {
output <- y output <- y
} }

View File

@@ -137,7 +137,6 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
matches := make([]lp.CCMessage, 0) matches := make([]lp.CCMessage, 0)
for _, m := range metrics { for _, m := range metrics {
vars["metric"] = m vars["metric"] = m
//value, err := gval.Evaluate(f.Condition, vars, c.language)
value, err := f.gvalCond.EvalBool(context.Background(), vars) value, err := f.gvalCond.EvalBool(context.Background(), vars)
if err != nil { if err != nil {
cclog.ComponentError("MetricCache", "COLLECT", f.Name, "COND", f.Condition, ":", err.Error()) cclog.ComponentError("MetricCache", "COLLECT", f.Name, "COND", f.Condition, ":", err.Error())
@@ -171,22 +170,22 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
// Check, that only values of one type were collected // Check, that only values of one type were collected
countValueTypes := 0 countValueTypes := 0
if len(valuesFloat64) > 0 { if len(valuesFloat64) > 0 {
countValueTypes += 1 countValueTypes++
} }
if len(valuesFloat32) > 0 { if len(valuesFloat32) > 0 {
countValueTypes += 1 countValueTypes++
} }
if len(valuesInt) > 0 { if len(valuesInt) > 0 {
countValueTypes += 1 countValueTypes++
} }
if len(valuesInt32) > 0 { if len(valuesInt32) > 0 {
countValueTypes += 1 countValueTypes++
} }
if len(valuesInt64) > 0 { if len(valuesInt64) > 0 {
countValueTypes += 1 countValueTypes++
} }
if len(valuesBool) > 0 { if len(valuesBool) > 0 {
countValueTypes += 1 countValueTypes++
} }
if countValueTypes > 1 { if countValueTypes > 1 {
cclog.ComponentError("MetricCache", "Collected values of different types") cclog.ComponentError("MetricCache", "Collected values of different types")
@@ -337,7 +336,9 @@ func (c *metricAggregator) DeleteAggregation(name string) error {
if i == -1 { if i == -1 {
return fmt.Errorf("no aggregation for metric name %s", name) return fmt.Errorf("no aggregation for metric name %s", name)
} }
c.functions = slices.Delete(c.functions, i, i) copy(c.functions[i:], c.functions[i+1:])
c.functions[len(c.functions)-1] = nil
c.functions = c.functions[:len(c.functions)-1]
return nil return nil
} }

View File

@@ -12,6 +12,7 @@ import (
"fmt" "fmt"
"regexp" "regexp"
"slices" "slices"
"strconv"
"strings" "strings"
topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology" topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology"
@@ -208,7 +209,7 @@ func infunc(a any, b any) (any, error) {
case []int: case []int:
return slices.Contains(total, match), nil return slices.Contains(total, match), nil
case string: case string:
smatch := fmt.Sprintf("%d", match) smatch := strconv.Itoa(match)
return strings.Contains(total, smatch), nil return strings.Contains(total, smatch), nil
} }

View File

@@ -137,12 +137,12 @@ func (c *metricCache) Add(metric lp.CCMessage) {
p := c.intervals[c.curPeriod] p := c.intervals[c.curPeriod]
if p.numMetrics < p.sizeMetrics { if p.numMetrics < p.sizeMetrics {
p.metrics[p.numMetrics] = metric p.metrics[p.numMetrics] = metric
p.numMetrics = p.numMetrics + 1 p.numMetrics++
p.stopstamp = metric.Time() p.stopstamp = metric.Time()
} else { } else {
p.metrics = append(p.metrics, metric) p.metrics = append(p.metrics, metric)
p.numMetrics = p.numMetrics + 1 p.numMetrics++
p.sizeMetrics = p.sizeMetrics + 1 p.sizeMetrics++
p.stopstamp = metric.Time() p.stopstamp = metric.Time()
} }
c.lock.Unlock() c.lock.Unlock()

View File

@@ -186,10 +186,6 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
return fmt.Errorf("MessageProcessor AddAddTagsByCondition() failed: %w", err) return fmt.Errorf("MessageProcessor AddAddTagsByCondition() failed: %w", err)
} }
// r.config.dropMetrics = make(map[string]bool)
// for _, mname := range r.config.DropMetrics {
// r.config.dropMetrics[mname] = true
// }
return nil return nil
} }
@@ -208,7 +204,7 @@ func getParamMap(point lp.CCMessage) map[string]any {
return params return params
} }
// DoAddTags adds a tag when condition is fullfiled // DoAddTags adds a tag when condition is fulfilled
func (r *metricRouter) DoAddTags(point lp.CCMessage) { func (r *metricRouter) DoAddTags(point lp.CCMessage) {
var conditionMatches bool var conditionMatches bool
for _, m := range r.config.AddTags { for _, m := range r.config.AddTags {
@@ -230,83 +226,6 @@ func (r *metricRouter) DoAddTags(point lp.CCMessage) {
} }
} }
// DoDelTags removes a tag when condition is fullfiled
// func (r *metricRouter) DoDelTags(point lp.CCMessage) {
// var conditionMatches bool
// for _, m := range r.config.DelTags {
// if m.Condition == "*" {
// // Condition is always matched
// conditionMatches = true
// } else {
// // Evaluate condition
// var err error
// conditionMatches, err = agg.EvalBoolCondition(m.Condition, getParamMap(point))
// if err != nil {
// cclog.ComponentError("MetricRouter", err.Error())
// conditionMatches = false
// }
// }
// if conditionMatches {
// point.RemoveTag(m.Key)
// }
// }
// }
// Conditional test whether a metric should be dropped
// func (r *metricRouter) dropMetric(point lp.CCMessage) bool {
// // Simple drop check
// if conditionMatches, ok := r.config.dropMetrics[point.Name()]; ok {
// return conditionMatches
// }
// // Checking the dropping conditions
// for _, m := range r.config.DropMetricsIf {
// conditionMatches, err := agg.EvalBoolCondition(m, getParamMap(point))
// if err != nil {
// cclog.ComponentError("MetricRouter", err.Error())
// conditionMatches = false
// }
// if conditionMatches {
// return conditionMatches
// }
// }
// // No dropping condition met
// return false
// }
// func (r *metricRouter) prepareUnit(point lp.CCMessage) bool {
// if r.config.NormalizeUnits {
// if in_unit, ok := point.GetMeta("unit"); ok {
// u := units.NewUnit(in_unit)
// if u.Valid() {
// point.AddMeta("unit", u.Short())
// }
// }
// }
// if newP, ok := r.config.ChangeUnitPrefix[point.Name()]; ok {
// newPrefix := units.NewPrefix(newP)
// if in_unit, ok := point.GetMeta("unit"); ok && newPrefix != units.InvalidPrefix {
// u := units.NewUnit(in_unit)
// if u.Valid() {
// cclog.ComponentDebug("MetricRouter", "Change prefix to", newP, "for metric", point.Name())
// conv, out_unit := units.GetUnitPrefixFactor(u, newPrefix)
// if conv != nil && out_unit.Valid() {
// if val, ok := point.GetField("value"); ok {
// point.AddField("value", conv(val))
// point.AddMeta("unit", out_unit.Short())
// }
// }
// }
// }
// }
// return true
// }
// Start starts the metric router // Start starts the metric router
func (r *metricRouter) Start() { func (r *metricRouter) Start() {
// start timer if configured // start timer if configured
@@ -322,28 +241,7 @@ func (r *metricRouter) Start() {
cclog.ComponentDebug("MetricRouter", "DONE") cclog.ComponentDebug("MetricRouter", "DONE")
} }
// Forward takes a received metric, adds or deletes tags // Forward message received from collector channel
// and forwards it to the output channels
// forward := func(point lp.CCMessage) {
// cclog.ComponentDebug("MetricRouter", "FORWARD", point)
// r.DoAddTags(point)
// r.DoDelTags(point)
// name := point.Name()
// if new, ok := r.config.RenameMetrics[name]; ok {
// point.SetName(new)
// point.AddMeta("oldname", name)
// r.DoAddTags(point)
// r.DoDelTags(point)
// }
// r.prepareUnit(point)
// for _, o := range r.outputs {
// o <- point
// }
// }
// Foward message received from collector channel
coll_forward := func(p lp.CCMessage) { coll_forward := func(p lp.CCMessage) {
// receive from metric collector // receive from metric collector
//p.AddTag(r.config.HostnameTagName, r.hostname) //p.AddTag(r.config.HostnameTagName, r.hostname)
@@ -356,11 +254,6 @@ func (r *metricRouter) Start() {
o <- m o <- m
} }
} }
// if !r.dropMetric(p) {
// for _, o := range r.outputs {
// o <- point
// }
// }
// even if the metric is dropped, it is stored in the cache for // even if the metric is dropped, it is stored in the cache for
// aggregations // aggregations
if r.config.NumCacheIntervals > 0 { if r.config.NumCacheIntervals > 0 {
@@ -380,9 +273,6 @@ func (r *metricRouter) Start() {
o <- m o <- m
} }
} }
// if !r.dropMetric(p) {
// forward(p)
// }
} }
// Forward message received from cache channel // Forward message received from cache channel

View File

@@ -51,14 +51,13 @@ var cache struct {
func fileToInt(path string) int { func fileToInt(path string) int {
buffer, err := os.ReadFile(path) buffer, err := os.ReadFile(path)
if err != nil { if err != nil {
log.Print(err) cclogger.ComponentError("ccTopology", fmt.Sprintf("fileToInt(): Reading \"%s\": %v", path, err))
cclogger.ComponentError("ccTopology", "fileToInt", "Reading", path, ":", err.Error())
return -1 return -1
} }
stringBuffer := strings.TrimSpace(string(buffer)) stringBuffer := strings.TrimSpace(string(buffer))
id, err := strconv.Atoi(stringBuffer) id, err := strconv.Atoi(stringBuffer)
if err != nil { if err != nil {
cclogger.ComponentError("ccTopology", "fileToInt", "Parsing", path, ":", stringBuffer, err.Error()) cclogger.ComponentError("ccTopology", fmt.Sprintf("fileToInt(): Parsing \"%s\": %v", stringBuffer, err))
return -1 return -1
} }
return id return id
@@ -304,20 +303,19 @@ func GetTypeList(topology_type string) []int {
} }
func GetTypeId(hwt HwthreadEntry, topology_type string) (int, error) { func GetTypeId(hwt HwthreadEntry, topology_type string) (int, error) {
var err error = nil
switch topology_type { switch topology_type {
case "node": case "node":
return 0, err return 0, nil
case "socket": case "socket":
return hwt.Socket, err return hwt.Socket, nil
case "die": case "die":
return hwt.Die, err return hwt.Die, nil
case "memoryDomain": case "memoryDomain":
return hwt.NumaDomain, err return hwt.NumaDomain, nil
case "core": case "core":
return hwt.Core, err return hwt.Core, nil
case "hwthread": case "hwthread":
return hwt.CpuID, err return hwt.CpuID, nil
} }
return -1, fmt.Errorf("unknown topology type '%s'", topology_type) return -1, fmt.Errorf("unknown topology type '%s'", topology_type)
} }

View File

@@ -21,7 +21,7 @@ type multiChanTicker struct {
type MultiChanTicker interface { type MultiChanTicker interface {
Init(duration time.Duration) Init(duration time.Duration)
AddChannel(chan time.Time) AddChannel(channel chan time.Time)
Close() Close()
} }