mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2026-02-13 06:31:46 +01:00
Compare commits
32 Commits
main
...
golangci-l
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ec8177e73b | ||
|
|
8d75aba882 | ||
|
|
094fa52b20 | ||
|
|
80d5adfb97 | ||
|
|
4b831f09a5 | ||
|
|
f6cd593862 | ||
|
|
6000a1a45b | ||
|
|
3633db0d46 | ||
|
|
77a9b5a977 | ||
|
|
baf7a4f2c5 | ||
|
|
2b5bf4d6a5 | ||
|
|
1f7b13349c | ||
|
|
f98800e039 | ||
|
|
396a9f8ce5 | ||
|
|
de5429201b | ||
|
|
c8f4769a82 | ||
|
|
090f6c69a9 | ||
|
|
b2af81a038 | ||
|
|
8f49c7aa67 | ||
|
|
a77cc19ddb | ||
|
|
ff0cd5803d | ||
|
|
5df7b0eb48 | ||
|
|
0f7792b4cb | ||
|
|
e6dc9eba27 | ||
|
|
c9ebef3bad | ||
|
|
faf5088385 | ||
|
|
12ab80ccad | ||
|
|
c24b3a7e4b | ||
|
|
3c03f4ac96 | ||
|
|
dddae13c7a | ||
|
|
159bee1e9f | ||
|
|
86710d9b4b |
21
.github/workflows/runonce.yml
vendored
21
.github/workflows/runonce.yml
vendored
@@ -35,6 +35,11 @@ jobs:
|
||||
run: |
|
||||
go install github.com/reviewdog/reviewdog/cmd/reviewdog@latest
|
||||
|
||||
# See: https://staticcheck.io
|
||||
- name: Install staticcheck
|
||||
run: |
|
||||
go install honnef.co/go/tools/cmd/staticcheck@latest
|
||||
|
||||
# See: https://golangci-lint.run
|
||||
- name: Install GolangCI-Lint
|
||||
run: |
|
||||
@@ -49,7 +54,21 @@ jobs:
|
||||
# Running the linter requires likwid.h, which gets downloaded in the build step
|
||||
- name: Static Analysis with GolangCI-Lint and Upload Report with reviewdog
|
||||
run: |
|
||||
golangci-lint run --enable modernize,staticcheck,govet | reviewdog -f=golangci-lint -name "Check golangci-lint on build-latest" -reporter=github-check -filter-mode=nofilter -fail-level none
|
||||
golangci-lint run | reviewdog -f=golangci-lint -name "Check golangci-lint on build-latest" -reporter=github-check -filter-mode=nofilter -fail-level none
|
||||
env:
|
||||
REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
# Running the linter requires likwid.h, which gets downloaded in the build step
|
||||
- name: Run Static Analysis with go vet and Upload Report with reviewdog
|
||||
run: |
|
||||
go vet ./... | reviewdog -f=govet -name "Check govet on build-latest" -reporter=github-check -filter-mode=nofilter -fail-level none
|
||||
env:
|
||||
REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
# Running the linter requires likwid.h, which gets downloaded in the build step
|
||||
- name: Run Static Analysis with staticcheck and Upload Report with reviewdog
|
||||
run: |
|
||||
staticcheck ./... | reviewdog -f=staticcheck -name "Check staticcheck on build-latest" -reporter=github-check -filter-mode=nofilter -fail-level none
|
||||
env:
|
||||
REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
|
||||
@@ -225,7 +225,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
|
||||
for key, data := range m.matches {
|
||||
value, _ := strconv.ParseFloat(data, 32)
|
||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
@@ -217,7 +217,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
||||
|
||||
for key, data := range m.matches {
|
||||
value, _ := strconv.ParseFloat(data, 32)
|
||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||
y, err := lp.NewMessage(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
@@ -171,7 +171,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
||||
fmt.Sprintf("Read(): Failed to convert cpu MHz '%s' to float64: %v", lineSplit[1], err))
|
||||
return
|
||||
}
|
||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": value}, now); err == nil {
|
||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
@@ -126,7 +126,7 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
continue
|
||||
}
|
||||
|
||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]any{"value": cpuFreq}, now); err == nil {
|
||||
if y, err := lp.NewMessage("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
@@ -138,7 +138,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
|
||||
sum := float64(0)
|
||||
for name, value := range values {
|
||||
sum += value
|
||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value * 100}, now)
|
||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now)
|
||||
if err == nil {
|
||||
y.AddTag("unit", "Percent")
|
||||
output <- y
|
||||
@@ -146,7 +146,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
|
||||
}
|
||||
if v, ok := values["cpu_idle"]; ok {
|
||||
sum -= v
|
||||
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]any{"value": sum * 100}, now)
|
||||
y, err := lp.NewMessage("cpu_used", tags, m.meta, map[string]interface{}{"value": sum * 100}, now)
|
||||
if err == nil {
|
||||
y.AddTag("unit", "Percent")
|
||||
output <- y
|
||||
@@ -191,7 +191,7 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
num_cpus_metric, err := lp.NewMessage("num_cpus",
|
||||
m.nodetags,
|
||||
m.meta,
|
||||
map[string]any{"value": int(num_cpus)},
|
||||
map[string]interface{}{"value": int(num_cpus)},
|
||||
now,
|
||||
)
|
||||
if err == nil {
|
||||
|
||||
@@ -124,7 +124,7 @@ mountLoop:
|
||||
tags := map[string]string{"type": "node", "device": linefields[0]}
|
||||
total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
|
||||
if m.allowedMetrics["disk_total"] {
|
||||
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]any{"value": total}, time.Now())
|
||||
y, err := lp.NewMessage("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "GBytes")
|
||||
output <- y
|
||||
@@ -132,7 +132,7 @@ mountLoop:
|
||||
}
|
||||
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000)
|
||||
if m.allowedMetrics["disk_free"] {
|
||||
y, err := lp.NewMessage("disk_free", tags, m.meta, map[string]any{"value": free}, time.Now())
|
||||
y, err := lp.NewMessage("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "GBytes")
|
||||
output <- y
|
||||
@@ -146,7 +146,7 @@ mountLoop:
|
||||
}
|
||||
}
|
||||
if m.allowedMetrics["part_max_used"] {
|
||||
y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]any{"value": int(part_max_used)}, time.Now())
|
||||
y, err := lp.NewMessage("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "percent")
|
||||
output <- y
|
||||
|
||||
@@ -593,7 +593,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
for _, metric := range m.definitions {
|
||||
vold, vold_ok := m.lastState[filesystem][metric.prefix]
|
||||
vnew, vnew_ok := newstate[metric.prefix]
|
||||
var value any
|
||||
var value interface{}
|
||||
value_ok := false
|
||||
switch metric.calc {
|
||||
case "none":
|
||||
|
||||
@@ -10,7 +10,6 @@ package collectors
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"slices"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
lp "github.com/ClusterCockpit/cc-lib/v2/ccMessage"
|
||||
@@ -114,7 +113,14 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
||||
port := pathSplit[6]
|
||||
|
||||
// Skip excluded devices
|
||||
if slices.Contains(m.config.ExcludeDevices, device) {
|
||||
skip := false
|
||||
for _, excludedDevice := range m.config.ExcludeDevices {
|
||||
if excludedDevice == device {
|
||||
skip = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if skip {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -237,7 +243,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
counterDef.name,
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
map[string]interface{}{
|
||||
"value": counterDef.currentState,
|
||||
},
|
||||
now); err == nil {
|
||||
@@ -255,7 +261,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
counterDef.name+"_bw",
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
map[string]interface{}{
|
||||
"value": rate,
|
||||
},
|
||||
now); err == nil {
|
||||
@@ -285,7 +291,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
"ib_total",
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
map[string]interface{}{
|
||||
"value": ib_total,
|
||||
},
|
||||
now); err == nil {
|
||||
@@ -298,7 +304,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess
|
||||
"ib_total_pkts",
|
||||
info.tagSet,
|
||||
m.meta,
|
||||
map[string]any{
|
||||
map[string]interface{}{
|
||||
"value": ib_total_pkts,
|
||||
},
|
||||
now); err == nil {
|
||||
|
||||
@@ -130,7 +130,7 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMessage) {
|
||||
unit = "Watts"
|
||||
}
|
||||
|
||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
|
||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", unit)
|
||||
output <- y
|
||||
@@ -175,7 +175,7 @@ func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMessage) {
|
||||
v, err := strconv.ParseFloat(lv[3], 64)
|
||||
if err == nil {
|
||||
name := strings.ToLower(strings.ReplaceAll(lv[1], " ", "_"))
|
||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]any{"value": v}, time.Now())
|
||||
y, err := lp.NewMessage(name, map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": v}, time.Now())
|
||||
if err == nil {
|
||||
if len(lv) > 4 {
|
||||
y.AddMeta("unit", lv[4])
|
||||
|
||||
@@ -617,6 +617,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
evset.metrics[tid][metric.Name] = value
|
||||
// Now we have the result, send it with the proper tags
|
||||
if !math.IsNaN(value) && metric.Publish {
|
||||
fields := map[string]interface{}{"value": value}
|
||||
y, err :=
|
||||
lp.NewMessage(
|
||||
metric.Name,
|
||||
@@ -624,9 +625,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
"type": metric.Type,
|
||||
},
|
||||
m.meta,
|
||||
map[string]any{
|
||||
"value": value,
|
||||
},
|
||||
fields,
|
||||
now,
|
||||
)
|
||||
if err == nil {
|
||||
@@ -664,7 +663,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
"type-id": fmt.Sprintf("%d", coreID),
|
||||
},
|
||||
m.meta,
|
||||
map[string]any{
|
||||
map[string]interface{}{
|
||||
"value": value,
|
||||
},
|
||||
now,
|
||||
@@ -701,7 +700,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
"type-id": fmt.Sprintf("%d", socketID),
|
||||
},
|
||||
m.meta,
|
||||
map[string]any{
|
||||
map[string]interface{}{
|
||||
"value": value,
|
||||
},
|
||||
now,
|
||||
@@ -735,7 +734,7 @@ func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interv
|
||||
"type": "node",
|
||||
},
|
||||
m.meta,
|
||||
map[string]any{
|
||||
map[string]interface{}{
|
||||
"value": totalNodeValue,
|
||||
},
|
||||
now,
|
||||
@@ -793,7 +792,7 @@ func (m *LikwidCollector) calcGlobalMetrics(groups []LikwidEventsetConfig, inter
|
||||
"type": metric.Type,
|
||||
},
|
||||
m.meta,
|
||||
map[string]any{
|
||||
map[string]interface{}{
|
||||
"value": value,
|
||||
},
|
||||
now,
|
||||
|
||||
@@ -102,7 +102,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
if m.load_skips[i] {
|
||||
continue
|
||||
}
|
||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
|
||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -121,7 +121,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
if m.proc_skips[i] {
|
||||
continue
|
||||
}
|
||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": x}, now)
|
||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
@@ -404,23 +404,23 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
} else {
|
||||
use_x = devData[def.name]
|
||||
}
|
||||
var value any
|
||||
var value interface{}
|
||||
switch def.calc {
|
||||
case "none":
|
||||
value = use_x
|
||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||
case "difference":
|
||||
value = use_x - devData[def.name]
|
||||
if value.(int64) < 0 {
|
||||
value = 0
|
||||
}
|
||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||
case "derivative":
|
||||
value = float64(use_x-devData[def.name]) / tdiff.Seconds()
|
||||
if value.(float64) < 0 {
|
||||
value = 0
|
||||
}
|
||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||
y, err = lp.NewMessage(def.name, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||
}
|
||||
if err == nil {
|
||||
y.AddTag("device", device)
|
||||
|
||||
@@ -188,7 +188,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
}
|
||||
}
|
||||
|
||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]any{"value": value}, time.Now())
|
||||
y, err := lp.NewMessage(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||
if err == nil {
|
||||
if len(unit) > 0 {
|
||||
y.AddMeta("unit", unit)
|
||||
@@ -221,7 +221,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
}
|
||||
}
|
||||
}
|
||||
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]any{"value": memUsed}, time.Now())
|
||||
y, err := lp.NewMessage("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now())
|
||||
if err == nil {
|
||||
if len(unit) > 0 {
|
||||
y.AddMeta("unit", unit)
|
||||
|
||||
@@ -240,14 +240,14 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
continue
|
||||
}
|
||||
if m.config.SendAbsoluteValues {
|
||||
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]any{"value": v}, now); err == nil {
|
||||
if y, err := lp.NewMessage(metric.name, metric.tags, metric.meta, map[string]interface{}{"value": v}, now); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if m.config.SendDerivedValues {
|
||||
if metric.lastValue >= 0 {
|
||||
rate := float64(v-metric.lastValue) / timeDiff
|
||||
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]any{"value": rate}, now); err == nil {
|
||||
if y, err := lp.NewMessage(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]interface{}{"value": rate}, now); err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
@@ -167,7 +167,7 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
continue
|
||||
}
|
||||
value := data.current - data.last
|
||||
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage(fmt.Sprintf("%s_%s", prefix, name), m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("version", m.version)
|
||||
output <- y
|
||||
|
||||
@@ -143,7 +143,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
if old, ok := m.data[mntpoint]; ok {
|
||||
for name, newVal := range values {
|
||||
if m.config.SendAbsoluteValues {
|
||||
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", name), m.tags, m.meta, map[string]any{"value": newVal}, now)
|
||||
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s", name), m.tags, m.meta, map[string]interface{}{"value": newVal}, now)
|
||||
if err == nil {
|
||||
msg.AddTag("stype", "filesystem")
|
||||
msg.AddTag("stype-id", mntpoint)
|
||||
@@ -152,7 +152,7 @@ func (m *NfsIOStatCollector) Read(interval time.Duration, output chan lp.CCMessa
|
||||
}
|
||||
if m.config.SendDerivedValues {
|
||||
rate := float64(newVal-old[name]) / timeDiff
|
||||
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]any{"value": rate}, now)
|
||||
msg, err := lp.NewMessage(fmt.Sprintf("nfsio_%s_bw", name), m.tags, m.meta, map[string]interface{}{"value": rate}, now)
|
||||
if err == nil {
|
||||
if strings.HasPrefix(name, "page") {
|
||||
msg.AddMeta("unit", "4K_pages/s")
|
||||
|
||||
@@ -109,7 +109,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
||||
// For all GPUs
|
||||
idx := 0
|
||||
m.gpus = make([]NvidiaCollectorDevice, num_gpus)
|
||||
for i := range num_gpus {
|
||||
for i := 0; i < num_gpus; i++ {
|
||||
|
||||
// Skip excluded devices by ID
|
||||
str_i := fmt.Sprintf("%d", i)
|
||||
@@ -239,7 +239,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
||||
|
||||
if !device.excludeMetrics["nv_fb_mem_total"] {
|
||||
t := float64(total) / (1024 * 1024)
|
||||
y, err := lp.NewMessage("nv_fb_mem_total", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||
y, err := lp.NewMessage("nv_fb_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MByte")
|
||||
output <- y
|
||||
@@ -248,7 +248,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
||||
|
||||
if !device.excludeMetrics["nv_fb_mem_used"] {
|
||||
f := float64(used) / (1024 * 1024)
|
||||
y, err := lp.NewMessage("nv_fb_mem_used", device.tags, device.meta, map[string]any{"value": f}, time.Now())
|
||||
y, err := lp.NewMessage("nv_fb_mem_used", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MByte")
|
||||
output <- y
|
||||
@@ -257,7 +257,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
||||
|
||||
if v2 && !device.excludeMetrics["nv_fb_mem_reserved"] {
|
||||
r := float64(reserved) / (1024 * 1024)
|
||||
y, err := lp.NewMessage("nv_fb_mem_reserved", device.tags, device.meta, map[string]any{"value": r}, time.Now())
|
||||
y, err := lp.NewMessage("nv_fb_mem_reserved", device.tags, device.meta, map[string]interface{}{"value": r}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MByte")
|
||||
output <- y
|
||||
@@ -276,7 +276,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
}
|
||||
if !device.excludeMetrics["nv_bar1_mem_total"] {
|
||||
t := float64(meminfo.Bar1Total) / (1024 * 1024)
|
||||
y, err := lp.NewMessage("nv_bar1_mem_total", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||
y, err := lp.NewMessage("nv_bar1_mem_total", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MByte")
|
||||
output <- y
|
||||
@@ -284,7 +284,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
}
|
||||
if !device.excludeMetrics["nv_bar1_mem_used"] {
|
||||
t := float64(meminfo.Bar1Used) / (1024 * 1024)
|
||||
y, err := lp.NewMessage("nv_bar1_mem_used", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||
y, err := lp.NewMessage("nv_bar1_mem_used", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MByte")
|
||||
output <- y
|
||||
@@ -318,14 +318,14 @@ func readUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
util, ret := nvml.DeviceGetUtilizationRates(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
if !device.excludeMetrics["nv_util"] {
|
||||
y, err := lp.NewMessage("nv_util", device.tags, device.meta, map[string]any{"value": float64(util.Gpu)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !device.excludeMetrics["nv_mem_util"] {
|
||||
y, err := lp.NewMessage("nv_mem_util", device.tags, device.meta, map[string]any{"value": float64(util.Memory)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_mem_util", device.tags, device.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
@@ -345,7 +345,7 @@ func readTemp(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
// * NVML_TEMPERATURE_COUNT
|
||||
temp, ret := nvml.DeviceGetTemperature(device.device, nvml.TEMPERATURE_GPU)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_temp", device.tags, device.meta, map[string]any{"value": float64(temp)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_temp", device.tags, device.meta, map[string]interface{}{"value": float64(temp)}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "degC")
|
||||
output <- y
|
||||
@@ -368,7 +368,7 @@ func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
// This value may exceed 100% in certain cases.
|
||||
fan, ret := nvml.DeviceGetFanSpeed(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]any{"value": float64(fan)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_fan", device.tags, device.meta, map[string]interface{}{"value": float64(fan)}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
@@ -415,17 +415,17 @@ func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error
|
||||
var err error
|
||||
switch ecc_pend {
|
||||
case nvml.FEATURE_DISABLED:
|
||||
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "OFF"}, time.Now())
|
||||
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "OFF"}, time.Now())
|
||||
case nvml.FEATURE_ENABLED:
|
||||
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "ON"}, time.Now())
|
||||
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "ON"}, time.Now())
|
||||
default:
|
||||
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "UNKNOWN"}, time.Now())
|
||||
y, err = lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "UNKNOWN"}, time.Now())
|
||||
}
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
case nvml.ERROR_NOT_SUPPORTED:
|
||||
y, err := lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]any{"value": "N/A"}, time.Now())
|
||||
y, err := lp.NewMessage("nv_ecc_mode", device.tags, device.meta, map[string]interface{}{"value": "N/A"}, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -445,7 +445,7 @@ func readPerfState(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
||||
// 32: Unknown performance state.
|
||||
pState, ret := nvml.DeviceGetPerformanceState(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_perf_state", device.tags, device.meta, map[string]any{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
|
||||
y, err := lp.NewMessage("nv_perf_state", device.tags, device.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -471,7 +471,7 @@ func readPowerUsage(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
||||
if mode == nvml.FEATURE_ENABLED {
|
||||
power, ret := nvml.DeviceGetPowerUsage(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_power_usage", device.tags, device.meta, map[string]any{"value": float64(power) / 1000}, time.Now())
|
||||
y, err := lp.NewMessage("nv_power_usage", device.tags, device.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "watts")
|
||||
output <- y
|
||||
@@ -539,7 +539,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
if !device.excludeMetrics["nv_graphics_clock"] {
|
||||
graphicsClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_GRAPHICS)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_graphics_clock", device.tags, device.meta, map[string]any{"value": float64(graphicsClock)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_graphics_clock", device.tags, device.meta, map[string]interface{}{"value": float64(graphicsClock)}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MHz")
|
||||
output <- y
|
||||
@@ -550,7 +550,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
if !device.excludeMetrics["nv_sm_clock"] {
|
||||
smCock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_SM)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_sm_clock", device.tags, device.meta, map[string]any{"value": float64(smCock)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_sm_clock", device.tags, device.meta, map[string]interface{}{"value": float64(smCock)}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MHz")
|
||||
output <- y
|
||||
@@ -561,7 +561,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
if !device.excludeMetrics["nv_mem_clock"] {
|
||||
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_MEM)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_mem_clock", device.tags, device.meta, map[string]any{"value": float64(memClock)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_mem_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MHz")
|
||||
output <- y
|
||||
@@ -571,7 +571,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
|
||||
if !device.excludeMetrics["nv_video_clock"] {
|
||||
memClock, ret := nvml.DeviceGetClockInfo(device.device, nvml.CLOCK_VIDEO)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_video_clock", device.tags, device.meta, map[string]any{"value": float64(memClock)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_video_clock", device.tags, device.meta, map[string]interface{}{"value": float64(memClock)}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "MHz")
|
||||
output <- y
|
||||
@@ -652,7 +652,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
||||
// i.e. the total set of errors across the entire device.
|
||||
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_UNCORRECTED, nvml.AGGREGATE_ECC)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]any{"value": float64(ecc_db)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_ecc_uncorrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -661,7 +661,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
|
||||
if !device.excludeMetrics["nv_ecc_corrected_error"] {
|
||||
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device.device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.AGGREGATE_ECC)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_ecc_corrected_error", device.tags, device.meta, map[string]any{"value": float64(ecc_sb)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_ecc_corrected_error", device.tags, device.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -680,7 +680,7 @@ func readPowerLimit(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
|
||||
// If the card's total power draw reaches this limit the power management algorithm kicks in.
|
||||
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_power_max_limit", device.tags, device.meta, map[string]any{"value": float64(pwr_limit) / 1000}, time.Now())
|
||||
y, err := lp.NewMessage("nv_power_max_limit", device.tags, device.meta, map[string]interface{}{"value": float64(pwr_limit) / 1000}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "watts")
|
||||
output <- y
|
||||
@@ -707,7 +707,7 @@ func readEncUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
|
||||
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_encoder_util", device.tags, device.meta, map[string]any{"value": float64(enc_util)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_encoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
@@ -734,7 +734,7 @@ func readDecUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
// Note: On MIG-enabled GPUs, querying encoder utilization is not currently supported.
|
||||
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_decoder_util", device.tags, device.meta, map[string]any{"value": float64(dec_util)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_decoder_util", device.tags, device.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
@@ -761,13 +761,13 @@ func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) e
|
||||
corrected, uncorrected, pending, failure, ret := nvml.DeviceGetRemappedRows(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
if !device.excludeMetrics["nv_remapped_rows_corrected"] {
|
||||
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]any{"value": float64(corrected)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(corrected)}, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !device.excludeMetrics["nv_remapped_rows_uncorrected"] {
|
||||
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]any{"value": float64(uncorrected)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_remapped_rows_corrected", device.tags, device.meta, map[string]interface{}{"value": float64(uncorrected)}, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -777,7 +777,7 @@ func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) e
|
||||
if pending {
|
||||
p = 1
|
||||
}
|
||||
y, err := lp.NewMessage("nv_remapped_rows_pending", device.tags, device.meta, map[string]any{"value": p}, time.Now())
|
||||
y, err := lp.NewMessage("nv_remapped_rows_pending", device.tags, device.meta, map[string]interface{}{"value": p}, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -787,7 +787,7 @@ func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) e
|
||||
if failure {
|
||||
f = 1
|
||||
}
|
||||
y, err := lp.NewMessage("nv_remapped_rows_failure", device.tags, device.meta, map[string]any{"value": f}, time.Now())
|
||||
y, err := lp.NewMessage("nv_remapped_rows_failure", device.tags, device.meta, map[string]interface{}{"value": f}, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -821,7 +821,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
|
||||
procList, ret := nvml.DeviceGetComputeRunningProcesses(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_compute_processes", device.tags, device.meta, map[string]any{"value": len(procList)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_compute_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -850,7 +850,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
// Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode.
|
||||
procList, ret := nvml.DeviceGetGraphicsRunningProcesses(device.device)
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_graphics_processes", device.tags, device.meta, map[string]any{"value": len(procList)}, time.Now())
|
||||
y, err := lp.NewMessage("nv_graphics_processes", device.tags, device.meta, map[string]interface{}{"value": len(procList)}, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -908,7 +908,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_POWER)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_power", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||
y, err := lp.NewMessage("nv_violation_power", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -920,7 +920,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_THERMAL)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_thermal", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||
y, err := lp.NewMessage("nv_violation_thermal", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -932,7 +932,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_SYNC_BOOST)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_sync_boost", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||
y, err := lp.NewMessage("nv_violation_sync_boost", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -944,7 +944,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_BOARD_LIMIT)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_board_limit", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||
y, err := lp.NewMessage("nv_violation_board_limit", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -956,7 +956,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_LOW_UTILIZATION)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_low_util", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||
y, err := lp.NewMessage("nv_violation_low_util", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -968,7 +968,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_RELIABILITY)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_reliability", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||
y, err := lp.NewMessage("nv_violation_reliability", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -980,7 +980,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_APP_CLOCKS)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_below_app_clock", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||
y, err := lp.NewMessage("nv_violation_below_app_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -992,7 +992,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
|
||||
violTime, ret = nvml.DeviceGetViolationStatus(device.device, nvml.PERF_POLICY_TOTAL_BASE_CLOCKS)
|
||||
if ret == nvml.SUCCESS {
|
||||
t := float64(violTime.ViolationTime) * 1e-9
|
||||
y, err := lp.NewMessage("nv_violation_below_base_clock", device.tags, device.meta, map[string]any{"value": t}, time.Now())
|
||||
y, err := lp.NewMessage("nv_violation_below_base_clock", device.tags, device.meta, map[string]interface{}{"value": t}, time.Now())
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "sec")
|
||||
output <- y
|
||||
@@ -1015,7 +1015,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
var aggregate_recovery_errors uint64 = 0
|
||||
var aggregate_crc_flit_errors uint64 = 0
|
||||
|
||||
for i := range nvml.NVLINK_MAX_LINKS {
|
||||
for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ {
|
||||
state, ret := nvml.DeviceGetNvLinkState(device.device, i)
|
||||
if ret == nvml.SUCCESS {
|
||||
if state == nvml.FEATURE_ENABLED {
|
||||
@@ -1024,7 +1024,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_DATA)
|
||||
aggregate_crc_errors = aggregate_crc_errors + count
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_nvlink_crc_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now())
|
||||
y, err := lp.NewMessage("nv_nvlink_crc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||
@@ -1037,7 +1037,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_ECC_DATA)
|
||||
aggregate_ecc_errors = aggregate_ecc_errors + count
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now())
|
||||
y, err := lp.NewMessage("nv_nvlink_ecc_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||
@@ -1050,7 +1050,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_REPLAY)
|
||||
aggregate_replay_errors = aggregate_replay_errors + count
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_nvlink_replay_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now())
|
||||
y, err := lp.NewMessage("nv_nvlink_replay_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||
@@ -1063,7 +1063,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_RECOVERY)
|
||||
aggregate_recovery_errors = aggregate_recovery_errors + count
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now())
|
||||
y, err := lp.NewMessage("nv_nvlink_recovery_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||
@@ -1076,7 +1076,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
count, ret := nvml.DeviceGetNvLinkErrorCounter(device.device, i, nvml.NVLINK_ERROR_DL_CRC_FLIT)
|
||||
aggregate_crc_flit_errors = aggregate_crc_flit_errors + count
|
||||
if ret == nvml.SUCCESS {
|
||||
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]any{"value": count}, time.Now())
|
||||
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors", device.tags, device.meta, map[string]interface{}{"value": count}, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||
@@ -1091,7 +1091,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
// Export aggegated values
|
||||
if !device.excludeMetrics["nv_nvlink_crc_errors"] {
|
||||
// Data link receive data CRC error counter
|
||||
y, err := lp.NewMessage("nv_nvlink_crc_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_crc_errors}, time.Now())
|
||||
y, err := lp.NewMessage("nv_nvlink_crc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_errors}, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
output <- y
|
||||
@@ -1099,7 +1099,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
}
|
||||
if !device.excludeMetrics["nv_nvlink_ecc_errors"] {
|
||||
// Data link receive data ECC error counter
|
||||
y, err := lp.NewMessage("nv_nvlink_ecc_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_ecc_errors}, time.Now())
|
||||
y, err := lp.NewMessage("nv_nvlink_ecc_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_ecc_errors}, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
output <- y
|
||||
@@ -1107,7 +1107,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
}
|
||||
if !device.excludeMetrics["nv_nvlink_replay_errors"] {
|
||||
// Data link transmit replay error counter
|
||||
y, err := lp.NewMessage("nv_nvlink_replay_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_replay_errors}, time.Now())
|
||||
y, err := lp.NewMessage("nv_nvlink_replay_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_replay_errors}, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
output <- y
|
||||
@@ -1115,7 +1115,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
}
|
||||
if !device.excludeMetrics["nv_nvlink_recovery_errors"] {
|
||||
// Data link transmit recovery error counter
|
||||
y, err := lp.NewMessage("nv_nvlink_recovery_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_recovery_errors}, time.Now())
|
||||
y, err := lp.NewMessage("nv_nvlink_recovery_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_recovery_errors}, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
output <- y
|
||||
@@ -1123,7 +1123,7 @@ func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
|
||||
}
|
||||
if !device.excludeMetrics["nv_nvlink_crc_flit_errors"] {
|
||||
// Data link receive flow control digit CRC error counter
|
||||
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, map[string]any{"value": aggregate_crc_flit_errors}, time.Now())
|
||||
y, err := lp.NewMessage("nv_nvlink_crc_flit_errors_sum", device.tags, device.meta, map[string]interface{}{"value": aggregate_crc_flit_errors}, time.Now())
|
||||
if err == nil {
|
||||
y.AddTag("stype", "nvlink")
|
||||
output <- y
|
||||
@@ -1263,7 +1263,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
}
|
||||
cclog.ComponentDebug(m.name, "Reading MIG devices for GPU", i)
|
||||
|
||||
for j := range maxMig {
|
||||
for j := 0; j < maxMig; j++ {
|
||||
mdev, ret := nvml.DeviceGetMigDeviceHandleByIndex(m.gpus[i].device, j)
|
||||
if ret != nvml.SUCCESS {
|
||||
continue
|
||||
|
||||
@@ -249,7 +249,7 @@ func (m *RAPLCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
"rapl_average_power",
|
||||
p.tags,
|
||||
m.meta,
|
||||
map[string]any{"value": averagePower},
|
||||
map[string]interface{}{"value": averagePower},
|
||||
energyTimestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
|
||||
@@ -11,7 +11,6 @@ import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"slices"
|
||||
"time"
|
||||
|
||||
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
|
||||
@@ -88,11 +87,22 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
return err
|
||||
}
|
||||
|
||||
exclDev := func(s string) bool {
|
||||
skip_device := false
|
||||
for _, excl := range m.config.ExcludeDevices {
|
||||
if excl == s {
|
||||
skip_device = true
|
||||
break
|
||||
}
|
||||
}
|
||||
return skip_device
|
||||
}
|
||||
|
||||
m.devices = make([]RocmSmiCollectorDevice, 0)
|
||||
|
||||
for i := range numDevs {
|
||||
for i := 0; i < numDevs; i++ {
|
||||
str_i := fmt.Sprintf("%d", i)
|
||||
if slices.Contains(m.config.ExcludeDevices, str_i) {
|
||||
if exclDev(str_i) {
|
||||
continue
|
||||
}
|
||||
device, ret := rocm_smi.DeviceGetHandleByIndex(i)
|
||||
@@ -116,7 +126,7 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error {
|
||||
pciInfo.Device,
|
||||
pciInfo.Function)
|
||||
|
||||
if slices.Contains(m.config.ExcludeDevices, pciId) {
|
||||
if exclDev(pciId) {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -174,127 +184,127 @@ func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMessage
|
||||
|
||||
if !dev.excludeMetrics["rocm_gfx_util"] {
|
||||
value := metrics.Average_gfx_activity
|
||||
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_umc_util"] {
|
||||
value := metrics.Average_umc_activity
|
||||
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_mm_util"] {
|
||||
value := metrics.Average_mm_activity
|
||||
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_avg_power"] {
|
||||
value := metrics.Average_socket_power
|
||||
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_mem"] {
|
||||
value := metrics.Temperature_mem
|
||||
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_hotspot"] {
|
||||
value := metrics.Temperature_hotspot
|
||||
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_edge"] {
|
||||
value := metrics.Temperature_edge
|
||||
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_vrgfx"] {
|
||||
value := metrics.Temperature_vrgfx
|
||||
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_vrsoc"] {
|
||||
value := metrics.Temperature_vrsoc
|
||||
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_vrmem"] {
|
||||
value := metrics.Temperature_vrmem
|
||||
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_gfx_clock"] {
|
||||
value := metrics.Average_gfxclk_frequency
|
||||
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_soc_clock"] {
|
||||
value := metrics.Average_socclk_frequency
|
||||
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_u_clock"] {
|
||||
value := metrics.Average_uclk_frequency
|
||||
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_v0_clock"] {
|
||||
value := metrics.Average_vclk0_frequency
|
||||
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_v1_clock"] {
|
||||
value := metrics.Average_vclk1_frequency
|
||||
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_d0_clock"] {
|
||||
value := metrics.Average_dclk0_frequency
|
||||
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_d1_clock"] {
|
||||
value := metrics.Average_dclk1_frequency
|
||||
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if !dev.excludeMetrics["rocm_temp_hbm"] {
|
||||
for i := range rocm_smi.NUM_HBM_INSTANCES {
|
||||
for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ {
|
||||
value := metrics.Temperature_hbm[i]
|
||||
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
y.AddTag("stype", "device")
|
||||
y.AddTag("stype-id", fmt.Sprintf("%d", i))
|
||||
|
||||
@@ -15,9 +15,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/rocmsmi.md
|
||||
```json
|
||||
"rocm_smi": {
|
||||
"exclude_devices": [
|
||||
"0",
|
||||
"1",
|
||||
"0000000:ff:01.0"
|
||||
"0","1", "0000000:ff:01.0"
|
||||
],
|
||||
"exclude_metrics": [
|
||||
"rocm_mm_util",
|
||||
@@ -25,7 +23,7 @@ hugo_path: docs/reference/cc-metric-collector/collectors/rocmsmi.md
|
||||
],
|
||||
"use_pci_info_as_type_id": true,
|
||||
"add_pci_info_tag": false,
|
||||
"add_serial_meta": false
|
||||
"add_serial_meta": false,
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
@@ -95,7 +95,7 @@ func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMessage)
|
||||
// stop := readState()
|
||||
// value = (stop - start) / interval.Seconds()
|
||||
|
||||
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil {
|
||||
// Send it to output channel
|
||||
output <- y
|
||||
|
||||
@@ -110,7 +110,7 @@ func (m *SampleTimerCollector) ReadMetrics(timestamp time.Time) {
|
||||
// stop := readState()
|
||||
// value = (stop - start) / interval.Seconds()
|
||||
|
||||
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]any{"value": value}, timestamp)
|
||||
y, err := lp.NewMessage("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
|
||||
if err == nil && m.output != nil {
|
||||
// Send it to output channel if we have a valid channel
|
||||
m.output <- y
|
||||
|
||||
@@ -123,7 +123,7 @@ func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]
|
||||
m.olddata[linefields[0]]["waiting"] = waiting
|
||||
value := l_running + l_waiting
|
||||
|
||||
y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]any{"value": value}, now)
|
||||
y, err := lp.NewMessage("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now)
|
||||
if err == nil {
|
||||
// Send it to output channel
|
||||
output <- y
|
||||
|
||||
@@ -59,49 +59,49 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
var memstats runtime.MemStats
|
||||
runtime.ReadMemStats(&memstats)
|
||||
|
||||
y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]any{"value": memstats.TotalAlloc}, timestamp)
|
||||
y, err := lp.NewMessage("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]any{"value": memstats.HeapAlloc}, timestamp)
|
||||
y, err = lp.NewMessage("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]any{"value": memstats.HeapSys}, timestamp)
|
||||
y, err = lp.NewMessage("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]any{"value": memstats.HeapIdle}, timestamp)
|
||||
y, err = lp.NewMessage("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]any{"value": memstats.HeapInuse}, timestamp)
|
||||
y, err = lp.NewMessage("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]any{"value": memstats.HeapReleased}, timestamp)
|
||||
y, err = lp.NewMessage("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]any{"value": memstats.HeapObjects}, timestamp)
|
||||
y, err = lp.NewMessage("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if m.config.GoRoutines {
|
||||
y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]any{"value": runtime.NumGoroutine()}, timestamp)
|
||||
y, err := lp.NewMessage("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
if m.config.CgoCalls {
|
||||
y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]any{"value": runtime.NumCgoCall()}, timestamp)
|
||||
y, err := lp.NewMessage("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
@@ -112,35 +112,35 @@ func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
if err == nil {
|
||||
sec, nsec := rusage.Utime.Unix()
|
||||
t := float64(sec) + (float64(nsec) * 1e-9)
|
||||
y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]any{"value": t}, timestamp)
|
||||
y, err := lp.NewMessage("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "seconds")
|
||||
output <- y
|
||||
}
|
||||
sec, nsec = rusage.Stime.Unix()
|
||||
t = float64(sec) + (float64(nsec) * 1e-9)
|
||||
y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]any{"value": t}, timestamp)
|
||||
y, err = lp.NewMessage("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp)
|
||||
if err == nil {
|
||||
y.AddMeta("unit", "seconds")
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]any{"value": rusage.Nvcsw}, timestamp)
|
||||
y, err = lp.NewMessage("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]any{"value": rusage.Nivcsw}, timestamp)
|
||||
y, err = lp.NewMessage("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]any{"value": rusage.Nsignals}, timestamp)
|
||||
y, err = lp.NewMessage("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]any{"value": rusage.Majflt}, timestamp)
|
||||
y, err = lp.NewMessage("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]any{"value": rusage.Minflt}, timestamp)
|
||||
y, err = lp.NewMessage("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp)
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
@@ -257,7 +257,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
|
||||
if coreCount > 0 && !m.isExcluded("job_mem_used") {
|
||||
memPerCore := jobdata.MemoryUsage / coreCount
|
||||
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]any{"value": memPerCore}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]interface{}{"value": memPerCore}, timestamp); err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
@@ -265,7 +265,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
|
||||
if coreCount > 0 && !m.isExcluded("job_max_mem_used") {
|
||||
maxMemPerCore := jobdata.MaxMemoryUsage / coreCount
|
||||
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]any{"value": maxMemPerCore}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]interface{}{"value": maxMemPerCore}, timestamp); err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
@@ -273,7 +273,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
|
||||
if coreCount > 0 && !m.isExcluded("job_mem_limit") {
|
||||
limitPerCore := jobdata.LimitMemoryUsage / coreCount
|
||||
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]any{"value": limitPerCore}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]interface{}{"value": limitPerCore}, timestamp); err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
@@ -281,7 +281,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
|
||||
if coreCount > 0 && !m.isExcluded("job_user_cpu") {
|
||||
cpuUserPerCore := jobdata.CpuUsageUser / coreCount
|
||||
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]any{"value": cpuUserPerCore}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]interface{}{"value": cpuUserPerCore}, timestamp); err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
}
|
||||
@@ -289,7 +289,7 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
|
||||
if coreCount > 0 && !m.isExcluded("job_sys_cpu") {
|
||||
cpuSysPerCore := jobdata.CpuUsageSys / coreCount
|
||||
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]any{"value": cpuSysPerCore}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]interface{}{"value": cpuSysPerCore}, timestamp); err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
}
|
||||
@@ -308,35 +308,35 @@ func (m *SlurmCgroupCollector) Read(interval time.Duration, output chan lp.CCMes
|
||||
}
|
||||
|
||||
if !m.isExcluded("job_mem_used") {
|
||||
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage("job_mem_used", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
if !m.isExcluded("job_max_mem_used") {
|
||||
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage("job_max_mem_used", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
if !m.isExcluded("job_mem_limit") {
|
||||
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage("job_mem_limit", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
||||
y.AddMeta("unit", "Bytes")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
if !m.isExcluded("job_user_cpu") {
|
||||
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage("job_user_cpu", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
}
|
||||
}
|
||||
|
||||
if !m.isExcluded("job_sys_cpu") {
|
||||
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]any{"value": 0}, timestamp); err == nil {
|
||||
if y, err := lp.NewMessage("job_sys_cpu", coreTags, m.meta, map[string]interface{}{"value": 0}, timestamp); err == nil {
|
||||
y.AddMeta("unit", "%")
|
||||
output <- y
|
||||
}
|
||||
|
||||
@@ -203,7 +203,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
sensor.metricName,
|
||||
sensor.tags,
|
||||
m.meta,
|
||||
map[string]any{"value": x},
|
||||
map[string]interface{}{"value": x},
|
||||
time.Now(),
|
||||
)
|
||||
if err == nil {
|
||||
@@ -216,7 +216,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
sensor.maxTempName,
|
||||
sensor.tags,
|
||||
m.meta,
|
||||
map[string]any{"value": sensor.maxTemp},
|
||||
map[string]interface{}{"value": sensor.maxTemp},
|
||||
time.Now(),
|
||||
)
|
||||
if err == nil {
|
||||
@@ -230,7 +230,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMessage) {
|
||||
sensor.critTempName,
|
||||
sensor.tags,
|
||||
m.meta,
|
||||
map[string]any{"value": sensor.critTemp},
|
||||
map[string]interface{}{"value": sensor.critTemp},
|
||||
time.Now(),
|
||||
)
|
||||
if err == nil {
|
||||
|
||||
@@ -81,7 +81,7 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMessag
|
||||
lines := strings.Split(string(stdout), "\n")
|
||||
for i := 1; i < m.config.Num_procs+1; i++ {
|
||||
name := fmt.Sprintf("topproc%d", i)
|
||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]any{"value": string(lines[i])}, time.Now())
|
||||
y, err := lp.NewMessage(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now())
|
||||
if err == nil {
|
||||
output <- y
|
||||
}
|
||||
|
||||
4
go.mod
4
go.mod
@@ -3,7 +3,7 @@ module github.com/ClusterCockpit/cc-metric-collector
|
||||
go 1.24.0
|
||||
|
||||
require (
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.2.1
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.1.0
|
||||
github.com/ClusterCockpit/go-rocm-smi v0.3.0
|
||||
github.com/NVIDIA/go-nvml v0.13.0-1
|
||||
github.com/PaesslerAG/gval v1.2.4
|
||||
@@ -11,7 +11,7 @@ require (
|
||||
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf
|
||||
github.com/tklauser/go-sysconf v0.3.16
|
||||
golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1
|
||||
golang.org/x/sys v0.41.0
|
||||
golang.org/x/sys v0.40.0
|
||||
)
|
||||
|
||||
require (
|
||||
|
||||
8
go.sum
8
go.sum
@@ -1,5 +1,5 @@
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.2.1 h1:iCVas+Jc61zFH5S2VG3H1sc7tsn+U4lOJwUYjYZEims=
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.2.1/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw=
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.1.0 h1:B6l6h0IjfEuY9DU6aVM3fSsj24lQ1eudXK9QTKmJjqg=
|
||||
github.com/ClusterCockpit/cc-lib/v2 v2.1.0/go.mod h1:JuxMAuEOaLLNEnnL9U3ejha8kMvsSatLdKPZEgJw6iw=
|
||||
github.com/ClusterCockpit/go-rocm-smi v0.3.0 h1:1qZnSpG7/NyLtc7AjqnUL9Jb8xtqG1nMVgp69rJfaR8=
|
||||
github.com/ClusterCockpit/go-rocm-smi v0.3.0/go.mod h1:+I3UMeX3OlizXDf1WpGD43W4KGZZGVSGmny6rTeOnWA=
|
||||
github.com/NVIDIA/go-nvml v0.11.6-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
|
||||
@@ -122,8 +122,8 @@ golang.org/x/exp v0.0.0-20260112195511-716be5621a96/go.mod h1:nzimsREAkjBCIEFtHi
|
||||
golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o=
|
||||
golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8=
|
||||
golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k=
|
||||
golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
|
||||
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
|
||||
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
|
||||
@@ -13,7 +13,6 @@ import (
|
||||
"maps"
|
||||
"math"
|
||||
"os"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -38,7 +37,7 @@ type MetricAggregatorIntervalConfig struct {
|
||||
|
||||
type metricAggregator struct {
|
||||
functions []*MetricAggregatorIntervalConfig
|
||||
constants map[string]any
|
||||
constants map[string]interface{}
|
||||
language gval.Language
|
||||
output chan lp.CCMessage
|
||||
}
|
||||
@@ -86,7 +85,7 @@ var evaluables = struct {
|
||||
func (c *metricAggregator) Init(output chan lp.CCMessage) error {
|
||||
c.output = output
|
||||
c.functions = make([]*MetricAggregatorIntervalConfig, 0)
|
||||
c.constants = make(map[string]any)
|
||||
c.constants = make(map[string]interface{})
|
||||
|
||||
// add constants like hostname, numSockets, ... to constants list
|
||||
// Set hostname
|
||||
@@ -122,7 +121,7 @@ func (c *metricAggregator) Init(output chan lp.CCMessage) error {
|
||||
}
|
||||
|
||||
func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics []lp.CCMessage) {
|
||||
vars := make(map[string]any)
|
||||
vars := make(map[string]interface{})
|
||||
maps.Copy(vars, c.constants)
|
||||
vars["starttime"] = starttime
|
||||
vars["endtime"] = endtime
|
||||
@@ -263,15 +262,15 @@ func (c *metricAggregator) Eval(starttime time.Time, endtime time.Time, metrics
|
||||
var m lp.CCMessage
|
||||
switch t := value.(type) {
|
||||
case float64:
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
||||
case float32:
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
||||
case int:
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
||||
case int64:
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
||||
case string:
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]any{"value": t}, starttime)
|
||||
m, err = lp.NewMessage(f.Name, tags, meta, map[string]interface{}{"value": t}, starttime)
|
||||
default:
|
||||
cclog.ComponentError("MetricCache", "Gval returned invalid type", t, "skipping metric", f.Name)
|
||||
}
|
||||
@@ -329,19 +328,18 @@ func (c *metricAggregator) AddAggregation(name, function, condition string, tags
|
||||
}
|
||||
|
||||
func (c *metricAggregator) DeleteAggregation(name string) error {
|
||||
i := slices.IndexFunc(
|
||||
c.functions,
|
||||
func(agg *MetricAggregatorIntervalConfig) bool {
|
||||
return agg.Name == name
|
||||
})
|
||||
if i == -1 {
|
||||
return fmt.Errorf("no aggregation for metric name %s", name)
|
||||
for i, agg := range c.functions {
|
||||
if agg.Name == name {
|
||||
copy(c.functions[i:], c.functions[i+1:])
|
||||
c.functions[len(c.functions)-1] = nil
|
||||
c.functions = c.functions[:len(c.functions)-1]
|
||||
return nil
|
||||
}
|
||||
}
|
||||
c.functions = slices.Delete(c.functions, i, i)
|
||||
return nil
|
||||
return fmt.Errorf("no aggregation for metric name %s", name)
|
||||
}
|
||||
|
||||
func (c *metricAggregator) AddConstant(name string, value any) {
|
||||
func (c *metricAggregator) AddConstant(name string, value interface{}) {
|
||||
c.constants[name] = value
|
||||
}
|
||||
|
||||
@@ -349,11 +347,11 @@ func (c *metricAggregator) DelConstant(name string) {
|
||||
delete(c.constants, name)
|
||||
}
|
||||
|
||||
func (c *metricAggregator) AddFunction(name string, function func(args ...any) (any, error)) {
|
||||
func (c *metricAggregator) AddFunction(name string, function func(args ...interface{}) (interface{}, error)) {
|
||||
c.language = gval.NewLanguage(c.language, gval.Function(name, function))
|
||||
}
|
||||
|
||||
func EvalBoolCondition(condition string, params map[string]any) (bool, error) {
|
||||
func EvalBoolCondition(condition string, params map[string]interface{}) (bool, error) {
|
||||
evaluables.mutex.Lock()
|
||||
evaluable, ok := evaluables.mapping[condition]
|
||||
evaluables.mutex.Unlock()
|
||||
|
||||
@@ -33,7 +33,7 @@ func sumAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
|
||||
}
|
||||
|
||||
// Sum up values
|
||||
func sumfunc(args any) (any, error) {
|
||||
func sumfunc(args interface{}) (interface{}, error) {
|
||||
|
||||
var err error
|
||||
switch values := args.(type) {
|
||||
@@ -62,7 +62,7 @@ func minAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
|
||||
}
|
||||
|
||||
// Get the minimum value
|
||||
func minfunc(args any) (any, error) {
|
||||
func minfunc(args interface{}) (interface{}, error) {
|
||||
switch values := args.(type) {
|
||||
case []float64:
|
||||
return minAnyType(values)
|
||||
@@ -83,12 +83,12 @@ func avgAnyType[T float64 | float32 | int | int32 | int64](values []T) (float64,
|
||||
if len(values) == 0 {
|
||||
return 0.0, errors.New("average function requires at least one argument")
|
||||
}
|
||||
sum, err := sumAnyType(values)
|
||||
sum, err := sumAnyType[T](values)
|
||||
return float64(sum) / float64(len(values)), err
|
||||
}
|
||||
|
||||
// Get the average or mean value
|
||||
func avgfunc(args any) (any, error) {
|
||||
func avgfunc(args interface{}) (interface{}, error) {
|
||||
switch values := args.(type) {
|
||||
case []float64:
|
||||
return avgAnyType(values)
|
||||
@@ -113,7 +113,7 @@ func maxAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, error
|
||||
}
|
||||
|
||||
// Get the maximum value
|
||||
func maxfunc(args any) (any, error) {
|
||||
func maxfunc(args interface{}) (interface{}, error) {
|
||||
switch values := args.(type) {
|
||||
case []float64:
|
||||
return maxAnyType(values)
|
||||
@@ -145,7 +145,7 @@ func medianAnyType[T float64 | float32 | int | int32 | int64](values []T) (T, er
|
||||
}
|
||||
|
||||
// Get the median value
|
||||
func medianfunc(args any) (any, error) {
|
||||
func medianfunc(args interface{}) (interface{}, error) {
|
||||
switch values := args.(type) {
|
||||
case []float64:
|
||||
return medianAnyType(values)
|
||||
@@ -166,7 +166,7 @@ func medianfunc(args any) (any, error) {
|
||||
* Get number of values in list. Returns always an int
|
||||
*/
|
||||
|
||||
func lenfunc(args any) (any, error) {
|
||||
func lenfunc(args interface{}) (interface{}, error) {
|
||||
var err error = nil
|
||||
length := 0
|
||||
switch values := args.(type) {
|
||||
@@ -180,7 +180,13 @@ func lenfunc(args any) (any, error) {
|
||||
length = len(values)
|
||||
case []int32:
|
||||
length = len(values)
|
||||
case float64, float32, int, int64:
|
||||
case float64:
|
||||
err = errors.New("function 'len' can only be applied on arrays and strings")
|
||||
case float32:
|
||||
err = errors.New("function 'len' can only be applied on arrays and strings")
|
||||
case int:
|
||||
err = errors.New("function 'len' can only be applied on arrays and strings")
|
||||
case int64:
|
||||
err = errors.New("function 'len' can only be applied on arrays and strings")
|
||||
case string:
|
||||
length = len(values)
|
||||
@@ -190,13 +196,13 @@ func lenfunc(args any) (any, error) {
|
||||
|
||||
/*
|
||||
* Check if a values is in a list
|
||||
* In contrast to most of the other functions, this one is an infix operator for
|
||||
* In constrast to most of the other functions, this one is an infix operator for
|
||||
* - substring matching: `"abc" in "abcdef"` -> true
|
||||
* - substring matching with int casting: `3 in "abd3"` -> true
|
||||
* - search for an int in an int list: `3 in getCpuList()` -> true (if you have more than 4 CPU hardware threads)
|
||||
*/
|
||||
|
||||
func infunc(a any, b any) (any, error) {
|
||||
func infunc(a interface{}, b interface{}) (interface{}, error) {
|
||||
switch match := a.(type) {
|
||||
case string:
|
||||
switch total := b.(type) {
|
||||
@@ -206,7 +212,11 @@ func infunc(a any, b any) (any, error) {
|
||||
case int:
|
||||
switch total := b.(type) {
|
||||
case []int:
|
||||
return slices.Contains(total, match), nil
|
||||
for _, x := range total {
|
||||
if x == match {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
case string:
|
||||
smatch := fmt.Sprintf("%d", match)
|
||||
return strings.Contains(total, smatch), nil
|
||||
@@ -222,7 +232,7 @@ func infunc(a any, b any) (any, error) {
|
||||
* format keys \d = %d, \w = %d, ... Not sure how to fix this
|
||||
*/
|
||||
|
||||
func matchfunc(args ...any) (any, error) {
|
||||
func matchfunc(args ...interface{}) (interface{}, error) {
|
||||
switch match := args[0].(type) {
|
||||
case string:
|
||||
switch total := args[1].(type) {
|
||||
@@ -244,7 +254,7 @@ func matchfunc(args ...any) (any, error) {
|
||||
*/
|
||||
|
||||
// for a given cpuid, it returns the core id
|
||||
func getCpuCoreFunc(args any) (any, error) {
|
||||
func getCpuCoreFunc(args interface{}) (interface{}, error) {
|
||||
switch cpuid := args.(type) {
|
||||
case int:
|
||||
return topo.GetHwthreadCore(cpuid), nil
|
||||
@@ -253,7 +263,7 @@ func getCpuCoreFunc(args any) (any, error) {
|
||||
}
|
||||
|
||||
// for a given cpuid, it returns the socket id
|
||||
func getCpuSocketFunc(args any) (any, error) {
|
||||
func getCpuSocketFunc(args interface{}) (interface{}, error) {
|
||||
switch cpuid := args.(type) {
|
||||
case int:
|
||||
return topo.GetHwthreadSocket(cpuid), nil
|
||||
@@ -262,7 +272,7 @@ func getCpuSocketFunc(args any) (any, error) {
|
||||
}
|
||||
|
||||
// for a given cpuid, it returns the id of the NUMA node
|
||||
func getCpuNumaDomainFunc(args any) (any, error) {
|
||||
func getCpuNumaDomainFunc(args interface{}) (interface{}, error) {
|
||||
switch cpuid := args.(type) {
|
||||
case int:
|
||||
return topo.GetHwthreadNumaDomain(cpuid), nil
|
||||
@@ -271,7 +281,7 @@ func getCpuNumaDomainFunc(args any) (any, error) {
|
||||
}
|
||||
|
||||
// for a given cpuid, it returns the id of the CPU die
|
||||
func getCpuDieFunc(args any) (any, error) {
|
||||
func getCpuDieFunc(args interface{}) (interface{}, error) {
|
||||
switch cpuid := args.(type) {
|
||||
case int:
|
||||
return topo.GetHwthreadDie(cpuid), nil
|
||||
@@ -280,7 +290,7 @@ func getCpuDieFunc(args any) (any, error) {
|
||||
}
|
||||
|
||||
// for a given core id, it returns the list of cpuids
|
||||
func getCpuListOfCoreFunc(args any) (any, error) {
|
||||
func getCpuListOfCoreFunc(args interface{}) (interface{}, error) {
|
||||
cpulist := make([]int, 0)
|
||||
switch in := args.(type) {
|
||||
case int:
|
||||
@@ -294,7 +304,7 @@ func getCpuListOfCoreFunc(args any) (any, error) {
|
||||
}
|
||||
|
||||
// for a given socket id, it returns the list of cpuids
|
||||
func getCpuListOfSocketFunc(args any) (any, error) {
|
||||
func getCpuListOfSocketFunc(args interface{}) (interface{}, error) {
|
||||
cpulist := make([]int, 0)
|
||||
switch in := args.(type) {
|
||||
case int:
|
||||
@@ -308,7 +318,7 @@ func getCpuListOfSocketFunc(args any) (any, error) {
|
||||
}
|
||||
|
||||
// for a given id of a NUMA domain, it returns the list of cpuids
|
||||
func getCpuListOfNumaDomainFunc(args any) (any, error) {
|
||||
func getCpuListOfNumaDomainFunc(args interface{}) (interface{}, error) {
|
||||
cpulist := make([]int, 0)
|
||||
switch in := args.(type) {
|
||||
case int:
|
||||
@@ -322,7 +332,7 @@ func getCpuListOfNumaDomainFunc(args any) (any, error) {
|
||||
}
|
||||
|
||||
// for a given CPU die id, it returns the list of cpuids
|
||||
func getCpuListOfDieFunc(args any) (any, error) {
|
||||
func getCpuListOfDieFunc(args interface{}) (interface{}, error) {
|
||||
cpulist := make([]int, 0)
|
||||
switch in := args.(type) {
|
||||
case int:
|
||||
@@ -336,14 +346,14 @@ func getCpuListOfDieFunc(args any) (any, error) {
|
||||
}
|
||||
|
||||
// wrapper function to get a list of all cpuids of the node
|
||||
func getCpuListOfNode() (any, error) {
|
||||
func getCpuListOfNode() (interface{}, error) {
|
||||
return topo.HwthreadList(), nil
|
||||
}
|
||||
|
||||
// helper function to get the cpuid list for a CCMetric type tag set (type and type-id)
|
||||
// since there is no access to the metric data in the function, is should be called like
|
||||
// `getCpuListOfType()`
|
||||
func getCpuListOfType(args ...any) (any, error) {
|
||||
func getCpuListOfType(args ...interface{}) (interface{}, error) {
|
||||
cpulist := make([]int, 0)
|
||||
switch typ := args[0].(type) {
|
||||
case string:
|
||||
|
||||
@@ -10,7 +10,6 @@ package metricRouter
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"maps"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -193,8 +192,8 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
|
||||
return nil
|
||||
}
|
||||
|
||||
func getParamMap(point lp.CCMessage) map[string]any {
|
||||
params := make(map[string]any)
|
||||
func getParamMap(point lp.CCMessage) map[string]interface{} {
|
||||
params := make(map[string]interface{})
|
||||
params["metric"] = point
|
||||
params["name"] = point.Name()
|
||||
for key, value := range point.Tags() {
|
||||
@@ -203,7 +202,9 @@ func getParamMap(point lp.CCMessage) map[string]any {
|
||||
for key, value := range point.Meta() {
|
||||
params[key] = value
|
||||
}
|
||||
maps.Copy(params, point.Fields())
|
||||
for key, value := range point.Fields() {
|
||||
params[key] = value
|
||||
}
|
||||
params["timestamp"] = point.Time()
|
||||
return params
|
||||
}
|
||||
|
||||
@@ -30,11 +30,11 @@ make
|
||||
|
||||
%install
|
||||
install -Dpm 0750 %{name} %{buildroot}%{_bindir}/%{name}
|
||||
install -Dpm 0600 example-configs/config.json %{buildroot}%{_sysconfdir}/%{name}/%{name}.json
|
||||
install -Dpm 0600 example-configs/collectors.json %{buildroot}%{_sysconfdir}/%{name}/collectors.json
|
||||
install -Dpm 0600 example-configs/sinks.json %{buildroot}%{_sysconfdir}/%{name}/sinks.json
|
||||
install -Dpm 0600 example-configs/receivers.json %{buildroot}%{_sysconfdir}/%{name}/receivers.json
|
||||
install -Dpm 0600 example-configs/router.json %{buildroot}%{_sysconfdir}/%{name}/router.json
|
||||
install -Dpm 0600 config.json %{buildroot}%{_sysconfdir}/%{name}/%{name}.json
|
||||
install -Dpm 0600 collectors.json %{buildroot}%{_sysconfdir}/%{name}/collectors.json
|
||||
install -Dpm 0600 sinks.json %{buildroot}%{_sysconfdir}/%{name}/sinks.json
|
||||
install -Dpm 0600 receivers.json %{buildroot}%{_sysconfdir}/%{name}/receivers.json
|
||||
install -Dpm 0600 router.json %{buildroot}%{_sysconfdir}/%{name}/router.json
|
||||
install -Dpm 0644 scripts/%{name}.service %{buildroot}%{_unitdir}/%{name}.service
|
||||
install -Dpm 0600 scripts/%{name}.config %{buildroot}%{_sysconfdir}/default/%{name}
|
||||
install -Dpm 0644 scripts/%{name}.sysusers %{buildroot}%{_sysusersdir}/%{name}.conf
|
||||
|
||||
Reference in New Issue
Block a user