From 76884c3380aaf1b1b5e83964c3d2e9224a0052e8 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Wed, 26 Jan 2022 18:45:23 +0100 Subject: [PATCH] Prefix Nvidia metrics with 'nv_' --- collectors/nvidiaMetric.go | 88 +++++++++++++++++++------------------- collectors/nvidiaMetric.md | 44 +++++++++---------- 2 files changed, 66 insertions(+), 66 deletions(-) diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 6f5141a..1eff3be 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -73,13 +73,13 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) util, ret := nvml.DeviceGetUtilizationRates(device) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "util") - y, err := lp.New("util", tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_util") + y, err := lp.New("nv_util", tags, m.meta, map[string]interface{}{"value": float64(util.Gpu)}, time.Now()) if err == nil && !skip { output <- y } - _, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_util") - y, err = lp.New("mem_util", tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_util") + y, err = lp.New("nv_mem_util", tags, m.meta, map[string]interface{}{"value": float64(util.Memory)}, time.Now()) if err == nil && !skip { output <- y } @@ -88,15 +88,15 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) meminfo, ret := nvml.DeviceGetMemoryInfo(device) if ret == nvml.SUCCESS { t := float64(meminfo.Total) / (1024 * 1024) - _, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_total") - y, err := lp.New("mem_total", tags, m.meta, map[string]interface{}{"value": t}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_total") + y, err := lp.New("nv_mem_total", tags, m.meta, map[string]interface{}{"value": t}, time.Now()) if err == nil && !skip { y.AddMeta("unit", "MByte") output <- y } f := float64(meminfo.Used) / (1024 * 1024) - _, skip = stringArrayContains(m.config.ExcludeMetrics, "fb_memory") - y, err = lp.New("fb_memory", tags, m.meta, map[string]interface{}{"value": f}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_fb_memory") + y, err = lp.New("nv_fb_memory", tags, m.meta, map[string]interface{}{"value": f}, time.Now()) if err == nil && !skip { y.AddMeta("unit", "MByte") output <- y @@ -105,8 +105,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) temp, ret := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "temp") - y, err := lp.New("temp", tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_temp") + y, err := lp.New("nv_temp", tags, m.meta, map[string]interface{}{"value": float64(temp)}, time.Now()) if err == nil && !skip { y.AddMeta("unit", "degC") output <- y @@ -115,8 +115,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) fan, ret := nvml.DeviceGetFanSpeed(device) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "fan") - y, err := lp.New("fan", tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_fan") + y, err := lp.New("nv_fan", tags, m.meta, map[string]interface{}{"value": float64(fan)}, time.Now()) if err == nil && !skip { output <- y } @@ -128,19 +128,19 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) var err error switch ecc_pend { case nvml.FEATURE_DISABLED: - y, err = lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now()) + y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("OFF")}, time.Now()) case nvml.FEATURE_ENABLED: - y, err = lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now()) + y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("ON")}, time.Now()) default: - y, err = lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now()) + y, err = lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("UNKNOWN")}, time.Now()) } - _, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_mode") + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_mode") if err == nil && !skip { output <- y } } else if ret == nvml.ERROR_NOT_SUPPORTED { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_mode") - y, err := lp.New("ecc_mode", tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_mode") + y, err := lp.New("nv_ecc_mode", tags, m.meta, map[string]interface{}{"value": string("N/A")}, time.Now()) if err == nil && !skip { output <- y } @@ -148,8 +148,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) pstate, ret := nvml.DeviceGetPerformanceState(device) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "perf_state") - y, err := lp.New("perf_state", tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_perf_state") + y, err := lp.New("nv_perf_state", tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now()) if err == nil && !skip { output <- y } @@ -157,8 +157,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) power, ret := nvml.DeviceGetPowerUsage(device) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "power_usage_report") - y, err := lp.New("power_usage_report", tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_power_usage_report") + y, err := lp.New("nv_power_usage_report", tags, m.meta, map[string]interface{}{"value": float64(power) / 1000}, time.Now()) if err == nil && !skip { output <- y } @@ -166,8 +166,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) gclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_GRAPHICS) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "graphics_clock_report") - y, err := lp.New("graphics_clock_report", tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_graphics_clock_report") + y, err := lp.New("nv_graphics_clock_report", tags, m.meta, map[string]interface{}{"value": float64(gclk)}, time.Now()) if err == nil && !skip { output <- y } @@ -175,8 +175,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "sm_clock_report") - y, err := lp.New("sm_clock_report", tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_sm_clock_report") + y, err := lp.New("nv_sm_clock_report", tags, m.meta, map[string]interface{}{"value": float64(smclk)}, time.Now()) if err == nil && !skip { output <- y } @@ -184,8 +184,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_clock_report") - y, err := lp.New("mem_clock_report", tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_mem_clock_report") + y, err := lp.New("nv_mem_clock_report", tags, m.meta, map[string]interface{}{"value": float64(memclk)}, time.Now()) if err == nil && !skip { output <- y } @@ -193,8 +193,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) max_gclk, ret := nvml.DeviceGetMaxClockInfo(device, nvml.CLOCK_GRAPHICS) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "max_graphics_clock") - y, err := lp.New("max_graphics_clock", tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_graphics_clock") + y, err := lp.New("nv_max_graphics_clock", tags, m.meta, map[string]interface{}{"value": float64(max_gclk)}, time.Now()) if err == nil && !skip { output <- y } @@ -202,8 +202,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) max_smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "max_sm_clock") - y, err := lp.New("max_sm_clock", tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_sm_clock") + y, err := lp.New("nv_max_sm_clock", tags, m.meta, map[string]interface{}{"value": float64(max_smclk)}, time.Now()) if err == nil && !skip { output <- y } @@ -211,8 +211,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) max_memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "max_mem_clock") - y, err := lp.New("max_mem_clock", tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_max_mem_clock") + y, err := lp.New("nv_max_mem_clock", tags, m.meta, map[string]interface{}{"value": float64(max_memclk)}, time.Now()) if err == nil && !skip { output <- y } @@ -220,8 +220,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) ecc_db, ret := nvml.DeviceGetTotalEccErrors(device, 1, 1) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_db_error") - y, err := lp.New("ecc_db_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_db_error") + y, err := lp.New("nv_ecc_db_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now()) if err == nil && !skip { output <- y } @@ -229,8 +229,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device, 0, 1) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_sb_error") - y, err := lp.New("ecc_sb_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_ecc_sb_error") + y, err := lp.New("nv_ecc_sb_error", tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now()) if err == nil && !skip { output <- y } @@ -238,8 +238,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "power_man_limit") - y, err := lp.New("power_man_limit", tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_power_man_limit") + y, err := lp.New("nv_power_man_limit", tags, m.meta, map[string]interface{}{"value": float64(pwr_limit)}, time.Now()) if err == nil && !skip { output <- y } @@ -247,8 +247,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "encoder_util") - y, err := lp.New("encoder_util", tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_encoder_util") + y, err := lp.New("nv_encoder_util", tags, m.meta, map[string]interface{}{"value": float64(enc_util)}, time.Now()) if err == nil && !skip { output <- y } @@ -256,8 +256,8 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device) if ret == nvml.SUCCESS { - _, skip = stringArrayContains(m.config.ExcludeMetrics, "decoder_util") - y, err := lp.New("decoder_util", tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now()) + _, skip = stringArrayContains(m.config.ExcludeMetrics, "nv_decoder_util") + y, err := lp.New("nv_decoder_util", tags, m.meta, map[string]interface{}{"value": float64(dec_util)}, time.Now()) if err == nil && !skip { output <- y } diff --git a/collectors/nvidiaMetric.md b/collectors/nvidiaMetric.md index e2e08e5..afe8b9e 100644 --- a/collectors/nvidiaMetric.md +++ b/collectors/nvidiaMetric.md @@ -7,33 +7,33 @@ "0","1" ], "exclude_metrics": [ - "fb_memory", - "fan" + "nv_fb_memory", + "nv_fan" ] } ``` Metrics: -* `util` -* `mem_util` -* `mem_total` -* `fb_memory` -* `temp` -* `fan` -* `ecc_mode` -* `perf_state` -* `power_usage_report` -* `graphics_clock_report` -* `sm_clock_report` -* `mem_clock_report` -* `max_graphics_clock` -* `max_sm_clock` -* `max_mem_clock` -* `ecc_db_error` -* `ecc_sb_error` -* `power_man_limit` -* `encoder_util` -* `decoder_util` +* `nv_util` +* `nv_mem_util` +* `nv_mem_total` +* `nv_fb_memory` +* `nv_temp` +* `nv_fan` +* `nv_ecc_mode` +* `nv_perf_state` +* `nv_power_usage_report` +* `nv_graphics_clock_report` +* `nv_sm_clock_report` +* `nv_mem_clock_report` +* `nv_max_graphics_clock` +* `nv_max_sm_clock` +* `nv_max_mem_clock` +* `nv_ecc_db_error` +* `nv_ecc_sb_error` +* `nv_power_man_limit` +* `nv_encoder_util` +* `nv_decoder_util` It uses a separate `type` in the metrics. The output metric looks like this: `,type=accelerator,type-id= value= `