Compare commits

..

1 Commits

Author SHA1 Message Date
Thomas Roehl
a1077b58a8 Update Hugo integration 2025-04-16 23:54:17 +02:00
3 changed files with 29 additions and 91 deletions

View File

@@ -31,8 +31,6 @@ type NvidiaCollectorDevice struct {
excludeMetrics map[string]bool
tags map[string]string
meta map[string]string
lastEnergyReading uint64
lastEnergyTimestamp time.Time
}
type NvidiaCollector struct {
@@ -151,8 +149,6 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
// Add device handle
g.device = device
g.lastEnergyReading = 0
g.lastEnergyTimestamp = time.Now()
// Add tags
g.tags = map[string]string{
@@ -210,7 +206,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
return nil
}
func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_fb_mem_total"] || !device.excludeMetrics["nv_fb_mem_used"] || !device.excludeMetrics["nv_fb_mem_reserved"] {
var total uint64
var used uint64
@@ -254,7 +250,7 @@ func readMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
return nil
}
func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readBarMemoryInfo(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_bar1_mem_total"] || !device.excludeMetrics["nv_bar1_mem_used"] {
meminfo, ret := nvml.DeviceGetBAR1MemoryInfo(device.device)
if ret != nvml.SUCCESS {
@@ -281,7 +277,7 @@ func readBarMemoryInfo(device *NvidiaCollectorDevice, output chan lp.CCMessage)
return nil
}
func readUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
if ret != nvml.SUCCESS {
err := errors.New(nvml.ErrorString(ret))
@@ -323,7 +319,7 @@ func readUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) er
return nil
}
func readTemp(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readTemp(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_temp"] {
// Retrieves the current temperature readings for the device, in degrees C.
//
@@ -342,7 +338,7 @@ func readTemp(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
return nil
}
func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readFan(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_fan"] {
// Retrieves the intended operating speed of the device's fan.
//
@@ -365,7 +361,7 @@ func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
return nil
}
// func readFans(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
// func readFans(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
// if !device.excludeMetrics["nv_fan"] {
// numFans, ret := nvml.DeviceGetNumFans(device.device)
// if ret == nvml.SUCCESS {
@@ -386,7 +382,7 @@ func readFan(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
// return nil
// }
func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readEccMode(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_ecc_mode"] {
// Retrieves the current and pending ECC modes for the device.
//
@@ -420,7 +416,7 @@ func readEccMode(device *NvidiaCollectorDevice, output chan lp.CCMessage) error
return nil
}
func readPerfState(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_perf_state"] {
// Retrieves the current performance state for the device.
//
@@ -440,16 +436,13 @@ func readPerfState(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
return nil
}
func readPowerUsage(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_power_usage"] {
// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
//
// On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
// On Ampere (except GA100) or newer GPUs, the API returns power averaged over 1 sec interval.
// On GA100 and older architectures, instantaneous power is returned.
//
// It is only available if power management mode is supported.
// It is only available if power management mode is supported
mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
if ret != nvml.SUCCESS {
return nil
@@ -468,54 +461,7 @@ func readPowerUsage(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
return nil
}
func readEnergyConsumption(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
// Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded
// For Volta or newer fully supported devices.
if (!device.excludeMetrics["nv_energy"]) && (!device.excludeMetrics["nv_energy_abs"]) && (!device.excludeMetrics["nv_average_power"]) {
now := time.Now()
mode, ret := nvml.DeviceGetPowerManagementMode(device.device)
if ret != nvml.SUCCESS {
return nil
}
if mode == nvml.FEATURE_ENABLED {
energy, ret := nvml.DeviceGetTotalEnergyConsumption(device.device)
if ret == nvml.SUCCESS {
if device.lastEnergyReading != 0 {
if !device.excludeMetrics["nv_energy"] {
y, err := lp.NewMetric("nv_energy", device.tags, device.meta, (energy-device.lastEnergyReading)/1000, now)
if err == nil {
y.AddMeta("unit", "Joules")
output <- y
}
}
if !device.excludeMetrics["nv_average_power"] {
energyDiff := (energy - device.lastEnergyReading) / 1000
timeDiff := now.Sub(device.lastEnergyTimestamp)
y, err := lp.NewMetric("nv_average_power", device.tags, device.meta, energyDiff/uint64(timeDiff.Seconds()), now)
if err == nil {
y.AddMeta("unit", "watts")
output <- y
}
}
}
if !device.excludeMetrics["nv_energy_abs"] {
y, err := lp.NewMetric("nv_energy_abs", device.tags, device.meta, energy/1000, now)
if err == nil {
y.AddMeta("unit", "Joules")
output <- y
}
}
device.lastEnergyReading = energy
device.lastEnergyTimestamp = time.Now()
}
}
}
return nil
}
func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
// Retrieves the current clock speeds for the device.
//
// Available clock information:
@@ -567,7 +513,7 @@ func readClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
return nil
}
func readMaxClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readMaxClocks(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
// Retrieves the maximum clock speeds for the device.
//
// Available clock information:
@@ -625,7 +571,7 @@ func readMaxClocks(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
return nil
}
func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readEccErrors(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_ecc_uncorrected_error"] {
// Retrieves the total ECC error counts for the device.
//
@@ -656,7 +602,7 @@ func readEccErrors(device *NvidiaCollectorDevice, output chan lp.CCMessage) erro
return nil
}
func readPowerLimit(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readPowerLimit(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_power_max_limit"] {
// Retrieves the power management limit associated with this device.
//
@@ -676,7 +622,7 @@ func readPowerLimit(device *NvidiaCollectorDevice, output chan lp.CCMessage) err
return nil
}
func readEncUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
if ret != nvml.SUCCESS {
err := errors.New(nvml.ErrorString(ret))
@@ -703,7 +649,7 @@ func readEncUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
return nil
}
func readDecUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
isMig, ret := nvml.DeviceIsMigDeviceHandle(device.device)
if ret != nvml.SUCCESS {
err := errors.New(nvml.ErrorString(ret))
@@ -730,7 +676,7 @@ func readDecUtilization(device *NvidiaCollectorDevice, output chan lp.CCMessage)
return nil
}
func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readRemappedRows(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_remapped_rows_corrected"] ||
!device.excludeMetrics["nv_remapped_rows_uncorrected"] ||
!device.excludeMetrics["nv_remapped_rows_pending"] ||
@@ -783,7 +729,7 @@ func readRemappedRows(device *NvidiaCollectorDevice, output chan lp.CCMessage) e
return nil
}
func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readProcessCounts(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
if !device.excludeMetrics["nv_compute_processes"] {
// Get information about processes with a compute context on a device
//
@@ -875,7 +821,7 @@ func readProcessCounts(device *NvidiaCollectorDevice, output chan lp.CCMessage)
return nil
}
func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readViolationStats(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
var violTime nvml.ViolationTime
var ret nvml.Return
@@ -989,7 +935,7 @@ func readViolationStats(device *NvidiaCollectorDevice, output chan lp.CCMessage)
return nil
}
func readNVLinkStats(device *NvidiaCollectorDevice, output chan lp.CCMessage) error {
func readNVLinkStats(device NvidiaCollectorDevice, output chan lp.CCMessage) error {
// Retrieves the specified error counter value
// Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
//
@@ -1124,7 +1070,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
return
}
readAll := func(device *NvidiaCollectorDevice, output chan lp.CCMessage) {
readAll := func(device NvidiaCollectorDevice, output chan lp.CCMessage) {
name, ret := nvml.DeviceGetName(device.device)
if ret != nvml.SUCCESS {
name = "NoName"
@@ -1164,11 +1110,6 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed")
}
err = readEnergyConsumption(device, output)
if err != nil {
cclog.ComponentDebug(m.name, "readEnergyConsumption for device", name, "failed")
}
err = readClocks(device, output)
if err != nil {
cclog.ComponentDebug(m.name, "readClocks for device", name, "failed")
@@ -1228,7 +1169,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
// Actual read loop over all attached Nvidia GPUs
for i := 0; i < m.num_gpus; i++ {
readAll(&m.gpus[i], output)
readAll(m.gpus[i], output)
// Iterate over all MIG devices if any
if m.config.ProcessMigDevices {
@@ -1302,7 +1243,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMessage)
}
}
readAll(&migDevice, output)
readAll(migDevice, output)
}
}
}

View File

@@ -82,8 +82,5 @@ Metrics:
* `nv_nvlink_ecc_errors`
* `nv_nvlink_replay_errors`
* `nv_nvlink_recovery_errors`
* `nv_energy`
* `nv_energy_abs`
* `nv_average_power`
Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`.

View File

@@ -236,13 +236,13 @@ __deprecated__
The cc-metric-collector tries to read the data from the system as it is reported. If available, it tries to read the metric unit from the system as well (e.g. from `/proc/meminfo`). The problem is that, depending on the source, the metric units are named differently. Just think about `byte`, `Byte`, `B`, `bytes`, ...
The [cc-units](https://github.com/ClusterCockpit/cc-lib/ccUnits) package provides us a normalization option to use the same metric unit name for all metrics. It this option is set to true, all `unit` meta tags are normalized.
The [cc-units](https://github.com/ClusterCockpit/cc-units) package provides us a normalization option to use the same metric unit name for all metrics. It this option is set to true, all `unit` meta tags are normalized.
## The `change_unit_prefix` section
__deprecated__
It is often the case that metrics are reported by the system using a rather outdated unit prefix (like `/proc/meminfo` still uses kByte despite current memory sizes are in the GByte range). If you want to change the prefix of a unit, you can do that with the help of [cc-units](https://github.com/ClusterCockpit/cc-lib/ccUnits). The setting works on the metric name and requires the new prefix for the metric. The cc-units package determines the scaling factor.
It is often the case that metrics are reported by the system using a rather outdated unit prefix (like `/proc/meminfo` still uses kByte despite current memory sizes are in the GByte range). If you want to change the prefix of a unit, you can do that with the help of [cc-units](https://github.com/ClusterCockpit/cc-units). The setting works on the metric name and requires the new prefix for the metric. The cc-units package determines the scaling factor.
# Aggregate metric values of the current interval with the `interval_aggregates` option