diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 369f12b..64356a7 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -6,6 +6,7 @@ import ( "fmt" "log" "strings" + "sync" "time" cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" @@ -24,6 +25,81 @@ type NvidiaCollectorConfig struct { ProcessMigDevices bool `json:"process_mig_devices,omitempty"` UseUuidForMigDevices bool `json:"use_uuid_for_mig_device,omitempty"` UseSliceForMigDevices bool `json:"use_slice_for_mig_device,omitempty"` + AveragePowerInterval string `json:"average_power_interval,omitempty"` +} + +type powerAverager struct { + device nvml.Device + interval time.Duration + done chan bool + wg sync.WaitGroup + powerSum float64 + powerSamples int + ticker *time.Ticker + running bool +} + +type PowerAverager interface { + Start() + IsRunning() bool + Get() float64 + Close() +} + +func (pa *powerAverager) IsRunning() bool { + return pa.running +} + +func (pa *powerAverager) Start() { + pa.wg.Add(1) + + go func(avger *powerAverager) { + avger.running = true + avger.ticker = time.NewTicker(avger.interval) + for { + select { + case <-avger.done: + avger.wg.Done() + avger.running = false + return + case <-avger.ticker.C: + power, ret := nvml.DeviceGetPowerUsage(avger.device) + if ret == nvml.SUCCESS { + avger.powerSum += float64(power) / 1000 + avger.powerSamples += 1 + } + } + } + }(pa) +} + +func (pa *powerAverager) Get() float64 { + avg := float64(0) + if pa.powerSamples > 0 { + pa.ticker.Stop() + avg = pa.powerSum / float64(pa.powerSamples) + pa.powerSum = 0 + pa.powerSamples = 0 + pa.ticker.Reset(pa.interval) + } + return avg +} + +func (pa *powerAverager) Close() { + pa.done <- true + pa.wg.Wait() + pa.running = false +} + +func NewPowerAverager(device nvml.Device, interval time.Duration) (PowerAverager, error) { + pa := new(powerAverager) + pa.device = device + pa.interval = interval + pa.done = make(chan bool) + pa.powerSamples = 0 + pa.powerSum = 0 + pa.running = false + return pa, nil } type NvidiaCollectorDevice struct { @@ -31,6 +107,8 @@ type NvidiaCollectorDevice struct { excludeMetrics map[string]bool tags map[string]string meta map[string]string + powerInterval time.Duration + averager PowerAverager } type NvidiaCollector struct { @@ -55,6 +133,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { m.config.ProcessMigDevices = false m.config.UseUuidForMigDevices = false m.config.UseSliceForMigDevices = false + m.config.AveragePowerInterval = "" m.setup() if len(config) > 0 { err = json.Unmarshal(config, &m.config) @@ -93,6 +172,16 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { return err } + powerDur := time.Duration(0) + if len(m.config.AveragePowerInterval) > 0 { + d, err := time.ParseDuration(m.config.AveragePowerInterval) + if err != nil { + cclog.ComponentError(m.name, "Unable to parse average_power_interval ", m.config.AveragePowerInterval, ":", err.Error()) + return err + } + powerDur = d + } + // For all GPUs idx := 0 m.gpus = make([]NvidiaCollectorDevice, num_gpus) @@ -197,6 +286,15 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { g.excludeMetrics[e] = true } + if powerDur > 0 { + a, err := NewPowerAverager(g.device, powerDur) + if err != nil { + cclog.ComponentError(m.name, "Failed to initialize power averager for device at index", i, ":", err.Error()) + } else { + g.averager = a + } + } + // Increment the index for the next device idx++ } @@ -436,6 +534,21 @@ func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMetric) error return nil } +func readPowerUsageAverage(device NvidiaCollectorDevice, output chan lp.CCMetric) error { + if !device.excludeMetrics["nv_power_usage_avg"] && device.averager != nil { + if !device.averager.IsRunning() { + device.averager.Start() + } else { + y, err := lp.New("nv_power_usage_avg", device.tags, device.meta, map[string]interface{}{"value": device.averager.Get()}, time.Now()) + if err == nil { + y.AddMeta("unit", "watts") + output <- y + } + } + } + return nil +} + func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMetric) error { if !device.excludeMetrics["nv_power_usage"] { // Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) @@ -1022,95 +1135,100 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) if ret != nvml.SUCCESS { name = "NoName" } - err = readMemoryInfo(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readMemoryInfo for device", name, "failed") - } + // err = readMemoryInfo(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readMemoryInfo for device", name, "failed") + // } - err = readUtilization(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readUtilization for device", name, "failed") - } + // err = readUtilization(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readUtilization for device", name, "failed") + // } - err = readTemp(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readTemp for device", name, "failed") - } + // err = readTemp(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readTemp for device", name, "failed") + // } - err = readFan(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readFan for device", name, "failed") - } + // err = readFan(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readFan for device", name, "failed") + // } - err = readEccMode(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readEccMode for device", name, "failed") - } + // err = readEccMode(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readEccMode for device", name, "failed") + // } - err = readPerfState(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readPerfState for device", name, "failed") - } + // err = readPerfState(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readPerfState for device", name, "failed") + // } err = readPowerUsage(device, output) if err != nil { cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed") } - err = readClocks(device, output) + err = readPowerUsageAverage(device, output) if err != nil { - cclog.ComponentDebug(m.name, "readClocks for device", name, "failed") + cclog.ComponentDebug(m.name, "readPowerUsageAverage for device", name, "failed") } - err = readMaxClocks(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readMaxClocks for device", name, "failed") - } + // err = readClocks(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readClocks for device", name, "failed") + // } - err = readEccErrors(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readEccErrors for device", name, "failed") - } + // err = readMaxClocks(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readMaxClocks for device", name, "failed") + // } - err = readPowerLimit(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readPowerLimit for device", name, "failed") - } + // err = readEccErrors(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readEccErrors for device", name, "failed") + // } - err = readEncUtilization(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readEncUtilization for device", name, "failed") - } + // err = readPowerLimit(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readPowerLimit for device", name, "failed") + // } - err = readDecUtilization(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readDecUtilization for device", name, "failed") - } + // err = readEncUtilization(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readEncUtilization for device", name, "failed") + // } - err = readRemappedRows(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readRemappedRows for device", name, "failed") - } + // err = readDecUtilization(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readDecUtilization for device", name, "failed") + // } - err = readBarMemoryInfo(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readBarMemoryInfo for device", name, "failed") - } + // err = readRemappedRows(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readRemappedRows for device", name, "failed") + // } - err = readProcessCounts(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readProcessCounts for device", name, "failed") - } + // err = readBarMemoryInfo(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readBarMemoryInfo for device", name, "failed") + // } - err = readViolationStats(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readViolationStats for device", name, "failed") - } + // err = readProcessCounts(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readProcessCounts for device", name, "failed") + // } - err = readNVLinkStats(device, output) - if err != nil { - cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed") - } + // err = readViolationStats(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readViolationStats for device", name, "failed") + // } + + // err = readNVLinkStats(device, output) + // if err != nil { + // cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed") + // } } // Actual read loop over all attached Nvidia GPUs @@ -1198,6 +1316,9 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) func (m *NvidiaCollector) Close() { if m.init { + for i := 0; i < m.num_gpus; i++ { + m.gpus[i].averager.Close() + } nvml.Shutdown() m.init = false }