mirror of
				https://github.com/ClusterCockpit/cc-metric-collector.git
				synced 2025-10-31 00:55:06 +01:00 
			
		
		
		
	Add power averager to Nvidia GPU collector
This commit is contained in:
		| @@ -6,6 +6,7 @@ import ( | ||||
| 	"fmt" | ||||
| 	"log" | ||||
| 	"strings" | ||||
| 	"sync" | ||||
| 	"time" | ||||
|  | ||||
| 	cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" | ||||
| @@ -24,6 +25,81 @@ type NvidiaCollectorConfig struct { | ||||
| 	ProcessMigDevices     bool     `json:"process_mig_devices,omitempty"` | ||||
| 	UseUuidForMigDevices  bool     `json:"use_uuid_for_mig_device,omitempty"` | ||||
| 	UseSliceForMigDevices bool     `json:"use_slice_for_mig_device,omitempty"` | ||||
| 	AveragePowerInterval  string   `json:"average_power_interval,omitempty"` | ||||
| } | ||||
|  | ||||
| type powerAverager struct { | ||||
| 	device       nvml.Device | ||||
| 	interval     time.Duration | ||||
| 	done         chan bool | ||||
| 	wg           sync.WaitGroup | ||||
| 	powerSum     float64 | ||||
| 	powerSamples int | ||||
| 	ticker       *time.Ticker | ||||
| 	running      bool | ||||
| } | ||||
|  | ||||
| type PowerAverager interface { | ||||
| 	Start() | ||||
| 	IsRunning() bool | ||||
| 	Get() float64 | ||||
| 	Close() | ||||
| } | ||||
|  | ||||
| func (pa *powerAverager) IsRunning() bool { | ||||
| 	return pa.running | ||||
| } | ||||
|  | ||||
| func (pa *powerAverager) Start() { | ||||
| 	pa.wg.Add(1) | ||||
|  | ||||
| 	go func(avger *powerAverager) { | ||||
| 		avger.running = true | ||||
| 		avger.ticker = time.NewTicker(avger.interval) | ||||
| 		for { | ||||
| 			select { | ||||
| 			case <-avger.done: | ||||
| 				avger.wg.Done() | ||||
| 				avger.running = false | ||||
| 				return | ||||
| 			case <-avger.ticker.C: | ||||
| 				power, ret := nvml.DeviceGetPowerUsage(avger.device) | ||||
| 				if ret == nvml.SUCCESS { | ||||
| 					avger.powerSum += float64(power) / 1000 | ||||
| 					avger.powerSamples += 1 | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 	}(pa) | ||||
| } | ||||
|  | ||||
| func (pa *powerAverager) Get() float64 { | ||||
| 	avg := float64(0) | ||||
| 	if pa.powerSamples > 0 { | ||||
| 		pa.ticker.Stop() | ||||
| 		avg = pa.powerSum / float64(pa.powerSamples) | ||||
| 		pa.powerSum = 0 | ||||
| 		pa.powerSamples = 0 | ||||
| 		pa.ticker.Reset(pa.interval) | ||||
| 	} | ||||
| 	return avg | ||||
| } | ||||
|  | ||||
| func (pa *powerAverager) Close() { | ||||
| 	pa.done <- true | ||||
| 	pa.wg.Wait() | ||||
| 	pa.running = false | ||||
| } | ||||
|  | ||||
| func NewPowerAverager(device nvml.Device, interval time.Duration) (PowerAverager, error) { | ||||
| 	pa := new(powerAverager) | ||||
| 	pa.device = device | ||||
| 	pa.interval = interval | ||||
| 	pa.done = make(chan bool) | ||||
| 	pa.powerSamples = 0 | ||||
| 	pa.powerSum = 0 | ||||
| 	pa.running = false | ||||
| 	return pa, nil | ||||
| } | ||||
|  | ||||
| type NvidiaCollectorDevice struct { | ||||
| @@ -31,6 +107,8 @@ type NvidiaCollectorDevice struct { | ||||
| 	excludeMetrics map[string]bool | ||||
| 	tags           map[string]string | ||||
| 	meta           map[string]string | ||||
| 	powerInterval  time.Duration | ||||
| 	averager       PowerAverager | ||||
| } | ||||
|  | ||||
| type NvidiaCollector struct { | ||||
| @@ -55,6 +133,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { | ||||
| 	m.config.ProcessMigDevices = false | ||||
| 	m.config.UseUuidForMigDevices = false | ||||
| 	m.config.UseSliceForMigDevices = false | ||||
| 	m.config.AveragePowerInterval = "" | ||||
| 	m.setup() | ||||
| 	if len(config) > 0 { | ||||
| 		err = json.Unmarshal(config, &m.config) | ||||
| @@ -93,6 +172,16 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { | ||||
| 		return err | ||||
| 	} | ||||
|  | ||||
| 	powerDur := time.Duration(0) | ||||
| 	if len(m.config.AveragePowerInterval) > 0 { | ||||
| 		d, err := time.ParseDuration(m.config.AveragePowerInterval) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentError(m.name, "Unable to parse average_power_interval ", m.config.AveragePowerInterval, ":", err.Error()) | ||||
| 			return err | ||||
| 		} | ||||
| 		powerDur = d | ||||
| 	} | ||||
|  | ||||
| 	// For all GPUs | ||||
| 	idx := 0 | ||||
| 	m.gpus = make([]NvidiaCollectorDevice, num_gpus) | ||||
| @@ -197,6 +286,15 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error { | ||||
| 			g.excludeMetrics[e] = true | ||||
| 		} | ||||
|  | ||||
| 		if powerDur > 0 { | ||||
| 			a, err := NewPowerAverager(g.device, powerDur) | ||||
| 			if err != nil { | ||||
| 				cclog.ComponentError(m.name, "Failed to initialize power averager for device at index", i, ":", err.Error()) | ||||
| 			} else { | ||||
| 				g.averager = a | ||||
| 			} | ||||
| 		} | ||||
|  | ||||
| 		// Increment the index for the next device | ||||
| 		idx++ | ||||
| 	} | ||||
| @@ -436,6 +534,21 @@ func readPerfState(device NvidiaCollectorDevice, output chan lp.CCMetric) error | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| func readPowerUsageAverage(device NvidiaCollectorDevice, output chan lp.CCMetric) error { | ||||
| 	if !device.excludeMetrics["nv_power_usage_avg"] && device.averager != nil { | ||||
| 		if !device.averager.IsRunning() { | ||||
| 			device.averager.Start() | ||||
| 		} else { | ||||
| 			y, err := lp.New("nv_power_usage_avg", device.tags, device.meta, map[string]interface{}{"value": device.averager.Get()}, time.Now()) | ||||
| 			if err == nil { | ||||
| 				y.AddMeta("unit", "watts") | ||||
| 				output <- y | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| func readPowerUsage(device NvidiaCollectorDevice, output chan lp.CCMetric) error { | ||||
| 	if !device.excludeMetrics["nv_power_usage"] { | ||||
| 		// Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) | ||||
| @@ -1022,95 +1135,100 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) | ||||
| 		if ret != nvml.SUCCESS { | ||||
| 			name = "NoName" | ||||
| 		} | ||||
| 		err = readMemoryInfo(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readMemoryInfo for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readMemoryInfo(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readMemoryInfo for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		err = readUtilization(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readUtilization for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readUtilization(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readUtilization for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		err = readTemp(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readTemp for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readTemp(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readTemp for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		err = readFan(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readFan for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readFan(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readFan for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		err = readEccMode(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readEccMode for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readEccMode(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readEccMode for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		err = readPerfState(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readPerfState for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readPerfState(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readPerfState for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		err = readPowerUsage(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readPowerUsage for device", name, "failed") | ||||
| 		} | ||||
|  | ||||
| 		err = readClocks(device, output) | ||||
| 		err = readPowerUsageAverage(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readClocks for device", name, "failed") | ||||
| 			cclog.ComponentDebug(m.name, "readPowerUsageAverage for device", name, "failed") | ||||
| 		} | ||||
|  | ||||
| 		err = readMaxClocks(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readMaxClocks for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readClocks(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readClocks for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		err = readEccErrors(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readEccErrors for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readMaxClocks(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readMaxClocks for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		err = readPowerLimit(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readPowerLimit for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readEccErrors(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readEccErrors for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		err = readEncUtilization(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readEncUtilization for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readPowerLimit(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readPowerLimit for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		err = readDecUtilization(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readDecUtilization for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readEncUtilization(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readEncUtilization for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		err = readRemappedRows(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readRemappedRows for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readDecUtilization(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readDecUtilization for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		err = readBarMemoryInfo(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readBarMemoryInfo for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readRemappedRows(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readRemappedRows for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		err = readProcessCounts(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readProcessCounts for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readBarMemoryInfo(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readBarMemoryInfo for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		err = readViolationStats(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readViolationStats for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readProcessCounts(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readProcessCounts for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		err = readNVLinkStats(device, output) | ||||
| 		if err != nil { | ||||
| 			cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed") | ||||
| 		} | ||||
| 		// err = readViolationStats(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readViolationStats for device", name, "failed") | ||||
| 		// } | ||||
|  | ||||
| 		// err = readNVLinkStats(device, output) | ||||
| 		// if err != nil { | ||||
| 		// 	cclog.ComponentDebug(m.name, "readNVLinkStats for device", name, "failed") | ||||
| 		// } | ||||
| 	} | ||||
|  | ||||
| 	// Actual read loop over all attached Nvidia GPUs | ||||
| @@ -1198,6 +1316,9 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric) | ||||
|  | ||||
| func (m *NvidiaCollector) Close() { | ||||
| 	if m.init { | ||||
| 		for i := 0; i < m.num_gpus; i++ { | ||||
| 			m.gpus[i].averager.Close() | ||||
| 		} | ||||
| 		nvml.Shutdown() | ||||
| 		m.init = false | ||||
| 	} | ||||
|   | ||||
		Reference in New Issue
	
	Block a user