""" GPU Monitoring Infrastructure for LLM Benchmarking Provides unified interface for monitoring both NVIDIA and AMD GPUs. """ import time from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Optional, List import warnings @dataclass class GPUMetrics: """Container for GPU metrics.""" timestamp: float power_watts: float gpu_utilization_percent: float memory_used_gb: float memory_total_gb: float temperature_celsius: Optional[float] = None energy_joules: Optional[float] = None # Cumulative energy class GPUMonitor(ABC): """Abstract base class for GPU monitoring.""" def __init__(self, device_id: int = 0): """ Initialize GPU monitor. Args: device_id: GPU device ID to monitor """ self.device_id = device_id self.start_time = None self.start_energy = None self.last_metrics = None @abstractmethod def get_metrics(self) -> GPUMetrics: """Get current GPU metrics.""" pass @abstractmethod def get_device_name(self) -> str: """Get GPU device name.""" pass @abstractmethod def cleanup(self): """Cleanup resources.""" pass def start_monitoring(self): """Start energy monitoring session.""" self.start_time = time.time() metrics = self.get_metrics() self.start_energy = metrics.energy_joules if metrics.energy_joules is not None else 0.0 self.last_metrics = metrics def get_energy_consumed(self) -> float: """ Get energy consumed since start_monitoring() was called. Returns: Energy in Joules """ if self.start_time is None: raise RuntimeError("Must call start_monitoring() first") current_metrics = self.get_metrics() if current_metrics.energy_joules is not None: # If GPU provides cumulative energy, use it return current_metrics.energy_joules - self.start_energy else: # Otherwise, integrate power over time elapsed_time = time.time() - self.start_time # Use average of start and current power avg_power = (self.last_metrics.power_watts + current_metrics.power_watts) / 2.0 return avg_power * elapsed_time def get_average_power(self) -> float: """ Get average power consumption since start_monitoring(). Returns: Average power in Watts """ if self.start_time is None: raise RuntimeError("Must call start_monitoring() first") elapsed_time = time.time() - self.start_time if elapsed_time == 0: return 0.0 energy = self.get_energy_consumed() return energy / elapsed_time class NVIDIAMonitor(GPUMonitor): """NVIDIA GPU monitor using pynvml.""" def __init__(self, device_id: int = 0): """Initialize NVIDIA monitor.""" try: import pynvml self.pynvml = pynvml except ImportError: raise ImportError( "pynvml not found. Install with: pip install pynvml" ) try: self.pynvml.nvmlInit() self.handle = self.pynvml.nvmlDeviceGetHandleByIndex(device_id) except Exception as e: raise RuntimeError(f"Failed to initialize NVIDIA GPU {device_id}: {e}") super().__init__(device_id) def get_metrics(self) -> GPUMetrics: """Get current NVIDIA GPU metrics.""" try: # Power (in milliwatts) power_mw = self.pynvml.nvmlDeviceGetPowerUsage(self.handle) power_watts = power_mw / 1000.0 # Utilization util = self.pynvml.nvmlDeviceGetUtilizationRates(self.handle) gpu_util = util.gpu # Memory mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(self.handle) memory_used_gb = mem_info.used / (1024**3) memory_total_gb = mem_info.total / (1024**3) # Temperature try: temp = self.pynvml.nvmlDeviceGetTemperature( self.handle, self.pynvml.NVML_TEMPERATURE_GPU ) except: temp = None # Try to get cumulative energy (newer GPUs) energy_joules = None try: energy_mj = self.pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle) energy_joules = energy_mj / 1000.0 except: # Not supported on this GPU, will use power integration pass return GPUMetrics( timestamp=time.time(), power_watts=power_watts, gpu_utilization_percent=gpu_util, memory_used_gb=memory_used_gb, memory_total_gb=memory_total_gb, temperature_celsius=temp, energy_joules=energy_joules ) except Exception as e: raise RuntimeError(f"Failed to get NVIDIA GPU metrics: {e}") def get_device_name(self) -> str: """Get NVIDIA GPU device name.""" try: name = self.pynvml.nvmlDeviceGetName(self.handle) if isinstance(name, bytes): name = name.decode('utf-8') return name except: return f"NVIDIA GPU {self.device_id}" def cleanup(self): """Cleanup NVIDIA resources.""" try: self.pynvml.nvmlShutdown() except: pass class AMDMonitor(GPUMonitor): """AMD GPU monitor using rocm-smi command line tool.""" def __init__(self, device_id: int = 0): """Initialize AMD monitor.""" import subprocess import shutil # Check if rocm-smi is available if shutil.which('rocm-smi') is None: raise RuntimeError("rocm-smi command not found. Make sure ROCm is installed and in PATH.") self.device_id = device_id # Verify device exists try: result = subprocess.run( ['rocm-smi', '--showid'], capture_output=True, text=True, timeout=5 ) if result.returncode != 0: raise RuntimeError(f"rocm-smi failed: {result.stderr}") except subprocess.TimeoutExpired: raise RuntimeError("rocm-smi command timed out") except Exception as e: raise RuntimeError(f"Failed to initialize AMD GPU {device_id}: {e}") super().__init__(device_id) def _parse_detailed_output(self, output: str) -> dict: """Parse rocm-smi detailed output format.""" lines = output.strip().split('\n') # Parse detailed format: GPU[X] : Metric : Value metrics = { 'temperature': None, 'power': None, 'vram_percent': None, 'gpu_percent': None, } device_prefix = f"GPU[{self.device_id}]" for line in lines: if not line.strip() or not line.startswith(device_prefix): continue # Split by colon parts = line.split(':') if len(parts) < 3: continue metric_name = parts[1].strip().lower() value_str = parts[2].strip() try: # Temperature (Sensor junction) if 'temperature' in metric_name and 'junction' in metric_name: metrics['temperature'] = float(value_str) # Power consumption elif 'power' in metric_name and 'package' in metric_name: metrics['power'] = float(value_str) # GPU utilization elif 'gpu use' in metric_name: metrics['gpu_percent'] = float(value_str) # VRAM usage percentage elif 'memory allocated' in metric_name and 'vram%' in metric_name: metrics['vram_percent'] = float(value_str) except (ValueError, IndexError): continue # Validate we got the required metrics if metrics['temperature'] is None: raise ValueError(f"Could not find temperature for GPU[{self.device_id}]") if metrics['power'] is None: raise ValueError(f"Could not find power for GPU[{self.device_id}]") if metrics['gpu_percent'] is None: metrics['gpu_percent'] = 0.0 if metrics['vram_percent'] is None: metrics['vram_percent'] = 0.0 return metrics def _get_memory_info(self) -> tuple: """Get memory usage in GB using rocm-smi --showmeminfo.""" import subprocess try: result = subprocess.run( ['rocm-smi', '--showmeminfo', 'vram', '-d', str(self.device_id)], capture_output=True, text=True, timeout=5 ) if result.returncode != 0: return 0.0, 0.0 # Parse output for memory info # Looking for lines like "GPU memory used: X MiB" and "GPU memory total: Y MiB" used_gb = 0.0 total_gb = 0.0 for line in result.stdout.split('\n'): if 'Used' in line or 'used' in line: # Extract number parts = line.split() for i, part in enumerate(parts): if part.replace('.', '').isdigit(): used_bytes = float(part) # Check if next part indicates unit if i + 1 < len(parts): unit = parts[i + 1].lower() if 'mb' in unit or 'mib' in unit: used_gb = used_bytes / 1024 elif 'gb' in unit or 'gib' in unit: used_gb = used_bytes elif 'kb' in unit or 'kib' in unit: used_gb = used_bytes / (1024 * 1024) break if 'Total' in line or 'total' in line: parts = line.split() for i, part in enumerate(parts): if part.replace('.', '').isdigit(): total_bytes = float(part) if i + 1 < len(parts): unit = parts[i + 1].lower() if 'mb' in unit or 'mib' in unit: total_gb = total_bytes / 1024 elif 'gb' in unit or 'gib' in unit: total_gb = total_bytes elif 'kb' in unit or 'kib' in unit: total_gb = total_bytes / (1024 * 1024) break return used_gb, total_gb except Exception: return 0.0, 0.0 def get_metrics(self) -> GPUMetrics: """Get current AMD GPU metrics.""" import subprocess try: # Get main metrics from concise output result = subprocess.run( ['rocm-smi', '--showid', '--showtemp', '--showpower', '--showuse', '--showmemuse'], capture_output=True, text=True, timeout=5 ) if result.returncode != 0: raise RuntimeError(f"rocm-smi failed: {result.stderr}") metrics = self._parse_detailed_output(result.stdout) # Get detailed memory info memory_used_gb, memory_total_gb = self._get_memory_info() # If we couldn't get absolute memory, estimate from percentage if memory_total_gb == 0.0: # MI300X has ~192GB, MI250X has ~128GB - use a reasonable default memory_total_gb = 192.0 # Assume MI300X memory_used_gb = memory_total_gb * (metrics['vram_percent'] / 100.0) return GPUMetrics( timestamp=time.time(), power_watts=metrics['power'], gpu_utilization_percent=metrics['gpu_percent'], memory_used_gb=memory_used_gb, memory_total_gb=memory_total_gb, temperature_celsius=metrics['temperature'], energy_joules=None # Will use power integration ) except subprocess.TimeoutExpired: raise RuntimeError("rocm-smi command timed out") except Exception as e: raise RuntimeError(f"Failed to get AMD GPU metrics: {e}") def get_device_name(self) -> str: """Get AMD GPU device name.""" import subprocess try: result = subprocess.run( ['rocm-smi', '--showproductname', '-d', str(self.device_id)], capture_output=True, text=True, timeout=5 ) if result.returncode == 0: # Parse output to find device name for line in result.stdout.split('\n'): if 'Card series' in line or 'Card model' in line or 'name' in line.lower(): parts = line.split(':') if len(parts) > 1: return parts[1].strip() except Exception: pass return f"AMD GPU {self.device_id}" def cleanup(self): """Cleanup AMD resources.""" # No cleanup needed for command-line tool pass def get_gpu_monitor(device_id: int = 0) -> GPUMonitor: """ Factory function to automatically detect and create appropriate GPU monitor. Args: device_id: GPU device ID to monitor Returns: GPUMonitor instance (NVIDIAMonitor or AMDMonitor) Raises: RuntimeError: If no supported GPU is found """ # Try AMD first (rocm-smi based) as it's more commonly available try: return AMDMonitor(device_id) except: pass # Try NVIDIA if AMD fails try: return NVIDIAMonitor(device_id) except: pass # Try to import torch to detect GPU type as last resort try: import torch if torch.cuda.is_available(): # Check if it's NVIDIA or AMD device_name = torch.cuda.get_device_name(device_id).lower() if 'nvidia' in device_name or 'tesla' in device_name or 'geforce' in device_name: return NVIDIAMonitor(device_id) elif 'amd' in device_name or 'radeon' in device_name or 'mi300' in device_name or 'mi200' in device_name: return AMDMonitor(device_id) except: pass raise RuntimeError( "No supported GPU found. Make sure either ROCm (rocm-smi) or NVIDIA (pynvml) drivers are installed." ) def list_available_gpus() -> List[str]: """ List all available GPUs. Returns: List of GPU names """ gpus = [] # Try NVIDIA try: import pynvml pynvml.nvmlInit() device_count = pynvml.nvmlDeviceGetCount() for i in range(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle) if isinstance(name, bytes): name = name.decode('utf-8') gpus.append(f"GPU {i}: {name} (NVIDIA)") pynvml.nvmlShutdown() except: pass # Try AMD with rocm-smi try: import subprocess import shutil if shutil.which('rocm-smi'): result = subprocess.run( ['rocm-smi', '--showid'], capture_output=True, text=True, timeout=5 ) if result.returncode == 0: # Parse device IDs from output for line in result.stdout.split('\n'): if not line.strip() or line.startswith('=') or 'Device' in line or 'ROCm' in line: continue parts = line.split() if parts and parts[0].isdigit(): device_id = int(parts[0]) # Try to get device name name_result = subprocess.run( ['rocm-smi', '--showproductname', '-d', str(device_id)], capture_output=True, text=True, timeout=5 ) name = f"AMD GPU" if name_result.returncode == 0: for name_line in name_result.stdout.split('\n'): if 'Card' in name_line or 'name' in name_line.lower(): parts_name = name_line.split(':') if len(parts_name) > 1: name = parts_name[1].strip() break gpus.append(f"GPU {device_id}: {name} (AMD)") except: pass return gpus if __name__ == "__main__": """Test GPU monitoring.""" print("=" * 60) print("GPU Monitoring Test") print("=" * 60) # List available GPUs print("\nAvailable GPUs:") gpus = list_available_gpus() if not gpus: print(" No GPUs found!") exit(1) for gpu in gpus: print(f" {gpu}") # Test monitoring print("\nTesting GPU 0 monitoring...") try: monitor = get_gpu_monitor(0) print(f" Device: {monitor.get_device_name()}") # Get metrics metrics = monitor.get_metrics() print(f"\nCurrent Metrics:") print(f" Power: {metrics.power_watts:.2f} W") print(f" GPU Utilization: {metrics.gpu_utilization_percent:.1f}%") print(f" Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB") if metrics.temperature_celsius: print(f" Temperature: {metrics.temperature_celsius:.1f}°C") # Test energy monitoring print("\nTesting energy monitoring (5 seconds)...") monitor.start_monitoring() time.sleep(5) energy = monitor.get_energy_consumed() avg_power = monitor.get_average_power() print(f" Energy consumed: {energy:.2f} J") print(f" Average power: {avg_power:.2f} W") monitor.cleanup() print("\n✓ Monitoring test successful!") except Exception as e: print(f"\n✗ Error: {e}") exit(1)