Initial commit

2026-02-05 23:18:26 +01:00
commit 747c92ac6b
31 changed files with 4220 additions and 0 deletions
--- a/utils/gpu_monitor.py
+++ b/utils/gpu_monitor.py
@@ -0,0 +1,562 @@
+"""
+GPU Monitoring Infrastructure for LLM Benchmarking
+
+Provides unified interface for monitoring both NVIDIA and AMD GPUs.
+"""
+
+import time
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional, List
+import warnings
+
+
+@dataclass
+class GPUMetrics:
+    """Container for GPU metrics."""
+    timestamp: float
+    power_watts: float
+    gpu_utilization_percent: float
+    memory_used_gb: float
+    memory_total_gb: float
+    temperature_celsius: Optional[float] = None
+    energy_joules: Optional[float] = None  # Cumulative energy
+
+
+class GPUMonitor(ABC):
+    """Abstract base class for GPU monitoring."""
+    
+    def __init__(self, device_id: int = 0):
+        """
+        Initialize GPU monitor.
+        
+        Args:
+            device_id: GPU device ID to monitor
+        """
+        self.device_id = device_id
+        self.start_time = None
+        self.start_energy = None
+        self.last_metrics = None
+    
+    @abstractmethod
+    def get_metrics(self) -> GPUMetrics:
+        """Get current GPU metrics."""
+        pass
+    
+    @abstractmethod
+    def get_device_name(self) -> str:
+        """Get GPU device name."""
+        pass
+    
+    @abstractmethod
+    def cleanup(self):
+        """Cleanup resources."""
+        pass
+    
+    def start_monitoring(self):
+        """Start energy monitoring session."""
+        self.start_time = time.time()
+        metrics = self.get_metrics()
+        self.start_energy = metrics.energy_joules if metrics.energy_joules is not None else 0.0
+        self.last_metrics = metrics
+    
+    def get_energy_consumed(self) -> float:
+        """
+        Get energy consumed since start_monitoring() was called.
+        
+        Returns:
+            Energy in Joules
+        """
+        if self.start_time is None:
+            raise RuntimeError("Must call start_monitoring() first")
+        
+        current_metrics = self.get_metrics()
+        
+        if current_metrics.energy_joules is not None:
+            # If GPU provides cumulative energy, use it
+            return current_metrics.energy_joules - self.start_energy
+        else:
+            # Otherwise, integrate power over time
+            elapsed_time = time.time() - self.start_time
+            # Use average of start and current power
+            avg_power = (self.last_metrics.power_watts + current_metrics.power_watts) / 2.0
+            return avg_power * elapsed_time
+    
+    def get_average_power(self) -> float:
+        """
+        Get average power consumption since start_monitoring().
+        
+        Returns:
+            Average power in Watts
+        """
+        if self.start_time is None:
+            raise RuntimeError("Must call start_monitoring() first")
+        
+        elapsed_time = time.time() - self.start_time
+        if elapsed_time == 0:
+            return 0.0
+        
+        energy = self.get_energy_consumed()
+        return energy / elapsed_time
+
+
+class NVIDIAMonitor(GPUMonitor):
+    """NVIDIA GPU monitor using pynvml."""
+    
+    def __init__(self, device_id: int = 0):
+        """Initialize NVIDIA monitor."""
+        try:
+            import pynvml
+            self.pynvml = pynvml
+        except ImportError:
+            raise ImportError(
+                "pynvml not found. Install with: pip install pynvml"
+            )
+        
+        try:
+            self.pynvml.nvmlInit()
+            self.handle = self.pynvml.nvmlDeviceGetHandleByIndex(device_id)
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize NVIDIA GPU {device_id}: {e}")
+        
+        super().__init__(device_id)
+    
+    def get_metrics(self) -> GPUMetrics:
+        """Get current NVIDIA GPU metrics."""
+        try:
+            # Power (in milliwatts)
+            power_mw = self.pynvml.nvmlDeviceGetPowerUsage(self.handle)
+            power_watts = power_mw / 1000.0
+            
+            # Utilization
+            util = self.pynvml.nvmlDeviceGetUtilizationRates(self.handle)
+            gpu_util = util.gpu
+            
+            # Memory
+            mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(self.handle)
+            memory_used_gb = mem_info.used / (1024**3)
+            memory_total_gb = mem_info.total / (1024**3)
+            
+            # Temperature
+            try:
+                temp = self.pynvml.nvmlDeviceGetTemperature(
+                    self.handle, 
+                    self.pynvml.NVML_TEMPERATURE_GPU
+                )
+            except:
+                temp = None
+            
+            # Try to get cumulative energy (newer GPUs)
+            energy_joules = None
+            try:
+                energy_mj = self.pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
+                energy_joules = energy_mj / 1000.0
+            except:
+                # Not supported on this GPU, will use power integration
+                pass
+            
+            return GPUMetrics(
+                timestamp=time.time(),
+                power_watts=power_watts,
+                gpu_utilization_percent=gpu_util,
+                memory_used_gb=memory_used_gb,
+                memory_total_gb=memory_total_gb,
+                temperature_celsius=temp,
+                energy_joules=energy_joules
+            )
+        except Exception as e:
+            raise RuntimeError(f"Failed to get NVIDIA GPU metrics: {e}")
+    
+    def get_device_name(self) -> str:
+        """Get NVIDIA GPU device name."""
+        try:
+            name = self.pynvml.nvmlDeviceGetName(self.handle)
+            if isinstance(name, bytes):
+                name = name.decode('utf-8')
+            return name
+        except:
+            return f"NVIDIA GPU {self.device_id}"
+    
+    def cleanup(self):
+        """Cleanup NVIDIA resources."""
+        try:
+            self.pynvml.nvmlShutdown()
+        except:
+            pass
+
+
+class AMDMonitor(GPUMonitor):
+    """AMD GPU monitor using rocm-smi command line tool."""
+    
+    def __init__(self, device_id: int = 0):
+        """Initialize AMD monitor."""
+        import subprocess
+        import shutil
+        
+        # Check if rocm-smi is available
+        if shutil.which('rocm-smi') is None:
+            raise RuntimeError("rocm-smi command not found. Make sure ROCm is installed and in PATH.")
+        
+        self.device_id = device_id
+        
+        # Verify device exists
+        try:
+            result = subprocess.run(
+                ['rocm-smi', '--showid'],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            if result.returncode != 0:
+                raise RuntimeError(f"rocm-smi failed: {result.stderr}")
+        except subprocess.TimeoutExpired:
+            raise RuntimeError("rocm-smi command timed out")
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize AMD GPU {device_id}: {e}")
+        
+        super().__init__(device_id)
+    
+    def _parse_detailed_output(self, output: str) -> dict:
+        """Parse rocm-smi detailed output format."""
+        lines = output.strip().split('\n')
+        
+        # Parse detailed format: GPU[X] : Metric : Value
+        metrics = {
+            'temperature': None,
+            'power': None,
+            'vram_percent': None,
+            'gpu_percent': None,
+        }
+        
+        device_prefix = f"GPU[{self.device_id}]"
+        
+        for line in lines:
+            if not line.strip() or not line.startswith(device_prefix):
+                continue
+            
+            # Split by colon
+            parts = line.split(':')
+            if len(parts) < 3:
+                continue
+            
+            metric_name = parts[1].strip().lower()
+            value_str = parts[2].strip()
+            
+            try:
+                # Temperature (Sensor junction)
+                if 'temperature' in metric_name and 'junction' in metric_name:
+                    metrics['temperature'] = float(value_str)
+                
+                # Power consumption
+                elif 'power' in metric_name and 'package' in metric_name:
+                    metrics['power'] = float(value_str)
+                
+                # GPU utilization
+                elif 'gpu use' in metric_name:
+                    metrics['gpu_percent'] = float(value_str)
+                
+                # VRAM usage percentage
+                elif 'memory allocated' in metric_name and 'vram%' in metric_name:
+                    metrics['vram_percent'] = float(value_str)
+                    
+            except (ValueError, IndexError):
+                continue
+        
+        # Validate we got the required metrics
+        if metrics['temperature'] is None:
+            raise ValueError(f"Could not find temperature for GPU[{self.device_id}]")
+        if metrics['power'] is None:
+            raise ValueError(f"Could not find power for GPU[{self.device_id}]")
+        if metrics['gpu_percent'] is None:
+            metrics['gpu_percent'] = 0.0
+        if metrics['vram_percent'] is None:
+            metrics['vram_percent'] = 0.0
+        
+        return metrics
+    
+    def _get_memory_info(self) -> tuple:
+        """Get memory usage in GB using rocm-smi --showmeminfo."""
+        import subprocess
+        
+        try:
+            result = subprocess.run(
+                ['rocm-smi', '--showmeminfo', 'vram', '-d', str(self.device_id)],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            
+            if result.returncode != 0:
+                return 0.0, 0.0
+            
+            # Parse output for memory info
+            # Looking for lines like "GPU memory used: X MiB" and "GPU memory total: Y MiB"
+            used_gb = 0.0
+            total_gb = 0.0
+            
+            for line in result.stdout.split('\n'):
+                if 'Used' in line or 'used' in line:
+                    # Extract number
+                    parts = line.split()
+                    for i, part in enumerate(parts):
+                        if part.replace('.', '').isdigit():
+                            used_bytes = float(part)
+                            # Check if next part indicates unit
+                            if i + 1 < len(parts):
+                                unit = parts[i + 1].lower()
+                                if 'mb' in unit or 'mib' in unit:
+                                    used_gb = used_bytes / 1024
+                                elif 'gb' in unit or 'gib' in unit:
+                                    used_gb = used_bytes
+                                elif 'kb' in unit or 'kib' in unit:
+                                    used_gb = used_bytes / (1024 * 1024)
+                            break
+                
+                if 'Total' in line or 'total' in line:
+                    parts = line.split()
+                    for i, part in enumerate(parts):
+                        if part.replace('.', '').isdigit():
+                            total_bytes = float(part)
+                            if i + 1 < len(parts):
+                                unit = parts[i + 1].lower()
+                                if 'mb' in unit or 'mib' in unit:
+                                    total_gb = total_bytes / 1024
+                                elif 'gb' in unit or 'gib' in unit:
+                                    total_gb = total_bytes
+                                elif 'kb' in unit or 'kib' in unit:
+                                    total_gb = total_bytes / (1024 * 1024)
+                            break
+            
+            return used_gb, total_gb
+            
+        except Exception:
+            return 0.0, 0.0
+    
+    def get_metrics(self) -> GPUMetrics:
+        """Get current AMD GPU metrics."""
+        import subprocess
+        
+        try:
+            # Get main metrics from concise output
+            result = subprocess.run(
+                ['rocm-smi', '--showid', '--showtemp', '--showpower', '--showuse', '--showmemuse'],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            
+            if result.returncode != 0:
+                raise RuntimeError(f"rocm-smi failed: {result.stderr}")
+            
+            metrics = self._parse_detailed_output(result.stdout)
+            
+            # Get detailed memory info
+            memory_used_gb, memory_total_gb = self._get_memory_info()
+            
+            # If we couldn't get absolute memory, estimate from percentage
+            if memory_total_gb == 0.0:
+                # MI300X has ~192GB, MI250X has ~128GB - use a reasonable default
+                memory_total_gb = 192.0  # Assume MI300X
+                memory_used_gb = memory_total_gb * (metrics['vram_percent'] / 100.0)
+            
+            return GPUMetrics(
+                timestamp=time.time(),
+                power_watts=metrics['power'],
+                gpu_utilization_percent=metrics['gpu_percent'],
+                memory_used_gb=memory_used_gb,
+                memory_total_gb=memory_total_gb,
+                temperature_celsius=metrics['temperature'],
+                energy_joules=None  # Will use power integration
+            )
+            
+        except subprocess.TimeoutExpired:
+            raise RuntimeError("rocm-smi command timed out")
+        except Exception as e:
+            raise RuntimeError(f"Failed to get AMD GPU metrics: {e}")
+    
+    def get_device_name(self) -> str:
+        """Get AMD GPU device name."""
+        import subprocess
+        
+        try:
+            result = subprocess.run(
+                ['rocm-smi', '--showproductname', '-d', str(self.device_id)],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            
+            if result.returncode == 0:
+                # Parse output to find device name
+                for line in result.stdout.split('\n'):
+                    if 'Card series' in line or 'Card model' in line or 'name' in line.lower():
+                        parts = line.split(':')
+                        if len(parts) > 1:
+                            return parts[1].strip()
+        except Exception:
+            pass
+        
+        return f"AMD GPU {self.device_id}"
+    
+    def cleanup(self):
+        """Cleanup AMD resources."""
+        # No cleanup needed for command-line tool
+        pass
+
+
+def get_gpu_monitor(device_id: int = 0) -> GPUMonitor:
+    """
+    Factory function to automatically detect and create appropriate GPU monitor.
+    
+    Args:
+        device_id: GPU device ID to monitor
+        
+    Returns:
+        GPUMonitor instance (NVIDIAMonitor or AMDMonitor)
+        
+    Raises:
+        RuntimeError: If no supported GPU is found
+    """
+    # Try AMD first (rocm-smi based) as it's more commonly available
+    try:
+        return AMDMonitor(device_id)
+    except:
+        pass
+    
+    # Try NVIDIA if AMD fails
+    try:
+        return NVIDIAMonitor(device_id)
+    except:
+        pass
+    
+    # Try to import torch to detect GPU type as last resort
+    try:
+        import torch
+        if torch.cuda.is_available():
+            # Check if it's NVIDIA or AMD
+            device_name = torch.cuda.get_device_name(device_id).lower()
+            
+            if 'nvidia' in device_name or 'tesla' in device_name or 'geforce' in device_name:
+                return NVIDIAMonitor(device_id)
+            elif 'amd' in device_name or 'radeon' in device_name or 'mi300' in device_name or 'mi200' in device_name:
+                return AMDMonitor(device_id)
+    except:
+        pass
+    
+    raise RuntimeError(
+        "No supported GPU found. Make sure either ROCm (rocm-smi) or NVIDIA (pynvml) drivers are installed."
+    )
+
+
+def list_available_gpus() -> List[str]:
+    """
+    List all available GPUs.
+    
+    Returns:
+        List of GPU names
+    """
+    gpus = []
+    
+    # Try NVIDIA
+    try:
+        import pynvml
+        pynvml.nvmlInit()
+        device_count = pynvml.nvmlDeviceGetCount()
+        for i in range(device_count):
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            name = pynvml.nvmlDeviceGetName(handle)
+            if isinstance(name, bytes):
+                name = name.decode('utf-8')
+            gpus.append(f"GPU {i}: {name} (NVIDIA)")
+        pynvml.nvmlShutdown()
+    except:
+        pass
+    
+    # Try AMD with rocm-smi
+    try:
+        import subprocess
+        import shutil
+        
+        if shutil.which('rocm-smi'):
+            result = subprocess.run(
+                ['rocm-smi', '--showid'],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            if result.returncode == 0:
+                # Parse device IDs from output
+                for line in result.stdout.split('\n'):
+                    if not line.strip() or line.startswith('=') or 'Device' in line or 'ROCm' in line:
+                        continue
+                    parts = line.split()
+                    if parts and parts[0].isdigit():
+                        device_id = int(parts[0])
+                        # Try to get device name
+                        name_result = subprocess.run(
+                            ['rocm-smi', '--showproductname', '-d', str(device_id)],
+                            capture_output=True,
+                            text=True,
+                            timeout=5
+                        )
+                        name = f"AMD GPU"
+                        if name_result.returncode == 0:
+                            for name_line in name_result.stdout.split('\n'):
+                                if 'Card' in name_line or 'name' in name_line.lower():
+                                    parts_name = name_line.split(':')
+                                    if len(parts_name) > 1:
+                                        name = parts_name[1].strip()
+                                        break
+                        gpus.append(f"GPU {device_id}: {name} (AMD)")
+    except:
+        pass
+    
+    return gpus
+
+
+if __name__ == "__main__":
+    """Test GPU monitoring."""
+    print("=" * 60)
+    print("GPU Monitoring Test")
+    print("=" * 60)
+    
+    # List available GPUs
+    print("\nAvailable GPUs:")
+    gpus = list_available_gpus()
+    if not gpus:
+        print("  No GPUs found!")
+        exit(1)
+    
+    for gpu in gpus:
+        print(f"  {gpu}")
+    
+    # Test monitoring
+    print("\nTesting GPU 0 monitoring...")
+    try:
+        monitor = get_gpu_monitor(0)
+        print(f"  Device: {monitor.get_device_name()}")
+        
+        # Get metrics
+        metrics = monitor.get_metrics()
+        print(f"\nCurrent Metrics:")
+        print(f"  Power: {metrics.power_watts:.2f} W")
+        print(f"  GPU Utilization: {metrics.gpu_utilization_percent:.1f}%")
+        print(f"  Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB")
+        if metrics.temperature_celsius:
+            print(f"  Temperature: {metrics.temperature_celsius:.1f}°C")
+        
+        # Test energy monitoring
+        print("\nTesting energy monitoring (5 seconds)...")
+        monitor.start_monitoring()
+        time.sleep(5)
+        energy = monitor.get_energy_consumed()
+        avg_power = monitor.get_average_power()
+        print(f"  Energy consumed: {energy:.2f} J")
+        print(f"  Average power: {avg_power:.2f} W")
+        
+        monitor.cleanup()
+        print("\n✓ Monitoring test successful!")
+        
+    except Exception as e:
+        print(f"\n✗ Error: {e}")
+        exit(1)