cocogoat/utils/gpu_monitor.py

"""
GPU Monitoring Infrastructure for LLM Benchmarking

Provides unified interface for monitoring both NVIDIA and AMD GPUs.
"""

import time
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional, List
import warnings


@dataclass
class GPUMetrics:
    """Container for GPU metrics."""
    timestamp: float
    power_watts: float
    gpu_utilization_percent: float
    memory_used_gb: float
    memory_total_gb: float
    temperature_celsius: Optional[float] = None
    energy_joules: Optional[float] = None  # Cumulative energy


class GPUMonitor(ABC):
    """Abstract base class for GPU monitoring."""

    def __init__(self, device_id: int = 0):
        """
        Initialize GPU monitor.

        Args:
            device_id: GPU device ID to monitor
        """
        self.device_id = device_id
        self.start_time = None
        self.start_energy = None
        self.last_metrics = None

    @abstractmethod
    def get_metrics(self) -> GPUMetrics:
        """Get current GPU metrics."""
        pass

    @abstractmethod
    def get_device_name(self) -> str:
        """Get GPU device name."""
        pass

    @abstractmethod
    def cleanup(self):
        """Cleanup resources."""
        pass

    def start_monitoring(self):
        """Start energy monitoring session."""
        self.start_time = time.time()
        metrics = self.get_metrics()
        self.start_energy = metrics.energy_joules if metrics.energy_joules is not None else 0.0
        self.last_metrics = metrics

    def get_energy_consumed(self) -> float:
        """
        Get energy consumed since start_monitoring() was called.

        Returns:
            Energy in Joules
        """
        if self.start_time is None:
            raise RuntimeError("Must call start_monitoring() first")

        current_metrics = self.get_metrics()

        if current_metrics.energy_joules is not None:
            # If GPU provides cumulative energy, use it
            return current_metrics.energy_joules - self.start_energy
        else:
            # Otherwise, integrate power over time
            elapsed_time = time.time() - self.start_time
            # Use average of start and current power
            avg_power = (self.last_metrics.power_watts + current_metrics.power_watts) / 2.0
            return avg_power * elapsed_time

    def get_average_power(self) -> float:
        """
        Get average power consumption since start_monitoring().

        Returns:
            Average power in Watts
        """
        if self.start_time is None:
            raise RuntimeError("Must call start_monitoring() first")

        elapsed_time = time.time() - self.start_time
        if elapsed_time == 0:
            return 0.0

        energy = self.get_energy_consumed()
        return energy / elapsed_time


class NVIDIAMonitor(GPUMonitor):
    """NVIDIA GPU monitor using pynvml."""

    def __init__(self, device_id: int = 0):
        """Initialize NVIDIA monitor."""
        try:
            import pynvml
            self.pynvml = pynvml
        except ImportError:
            raise ImportError(
                "pynvml not found. Install with: pip install pynvml"
            )

        try:
            self.pynvml.nvmlInit()
            self.handle = self.pynvml.nvmlDeviceGetHandleByIndex(device_id)
        except Exception as e:
            raise RuntimeError(f"Failed to initialize NVIDIA GPU {device_id}: {e}")

        super().__init__(device_id)

    def get_metrics(self) -> GPUMetrics:
        """Get current NVIDIA GPU metrics."""
        try:
            # Power (in milliwatts)
            power_mw = self.pynvml.nvmlDeviceGetPowerUsage(self.handle)
            power_watts = power_mw / 1000.0

            # Utilization
            util = self.pynvml.nvmlDeviceGetUtilizationRates(self.handle)
            gpu_util = util.gpu

            # Memory
            mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(self.handle)
            memory_used_gb = mem_info.used / (1024**3)
            memory_total_gb = mem_info.total / (1024**3)

            # Temperature
            try:
                temp = self.pynvml.nvmlDeviceGetTemperature(
                    self.handle,
                    self.pynvml.NVML_TEMPERATURE_GPU
                )
            except:
                temp = None

            # Try to get cumulative energy (newer GPUs)
            energy_joules = None
            try:
                energy_mj = self.pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
                energy_joules = energy_mj / 1000.0
            except:
                # Not supported on this GPU, will use power integration
                pass

            return GPUMetrics(
                timestamp=time.time(),
                power_watts=power_watts,
                gpu_utilization_percent=gpu_util,
                memory_used_gb=memory_used_gb,
                memory_total_gb=memory_total_gb,
                temperature_celsius=temp,
                energy_joules=energy_joules
            )
        except Exception as e:
            raise RuntimeError(f"Failed to get NVIDIA GPU metrics: {e}")

    def get_device_name(self) -> str:
        """Get NVIDIA GPU device name."""
        try:
            name = self.pynvml.nvmlDeviceGetName(self.handle)
            if isinstance(name, bytes):
                name = name.decode('utf-8')
            return name
        except:
            return f"NVIDIA GPU {self.device_id}"

    def cleanup(self):
        """Cleanup NVIDIA resources."""
        try:
            self.pynvml.nvmlShutdown()
        except:
            pass


class AMDMonitor(GPUMonitor):
    """AMD GPU monitor using rocm-smi command line tool."""

    def __init__(self, device_id: int = 0):
        """Initialize AMD monitor."""
        import subprocess
        import shutil

        # Check if rocm-smi is available
        if shutil.which('rocm-smi') is None:
            raise RuntimeError("rocm-smi command not found. Make sure ROCm is installed and in PATH.")

        self.device_id = device_id

        # Verify device exists
        try:
            result = subprocess.run(
                ['rocm-smi', '--showid'],
                capture_output=True,
                text=True,
                timeout=5
            )
            if result.returncode != 0:
                raise RuntimeError(f"rocm-smi failed: {result.stderr}")
        except subprocess.TimeoutExpired:
            raise RuntimeError("rocm-smi command timed out")
        except Exception as e:
            raise RuntimeError(f"Failed to initialize AMD GPU {device_id}: {e}")

        super().__init__(device_id)

    def _parse_detailed_output(self, output: str) -> dict:
        """Parse rocm-smi detailed output format."""
        lines = output.strip().split('\n')

        # Parse detailed format: GPU[X] : Metric : Value
        metrics = {
            'temperature': None,
            'power': None,
            'vram_percent': None,
            'gpu_percent': None,
        }

        device_prefix = f"GPU[{self.device_id}]"

        for line in lines:
            if not line.strip() or not line.startswith(device_prefix):
                continue

            # Split by colon
            parts = line.split(':')
            if len(parts) < 3:
                continue

            metric_name = parts[1].strip().lower()
            value_str = parts[2].strip()

            try:
                # Temperature (Sensor junction)
                if 'temperature' in metric_name and 'junction' in metric_name:
                    metrics['temperature'] = float(value_str)

                # Power consumption
                elif 'power' in metric_name and 'package' in metric_name:
                    metrics['power'] = float(value_str)

                # GPU utilization
                elif 'gpu use' in metric_name:
                    metrics['gpu_percent'] = float(value_str)

                # VRAM usage percentage
                elif 'memory allocated' in metric_name and 'vram%' in metric_name:
                    metrics['vram_percent'] = float(value_str)

            except (ValueError, IndexError):
                continue

        # Validate we got the required metrics
        if metrics['temperature'] is None:
            raise ValueError(f"Could not find temperature for GPU[{self.device_id}]")
        if metrics['power'] is None:
            raise ValueError(f"Could not find power for GPU[{self.device_id}]")
        if metrics['gpu_percent'] is None:
            metrics['gpu_percent'] = 0.0
        if metrics['vram_percent'] is None:
            metrics['vram_percent'] = 0.0

        return metrics

    def _get_memory_info(self) -> tuple:
        """Get memory usage in GB using rocm-smi --showmeminfo."""
        import subprocess

        try:
            result = subprocess.run(
                ['rocm-smi', '--showmeminfo', 'vram', '-d', str(self.device_id)],
                capture_output=True,
                text=True,
                timeout=5
            )

            if result.returncode != 0:
                return 0.0, 0.0

            # Parse output for memory info
            # Looking for lines like "GPU memory used: X MiB" and "GPU memory total: Y MiB"
            used_gb = 0.0
            total_gb = 0.0

            for line in result.stdout.split('\n'):
                if 'Used' in line or 'used' in line:
                    # Extract number
                    parts = line.split()
                    for i, part in enumerate(parts):
                        if part.replace('.', '').isdigit():
                            used_bytes = float(part)
                            # Check if next part indicates unit
                            if i + 1 < len(parts):
                                unit = parts[i + 1].lower()
                                if 'mb' in unit or 'mib' in unit:
                                    used_gb = used_bytes / 1024
                                elif 'gb' in unit or 'gib' in unit:
                                    used_gb = used_bytes
                                elif 'kb' in unit or 'kib' in unit:
                                    used_gb = used_bytes / (1024 * 1024)
                            break

                if 'Total' in line or 'total' in line:
                    parts = line.split()
                    for i, part in enumerate(parts):
                        if part.replace('.', '').isdigit():
                            total_bytes = float(part)
                            if i + 1 < len(parts):
                                unit = parts[i + 1].lower()
                                if 'mb' in unit or 'mib' in unit:
                                    total_gb = total_bytes / 1024
                                elif 'gb' in unit or 'gib' in unit:
                                    total_gb = total_bytes
                                elif 'kb' in unit or 'kib' in unit:
                                    total_gb = total_bytes / (1024 * 1024)
                            break

            return used_gb, total_gb

        except Exception:
            return 0.0, 0.0

    def get_metrics(self) -> GPUMetrics:
        """Get current AMD GPU metrics."""
        import subprocess

        try:
            # Get main metrics from concise output
            result = subprocess.run(
                ['rocm-smi', '--showid', '--showtemp', '--showpower', '--showuse', '--showmemuse'],
                capture_output=True,
                text=True,
                timeout=5
            )

            if result.returncode != 0:
                raise RuntimeError(f"rocm-smi failed: {result.stderr}")

            metrics = self._parse_detailed_output(result.stdout)

            # Get detailed memory info
            memory_used_gb, memory_total_gb = self._get_memory_info()

            # If we couldn't get absolute memory, estimate from percentage
            if memory_total_gb == 0.0:
                # MI300X has ~192GB, MI250X has ~128GB - use a reasonable default
                memory_total_gb = 192.0  # Assume MI300X
                memory_used_gb = memory_total_gb * (metrics['vram_percent'] / 100.0)

            return GPUMetrics(
                timestamp=time.time(),
                power_watts=metrics['power'],
                gpu_utilization_percent=metrics['gpu_percent'],
                memory_used_gb=memory_used_gb,
                memory_total_gb=memory_total_gb,
                temperature_celsius=metrics['temperature'],
                energy_joules=None  # Will use power integration
            )

        except subprocess.TimeoutExpired:
            raise RuntimeError("rocm-smi command timed out")
        except Exception as e:
            raise RuntimeError(f"Failed to get AMD GPU metrics: {e}")

    def get_device_name(self) -> str:
        """Get AMD GPU device name."""
        import subprocess

        try:
            result = subprocess.run(
                ['rocm-smi', '--showproductname', '-d', str(self.device_id)],
                capture_output=True,
                text=True,
                timeout=5
            )

            if result.returncode == 0:
                # Parse output to find device name
                for line in result.stdout.split('\n'):
                    if 'Card series' in line or 'Card model' in line or 'name' in line.lower():
                        parts = line.split(':')
                        if len(parts) > 1:
                            return parts[1].strip()
        except Exception:
            pass

        return f"AMD GPU {self.device_id}"

    def cleanup(self):
        """Cleanup AMD resources."""
        # No cleanup needed for command-line tool
        pass


def get_gpu_monitor(device_id: int = 0) -> GPUMonitor:
    """
    Factory function to automatically detect and create appropriate GPU monitor.

    Args:
        device_id: GPU device ID to monitor

    Returns:
        GPUMonitor instance (NVIDIAMonitor or AMDMonitor)

    Raises:
        RuntimeError: If no supported GPU is found
    """
    # Try AMD first (rocm-smi based) as it's more commonly available
    try:
        return AMDMonitor(device_id)
    except:
        pass

    # Try NVIDIA if AMD fails
    try:
        return NVIDIAMonitor(device_id)
    except:
        pass

    # Try to import torch to detect GPU type as last resort
    try:
        import torch
        if torch.cuda.is_available():
            # Check if it's NVIDIA or AMD
            device_name = torch.cuda.get_device_name(device_id).lower()

            if 'nvidia' in device_name or 'tesla' in device_name or 'geforce' in device_name:
                return NVIDIAMonitor(device_id)
            elif 'amd' in device_name or 'radeon' in device_name or 'mi300' in device_name or 'mi200' in device_name:
                return AMDMonitor(device_id)
    except:
        pass

    raise RuntimeError(
        "No supported GPU found. Make sure either ROCm (rocm-smi) or NVIDIA (pynvml) drivers are installed."
    )


def list_available_gpus() -> List[str]:
    """
    List all available GPUs.

    Returns:
        List of GPU names
    """
    gpus = []

    # Try NVIDIA
    try:
        import pynvml
        pynvml.nvmlInit()
        device_count = pynvml.nvmlDeviceGetCount()
        for i in range(device_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            name = pynvml.nvmlDeviceGetName(handle)
            if isinstance(name, bytes):
                name = name.decode('utf-8')
            gpus.append(f"GPU {i}: {name} (NVIDIA)")
        pynvml.nvmlShutdown()
    except:
        pass

    # Try AMD with rocm-smi
    try:
        import subprocess
        import shutil

        if shutil.which('rocm-smi'):
            result = subprocess.run(
                ['rocm-smi', '--showid'],
                capture_output=True,
                text=True,
                timeout=5
            )
            if result.returncode == 0:
                # Parse device IDs from output
                for line in result.stdout.split('\n'):
                    if not line.strip() or line.startswith('=') or 'Device' in line or 'ROCm' in line:
                        continue
                    parts = line.split()
                    if parts and parts[0].isdigit():
                        device_id = int(parts[0])
                        # Try to get device name
                        name_result = subprocess.run(
                            ['rocm-smi', '--showproductname', '-d', str(device_id)],
                            capture_output=True,
                            text=True,
                            timeout=5
                        )
                        name = f"AMD GPU"
                        if name_result.returncode == 0:
                            for name_line in name_result.stdout.split('\n'):
                                if 'Card' in name_line or 'name' in name_line.lower():
                                    parts_name = name_line.split(':')
                                    if len(parts_name) > 1:
                                        name = parts_name[1].strip()
                                        break
                        gpus.append(f"GPU {device_id}: {name} (AMD)")
    except:
        pass

    return gpus


if __name__ == "__main__":
    """Test GPU monitoring."""
    print("=" * 60)
    print("GPU Monitoring Test")
    print("=" * 60)

    # List available GPUs
    print("\nAvailable GPUs:")
    gpus = list_available_gpus()
    if not gpus:
        print("  No GPUs found!")
        exit(1)

    for gpu in gpus:
        print(f"  {gpu}")

    # Test monitoring
    print("\nTesting GPU 0 monitoring...")
    try:
        monitor = get_gpu_monitor(0)
        print(f"  Device: {monitor.get_device_name()}")

        # Get metrics
        metrics = monitor.get_metrics()
        print(f"\nCurrent Metrics:")
        print(f"  Power: {metrics.power_watts:.2f} W")
        print(f"  GPU Utilization: {metrics.gpu_utilization_percent:.1f}%")
        print(f"  Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB")
        if metrics.temperature_celsius:
            print(f"  Temperature: {metrics.temperature_celsius:.1f}°C")

        # Test energy monitoring
        print("\nTesting energy monitoring (5 seconds)...")
        monitor.start_monitoring()
        time.sleep(5)
        energy = monitor.get_energy_consumed()
        avg_power = monitor.get_average_power()
        print(f"  Energy consumed: {energy:.2f} J")
        print(f"  Average power: {avg_power:.2f} W")

        monitor.cleanup()
        print("\n✓ Monitoring test successful!")

    except Exception as e:
        print(f"\n✗ Error: {e}")
        exit(1)