Files
cocogoat/utils/gpu_monitor.py
2026-02-05 23:18:26 +01:00

563 lines
19 KiB
Python

"""
GPU Monitoring Infrastructure for LLM Benchmarking
Provides unified interface for monitoring both NVIDIA and AMD GPUs.
"""
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional, List
import warnings
@dataclass
class GPUMetrics:
"""Container for GPU metrics."""
timestamp: float
power_watts: float
gpu_utilization_percent: float
memory_used_gb: float
memory_total_gb: float
temperature_celsius: Optional[float] = None
energy_joules: Optional[float] = None # Cumulative energy
class GPUMonitor(ABC):
"""Abstract base class for GPU monitoring."""
def __init__(self, device_id: int = 0):
"""
Initialize GPU monitor.
Args:
device_id: GPU device ID to monitor
"""
self.device_id = device_id
self.start_time = None
self.start_energy = None
self.last_metrics = None
@abstractmethod
def get_metrics(self) -> GPUMetrics:
"""Get current GPU metrics."""
pass
@abstractmethod
def get_device_name(self) -> str:
"""Get GPU device name."""
pass
@abstractmethod
def cleanup(self):
"""Cleanup resources."""
pass
def start_monitoring(self):
"""Start energy monitoring session."""
self.start_time = time.time()
metrics = self.get_metrics()
self.start_energy = metrics.energy_joules if metrics.energy_joules is not None else 0.0
self.last_metrics = metrics
def get_energy_consumed(self) -> float:
"""
Get energy consumed since start_monitoring() was called.
Returns:
Energy in Joules
"""
if self.start_time is None:
raise RuntimeError("Must call start_monitoring() first")
current_metrics = self.get_metrics()
if current_metrics.energy_joules is not None:
# If GPU provides cumulative energy, use it
return current_metrics.energy_joules - self.start_energy
else:
# Otherwise, integrate power over time
elapsed_time = time.time() - self.start_time
# Use average of start and current power
avg_power = (self.last_metrics.power_watts + current_metrics.power_watts) / 2.0
return avg_power * elapsed_time
def get_average_power(self) -> float:
"""
Get average power consumption since start_monitoring().
Returns:
Average power in Watts
"""
if self.start_time is None:
raise RuntimeError("Must call start_monitoring() first")
elapsed_time = time.time() - self.start_time
if elapsed_time == 0:
return 0.0
energy = self.get_energy_consumed()
return energy / elapsed_time
class NVIDIAMonitor(GPUMonitor):
"""NVIDIA GPU monitor using pynvml."""
def __init__(self, device_id: int = 0):
"""Initialize NVIDIA monitor."""
try:
import pynvml
self.pynvml = pynvml
except ImportError:
raise ImportError(
"pynvml not found. Install with: pip install pynvml"
)
try:
self.pynvml.nvmlInit()
self.handle = self.pynvml.nvmlDeviceGetHandleByIndex(device_id)
except Exception as e:
raise RuntimeError(f"Failed to initialize NVIDIA GPU {device_id}: {e}")
super().__init__(device_id)
def get_metrics(self) -> GPUMetrics:
"""Get current NVIDIA GPU metrics."""
try:
# Power (in milliwatts)
power_mw = self.pynvml.nvmlDeviceGetPowerUsage(self.handle)
power_watts = power_mw / 1000.0
# Utilization
util = self.pynvml.nvmlDeviceGetUtilizationRates(self.handle)
gpu_util = util.gpu
# Memory
mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(self.handle)
memory_used_gb = mem_info.used / (1024**3)
memory_total_gb = mem_info.total / (1024**3)
# Temperature
try:
temp = self.pynvml.nvmlDeviceGetTemperature(
self.handle,
self.pynvml.NVML_TEMPERATURE_GPU
)
except:
temp = None
# Try to get cumulative energy (newer GPUs)
energy_joules = None
try:
energy_mj = self.pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
energy_joules = energy_mj / 1000.0
except:
# Not supported on this GPU, will use power integration
pass
return GPUMetrics(
timestamp=time.time(),
power_watts=power_watts,
gpu_utilization_percent=gpu_util,
memory_used_gb=memory_used_gb,
memory_total_gb=memory_total_gb,
temperature_celsius=temp,
energy_joules=energy_joules
)
except Exception as e:
raise RuntimeError(f"Failed to get NVIDIA GPU metrics: {e}")
def get_device_name(self) -> str:
"""Get NVIDIA GPU device name."""
try:
name = self.pynvml.nvmlDeviceGetName(self.handle)
if isinstance(name, bytes):
name = name.decode('utf-8')
return name
except:
return f"NVIDIA GPU {self.device_id}"
def cleanup(self):
"""Cleanup NVIDIA resources."""
try:
self.pynvml.nvmlShutdown()
except:
pass
class AMDMonitor(GPUMonitor):
"""AMD GPU monitor using rocm-smi command line tool."""
def __init__(self, device_id: int = 0):
"""Initialize AMD monitor."""
import subprocess
import shutil
# Check if rocm-smi is available
if shutil.which('rocm-smi') is None:
raise RuntimeError("rocm-smi command not found. Make sure ROCm is installed and in PATH.")
self.device_id = device_id
# Verify device exists
try:
result = subprocess.run(
['rocm-smi', '--showid'],
capture_output=True,
text=True,
timeout=5
)
if result.returncode != 0:
raise RuntimeError(f"rocm-smi failed: {result.stderr}")
except subprocess.TimeoutExpired:
raise RuntimeError("rocm-smi command timed out")
except Exception as e:
raise RuntimeError(f"Failed to initialize AMD GPU {device_id}: {e}")
super().__init__(device_id)
def _parse_detailed_output(self, output: str) -> dict:
"""Parse rocm-smi detailed output format."""
lines = output.strip().split('\n')
# Parse detailed format: GPU[X] : Metric : Value
metrics = {
'temperature': None,
'power': None,
'vram_percent': None,
'gpu_percent': None,
}
device_prefix = f"GPU[{self.device_id}]"
for line in lines:
if not line.strip() or not line.startswith(device_prefix):
continue
# Split by colon
parts = line.split(':')
if len(parts) < 3:
continue
metric_name = parts[1].strip().lower()
value_str = parts[2].strip()
try:
# Temperature (Sensor junction)
if 'temperature' in metric_name and 'junction' in metric_name:
metrics['temperature'] = float(value_str)
# Power consumption
elif 'power' in metric_name and 'package' in metric_name:
metrics['power'] = float(value_str)
# GPU utilization
elif 'gpu use' in metric_name:
metrics['gpu_percent'] = float(value_str)
# VRAM usage percentage
elif 'memory allocated' in metric_name and 'vram%' in metric_name:
metrics['vram_percent'] = float(value_str)
except (ValueError, IndexError):
continue
# Validate we got the required metrics
if metrics['temperature'] is None:
raise ValueError(f"Could not find temperature for GPU[{self.device_id}]")
if metrics['power'] is None:
raise ValueError(f"Could not find power for GPU[{self.device_id}]")
if metrics['gpu_percent'] is None:
metrics['gpu_percent'] = 0.0
if metrics['vram_percent'] is None:
metrics['vram_percent'] = 0.0
return metrics
def _get_memory_info(self) -> tuple:
"""Get memory usage in GB using rocm-smi --showmeminfo."""
import subprocess
try:
result = subprocess.run(
['rocm-smi', '--showmeminfo', 'vram', '-d', str(self.device_id)],
capture_output=True,
text=True,
timeout=5
)
if result.returncode != 0:
return 0.0, 0.0
# Parse output for memory info
# Looking for lines like "GPU memory used: X MiB" and "GPU memory total: Y MiB"
used_gb = 0.0
total_gb = 0.0
for line in result.stdout.split('\n'):
if 'Used' in line or 'used' in line:
# Extract number
parts = line.split()
for i, part in enumerate(parts):
if part.replace('.', '').isdigit():
used_bytes = float(part)
# Check if next part indicates unit
if i + 1 < len(parts):
unit = parts[i + 1].lower()
if 'mb' in unit or 'mib' in unit:
used_gb = used_bytes / 1024
elif 'gb' in unit or 'gib' in unit:
used_gb = used_bytes
elif 'kb' in unit or 'kib' in unit:
used_gb = used_bytes / (1024 * 1024)
break
if 'Total' in line or 'total' in line:
parts = line.split()
for i, part in enumerate(parts):
if part.replace('.', '').isdigit():
total_bytes = float(part)
if i + 1 < len(parts):
unit = parts[i + 1].lower()
if 'mb' in unit or 'mib' in unit:
total_gb = total_bytes / 1024
elif 'gb' in unit or 'gib' in unit:
total_gb = total_bytes
elif 'kb' in unit or 'kib' in unit:
total_gb = total_bytes / (1024 * 1024)
break
return used_gb, total_gb
except Exception:
return 0.0, 0.0
def get_metrics(self) -> GPUMetrics:
"""Get current AMD GPU metrics."""
import subprocess
try:
# Get main metrics from concise output
result = subprocess.run(
['rocm-smi', '--showid', '--showtemp', '--showpower', '--showuse', '--showmemuse'],
capture_output=True,
text=True,
timeout=5
)
if result.returncode != 0:
raise RuntimeError(f"rocm-smi failed: {result.stderr}")
metrics = self._parse_detailed_output(result.stdout)
# Get detailed memory info
memory_used_gb, memory_total_gb = self._get_memory_info()
# If we couldn't get absolute memory, estimate from percentage
if memory_total_gb == 0.0:
# MI300X has ~192GB, MI250X has ~128GB - use a reasonable default
memory_total_gb = 192.0 # Assume MI300X
memory_used_gb = memory_total_gb * (metrics['vram_percent'] / 100.0)
return GPUMetrics(
timestamp=time.time(),
power_watts=metrics['power'],
gpu_utilization_percent=metrics['gpu_percent'],
memory_used_gb=memory_used_gb,
memory_total_gb=memory_total_gb,
temperature_celsius=metrics['temperature'],
energy_joules=None # Will use power integration
)
except subprocess.TimeoutExpired:
raise RuntimeError("rocm-smi command timed out")
except Exception as e:
raise RuntimeError(f"Failed to get AMD GPU metrics: {e}")
def get_device_name(self) -> str:
"""Get AMD GPU device name."""
import subprocess
try:
result = subprocess.run(
['rocm-smi', '--showproductname', '-d', str(self.device_id)],
capture_output=True,
text=True,
timeout=5
)
if result.returncode == 0:
# Parse output to find device name
for line in result.stdout.split('\n'):
if 'Card series' in line or 'Card model' in line or 'name' in line.lower():
parts = line.split(':')
if len(parts) > 1:
return parts[1].strip()
except Exception:
pass
return f"AMD GPU {self.device_id}"
def cleanup(self):
"""Cleanup AMD resources."""
# No cleanup needed for command-line tool
pass
def get_gpu_monitor(device_id: int = 0) -> GPUMonitor:
"""
Factory function to automatically detect and create appropriate GPU monitor.
Args:
device_id: GPU device ID to monitor
Returns:
GPUMonitor instance (NVIDIAMonitor or AMDMonitor)
Raises:
RuntimeError: If no supported GPU is found
"""
# Try AMD first (rocm-smi based) as it's more commonly available
try:
return AMDMonitor(device_id)
except:
pass
# Try NVIDIA if AMD fails
try:
return NVIDIAMonitor(device_id)
except:
pass
# Try to import torch to detect GPU type as last resort
try:
import torch
if torch.cuda.is_available():
# Check if it's NVIDIA or AMD
device_name = torch.cuda.get_device_name(device_id).lower()
if 'nvidia' in device_name or 'tesla' in device_name or 'geforce' in device_name:
return NVIDIAMonitor(device_id)
elif 'amd' in device_name or 'radeon' in device_name or 'mi300' in device_name or 'mi200' in device_name:
return AMDMonitor(device_id)
except:
pass
raise RuntimeError(
"No supported GPU found. Make sure either ROCm (rocm-smi) or NVIDIA (pynvml) drivers are installed."
)
def list_available_gpus() -> List[str]:
"""
List all available GPUs.
Returns:
List of GPU names
"""
gpus = []
# Try NVIDIA
try:
import pynvml
pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()
for i in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
name = pynvml.nvmlDeviceGetName(handle)
if isinstance(name, bytes):
name = name.decode('utf-8')
gpus.append(f"GPU {i}: {name} (NVIDIA)")
pynvml.nvmlShutdown()
except:
pass
# Try AMD with rocm-smi
try:
import subprocess
import shutil
if shutil.which('rocm-smi'):
result = subprocess.run(
['rocm-smi', '--showid'],
capture_output=True,
text=True,
timeout=5
)
if result.returncode == 0:
# Parse device IDs from output
for line in result.stdout.split('\n'):
if not line.strip() or line.startswith('=') or 'Device' in line or 'ROCm' in line:
continue
parts = line.split()
if parts and parts[0].isdigit():
device_id = int(parts[0])
# Try to get device name
name_result = subprocess.run(
['rocm-smi', '--showproductname', '-d', str(device_id)],
capture_output=True,
text=True,
timeout=5
)
name = f"AMD GPU"
if name_result.returncode == 0:
for name_line in name_result.stdout.split('\n'):
if 'Card' in name_line or 'name' in name_line.lower():
parts_name = name_line.split(':')
if len(parts_name) > 1:
name = parts_name[1].strip()
break
gpus.append(f"GPU {device_id}: {name} (AMD)")
except:
pass
return gpus
if __name__ == "__main__":
"""Test GPU monitoring."""
print("=" * 60)
print("GPU Monitoring Test")
print("=" * 60)
# List available GPUs
print("\nAvailable GPUs:")
gpus = list_available_gpus()
if not gpus:
print(" No GPUs found!")
exit(1)
for gpu in gpus:
print(f" {gpu}")
# Test monitoring
print("\nTesting GPU 0 monitoring...")
try:
monitor = get_gpu_monitor(0)
print(f" Device: {monitor.get_device_name()}")
# Get metrics
metrics = monitor.get_metrics()
print(f"\nCurrent Metrics:")
print(f" Power: {metrics.power_watts:.2f} W")
print(f" GPU Utilization: {metrics.gpu_utilization_percent:.1f}%")
print(f" Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB")
if metrics.temperature_celsius:
print(f" Temperature: {metrics.temperature_celsius:.1f}°C")
# Test energy monitoring
print("\nTesting energy monitoring (5 seconds)...")
monitor.start_monitoring()
time.sleep(5)
energy = monitor.get_energy_consumed()
avg_power = monitor.get_average_power()
print(f" Energy consumed: {energy:.2f} J")
print(f" Average power: {avg_power:.2f} W")
monitor.cleanup()
print("\n✓ Monitoring test successful!")
except Exception as e:
print(f"\n✗ Error: {e}")
exit(1)