563 lines
19 KiB
Python
563 lines
19 KiB
Python
"""
|
|
GPU Monitoring Infrastructure for LLM Benchmarking
|
|
|
|
Provides unified interface for monitoring both NVIDIA and AMD GPUs.
|
|
"""
|
|
|
|
import time
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass
|
|
from typing import Optional, List
|
|
import warnings
|
|
|
|
|
|
@dataclass
|
|
class GPUMetrics:
|
|
"""Container for GPU metrics."""
|
|
timestamp: float
|
|
power_watts: float
|
|
gpu_utilization_percent: float
|
|
memory_used_gb: float
|
|
memory_total_gb: float
|
|
temperature_celsius: Optional[float] = None
|
|
energy_joules: Optional[float] = None # Cumulative energy
|
|
|
|
|
|
class GPUMonitor(ABC):
|
|
"""Abstract base class for GPU monitoring."""
|
|
|
|
def __init__(self, device_id: int = 0):
|
|
"""
|
|
Initialize GPU monitor.
|
|
|
|
Args:
|
|
device_id: GPU device ID to monitor
|
|
"""
|
|
self.device_id = device_id
|
|
self.start_time = None
|
|
self.start_energy = None
|
|
self.last_metrics = None
|
|
|
|
@abstractmethod
|
|
def get_metrics(self) -> GPUMetrics:
|
|
"""Get current GPU metrics."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def get_device_name(self) -> str:
|
|
"""Get GPU device name."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def cleanup(self):
|
|
"""Cleanup resources."""
|
|
pass
|
|
|
|
def start_monitoring(self):
|
|
"""Start energy monitoring session."""
|
|
self.start_time = time.time()
|
|
metrics = self.get_metrics()
|
|
self.start_energy = metrics.energy_joules if metrics.energy_joules is not None else 0.0
|
|
self.last_metrics = metrics
|
|
|
|
def get_energy_consumed(self) -> float:
|
|
"""
|
|
Get energy consumed since start_monitoring() was called.
|
|
|
|
Returns:
|
|
Energy in Joules
|
|
"""
|
|
if self.start_time is None:
|
|
raise RuntimeError("Must call start_monitoring() first")
|
|
|
|
current_metrics = self.get_metrics()
|
|
|
|
if current_metrics.energy_joules is not None:
|
|
# If GPU provides cumulative energy, use it
|
|
return current_metrics.energy_joules - self.start_energy
|
|
else:
|
|
# Otherwise, integrate power over time
|
|
elapsed_time = time.time() - self.start_time
|
|
# Use average of start and current power
|
|
avg_power = (self.last_metrics.power_watts + current_metrics.power_watts) / 2.0
|
|
return avg_power * elapsed_time
|
|
|
|
def get_average_power(self) -> float:
|
|
"""
|
|
Get average power consumption since start_monitoring().
|
|
|
|
Returns:
|
|
Average power in Watts
|
|
"""
|
|
if self.start_time is None:
|
|
raise RuntimeError("Must call start_monitoring() first")
|
|
|
|
elapsed_time = time.time() - self.start_time
|
|
if elapsed_time == 0:
|
|
return 0.0
|
|
|
|
energy = self.get_energy_consumed()
|
|
return energy / elapsed_time
|
|
|
|
|
|
class NVIDIAMonitor(GPUMonitor):
|
|
"""NVIDIA GPU monitor using pynvml."""
|
|
|
|
def __init__(self, device_id: int = 0):
|
|
"""Initialize NVIDIA monitor."""
|
|
try:
|
|
import pynvml
|
|
self.pynvml = pynvml
|
|
except ImportError:
|
|
raise ImportError(
|
|
"pynvml not found. Install with: pip install pynvml"
|
|
)
|
|
|
|
try:
|
|
self.pynvml.nvmlInit()
|
|
self.handle = self.pynvml.nvmlDeviceGetHandleByIndex(device_id)
|
|
except Exception as e:
|
|
raise RuntimeError(f"Failed to initialize NVIDIA GPU {device_id}: {e}")
|
|
|
|
super().__init__(device_id)
|
|
|
|
def get_metrics(self) -> GPUMetrics:
|
|
"""Get current NVIDIA GPU metrics."""
|
|
try:
|
|
# Power (in milliwatts)
|
|
power_mw = self.pynvml.nvmlDeviceGetPowerUsage(self.handle)
|
|
power_watts = power_mw / 1000.0
|
|
|
|
# Utilization
|
|
util = self.pynvml.nvmlDeviceGetUtilizationRates(self.handle)
|
|
gpu_util = util.gpu
|
|
|
|
# Memory
|
|
mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(self.handle)
|
|
memory_used_gb = mem_info.used / (1024**3)
|
|
memory_total_gb = mem_info.total / (1024**3)
|
|
|
|
# Temperature
|
|
try:
|
|
temp = self.pynvml.nvmlDeviceGetTemperature(
|
|
self.handle,
|
|
self.pynvml.NVML_TEMPERATURE_GPU
|
|
)
|
|
except:
|
|
temp = None
|
|
|
|
# Try to get cumulative energy (newer GPUs)
|
|
energy_joules = None
|
|
try:
|
|
energy_mj = self.pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
|
|
energy_joules = energy_mj / 1000.0
|
|
except:
|
|
# Not supported on this GPU, will use power integration
|
|
pass
|
|
|
|
return GPUMetrics(
|
|
timestamp=time.time(),
|
|
power_watts=power_watts,
|
|
gpu_utilization_percent=gpu_util,
|
|
memory_used_gb=memory_used_gb,
|
|
memory_total_gb=memory_total_gb,
|
|
temperature_celsius=temp,
|
|
energy_joules=energy_joules
|
|
)
|
|
except Exception as e:
|
|
raise RuntimeError(f"Failed to get NVIDIA GPU metrics: {e}")
|
|
|
|
def get_device_name(self) -> str:
|
|
"""Get NVIDIA GPU device name."""
|
|
try:
|
|
name = self.pynvml.nvmlDeviceGetName(self.handle)
|
|
if isinstance(name, bytes):
|
|
name = name.decode('utf-8')
|
|
return name
|
|
except:
|
|
return f"NVIDIA GPU {self.device_id}"
|
|
|
|
def cleanup(self):
|
|
"""Cleanup NVIDIA resources."""
|
|
try:
|
|
self.pynvml.nvmlShutdown()
|
|
except:
|
|
pass
|
|
|
|
|
|
class AMDMonitor(GPUMonitor):
|
|
"""AMD GPU monitor using rocm-smi command line tool."""
|
|
|
|
def __init__(self, device_id: int = 0):
|
|
"""Initialize AMD monitor."""
|
|
import subprocess
|
|
import shutil
|
|
|
|
# Check if rocm-smi is available
|
|
if shutil.which('rocm-smi') is None:
|
|
raise RuntimeError("rocm-smi command not found. Make sure ROCm is installed and in PATH.")
|
|
|
|
self.device_id = device_id
|
|
|
|
# Verify device exists
|
|
try:
|
|
result = subprocess.run(
|
|
['rocm-smi', '--showid'],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5
|
|
)
|
|
if result.returncode != 0:
|
|
raise RuntimeError(f"rocm-smi failed: {result.stderr}")
|
|
except subprocess.TimeoutExpired:
|
|
raise RuntimeError("rocm-smi command timed out")
|
|
except Exception as e:
|
|
raise RuntimeError(f"Failed to initialize AMD GPU {device_id}: {e}")
|
|
|
|
super().__init__(device_id)
|
|
|
|
def _parse_detailed_output(self, output: str) -> dict:
|
|
"""Parse rocm-smi detailed output format."""
|
|
lines = output.strip().split('\n')
|
|
|
|
# Parse detailed format: GPU[X] : Metric : Value
|
|
metrics = {
|
|
'temperature': None,
|
|
'power': None,
|
|
'vram_percent': None,
|
|
'gpu_percent': None,
|
|
}
|
|
|
|
device_prefix = f"GPU[{self.device_id}]"
|
|
|
|
for line in lines:
|
|
if not line.strip() or not line.startswith(device_prefix):
|
|
continue
|
|
|
|
# Split by colon
|
|
parts = line.split(':')
|
|
if len(parts) < 3:
|
|
continue
|
|
|
|
metric_name = parts[1].strip().lower()
|
|
value_str = parts[2].strip()
|
|
|
|
try:
|
|
# Temperature (Sensor junction)
|
|
if 'temperature' in metric_name and 'junction' in metric_name:
|
|
metrics['temperature'] = float(value_str)
|
|
|
|
# Power consumption
|
|
elif 'power' in metric_name and 'package' in metric_name:
|
|
metrics['power'] = float(value_str)
|
|
|
|
# GPU utilization
|
|
elif 'gpu use' in metric_name:
|
|
metrics['gpu_percent'] = float(value_str)
|
|
|
|
# VRAM usage percentage
|
|
elif 'memory allocated' in metric_name and 'vram%' in metric_name:
|
|
metrics['vram_percent'] = float(value_str)
|
|
|
|
except (ValueError, IndexError):
|
|
continue
|
|
|
|
# Validate we got the required metrics
|
|
if metrics['temperature'] is None:
|
|
raise ValueError(f"Could not find temperature for GPU[{self.device_id}]")
|
|
if metrics['power'] is None:
|
|
raise ValueError(f"Could not find power for GPU[{self.device_id}]")
|
|
if metrics['gpu_percent'] is None:
|
|
metrics['gpu_percent'] = 0.0
|
|
if metrics['vram_percent'] is None:
|
|
metrics['vram_percent'] = 0.0
|
|
|
|
return metrics
|
|
|
|
def _get_memory_info(self) -> tuple:
|
|
"""Get memory usage in GB using rocm-smi --showmeminfo."""
|
|
import subprocess
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
['rocm-smi', '--showmeminfo', 'vram', '-d', str(self.device_id)],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
return 0.0, 0.0
|
|
|
|
# Parse output for memory info
|
|
# Looking for lines like "GPU memory used: X MiB" and "GPU memory total: Y MiB"
|
|
used_gb = 0.0
|
|
total_gb = 0.0
|
|
|
|
for line in result.stdout.split('\n'):
|
|
if 'Used' in line or 'used' in line:
|
|
# Extract number
|
|
parts = line.split()
|
|
for i, part in enumerate(parts):
|
|
if part.replace('.', '').isdigit():
|
|
used_bytes = float(part)
|
|
# Check if next part indicates unit
|
|
if i + 1 < len(parts):
|
|
unit = parts[i + 1].lower()
|
|
if 'mb' in unit or 'mib' in unit:
|
|
used_gb = used_bytes / 1024
|
|
elif 'gb' in unit or 'gib' in unit:
|
|
used_gb = used_bytes
|
|
elif 'kb' in unit or 'kib' in unit:
|
|
used_gb = used_bytes / (1024 * 1024)
|
|
break
|
|
|
|
if 'Total' in line or 'total' in line:
|
|
parts = line.split()
|
|
for i, part in enumerate(parts):
|
|
if part.replace('.', '').isdigit():
|
|
total_bytes = float(part)
|
|
if i + 1 < len(parts):
|
|
unit = parts[i + 1].lower()
|
|
if 'mb' in unit or 'mib' in unit:
|
|
total_gb = total_bytes / 1024
|
|
elif 'gb' in unit or 'gib' in unit:
|
|
total_gb = total_bytes
|
|
elif 'kb' in unit or 'kib' in unit:
|
|
total_gb = total_bytes / (1024 * 1024)
|
|
break
|
|
|
|
return used_gb, total_gb
|
|
|
|
except Exception:
|
|
return 0.0, 0.0
|
|
|
|
def get_metrics(self) -> GPUMetrics:
|
|
"""Get current AMD GPU metrics."""
|
|
import subprocess
|
|
|
|
try:
|
|
# Get main metrics from concise output
|
|
result = subprocess.run(
|
|
['rocm-smi', '--showid', '--showtemp', '--showpower', '--showuse', '--showmemuse'],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
raise RuntimeError(f"rocm-smi failed: {result.stderr}")
|
|
|
|
metrics = self._parse_detailed_output(result.stdout)
|
|
|
|
# Get detailed memory info
|
|
memory_used_gb, memory_total_gb = self._get_memory_info()
|
|
|
|
# If we couldn't get absolute memory, estimate from percentage
|
|
if memory_total_gb == 0.0:
|
|
# MI300X has ~192GB, MI250X has ~128GB - use a reasonable default
|
|
memory_total_gb = 192.0 # Assume MI300X
|
|
memory_used_gb = memory_total_gb * (metrics['vram_percent'] / 100.0)
|
|
|
|
return GPUMetrics(
|
|
timestamp=time.time(),
|
|
power_watts=metrics['power'],
|
|
gpu_utilization_percent=metrics['gpu_percent'],
|
|
memory_used_gb=memory_used_gb,
|
|
memory_total_gb=memory_total_gb,
|
|
temperature_celsius=metrics['temperature'],
|
|
energy_joules=None # Will use power integration
|
|
)
|
|
|
|
except subprocess.TimeoutExpired:
|
|
raise RuntimeError("rocm-smi command timed out")
|
|
except Exception as e:
|
|
raise RuntimeError(f"Failed to get AMD GPU metrics: {e}")
|
|
|
|
def get_device_name(self) -> str:
|
|
"""Get AMD GPU device name."""
|
|
import subprocess
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
['rocm-smi', '--showproductname', '-d', str(self.device_id)],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5
|
|
)
|
|
|
|
if result.returncode == 0:
|
|
# Parse output to find device name
|
|
for line in result.stdout.split('\n'):
|
|
if 'Card series' in line or 'Card model' in line or 'name' in line.lower():
|
|
parts = line.split(':')
|
|
if len(parts) > 1:
|
|
return parts[1].strip()
|
|
except Exception:
|
|
pass
|
|
|
|
return f"AMD GPU {self.device_id}"
|
|
|
|
def cleanup(self):
|
|
"""Cleanup AMD resources."""
|
|
# No cleanup needed for command-line tool
|
|
pass
|
|
|
|
|
|
def get_gpu_monitor(device_id: int = 0) -> GPUMonitor:
|
|
"""
|
|
Factory function to automatically detect and create appropriate GPU monitor.
|
|
|
|
Args:
|
|
device_id: GPU device ID to monitor
|
|
|
|
Returns:
|
|
GPUMonitor instance (NVIDIAMonitor or AMDMonitor)
|
|
|
|
Raises:
|
|
RuntimeError: If no supported GPU is found
|
|
"""
|
|
# Try AMD first (rocm-smi based) as it's more commonly available
|
|
try:
|
|
return AMDMonitor(device_id)
|
|
except:
|
|
pass
|
|
|
|
# Try NVIDIA if AMD fails
|
|
try:
|
|
return NVIDIAMonitor(device_id)
|
|
except:
|
|
pass
|
|
|
|
# Try to import torch to detect GPU type as last resort
|
|
try:
|
|
import torch
|
|
if torch.cuda.is_available():
|
|
# Check if it's NVIDIA or AMD
|
|
device_name = torch.cuda.get_device_name(device_id).lower()
|
|
|
|
if 'nvidia' in device_name or 'tesla' in device_name or 'geforce' in device_name:
|
|
return NVIDIAMonitor(device_id)
|
|
elif 'amd' in device_name or 'radeon' in device_name or 'mi300' in device_name or 'mi200' in device_name:
|
|
return AMDMonitor(device_id)
|
|
except:
|
|
pass
|
|
|
|
raise RuntimeError(
|
|
"No supported GPU found. Make sure either ROCm (rocm-smi) or NVIDIA (pynvml) drivers are installed."
|
|
)
|
|
|
|
|
|
def list_available_gpus() -> List[str]:
|
|
"""
|
|
List all available GPUs.
|
|
|
|
Returns:
|
|
List of GPU names
|
|
"""
|
|
gpus = []
|
|
|
|
# Try NVIDIA
|
|
try:
|
|
import pynvml
|
|
pynvml.nvmlInit()
|
|
device_count = pynvml.nvmlDeviceGetCount()
|
|
for i in range(device_count):
|
|
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
name = pynvml.nvmlDeviceGetName(handle)
|
|
if isinstance(name, bytes):
|
|
name = name.decode('utf-8')
|
|
gpus.append(f"GPU {i}: {name} (NVIDIA)")
|
|
pynvml.nvmlShutdown()
|
|
except:
|
|
pass
|
|
|
|
# Try AMD with rocm-smi
|
|
try:
|
|
import subprocess
|
|
import shutil
|
|
|
|
if shutil.which('rocm-smi'):
|
|
result = subprocess.run(
|
|
['rocm-smi', '--showid'],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5
|
|
)
|
|
if result.returncode == 0:
|
|
# Parse device IDs from output
|
|
for line in result.stdout.split('\n'):
|
|
if not line.strip() or line.startswith('=') or 'Device' in line or 'ROCm' in line:
|
|
continue
|
|
parts = line.split()
|
|
if parts and parts[0].isdigit():
|
|
device_id = int(parts[0])
|
|
# Try to get device name
|
|
name_result = subprocess.run(
|
|
['rocm-smi', '--showproductname', '-d', str(device_id)],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5
|
|
)
|
|
name = f"AMD GPU"
|
|
if name_result.returncode == 0:
|
|
for name_line in name_result.stdout.split('\n'):
|
|
if 'Card' in name_line or 'name' in name_line.lower():
|
|
parts_name = name_line.split(':')
|
|
if len(parts_name) > 1:
|
|
name = parts_name[1].strip()
|
|
break
|
|
gpus.append(f"GPU {device_id}: {name} (AMD)")
|
|
except:
|
|
pass
|
|
|
|
return gpus
|
|
|
|
|
|
if __name__ == "__main__":
|
|
"""Test GPU monitoring."""
|
|
print("=" * 60)
|
|
print("GPU Monitoring Test")
|
|
print("=" * 60)
|
|
|
|
# List available GPUs
|
|
print("\nAvailable GPUs:")
|
|
gpus = list_available_gpus()
|
|
if not gpus:
|
|
print(" No GPUs found!")
|
|
exit(1)
|
|
|
|
for gpu in gpus:
|
|
print(f" {gpu}")
|
|
|
|
# Test monitoring
|
|
print("\nTesting GPU 0 monitoring...")
|
|
try:
|
|
monitor = get_gpu_monitor(0)
|
|
print(f" Device: {monitor.get_device_name()}")
|
|
|
|
# Get metrics
|
|
metrics = monitor.get_metrics()
|
|
print(f"\nCurrent Metrics:")
|
|
print(f" Power: {metrics.power_watts:.2f} W")
|
|
print(f" GPU Utilization: {metrics.gpu_utilization_percent:.1f}%")
|
|
print(f" Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB")
|
|
if metrics.temperature_celsius:
|
|
print(f" Temperature: {metrics.temperature_celsius:.1f}°C")
|
|
|
|
# Test energy monitoring
|
|
print("\nTesting energy monitoring (5 seconds)...")
|
|
monitor.start_monitoring()
|
|
time.sleep(5)
|
|
energy = monitor.get_energy_consumed()
|
|
avg_power = monitor.get_average_power()
|
|
print(f" Energy consumed: {energy:.2f} J")
|
|
print(f" Average power: {avg_power:.2f} W")
|
|
|
|
monitor.cleanup()
|
|
print("\n✓ Monitoring test successful!")
|
|
|
|
except Exception as e:
|
|
print(f"\n✗ Error: {e}")
|
|
exit(1)
|