Initial commit
This commit is contained in:
562
utils/gpu_monitor.py
Normal file
562
utils/gpu_monitor.py
Normal file
@@ -0,0 +1,562 @@
|
||||
"""
|
||||
GPU Monitoring Infrastructure for LLM Benchmarking
|
||||
|
||||
Provides unified interface for monitoring both NVIDIA and AMD GPUs.
|
||||
"""
|
||||
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, List
|
||||
import warnings
|
||||
|
||||
|
||||
@dataclass
|
||||
class GPUMetrics:
|
||||
"""Container for GPU metrics."""
|
||||
timestamp: float
|
||||
power_watts: float
|
||||
gpu_utilization_percent: float
|
||||
memory_used_gb: float
|
||||
memory_total_gb: float
|
||||
temperature_celsius: Optional[float] = None
|
||||
energy_joules: Optional[float] = None # Cumulative energy
|
||||
|
||||
|
||||
class GPUMonitor(ABC):
|
||||
"""Abstract base class for GPU monitoring."""
|
||||
|
||||
def __init__(self, device_id: int = 0):
|
||||
"""
|
||||
Initialize GPU monitor.
|
||||
|
||||
Args:
|
||||
device_id: GPU device ID to monitor
|
||||
"""
|
||||
self.device_id = device_id
|
||||
self.start_time = None
|
||||
self.start_energy = None
|
||||
self.last_metrics = None
|
||||
|
||||
@abstractmethod
|
||||
def get_metrics(self) -> GPUMetrics:
|
||||
"""Get current GPU metrics."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_device_name(self) -> str:
|
||||
"""Get GPU device name."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def cleanup(self):
|
||||
"""Cleanup resources."""
|
||||
pass
|
||||
|
||||
def start_monitoring(self):
|
||||
"""Start energy monitoring session."""
|
||||
self.start_time = time.time()
|
||||
metrics = self.get_metrics()
|
||||
self.start_energy = metrics.energy_joules if metrics.energy_joules is not None else 0.0
|
||||
self.last_metrics = metrics
|
||||
|
||||
def get_energy_consumed(self) -> float:
|
||||
"""
|
||||
Get energy consumed since start_monitoring() was called.
|
||||
|
||||
Returns:
|
||||
Energy in Joules
|
||||
"""
|
||||
if self.start_time is None:
|
||||
raise RuntimeError("Must call start_monitoring() first")
|
||||
|
||||
current_metrics = self.get_metrics()
|
||||
|
||||
if current_metrics.energy_joules is not None:
|
||||
# If GPU provides cumulative energy, use it
|
||||
return current_metrics.energy_joules - self.start_energy
|
||||
else:
|
||||
# Otherwise, integrate power over time
|
||||
elapsed_time = time.time() - self.start_time
|
||||
# Use average of start and current power
|
||||
avg_power = (self.last_metrics.power_watts + current_metrics.power_watts) / 2.0
|
||||
return avg_power * elapsed_time
|
||||
|
||||
def get_average_power(self) -> float:
|
||||
"""
|
||||
Get average power consumption since start_monitoring().
|
||||
|
||||
Returns:
|
||||
Average power in Watts
|
||||
"""
|
||||
if self.start_time is None:
|
||||
raise RuntimeError("Must call start_monitoring() first")
|
||||
|
||||
elapsed_time = time.time() - self.start_time
|
||||
if elapsed_time == 0:
|
||||
return 0.0
|
||||
|
||||
energy = self.get_energy_consumed()
|
||||
return energy / elapsed_time
|
||||
|
||||
|
||||
class NVIDIAMonitor(GPUMonitor):
|
||||
"""NVIDIA GPU monitor using pynvml."""
|
||||
|
||||
def __init__(self, device_id: int = 0):
|
||||
"""Initialize NVIDIA monitor."""
|
||||
try:
|
||||
import pynvml
|
||||
self.pynvml = pynvml
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pynvml not found. Install with: pip install pynvml"
|
||||
)
|
||||
|
||||
try:
|
||||
self.pynvml.nvmlInit()
|
||||
self.handle = self.pynvml.nvmlDeviceGetHandleByIndex(device_id)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to initialize NVIDIA GPU {device_id}: {e}")
|
||||
|
||||
super().__init__(device_id)
|
||||
|
||||
def get_metrics(self) -> GPUMetrics:
|
||||
"""Get current NVIDIA GPU metrics."""
|
||||
try:
|
||||
# Power (in milliwatts)
|
||||
power_mw = self.pynvml.nvmlDeviceGetPowerUsage(self.handle)
|
||||
power_watts = power_mw / 1000.0
|
||||
|
||||
# Utilization
|
||||
util = self.pynvml.nvmlDeviceGetUtilizationRates(self.handle)
|
||||
gpu_util = util.gpu
|
||||
|
||||
# Memory
|
||||
mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(self.handle)
|
||||
memory_used_gb = mem_info.used / (1024**3)
|
||||
memory_total_gb = mem_info.total / (1024**3)
|
||||
|
||||
# Temperature
|
||||
try:
|
||||
temp = self.pynvml.nvmlDeviceGetTemperature(
|
||||
self.handle,
|
||||
self.pynvml.NVML_TEMPERATURE_GPU
|
||||
)
|
||||
except:
|
||||
temp = None
|
||||
|
||||
# Try to get cumulative energy (newer GPUs)
|
||||
energy_joules = None
|
||||
try:
|
||||
energy_mj = self.pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
|
||||
energy_joules = energy_mj / 1000.0
|
||||
except:
|
||||
# Not supported on this GPU, will use power integration
|
||||
pass
|
||||
|
||||
return GPUMetrics(
|
||||
timestamp=time.time(),
|
||||
power_watts=power_watts,
|
||||
gpu_utilization_percent=gpu_util,
|
||||
memory_used_gb=memory_used_gb,
|
||||
memory_total_gb=memory_total_gb,
|
||||
temperature_celsius=temp,
|
||||
energy_joules=energy_joules
|
||||
)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to get NVIDIA GPU metrics: {e}")
|
||||
|
||||
def get_device_name(self) -> str:
|
||||
"""Get NVIDIA GPU device name."""
|
||||
try:
|
||||
name = self.pynvml.nvmlDeviceGetName(self.handle)
|
||||
if isinstance(name, bytes):
|
||||
name = name.decode('utf-8')
|
||||
return name
|
||||
except:
|
||||
return f"NVIDIA GPU {self.device_id}"
|
||||
|
||||
def cleanup(self):
|
||||
"""Cleanup NVIDIA resources."""
|
||||
try:
|
||||
self.pynvml.nvmlShutdown()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
class AMDMonitor(GPUMonitor):
|
||||
"""AMD GPU monitor using rocm-smi command line tool."""
|
||||
|
||||
def __init__(self, device_id: int = 0):
|
||||
"""Initialize AMD monitor."""
|
||||
import subprocess
|
||||
import shutil
|
||||
|
||||
# Check if rocm-smi is available
|
||||
if shutil.which('rocm-smi') is None:
|
||||
raise RuntimeError("rocm-smi command not found. Make sure ROCm is installed and in PATH.")
|
||||
|
||||
self.device_id = device_id
|
||||
|
||||
# Verify device exists
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['rocm-smi', '--showid'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"rocm-smi failed: {result.stderr}")
|
||||
except subprocess.TimeoutExpired:
|
||||
raise RuntimeError("rocm-smi command timed out")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to initialize AMD GPU {device_id}: {e}")
|
||||
|
||||
super().__init__(device_id)
|
||||
|
||||
def _parse_detailed_output(self, output: str) -> dict:
|
||||
"""Parse rocm-smi detailed output format."""
|
||||
lines = output.strip().split('\n')
|
||||
|
||||
# Parse detailed format: GPU[X] : Metric : Value
|
||||
metrics = {
|
||||
'temperature': None,
|
||||
'power': None,
|
||||
'vram_percent': None,
|
||||
'gpu_percent': None,
|
||||
}
|
||||
|
||||
device_prefix = f"GPU[{self.device_id}]"
|
||||
|
||||
for line in lines:
|
||||
if not line.strip() or not line.startswith(device_prefix):
|
||||
continue
|
||||
|
||||
# Split by colon
|
||||
parts = line.split(':')
|
||||
if len(parts) < 3:
|
||||
continue
|
||||
|
||||
metric_name = parts[1].strip().lower()
|
||||
value_str = parts[2].strip()
|
||||
|
||||
try:
|
||||
# Temperature (Sensor junction)
|
||||
if 'temperature' in metric_name and 'junction' in metric_name:
|
||||
metrics['temperature'] = float(value_str)
|
||||
|
||||
# Power consumption
|
||||
elif 'power' in metric_name and 'package' in metric_name:
|
||||
metrics['power'] = float(value_str)
|
||||
|
||||
# GPU utilization
|
||||
elif 'gpu use' in metric_name:
|
||||
metrics['gpu_percent'] = float(value_str)
|
||||
|
||||
# VRAM usage percentage
|
||||
elif 'memory allocated' in metric_name and 'vram%' in metric_name:
|
||||
metrics['vram_percent'] = float(value_str)
|
||||
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
# Validate we got the required metrics
|
||||
if metrics['temperature'] is None:
|
||||
raise ValueError(f"Could not find temperature for GPU[{self.device_id}]")
|
||||
if metrics['power'] is None:
|
||||
raise ValueError(f"Could not find power for GPU[{self.device_id}]")
|
||||
if metrics['gpu_percent'] is None:
|
||||
metrics['gpu_percent'] = 0.0
|
||||
if metrics['vram_percent'] is None:
|
||||
metrics['vram_percent'] = 0.0
|
||||
|
||||
return metrics
|
||||
|
||||
def _get_memory_info(self) -> tuple:
|
||||
"""Get memory usage in GB using rocm-smi --showmeminfo."""
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['rocm-smi', '--showmeminfo', 'vram', '-d', str(self.device_id)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
return 0.0, 0.0
|
||||
|
||||
# Parse output for memory info
|
||||
# Looking for lines like "GPU memory used: X MiB" and "GPU memory total: Y MiB"
|
||||
used_gb = 0.0
|
||||
total_gb = 0.0
|
||||
|
||||
for line in result.stdout.split('\n'):
|
||||
if 'Used' in line or 'used' in line:
|
||||
# Extract number
|
||||
parts = line.split()
|
||||
for i, part in enumerate(parts):
|
||||
if part.replace('.', '').isdigit():
|
||||
used_bytes = float(part)
|
||||
# Check if next part indicates unit
|
||||
if i + 1 < len(parts):
|
||||
unit = parts[i + 1].lower()
|
||||
if 'mb' in unit or 'mib' in unit:
|
||||
used_gb = used_bytes / 1024
|
||||
elif 'gb' in unit or 'gib' in unit:
|
||||
used_gb = used_bytes
|
||||
elif 'kb' in unit or 'kib' in unit:
|
||||
used_gb = used_bytes / (1024 * 1024)
|
||||
break
|
||||
|
||||
if 'Total' in line or 'total' in line:
|
||||
parts = line.split()
|
||||
for i, part in enumerate(parts):
|
||||
if part.replace('.', '').isdigit():
|
||||
total_bytes = float(part)
|
||||
if i + 1 < len(parts):
|
||||
unit = parts[i + 1].lower()
|
||||
if 'mb' in unit or 'mib' in unit:
|
||||
total_gb = total_bytes / 1024
|
||||
elif 'gb' in unit or 'gib' in unit:
|
||||
total_gb = total_bytes
|
||||
elif 'kb' in unit or 'kib' in unit:
|
||||
total_gb = total_bytes / (1024 * 1024)
|
||||
break
|
||||
|
||||
return used_gb, total_gb
|
||||
|
||||
except Exception:
|
||||
return 0.0, 0.0
|
||||
|
||||
def get_metrics(self) -> GPUMetrics:
|
||||
"""Get current AMD GPU metrics."""
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
# Get main metrics from concise output
|
||||
result = subprocess.run(
|
||||
['rocm-smi', '--showid', '--showtemp', '--showpower', '--showuse', '--showmemuse'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"rocm-smi failed: {result.stderr}")
|
||||
|
||||
metrics = self._parse_detailed_output(result.stdout)
|
||||
|
||||
# Get detailed memory info
|
||||
memory_used_gb, memory_total_gb = self._get_memory_info()
|
||||
|
||||
# If we couldn't get absolute memory, estimate from percentage
|
||||
if memory_total_gb == 0.0:
|
||||
# MI300X has ~192GB, MI250X has ~128GB - use a reasonable default
|
||||
memory_total_gb = 192.0 # Assume MI300X
|
||||
memory_used_gb = memory_total_gb * (metrics['vram_percent'] / 100.0)
|
||||
|
||||
return GPUMetrics(
|
||||
timestamp=time.time(),
|
||||
power_watts=metrics['power'],
|
||||
gpu_utilization_percent=metrics['gpu_percent'],
|
||||
memory_used_gb=memory_used_gb,
|
||||
memory_total_gb=memory_total_gb,
|
||||
temperature_celsius=metrics['temperature'],
|
||||
energy_joules=None # Will use power integration
|
||||
)
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
raise RuntimeError("rocm-smi command timed out")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to get AMD GPU metrics: {e}")
|
||||
|
||||
def get_device_name(self) -> str:
|
||||
"""Get AMD GPU device name."""
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['rocm-smi', '--showproductname', '-d', str(self.device_id)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
# Parse output to find device name
|
||||
for line in result.stdout.split('\n'):
|
||||
if 'Card series' in line or 'Card model' in line or 'name' in line.lower():
|
||||
parts = line.split(':')
|
||||
if len(parts) > 1:
|
||||
return parts[1].strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return f"AMD GPU {self.device_id}"
|
||||
|
||||
def cleanup(self):
|
||||
"""Cleanup AMD resources."""
|
||||
# No cleanup needed for command-line tool
|
||||
pass
|
||||
|
||||
|
||||
def get_gpu_monitor(device_id: int = 0) -> GPUMonitor:
|
||||
"""
|
||||
Factory function to automatically detect and create appropriate GPU monitor.
|
||||
|
||||
Args:
|
||||
device_id: GPU device ID to monitor
|
||||
|
||||
Returns:
|
||||
GPUMonitor instance (NVIDIAMonitor or AMDMonitor)
|
||||
|
||||
Raises:
|
||||
RuntimeError: If no supported GPU is found
|
||||
"""
|
||||
# Try AMD first (rocm-smi based) as it's more commonly available
|
||||
try:
|
||||
return AMDMonitor(device_id)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Try NVIDIA if AMD fails
|
||||
try:
|
||||
return NVIDIAMonitor(device_id)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Try to import torch to detect GPU type as last resort
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
# Check if it's NVIDIA or AMD
|
||||
device_name = torch.cuda.get_device_name(device_id).lower()
|
||||
|
||||
if 'nvidia' in device_name or 'tesla' in device_name or 'geforce' in device_name:
|
||||
return NVIDIAMonitor(device_id)
|
||||
elif 'amd' in device_name or 'radeon' in device_name or 'mi300' in device_name or 'mi200' in device_name:
|
||||
return AMDMonitor(device_id)
|
||||
except:
|
||||
pass
|
||||
|
||||
raise RuntimeError(
|
||||
"No supported GPU found. Make sure either ROCm (rocm-smi) or NVIDIA (pynvml) drivers are installed."
|
||||
)
|
||||
|
||||
|
||||
def list_available_gpus() -> List[str]:
|
||||
"""
|
||||
List all available GPUs.
|
||||
|
||||
Returns:
|
||||
List of GPU names
|
||||
"""
|
||||
gpus = []
|
||||
|
||||
# Try NVIDIA
|
||||
try:
|
||||
import pynvml
|
||||
pynvml.nvmlInit()
|
||||
device_count = pynvml.nvmlDeviceGetCount()
|
||||
for i in range(device_count):
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
||||
name = pynvml.nvmlDeviceGetName(handle)
|
||||
if isinstance(name, bytes):
|
||||
name = name.decode('utf-8')
|
||||
gpus.append(f"GPU {i}: {name} (NVIDIA)")
|
||||
pynvml.nvmlShutdown()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Try AMD with rocm-smi
|
||||
try:
|
||||
import subprocess
|
||||
import shutil
|
||||
|
||||
if shutil.which('rocm-smi'):
|
||||
result = subprocess.run(
|
||||
['rocm-smi', '--showid'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
if result.returncode == 0:
|
||||
# Parse device IDs from output
|
||||
for line in result.stdout.split('\n'):
|
||||
if not line.strip() or line.startswith('=') or 'Device' in line or 'ROCm' in line:
|
||||
continue
|
||||
parts = line.split()
|
||||
if parts and parts[0].isdigit():
|
||||
device_id = int(parts[0])
|
||||
# Try to get device name
|
||||
name_result = subprocess.run(
|
||||
['rocm-smi', '--showproductname', '-d', str(device_id)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
name = f"AMD GPU"
|
||||
if name_result.returncode == 0:
|
||||
for name_line in name_result.stdout.split('\n'):
|
||||
if 'Card' in name_line or 'name' in name_line.lower():
|
||||
parts_name = name_line.split(':')
|
||||
if len(parts_name) > 1:
|
||||
name = parts_name[1].strip()
|
||||
break
|
||||
gpus.append(f"GPU {device_id}: {name} (AMD)")
|
||||
except:
|
||||
pass
|
||||
|
||||
return gpus
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Test GPU monitoring."""
|
||||
print("=" * 60)
|
||||
print("GPU Monitoring Test")
|
||||
print("=" * 60)
|
||||
|
||||
# List available GPUs
|
||||
print("\nAvailable GPUs:")
|
||||
gpus = list_available_gpus()
|
||||
if not gpus:
|
||||
print(" No GPUs found!")
|
||||
exit(1)
|
||||
|
||||
for gpu in gpus:
|
||||
print(f" {gpu}")
|
||||
|
||||
# Test monitoring
|
||||
print("\nTesting GPU 0 monitoring...")
|
||||
try:
|
||||
monitor = get_gpu_monitor(0)
|
||||
print(f" Device: {monitor.get_device_name()}")
|
||||
|
||||
# Get metrics
|
||||
metrics = monitor.get_metrics()
|
||||
print(f"\nCurrent Metrics:")
|
||||
print(f" Power: {metrics.power_watts:.2f} W")
|
||||
print(f" GPU Utilization: {metrics.gpu_utilization_percent:.1f}%")
|
||||
print(f" Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB")
|
||||
if metrics.temperature_celsius:
|
||||
print(f" Temperature: {metrics.temperature_celsius:.1f}°C")
|
||||
|
||||
# Test energy monitoring
|
||||
print("\nTesting energy monitoring (5 seconds)...")
|
||||
monitor.start_monitoring()
|
||||
time.sleep(5)
|
||||
energy = monitor.get_energy_consumed()
|
||||
avg_power = monitor.get_average_power()
|
||||
print(f" Energy consumed: {energy:.2f} J")
|
||||
print(f" Average power: {avg_power:.2f} W")
|
||||
|
||||
monitor.cleanup()
|
||||
print("\n✓ Monitoring test successful!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ Error: {e}")
|
||||
exit(1)
|
||||
Reference in New Issue
Block a user