Files
cocogoat/benchmark_inference.py
2026-02-05 23:18:26 +01:00

418 lines
13 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Inference Benchmark for LLM Performance Evaluation
Measures performance and energy metrics for inference workloads with
separate measurements for prefill and decode stages.
"""
import argparse
import os
import sys
import time
from pathlib import Path
from typing import Optional
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
# Add utils to path
sys.path.insert(0, str(Path(__file__).parent))
from utils.gpu_monitor import get_gpu_monitor
from utils.metrics import StageMetrics, InferenceMetrics, MetricsReporter
from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu
def benchmark_inference(
model_name_or_path: str,
attn_implementation: str = "auto",
num_requests: int = 10,
prompt_length: int = 512,
generation_length: int = 100,
warmup_requests: int = 2,
device: str = "cuda",
device_id: int = 0,
output_dir: Optional[str] = None,
verbose: bool = True,
):
"""
Run inference benchmark.
Args:
model_name_or_path: Path to model or HuggingFace identifier
attn_implementation: Attention implementation to use
num_requests: Number of inference requests to measure
prompt_length: Length of input prompt
generation_length: Number of tokens to generate
warmup_requests: Number of warmup requests
device: Device to use
device_id: GPU device ID
output_dir: Directory to save results
verbose: Print verbose output
"""
print("=" * 80)
print("INFERENCE BENCHMARK")
print("=" * 80)
# Initialize GPU monitor
if verbose:
print("\n[1/7] Initializing GPU monitor...")
monitor = get_gpu_monitor(device_id)
gpu_name = monitor.get_device_name()
if verbose:
print(f" GPU: {gpu_name}")
# Determine attention implementation
if attn_implementation == "auto":
attn_implementation = get_default_attention(gpu_name)
if verbose:
print(f" Auto-selected attention: {attn_implementation}")
# Validate attention for GPU
valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name)
if warning and verbose:
print(f"{warning}")
# Load model
if verbose:
print(f"\n[2/7] Loading model: {model_name_or_path}")
# Determine attn_implementation parameter for model loading
load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation
try:
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
torch_dtype=torch.bfloat16,
attn_implementation=load_attn,
trust_remote_code=True,
)
model = model.to(device)
# Configure attention (patch if needed for FA3)
model = configure_model_attention(model, attn_implementation, verbose=verbose)
if verbose:
total_params = sum(p.numel() for p in model.parameters())
print(f" Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
except Exception as e:
print(f"✗ Error loading model: {e}")
sys.exit(1)
# Load tokenizer
if verbose:
print(f"\n[3/7] Loading tokenizer...")
try:
tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path,
trust_remote_code=True
)
except Exception as e:
print(f"✗ Error loading tokenizer: {e}")
sys.exit(1)
# Generate synthetic prompts
if verbose:
print(f"\n[4/7] Generating synthetic prompts...")
print(f" Prompt length: {prompt_length}")
print(f" Generation length: {generation_length}")
# Create random input_ids (synthetic prompts)
vocab_size = model.config.vocab_size
# We'll create one prompt and reuse it
prompt_ids = torch.randint(0, vocab_size, (1, prompt_length), device=device)
# Warmup
if verbose:
print(f"\n[5/7] Running warmup ({warmup_requests} requests)...")
model.eval()
with torch.no_grad():
for _ in range(warmup_requests):
_ = model.generate(
prompt_ids,
max_new_tokens=generation_length,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
# Synchronize before benchmarking
torch.cuda.synchronize()
# Benchmark
if verbose:
print(f"\n[6/7] Running benchmark ({num_requests} requests)...")
# Storage for per-request metrics
prefill_times = []
decode_times = []
e2e_times = []
prefill_energies = []
decode_energies = []
e2e_energies = []
prefill_powers = []
decode_powers = []
memory_usage = []
gpu_utils = []
# For inference, we separate prefill (first token) from decode (remaining tokens)
# We'll use a custom generation loop to measure them separately
for req_idx in tqdm(range(num_requests), desc="Benchmarking"):
# === PREFILL PHASE (Time to First Token) ===
# This is the forward pass with the prompt to get the first token
monitor.start_monitoring()
torch.cuda.synchronize()
prefill_start = time.perf_counter()
with torch.no_grad():
# Forward pass with prompt
outputs = model(input_ids=prompt_ids, use_cache=True)
logits = outputs.logits
past_key_values = outputs.past_key_values
# Get first generated token
next_token_logits = logits[:, -1, :]
next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
torch.cuda.synchronize()
prefill_time = time.perf_counter() - prefill_start
prefill_energy = monitor.get_energy_consumed()
prefill_power = monitor.get_average_power()
prefill_times.append(prefill_time * 1000) # Convert to ms
prefill_energies.append(prefill_energy)
prefill_powers.append(prefill_power)
# === DECODE PHASE (Inter-Token Latency) ===
# Generate remaining tokens one by one
monitor.start_monitoring()
torch.cuda.synchronize()
decode_start = time.perf_counter()
generated_tokens = [next_token]
with torch.no_grad():
for _ in range(generation_length - 1):
# Forward pass with single token using cached keys/values
outputs = model(
input_ids=next_token,
past_key_values=past_key_values,
use_cache=True
)
logits = outputs.logits
past_key_values = outputs.past_key_values
# Get next token
next_token_logits = logits[:, -1, :]
next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
generated_tokens.append(next_token)
torch.cuda.synchronize()
decode_time = time.perf_counter() - decode_start
decode_energy = monitor.get_energy_consumed()
decode_power = monitor.get_average_power()
decode_times.append(decode_time * 1000) # Convert to ms
decode_energies.append(decode_energy)
decode_powers.append(decode_power)
# End-to-end metrics
e2e_time = prefill_time + decode_time
e2e_energy = prefill_energy + decode_energy
e2e_times.append(e2e_time * 1000) # Convert to ms
e2e_energies.append(e2e_energy)
# Get memory and utilization
metrics = monitor.get_metrics()
memory_usage.append(metrics.memory_used_gb)
gpu_utils.append(metrics.gpu_utilization_percent)
# Compute aggregated metrics
# Prefill metrics (TTFT)
prefill_duration_ms = sum(prefill_times)
prefill_energy_j = sum(prefill_energies)
prefill_tokens = prompt_length * num_requests
prefill_tps = prefill_tokens / (prefill_duration_ms / 1000)
prefill_ept = prefill_energy_j / prefill_tokens
avg_ttft_ms = sum(prefill_times) / len(prefill_times)
prefill_metrics = StageMetrics(
stage_name="prefill",
duration_ms=prefill_duration_ms,
tokens_processed=prefill_tokens,
tokens_per_second=prefill_tps,
energy_joules=prefill_energy_j,
energy_per_token=prefill_ept,
avg_power_watts=sum(prefill_powers) / len(prefill_powers),
peak_memory_gb=max(memory_usage),
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
)
# Decode metrics (ITL)
decode_duration_ms = sum(decode_times)
decode_energy_j = sum(decode_energies)
decode_tokens = generation_length * num_requests
decode_tps = decode_tokens / (decode_duration_ms / 1000)
decode_ept = decode_energy_j / decode_tokens
avg_itl_ms = sum(decode_times) / len(decode_times) / generation_length
decode_metrics = StageMetrics(
stage_name="decode",
duration_ms=decode_duration_ms,
tokens_processed=decode_tokens,
tokens_per_second=decode_tps,
energy_joules=decode_energy_j,
energy_per_token=decode_ept,
avg_power_watts=sum(decode_powers) / len(decode_powers),
peak_memory_gb=max(memory_usage),
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
)
# End-to-end metrics
e2e_latency_ms = sum(e2e_times) / len(e2e_times)
e2e_energy_j = sum(e2e_energies)
total_tokens = (prompt_length + generation_length) * num_requests
e2e_tps = total_tokens / (sum(e2e_times) / 1000)
e2e_ept = e2e_energy_j / total_tokens
# Create metrics object
metrics = InferenceMetrics(
model_name=model_name_or_path,
gpu_name=gpu_name,
attention_implementation=attn_implementation,
num_requests=num_requests,
prompt_length=prompt_length,
generation_length=generation_length,
prefill=prefill_metrics,
decode=decode_metrics,
e2e_latency_ms=e2e_latency_ms,
e2e_tokens_per_second=e2e_tps,
e2e_energy_joules=e2e_energy_j,
e2e_energy_per_token=e2e_ept,
ttft_ms=avg_ttft_ms,
itl_ms=avg_itl_ms
)
# Print results
if verbose:
print()
MetricsReporter.print_inference_metrics(metrics, verbose=verbose)
# Save results
if output_dir:
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Save JSON
json_path = output_path / f"inference_{gpu_name.replace(' ', '_')}_{attn_implementation}.json"
MetricsReporter.save_json(metrics, json_path)
# Cleanup
monitor.cleanup()
del model
torch.cuda.empty_cache()
return metrics
def main():
parser = argparse.ArgumentParser(
description="LLM Inference Benchmark",
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
"--model-path",
type=str,
default="./model_cache",
help="Path to cached model"
)
parser.add_argument(
"--model-name",
type=str,
default="Qwen/Qwen3-4B",
help="Model name (for reporting)"
)
parser.add_argument(
"--attn-implementation",
type=str,
default="auto",
choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
help="Attention implementation to use"
)
parser.add_argument(
"--num-requests",
type=int,
default=10,
help="Number of inference requests"
)
parser.add_argument(
"--prompt-length",
type=int,
default=512,
help="Prompt length in tokens"
)
parser.add_argument(
"--generation-length",
type=int,
default=100,
help="Number of tokens to generate"
)
parser.add_argument(
"--warmup-requests",
type=int,
default=2,
help="Number of warmup requests"
)
parser.add_argument(
"--device-id",
type=int,
default=0,
help="GPU device ID"
)
parser.add_argument(
"--output-dir",
type=str,
default="./results",
help="Output directory for results"
)
args = parser.parse_args()
# Set environment variables for HuggingFace cache
if Path(args.model_path).exists():
os.environ['HF_HOME'] = args.model_path
benchmark_inference(
model_name_or_path=args.model_name,
attn_implementation=args.attn_implementation,
num_requests=args.num_requests,
prompt_length=args.prompt_length,
generation_length=args.generation_length,
warmup_requests=args.warmup_requests,
device="cuda",
device_id=args.device_id,
output_dir=args.output_dir,
verbose=True
)
if __name__ == "__main__":
main()