#!/usr/bin/env python3 """ Inference Benchmark for LLM Performance Evaluation Measures performance and energy metrics for inference workloads with separate measurements for prefill and decode stages. """ import argparse import os import sys import time from pathlib import Path from typing import Optional import torch from transformers import AutoModelForCausalLM, AutoTokenizer from tqdm import tqdm # Add utils to path sys.path.insert(0, str(Path(__file__).parent)) from utils.gpu_monitor import get_gpu_monitor from utils.metrics import StageMetrics, InferenceMetrics, MetricsReporter from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu def benchmark_inference( model_name_or_path: str, attn_implementation: str = "auto", num_requests: int = 10, prompt_length: int = 512, generation_length: int = 100, warmup_requests: int = 2, device: str = "cuda", device_id: int = 0, output_dir: Optional[str] = None, verbose: bool = True, ): """ Run inference benchmark. Args: model_name_or_path: Path to model or HuggingFace identifier attn_implementation: Attention implementation to use num_requests: Number of inference requests to measure prompt_length: Length of input prompt generation_length: Number of tokens to generate warmup_requests: Number of warmup requests device: Device to use device_id: GPU device ID output_dir: Directory to save results verbose: Print verbose output """ print("=" * 80) print("INFERENCE BENCHMARK") print("=" * 80) # Initialize GPU monitor if verbose: print("\n[1/7] Initializing GPU monitor...") monitor = get_gpu_monitor(device_id) gpu_name = monitor.get_device_name() if verbose: print(f" GPU: {gpu_name}") # Determine attention implementation if attn_implementation == "auto": attn_implementation = get_default_attention(gpu_name) if verbose: print(f" Auto-selected attention: {attn_implementation}") # Validate attention for GPU valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name) if warning and verbose: print(f" ⚠ {warning}") # Load model if verbose: print(f"\n[2/7] Loading model: {model_name_or_path}") # Determine attn_implementation parameter for model loading load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation try: model = AutoModelForCausalLM.from_pretrained( model_name_or_path, torch_dtype=torch.bfloat16, attn_implementation=load_attn, trust_remote_code=True, ) model = model.to(device) # Configure attention (patch if needed for FA3) model = configure_model_attention(model, attn_implementation, verbose=verbose) if verbose: total_params = sum(p.numel() for p in model.parameters()) print(f" Total parameters: {total_params:,} ({total_params/1e9:.2f}B)") except Exception as e: print(f"✗ Error loading model: {e}") sys.exit(1) # Load tokenizer if verbose: print(f"\n[3/7] Loading tokenizer...") try: tokenizer = AutoTokenizer.from_pretrained( model_name_or_path, trust_remote_code=True ) except Exception as e: print(f"✗ Error loading tokenizer: {e}") sys.exit(1) # Generate synthetic prompts if verbose: print(f"\n[4/7] Generating synthetic prompts...") print(f" Prompt length: {prompt_length}") print(f" Generation length: {generation_length}") # Create random input_ids (synthetic prompts) vocab_size = model.config.vocab_size # We'll create one prompt and reuse it prompt_ids = torch.randint(0, vocab_size, (1, prompt_length), device=device) # Warmup if verbose: print(f"\n[5/7] Running warmup ({warmup_requests} requests)...") model.eval() with torch.no_grad(): for _ in range(warmup_requests): _ = model.generate( prompt_ids, max_new_tokens=generation_length, do_sample=False, pad_token_id=tokenizer.eos_token_id ) # Synchronize before benchmarking torch.cuda.synchronize() # Benchmark if verbose: print(f"\n[6/7] Running benchmark ({num_requests} requests)...") # Storage for per-request metrics prefill_times = [] decode_times = [] e2e_times = [] prefill_energies = [] decode_energies = [] e2e_energies = [] prefill_powers = [] decode_powers = [] memory_usage = [] gpu_utils = [] # For inference, we separate prefill (first token) from decode (remaining tokens) # We'll use a custom generation loop to measure them separately for req_idx in tqdm(range(num_requests), desc="Benchmarking"): # === PREFILL PHASE (Time to First Token) === # This is the forward pass with the prompt to get the first token monitor.start_monitoring() torch.cuda.synchronize() prefill_start = time.perf_counter() with torch.no_grad(): # Forward pass with prompt outputs = model(input_ids=prompt_ids, use_cache=True) logits = outputs.logits past_key_values = outputs.past_key_values # Get first generated token next_token_logits = logits[:, -1, :] next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True) torch.cuda.synchronize() prefill_time = time.perf_counter() - prefill_start prefill_energy = monitor.get_energy_consumed() prefill_power = monitor.get_average_power() prefill_times.append(prefill_time * 1000) # Convert to ms prefill_energies.append(prefill_energy) prefill_powers.append(prefill_power) # === DECODE PHASE (Inter-Token Latency) === # Generate remaining tokens one by one monitor.start_monitoring() torch.cuda.synchronize() decode_start = time.perf_counter() generated_tokens = [next_token] with torch.no_grad(): for _ in range(generation_length - 1): # Forward pass with single token using cached keys/values outputs = model( input_ids=next_token, past_key_values=past_key_values, use_cache=True ) logits = outputs.logits past_key_values = outputs.past_key_values # Get next token next_token_logits = logits[:, -1, :] next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True) generated_tokens.append(next_token) torch.cuda.synchronize() decode_time = time.perf_counter() - decode_start decode_energy = monitor.get_energy_consumed() decode_power = monitor.get_average_power() decode_times.append(decode_time * 1000) # Convert to ms decode_energies.append(decode_energy) decode_powers.append(decode_power) # End-to-end metrics e2e_time = prefill_time + decode_time e2e_energy = prefill_energy + decode_energy e2e_times.append(e2e_time * 1000) # Convert to ms e2e_energies.append(e2e_energy) # Get memory and utilization metrics = monitor.get_metrics() memory_usage.append(metrics.memory_used_gb) gpu_utils.append(metrics.gpu_utilization_percent) # Compute aggregated metrics # Prefill metrics (TTFT) prefill_duration_ms = sum(prefill_times) prefill_energy_j = sum(prefill_energies) prefill_tokens = prompt_length * num_requests prefill_tps = prefill_tokens / (prefill_duration_ms / 1000) prefill_ept = prefill_energy_j / prefill_tokens avg_ttft_ms = sum(prefill_times) / len(prefill_times) prefill_metrics = StageMetrics( stage_name="prefill", duration_ms=prefill_duration_ms, tokens_processed=prefill_tokens, tokens_per_second=prefill_tps, energy_joules=prefill_energy_j, energy_per_token=prefill_ept, avg_power_watts=sum(prefill_powers) / len(prefill_powers), peak_memory_gb=max(memory_usage), avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils) ) # Decode metrics (ITL) decode_duration_ms = sum(decode_times) decode_energy_j = sum(decode_energies) decode_tokens = generation_length * num_requests decode_tps = decode_tokens / (decode_duration_ms / 1000) decode_ept = decode_energy_j / decode_tokens avg_itl_ms = sum(decode_times) / len(decode_times) / generation_length decode_metrics = StageMetrics( stage_name="decode", duration_ms=decode_duration_ms, tokens_processed=decode_tokens, tokens_per_second=decode_tps, energy_joules=decode_energy_j, energy_per_token=decode_ept, avg_power_watts=sum(decode_powers) / len(decode_powers), peak_memory_gb=max(memory_usage), avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils) ) # End-to-end metrics e2e_latency_ms = sum(e2e_times) / len(e2e_times) e2e_energy_j = sum(e2e_energies) total_tokens = (prompt_length + generation_length) * num_requests e2e_tps = total_tokens / (sum(e2e_times) / 1000) e2e_ept = e2e_energy_j / total_tokens # Create metrics object metrics = InferenceMetrics( model_name=model_name_or_path, gpu_name=gpu_name, attention_implementation=attn_implementation, num_requests=num_requests, prompt_length=prompt_length, generation_length=generation_length, prefill=prefill_metrics, decode=decode_metrics, e2e_latency_ms=e2e_latency_ms, e2e_tokens_per_second=e2e_tps, e2e_energy_joules=e2e_energy_j, e2e_energy_per_token=e2e_ept, ttft_ms=avg_ttft_ms, itl_ms=avg_itl_ms ) # Print results if verbose: print() MetricsReporter.print_inference_metrics(metrics, verbose=verbose) # Save results if output_dir: output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) # Save JSON json_path = output_path / f"inference_{gpu_name.replace(' ', '_')}_{attn_implementation}.json" MetricsReporter.save_json(metrics, json_path) # Cleanup monitor.cleanup() del model torch.cuda.empty_cache() return metrics def main(): parser = argparse.ArgumentParser( description="LLM Inference Benchmark", formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument( "--model-path", type=str, default="./model_cache", help="Path to cached model" ) parser.add_argument( "--model-name", type=str, default="Qwen/Qwen3-4B", help="Model name (for reporting)" ) parser.add_argument( "--attn-implementation", type=str, default="auto", choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"], help="Attention implementation to use" ) parser.add_argument( "--num-requests", type=int, default=10, help="Number of inference requests" ) parser.add_argument( "--prompt-length", type=int, default=512, help="Prompt length in tokens" ) parser.add_argument( "--generation-length", type=int, default=100, help="Number of tokens to generate" ) parser.add_argument( "--warmup-requests", type=int, default=2, help="Number of warmup requests" ) parser.add_argument( "--device-id", type=int, default=0, help="GPU device ID" ) parser.add_argument( "--output-dir", type=str, default="./results", help="Output directory for results" ) args = parser.parse_args() # Set environment variables for HuggingFace cache if Path(args.model_path).exists(): os.environ['HF_HOME'] = args.model_path benchmark_inference( model_name_or_path=args.model_name, attn_implementation=args.attn_implementation, num_requests=args.num_requests, prompt_length=args.prompt_length, generation_length=args.generation_length, warmup_requests=args.warmup_requests, device="cuda", device_id=args.device_id, output_dir=args.output_dir, verbose=True ) if __name__ == "__main__": main()