Initial commit

2026-02-05 23:18:26 +01:00
commit 747c92ac6b
31 changed files with 4220 additions and 0 deletions
--- a/benchmark_inference.py
+++ b/benchmark_inference.py
@@ -0,0 +1,417 @@
+#!/usr/bin/env python3
+"""
+Inference Benchmark for LLM Performance Evaluation
+
+Measures performance and energy metrics for inference workloads with
+separate measurements for prefill and decode stages.
+"""
+
+import argparse
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Optional
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from tqdm import tqdm
+
+# Add utils to path
+sys.path.insert(0, str(Path(__file__).parent))
+from utils.gpu_monitor import get_gpu_monitor
+from utils.metrics import StageMetrics, InferenceMetrics, MetricsReporter
+from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu
+
+
+def benchmark_inference(
+    model_name_or_path: str,
+    attn_implementation: str = "auto",
+    num_requests: int = 10,
+    prompt_length: int = 512,
+    generation_length: int = 100,
+    warmup_requests: int = 2,
+    device: str = "cuda",
+    device_id: int = 0,
+    output_dir: Optional[str] = None,
+    verbose: bool = True,
+):
+    """
+    Run inference benchmark.
+    
+    Args:
+        model_name_or_path: Path to model or HuggingFace identifier
+        attn_implementation: Attention implementation to use
+        num_requests: Number of inference requests to measure
+        prompt_length: Length of input prompt
+        generation_length: Number of tokens to generate
+        warmup_requests: Number of warmup requests
+        device: Device to use
+        device_id: GPU device ID
+        output_dir: Directory to save results
+        verbose: Print verbose output
+    """
+    print("=" * 80)
+    print("INFERENCE BENCHMARK")
+    print("=" * 80)
+    
+    # Initialize GPU monitor
+    if verbose:
+        print("\n[1/7] Initializing GPU monitor...")
+    monitor = get_gpu_monitor(device_id)
+    gpu_name = monitor.get_device_name()
+    if verbose:
+        print(f"  GPU: {gpu_name}")
+    
+    # Determine attention implementation
+    if attn_implementation == "auto":
+        attn_implementation = get_default_attention(gpu_name)
+        if verbose:
+            print(f"  Auto-selected attention: {attn_implementation}")
+    
+    # Validate attention for GPU
+    valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name)
+    if warning and verbose:
+        print(f"  ⚠ {warning}")
+    
+    # Load model
+    if verbose:
+        print(f"\n[2/7] Loading model: {model_name_or_path}")
+    
+    # Determine attn_implementation parameter for model loading
+    load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation
+    
+    try:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path,
+            torch_dtype=torch.bfloat16,
+            attn_implementation=load_attn,
+            trust_remote_code=True,
+        )
+        model = model.to(device)
+        
+        # Configure attention (patch if needed for FA3)
+        model = configure_model_attention(model, attn_implementation, verbose=verbose)
+        
+        if verbose:
+            total_params = sum(p.numel() for p in model.parameters())
+            print(f"  Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
+    except Exception as e:
+        print(f"✗ Error loading model: {e}")
+        sys.exit(1)
+    
+    # Load tokenizer
+    if verbose:
+        print(f"\n[3/7] Loading tokenizer...")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name_or_path,
+            trust_remote_code=True
+        )
+    except Exception as e:
+        print(f"✗ Error loading tokenizer: {e}")
+        sys.exit(1)
+    
+    # Generate synthetic prompts
+    if verbose:
+        print(f"\n[4/7] Generating synthetic prompts...")
+        print(f"  Prompt length: {prompt_length}")
+        print(f"  Generation length: {generation_length}")
+    
+    # Create random input_ids (synthetic prompts)
+    vocab_size = model.config.vocab_size
+    # We'll create one prompt and reuse it
+    prompt_ids = torch.randint(0, vocab_size, (1, prompt_length), device=device)
+    
+    # Warmup
+    if verbose:
+        print(f"\n[5/7] Running warmup ({warmup_requests} requests)...")
+    model.eval()
+    with torch.no_grad():
+        for _ in range(warmup_requests):
+            _ = model.generate(
+                prompt_ids,
+                max_new_tokens=generation_length,
+                do_sample=False,
+                pad_token_id=tokenizer.eos_token_id
+            )
+    
+    # Synchronize before benchmarking
+    torch.cuda.synchronize()
+    
+    # Benchmark
+    if verbose:
+        print(f"\n[6/7] Running benchmark ({num_requests} requests)...")
+    
+    # Storage for per-request metrics
+    prefill_times = []
+    decode_times = []
+    e2e_times = []
+    
+    prefill_energies = []
+    decode_energies = []
+    e2e_energies = []
+    
+    prefill_powers = []
+    decode_powers = []
+    
+    memory_usage = []
+    gpu_utils = []
+    
+    # For inference, we separate prefill (first token) from decode (remaining tokens)
+    # We'll use a custom generation loop to measure them separately
+    
+    for req_idx in tqdm(range(num_requests), desc="Benchmarking"):
+        # === PREFILL PHASE (Time to First Token) ===
+        # This is the forward pass with the prompt to get the first token
+        
+        monitor.start_monitoring()
+        torch.cuda.synchronize()
+        prefill_start = time.perf_counter()
+        
+        with torch.no_grad():
+            # Forward pass with prompt
+            outputs = model(input_ids=prompt_ids, use_cache=True)
+            logits = outputs.logits
+            past_key_values = outputs.past_key_values
+            
+            # Get first generated token
+            next_token_logits = logits[:, -1, :]
+            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+        
+        torch.cuda.synchronize()
+        prefill_time = time.perf_counter() - prefill_start
+        prefill_energy = monitor.get_energy_consumed()
+        prefill_power = monitor.get_average_power()
+        
+        prefill_times.append(prefill_time * 1000)  # Convert to ms
+        prefill_energies.append(prefill_energy)
+        prefill_powers.append(prefill_power)
+        
+        # === DECODE PHASE (Inter-Token Latency) ===
+        # Generate remaining tokens one by one
+        
+        monitor.start_monitoring()
+        torch.cuda.synchronize()
+        decode_start = time.perf_counter()
+        
+        generated_tokens = [next_token]
+        
+        with torch.no_grad():
+            for _ in range(generation_length - 1):
+                # Forward pass with single token using cached keys/values
+                outputs = model(
+                    input_ids=next_token,
+                    past_key_values=past_key_values,
+                    use_cache=True
+                )
+                logits = outputs.logits
+                past_key_values = outputs.past_key_values
+                
+                # Get next token
+                next_token_logits = logits[:, -1, :]
+                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+                generated_tokens.append(next_token)
+        
+        torch.cuda.synchronize()
+        decode_time = time.perf_counter() - decode_start
+        decode_energy = monitor.get_energy_consumed()
+        decode_power = monitor.get_average_power()
+        
+        decode_times.append(decode_time * 1000)  # Convert to ms
+        decode_energies.append(decode_energy)
+        decode_powers.append(decode_power)
+        
+        # End-to-end metrics
+        e2e_time = prefill_time + decode_time
+        e2e_energy = prefill_energy + decode_energy
+        
+        e2e_times.append(e2e_time * 1000)  # Convert to ms
+        e2e_energies.append(e2e_energy)
+        
+        # Get memory and utilization
+        metrics = monitor.get_metrics()
+        memory_usage.append(metrics.memory_used_gb)
+        gpu_utils.append(metrics.gpu_utilization_percent)
+    
+    # Compute aggregated metrics
+    
+    # Prefill metrics (TTFT)
+    prefill_duration_ms = sum(prefill_times)
+    prefill_energy_j = sum(prefill_energies)
+    prefill_tokens = prompt_length * num_requests
+    prefill_tps = prefill_tokens / (prefill_duration_ms / 1000)
+    prefill_ept = prefill_energy_j / prefill_tokens
+    avg_ttft_ms = sum(prefill_times) / len(prefill_times)
+    
+    prefill_metrics = StageMetrics(
+        stage_name="prefill",
+        duration_ms=prefill_duration_ms,
+        tokens_processed=prefill_tokens,
+        tokens_per_second=prefill_tps,
+        energy_joules=prefill_energy_j,
+        energy_per_token=prefill_ept,
+        avg_power_watts=sum(prefill_powers) / len(prefill_powers),
+        peak_memory_gb=max(memory_usage),
+        avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
+    )
+    
+    # Decode metrics (ITL)
+    decode_duration_ms = sum(decode_times)
+    decode_energy_j = sum(decode_energies)
+    decode_tokens = generation_length * num_requests
+    decode_tps = decode_tokens / (decode_duration_ms / 1000)
+    decode_ept = decode_energy_j / decode_tokens
+    avg_itl_ms = sum(decode_times) / len(decode_times) / generation_length
+    
+    decode_metrics = StageMetrics(
+        stage_name="decode",
+        duration_ms=decode_duration_ms,
+        tokens_processed=decode_tokens,
+        tokens_per_second=decode_tps,
+        energy_joules=decode_energy_j,
+        energy_per_token=decode_ept,
+        avg_power_watts=sum(decode_powers) / len(decode_powers),
+        peak_memory_gb=max(memory_usage),
+        avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
+    )
+    
+    # End-to-end metrics
+    e2e_latency_ms = sum(e2e_times) / len(e2e_times)
+    e2e_energy_j = sum(e2e_energies)
+    total_tokens = (prompt_length + generation_length) * num_requests
+    e2e_tps = total_tokens / (sum(e2e_times) / 1000)
+    e2e_ept = e2e_energy_j / total_tokens
+    
+    # Create metrics object
+    metrics = InferenceMetrics(
+        model_name=model_name_or_path,
+        gpu_name=gpu_name,
+        attention_implementation=attn_implementation,
+        num_requests=num_requests,
+        prompt_length=prompt_length,
+        generation_length=generation_length,
+        prefill=prefill_metrics,
+        decode=decode_metrics,
+        e2e_latency_ms=e2e_latency_ms,
+        e2e_tokens_per_second=e2e_tps,
+        e2e_energy_joules=e2e_energy_j,
+        e2e_energy_per_token=e2e_ept,
+        ttft_ms=avg_ttft_ms,
+        itl_ms=avg_itl_ms
+    )
+    
+    # Print results
+    if verbose:
+        print()
+    MetricsReporter.print_inference_metrics(metrics, verbose=verbose)
+    
+    # Save results
+    if output_dir:
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        
+        # Save JSON
+        json_path = output_path / f"inference_{gpu_name.replace(' ', '_')}_{attn_implementation}.json"
+        MetricsReporter.save_json(metrics, json_path)
+    
+    # Cleanup
+    monitor.cleanup()
+    del model
+    torch.cuda.empty_cache()
+    
+    return metrics
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="LLM Inference Benchmark",
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="./model_cache",
+        help="Path to cached model"
+    )
+    
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="Qwen/Qwen3-4B",
+        help="Model name (for reporting)"
+    )
+    
+    parser.add_argument(
+        "--attn-implementation",
+        type=str,
+        default="auto",
+        choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
+        help="Attention implementation to use"
+    )
+    
+    parser.add_argument(
+        "--num-requests",
+        type=int,
+        default=10,
+        help="Number of inference requests"
+    )
+    
+    parser.add_argument(
+        "--prompt-length",
+        type=int,
+        default=512,
+        help="Prompt length in tokens"
+    )
+    
+    parser.add_argument(
+        "--generation-length",
+        type=int,
+        default=100,
+        help="Number of tokens to generate"
+    )
+    
+    parser.add_argument(
+        "--warmup-requests",
+        type=int,
+        default=2,
+        help="Number of warmup requests"
+    )
+    
+    parser.add_argument(
+        "--device-id",
+        type=int,
+        default=0,
+        help="GPU device ID"
+    )
+    
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="./results",
+        help="Output directory for results"
+    )
+    
+    args = parser.parse_args()
+    
+    # Set environment variables for HuggingFace cache
+    if Path(args.model_path).exists():
+        os.environ['HF_HOME'] = args.model_path
+    
+    benchmark_inference(
+        model_name_or_path=args.model_name,
+        attn_implementation=args.attn_implementation,
+        num_requests=args.num_requests,
+        prompt_length=args.prompt_length,
+        generation_length=args.generation_length,
+        warmup_requests=args.warmup_requests,
+        device="cuda",
+        device_id=args.device_id,
+        output_dir=args.output_dir,
+        verbose=True
+    )
+
+
+if __name__ == "__main__":
+    main()