cocogoat/benchmark_inference.py

#!/usr/bin/env python3
"""
Inference Benchmark for LLM Performance Evaluation

Measures performance and energy metrics for inference workloads with
separate measurements for prefill and decode stages.
"""

import argparse
import os
import sys
import time
from pathlib import Path
from typing import Optional

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

# Add utils to path
sys.path.insert(0, str(Path(__file__).parent))
from utils.gpu_monitor import get_gpu_monitor
from utils.metrics import StageMetrics, InferenceMetrics, MetricsReporter
from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu


def benchmark_inference(
    model_name_or_path: str,
    attn_implementation: str = "auto",
    num_requests: int = 10,
    prompt_length: int = 512,
    generation_length: int = 100,
    warmup_requests: int = 2,
    device: str = "cuda",
    device_id: int = 0,
    output_dir: Optional[str] = None,
    verbose: bool = True,
):
    """
    Run inference benchmark.

    Args:
        model_name_or_path: Path to model or HuggingFace identifier
        attn_implementation: Attention implementation to use
        num_requests: Number of inference requests to measure
        prompt_length: Length of input prompt
        generation_length: Number of tokens to generate
        warmup_requests: Number of warmup requests
        device: Device to use
        device_id: GPU device ID
        output_dir: Directory to save results
        verbose: Print verbose output
    """
    print("=" * 80)
    print("INFERENCE BENCHMARK")
    print("=" * 80)

    # Initialize GPU monitor
    if verbose:
        print("\n[1/7] Initializing GPU monitor...")
    monitor = get_gpu_monitor(device_id)
    gpu_name = monitor.get_device_name()
    if verbose:
        print(f"  GPU: {gpu_name}")

    # Determine attention implementation
    if attn_implementation == "auto":
        attn_implementation = get_default_attention(gpu_name)
        if verbose:
            print(f"  Auto-selected attention: {attn_implementation}")

    # Validate attention for GPU
    valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name)
    if warning and verbose:
        print(f"  ⚠ {warning}")

    # Load model
    if verbose:
        print(f"\n[2/7] Loading model: {model_name_or_path}")

    # Determine attn_implementation parameter for model loading
    load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation

    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name_or_path,
            torch_dtype=torch.bfloat16,
            attn_implementation=load_attn,
            trust_remote_code=True,
        )
        model = model.to(device)

        # Configure attention (patch if needed for FA3)
        model = configure_model_attention(model, attn_implementation, verbose=verbose)

        if verbose:
            total_params = sum(p.numel() for p in model.parameters())
            print(f"  Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
    except Exception as e:
        print(f"✗ Error loading model: {e}")
        sys.exit(1)

    # Load tokenizer
    if verbose:
        print(f"\n[3/7] Loading tokenizer...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_name_or_path,
            trust_remote_code=True
        )
    except Exception as e:
        print(f"✗ Error loading tokenizer: {e}")
        sys.exit(1)

    # Generate synthetic prompts
    if verbose:
        print(f"\n[4/7] Generating synthetic prompts...")
        print(f"  Prompt length: {prompt_length}")
        print(f"  Generation length: {generation_length}")

    # Create random input_ids (synthetic prompts)
    vocab_size = model.config.vocab_size
    # We'll create one prompt and reuse it
    prompt_ids = torch.randint(0, vocab_size, (1, prompt_length), device=device)

    # Warmup
    if verbose:
        print(f"\n[5/7] Running warmup ({warmup_requests} requests)...")
    model.eval()
    with torch.no_grad():
        for _ in range(warmup_requests):
            _ = model.generate(
                prompt_ids,
                max_new_tokens=generation_length,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )

    # Synchronize before benchmarking
    torch.cuda.synchronize()

    # Benchmark
    if verbose:
        print(f"\n[6/7] Running benchmark ({num_requests} requests)...")

    # Storage for per-request metrics
    prefill_times = []
    decode_times = []
    e2e_times = []

    prefill_energies = []
    decode_energies = []
    e2e_energies = []

    prefill_powers = []
    decode_powers = []

    memory_usage = []
    gpu_utils = []

    # For inference, we separate prefill (first token) from decode (remaining tokens)
    # We'll use a custom generation loop to measure them separately

    for req_idx in tqdm(range(num_requests), desc="Benchmarking"):
        # === PREFILL PHASE (Time to First Token) ===
        # This is the forward pass with the prompt to get the first token

        monitor.start_monitoring()
        torch.cuda.synchronize()
        prefill_start = time.perf_counter()

        with torch.no_grad():
            # Forward pass with prompt
            outputs = model(input_ids=prompt_ids, use_cache=True)
            logits = outputs.logits
            past_key_values = outputs.past_key_values

            # Get first generated token
            next_token_logits = logits[:, -1, :]
            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)

        torch.cuda.synchronize()
        prefill_time = time.perf_counter() - prefill_start
        prefill_energy = monitor.get_energy_consumed()
        prefill_power = monitor.get_average_power()

        prefill_times.append(prefill_time * 1000)  # Convert to ms
        prefill_energies.append(prefill_energy)
        prefill_powers.append(prefill_power)

        # === DECODE PHASE (Inter-Token Latency) ===
        # Generate remaining tokens one by one

        monitor.start_monitoring()
        torch.cuda.synchronize()
        decode_start = time.perf_counter()

        generated_tokens = [next_token]

        with torch.no_grad():
            for _ in range(generation_length - 1):
                # Forward pass with single token using cached keys/values
                outputs = model(
                    input_ids=next_token,
                    past_key_values=past_key_values,
                    use_cache=True
                )
                logits = outputs.logits
                past_key_values = outputs.past_key_values

                # Get next token
                next_token_logits = logits[:, -1, :]
                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
                generated_tokens.append(next_token)

        torch.cuda.synchronize()
        decode_time = time.perf_counter() - decode_start
        decode_energy = monitor.get_energy_consumed()
        decode_power = monitor.get_average_power()

        decode_times.append(decode_time * 1000)  # Convert to ms
        decode_energies.append(decode_energy)
        decode_powers.append(decode_power)

        # End-to-end metrics
        e2e_time = prefill_time + decode_time
        e2e_energy = prefill_energy + decode_energy

        e2e_times.append(e2e_time * 1000)  # Convert to ms
        e2e_energies.append(e2e_energy)

        # Get memory and utilization
        metrics = monitor.get_metrics()
        memory_usage.append(metrics.memory_used_gb)
        gpu_utils.append(metrics.gpu_utilization_percent)

    # Compute aggregated metrics

    # Prefill metrics (TTFT)
    prefill_duration_ms = sum(prefill_times)
    prefill_energy_j = sum(prefill_energies)
    prefill_tokens = prompt_length * num_requests
    prefill_tps = prefill_tokens / (prefill_duration_ms / 1000)
    prefill_ept = prefill_energy_j / prefill_tokens
    avg_ttft_ms = sum(prefill_times) / len(prefill_times)

    prefill_metrics = StageMetrics(
        stage_name="prefill",
        duration_ms=prefill_duration_ms,
        tokens_processed=prefill_tokens,
        tokens_per_second=prefill_tps,
        energy_joules=prefill_energy_j,
        energy_per_token=prefill_ept,
        avg_power_watts=sum(prefill_powers) / len(prefill_powers),
        peak_memory_gb=max(memory_usage),
        avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
    )

    # Decode metrics (ITL)
    decode_duration_ms = sum(decode_times)
    decode_energy_j = sum(decode_energies)
    decode_tokens = generation_length * num_requests
    decode_tps = decode_tokens / (decode_duration_ms / 1000)
    decode_ept = decode_energy_j / decode_tokens
    avg_itl_ms = sum(decode_times) / len(decode_times) / generation_length

    decode_metrics = StageMetrics(
        stage_name="decode",
        duration_ms=decode_duration_ms,
        tokens_processed=decode_tokens,
        tokens_per_second=decode_tps,
        energy_joules=decode_energy_j,
        energy_per_token=decode_ept,
        avg_power_watts=sum(decode_powers) / len(decode_powers),
        peak_memory_gb=max(memory_usage),
        avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
    )

    # End-to-end metrics
    e2e_latency_ms = sum(e2e_times) / len(e2e_times)
    e2e_energy_j = sum(e2e_energies)
    total_tokens = (prompt_length + generation_length) * num_requests
    e2e_tps = total_tokens / (sum(e2e_times) / 1000)
    e2e_ept = e2e_energy_j / total_tokens

    # Create metrics object
    metrics = InferenceMetrics(
        model_name=model_name_or_path,
        gpu_name=gpu_name,
        attention_implementation=attn_implementation,
        num_requests=num_requests,
        prompt_length=prompt_length,
        generation_length=generation_length,
        prefill=prefill_metrics,
        decode=decode_metrics,
        e2e_latency_ms=e2e_latency_ms,
        e2e_tokens_per_second=e2e_tps,
        e2e_energy_joules=e2e_energy_j,
        e2e_energy_per_token=e2e_ept,
        ttft_ms=avg_ttft_ms,
        itl_ms=avg_itl_ms
    )

    # Print results
    if verbose:
        print()
    MetricsReporter.print_inference_metrics(metrics, verbose=verbose)

    # Save results
    if output_dir:
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)

        # Save JSON
        json_path = output_path / f"inference_{gpu_name.replace(' ', '_')}_{attn_implementation}.json"
        MetricsReporter.save_json(metrics, json_path)

    # Cleanup
    monitor.cleanup()
    del model
    torch.cuda.empty_cache()

    return metrics


def main():
    parser = argparse.ArgumentParser(
        description="LLM Inference Benchmark",
        formatter_class=argparse.RawDescriptionHelpFormatter
    )

    parser.add_argument(
        "--model-path",
        type=str,
        default="./model_cache",
        help="Path to cached model"
    )

    parser.add_argument(
        "--model-name",
        type=str,
        default="Qwen/Qwen3-4B",
        help="Model name (for reporting)"
    )

    parser.add_argument(
        "--attn-implementation",
        type=str,
        default="auto",
        choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
        help="Attention implementation to use"
    )

    parser.add_argument(
        "--num-requests",
        type=int,
        default=10,
        help="Number of inference requests"
    )

    parser.add_argument(
        "--prompt-length",
        type=int,
        default=512,
        help="Prompt length in tokens"
    )

    parser.add_argument(
        "--generation-length",
        type=int,
        default=100,
        help="Number of tokens to generate"
    )

    parser.add_argument(
        "--warmup-requests",
        type=int,
        default=2,
        help="Number of warmup requests"
    )

    parser.add_argument(
        "--device-id",
        type=int,
        default=0,
        help="GPU device ID"
    )

    parser.add_argument(
        "--output-dir",
        type=str,
        default="./results",
        help="Output directory for results"
    )

    args = parser.parse_args()

    # Set environment variables for HuggingFace cache
    if Path(args.model_path).exists():
        os.environ['HF_HOME'] = args.model_path

    benchmark_inference(
        model_name_or_path=args.model_name,
        attn_implementation=args.attn_implementation,
        num_requests=args.num_requests,
        prompt_length=args.prompt_length,
        generation_length=args.generation_length,
        warmup_requests=args.warmup_requests,
        device="cuda",
        device_id=args.device_id,
        output_dir=args.output_dir,
        verbose=True
    )


if __name__ == "__main__":
    main()