cocogoat/run_benchmark.py

#!/usr/bin/env python3
"""
Main LLM Benchmark Runner

Orchestrates pretraining and inference benchmarks with auto-detection
of GPU type and configuration.
"""

import argparse
import sys
from pathlib import Path

# Import benchmark functions
import benchmark_pretrain
import benchmark_inference

from utils.gpu_monitor import get_gpu_monitor, list_available_gpus
from utils.metrics import MetricsReporter


def main():
    parser = argparse.ArgumentParser(
        description="LLM Benchmark Suite - Compare GPU performance for pretraining and inference",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Run both pretrain and inference benchmarks
  python run_benchmark.py --mode both

  # Run only pretraining benchmark
  python run_benchmark.py --mode pretrain --num-steps 20

  # Run inference with custom settings
  python run_benchmark.py --mode inference --num-requests 20 --generation-length 200

  # Use specific attention implementation
  python run_benchmark.py --attn-implementation flash_attention_3_hopper
        """
    )

    # Model configuration
    parser.add_argument(
        "--model-path",
        type=str,
        default="./model_cache",
        help="Path to cached model directory"
    )

    parser.add_argument(
        "--model-name",
        type=str,
        default="Qwen/Qwen3-4B",
        help="Model name for reporting"
    )

    # Benchmark mode
    parser.add_argument(
        "--mode",
        type=str,
        default="both",
        choices=["pretrain", "inference", "both"],
        help="Benchmark mode to run"
    )

    # Attention configuration
    parser.add_argument(
        "--attn-implementation",
        type=str,
        default="auto",
        choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
        help="Attention implementation (auto selects based on GPU)"
    )

    # Pretraining parameters
    pretrain_group = parser.add_argument_group("pretraining parameters")
    pretrain_group.add_argument(
        "--batch-size",
        type=int,
        default=3,
        help="Batch size for pretraining"
    )
    pretrain_group.add_argument(
        "--sequence-length",
        type=int,
        default=2048,
        help="Sequence length for pretraining"
    )
    pretrain_group.add_argument(
        "--num-steps",
        type=int,
        default=10,
        help="Number of training steps"
    )
    pretrain_group.add_argument(
        "--warmup-steps",
        type=int,
        default=3,
        help="Number of warmup steps"
    )

    # Inference parameters
    inference_group = parser.add_argument_group("inference parameters")
    inference_group.add_argument(
        "--num-requests",
        type=int,
        default=10,
        help="Number of inference requests"
    )
    inference_group.add_argument(
        "--prompt-length",
        type=int,
        default=512,
        help="Prompt length in tokens"
    )
    inference_group.add_argument(
        "--generation-length",
        type=int,
        default=100,
        help="Number of tokens to generate"
    )
    inference_group.add_argument(
        "--warmup-requests",
        type=int,
        default=2,
        help="Number of warmup requests"
    )

    # General parameters
    parser.add_argument(
        "--device-id",
        type=int,
        default=0,
        help="GPU device ID"
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="./results",
        help="Output directory for results"
    )
    parser.add_argument(
        "--list-gpus",
        action="store_true",
        help="List available GPUs and exit"
    )

    args = parser.parse_args()

    # List GPUs if requested
    if args.list_gpus:
        print("Available GPUs:")
        gpus = list_available_gpus()
        if not gpus:
            print("  No GPUs found!")
        else:
            for gpu in gpus:
                print(f"  {gpu}")
        return

    # Print header
    print("=" * 80)
    print("LLM BENCHMARK SUITE")
    print("=" * 80)
    print(f"\nModel: {args.model_name}")
    print(f"Model Path: {args.model_path}")
    print(f"Mode: {args.mode}")
    print(f"Attention: {args.attn_implementation}")
    print(f"Output Directory: {args.output_dir}")

    # Detect GPU
    print("\nDetecting GPU...")
    try:
        monitor = get_gpu_monitor(args.device_id)
        gpu_name = monitor.get_device_name()
        print(f"  GPU {args.device_id}: {gpu_name}")
        monitor.cleanup()
    except Exception as e:
        print(f"✗ Error detecting GPU: {e}")
        sys.exit(1)

    # Create output directory
    output_path = Path(args.output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    # Run benchmarks
    pretrain_metrics = None
    inference_metrics = None

    if args.mode in ["pretrain", "both"]:
        print("\n" + "=" * 80)
        print("Running Pretraining Benchmark...")
        print("=" * 80)

        pretrain_metrics = benchmark_pretrain.benchmark_pretrain(
            model_name_or_path=args.model_name,
            attn_implementation=args.attn_implementation,
            batch_size=args.batch_size,
            sequence_length=args.sequence_length,
            num_steps=args.num_steps,
            warmup_steps=args.warmup_steps,
            device="cuda",
            device_id=args.device_id,
            output_dir=args.output_dir,
            verbose=True
        )

    if args.mode in ["inference", "both"]:
        print("\n" + "=" * 80)
        print("Running Inference Benchmark...")
        print("=" * 80)

        inference_metrics = benchmark_inference.benchmark_inference(
            model_name_or_path=args.model_name,
            attn_implementation=args.attn_implementation,
            num_requests=args.num_requests,
            prompt_length=args.prompt_length,
            generation_length=args.generation_length,
            warmup_requests=args.warmup_requests,
            device="cuda",
            device_id=args.device_id,
            output_dir=args.output_dir,
            verbose=True
        )

    # Summary
    print("\n" + "=" * 80)
    print("BENCHMARK COMPLETE")
    print("=" * 80)
    print(f"\nResults saved to: {output_path}")

    if pretrain_metrics:
        print(f"\nPretraining:")
        print(f"  Duration: {pretrain_metrics.total_duration_ms:.2f} ms")
        print(f"  Throughput: {pretrain_metrics.total_tokens_per_second:.2f} tokens/s")
        print(f"  Energy: {pretrain_metrics.total_energy_joules:.2f} J")
        print(f"  Energy/token: {pretrain_metrics.total_energy_per_token*1000:.4f} mJ/token")

    if inference_metrics:
        print(f"\nInference:")
        print(f"  TTFT: {inference_metrics.ttft_ms:.2f} ms")
        print(f"  ITL: {inference_metrics.itl_ms:.2f} ms/token")
        print(f"  Throughput: {inference_metrics.e2e_tokens_per_second:.2f} tokens/s")
        print(f"  Energy: {inference_metrics.e2e_energy_joules:.2f} J")
        print(f"  Energy/token: {inference_metrics.e2e_energy_per_token*1000:.4f} mJ/token")


if __name__ == "__main__":
    main()