#!/usr/bin/env python3 """ Main LLM Benchmark Runner Orchestrates pretraining and inference benchmarks with auto-detection of GPU type and configuration. """ import argparse import sys from pathlib import Path # Import benchmark functions import benchmark_pretrain import benchmark_inference from utils.gpu_monitor import get_gpu_monitor, list_available_gpus from utils.metrics import MetricsReporter def main(): parser = argparse.ArgumentParser( description="LLM Benchmark Suite - Compare GPU performance for pretraining and inference", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Run both pretrain and inference benchmarks python run_benchmark.py --mode both # Run only pretraining benchmark python run_benchmark.py --mode pretrain --num-steps 20 # Run inference with custom settings python run_benchmark.py --mode inference --num-requests 20 --generation-length 200 # Use specific attention implementation python run_benchmark.py --attn-implementation flash_attention_3_hopper """ ) # Model configuration parser.add_argument( "--model-path", type=str, default="./model_cache", help="Path to cached model directory" ) parser.add_argument( "--model-name", type=str, default="Qwen/Qwen3-4B", help="Model name for reporting" ) # Benchmark mode parser.add_argument( "--mode", type=str, default="both", choices=["pretrain", "inference", "both"], help="Benchmark mode to run" ) # Attention configuration parser.add_argument( "--attn-implementation", type=str, default="auto", choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"], help="Attention implementation (auto selects based on GPU)" ) # Pretraining parameters pretrain_group = parser.add_argument_group("pretraining parameters") pretrain_group.add_argument( "--batch-size", type=int, default=3, help="Batch size for pretraining" ) pretrain_group.add_argument( "--sequence-length", type=int, default=2048, help="Sequence length for pretraining" ) pretrain_group.add_argument( "--num-steps", type=int, default=10, help="Number of training steps" ) pretrain_group.add_argument( "--warmup-steps", type=int, default=3, help="Number of warmup steps" ) # Inference parameters inference_group = parser.add_argument_group("inference parameters") inference_group.add_argument( "--num-requests", type=int, default=10, help="Number of inference requests" ) inference_group.add_argument( "--prompt-length", type=int, default=512, help="Prompt length in tokens" ) inference_group.add_argument( "--generation-length", type=int, default=100, help="Number of tokens to generate" ) inference_group.add_argument( "--warmup-requests", type=int, default=2, help="Number of warmup requests" ) # General parameters parser.add_argument( "--device-id", type=int, default=0, help="GPU device ID" ) parser.add_argument( "--output-dir", type=str, default="./results", help="Output directory for results" ) parser.add_argument( "--list-gpus", action="store_true", help="List available GPUs and exit" ) args = parser.parse_args() # List GPUs if requested if args.list_gpus: print("Available GPUs:") gpus = list_available_gpus() if not gpus: print(" No GPUs found!") else: for gpu in gpus: print(f" {gpu}") return # Print header print("=" * 80) print("LLM BENCHMARK SUITE") print("=" * 80) print(f"\nModel: {args.model_name}") print(f"Model Path: {args.model_path}") print(f"Mode: {args.mode}") print(f"Attention: {args.attn_implementation}") print(f"Output Directory: {args.output_dir}") # Detect GPU print("\nDetecting GPU...") try: monitor = get_gpu_monitor(args.device_id) gpu_name = monitor.get_device_name() print(f" GPU {args.device_id}: {gpu_name}") monitor.cleanup() except Exception as e: print(f"✗ Error detecting GPU: {e}") sys.exit(1) # Create output directory output_path = Path(args.output_dir) output_path.mkdir(parents=True, exist_ok=True) # Run benchmarks pretrain_metrics = None inference_metrics = None if args.mode in ["pretrain", "both"]: print("\n" + "=" * 80) print("Running Pretraining Benchmark...") print("=" * 80) pretrain_metrics = benchmark_pretrain.benchmark_pretrain( model_name_or_path=args.model_name, attn_implementation=args.attn_implementation, batch_size=args.batch_size, sequence_length=args.sequence_length, num_steps=args.num_steps, warmup_steps=args.warmup_steps, device="cuda", device_id=args.device_id, output_dir=args.output_dir, verbose=True ) if args.mode in ["inference", "both"]: print("\n" + "=" * 80) print("Running Inference Benchmark...") print("=" * 80) inference_metrics = benchmark_inference.benchmark_inference( model_name_or_path=args.model_name, attn_implementation=args.attn_implementation, num_requests=args.num_requests, prompt_length=args.prompt_length, generation_length=args.generation_length, warmup_requests=args.warmup_requests, device="cuda", device_id=args.device_id, output_dir=args.output_dir, verbose=True ) # Summary print("\n" + "=" * 80) print("BENCHMARK COMPLETE") print("=" * 80) print(f"\nResults saved to: {output_path}") if pretrain_metrics: print(f"\nPretraining:") print(f" Duration: {pretrain_metrics.total_duration_ms:.2f} ms") print(f" Throughput: {pretrain_metrics.total_tokens_per_second:.2f} tokens/s") print(f" Energy: {pretrain_metrics.total_energy_joules:.2f} J") print(f" Energy/token: {pretrain_metrics.total_energy_per_token*1000:.4f} mJ/token") if inference_metrics: print(f"\nInference:") print(f" TTFT: {inference_metrics.ttft_ms:.2f} ms") print(f" ITL: {inference_metrics.itl_ms:.2f} ms/token") print(f" Throughput: {inference_metrics.e2e_tokens_per_second:.2f} tokens/s") print(f" Energy: {inference_metrics.e2e_energy_joules:.2f} J") print(f" Energy/token: {inference_metrics.e2e_energy_per_token*1000:.4f} mJ/token") if __name__ == "__main__": main()