Initial commit

2026-02-05 23:18:26 +01:00
commit 747c92ac6b
31 changed files with 4220 additions and 0 deletions
--- a/run_benchmark.py
+++ b/run_benchmark.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+"""
+Main LLM Benchmark Runner
+
+Orchestrates pretraining and inference benchmarks with auto-detection
+of GPU type and configuration.
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+# Import benchmark functions
+import benchmark_pretrain
+import benchmark_inference
+
+from utils.gpu_monitor import get_gpu_monitor, list_available_gpus
+from utils.metrics import MetricsReporter
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="LLM Benchmark Suite - Compare GPU performance for pretraining and inference",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Run both pretrain and inference benchmarks
+  python run_benchmark.py --mode both
+  
+  # Run only pretraining benchmark
+  python run_benchmark.py --mode pretrain --num-steps 20
+  
+  # Run inference with custom settings
+  python run_benchmark.py --mode inference --num-requests 20 --generation-length 200
+  
+  # Use specific attention implementation
+  python run_benchmark.py --attn-implementation flash_attention_3_hopper
+        """
+    )
+    
+    # Model configuration
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="./model_cache",
+        help="Path to cached model directory"
+    )
+    
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="Qwen/Qwen3-4B",
+        help="Model name for reporting"
+    )
+    
+    # Benchmark mode
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="both",
+        choices=["pretrain", "inference", "both"],
+        help="Benchmark mode to run"
+    )
+    
+    # Attention configuration
+    parser.add_argument(
+        "--attn-implementation",
+        type=str,
+        default="auto",
+        choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
+        help="Attention implementation (auto selects based on GPU)"
+    )
+    
+    # Pretraining parameters
+    pretrain_group = parser.add_argument_group("pretraining parameters")
+    pretrain_group.add_argument(
+        "--batch-size",
+        type=int,
+        default=3,
+        help="Batch size for pretraining"
+    )
+    pretrain_group.add_argument(
+        "--sequence-length",
+        type=int,
+        default=2048,
+        help="Sequence length for pretraining"
+    )
+    pretrain_group.add_argument(
+        "--num-steps",
+        type=int,
+        default=10,
+        help="Number of training steps"
+    )
+    pretrain_group.add_argument(
+        "--warmup-steps",
+        type=int,
+        default=3,
+        help="Number of warmup steps"
+    )
+    
+    # Inference parameters
+    inference_group = parser.add_argument_group("inference parameters")
+    inference_group.add_argument(
+        "--num-requests",
+        type=int,
+        default=10,
+        help="Number of inference requests"
+    )
+    inference_group.add_argument(
+        "--prompt-length",
+        type=int,
+        default=512,
+        help="Prompt length in tokens"
+    )
+    inference_group.add_argument(
+        "--generation-length",
+        type=int,
+        default=100,
+        help="Number of tokens to generate"
+    )
+    inference_group.add_argument(
+        "--warmup-requests",
+        type=int,
+        default=2,
+        help="Number of warmup requests"
+    )
+    
+    # General parameters
+    parser.add_argument(
+        "--device-id",
+        type=int,
+        default=0,
+        help="GPU device ID"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="./results",
+        help="Output directory for results"
+    )
+    parser.add_argument(
+        "--list-gpus",
+        action="store_true",
+        help="List available GPUs and exit"
+    )
+    
+    args = parser.parse_args()
+    
+    # List GPUs if requested
+    if args.list_gpus:
+        print("Available GPUs:")
+        gpus = list_available_gpus()
+        if not gpus:
+            print("  No GPUs found!")
+        else:
+            for gpu in gpus:
+                print(f"  {gpu}")
+        return
+    
+    # Print header
+    print("=" * 80)
+    print("LLM BENCHMARK SUITE")
+    print("=" * 80)
+    print(f"\nModel: {args.model_name}")
+    print(f"Model Path: {args.model_path}")
+    print(f"Mode: {args.mode}")
+    print(f"Attention: {args.attn_implementation}")
+    print(f"Output Directory: {args.output_dir}")
+    
+    # Detect GPU
+    print("\nDetecting GPU...")
+    try:
+        monitor = get_gpu_monitor(args.device_id)
+        gpu_name = monitor.get_device_name()
+        print(f"  GPU {args.device_id}: {gpu_name}")
+        monitor.cleanup()
+    except Exception as e:
+        print(f"✗ Error detecting GPU: {e}")
+        sys.exit(1)
+    
+    # Create output directory
+    output_path = Path(args.output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    # Run benchmarks
+    pretrain_metrics = None
+    inference_metrics = None
+    
+    if args.mode in ["pretrain", "both"]:
+        print("\n" + "=" * 80)
+        print("Running Pretraining Benchmark...")
+        print("=" * 80)
+        
+        pretrain_metrics = benchmark_pretrain.benchmark_pretrain(
+            model_name_or_path=args.model_name,
+            attn_implementation=args.attn_implementation,
+            batch_size=args.batch_size,
+            sequence_length=args.sequence_length,
+            num_steps=args.num_steps,
+            warmup_steps=args.warmup_steps,
+            device="cuda",
+            device_id=args.device_id,
+            output_dir=args.output_dir,
+            verbose=True
+        )
+    
+    if args.mode in ["inference", "both"]:
+        print("\n" + "=" * 80)
+        print("Running Inference Benchmark...")
+        print("=" * 80)
+        
+        inference_metrics = benchmark_inference.benchmark_inference(
+            model_name_or_path=args.model_name,
+            attn_implementation=args.attn_implementation,
+            num_requests=args.num_requests,
+            prompt_length=args.prompt_length,
+            generation_length=args.generation_length,
+            warmup_requests=args.warmup_requests,
+            device="cuda",
+            device_id=args.device_id,
+            output_dir=args.output_dir,
+            verbose=True
+        )
+    
+    # Summary
+    print("\n" + "=" * 80)
+    print("BENCHMARK COMPLETE")
+    print("=" * 80)
+    print(f"\nResults saved to: {output_path}")
+    
+    if pretrain_metrics:
+        print(f"\nPretraining:")
+        print(f"  Duration: {pretrain_metrics.total_duration_ms:.2f} ms")
+        print(f"  Throughput: {pretrain_metrics.total_tokens_per_second:.2f} tokens/s")
+        print(f"  Energy: {pretrain_metrics.total_energy_joules:.2f} J")
+        print(f"  Energy/token: {pretrain_metrics.total_energy_per_token*1000:.4f} mJ/token")
+    
+    if inference_metrics:
+        print(f"\nInference:")
+        print(f"  TTFT: {inference_metrics.ttft_ms:.2f} ms")
+        print(f"  ITL: {inference_metrics.itl_ms:.2f} ms/token")
+        print(f"  Throughput: {inference_metrics.e2e_tokens_per_second:.2f} tokens/s")
+        print(f"  Energy: {inference_metrics.e2e_energy_joules:.2f} J")
+        print(f"  Energy/token: {inference_metrics.e2e_energy_per_token*1000:.4f} mJ/token")
+
+
+if __name__ == "__main__":
+    main()