Initial commit

This commit is contained in:
Bole Ma
2026-02-05 23:18:26 +01:00
commit 747c92ac6b
31 changed files with 4220 additions and 0 deletions

248
run_benchmark.py Executable file
View File

@@ -0,0 +1,248 @@
#!/usr/bin/env python3
"""
Main LLM Benchmark Runner
Orchestrates pretraining and inference benchmarks with auto-detection
of GPU type and configuration.
"""
import argparse
import sys
from pathlib import Path
# Import benchmark functions
import benchmark_pretrain
import benchmark_inference
from utils.gpu_monitor import get_gpu_monitor, list_available_gpus
from utils.metrics import MetricsReporter
def main():
parser = argparse.ArgumentParser(
description="LLM Benchmark Suite - Compare GPU performance for pretraining and inference",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Run both pretrain and inference benchmarks
python run_benchmark.py --mode both
# Run only pretraining benchmark
python run_benchmark.py --mode pretrain --num-steps 20
# Run inference with custom settings
python run_benchmark.py --mode inference --num-requests 20 --generation-length 200
# Use specific attention implementation
python run_benchmark.py --attn-implementation flash_attention_3_hopper
"""
)
# Model configuration
parser.add_argument(
"--model-path",
type=str,
default="./model_cache",
help="Path to cached model directory"
)
parser.add_argument(
"--model-name",
type=str,
default="Qwen/Qwen3-4B",
help="Model name for reporting"
)
# Benchmark mode
parser.add_argument(
"--mode",
type=str,
default="both",
choices=["pretrain", "inference", "both"],
help="Benchmark mode to run"
)
# Attention configuration
parser.add_argument(
"--attn-implementation",
type=str,
default="auto",
choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
help="Attention implementation (auto selects based on GPU)"
)
# Pretraining parameters
pretrain_group = parser.add_argument_group("pretraining parameters")
pretrain_group.add_argument(
"--batch-size",
type=int,
default=3,
help="Batch size for pretraining"
)
pretrain_group.add_argument(
"--sequence-length",
type=int,
default=2048,
help="Sequence length for pretraining"
)
pretrain_group.add_argument(
"--num-steps",
type=int,
default=10,
help="Number of training steps"
)
pretrain_group.add_argument(
"--warmup-steps",
type=int,
default=3,
help="Number of warmup steps"
)
# Inference parameters
inference_group = parser.add_argument_group("inference parameters")
inference_group.add_argument(
"--num-requests",
type=int,
default=10,
help="Number of inference requests"
)
inference_group.add_argument(
"--prompt-length",
type=int,
default=512,
help="Prompt length in tokens"
)
inference_group.add_argument(
"--generation-length",
type=int,
default=100,
help="Number of tokens to generate"
)
inference_group.add_argument(
"--warmup-requests",
type=int,
default=2,
help="Number of warmup requests"
)
# General parameters
parser.add_argument(
"--device-id",
type=int,
default=0,
help="GPU device ID"
)
parser.add_argument(
"--output-dir",
type=str,
default="./results",
help="Output directory for results"
)
parser.add_argument(
"--list-gpus",
action="store_true",
help="List available GPUs and exit"
)
args = parser.parse_args()
# List GPUs if requested
if args.list_gpus:
print("Available GPUs:")
gpus = list_available_gpus()
if not gpus:
print(" No GPUs found!")
else:
for gpu in gpus:
print(f" {gpu}")
return
# Print header
print("=" * 80)
print("LLM BENCHMARK SUITE")
print("=" * 80)
print(f"\nModel: {args.model_name}")
print(f"Model Path: {args.model_path}")
print(f"Mode: {args.mode}")
print(f"Attention: {args.attn_implementation}")
print(f"Output Directory: {args.output_dir}")
# Detect GPU
print("\nDetecting GPU...")
try:
monitor = get_gpu_monitor(args.device_id)
gpu_name = monitor.get_device_name()
print(f" GPU {args.device_id}: {gpu_name}")
monitor.cleanup()
except Exception as e:
print(f"✗ Error detecting GPU: {e}")
sys.exit(1)
# Create output directory
output_path = Path(args.output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Run benchmarks
pretrain_metrics = None
inference_metrics = None
if args.mode in ["pretrain", "both"]:
print("\n" + "=" * 80)
print("Running Pretraining Benchmark...")
print("=" * 80)
pretrain_metrics = benchmark_pretrain.benchmark_pretrain(
model_name_or_path=args.model_name,
attn_implementation=args.attn_implementation,
batch_size=args.batch_size,
sequence_length=args.sequence_length,
num_steps=args.num_steps,
warmup_steps=args.warmup_steps,
device="cuda",
device_id=args.device_id,
output_dir=args.output_dir,
verbose=True
)
if args.mode in ["inference", "both"]:
print("\n" + "=" * 80)
print("Running Inference Benchmark...")
print("=" * 80)
inference_metrics = benchmark_inference.benchmark_inference(
model_name_or_path=args.model_name,
attn_implementation=args.attn_implementation,
num_requests=args.num_requests,
prompt_length=args.prompt_length,
generation_length=args.generation_length,
warmup_requests=args.warmup_requests,
device="cuda",
device_id=args.device_id,
output_dir=args.output_dir,
verbose=True
)
# Summary
print("\n" + "=" * 80)
print("BENCHMARK COMPLETE")
print("=" * 80)
print(f"\nResults saved to: {output_path}")
if pretrain_metrics:
print(f"\nPretraining:")
print(f" Duration: {pretrain_metrics.total_duration_ms:.2f} ms")
print(f" Throughput: {pretrain_metrics.total_tokens_per_second:.2f} tokens/s")
print(f" Energy: {pretrain_metrics.total_energy_joules:.2f} J")
print(f" Energy/token: {pretrain_metrics.total_energy_per_token*1000:.4f} mJ/token")
if inference_metrics:
print(f"\nInference:")
print(f" TTFT: {inference_metrics.ttft_ms:.2f} ms")
print(f" ITL: {inference_metrics.itl_ms:.2f} ms/token")
print(f" Throughput: {inference_metrics.e2e_tokens_per_second:.2f} tokens/s")
print(f" Energy: {inference_metrics.e2e_energy_joules:.2f} J")
print(f" Energy/token: {inference_metrics.e2e_energy_per_token*1000:.4f} mJ/token")
if __name__ == "__main__":
main()