Initial commit
This commit is contained in:
248
run_benchmark.py
Executable file
248
run_benchmark.py
Executable file
@@ -0,0 +1,248 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Main LLM Benchmark Runner
|
||||
|
||||
Orchestrates pretraining and inference benchmarks with auto-detection
|
||||
of GPU type and configuration.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Import benchmark functions
|
||||
import benchmark_pretrain
|
||||
import benchmark_inference
|
||||
|
||||
from utils.gpu_monitor import get_gpu_monitor, list_available_gpus
|
||||
from utils.metrics import MetricsReporter
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="LLM Benchmark Suite - Compare GPU performance for pretraining and inference",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Run both pretrain and inference benchmarks
|
||||
python run_benchmark.py --mode both
|
||||
|
||||
# Run only pretraining benchmark
|
||||
python run_benchmark.py --mode pretrain --num-steps 20
|
||||
|
||||
# Run inference with custom settings
|
||||
python run_benchmark.py --mode inference --num-requests 20 --generation-length 200
|
||||
|
||||
# Use specific attention implementation
|
||||
python run_benchmark.py --attn-implementation flash_attention_3_hopper
|
||||
"""
|
||||
)
|
||||
|
||||
# Model configuration
|
||||
parser.add_argument(
|
||||
"--model-path",
|
||||
type=str,
|
||||
default="./model_cache",
|
||||
help="Path to cached model directory"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model-name",
|
||||
type=str,
|
||||
default="Qwen/Qwen3-4B",
|
||||
help="Model name for reporting"
|
||||
)
|
||||
|
||||
# Benchmark mode
|
||||
parser.add_argument(
|
||||
"--mode",
|
||||
type=str,
|
||||
default="both",
|
||||
choices=["pretrain", "inference", "both"],
|
||||
help="Benchmark mode to run"
|
||||
)
|
||||
|
||||
# Attention configuration
|
||||
parser.add_argument(
|
||||
"--attn-implementation",
|
||||
type=str,
|
||||
default="auto",
|
||||
choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
|
||||
help="Attention implementation (auto selects based on GPU)"
|
||||
)
|
||||
|
||||
# Pretraining parameters
|
||||
pretrain_group = parser.add_argument_group("pretraining parameters")
|
||||
pretrain_group.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Batch size for pretraining"
|
||||
)
|
||||
pretrain_group.add_argument(
|
||||
"--sequence-length",
|
||||
type=int,
|
||||
default=2048,
|
||||
help="Sequence length for pretraining"
|
||||
)
|
||||
pretrain_group.add_argument(
|
||||
"--num-steps",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Number of training steps"
|
||||
)
|
||||
pretrain_group.add_argument(
|
||||
"--warmup-steps",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Number of warmup steps"
|
||||
)
|
||||
|
||||
# Inference parameters
|
||||
inference_group = parser.add_argument_group("inference parameters")
|
||||
inference_group.add_argument(
|
||||
"--num-requests",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Number of inference requests"
|
||||
)
|
||||
inference_group.add_argument(
|
||||
"--prompt-length",
|
||||
type=int,
|
||||
default=512,
|
||||
help="Prompt length in tokens"
|
||||
)
|
||||
inference_group.add_argument(
|
||||
"--generation-length",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Number of tokens to generate"
|
||||
)
|
||||
inference_group.add_argument(
|
||||
"--warmup-requests",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Number of warmup requests"
|
||||
)
|
||||
|
||||
# General parameters
|
||||
parser.add_argument(
|
||||
"--device-id",
|
||||
type=int,
|
||||
default=0,
|
||||
help="GPU device ID"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
default="./results",
|
||||
help="Output directory for results"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--list-gpus",
|
||||
action="store_true",
|
||||
help="List available GPUs and exit"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# List GPUs if requested
|
||||
if args.list_gpus:
|
||||
print("Available GPUs:")
|
||||
gpus = list_available_gpus()
|
||||
if not gpus:
|
||||
print(" No GPUs found!")
|
||||
else:
|
||||
for gpu in gpus:
|
||||
print(f" {gpu}")
|
||||
return
|
||||
|
||||
# Print header
|
||||
print("=" * 80)
|
||||
print("LLM BENCHMARK SUITE")
|
||||
print("=" * 80)
|
||||
print(f"\nModel: {args.model_name}")
|
||||
print(f"Model Path: {args.model_path}")
|
||||
print(f"Mode: {args.mode}")
|
||||
print(f"Attention: {args.attn_implementation}")
|
||||
print(f"Output Directory: {args.output_dir}")
|
||||
|
||||
# Detect GPU
|
||||
print("\nDetecting GPU...")
|
||||
try:
|
||||
monitor = get_gpu_monitor(args.device_id)
|
||||
gpu_name = monitor.get_device_name()
|
||||
print(f" GPU {args.device_id}: {gpu_name}")
|
||||
monitor.cleanup()
|
||||
except Exception as e:
|
||||
print(f"✗ Error detecting GPU: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Create output directory
|
||||
output_path = Path(args.output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Run benchmarks
|
||||
pretrain_metrics = None
|
||||
inference_metrics = None
|
||||
|
||||
if args.mode in ["pretrain", "both"]:
|
||||
print("\n" + "=" * 80)
|
||||
print("Running Pretraining Benchmark...")
|
||||
print("=" * 80)
|
||||
|
||||
pretrain_metrics = benchmark_pretrain.benchmark_pretrain(
|
||||
model_name_or_path=args.model_name,
|
||||
attn_implementation=args.attn_implementation,
|
||||
batch_size=args.batch_size,
|
||||
sequence_length=args.sequence_length,
|
||||
num_steps=args.num_steps,
|
||||
warmup_steps=args.warmup_steps,
|
||||
device="cuda",
|
||||
device_id=args.device_id,
|
||||
output_dir=args.output_dir,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
if args.mode in ["inference", "both"]:
|
||||
print("\n" + "=" * 80)
|
||||
print("Running Inference Benchmark...")
|
||||
print("=" * 80)
|
||||
|
||||
inference_metrics = benchmark_inference.benchmark_inference(
|
||||
model_name_or_path=args.model_name,
|
||||
attn_implementation=args.attn_implementation,
|
||||
num_requests=args.num_requests,
|
||||
prompt_length=args.prompt_length,
|
||||
generation_length=args.generation_length,
|
||||
warmup_requests=args.warmup_requests,
|
||||
device="cuda",
|
||||
device_id=args.device_id,
|
||||
output_dir=args.output_dir,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 80)
|
||||
print("BENCHMARK COMPLETE")
|
||||
print("=" * 80)
|
||||
print(f"\nResults saved to: {output_path}")
|
||||
|
||||
if pretrain_metrics:
|
||||
print(f"\nPretraining:")
|
||||
print(f" Duration: {pretrain_metrics.total_duration_ms:.2f} ms")
|
||||
print(f" Throughput: {pretrain_metrics.total_tokens_per_second:.2f} tokens/s")
|
||||
print(f" Energy: {pretrain_metrics.total_energy_joules:.2f} J")
|
||||
print(f" Energy/token: {pretrain_metrics.total_energy_per_token*1000:.4f} mJ/token")
|
||||
|
||||
if inference_metrics:
|
||||
print(f"\nInference:")
|
||||
print(f" TTFT: {inference_metrics.ttft_ms:.2f} ms")
|
||||
print(f" ITL: {inference_metrics.itl_ms:.2f} ms/token")
|
||||
print(f" Throughput: {inference_metrics.e2e_tokens_per_second:.2f} tokens/s")
|
||||
print(f" Energy: {inference_metrics.e2e_energy_joules:.2f} J")
|
||||
print(f" Energy/token: {inference_metrics.e2e_energy_per_token*1000:.4f} mJ/token")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user