249 lines
7.1 KiB
Python
Executable File
249 lines
7.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Main LLM Benchmark Runner
|
|
|
|
Orchestrates pretraining and inference benchmarks with auto-detection
|
|
of GPU type and configuration.
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Import benchmark functions
|
|
import benchmark_pretrain
|
|
import benchmark_inference
|
|
|
|
from utils.gpu_monitor import get_gpu_monitor, list_available_gpus
|
|
from utils.metrics import MetricsReporter
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="LLM Benchmark Suite - Compare GPU performance for pretraining and inference",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Run both pretrain and inference benchmarks
|
|
python run_benchmark.py --mode both
|
|
|
|
# Run only pretraining benchmark
|
|
python run_benchmark.py --mode pretrain --num-steps 20
|
|
|
|
# Run inference with custom settings
|
|
python run_benchmark.py --mode inference --num-requests 20 --generation-length 200
|
|
|
|
# Use specific attention implementation
|
|
python run_benchmark.py --attn-implementation flash_attention_3_hopper
|
|
"""
|
|
)
|
|
|
|
# Model configuration
|
|
parser.add_argument(
|
|
"--model-path",
|
|
type=str,
|
|
default="./model_cache",
|
|
help="Path to cached model directory"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--model-name",
|
|
type=str,
|
|
default="Qwen/Qwen3-4B",
|
|
help="Model name for reporting"
|
|
)
|
|
|
|
# Benchmark mode
|
|
parser.add_argument(
|
|
"--mode",
|
|
type=str,
|
|
default="both",
|
|
choices=["pretrain", "inference", "both"],
|
|
help="Benchmark mode to run"
|
|
)
|
|
|
|
# Attention configuration
|
|
parser.add_argument(
|
|
"--attn-implementation",
|
|
type=str,
|
|
default="auto",
|
|
choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
|
|
help="Attention implementation (auto selects based on GPU)"
|
|
)
|
|
|
|
# Pretraining parameters
|
|
pretrain_group = parser.add_argument_group("pretraining parameters")
|
|
pretrain_group.add_argument(
|
|
"--batch-size",
|
|
type=int,
|
|
default=3,
|
|
help="Batch size for pretraining"
|
|
)
|
|
pretrain_group.add_argument(
|
|
"--sequence-length",
|
|
type=int,
|
|
default=2048,
|
|
help="Sequence length for pretraining"
|
|
)
|
|
pretrain_group.add_argument(
|
|
"--num-steps",
|
|
type=int,
|
|
default=10,
|
|
help="Number of training steps"
|
|
)
|
|
pretrain_group.add_argument(
|
|
"--warmup-steps",
|
|
type=int,
|
|
default=3,
|
|
help="Number of warmup steps"
|
|
)
|
|
|
|
# Inference parameters
|
|
inference_group = parser.add_argument_group("inference parameters")
|
|
inference_group.add_argument(
|
|
"--num-requests",
|
|
type=int,
|
|
default=10,
|
|
help="Number of inference requests"
|
|
)
|
|
inference_group.add_argument(
|
|
"--prompt-length",
|
|
type=int,
|
|
default=512,
|
|
help="Prompt length in tokens"
|
|
)
|
|
inference_group.add_argument(
|
|
"--generation-length",
|
|
type=int,
|
|
default=100,
|
|
help="Number of tokens to generate"
|
|
)
|
|
inference_group.add_argument(
|
|
"--warmup-requests",
|
|
type=int,
|
|
default=2,
|
|
help="Number of warmup requests"
|
|
)
|
|
|
|
# General parameters
|
|
parser.add_argument(
|
|
"--device-id",
|
|
type=int,
|
|
default=0,
|
|
help="GPU device ID"
|
|
)
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
type=str,
|
|
default="./results",
|
|
help="Output directory for results"
|
|
)
|
|
parser.add_argument(
|
|
"--list-gpus",
|
|
action="store_true",
|
|
help="List available GPUs and exit"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# List GPUs if requested
|
|
if args.list_gpus:
|
|
print("Available GPUs:")
|
|
gpus = list_available_gpus()
|
|
if not gpus:
|
|
print(" No GPUs found!")
|
|
else:
|
|
for gpu in gpus:
|
|
print(f" {gpu}")
|
|
return
|
|
|
|
# Print header
|
|
print("=" * 80)
|
|
print("LLM BENCHMARK SUITE")
|
|
print("=" * 80)
|
|
print(f"\nModel: {args.model_name}")
|
|
print(f"Model Path: {args.model_path}")
|
|
print(f"Mode: {args.mode}")
|
|
print(f"Attention: {args.attn_implementation}")
|
|
print(f"Output Directory: {args.output_dir}")
|
|
|
|
# Detect GPU
|
|
print("\nDetecting GPU...")
|
|
try:
|
|
monitor = get_gpu_monitor(args.device_id)
|
|
gpu_name = monitor.get_device_name()
|
|
print(f" GPU {args.device_id}: {gpu_name}")
|
|
monitor.cleanup()
|
|
except Exception as e:
|
|
print(f"✗ Error detecting GPU: {e}")
|
|
sys.exit(1)
|
|
|
|
# Create output directory
|
|
output_path = Path(args.output_dir)
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Run benchmarks
|
|
pretrain_metrics = None
|
|
inference_metrics = None
|
|
|
|
if args.mode in ["pretrain", "both"]:
|
|
print("\n" + "=" * 80)
|
|
print("Running Pretraining Benchmark...")
|
|
print("=" * 80)
|
|
|
|
pretrain_metrics = benchmark_pretrain.benchmark_pretrain(
|
|
model_name_or_path=args.model_name,
|
|
attn_implementation=args.attn_implementation,
|
|
batch_size=args.batch_size,
|
|
sequence_length=args.sequence_length,
|
|
num_steps=args.num_steps,
|
|
warmup_steps=args.warmup_steps,
|
|
device="cuda",
|
|
device_id=args.device_id,
|
|
output_dir=args.output_dir,
|
|
verbose=True
|
|
)
|
|
|
|
if args.mode in ["inference", "both"]:
|
|
print("\n" + "=" * 80)
|
|
print("Running Inference Benchmark...")
|
|
print("=" * 80)
|
|
|
|
inference_metrics = benchmark_inference.benchmark_inference(
|
|
model_name_or_path=args.model_name,
|
|
attn_implementation=args.attn_implementation,
|
|
num_requests=args.num_requests,
|
|
prompt_length=args.prompt_length,
|
|
generation_length=args.generation_length,
|
|
warmup_requests=args.warmup_requests,
|
|
device="cuda",
|
|
device_id=args.device_id,
|
|
output_dir=args.output_dir,
|
|
verbose=True
|
|
)
|
|
|
|
# Summary
|
|
print("\n" + "=" * 80)
|
|
print("BENCHMARK COMPLETE")
|
|
print("=" * 80)
|
|
print(f"\nResults saved to: {output_path}")
|
|
|
|
if pretrain_metrics:
|
|
print(f"\nPretraining:")
|
|
print(f" Duration: {pretrain_metrics.total_duration_ms:.2f} ms")
|
|
print(f" Throughput: {pretrain_metrics.total_tokens_per_second:.2f} tokens/s")
|
|
print(f" Energy: {pretrain_metrics.total_energy_joules:.2f} J")
|
|
print(f" Energy/token: {pretrain_metrics.total_energy_per_token*1000:.4f} mJ/token")
|
|
|
|
if inference_metrics:
|
|
print(f"\nInference:")
|
|
print(f" TTFT: {inference_metrics.ttft_ms:.2f} ms")
|
|
print(f" ITL: {inference_metrics.itl_ms:.2f} ms/token")
|
|
print(f" Throughput: {inference_metrics.e2e_tokens_per_second:.2f} tokens/s")
|
|
print(f" Energy: {inference_metrics.e2e_energy_joules:.2f} J")
|
|
print(f" Energy/token: {inference_metrics.e2e_energy_per_token*1000:.4f} mJ/token")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|