Initial commit
This commit is contained in:
417
benchmark_inference.py
Executable file
417
benchmark_inference.py
Executable file
@@ -0,0 +1,417 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Inference Benchmark for LLM Performance Evaluation
|
||||
|
||||
Measures performance and energy metrics for inference workloads with
|
||||
separate measurements for prefill and decode stages.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from tqdm import tqdm
|
||||
|
||||
# Add utils to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from utils.gpu_monitor import get_gpu_monitor
|
||||
from utils.metrics import StageMetrics, InferenceMetrics, MetricsReporter
|
||||
from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu
|
||||
|
||||
|
||||
def benchmark_inference(
|
||||
model_name_or_path: str,
|
||||
attn_implementation: str = "auto",
|
||||
num_requests: int = 10,
|
||||
prompt_length: int = 512,
|
||||
generation_length: int = 100,
|
||||
warmup_requests: int = 2,
|
||||
device: str = "cuda",
|
||||
device_id: int = 0,
|
||||
output_dir: Optional[str] = None,
|
||||
verbose: bool = True,
|
||||
):
|
||||
"""
|
||||
Run inference benchmark.
|
||||
|
||||
Args:
|
||||
model_name_or_path: Path to model or HuggingFace identifier
|
||||
attn_implementation: Attention implementation to use
|
||||
num_requests: Number of inference requests to measure
|
||||
prompt_length: Length of input prompt
|
||||
generation_length: Number of tokens to generate
|
||||
warmup_requests: Number of warmup requests
|
||||
device: Device to use
|
||||
device_id: GPU device ID
|
||||
output_dir: Directory to save results
|
||||
verbose: Print verbose output
|
||||
"""
|
||||
print("=" * 80)
|
||||
print("INFERENCE BENCHMARK")
|
||||
print("=" * 80)
|
||||
|
||||
# Initialize GPU monitor
|
||||
if verbose:
|
||||
print("\n[1/7] Initializing GPU monitor...")
|
||||
monitor = get_gpu_monitor(device_id)
|
||||
gpu_name = monitor.get_device_name()
|
||||
if verbose:
|
||||
print(f" GPU: {gpu_name}")
|
||||
|
||||
# Determine attention implementation
|
||||
if attn_implementation == "auto":
|
||||
attn_implementation = get_default_attention(gpu_name)
|
||||
if verbose:
|
||||
print(f" Auto-selected attention: {attn_implementation}")
|
||||
|
||||
# Validate attention for GPU
|
||||
valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name)
|
||||
if warning and verbose:
|
||||
print(f" ⚠ {warning}")
|
||||
|
||||
# Load model
|
||||
if verbose:
|
||||
print(f"\n[2/7] Loading model: {model_name_or_path}")
|
||||
|
||||
# Determine attn_implementation parameter for model loading
|
||||
load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation
|
||||
|
||||
try:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name_or_path,
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation=load_attn,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
model = model.to(device)
|
||||
|
||||
# Configure attention (patch if needed for FA3)
|
||||
model = configure_model_attention(model, attn_implementation, verbose=verbose)
|
||||
|
||||
if verbose:
|
||||
total_params = sum(p.numel() for p in model.parameters())
|
||||
print(f" Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
|
||||
except Exception as e:
|
||||
print(f"✗ Error loading model: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Load tokenizer
|
||||
if verbose:
|
||||
print(f"\n[3/7] Loading tokenizer...")
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_name_or_path,
|
||||
trust_remote_code=True
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"✗ Error loading tokenizer: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Generate synthetic prompts
|
||||
if verbose:
|
||||
print(f"\n[4/7] Generating synthetic prompts...")
|
||||
print(f" Prompt length: {prompt_length}")
|
||||
print(f" Generation length: {generation_length}")
|
||||
|
||||
# Create random input_ids (synthetic prompts)
|
||||
vocab_size = model.config.vocab_size
|
||||
# We'll create one prompt and reuse it
|
||||
prompt_ids = torch.randint(0, vocab_size, (1, prompt_length), device=device)
|
||||
|
||||
# Warmup
|
||||
if verbose:
|
||||
print(f"\n[5/7] Running warmup ({warmup_requests} requests)...")
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
for _ in range(warmup_requests):
|
||||
_ = model.generate(
|
||||
prompt_ids,
|
||||
max_new_tokens=generation_length,
|
||||
do_sample=False,
|
||||
pad_token_id=tokenizer.eos_token_id
|
||||
)
|
||||
|
||||
# Synchronize before benchmarking
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Benchmark
|
||||
if verbose:
|
||||
print(f"\n[6/7] Running benchmark ({num_requests} requests)...")
|
||||
|
||||
# Storage for per-request metrics
|
||||
prefill_times = []
|
||||
decode_times = []
|
||||
e2e_times = []
|
||||
|
||||
prefill_energies = []
|
||||
decode_energies = []
|
||||
e2e_energies = []
|
||||
|
||||
prefill_powers = []
|
||||
decode_powers = []
|
||||
|
||||
memory_usage = []
|
||||
gpu_utils = []
|
||||
|
||||
# For inference, we separate prefill (first token) from decode (remaining tokens)
|
||||
# We'll use a custom generation loop to measure them separately
|
||||
|
||||
for req_idx in tqdm(range(num_requests), desc="Benchmarking"):
|
||||
# === PREFILL PHASE (Time to First Token) ===
|
||||
# This is the forward pass with the prompt to get the first token
|
||||
|
||||
monitor.start_monitoring()
|
||||
torch.cuda.synchronize()
|
||||
prefill_start = time.perf_counter()
|
||||
|
||||
with torch.no_grad():
|
||||
# Forward pass with prompt
|
||||
outputs = model(input_ids=prompt_ids, use_cache=True)
|
||||
logits = outputs.logits
|
||||
past_key_values = outputs.past_key_values
|
||||
|
||||
# Get first generated token
|
||||
next_token_logits = logits[:, -1, :]
|
||||
next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
prefill_time = time.perf_counter() - prefill_start
|
||||
prefill_energy = monitor.get_energy_consumed()
|
||||
prefill_power = monitor.get_average_power()
|
||||
|
||||
prefill_times.append(prefill_time * 1000) # Convert to ms
|
||||
prefill_energies.append(prefill_energy)
|
||||
prefill_powers.append(prefill_power)
|
||||
|
||||
# === DECODE PHASE (Inter-Token Latency) ===
|
||||
# Generate remaining tokens one by one
|
||||
|
||||
monitor.start_monitoring()
|
||||
torch.cuda.synchronize()
|
||||
decode_start = time.perf_counter()
|
||||
|
||||
generated_tokens = [next_token]
|
||||
|
||||
with torch.no_grad():
|
||||
for _ in range(generation_length - 1):
|
||||
# Forward pass with single token using cached keys/values
|
||||
outputs = model(
|
||||
input_ids=next_token,
|
||||
past_key_values=past_key_values,
|
||||
use_cache=True
|
||||
)
|
||||
logits = outputs.logits
|
||||
past_key_values = outputs.past_key_values
|
||||
|
||||
# Get next token
|
||||
next_token_logits = logits[:, -1, :]
|
||||
next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
|
||||
generated_tokens.append(next_token)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
decode_time = time.perf_counter() - decode_start
|
||||
decode_energy = monitor.get_energy_consumed()
|
||||
decode_power = monitor.get_average_power()
|
||||
|
||||
decode_times.append(decode_time * 1000) # Convert to ms
|
||||
decode_energies.append(decode_energy)
|
||||
decode_powers.append(decode_power)
|
||||
|
||||
# End-to-end metrics
|
||||
e2e_time = prefill_time + decode_time
|
||||
e2e_energy = prefill_energy + decode_energy
|
||||
|
||||
e2e_times.append(e2e_time * 1000) # Convert to ms
|
||||
e2e_energies.append(e2e_energy)
|
||||
|
||||
# Get memory and utilization
|
||||
metrics = monitor.get_metrics()
|
||||
memory_usage.append(metrics.memory_used_gb)
|
||||
gpu_utils.append(metrics.gpu_utilization_percent)
|
||||
|
||||
# Compute aggregated metrics
|
||||
|
||||
# Prefill metrics (TTFT)
|
||||
prefill_duration_ms = sum(prefill_times)
|
||||
prefill_energy_j = sum(prefill_energies)
|
||||
prefill_tokens = prompt_length * num_requests
|
||||
prefill_tps = prefill_tokens / (prefill_duration_ms / 1000)
|
||||
prefill_ept = prefill_energy_j / prefill_tokens
|
||||
avg_ttft_ms = sum(prefill_times) / len(prefill_times)
|
||||
|
||||
prefill_metrics = StageMetrics(
|
||||
stage_name="prefill",
|
||||
duration_ms=prefill_duration_ms,
|
||||
tokens_processed=prefill_tokens,
|
||||
tokens_per_second=prefill_tps,
|
||||
energy_joules=prefill_energy_j,
|
||||
energy_per_token=prefill_ept,
|
||||
avg_power_watts=sum(prefill_powers) / len(prefill_powers),
|
||||
peak_memory_gb=max(memory_usage),
|
||||
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
|
||||
)
|
||||
|
||||
# Decode metrics (ITL)
|
||||
decode_duration_ms = sum(decode_times)
|
||||
decode_energy_j = sum(decode_energies)
|
||||
decode_tokens = generation_length * num_requests
|
||||
decode_tps = decode_tokens / (decode_duration_ms / 1000)
|
||||
decode_ept = decode_energy_j / decode_tokens
|
||||
avg_itl_ms = sum(decode_times) / len(decode_times) / generation_length
|
||||
|
||||
decode_metrics = StageMetrics(
|
||||
stage_name="decode",
|
||||
duration_ms=decode_duration_ms,
|
||||
tokens_processed=decode_tokens,
|
||||
tokens_per_second=decode_tps,
|
||||
energy_joules=decode_energy_j,
|
||||
energy_per_token=decode_ept,
|
||||
avg_power_watts=sum(decode_powers) / len(decode_powers),
|
||||
peak_memory_gb=max(memory_usage),
|
||||
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
|
||||
)
|
||||
|
||||
# End-to-end metrics
|
||||
e2e_latency_ms = sum(e2e_times) / len(e2e_times)
|
||||
e2e_energy_j = sum(e2e_energies)
|
||||
total_tokens = (prompt_length + generation_length) * num_requests
|
||||
e2e_tps = total_tokens / (sum(e2e_times) / 1000)
|
||||
e2e_ept = e2e_energy_j / total_tokens
|
||||
|
||||
# Create metrics object
|
||||
metrics = InferenceMetrics(
|
||||
model_name=model_name_or_path,
|
||||
gpu_name=gpu_name,
|
||||
attention_implementation=attn_implementation,
|
||||
num_requests=num_requests,
|
||||
prompt_length=prompt_length,
|
||||
generation_length=generation_length,
|
||||
prefill=prefill_metrics,
|
||||
decode=decode_metrics,
|
||||
e2e_latency_ms=e2e_latency_ms,
|
||||
e2e_tokens_per_second=e2e_tps,
|
||||
e2e_energy_joules=e2e_energy_j,
|
||||
e2e_energy_per_token=e2e_ept,
|
||||
ttft_ms=avg_ttft_ms,
|
||||
itl_ms=avg_itl_ms
|
||||
)
|
||||
|
||||
# Print results
|
||||
if verbose:
|
||||
print()
|
||||
MetricsReporter.print_inference_metrics(metrics, verbose=verbose)
|
||||
|
||||
# Save results
|
||||
if output_dir:
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save JSON
|
||||
json_path = output_path / f"inference_{gpu_name.replace(' ', '_')}_{attn_implementation}.json"
|
||||
MetricsReporter.save_json(metrics, json_path)
|
||||
|
||||
# Cleanup
|
||||
monitor.cleanup()
|
||||
del model
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="LLM Inference Benchmark",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model-path",
|
||||
type=str,
|
||||
default="./model_cache",
|
||||
help="Path to cached model"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model-name",
|
||||
type=str,
|
||||
default="Qwen/Qwen3-4B",
|
||||
help="Model name (for reporting)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--attn-implementation",
|
||||
type=str,
|
||||
default="auto",
|
||||
choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
|
||||
help="Attention implementation to use"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--num-requests",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Number of inference requests"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--prompt-length",
|
||||
type=int,
|
||||
default=512,
|
||||
help="Prompt length in tokens"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--generation-length",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Number of tokens to generate"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--warmup-requests",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Number of warmup requests"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--device-id",
|
||||
type=int,
|
||||
default=0,
|
||||
help="GPU device ID"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
default="./results",
|
||||
help="Output directory for results"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set environment variables for HuggingFace cache
|
||||
if Path(args.model_path).exists():
|
||||
os.environ['HF_HOME'] = args.model_path
|
||||
|
||||
benchmark_inference(
|
||||
model_name_or_path=args.model_name,
|
||||
attn_implementation=args.attn_implementation,
|
||||
num_requests=args.num_requests,
|
||||
prompt_length=args.prompt_length,
|
||||
generation_length=args.generation_length,
|
||||
warmup_requests=args.warmup_requests,
|
||||
device="cuda",
|
||||
device_id=args.device_id,
|
||||
output_dir=args.output_dir,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user