418 lines
13 KiB
Python
Executable File
418 lines
13 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Inference Benchmark for LLM Performance Evaluation
|
|
|
|
Measures performance and energy metrics for inference workloads with
|
|
separate measurements for prefill and decode stages.
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import torch
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
from tqdm import tqdm
|
|
|
|
# Add utils to path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
from utils.gpu_monitor import get_gpu_monitor
|
|
from utils.metrics import StageMetrics, InferenceMetrics, MetricsReporter
|
|
from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu
|
|
|
|
|
|
def benchmark_inference(
|
|
model_name_or_path: str,
|
|
attn_implementation: str = "auto",
|
|
num_requests: int = 10,
|
|
prompt_length: int = 512,
|
|
generation_length: int = 100,
|
|
warmup_requests: int = 2,
|
|
device: str = "cuda",
|
|
device_id: int = 0,
|
|
output_dir: Optional[str] = None,
|
|
verbose: bool = True,
|
|
):
|
|
"""
|
|
Run inference benchmark.
|
|
|
|
Args:
|
|
model_name_or_path: Path to model or HuggingFace identifier
|
|
attn_implementation: Attention implementation to use
|
|
num_requests: Number of inference requests to measure
|
|
prompt_length: Length of input prompt
|
|
generation_length: Number of tokens to generate
|
|
warmup_requests: Number of warmup requests
|
|
device: Device to use
|
|
device_id: GPU device ID
|
|
output_dir: Directory to save results
|
|
verbose: Print verbose output
|
|
"""
|
|
print("=" * 80)
|
|
print("INFERENCE BENCHMARK")
|
|
print("=" * 80)
|
|
|
|
# Initialize GPU monitor
|
|
if verbose:
|
|
print("\n[1/7] Initializing GPU monitor...")
|
|
monitor = get_gpu_monitor(device_id)
|
|
gpu_name = monitor.get_device_name()
|
|
if verbose:
|
|
print(f" GPU: {gpu_name}")
|
|
|
|
# Determine attention implementation
|
|
if attn_implementation == "auto":
|
|
attn_implementation = get_default_attention(gpu_name)
|
|
if verbose:
|
|
print(f" Auto-selected attention: {attn_implementation}")
|
|
|
|
# Validate attention for GPU
|
|
valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name)
|
|
if warning and verbose:
|
|
print(f" ⚠ {warning}")
|
|
|
|
# Load model
|
|
if verbose:
|
|
print(f"\n[2/7] Loading model: {model_name_or_path}")
|
|
|
|
# Determine attn_implementation parameter for model loading
|
|
load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation
|
|
|
|
try:
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
model_name_or_path,
|
|
torch_dtype=torch.bfloat16,
|
|
attn_implementation=load_attn,
|
|
trust_remote_code=True,
|
|
)
|
|
model = model.to(device)
|
|
|
|
# Configure attention (patch if needed for FA3)
|
|
model = configure_model_attention(model, attn_implementation, verbose=verbose)
|
|
|
|
if verbose:
|
|
total_params = sum(p.numel() for p in model.parameters())
|
|
print(f" Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
|
|
except Exception as e:
|
|
print(f"✗ Error loading model: {e}")
|
|
sys.exit(1)
|
|
|
|
# Load tokenizer
|
|
if verbose:
|
|
print(f"\n[3/7] Loading tokenizer...")
|
|
try:
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
model_name_or_path,
|
|
trust_remote_code=True
|
|
)
|
|
except Exception as e:
|
|
print(f"✗ Error loading tokenizer: {e}")
|
|
sys.exit(1)
|
|
|
|
# Generate synthetic prompts
|
|
if verbose:
|
|
print(f"\n[4/7] Generating synthetic prompts...")
|
|
print(f" Prompt length: {prompt_length}")
|
|
print(f" Generation length: {generation_length}")
|
|
|
|
# Create random input_ids (synthetic prompts)
|
|
vocab_size = model.config.vocab_size
|
|
# We'll create one prompt and reuse it
|
|
prompt_ids = torch.randint(0, vocab_size, (1, prompt_length), device=device)
|
|
|
|
# Warmup
|
|
if verbose:
|
|
print(f"\n[5/7] Running warmup ({warmup_requests} requests)...")
|
|
model.eval()
|
|
with torch.no_grad():
|
|
for _ in range(warmup_requests):
|
|
_ = model.generate(
|
|
prompt_ids,
|
|
max_new_tokens=generation_length,
|
|
do_sample=False,
|
|
pad_token_id=tokenizer.eos_token_id
|
|
)
|
|
|
|
# Synchronize before benchmarking
|
|
torch.cuda.synchronize()
|
|
|
|
# Benchmark
|
|
if verbose:
|
|
print(f"\n[6/7] Running benchmark ({num_requests} requests)...")
|
|
|
|
# Storage for per-request metrics
|
|
prefill_times = []
|
|
decode_times = []
|
|
e2e_times = []
|
|
|
|
prefill_energies = []
|
|
decode_energies = []
|
|
e2e_energies = []
|
|
|
|
prefill_powers = []
|
|
decode_powers = []
|
|
|
|
memory_usage = []
|
|
gpu_utils = []
|
|
|
|
# For inference, we separate prefill (first token) from decode (remaining tokens)
|
|
# We'll use a custom generation loop to measure them separately
|
|
|
|
for req_idx in tqdm(range(num_requests), desc="Benchmarking"):
|
|
# === PREFILL PHASE (Time to First Token) ===
|
|
# This is the forward pass with the prompt to get the first token
|
|
|
|
monitor.start_monitoring()
|
|
torch.cuda.synchronize()
|
|
prefill_start = time.perf_counter()
|
|
|
|
with torch.no_grad():
|
|
# Forward pass with prompt
|
|
outputs = model(input_ids=prompt_ids, use_cache=True)
|
|
logits = outputs.logits
|
|
past_key_values = outputs.past_key_values
|
|
|
|
# Get first generated token
|
|
next_token_logits = logits[:, -1, :]
|
|
next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
|
|
|
|
torch.cuda.synchronize()
|
|
prefill_time = time.perf_counter() - prefill_start
|
|
prefill_energy = monitor.get_energy_consumed()
|
|
prefill_power = monitor.get_average_power()
|
|
|
|
prefill_times.append(prefill_time * 1000) # Convert to ms
|
|
prefill_energies.append(prefill_energy)
|
|
prefill_powers.append(prefill_power)
|
|
|
|
# === DECODE PHASE (Inter-Token Latency) ===
|
|
# Generate remaining tokens one by one
|
|
|
|
monitor.start_monitoring()
|
|
torch.cuda.synchronize()
|
|
decode_start = time.perf_counter()
|
|
|
|
generated_tokens = [next_token]
|
|
|
|
with torch.no_grad():
|
|
for _ in range(generation_length - 1):
|
|
# Forward pass with single token using cached keys/values
|
|
outputs = model(
|
|
input_ids=next_token,
|
|
past_key_values=past_key_values,
|
|
use_cache=True
|
|
)
|
|
logits = outputs.logits
|
|
past_key_values = outputs.past_key_values
|
|
|
|
# Get next token
|
|
next_token_logits = logits[:, -1, :]
|
|
next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
|
|
generated_tokens.append(next_token)
|
|
|
|
torch.cuda.synchronize()
|
|
decode_time = time.perf_counter() - decode_start
|
|
decode_energy = monitor.get_energy_consumed()
|
|
decode_power = monitor.get_average_power()
|
|
|
|
decode_times.append(decode_time * 1000) # Convert to ms
|
|
decode_energies.append(decode_energy)
|
|
decode_powers.append(decode_power)
|
|
|
|
# End-to-end metrics
|
|
e2e_time = prefill_time + decode_time
|
|
e2e_energy = prefill_energy + decode_energy
|
|
|
|
e2e_times.append(e2e_time * 1000) # Convert to ms
|
|
e2e_energies.append(e2e_energy)
|
|
|
|
# Get memory and utilization
|
|
metrics = monitor.get_metrics()
|
|
memory_usage.append(metrics.memory_used_gb)
|
|
gpu_utils.append(metrics.gpu_utilization_percent)
|
|
|
|
# Compute aggregated metrics
|
|
|
|
# Prefill metrics (TTFT)
|
|
prefill_duration_ms = sum(prefill_times)
|
|
prefill_energy_j = sum(prefill_energies)
|
|
prefill_tokens = prompt_length * num_requests
|
|
prefill_tps = prefill_tokens / (prefill_duration_ms / 1000)
|
|
prefill_ept = prefill_energy_j / prefill_tokens
|
|
avg_ttft_ms = sum(prefill_times) / len(prefill_times)
|
|
|
|
prefill_metrics = StageMetrics(
|
|
stage_name="prefill",
|
|
duration_ms=prefill_duration_ms,
|
|
tokens_processed=prefill_tokens,
|
|
tokens_per_second=prefill_tps,
|
|
energy_joules=prefill_energy_j,
|
|
energy_per_token=prefill_ept,
|
|
avg_power_watts=sum(prefill_powers) / len(prefill_powers),
|
|
peak_memory_gb=max(memory_usage),
|
|
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
|
|
)
|
|
|
|
# Decode metrics (ITL)
|
|
decode_duration_ms = sum(decode_times)
|
|
decode_energy_j = sum(decode_energies)
|
|
decode_tokens = generation_length * num_requests
|
|
decode_tps = decode_tokens / (decode_duration_ms / 1000)
|
|
decode_ept = decode_energy_j / decode_tokens
|
|
avg_itl_ms = sum(decode_times) / len(decode_times) / generation_length
|
|
|
|
decode_metrics = StageMetrics(
|
|
stage_name="decode",
|
|
duration_ms=decode_duration_ms,
|
|
tokens_processed=decode_tokens,
|
|
tokens_per_second=decode_tps,
|
|
energy_joules=decode_energy_j,
|
|
energy_per_token=decode_ept,
|
|
avg_power_watts=sum(decode_powers) / len(decode_powers),
|
|
peak_memory_gb=max(memory_usage),
|
|
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
|
|
)
|
|
|
|
# End-to-end metrics
|
|
e2e_latency_ms = sum(e2e_times) / len(e2e_times)
|
|
e2e_energy_j = sum(e2e_energies)
|
|
total_tokens = (prompt_length + generation_length) * num_requests
|
|
e2e_tps = total_tokens / (sum(e2e_times) / 1000)
|
|
e2e_ept = e2e_energy_j / total_tokens
|
|
|
|
# Create metrics object
|
|
metrics = InferenceMetrics(
|
|
model_name=model_name_or_path,
|
|
gpu_name=gpu_name,
|
|
attention_implementation=attn_implementation,
|
|
num_requests=num_requests,
|
|
prompt_length=prompt_length,
|
|
generation_length=generation_length,
|
|
prefill=prefill_metrics,
|
|
decode=decode_metrics,
|
|
e2e_latency_ms=e2e_latency_ms,
|
|
e2e_tokens_per_second=e2e_tps,
|
|
e2e_energy_joules=e2e_energy_j,
|
|
e2e_energy_per_token=e2e_ept,
|
|
ttft_ms=avg_ttft_ms,
|
|
itl_ms=avg_itl_ms
|
|
)
|
|
|
|
# Print results
|
|
if verbose:
|
|
print()
|
|
MetricsReporter.print_inference_metrics(metrics, verbose=verbose)
|
|
|
|
# Save results
|
|
if output_dir:
|
|
output_path = Path(output_dir)
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Save JSON
|
|
json_path = output_path / f"inference_{gpu_name.replace(' ', '_')}_{attn_implementation}.json"
|
|
MetricsReporter.save_json(metrics, json_path)
|
|
|
|
# Cleanup
|
|
monitor.cleanup()
|
|
del model
|
|
torch.cuda.empty_cache()
|
|
|
|
return metrics
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="LLM Inference Benchmark",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--model-path",
|
|
type=str,
|
|
default="./model_cache",
|
|
help="Path to cached model"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--model-name",
|
|
type=str,
|
|
default="Qwen/Qwen3-4B",
|
|
help="Model name (for reporting)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--attn-implementation",
|
|
type=str,
|
|
default="auto",
|
|
choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
|
|
help="Attention implementation to use"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--num-requests",
|
|
type=int,
|
|
default=10,
|
|
help="Number of inference requests"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--prompt-length",
|
|
type=int,
|
|
default=512,
|
|
help="Prompt length in tokens"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--generation-length",
|
|
type=int,
|
|
default=100,
|
|
help="Number of tokens to generate"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--warmup-requests",
|
|
type=int,
|
|
default=2,
|
|
help="Number of warmup requests"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--device-id",
|
|
type=int,
|
|
default=0,
|
|
help="GPU device ID"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
type=str,
|
|
default="./results",
|
|
help="Output directory for results"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Set environment variables for HuggingFace cache
|
|
if Path(args.model_path).exists():
|
|
os.environ['HF_HOME'] = args.model_path
|
|
|
|
benchmark_inference(
|
|
model_name_or_path=args.model_name,
|
|
attn_implementation=args.attn_implementation,
|
|
num_requests=args.num_requests,
|
|
prompt_length=args.prompt_length,
|
|
generation_length=args.generation_length,
|
|
warmup_requests=args.warmup_requests,
|
|
device="cuda",
|
|
device_id=args.device_id,
|
|
output_dir=args.output_dir,
|
|
verbose=True
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|