Initial commit
This commit is contained in:
473
utils/metrics.py
Normal file
473
utils/metrics.py
Normal file
@@ -0,0 +1,473 @@
|
||||
"""
|
||||
Metrics Collection and Reporting for LLM Benchmarking
|
||||
|
||||
Provides centralized metrics collection, aggregation, and reporting.
|
||||
"""
|
||||
|
||||
import json
|
||||
import csv
|
||||
from dataclasses import dataclass, asdict, field
|
||||
from typing import Dict, List, Optional, Any
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
|
||||
@dataclass
|
||||
class StageMetrics:
|
||||
"""Metrics for a specific stage (e.g., forward pass, prefill, etc.)."""
|
||||
stage_name: str
|
||||
duration_ms: float
|
||||
tokens_processed: int
|
||||
tokens_per_second: float
|
||||
energy_joules: float
|
||||
energy_per_token: float
|
||||
avg_power_watts: float
|
||||
peak_memory_gb: float
|
||||
avg_gpu_util_percent: float
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary."""
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PretrainMetrics:
|
||||
"""Metrics for pretraining benchmark."""
|
||||
model_name: str
|
||||
gpu_name: str
|
||||
attention_implementation: str
|
||||
batch_size: int
|
||||
sequence_length: int
|
||||
num_steps: int
|
||||
|
||||
# Stage-specific metrics
|
||||
forward: StageMetrics
|
||||
backward: StageMetrics
|
||||
optimizer: StageMetrics
|
||||
|
||||
# Overall metrics
|
||||
total_duration_ms: float
|
||||
total_tokens: int
|
||||
total_tokens_per_second: float
|
||||
total_energy_joules: float
|
||||
total_energy_per_token: float
|
||||
|
||||
timestamp: float = field(default_factory=time.time)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary."""
|
||||
return {
|
||||
"model_name": self.model_name,
|
||||
"gpu_name": self.gpu_name,
|
||||
"attention_implementation": self.attention_implementation,
|
||||
"batch_size": self.batch_size,
|
||||
"sequence_length": self.sequence_length,
|
||||
"num_steps": self.num_steps,
|
||||
"forward": self.forward.to_dict(),
|
||||
"backward": self.backward.to_dict(),
|
||||
"optimizer": self.optimizer.to_dict(),
|
||||
"total_duration_ms": self.total_duration_ms,
|
||||
"total_tokens": self.total_tokens,
|
||||
"total_tokens_per_second": self.total_tokens_per_second,
|
||||
"total_energy_joules": self.total_energy_joules,
|
||||
"total_energy_per_token": self.total_energy_per_token,
|
||||
"timestamp": self.timestamp,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class InferenceMetrics:
|
||||
"""Metrics for inference benchmark."""
|
||||
model_name: str
|
||||
gpu_name: str
|
||||
attention_implementation: str
|
||||
num_requests: int
|
||||
prompt_length: int
|
||||
generation_length: int
|
||||
|
||||
# Stage-specific metrics
|
||||
prefill: StageMetrics # Time to First Token
|
||||
decode: StageMetrics # Inter-Token Latency
|
||||
|
||||
# End-to-end metrics
|
||||
e2e_latency_ms: float
|
||||
e2e_tokens_per_second: float
|
||||
e2e_energy_joules: float
|
||||
e2e_energy_per_token: float
|
||||
|
||||
# Additional metrics
|
||||
ttft_ms: float # Time to First Token (same as prefill duration)
|
||||
itl_ms: float # Inter-Token Latency (decode duration / num_tokens)
|
||||
|
||||
timestamp: float = field(default_factory=time.time)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary."""
|
||||
return {
|
||||
"model_name": self.model_name,
|
||||
"gpu_name": self.gpu_name,
|
||||
"attention_implementation": self.attention_implementation,
|
||||
"num_requests": self.num_requests,
|
||||
"prompt_length": self.prompt_length,
|
||||
"generation_length": self.generation_length,
|
||||
"prefill": self.prefill.to_dict(),
|
||||
"decode": self.decode.to_dict(),
|
||||
"e2e_latency_ms": self.e2e_latency_ms,
|
||||
"e2e_tokens_per_second": self.e2e_tokens_per_second,
|
||||
"e2e_energy_joules": self.e2e_energy_joules,
|
||||
"e2e_energy_per_token": self.e2e_energy_per_token,
|
||||
"ttft_ms": self.ttft_ms,
|
||||
"itl_ms": self.itl_ms,
|
||||
"timestamp": self.timestamp,
|
||||
}
|
||||
|
||||
|
||||
class MetricsCollector:
|
||||
"""Collects metrics during benchmark runs."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize metrics collector."""
|
||||
self.metrics_history: List[Dict[str, Any]] = []
|
||||
|
||||
def add_pretrain_metrics(self, metrics: PretrainMetrics):
|
||||
"""Add pretraining metrics."""
|
||||
self.metrics_history.append({
|
||||
"type": "pretrain",
|
||||
"metrics": metrics.to_dict()
|
||||
})
|
||||
|
||||
def add_inference_metrics(self, metrics: InferenceMetrics):
|
||||
"""Add inference metrics."""
|
||||
self.metrics_history.append({
|
||||
"type": "inference",
|
||||
"metrics": metrics.to_dict()
|
||||
})
|
||||
|
||||
def get_all_metrics(self) -> List[Dict[str, Any]]:
|
||||
"""Get all collected metrics."""
|
||||
return self.metrics_history
|
||||
|
||||
def clear(self):
|
||||
"""Clear all metrics."""
|
||||
self.metrics_history.clear()
|
||||
|
||||
|
||||
class MetricsReporter:
|
||||
"""Formats and outputs benchmark results."""
|
||||
|
||||
@staticmethod
|
||||
def print_pretrain_metrics(metrics: PretrainMetrics, verbose: bool = True):
|
||||
"""Print pretraining metrics to console."""
|
||||
print("\n" + "=" * 80)
|
||||
print("PRETRAINING BENCHMARK RESULTS")
|
||||
print("=" * 80)
|
||||
print(f"\nModel: {metrics.model_name}")
|
||||
print(f"GPU: {metrics.gpu_name}")
|
||||
print(f"Attention: {metrics.attention_implementation}")
|
||||
print(f"Batch Size: {metrics.batch_size}")
|
||||
print(f"Sequence Length: {metrics.sequence_length}")
|
||||
print(f"Training Steps: {metrics.num_steps}")
|
||||
|
||||
print("\n" + "-" * 80)
|
||||
print("STAGE BREAKDOWN")
|
||||
print("-" * 80)
|
||||
|
||||
# Forward pass
|
||||
print(f"\n[1] FORWARD PASS")
|
||||
MetricsReporter._print_stage_metrics(metrics.forward, verbose)
|
||||
|
||||
# Backward pass
|
||||
print(f"\n[2] BACKWARD PASS")
|
||||
MetricsReporter._print_stage_metrics(metrics.backward, verbose)
|
||||
|
||||
# Optimizer step
|
||||
print(f"\n[3] OPTIMIZER STEP")
|
||||
MetricsReporter._print_stage_metrics(metrics.optimizer, verbose)
|
||||
|
||||
# Overall
|
||||
print("\n" + "-" * 80)
|
||||
print("OVERALL METRICS")
|
||||
print("-" * 80)
|
||||
print(f" Total Duration: {metrics.total_duration_ms:>10.2f} ms")
|
||||
print(f" Total Tokens: {metrics.total_tokens:>10,}")
|
||||
print(f" Throughput: {metrics.total_tokens_per_second:>10.2f} tokens/s")
|
||||
print(f" Total Energy: {metrics.total_energy_joules:>10.2f} J")
|
||||
print(f" Energy per Token: {metrics.total_energy_per_token*1000:>10.4f} mJ/token")
|
||||
print("=" * 80 + "\n")
|
||||
|
||||
@staticmethod
|
||||
def print_inference_metrics(metrics: InferenceMetrics, verbose: bool = True):
|
||||
"""Print inference metrics to console."""
|
||||
print("\n" + "=" * 80)
|
||||
print("INFERENCE BENCHMARK RESULTS")
|
||||
print("=" * 80)
|
||||
print(f"\nModel: {metrics.model_name}")
|
||||
print(f"GPU: {metrics.gpu_name}")
|
||||
print(f"Attention: {metrics.attention_implementation}")
|
||||
print(f"Requests: {metrics.num_requests}")
|
||||
print(f"Prompt Length: {metrics.prompt_length}")
|
||||
print(f"Generation Length: {metrics.generation_length}")
|
||||
|
||||
print("\n" + "-" * 80)
|
||||
print("STAGE BREAKDOWN")
|
||||
print("-" * 80)
|
||||
|
||||
# Prefill
|
||||
print(f"\n[1] PREFILL (Time to First Token)")
|
||||
MetricsReporter._print_stage_metrics(metrics.prefill, verbose)
|
||||
print(f" TTFT: {metrics.ttft_ms:>10.2f} ms")
|
||||
|
||||
# Decode
|
||||
print(f"\n[2] DECODE (Inter-Token Latency)")
|
||||
MetricsReporter._print_stage_metrics(metrics.decode, verbose)
|
||||
print(f" ITL: {metrics.itl_ms:>10.2f} ms/token")
|
||||
|
||||
# End-to-end
|
||||
print("\n" + "-" * 80)
|
||||
print("END-TO-END METRICS")
|
||||
print("-" * 80)
|
||||
print(f" Request Latency: {metrics.e2e_latency_ms:>10.2f} ms")
|
||||
print(f" Throughput: {metrics.e2e_tokens_per_second:>10.2f} tokens/s")
|
||||
print(f" Total Energy: {metrics.e2e_energy_joules:>10.2f} J")
|
||||
print(f" Energy per Token: {metrics.e2e_energy_per_token*1000:>10.4f} mJ/token")
|
||||
print("=" * 80 + "\n")
|
||||
|
||||
@staticmethod
|
||||
def _print_stage_metrics(stage: StageMetrics, verbose: bool = True):
|
||||
"""Print metrics for a single stage."""
|
||||
print(f" Duration: {stage.duration_ms:>10.2f} ms")
|
||||
print(f" Tokens: {stage.tokens_processed:>10,}")
|
||||
print(f" Throughput: {stage.tokens_per_second:>10.2f} tokens/s")
|
||||
print(f" Energy: {stage.energy_joules:>10.2f} J")
|
||||
print(f" Energy per Token: {stage.energy_per_token*1000:>10.4f} mJ/token")
|
||||
|
||||
if verbose:
|
||||
print(f" Avg Power: {stage.avg_power_watts:>10.2f} W")
|
||||
print(f" Peak Memory: {stage.peak_memory_gb:>10.2f} GB")
|
||||
print(f" Avg GPU Utilization: {stage.avg_gpu_util_percent:>10.1f} %")
|
||||
|
||||
@staticmethod
|
||||
def save_json(metrics: Any, output_path: Path):
|
||||
"""
|
||||
Save metrics to JSON file.
|
||||
|
||||
Args:
|
||||
metrics: PretrainMetrics or InferenceMetrics object
|
||||
output_path: Path to output JSON file
|
||||
"""
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(metrics.to_dict(), f, indent=2)
|
||||
|
||||
print(f"Metrics saved to: {output_path}")
|
||||
|
||||
@staticmethod
|
||||
def save_csv(metrics_list: List[Any], output_path: Path, benchmark_type: str = "pretrain"):
|
||||
"""
|
||||
Save multiple metrics to CSV file for comparison.
|
||||
|
||||
Args:
|
||||
metrics_list: List of PretrainMetrics or InferenceMetrics objects
|
||||
output_path: Path to output CSV file
|
||||
benchmark_type: "pretrain" or "inference"
|
||||
"""
|
||||
if not metrics_list:
|
||||
print("No metrics to save")
|
||||
return
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, 'w', newline='') as f:
|
||||
if benchmark_type == "pretrain":
|
||||
MetricsReporter._save_pretrain_csv(metrics_list, f)
|
||||
else:
|
||||
MetricsReporter._save_inference_csv(metrics_list, f)
|
||||
|
||||
print(f"CSV saved to: {output_path}")
|
||||
|
||||
@staticmethod
|
||||
def _save_pretrain_csv(metrics_list: List[PretrainMetrics], file):
|
||||
"""Save pretraining metrics to CSV."""
|
||||
fieldnames = [
|
||||
'gpu_name', 'attention_implementation', 'batch_size', 'sequence_length', 'num_steps',
|
||||
'forward_duration_ms', 'forward_tokens_per_sec', 'forward_energy_j', 'forward_energy_per_token_mj',
|
||||
'backward_duration_ms', 'backward_tokens_per_sec', 'backward_energy_j', 'backward_energy_per_token_mj',
|
||||
'optimizer_duration_ms', 'optimizer_tokens_per_sec', 'optimizer_energy_j', 'optimizer_energy_per_token_mj',
|
||||
'total_duration_ms', 'total_tokens_per_sec', 'total_energy_j', 'total_energy_per_token_mj',
|
||||
'timestamp'
|
||||
]
|
||||
|
||||
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
|
||||
for m in metrics_list:
|
||||
writer.writerow({
|
||||
'gpu_name': m.gpu_name,
|
||||
'attention_implementation': m.attention_implementation,
|
||||
'batch_size': m.batch_size,
|
||||
'sequence_length': m.sequence_length,
|
||||
'num_steps': m.num_steps,
|
||||
'forward_duration_ms': m.forward.duration_ms,
|
||||
'forward_tokens_per_sec': m.forward.tokens_per_second,
|
||||
'forward_energy_j': m.forward.energy_joules,
|
||||
'forward_energy_per_token_mj': m.forward.energy_per_token * 1000,
|
||||
'backward_duration_ms': m.backward.duration_ms,
|
||||
'backward_tokens_per_sec': m.backward.tokens_per_second,
|
||||
'backward_energy_j': m.backward.energy_joules,
|
||||
'backward_energy_per_token_mj': m.backward.energy_per_token * 1000,
|
||||
'optimizer_duration_ms': m.optimizer.duration_ms,
|
||||
'optimizer_tokens_per_sec': m.optimizer.tokens_per_second,
|
||||
'optimizer_energy_j': m.optimizer.energy_joules,
|
||||
'optimizer_energy_per_token_mj': m.optimizer.energy_per_token * 1000,
|
||||
'total_duration_ms': m.total_duration_ms,
|
||||
'total_tokens_per_sec': m.total_tokens_per_second,
|
||||
'total_energy_j': m.total_energy_joules,
|
||||
'total_energy_per_token_mj': m.total_energy_per_token * 1000,
|
||||
'timestamp': m.timestamp,
|
||||
})
|
||||
|
||||
@staticmethod
|
||||
def _save_inference_csv(metrics_list: List[InferenceMetrics], file):
|
||||
"""Save inference metrics to CSV."""
|
||||
fieldnames = [
|
||||
'gpu_name', 'attention_implementation', 'num_requests', 'prompt_length', 'generation_length',
|
||||
'prefill_duration_ms', 'prefill_tokens_per_sec', 'prefill_energy_j', 'prefill_energy_per_token_mj',
|
||||
'ttft_ms',
|
||||
'decode_duration_ms', 'decode_tokens_per_sec', 'decode_energy_j', 'decode_energy_per_token_mj',
|
||||
'itl_ms',
|
||||
'e2e_latency_ms', 'e2e_tokens_per_sec', 'e2e_energy_j', 'e2e_energy_per_token_mj',
|
||||
'timestamp'
|
||||
]
|
||||
|
||||
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
|
||||
for m in metrics_list:
|
||||
writer.writerow({
|
||||
'gpu_name': m.gpu_name,
|
||||
'attention_implementation': m.attention_implementation,
|
||||
'num_requests': m.num_requests,
|
||||
'prompt_length': m.prompt_length,
|
||||
'generation_length': m.generation_length,
|
||||
'prefill_duration_ms': m.prefill.duration_ms,
|
||||
'prefill_tokens_per_sec': m.prefill.tokens_per_second,
|
||||
'prefill_energy_j': m.prefill.energy_joules,
|
||||
'prefill_energy_per_token_mj': m.prefill.energy_per_token * 1000,
|
||||
'ttft_ms': m.ttft_ms,
|
||||
'decode_duration_ms': m.decode.duration_ms,
|
||||
'decode_tokens_per_sec': m.decode.tokens_per_second,
|
||||
'decode_energy_j': m.decode.energy_joules,
|
||||
'decode_energy_per_token_mj': m.decode.energy_per_token * 1000,
|
||||
'itl_ms': m.itl_ms,
|
||||
'e2e_latency_ms': m.e2e_latency_ms,
|
||||
'e2e_tokens_per_sec': m.e2e_tokens_per_second,
|
||||
'e2e_energy_j': m.e2e_energy_joules,
|
||||
'e2e_energy_per_token_mj': m.e2e_energy_per_token * 1000,
|
||||
'timestamp': m.timestamp,
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Test metrics reporting."""
|
||||
# Create sample pretraining metrics
|
||||
forward = StageMetrics(
|
||||
stage_name="forward",
|
||||
duration_ms=100.5,
|
||||
tokens_processed=1024,
|
||||
tokens_per_second=10189.3,
|
||||
energy_joules=25.3,
|
||||
energy_per_token=0.0247,
|
||||
avg_power_watts=251.7,
|
||||
peak_memory_gb=45.2,
|
||||
avg_gpu_util_percent=95.3
|
||||
)
|
||||
|
||||
backward = StageMetrics(
|
||||
stage_name="backward",
|
||||
duration_ms=205.2,
|
||||
tokens_processed=1024,
|
||||
tokens_per_second=4991.2,
|
||||
energy_joules=51.6,
|
||||
energy_per_token=0.0504,
|
||||
avg_power_watts=251.5,
|
||||
peak_memory_gb=48.6,
|
||||
avg_gpu_util_percent=97.1
|
||||
)
|
||||
|
||||
optimizer = StageMetrics(
|
||||
stage_name="optimizer",
|
||||
duration_ms=15.3,
|
||||
tokens_processed=1024,
|
||||
tokens_per_second=66928.1,
|
||||
energy_joules=3.8,
|
||||
energy_per_token=0.0037,
|
||||
avg_power_watts=248.4,
|
||||
peak_memory_gb=48.6,
|
||||
avg_gpu_util_percent=42.1
|
||||
)
|
||||
|
||||
pretrain_metrics = PretrainMetrics(
|
||||
model_name="Qwen/Qwen2.5-3B-Instruct",
|
||||
gpu_name="NVIDIA A100 80GB",
|
||||
attention_implementation="flash_attention_2",
|
||||
batch_size=8,
|
||||
sequence_length=2048,
|
||||
num_steps=10,
|
||||
forward=forward,
|
||||
backward=backward,
|
||||
optimizer=optimizer,
|
||||
total_duration_ms=321.0,
|
||||
total_tokens=10240,
|
||||
total_tokens_per_second=31900.3,
|
||||
total_energy_joules=80.7,
|
||||
total_energy_per_token=0.00788
|
||||
)
|
||||
|
||||
# Print pretrain metrics
|
||||
MetricsReporter.print_pretrain_metrics(pretrain_metrics)
|
||||
|
||||
# Create sample inference metrics
|
||||
prefill = StageMetrics(
|
||||
stage_name="prefill",
|
||||
duration_ms=45.2,
|
||||
tokens_processed=512,
|
||||
tokens_per_second=11327.4,
|
||||
energy_joules=11.3,
|
||||
energy_per_token=0.0221,
|
||||
avg_power_watts=250.0,
|
||||
peak_memory_gb=42.1,
|
||||
avg_gpu_util_percent=89.2
|
||||
)
|
||||
|
||||
decode = StageMetrics(
|
||||
stage_name="decode",
|
||||
duration_ms=223.5,
|
||||
tokens_processed=100,
|
||||
tokens_per_second=447.4,
|
||||
energy_joules=55.9,
|
||||
energy_per_token=0.559,
|
||||
avg_power_watts=250.1,
|
||||
peak_memory_gb=42.1,
|
||||
avg_gpu_util_percent=62.3
|
||||
)
|
||||
|
||||
inference_metrics = InferenceMetrics(
|
||||
model_name="Qwen/Qwen2.5-3B-Instruct",
|
||||
gpu_name="NVIDIA A100 80GB",
|
||||
attention_implementation="flash_attention_2",
|
||||
num_requests=10,
|
||||
prompt_length=512,
|
||||
generation_length=100,
|
||||
prefill=prefill,
|
||||
decode=decode,
|
||||
e2e_latency_ms=268.7,
|
||||
e2e_tokens_per_second=2277.9,
|
||||
e2e_energy_joules=67.2,
|
||||
e2e_energy_per_token=0.110,
|
||||
ttft_ms=45.2,
|
||||
itl_ms=2.235
|
||||
)
|
||||
|
||||
# Print inference metrics
|
||||
MetricsReporter.print_inference_metrics(inference_metrics)
|
||||
Reference in New Issue
Block a user