Initial commit

This commit is contained in:
Bole Ma
2026-02-05 23:18:26 +01:00
commit 747c92ac6b
31 changed files with 4220 additions and 0 deletions

408
.gitignore vendored Normal file
View File

@@ -0,0 +1,408 @@
# READ THIS BEFORE YOU REFACTOR ME
#
# setup.py uses the list of patterns in this file to decide
# what to delete, but it's not 100% sound. So, for example,
# if you delete aten/build/ because it's redundant with build/,
# aten/build/ will stop being cleaned. So be careful when
# refactoring this file!
## Model cache
.md
model_cache/
## PyTorch
.coverage
coverage.xml
.dmypy.json
.gradle
.hypothesis
.mypy_cache
.additional_ci_files
.lintrunner.private.toml
/.extracted_scripts/
**/.pytorch_specified_test_cases.csv
**/.pytorch-disabled-tests.json
*/*.pyc
*/*.so*
*/**/__pycache__
*/**/*.dylib*
*/**/*.pyc
*/**/*.pyd
*/**/*.so*
*/**/**/*.pyc
*/**/**/**/*.pyc
*/**/**/**/**/*.pyc
aten/build/
aten/src/ATen/Config.h
aten/src/ATen/cuda/CUDAConfig.h
aten/src/ATen/hip/HIPConfig.h
benchmarks/.data
caffe2/cpp_test/
dist/
docs/build/
docs/cpp/src
docs/src/**/*
docs/cpp/build
docs/cpp/source/api
docs/cpp/source/html/
docs/cpp/source/latex/
docs/source/compile/generated/
docs/source/generated/
docs/source/compile/generated/
log
usage_log.txt
usage_log*
test-reports/
test/*.bak
test/**/*.bak
test/.coverage
test/.hypothesis/
test/cpp/api/mnist
test/custom_operator/model.pt
test/debug/
test/jit_hooks/*.pt
test/data/legacy_modules.t7
test/data/*.pt
test/forward_backward_compatibility/nightly_schemas.txt
dropout_model.pt
test/generated_type_hints_smoketest.py
test/htmlcov
test/cpp_extensions/**/install
test/kernel.errors.txt
third_party/build/
third_party/nccl/
tools/coverage_plugins_package/pip-wheel-metadata/
tools/shared/_utils_internal.py
tools/fast_nvcc/wrap_nvcc.sh
tools/fast_nvcc/wrap_nvcc.bat
tools/fast_nvcc/tmp/
torch.egg-info/
torch/_C/__init__.pyi
torch/_C/_nn.pyi
torch/_C/_VariableFunctions.pyi
torch/_VF.pyi
torch/return_types.pyi
torch/nn/functional.pyi
torch/utils/data/datapipes/datapipe.pyi
torch/csrc/autograd/generated/*
torch/csrc/functionalization/generated/*
torch/csrc/lazy/generated/*.[!m]*
torch_compile_debug/
# Listed manually because some files in this directory are not generated
torch/testing/_internal/generated/annotated_fn_args.py
torch/testing/_internal/data/*.pt
torch/headeronly/version.h
torch/csrc/cudnn/cuDNN.cpp
torch/csrc/generated
torch/csrc/generic/TensorMethods.cpp
torch/csrc/inductor/aoti_torch/generated/*.cpp
torch/csrc/inductor/aoti_torch/generated/extend/*
torch/csrc/jit/generated/*
torch/csrc/jit/fuser/config.h
torch/csrc/nn/THCUNN.cpp
torch/csrc/nn/THCUNN.cwrap
torch/bin/
torch/cmake/
torch/lib/*.a*
torch/lib/*.dll*
torch/lib/*.exe*
torch/lib/*.dylib*
torch/lib/*.h
torch/lib/*.lib
torch/lib/*.pdb
torch/lib/*.so*
torch/lib/protobuf*.pc
torch/lib/build
torch/lib/caffe2/
torch/lib/cmake
torch/lib/include
torch/lib/pkgconfig
torch/lib/protoc
torch/lib/protobuf/
torch/lib/tmp_install
torch/lib/torch_shm_manager
torch/lib/site-packages/
torch/lib/python*
torch/lib64
torch/include/
torch/share/
torch/test/
torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
torch/version.py
torch/_inductor/kernel/vendored_templates/*
test/inductor/test_tlx*
minifier_launcher.py
aten/src/ATen/native/transformers/hip/flash_attn/ck/fav_v3/asm_fmha_v3_bwd_configs.hpp
aten/src/ATen/native/transformers/hip/flash_attn/ck/fav_v3/mha_bwd.hip
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d*
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d*
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_convert*
aten/src/ATen/native/transformers/hip/flash_attn/ck/fwd_blob*
aten/src/ATen/native/transformers/hip/flash_attn/ck/bwd_blob*
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_api*
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_api*
# Root level file used in CI to specify certain env configs.
# E.g., see .circleci/config.yaml
env
.circleci/scripts/COMMIT_MSG
scripts/release_notes/*.json
sccache-stats*.json
lint.json
merge_record.json
.github/scripts/nightly_source_matrix.json
# These files get copied over on invoking setup.py
torchgen/packaged/*
!torchgen/packaged/README.md
# This file is injected by ROCm build scripts to bootstrap in torch/__init__.py.
torch/_rocm_init.py
# IPython notebook checkpoints
.ipynb_checkpoints
# Editor temporaries
*.swa
*.swb
*.swc
*.swd
*.swe
*.swf
*.swg
*.swh
*.swi
*.swj
*.swk
*.swl
*.swm
*.swn
*.swo
*.swp
*~
.~lock.*
# macOS dir files
.DS_Store
# Ninja files
.ninja_deps
.ninja_log
compile_commands.json
*.egg-info/
docs/source/scripts/activation_images/
docs/source/scripts/quantization_backend_configs/
docs/source/scripts/lr_scheduler_images/
## General
# Compiled Object files
*.slo
*.lo
*.o
*.cuo
*.obj
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Compiled protocol buffers
*.pb.h
*.pb.cc
*_pb2.py
# Compiled python
*.pyc
*.pyd
# Compiled MATLAB
*.mex*
# NFS handle files
**/.nfs*
# Sublime Text settings
*.sublime-workspace
*.sublime-project
# Eclipse Project settings
*.*project
.settings
# QtCreator files
*.user
# PyCharm files
.idea
# GDB history
.gdb_history
## Caffe2
# build, distribute, and bins (+ python proto bindings)
build/
# Allow tools/build/ for build support.
!tools/build/
build_host_protoc
build_android
build_ios
.build_debug/*
.build_release/*
.build_profile/*
distribute/*
*.testbin
*.bin
cmake_build
.cmake_build
gen
.setuptools-cmake-build
.pytest_cache
aten/build/*
# Linker scripts for prioritized text optimization
cmake/linker_script.ld
# Bram
plsdontbreak
# Generated documentation
docs/_site
docs/gathered
_site
doxygen
docs/dev
# LevelDB files
*.sst
*.ldb
LOCK
CURRENT
MANIFEST-*
# generated version file
caffe2/version.py
# setup.py intermediates
.eggs
caffe2.egg-info
MANIFEST
# Atom/Watchman required file
.watchmanconfig
.watchman
# Files generated by CLion
cmake-build-debug
# BEGIN NOT-CLEAN-FILES (setup.py handles this marker. Do not change.)
#
# Below files are not deleted by "setup.py clean".
# Downloaded bazel
tools/bazel
# Visual Studio Code files
.vs
/.vscode/*
!/.vscode/extensions.json
!/.vscode/settings_recommended.json
# YouCompleteMe config file
.ycm_extra_conf.py
# Files generated when a patch is rejected
*.orig
*.rej
# Files generated by ctags
CTAGS
GTAGS
GRTAGS
GSYMS
GPATH
tags
TAGS
# ccls file
.ccls-cache/
# clang tooling storage location
.clang-format-bin
.clang-tidy-bin
.lintbin
# clangd background index
.clangd/
.cache/
# bazel symlinks
bazel-*
# xla repo
xla/
# direnv, posh-direnv
.env
.envrc
.psenvrc
# generated shellcheck directories
.shellcheck_generated*/
# zip archives
*.zip
# core dump files
**/core.[1-9]*
# Generated if you use the pre-commit script for clang-tidy
pr.diff
# coverage files
*/**/.coverage.*
# buck generated files
.buckd/
.lsp-buck-out/
.lsp.buckd/
buck-out/
# Downloaded libraries
third_party/ruy/
third_party/glog/
# Virtualenv
.venv/
venv/
# Log files
*.log
sweep/
# Android build artifacts
android/pytorch_android/.cxx
android/pytorch_android_torchvision/.cxx
# Pyre configs (for internal usage)
.pyre_configuration
.pyre_configuration.codenav
.arcconfig
.stable_pyre_client
.pyre_client
# Claude Code local configuration
CLAUDE.local.md
/test_*.py
/debug_*.py
CLAUDE_CONTEXT/
/.claude/settings.local.json

100
AMD_FIX_SUMMARY.md Normal file
View File

@@ -0,0 +1,100 @@
# AMD GPU Monitoring Fix Summary
## Issue
The AMDMonitor class was using incorrect pyrsmi API calls. The implementation attempted to use low-level `rocmsmi` module which has complex initialization and function signatures.
## Solution
Updated to use the correct `rocml` high-level API from pyrsmi, based on the official example at:
`/anvme/workspace/ihpc125h-llm-profiles/pyrsmi/examples/llm_monitoring/monitor_llm_inference.py`
## Changes Made
### 1. Fixed AMDMonitor Class
**Before** (incorrect):
```python
from pyrsmi import rocmsmi
ret = self.rocmsmi.rsmi_init(0)
power_uw = self.rocmsmi.rsmi_dev_power_ave_get(self.device_id)
```
**After** (correct):
```python
from pyrsmi import rocml
self.rocml.smi_initialize()
power_watts = self.rocml.smi_get_device_average_power(self.device_id)
```
**Key API Functions**:
- `rocml.smi_initialize()` - Initialize monitoring
- `rocml.smi_get_device_average_power(device_id)` - Get power in Watts (not microwatts!)
- `rocml.smi_get_device_utilization(device_id)` - Get GPU utilization %
- `rocml.smi_get_device_memory_used(device_id)` - Get memory used in bytes
- `rocml.smi_get_device_memory_total(device_id)` - Get total memory in bytes
- `rocml.smi_get_device_temperature(device_id)` - Get temperature
- `rocml.smi_get_device_name(device_id)` - Get device name
- `rocml.smi_shutdown()` - Cleanup
### 2. Updated All SLURM Scripts for Apptainer
All GPU benchmark scripts now run inside the apptainer container:
**A100, H100, H200** (NVIDIA):
```bash
APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_tilelang.sif"
apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py ...
```
**MI300X** (AMD):
```bash
APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_tilelang.sif"
apptainer exec --rocm $APPTAINER_IMAGE python run_benchmark.py ...
```
Note: `--nv` for NVIDIA, `--rocm` for AMD
### 3. Updated Documentation
- README.md now mentions apptainer usage
- Updated setup instructions to use apptainer for model caching
- Added notes about container flags (--nv vs --rocm)
## Testing
To verify the AMD monitoring works:
```bash
# Inside apptainer on MI300X node
apptainer exec --rocm pytorch_25.10_tilelang.sif python -c "
from utils.gpu_monitor import AMDMonitor
m = AMDMonitor(0)
print(f'GPU: {m.get_device_name()}')
metrics = m.get_metrics()
print(f'Power: {metrics.power_watts:.2f} W')
print(f'Utilization: {metrics.gpu_utilization_percent:.1f}%')
print(f'Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB')
m.cleanup()
"
```
## Files Modified
1. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/utils/gpu_monitor.py` - Fixed AMDMonitor class
2. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_a100.sh` - Added apptainer
3. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_h100.sh` - Added apptainer
4. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_h200.sh` - Added apptainer
5. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_mi300x.sh` - Added apptainer with --rocm
6. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/README.md` - Updated documentation
## Key Differences: rocml vs rocmsmi
| Feature | rocml (High-level) | rocmsmi (Low-level) |
|---------|-------------------|---------------------|
| API Style | Simple functions | Complex C-style API |
| Initialization | `smi_initialize()` | `rsmi_init(0)` + error codes |
| Power | Returns Watts | Returns microwatts |
| Memory | Returns bytes | Returns bytes via enums |
| Error Handling | Returns -1 on error | Returns error codes |
| Ease of Use | Much easier | Complex |
The `rocml` module is the recommended high-level Python API for pyrsmi.

311
README.md Normal file
View File

@@ -0,0 +1,311 @@
# LLM Benchmark Suite
A comprehensive benchmarking suite for comparing LLM performance (Qwen3-4B) across different GPU architectures: **MI300X**, **A100 80G**, **H100**, and **H200**.
## Features
- **Pretraining Benchmarks**: Separate metrics for forward, backward, and optimizer stages
- **Inference Benchmarks**: Separate metrics for prefill (TTFT) and decode (ITL) stages
- **Energy Monitoring**: GPU-specific energy and power measurement
- NVIDIA: pynvml
- AMD: pyrsmi
- **Attention Implementations**:
- FlashAttention-2 (A100, MI300X)
- FlashAttention-3 Hopper (H100, H200)
- Configurable via CLI
- **Comprehensive Metrics**:
- Tokens per second
- Energy per token
- Time to First Token (TTFT)
- Inter-Token Latency (ITL)
- End-to-End Request Latency
- GPU utilization and memory usage
## Directory Structure
```
llm-benchmark/
├── cache_model.py # Model caching script
├── benchmark_pretrain.py # Pretraining benchmark
├── benchmark_inference.py # Inference benchmark
├── run_benchmark.py # Main orchestration script
├── requirements.txt # Python dependencies
├── utils/
│ ├── gpu_monitor.py # GPU monitoring (NVIDIA & AMD)
│ ├── metrics.py # Metrics collection and reporting
│ └── attention.py # Attention implementation helpers
├── configs/
│ ├── a100.yaml
│ ├── h100.yaml
│ ├── h200.yaml
│ └── mi300x.yaml
└── results/ # Benchmark results (JSON)
```
## Setup
### 1. Container Environment
All benchmarks should be run inside the apptainer container:
```bash
# Container is located at:
/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_tilelang.sif
```
### 2. Install Dependencies (if not using apptainer)
If you want to run directly without apptainer:
```bash
# Install Python dependencies
pip install -r requirements.txt
# For AMD GPUs, ensure ROCm and pyrsmi are installed
# For NVIDIA GPUs, ensure CUDA and pynvml are installed
```
### 3. Cache Model (Run on Head Node)
**IMPORTANT**: Run this on the head node BEFORE allocating compute nodes, as compute nodes are typically offline.
```bash
# Using apptainer (recommended)
apptainer exec --nv pytorch_25.10_tilelang.sif python cache_model.py \
--model-name Qwen/Qwen3-4B \
--cache-dir ./model_cache
# Or directly (if dependencies installed)
python cache_model.py --model-name Qwen/Qwen3-4B --cache-dir ./model_cache
```
The model will be cached to `./model_cache` in the current directory (avoiding slow NFS $HOME).
## Usage
### Quick Start
```bash
# Run both pretraining and inference benchmarks
python run_benchmark.py --mode both --model-path ./model_cache
# Run only pretraining
python run_benchmark.py --mode pretrain --num-steps 20
# Run only inference
python run_benchmark.py --mode inference --num-requests 20
```
### Detailed Usage
#### List Available GPUs
```bash
python run_benchmark.py --list-gpus
```
#### Pretraining Benchmark
```bash
python benchmark_pretrain.py \
--model-path ./model_cache \
--model-name Qwen/Qwen3-4B \
--attn-implementation auto \
--batch-size 8 \
--sequence-length 8192 \
--num-steps 10 \
--warmup-steps 3 \
--output-dir ./results
```
**Metrics Reported** (per stage: forward, backward, optimizer):
- Duration (ms)
- Tokens processed
- Throughput (tokens/s)
- Energy (J)
- Energy per token (J/token)
- Average power (W)
- Peak memory (GB)
- GPU utilization (%)
#### Inference Benchmark
```bash
python benchmark_inference.py \
--model-path ./model_cache \
--model-name Qwen/Qwen3-4B \
--attn-implementation auto \
--num-requests 10 \
--prompt-length 512 \
--generation-length 100 \
--warmup-requests 2 \
--output-dir ./results
```
**Metrics Reported**:
- **Prefill**: TTFT, throughput, energy per token
- **Decode**: ITL, throughput, energy per token
- **End-to-End**: Request latency, total throughput, total energy
### Attention Implementations
The benchmark automatically selects the optimal attention implementation based on GPU:
- **A100, MI300X**: `flash_attention_2`
- **H100, H200**: `flash_attention_3_hopper`
Override with `--attn-implementation`:
```bash
# Force FlashAttention-3 Hopper on H100
python run_benchmark.py --attn-implementation flash_attention_3_hopper
# Use SDPA instead
python run_benchmark.py --attn-implementation sdpa
```
Available options:
- `auto` - Auto-detect based on GPU
- `flash_attention_2` - FlashAttention-2 (all GPUs)
- `flash_attention_3_hopper` - FlashAttention-3 for H100/H200
- `sdpa` - PyTorch Scaled Dot Product Attention
- `eager` - Standard PyTorch attention
## Running on SLURM
All SLURM scripts are configured to run inside the apptainer container. First cache the model on the head node:
```bash
# On head node (with internet access)
apptainer exec --nv pytorch_25.10_tilelang.sif python cache_model.py \
--model-name Qwen/Qwen3-4B \
--cache-dir ./model_cache
```
Then submit jobs:
```bash
# A100
sbatch slurm_a100.sh
# H100
sbatch slurm_h100.sh
# H200
sbatch slurm_h200.sh
# MI300X
sbatch slurm_mi300x.sh
```
**Note**:
- NVIDIA GPUs use `--nv` flag
- AMD GPUs use `--rocm` flag
## Output
Results are saved to the `--output-dir` directory (default: `./results/`):
- `pretrain_<GPU>_<ATTENTION>.json` - Pretraining metrics
- `inference_<GPU>_<ATTENTION>.json` - Inference metrics
Example output:
```
===============================================================================
PRETRAINING BENCHMARK RESULTS
===============================================================================
Model: Qwen/Qwen3-4B
GPU: NVIDIA A100 80GB
Attention: flash_attention_2
Batch Size: 8
Sequence Length: 8192
Training Steps: 10
-------------------------------------------------------------------------------
STAGE BREAKDOWN
-------------------------------------------------------------------------------
[1] FORWARD PASS
Duration: 1005.23 ms
Tokens: 163,840
Throughput: 163,012.45 tokens/s
Energy: 253.0 J
Energy per Token: 1.5443 mJ/token
[2] BACKWARD PASS
Duration: 2052.11 ms
Tokens: 163,840
Throughput: 79,857.23 tokens/s
Energy: 516.2 J
Energy per Token: 3.1513 mJ/token
[3] OPTIMIZER STEP
Duration: 153.42 ms
Tokens: 163,840
Throughput: 1,068,012.34 tokens/s
Energy: 38.4 J
Energy per Token: 0.2344 mJ/token
-------------------------------------------------------------------------------
OVERALL METRICS
-------------------------------------------------------------------------------
Total Duration: 3210.76 ms
Total Tokens: 163,840
Throughput: 51,012.45 tokens/s
Total Energy: 807.6 J
Energy per Token: 4.9300 mJ/token
===============================================================================
```
## Key Metrics Reference
### Pretraining
- **Forward**: Input processing and loss calculation
- **Backward**: Gradient computation
- **Optimizer**: Weight updates
### Inference
- **TTFT (Time to First Token)**: Prefill latency
- **ITL (Inter-Token Latency)**: Average decode time per token
- **E2E Latency**: Total request time (prefill + decode)
### Energy
- **Energy (J)**: Total energy consumed
- **Energy per Token (mJ/token)**: Energy efficiency metric
- **Average Power (W)**: Power consumption during stage
## Troubleshooting
### Model Not Found
Ensure you've cached the model first:
```bash
python cache_model.py --model-name Qwen/Qwen2.5-3B-Instruct --cache-dir ./model_cache
```
### GPU Monitoring Errors
- **NVIDIA**: Install pynvml: `pip install pynvml`
- **AMD**: Install pyrsmi: `pip install pyrsmi`
### FlashAttention-3 Not Found
For H100/H200, ensure FlashAttention-3 is installed. If not available, use:
```bash
python run_benchmark.py --attn-implementation flash_attention_2
```
### Out of Memory
Reduce batch size or sequence length:
```bash
python run_benchmark.py --batch-size 4 --sequence-length 1024
```
## Citation
If you use this benchmark suite, please cite:
- [FlashAttention-2](https://github.com/Dao-AILab/flash-attention)
- [FlashAttention-3](https://github.com/Dao-AILab/flash-attention) (for Hopper)
- [Qwen Models](https://huggingface.co/Qwen)
## License
MIT License - see LICENSE file for details

417
benchmark_inference.py Executable file
View File

@@ -0,0 +1,417 @@
#!/usr/bin/env python3
"""
Inference Benchmark for LLM Performance Evaluation
Measures performance and energy metrics for inference workloads with
separate measurements for prefill and decode stages.
"""
import argparse
import os
import sys
import time
from pathlib import Path
from typing import Optional
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
# Add utils to path
sys.path.insert(0, str(Path(__file__).parent))
from utils.gpu_monitor import get_gpu_monitor
from utils.metrics import StageMetrics, InferenceMetrics, MetricsReporter
from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu
def benchmark_inference(
model_name_or_path: str,
attn_implementation: str = "auto",
num_requests: int = 10,
prompt_length: int = 512,
generation_length: int = 100,
warmup_requests: int = 2,
device: str = "cuda",
device_id: int = 0,
output_dir: Optional[str] = None,
verbose: bool = True,
):
"""
Run inference benchmark.
Args:
model_name_or_path: Path to model or HuggingFace identifier
attn_implementation: Attention implementation to use
num_requests: Number of inference requests to measure
prompt_length: Length of input prompt
generation_length: Number of tokens to generate
warmup_requests: Number of warmup requests
device: Device to use
device_id: GPU device ID
output_dir: Directory to save results
verbose: Print verbose output
"""
print("=" * 80)
print("INFERENCE BENCHMARK")
print("=" * 80)
# Initialize GPU monitor
if verbose:
print("\n[1/7] Initializing GPU monitor...")
monitor = get_gpu_monitor(device_id)
gpu_name = monitor.get_device_name()
if verbose:
print(f" GPU: {gpu_name}")
# Determine attention implementation
if attn_implementation == "auto":
attn_implementation = get_default_attention(gpu_name)
if verbose:
print(f" Auto-selected attention: {attn_implementation}")
# Validate attention for GPU
valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name)
if warning and verbose:
print(f"{warning}")
# Load model
if verbose:
print(f"\n[2/7] Loading model: {model_name_or_path}")
# Determine attn_implementation parameter for model loading
load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation
try:
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
torch_dtype=torch.bfloat16,
attn_implementation=load_attn,
trust_remote_code=True,
)
model = model.to(device)
# Configure attention (patch if needed for FA3)
model = configure_model_attention(model, attn_implementation, verbose=verbose)
if verbose:
total_params = sum(p.numel() for p in model.parameters())
print(f" Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
except Exception as e:
print(f"✗ Error loading model: {e}")
sys.exit(1)
# Load tokenizer
if verbose:
print(f"\n[3/7] Loading tokenizer...")
try:
tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path,
trust_remote_code=True
)
except Exception as e:
print(f"✗ Error loading tokenizer: {e}")
sys.exit(1)
# Generate synthetic prompts
if verbose:
print(f"\n[4/7] Generating synthetic prompts...")
print(f" Prompt length: {prompt_length}")
print(f" Generation length: {generation_length}")
# Create random input_ids (synthetic prompts)
vocab_size = model.config.vocab_size
# We'll create one prompt and reuse it
prompt_ids = torch.randint(0, vocab_size, (1, prompt_length), device=device)
# Warmup
if verbose:
print(f"\n[5/7] Running warmup ({warmup_requests} requests)...")
model.eval()
with torch.no_grad():
for _ in range(warmup_requests):
_ = model.generate(
prompt_ids,
max_new_tokens=generation_length,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
# Synchronize before benchmarking
torch.cuda.synchronize()
# Benchmark
if verbose:
print(f"\n[6/7] Running benchmark ({num_requests} requests)...")
# Storage for per-request metrics
prefill_times = []
decode_times = []
e2e_times = []
prefill_energies = []
decode_energies = []
e2e_energies = []
prefill_powers = []
decode_powers = []
memory_usage = []
gpu_utils = []
# For inference, we separate prefill (first token) from decode (remaining tokens)
# We'll use a custom generation loop to measure them separately
for req_idx in tqdm(range(num_requests), desc="Benchmarking"):
# === PREFILL PHASE (Time to First Token) ===
# This is the forward pass with the prompt to get the first token
monitor.start_monitoring()
torch.cuda.synchronize()
prefill_start = time.perf_counter()
with torch.no_grad():
# Forward pass with prompt
outputs = model(input_ids=prompt_ids, use_cache=True)
logits = outputs.logits
past_key_values = outputs.past_key_values
# Get first generated token
next_token_logits = logits[:, -1, :]
next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
torch.cuda.synchronize()
prefill_time = time.perf_counter() - prefill_start
prefill_energy = monitor.get_energy_consumed()
prefill_power = monitor.get_average_power()
prefill_times.append(prefill_time * 1000) # Convert to ms
prefill_energies.append(prefill_energy)
prefill_powers.append(prefill_power)
# === DECODE PHASE (Inter-Token Latency) ===
# Generate remaining tokens one by one
monitor.start_monitoring()
torch.cuda.synchronize()
decode_start = time.perf_counter()
generated_tokens = [next_token]
with torch.no_grad():
for _ in range(generation_length - 1):
# Forward pass with single token using cached keys/values
outputs = model(
input_ids=next_token,
past_key_values=past_key_values,
use_cache=True
)
logits = outputs.logits
past_key_values = outputs.past_key_values
# Get next token
next_token_logits = logits[:, -1, :]
next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
generated_tokens.append(next_token)
torch.cuda.synchronize()
decode_time = time.perf_counter() - decode_start
decode_energy = monitor.get_energy_consumed()
decode_power = monitor.get_average_power()
decode_times.append(decode_time * 1000) # Convert to ms
decode_energies.append(decode_energy)
decode_powers.append(decode_power)
# End-to-end metrics
e2e_time = prefill_time + decode_time
e2e_energy = prefill_energy + decode_energy
e2e_times.append(e2e_time * 1000) # Convert to ms
e2e_energies.append(e2e_energy)
# Get memory and utilization
metrics = monitor.get_metrics()
memory_usage.append(metrics.memory_used_gb)
gpu_utils.append(metrics.gpu_utilization_percent)
# Compute aggregated metrics
# Prefill metrics (TTFT)
prefill_duration_ms = sum(prefill_times)
prefill_energy_j = sum(prefill_energies)
prefill_tokens = prompt_length * num_requests
prefill_tps = prefill_tokens / (prefill_duration_ms / 1000)
prefill_ept = prefill_energy_j / prefill_tokens
avg_ttft_ms = sum(prefill_times) / len(prefill_times)
prefill_metrics = StageMetrics(
stage_name="prefill",
duration_ms=prefill_duration_ms,
tokens_processed=prefill_tokens,
tokens_per_second=prefill_tps,
energy_joules=prefill_energy_j,
energy_per_token=prefill_ept,
avg_power_watts=sum(prefill_powers) / len(prefill_powers),
peak_memory_gb=max(memory_usage),
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
)
# Decode metrics (ITL)
decode_duration_ms = sum(decode_times)
decode_energy_j = sum(decode_energies)
decode_tokens = generation_length * num_requests
decode_tps = decode_tokens / (decode_duration_ms / 1000)
decode_ept = decode_energy_j / decode_tokens
avg_itl_ms = sum(decode_times) / len(decode_times) / generation_length
decode_metrics = StageMetrics(
stage_name="decode",
duration_ms=decode_duration_ms,
tokens_processed=decode_tokens,
tokens_per_second=decode_tps,
energy_joules=decode_energy_j,
energy_per_token=decode_ept,
avg_power_watts=sum(decode_powers) / len(decode_powers),
peak_memory_gb=max(memory_usage),
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
)
# End-to-end metrics
e2e_latency_ms = sum(e2e_times) / len(e2e_times)
e2e_energy_j = sum(e2e_energies)
total_tokens = (prompt_length + generation_length) * num_requests
e2e_tps = total_tokens / (sum(e2e_times) / 1000)
e2e_ept = e2e_energy_j / total_tokens
# Create metrics object
metrics = InferenceMetrics(
model_name=model_name_or_path,
gpu_name=gpu_name,
attention_implementation=attn_implementation,
num_requests=num_requests,
prompt_length=prompt_length,
generation_length=generation_length,
prefill=prefill_metrics,
decode=decode_metrics,
e2e_latency_ms=e2e_latency_ms,
e2e_tokens_per_second=e2e_tps,
e2e_energy_joules=e2e_energy_j,
e2e_energy_per_token=e2e_ept,
ttft_ms=avg_ttft_ms,
itl_ms=avg_itl_ms
)
# Print results
if verbose:
print()
MetricsReporter.print_inference_metrics(metrics, verbose=verbose)
# Save results
if output_dir:
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Save JSON
json_path = output_path / f"inference_{gpu_name.replace(' ', '_')}_{attn_implementation}.json"
MetricsReporter.save_json(metrics, json_path)
# Cleanup
monitor.cleanup()
del model
torch.cuda.empty_cache()
return metrics
def main():
parser = argparse.ArgumentParser(
description="LLM Inference Benchmark",
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
"--model-path",
type=str,
default="./model_cache",
help="Path to cached model"
)
parser.add_argument(
"--model-name",
type=str,
default="Qwen/Qwen3-4B",
help="Model name (for reporting)"
)
parser.add_argument(
"--attn-implementation",
type=str,
default="auto",
choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
help="Attention implementation to use"
)
parser.add_argument(
"--num-requests",
type=int,
default=10,
help="Number of inference requests"
)
parser.add_argument(
"--prompt-length",
type=int,
default=512,
help="Prompt length in tokens"
)
parser.add_argument(
"--generation-length",
type=int,
default=100,
help="Number of tokens to generate"
)
parser.add_argument(
"--warmup-requests",
type=int,
default=2,
help="Number of warmup requests"
)
parser.add_argument(
"--device-id",
type=int,
default=0,
help="GPU device ID"
)
parser.add_argument(
"--output-dir",
type=str,
default="./results",
help="Output directory for results"
)
args = parser.parse_args()
# Set environment variables for HuggingFace cache
if Path(args.model_path).exists():
os.environ['HF_HOME'] = args.model_path
benchmark_inference(
model_name_or_path=args.model_name,
attn_implementation=args.attn_implementation,
num_requests=args.num_requests,
prompt_length=args.prompt_length,
generation_length=args.generation_length,
warmup_requests=args.warmup_requests,
device="cuda",
device_id=args.device_id,
output_dir=args.output_dir,
verbose=True
)
if __name__ == "__main__":
main()

406
benchmark_pretrain.py Executable file
View File

@@ -0,0 +1,406 @@
#!/usr/bin/env python3
"""
Pretraining Benchmark for LLM Performance Evaluation
Measures performance and energy metrics for pretraining workloads with
separate measurements for forward, backward, and optimizer stages.
"""
import argparse
import os
import sys
import time
from pathlib import Path
from typing import Optional
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
# Add utils to path
sys.path.insert(0, str(Path(__file__).parent))
from utils.gpu_monitor import get_gpu_monitor
from utils.metrics import StageMetrics, PretrainMetrics, MetricsReporter
from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu
def benchmark_pretrain(
model_name_or_path: str,
attn_implementation: str = "auto",
batch_size: int = 8,
sequence_length: int = 2048,
num_steps: int = 10,
warmup_steps: int = 3,
device: str = "cuda",
device_id: int = 0,
output_dir: Optional[str] = None,
verbose: bool = True,
):
"""
Run pretraining benchmark.
Args:
model_name_or_path: Path to model or HuggingFace identifier
attn_implementation: Attention implementation to use
batch_size: Batch size for training
sequence_length: Sequence length
num_steps: Number of training steps to measure
warmup_steps: Number of warmup steps before measurement
device: Device to use
device_id: GPU device ID
output_dir: Directory to save results
verbose: Print verbose output
"""
print("=" * 80)
print("PRETRAINING BENCHMARK")
print("=" * 80)
# Initialize GPU monitor
if verbose:
print("\n[1/6] Initializing GPU monitor...")
monitor = get_gpu_monitor(device_id)
gpu_name = monitor.get_device_name()
if verbose:
print(f" GPU: {gpu_name}")
# Determine attention implementation
if attn_implementation == "auto":
attn_implementation = get_default_attention(gpu_name)
if verbose:
print(f" Auto-selected attention: {attn_implementation}")
# Validate attention for GPU
valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name)
if warning and verbose:
print(f"{warning}")
# Load model
if verbose:
print(f"\n[2/6] Loading model: {model_name_or_path}")
# Determine attn_implementation parameter for model loading
load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation
try:
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
torch_dtype=torch.bfloat16,
attn_implementation=load_attn,
trust_remote_code=True,
)
model = model.to(device)
# Configure attention (patch if needed for FA3)
model = configure_model_attention(model, attn_implementation, verbose=verbose)
if verbose:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f" Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
print(f" Trainable parameters: {trainable_params:,}")
except Exception as e:
print(f"✗ Error loading model: {e}")
sys.exit(1)
# Setup optimizer
if verbose:
print(f"\n[3/6] Setting up optimizer...")
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
# Generate synthetic training data
if verbose:
print(f"\n[4/6] Generating synthetic training data...")
print(f" Batch size: {batch_size}")
print(f" Sequence length: {sequence_length}")
# Create random input_ids (synthetic data)
vocab_size = model.config.vocab_size
input_ids = torch.randint(0, vocab_size, (batch_size, sequence_length), device=device)
labels = input_ids.clone()
# Warmup
if verbose:
print(f"\n[5/6] Running warmup ({warmup_steps} steps)...")
model.train()
for _ in range(warmup_steps):
optimizer.zero_grad()
outputs = model(input_ids=input_ids, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
# Synchronize before benchmarking
torch.cuda.synchronize()
# Benchmark
if verbose:
print(f"\n[6/6] Running benchmark ({num_steps} steps)...")
# Storage for per-step metrics
forward_times = []
backward_times = []
optimizer_times = []
forward_energies = []
backward_energies = []
optimizer_energies = []
forward_powers = []
backward_powers = []
optimizer_powers = []
memory_usage = []
gpu_utils = []
total_tokens = batch_size * sequence_length * num_steps
for step in tqdm(range(num_steps), desc="Benchmarking"):
# === FORWARD PASS ===
monitor.start_monitoring()
torch.cuda.synchronize()
start_time = time.perf_counter()
optimizer.zero_grad()
outputs = model(input_ids=input_ids, labels=labels)
loss = outputs.loss
torch.cuda.synchronize()
forward_time = time.perf_counter() - start_time
forward_energy = monitor.get_energy_consumed()
forward_power = monitor.get_average_power()
forward_times.append(forward_time * 1000) # Convert to ms
forward_energies.append(forward_energy)
forward_powers.append(forward_power)
# === BACKWARD PASS ===
monitor.start_monitoring()
torch.cuda.synchronize()
start_time = time.perf_counter()
loss.backward()
torch.cuda.synchronize()
backward_time = time.perf_counter() - start_time
backward_energy = monitor.get_energy_consumed()
backward_power = monitor.get_average_power()
backward_times.append(backward_time * 1000) # Convert to ms
backward_energies.append(backward_energy)
backward_powers.append(backward_power)
# === OPTIMIZER STEP ===
monitor.start_monitoring()
torch.cuda.synchronize()
start_time = time.perf_counter()
optimizer.step()
torch.cuda.synchronize()
optimizer_time = time.perf_counter() - start_time
optimizer_energy = monitor.get_energy_consumed()
optimizer_power = monitor.get_average_power()
optimizer_times.append(optimizer_time * 1000) # Convert to ms
optimizer_energies.append(optimizer_energy)
optimizer_powers.append(optimizer_power)
# Get memory and utilization
metrics = monitor.get_metrics()
memory_usage.append(metrics.memory_used_gb)
gpu_utils.append(metrics.gpu_utilization_percent)
# Compute aggregated metrics
tokens_per_step = batch_size * sequence_length
# Forward metrics
forward_duration_ms = sum(forward_times)
forward_energy_j = sum(forward_energies)
forward_tokens = tokens_per_step * num_steps
forward_tps = forward_tokens / (forward_duration_ms / 1000)
forward_ept = forward_energy_j / forward_tokens
forward_metrics = StageMetrics(
stage_name="forward",
duration_ms=forward_duration_ms,
tokens_processed=forward_tokens,
tokens_per_second=forward_tps,
energy_joules=forward_energy_j,
energy_per_token=forward_ept,
avg_power_watts=sum(forward_powers) / len(forward_powers),
peak_memory_gb=max(memory_usage),
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
)
# Backward metrics
backward_duration_ms = sum(backward_times)
backward_energy_j = sum(backward_energies)
backward_tokens = tokens_per_step * num_steps
backward_tps = backward_tokens / (backward_duration_ms / 1000)
backward_ept = backward_energy_j / backward_tokens
backward_metrics = StageMetrics(
stage_name="backward",
duration_ms=backward_duration_ms,
tokens_processed=backward_tokens,
tokens_per_second=backward_tps,
energy_joules=backward_energy_j,
energy_per_token=backward_ept,
avg_power_watts=sum(backward_powers) / len(backward_powers),
peak_memory_gb=max(memory_usage),
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
)
# Optimizer metrics
optimizer_duration_ms = sum(optimizer_times)
optimizer_energy_j = sum(optimizer_energies)
optimizer_tokens = tokens_per_step * num_steps
optimizer_tps = optimizer_tokens / (optimizer_duration_ms / 1000)
optimizer_ept = optimizer_energy_j / optimizer_tokens
optimizer_metrics = StageMetrics(
stage_name="optimizer",
duration_ms=optimizer_duration_ms,
tokens_processed=optimizer_tokens,
tokens_per_second=optimizer_tps,
energy_joules=optimizer_energy_j,
energy_per_token=optimizer_ept,
avg_power_watts=sum(optimizer_powers) / len(optimizer_powers),
peak_memory_gb=max(memory_usage),
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
)
# Overall metrics
total_duration_ms = forward_duration_ms + backward_duration_ms + optimizer_duration_ms
total_energy_j = forward_energy_j + backward_energy_j + optimizer_energy_j
total_tps = total_tokens / (total_duration_ms / 1000)
total_ept = total_energy_j / total_tokens
# Create metrics object
metrics = PretrainMetrics(
model_name=model_name_or_path,
gpu_name=gpu_name,
attention_implementation=attn_implementation,
batch_size=batch_size,
sequence_length=sequence_length,
num_steps=num_steps,
forward=forward_metrics,
backward=backward_metrics,
optimizer=optimizer_metrics,
total_duration_ms=total_duration_ms,
total_tokens=total_tokens,
total_tokens_per_second=total_tps,
total_energy_joules=total_energy_j,
total_energy_per_token=total_ept
)
# Print results
MetricsReporter.print_pretrain_metrics(metrics, verbose=verbose)
# Save results
if output_dir:
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Save JSON
json_path = output_path / f"pretrain_{gpu_name.replace(' ', '_')}_{attn_implementation}.json"
MetricsReporter.save_json(metrics, json_path)
# Cleanup
monitor.cleanup()
del model
torch.cuda.empty_cache()
return metrics
def main():
parser = argparse.ArgumentParser(
description="LLM Pretraining Benchmark",
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
"--model-path",
type=str,
default="./model_cache",
help="Path to cached model"
)
parser.add_argument(
"--model-name",
type=str,
default="Qwen/Qwen3-4B",
help="Model name (for reporting)"
)
parser.add_argument(
"--attn-implementation",
type=str,
default="auto",
choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
help="Attention implementation to use"
)
parser.add_argument(
"--batch-size",
type=int,
default=8,
help="Batch size"
)
parser.add_argument(
"--sequence-length",
type=int,
default=8192,
help="Sequence length"
)
parser.add_argument(
"--num-steps",
type=int,
default=10,
help="Number of training steps"
)
parser.add_argument(
"--warmup-steps",
type=int,
default=3,
help="Number of warmup steps"
)
parser.add_argument(
"--device-id",
type=int,
default=0,
help="GPU device ID"
)
parser.add_argument(
"--output-dir",
type=str,
default="./results",
help="Output directory for results"
)
args = parser.parse_args()
# Set environment variables for HuggingFace cache
if Path(args.model_path).exists():
os.environ['HF_HOME'] = args.model_path
benchmark_pretrain(
model_name_or_path=args.model_name,
attn_implementation=args.attn_implementation,
batch_size=args.batch_size,
sequence_length=args.sequence_length,
num_steps=args.num_steps,
warmup_steps=args.warmup_steps,
device="cuda",
device_id=args.device_id,
output_dir=args.output_dir,
verbose=True
)
if __name__ == "__main__":
main()

151
cache_model.py Executable file
View File

@@ -0,0 +1,151 @@
#!/usr/bin/env python3
"""
Model Caching Script for LLM Benchmarking
This script downloads and caches the Qwen3-4B model from HuggingFace
before running benchmarks on offline compute nodes.
"""
import argparse
import os
import sys
from pathlib import Path
def cache_model(model_name: str, cache_dir: str, force: bool = False):
"""
Download and cache a HuggingFace model.
Args:
model_name: HuggingFace model identifier (e.g., "Qwen/Qwen3-4B-Instruct-2507")
cache_dir: Local directory to cache the model
force: Force re-download even if model exists
"""
try:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
except ImportError:
print("Error: transformers library not found. Please install it:")
print(" pip install transformers")
sys.exit(1)
# Create cache directory
cache_path = Path(cache_dir).resolve()
cache_path.mkdir(parents=True, exist_ok=True)
print(f"Caching model: {model_name}")
print(f"Cache directory: {cache_path}")
print("-" * 60)
# Set HuggingFace cache directory
os.environ['HF_HOME'] = str(cache_path)
# Check if model already exists
model_path = cache_path / model_name.replace("/", "--")
if model_path.exists() and not force:
print(f"Model already cached at: {model_path}")
print("Use --force to re-download")
return str(cache_path)
try:
# Download config
print("\n[1/3] Downloading model config...")
config = AutoConfig.from_pretrained(
model_name,
cache_dir=cache_path,
trust_remote_code=True
)
print(f" ✓ Config downloaded")
print(f" - Model type: {config.model_type}")
print(f" - Hidden size: {config.hidden_size}")
print(f" - Num layers: {config.num_hidden_layers}")
print(f" - Num attention heads: {config.num_attention_heads}")
# Download tokenizer
print("\n[2/3] Downloading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
model_name,
cache_dir=cache_path,
trust_remote_code=True
)
print(f" ✓ Tokenizer downloaded")
print(f" - Vocab size: {len(tokenizer)}")
print(f" - Model max length: {tokenizer.model_max_length}")
# Download model weights
print("\n[3/3] Downloading model weights...")
print(" (This may take several minutes depending on connection speed)")
model = AutoModelForCausalLM.from_pretrained(
model_name,
cache_dir=cache_path,
trust_remote_code=True,
torch_dtype="auto",
low_cpu_mem_usage=True
)
print(f" ✓ Model weights downloaded")
# Calculate total parameters
total_params = sum(p.numel() for p in model.parameters())
print(f" - Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
# Clean up model from memory
del model
print("\n" + "=" * 60)
print("✓ Model successfully cached!")
print("=" * 60)
print(f"\nCache location: {cache_path}")
print(f"\nTo use in benchmarks, set:")
print(f" --model-path {cache_path}")
print(f"\nOr set environment variable:")
print(f" export HF_HOME={cache_path}")
return str(cache_path)
except Exception as e:
print(f"\n✗ Error downloading model: {e}", file=sys.stderr)
sys.exit(1)
def main():
parser = argparse.ArgumentParser(
description="Cache HuggingFace model for offline use",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Cache model to default location
python cache_model.py
# Cache model to custom directory
python cache_model.py --cache-dir /path/to/cache
# Force re-download
python cache_model.py --force
"""
)
parser.add_argument(
"--model-name",
type=str,
default="Qwen/Qwen3-4B",
help="HuggingFace model identifier (default: Qwen/Qwen3-4B)"
)
parser.add_argument(
"--cache-dir",
type=str,
default="./model_cache",
help="Directory to cache model (default: ./model_cache in current directory)"
)
parser.add_argument(
"--force",
action="store_true",
help="Force re-download even if model exists"
)
args = parser.parse_args()
cache_model(args.model_name, args.cache_dir, args.force)
if __name__ == "__main__":
main()

26
configs/a100.yaml Normal file
View File

@@ -0,0 +1,26 @@
# A100 Configuration
gpu_type: a100
gpu_model: "NVIDIA A100 80GB"
# Default attention implementation
default_attention: flash_attention_2
# Pretraining defaults
pretrain:
batch_size: 8
sequence_length: 8192
num_steps: 10
warmup_steps: 3
# Inference defaults
inference:
num_requests: 10
prompt_length: 512
generation_length: 100
warmup_requests: 2
# Hardware specs (for reference)
hardware:
memory_gb: 80
tdp_watts: 400
compute_capability: "8.0"

26
configs/h100.yaml Normal file
View File

@@ -0,0 +1,26 @@
# H100 Configuration
gpu_type: h100
gpu_model: "NVIDIA H100 80GB"
# Default attention implementation
default_attention: flash_attention_3_hopper
# Pretraining defaults
pretrain:
batch_size: 8
sequence_length: 8192
num_steps: 10
warmup_steps: 3
# Inference defaults
inference:
num_requests: 10
prompt_length: 512
generation_length: 100
warmup_requests: 2
# Hardware specs (for reference)
hardware:
memory_gb: 80
tdp_watts: 700
compute_capability: "9.0"

26
configs/h200.yaml Normal file
View File

@@ -0,0 +1,26 @@
# H200 Configuration
gpu_type: h200
gpu_model: "NVIDIA H200 141GB"
# Default attention implementation
default_attention: flash_attention_3_hopper
# Pretraining defaults
pretrain:
batch_size: 8
sequence_length: 8192
num_steps: 10
warmup_steps: 3
# Inference defaults
inference:
num_requests: 10
prompt_length: 512
generation_length: 100
warmup_requests: 2
# Hardware specs (for reference)
hardware:
memory_gb: 141
tdp_watts: 700
compute_capability: "9.0"

26
configs/mi300x.yaml Normal file
View File

@@ -0,0 +1,26 @@
# MI300X Configuration
gpu_type: mi300x
gpu_model: "AMD Instinct MI300X"
# Default attention implementation
default_attention: flash_attention_2
# Pretraining defaults
pretrain:
batch_size: 8
sequence_length: 8192
num_steps: 10
warmup_steps: 3
# Inference defaults
inference:
num_requests: 10
prompt_length: 512
generation_length: 100
warmup_requests: 2
# Hardware specs (for reference)
hardware:
memory_gb: 192
tdp_watts: 750
compute_capability: "gfx940"

122
quick_start.sh Executable file
View File

@@ -0,0 +1,122 @@
#!/bin/bash
# Quick Start Script for LLM Benchmark Suite
#
# This script helps you get started quickly with the benchmark suite.
# It will:
# 1. Check dependencies
# 2. Cache the model if needed
# 3. Run a quick test benchmark
#
# Usage: ./quick_start.sh [--skip-cache]
set -e # Exit on error
echo "========================================="
echo "LLM Benchmark Suite - Quick Start"
echo "========================================="
# Parse arguments
SKIP_CACHE=false
if [[ "$1" == "--skip-cache" ]]; then
SKIP_CACHE=true
fi
# Check Python
echo ""
echo "[1/5] Checking Python..."
if ! command -v python &> /dev/null; then
echo "✗ Python not found. Please install Python 3.8+"
exit 1
fi
PYTHON_VERSION=$(python --version 2>&1 | awk '{print $2}')
echo " ✓ Python $PYTHON_VERSION found"
# Check dependencies
echo ""
echo "[2/5] Checking dependencies..."
MISSING_DEPS=()
if ! python -c "import torch" 2>/dev/null; then
MISSING_DEPS+=("torch")
fi
if ! python -c "import transformers" 2>/dev/null; then
MISSING_DEPS+=("transformers")
fi
if ${#MISSING_DEPS[@]} -gt 0; then
echo " ⚠ Missing dependencies: ${MISSING_DEPS[*]}"
echo " Installing dependencies..."
pip install -r requirements.txt
else
echo " ✓ All dependencies installed"
fi
# Check GPU
echo ""
echo "[3/5] Checking GPU..."
if python -c "import torch; assert torch.cuda.is_available()" 2>/dev/null; then
GPU_NAME=$(python -c "import torch; print(torch.cuda.get_device_name(0))")
echo " ✓ GPU found: $GPU_NAME"
else
echo " ✗ No GPU found or CUDA not available"
echo " This benchmark requires a GPU to run."
exit 1
fi
# Cache model
if [ "$SKIP_CACHE" = false ]; then
echo ""
echo "[4/5] Caching model..."
if [ -d "./model_cache" ] && [ "$(ls -A ./model_cache)" ]; then
echo " ✓ Model cache already exists at ./model_cache"
echo " To re-download, remove the directory and run again."
else
echo " Downloading Qwen/Qwen3-4B..."
echo " (This may take several minutes depending on your connection)"
python cache_model.py --model-name Qwen/Qwen3-4B --cache-dir ./model_cache
fi
else
echo ""
echo "[4/5] Skipping model cache (--skip-cache specified)"
fi
# Run quick test
echo ""
echo "[5/5] Running quick test benchmark..."
echo " This will run a minimal benchmark to verify everything works."
echo " Parameters: 2 steps, batch size 2, sequence length 512"
echo ""
python run_benchmark.py \
--mode both \
--model-path ./model_cache \
--model-name Qwen/Qwen3-4B \
--batch-size 2 \
--sequence-length 512 \
--num-steps 2 \
--num-requests 2 \
--prompt-length 256 \
--generation-length 20 \
--output-dir ./results/test
echo ""
echo "========================================="
echo "Quick Start Complete!"
echo "========================================="
echo ""
echo "Next steps:"
echo " 1. Run full benchmarks:"
echo " python run_benchmark.py --mode both"
echo ""
echo " 2. Run on different GPUs using SLURM:"
echo " sbatch slurm_a100.sh"
echo " sbatch slurm_h100.sh"
echo " sbatch slurm_h200.sh"
echo " sbatch slurm_mi300x.sh"
echo ""
echo " 3. View results:"
echo " ls -l results/"
echo ""
echo "For more information, see README.md"
echo ""

22
requirements.txt Normal file
View File

@@ -0,0 +1,22 @@
# LLM Benchmark Suite - Requirements
# Core dependencies
torch>=2.0.0
transformers>=4.35.0
accelerate>=0.24.0
tokenizers>=0.14.0
# Attention implementations
flash-attn>=2.0.0
# GPU monitoring
pynvml>=11.5.0 # NVIDIA GPU monitoring
pyrsmi>=1.0.0 # AMD GPU monitoring
# Utilities
numpy>=1.24.0
pyyaml>=6.0
tqdm>=4.65.0
# Optional: for better performance
triton>=2.0.0

View File

@@ -0,0 +1,37 @@
{
"model_name": "Qwen/Qwen3-4B",
"gpu_name": "NVIDIA A100-SXM4-80GB",
"attention_implementation": "flash_attention_2",
"num_requests": 10,
"prompt_length": 512,
"generation_length": 100,
"prefill": {
"stage_name": "prefill",
"duration_ms": 475.62581300735474,
"tokens_processed": 5120,
"tokens_per_second": 10764.76477932628,
"energy_joules": 21.409000039100647,
"energy_per_token": 0.004181445320136845,
"avg_power_watts": 68.91171083870925,
"peak_memory_gb": 45.87115478515625,
"avg_gpu_util_percent": 38.1
},
"decode": {
"stage_name": "decode",
"duration_ms": 41460.768724791706,
"tokens_processed": 1000,
"tokens_per_second": 24.119186179055195,
"energy_joules": 4684.697999954224,
"energy_per_token": 4.684697999954223,
"avg_power_watts": 112.85507087682042,
"peak_memory_gb": 45.87115478515625,
"avg_gpu_util_percent": 38.1
},
"e2e_latency_ms": 4193.639453779906,
"e2e_tokens_per_second": 145.93529242204605,
"e2e_energy_joules": 4706.106999993324,
"e2e_energy_per_token": 0.768971732025053,
"ttft_ms": 47.562581300735474,
"itl_ms": 41.460768724791706,
"timestamp": 1768519487.5402663
}

View File

@@ -0,0 +1,47 @@
{
"model_name": "Qwen/Qwen3-4B",
"gpu_name": "NVIDIA A100-SXM4-80GB",
"attention_implementation": "flash_attention_2",
"batch_size": 3,
"sequence_length": 2048,
"num_steps": 10,
"forward": {
"stage_name": "forward",
"duration_ms": 3359.0412912890315,
"tokens_processed": 61440,
"tokens_per_second": 18290.933237210196,
"energy_joules": 1292.2280000448227,
"energy_per_token": 0.021032356771562868,
"avg_power_watts": 387.19580415542595,
"peak_memory_gb": 79.66021728515625,
"avg_gpu_util_percent": 97.8
},
"backward": {
"stage_name": "backward",
"duration_ms": 6954.944152384996,
"tokens_processed": 61440,
"tokens_per_second": 8834.003358449821,
"energy_joules": 2729.588000059128,
"energy_per_token": 0.0444268880217957,
"avg_power_watts": 394.24766095856324,
"peak_memory_gb": 79.66021728515625,
"avg_gpu_util_percent": 97.8
},
"optimizer": {
"stage_name": "optimizer",
"duration_ms": 1153.845101594925,
"tokens_processed": 61440,
"tokens_per_second": 53248.048559614595,
"energy_joules": 362.6529998779297,
"energy_per_token": 0.005902555336554845,
"avg_power_watts": 299.1223537953503,
"peak_memory_gb": 79.66021728515625,
"avg_gpu_util_percent": 97.8
},
"total_duration_ms": 11467.830545268953,
"total_tokens": 61440,
"total_tokens_per_second": 5357.595733340081,
"total_energy_joules": 4384.46899998188,
"total_energy_per_token": 0.07136180012991342,
"timestamp": 1768519431.5985208
}

View File

@@ -0,0 +1,37 @@
{
"model_name": "Qwen/Qwen3-4B",
"gpu_name": "NVIDIA H100",
"attention_implementation": "flash_attention_3_hopper",
"num_requests": 10,
"prompt_length": 512,
"generation_length": 100,
"prefill": {
"stage_name": "prefill",
"duration_ms": 323.99015384726226,
"tokens_processed": 5120,
"tokens_per_second": 15802.949377324925,
"energy_joules": 17.092000007629395,
"energy_per_token": 0.0033382812514901163,
"avg_power_watts": 93.64442380045372,
"peak_memory_gb": 46.02825927734375,
"avg_gpu_util_percent": 40.0
},
"decode": {
"stage_name": "decode",
"duration_ms": 30513.75844143331,
"tokens_processed": 1000,
"tokens_per_second": 32.772101867403634,
"energy_joules": 4915.5139999985695,
"energy_per_token": 4.915513999998569,
"avg_power_watts": 161.199160874206,
"peak_memory_gb": 46.02825927734375,
"avg_gpu_util_percent": 40.0
},
"e2e_latency_ms": 3083.7748595280573,
"e2e_tokens_per_second": 198.4580677506596,
"e2e_energy_joules": 4932.606000006199,
"e2e_energy_per_token": 0.8059813725500325,
"ttft_ms": 32.399015384726226,
"itl_ms": 30.51375844143331,
"timestamp": 1768541839.3186588
}

View File

@@ -0,0 +1,47 @@
{
"model_name": "Qwen/Qwen3-4B",
"gpu_name": "NVIDIA H100",
"attention_implementation": "flash_attention_3_hopper",
"batch_size": 3,
"sequence_length": 2048,
"num_steps": 10,
"forward": {
"stage_name": "forward",
"duration_ms": 1748.5067250672728,
"tokens_processed": 61440,
"tokens_per_second": 35138.55515633555,
"energy_joules": 946.9269999563694,
"energy_per_token": 0.015412223306581534,
"avg_power_watts": 501.76439870614394,
"peak_memory_gb": 76.45208740234375,
"avg_gpu_util_percent": 97.0
},
"backward": {
"stage_name": "backward",
"duration_ms": 3761.718863155693,
"tokens_processed": 61440,
"tokens_per_second": 16332.959010248362,
"energy_joules": 1904.104000031948,
"energy_per_token": 0.030991276042186655,
"avg_power_watts": 491.250130606127,
"peak_memory_gb": 76.45208740234375,
"avg_gpu_util_percent": 97.0
},
"optimizer": {
"stage_name": "optimizer",
"duration_ms": 896.0564862936735,
"tokens_processed": 61440,
"tokens_per_second": 68567.1059133025,
"energy_joules": 349.722000002861,
"energy_per_token": 0.0056920898437965665,
"avg_power_watts": 356.92130879075387,
"peak_memory_gb": 76.45208740234375,
"avg_gpu_util_percent": 97.0
},
"total_duration_ms": 6406.282074516639,
"total_tokens": 61440,
"total_tokens_per_second": 9590.586128637759,
"total_energy_joules": 3200.7529999911785,
"total_energy_per_token": 0.052095589192564754,
"timestamp": 1768541796.4011748
}

View File

@@ -0,0 +1,37 @@
{
"model_name": "Qwen/Qwen3-4B",
"gpu_name": "NVIDIA H100",
"attention_implementation": "sdpa",
"num_requests": 10,
"prompt_length": 512,
"generation_length": 100,
"prefill": {
"stage_name": "prefill",
"duration_ms": 253.97859653458,
"tokens_processed": 5120,
"tokens_per_second": 20159.179040517676,
"energy_joules": 0.0,
"energy_per_token": 0.0,
"avg_power_watts": 0.0,
"peak_memory_gb": 46.01458740234375,
"avg_gpu_util_percent": 48.8
},
"decode": {
"stage_name": "decode",
"duration_ms": 23519.252635538578,
"tokens_processed": 1000,
"tokens_per_second": 42.51835785330007,
"energy_joules": 4544.901999980211,
"energy_per_token": 4.544901999980211,
"avg_power_watts": 192.5432634001641,
"peak_memory_gb": 46.01458740234375,
"avg_gpu_util_percent": 48.8
},
"e2e_latency_ms": 2377.323123207316,
"e2e_tokens_per_second": 257.43240118504923,
"e2e_energy_joules": 4544.901999980211,
"e2e_energy_per_token": 0.7426310457484006,
"ttft_ms": 25.397859653458,
"itl_ms": 23.519252635538578,
"timestamp": 1769149269.5228984
}

View File

@@ -0,0 +1,47 @@
{
"model_name": "Qwen/Qwen3-4B",
"gpu_name": "NVIDIA H100",
"attention_implementation": "sdpa",
"batch_size": 3,
"sequence_length": 2048,
"num_steps": 10,
"forward": {
"stage_name": "forward",
"duration_ms": 1790.2467511594296,
"tokens_processed": 61440,
"tokens_per_second": 34319.29143857359,
"energy_joules": 981.029000043869,
"energy_per_token": 0.01596726888092235,
"avg_power_watts": 520.9058508009567,
"peak_memory_gb": 76.45208740234375,
"avg_gpu_util_percent": 100.0
},
"backward": {
"stage_name": "backward",
"duration_ms": 3854.5540031045675,
"tokens_processed": 61440,
"tokens_per_second": 15939.587290906931,
"energy_joules": 1953.71099999547,
"energy_per_token": 0.03179868164055127,
"avg_power_watts": 491.5443624439596,
"peak_memory_gb": 76.45208740234375,
"avg_gpu_util_percent": 100.0
},
"optimizer": {
"stage_name": "optimizer",
"duration_ms": 899.9840868636966,
"tokens_processed": 61440,
"tokens_per_second": 68267.87372886644,
"energy_joules": 365.9209999740124,
"energy_per_token": 0.005955745442285358,
"avg_power_watts": 377.8756124501158,
"peak_memory_gb": 76.45208740234375,
"avg_gpu_util_percent": 100.0
},
"total_duration_ms": 6544.784841127694,
"total_tokens": 61440,
"total_tokens_per_second": 9387.627170553957,
"total_energy_joules": 3300.6610000133514,
"total_energy_per_token": 0.053721695963758975,
"timestamp": 1769149234.99943
}

View File

@@ -0,0 +1,37 @@
{
"model_name": "Qwen/Qwen3-4B",
"gpu_name": "NVIDIA H200",
"attention_implementation": "flash_attention_3_hopper",
"num_requests": 10,
"prompt_length": 512,
"generation_length": 100,
"prefill": {
"stage_name": "prefill",
"duration_ms": 323.8773119999223,
"tokens_processed": 5120,
"tokens_per_second": 15808.455270868828,
"energy_joules": 98.1449999999968,
"energy_per_token": 0.019168945312499373,
"avg_power_watts": 250.96736239598317,
"peak_memory_gb": 46.1302490234375,
"avg_gpu_util_percent": 32.2
},
"decode": {
"stage_name": "decode",
"duration_ms": 30558.618001000013,
"tokens_processed": 1000,
"tokens_per_second": 32.72399294913388,
"energy_joules": 4828.459999999999,
"energy_per_token": 4.828459999999999,
"avg_power_watts": 157.61927190444868,
"peak_memory_gb": 46.1302490234375,
"avg_gpu_util_percent": 32.2
},
"e2e_latency_ms": 3088.2495312999936,
"e2e_tokens_per_second": 198.17051497855476,
"e2e_energy_joules": 4926.604999999996,
"e2e_energy_per_token": 0.8050008169934634,
"ttft_ms": 32.38773119999223,
"itl_ms": 30.558618001000013,
"timestamp": 1768541964.4743361
}

View File

@@ -0,0 +1,47 @@
{
"model_name": "Qwen/Qwen3-4B",
"gpu_name": "NVIDIA H200",
"attention_implementation": "flash_attention_3_hopper",
"batch_size": 3,
"sequence_length": 2048,
"num_steps": 10,
"forward": {
"stage_name": "forward",
"duration_ms": 1605.9521619997668,
"tokens_processed": 61440,
"tokens_per_second": 38257.67756587068,
"energy_joules": 817.7539999999863,
"energy_per_token": 0.01330979817708311,
"avg_power_watts": 476.6091506406698,
"peak_memory_gb": 76.5540771484375,
"avg_gpu_util_percent": 95.1
},
"backward": {
"stage_name": "backward",
"duration_ms": 3448.8081949999696,
"tokens_processed": 61440,
"tokens_per_second": 17814.849804948502,
"energy_joules": 1765.182000000008,
"energy_per_token": 0.02873017578125013,
"avg_power_watts": 498.84691252245983,
"peak_memory_gb": 76.5540771484375,
"avg_gpu_util_percent": 95.1
},
"optimizer": {
"stage_name": "optimizer",
"duration_ms": 545.701982000196,
"tokens_processed": 61440,
"tokens_per_second": 112588.92587268984,
"energy_joules": 332.4770000000135,
"energy_per_token": 0.005411409505208553,
"avg_power_watts": 521.4900438388863,
"peak_memory_gb": 76.5540771484375,
"avg_gpu_util_percent": 95.1
},
"total_duration_ms": 5600.462338999932,
"total_tokens": 61440,
"total_tokens_per_second": 10970.522839186035,
"total_energy_joules": 2915.4130000000077,
"total_energy_per_token": 0.047451383463541795,
"timestamp": 1768541921.6000674
}

View File

@@ -0,0 +1,37 @@
{
"model_name": "Qwen/Qwen3-4B",
"gpu_name": "NVIDIA H200",
"attention_implementation": "sdpa",
"num_requests": 10,
"prompt_length": 512,
"generation_length": 100,
"prefill": {
"stage_name": "prefill",
"duration_ms": 247.9969559935853,
"tokens_processed": 5120,
"tokens_per_second": 20645.414696672466,
"energy_joules": 73.83399999141693,
"energy_per_token": 0.014420703123323619,
"avg_power_watts": 222.33737204549297,
"peak_memory_gb": 46.1165771484375,
"avg_gpu_util_percent": 40.0
},
"decode": {
"stage_name": "decode",
"duration_ms": 23003.622506046668,
"tokens_processed": 1000,
"tokens_per_second": 43.47141411041425,
"energy_joules": 4033.3500000089407,
"energy_per_token": 4.033350000008941,
"avg_power_watts": 174.6335604209662,
"peak_memory_gb": 46.1165771484375,
"avg_gpu_util_percent": 40.0
},
"e2e_latency_ms": 2325.1619462040253,
"e2e_tokens_per_second": 263.20747292425324,
"e2e_energy_joules": 4107.184000000358,
"e2e_energy_per_token": 0.6711084967320846,
"ttft_ms": 24.79969559935853,
"itl_ms": 23.003622506046668,
"timestamp": 1769149520.7919798
}

View File

@@ -0,0 +1,47 @@
{
"model_name": "Qwen/Qwen3-4B",
"gpu_name": "NVIDIA H200",
"attention_implementation": "sdpa",
"batch_size": 3,
"sequence_length": 2048,
"num_steps": 10,
"forward": {
"stage_name": "forward",
"duration_ms": 1615.8598741167225,
"tokens_processed": 61440,
"tokens_per_second": 38023.09902248482,
"energy_joules": 873.9250000119209,
"energy_per_token": 0.014224039713735693,
"avg_power_watts": 541.9081076256928,
"peak_memory_gb": 76.5540771484375,
"avg_gpu_util_percent": 100.0
},
"backward": {
"stage_name": "backward",
"duration_ms": 3462.180594098754,
"tokens_processed": 61440,
"tokens_per_second": 17746.04135460864,
"energy_joules": 1696.024000003934,
"energy_per_token": 0.027604557291730693,
"avg_power_watts": 472.8399628680292,
"peak_memory_gb": 76.5540771484375,
"avg_gpu_util_percent": 100.0
},
"optimizer": {
"stage_name": "optimizer",
"duration_ms": 551.849422918167,
"tokens_processed": 61440,
"tokens_per_second": 111334.71821915968,
"energy_joules": 316.88299998641014,
"energy_per_token": 0.005157600911237144,
"avg_power_watts": 499.2301039455484,
"peak_memory_gb": 76.5540771484375,
"avg_gpu_util_percent": 100.0
},
"total_duration_ms": 5629.889891133644,
"total_tokens": 61440,
"total_tokens_per_second": 10913.179687005982,
"total_energy_joules": 2886.832000002265,
"total_energy_per_token": 0.04698619791670353,
"timestamp": 1769149487.0005488
}

248
run_benchmark.py Executable file
View File

@@ -0,0 +1,248 @@
#!/usr/bin/env python3
"""
Main LLM Benchmark Runner
Orchestrates pretraining and inference benchmarks with auto-detection
of GPU type and configuration.
"""
import argparse
import sys
from pathlib import Path
# Import benchmark functions
import benchmark_pretrain
import benchmark_inference
from utils.gpu_monitor import get_gpu_monitor, list_available_gpus
from utils.metrics import MetricsReporter
def main():
parser = argparse.ArgumentParser(
description="LLM Benchmark Suite - Compare GPU performance for pretraining and inference",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Run both pretrain and inference benchmarks
python run_benchmark.py --mode both
# Run only pretraining benchmark
python run_benchmark.py --mode pretrain --num-steps 20
# Run inference with custom settings
python run_benchmark.py --mode inference --num-requests 20 --generation-length 200
# Use specific attention implementation
python run_benchmark.py --attn-implementation flash_attention_3_hopper
"""
)
# Model configuration
parser.add_argument(
"--model-path",
type=str,
default="./model_cache",
help="Path to cached model directory"
)
parser.add_argument(
"--model-name",
type=str,
default="Qwen/Qwen3-4B",
help="Model name for reporting"
)
# Benchmark mode
parser.add_argument(
"--mode",
type=str,
default="both",
choices=["pretrain", "inference", "both"],
help="Benchmark mode to run"
)
# Attention configuration
parser.add_argument(
"--attn-implementation",
type=str,
default="auto",
choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
help="Attention implementation (auto selects based on GPU)"
)
# Pretraining parameters
pretrain_group = parser.add_argument_group("pretraining parameters")
pretrain_group.add_argument(
"--batch-size",
type=int,
default=3,
help="Batch size for pretraining"
)
pretrain_group.add_argument(
"--sequence-length",
type=int,
default=2048,
help="Sequence length for pretraining"
)
pretrain_group.add_argument(
"--num-steps",
type=int,
default=10,
help="Number of training steps"
)
pretrain_group.add_argument(
"--warmup-steps",
type=int,
default=3,
help="Number of warmup steps"
)
# Inference parameters
inference_group = parser.add_argument_group("inference parameters")
inference_group.add_argument(
"--num-requests",
type=int,
default=10,
help="Number of inference requests"
)
inference_group.add_argument(
"--prompt-length",
type=int,
default=512,
help="Prompt length in tokens"
)
inference_group.add_argument(
"--generation-length",
type=int,
default=100,
help="Number of tokens to generate"
)
inference_group.add_argument(
"--warmup-requests",
type=int,
default=2,
help="Number of warmup requests"
)
# General parameters
parser.add_argument(
"--device-id",
type=int,
default=0,
help="GPU device ID"
)
parser.add_argument(
"--output-dir",
type=str,
default="./results",
help="Output directory for results"
)
parser.add_argument(
"--list-gpus",
action="store_true",
help="List available GPUs and exit"
)
args = parser.parse_args()
# List GPUs if requested
if args.list_gpus:
print("Available GPUs:")
gpus = list_available_gpus()
if not gpus:
print(" No GPUs found!")
else:
for gpu in gpus:
print(f" {gpu}")
return
# Print header
print("=" * 80)
print("LLM BENCHMARK SUITE")
print("=" * 80)
print(f"\nModel: {args.model_name}")
print(f"Model Path: {args.model_path}")
print(f"Mode: {args.mode}")
print(f"Attention: {args.attn_implementation}")
print(f"Output Directory: {args.output_dir}")
# Detect GPU
print("\nDetecting GPU...")
try:
monitor = get_gpu_monitor(args.device_id)
gpu_name = monitor.get_device_name()
print(f" GPU {args.device_id}: {gpu_name}")
monitor.cleanup()
except Exception as e:
print(f"✗ Error detecting GPU: {e}")
sys.exit(1)
# Create output directory
output_path = Path(args.output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Run benchmarks
pretrain_metrics = None
inference_metrics = None
if args.mode in ["pretrain", "both"]:
print("\n" + "=" * 80)
print("Running Pretraining Benchmark...")
print("=" * 80)
pretrain_metrics = benchmark_pretrain.benchmark_pretrain(
model_name_or_path=args.model_name,
attn_implementation=args.attn_implementation,
batch_size=args.batch_size,
sequence_length=args.sequence_length,
num_steps=args.num_steps,
warmup_steps=args.warmup_steps,
device="cuda",
device_id=args.device_id,
output_dir=args.output_dir,
verbose=True
)
if args.mode in ["inference", "both"]:
print("\n" + "=" * 80)
print("Running Inference Benchmark...")
print("=" * 80)
inference_metrics = benchmark_inference.benchmark_inference(
model_name_or_path=args.model_name,
attn_implementation=args.attn_implementation,
num_requests=args.num_requests,
prompt_length=args.prompt_length,
generation_length=args.generation_length,
warmup_requests=args.warmup_requests,
device="cuda",
device_id=args.device_id,
output_dir=args.output_dir,
verbose=True
)
# Summary
print("\n" + "=" * 80)
print("BENCHMARK COMPLETE")
print("=" * 80)
print(f"\nResults saved to: {output_path}")
if pretrain_metrics:
print(f"\nPretraining:")
print(f" Duration: {pretrain_metrics.total_duration_ms:.2f} ms")
print(f" Throughput: {pretrain_metrics.total_tokens_per_second:.2f} tokens/s")
print(f" Energy: {pretrain_metrics.total_energy_joules:.2f} J")
print(f" Energy/token: {pretrain_metrics.total_energy_per_token*1000:.4f} mJ/token")
if inference_metrics:
print(f"\nInference:")
print(f" TTFT: {inference_metrics.ttft_ms:.2f} ms")
print(f" ITL: {inference_metrics.itl_ms:.2f} ms/token")
print(f" Throughput: {inference_metrics.e2e_tokens_per_second:.2f} tokens/s")
print(f" Energy: {inference_metrics.e2e_energy_joules:.2f} J")
print(f" Energy/token: {inference_metrics.e2e_energy_per_token*1000:.4f} mJ/token")
if __name__ == "__main__":
main()

45
slurm_a100.sh Executable file
View File

@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --job-name=llm_bench_a100
#SBATCH --partition=a100 # Adjust to your A100 partition name
#SBATCH --nodes=1
#SBATCH --gres=gpu:a100:1 # Request 1 A100 GPU
#SBATCH -C a100_80
#SBATCH --time=02:00:00
#SBATCH --output=logs/benchmark_a100_sdpa_%j.out
#SBATCH --error=logs/benchmark_a100_sdpa_%j.err
# Create logs directory
mkdir -p logs
# Print job info
echo "========================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Job Name: $SLURM_JOB_NAME"
echo "Node: $SLURM_NODELIST"
echo "Date: $(date)"
echo "========================================="
# Set cache paths
export TRANSFORMERS_CACHE=$(pwd)/model_cache
export HF_HOME=$(pwd)/model_cache
# Path to apptainer image
APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_updated_ao.sif"
# Run benchmark inside apptainer
apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \
--mode both \
--model-path ./model_cache \
--model-name Qwen/Qwen3-4B \
--attn-implementation sdpa \
--batch-size 3 \
--sequence-length 2048 \
--num-steps 10 \
--num-requests 10 \
--prompt-length 512 \
--generation-length 100 \
--output-dir ./results/a100
echo "========================================="
echo "Benchmark Complete!"
echo "========================================="

46
slurm_h100.sh Executable file
View File

@@ -0,0 +1,46 @@
#!/bin/bash
#SBATCH --job-name=llm_bench_h100
#SBATCH --partition=h100 # Adjust to your H100 partition name
#SBATCH --nodes=1
#SBATCH --gres=gpu:h100:1 # Request 1 H100 GPU
#SBATCH --time=02:00:00
#SBATCH --output=logs/benchmark_h100_%j.out
#SBATCH --error=logs/benchmark_h100_%j.err
# Create logs directory
mkdir -p logs
# Print job info
echo "========================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Job Name: $SLURM_JOB_NAME"
echo "Node: $SLURM_NODELIST"
echo "Date: $(date)"
echo "========================================="
# Set cache paths
export TRANSFORMERS_CACHE=$(pwd)/model_cache
export HF_HOME=$(pwd)/model_cache
# Path to apptainer image
APPTAINER_IMAGE="/hnvme/workspace/ihpc125h-llm-profiler/pytorch_25.10_updated_ao.sif"
# Run benchmark with FlashAttention-3 Hopper inside apptainer
apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \
--mode both \
--model-path ./model_cache \
--model-name Qwen/Qwen3-4B \
--attn-implementation sdpa \
--batch-size 3 \
--sequence-length 2048 \
--num-steps 10 \
--num-requests 10 \
--prompt-length 512 \
--generation-length 100 \
--output-dir ./results/h100_sdpa
# --attn-implementation flash_attention_3_hopper \
echo "========================================="
echo "Benchmark Complete!"
echo "========================================="

45
slurm_h200.sh Executable file
View File

@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --job-name=llm_bench_h200
#SBATCH --partition=h200 # Adjust to your H200 partition name
#SBATCH --nodes=1
#SBATCH --gres=gpu:h200:1 # Request 1 H200 GPU
#SBATCH --time=02:00:00
#SBATCH --output=logs/benchmark_h200_%j.out
#SBATCH --error=logs/benchmark_h200_%j.err
# Create logs directory
mkdir -p logs
# Print job info
echo "========================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Job Name: $SLURM_JOB_NAME"
echo "Node: $SLURM_NODELIST"
echo "Date: $(date)"
echo "========================================="
# Set cache paths
export TRANSFORMERS_CACHE=$(pwd)/model_cache
export HF_HOME=$(pwd)/model_cache
# Path to apptainer image
APPTAINER_IMAGE="/hnvme/workspace/ihpc125h-llm-profiler/pytorch_25.10_updated_ao.sif"
# Run benchmark with FlashAttention-3 Hopper inside apptainer
apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \
--mode both \
--model-path ./model_cache \
--model-name Qwen/Qwen3-4B \
--attn-implementation sdpa \
--batch-size 3 \
--sequence-length 2048 \
--num-steps 10 \
--num-requests 10 \
--prompt-length 512 \
--generation-length 100 \
--output-dir ./results/h200_sdpa
# --attn-implementation flash_attention_3_hopper \
echo "========================================="
echo "Benchmark Complete!"
echo "========================================="

42
slurm_mi300x.sh Executable file
View File

@@ -0,0 +1,42 @@
#!/bin/bash
#SBATCH --job-name=llm_bench_mi300x
#SBATCH --nodes=1
#SBATCH -w=aquavan1 # Request MI300X GPUs
#SBATCH --time=02:00:00
#SBATCH --output=logs/benchmark_mi300x_%j.out
#SBATCH --error=logs/benchmark_mi300x_%j.err
# Create logs directory
mkdir -p logs
# Print job info
echo "========================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Job Name: $SLURM_JOB_NAME"
echo "Node: $SLURM_NODELIST"
echo "Date: $(date)"
echo "========================================="
# Set cache paths
export TRANSFORMERS_CACHE=$(pwd)/models
export HF_HOME=$(pwd)/models
# Path to apptainer image
#APPTAINER_IMAGE="/home/woody/ihpc/ihpc125h/pytorch_25.10_updated_ao.sif"
apptainer exec --writable ../rocm_sandbox/ python run_benchmark.py \
--mode both \
--model-path ./model_cache \
--model-name Qwen/Qwen3-4B \
--attn-implementation sdpa \
--batch-size 3 \
--sequence-length 2048 \
--num-steps 10 \
--num-requests 10 \
--prompt-length 512 \
--generation-length 100 \
--output-dir ./results/mi300x_sdpa
echo "========================================="
echo "Benchmark Complete!"
echo "========================================="

3
utils/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
"""Utility package for LLM benchmarking."""
__version__ = "1.0.0"

295
utils/attention.py Normal file
View File

@@ -0,0 +1,295 @@
"""
Attention Implementation Helpers for LLM Benchmarking
Provides functions for configuring different attention implementations
based on GPU type.
"""
from typing import Optional
import warnings
def get_default_attention(gpu_name: str) -> str:
"""
Get default attention implementation for GPU type.
Args:
gpu_name: GPU device name (from monitoring)
Returns:
Attention implementation string
"""
gpu_lower = gpu_name.lower()
# H100/H200: FlashAttention-3 Hopper
if 'h100' in gpu_lower or 'h200' in gpu_lower:
return "flash_attention_3_hopper"
# A100, MI300X, other: FlashAttention-2
return "flash_attention_2"
def configure_model_attention(model, attn_implementation: str, verbose: bool = True):
"""
Configure model to use specified attention implementation.
This function patches the model if needed to use the specified attention.
For standard implementations like flash_attention_2, the model should already
be loaded with the correct implementation via AutoModelForCausalLM.from_pretrained().
For FlashAttention-3 Hopper, this patches the model's attention modules.
Args:
model: The loaded model
attn_implementation: Attention implementation to use
verbose: Print configuration messages
Returns:
Configured model
"""
if verbose:
print(f"Configuring attention: {attn_implementation}")
if attn_implementation == "flash_attention_3_hopper":
# Patch model to use FlashAttention-3 Hopper
try:
import flash_attn_interface
except ImportError:
raise ImportError(
"flash_attn_interface not found. This is required for FlashAttention-3.\n"
"Install with appropriate method for your system."
)
# Patch the model's attention function
_patch_fa3_hopper(model, verbose=verbose)
elif attn_implementation == "flash_attention_2":
# Model should already be loaded with FA2
if verbose:
print(" Using FlashAttention-2 (configured during model loading)")
elif attn_implementation == "sdpa":
# PyTorch Scaled Dot Product Attention
if verbose:
print(" Using PyTorch SDPA")
elif attn_implementation == "eager":
# Standard PyTorch attention
if verbose:
print(" Using eager attention")
else:
warnings.warn(f"Unknown attention implementation: {attn_implementation}")
return model
def _patch_fa3_hopper(model, verbose: bool = True):
"""
Patch model to use FlashAttention-3 Hopper.
This replaces the attention computation in the model's attention layers
with calls to flash_attn_interface.flash_attn_func().
Args:
model: The model to patch
verbose: Print patching messages
"""
import flash_attn_interface
import torch
# Counter for patched modules
num_patched = 0
# Iterate through all modules in the model
for name, module in model.named_modules():
# Look for attention modules (this will vary by model architecture)
# Common names: "self_attn", "attn", "attention"
if any(attn_name in name.lower() for attn_name in ['self_attn', 'attention']):
# Check if module has a forward method we can patch
if hasattr(module, 'forward'):
# Save original forward
original_forward = module.forward
# Create patched forward function
def create_patched_forward(orig_forward):
def patched_forward(hidden_states, *args, **kwargs):
# Check if this is an attention computation
# For Qwen models, attention modules typically have q, k, v projections
if hasattr(module, 'q_proj') and hasattr(module, 'k_proj') and hasattr(module, 'v_proj'):
# Extract batch, seq_len, hidden_dim
batch_size, seq_len, hidden_dim = hidden_states.shape
# Compute Q, K, V
q = module.q_proj(hidden_states)
k = module.k_proj(hidden_states)
v = module.v_proj(hidden_states)
# Reshape for multi-head attention
num_heads = module.num_heads
head_dim = hidden_dim // num_heads
q = q.view(batch_size, seq_len, num_heads, head_dim)
k = k.view(batch_size, seq_len, num_heads, head_dim)
v = v.view(batch_size, seq_len, num_heads, head_dim)
# Call FlashAttention-3
# Note: flash_attn_func expects (batch, seqlen, nheads, headdim)
attn_output = flash_attn_interface.flash_attn_func(
q, k, v,
dropout_p=0.0,
softmax_scale=None, # Will use default 1/sqrt(head_dim)
causal=True, # For causal LM
)
# Reshape back
attn_output = attn_output.view(batch_size, seq_len, hidden_dim)
# Apply output projection if it exists
if hasattr(module, 'o_proj'):
attn_output = module.o_proj(attn_output)
return (attn_output,) + (None,) * (len(orig_forward(hidden_states, *args, **kwargs)) - 1)
else:
# Not an attention module we can patch, use original
return orig_forward(hidden_states, *args, **kwargs)
return patched_forward
# Apply patch
module.forward = create_patched_forward(original_forward)
num_patched += 1
if verbose:
if num_patched > 0:
print(f" ✓ Patched {num_patched} attention modules to use FlashAttention-3 Hopper")
else:
warnings.warn(" ⚠ No attention modules found to patch for FlashAttention-3")
def get_attention_info(attn_implementation: str) -> dict:
"""
Get information about an attention implementation.
Args:
attn_implementation: Attention implementation string
Returns:
Dictionary with info about the implementation
"""
info = {
"flash_attention_2": {
"name": "FlashAttention-2",
"description": "Optimized attention for A100 and other GPUs",
"gpu_support": ["A100", "MI300X", "V100", "RTX"],
"memory_efficient": True,
"requires_cuda": True,
},
"flash_attention_3_hopper": {
"name": "FlashAttention-3 Hopper",
"description": "Optimized attention for H100/H200 Hopper architecture",
"gpu_support": ["H100", "H200"],
"memory_efficient": True,
"requires_cuda": True,
},
"sdpa": {
"name": "PyTorch SDPA",
"description": "PyTorch Scaled Dot Product Attention",
"gpu_support": ["All"],
"memory_efficient": True,
"requires_cuda": False,
},
"eager": {
"name": "Eager Attention",
"description": "Standard PyTorch attention implementation",
"gpu_support": ["All"],
"memory_efficient": False,
"requires_cuda": False,
},
}
return info.get(attn_implementation, {
"name": attn_implementation,
"description": "Unknown attention implementation",
"gpu_support": ["Unknown"],
"memory_efficient": False,
"requires_cuda": False,
})
def validate_attention_for_gpu(attn_implementation: str, gpu_name: str) -> tuple[bool, Optional[str]]:
"""
Validate if attention implementation is suitable for GPU.
Args:
attn_implementation: Attention implementation
gpu_name: GPU device name
Returns:
Tuple of (is_valid, warning_message)
"""
gpu_lower = gpu_name.lower()
# FlashAttention-3 Hopper validation
if attn_implementation == "flash_attention_3_hopper":
if 'h100' not in gpu_lower and 'h200' not in gpu_lower:
return False, (
f"FlashAttention-3 Hopper is optimized for H100/H200. "
f"Current GPU: {gpu_name}. Consider using flash_attention_2 instead."
)
# FlashAttention-2 on Hopper GPUs
if attn_implementation == "flash_attention_2":
if 'h100' in gpu_lower or 'h200' in gpu_lower:
return True, (
f"FlashAttention-2 will work on {gpu_name}, but FlashAttention-3 Hopper "
f"may provide better performance."
)
return True, None
if __name__ == "__main__":
"""Test attention configuration."""
print("=" * 60)
print("Attention Implementation Test")
print("=" * 60)
# Test getting default attention for different GPUs
test_gpus = [
"NVIDIA A100 80GB",
"NVIDIA H100 80GB",
"NVIDIA H200 141GB",
"AMD Instinct MI300X",
]
print("\nDefault attention implementations:")
for gpu in test_gpus:
attn = get_default_attention(gpu)
print(f" {gpu:30s}{attn}")
# Test validation
print("\nValidation tests:")
test_cases = [
("flash_attention_3_hopper", "NVIDIA H100 80GB"),
("flash_attention_3_hopper", "NVIDIA A100 80GB"),
("flash_attention_2", "NVIDIA H100 80GB"),
("flash_attention_2", "NVIDIA A100 80GB"),
]
for attn, gpu in test_cases:
valid, warning = validate_attention_for_gpu(attn, gpu)
status = "" if valid else ""
print(f" {status} {attn:30s} on {gpu:25s}")
if warning:
print(f"{warning}")
# Test getting info
print("\nAttention implementation info:")
for attn in ["flash_attention_2", "flash_attention_3_hopper", "sdpa"]:
info = get_attention_info(attn)
print(f"\n {info['name']}:")
print(f" Description: {info['description']}")
print(f" GPU Support: {', '.join(info['gpu_support'])}")
print(f" Memory Efficient: {info['memory_efficient']}")

562
utils/gpu_monitor.py Normal file
View File

@@ -0,0 +1,562 @@
"""
GPU Monitoring Infrastructure for LLM Benchmarking
Provides unified interface for monitoring both NVIDIA and AMD GPUs.
"""
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional, List
import warnings
@dataclass
class GPUMetrics:
"""Container for GPU metrics."""
timestamp: float
power_watts: float
gpu_utilization_percent: float
memory_used_gb: float
memory_total_gb: float
temperature_celsius: Optional[float] = None
energy_joules: Optional[float] = None # Cumulative energy
class GPUMonitor(ABC):
"""Abstract base class for GPU monitoring."""
def __init__(self, device_id: int = 0):
"""
Initialize GPU monitor.
Args:
device_id: GPU device ID to monitor
"""
self.device_id = device_id
self.start_time = None
self.start_energy = None
self.last_metrics = None
@abstractmethod
def get_metrics(self) -> GPUMetrics:
"""Get current GPU metrics."""
pass
@abstractmethod
def get_device_name(self) -> str:
"""Get GPU device name."""
pass
@abstractmethod
def cleanup(self):
"""Cleanup resources."""
pass
def start_monitoring(self):
"""Start energy monitoring session."""
self.start_time = time.time()
metrics = self.get_metrics()
self.start_energy = metrics.energy_joules if metrics.energy_joules is not None else 0.0
self.last_metrics = metrics
def get_energy_consumed(self) -> float:
"""
Get energy consumed since start_monitoring() was called.
Returns:
Energy in Joules
"""
if self.start_time is None:
raise RuntimeError("Must call start_monitoring() first")
current_metrics = self.get_metrics()
if current_metrics.energy_joules is not None:
# If GPU provides cumulative energy, use it
return current_metrics.energy_joules - self.start_energy
else:
# Otherwise, integrate power over time
elapsed_time = time.time() - self.start_time
# Use average of start and current power
avg_power = (self.last_metrics.power_watts + current_metrics.power_watts) / 2.0
return avg_power * elapsed_time
def get_average_power(self) -> float:
"""
Get average power consumption since start_monitoring().
Returns:
Average power in Watts
"""
if self.start_time is None:
raise RuntimeError("Must call start_monitoring() first")
elapsed_time = time.time() - self.start_time
if elapsed_time == 0:
return 0.0
energy = self.get_energy_consumed()
return energy / elapsed_time
class NVIDIAMonitor(GPUMonitor):
"""NVIDIA GPU monitor using pynvml."""
def __init__(self, device_id: int = 0):
"""Initialize NVIDIA monitor."""
try:
import pynvml
self.pynvml = pynvml
except ImportError:
raise ImportError(
"pynvml not found. Install with: pip install pynvml"
)
try:
self.pynvml.nvmlInit()
self.handle = self.pynvml.nvmlDeviceGetHandleByIndex(device_id)
except Exception as e:
raise RuntimeError(f"Failed to initialize NVIDIA GPU {device_id}: {e}")
super().__init__(device_id)
def get_metrics(self) -> GPUMetrics:
"""Get current NVIDIA GPU metrics."""
try:
# Power (in milliwatts)
power_mw = self.pynvml.nvmlDeviceGetPowerUsage(self.handle)
power_watts = power_mw / 1000.0
# Utilization
util = self.pynvml.nvmlDeviceGetUtilizationRates(self.handle)
gpu_util = util.gpu
# Memory
mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(self.handle)
memory_used_gb = mem_info.used / (1024**3)
memory_total_gb = mem_info.total / (1024**3)
# Temperature
try:
temp = self.pynvml.nvmlDeviceGetTemperature(
self.handle,
self.pynvml.NVML_TEMPERATURE_GPU
)
except:
temp = None
# Try to get cumulative energy (newer GPUs)
energy_joules = None
try:
energy_mj = self.pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
energy_joules = energy_mj / 1000.0
except:
# Not supported on this GPU, will use power integration
pass
return GPUMetrics(
timestamp=time.time(),
power_watts=power_watts,
gpu_utilization_percent=gpu_util,
memory_used_gb=memory_used_gb,
memory_total_gb=memory_total_gb,
temperature_celsius=temp,
energy_joules=energy_joules
)
except Exception as e:
raise RuntimeError(f"Failed to get NVIDIA GPU metrics: {e}")
def get_device_name(self) -> str:
"""Get NVIDIA GPU device name."""
try:
name = self.pynvml.nvmlDeviceGetName(self.handle)
if isinstance(name, bytes):
name = name.decode('utf-8')
return name
except:
return f"NVIDIA GPU {self.device_id}"
def cleanup(self):
"""Cleanup NVIDIA resources."""
try:
self.pynvml.nvmlShutdown()
except:
pass
class AMDMonitor(GPUMonitor):
"""AMD GPU monitor using rocm-smi command line tool."""
def __init__(self, device_id: int = 0):
"""Initialize AMD monitor."""
import subprocess
import shutil
# Check if rocm-smi is available
if shutil.which('rocm-smi') is None:
raise RuntimeError("rocm-smi command not found. Make sure ROCm is installed and in PATH.")
self.device_id = device_id
# Verify device exists
try:
result = subprocess.run(
['rocm-smi', '--showid'],
capture_output=True,
text=True,
timeout=5
)
if result.returncode != 0:
raise RuntimeError(f"rocm-smi failed: {result.stderr}")
except subprocess.TimeoutExpired:
raise RuntimeError("rocm-smi command timed out")
except Exception as e:
raise RuntimeError(f"Failed to initialize AMD GPU {device_id}: {e}")
super().__init__(device_id)
def _parse_detailed_output(self, output: str) -> dict:
"""Parse rocm-smi detailed output format."""
lines = output.strip().split('\n')
# Parse detailed format: GPU[X] : Metric : Value
metrics = {
'temperature': None,
'power': None,
'vram_percent': None,
'gpu_percent': None,
}
device_prefix = f"GPU[{self.device_id}]"
for line in lines:
if not line.strip() or not line.startswith(device_prefix):
continue
# Split by colon
parts = line.split(':')
if len(parts) < 3:
continue
metric_name = parts[1].strip().lower()
value_str = parts[2].strip()
try:
# Temperature (Sensor junction)
if 'temperature' in metric_name and 'junction' in metric_name:
metrics['temperature'] = float(value_str)
# Power consumption
elif 'power' in metric_name and 'package' in metric_name:
metrics['power'] = float(value_str)
# GPU utilization
elif 'gpu use' in metric_name:
metrics['gpu_percent'] = float(value_str)
# VRAM usage percentage
elif 'memory allocated' in metric_name and 'vram%' in metric_name:
metrics['vram_percent'] = float(value_str)
except (ValueError, IndexError):
continue
# Validate we got the required metrics
if metrics['temperature'] is None:
raise ValueError(f"Could not find temperature for GPU[{self.device_id}]")
if metrics['power'] is None:
raise ValueError(f"Could not find power for GPU[{self.device_id}]")
if metrics['gpu_percent'] is None:
metrics['gpu_percent'] = 0.0
if metrics['vram_percent'] is None:
metrics['vram_percent'] = 0.0
return metrics
def _get_memory_info(self) -> tuple:
"""Get memory usage in GB using rocm-smi --showmeminfo."""
import subprocess
try:
result = subprocess.run(
['rocm-smi', '--showmeminfo', 'vram', '-d', str(self.device_id)],
capture_output=True,
text=True,
timeout=5
)
if result.returncode != 0:
return 0.0, 0.0
# Parse output for memory info
# Looking for lines like "GPU memory used: X MiB" and "GPU memory total: Y MiB"
used_gb = 0.0
total_gb = 0.0
for line in result.stdout.split('\n'):
if 'Used' in line or 'used' in line:
# Extract number
parts = line.split()
for i, part in enumerate(parts):
if part.replace('.', '').isdigit():
used_bytes = float(part)
# Check if next part indicates unit
if i + 1 < len(parts):
unit = parts[i + 1].lower()
if 'mb' in unit or 'mib' in unit:
used_gb = used_bytes / 1024
elif 'gb' in unit or 'gib' in unit:
used_gb = used_bytes
elif 'kb' in unit or 'kib' in unit:
used_gb = used_bytes / (1024 * 1024)
break
if 'Total' in line or 'total' in line:
parts = line.split()
for i, part in enumerate(parts):
if part.replace('.', '').isdigit():
total_bytes = float(part)
if i + 1 < len(parts):
unit = parts[i + 1].lower()
if 'mb' in unit or 'mib' in unit:
total_gb = total_bytes / 1024
elif 'gb' in unit or 'gib' in unit:
total_gb = total_bytes
elif 'kb' in unit or 'kib' in unit:
total_gb = total_bytes / (1024 * 1024)
break
return used_gb, total_gb
except Exception:
return 0.0, 0.0
def get_metrics(self) -> GPUMetrics:
"""Get current AMD GPU metrics."""
import subprocess
try:
# Get main metrics from concise output
result = subprocess.run(
['rocm-smi', '--showid', '--showtemp', '--showpower', '--showuse', '--showmemuse'],
capture_output=True,
text=True,
timeout=5
)
if result.returncode != 0:
raise RuntimeError(f"rocm-smi failed: {result.stderr}")
metrics = self._parse_detailed_output(result.stdout)
# Get detailed memory info
memory_used_gb, memory_total_gb = self._get_memory_info()
# If we couldn't get absolute memory, estimate from percentage
if memory_total_gb == 0.0:
# MI300X has ~192GB, MI250X has ~128GB - use a reasonable default
memory_total_gb = 192.0 # Assume MI300X
memory_used_gb = memory_total_gb * (metrics['vram_percent'] / 100.0)
return GPUMetrics(
timestamp=time.time(),
power_watts=metrics['power'],
gpu_utilization_percent=metrics['gpu_percent'],
memory_used_gb=memory_used_gb,
memory_total_gb=memory_total_gb,
temperature_celsius=metrics['temperature'],
energy_joules=None # Will use power integration
)
except subprocess.TimeoutExpired:
raise RuntimeError("rocm-smi command timed out")
except Exception as e:
raise RuntimeError(f"Failed to get AMD GPU metrics: {e}")
def get_device_name(self) -> str:
"""Get AMD GPU device name."""
import subprocess
try:
result = subprocess.run(
['rocm-smi', '--showproductname', '-d', str(self.device_id)],
capture_output=True,
text=True,
timeout=5
)
if result.returncode == 0:
# Parse output to find device name
for line in result.stdout.split('\n'):
if 'Card series' in line or 'Card model' in line or 'name' in line.lower():
parts = line.split(':')
if len(parts) > 1:
return parts[1].strip()
except Exception:
pass
return f"AMD GPU {self.device_id}"
def cleanup(self):
"""Cleanup AMD resources."""
# No cleanup needed for command-line tool
pass
def get_gpu_monitor(device_id: int = 0) -> GPUMonitor:
"""
Factory function to automatically detect and create appropriate GPU monitor.
Args:
device_id: GPU device ID to monitor
Returns:
GPUMonitor instance (NVIDIAMonitor or AMDMonitor)
Raises:
RuntimeError: If no supported GPU is found
"""
# Try AMD first (rocm-smi based) as it's more commonly available
try:
return AMDMonitor(device_id)
except:
pass
# Try NVIDIA if AMD fails
try:
return NVIDIAMonitor(device_id)
except:
pass
# Try to import torch to detect GPU type as last resort
try:
import torch
if torch.cuda.is_available():
# Check if it's NVIDIA or AMD
device_name = torch.cuda.get_device_name(device_id).lower()
if 'nvidia' in device_name or 'tesla' in device_name or 'geforce' in device_name:
return NVIDIAMonitor(device_id)
elif 'amd' in device_name or 'radeon' in device_name or 'mi300' in device_name or 'mi200' in device_name:
return AMDMonitor(device_id)
except:
pass
raise RuntimeError(
"No supported GPU found. Make sure either ROCm (rocm-smi) or NVIDIA (pynvml) drivers are installed."
)
def list_available_gpus() -> List[str]:
"""
List all available GPUs.
Returns:
List of GPU names
"""
gpus = []
# Try NVIDIA
try:
import pynvml
pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()
for i in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
name = pynvml.nvmlDeviceGetName(handle)
if isinstance(name, bytes):
name = name.decode('utf-8')
gpus.append(f"GPU {i}: {name} (NVIDIA)")
pynvml.nvmlShutdown()
except:
pass
# Try AMD with rocm-smi
try:
import subprocess
import shutil
if shutil.which('rocm-smi'):
result = subprocess.run(
['rocm-smi', '--showid'],
capture_output=True,
text=True,
timeout=5
)
if result.returncode == 0:
# Parse device IDs from output
for line in result.stdout.split('\n'):
if not line.strip() or line.startswith('=') or 'Device' in line or 'ROCm' in line:
continue
parts = line.split()
if parts and parts[0].isdigit():
device_id = int(parts[0])
# Try to get device name
name_result = subprocess.run(
['rocm-smi', '--showproductname', '-d', str(device_id)],
capture_output=True,
text=True,
timeout=5
)
name = f"AMD GPU"
if name_result.returncode == 0:
for name_line in name_result.stdout.split('\n'):
if 'Card' in name_line or 'name' in name_line.lower():
parts_name = name_line.split(':')
if len(parts_name) > 1:
name = parts_name[1].strip()
break
gpus.append(f"GPU {device_id}: {name} (AMD)")
except:
pass
return gpus
if __name__ == "__main__":
"""Test GPU monitoring."""
print("=" * 60)
print("GPU Monitoring Test")
print("=" * 60)
# List available GPUs
print("\nAvailable GPUs:")
gpus = list_available_gpus()
if not gpus:
print(" No GPUs found!")
exit(1)
for gpu in gpus:
print(f" {gpu}")
# Test monitoring
print("\nTesting GPU 0 monitoring...")
try:
monitor = get_gpu_monitor(0)
print(f" Device: {monitor.get_device_name()}")
# Get metrics
metrics = monitor.get_metrics()
print(f"\nCurrent Metrics:")
print(f" Power: {metrics.power_watts:.2f} W")
print(f" GPU Utilization: {metrics.gpu_utilization_percent:.1f}%")
print(f" Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB")
if metrics.temperature_celsius:
print(f" Temperature: {metrics.temperature_celsius:.1f}°C")
# Test energy monitoring
print("\nTesting energy monitoring (5 seconds)...")
monitor.start_monitoring()
time.sleep(5)
energy = monitor.get_energy_consumed()
avg_power = monitor.get_average_power()
print(f" Energy consumed: {energy:.2f} J")
print(f" Average power: {avg_power:.2f} W")
monitor.cleanup()
print("\n✓ Monitoring test successful!")
except Exception as e:
print(f"\n✗ Error: {e}")
exit(1)

473
utils/metrics.py Normal file
View File

@@ -0,0 +1,473 @@
"""
Metrics Collection and Reporting for LLM Benchmarking
Provides centralized metrics collection, aggregation, and reporting.
"""
import json
import csv
from dataclasses import dataclass, asdict, field
from typing import Dict, List, Optional, Any
from pathlib import Path
import time
@dataclass
class StageMetrics:
"""Metrics for a specific stage (e.g., forward pass, prefill, etc.)."""
stage_name: str
duration_ms: float
tokens_processed: int
tokens_per_second: float
energy_joules: float
energy_per_token: float
avg_power_watts: float
peak_memory_gb: float
avg_gpu_util_percent: float
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary."""
return asdict(self)
@dataclass
class PretrainMetrics:
"""Metrics for pretraining benchmark."""
model_name: str
gpu_name: str
attention_implementation: str
batch_size: int
sequence_length: int
num_steps: int
# Stage-specific metrics
forward: StageMetrics
backward: StageMetrics
optimizer: StageMetrics
# Overall metrics
total_duration_ms: float
total_tokens: int
total_tokens_per_second: float
total_energy_joules: float
total_energy_per_token: float
timestamp: float = field(default_factory=time.time)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary."""
return {
"model_name": self.model_name,
"gpu_name": self.gpu_name,
"attention_implementation": self.attention_implementation,
"batch_size": self.batch_size,
"sequence_length": self.sequence_length,
"num_steps": self.num_steps,
"forward": self.forward.to_dict(),
"backward": self.backward.to_dict(),
"optimizer": self.optimizer.to_dict(),
"total_duration_ms": self.total_duration_ms,
"total_tokens": self.total_tokens,
"total_tokens_per_second": self.total_tokens_per_second,
"total_energy_joules": self.total_energy_joules,
"total_energy_per_token": self.total_energy_per_token,
"timestamp": self.timestamp,
}
@dataclass
class InferenceMetrics:
"""Metrics for inference benchmark."""
model_name: str
gpu_name: str
attention_implementation: str
num_requests: int
prompt_length: int
generation_length: int
# Stage-specific metrics
prefill: StageMetrics # Time to First Token
decode: StageMetrics # Inter-Token Latency
# End-to-end metrics
e2e_latency_ms: float
e2e_tokens_per_second: float
e2e_energy_joules: float
e2e_energy_per_token: float
# Additional metrics
ttft_ms: float # Time to First Token (same as prefill duration)
itl_ms: float # Inter-Token Latency (decode duration / num_tokens)
timestamp: float = field(default_factory=time.time)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary."""
return {
"model_name": self.model_name,
"gpu_name": self.gpu_name,
"attention_implementation": self.attention_implementation,
"num_requests": self.num_requests,
"prompt_length": self.prompt_length,
"generation_length": self.generation_length,
"prefill": self.prefill.to_dict(),
"decode": self.decode.to_dict(),
"e2e_latency_ms": self.e2e_latency_ms,
"e2e_tokens_per_second": self.e2e_tokens_per_second,
"e2e_energy_joules": self.e2e_energy_joules,
"e2e_energy_per_token": self.e2e_energy_per_token,
"ttft_ms": self.ttft_ms,
"itl_ms": self.itl_ms,
"timestamp": self.timestamp,
}
class MetricsCollector:
"""Collects metrics during benchmark runs."""
def __init__(self):
"""Initialize metrics collector."""
self.metrics_history: List[Dict[str, Any]] = []
def add_pretrain_metrics(self, metrics: PretrainMetrics):
"""Add pretraining metrics."""
self.metrics_history.append({
"type": "pretrain",
"metrics": metrics.to_dict()
})
def add_inference_metrics(self, metrics: InferenceMetrics):
"""Add inference metrics."""
self.metrics_history.append({
"type": "inference",
"metrics": metrics.to_dict()
})
def get_all_metrics(self) -> List[Dict[str, Any]]:
"""Get all collected metrics."""
return self.metrics_history
def clear(self):
"""Clear all metrics."""
self.metrics_history.clear()
class MetricsReporter:
"""Formats and outputs benchmark results."""
@staticmethod
def print_pretrain_metrics(metrics: PretrainMetrics, verbose: bool = True):
"""Print pretraining metrics to console."""
print("\n" + "=" * 80)
print("PRETRAINING BENCHMARK RESULTS")
print("=" * 80)
print(f"\nModel: {metrics.model_name}")
print(f"GPU: {metrics.gpu_name}")
print(f"Attention: {metrics.attention_implementation}")
print(f"Batch Size: {metrics.batch_size}")
print(f"Sequence Length: {metrics.sequence_length}")
print(f"Training Steps: {metrics.num_steps}")
print("\n" + "-" * 80)
print("STAGE BREAKDOWN")
print("-" * 80)
# Forward pass
print(f"\n[1] FORWARD PASS")
MetricsReporter._print_stage_metrics(metrics.forward, verbose)
# Backward pass
print(f"\n[2] BACKWARD PASS")
MetricsReporter._print_stage_metrics(metrics.backward, verbose)
# Optimizer step
print(f"\n[3] OPTIMIZER STEP")
MetricsReporter._print_stage_metrics(metrics.optimizer, verbose)
# Overall
print("\n" + "-" * 80)
print("OVERALL METRICS")
print("-" * 80)
print(f" Total Duration: {metrics.total_duration_ms:>10.2f} ms")
print(f" Total Tokens: {metrics.total_tokens:>10,}")
print(f" Throughput: {metrics.total_tokens_per_second:>10.2f} tokens/s")
print(f" Total Energy: {metrics.total_energy_joules:>10.2f} J")
print(f" Energy per Token: {metrics.total_energy_per_token*1000:>10.4f} mJ/token")
print("=" * 80 + "\n")
@staticmethod
def print_inference_metrics(metrics: InferenceMetrics, verbose: bool = True):
"""Print inference metrics to console."""
print("\n" + "=" * 80)
print("INFERENCE BENCHMARK RESULTS")
print("=" * 80)
print(f"\nModel: {metrics.model_name}")
print(f"GPU: {metrics.gpu_name}")
print(f"Attention: {metrics.attention_implementation}")
print(f"Requests: {metrics.num_requests}")
print(f"Prompt Length: {metrics.prompt_length}")
print(f"Generation Length: {metrics.generation_length}")
print("\n" + "-" * 80)
print("STAGE BREAKDOWN")
print("-" * 80)
# Prefill
print(f"\n[1] PREFILL (Time to First Token)")
MetricsReporter._print_stage_metrics(metrics.prefill, verbose)
print(f" TTFT: {metrics.ttft_ms:>10.2f} ms")
# Decode
print(f"\n[2] DECODE (Inter-Token Latency)")
MetricsReporter._print_stage_metrics(metrics.decode, verbose)
print(f" ITL: {metrics.itl_ms:>10.2f} ms/token")
# End-to-end
print("\n" + "-" * 80)
print("END-TO-END METRICS")
print("-" * 80)
print(f" Request Latency: {metrics.e2e_latency_ms:>10.2f} ms")
print(f" Throughput: {metrics.e2e_tokens_per_second:>10.2f} tokens/s")
print(f" Total Energy: {metrics.e2e_energy_joules:>10.2f} J")
print(f" Energy per Token: {metrics.e2e_energy_per_token*1000:>10.4f} mJ/token")
print("=" * 80 + "\n")
@staticmethod
def _print_stage_metrics(stage: StageMetrics, verbose: bool = True):
"""Print metrics for a single stage."""
print(f" Duration: {stage.duration_ms:>10.2f} ms")
print(f" Tokens: {stage.tokens_processed:>10,}")
print(f" Throughput: {stage.tokens_per_second:>10.2f} tokens/s")
print(f" Energy: {stage.energy_joules:>10.2f} J")
print(f" Energy per Token: {stage.energy_per_token*1000:>10.4f} mJ/token")
if verbose:
print(f" Avg Power: {stage.avg_power_watts:>10.2f} W")
print(f" Peak Memory: {stage.peak_memory_gb:>10.2f} GB")
print(f" Avg GPU Utilization: {stage.avg_gpu_util_percent:>10.1f} %")
@staticmethod
def save_json(metrics: Any, output_path: Path):
"""
Save metrics to JSON file.
Args:
metrics: PretrainMetrics or InferenceMetrics object
output_path: Path to output JSON file
"""
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w') as f:
json.dump(metrics.to_dict(), f, indent=2)
print(f"Metrics saved to: {output_path}")
@staticmethod
def save_csv(metrics_list: List[Any], output_path: Path, benchmark_type: str = "pretrain"):
"""
Save multiple metrics to CSV file for comparison.
Args:
metrics_list: List of PretrainMetrics or InferenceMetrics objects
output_path: Path to output CSV file
benchmark_type: "pretrain" or "inference"
"""
if not metrics_list:
print("No metrics to save")
return
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', newline='') as f:
if benchmark_type == "pretrain":
MetricsReporter._save_pretrain_csv(metrics_list, f)
else:
MetricsReporter._save_inference_csv(metrics_list, f)
print(f"CSV saved to: {output_path}")
@staticmethod
def _save_pretrain_csv(metrics_list: List[PretrainMetrics], file):
"""Save pretraining metrics to CSV."""
fieldnames = [
'gpu_name', 'attention_implementation', 'batch_size', 'sequence_length', 'num_steps',
'forward_duration_ms', 'forward_tokens_per_sec', 'forward_energy_j', 'forward_energy_per_token_mj',
'backward_duration_ms', 'backward_tokens_per_sec', 'backward_energy_j', 'backward_energy_per_token_mj',
'optimizer_duration_ms', 'optimizer_tokens_per_sec', 'optimizer_energy_j', 'optimizer_energy_per_token_mj',
'total_duration_ms', 'total_tokens_per_sec', 'total_energy_j', 'total_energy_per_token_mj',
'timestamp'
]
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
for m in metrics_list:
writer.writerow({
'gpu_name': m.gpu_name,
'attention_implementation': m.attention_implementation,
'batch_size': m.batch_size,
'sequence_length': m.sequence_length,
'num_steps': m.num_steps,
'forward_duration_ms': m.forward.duration_ms,
'forward_tokens_per_sec': m.forward.tokens_per_second,
'forward_energy_j': m.forward.energy_joules,
'forward_energy_per_token_mj': m.forward.energy_per_token * 1000,
'backward_duration_ms': m.backward.duration_ms,
'backward_tokens_per_sec': m.backward.tokens_per_second,
'backward_energy_j': m.backward.energy_joules,
'backward_energy_per_token_mj': m.backward.energy_per_token * 1000,
'optimizer_duration_ms': m.optimizer.duration_ms,
'optimizer_tokens_per_sec': m.optimizer.tokens_per_second,
'optimizer_energy_j': m.optimizer.energy_joules,
'optimizer_energy_per_token_mj': m.optimizer.energy_per_token * 1000,
'total_duration_ms': m.total_duration_ms,
'total_tokens_per_sec': m.total_tokens_per_second,
'total_energy_j': m.total_energy_joules,
'total_energy_per_token_mj': m.total_energy_per_token * 1000,
'timestamp': m.timestamp,
})
@staticmethod
def _save_inference_csv(metrics_list: List[InferenceMetrics], file):
"""Save inference metrics to CSV."""
fieldnames = [
'gpu_name', 'attention_implementation', 'num_requests', 'prompt_length', 'generation_length',
'prefill_duration_ms', 'prefill_tokens_per_sec', 'prefill_energy_j', 'prefill_energy_per_token_mj',
'ttft_ms',
'decode_duration_ms', 'decode_tokens_per_sec', 'decode_energy_j', 'decode_energy_per_token_mj',
'itl_ms',
'e2e_latency_ms', 'e2e_tokens_per_sec', 'e2e_energy_j', 'e2e_energy_per_token_mj',
'timestamp'
]
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
for m in metrics_list:
writer.writerow({
'gpu_name': m.gpu_name,
'attention_implementation': m.attention_implementation,
'num_requests': m.num_requests,
'prompt_length': m.prompt_length,
'generation_length': m.generation_length,
'prefill_duration_ms': m.prefill.duration_ms,
'prefill_tokens_per_sec': m.prefill.tokens_per_second,
'prefill_energy_j': m.prefill.energy_joules,
'prefill_energy_per_token_mj': m.prefill.energy_per_token * 1000,
'ttft_ms': m.ttft_ms,
'decode_duration_ms': m.decode.duration_ms,
'decode_tokens_per_sec': m.decode.tokens_per_second,
'decode_energy_j': m.decode.energy_joules,
'decode_energy_per_token_mj': m.decode.energy_per_token * 1000,
'itl_ms': m.itl_ms,
'e2e_latency_ms': m.e2e_latency_ms,
'e2e_tokens_per_sec': m.e2e_tokens_per_second,
'e2e_energy_j': m.e2e_energy_joules,
'e2e_energy_per_token_mj': m.e2e_energy_per_token * 1000,
'timestamp': m.timestamp,
})
if __name__ == "__main__":
"""Test metrics reporting."""
# Create sample pretraining metrics
forward = StageMetrics(
stage_name="forward",
duration_ms=100.5,
tokens_processed=1024,
tokens_per_second=10189.3,
energy_joules=25.3,
energy_per_token=0.0247,
avg_power_watts=251.7,
peak_memory_gb=45.2,
avg_gpu_util_percent=95.3
)
backward = StageMetrics(
stage_name="backward",
duration_ms=205.2,
tokens_processed=1024,
tokens_per_second=4991.2,
energy_joules=51.6,
energy_per_token=0.0504,
avg_power_watts=251.5,
peak_memory_gb=48.6,
avg_gpu_util_percent=97.1
)
optimizer = StageMetrics(
stage_name="optimizer",
duration_ms=15.3,
tokens_processed=1024,
tokens_per_second=66928.1,
energy_joules=3.8,
energy_per_token=0.0037,
avg_power_watts=248.4,
peak_memory_gb=48.6,
avg_gpu_util_percent=42.1
)
pretrain_metrics = PretrainMetrics(
model_name="Qwen/Qwen2.5-3B-Instruct",
gpu_name="NVIDIA A100 80GB",
attention_implementation="flash_attention_2",
batch_size=8,
sequence_length=2048,
num_steps=10,
forward=forward,
backward=backward,
optimizer=optimizer,
total_duration_ms=321.0,
total_tokens=10240,
total_tokens_per_second=31900.3,
total_energy_joules=80.7,
total_energy_per_token=0.00788
)
# Print pretrain metrics
MetricsReporter.print_pretrain_metrics(pretrain_metrics)
# Create sample inference metrics
prefill = StageMetrics(
stage_name="prefill",
duration_ms=45.2,
tokens_processed=512,
tokens_per_second=11327.4,
energy_joules=11.3,
energy_per_token=0.0221,
avg_power_watts=250.0,
peak_memory_gb=42.1,
avg_gpu_util_percent=89.2
)
decode = StageMetrics(
stage_name="decode",
duration_ms=223.5,
tokens_processed=100,
tokens_per_second=447.4,
energy_joules=55.9,
energy_per_token=0.559,
avg_power_watts=250.1,
peak_memory_gb=42.1,
avg_gpu_util_percent=62.3
)
inference_metrics = InferenceMetrics(
model_name="Qwen/Qwen2.5-3B-Instruct",
gpu_name="NVIDIA A100 80GB",
attention_implementation="flash_attention_2",
num_requests=10,
prompt_length=512,
generation_length=100,
prefill=prefill,
decode=decode,
e2e_latency_ms=268.7,
e2e_tokens_per_second=2277.9,
e2e_energy_joules=67.2,
e2e_energy_per_token=0.110,
ttft_ms=45.2,
itl_ms=2.235
)
# Print inference metrics
MetricsReporter.print_inference_metrics(inference_metrics)