commit 747c92ac6b542ec9fef5896735ececd91e8b6039 Author: Bole Ma Date: Thu Feb 5 23:18:26 2026 +0100 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..721247b --- /dev/null +++ b/.gitignore @@ -0,0 +1,408 @@ +# READ THIS BEFORE YOU REFACTOR ME +# +# setup.py uses the list of patterns in this file to decide +# what to delete, but it's not 100% sound. So, for example, +# if you delete aten/build/ because it's redundant with build/, +# aten/build/ will stop being cleaned. So be careful when +# refactoring this file! + +## Model cache +.md +model_cache/ + +## PyTorch +.coverage +coverage.xml +.dmypy.json +.gradle +.hypothesis +.mypy_cache +.additional_ci_files +.lintrunner.private.toml +/.extracted_scripts/ +**/.pytorch_specified_test_cases.csv +**/.pytorch-disabled-tests.json +*/*.pyc +*/*.so* +*/**/__pycache__ +*/**/*.dylib* +*/**/*.pyc +*/**/*.pyd +*/**/*.so* +*/**/**/*.pyc +*/**/**/**/*.pyc +*/**/**/**/**/*.pyc +aten/build/ +aten/src/ATen/Config.h +aten/src/ATen/cuda/CUDAConfig.h +aten/src/ATen/hip/HIPConfig.h +benchmarks/.data +caffe2/cpp_test/ +dist/ +docs/build/ +docs/cpp/src +docs/src/**/* +docs/cpp/build +docs/cpp/source/api +docs/cpp/source/html/ +docs/cpp/source/latex/ +docs/source/compile/generated/ +docs/source/generated/ +docs/source/compile/generated/ +log +usage_log.txt +usage_log* +test-reports/ +test/*.bak +test/**/*.bak +test/.coverage +test/.hypothesis/ +test/cpp/api/mnist +test/custom_operator/model.pt +test/debug/ +test/jit_hooks/*.pt +test/data/legacy_modules.t7 +test/data/*.pt +test/forward_backward_compatibility/nightly_schemas.txt +dropout_model.pt +test/generated_type_hints_smoketest.py +test/htmlcov +test/cpp_extensions/**/install +test/kernel.errors.txt +third_party/build/ +third_party/nccl/ +tools/coverage_plugins_package/pip-wheel-metadata/ +tools/shared/_utils_internal.py +tools/fast_nvcc/wrap_nvcc.sh +tools/fast_nvcc/wrap_nvcc.bat +tools/fast_nvcc/tmp/ +torch.egg-info/ +torch/_C/__init__.pyi +torch/_C/_nn.pyi +torch/_C/_VariableFunctions.pyi +torch/_VF.pyi +torch/return_types.pyi +torch/nn/functional.pyi +torch/utils/data/datapipes/datapipe.pyi +torch/csrc/autograd/generated/* +torch/csrc/functionalization/generated/* +torch/csrc/lazy/generated/*.[!m]* +torch_compile_debug/ +# Listed manually because some files in this directory are not generated +torch/testing/_internal/generated/annotated_fn_args.py +torch/testing/_internal/data/*.pt +torch/headeronly/version.h +torch/csrc/cudnn/cuDNN.cpp +torch/csrc/generated +torch/csrc/generic/TensorMethods.cpp +torch/csrc/inductor/aoti_torch/generated/*.cpp +torch/csrc/inductor/aoti_torch/generated/extend/* +torch/csrc/jit/generated/* +torch/csrc/jit/fuser/config.h +torch/csrc/nn/THCUNN.cpp +torch/csrc/nn/THCUNN.cwrap +torch/bin/ +torch/cmake/ +torch/lib/*.a* +torch/lib/*.dll* +torch/lib/*.exe* +torch/lib/*.dylib* +torch/lib/*.h +torch/lib/*.lib +torch/lib/*.pdb +torch/lib/*.so* +torch/lib/protobuf*.pc +torch/lib/build +torch/lib/caffe2/ +torch/lib/cmake +torch/lib/include +torch/lib/pkgconfig +torch/lib/protoc +torch/lib/protobuf/ +torch/lib/tmp_install +torch/lib/torch_shm_manager +torch/lib/site-packages/ +torch/lib/python* +torch/lib64 +torch/include/ +torch/share/ +torch/test/ +torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h +torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h +torch/version.py +torch/_inductor/kernel/vendored_templates/* +test/inductor/test_tlx* +minifier_launcher.py +aten/src/ATen/native/transformers/hip/flash_attn/ck/fav_v3/asm_fmha_v3_bwd_configs.hpp +aten/src/ATen/native/transformers/hip/flash_attn/ck/fav_v3/mha_bwd.hip +aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d* +aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d* +aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_convert* +aten/src/ATen/native/transformers/hip/flash_attn/ck/fwd_blob* +aten/src/ATen/native/transformers/hip/flash_attn/ck/bwd_blob* +aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_api* +aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_api* +# Root level file used in CI to specify certain env configs. +# E.g., see .circleci/config.yaml +env +.circleci/scripts/COMMIT_MSG +scripts/release_notes/*.json +sccache-stats*.json +lint.json +merge_record.json +.github/scripts/nightly_source_matrix.json + +# These files get copied over on invoking setup.py +torchgen/packaged/* +!torchgen/packaged/README.md + +# This file is injected by ROCm build scripts to bootstrap in torch/__init__.py. +torch/_rocm_init.py + +# IPython notebook checkpoints +.ipynb_checkpoints + +# Editor temporaries +*.swa +*.swb +*.swc +*.swd +*.swe +*.swf +*.swg +*.swh +*.swi +*.swj +*.swk +*.swl +*.swm +*.swn +*.swo +*.swp +*~ +.~lock.* + +# macOS dir files +.DS_Store + +# Ninja files +.ninja_deps +.ninja_log +compile_commands.json +*.egg-info/ +docs/source/scripts/activation_images/ +docs/source/scripts/quantization_backend_configs/ +docs/source/scripts/lr_scheduler_images/ + +## General + +# Compiled Object files +*.slo +*.lo +*.o +*.cuo +*.obj + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Compiled protocol buffers +*.pb.h +*.pb.cc +*_pb2.py + +# Compiled python +*.pyc +*.pyd + +# Compiled MATLAB +*.mex* + +# NFS handle files +**/.nfs* + +# Sublime Text settings +*.sublime-workspace +*.sublime-project + +# Eclipse Project settings +*.*project +.settings + +# QtCreator files +*.user + +# PyCharm files +.idea + +# GDB history +.gdb_history + +## Caffe2 + +# build, distribute, and bins (+ python proto bindings) +build/ +# Allow tools/build/ for build support. +!tools/build/ +build_host_protoc +build_android +build_ios +.build_debug/* +.build_release/* +.build_profile/* +distribute/* +*.testbin +*.bin +cmake_build +.cmake_build +gen +.setuptools-cmake-build +.pytest_cache +aten/build/* + +# Linker scripts for prioritized text optimization +cmake/linker_script.ld + +# Bram +plsdontbreak + +# Generated documentation +docs/_site +docs/gathered +_site +doxygen +docs/dev + +# LevelDB files +*.sst +*.ldb +LOCK +CURRENT +MANIFEST-* + +# generated version file +caffe2/version.py + +# setup.py intermediates +.eggs +caffe2.egg-info +MANIFEST + +# Atom/Watchman required file +.watchmanconfig +.watchman + +# Files generated by CLion +cmake-build-debug + +# BEGIN NOT-CLEAN-FILES (setup.py handles this marker. Do not change.) +# +# Below files are not deleted by "setup.py clean". + +# Downloaded bazel +tools/bazel + +# Visual Studio Code files +.vs +/.vscode/* +!/.vscode/extensions.json +!/.vscode/settings_recommended.json + +# YouCompleteMe config file +.ycm_extra_conf.py + +# Files generated when a patch is rejected +*.orig +*.rej + +# Files generated by ctags +CTAGS +GTAGS +GRTAGS +GSYMS +GPATH +tags +TAGS + + +# ccls file +.ccls-cache/ + +# clang tooling storage location +.clang-format-bin +.clang-tidy-bin +.lintbin + +# clangd background index +.clangd/ +.cache/ + +# bazel symlinks +bazel-* + +# xla repo +xla/ + +# direnv, posh-direnv +.env +.envrc +.psenvrc + +# generated shellcheck directories +.shellcheck_generated*/ + +# zip archives +*.zip + +# core dump files +**/core.[1-9]* + +# Generated if you use the pre-commit script for clang-tidy +pr.diff + +# coverage files +*/**/.coverage.* + +# buck generated files +.buckd/ +.lsp-buck-out/ +.lsp.buckd/ +buck-out/ + +# Downloaded libraries +third_party/ruy/ +third_party/glog/ + +# Virtualenv +.venv/ +venv/ + +# Log files +*.log +sweep/ + +# Android build artifacts +android/pytorch_android/.cxx +android/pytorch_android_torchvision/.cxx + +# Pyre configs (for internal usage) +.pyre_configuration +.pyre_configuration.codenav +.arcconfig +.stable_pyre_client +.pyre_client + +# Claude Code local configuration +CLAUDE.local.md +/test_*.py +/debug_*.py +CLAUDE_CONTEXT/ +/.claude/settings.local.json \ No newline at end of file diff --git a/AMD_FIX_SUMMARY.md b/AMD_FIX_SUMMARY.md new file mode 100644 index 0000000..b5fbca1 --- /dev/null +++ b/AMD_FIX_SUMMARY.md @@ -0,0 +1,100 @@ +# AMD GPU Monitoring Fix Summary + +## Issue +The AMDMonitor class was using incorrect pyrsmi API calls. The implementation attempted to use low-level `rocmsmi` module which has complex initialization and function signatures. + +## Solution +Updated to use the correct `rocml` high-level API from pyrsmi, based on the official example at: +`/anvme/workspace/ihpc125h-llm-profiles/pyrsmi/examples/llm_monitoring/monitor_llm_inference.py` + +## Changes Made + +### 1. Fixed AMDMonitor Class + +**Before** (incorrect): +```python +from pyrsmi import rocmsmi +ret = self.rocmsmi.rsmi_init(0) +power_uw = self.rocmsmi.rsmi_dev_power_ave_get(self.device_id) +``` + +**After** (correct): +```python +from pyrsmi import rocml +self.rocml.smi_initialize() +power_watts = self.rocml.smi_get_device_average_power(self.device_id) +``` + +**Key API Functions**: +- `rocml.smi_initialize()` - Initialize monitoring +- `rocml.smi_get_device_average_power(device_id)` - Get power in Watts (not microwatts!) +- `rocml.smi_get_device_utilization(device_id)` - Get GPU utilization % +- `rocml.smi_get_device_memory_used(device_id)` - Get memory used in bytes +- `rocml.smi_get_device_memory_total(device_id)` - Get total memory in bytes +- `rocml.smi_get_device_temperature(device_id)` - Get temperature +- `rocml.smi_get_device_name(device_id)` - Get device name +- `rocml.smi_shutdown()` - Cleanup + +### 2. Updated All SLURM Scripts for Apptainer + +All GPU benchmark scripts now run inside the apptainer container: + +**A100, H100, H200** (NVIDIA): +```bash +APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_tilelang.sif" +apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py ... +``` + +**MI300X** (AMD): +```bash +APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_tilelang.sif" +apptainer exec --rocm $APPTAINER_IMAGE python run_benchmark.py ... +``` + +Note: `--nv` for NVIDIA, `--rocm` for AMD + +### 3. Updated Documentation + +- README.md now mentions apptainer usage +- Updated setup instructions to use apptainer for model caching +- Added notes about container flags (--nv vs --rocm) + +## Testing + +To verify the AMD monitoring works: + +```bash +# Inside apptainer on MI300X node +apptainer exec --rocm pytorch_25.10_tilelang.sif python -c " +from utils.gpu_monitor import AMDMonitor +m = AMDMonitor(0) +print(f'GPU: {m.get_device_name()}') +metrics = m.get_metrics() +print(f'Power: {metrics.power_watts:.2f} W') +print(f'Utilization: {metrics.gpu_utilization_percent:.1f}%') +print(f'Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB') +m.cleanup() +" +``` + +## Files Modified + +1. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/utils/gpu_monitor.py` - Fixed AMDMonitor class +2. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_a100.sh` - Added apptainer +3. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_h100.sh` - Added apptainer +4. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_h200.sh` - Added apptainer +5. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_mi300x.sh` - Added apptainer with --rocm +6. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/README.md` - Updated documentation + +## Key Differences: rocml vs rocmsmi + +| Feature | rocml (High-level) | rocmsmi (Low-level) | +|---------|-------------------|---------------------| +| API Style | Simple functions | Complex C-style API | +| Initialization | `smi_initialize()` | `rsmi_init(0)` + error codes | +| Power | Returns Watts | Returns microwatts | +| Memory | Returns bytes | Returns bytes via enums | +| Error Handling | Returns -1 on error | Returns error codes | +| Ease of Use | Much easier | Complex | + +The `rocml` module is the recommended high-level Python API for pyrsmi. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a98c03c --- /dev/null +++ b/README.md @@ -0,0 +1,311 @@ +# LLM Benchmark Suite + +A comprehensive benchmarking suite for comparing LLM performance (Qwen3-4B) across different GPU architectures: **MI300X**, **A100 80G**, **H100**, and **H200**. + +## Features + +- **Pretraining Benchmarks**: Separate metrics for forward, backward, and optimizer stages +- **Inference Benchmarks**: Separate metrics for prefill (TTFT) and decode (ITL) stages +- **Energy Monitoring**: GPU-specific energy and power measurement + - NVIDIA: pynvml + - AMD: pyrsmi +- **Attention Implementations**: + - FlashAttention-2 (A100, MI300X) + - FlashAttention-3 Hopper (H100, H200) + - Configurable via CLI +- **Comprehensive Metrics**: + - Tokens per second + - Energy per token + - Time to First Token (TTFT) + - Inter-Token Latency (ITL) + - End-to-End Request Latency + - GPU utilization and memory usage + +## Directory Structure + +``` +llm-benchmark/ +├── cache_model.py # Model caching script +├── benchmark_pretrain.py # Pretraining benchmark +├── benchmark_inference.py # Inference benchmark +├── run_benchmark.py # Main orchestration script +├── requirements.txt # Python dependencies +├── utils/ +│ ├── gpu_monitor.py # GPU monitoring (NVIDIA & AMD) +│ ├── metrics.py # Metrics collection and reporting +│ └── attention.py # Attention implementation helpers +├── configs/ +│ ├── a100.yaml +│ ├── h100.yaml +│ ├── h200.yaml +│ └── mi300x.yaml +└── results/ # Benchmark results (JSON) +``` + +## Setup + +### 1. Container Environment + +All benchmarks should be run inside the apptainer container: + +```bash +# Container is located at: +/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_tilelang.sif +``` + +### 2. Install Dependencies (if not using apptainer) + +If you want to run directly without apptainer: + +```bash +# Install Python dependencies +pip install -r requirements.txt + +# For AMD GPUs, ensure ROCm and pyrsmi are installed +# For NVIDIA GPUs, ensure CUDA and pynvml are installed +``` + +### 3. Cache Model (Run on Head Node) + +**IMPORTANT**: Run this on the head node BEFORE allocating compute nodes, as compute nodes are typically offline. + +```bash +# Using apptainer (recommended) +apptainer exec --nv pytorch_25.10_tilelang.sif python cache_model.py \ + --model-name Qwen/Qwen3-4B \ + --cache-dir ./model_cache + +# Or directly (if dependencies installed) +python cache_model.py --model-name Qwen/Qwen3-4B --cache-dir ./model_cache +``` + +The model will be cached to `./model_cache` in the current directory (avoiding slow NFS $HOME). + +## Usage + +### Quick Start + +```bash +# Run both pretraining and inference benchmarks +python run_benchmark.py --mode both --model-path ./model_cache + +# Run only pretraining +python run_benchmark.py --mode pretrain --num-steps 20 + +# Run only inference +python run_benchmark.py --mode inference --num-requests 20 +``` + +### Detailed Usage + +#### List Available GPUs + +```bash +python run_benchmark.py --list-gpus +``` + +#### Pretraining Benchmark + +```bash +python benchmark_pretrain.py \ + --model-path ./model_cache \ + --model-name Qwen/Qwen3-4B \ + --attn-implementation auto \ + --batch-size 8 \ + --sequence-length 8192 \ + --num-steps 10 \ + --warmup-steps 3 \ + --output-dir ./results +``` + +**Metrics Reported** (per stage: forward, backward, optimizer): +- Duration (ms) +- Tokens processed +- Throughput (tokens/s) +- Energy (J) +- Energy per token (J/token) +- Average power (W) +- Peak memory (GB) +- GPU utilization (%) + +#### Inference Benchmark + +```bash +python benchmark_inference.py \ + --model-path ./model_cache \ + --model-name Qwen/Qwen3-4B \ + --attn-implementation auto \ + --num-requests 10 \ + --prompt-length 512 \ + --generation-length 100 \ + --warmup-requests 2 \ + --output-dir ./results +``` + +**Metrics Reported**: +- **Prefill**: TTFT, throughput, energy per token +- **Decode**: ITL, throughput, energy per token +- **End-to-End**: Request latency, total throughput, total energy + +### Attention Implementations + +The benchmark automatically selects the optimal attention implementation based on GPU: +- **A100, MI300X**: `flash_attention_2` +- **H100, H200**: `flash_attention_3_hopper` + +Override with `--attn-implementation`: + +```bash +# Force FlashAttention-3 Hopper on H100 +python run_benchmark.py --attn-implementation flash_attention_3_hopper + +# Use SDPA instead +python run_benchmark.py --attn-implementation sdpa +``` + +Available options: +- `auto` - Auto-detect based on GPU +- `flash_attention_2` - FlashAttention-2 (all GPUs) +- `flash_attention_3_hopper` - FlashAttention-3 for H100/H200 +- `sdpa` - PyTorch Scaled Dot Product Attention +- `eager` - Standard PyTorch attention + +## Running on SLURM + +All SLURM scripts are configured to run inside the apptainer container. First cache the model on the head node: + +```bash +# On head node (with internet access) +apptainer exec --nv pytorch_25.10_tilelang.sif python cache_model.py \ + --model-name Qwen/Qwen3-4B \ + --cache-dir ./model_cache +``` + +Then submit jobs: + +```bash +# A100 +sbatch slurm_a100.sh + +# H100 +sbatch slurm_h100.sh + +# H200 +sbatch slurm_h200.sh + +# MI300X +sbatch slurm_mi300x.sh +``` + +**Note**: +- NVIDIA GPUs use `--nv` flag +- AMD GPUs use `--rocm` flag + +## Output + +Results are saved to the `--output-dir` directory (default: `./results/`): + +- `pretrain__.json` - Pretraining metrics +- `inference__.json` - Inference metrics + +Example output: + +``` +=============================================================================== +PRETRAINING BENCHMARK RESULTS +=============================================================================== + +Model: Qwen/Qwen3-4B +GPU: NVIDIA A100 80GB +Attention: flash_attention_2 +Batch Size: 8 +Sequence Length: 8192 +Training Steps: 10 + +------------------------------------------------------------------------------- +STAGE BREAKDOWN +------------------------------------------------------------------------------- + +[1] FORWARD PASS + Duration: 1005.23 ms + Tokens: 163,840 + Throughput: 163,012.45 tokens/s + Energy: 253.0 J + Energy per Token: 1.5443 mJ/token + +[2] BACKWARD PASS + Duration: 2052.11 ms + Tokens: 163,840 + Throughput: 79,857.23 tokens/s + Energy: 516.2 J + Energy per Token: 3.1513 mJ/token + +[3] OPTIMIZER STEP + Duration: 153.42 ms + Tokens: 163,840 + Throughput: 1,068,012.34 tokens/s + Energy: 38.4 J + Energy per Token: 0.2344 mJ/token + +------------------------------------------------------------------------------- +OVERALL METRICS +------------------------------------------------------------------------------- + Total Duration: 3210.76 ms + Total Tokens: 163,840 + Throughput: 51,012.45 tokens/s + Total Energy: 807.6 J + Energy per Token: 4.9300 mJ/token +=============================================================================== +``` + +## Key Metrics Reference + +### Pretraining +- **Forward**: Input processing and loss calculation +- **Backward**: Gradient computation +- **Optimizer**: Weight updates + +### Inference +- **TTFT (Time to First Token)**: Prefill latency +- **ITL (Inter-Token Latency)**: Average decode time per token +- **E2E Latency**: Total request time (prefill + decode) + +### Energy +- **Energy (J)**: Total energy consumed +- **Energy per Token (mJ/token)**: Energy efficiency metric +- **Average Power (W)**: Power consumption during stage + +## Troubleshooting + +### Model Not Found +Ensure you've cached the model first: +```bash +python cache_model.py --model-name Qwen/Qwen2.5-3B-Instruct --cache-dir ./model_cache +``` + +### GPU Monitoring Errors +- **NVIDIA**: Install pynvml: `pip install pynvml` +- **AMD**: Install pyrsmi: `pip install pyrsmi` + +### FlashAttention-3 Not Found +For H100/H200, ensure FlashAttention-3 is installed. If not available, use: +```bash +python run_benchmark.py --attn-implementation flash_attention_2 +``` + +### Out of Memory +Reduce batch size or sequence length: +```bash +python run_benchmark.py --batch-size 4 --sequence-length 1024 +``` + +## Citation + +If you use this benchmark suite, please cite: +- [FlashAttention-2](https://github.com/Dao-AILab/flash-attention) +- [FlashAttention-3](https://github.com/Dao-AILab/flash-attention) (for Hopper) +- [Qwen Models](https://huggingface.co/Qwen) + +## License + +MIT License - see LICENSE file for details diff --git a/benchmark_inference.py b/benchmark_inference.py new file mode 100755 index 0000000..0895e21 --- /dev/null +++ b/benchmark_inference.py @@ -0,0 +1,417 @@ +#!/usr/bin/env python3 +""" +Inference Benchmark for LLM Performance Evaluation + +Measures performance and energy metrics for inference workloads with +separate measurements for prefill and decode stages. +""" + +import argparse +import os +import sys +import time +from pathlib import Path +from typing import Optional + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +from tqdm import tqdm + +# Add utils to path +sys.path.insert(0, str(Path(__file__).parent)) +from utils.gpu_monitor import get_gpu_monitor +from utils.metrics import StageMetrics, InferenceMetrics, MetricsReporter +from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu + + +def benchmark_inference( + model_name_or_path: str, + attn_implementation: str = "auto", + num_requests: int = 10, + prompt_length: int = 512, + generation_length: int = 100, + warmup_requests: int = 2, + device: str = "cuda", + device_id: int = 0, + output_dir: Optional[str] = None, + verbose: bool = True, +): + """ + Run inference benchmark. + + Args: + model_name_or_path: Path to model or HuggingFace identifier + attn_implementation: Attention implementation to use + num_requests: Number of inference requests to measure + prompt_length: Length of input prompt + generation_length: Number of tokens to generate + warmup_requests: Number of warmup requests + device: Device to use + device_id: GPU device ID + output_dir: Directory to save results + verbose: Print verbose output + """ + print("=" * 80) + print("INFERENCE BENCHMARK") + print("=" * 80) + + # Initialize GPU monitor + if verbose: + print("\n[1/7] Initializing GPU monitor...") + monitor = get_gpu_monitor(device_id) + gpu_name = monitor.get_device_name() + if verbose: + print(f" GPU: {gpu_name}") + + # Determine attention implementation + if attn_implementation == "auto": + attn_implementation = get_default_attention(gpu_name) + if verbose: + print(f" Auto-selected attention: {attn_implementation}") + + # Validate attention for GPU + valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name) + if warning and verbose: + print(f" ⚠ {warning}") + + # Load model + if verbose: + print(f"\n[2/7] Loading model: {model_name_or_path}") + + # Determine attn_implementation parameter for model loading + load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation + + try: + model = AutoModelForCausalLM.from_pretrained( + model_name_or_path, + torch_dtype=torch.bfloat16, + attn_implementation=load_attn, + trust_remote_code=True, + ) + model = model.to(device) + + # Configure attention (patch if needed for FA3) + model = configure_model_attention(model, attn_implementation, verbose=verbose) + + if verbose: + total_params = sum(p.numel() for p in model.parameters()) + print(f" Total parameters: {total_params:,} ({total_params/1e9:.2f}B)") + except Exception as e: + print(f"✗ Error loading model: {e}") + sys.exit(1) + + # Load tokenizer + if verbose: + print(f"\n[3/7] Loading tokenizer...") + try: + tokenizer = AutoTokenizer.from_pretrained( + model_name_or_path, + trust_remote_code=True + ) + except Exception as e: + print(f"✗ Error loading tokenizer: {e}") + sys.exit(1) + + # Generate synthetic prompts + if verbose: + print(f"\n[4/7] Generating synthetic prompts...") + print(f" Prompt length: {prompt_length}") + print(f" Generation length: {generation_length}") + + # Create random input_ids (synthetic prompts) + vocab_size = model.config.vocab_size + # We'll create one prompt and reuse it + prompt_ids = torch.randint(0, vocab_size, (1, prompt_length), device=device) + + # Warmup + if verbose: + print(f"\n[5/7] Running warmup ({warmup_requests} requests)...") + model.eval() + with torch.no_grad(): + for _ in range(warmup_requests): + _ = model.generate( + prompt_ids, + max_new_tokens=generation_length, + do_sample=False, + pad_token_id=tokenizer.eos_token_id + ) + + # Synchronize before benchmarking + torch.cuda.synchronize() + + # Benchmark + if verbose: + print(f"\n[6/7] Running benchmark ({num_requests} requests)...") + + # Storage for per-request metrics + prefill_times = [] + decode_times = [] + e2e_times = [] + + prefill_energies = [] + decode_energies = [] + e2e_energies = [] + + prefill_powers = [] + decode_powers = [] + + memory_usage = [] + gpu_utils = [] + + # For inference, we separate prefill (first token) from decode (remaining tokens) + # We'll use a custom generation loop to measure them separately + + for req_idx in tqdm(range(num_requests), desc="Benchmarking"): + # === PREFILL PHASE (Time to First Token) === + # This is the forward pass with the prompt to get the first token + + monitor.start_monitoring() + torch.cuda.synchronize() + prefill_start = time.perf_counter() + + with torch.no_grad(): + # Forward pass with prompt + outputs = model(input_ids=prompt_ids, use_cache=True) + logits = outputs.logits + past_key_values = outputs.past_key_values + + # Get first generated token + next_token_logits = logits[:, -1, :] + next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True) + + torch.cuda.synchronize() + prefill_time = time.perf_counter() - prefill_start + prefill_energy = monitor.get_energy_consumed() + prefill_power = monitor.get_average_power() + + prefill_times.append(prefill_time * 1000) # Convert to ms + prefill_energies.append(prefill_energy) + prefill_powers.append(prefill_power) + + # === DECODE PHASE (Inter-Token Latency) === + # Generate remaining tokens one by one + + monitor.start_monitoring() + torch.cuda.synchronize() + decode_start = time.perf_counter() + + generated_tokens = [next_token] + + with torch.no_grad(): + for _ in range(generation_length - 1): + # Forward pass with single token using cached keys/values + outputs = model( + input_ids=next_token, + past_key_values=past_key_values, + use_cache=True + ) + logits = outputs.logits + past_key_values = outputs.past_key_values + + # Get next token + next_token_logits = logits[:, -1, :] + next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True) + generated_tokens.append(next_token) + + torch.cuda.synchronize() + decode_time = time.perf_counter() - decode_start + decode_energy = monitor.get_energy_consumed() + decode_power = monitor.get_average_power() + + decode_times.append(decode_time * 1000) # Convert to ms + decode_energies.append(decode_energy) + decode_powers.append(decode_power) + + # End-to-end metrics + e2e_time = prefill_time + decode_time + e2e_energy = prefill_energy + decode_energy + + e2e_times.append(e2e_time * 1000) # Convert to ms + e2e_energies.append(e2e_energy) + + # Get memory and utilization + metrics = monitor.get_metrics() + memory_usage.append(metrics.memory_used_gb) + gpu_utils.append(metrics.gpu_utilization_percent) + + # Compute aggregated metrics + + # Prefill metrics (TTFT) + prefill_duration_ms = sum(prefill_times) + prefill_energy_j = sum(prefill_energies) + prefill_tokens = prompt_length * num_requests + prefill_tps = prefill_tokens / (prefill_duration_ms / 1000) + prefill_ept = prefill_energy_j / prefill_tokens + avg_ttft_ms = sum(prefill_times) / len(prefill_times) + + prefill_metrics = StageMetrics( + stage_name="prefill", + duration_ms=prefill_duration_ms, + tokens_processed=prefill_tokens, + tokens_per_second=prefill_tps, + energy_joules=prefill_energy_j, + energy_per_token=prefill_ept, + avg_power_watts=sum(prefill_powers) / len(prefill_powers), + peak_memory_gb=max(memory_usage), + avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils) + ) + + # Decode metrics (ITL) + decode_duration_ms = sum(decode_times) + decode_energy_j = sum(decode_energies) + decode_tokens = generation_length * num_requests + decode_tps = decode_tokens / (decode_duration_ms / 1000) + decode_ept = decode_energy_j / decode_tokens + avg_itl_ms = sum(decode_times) / len(decode_times) / generation_length + + decode_metrics = StageMetrics( + stage_name="decode", + duration_ms=decode_duration_ms, + tokens_processed=decode_tokens, + tokens_per_second=decode_tps, + energy_joules=decode_energy_j, + energy_per_token=decode_ept, + avg_power_watts=sum(decode_powers) / len(decode_powers), + peak_memory_gb=max(memory_usage), + avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils) + ) + + # End-to-end metrics + e2e_latency_ms = sum(e2e_times) / len(e2e_times) + e2e_energy_j = sum(e2e_energies) + total_tokens = (prompt_length + generation_length) * num_requests + e2e_tps = total_tokens / (sum(e2e_times) / 1000) + e2e_ept = e2e_energy_j / total_tokens + + # Create metrics object + metrics = InferenceMetrics( + model_name=model_name_or_path, + gpu_name=gpu_name, + attention_implementation=attn_implementation, + num_requests=num_requests, + prompt_length=prompt_length, + generation_length=generation_length, + prefill=prefill_metrics, + decode=decode_metrics, + e2e_latency_ms=e2e_latency_ms, + e2e_tokens_per_second=e2e_tps, + e2e_energy_joules=e2e_energy_j, + e2e_energy_per_token=e2e_ept, + ttft_ms=avg_ttft_ms, + itl_ms=avg_itl_ms + ) + + # Print results + if verbose: + print() + MetricsReporter.print_inference_metrics(metrics, verbose=verbose) + + # Save results + if output_dir: + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + # Save JSON + json_path = output_path / f"inference_{gpu_name.replace(' ', '_')}_{attn_implementation}.json" + MetricsReporter.save_json(metrics, json_path) + + # Cleanup + monitor.cleanup() + del model + torch.cuda.empty_cache() + + return metrics + + +def main(): + parser = argparse.ArgumentParser( + description="LLM Inference Benchmark", + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + parser.add_argument( + "--model-path", + type=str, + default="./model_cache", + help="Path to cached model" + ) + + parser.add_argument( + "--model-name", + type=str, + default="Qwen/Qwen3-4B", + help="Model name (for reporting)" + ) + + parser.add_argument( + "--attn-implementation", + type=str, + default="auto", + choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"], + help="Attention implementation to use" + ) + + parser.add_argument( + "--num-requests", + type=int, + default=10, + help="Number of inference requests" + ) + + parser.add_argument( + "--prompt-length", + type=int, + default=512, + help="Prompt length in tokens" + ) + + parser.add_argument( + "--generation-length", + type=int, + default=100, + help="Number of tokens to generate" + ) + + parser.add_argument( + "--warmup-requests", + type=int, + default=2, + help="Number of warmup requests" + ) + + parser.add_argument( + "--device-id", + type=int, + default=0, + help="GPU device ID" + ) + + parser.add_argument( + "--output-dir", + type=str, + default="./results", + help="Output directory for results" + ) + + args = parser.parse_args() + + # Set environment variables for HuggingFace cache + if Path(args.model_path).exists(): + os.environ['HF_HOME'] = args.model_path + + benchmark_inference( + model_name_or_path=args.model_name, + attn_implementation=args.attn_implementation, + num_requests=args.num_requests, + prompt_length=args.prompt_length, + generation_length=args.generation_length, + warmup_requests=args.warmup_requests, + device="cuda", + device_id=args.device_id, + output_dir=args.output_dir, + verbose=True + ) + + +if __name__ == "__main__": + main() diff --git a/benchmark_pretrain.py b/benchmark_pretrain.py new file mode 100755 index 0000000..659ff3f --- /dev/null +++ b/benchmark_pretrain.py @@ -0,0 +1,406 @@ +#!/usr/bin/env python3 +""" +Pretraining Benchmark for LLM Performance Evaluation + +Measures performance and energy metrics for pretraining workloads with +separate measurements for forward, backward, and optimizer stages. +""" + +import argparse +import os +import sys +import time +from pathlib import Path +from typing import Optional + +import torch +import torch.nn as nn +from transformers import AutoModelForCausalLM, AutoTokenizer +from tqdm import tqdm + +# Add utils to path +sys.path.insert(0, str(Path(__file__).parent)) +from utils.gpu_monitor import get_gpu_monitor +from utils.metrics import StageMetrics, PretrainMetrics, MetricsReporter +from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu + + +def benchmark_pretrain( + model_name_or_path: str, + attn_implementation: str = "auto", + batch_size: int = 8, + sequence_length: int = 2048, + num_steps: int = 10, + warmup_steps: int = 3, + device: str = "cuda", + device_id: int = 0, + output_dir: Optional[str] = None, + verbose: bool = True, +): + """ + Run pretraining benchmark. + + Args: + model_name_or_path: Path to model or HuggingFace identifier + attn_implementation: Attention implementation to use + batch_size: Batch size for training + sequence_length: Sequence length + num_steps: Number of training steps to measure + warmup_steps: Number of warmup steps before measurement + device: Device to use + device_id: GPU device ID + output_dir: Directory to save results + verbose: Print verbose output + """ + print("=" * 80) + print("PRETRAINING BENCHMARK") + print("=" * 80) + + # Initialize GPU monitor + if verbose: + print("\n[1/6] Initializing GPU monitor...") + monitor = get_gpu_monitor(device_id) + gpu_name = monitor.get_device_name() + if verbose: + print(f" GPU: {gpu_name}") + + # Determine attention implementation + if attn_implementation == "auto": + attn_implementation = get_default_attention(gpu_name) + if verbose: + print(f" Auto-selected attention: {attn_implementation}") + + # Validate attention for GPU + valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name) + if warning and verbose: + print(f" ⚠ {warning}") + + # Load model + if verbose: + print(f"\n[2/6] Loading model: {model_name_or_path}") + + # Determine attn_implementation parameter for model loading + load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation + + try: + model = AutoModelForCausalLM.from_pretrained( + model_name_or_path, + torch_dtype=torch.bfloat16, + attn_implementation=load_attn, + trust_remote_code=True, + ) + model = model.to(device) + + # Configure attention (patch if needed for FA3) + model = configure_model_attention(model, attn_implementation, verbose=verbose) + + if verbose: + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(f" Total parameters: {total_params:,} ({total_params/1e9:.2f}B)") + print(f" Trainable parameters: {trainable_params:,}") + except Exception as e: + print(f"✗ Error loading model: {e}") + sys.exit(1) + + # Setup optimizer + if verbose: + print(f"\n[3/6] Setting up optimizer...") + optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4) + + # Generate synthetic training data + if verbose: + print(f"\n[4/6] Generating synthetic training data...") + print(f" Batch size: {batch_size}") + print(f" Sequence length: {sequence_length}") + + # Create random input_ids (synthetic data) + vocab_size = model.config.vocab_size + input_ids = torch.randint(0, vocab_size, (batch_size, sequence_length), device=device) + labels = input_ids.clone() + + # Warmup + if verbose: + print(f"\n[5/6] Running warmup ({warmup_steps} steps)...") + model.train() + for _ in range(warmup_steps): + optimizer.zero_grad() + outputs = model(input_ids=input_ids, labels=labels) + loss = outputs.loss + loss.backward() + optimizer.step() + + # Synchronize before benchmarking + torch.cuda.synchronize() + + # Benchmark + if verbose: + print(f"\n[6/6] Running benchmark ({num_steps} steps)...") + + # Storage for per-step metrics + forward_times = [] + backward_times = [] + optimizer_times = [] + + forward_energies = [] + backward_energies = [] + optimizer_energies = [] + + forward_powers = [] + backward_powers = [] + optimizer_powers = [] + + memory_usage = [] + gpu_utils = [] + + total_tokens = batch_size * sequence_length * num_steps + + for step in tqdm(range(num_steps), desc="Benchmarking"): + # === FORWARD PASS === + monitor.start_monitoring() + torch.cuda.synchronize() + start_time = time.perf_counter() + + optimizer.zero_grad() + outputs = model(input_ids=input_ids, labels=labels) + loss = outputs.loss + + torch.cuda.synchronize() + forward_time = time.perf_counter() - start_time + forward_energy = monitor.get_energy_consumed() + forward_power = monitor.get_average_power() + + forward_times.append(forward_time * 1000) # Convert to ms + forward_energies.append(forward_energy) + forward_powers.append(forward_power) + + # === BACKWARD PASS === + monitor.start_monitoring() + torch.cuda.synchronize() + start_time = time.perf_counter() + + loss.backward() + + torch.cuda.synchronize() + backward_time = time.perf_counter() - start_time + backward_energy = monitor.get_energy_consumed() + backward_power = monitor.get_average_power() + + backward_times.append(backward_time * 1000) # Convert to ms + backward_energies.append(backward_energy) + backward_powers.append(backward_power) + + # === OPTIMIZER STEP === + monitor.start_monitoring() + torch.cuda.synchronize() + start_time = time.perf_counter() + + optimizer.step() + + torch.cuda.synchronize() + optimizer_time = time.perf_counter() - start_time + optimizer_energy = monitor.get_energy_consumed() + optimizer_power = monitor.get_average_power() + + optimizer_times.append(optimizer_time * 1000) # Convert to ms + optimizer_energies.append(optimizer_energy) + optimizer_powers.append(optimizer_power) + + # Get memory and utilization + metrics = monitor.get_metrics() + memory_usage.append(metrics.memory_used_gb) + gpu_utils.append(metrics.gpu_utilization_percent) + + # Compute aggregated metrics + tokens_per_step = batch_size * sequence_length + + # Forward metrics + forward_duration_ms = sum(forward_times) + forward_energy_j = sum(forward_energies) + forward_tokens = tokens_per_step * num_steps + forward_tps = forward_tokens / (forward_duration_ms / 1000) + forward_ept = forward_energy_j / forward_tokens + forward_metrics = StageMetrics( + stage_name="forward", + duration_ms=forward_duration_ms, + tokens_processed=forward_tokens, + tokens_per_second=forward_tps, + energy_joules=forward_energy_j, + energy_per_token=forward_ept, + avg_power_watts=sum(forward_powers) / len(forward_powers), + peak_memory_gb=max(memory_usage), + avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils) + ) + + # Backward metrics + backward_duration_ms = sum(backward_times) + backward_energy_j = sum(backward_energies) + backward_tokens = tokens_per_step * num_steps + backward_tps = backward_tokens / (backward_duration_ms / 1000) + backward_ept = backward_energy_j / backward_tokens + backward_metrics = StageMetrics( + stage_name="backward", + duration_ms=backward_duration_ms, + tokens_processed=backward_tokens, + tokens_per_second=backward_tps, + energy_joules=backward_energy_j, + energy_per_token=backward_ept, + avg_power_watts=sum(backward_powers) / len(backward_powers), + peak_memory_gb=max(memory_usage), + avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils) + ) + + # Optimizer metrics + optimizer_duration_ms = sum(optimizer_times) + optimizer_energy_j = sum(optimizer_energies) + optimizer_tokens = tokens_per_step * num_steps + optimizer_tps = optimizer_tokens / (optimizer_duration_ms / 1000) + optimizer_ept = optimizer_energy_j / optimizer_tokens + optimizer_metrics = StageMetrics( + stage_name="optimizer", + duration_ms=optimizer_duration_ms, + tokens_processed=optimizer_tokens, + tokens_per_second=optimizer_tps, + energy_joules=optimizer_energy_j, + energy_per_token=optimizer_ept, + avg_power_watts=sum(optimizer_powers) / len(optimizer_powers), + peak_memory_gb=max(memory_usage), + avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils) + ) + + # Overall metrics + total_duration_ms = forward_duration_ms + backward_duration_ms + optimizer_duration_ms + total_energy_j = forward_energy_j + backward_energy_j + optimizer_energy_j + total_tps = total_tokens / (total_duration_ms / 1000) + total_ept = total_energy_j / total_tokens + + # Create metrics object + metrics = PretrainMetrics( + model_name=model_name_or_path, + gpu_name=gpu_name, + attention_implementation=attn_implementation, + batch_size=batch_size, + sequence_length=sequence_length, + num_steps=num_steps, + forward=forward_metrics, + backward=backward_metrics, + optimizer=optimizer_metrics, + total_duration_ms=total_duration_ms, + total_tokens=total_tokens, + total_tokens_per_second=total_tps, + total_energy_joules=total_energy_j, + total_energy_per_token=total_ept + ) + + # Print results + MetricsReporter.print_pretrain_metrics(metrics, verbose=verbose) + + # Save results + if output_dir: + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + # Save JSON + json_path = output_path / f"pretrain_{gpu_name.replace(' ', '_')}_{attn_implementation}.json" + MetricsReporter.save_json(metrics, json_path) + + # Cleanup + monitor.cleanup() + del model + torch.cuda.empty_cache() + + return metrics + + +def main(): + parser = argparse.ArgumentParser( + description="LLM Pretraining Benchmark", + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + parser.add_argument( + "--model-path", + type=str, + default="./model_cache", + help="Path to cached model" + ) + + parser.add_argument( + "--model-name", + type=str, + default="Qwen/Qwen3-4B", + help="Model name (for reporting)" + ) + + parser.add_argument( + "--attn-implementation", + type=str, + default="auto", + choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"], + help="Attention implementation to use" + ) + + parser.add_argument( + "--batch-size", + type=int, + default=8, + help="Batch size" + ) + + parser.add_argument( + "--sequence-length", + type=int, + default=8192, + help="Sequence length" + ) + + parser.add_argument( + "--num-steps", + type=int, + default=10, + help="Number of training steps" + ) + + parser.add_argument( + "--warmup-steps", + type=int, + default=3, + help="Number of warmup steps" + ) + + parser.add_argument( + "--device-id", + type=int, + default=0, + help="GPU device ID" + ) + + parser.add_argument( + "--output-dir", + type=str, + default="./results", + help="Output directory for results" + ) + + args = parser.parse_args() + + # Set environment variables for HuggingFace cache + if Path(args.model_path).exists(): + os.environ['HF_HOME'] = args.model_path + + benchmark_pretrain( + model_name_or_path=args.model_name, + attn_implementation=args.attn_implementation, + batch_size=args.batch_size, + sequence_length=args.sequence_length, + num_steps=args.num_steps, + warmup_steps=args.warmup_steps, + device="cuda", + device_id=args.device_id, + output_dir=args.output_dir, + verbose=True + ) + + +if __name__ == "__main__": + main() diff --git a/cache_model.py b/cache_model.py new file mode 100755 index 0000000..8414c54 --- /dev/null +++ b/cache_model.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +""" +Model Caching Script for LLM Benchmarking + +This script downloads and caches the Qwen3-4B model from HuggingFace +before running benchmarks on offline compute nodes. +""" + +import argparse +import os +import sys +from pathlib import Path + +def cache_model(model_name: str, cache_dir: str, force: bool = False): + """ + Download and cache a HuggingFace model. + + Args: + model_name: HuggingFace model identifier (e.g., "Qwen/Qwen3-4B-Instruct-2507") + cache_dir: Local directory to cache the model + force: Force re-download even if model exists + """ + try: + from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig + except ImportError: + print("Error: transformers library not found. Please install it:") + print(" pip install transformers") + sys.exit(1) + + # Create cache directory + cache_path = Path(cache_dir).resolve() + cache_path.mkdir(parents=True, exist_ok=True) + + print(f"Caching model: {model_name}") + print(f"Cache directory: {cache_path}") + print("-" * 60) + + # Set HuggingFace cache directory + os.environ['HF_HOME'] = str(cache_path) + + # Check if model already exists + model_path = cache_path / model_name.replace("/", "--") + if model_path.exists() and not force: + print(f"Model already cached at: {model_path}") + print("Use --force to re-download") + return str(cache_path) + + try: + # Download config + print("\n[1/3] Downloading model config...") + config = AutoConfig.from_pretrained( + model_name, + cache_dir=cache_path, + trust_remote_code=True + ) + print(f" ✓ Config downloaded") + print(f" - Model type: {config.model_type}") + print(f" - Hidden size: {config.hidden_size}") + print(f" - Num layers: {config.num_hidden_layers}") + print(f" - Num attention heads: {config.num_attention_heads}") + + # Download tokenizer + print("\n[2/3] Downloading tokenizer...") + tokenizer = AutoTokenizer.from_pretrained( + model_name, + cache_dir=cache_path, + trust_remote_code=True + ) + print(f" ✓ Tokenizer downloaded") + print(f" - Vocab size: {len(tokenizer)}") + print(f" - Model max length: {tokenizer.model_max_length}") + + # Download model weights + print("\n[3/3] Downloading model weights...") + print(" (This may take several minutes depending on connection speed)") + model = AutoModelForCausalLM.from_pretrained( + model_name, + cache_dir=cache_path, + trust_remote_code=True, + torch_dtype="auto", + low_cpu_mem_usage=True + ) + print(f" ✓ Model weights downloaded") + + # Calculate total parameters + total_params = sum(p.numel() for p in model.parameters()) + print(f" - Total parameters: {total_params:,} ({total_params/1e9:.2f}B)") + + # Clean up model from memory + del model + + print("\n" + "=" * 60) + print("✓ Model successfully cached!") + print("=" * 60) + print(f"\nCache location: {cache_path}") + print(f"\nTo use in benchmarks, set:") + print(f" --model-path {cache_path}") + print(f"\nOr set environment variable:") + print(f" export HF_HOME={cache_path}") + + return str(cache_path) + + except Exception as e: + print(f"\n✗ Error downloading model: {e}", file=sys.stderr) + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser( + description="Cache HuggingFace model for offline use", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Cache model to default location + python cache_model.py + + # Cache model to custom directory + python cache_model.py --cache-dir /path/to/cache + + # Force re-download + python cache_model.py --force + """ + ) + + parser.add_argument( + "--model-name", + type=str, + default="Qwen/Qwen3-4B", + help="HuggingFace model identifier (default: Qwen/Qwen3-4B)" + ) + + parser.add_argument( + "--cache-dir", + type=str, + default="./model_cache", + help="Directory to cache model (default: ./model_cache in current directory)" + ) + + parser.add_argument( + "--force", + action="store_true", + help="Force re-download even if model exists" + ) + + args = parser.parse_args() + + cache_model(args.model_name, args.cache_dir, args.force) + + +if __name__ == "__main__": + main() diff --git a/configs/a100.yaml b/configs/a100.yaml new file mode 100644 index 0000000..bc96a5a --- /dev/null +++ b/configs/a100.yaml @@ -0,0 +1,26 @@ +# A100 Configuration +gpu_type: a100 +gpu_model: "NVIDIA A100 80GB" + +# Default attention implementation +default_attention: flash_attention_2 + +# Pretraining defaults +pretrain: + batch_size: 8 + sequence_length: 8192 + num_steps: 10 + warmup_steps: 3 + +# Inference defaults +inference: + num_requests: 10 + prompt_length: 512 + generation_length: 100 + warmup_requests: 2 + +# Hardware specs (for reference) +hardware: + memory_gb: 80 + tdp_watts: 400 + compute_capability: "8.0" diff --git a/configs/h100.yaml b/configs/h100.yaml new file mode 100644 index 0000000..b3fde83 --- /dev/null +++ b/configs/h100.yaml @@ -0,0 +1,26 @@ +# H100 Configuration +gpu_type: h100 +gpu_model: "NVIDIA H100 80GB" + +# Default attention implementation +default_attention: flash_attention_3_hopper + +# Pretraining defaults +pretrain: + batch_size: 8 + sequence_length: 8192 + num_steps: 10 + warmup_steps: 3 + +# Inference defaults +inference: + num_requests: 10 + prompt_length: 512 + generation_length: 100 + warmup_requests: 2 + +# Hardware specs (for reference) +hardware: + memory_gb: 80 + tdp_watts: 700 + compute_capability: "9.0" diff --git a/configs/h200.yaml b/configs/h200.yaml new file mode 100644 index 0000000..88e00a2 --- /dev/null +++ b/configs/h200.yaml @@ -0,0 +1,26 @@ +# H200 Configuration +gpu_type: h200 +gpu_model: "NVIDIA H200 141GB" + +# Default attention implementation +default_attention: flash_attention_3_hopper + +# Pretraining defaults +pretrain: + batch_size: 8 + sequence_length: 8192 + num_steps: 10 + warmup_steps: 3 + +# Inference defaults +inference: + num_requests: 10 + prompt_length: 512 + generation_length: 100 + warmup_requests: 2 + +# Hardware specs (for reference) +hardware: + memory_gb: 141 + tdp_watts: 700 + compute_capability: "9.0" diff --git a/configs/mi300x.yaml b/configs/mi300x.yaml new file mode 100644 index 0000000..4002591 --- /dev/null +++ b/configs/mi300x.yaml @@ -0,0 +1,26 @@ +# MI300X Configuration +gpu_type: mi300x +gpu_model: "AMD Instinct MI300X" + +# Default attention implementation +default_attention: flash_attention_2 + +# Pretraining defaults +pretrain: + batch_size: 8 + sequence_length: 8192 + num_steps: 10 + warmup_steps: 3 + +# Inference defaults +inference: + num_requests: 10 + prompt_length: 512 + generation_length: 100 + warmup_requests: 2 + +# Hardware specs (for reference) +hardware: + memory_gb: 192 + tdp_watts: 750 + compute_capability: "gfx940" diff --git a/quick_start.sh b/quick_start.sh new file mode 100755 index 0000000..e960d2a --- /dev/null +++ b/quick_start.sh @@ -0,0 +1,122 @@ +#!/bin/bash +# Quick Start Script for LLM Benchmark Suite +# +# This script helps you get started quickly with the benchmark suite. +# It will: +# 1. Check dependencies +# 2. Cache the model if needed +# 3. Run a quick test benchmark +# +# Usage: ./quick_start.sh [--skip-cache] + +set -e # Exit on error + +echo "=========================================" +echo "LLM Benchmark Suite - Quick Start" +echo "=========================================" + +# Parse arguments +SKIP_CACHE=false +if [[ "$1" == "--skip-cache" ]]; then + SKIP_CACHE=true +fi + +# Check Python +echo "" +echo "[1/5] Checking Python..." +if ! command -v python &> /dev/null; then + echo "✗ Python not found. Please install Python 3.8+" + exit 1 +fi +PYTHON_VERSION=$(python --version 2>&1 | awk '{print $2}') +echo " ✓ Python $PYTHON_VERSION found" + +# Check dependencies +echo "" +echo "[2/5] Checking dependencies..." +MISSING_DEPS=() + +if ! python -c "import torch" 2>/dev/null; then + MISSING_DEPS+=("torch") +fi + +if ! python -c "import transformers" 2>/dev/null; then + MISSING_DEPS+=("transformers") +fi + +if ${#MISSING_DEPS[@]} -gt 0; then + echo " ⚠ Missing dependencies: ${MISSING_DEPS[*]}" + echo " Installing dependencies..." + pip install -r requirements.txt +else + echo " ✓ All dependencies installed" +fi + +# Check GPU +echo "" +echo "[3/5] Checking GPU..." +if python -c "import torch; assert torch.cuda.is_available()" 2>/dev/null; then + GPU_NAME=$(python -c "import torch; print(torch.cuda.get_device_name(0))") + echo " ✓ GPU found: $GPU_NAME" +else + echo " ✗ No GPU found or CUDA not available" + echo " This benchmark requires a GPU to run." + exit 1 +fi + +# Cache model +if [ "$SKIP_CACHE" = false ]; then + echo "" + echo "[4/5] Caching model..." + if [ -d "./model_cache" ] && [ "$(ls -A ./model_cache)" ]; then + echo " ✓ Model cache already exists at ./model_cache" + echo " To re-download, remove the directory and run again." + else + echo " Downloading Qwen/Qwen3-4B..." + echo " (This may take several minutes depending on your connection)" + python cache_model.py --model-name Qwen/Qwen3-4B --cache-dir ./model_cache + fi +else + echo "" + echo "[4/5] Skipping model cache (--skip-cache specified)" +fi + +# Run quick test +echo "" +echo "[5/5] Running quick test benchmark..." +echo " This will run a minimal benchmark to verify everything works." +echo " Parameters: 2 steps, batch size 2, sequence length 512" +echo "" + +python run_benchmark.py \ + --mode both \ + --model-path ./model_cache \ + --model-name Qwen/Qwen3-4B \ + --batch-size 2 \ + --sequence-length 512 \ + --num-steps 2 \ + --num-requests 2 \ + --prompt-length 256 \ + --generation-length 20 \ + --output-dir ./results/test + +echo "" +echo "=========================================" +echo "Quick Start Complete!" +echo "=========================================" +echo "" +echo "Next steps:" +echo " 1. Run full benchmarks:" +echo " python run_benchmark.py --mode both" +echo "" +echo " 2. Run on different GPUs using SLURM:" +echo " sbatch slurm_a100.sh" +echo " sbatch slurm_h100.sh" +echo " sbatch slurm_h200.sh" +echo " sbatch slurm_mi300x.sh" +echo "" +echo " 3. View results:" +echo " ls -l results/" +echo "" +echo "For more information, see README.md" +echo "" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2c6eede --- /dev/null +++ b/requirements.txt @@ -0,0 +1,22 @@ +# LLM Benchmark Suite - Requirements + +# Core dependencies +torch>=2.0.0 +transformers>=4.35.0 +accelerate>=0.24.0 +tokenizers>=0.14.0 + +# Attention implementations +flash-attn>=2.0.0 + +# GPU monitoring +pynvml>=11.5.0 # NVIDIA GPU monitoring +pyrsmi>=1.0.0 # AMD GPU monitoring + +# Utilities +numpy>=1.24.0 +pyyaml>=6.0 +tqdm>=4.65.0 + +# Optional: for better performance +triton>=2.0.0 diff --git a/results/a100/inference_NVIDIA_A100-SXM4-80GB_flash_attention_2.json b/results/a100/inference_NVIDIA_A100-SXM4-80GB_flash_attention_2.json new file mode 100644 index 0000000..075e4a3 --- /dev/null +++ b/results/a100/inference_NVIDIA_A100-SXM4-80GB_flash_attention_2.json @@ -0,0 +1,37 @@ +{ + "model_name": "Qwen/Qwen3-4B", + "gpu_name": "NVIDIA A100-SXM4-80GB", + "attention_implementation": "flash_attention_2", + "num_requests": 10, + "prompt_length": 512, + "generation_length": 100, + "prefill": { + "stage_name": "prefill", + "duration_ms": 475.62581300735474, + "tokens_processed": 5120, + "tokens_per_second": 10764.76477932628, + "energy_joules": 21.409000039100647, + "energy_per_token": 0.004181445320136845, + "avg_power_watts": 68.91171083870925, + "peak_memory_gb": 45.87115478515625, + "avg_gpu_util_percent": 38.1 + }, + "decode": { + "stage_name": "decode", + "duration_ms": 41460.768724791706, + "tokens_processed": 1000, + "tokens_per_second": 24.119186179055195, + "energy_joules": 4684.697999954224, + "energy_per_token": 4.684697999954223, + "avg_power_watts": 112.85507087682042, + "peak_memory_gb": 45.87115478515625, + "avg_gpu_util_percent": 38.1 + }, + "e2e_latency_ms": 4193.639453779906, + "e2e_tokens_per_second": 145.93529242204605, + "e2e_energy_joules": 4706.106999993324, + "e2e_energy_per_token": 0.768971732025053, + "ttft_ms": 47.562581300735474, + "itl_ms": 41.460768724791706, + "timestamp": 1768519487.5402663 +} \ No newline at end of file diff --git a/results/a100/pretrain_NVIDIA_A100-SXM4-80GB_flash_attention_2.json b/results/a100/pretrain_NVIDIA_A100-SXM4-80GB_flash_attention_2.json new file mode 100644 index 0000000..0abc5ff --- /dev/null +++ b/results/a100/pretrain_NVIDIA_A100-SXM4-80GB_flash_attention_2.json @@ -0,0 +1,47 @@ +{ + "model_name": "Qwen/Qwen3-4B", + "gpu_name": "NVIDIA A100-SXM4-80GB", + "attention_implementation": "flash_attention_2", + "batch_size": 3, + "sequence_length": 2048, + "num_steps": 10, + "forward": { + "stage_name": "forward", + "duration_ms": 3359.0412912890315, + "tokens_processed": 61440, + "tokens_per_second": 18290.933237210196, + "energy_joules": 1292.2280000448227, + "energy_per_token": 0.021032356771562868, + "avg_power_watts": 387.19580415542595, + "peak_memory_gb": 79.66021728515625, + "avg_gpu_util_percent": 97.8 + }, + "backward": { + "stage_name": "backward", + "duration_ms": 6954.944152384996, + "tokens_processed": 61440, + "tokens_per_second": 8834.003358449821, + "energy_joules": 2729.588000059128, + "energy_per_token": 0.0444268880217957, + "avg_power_watts": 394.24766095856324, + "peak_memory_gb": 79.66021728515625, + "avg_gpu_util_percent": 97.8 + }, + "optimizer": { + "stage_name": "optimizer", + "duration_ms": 1153.845101594925, + "tokens_processed": 61440, + "tokens_per_second": 53248.048559614595, + "energy_joules": 362.6529998779297, + "energy_per_token": 0.005902555336554845, + "avg_power_watts": 299.1223537953503, + "peak_memory_gb": 79.66021728515625, + "avg_gpu_util_percent": 97.8 + }, + "total_duration_ms": 11467.830545268953, + "total_tokens": 61440, + "total_tokens_per_second": 5357.595733340081, + "total_energy_joules": 4384.46899998188, + "total_energy_per_token": 0.07136180012991342, + "timestamp": 1768519431.5985208 +} \ No newline at end of file diff --git a/results/h100/inference_NVIDIA_H100_flash_attention_3_hopper.json b/results/h100/inference_NVIDIA_H100_flash_attention_3_hopper.json new file mode 100644 index 0000000..eba7208 --- /dev/null +++ b/results/h100/inference_NVIDIA_H100_flash_attention_3_hopper.json @@ -0,0 +1,37 @@ +{ + "model_name": "Qwen/Qwen3-4B", + "gpu_name": "NVIDIA H100", + "attention_implementation": "flash_attention_3_hopper", + "num_requests": 10, + "prompt_length": 512, + "generation_length": 100, + "prefill": { + "stage_name": "prefill", + "duration_ms": 323.99015384726226, + "tokens_processed": 5120, + "tokens_per_second": 15802.949377324925, + "energy_joules": 17.092000007629395, + "energy_per_token": 0.0033382812514901163, + "avg_power_watts": 93.64442380045372, + "peak_memory_gb": 46.02825927734375, + "avg_gpu_util_percent": 40.0 + }, + "decode": { + "stage_name": "decode", + "duration_ms": 30513.75844143331, + "tokens_processed": 1000, + "tokens_per_second": 32.772101867403634, + "energy_joules": 4915.5139999985695, + "energy_per_token": 4.915513999998569, + "avg_power_watts": 161.199160874206, + "peak_memory_gb": 46.02825927734375, + "avg_gpu_util_percent": 40.0 + }, + "e2e_latency_ms": 3083.7748595280573, + "e2e_tokens_per_second": 198.4580677506596, + "e2e_energy_joules": 4932.606000006199, + "e2e_energy_per_token": 0.8059813725500325, + "ttft_ms": 32.399015384726226, + "itl_ms": 30.51375844143331, + "timestamp": 1768541839.3186588 +} \ No newline at end of file diff --git a/results/h100/pretrain_NVIDIA_H100_flash_attention_3_hopper.json b/results/h100/pretrain_NVIDIA_H100_flash_attention_3_hopper.json new file mode 100644 index 0000000..426b1f1 --- /dev/null +++ b/results/h100/pretrain_NVIDIA_H100_flash_attention_3_hopper.json @@ -0,0 +1,47 @@ +{ + "model_name": "Qwen/Qwen3-4B", + "gpu_name": "NVIDIA H100", + "attention_implementation": "flash_attention_3_hopper", + "batch_size": 3, + "sequence_length": 2048, + "num_steps": 10, + "forward": { + "stage_name": "forward", + "duration_ms": 1748.5067250672728, + "tokens_processed": 61440, + "tokens_per_second": 35138.55515633555, + "energy_joules": 946.9269999563694, + "energy_per_token": 0.015412223306581534, + "avg_power_watts": 501.76439870614394, + "peak_memory_gb": 76.45208740234375, + "avg_gpu_util_percent": 97.0 + }, + "backward": { + "stage_name": "backward", + "duration_ms": 3761.718863155693, + "tokens_processed": 61440, + "tokens_per_second": 16332.959010248362, + "energy_joules": 1904.104000031948, + "energy_per_token": 0.030991276042186655, + "avg_power_watts": 491.250130606127, + "peak_memory_gb": 76.45208740234375, + "avg_gpu_util_percent": 97.0 + }, + "optimizer": { + "stage_name": "optimizer", + "duration_ms": 896.0564862936735, + "tokens_processed": 61440, + "tokens_per_second": 68567.1059133025, + "energy_joules": 349.722000002861, + "energy_per_token": 0.0056920898437965665, + "avg_power_watts": 356.92130879075387, + "peak_memory_gb": 76.45208740234375, + "avg_gpu_util_percent": 97.0 + }, + "total_duration_ms": 6406.282074516639, + "total_tokens": 61440, + "total_tokens_per_second": 9590.586128637759, + "total_energy_joules": 3200.7529999911785, + "total_energy_per_token": 0.052095589192564754, + "timestamp": 1768541796.4011748 +} \ No newline at end of file diff --git a/results/h100_sdpa/inference_NVIDIA_H100_sdpa.json b/results/h100_sdpa/inference_NVIDIA_H100_sdpa.json new file mode 100644 index 0000000..53d8cc2 --- /dev/null +++ b/results/h100_sdpa/inference_NVIDIA_H100_sdpa.json @@ -0,0 +1,37 @@ +{ + "model_name": "Qwen/Qwen3-4B", + "gpu_name": "NVIDIA H100", + "attention_implementation": "sdpa", + "num_requests": 10, + "prompt_length": 512, + "generation_length": 100, + "prefill": { + "stage_name": "prefill", + "duration_ms": 253.97859653458, + "tokens_processed": 5120, + "tokens_per_second": 20159.179040517676, + "energy_joules": 0.0, + "energy_per_token": 0.0, + "avg_power_watts": 0.0, + "peak_memory_gb": 46.01458740234375, + "avg_gpu_util_percent": 48.8 + }, + "decode": { + "stage_name": "decode", + "duration_ms": 23519.252635538578, + "tokens_processed": 1000, + "tokens_per_second": 42.51835785330007, + "energy_joules": 4544.901999980211, + "energy_per_token": 4.544901999980211, + "avg_power_watts": 192.5432634001641, + "peak_memory_gb": 46.01458740234375, + "avg_gpu_util_percent": 48.8 + }, + "e2e_latency_ms": 2377.323123207316, + "e2e_tokens_per_second": 257.43240118504923, + "e2e_energy_joules": 4544.901999980211, + "e2e_energy_per_token": 0.7426310457484006, + "ttft_ms": 25.397859653458, + "itl_ms": 23.519252635538578, + "timestamp": 1769149269.5228984 +} \ No newline at end of file diff --git a/results/h100_sdpa/pretrain_NVIDIA_H100_sdpa.json b/results/h100_sdpa/pretrain_NVIDIA_H100_sdpa.json new file mode 100644 index 0000000..3d6dc64 --- /dev/null +++ b/results/h100_sdpa/pretrain_NVIDIA_H100_sdpa.json @@ -0,0 +1,47 @@ +{ + "model_name": "Qwen/Qwen3-4B", + "gpu_name": "NVIDIA H100", + "attention_implementation": "sdpa", + "batch_size": 3, + "sequence_length": 2048, + "num_steps": 10, + "forward": { + "stage_name": "forward", + "duration_ms": 1790.2467511594296, + "tokens_processed": 61440, + "tokens_per_second": 34319.29143857359, + "energy_joules": 981.029000043869, + "energy_per_token": 0.01596726888092235, + "avg_power_watts": 520.9058508009567, + "peak_memory_gb": 76.45208740234375, + "avg_gpu_util_percent": 100.0 + }, + "backward": { + "stage_name": "backward", + "duration_ms": 3854.5540031045675, + "tokens_processed": 61440, + "tokens_per_second": 15939.587290906931, + "energy_joules": 1953.71099999547, + "energy_per_token": 0.03179868164055127, + "avg_power_watts": 491.5443624439596, + "peak_memory_gb": 76.45208740234375, + "avg_gpu_util_percent": 100.0 + }, + "optimizer": { + "stage_name": "optimizer", + "duration_ms": 899.9840868636966, + "tokens_processed": 61440, + "tokens_per_second": 68267.87372886644, + "energy_joules": 365.9209999740124, + "energy_per_token": 0.005955745442285358, + "avg_power_watts": 377.8756124501158, + "peak_memory_gb": 76.45208740234375, + "avg_gpu_util_percent": 100.0 + }, + "total_duration_ms": 6544.784841127694, + "total_tokens": 61440, + "total_tokens_per_second": 9387.627170553957, + "total_energy_joules": 3300.6610000133514, + "total_energy_per_token": 0.053721695963758975, + "timestamp": 1769149234.99943 +} \ No newline at end of file diff --git a/results/h200/inference_NVIDIA_H200_flash_attention_3_hopper.json b/results/h200/inference_NVIDIA_H200_flash_attention_3_hopper.json new file mode 100644 index 0000000..213313b --- /dev/null +++ b/results/h200/inference_NVIDIA_H200_flash_attention_3_hopper.json @@ -0,0 +1,37 @@ +{ + "model_name": "Qwen/Qwen3-4B", + "gpu_name": "NVIDIA H200", + "attention_implementation": "flash_attention_3_hopper", + "num_requests": 10, + "prompt_length": 512, + "generation_length": 100, + "prefill": { + "stage_name": "prefill", + "duration_ms": 323.8773119999223, + "tokens_processed": 5120, + "tokens_per_second": 15808.455270868828, + "energy_joules": 98.1449999999968, + "energy_per_token": 0.019168945312499373, + "avg_power_watts": 250.96736239598317, + "peak_memory_gb": 46.1302490234375, + "avg_gpu_util_percent": 32.2 + }, + "decode": { + "stage_name": "decode", + "duration_ms": 30558.618001000013, + "tokens_processed": 1000, + "tokens_per_second": 32.72399294913388, + "energy_joules": 4828.459999999999, + "energy_per_token": 4.828459999999999, + "avg_power_watts": 157.61927190444868, + "peak_memory_gb": 46.1302490234375, + "avg_gpu_util_percent": 32.2 + }, + "e2e_latency_ms": 3088.2495312999936, + "e2e_tokens_per_second": 198.17051497855476, + "e2e_energy_joules": 4926.604999999996, + "e2e_energy_per_token": 0.8050008169934634, + "ttft_ms": 32.38773119999223, + "itl_ms": 30.558618001000013, + "timestamp": 1768541964.4743361 +} \ No newline at end of file diff --git a/results/h200/pretrain_NVIDIA_H200_flash_attention_3_hopper.json b/results/h200/pretrain_NVIDIA_H200_flash_attention_3_hopper.json new file mode 100644 index 0000000..9bd001f --- /dev/null +++ b/results/h200/pretrain_NVIDIA_H200_flash_attention_3_hopper.json @@ -0,0 +1,47 @@ +{ + "model_name": "Qwen/Qwen3-4B", + "gpu_name": "NVIDIA H200", + "attention_implementation": "flash_attention_3_hopper", + "batch_size": 3, + "sequence_length": 2048, + "num_steps": 10, + "forward": { + "stage_name": "forward", + "duration_ms": 1605.9521619997668, + "tokens_processed": 61440, + "tokens_per_second": 38257.67756587068, + "energy_joules": 817.7539999999863, + "energy_per_token": 0.01330979817708311, + "avg_power_watts": 476.6091506406698, + "peak_memory_gb": 76.5540771484375, + "avg_gpu_util_percent": 95.1 + }, + "backward": { + "stage_name": "backward", + "duration_ms": 3448.8081949999696, + "tokens_processed": 61440, + "tokens_per_second": 17814.849804948502, + "energy_joules": 1765.182000000008, + "energy_per_token": 0.02873017578125013, + "avg_power_watts": 498.84691252245983, + "peak_memory_gb": 76.5540771484375, + "avg_gpu_util_percent": 95.1 + }, + "optimizer": { + "stage_name": "optimizer", + "duration_ms": 545.701982000196, + "tokens_processed": 61440, + "tokens_per_second": 112588.92587268984, + "energy_joules": 332.4770000000135, + "energy_per_token": 0.005411409505208553, + "avg_power_watts": 521.4900438388863, + "peak_memory_gb": 76.5540771484375, + "avg_gpu_util_percent": 95.1 + }, + "total_duration_ms": 5600.462338999932, + "total_tokens": 61440, + "total_tokens_per_second": 10970.522839186035, + "total_energy_joules": 2915.4130000000077, + "total_energy_per_token": 0.047451383463541795, + "timestamp": 1768541921.6000674 +} \ No newline at end of file diff --git a/results/h200_sdpa/inference_NVIDIA_H200_sdpa.json b/results/h200_sdpa/inference_NVIDIA_H200_sdpa.json new file mode 100644 index 0000000..b07b0a9 --- /dev/null +++ b/results/h200_sdpa/inference_NVIDIA_H200_sdpa.json @@ -0,0 +1,37 @@ +{ + "model_name": "Qwen/Qwen3-4B", + "gpu_name": "NVIDIA H200", + "attention_implementation": "sdpa", + "num_requests": 10, + "prompt_length": 512, + "generation_length": 100, + "prefill": { + "stage_name": "prefill", + "duration_ms": 247.9969559935853, + "tokens_processed": 5120, + "tokens_per_second": 20645.414696672466, + "energy_joules": 73.83399999141693, + "energy_per_token": 0.014420703123323619, + "avg_power_watts": 222.33737204549297, + "peak_memory_gb": 46.1165771484375, + "avg_gpu_util_percent": 40.0 + }, + "decode": { + "stage_name": "decode", + "duration_ms": 23003.622506046668, + "tokens_processed": 1000, + "tokens_per_second": 43.47141411041425, + "energy_joules": 4033.3500000089407, + "energy_per_token": 4.033350000008941, + "avg_power_watts": 174.6335604209662, + "peak_memory_gb": 46.1165771484375, + "avg_gpu_util_percent": 40.0 + }, + "e2e_latency_ms": 2325.1619462040253, + "e2e_tokens_per_second": 263.20747292425324, + "e2e_energy_joules": 4107.184000000358, + "e2e_energy_per_token": 0.6711084967320846, + "ttft_ms": 24.79969559935853, + "itl_ms": 23.003622506046668, + "timestamp": 1769149520.7919798 +} \ No newline at end of file diff --git a/results/h200_sdpa/pretrain_NVIDIA_H200_sdpa.json b/results/h200_sdpa/pretrain_NVIDIA_H200_sdpa.json new file mode 100644 index 0000000..87ae460 --- /dev/null +++ b/results/h200_sdpa/pretrain_NVIDIA_H200_sdpa.json @@ -0,0 +1,47 @@ +{ + "model_name": "Qwen/Qwen3-4B", + "gpu_name": "NVIDIA H200", + "attention_implementation": "sdpa", + "batch_size": 3, + "sequence_length": 2048, + "num_steps": 10, + "forward": { + "stage_name": "forward", + "duration_ms": 1615.8598741167225, + "tokens_processed": 61440, + "tokens_per_second": 38023.09902248482, + "energy_joules": 873.9250000119209, + "energy_per_token": 0.014224039713735693, + "avg_power_watts": 541.9081076256928, + "peak_memory_gb": 76.5540771484375, + "avg_gpu_util_percent": 100.0 + }, + "backward": { + "stage_name": "backward", + "duration_ms": 3462.180594098754, + "tokens_processed": 61440, + "tokens_per_second": 17746.04135460864, + "energy_joules": 1696.024000003934, + "energy_per_token": 0.027604557291730693, + "avg_power_watts": 472.8399628680292, + "peak_memory_gb": 76.5540771484375, + "avg_gpu_util_percent": 100.0 + }, + "optimizer": { + "stage_name": "optimizer", + "duration_ms": 551.849422918167, + "tokens_processed": 61440, + "tokens_per_second": 111334.71821915968, + "energy_joules": 316.88299998641014, + "energy_per_token": 0.005157600911237144, + "avg_power_watts": 499.2301039455484, + "peak_memory_gb": 76.5540771484375, + "avg_gpu_util_percent": 100.0 + }, + "total_duration_ms": 5629.889891133644, + "total_tokens": 61440, + "total_tokens_per_second": 10913.179687005982, + "total_energy_joules": 2886.832000002265, + "total_energy_per_token": 0.04698619791670353, + "timestamp": 1769149487.0005488 +} \ No newline at end of file diff --git a/run_benchmark.py b/run_benchmark.py new file mode 100755 index 0000000..9752e60 --- /dev/null +++ b/run_benchmark.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +""" +Main LLM Benchmark Runner + +Orchestrates pretraining and inference benchmarks with auto-detection +of GPU type and configuration. +""" + +import argparse +import sys +from pathlib import Path + +# Import benchmark functions +import benchmark_pretrain +import benchmark_inference + +from utils.gpu_monitor import get_gpu_monitor, list_available_gpus +from utils.metrics import MetricsReporter + + +def main(): + parser = argparse.ArgumentParser( + description="LLM Benchmark Suite - Compare GPU performance for pretraining and inference", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Run both pretrain and inference benchmarks + python run_benchmark.py --mode both + + # Run only pretraining benchmark + python run_benchmark.py --mode pretrain --num-steps 20 + + # Run inference with custom settings + python run_benchmark.py --mode inference --num-requests 20 --generation-length 200 + + # Use specific attention implementation + python run_benchmark.py --attn-implementation flash_attention_3_hopper + """ + ) + + # Model configuration + parser.add_argument( + "--model-path", + type=str, + default="./model_cache", + help="Path to cached model directory" + ) + + parser.add_argument( + "--model-name", + type=str, + default="Qwen/Qwen3-4B", + help="Model name for reporting" + ) + + # Benchmark mode + parser.add_argument( + "--mode", + type=str, + default="both", + choices=["pretrain", "inference", "both"], + help="Benchmark mode to run" + ) + + # Attention configuration + parser.add_argument( + "--attn-implementation", + type=str, + default="auto", + choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"], + help="Attention implementation (auto selects based on GPU)" + ) + + # Pretraining parameters + pretrain_group = parser.add_argument_group("pretraining parameters") + pretrain_group.add_argument( + "--batch-size", + type=int, + default=3, + help="Batch size for pretraining" + ) + pretrain_group.add_argument( + "--sequence-length", + type=int, + default=2048, + help="Sequence length for pretraining" + ) + pretrain_group.add_argument( + "--num-steps", + type=int, + default=10, + help="Number of training steps" + ) + pretrain_group.add_argument( + "--warmup-steps", + type=int, + default=3, + help="Number of warmup steps" + ) + + # Inference parameters + inference_group = parser.add_argument_group("inference parameters") + inference_group.add_argument( + "--num-requests", + type=int, + default=10, + help="Number of inference requests" + ) + inference_group.add_argument( + "--prompt-length", + type=int, + default=512, + help="Prompt length in tokens" + ) + inference_group.add_argument( + "--generation-length", + type=int, + default=100, + help="Number of tokens to generate" + ) + inference_group.add_argument( + "--warmup-requests", + type=int, + default=2, + help="Number of warmup requests" + ) + + # General parameters + parser.add_argument( + "--device-id", + type=int, + default=0, + help="GPU device ID" + ) + parser.add_argument( + "--output-dir", + type=str, + default="./results", + help="Output directory for results" + ) + parser.add_argument( + "--list-gpus", + action="store_true", + help="List available GPUs and exit" + ) + + args = parser.parse_args() + + # List GPUs if requested + if args.list_gpus: + print("Available GPUs:") + gpus = list_available_gpus() + if not gpus: + print(" No GPUs found!") + else: + for gpu in gpus: + print(f" {gpu}") + return + + # Print header + print("=" * 80) + print("LLM BENCHMARK SUITE") + print("=" * 80) + print(f"\nModel: {args.model_name}") + print(f"Model Path: {args.model_path}") + print(f"Mode: {args.mode}") + print(f"Attention: {args.attn_implementation}") + print(f"Output Directory: {args.output_dir}") + + # Detect GPU + print("\nDetecting GPU...") + try: + monitor = get_gpu_monitor(args.device_id) + gpu_name = monitor.get_device_name() + print(f" GPU {args.device_id}: {gpu_name}") + monitor.cleanup() + except Exception as e: + print(f"✗ Error detecting GPU: {e}") + sys.exit(1) + + # Create output directory + output_path = Path(args.output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + # Run benchmarks + pretrain_metrics = None + inference_metrics = None + + if args.mode in ["pretrain", "both"]: + print("\n" + "=" * 80) + print("Running Pretraining Benchmark...") + print("=" * 80) + + pretrain_metrics = benchmark_pretrain.benchmark_pretrain( + model_name_or_path=args.model_name, + attn_implementation=args.attn_implementation, + batch_size=args.batch_size, + sequence_length=args.sequence_length, + num_steps=args.num_steps, + warmup_steps=args.warmup_steps, + device="cuda", + device_id=args.device_id, + output_dir=args.output_dir, + verbose=True + ) + + if args.mode in ["inference", "both"]: + print("\n" + "=" * 80) + print("Running Inference Benchmark...") + print("=" * 80) + + inference_metrics = benchmark_inference.benchmark_inference( + model_name_or_path=args.model_name, + attn_implementation=args.attn_implementation, + num_requests=args.num_requests, + prompt_length=args.prompt_length, + generation_length=args.generation_length, + warmup_requests=args.warmup_requests, + device="cuda", + device_id=args.device_id, + output_dir=args.output_dir, + verbose=True + ) + + # Summary + print("\n" + "=" * 80) + print("BENCHMARK COMPLETE") + print("=" * 80) + print(f"\nResults saved to: {output_path}") + + if pretrain_metrics: + print(f"\nPretraining:") + print(f" Duration: {pretrain_metrics.total_duration_ms:.2f} ms") + print(f" Throughput: {pretrain_metrics.total_tokens_per_second:.2f} tokens/s") + print(f" Energy: {pretrain_metrics.total_energy_joules:.2f} J") + print(f" Energy/token: {pretrain_metrics.total_energy_per_token*1000:.4f} mJ/token") + + if inference_metrics: + print(f"\nInference:") + print(f" TTFT: {inference_metrics.ttft_ms:.2f} ms") + print(f" ITL: {inference_metrics.itl_ms:.2f} ms/token") + print(f" Throughput: {inference_metrics.e2e_tokens_per_second:.2f} tokens/s") + print(f" Energy: {inference_metrics.e2e_energy_joules:.2f} J") + print(f" Energy/token: {inference_metrics.e2e_energy_per_token*1000:.4f} mJ/token") + + +if __name__ == "__main__": + main() diff --git a/slurm_a100.sh b/slurm_a100.sh new file mode 100755 index 0000000..5f6b6df --- /dev/null +++ b/slurm_a100.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=llm_bench_a100 +#SBATCH --partition=a100 # Adjust to your A100 partition name +#SBATCH --nodes=1 +#SBATCH --gres=gpu:a100:1 # Request 1 A100 GPU +#SBATCH -C a100_80 +#SBATCH --time=02:00:00 +#SBATCH --output=logs/benchmark_a100_sdpa_%j.out +#SBATCH --error=logs/benchmark_a100_sdpa_%j.err + +# Create logs directory +mkdir -p logs + +# Print job info +echo "=========================================" +echo "Job ID: $SLURM_JOB_ID" +echo "Job Name: $SLURM_JOB_NAME" +echo "Node: $SLURM_NODELIST" +echo "Date: $(date)" +echo "=========================================" + +# Set cache paths +export TRANSFORMERS_CACHE=$(pwd)/model_cache +export HF_HOME=$(pwd)/model_cache + +# Path to apptainer image +APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_updated_ao.sif" + +# Run benchmark inside apptainer +apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \ + --mode both \ + --model-path ./model_cache \ + --model-name Qwen/Qwen3-4B \ + --attn-implementation sdpa \ + --batch-size 3 \ + --sequence-length 2048 \ + --num-steps 10 \ + --num-requests 10 \ + --prompt-length 512 \ + --generation-length 100 \ + --output-dir ./results/a100 + +echo "=========================================" +echo "Benchmark Complete!" +echo "=========================================" diff --git a/slurm_h100.sh b/slurm_h100.sh new file mode 100755 index 0000000..1421429 --- /dev/null +++ b/slurm_h100.sh @@ -0,0 +1,46 @@ +#!/bin/bash +#SBATCH --job-name=llm_bench_h100 +#SBATCH --partition=h100 # Adjust to your H100 partition name +#SBATCH --nodes=1 +#SBATCH --gres=gpu:h100:1 # Request 1 H100 GPU +#SBATCH --time=02:00:00 +#SBATCH --output=logs/benchmark_h100_%j.out +#SBATCH --error=logs/benchmark_h100_%j.err + +# Create logs directory +mkdir -p logs + +# Print job info +echo "=========================================" +echo "Job ID: $SLURM_JOB_ID" +echo "Job Name: $SLURM_JOB_NAME" +echo "Node: $SLURM_NODELIST" +echo "Date: $(date)" +echo "=========================================" + +# Set cache paths +export TRANSFORMERS_CACHE=$(pwd)/model_cache +export HF_HOME=$(pwd)/model_cache + +# Path to apptainer image +APPTAINER_IMAGE="/hnvme/workspace/ihpc125h-llm-profiler/pytorch_25.10_updated_ao.sif" + +# Run benchmark with FlashAttention-3 Hopper inside apptainer +apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \ + --mode both \ + --model-path ./model_cache \ + --model-name Qwen/Qwen3-4B \ + --attn-implementation sdpa \ + --batch-size 3 \ + --sequence-length 2048 \ + --num-steps 10 \ + --num-requests 10 \ + --prompt-length 512 \ + --generation-length 100 \ + --output-dir ./results/h100_sdpa + +# --attn-implementation flash_attention_3_hopper \ + +echo "=========================================" +echo "Benchmark Complete!" +echo "=========================================" diff --git a/slurm_h200.sh b/slurm_h200.sh new file mode 100755 index 0000000..d44ea39 --- /dev/null +++ b/slurm_h200.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=llm_bench_h200 +#SBATCH --partition=h200 # Adjust to your H200 partition name +#SBATCH --nodes=1 +#SBATCH --gres=gpu:h200:1 # Request 1 H200 GPU +#SBATCH --time=02:00:00 +#SBATCH --output=logs/benchmark_h200_%j.out +#SBATCH --error=logs/benchmark_h200_%j.err + +# Create logs directory +mkdir -p logs + +# Print job info +echo "=========================================" +echo "Job ID: $SLURM_JOB_ID" +echo "Job Name: $SLURM_JOB_NAME" +echo "Node: $SLURM_NODELIST" +echo "Date: $(date)" +echo "=========================================" + +# Set cache paths +export TRANSFORMERS_CACHE=$(pwd)/model_cache +export HF_HOME=$(pwd)/model_cache + +# Path to apptainer image +APPTAINER_IMAGE="/hnvme/workspace/ihpc125h-llm-profiler/pytorch_25.10_updated_ao.sif" + +# Run benchmark with FlashAttention-3 Hopper inside apptainer +apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \ + --mode both \ + --model-path ./model_cache \ + --model-name Qwen/Qwen3-4B \ + --attn-implementation sdpa \ + --batch-size 3 \ + --sequence-length 2048 \ + --num-steps 10 \ + --num-requests 10 \ + --prompt-length 512 \ + --generation-length 100 \ + --output-dir ./results/h200_sdpa + # --attn-implementation flash_attention_3_hopper \ + +echo "=========================================" +echo "Benchmark Complete!" +echo "=========================================" diff --git a/slurm_mi300x.sh b/slurm_mi300x.sh new file mode 100755 index 0000000..7d4cd54 --- /dev/null +++ b/slurm_mi300x.sh @@ -0,0 +1,42 @@ +#!/bin/bash +#SBATCH --job-name=llm_bench_mi300x +#SBATCH --nodes=1 +#SBATCH -w=aquavan1 # Request MI300X GPUs +#SBATCH --time=02:00:00 +#SBATCH --output=logs/benchmark_mi300x_%j.out +#SBATCH --error=logs/benchmark_mi300x_%j.err + +# Create logs directory +mkdir -p logs + +# Print job info +echo "=========================================" +echo "Job ID: $SLURM_JOB_ID" +echo "Job Name: $SLURM_JOB_NAME" +echo "Node: $SLURM_NODELIST" +echo "Date: $(date)" +echo "=========================================" + +# Set cache paths +export TRANSFORMERS_CACHE=$(pwd)/models +export HF_HOME=$(pwd)/models + +# Path to apptainer image +#APPTAINER_IMAGE="/home/woody/ihpc/ihpc125h/pytorch_25.10_updated_ao.sif" + +apptainer exec --writable ../rocm_sandbox/ python run_benchmark.py \ + --mode both \ + --model-path ./model_cache \ + --model-name Qwen/Qwen3-4B \ + --attn-implementation sdpa \ + --batch-size 3 \ + --sequence-length 2048 \ + --num-steps 10 \ + --num-requests 10 \ + --prompt-length 512 \ + --generation-length 100 \ + --output-dir ./results/mi300x_sdpa + +echo "=========================================" +echo "Benchmark Complete!" +echo "=========================================" diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..f6372b8 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,3 @@ +"""Utility package for LLM benchmarking.""" + +__version__ = "1.0.0" diff --git a/utils/attention.py b/utils/attention.py new file mode 100644 index 0000000..efb7308 --- /dev/null +++ b/utils/attention.py @@ -0,0 +1,295 @@ +""" +Attention Implementation Helpers for LLM Benchmarking + +Provides functions for configuring different attention implementations +based on GPU type. +""" + +from typing import Optional +import warnings + + +def get_default_attention(gpu_name: str) -> str: + """ + Get default attention implementation for GPU type. + + Args: + gpu_name: GPU device name (from monitoring) + + Returns: + Attention implementation string + """ + gpu_lower = gpu_name.lower() + + # H100/H200: FlashAttention-3 Hopper + if 'h100' in gpu_lower or 'h200' in gpu_lower: + return "flash_attention_3_hopper" + + # A100, MI300X, other: FlashAttention-2 + return "flash_attention_2" + + +def configure_model_attention(model, attn_implementation: str, verbose: bool = True): + """ + Configure model to use specified attention implementation. + + This function patches the model if needed to use the specified attention. + For standard implementations like flash_attention_2, the model should already + be loaded with the correct implementation via AutoModelForCausalLM.from_pretrained(). + + For FlashAttention-3 Hopper, this patches the model's attention modules. + + Args: + model: The loaded model + attn_implementation: Attention implementation to use + verbose: Print configuration messages + + Returns: + Configured model + """ + if verbose: + print(f"Configuring attention: {attn_implementation}") + + if attn_implementation == "flash_attention_3_hopper": + # Patch model to use FlashAttention-3 Hopper + try: + import flash_attn_interface + except ImportError: + raise ImportError( + "flash_attn_interface not found. This is required for FlashAttention-3.\n" + "Install with appropriate method for your system." + ) + + # Patch the model's attention function + _patch_fa3_hopper(model, verbose=verbose) + + elif attn_implementation == "flash_attention_2": + # Model should already be loaded with FA2 + if verbose: + print(" Using FlashAttention-2 (configured during model loading)") + + elif attn_implementation == "sdpa": + # PyTorch Scaled Dot Product Attention + if verbose: + print(" Using PyTorch SDPA") + + elif attn_implementation == "eager": + # Standard PyTorch attention + if verbose: + print(" Using eager attention") + + else: + warnings.warn(f"Unknown attention implementation: {attn_implementation}") + + return model + + +def _patch_fa3_hopper(model, verbose: bool = True): + """ + Patch model to use FlashAttention-3 Hopper. + + This replaces the attention computation in the model's attention layers + with calls to flash_attn_interface.flash_attn_func(). + + Args: + model: The model to patch + verbose: Print patching messages + """ + import flash_attn_interface + import torch + + # Counter for patched modules + num_patched = 0 + + # Iterate through all modules in the model + for name, module in model.named_modules(): + # Look for attention modules (this will vary by model architecture) + # Common names: "self_attn", "attn", "attention" + if any(attn_name in name.lower() for attn_name in ['self_attn', 'attention']): + # Check if module has a forward method we can patch + if hasattr(module, 'forward'): + # Save original forward + original_forward = module.forward + + # Create patched forward function + def create_patched_forward(orig_forward): + def patched_forward(hidden_states, *args, **kwargs): + # Check if this is an attention computation + # For Qwen models, attention modules typically have q, k, v projections + if hasattr(module, 'q_proj') and hasattr(module, 'k_proj') and hasattr(module, 'v_proj'): + # Extract batch, seq_len, hidden_dim + batch_size, seq_len, hidden_dim = hidden_states.shape + + # Compute Q, K, V + q = module.q_proj(hidden_states) + k = module.k_proj(hidden_states) + v = module.v_proj(hidden_states) + + # Reshape for multi-head attention + num_heads = module.num_heads + head_dim = hidden_dim // num_heads + + q = q.view(batch_size, seq_len, num_heads, head_dim) + k = k.view(batch_size, seq_len, num_heads, head_dim) + v = v.view(batch_size, seq_len, num_heads, head_dim) + + # Call FlashAttention-3 + # Note: flash_attn_func expects (batch, seqlen, nheads, headdim) + attn_output = flash_attn_interface.flash_attn_func( + q, k, v, + dropout_p=0.0, + softmax_scale=None, # Will use default 1/sqrt(head_dim) + causal=True, # For causal LM + ) + + # Reshape back + attn_output = attn_output.view(batch_size, seq_len, hidden_dim) + + # Apply output projection if it exists + if hasattr(module, 'o_proj'): + attn_output = module.o_proj(attn_output) + + return (attn_output,) + (None,) * (len(orig_forward(hidden_states, *args, **kwargs)) - 1) + + else: + # Not an attention module we can patch, use original + return orig_forward(hidden_states, *args, **kwargs) + + return patched_forward + + # Apply patch + module.forward = create_patched_forward(original_forward) + num_patched += 1 + + if verbose: + if num_patched > 0: + print(f" ✓ Patched {num_patched} attention modules to use FlashAttention-3 Hopper") + else: + warnings.warn(" ⚠ No attention modules found to patch for FlashAttention-3") + + +def get_attention_info(attn_implementation: str) -> dict: + """ + Get information about an attention implementation. + + Args: + attn_implementation: Attention implementation string + + Returns: + Dictionary with info about the implementation + """ + info = { + "flash_attention_2": { + "name": "FlashAttention-2", + "description": "Optimized attention for A100 and other GPUs", + "gpu_support": ["A100", "MI300X", "V100", "RTX"], + "memory_efficient": True, + "requires_cuda": True, + }, + "flash_attention_3_hopper": { + "name": "FlashAttention-3 Hopper", + "description": "Optimized attention for H100/H200 Hopper architecture", + "gpu_support": ["H100", "H200"], + "memory_efficient": True, + "requires_cuda": True, + }, + "sdpa": { + "name": "PyTorch SDPA", + "description": "PyTorch Scaled Dot Product Attention", + "gpu_support": ["All"], + "memory_efficient": True, + "requires_cuda": False, + }, + "eager": { + "name": "Eager Attention", + "description": "Standard PyTorch attention implementation", + "gpu_support": ["All"], + "memory_efficient": False, + "requires_cuda": False, + }, + } + + return info.get(attn_implementation, { + "name": attn_implementation, + "description": "Unknown attention implementation", + "gpu_support": ["Unknown"], + "memory_efficient": False, + "requires_cuda": False, + }) + + +def validate_attention_for_gpu(attn_implementation: str, gpu_name: str) -> tuple[bool, Optional[str]]: + """ + Validate if attention implementation is suitable for GPU. + + Args: + attn_implementation: Attention implementation + gpu_name: GPU device name + + Returns: + Tuple of (is_valid, warning_message) + """ + gpu_lower = gpu_name.lower() + + # FlashAttention-3 Hopper validation + if attn_implementation == "flash_attention_3_hopper": + if 'h100' not in gpu_lower and 'h200' not in gpu_lower: + return False, ( + f"FlashAttention-3 Hopper is optimized for H100/H200. " + f"Current GPU: {gpu_name}. Consider using flash_attention_2 instead." + ) + + # FlashAttention-2 on Hopper GPUs + if attn_implementation == "flash_attention_2": + if 'h100' in gpu_lower or 'h200' in gpu_lower: + return True, ( + f"FlashAttention-2 will work on {gpu_name}, but FlashAttention-3 Hopper " + f"may provide better performance." + ) + + return True, None + + +if __name__ == "__main__": + """Test attention configuration.""" + print("=" * 60) + print("Attention Implementation Test") + print("=" * 60) + + # Test getting default attention for different GPUs + test_gpus = [ + "NVIDIA A100 80GB", + "NVIDIA H100 80GB", + "NVIDIA H200 141GB", + "AMD Instinct MI300X", + ] + + print("\nDefault attention implementations:") + for gpu in test_gpus: + attn = get_default_attention(gpu) + print(f" {gpu:30s} → {attn}") + + # Test validation + print("\nValidation tests:") + test_cases = [ + ("flash_attention_3_hopper", "NVIDIA H100 80GB"), + ("flash_attention_3_hopper", "NVIDIA A100 80GB"), + ("flash_attention_2", "NVIDIA H100 80GB"), + ("flash_attention_2", "NVIDIA A100 80GB"), + ] + + for attn, gpu in test_cases: + valid, warning = validate_attention_for_gpu(attn, gpu) + status = "✓" if valid else "✗" + print(f" {status} {attn:30s} on {gpu:25s}") + if warning: + print(f" ⚠ {warning}") + + # Test getting info + print("\nAttention implementation info:") + for attn in ["flash_attention_2", "flash_attention_3_hopper", "sdpa"]: + info = get_attention_info(attn) + print(f"\n {info['name']}:") + print(f" Description: {info['description']}") + print(f" GPU Support: {', '.join(info['gpu_support'])}") + print(f" Memory Efficient: {info['memory_efficient']}") diff --git a/utils/gpu_monitor.py b/utils/gpu_monitor.py new file mode 100644 index 0000000..8600f98 --- /dev/null +++ b/utils/gpu_monitor.py @@ -0,0 +1,562 @@ +""" +GPU Monitoring Infrastructure for LLM Benchmarking + +Provides unified interface for monitoring both NVIDIA and AMD GPUs. +""" + +import time +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Optional, List +import warnings + + +@dataclass +class GPUMetrics: + """Container for GPU metrics.""" + timestamp: float + power_watts: float + gpu_utilization_percent: float + memory_used_gb: float + memory_total_gb: float + temperature_celsius: Optional[float] = None + energy_joules: Optional[float] = None # Cumulative energy + + +class GPUMonitor(ABC): + """Abstract base class for GPU monitoring.""" + + def __init__(self, device_id: int = 0): + """ + Initialize GPU monitor. + + Args: + device_id: GPU device ID to monitor + """ + self.device_id = device_id + self.start_time = None + self.start_energy = None + self.last_metrics = None + + @abstractmethod + def get_metrics(self) -> GPUMetrics: + """Get current GPU metrics.""" + pass + + @abstractmethod + def get_device_name(self) -> str: + """Get GPU device name.""" + pass + + @abstractmethod + def cleanup(self): + """Cleanup resources.""" + pass + + def start_monitoring(self): + """Start energy monitoring session.""" + self.start_time = time.time() + metrics = self.get_metrics() + self.start_energy = metrics.energy_joules if metrics.energy_joules is not None else 0.0 + self.last_metrics = metrics + + def get_energy_consumed(self) -> float: + """ + Get energy consumed since start_monitoring() was called. + + Returns: + Energy in Joules + """ + if self.start_time is None: + raise RuntimeError("Must call start_monitoring() first") + + current_metrics = self.get_metrics() + + if current_metrics.energy_joules is not None: + # If GPU provides cumulative energy, use it + return current_metrics.energy_joules - self.start_energy + else: + # Otherwise, integrate power over time + elapsed_time = time.time() - self.start_time + # Use average of start and current power + avg_power = (self.last_metrics.power_watts + current_metrics.power_watts) / 2.0 + return avg_power * elapsed_time + + def get_average_power(self) -> float: + """ + Get average power consumption since start_monitoring(). + + Returns: + Average power in Watts + """ + if self.start_time is None: + raise RuntimeError("Must call start_monitoring() first") + + elapsed_time = time.time() - self.start_time + if elapsed_time == 0: + return 0.0 + + energy = self.get_energy_consumed() + return energy / elapsed_time + + +class NVIDIAMonitor(GPUMonitor): + """NVIDIA GPU monitor using pynvml.""" + + def __init__(self, device_id: int = 0): + """Initialize NVIDIA monitor.""" + try: + import pynvml + self.pynvml = pynvml + except ImportError: + raise ImportError( + "pynvml not found. Install with: pip install pynvml" + ) + + try: + self.pynvml.nvmlInit() + self.handle = self.pynvml.nvmlDeviceGetHandleByIndex(device_id) + except Exception as e: + raise RuntimeError(f"Failed to initialize NVIDIA GPU {device_id}: {e}") + + super().__init__(device_id) + + def get_metrics(self) -> GPUMetrics: + """Get current NVIDIA GPU metrics.""" + try: + # Power (in milliwatts) + power_mw = self.pynvml.nvmlDeviceGetPowerUsage(self.handle) + power_watts = power_mw / 1000.0 + + # Utilization + util = self.pynvml.nvmlDeviceGetUtilizationRates(self.handle) + gpu_util = util.gpu + + # Memory + mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(self.handle) + memory_used_gb = mem_info.used / (1024**3) + memory_total_gb = mem_info.total / (1024**3) + + # Temperature + try: + temp = self.pynvml.nvmlDeviceGetTemperature( + self.handle, + self.pynvml.NVML_TEMPERATURE_GPU + ) + except: + temp = None + + # Try to get cumulative energy (newer GPUs) + energy_joules = None + try: + energy_mj = self.pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle) + energy_joules = energy_mj / 1000.0 + except: + # Not supported on this GPU, will use power integration + pass + + return GPUMetrics( + timestamp=time.time(), + power_watts=power_watts, + gpu_utilization_percent=gpu_util, + memory_used_gb=memory_used_gb, + memory_total_gb=memory_total_gb, + temperature_celsius=temp, + energy_joules=energy_joules + ) + except Exception as e: + raise RuntimeError(f"Failed to get NVIDIA GPU metrics: {e}") + + def get_device_name(self) -> str: + """Get NVIDIA GPU device name.""" + try: + name = self.pynvml.nvmlDeviceGetName(self.handle) + if isinstance(name, bytes): + name = name.decode('utf-8') + return name + except: + return f"NVIDIA GPU {self.device_id}" + + def cleanup(self): + """Cleanup NVIDIA resources.""" + try: + self.pynvml.nvmlShutdown() + except: + pass + + +class AMDMonitor(GPUMonitor): + """AMD GPU monitor using rocm-smi command line tool.""" + + def __init__(self, device_id: int = 0): + """Initialize AMD monitor.""" + import subprocess + import shutil + + # Check if rocm-smi is available + if shutil.which('rocm-smi') is None: + raise RuntimeError("rocm-smi command not found. Make sure ROCm is installed and in PATH.") + + self.device_id = device_id + + # Verify device exists + try: + result = subprocess.run( + ['rocm-smi', '--showid'], + capture_output=True, + text=True, + timeout=5 + ) + if result.returncode != 0: + raise RuntimeError(f"rocm-smi failed: {result.stderr}") + except subprocess.TimeoutExpired: + raise RuntimeError("rocm-smi command timed out") + except Exception as e: + raise RuntimeError(f"Failed to initialize AMD GPU {device_id}: {e}") + + super().__init__(device_id) + + def _parse_detailed_output(self, output: str) -> dict: + """Parse rocm-smi detailed output format.""" + lines = output.strip().split('\n') + + # Parse detailed format: GPU[X] : Metric : Value + metrics = { + 'temperature': None, + 'power': None, + 'vram_percent': None, + 'gpu_percent': None, + } + + device_prefix = f"GPU[{self.device_id}]" + + for line in lines: + if not line.strip() or not line.startswith(device_prefix): + continue + + # Split by colon + parts = line.split(':') + if len(parts) < 3: + continue + + metric_name = parts[1].strip().lower() + value_str = parts[2].strip() + + try: + # Temperature (Sensor junction) + if 'temperature' in metric_name and 'junction' in metric_name: + metrics['temperature'] = float(value_str) + + # Power consumption + elif 'power' in metric_name and 'package' in metric_name: + metrics['power'] = float(value_str) + + # GPU utilization + elif 'gpu use' in metric_name: + metrics['gpu_percent'] = float(value_str) + + # VRAM usage percentage + elif 'memory allocated' in metric_name and 'vram%' in metric_name: + metrics['vram_percent'] = float(value_str) + + except (ValueError, IndexError): + continue + + # Validate we got the required metrics + if metrics['temperature'] is None: + raise ValueError(f"Could not find temperature for GPU[{self.device_id}]") + if metrics['power'] is None: + raise ValueError(f"Could not find power for GPU[{self.device_id}]") + if metrics['gpu_percent'] is None: + metrics['gpu_percent'] = 0.0 + if metrics['vram_percent'] is None: + metrics['vram_percent'] = 0.0 + + return metrics + + def _get_memory_info(self) -> tuple: + """Get memory usage in GB using rocm-smi --showmeminfo.""" + import subprocess + + try: + result = subprocess.run( + ['rocm-smi', '--showmeminfo', 'vram', '-d', str(self.device_id)], + capture_output=True, + text=True, + timeout=5 + ) + + if result.returncode != 0: + return 0.0, 0.0 + + # Parse output for memory info + # Looking for lines like "GPU memory used: X MiB" and "GPU memory total: Y MiB" + used_gb = 0.0 + total_gb = 0.0 + + for line in result.stdout.split('\n'): + if 'Used' in line or 'used' in line: + # Extract number + parts = line.split() + for i, part in enumerate(parts): + if part.replace('.', '').isdigit(): + used_bytes = float(part) + # Check if next part indicates unit + if i + 1 < len(parts): + unit = parts[i + 1].lower() + if 'mb' in unit or 'mib' in unit: + used_gb = used_bytes / 1024 + elif 'gb' in unit or 'gib' in unit: + used_gb = used_bytes + elif 'kb' in unit or 'kib' in unit: + used_gb = used_bytes / (1024 * 1024) + break + + if 'Total' in line or 'total' in line: + parts = line.split() + for i, part in enumerate(parts): + if part.replace('.', '').isdigit(): + total_bytes = float(part) + if i + 1 < len(parts): + unit = parts[i + 1].lower() + if 'mb' in unit or 'mib' in unit: + total_gb = total_bytes / 1024 + elif 'gb' in unit or 'gib' in unit: + total_gb = total_bytes + elif 'kb' in unit or 'kib' in unit: + total_gb = total_bytes / (1024 * 1024) + break + + return used_gb, total_gb + + except Exception: + return 0.0, 0.0 + + def get_metrics(self) -> GPUMetrics: + """Get current AMD GPU metrics.""" + import subprocess + + try: + # Get main metrics from concise output + result = subprocess.run( + ['rocm-smi', '--showid', '--showtemp', '--showpower', '--showuse', '--showmemuse'], + capture_output=True, + text=True, + timeout=5 + ) + + if result.returncode != 0: + raise RuntimeError(f"rocm-smi failed: {result.stderr}") + + metrics = self._parse_detailed_output(result.stdout) + + # Get detailed memory info + memory_used_gb, memory_total_gb = self._get_memory_info() + + # If we couldn't get absolute memory, estimate from percentage + if memory_total_gb == 0.0: + # MI300X has ~192GB, MI250X has ~128GB - use a reasonable default + memory_total_gb = 192.0 # Assume MI300X + memory_used_gb = memory_total_gb * (metrics['vram_percent'] / 100.0) + + return GPUMetrics( + timestamp=time.time(), + power_watts=metrics['power'], + gpu_utilization_percent=metrics['gpu_percent'], + memory_used_gb=memory_used_gb, + memory_total_gb=memory_total_gb, + temperature_celsius=metrics['temperature'], + energy_joules=None # Will use power integration + ) + + except subprocess.TimeoutExpired: + raise RuntimeError("rocm-smi command timed out") + except Exception as e: + raise RuntimeError(f"Failed to get AMD GPU metrics: {e}") + + def get_device_name(self) -> str: + """Get AMD GPU device name.""" + import subprocess + + try: + result = subprocess.run( + ['rocm-smi', '--showproductname', '-d', str(self.device_id)], + capture_output=True, + text=True, + timeout=5 + ) + + if result.returncode == 0: + # Parse output to find device name + for line in result.stdout.split('\n'): + if 'Card series' in line or 'Card model' in line or 'name' in line.lower(): + parts = line.split(':') + if len(parts) > 1: + return parts[1].strip() + except Exception: + pass + + return f"AMD GPU {self.device_id}" + + def cleanup(self): + """Cleanup AMD resources.""" + # No cleanup needed for command-line tool + pass + + +def get_gpu_monitor(device_id: int = 0) -> GPUMonitor: + """ + Factory function to automatically detect and create appropriate GPU monitor. + + Args: + device_id: GPU device ID to monitor + + Returns: + GPUMonitor instance (NVIDIAMonitor or AMDMonitor) + + Raises: + RuntimeError: If no supported GPU is found + """ + # Try AMD first (rocm-smi based) as it's more commonly available + try: + return AMDMonitor(device_id) + except: + pass + + # Try NVIDIA if AMD fails + try: + return NVIDIAMonitor(device_id) + except: + pass + + # Try to import torch to detect GPU type as last resort + try: + import torch + if torch.cuda.is_available(): + # Check if it's NVIDIA or AMD + device_name = torch.cuda.get_device_name(device_id).lower() + + if 'nvidia' in device_name or 'tesla' in device_name or 'geforce' in device_name: + return NVIDIAMonitor(device_id) + elif 'amd' in device_name or 'radeon' in device_name or 'mi300' in device_name or 'mi200' in device_name: + return AMDMonitor(device_id) + except: + pass + + raise RuntimeError( + "No supported GPU found. Make sure either ROCm (rocm-smi) or NVIDIA (pynvml) drivers are installed." + ) + + +def list_available_gpus() -> List[str]: + """ + List all available GPUs. + + Returns: + List of GPU names + """ + gpus = [] + + # Try NVIDIA + try: + import pynvml + pynvml.nvmlInit() + device_count = pynvml.nvmlDeviceGetCount() + for i in range(device_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + name = pynvml.nvmlDeviceGetName(handle) + if isinstance(name, bytes): + name = name.decode('utf-8') + gpus.append(f"GPU {i}: {name} (NVIDIA)") + pynvml.nvmlShutdown() + except: + pass + + # Try AMD with rocm-smi + try: + import subprocess + import shutil + + if shutil.which('rocm-smi'): + result = subprocess.run( + ['rocm-smi', '--showid'], + capture_output=True, + text=True, + timeout=5 + ) + if result.returncode == 0: + # Parse device IDs from output + for line in result.stdout.split('\n'): + if not line.strip() or line.startswith('=') or 'Device' in line or 'ROCm' in line: + continue + parts = line.split() + if parts and parts[0].isdigit(): + device_id = int(parts[0]) + # Try to get device name + name_result = subprocess.run( + ['rocm-smi', '--showproductname', '-d', str(device_id)], + capture_output=True, + text=True, + timeout=5 + ) + name = f"AMD GPU" + if name_result.returncode == 0: + for name_line in name_result.stdout.split('\n'): + if 'Card' in name_line or 'name' in name_line.lower(): + parts_name = name_line.split(':') + if len(parts_name) > 1: + name = parts_name[1].strip() + break + gpus.append(f"GPU {device_id}: {name} (AMD)") + except: + pass + + return gpus + + +if __name__ == "__main__": + """Test GPU monitoring.""" + print("=" * 60) + print("GPU Monitoring Test") + print("=" * 60) + + # List available GPUs + print("\nAvailable GPUs:") + gpus = list_available_gpus() + if not gpus: + print(" No GPUs found!") + exit(1) + + for gpu in gpus: + print(f" {gpu}") + + # Test monitoring + print("\nTesting GPU 0 monitoring...") + try: + monitor = get_gpu_monitor(0) + print(f" Device: {monitor.get_device_name()}") + + # Get metrics + metrics = monitor.get_metrics() + print(f"\nCurrent Metrics:") + print(f" Power: {metrics.power_watts:.2f} W") + print(f" GPU Utilization: {metrics.gpu_utilization_percent:.1f}%") + print(f" Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB") + if metrics.temperature_celsius: + print(f" Temperature: {metrics.temperature_celsius:.1f}°C") + + # Test energy monitoring + print("\nTesting energy monitoring (5 seconds)...") + monitor.start_monitoring() + time.sleep(5) + energy = monitor.get_energy_consumed() + avg_power = monitor.get_average_power() + print(f" Energy consumed: {energy:.2f} J") + print(f" Average power: {avg_power:.2f} W") + + monitor.cleanup() + print("\n✓ Monitoring test successful!") + + except Exception as e: + print(f"\n✗ Error: {e}") + exit(1) diff --git a/utils/metrics.py b/utils/metrics.py new file mode 100644 index 0000000..d1b7ca5 --- /dev/null +++ b/utils/metrics.py @@ -0,0 +1,473 @@ +""" +Metrics Collection and Reporting for LLM Benchmarking + +Provides centralized metrics collection, aggregation, and reporting. +""" + +import json +import csv +from dataclasses import dataclass, asdict, field +from typing import Dict, List, Optional, Any +from pathlib import Path +import time + + +@dataclass +class StageMetrics: + """Metrics for a specific stage (e.g., forward pass, prefill, etc.).""" + stage_name: str + duration_ms: float + tokens_processed: int + tokens_per_second: float + energy_joules: float + energy_per_token: float + avg_power_watts: float + peak_memory_gb: float + avg_gpu_util_percent: float + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary.""" + return asdict(self) + + +@dataclass +class PretrainMetrics: + """Metrics for pretraining benchmark.""" + model_name: str + gpu_name: str + attention_implementation: str + batch_size: int + sequence_length: int + num_steps: int + + # Stage-specific metrics + forward: StageMetrics + backward: StageMetrics + optimizer: StageMetrics + + # Overall metrics + total_duration_ms: float + total_tokens: int + total_tokens_per_second: float + total_energy_joules: float + total_energy_per_token: float + + timestamp: float = field(default_factory=time.time) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary.""" + return { + "model_name": self.model_name, + "gpu_name": self.gpu_name, + "attention_implementation": self.attention_implementation, + "batch_size": self.batch_size, + "sequence_length": self.sequence_length, + "num_steps": self.num_steps, + "forward": self.forward.to_dict(), + "backward": self.backward.to_dict(), + "optimizer": self.optimizer.to_dict(), + "total_duration_ms": self.total_duration_ms, + "total_tokens": self.total_tokens, + "total_tokens_per_second": self.total_tokens_per_second, + "total_energy_joules": self.total_energy_joules, + "total_energy_per_token": self.total_energy_per_token, + "timestamp": self.timestamp, + } + + +@dataclass +class InferenceMetrics: + """Metrics for inference benchmark.""" + model_name: str + gpu_name: str + attention_implementation: str + num_requests: int + prompt_length: int + generation_length: int + + # Stage-specific metrics + prefill: StageMetrics # Time to First Token + decode: StageMetrics # Inter-Token Latency + + # End-to-end metrics + e2e_latency_ms: float + e2e_tokens_per_second: float + e2e_energy_joules: float + e2e_energy_per_token: float + + # Additional metrics + ttft_ms: float # Time to First Token (same as prefill duration) + itl_ms: float # Inter-Token Latency (decode duration / num_tokens) + + timestamp: float = field(default_factory=time.time) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary.""" + return { + "model_name": self.model_name, + "gpu_name": self.gpu_name, + "attention_implementation": self.attention_implementation, + "num_requests": self.num_requests, + "prompt_length": self.prompt_length, + "generation_length": self.generation_length, + "prefill": self.prefill.to_dict(), + "decode": self.decode.to_dict(), + "e2e_latency_ms": self.e2e_latency_ms, + "e2e_tokens_per_second": self.e2e_tokens_per_second, + "e2e_energy_joules": self.e2e_energy_joules, + "e2e_energy_per_token": self.e2e_energy_per_token, + "ttft_ms": self.ttft_ms, + "itl_ms": self.itl_ms, + "timestamp": self.timestamp, + } + + +class MetricsCollector: + """Collects metrics during benchmark runs.""" + + def __init__(self): + """Initialize metrics collector.""" + self.metrics_history: List[Dict[str, Any]] = [] + + def add_pretrain_metrics(self, metrics: PretrainMetrics): + """Add pretraining metrics.""" + self.metrics_history.append({ + "type": "pretrain", + "metrics": metrics.to_dict() + }) + + def add_inference_metrics(self, metrics: InferenceMetrics): + """Add inference metrics.""" + self.metrics_history.append({ + "type": "inference", + "metrics": metrics.to_dict() + }) + + def get_all_metrics(self) -> List[Dict[str, Any]]: + """Get all collected metrics.""" + return self.metrics_history + + def clear(self): + """Clear all metrics.""" + self.metrics_history.clear() + + +class MetricsReporter: + """Formats and outputs benchmark results.""" + + @staticmethod + def print_pretrain_metrics(metrics: PretrainMetrics, verbose: bool = True): + """Print pretraining metrics to console.""" + print("\n" + "=" * 80) + print("PRETRAINING BENCHMARK RESULTS") + print("=" * 80) + print(f"\nModel: {metrics.model_name}") + print(f"GPU: {metrics.gpu_name}") + print(f"Attention: {metrics.attention_implementation}") + print(f"Batch Size: {metrics.batch_size}") + print(f"Sequence Length: {metrics.sequence_length}") + print(f"Training Steps: {metrics.num_steps}") + + print("\n" + "-" * 80) + print("STAGE BREAKDOWN") + print("-" * 80) + + # Forward pass + print(f"\n[1] FORWARD PASS") + MetricsReporter._print_stage_metrics(metrics.forward, verbose) + + # Backward pass + print(f"\n[2] BACKWARD PASS") + MetricsReporter._print_stage_metrics(metrics.backward, verbose) + + # Optimizer step + print(f"\n[3] OPTIMIZER STEP") + MetricsReporter._print_stage_metrics(metrics.optimizer, verbose) + + # Overall + print("\n" + "-" * 80) + print("OVERALL METRICS") + print("-" * 80) + print(f" Total Duration: {metrics.total_duration_ms:>10.2f} ms") + print(f" Total Tokens: {metrics.total_tokens:>10,}") + print(f" Throughput: {metrics.total_tokens_per_second:>10.2f} tokens/s") + print(f" Total Energy: {metrics.total_energy_joules:>10.2f} J") + print(f" Energy per Token: {metrics.total_energy_per_token*1000:>10.4f} mJ/token") + print("=" * 80 + "\n") + + @staticmethod + def print_inference_metrics(metrics: InferenceMetrics, verbose: bool = True): + """Print inference metrics to console.""" + print("\n" + "=" * 80) + print("INFERENCE BENCHMARK RESULTS") + print("=" * 80) + print(f"\nModel: {metrics.model_name}") + print(f"GPU: {metrics.gpu_name}") + print(f"Attention: {metrics.attention_implementation}") + print(f"Requests: {metrics.num_requests}") + print(f"Prompt Length: {metrics.prompt_length}") + print(f"Generation Length: {metrics.generation_length}") + + print("\n" + "-" * 80) + print("STAGE BREAKDOWN") + print("-" * 80) + + # Prefill + print(f"\n[1] PREFILL (Time to First Token)") + MetricsReporter._print_stage_metrics(metrics.prefill, verbose) + print(f" TTFT: {metrics.ttft_ms:>10.2f} ms") + + # Decode + print(f"\n[2] DECODE (Inter-Token Latency)") + MetricsReporter._print_stage_metrics(metrics.decode, verbose) + print(f" ITL: {metrics.itl_ms:>10.2f} ms/token") + + # End-to-end + print("\n" + "-" * 80) + print("END-TO-END METRICS") + print("-" * 80) + print(f" Request Latency: {metrics.e2e_latency_ms:>10.2f} ms") + print(f" Throughput: {metrics.e2e_tokens_per_second:>10.2f} tokens/s") + print(f" Total Energy: {metrics.e2e_energy_joules:>10.2f} J") + print(f" Energy per Token: {metrics.e2e_energy_per_token*1000:>10.4f} mJ/token") + print("=" * 80 + "\n") + + @staticmethod + def _print_stage_metrics(stage: StageMetrics, verbose: bool = True): + """Print metrics for a single stage.""" + print(f" Duration: {stage.duration_ms:>10.2f} ms") + print(f" Tokens: {stage.tokens_processed:>10,}") + print(f" Throughput: {stage.tokens_per_second:>10.2f} tokens/s") + print(f" Energy: {stage.energy_joules:>10.2f} J") + print(f" Energy per Token: {stage.energy_per_token*1000:>10.4f} mJ/token") + + if verbose: + print(f" Avg Power: {stage.avg_power_watts:>10.2f} W") + print(f" Peak Memory: {stage.peak_memory_gb:>10.2f} GB") + print(f" Avg GPU Utilization: {stage.avg_gpu_util_percent:>10.1f} %") + + @staticmethod + def save_json(metrics: Any, output_path: Path): + """ + Save metrics to JSON file. + + Args: + metrics: PretrainMetrics or InferenceMetrics object + output_path: Path to output JSON file + """ + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, 'w') as f: + json.dump(metrics.to_dict(), f, indent=2) + + print(f"Metrics saved to: {output_path}") + + @staticmethod + def save_csv(metrics_list: List[Any], output_path: Path, benchmark_type: str = "pretrain"): + """ + Save multiple metrics to CSV file for comparison. + + Args: + metrics_list: List of PretrainMetrics or InferenceMetrics objects + output_path: Path to output CSV file + benchmark_type: "pretrain" or "inference" + """ + if not metrics_list: + print("No metrics to save") + return + + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, 'w', newline='') as f: + if benchmark_type == "pretrain": + MetricsReporter._save_pretrain_csv(metrics_list, f) + else: + MetricsReporter._save_inference_csv(metrics_list, f) + + print(f"CSV saved to: {output_path}") + + @staticmethod + def _save_pretrain_csv(metrics_list: List[PretrainMetrics], file): + """Save pretraining metrics to CSV.""" + fieldnames = [ + 'gpu_name', 'attention_implementation', 'batch_size', 'sequence_length', 'num_steps', + 'forward_duration_ms', 'forward_tokens_per_sec', 'forward_energy_j', 'forward_energy_per_token_mj', + 'backward_duration_ms', 'backward_tokens_per_sec', 'backward_energy_j', 'backward_energy_per_token_mj', + 'optimizer_duration_ms', 'optimizer_tokens_per_sec', 'optimizer_energy_j', 'optimizer_energy_per_token_mj', + 'total_duration_ms', 'total_tokens_per_sec', 'total_energy_j', 'total_energy_per_token_mj', + 'timestamp' + ] + + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writeheader() + + for m in metrics_list: + writer.writerow({ + 'gpu_name': m.gpu_name, + 'attention_implementation': m.attention_implementation, + 'batch_size': m.batch_size, + 'sequence_length': m.sequence_length, + 'num_steps': m.num_steps, + 'forward_duration_ms': m.forward.duration_ms, + 'forward_tokens_per_sec': m.forward.tokens_per_second, + 'forward_energy_j': m.forward.energy_joules, + 'forward_energy_per_token_mj': m.forward.energy_per_token * 1000, + 'backward_duration_ms': m.backward.duration_ms, + 'backward_tokens_per_sec': m.backward.tokens_per_second, + 'backward_energy_j': m.backward.energy_joules, + 'backward_energy_per_token_mj': m.backward.energy_per_token * 1000, + 'optimizer_duration_ms': m.optimizer.duration_ms, + 'optimizer_tokens_per_sec': m.optimizer.tokens_per_second, + 'optimizer_energy_j': m.optimizer.energy_joules, + 'optimizer_energy_per_token_mj': m.optimizer.energy_per_token * 1000, + 'total_duration_ms': m.total_duration_ms, + 'total_tokens_per_sec': m.total_tokens_per_second, + 'total_energy_j': m.total_energy_joules, + 'total_energy_per_token_mj': m.total_energy_per_token * 1000, + 'timestamp': m.timestamp, + }) + + @staticmethod + def _save_inference_csv(metrics_list: List[InferenceMetrics], file): + """Save inference metrics to CSV.""" + fieldnames = [ + 'gpu_name', 'attention_implementation', 'num_requests', 'prompt_length', 'generation_length', + 'prefill_duration_ms', 'prefill_tokens_per_sec', 'prefill_energy_j', 'prefill_energy_per_token_mj', + 'ttft_ms', + 'decode_duration_ms', 'decode_tokens_per_sec', 'decode_energy_j', 'decode_energy_per_token_mj', + 'itl_ms', + 'e2e_latency_ms', 'e2e_tokens_per_sec', 'e2e_energy_j', 'e2e_energy_per_token_mj', + 'timestamp' + ] + + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writeheader() + + for m in metrics_list: + writer.writerow({ + 'gpu_name': m.gpu_name, + 'attention_implementation': m.attention_implementation, + 'num_requests': m.num_requests, + 'prompt_length': m.prompt_length, + 'generation_length': m.generation_length, + 'prefill_duration_ms': m.prefill.duration_ms, + 'prefill_tokens_per_sec': m.prefill.tokens_per_second, + 'prefill_energy_j': m.prefill.energy_joules, + 'prefill_energy_per_token_mj': m.prefill.energy_per_token * 1000, + 'ttft_ms': m.ttft_ms, + 'decode_duration_ms': m.decode.duration_ms, + 'decode_tokens_per_sec': m.decode.tokens_per_second, + 'decode_energy_j': m.decode.energy_joules, + 'decode_energy_per_token_mj': m.decode.energy_per_token * 1000, + 'itl_ms': m.itl_ms, + 'e2e_latency_ms': m.e2e_latency_ms, + 'e2e_tokens_per_sec': m.e2e_tokens_per_second, + 'e2e_energy_j': m.e2e_energy_joules, + 'e2e_energy_per_token_mj': m.e2e_energy_per_token * 1000, + 'timestamp': m.timestamp, + }) + + +if __name__ == "__main__": + """Test metrics reporting.""" + # Create sample pretraining metrics + forward = StageMetrics( + stage_name="forward", + duration_ms=100.5, + tokens_processed=1024, + tokens_per_second=10189.3, + energy_joules=25.3, + energy_per_token=0.0247, + avg_power_watts=251.7, + peak_memory_gb=45.2, + avg_gpu_util_percent=95.3 + ) + + backward = StageMetrics( + stage_name="backward", + duration_ms=205.2, + tokens_processed=1024, + tokens_per_second=4991.2, + energy_joules=51.6, + energy_per_token=0.0504, + avg_power_watts=251.5, + peak_memory_gb=48.6, + avg_gpu_util_percent=97.1 + ) + + optimizer = StageMetrics( + stage_name="optimizer", + duration_ms=15.3, + tokens_processed=1024, + tokens_per_second=66928.1, + energy_joules=3.8, + energy_per_token=0.0037, + avg_power_watts=248.4, + peak_memory_gb=48.6, + avg_gpu_util_percent=42.1 + ) + + pretrain_metrics = PretrainMetrics( + model_name="Qwen/Qwen2.5-3B-Instruct", + gpu_name="NVIDIA A100 80GB", + attention_implementation="flash_attention_2", + batch_size=8, + sequence_length=2048, + num_steps=10, + forward=forward, + backward=backward, + optimizer=optimizer, + total_duration_ms=321.0, + total_tokens=10240, + total_tokens_per_second=31900.3, + total_energy_joules=80.7, + total_energy_per_token=0.00788 + ) + + # Print pretrain metrics + MetricsReporter.print_pretrain_metrics(pretrain_metrics) + + # Create sample inference metrics + prefill = StageMetrics( + stage_name="prefill", + duration_ms=45.2, + tokens_processed=512, + tokens_per_second=11327.4, + energy_joules=11.3, + energy_per_token=0.0221, + avg_power_watts=250.0, + peak_memory_gb=42.1, + avg_gpu_util_percent=89.2 + ) + + decode = StageMetrics( + stage_name="decode", + duration_ms=223.5, + tokens_processed=100, + tokens_per_second=447.4, + energy_joules=55.9, + energy_per_token=0.559, + avg_power_watts=250.1, + peak_memory_gb=42.1, + avg_gpu_util_percent=62.3 + ) + + inference_metrics = InferenceMetrics( + model_name="Qwen/Qwen2.5-3B-Instruct", + gpu_name="NVIDIA A100 80GB", + attention_implementation="flash_attention_2", + num_requests=10, + prompt_length=512, + generation_length=100, + prefill=prefill, + decode=decode, + e2e_latency_ms=268.7, + e2e_tokens_per_second=2277.9, + e2e_energy_joules=67.2, + e2e_energy_per_token=0.110, + ttft_ms=45.2, + itl_ms=2.235 + ) + + # Print inference metrics + MetricsReporter.print_inference_metrics(inference_metrics)