Initial commit
This commit is contained in:
408
.gitignore
vendored
Normal file
408
.gitignore
vendored
Normal file
@@ -0,0 +1,408 @@
|
||||
# READ THIS BEFORE YOU REFACTOR ME
|
||||
#
|
||||
# setup.py uses the list of patterns in this file to decide
|
||||
# what to delete, but it's not 100% sound. So, for example,
|
||||
# if you delete aten/build/ because it's redundant with build/,
|
||||
# aten/build/ will stop being cleaned. So be careful when
|
||||
# refactoring this file!
|
||||
|
||||
## Model cache
|
||||
.md
|
||||
model_cache/
|
||||
|
||||
## PyTorch
|
||||
.coverage
|
||||
coverage.xml
|
||||
.dmypy.json
|
||||
.gradle
|
||||
.hypothesis
|
||||
.mypy_cache
|
||||
.additional_ci_files
|
||||
.lintrunner.private.toml
|
||||
/.extracted_scripts/
|
||||
**/.pytorch_specified_test_cases.csv
|
||||
**/.pytorch-disabled-tests.json
|
||||
*/*.pyc
|
||||
*/*.so*
|
||||
*/**/__pycache__
|
||||
*/**/*.dylib*
|
||||
*/**/*.pyc
|
||||
*/**/*.pyd
|
||||
*/**/*.so*
|
||||
*/**/**/*.pyc
|
||||
*/**/**/**/*.pyc
|
||||
*/**/**/**/**/*.pyc
|
||||
aten/build/
|
||||
aten/src/ATen/Config.h
|
||||
aten/src/ATen/cuda/CUDAConfig.h
|
||||
aten/src/ATen/hip/HIPConfig.h
|
||||
benchmarks/.data
|
||||
caffe2/cpp_test/
|
||||
dist/
|
||||
docs/build/
|
||||
docs/cpp/src
|
||||
docs/src/**/*
|
||||
docs/cpp/build
|
||||
docs/cpp/source/api
|
||||
docs/cpp/source/html/
|
||||
docs/cpp/source/latex/
|
||||
docs/source/compile/generated/
|
||||
docs/source/generated/
|
||||
docs/source/compile/generated/
|
||||
log
|
||||
usage_log.txt
|
||||
usage_log*
|
||||
test-reports/
|
||||
test/*.bak
|
||||
test/**/*.bak
|
||||
test/.coverage
|
||||
test/.hypothesis/
|
||||
test/cpp/api/mnist
|
||||
test/custom_operator/model.pt
|
||||
test/debug/
|
||||
test/jit_hooks/*.pt
|
||||
test/data/legacy_modules.t7
|
||||
test/data/*.pt
|
||||
test/forward_backward_compatibility/nightly_schemas.txt
|
||||
dropout_model.pt
|
||||
test/generated_type_hints_smoketest.py
|
||||
test/htmlcov
|
||||
test/cpp_extensions/**/install
|
||||
test/kernel.errors.txt
|
||||
third_party/build/
|
||||
third_party/nccl/
|
||||
tools/coverage_plugins_package/pip-wheel-metadata/
|
||||
tools/shared/_utils_internal.py
|
||||
tools/fast_nvcc/wrap_nvcc.sh
|
||||
tools/fast_nvcc/wrap_nvcc.bat
|
||||
tools/fast_nvcc/tmp/
|
||||
torch.egg-info/
|
||||
torch/_C/__init__.pyi
|
||||
torch/_C/_nn.pyi
|
||||
torch/_C/_VariableFunctions.pyi
|
||||
torch/_VF.pyi
|
||||
torch/return_types.pyi
|
||||
torch/nn/functional.pyi
|
||||
torch/utils/data/datapipes/datapipe.pyi
|
||||
torch/csrc/autograd/generated/*
|
||||
torch/csrc/functionalization/generated/*
|
||||
torch/csrc/lazy/generated/*.[!m]*
|
||||
torch_compile_debug/
|
||||
# Listed manually because some files in this directory are not generated
|
||||
torch/testing/_internal/generated/annotated_fn_args.py
|
||||
torch/testing/_internal/data/*.pt
|
||||
torch/headeronly/version.h
|
||||
torch/csrc/cudnn/cuDNN.cpp
|
||||
torch/csrc/generated
|
||||
torch/csrc/generic/TensorMethods.cpp
|
||||
torch/csrc/inductor/aoti_torch/generated/*.cpp
|
||||
torch/csrc/inductor/aoti_torch/generated/extend/*
|
||||
torch/csrc/jit/generated/*
|
||||
torch/csrc/jit/fuser/config.h
|
||||
torch/csrc/nn/THCUNN.cpp
|
||||
torch/csrc/nn/THCUNN.cwrap
|
||||
torch/bin/
|
||||
torch/cmake/
|
||||
torch/lib/*.a*
|
||||
torch/lib/*.dll*
|
||||
torch/lib/*.exe*
|
||||
torch/lib/*.dylib*
|
||||
torch/lib/*.h
|
||||
torch/lib/*.lib
|
||||
torch/lib/*.pdb
|
||||
torch/lib/*.so*
|
||||
torch/lib/protobuf*.pc
|
||||
torch/lib/build
|
||||
torch/lib/caffe2/
|
||||
torch/lib/cmake
|
||||
torch/lib/include
|
||||
torch/lib/pkgconfig
|
||||
torch/lib/protoc
|
||||
torch/lib/protobuf/
|
||||
torch/lib/tmp_install
|
||||
torch/lib/torch_shm_manager
|
||||
torch/lib/site-packages/
|
||||
torch/lib/python*
|
||||
torch/lib64
|
||||
torch/include/
|
||||
torch/share/
|
||||
torch/test/
|
||||
torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
|
||||
torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
|
||||
torch/version.py
|
||||
torch/_inductor/kernel/vendored_templates/*
|
||||
test/inductor/test_tlx*
|
||||
minifier_launcher.py
|
||||
aten/src/ATen/native/transformers/hip/flash_attn/ck/fav_v3/asm_fmha_v3_bwd_configs.hpp
|
||||
aten/src/ATen/native/transformers/hip/flash_attn/ck/fav_v3/mha_bwd.hip
|
||||
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d*
|
||||
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d*
|
||||
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_convert*
|
||||
aten/src/ATen/native/transformers/hip/flash_attn/ck/fwd_blob*
|
||||
aten/src/ATen/native/transformers/hip/flash_attn/ck/bwd_blob*
|
||||
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_api*
|
||||
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_api*
|
||||
# Root level file used in CI to specify certain env configs.
|
||||
# E.g., see .circleci/config.yaml
|
||||
env
|
||||
.circleci/scripts/COMMIT_MSG
|
||||
scripts/release_notes/*.json
|
||||
sccache-stats*.json
|
||||
lint.json
|
||||
merge_record.json
|
||||
.github/scripts/nightly_source_matrix.json
|
||||
|
||||
# These files get copied over on invoking setup.py
|
||||
torchgen/packaged/*
|
||||
!torchgen/packaged/README.md
|
||||
|
||||
# This file is injected by ROCm build scripts to bootstrap in torch/__init__.py.
|
||||
torch/_rocm_init.py
|
||||
|
||||
# IPython notebook checkpoints
|
||||
.ipynb_checkpoints
|
||||
|
||||
# Editor temporaries
|
||||
*.swa
|
||||
*.swb
|
||||
*.swc
|
||||
*.swd
|
||||
*.swe
|
||||
*.swf
|
||||
*.swg
|
||||
*.swh
|
||||
*.swi
|
||||
*.swj
|
||||
*.swk
|
||||
*.swl
|
||||
*.swm
|
||||
*.swn
|
||||
*.swo
|
||||
*.swp
|
||||
*~
|
||||
.~lock.*
|
||||
|
||||
# macOS dir files
|
||||
.DS_Store
|
||||
|
||||
# Ninja files
|
||||
.ninja_deps
|
||||
.ninja_log
|
||||
compile_commands.json
|
||||
*.egg-info/
|
||||
docs/source/scripts/activation_images/
|
||||
docs/source/scripts/quantization_backend_configs/
|
||||
docs/source/scripts/lr_scheduler_images/
|
||||
|
||||
## General
|
||||
|
||||
# Compiled Object files
|
||||
*.slo
|
||||
*.lo
|
||||
*.o
|
||||
*.cuo
|
||||
*.obj
|
||||
|
||||
# Compiled Dynamic libraries
|
||||
*.so
|
||||
*.dylib
|
||||
*.dll
|
||||
|
||||
# Compiled Static libraries
|
||||
*.lai
|
||||
*.la
|
||||
*.a
|
||||
*.lib
|
||||
|
||||
# Compiled protocol buffers
|
||||
*.pb.h
|
||||
*.pb.cc
|
||||
*_pb2.py
|
||||
|
||||
# Compiled python
|
||||
*.pyc
|
||||
*.pyd
|
||||
|
||||
# Compiled MATLAB
|
||||
*.mex*
|
||||
|
||||
# NFS handle files
|
||||
**/.nfs*
|
||||
|
||||
# Sublime Text settings
|
||||
*.sublime-workspace
|
||||
*.sublime-project
|
||||
|
||||
# Eclipse Project settings
|
||||
*.*project
|
||||
.settings
|
||||
|
||||
# QtCreator files
|
||||
*.user
|
||||
|
||||
# PyCharm files
|
||||
.idea
|
||||
|
||||
# GDB history
|
||||
.gdb_history
|
||||
|
||||
## Caffe2
|
||||
|
||||
# build, distribute, and bins (+ python proto bindings)
|
||||
build/
|
||||
# Allow tools/build/ for build support.
|
||||
!tools/build/
|
||||
build_host_protoc
|
||||
build_android
|
||||
build_ios
|
||||
.build_debug/*
|
||||
.build_release/*
|
||||
.build_profile/*
|
||||
distribute/*
|
||||
*.testbin
|
||||
*.bin
|
||||
cmake_build
|
||||
.cmake_build
|
||||
gen
|
||||
.setuptools-cmake-build
|
||||
.pytest_cache
|
||||
aten/build/*
|
||||
|
||||
# Linker scripts for prioritized text optimization
|
||||
cmake/linker_script.ld
|
||||
|
||||
# Bram
|
||||
plsdontbreak
|
||||
|
||||
# Generated documentation
|
||||
docs/_site
|
||||
docs/gathered
|
||||
_site
|
||||
doxygen
|
||||
docs/dev
|
||||
|
||||
# LevelDB files
|
||||
*.sst
|
||||
*.ldb
|
||||
LOCK
|
||||
CURRENT
|
||||
MANIFEST-*
|
||||
|
||||
# generated version file
|
||||
caffe2/version.py
|
||||
|
||||
# setup.py intermediates
|
||||
.eggs
|
||||
caffe2.egg-info
|
||||
MANIFEST
|
||||
|
||||
# Atom/Watchman required file
|
||||
.watchmanconfig
|
||||
.watchman
|
||||
|
||||
# Files generated by CLion
|
||||
cmake-build-debug
|
||||
|
||||
# BEGIN NOT-CLEAN-FILES (setup.py handles this marker. Do not change.)
|
||||
#
|
||||
# Below files are not deleted by "setup.py clean".
|
||||
|
||||
# Downloaded bazel
|
||||
tools/bazel
|
||||
|
||||
# Visual Studio Code files
|
||||
.vs
|
||||
/.vscode/*
|
||||
!/.vscode/extensions.json
|
||||
!/.vscode/settings_recommended.json
|
||||
|
||||
# YouCompleteMe config file
|
||||
.ycm_extra_conf.py
|
||||
|
||||
# Files generated when a patch is rejected
|
||||
*.orig
|
||||
*.rej
|
||||
|
||||
# Files generated by ctags
|
||||
CTAGS
|
||||
GTAGS
|
||||
GRTAGS
|
||||
GSYMS
|
||||
GPATH
|
||||
tags
|
||||
TAGS
|
||||
|
||||
|
||||
# ccls file
|
||||
.ccls-cache/
|
||||
|
||||
# clang tooling storage location
|
||||
.clang-format-bin
|
||||
.clang-tidy-bin
|
||||
.lintbin
|
||||
|
||||
# clangd background index
|
||||
.clangd/
|
||||
.cache/
|
||||
|
||||
# bazel symlinks
|
||||
bazel-*
|
||||
|
||||
# xla repo
|
||||
xla/
|
||||
|
||||
# direnv, posh-direnv
|
||||
.env
|
||||
.envrc
|
||||
.psenvrc
|
||||
|
||||
# generated shellcheck directories
|
||||
.shellcheck_generated*/
|
||||
|
||||
# zip archives
|
||||
*.zip
|
||||
|
||||
# core dump files
|
||||
**/core.[1-9]*
|
||||
|
||||
# Generated if you use the pre-commit script for clang-tidy
|
||||
pr.diff
|
||||
|
||||
# coverage files
|
||||
*/**/.coverage.*
|
||||
|
||||
# buck generated files
|
||||
.buckd/
|
||||
.lsp-buck-out/
|
||||
.lsp.buckd/
|
||||
buck-out/
|
||||
|
||||
# Downloaded libraries
|
||||
third_party/ruy/
|
||||
third_party/glog/
|
||||
|
||||
# Virtualenv
|
||||
.venv/
|
||||
venv/
|
||||
|
||||
# Log files
|
||||
*.log
|
||||
sweep/
|
||||
|
||||
# Android build artifacts
|
||||
android/pytorch_android/.cxx
|
||||
android/pytorch_android_torchvision/.cxx
|
||||
|
||||
# Pyre configs (for internal usage)
|
||||
.pyre_configuration
|
||||
.pyre_configuration.codenav
|
||||
.arcconfig
|
||||
.stable_pyre_client
|
||||
.pyre_client
|
||||
|
||||
# Claude Code local configuration
|
||||
CLAUDE.local.md
|
||||
/test_*.py
|
||||
/debug_*.py
|
||||
CLAUDE_CONTEXT/
|
||||
/.claude/settings.local.json
|
||||
100
AMD_FIX_SUMMARY.md
Normal file
100
AMD_FIX_SUMMARY.md
Normal file
@@ -0,0 +1,100 @@
|
||||
# AMD GPU Monitoring Fix Summary
|
||||
|
||||
## Issue
|
||||
The AMDMonitor class was using incorrect pyrsmi API calls. The implementation attempted to use low-level `rocmsmi` module which has complex initialization and function signatures.
|
||||
|
||||
## Solution
|
||||
Updated to use the correct `rocml` high-level API from pyrsmi, based on the official example at:
|
||||
`/anvme/workspace/ihpc125h-llm-profiles/pyrsmi/examples/llm_monitoring/monitor_llm_inference.py`
|
||||
|
||||
## Changes Made
|
||||
|
||||
### 1. Fixed AMDMonitor Class
|
||||
|
||||
**Before** (incorrect):
|
||||
```python
|
||||
from pyrsmi import rocmsmi
|
||||
ret = self.rocmsmi.rsmi_init(0)
|
||||
power_uw = self.rocmsmi.rsmi_dev_power_ave_get(self.device_id)
|
||||
```
|
||||
|
||||
**After** (correct):
|
||||
```python
|
||||
from pyrsmi import rocml
|
||||
self.rocml.smi_initialize()
|
||||
power_watts = self.rocml.smi_get_device_average_power(self.device_id)
|
||||
```
|
||||
|
||||
**Key API Functions**:
|
||||
- `rocml.smi_initialize()` - Initialize monitoring
|
||||
- `rocml.smi_get_device_average_power(device_id)` - Get power in Watts (not microwatts!)
|
||||
- `rocml.smi_get_device_utilization(device_id)` - Get GPU utilization %
|
||||
- `rocml.smi_get_device_memory_used(device_id)` - Get memory used in bytes
|
||||
- `rocml.smi_get_device_memory_total(device_id)` - Get total memory in bytes
|
||||
- `rocml.smi_get_device_temperature(device_id)` - Get temperature
|
||||
- `rocml.smi_get_device_name(device_id)` - Get device name
|
||||
- `rocml.smi_shutdown()` - Cleanup
|
||||
|
||||
### 2. Updated All SLURM Scripts for Apptainer
|
||||
|
||||
All GPU benchmark scripts now run inside the apptainer container:
|
||||
|
||||
**A100, H100, H200** (NVIDIA):
|
||||
```bash
|
||||
APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_tilelang.sif"
|
||||
apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py ...
|
||||
```
|
||||
|
||||
**MI300X** (AMD):
|
||||
```bash
|
||||
APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_tilelang.sif"
|
||||
apptainer exec --rocm $APPTAINER_IMAGE python run_benchmark.py ...
|
||||
```
|
||||
|
||||
Note: `--nv` for NVIDIA, `--rocm` for AMD
|
||||
|
||||
### 3. Updated Documentation
|
||||
|
||||
- README.md now mentions apptainer usage
|
||||
- Updated setup instructions to use apptainer for model caching
|
||||
- Added notes about container flags (--nv vs --rocm)
|
||||
|
||||
## Testing
|
||||
|
||||
To verify the AMD monitoring works:
|
||||
|
||||
```bash
|
||||
# Inside apptainer on MI300X node
|
||||
apptainer exec --rocm pytorch_25.10_tilelang.sif python -c "
|
||||
from utils.gpu_monitor import AMDMonitor
|
||||
m = AMDMonitor(0)
|
||||
print(f'GPU: {m.get_device_name()}')
|
||||
metrics = m.get_metrics()
|
||||
print(f'Power: {metrics.power_watts:.2f} W')
|
||||
print(f'Utilization: {metrics.gpu_utilization_percent:.1f}%')
|
||||
print(f'Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB')
|
||||
m.cleanup()
|
||||
"
|
||||
```
|
||||
|
||||
## Files Modified
|
||||
|
||||
1. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/utils/gpu_monitor.py` - Fixed AMDMonitor class
|
||||
2. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_a100.sh` - Added apptainer
|
||||
3. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_h100.sh` - Added apptainer
|
||||
4. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_h200.sh` - Added apptainer
|
||||
5. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_mi300x.sh` - Added apptainer with --rocm
|
||||
6. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/README.md` - Updated documentation
|
||||
|
||||
## Key Differences: rocml vs rocmsmi
|
||||
|
||||
| Feature | rocml (High-level) | rocmsmi (Low-level) |
|
||||
|---------|-------------------|---------------------|
|
||||
| API Style | Simple functions | Complex C-style API |
|
||||
| Initialization | `smi_initialize()` | `rsmi_init(0)` + error codes |
|
||||
| Power | Returns Watts | Returns microwatts |
|
||||
| Memory | Returns bytes | Returns bytes via enums |
|
||||
| Error Handling | Returns -1 on error | Returns error codes |
|
||||
| Ease of Use | Much easier | Complex |
|
||||
|
||||
The `rocml` module is the recommended high-level Python API for pyrsmi.
|
||||
311
README.md
Normal file
311
README.md
Normal file
@@ -0,0 +1,311 @@
|
||||
# LLM Benchmark Suite
|
||||
|
||||
A comprehensive benchmarking suite for comparing LLM performance (Qwen3-4B) across different GPU architectures: **MI300X**, **A100 80G**, **H100**, and **H200**.
|
||||
|
||||
## Features
|
||||
|
||||
- **Pretraining Benchmarks**: Separate metrics for forward, backward, and optimizer stages
|
||||
- **Inference Benchmarks**: Separate metrics for prefill (TTFT) and decode (ITL) stages
|
||||
- **Energy Monitoring**: GPU-specific energy and power measurement
|
||||
- NVIDIA: pynvml
|
||||
- AMD: pyrsmi
|
||||
- **Attention Implementations**:
|
||||
- FlashAttention-2 (A100, MI300X)
|
||||
- FlashAttention-3 Hopper (H100, H200)
|
||||
- Configurable via CLI
|
||||
- **Comprehensive Metrics**:
|
||||
- Tokens per second
|
||||
- Energy per token
|
||||
- Time to First Token (TTFT)
|
||||
- Inter-Token Latency (ITL)
|
||||
- End-to-End Request Latency
|
||||
- GPU utilization and memory usage
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
llm-benchmark/
|
||||
├── cache_model.py # Model caching script
|
||||
├── benchmark_pretrain.py # Pretraining benchmark
|
||||
├── benchmark_inference.py # Inference benchmark
|
||||
├── run_benchmark.py # Main orchestration script
|
||||
├── requirements.txt # Python dependencies
|
||||
├── utils/
|
||||
│ ├── gpu_monitor.py # GPU monitoring (NVIDIA & AMD)
|
||||
│ ├── metrics.py # Metrics collection and reporting
|
||||
│ └── attention.py # Attention implementation helpers
|
||||
├── configs/
|
||||
│ ├── a100.yaml
|
||||
│ ├── h100.yaml
|
||||
│ ├── h200.yaml
|
||||
│ └── mi300x.yaml
|
||||
└── results/ # Benchmark results (JSON)
|
||||
```
|
||||
|
||||
## Setup
|
||||
|
||||
### 1. Container Environment
|
||||
|
||||
All benchmarks should be run inside the apptainer container:
|
||||
|
||||
```bash
|
||||
# Container is located at:
|
||||
/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_tilelang.sif
|
||||
```
|
||||
|
||||
### 2. Install Dependencies (if not using apptainer)
|
||||
|
||||
If you want to run directly without apptainer:
|
||||
|
||||
```bash
|
||||
# Install Python dependencies
|
||||
pip install -r requirements.txt
|
||||
|
||||
# For AMD GPUs, ensure ROCm and pyrsmi are installed
|
||||
# For NVIDIA GPUs, ensure CUDA and pynvml are installed
|
||||
```
|
||||
|
||||
### 3. Cache Model (Run on Head Node)
|
||||
|
||||
**IMPORTANT**: Run this on the head node BEFORE allocating compute nodes, as compute nodes are typically offline.
|
||||
|
||||
```bash
|
||||
# Using apptainer (recommended)
|
||||
apptainer exec --nv pytorch_25.10_tilelang.sif python cache_model.py \
|
||||
--model-name Qwen/Qwen3-4B \
|
||||
--cache-dir ./model_cache
|
||||
|
||||
# Or directly (if dependencies installed)
|
||||
python cache_model.py --model-name Qwen/Qwen3-4B --cache-dir ./model_cache
|
||||
```
|
||||
|
||||
The model will be cached to `./model_cache` in the current directory (avoiding slow NFS $HOME).
|
||||
|
||||
## Usage
|
||||
|
||||
### Quick Start
|
||||
|
||||
```bash
|
||||
# Run both pretraining and inference benchmarks
|
||||
python run_benchmark.py --mode both --model-path ./model_cache
|
||||
|
||||
# Run only pretraining
|
||||
python run_benchmark.py --mode pretrain --num-steps 20
|
||||
|
||||
# Run only inference
|
||||
python run_benchmark.py --mode inference --num-requests 20
|
||||
```
|
||||
|
||||
### Detailed Usage
|
||||
|
||||
#### List Available GPUs
|
||||
|
||||
```bash
|
||||
python run_benchmark.py --list-gpus
|
||||
```
|
||||
|
||||
#### Pretraining Benchmark
|
||||
|
||||
```bash
|
||||
python benchmark_pretrain.py \
|
||||
--model-path ./model_cache \
|
||||
--model-name Qwen/Qwen3-4B \
|
||||
--attn-implementation auto \
|
||||
--batch-size 8 \
|
||||
--sequence-length 8192 \
|
||||
--num-steps 10 \
|
||||
--warmup-steps 3 \
|
||||
--output-dir ./results
|
||||
```
|
||||
|
||||
**Metrics Reported** (per stage: forward, backward, optimizer):
|
||||
- Duration (ms)
|
||||
- Tokens processed
|
||||
- Throughput (tokens/s)
|
||||
- Energy (J)
|
||||
- Energy per token (J/token)
|
||||
- Average power (W)
|
||||
- Peak memory (GB)
|
||||
- GPU utilization (%)
|
||||
|
||||
#### Inference Benchmark
|
||||
|
||||
```bash
|
||||
python benchmark_inference.py \
|
||||
--model-path ./model_cache \
|
||||
--model-name Qwen/Qwen3-4B \
|
||||
--attn-implementation auto \
|
||||
--num-requests 10 \
|
||||
--prompt-length 512 \
|
||||
--generation-length 100 \
|
||||
--warmup-requests 2 \
|
||||
--output-dir ./results
|
||||
```
|
||||
|
||||
**Metrics Reported**:
|
||||
- **Prefill**: TTFT, throughput, energy per token
|
||||
- **Decode**: ITL, throughput, energy per token
|
||||
- **End-to-End**: Request latency, total throughput, total energy
|
||||
|
||||
### Attention Implementations
|
||||
|
||||
The benchmark automatically selects the optimal attention implementation based on GPU:
|
||||
- **A100, MI300X**: `flash_attention_2`
|
||||
- **H100, H200**: `flash_attention_3_hopper`
|
||||
|
||||
Override with `--attn-implementation`:
|
||||
|
||||
```bash
|
||||
# Force FlashAttention-3 Hopper on H100
|
||||
python run_benchmark.py --attn-implementation flash_attention_3_hopper
|
||||
|
||||
# Use SDPA instead
|
||||
python run_benchmark.py --attn-implementation sdpa
|
||||
```
|
||||
|
||||
Available options:
|
||||
- `auto` - Auto-detect based on GPU
|
||||
- `flash_attention_2` - FlashAttention-2 (all GPUs)
|
||||
- `flash_attention_3_hopper` - FlashAttention-3 for H100/H200
|
||||
- `sdpa` - PyTorch Scaled Dot Product Attention
|
||||
- `eager` - Standard PyTorch attention
|
||||
|
||||
## Running on SLURM
|
||||
|
||||
All SLURM scripts are configured to run inside the apptainer container. First cache the model on the head node:
|
||||
|
||||
```bash
|
||||
# On head node (with internet access)
|
||||
apptainer exec --nv pytorch_25.10_tilelang.sif python cache_model.py \
|
||||
--model-name Qwen/Qwen3-4B \
|
||||
--cache-dir ./model_cache
|
||||
```
|
||||
|
||||
Then submit jobs:
|
||||
|
||||
```bash
|
||||
# A100
|
||||
sbatch slurm_a100.sh
|
||||
|
||||
# H100
|
||||
sbatch slurm_h100.sh
|
||||
|
||||
# H200
|
||||
sbatch slurm_h200.sh
|
||||
|
||||
# MI300X
|
||||
sbatch slurm_mi300x.sh
|
||||
```
|
||||
|
||||
**Note**:
|
||||
- NVIDIA GPUs use `--nv` flag
|
||||
- AMD GPUs use `--rocm` flag
|
||||
|
||||
## Output
|
||||
|
||||
Results are saved to the `--output-dir` directory (default: `./results/`):
|
||||
|
||||
- `pretrain_<GPU>_<ATTENTION>.json` - Pretraining metrics
|
||||
- `inference_<GPU>_<ATTENTION>.json` - Inference metrics
|
||||
|
||||
Example output:
|
||||
|
||||
```
|
||||
===============================================================================
|
||||
PRETRAINING BENCHMARK RESULTS
|
||||
===============================================================================
|
||||
|
||||
Model: Qwen/Qwen3-4B
|
||||
GPU: NVIDIA A100 80GB
|
||||
Attention: flash_attention_2
|
||||
Batch Size: 8
|
||||
Sequence Length: 8192
|
||||
Training Steps: 10
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
STAGE BREAKDOWN
|
||||
-------------------------------------------------------------------------------
|
||||
|
||||
[1] FORWARD PASS
|
||||
Duration: 1005.23 ms
|
||||
Tokens: 163,840
|
||||
Throughput: 163,012.45 tokens/s
|
||||
Energy: 253.0 J
|
||||
Energy per Token: 1.5443 mJ/token
|
||||
|
||||
[2] BACKWARD PASS
|
||||
Duration: 2052.11 ms
|
||||
Tokens: 163,840
|
||||
Throughput: 79,857.23 tokens/s
|
||||
Energy: 516.2 J
|
||||
Energy per Token: 3.1513 mJ/token
|
||||
|
||||
[3] OPTIMIZER STEP
|
||||
Duration: 153.42 ms
|
||||
Tokens: 163,840
|
||||
Throughput: 1,068,012.34 tokens/s
|
||||
Energy: 38.4 J
|
||||
Energy per Token: 0.2344 mJ/token
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
OVERALL METRICS
|
||||
-------------------------------------------------------------------------------
|
||||
Total Duration: 3210.76 ms
|
||||
Total Tokens: 163,840
|
||||
Throughput: 51,012.45 tokens/s
|
||||
Total Energy: 807.6 J
|
||||
Energy per Token: 4.9300 mJ/token
|
||||
===============================================================================
|
||||
```
|
||||
|
||||
## Key Metrics Reference
|
||||
|
||||
### Pretraining
|
||||
- **Forward**: Input processing and loss calculation
|
||||
- **Backward**: Gradient computation
|
||||
- **Optimizer**: Weight updates
|
||||
|
||||
### Inference
|
||||
- **TTFT (Time to First Token)**: Prefill latency
|
||||
- **ITL (Inter-Token Latency)**: Average decode time per token
|
||||
- **E2E Latency**: Total request time (prefill + decode)
|
||||
|
||||
### Energy
|
||||
- **Energy (J)**: Total energy consumed
|
||||
- **Energy per Token (mJ/token)**: Energy efficiency metric
|
||||
- **Average Power (W)**: Power consumption during stage
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Model Not Found
|
||||
Ensure you've cached the model first:
|
||||
```bash
|
||||
python cache_model.py --model-name Qwen/Qwen2.5-3B-Instruct --cache-dir ./model_cache
|
||||
```
|
||||
|
||||
### GPU Monitoring Errors
|
||||
- **NVIDIA**: Install pynvml: `pip install pynvml`
|
||||
- **AMD**: Install pyrsmi: `pip install pyrsmi`
|
||||
|
||||
### FlashAttention-3 Not Found
|
||||
For H100/H200, ensure FlashAttention-3 is installed. If not available, use:
|
||||
```bash
|
||||
python run_benchmark.py --attn-implementation flash_attention_2
|
||||
```
|
||||
|
||||
### Out of Memory
|
||||
Reduce batch size or sequence length:
|
||||
```bash
|
||||
python run_benchmark.py --batch-size 4 --sequence-length 1024
|
||||
```
|
||||
|
||||
## Citation
|
||||
|
||||
If you use this benchmark suite, please cite:
|
||||
- [FlashAttention-2](https://github.com/Dao-AILab/flash-attention)
|
||||
- [FlashAttention-3](https://github.com/Dao-AILab/flash-attention) (for Hopper)
|
||||
- [Qwen Models](https://huggingface.co/Qwen)
|
||||
|
||||
## License
|
||||
|
||||
MIT License - see LICENSE file for details
|
||||
417
benchmark_inference.py
Executable file
417
benchmark_inference.py
Executable file
@@ -0,0 +1,417 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Inference Benchmark for LLM Performance Evaluation
|
||||
|
||||
Measures performance and energy metrics for inference workloads with
|
||||
separate measurements for prefill and decode stages.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from tqdm import tqdm
|
||||
|
||||
# Add utils to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from utils.gpu_monitor import get_gpu_monitor
|
||||
from utils.metrics import StageMetrics, InferenceMetrics, MetricsReporter
|
||||
from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu
|
||||
|
||||
|
||||
def benchmark_inference(
|
||||
model_name_or_path: str,
|
||||
attn_implementation: str = "auto",
|
||||
num_requests: int = 10,
|
||||
prompt_length: int = 512,
|
||||
generation_length: int = 100,
|
||||
warmup_requests: int = 2,
|
||||
device: str = "cuda",
|
||||
device_id: int = 0,
|
||||
output_dir: Optional[str] = None,
|
||||
verbose: bool = True,
|
||||
):
|
||||
"""
|
||||
Run inference benchmark.
|
||||
|
||||
Args:
|
||||
model_name_or_path: Path to model or HuggingFace identifier
|
||||
attn_implementation: Attention implementation to use
|
||||
num_requests: Number of inference requests to measure
|
||||
prompt_length: Length of input prompt
|
||||
generation_length: Number of tokens to generate
|
||||
warmup_requests: Number of warmup requests
|
||||
device: Device to use
|
||||
device_id: GPU device ID
|
||||
output_dir: Directory to save results
|
||||
verbose: Print verbose output
|
||||
"""
|
||||
print("=" * 80)
|
||||
print("INFERENCE BENCHMARK")
|
||||
print("=" * 80)
|
||||
|
||||
# Initialize GPU monitor
|
||||
if verbose:
|
||||
print("\n[1/7] Initializing GPU monitor...")
|
||||
monitor = get_gpu_monitor(device_id)
|
||||
gpu_name = monitor.get_device_name()
|
||||
if verbose:
|
||||
print(f" GPU: {gpu_name}")
|
||||
|
||||
# Determine attention implementation
|
||||
if attn_implementation == "auto":
|
||||
attn_implementation = get_default_attention(gpu_name)
|
||||
if verbose:
|
||||
print(f" Auto-selected attention: {attn_implementation}")
|
||||
|
||||
# Validate attention for GPU
|
||||
valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name)
|
||||
if warning and verbose:
|
||||
print(f" ⚠ {warning}")
|
||||
|
||||
# Load model
|
||||
if verbose:
|
||||
print(f"\n[2/7] Loading model: {model_name_or_path}")
|
||||
|
||||
# Determine attn_implementation parameter for model loading
|
||||
load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation
|
||||
|
||||
try:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name_or_path,
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation=load_attn,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
model = model.to(device)
|
||||
|
||||
# Configure attention (patch if needed for FA3)
|
||||
model = configure_model_attention(model, attn_implementation, verbose=verbose)
|
||||
|
||||
if verbose:
|
||||
total_params = sum(p.numel() for p in model.parameters())
|
||||
print(f" Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
|
||||
except Exception as e:
|
||||
print(f"✗ Error loading model: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Load tokenizer
|
||||
if verbose:
|
||||
print(f"\n[3/7] Loading tokenizer...")
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_name_or_path,
|
||||
trust_remote_code=True
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"✗ Error loading tokenizer: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Generate synthetic prompts
|
||||
if verbose:
|
||||
print(f"\n[4/7] Generating synthetic prompts...")
|
||||
print(f" Prompt length: {prompt_length}")
|
||||
print(f" Generation length: {generation_length}")
|
||||
|
||||
# Create random input_ids (synthetic prompts)
|
||||
vocab_size = model.config.vocab_size
|
||||
# We'll create one prompt and reuse it
|
||||
prompt_ids = torch.randint(0, vocab_size, (1, prompt_length), device=device)
|
||||
|
||||
# Warmup
|
||||
if verbose:
|
||||
print(f"\n[5/7] Running warmup ({warmup_requests} requests)...")
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
for _ in range(warmup_requests):
|
||||
_ = model.generate(
|
||||
prompt_ids,
|
||||
max_new_tokens=generation_length,
|
||||
do_sample=False,
|
||||
pad_token_id=tokenizer.eos_token_id
|
||||
)
|
||||
|
||||
# Synchronize before benchmarking
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Benchmark
|
||||
if verbose:
|
||||
print(f"\n[6/7] Running benchmark ({num_requests} requests)...")
|
||||
|
||||
# Storage for per-request metrics
|
||||
prefill_times = []
|
||||
decode_times = []
|
||||
e2e_times = []
|
||||
|
||||
prefill_energies = []
|
||||
decode_energies = []
|
||||
e2e_energies = []
|
||||
|
||||
prefill_powers = []
|
||||
decode_powers = []
|
||||
|
||||
memory_usage = []
|
||||
gpu_utils = []
|
||||
|
||||
# For inference, we separate prefill (first token) from decode (remaining tokens)
|
||||
# We'll use a custom generation loop to measure them separately
|
||||
|
||||
for req_idx in tqdm(range(num_requests), desc="Benchmarking"):
|
||||
# === PREFILL PHASE (Time to First Token) ===
|
||||
# This is the forward pass with the prompt to get the first token
|
||||
|
||||
monitor.start_monitoring()
|
||||
torch.cuda.synchronize()
|
||||
prefill_start = time.perf_counter()
|
||||
|
||||
with torch.no_grad():
|
||||
# Forward pass with prompt
|
||||
outputs = model(input_ids=prompt_ids, use_cache=True)
|
||||
logits = outputs.logits
|
||||
past_key_values = outputs.past_key_values
|
||||
|
||||
# Get first generated token
|
||||
next_token_logits = logits[:, -1, :]
|
||||
next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
prefill_time = time.perf_counter() - prefill_start
|
||||
prefill_energy = monitor.get_energy_consumed()
|
||||
prefill_power = monitor.get_average_power()
|
||||
|
||||
prefill_times.append(prefill_time * 1000) # Convert to ms
|
||||
prefill_energies.append(prefill_energy)
|
||||
prefill_powers.append(prefill_power)
|
||||
|
||||
# === DECODE PHASE (Inter-Token Latency) ===
|
||||
# Generate remaining tokens one by one
|
||||
|
||||
monitor.start_monitoring()
|
||||
torch.cuda.synchronize()
|
||||
decode_start = time.perf_counter()
|
||||
|
||||
generated_tokens = [next_token]
|
||||
|
||||
with torch.no_grad():
|
||||
for _ in range(generation_length - 1):
|
||||
# Forward pass with single token using cached keys/values
|
||||
outputs = model(
|
||||
input_ids=next_token,
|
||||
past_key_values=past_key_values,
|
||||
use_cache=True
|
||||
)
|
||||
logits = outputs.logits
|
||||
past_key_values = outputs.past_key_values
|
||||
|
||||
# Get next token
|
||||
next_token_logits = logits[:, -1, :]
|
||||
next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
|
||||
generated_tokens.append(next_token)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
decode_time = time.perf_counter() - decode_start
|
||||
decode_energy = monitor.get_energy_consumed()
|
||||
decode_power = monitor.get_average_power()
|
||||
|
||||
decode_times.append(decode_time * 1000) # Convert to ms
|
||||
decode_energies.append(decode_energy)
|
||||
decode_powers.append(decode_power)
|
||||
|
||||
# End-to-end metrics
|
||||
e2e_time = prefill_time + decode_time
|
||||
e2e_energy = prefill_energy + decode_energy
|
||||
|
||||
e2e_times.append(e2e_time * 1000) # Convert to ms
|
||||
e2e_energies.append(e2e_energy)
|
||||
|
||||
# Get memory and utilization
|
||||
metrics = monitor.get_metrics()
|
||||
memory_usage.append(metrics.memory_used_gb)
|
||||
gpu_utils.append(metrics.gpu_utilization_percent)
|
||||
|
||||
# Compute aggregated metrics
|
||||
|
||||
# Prefill metrics (TTFT)
|
||||
prefill_duration_ms = sum(prefill_times)
|
||||
prefill_energy_j = sum(prefill_energies)
|
||||
prefill_tokens = prompt_length * num_requests
|
||||
prefill_tps = prefill_tokens / (prefill_duration_ms / 1000)
|
||||
prefill_ept = prefill_energy_j / prefill_tokens
|
||||
avg_ttft_ms = sum(prefill_times) / len(prefill_times)
|
||||
|
||||
prefill_metrics = StageMetrics(
|
||||
stage_name="prefill",
|
||||
duration_ms=prefill_duration_ms,
|
||||
tokens_processed=prefill_tokens,
|
||||
tokens_per_second=prefill_tps,
|
||||
energy_joules=prefill_energy_j,
|
||||
energy_per_token=prefill_ept,
|
||||
avg_power_watts=sum(prefill_powers) / len(prefill_powers),
|
||||
peak_memory_gb=max(memory_usage),
|
||||
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
|
||||
)
|
||||
|
||||
# Decode metrics (ITL)
|
||||
decode_duration_ms = sum(decode_times)
|
||||
decode_energy_j = sum(decode_energies)
|
||||
decode_tokens = generation_length * num_requests
|
||||
decode_tps = decode_tokens / (decode_duration_ms / 1000)
|
||||
decode_ept = decode_energy_j / decode_tokens
|
||||
avg_itl_ms = sum(decode_times) / len(decode_times) / generation_length
|
||||
|
||||
decode_metrics = StageMetrics(
|
||||
stage_name="decode",
|
||||
duration_ms=decode_duration_ms,
|
||||
tokens_processed=decode_tokens,
|
||||
tokens_per_second=decode_tps,
|
||||
energy_joules=decode_energy_j,
|
||||
energy_per_token=decode_ept,
|
||||
avg_power_watts=sum(decode_powers) / len(decode_powers),
|
||||
peak_memory_gb=max(memory_usage),
|
||||
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
|
||||
)
|
||||
|
||||
# End-to-end metrics
|
||||
e2e_latency_ms = sum(e2e_times) / len(e2e_times)
|
||||
e2e_energy_j = sum(e2e_energies)
|
||||
total_tokens = (prompt_length + generation_length) * num_requests
|
||||
e2e_tps = total_tokens / (sum(e2e_times) / 1000)
|
||||
e2e_ept = e2e_energy_j / total_tokens
|
||||
|
||||
# Create metrics object
|
||||
metrics = InferenceMetrics(
|
||||
model_name=model_name_or_path,
|
||||
gpu_name=gpu_name,
|
||||
attention_implementation=attn_implementation,
|
||||
num_requests=num_requests,
|
||||
prompt_length=prompt_length,
|
||||
generation_length=generation_length,
|
||||
prefill=prefill_metrics,
|
||||
decode=decode_metrics,
|
||||
e2e_latency_ms=e2e_latency_ms,
|
||||
e2e_tokens_per_second=e2e_tps,
|
||||
e2e_energy_joules=e2e_energy_j,
|
||||
e2e_energy_per_token=e2e_ept,
|
||||
ttft_ms=avg_ttft_ms,
|
||||
itl_ms=avg_itl_ms
|
||||
)
|
||||
|
||||
# Print results
|
||||
if verbose:
|
||||
print()
|
||||
MetricsReporter.print_inference_metrics(metrics, verbose=verbose)
|
||||
|
||||
# Save results
|
||||
if output_dir:
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save JSON
|
||||
json_path = output_path / f"inference_{gpu_name.replace(' ', '_')}_{attn_implementation}.json"
|
||||
MetricsReporter.save_json(metrics, json_path)
|
||||
|
||||
# Cleanup
|
||||
monitor.cleanup()
|
||||
del model
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="LLM Inference Benchmark",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model-path",
|
||||
type=str,
|
||||
default="./model_cache",
|
||||
help="Path to cached model"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model-name",
|
||||
type=str,
|
||||
default="Qwen/Qwen3-4B",
|
||||
help="Model name (for reporting)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--attn-implementation",
|
||||
type=str,
|
||||
default="auto",
|
||||
choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
|
||||
help="Attention implementation to use"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--num-requests",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Number of inference requests"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--prompt-length",
|
||||
type=int,
|
||||
default=512,
|
||||
help="Prompt length in tokens"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--generation-length",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Number of tokens to generate"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--warmup-requests",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Number of warmup requests"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--device-id",
|
||||
type=int,
|
||||
default=0,
|
||||
help="GPU device ID"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
default="./results",
|
||||
help="Output directory for results"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set environment variables for HuggingFace cache
|
||||
if Path(args.model_path).exists():
|
||||
os.environ['HF_HOME'] = args.model_path
|
||||
|
||||
benchmark_inference(
|
||||
model_name_or_path=args.model_name,
|
||||
attn_implementation=args.attn_implementation,
|
||||
num_requests=args.num_requests,
|
||||
prompt_length=args.prompt_length,
|
||||
generation_length=args.generation_length,
|
||||
warmup_requests=args.warmup_requests,
|
||||
device="cuda",
|
||||
device_id=args.device_id,
|
||||
output_dir=args.output_dir,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
406
benchmark_pretrain.py
Executable file
406
benchmark_pretrain.py
Executable file
@@ -0,0 +1,406 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Pretraining Benchmark for LLM Performance Evaluation
|
||||
|
||||
Measures performance and energy metrics for pretraining workloads with
|
||||
separate measurements for forward, backward, and optimizer stages.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from tqdm import tqdm
|
||||
|
||||
# Add utils to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from utils.gpu_monitor import get_gpu_monitor
|
||||
from utils.metrics import StageMetrics, PretrainMetrics, MetricsReporter
|
||||
from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu
|
||||
|
||||
|
||||
def benchmark_pretrain(
|
||||
model_name_or_path: str,
|
||||
attn_implementation: str = "auto",
|
||||
batch_size: int = 8,
|
||||
sequence_length: int = 2048,
|
||||
num_steps: int = 10,
|
||||
warmup_steps: int = 3,
|
||||
device: str = "cuda",
|
||||
device_id: int = 0,
|
||||
output_dir: Optional[str] = None,
|
||||
verbose: bool = True,
|
||||
):
|
||||
"""
|
||||
Run pretraining benchmark.
|
||||
|
||||
Args:
|
||||
model_name_or_path: Path to model or HuggingFace identifier
|
||||
attn_implementation: Attention implementation to use
|
||||
batch_size: Batch size for training
|
||||
sequence_length: Sequence length
|
||||
num_steps: Number of training steps to measure
|
||||
warmup_steps: Number of warmup steps before measurement
|
||||
device: Device to use
|
||||
device_id: GPU device ID
|
||||
output_dir: Directory to save results
|
||||
verbose: Print verbose output
|
||||
"""
|
||||
print("=" * 80)
|
||||
print("PRETRAINING BENCHMARK")
|
||||
print("=" * 80)
|
||||
|
||||
# Initialize GPU monitor
|
||||
if verbose:
|
||||
print("\n[1/6] Initializing GPU monitor...")
|
||||
monitor = get_gpu_monitor(device_id)
|
||||
gpu_name = monitor.get_device_name()
|
||||
if verbose:
|
||||
print(f" GPU: {gpu_name}")
|
||||
|
||||
# Determine attention implementation
|
||||
if attn_implementation == "auto":
|
||||
attn_implementation = get_default_attention(gpu_name)
|
||||
if verbose:
|
||||
print(f" Auto-selected attention: {attn_implementation}")
|
||||
|
||||
# Validate attention for GPU
|
||||
valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name)
|
||||
if warning and verbose:
|
||||
print(f" ⚠ {warning}")
|
||||
|
||||
# Load model
|
||||
if verbose:
|
||||
print(f"\n[2/6] Loading model: {model_name_or_path}")
|
||||
|
||||
# Determine attn_implementation parameter for model loading
|
||||
load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation
|
||||
|
||||
try:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name_or_path,
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation=load_attn,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
model = model.to(device)
|
||||
|
||||
# Configure attention (patch if needed for FA3)
|
||||
model = configure_model_attention(model, attn_implementation, verbose=verbose)
|
||||
|
||||
if verbose:
|
||||
total_params = sum(p.numel() for p in model.parameters())
|
||||
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
print(f" Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
|
||||
print(f" Trainable parameters: {trainable_params:,}")
|
||||
except Exception as e:
|
||||
print(f"✗ Error loading model: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Setup optimizer
|
||||
if verbose:
|
||||
print(f"\n[3/6] Setting up optimizer...")
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
|
||||
|
||||
# Generate synthetic training data
|
||||
if verbose:
|
||||
print(f"\n[4/6] Generating synthetic training data...")
|
||||
print(f" Batch size: {batch_size}")
|
||||
print(f" Sequence length: {sequence_length}")
|
||||
|
||||
# Create random input_ids (synthetic data)
|
||||
vocab_size = model.config.vocab_size
|
||||
input_ids = torch.randint(0, vocab_size, (batch_size, sequence_length), device=device)
|
||||
labels = input_ids.clone()
|
||||
|
||||
# Warmup
|
||||
if verbose:
|
||||
print(f"\n[5/6] Running warmup ({warmup_steps} steps)...")
|
||||
model.train()
|
||||
for _ in range(warmup_steps):
|
||||
optimizer.zero_grad()
|
||||
outputs = model(input_ids=input_ids, labels=labels)
|
||||
loss = outputs.loss
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# Synchronize before benchmarking
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Benchmark
|
||||
if verbose:
|
||||
print(f"\n[6/6] Running benchmark ({num_steps} steps)...")
|
||||
|
||||
# Storage for per-step metrics
|
||||
forward_times = []
|
||||
backward_times = []
|
||||
optimizer_times = []
|
||||
|
||||
forward_energies = []
|
||||
backward_energies = []
|
||||
optimizer_energies = []
|
||||
|
||||
forward_powers = []
|
||||
backward_powers = []
|
||||
optimizer_powers = []
|
||||
|
||||
memory_usage = []
|
||||
gpu_utils = []
|
||||
|
||||
total_tokens = batch_size * sequence_length * num_steps
|
||||
|
||||
for step in tqdm(range(num_steps), desc="Benchmarking"):
|
||||
# === FORWARD PASS ===
|
||||
monitor.start_monitoring()
|
||||
torch.cuda.synchronize()
|
||||
start_time = time.perf_counter()
|
||||
|
||||
optimizer.zero_grad()
|
||||
outputs = model(input_ids=input_ids, labels=labels)
|
||||
loss = outputs.loss
|
||||
|
||||
torch.cuda.synchronize()
|
||||
forward_time = time.perf_counter() - start_time
|
||||
forward_energy = monitor.get_energy_consumed()
|
||||
forward_power = monitor.get_average_power()
|
||||
|
||||
forward_times.append(forward_time * 1000) # Convert to ms
|
||||
forward_energies.append(forward_energy)
|
||||
forward_powers.append(forward_power)
|
||||
|
||||
# === BACKWARD PASS ===
|
||||
monitor.start_monitoring()
|
||||
torch.cuda.synchronize()
|
||||
start_time = time.perf_counter()
|
||||
|
||||
loss.backward()
|
||||
|
||||
torch.cuda.synchronize()
|
||||
backward_time = time.perf_counter() - start_time
|
||||
backward_energy = monitor.get_energy_consumed()
|
||||
backward_power = monitor.get_average_power()
|
||||
|
||||
backward_times.append(backward_time * 1000) # Convert to ms
|
||||
backward_energies.append(backward_energy)
|
||||
backward_powers.append(backward_power)
|
||||
|
||||
# === OPTIMIZER STEP ===
|
||||
monitor.start_monitoring()
|
||||
torch.cuda.synchronize()
|
||||
start_time = time.perf_counter()
|
||||
|
||||
optimizer.step()
|
||||
|
||||
torch.cuda.synchronize()
|
||||
optimizer_time = time.perf_counter() - start_time
|
||||
optimizer_energy = monitor.get_energy_consumed()
|
||||
optimizer_power = monitor.get_average_power()
|
||||
|
||||
optimizer_times.append(optimizer_time * 1000) # Convert to ms
|
||||
optimizer_energies.append(optimizer_energy)
|
||||
optimizer_powers.append(optimizer_power)
|
||||
|
||||
# Get memory and utilization
|
||||
metrics = monitor.get_metrics()
|
||||
memory_usage.append(metrics.memory_used_gb)
|
||||
gpu_utils.append(metrics.gpu_utilization_percent)
|
||||
|
||||
# Compute aggregated metrics
|
||||
tokens_per_step = batch_size * sequence_length
|
||||
|
||||
# Forward metrics
|
||||
forward_duration_ms = sum(forward_times)
|
||||
forward_energy_j = sum(forward_energies)
|
||||
forward_tokens = tokens_per_step * num_steps
|
||||
forward_tps = forward_tokens / (forward_duration_ms / 1000)
|
||||
forward_ept = forward_energy_j / forward_tokens
|
||||
forward_metrics = StageMetrics(
|
||||
stage_name="forward",
|
||||
duration_ms=forward_duration_ms,
|
||||
tokens_processed=forward_tokens,
|
||||
tokens_per_second=forward_tps,
|
||||
energy_joules=forward_energy_j,
|
||||
energy_per_token=forward_ept,
|
||||
avg_power_watts=sum(forward_powers) / len(forward_powers),
|
||||
peak_memory_gb=max(memory_usage),
|
||||
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
|
||||
)
|
||||
|
||||
# Backward metrics
|
||||
backward_duration_ms = sum(backward_times)
|
||||
backward_energy_j = sum(backward_energies)
|
||||
backward_tokens = tokens_per_step * num_steps
|
||||
backward_tps = backward_tokens / (backward_duration_ms / 1000)
|
||||
backward_ept = backward_energy_j / backward_tokens
|
||||
backward_metrics = StageMetrics(
|
||||
stage_name="backward",
|
||||
duration_ms=backward_duration_ms,
|
||||
tokens_processed=backward_tokens,
|
||||
tokens_per_second=backward_tps,
|
||||
energy_joules=backward_energy_j,
|
||||
energy_per_token=backward_ept,
|
||||
avg_power_watts=sum(backward_powers) / len(backward_powers),
|
||||
peak_memory_gb=max(memory_usage),
|
||||
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
|
||||
)
|
||||
|
||||
# Optimizer metrics
|
||||
optimizer_duration_ms = sum(optimizer_times)
|
||||
optimizer_energy_j = sum(optimizer_energies)
|
||||
optimizer_tokens = tokens_per_step * num_steps
|
||||
optimizer_tps = optimizer_tokens / (optimizer_duration_ms / 1000)
|
||||
optimizer_ept = optimizer_energy_j / optimizer_tokens
|
||||
optimizer_metrics = StageMetrics(
|
||||
stage_name="optimizer",
|
||||
duration_ms=optimizer_duration_ms,
|
||||
tokens_processed=optimizer_tokens,
|
||||
tokens_per_second=optimizer_tps,
|
||||
energy_joules=optimizer_energy_j,
|
||||
energy_per_token=optimizer_ept,
|
||||
avg_power_watts=sum(optimizer_powers) / len(optimizer_powers),
|
||||
peak_memory_gb=max(memory_usage),
|
||||
avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
|
||||
)
|
||||
|
||||
# Overall metrics
|
||||
total_duration_ms = forward_duration_ms + backward_duration_ms + optimizer_duration_ms
|
||||
total_energy_j = forward_energy_j + backward_energy_j + optimizer_energy_j
|
||||
total_tps = total_tokens / (total_duration_ms / 1000)
|
||||
total_ept = total_energy_j / total_tokens
|
||||
|
||||
# Create metrics object
|
||||
metrics = PretrainMetrics(
|
||||
model_name=model_name_or_path,
|
||||
gpu_name=gpu_name,
|
||||
attention_implementation=attn_implementation,
|
||||
batch_size=batch_size,
|
||||
sequence_length=sequence_length,
|
||||
num_steps=num_steps,
|
||||
forward=forward_metrics,
|
||||
backward=backward_metrics,
|
||||
optimizer=optimizer_metrics,
|
||||
total_duration_ms=total_duration_ms,
|
||||
total_tokens=total_tokens,
|
||||
total_tokens_per_second=total_tps,
|
||||
total_energy_joules=total_energy_j,
|
||||
total_energy_per_token=total_ept
|
||||
)
|
||||
|
||||
# Print results
|
||||
MetricsReporter.print_pretrain_metrics(metrics, verbose=verbose)
|
||||
|
||||
# Save results
|
||||
if output_dir:
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save JSON
|
||||
json_path = output_path / f"pretrain_{gpu_name.replace(' ', '_')}_{attn_implementation}.json"
|
||||
MetricsReporter.save_json(metrics, json_path)
|
||||
|
||||
# Cleanup
|
||||
monitor.cleanup()
|
||||
del model
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="LLM Pretraining Benchmark",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model-path",
|
||||
type=str,
|
||||
default="./model_cache",
|
||||
help="Path to cached model"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model-name",
|
||||
type=str,
|
||||
default="Qwen/Qwen3-4B",
|
||||
help="Model name (for reporting)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--attn-implementation",
|
||||
type=str,
|
||||
default="auto",
|
||||
choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
|
||||
help="Attention implementation to use"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=8,
|
||||
help="Batch size"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--sequence-length",
|
||||
type=int,
|
||||
default=8192,
|
||||
help="Sequence length"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--num-steps",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Number of training steps"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--warmup-steps",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Number of warmup steps"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--device-id",
|
||||
type=int,
|
||||
default=0,
|
||||
help="GPU device ID"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
default="./results",
|
||||
help="Output directory for results"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set environment variables for HuggingFace cache
|
||||
if Path(args.model_path).exists():
|
||||
os.environ['HF_HOME'] = args.model_path
|
||||
|
||||
benchmark_pretrain(
|
||||
model_name_or_path=args.model_name,
|
||||
attn_implementation=args.attn_implementation,
|
||||
batch_size=args.batch_size,
|
||||
sequence_length=args.sequence_length,
|
||||
num_steps=args.num_steps,
|
||||
warmup_steps=args.warmup_steps,
|
||||
device="cuda",
|
||||
device_id=args.device_id,
|
||||
output_dir=args.output_dir,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
151
cache_model.py
Executable file
151
cache_model.py
Executable file
@@ -0,0 +1,151 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Model Caching Script for LLM Benchmarking
|
||||
|
||||
This script downloads and caches the Qwen3-4B model from HuggingFace
|
||||
before running benchmarks on offline compute nodes.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
def cache_model(model_name: str, cache_dir: str, force: bool = False):
|
||||
"""
|
||||
Download and cache a HuggingFace model.
|
||||
|
||||
Args:
|
||||
model_name: HuggingFace model identifier (e.g., "Qwen/Qwen3-4B-Instruct-2507")
|
||||
cache_dir: Local directory to cache the model
|
||||
force: Force re-download even if model exists
|
||||
"""
|
||||
try:
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
|
||||
except ImportError:
|
||||
print("Error: transformers library not found. Please install it:")
|
||||
print(" pip install transformers")
|
||||
sys.exit(1)
|
||||
|
||||
# Create cache directory
|
||||
cache_path = Path(cache_dir).resolve()
|
||||
cache_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"Caching model: {model_name}")
|
||||
print(f"Cache directory: {cache_path}")
|
||||
print("-" * 60)
|
||||
|
||||
# Set HuggingFace cache directory
|
||||
os.environ['HF_HOME'] = str(cache_path)
|
||||
|
||||
# Check if model already exists
|
||||
model_path = cache_path / model_name.replace("/", "--")
|
||||
if model_path.exists() and not force:
|
||||
print(f"Model already cached at: {model_path}")
|
||||
print("Use --force to re-download")
|
||||
return str(cache_path)
|
||||
|
||||
try:
|
||||
# Download config
|
||||
print("\n[1/3] Downloading model config...")
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_name,
|
||||
cache_dir=cache_path,
|
||||
trust_remote_code=True
|
||||
)
|
||||
print(f" ✓ Config downloaded")
|
||||
print(f" - Model type: {config.model_type}")
|
||||
print(f" - Hidden size: {config.hidden_size}")
|
||||
print(f" - Num layers: {config.num_hidden_layers}")
|
||||
print(f" - Num attention heads: {config.num_attention_heads}")
|
||||
|
||||
# Download tokenizer
|
||||
print("\n[2/3] Downloading tokenizer...")
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_name,
|
||||
cache_dir=cache_path,
|
||||
trust_remote_code=True
|
||||
)
|
||||
print(f" ✓ Tokenizer downloaded")
|
||||
print(f" - Vocab size: {len(tokenizer)}")
|
||||
print(f" - Model max length: {tokenizer.model_max_length}")
|
||||
|
||||
# Download model weights
|
||||
print("\n[3/3] Downloading model weights...")
|
||||
print(" (This may take several minutes depending on connection speed)")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
cache_dir=cache_path,
|
||||
trust_remote_code=True,
|
||||
torch_dtype="auto",
|
||||
low_cpu_mem_usage=True
|
||||
)
|
||||
print(f" ✓ Model weights downloaded")
|
||||
|
||||
# Calculate total parameters
|
||||
total_params = sum(p.numel() for p in model.parameters())
|
||||
print(f" - Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
|
||||
|
||||
# Clean up model from memory
|
||||
del model
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✓ Model successfully cached!")
|
||||
print("=" * 60)
|
||||
print(f"\nCache location: {cache_path}")
|
||||
print(f"\nTo use in benchmarks, set:")
|
||||
print(f" --model-path {cache_path}")
|
||||
print(f"\nOr set environment variable:")
|
||||
print(f" export HF_HOME={cache_path}")
|
||||
|
||||
return str(cache_path)
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ Error downloading model: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Cache HuggingFace model for offline use",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Cache model to default location
|
||||
python cache_model.py
|
||||
|
||||
# Cache model to custom directory
|
||||
python cache_model.py --cache-dir /path/to/cache
|
||||
|
||||
# Force re-download
|
||||
python cache_model.py --force
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model-name",
|
||||
type=str,
|
||||
default="Qwen/Qwen3-4B",
|
||||
help="HuggingFace model identifier (default: Qwen/Qwen3-4B)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--cache-dir",
|
||||
type=str,
|
||||
default="./model_cache",
|
||||
help="Directory to cache model (default: ./model_cache in current directory)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--force",
|
||||
action="store_true",
|
||||
help="Force re-download even if model exists"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
cache_model(args.model_name, args.cache_dir, args.force)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
26
configs/a100.yaml
Normal file
26
configs/a100.yaml
Normal file
@@ -0,0 +1,26 @@
|
||||
# A100 Configuration
|
||||
gpu_type: a100
|
||||
gpu_model: "NVIDIA A100 80GB"
|
||||
|
||||
# Default attention implementation
|
||||
default_attention: flash_attention_2
|
||||
|
||||
# Pretraining defaults
|
||||
pretrain:
|
||||
batch_size: 8
|
||||
sequence_length: 8192
|
||||
num_steps: 10
|
||||
warmup_steps: 3
|
||||
|
||||
# Inference defaults
|
||||
inference:
|
||||
num_requests: 10
|
||||
prompt_length: 512
|
||||
generation_length: 100
|
||||
warmup_requests: 2
|
||||
|
||||
# Hardware specs (for reference)
|
||||
hardware:
|
||||
memory_gb: 80
|
||||
tdp_watts: 400
|
||||
compute_capability: "8.0"
|
||||
26
configs/h100.yaml
Normal file
26
configs/h100.yaml
Normal file
@@ -0,0 +1,26 @@
|
||||
# H100 Configuration
|
||||
gpu_type: h100
|
||||
gpu_model: "NVIDIA H100 80GB"
|
||||
|
||||
# Default attention implementation
|
||||
default_attention: flash_attention_3_hopper
|
||||
|
||||
# Pretraining defaults
|
||||
pretrain:
|
||||
batch_size: 8
|
||||
sequence_length: 8192
|
||||
num_steps: 10
|
||||
warmup_steps: 3
|
||||
|
||||
# Inference defaults
|
||||
inference:
|
||||
num_requests: 10
|
||||
prompt_length: 512
|
||||
generation_length: 100
|
||||
warmup_requests: 2
|
||||
|
||||
# Hardware specs (for reference)
|
||||
hardware:
|
||||
memory_gb: 80
|
||||
tdp_watts: 700
|
||||
compute_capability: "9.0"
|
||||
26
configs/h200.yaml
Normal file
26
configs/h200.yaml
Normal file
@@ -0,0 +1,26 @@
|
||||
# H200 Configuration
|
||||
gpu_type: h200
|
||||
gpu_model: "NVIDIA H200 141GB"
|
||||
|
||||
# Default attention implementation
|
||||
default_attention: flash_attention_3_hopper
|
||||
|
||||
# Pretraining defaults
|
||||
pretrain:
|
||||
batch_size: 8
|
||||
sequence_length: 8192
|
||||
num_steps: 10
|
||||
warmup_steps: 3
|
||||
|
||||
# Inference defaults
|
||||
inference:
|
||||
num_requests: 10
|
||||
prompt_length: 512
|
||||
generation_length: 100
|
||||
warmup_requests: 2
|
||||
|
||||
# Hardware specs (for reference)
|
||||
hardware:
|
||||
memory_gb: 141
|
||||
tdp_watts: 700
|
||||
compute_capability: "9.0"
|
||||
26
configs/mi300x.yaml
Normal file
26
configs/mi300x.yaml
Normal file
@@ -0,0 +1,26 @@
|
||||
# MI300X Configuration
|
||||
gpu_type: mi300x
|
||||
gpu_model: "AMD Instinct MI300X"
|
||||
|
||||
# Default attention implementation
|
||||
default_attention: flash_attention_2
|
||||
|
||||
# Pretraining defaults
|
||||
pretrain:
|
||||
batch_size: 8
|
||||
sequence_length: 8192
|
||||
num_steps: 10
|
||||
warmup_steps: 3
|
||||
|
||||
# Inference defaults
|
||||
inference:
|
||||
num_requests: 10
|
||||
prompt_length: 512
|
||||
generation_length: 100
|
||||
warmup_requests: 2
|
||||
|
||||
# Hardware specs (for reference)
|
||||
hardware:
|
||||
memory_gb: 192
|
||||
tdp_watts: 750
|
||||
compute_capability: "gfx940"
|
||||
122
quick_start.sh
Executable file
122
quick_start.sh
Executable file
@@ -0,0 +1,122 @@
|
||||
#!/bin/bash
|
||||
# Quick Start Script for LLM Benchmark Suite
|
||||
#
|
||||
# This script helps you get started quickly with the benchmark suite.
|
||||
# It will:
|
||||
# 1. Check dependencies
|
||||
# 2. Cache the model if needed
|
||||
# 3. Run a quick test benchmark
|
||||
#
|
||||
# Usage: ./quick_start.sh [--skip-cache]
|
||||
|
||||
set -e # Exit on error
|
||||
|
||||
echo "========================================="
|
||||
echo "LLM Benchmark Suite - Quick Start"
|
||||
echo "========================================="
|
||||
|
||||
# Parse arguments
|
||||
SKIP_CACHE=false
|
||||
if [[ "$1" == "--skip-cache" ]]; then
|
||||
SKIP_CACHE=true
|
||||
fi
|
||||
|
||||
# Check Python
|
||||
echo ""
|
||||
echo "[1/5] Checking Python..."
|
||||
if ! command -v python &> /dev/null; then
|
||||
echo "✗ Python not found. Please install Python 3.8+"
|
||||
exit 1
|
||||
fi
|
||||
PYTHON_VERSION=$(python --version 2>&1 | awk '{print $2}')
|
||||
echo " ✓ Python $PYTHON_VERSION found"
|
||||
|
||||
# Check dependencies
|
||||
echo ""
|
||||
echo "[2/5] Checking dependencies..."
|
||||
MISSING_DEPS=()
|
||||
|
||||
if ! python -c "import torch" 2>/dev/null; then
|
||||
MISSING_DEPS+=("torch")
|
||||
fi
|
||||
|
||||
if ! python -c "import transformers" 2>/dev/null; then
|
||||
MISSING_DEPS+=("transformers")
|
||||
fi
|
||||
|
||||
if ${#MISSING_DEPS[@]} -gt 0; then
|
||||
echo " ⚠ Missing dependencies: ${MISSING_DEPS[*]}"
|
||||
echo " Installing dependencies..."
|
||||
pip install -r requirements.txt
|
||||
else
|
||||
echo " ✓ All dependencies installed"
|
||||
fi
|
||||
|
||||
# Check GPU
|
||||
echo ""
|
||||
echo "[3/5] Checking GPU..."
|
||||
if python -c "import torch; assert torch.cuda.is_available()" 2>/dev/null; then
|
||||
GPU_NAME=$(python -c "import torch; print(torch.cuda.get_device_name(0))")
|
||||
echo " ✓ GPU found: $GPU_NAME"
|
||||
else
|
||||
echo " ✗ No GPU found or CUDA not available"
|
||||
echo " This benchmark requires a GPU to run."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Cache model
|
||||
if [ "$SKIP_CACHE" = false ]; then
|
||||
echo ""
|
||||
echo "[4/5] Caching model..."
|
||||
if [ -d "./model_cache" ] && [ "$(ls -A ./model_cache)" ]; then
|
||||
echo " ✓ Model cache already exists at ./model_cache"
|
||||
echo " To re-download, remove the directory and run again."
|
||||
else
|
||||
echo " Downloading Qwen/Qwen3-4B..."
|
||||
echo " (This may take several minutes depending on your connection)"
|
||||
python cache_model.py --model-name Qwen/Qwen3-4B --cache-dir ./model_cache
|
||||
fi
|
||||
else
|
||||
echo ""
|
||||
echo "[4/5] Skipping model cache (--skip-cache specified)"
|
||||
fi
|
||||
|
||||
# Run quick test
|
||||
echo ""
|
||||
echo "[5/5] Running quick test benchmark..."
|
||||
echo " This will run a minimal benchmark to verify everything works."
|
||||
echo " Parameters: 2 steps, batch size 2, sequence length 512"
|
||||
echo ""
|
||||
|
||||
python run_benchmark.py \
|
||||
--mode both \
|
||||
--model-path ./model_cache \
|
||||
--model-name Qwen/Qwen3-4B \
|
||||
--batch-size 2 \
|
||||
--sequence-length 512 \
|
||||
--num-steps 2 \
|
||||
--num-requests 2 \
|
||||
--prompt-length 256 \
|
||||
--generation-length 20 \
|
||||
--output-dir ./results/test
|
||||
|
||||
echo ""
|
||||
echo "========================================="
|
||||
echo "Quick Start Complete!"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo " 1. Run full benchmarks:"
|
||||
echo " python run_benchmark.py --mode both"
|
||||
echo ""
|
||||
echo " 2. Run on different GPUs using SLURM:"
|
||||
echo " sbatch slurm_a100.sh"
|
||||
echo " sbatch slurm_h100.sh"
|
||||
echo " sbatch slurm_h200.sh"
|
||||
echo " sbatch slurm_mi300x.sh"
|
||||
echo ""
|
||||
echo " 3. View results:"
|
||||
echo " ls -l results/"
|
||||
echo ""
|
||||
echo "For more information, see README.md"
|
||||
echo ""
|
||||
22
requirements.txt
Normal file
22
requirements.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
# LLM Benchmark Suite - Requirements
|
||||
|
||||
# Core dependencies
|
||||
torch>=2.0.0
|
||||
transformers>=4.35.0
|
||||
accelerate>=0.24.0
|
||||
tokenizers>=0.14.0
|
||||
|
||||
# Attention implementations
|
||||
flash-attn>=2.0.0
|
||||
|
||||
# GPU monitoring
|
||||
pynvml>=11.5.0 # NVIDIA GPU monitoring
|
||||
pyrsmi>=1.0.0 # AMD GPU monitoring
|
||||
|
||||
# Utilities
|
||||
numpy>=1.24.0
|
||||
pyyaml>=6.0
|
||||
tqdm>=4.65.0
|
||||
|
||||
# Optional: for better performance
|
||||
triton>=2.0.0
|
||||
@@ -0,0 +1,37 @@
|
||||
{
|
||||
"model_name": "Qwen/Qwen3-4B",
|
||||
"gpu_name": "NVIDIA A100-SXM4-80GB",
|
||||
"attention_implementation": "flash_attention_2",
|
||||
"num_requests": 10,
|
||||
"prompt_length": 512,
|
||||
"generation_length": 100,
|
||||
"prefill": {
|
||||
"stage_name": "prefill",
|
||||
"duration_ms": 475.62581300735474,
|
||||
"tokens_processed": 5120,
|
||||
"tokens_per_second": 10764.76477932628,
|
||||
"energy_joules": 21.409000039100647,
|
||||
"energy_per_token": 0.004181445320136845,
|
||||
"avg_power_watts": 68.91171083870925,
|
||||
"peak_memory_gb": 45.87115478515625,
|
||||
"avg_gpu_util_percent": 38.1
|
||||
},
|
||||
"decode": {
|
||||
"stage_name": "decode",
|
||||
"duration_ms": 41460.768724791706,
|
||||
"tokens_processed": 1000,
|
||||
"tokens_per_second": 24.119186179055195,
|
||||
"energy_joules": 4684.697999954224,
|
||||
"energy_per_token": 4.684697999954223,
|
||||
"avg_power_watts": 112.85507087682042,
|
||||
"peak_memory_gb": 45.87115478515625,
|
||||
"avg_gpu_util_percent": 38.1
|
||||
},
|
||||
"e2e_latency_ms": 4193.639453779906,
|
||||
"e2e_tokens_per_second": 145.93529242204605,
|
||||
"e2e_energy_joules": 4706.106999993324,
|
||||
"e2e_energy_per_token": 0.768971732025053,
|
||||
"ttft_ms": 47.562581300735474,
|
||||
"itl_ms": 41.460768724791706,
|
||||
"timestamp": 1768519487.5402663
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
{
|
||||
"model_name": "Qwen/Qwen3-4B",
|
||||
"gpu_name": "NVIDIA A100-SXM4-80GB",
|
||||
"attention_implementation": "flash_attention_2",
|
||||
"batch_size": 3,
|
||||
"sequence_length": 2048,
|
||||
"num_steps": 10,
|
||||
"forward": {
|
||||
"stage_name": "forward",
|
||||
"duration_ms": 3359.0412912890315,
|
||||
"tokens_processed": 61440,
|
||||
"tokens_per_second": 18290.933237210196,
|
||||
"energy_joules": 1292.2280000448227,
|
||||
"energy_per_token": 0.021032356771562868,
|
||||
"avg_power_watts": 387.19580415542595,
|
||||
"peak_memory_gb": 79.66021728515625,
|
||||
"avg_gpu_util_percent": 97.8
|
||||
},
|
||||
"backward": {
|
||||
"stage_name": "backward",
|
||||
"duration_ms": 6954.944152384996,
|
||||
"tokens_processed": 61440,
|
||||
"tokens_per_second": 8834.003358449821,
|
||||
"energy_joules": 2729.588000059128,
|
||||
"energy_per_token": 0.0444268880217957,
|
||||
"avg_power_watts": 394.24766095856324,
|
||||
"peak_memory_gb": 79.66021728515625,
|
||||
"avg_gpu_util_percent": 97.8
|
||||
},
|
||||
"optimizer": {
|
||||
"stage_name": "optimizer",
|
||||
"duration_ms": 1153.845101594925,
|
||||
"tokens_processed": 61440,
|
||||
"tokens_per_second": 53248.048559614595,
|
||||
"energy_joules": 362.6529998779297,
|
||||
"energy_per_token": 0.005902555336554845,
|
||||
"avg_power_watts": 299.1223537953503,
|
||||
"peak_memory_gb": 79.66021728515625,
|
||||
"avg_gpu_util_percent": 97.8
|
||||
},
|
||||
"total_duration_ms": 11467.830545268953,
|
||||
"total_tokens": 61440,
|
||||
"total_tokens_per_second": 5357.595733340081,
|
||||
"total_energy_joules": 4384.46899998188,
|
||||
"total_energy_per_token": 0.07136180012991342,
|
||||
"timestamp": 1768519431.5985208
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
{
|
||||
"model_name": "Qwen/Qwen3-4B",
|
||||
"gpu_name": "NVIDIA H100",
|
||||
"attention_implementation": "flash_attention_3_hopper",
|
||||
"num_requests": 10,
|
||||
"prompt_length": 512,
|
||||
"generation_length": 100,
|
||||
"prefill": {
|
||||
"stage_name": "prefill",
|
||||
"duration_ms": 323.99015384726226,
|
||||
"tokens_processed": 5120,
|
||||
"tokens_per_second": 15802.949377324925,
|
||||
"energy_joules": 17.092000007629395,
|
||||
"energy_per_token": 0.0033382812514901163,
|
||||
"avg_power_watts": 93.64442380045372,
|
||||
"peak_memory_gb": 46.02825927734375,
|
||||
"avg_gpu_util_percent": 40.0
|
||||
},
|
||||
"decode": {
|
||||
"stage_name": "decode",
|
||||
"duration_ms": 30513.75844143331,
|
||||
"tokens_processed": 1000,
|
||||
"tokens_per_second": 32.772101867403634,
|
||||
"energy_joules": 4915.5139999985695,
|
||||
"energy_per_token": 4.915513999998569,
|
||||
"avg_power_watts": 161.199160874206,
|
||||
"peak_memory_gb": 46.02825927734375,
|
||||
"avg_gpu_util_percent": 40.0
|
||||
},
|
||||
"e2e_latency_ms": 3083.7748595280573,
|
||||
"e2e_tokens_per_second": 198.4580677506596,
|
||||
"e2e_energy_joules": 4932.606000006199,
|
||||
"e2e_energy_per_token": 0.8059813725500325,
|
||||
"ttft_ms": 32.399015384726226,
|
||||
"itl_ms": 30.51375844143331,
|
||||
"timestamp": 1768541839.3186588
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
{
|
||||
"model_name": "Qwen/Qwen3-4B",
|
||||
"gpu_name": "NVIDIA H100",
|
||||
"attention_implementation": "flash_attention_3_hopper",
|
||||
"batch_size": 3,
|
||||
"sequence_length": 2048,
|
||||
"num_steps": 10,
|
||||
"forward": {
|
||||
"stage_name": "forward",
|
||||
"duration_ms": 1748.5067250672728,
|
||||
"tokens_processed": 61440,
|
||||
"tokens_per_second": 35138.55515633555,
|
||||
"energy_joules": 946.9269999563694,
|
||||
"energy_per_token": 0.015412223306581534,
|
||||
"avg_power_watts": 501.76439870614394,
|
||||
"peak_memory_gb": 76.45208740234375,
|
||||
"avg_gpu_util_percent": 97.0
|
||||
},
|
||||
"backward": {
|
||||
"stage_name": "backward",
|
||||
"duration_ms": 3761.718863155693,
|
||||
"tokens_processed": 61440,
|
||||
"tokens_per_second": 16332.959010248362,
|
||||
"energy_joules": 1904.104000031948,
|
||||
"energy_per_token": 0.030991276042186655,
|
||||
"avg_power_watts": 491.250130606127,
|
||||
"peak_memory_gb": 76.45208740234375,
|
||||
"avg_gpu_util_percent": 97.0
|
||||
},
|
||||
"optimizer": {
|
||||
"stage_name": "optimizer",
|
||||
"duration_ms": 896.0564862936735,
|
||||
"tokens_processed": 61440,
|
||||
"tokens_per_second": 68567.1059133025,
|
||||
"energy_joules": 349.722000002861,
|
||||
"energy_per_token": 0.0056920898437965665,
|
||||
"avg_power_watts": 356.92130879075387,
|
||||
"peak_memory_gb": 76.45208740234375,
|
||||
"avg_gpu_util_percent": 97.0
|
||||
},
|
||||
"total_duration_ms": 6406.282074516639,
|
||||
"total_tokens": 61440,
|
||||
"total_tokens_per_second": 9590.586128637759,
|
||||
"total_energy_joules": 3200.7529999911785,
|
||||
"total_energy_per_token": 0.052095589192564754,
|
||||
"timestamp": 1768541796.4011748
|
||||
}
|
||||
37
results/h100_sdpa/inference_NVIDIA_H100_sdpa.json
Normal file
37
results/h100_sdpa/inference_NVIDIA_H100_sdpa.json
Normal file
@@ -0,0 +1,37 @@
|
||||
{
|
||||
"model_name": "Qwen/Qwen3-4B",
|
||||
"gpu_name": "NVIDIA H100",
|
||||
"attention_implementation": "sdpa",
|
||||
"num_requests": 10,
|
||||
"prompt_length": 512,
|
||||
"generation_length": 100,
|
||||
"prefill": {
|
||||
"stage_name": "prefill",
|
||||
"duration_ms": 253.97859653458,
|
||||
"tokens_processed": 5120,
|
||||
"tokens_per_second": 20159.179040517676,
|
||||
"energy_joules": 0.0,
|
||||
"energy_per_token": 0.0,
|
||||
"avg_power_watts": 0.0,
|
||||
"peak_memory_gb": 46.01458740234375,
|
||||
"avg_gpu_util_percent": 48.8
|
||||
},
|
||||
"decode": {
|
||||
"stage_name": "decode",
|
||||
"duration_ms": 23519.252635538578,
|
||||
"tokens_processed": 1000,
|
||||
"tokens_per_second": 42.51835785330007,
|
||||
"energy_joules": 4544.901999980211,
|
||||
"energy_per_token": 4.544901999980211,
|
||||
"avg_power_watts": 192.5432634001641,
|
||||
"peak_memory_gb": 46.01458740234375,
|
||||
"avg_gpu_util_percent": 48.8
|
||||
},
|
||||
"e2e_latency_ms": 2377.323123207316,
|
||||
"e2e_tokens_per_second": 257.43240118504923,
|
||||
"e2e_energy_joules": 4544.901999980211,
|
||||
"e2e_energy_per_token": 0.7426310457484006,
|
||||
"ttft_ms": 25.397859653458,
|
||||
"itl_ms": 23.519252635538578,
|
||||
"timestamp": 1769149269.5228984
|
||||
}
|
||||
47
results/h100_sdpa/pretrain_NVIDIA_H100_sdpa.json
Normal file
47
results/h100_sdpa/pretrain_NVIDIA_H100_sdpa.json
Normal file
@@ -0,0 +1,47 @@
|
||||
{
|
||||
"model_name": "Qwen/Qwen3-4B",
|
||||
"gpu_name": "NVIDIA H100",
|
||||
"attention_implementation": "sdpa",
|
||||
"batch_size": 3,
|
||||
"sequence_length": 2048,
|
||||
"num_steps": 10,
|
||||
"forward": {
|
||||
"stage_name": "forward",
|
||||
"duration_ms": 1790.2467511594296,
|
||||
"tokens_processed": 61440,
|
||||
"tokens_per_second": 34319.29143857359,
|
||||
"energy_joules": 981.029000043869,
|
||||
"energy_per_token": 0.01596726888092235,
|
||||
"avg_power_watts": 520.9058508009567,
|
||||
"peak_memory_gb": 76.45208740234375,
|
||||
"avg_gpu_util_percent": 100.0
|
||||
},
|
||||
"backward": {
|
||||
"stage_name": "backward",
|
||||
"duration_ms": 3854.5540031045675,
|
||||
"tokens_processed": 61440,
|
||||
"tokens_per_second": 15939.587290906931,
|
||||
"energy_joules": 1953.71099999547,
|
||||
"energy_per_token": 0.03179868164055127,
|
||||
"avg_power_watts": 491.5443624439596,
|
||||
"peak_memory_gb": 76.45208740234375,
|
||||
"avg_gpu_util_percent": 100.0
|
||||
},
|
||||
"optimizer": {
|
||||
"stage_name": "optimizer",
|
||||
"duration_ms": 899.9840868636966,
|
||||
"tokens_processed": 61440,
|
||||
"tokens_per_second": 68267.87372886644,
|
||||
"energy_joules": 365.9209999740124,
|
||||
"energy_per_token": 0.005955745442285358,
|
||||
"avg_power_watts": 377.8756124501158,
|
||||
"peak_memory_gb": 76.45208740234375,
|
||||
"avg_gpu_util_percent": 100.0
|
||||
},
|
||||
"total_duration_ms": 6544.784841127694,
|
||||
"total_tokens": 61440,
|
||||
"total_tokens_per_second": 9387.627170553957,
|
||||
"total_energy_joules": 3300.6610000133514,
|
||||
"total_energy_per_token": 0.053721695963758975,
|
||||
"timestamp": 1769149234.99943
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
{
|
||||
"model_name": "Qwen/Qwen3-4B",
|
||||
"gpu_name": "NVIDIA H200",
|
||||
"attention_implementation": "flash_attention_3_hopper",
|
||||
"num_requests": 10,
|
||||
"prompt_length": 512,
|
||||
"generation_length": 100,
|
||||
"prefill": {
|
||||
"stage_name": "prefill",
|
||||
"duration_ms": 323.8773119999223,
|
||||
"tokens_processed": 5120,
|
||||
"tokens_per_second": 15808.455270868828,
|
||||
"energy_joules": 98.1449999999968,
|
||||
"energy_per_token": 0.019168945312499373,
|
||||
"avg_power_watts": 250.96736239598317,
|
||||
"peak_memory_gb": 46.1302490234375,
|
||||
"avg_gpu_util_percent": 32.2
|
||||
},
|
||||
"decode": {
|
||||
"stage_name": "decode",
|
||||
"duration_ms": 30558.618001000013,
|
||||
"tokens_processed": 1000,
|
||||
"tokens_per_second": 32.72399294913388,
|
||||
"energy_joules": 4828.459999999999,
|
||||
"energy_per_token": 4.828459999999999,
|
||||
"avg_power_watts": 157.61927190444868,
|
||||
"peak_memory_gb": 46.1302490234375,
|
||||
"avg_gpu_util_percent": 32.2
|
||||
},
|
||||
"e2e_latency_ms": 3088.2495312999936,
|
||||
"e2e_tokens_per_second": 198.17051497855476,
|
||||
"e2e_energy_joules": 4926.604999999996,
|
||||
"e2e_energy_per_token": 0.8050008169934634,
|
||||
"ttft_ms": 32.38773119999223,
|
||||
"itl_ms": 30.558618001000013,
|
||||
"timestamp": 1768541964.4743361
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
{
|
||||
"model_name": "Qwen/Qwen3-4B",
|
||||
"gpu_name": "NVIDIA H200",
|
||||
"attention_implementation": "flash_attention_3_hopper",
|
||||
"batch_size": 3,
|
||||
"sequence_length": 2048,
|
||||
"num_steps": 10,
|
||||
"forward": {
|
||||
"stage_name": "forward",
|
||||
"duration_ms": 1605.9521619997668,
|
||||
"tokens_processed": 61440,
|
||||
"tokens_per_second": 38257.67756587068,
|
||||
"energy_joules": 817.7539999999863,
|
||||
"energy_per_token": 0.01330979817708311,
|
||||
"avg_power_watts": 476.6091506406698,
|
||||
"peak_memory_gb": 76.5540771484375,
|
||||
"avg_gpu_util_percent": 95.1
|
||||
},
|
||||
"backward": {
|
||||
"stage_name": "backward",
|
||||
"duration_ms": 3448.8081949999696,
|
||||
"tokens_processed": 61440,
|
||||
"tokens_per_second": 17814.849804948502,
|
||||
"energy_joules": 1765.182000000008,
|
||||
"energy_per_token": 0.02873017578125013,
|
||||
"avg_power_watts": 498.84691252245983,
|
||||
"peak_memory_gb": 76.5540771484375,
|
||||
"avg_gpu_util_percent": 95.1
|
||||
},
|
||||
"optimizer": {
|
||||
"stage_name": "optimizer",
|
||||
"duration_ms": 545.701982000196,
|
||||
"tokens_processed": 61440,
|
||||
"tokens_per_second": 112588.92587268984,
|
||||
"energy_joules": 332.4770000000135,
|
||||
"energy_per_token": 0.005411409505208553,
|
||||
"avg_power_watts": 521.4900438388863,
|
||||
"peak_memory_gb": 76.5540771484375,
|
||||
"avg_gpu_util_percent": 95.1
|
||||
},
|
||||
"total_duration_ms": 5600.462338999932,
|
||||
"total_tokens": 61440,
|
||||
"total_tokens_per_second": 10970.522839186035,
|
||||
"total_energy_joules": 2915.4130000000077,
|
||||
"total_energy_per_token": 0.047451383463541795,
|
||||
"timestamp": 1768541921.6000674
|
||||
}
|
||||
37
results/h200_sdpa/inference_NVIDIA_H200_sdpa.json
Normal file
37
results/h200_sdpa/inference_NVIDIA_H200_sdpa.json
Normal file
@@ -0,0 +1,37 @@
|
||||
{
|
||||
"model_name": "Qwen/Qwen3-4B",
|
||||
"gpu_name": "NVIDIA H200",
|
||||
"attention_implementation": "sdpa",
|
||||
"num_requests": 10,
|
||||
"prompt_length": 512,
|
||||
"generation_length": 100,
|
||||
"prefill": {
|
||||
"stage_name": "prefill",
|
||||
"duration_ms": 247.9969559935853,
|
||||
"tokens_processed": 5120,
|
||||
"tokens_per_second": 20645.414696672466,
|
||||
"energy_joules": 73.83399999141693,
|
||||
"energy_per_token": 0.014420703123323619,
|
||||
"avg_power_watts": 222.33737204549297,
|
||||
"peak_memory_gb": 46.1165771484375,
|
||||
"avg_gpu_util_percent": 40.0
|
||||
},
|
||||
"decode": {
|
||||
"stage_name": "decode",
|
||||
"duration_ms": 23003.622506046668,
|
||||
"tokens_processed": 1000,
|
||||
"tokens_per_second": 43.47141411041425,
|
||||
"energy_joules": 4033.3500000089407,
|
||||
"energy_per_token": 4.033350000008941,
|
||||
"avg_power_watts": 174.6335604209662,
|
||||
"peak_memory_gb": 46.1165771484375,
|
||||
"avg_gpu_util_percent": 40.0
|
||||
},
|
||||
"e2e_latency_ms": 2325.1619462040253,
|
||||
"e2e_tokens_per_second": 263.20747292425324,
|
||||
"e2e_energy_joules": 4107.184000000358,
|
||||
"e2e_energy_per_token": 0.6711084967320846,
|
||||
"ttft_ms": 24.79969559935853,
|
||||
"itl_ms": 23.003622506046668,
|
||||
"timestamp": 1769149520.7919798
|
||||
}
|
||||
47
results/h200_sdpa/pretrain_NVIDIA_H200_sdpa.json
Normal file
47
results/h200_sdpa/pretrain_NVIDIA_H200_sdpa.json
Normal file
@@ -0,0 +1,47 @@
|
||||
{
|
||||
"model_name": "Qwen/Qwen3-4B",
|
||||
"gpu_name": "NVIDIA H200",
|
||||
"attention_implementation": "sdpa",
|
||||
"batch_size": 3,
|
||||
"sequence_length": 2048,
|
||||
"num_steps": 10,
|
||||
"forward": {
|
||||
"stage_name": "forward",
|
||||
"duration_ms": 1615.8598741167225,
|
||||
"tokens_processed": 61440,
|
||||
"tokens_per_second": 38023.09902248482,
|
||||
"energy_joules": 873.9250000119209,
|
||||
"energy_per_token": 0.014224039713735693,
|
||||
"avg_power_watts": 541.9081076256928,
|
||||
"peak_memory_gb": 76.5540771484375,
|
||||
"avg_gpu_util_percent": 100.0
|
||||
},
|
||||
"backward": {
|
||||
"stage_name": "backward",
|
||||
"duration_ms": 3462.180594098754,
|
||||
"tokens_processed": 61440,
|
||||
"tokens_per_second": 17746.04135460864,
|
||||
"energy_joules": 1696.024000003934,
|
||||
"energy_per_token": 0.027604557291730693,
|
||||
"avg_power_watts": 472.8399628680292,
|
||||
"peak_memory_gb": 76.5540771484375,
|
||||
"avg_gpu_util_percent": 100.0
|
||||
},
|
||||
"optimizer": {
|
||||
"stage_name": "optimizer",
|
||||
"duration_ms": 551.849422918167,
|
||||
"tokens_processed": 61440,
|
||||
"tokens_per_second": 111334.71821915968,
|
||||
"energy_joules": 316.88299998641014,
|
||||
"energy_per_token": 0.005157600911237144,
|
||||
"avg_power_watts": 499.2301039455484,
|
||||
"peak_memory_gb": 76.5540771484375,
|
||||
"avg_gpu_util_percent": 100.0
|
||||
},
|
||||
"total_duration_ms": 5629.889891133644,
|
||||
"total_tokens": 61440,
|
||||
"total_tokens_per_second": 10913.179687005982,
|
||||
"total_energy_joules": 2886.832000002265,
|
||||
"total_energy_per_token": 0.04698619791670353,
|
||||
"timestamp": 1769149487.0005488
|
||||
}
|
||||
248
run_benchmark.py
Executable file
248
run_benchmark.py
Executable file
@@ -0,0 +1,248 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Main LLM Benchmark Runner
|
||||
|
||||
Orchestrates pretraining and inference benchmarks with auto-detection
|
||||
of GPU type and configuration.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Import benchmark functions
|
||||
import benchmark_pretrain
|
||||
import benchmark_inference
|
||||
|
||||
from utils.gpu_monitor import get_gpu_monitor, list_available_gpus
|
||||
from utils.metrics import MetricsReporter
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="LLM Benchmark Suite - Compare GPU performance for pretraining and inference",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Run both pretrain and inference benchmarks
|
||||
python run_benchmark.py --mode both
|
||||
|
||||
# Run only pretraining benchmark
|
||||
python run_benchmark.py --mode pretrain --num-steps 20
|
||||
|
||||
# Run inference with custom settings
|
||||
python run_benchmark.py --mode inference --num-requests 20 --generation-length 200
|
||||
|
||||
# Use specific attention implementation
|
||||
python run_benchmark.py --attn-implementation flash_attention_3_hopper
|
||||
"""
|
||||
)
|
||||
|
||||
# Model configuration
|
||||
parser.add_argument(
|
||||
"--model-path",
|
||||
type=str,
|
||||
default="./model_cache",
|
||||
help="Path to cached model directory"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model-name",
|
||||
type=str,
|
||||
default="Qwen/Qwen3-4B",
|
||||
help="Model name for reporting"
|
||||
)
|
||||
|
||||
# Benchmark mode
|
||||
parser.add_argument(
|
||||
"--mode",
|
||||
type=str,
|
||||
default="both",
|
||||
choices=["pretrain", "inference", "both"],
|
||||
help="Benchmark mode to run"
|
||||
)
|
||||
|
||||
# Attention configuration
|
||||
parser.add_argument(
|
||||
"--attn-implementation",
|
||||
type=str,
|
||||
default="auto",
|
||||
choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
|
||||
help="Attention implementation (auto selects based on GPU)"
|
||||
)
|
||||
|
||||
# Pretraining parameters
|
||||
pretrain_group = parser.add_argument_group("pretraining parameters")
|
||||
pretrain_group.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Batch size for pretraining"
|
||||
)
|
||||
pretrain_group.add_argument(
|
||||
"--sequence-length",
|
||||
type=int,
|
||||
default=2048,
|
||||
help="Sequence length for pretraining"
|
||||
)
|
||||
pretrain_group.add_argument(
|
||||
"--num-steps",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Number of training steps"
|
||||
)
|
||||
pretrain_group.add_argument(
|
||||
"--warmup-steps",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Number of warmup steps"
|
||||
)
|
||||
|
||||
# Inference parameters
|
||||
inference_group = parser.add_argument_group("inference parameters")
|
||||
inference_group.add_argument(
|
||||
"--num-requests",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Number of inference requests"
|
||||
)
|
||||
inference_group.add_argument(
|
||||
"--prompt-length",
|
||||
type=int,
|
||||
default=512,
|
||||
help="Prompt length in tokens"
|
||||
)
|
||||
inference_group.add_argument(
|
||||
"--generation-length",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Number of tokens to generate"
|
||||
)
|
||||
inference_group.add_argument(
|
||||
"--warmup-requests",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Number of warmup requests"
|
||||
)
|
||||
|
||||
# General parameters
|
||||
parser.add_argument(
|
||||
"--device-id",
|
||||
type=int,
|
||||
default=0,
|
||||
help="GPU device ID"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
default="./results",
|
||||
help="Output directory for results"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--list-gpus",
|
||||
action="store_true",
|
||||
help="List available GPUs and exit"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# List GPUs if requested
|
||||
if args.list_gpus:
|
||||
print("Available GPUs:")
|
||||
gpus = list_available_gpus()
|
||||
if not gpus:
|
||||
print(" No GPUs found!")
|
||||
else:
|
||||
for gpu in gpus:
|
||||
print(f" {gpu}")
|
||||
return
|
||||
|
||||
# Print header
|
||||
print("=" * 80)
|
||||
print("LLM BENCHMARK SUITE")
|
||||
print("=" * 80)
|
||||
print(f"\nModel: {args.model_name}")
|
||||
print(f"Model Path: {args.model_path}")
|
||||
print(f"Mode: {args.mode}")
|
||||
print(f"Attention: {args.attn_implementation}")
|
||||
print(f"Output Directory: {args.output_dir}")
|
||||
|
||||
# Detect GPU
|
||||
print("\nDetecting GPU...")
|
||||
try:
|
||||
monitor = get_gpu_monitor(args.device_id)
|
||||
gpu_name = monitor.get_device_name()
|
||||
print(f" GPU {args.device_id}: {gpu_name}")
|
||||
monitor.cleanup()
|
||||
except Exception as e:
|
||||
print(f"✗ Error detecting GPU: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Create output directory
|
||||
output_path = Path(args.output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Run benchmarks
|
||||
pretrain_metrics = None
|
||||
inference_metrics = None
|
||||
|
||||
if args.mode in ["pretrain", "both"]:
|
||||
print("\n" + "=" * 80)
|
||||
print("Running Pretraining Benchmark...")
|
||||
print("=" * 80)
|
||||
|
||||
pretrain_metrics = benchmark_pretrain.benchmark_pretrain(
|
||||
model_name_or_path=args.model_name,
|
||||
attn_implementation=args.attn_implementation,
|
||||
batch_size=args.batch_size,
|
||||
sequence_length=args.sequence_length,
|
||||
num_steps=args.num_steps,
|
||||
warmup_steps=args.warmup_steps,
|
||||
device="cuda",
|
||||
device_id=args.device_id,
|
||||
output_dir=args.output_dir,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
if args.mode in ["inference", "both"]:
|
||||
print("\n" + "=" * 80)
|
||||
print("Running Inference Benchmark...")
|
||||
print("=" * 80)
|
||||
|
||||
inference_metrics = benchmark_inference.benchmark_inference(
|
||||
model_name_or_path=args.model_name,
|
||||
attn_implementation=args.attn_implementation,
|
||||
num_requests=args.num_requests,
|
||||
prompt_length=args.prompt_length,
|
||||
generation_length=args.generation_length,
|
||||
warmup_requests=args.warmup_requests,
|
||||
device="cuda",
|
||||
device_id=args.device_id,
|
||||
output_dir=args.output_dir,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 80)
|
||||
print("BENCHMARK COMPLETE")
|
||||
print("=" * 80)
|
||||
print(f"\nResults saved to: {output_path}")
|
||||
|
||||
if pretrain_metrics:
|
||||
print(f"\nPretraining:")
|
||||
print(f" Duration: {pretrain_metrics.total_duration_ms:.2f} ms")
|
||||
print(f" Throughput: {pretrain_metrics.total_tokens_per_second:.2f} tokens/s")
|
||||
print(f" Energy: {pretrain_metrics.total_energy_joules:.2f} J")
|
||||
print(f" Energy/token: {pretrain_metrics.total_energy_per_token*1000:.4f} mJ/token")
|
||||
|
||||
if inference_metrics:
|
||||
print(f"\nInference:")
|
||||
print(f" TTFT: {inference_metrics.ttft_ms:.2f} ms")
|
||||
print(f" ITL: {inference_metrics.itl_ms:.2f} ms/token")
|
||||
print(f" Throughput: {inference_metrics.e2e_tokens_per_second:.2f} tokens/s")
|
||||
print(f" Energy: {inference_metrics.e2e_energy_joules:.2f} J")
|
||||
print(f" Energy/token: {inference_metrics.e2e_energy_per_token*1000:.4f} mJ/token")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
45
slurm_a100.sh
Executable file
45
slurm_a100.sh
Executable file
@@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=llm_bench_a100
|
||||
#SBATCH --partition=a100 # Adjust to your A100 partition name
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --gres=gpu:a100:1 # Request 1 A100 GPU
|
||||
#SBATCH -C a100_80
|
||||
#SBATCH --time=02:00:00
|
||||
#SBATCH --output=logs/benchmark_a100_sdpa_%j.out
|
||||
#SBATCH --error=logs/benchmark_a100_sdpa_%j.err
|
||||
|
||||
# Create logs directory
|
||||
mkdir -p logs
|
||||
|
||||
# Print job info
|
||||
echo "========================================="
|
||||
echo "Job ID: $SLURM_JOB_ID"
|
||||
echo "Job Name: $SLURM_JOB_NAME"
|
||||
echo "Node: $SLURM_NODELIST"
|
||||
echo "Date: $(date)"
|
||||
echo "========================================="
|
||||
|
||||
# Set cache paths
|
||||
export TRANSFORMERS_CACHE=$(pwd)/model_cache
|
||||
export HF_HOME=$(pwd)/model_cache
|
||||
|
||||
# Path to apptainer image
|
||||
APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_updated_ao.sif"
|
||||
|
||||
# Run benchmark inside apptainer
|
||||
apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \
|
||||
--mode both \
|
||||
--model-path ./model_cache \
|
||||
--model-name Qwen/Qwen3-4B \
|
||||
--attn-implementation sdpa \
|
||||
--batch-size 3 \
|
||||
--sequence-length 2048 \
|
||||
--num-steps 10 \
|
||||
--num-requests 10 \
|
||||
--prompt-length 512 \
|
||||
--generation-length 100 \
|
||||
--output-dir ./results/a100
|
||||
|
||||
echo "========================================="
|
||||
echo "Benchmark Complete!"
|
||||
echo "========================================="
|
||||
46
slurm_h100.sh
Executable file
46
slurm_h100.sh
Executable file
@@ -0,0 +1,46 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=llm_bench_h100
|
||||
#SBATCH --partition=h100 # Adjust to your H100 partition name
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --gres=gpu:h100:1 # Request 1 H100 GPU
|
||||
#SBATCH --time=02:00:00
|
||||
#SBATCH --output=logs/benchmark_h100_%j.out
|
||||
#SBATCH --error=logs/benchmark_h100_%j.err
|
||||
|
||||
# Create logs directory
|
||||
mkdir -p logs
|
||||
|
||||
# Print job info
|
||||
echo "========================================="
|
||||
echo "Job ID: $SLURM_JOB_ID"
|
||||
echo "Job Name: $SLURM_JOB_NAME"
|
||||
echo "Node: $SLURM_NODELIST"
|
||||
echo "Date: $(date)"
|
||||
echo "========================================="
|
||||
|
||||
# Set cache paths
|
||||
export TRANSFORMERS_CACHE=$(pwd)/model_cache
|
||||
export HF_HOME=$(pwd)/model_cache
|
||||
|
||||
# Path to apptainer image
|
||||
APPTAINER_IMAGE="/hnvme/workspace/ihpc125h-llm-profiler/pytorch_25.10_updated_ao.sif"
|
||||
|
||||
# Run benchmark with FlashAttention-3 Hopper inside apptainer
|
||||
apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \
|
||||
--mode both \
|
||||
--model-path ./model_cache \
|
||||
--model-name Qwen/Qwen3-4B \
|
||||
--attn-implementation sdpa \
|
||||
--batch-size 3 \
|
||||
--sequence-length 2048 \
|
||||
--num-steps 10 \
|
||||
--num-requests 10 \
|
||||
--prompt-length 512 \
|
||||
--generation-length 100 \
|
||||
--output-dir ./results/h100_sdpa
|
||||
|
||||
# --attn-implementation flash_attention_3_hopper \
|
||||
|
||||
echo "========================================="
|
||||
echo "Benchmark Complete!"
|
||||
echo "========================================="
|
||||
45
slurm_h200.sh
Executable file
45
slurm_h200.sh
Executable file
@@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=llm_bench_h200
|
||||
#SBATCH --partition=h200 # Adjust to your H200 partition name
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --gres=gpu:h200:1 # Request 1 H200 GPU
|
||||
#SBATCH --time=02:00:00
|
||||
#SBATCH --output=logs/benchmark_h200_%j.out
|
||||
#SBATCH --error=logs/benchmark_h200_%j.err
|
||||
|
||||
# Create logs directory
|
||||
mkdir -p logs
|
||||
|
||||
# Print job info
|
||||
echo "========================================="
|
||||
echo "Job ID: $SLURM_JOB_ID"
|
||||
echo "Job Name: $SLURM_JOB_NAME"
|
||||
echo "Node: $SLURM_NODELIST"
|
||||
echo "Date: $(date)"
|
||||
echo "========================================="
|
||||
|
||||
# Set cache paths
|
||||
export TRANSFORMERS_CACHE=$(pwd)/model_cache
|
||||
export HF_HOME=$(pwd)/model_cache
|
||||
|
||||
# Path to apptainer image
|
||||
APPTAINER_IMAGE="/hnvme/workspace/ihpc125h-llm-profiler/pytorch_25.10_updated_ao.sif"
|
||||
|
||||
# Run benchmark with FlashAttention-3 Hopper inside apptainer
|
||||
apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \
|
||||
--mode both \
|
||||
--model-path ./model_cache \
|
||||
--model-name Qwen/Qwen3-4B \
|
||||
--attn-implementation sdpa \
|
||||
--batch-size 3 \
|
||||
--sequence-length 2048 \
|
||||
--num-steps 10 \
|
||||
--num-requests 10 \
|
||||
--prompt-length 512 \
|
||||
--generation-length 100 \
|
||||
--output-dir ./results/h200_sdpa
|
||||
# --attn-implementation flash_attention_3_hopper \
|
||||
|
||||
echo "========================================="
|
||||
echo "Benchmark Complete!"
|
||||
echo "========================================="
|
||||
42
slurm_mi300x.sh
Executable file
42
slurm_mi300x.sh
Executable file
@@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=llm_bench_mi300x
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH -w=aquavan1 # Request MI300X GPUs
|
||||
#SBATCH --time=02:00:00
|
||||
#SBATCH --output=logs/benchmark_mi300x_%j.out
|
||||
#SBATCH --error=logs/benchmark_mi300x_%j.err
|
||||
|
||||
# Create logs directory
|
||||
mkdir -p logs
|
||||
|
||||
# Print job info
|
||||
echo "========================================="
|
||||
echo "Job ID: $SLURM_JOB_ID"
|
||||
echo "Job Name: $SLURM_JOB_NAME"
|
||||
echo "Node: $SLURM_NODELIST"
|
||||
echo "Date: $(date)"
|
||||
echo "========================================="
|
||||
|
||||
# Set cache paths
|
||||
export TRANSFORMERS_CACHE=$(pwd)/models
|
||||
export HF_HOME=$(pwd)/models
|
||||
|
||||
# Path to apptainer image
|
||||
#APPTAINER_IMAGE="/home/woody/ihpc/ihpc125h/pytorch_25.10_updated_ao.sif"
|
||||
|
||||
apptainer exec --writable ../rocm_sandbox/ python run_benchmark.py \
|
||||
--mode both \
|
||||
--model-path ./model_cache \
|
||||
--model-name Qwen/Qwen3-4B \
|
||||
--attn-implementation sdpa \
|
||||
--batch-size 3 \
|
||||
--sequence-length 2048 \
|
||||
--num-steps 10 \
|
||||
--num-requests 10 \
|
||||
--prompt-length 512 \
|
||||
--generation-length 100 \
|
||||
--output-dir ./results/mi300x_sdpa
|
||||
|
||||
echo "========================================="
|
||||
echo "Benchmark Complete!"
|
||||
echo "========================================="
|
||||
3
utils/__init__.py
Normal file
3
utils/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""Utility package for LLM benchmarking."""
|
||||
|
||||
__version__ = "1.0.0"
|
||||
295
utils/attention.py
Normal file
295
utils/attention.py
Normal file
@@ -0,0 +1,295 @@
|
||||
"""
|
||||
Attention Implementation Helpers for LLM Benchmarking
|
||||
|
||||
Provides functions for configuring different attention implementations
|
||||
based on GPU type.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
import warnings
|
||||
|
||||
|
||||
def get_default_attention(gpu_name: str) -> str:
|
||||
"""
|
||||
Get default attention implementation for GPU type.
|
||||
|
||||
Args:
|
||||
gpu_name: GPU device name (from monitoring)
|
||||
|
||||
Returns:
|
||||
Attention implementation string
|
||||
"""
|
||||
gpu_lower = gpu_name.lower()
|
||||
|
||||
# H100/H200: FlashAttention-3 Hopper
|
||||
if 'h100' in gpu_lower or 'h200' in gpu_lower:
|
||||
return "flash_attention_3_hopper"
|
||||
|
||||
# A100, MI300X, other: FlashAttention-2
|
||||
return "flash_attention_2"
|
||||
|
||||
|
||||
def configure_model_attention(model, attn_implementation: str, verbose: bool = True):
|
||||
"""
|
||||
Configure model to use specified attention implementation.
|
||||
|
||||
This function patches the model if needed to use the specified attention.
|
||||
For standard implementations like flash_attention_2, the model should already
|
||||
be loaded with the correct implementation via AutoModelForCausalLM.from_pretrained().
|
||||
|
||||
For FlashAttention-3 Hopper, this patches the model's attention modules.
|
||||
|
||||
Args:
|
||||
model: The loaded model
|
||||
attn_implementation: Attention implementation to use
|
||||
verbose: Print configuration messages
|
||||
|
||||
Returns:
|
||||
Configured model
|
||||
"""
|
||||
if verbose:
|
||||
print(f"Configuring attention: {attn_implementation}")
|
||||
|
||||
if attn_implementation == "flash_attention_3_hopper":
|
||||
# Patch model to use FlashAttention-3 Hopper
|
||||
try:
|
||||
import flash_attn_interface
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"flash_attn_interface not found. This is required for FlashAttention-3.\n"
|
||||
"Install with appropriate method for your system."
|
||||
)
|
||||
|
||||
# Patch the model's attention function
|
||||
_patch_fa3_hopper(model, verbose=verbose)
|
||||
|
||||
elif attn_implementation == "flash_attention_2":
|
||||
# Model should already be loaded with FA2
|
||||
if verbose:
|
||||
print(" Using FlashAttention-2 (configured during model loading)")
|
||||
|
||||
elif attn_implementation == "sdpa":
|
||||
# PyTorch Scaled Dot Product Attention
|
||||
if verbose:
|
||||
print(" Using PyTorch SDPA")
|
||||
|
||||
elif attn_implementation == "eager":
|
||||
# Standard PyTorch attention
|
||||
if verbose:
|
||||
print(" Using eager attention")
|
||||
|
||||
else:
|
||||
warnings.warn(f"Unknown attention implementation: {attn_implementation}")
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def _patch_fa3_hopper(model, verbose: bool = True):
|
||||
"""
|
||||
Patch model to use FlashAttention-3 Hopper.
|
||||
|
||||
This replaces the attention computation in the model's attention layers
|
||||
with calls to flash_attn_interface.flash_attn_func().
|
||||
|
||||
Args:
|
||||
model: The model to patch
|
||||
verbose: Print patching messages
|
||||
"""
|
||||
import flash_attn_interface
|
||||
import torch
|
||||
|
||||
# Counter for patched modules
|
||||
num_patched = 0
|
||||
|
||||
# Iterate through all modules in the model
|
||||
for name, module in model.named_modules():
|
||||
# Look for attention modules (this will vary by model architecture)
|
||||
# Common names: "self_attn", "attn", "attention"
|
||||
if any(attn_name in name.lower() for attn_name in ['self_attn', 'attention']):
|
||||
# Check if module has a forward method we can patch
|
||||
if hasattr(module, 'forward'):
|
||||
# Save original forward
|
||||
original_forward = module.forward
|
||||
|
||||
# Create patched forward function
|
||||
def create_patched_forward(orig_forward):
|
||||
def patched_forward(hidden_states, *args, **kwargs):
|
||||
# Check if this is an attention computation
|
||||
# For Qwen models, attention modules typically have q, k, v projections
|
||||
if hasattr(module, 'q_proj') and hasattr(module, 'k_proj') and hasattr(module, 'v_proj'):
|
||||
# Extract batch, seq_len, hidden_dim
|
||||
batch_size, seq_len, hidden_dim = hidden_states.shape
|
||||
|
||||
# Compute Q, K, V
|
||||
q = module.q_proj(hidden_states)
|
||||
k = module.k_proj(hidden_states)
|
||||
v = module.v_proj(hidden_states)
|
||||
|
||||
# Reshape for multi-head attention
|
||||
num_heads = module.num_heads
|
||||
head_dim = hidden_dim // num_heads
|
||||
|
||||
q = q.view(batch_size, seq_len, num_heads, head_dim)
|
||||
k = k.view(batch_size, seq_len, num_heads, head_dim)
|
||||
v = v.view(batch_size, seq_len, num_heads, head_dim)
|
||||
|
||||
# Call FlashAttention-3
|
||||
# Note: flash_attn_func expects (batch, seqlen, nheads, headdim)
|
||||
attn_output = flash_attn_interface.flash_attn_func(
|
||||
q, k, v,
|
||||
dropout_p=0.0,
|
||||
softmax_scale=None, # Will use default 1/sqrt(head_dim)
|
||||
causal=True, # For causal LM
|
||||
)
|
||||
|
||||
# Reshape back
|
||||
attn_output = attn_output.view(batch_size, seq_len, hidden_dim)
|
||||
|
||||
# Apply output projection if it exists
|
||||
if hasattr(module, 'o_proj'):
|
||||
attn_output = module.o_proj(attn_output)
|
||||
|
||||
return (attn_output,) + (None,) * (len(orig_forward(hidden_states, *args, **kwargs)) - 1)
|
||||
|
||||
else:
|
||||
# Not an attention module we can patch, use original
|
||||
return orig_forward(hidden_states, *args, **kwargs)
|
||||
|
||||
return patched_forward
|
||||
|
||||
# Apply patch
|
||||
module.forward = create_patched_forward(original_forward)
|
||||
num_patched += 1
|
||||
|
||||
if verbose:
|
||||
if num_patched > 0:
|
||||
print(f" ✓ Patched {num_patched} attention modules to use FlashAttention-3 Hopper")
|
||||
else:
|
||||
warnings.warn(" ⚠ No attention modules found to patch for FlashAttention-3")
|
||||
|
||||
|
||||
def get_attention_info(attn_implementation: str) -> dict:
|
||||
"""
|
||||
Get information about an attention implementation.
|
||||
|
||||
Args:
|
||||
attn_implementation: Attention implementation string
|
||||
|
||||
Returns:
|
||||
Dictionary with info about the implementation
|
||||
"""
|
||||
info = {
|
||||
"flash_attention_2": {
|
||||
"name": "FlashAttention-2",
|
||||
"description": "Optimized attention for A100 and other GPUs",
|
||||
"gpu_support": ["A100", "MI300X", "V100", "RTX"],
|
||||
"memory_efficient": True,
|
||||
"requires_cuda": True,
|
||||
},
|
||||
"flash_attention_3_hopper": {
|
||||
"name": "FlashAttention-3 Hopper",
|
||||
"description": "Optimized attention for H100/H200 Hopper architecture",
|
||||
"gpu_support": ["H100", "H200"],
|
||||
"memory_efficient": True,
|
||||
"requires_cuda": True,
|
||||
},
|
||||
"sdpa": {
|
||||
"name": "PyTorch SDPA",
|
||||
"description": "PyTorch Scaled Dot Product Attention",
|
||||
"gpu_support": ["All"],
|
||||
"memory_efficient": True,
|
||||
"requires_cuda": False,
|
||||
},
|
||||
"eager": {
|
||||
"name": "Eager Attention",
|
||||
"description": "Standard PyTorch attention implementation",
|
||||
"gpu_support": ["All"],
|
||||
"memory_efficient": False,
|
||||
"requires_cuda": False,
|
||||
},
|
||||
}
|
||||
|
||||
return info.get(attn_implementation, {
|
||||
"name": attn_implementation,
|
||||
"description": "Unknown attention implementation",
|
||||
"gpu_support": ["Unknown"],
|
||||
"memory_efficient": False,
|
||||
"requires_cuda": False,
|
||||
})
|
||||
|
||||
|
||||
def validate_attention_for_gpu(attn_implementation: str, gpu_name: str) -> tuple[bool, Optional[str]]:
|
||||
"""
|
||||
Validate if attention implementation is suitable for GPU.
|
||||
|
||||
Args:
|
||||
attn_implementation: Attention implementation
|
||||
gpu_name: GPU device name
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, warning_message)
|
||||
"""
|
||||
gpu_lower = gpu_name.lower()
|
||||
|
||||
# FlashAttention-3 Hopper validation
|
||||
if attn_implementation == "flash_attention_3_hopper":
|
||||
if 'h100' not in gpu_lower and 'h200' not in gpu_lower:
|
||||
return False, (
|
||||
f"FlashAttention-3 Hopper is optimized for H100/H200. "
|
||||
f"Current GPU: {gpu_name}. Consider using flash_attention_2 instead."
|
||||
)
|
||||
|
||||
# FlashAttention-2 on Hopper GPUs
|
||||
if attn_implementation == "flash_attention_2":
|
||||
if 'h100' in gpu_lower or 'h200' in gpu_lower:
|
||||
return True, (
|
||||
f"FlashAttention-2 will work on {gpu_name}, but FlashAttention-3 Hopper "
|
||||
f"may provide better performance."
|
||||
)
|
||||
|
||||
return True, None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Test attention configuration."""
|
||||
print("=" * 60)
|
||||
print("Attention Implementation Test")
|
||||
print("=" * 60)
|
||||
|
||||
# Test getting default attention for different GPUs
|
||||
test_gpus = [
|
||||
"NVIDIA A100 80GB",
|
||||
"NVIDIA H100 80GB",
|
||||
"NVIDIA H200 141GB",
|
||||
"AMD Instinct MI300X",
|
||||
]
|
||||
|
||||
print("\nDefault attention implementations:")
|
||||
for gpu in test_gpus:
|
||||
attn = get_default_attention(gpu)
|
||||
print(f" {gpu:30s} → {attn}")
|
||||
|
||||
# Test validation
|
||||
print("\nValidation tests:")
|
||||
test_cases = [
|
||||
("flash_attention_3_hopper", "NVIDIA H100 80GB"),
|
||||
("flash_attention_3_hopper", "NVIDIA A100 80GB"),
|
||||
("flash_attention_2", "NVIDIA H100 80GB"),
|
||||
("flash_attention_2", "NVIDIA A100 80GB"),
|
||||
]
|
||||
|
||||
for attn, gpu in test_cases:
|
||||
valid, warning = validate_attention_for_gpu(attn, gpu)
|
||||
status = "✓" if valid else "✗"
|
||||
print(f" {status} {attn:30s} on {gpu:25s}")
|
||||
if warning:
|
||||
print(f" ⚠ {warning}")
|
||||
|
||||
# Test getting info
|
||||
print("\nAttention implementation info:")
|
||||
for attn in ["flash_attention_2", "flash_attention_3_hopper", "sdpa"]:
|
||||
info = get_attention_info(attn)
|
||||
print(f"\n {info['name']}:")
|
||||
print(f" Description: {info['description']}")
|
||||
print(f" GPU Support: {', '.join(info['gpu_support'])}")
|
||||
print(f" Memory Efficient: {info['memory_efficient']}")
|
||||
562
utils/gpu_monitor.py
Normal file
562
utils/gpu_monitor.py
Normal file
@@ -0,0 +1,562 @@
|
||||
"""
|
||||
GPU Monitoring Infrastructure for LLM Benchmarking
|
||||
|
||||
Provides unified interface for monitoring both NVIDIA and AMD GPUs.
|
||||
"""
|
||||
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, List
|
||||
import warnings
|
||||
|
||||
|
||||
@dataclass
|
||||
class GPUMetrics:
|
||||
"""Container for GPU metrics."""
|
||||
timestamp: float
|
||||
power_watts: float
|
||||
gpu_utilization_percent: float
|
||||
memory_used_gb: float
|
||||
memory_total_gb: float
|
||||
temperature_celsius: Optional[float] = None
|
||||
energy_joules: Optional[float] = None # Cumulative energy
|
||||
|
||||
|
||||
class GPUMonitor(ABC):
|
||||
"""Abstract base class for GPU monitoring."""
|
||||
|
||||
def __init__(self, device_id: int = 0):
|
||||
"""
|
||||
Initialize GPU monitor.
|
||||
|
||||
Args:
|
||||
device_id: GPU device ID to monitor
|
||||
"""
|
||||
self.device_id = device_id
|
||||
self.start_time = None
|
||||
self.start_energy = None
|
||||
self.last_metrics = None
|
||||
|
||||
@abstractmethod
|
||||
def get_metrics(self) -> GPUMetrics:
|
||||
"""Get current GPU metrics."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_device_name(self) -> str:
|
||||
"""Get GPU device name."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def cleanup(self):
|
||||
"""Cleanup resources."""
|
||||
pass
|
||||
|
||||
def start_monitoring(self):
|
||||
"""Start energy monitoring session."""
|
||||
self.start_time = time.time()
|
||||
metrics = self.get_metrics()
|
||||
self.start_energy = metrics.energy_joules if metrics.energy_joules is not None else 0.0
|
||||
self.last_metrics = metrics
|
||||
|
||||
def get_energy_consumed(self) -> float:
|
||||
"""
|
||||
Get energy consumed since start_monitoring() was called.
|
||||
|
||||
Returns:
|
||||
Energy in Joules
|
||||
"""
|
||||
if self.start_time is None:
|
||||
raise RuntimeError("Must call start_monitoring() first")
|
||||
|
||||
current_metrics = self.get_metrics()
|
||||
|
||||
if current_metrics.energy_joules is not None:
|
||||
# If GPU provides cumulative energy, use it
|
||||
return current_metrics.energy_joules - self.start_energy
|
||||
else:
|
||||
# Otherwise, integrate power over time
|
||||
elapsed_time = time.time() - self.start_time
|
||||
# Use average of start and current power
|
||||
avg_power = (self.last_metrics.power_watts + current_metrics.power_watts) / 2.0
|
||||
return avg_power * elapsed_time
|
||||
|
||||
def get_average_power(self) -> float:
|
||||
"""
|
||||
Get average power consumption since start_monitoring().
|
||||
|
||||
Returns:
|
||||
Average power in Watts
|
||||
"""
|
||||
if self.start_time is None:
|
||||
raise RuntimeError("Must call start_monitoring() first")
|
||||
|
||||
elapsed_time = time.time() - self.start_time
|
||||
if elapsed_time == 0:
|
||||
return 0.0
|
||||
|
||||
energy = self.get_energy_consumed()
|
||||
return energy / elapsed_time
|
||||
|
||||
|
||||
class NVIDIAMonitor(GPUMonitor):
|
||||
"""NVIDIA GPU monitor using pynvml."""
|
||||
|
||||
def __init__(self, device_id: int = 0):
|
||||
"""Initialize NVIDIA monitor."""
|
||||
try:
|
||||
import pynvml
|
||||
self.pynvml = pynvml
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pynvml not found. Install with: pip install pynvml"
|
||||
)
|
||||
|
||||
try:
|
||||
self.pynvml.nvmlInit()
|
||||
self.handle = self.pynvml.nvmlDeviceGetHandleByIndex(device_id)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to initialize NVIDIA GPU {device_id}: {e}")
|
||||
|
||||
super().__init__(device_id)
|
||||
|
||||
def get_metrics(self) -> GPUMetrics:
|
||||
"""Get current NVIDIA GPU metrics."""
|
||||
try:
|
||||
# Power (in milliwatts)
|
||||
power_mw = self.pynvml.nvmlDeviceGetPowerUsage(self.handle)
|
||||
power_watts = power_mw / 1000.0
|
||||
|
||||
# Utilization
|
||||
util = self.pynvml.nvmlDeviceGetUtilizationRates(self.handle)
|
||||
gpu_util = util.gpu
|
||||
|
||||
# Memory
|
||||
mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(self.handle)
|
||||
memory_used_gb = mem_info.used / (1024**3)
|
||||
memory_total_gb = mem_info.total / (1024**3)
|
||||
|
||||
# Temperature
|
||||
try:
|
||||
temp = self.pynvml.nvmlDeviceGetTemperature(
|
||||
self.handle,
|
||||
self.pynvml.NVML_TEMPERATURE_GPU
|
||||
)
|
||||
except:
|
||||
temp = None
|
||||
|
||||
# Try to get cumulative energy (newer GPUs)
|
||||
energy_joules = None
|
||||
try:
|
||||
energy_mj = self.pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
|
||||
energy_joules = energy_mj / 1000.0
|
||||
except:
|
||||
# Not supported on this GPU, will use power integration
|
||||
pass
|
||||
|
||||
return GPUMetrics(
|
||||
timestamp=time.time(),
|
||||
power_watts=power_watts,
|
||||
gpu_utilization_percent=gpu_util,
|
||||
memory_used_gb=memory_used_gb,
|
||||
memory_total_gb=memory_total_gb,
|
||||
temperature_celsius=temp,
|
||||
energy_joules=energy_joules
|
||||
)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to get NVIDIA GPU metrics: {e}")
|
||||
|
||||
def get_device_name(self) -> str:
|
||||
"""Get NVIDIA GPU device name."""
|
||||
try:
|
||||
name = self.pynvml.nvmlDeviceGetName(self.handle)
|
||||
if isinstance(name, bytes):
|
||||
name = name.decode('utf-8')
|
||||
return name
|
||||
except:
|
||||
return f"NVIDIA GPU {self.device_id}"
|
||||
|
||||
def cleanup(self):
|
||||
"""Cleanup NVIDIA resources."""
|
||||
try:
|
||||
self.pynvml.nvmlShutdown()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
class AMDMonitor(GPUMonitor):
|
||||
"""AMD GPU monitor using rocm-smi command line tool."""
|
||||
|
||||
def __init__(self, device_id: int = 0):
|
||||
"""Initialize AMD monitor."""
|
||||
import subprocess
|
||||
import shutil
|
||||
|
||||
# Check if rocm-smi is available
|
||||
if shutil.which('rocm-smi') is None:
|
||||
raise RuntimeError("rocm-smi command not found. Make sure ROCm is installed and in PATH.")
|
||||
|
||||
self.device_id = device_id
|
||||
|
||||
# Verify device exists
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['rocm-smi', '--showid'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"rocm-smi failed: {result.stderr}")
|
||||
except subprocess.TimeoutExpired:
|
||||
raise RuntimeError("rocm-smi command timed out")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to initialize AMD GPU {device_id}: {e}")
|
||||
|
||||
super().__init__(device_id)
|
||||
|
||||
def _parse_detailed_output(self, output: str) -> dict:
|
||||
"""Parse rocm-smi detailed output format."""
|
||||
lines = output.strip().split('\n')
|
||||
|
||||
# Parse detailed format: GPU[X] : Metric : Value
|
||||
metrics = {
|
||||
'temperature': None,
|
||||
'power': None,
|
||||
'vram_percent': None,
|
||||
'gpu_percent': None,
|
||||
}
|
||||
|
||||
device_prefix = f"GPU[{self.device_id}]"
|
||||
|
||||
for line in lines:
|
||||
if not line.strip() or not line.startswith(device_prefix):
|
||||
continue
|
||||
|
||||
# Split by colon
|
||||
parts = line.split(':')
|
||||
if len(parts) < 3:
|
||||
continue
|
||||
|
||||
metric_name = parts[1].strip().lower()
|
||||
value_str = parts[2].strip()
|
||||
|
||||
try:
|
||||
# Temperature (Sensor junction)
|
||||
if 'temperature' in metric_name and 'junction' in metric_name:
|
||||
metrics['temperature'] = float(value_str)
|
||||
|
||||
# Power consumption
|
||||
elif 'power' in metric_name and 'package' in metric_name:
|
||||
metrics['power'] = float(value_str)
|
||||
|
||||
# GPU utilization
|
||||
elif 'gpu use' in metric_name:
|
||||
metrics['gpu_percent'] = float(value_str)
|
||||
|
||||
# VRAM usage percentage
|
||||
elif 'memory allocated' in metric_name and 'vram%' in metric_name:
|
||||
metrics['vram_percent'] = float(value_str)
|
||||
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
# Validate we got the required metrics
|
||||
if metrics['temperature'] is None:
|
||||
raise ValueError(f"Could not find temperature for GPU[{self.device_id}]")
|
||||
if metrics['power'] is None:
|
||||
raise ValueError(f"Could not find power for GPU[{self.device_id}]")
|
||||
if metrics['gpu_percent'] is None:
|
||||
metrics['gpu_percent'] = 0.0
|
||||
if metrics['vram_percent'] is None:
|
||||
metrics['vram_percent'] = 0.0
|
||||
|
||||
return metrics
|
||||
|
||||
def _get_memory_info(self) -> tuple:
|
||||
"""Get memory usage in GB using rocm-smi --showmeminfo."""
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['rocm-smi', '--showmeminfo', 'vram', '-d', str(self.device_id)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
return 0.0, 0.0
|
||||
|
||||
# Parse output for memory info
|
||||
# Looking for lines like "GPU memory used: X MiB" and "GPU memory total: Y MiB"
|
||||
used_gb = 0.0
|
||||
total_gb = 0.0
|
||||
|
||||
for line in result.stdout.split('\n'):
|
||||
if 'Used' in line or 'used' in line:
|
||||
# Extract number
|
||||
parts = line.split()
|
||||
for i, part in enumerate(parts):
|
||||
if part.replace('.', '').isdigit():
|
||||
used_bytes = float(part)
|
||||
# Check if next part indicates unit
|
||||
if i + 1 < len(parts):
|
||||
unit = parts[i + 1].lower()
|
||||
if 'mb' in unit or 'mib' in unit:
|
||||
used_gb = used_bytes / 1024
|
||||
elif 'gb' in unit or 'gib' in unit:
|
||||
used_gb = used_bytes
|
||||
elif 'kb' in unit or 'kib' in unit:
|
||||
used_gb = used_bytes / (1024 * 1024)
|
||||
break
|
||||
|
||||
if 'Total' in line or 'total' in line:
|
||||
parts = line.split()
|
||||
for i, part in enumerate(parts):
|
||||
if part.replace('.', '').isdigit():
|
||||
total_bytes = float(part)
|
||||
if i + 1 < len(parts):
|
||||
unit = parts[i + 1].lower()
|
||||
if 'mb' in unit or 'mib' in unit:
|
||||
total_gb = total_bytes / 1024
|
||||
elif 'gb' in unit or 'gib' in unit:
|
||||
total_gb = total_bytes
|
||||
elif 'kb' in unit or 'kib' in unit:
|
||||
total_gb = total_bytes / (1024 * 1024)
|
||||
break
|
||||
|
||||
return used_gb, total_gb
|
||||
|
||||
except Exception:
|
||||
return 0.0, 0.0
|
||||
|
||||
def get_metrics(self) -> GPUMetrics:
|
||||
"""Get current AMD GPU metrics."""
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
# Get main metrics from concise output
|
||||
result = subprocess.run(
|
||||
['rocm-smi', '--showid', '--showtemp', '--showpower', '--showuse', '--showmemuse'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"rocm-smi failed: {result.stderr}")
|
||||
|
||||
metrics = self._parse_detailed_output(result.stdout)
|
||||
|
||||
# Get detailed memory info
|
||||
memory_used_gb, memory_total_gb = self._get_memory_info()
|
||||
|
||||
# If we couldn't get absolute memory, estimate from percentage
|
||||
if memory_total_gb == 0.0:
|
||||
# MI300X has ~192GB, MI250X has ~128GB - use a reasonable default
|
||||
memory_total_gb = 192.0 # Assume MI300X
|
||||
memory_used_gb = memory_total_gb * (metrics['vram_percent'] / 100.0)
|
||||
|
||||
return GPUMetrics(
|
||||
timestamp=time.time(),
|
||||
power_watts=metrics['power'],
|
||||
gpu_utilization_percent=metrics['gpu_percent'],
|
||||
memory_used_gb=memory_used_gb,
|
||||
memory_total_gb=memory_total_gb,
|
||||
temperature_celsius=metrics['temperature'],
|
||||
energy_joules=None # Will use power integration
|
||||
)
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
raise RuntimeError("rocm-smi command timed out")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to get AMD GPU metrics: {e}")
|
||||
|
||||
def get_device_name(self) -> str:
|
||||
"""Get AMD GPU device name."""
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['rocm-smi', '--showproductname', '-d', str(self.device_id)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
# Parse output to find device name
|
||||
for line in result.stdout.split('\n'):
|
||||
if 'Card series' in line or 'Card model' in line or 'name' in line.lower():
|
||||
parts = line.split(':')
|
||||
if len(parts) > 1:
|
||||
return parts[1].strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return f"AMD GPU {self.device_id}"
|
||||
|
||||
def cleanup(self):
|
||||
"""Cleanup AMD resources."""
|
||||
# No cleanup needed for command-line tool
|
||||
pass
|
||||
|
||||
|
||||
def get_gpu_monitor(device_id: int = 0) -> GPUMonitor:
|
||||
"""
|
||||
Factory function to automatically detect and create appropriate GPU monitor.
|
||||
|
||||
Args:
|
||||
device_id: GPU device ID to monitor
|
||||
|
||||
Returns:
|
||||
GPUMonitor instance (NVIDIAMonitor or AMDMonitor)
|
||||
|
||||
Raises:
|
||||
RuntimeError: If no supported GPU is found
|
||||
"""
|
||||
# Try AMD first (rocm-smi based) as it's more commonly available
|
||||
try:
|
||||
return AMDMonitor(device_id)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Try NVIDIA if AMD fails
|
||||
try:
|
||||
return NVIDIAMonitor(device_id)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Try to import torch to detect GPU type as last resort
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
# Check if it's NVIDIA or AMD
|
||||
device_name = torch.cuda.get_device_name(device_id).lower()
|
||||
|
||||
if 'nvidia' in device_name or 'tesla' in device_name or 'geforce' in device_name:
|
||||
return NVIDIAMonitor(device_id)
|
||||
elif 'amd' in device_name or 'radeon' in device_name or 'mi300' in device_name or 'mi200' in device_name:
|
||||
return AMDMonitor(device_id)
|
||||
except:
|
||||
pass
|
||||
|
||||
raise RuntimeError(
|
||||
"No supported GPU found. Make sure either ROCm (rocm-smi) or NVIDIA (pynvml) drivers are installed."
|
||||
)
|
||||
|
||||
|
||||
def list_available_gpus() -> List[str]:
|
||||
"""
|
||||
List all available GPUs.
|
||||
|
||||
Returns:
|
||||
List of GPU names
|
||||
"""
|
||||
gpus = []
|
||||
|
||||
# Try NVIDIA
|
||||
try:
|
||||
import pynvml
|
||||
pynvml.nvmlInit()
|
||||
device_count = pynvml.nvmlDeviceGetCount()
|
||||
for i in range(device_count):
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
||||
name = pynvml.nvmlDeviceGetName(handle)
|
||||
if isinstance(name, bytes):
|
||||
name = name.decode('utf-8')
|
||||
gpus.append(f"GPU {i}: {name} (NVIDIA)")
|
||||
pynvml.nvmlShutdown()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Try AMD with rocm-smi
|
||||
try:
|
||||
import subprocess
|
||||
import shutil
|
||||
|
||||
if shutil.which('rocm-smi'):
|
||||
result = subprocess.run(
|
||||
['rocm-smi', '--showid'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
if result.returncode == 0:
|
||||
# Parse device IDs from output
|
||||
for line in result.stdout.split('\n'):
|
||||
if not line.strip() or line.startswith('=') or 'Device' in line or 'ROCm' in line:
|
||||
continue
|
||||
parts = line.split()
|
||||
if parts and parts[0].isdigit():
|
||||
device_id = int(parts[0])
|
||||
# Try to get device name
|
||||
name_result = subprocess.run(
|
||||
['rocm-smi', '--showproductname', '-d', str(device_id)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
name = f"AMD GPU"
|
||||
if name_result.returncode == 0:
|
||||
for name_line in name_result.stdout.split('\n'):
|
||||
if 'Card' in name_line or 'name' in name_line.lower():
|
||||
parts_name = name_line.split(':')
|
||||
if len(parts_name) > 1:
|
||||
name = parts_name[1].strip()
|
||||
break
|
||||
gpus.append(f"GPU {device_id}: {name} (AMD)")
|
||||
except:
|
||||
pass
|
||||
|
||||
return gpus
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Test GPU monitoring."""
|
||||
print("=" * 60)
|
||||
print("GPU Monitoring Test")
|
||||
print("=" * 60)
|
||||
|
||||
# List available GPUs
|
||||
print("\nAvailable GPUs:")
|
||||
gpus = list_available_gpus()
|
||||
if not gpus:
|
||||
print(" No GPUs found!")
|
||||
exit(1)
|
||||
|
||||
for gpu in gpus:
|
||||
print(f" {gpu}")
|
||||
|
||||
# Test monitoring
|
||||
print("\nTesting GPU 0 monitoring...")
|
||||
try:
|
||||
monitor = get_gpu_monitor(0)
|
||||
print(f" Device: {monitor.get_device_name()}")
|
||||
|
||||
# Get metrics
|
||||
metrics = monitor.get_metrics()
|
||||
print(f"\nCurrent Metrics:")
|
||||
print(f" Power: {metrics.power_watts:.2f} W")
|
||||
print(f" GPU Utilization: {metrics.gpu_utilization_percent:.1f}%")
|
||||
print(f" Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB")
|
||||
if metrics.temperature_celsius:
|
||||
print(f" Temperature: {metrics.temperature_celsius:.1f}°C")
|
||||
|
||||
# Test energy monitoring
|
||||
print("\nTesting energy monitoring (5 seconds)...")
|
||||
monitor.start_monitoring()
|
||||
time.sleep(5)
|
||||
energy = monitor.get_energy_consumed()
|
||||
avg_power = monitor.get_average_power()
|
||||
print(f" Energy consumed: {energy:.2f} J")
|
||||
print(f" Average power: {avg_power:.2f} W")
|
||||
|
||||
monitor.cleanup()
|
||||
print("\n✓ Monitoring test successful!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ Error: {e}")
|
||||
exit(1)
|
||||
473
utils/metrics.py
Normal file
473
utils/metrics.py
Normal file
@@ -0,0 +1,473 @@
|
||||
"""
|
||||
Metrics Collection and Reporting for LLM Benchmarking
|
||||
|
||||
Provides centralized metrics collection, aggregation, and reporting.
|
||||
"""
|
||||
|
||||
import json
|
||||
import csv
|
||||
from dataclasses import dataclass, asdict, field
|
||||
from typing import Dict, List, Optional, Any
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
|
||||
@dataclass
|
||||
class StageMetrics:
|
||||
"""Metrics for a specific stage (e.g., forward pass, prefill, etc.)."""
|
||||
stage_name: str
|
||||
duration_ms: float
|
||||
tokens_processed: int
|
||||
tokens_per_second: float
|
||||
energy_joules: float
|
||||
energy_per_token: float
|
||||
avg_power_watts: float
|
||||
peak_memory_gb: float
|
||||
avg_gpu_util_percent: float
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary."""
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PretrainMetrics:
|
||||
"""Metrics for pretraining benchmark."""
|
||||
model_name: str
|
||||
gpu_name: str
|
||||
attention_implementation: str
|
||||
batch_size: int
|
||||
sequence_length: int
|
||||
num_steps: int
|
||||
|
||||
# Stage-specific metrics
|
||||
forward: StageMetrics
|
||||
backward: StageMetrics
|
||||
optimizer: StageMetrics
|
||||
|
||||
# Overall metrics
|
||||
total_duration_ms: float
|
||||
total_tokens: int
|
||||
total_tokens_per_second: float
|
||||
total_energy_joules: float
|
||||
total_energy_per_token: float
|
||||
|
||||
timestamp: float = field(default_factory=time.time)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary."""
|
||||
return {
|
||||
"model_name": self.model_name,
|
||||
"gpu_name": self.gpu_name,
|
||||
"attention_implementation": self.attention_implementation,
|
||||
"batch_size": self.batch_size,
|
||||
"sequence_length": self.sequence_length,
|
||||
"num_steps": self.num_steps,
|
||||
"forward": self.forward.to_dict(),
|
||||
"backward": self.backward.to_dict(),
|
||||
"optimizer": self.optimizer.to_dict(),
|
||||
"total_duration_ms": self.total_duration_ms,
|
||||
"total_tokens": self.total_tokens,
|
||||
"total_tokens_per_second": self.total_tokens_per_second,
|
||||
"total_energy_joules": self.total_energy_joules,
|
||||
"total_energy_per_token": self.total_energy_per_token,
|
||||
"timestamp": self.timestamp,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class InferenceMetrics:
|
||||
"""Metrics for inference benchmark."""
|
||||
model_name: str
|
||||
gpu_name: str
|
||||
attention_implementation: str
|
||||
num_requests: int
|
||||
prompt_length: int
|
||||
generation_length: int
|
||||
|
||||
# Stage-specific metrics
|
||||
prefill: StageMetrics # Time to First Token
|
||||
decode: StageMetrics # Inter-Token Latency
|
||||
|
||||
# End-to-end metrics
|
||||
e2e_latency_ms: float
|
||||
e2e_tokens_per_second: float
|
||||
e2e_energy_joules: float
|
||||
e2e_energy_per_token: float
|
||||
|
||||
# Additional metrics
|
||||
ttft_ms: float # Time to First Token (same as prefill duration)
|
||||
itl_ms: float # Inter-Token Latency (decode duration / num_tokens)
|
||||
|
||||
timestamp: float = field(default_factory=time.time)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary."""
|
||||
return {
|
||||
"model_name": self.model_name,
|
||||
"gpu_name": self.gpu_name,
|
||||
"attention_implementation": self.attention_implementation,
|
||||
"num_requests": self.num_requests,
|
||||
"prompt_length": self.prompt_length,
|
||||
"generation_length": self.generation_length,
|
||||
"prefill": self.prefill.to_dict(),
|
||||
"decode": self.decode.to_dict(),
|
||||
"e2e_latency_ms": self.e2e_latency_ms,
|
||||
"e2e_tokens_per_second": self.e2e_tokens_per_second,
|
||||
"e2e_energy_joules": self.e2e_energy_joules,
|
||||
"e2e_energy_per_token": self.e2e_energy_per_token,
|
||||
"ttft_ms": self.ttft_ms,
|
||||
"itl_ms": self.itl_ms,
|
||||
"timestamp": self.timestamp,
|
||||
}
|
||||
|
||||
|
||||
class MetricsCollector:
|
||||
"""Collects metrics during benchmark runs."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize metrics collector."""
|
||||
self.metrics_history: List[Dict[str, Any]] = []
|
||||
|
||||
def add_pretrain_metrics(self, metrics: PretrainMetrics):
|
||||
"""Add pretraining metrics."""
|
||||
self.metrics_history.append({
|
||||
"type": "pretrain",
|
||||
"metrics": metrics.to_dict()
|
||||
})
|
||||
|
||||
def add_inference_metrics(self, metrics: InferenceMetrics):
|
||||
"""Add inference metrics."""
|
||||
self.metrics_history.append({
|
||||
"type": "inference",
|
||||
"metrics": metrics.to_dict()
|
||||
})
|
||||
|
||||
def get_all_metrics(self) -> List[Dict[str, Any]]:
|
||||
"""Get all collected metrics."""
|
||||
return self.metrics_history
|
||||
|
||||
def clear(self):
|
||||
"""Clear all metrics."""
|
||||
self.metrics_history.clear()
|
||||
|
||||
|
||||
class MetricsReporter:
|
||||
"""Formats and outputs benchmark results."""
|
||||
|
||||
@staticmethod
|
||||
def print_pretrain_metrics(metrics: PretrainMetrics, verbose: bool = True):
|
||||
"""Print pretraining metrics to console."""
|
||||
print("\n" + "=" * 80)
|
||||
print("PRETRAINING BENCHMARK RESULTS")
|
||||
print("=" * 80)
|
||||
print(f"\nModel: {metrics.model_name}")
|
||||
print(f"GPU: {metrics.gpu_name}")
|
||||
print(f"Attention: {metrics.attention_implementation}")
|
||||
print(f"Batch Size: {metrics.batch_size}")
|
||||
print(f"Sequence Length: {metrics.sequence_length}")
|
||||
print(f"Training Steps: {metrics.num_steps}")
|
||||
|
||||
print("\n" + "-" * 80)
|
||||
print("STAGE BREAKDOWN")
|
||||
print("-" * 80)
|
||||
|
||||
# Forward pass
|
||||
print(f"\n[1] FORWARD PASS")
|
||||
MetricsReporter._print_stage_metrics(metrics.forward, verbose)
|
||||
|
||||
# Backward pass
|
||||
print(f"\n[2] BACKWARD PASS")
|
||||
MetricsReporter._print_stage_metrics(metrics.backward, verbose)
|
||||
|
||||
# Optimizer step
|
||||
print(f"\n[3] OPTIMIZER STEP")
|
||||
MetricsReporter._print_stage_metrics(metrics.optimizer, verbose)
|
||||
|
||||
# Overall
|
||||
print("\n" + "-" * 80)
|
||||
print("OVERALL METRICS")
|
||||
print("-" * 80)
|
||||
print(f" Total Duration: {metrics.total_duration_ms:>10.2f} ms")
|
||||
print(f" Total Tokens: {metrics.total_tokens:>10,}")
|
||||
print(f" Throughput: {metrics.total_tokens_per_second:>10.2f} tokens/s")
|
||||
print(f" Total Energy: {metrics.total_energy_joules:>10.2f} J")
|
||||
print(f" Energy per Token: {metrics.total_energy_per_token*1000:>10.4f} mJ/token")
|
||||
print("=" * 80 + "\n")
|
||||
|
||||
@staticmethod
|
||||
def print_inference_metrics(metrics: InferenceMetrics, verbose: bool = True):
|
||||
"""Print inference metrics to console."""
|
||||
print("\n" + "=" * 80)
|
||||
print("INFERENCE BENCHMARK RESULTS")
|
||||
print("=" * 80)
|
||||
print(f"\nModel: {metrics.model_name}")
|
||||
print(f"GPU: {metrics.gpu_name}")
|
||||
print(f"Attention: {metrics.attention_implementation}")
|
||||
print(f"Requests: {metrics.num_requests}")
|
||||
print(f"Prompt Length: {metrics.prompt_length}")
|
||||
print(f"Generation Length: {metrics.generation_length}")
|
||||
|
||||
print("\n" + "-" * 80)
|
||||
print("STAGE BREAKDOWN")
|
||||
print("-" * 80)
|
||||
|
||||
# Prefill
|
||||
print(f"\n[1] PREFILL (Time to First Token)")
|
||||
MetricsReporter._print_stage_metrics(metrics.prefill, verbose)
|
||||
print(f" TTFT: {metrics.ttft_ms:>10.2f} ms")
|
||||
|
||||
# Decode
|
||||
print(f"\n[2] DECODE (Inter-Token Latency)")
|
||||
MetricsReporter._print_stage_metrics(metrics.decode, verbose)
|
||||
print(f" ITL: {metrics.itl_ms:>10.2f} ms/token")
|
||||
|
||||
# End-to-end
|
||||
print("\n" + "-" * 80)
|
||||
print("END-TO-END METRICS")
|
||||
print("-" * 80)
|
||||
print(f" Request Latency: {metrics.e2e_latency_ms:>10.2f} ms")
|
||||
print(f" Throughput: {metrics.e2e_tokens_per_second:>10.2f} tokens/s")
|
||||
print(f" Total Energy: {metrics.e2e_energy_joules:>10.2f} J")
|
||||
print(f" Energy per Token: {metrics.e2e_energy_per_token*1000:>10.4f} mJ/token")
|
||||
print("=" * 80 + "\n")
|
||||
|
||||
@staticmethod
|
||||
def _print_stage_metrics(stage: StageMetrics, verbose: bool = True):
|
||||
"""Print metrics for a single stage."""
|
||||
print(f" Duration: {stage.duration_ms:>10.2f} ms")
|
||||
print(f" Tokens: {stage.tokens_processed:>10,}")
|
||||
print(f" Throughput: {stage.tokens_per_second:>10.2f} tokens/s")
|
||||
print(f" Energy: {stage.energy_joules:>10.2f} J")
|
||||
print(f" Energy per Token: {stage.energy_per_token*1000:>10.4f} mJ/token")
|
||||
|
||||
if verbose:
|
||||
print(f" Avg Power: {stage.avg_power_watts:>10.2f} W")
|
||||
print(f" Peak Memory: {stage.peak_memory_gb:>10.2f} GB")
|
||||
print(f" Avg GPU Utilization: {stage.avg_gpu_util_percent:>10.1f} %")
|
||||
|
||||
@staticmethod
|
||||
def save_json(metrics: Any, output_path: Path):
|
||||
"""
|
||||
Save metrics to JSON file.
|
||||
|
||||
Args:
|
||||
metrics: PretrainMetrics or InferenceMetrics object
|
||||
output_path: Path to output JSON file
|
||||
"""
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(metrics.to_dict(), f, indent=2)
|
||||
|
||||
print(f"Metrics saved to: {output_path}")
|
||||
|
||||
@staticmethod
|
||||
def save_csv(metrics_list: List[Any], output_path: Path, benchmark_type: str = "pretrain"):
|
||||
"""
|
||||
Save multiple metrics to CSV file for comparison.
|
||||
|
||||
Args:
|
||||
metrics_list: List of PretrainMetrics or InferenceMetrics objects
|
||||
output_path: Path to output CSV file
|
||||
benchmark_type: "pretrain" or "inference"
|
||||
"""
|
||||
if not metrics_list:
|
||||
print("No metrics to save")
|
||||
return
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, 'w', newline='') as f:
|
||||
if benchmark_type == "pretrain":
|
||||
MetricsReporter._save_pretrain_csv(metrics_list, f)
|
||||
else:
|
||||
MetricsReporter._save_inference_csv(metrics_list, f)
|
||||
|
||||
print(f"CSV saved to: {output_path}")
|
||||
|
||||
@staticmethod
|
||||
def _save_pretrain_csv(metrics_list: List[PretrainMetrics], file):
|
||||
"""Save pretraining metrics to CSV."""
|
||||
fieldnames = [
|
||||
'gpu_name', 'attention_implementation', 'batch_size', 'sequence_length', 'num_steps',
|
||||
'forward_duration_ms', 'forward_tokens_per_sec', 'forward_energy_j', 'forward_energy_per_token_mj',
|
||||
'backward_duration_ms', 'backward_tokens_per_sec', 'backward_energy_j', 'backward_energy_per_token_mj',
|
||||
'optimizer_duration_ms', 'optimizer_tokens_per_sec', 'optimizer_energy_j', 'optimizer_energy_per_token_mj',
|
||||
'total_duration_ms', 'total_tokens_per_sec', 'total_energy_j', 'total_energy_per_token_mj',
|
||||
'timestamp'
|
||||
]
|
||||
|
||||
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
|
||||
for m in metrics_list:
|
||||
writer.writerow({
|
||||
'gpu_name': m.gpu_name,
|
||||
'attention_implementation': m.attention_implementation,
|
||||
'batch_size': m.batch_size,
|
||||
'sequence_length': m.sequence_length,
|
||||
'num_steps': m.num_steps,
|
||||
'forward_duration_ms': m.forward.duration_ms,
|
||||
'forward_tokens_per_sec': m.forward.tokens_per_second,
|
||||
'forward_energy_j': m.forward.energy_joules,
|
||||
'forward_energy_per_token_mj': m.forward.energy_per_token * 1000,
|
||||
'backward_duration_ms': m.backward.duration_ms,
|
||||
'backward_tokens_per_sec': m.backward.tokens_per_second,
|
||||
'backward_energy_j': m.backward.energy_joules,
|
||||
'backward_energy_per_token_mj': m.backward.energy_per_token * 1000,
|
||||
'optimizer_duration_ms': m.optimizer.duration_ms,
|
||||
'optimizer_tokens_per_sec': m.optimizer.tokens_per_second,
|
||||
'optimizer_energy_j': m.optimizer.energy_joules,
|
||||
'optimizer_energy_per_token_mj': m.optimizer.energy_per_token * 1000,
|
||||
'total_duration_ms': m.total_duration_ms,
|
||||
'total_tokens_per_sec': m.total_tokens_per_second,
|
||||
'total_energy_j': m.total_energy_joules,
|
||||
'total_energy_per_token_mj': m.total_energy_per_token * 1000,
|
||||
'timestamp': m.timestamp,
|
||||
})
|
||||
|
||||
@staticmethod
|
||||
def _save_inference_csv(metrics_list: List[InferenceMetrics], file):
|
||||
"""Save inference metrics to CSV."""
|
||||
fieldnames = [
|
||||
'gpu_name', 'attention_implementation', 'num_requests', 'prompt_length', 'generation_length',
|
||||
'prefill_duration_ms', 'prefill_tokens_per_sec', 'prefill_energy_j', 'prefill_energy_per_token_mj',
|
||||
'ttft_ms',
|
||||
'decode_duration_ms', 'decode_tokens_per_sec', 'decode_energy_j', 'decode_energy_per_token_mj',
|
||||
'itl_ms',
|
||||
'e2e_latency_ms', 'e2e_tokens_per_sec', 'e2e_energy_j', 'e2e_energy_per_token_mj',
|
||||
'timestamp'
|
||||
]
|
||||
|
||||
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
|
||||
for m in metrics_list:
|
||||
writer.writerow({
|
||||
'gpu_name': m.gpu_name,
|
||||
'attention_implementation': m.attention_implementation,
|
||||
'num_requests': m.num_requests,
|
||||
'prompt_length': m.prompt_length,
|
||||
'generation_length': m.generation_length,
|
||||
'prefill_duration_ms': m.prefill.duration_ms,
|
||||
'prefill_tokens_per_sec': m.prefill.tokens_per_second,
|
||||
'prefill_energy_j': m.prefill.energy_joules,
|
||||
'prefill_energy_per_token_mj': m.prefill.energy_per_token * 1000,
|
||||
'ttft_ms': m.ttft_ms,
|
||||
'decode_duration_ms': m.decode.duration_ms,
|
||||
'decode_tokens_per_sec': m.decode.tokens_per_second,
|
||||
'decode_energy_j': m.decode.energy_joules,
|
||||
'decode_energy_per_token_mj': m.decode.energy_per_token * 1000,
|
||||
'itl_ms': m.itl_ms,
|
||||
'e2e_latency_ms': m.e2e_latency_ms,
|
||||
'e2e_tokens_per_sec': m.e2e_tokens_per_second,
|
||||
'e2e_energy_j': m.e2e_energy_joules,
|
||||
'e2e_energy_per_token_mj': m.e2e_energy_per_token * 1000,
|
||||
'timestamp': m.timestamp,
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Test metrics reporting."""
|
||||
# Create sample pretraining metrics
|
||||
forward = StageMetrics(
|
||||
stage_name="forward",
|
||||
duration_ms=100.5,
|
||||
tokens_processed=1024,
|
||||
tokens_per_second=10189.3,
|
||||
energy_joules=25.3,
|
||||
energy_per_token=0.0247,
|
||||
avg_power_watts=251.7,
|
||||
peak_memory_gb=45.2,
|
||||
avg_gpu_util_percent=95.3
|
||||
)
|
||||
|
||||
backward = StageMetrics(
|
||||
stage_name="backward",
|
||||
duration_ms=205.2,
|
||||
tokens_processed=1024,
|
||||
tokens_per_second=4991.2,
|
||||
energy_joules=51.6,
|
||||
energy_per_token=0.0504,
|
||||
avg_power_watts=251.5,
|
||||
peak_memory_gb=48.6,
|
||||
avg_gpu_util_percent=97.1
|
||||
)
|
||||
|
||||
optimizer = StageMetrics(
|
||||
stage_name="optimizer",
|
||||
duration_ms=15.3,
|
||||
tokens_processed=1024,
|
||||
tokens_per_second=66928.1,
|
||||
energy_joules=3.8,
|
||||
energy_per_token=0.0037,
|
||||
avg_power_watts=248.4,
|
||||
peak_memory_gb=48.6,
|
||||
avg_gpu_util_percent=42.1
|
||||
)
|
||||
|
||||
pretrain_metrics = PretrainMetrics(
|
||||
model_name="Qwen/Qwen2.5-3B-Instruct",
|
||||
gpu_name="NVIDIA A100 80GB",
|
||||
attention_implementation="flash_attention_2",
|
||||
batch_size=8,
|
||||
sequence_length=2048,
|
||||
num_steps=10,
|
||||
forward=forward,
|
||||
backward=backward,
|
||||
optimizer=optimizer,
|
||||
total_duration_ms=321.0,
|
||||
total_tokens=10240,
|
||||
total_tokens_per_second=31900.3,
|
||||
total_energy_joules=80.7,
|
||||
total_energy_per_token=0.00788
|
||||
)
|
||||
|
||||
# Print pretrain metrics
|
||||
MetricsReporter.print_pretrain_metrics(pretrain_metrics)
|
||||
|
||||
# Create sample inference metrics
|
||||
prefill = StageMetrics(
|
||||
stage_name="prefill",
|
||||
duration_ms=45.2,
|
||||
tokens_processed=512,
|
||||
tokens_per_second=11327.4,
|
||||
energy_joules=11.3,
|
||||
energy_per_token=0.0221,
|
||||
avg_power_watts=250.0,
|
||||
peak_memory_gb=42.1,
|
||||
avg_gpu_util_percent=89.2
|
||||
)
|
||||
|
||||
decode = StageMetrics(
|
||||
stage_name="decode",
|
||||
duration_ms=223.5,
|
||||
tokens_processed=100,
|
||||
tokens_per_second=447.4,
|
||||
energy_joules=55.9,
|
||||
energy_per_token=0.559,
|
||||
avg_power_watts=250.1,
|
||||
peak_memory_gb=42.1,
|
||||
avg_gpu_util_percent=62.3
|
||||
)
|
||||
|
||||
inference_metrics = InferenceMetrics(
|
||||
model_name="Qwen/Qwen2.5-3B-Instruct",
|
||||
gpu_name="NVIDIA A100 80GB",
|
||||
attention_implementation="flash_attention_2",
|
||||
num_requests=10,
|
||||
prompt_length=512,
|
||||
generation_length=100,
|
||||
prefill=prefill,
|
||||
decode=decode,
|
||||
e2e_latency_ms=268.7,
|
||||
e2e_tokens_per_second=2277.9,
|
||||
e2e_energy_joules=67.2,
|
||||
e2e_energy_per_token=0.110,
|
||||
ttft_ms=45.2,
|
||||
itl_ms=2.235
|
||||
)
|
||||
|
||||
# Print inference metrics
|
||||
MetricsReporter.print_inference_metrics(inference_metrics)
|
||||
Reference in New Issue
Block a user