Initial commit

2026-02-05 23:18:26 +01:00
commit 747c92ac6b
31 changed files with 4220 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,408 @@
+# READ THIS BEFORE YOU REFACTOR ME
+#
+# setup.py uses the list of patterns in this file to decide
+# what to delete, but it's not 100% sound.  So, for example,
+# if you delete aten/build/ because it's redundant with build/,
+# aten/build/ will stop being cleaned.  So be careful when
+# refactoring this file!
+
+## Model cache
+.md
+model_cache/
+
+## PyTorch
+.coverage
+coverage.xml
+.dmypy.json
+.gradle
+.hypothesis
+.mypy_cache
+.additional_ci_files
+.lintrunner.private.toml
+/.extracted_scripts/
+**/.pytorch_specified_test_cases.csv
+**/.pytorch-disabled-tests.json
+*/*.pyc
+*/*.so*
+*/**/__pycache__
+*/**/*.dylib*
+*/**/*.pyc
+*/**/*.pyd
+*/**/*.so*
+*/**/**/*.pyc
+*/**/**/**/*.pyc
+*/**/**/**/**/*.pyc
+aten/build/
+aten/src/ATen/Config.h
+aten/src/ATen/cuda/CUDAConfig.h
+aten/src/ATen/hip/HIPConfig.h
+benchmarks/.data
+caffe2/cpp_test/
+dist/
+docs/build/
+docs/cpp/src
+docs/src/**/*
+docs/cpp/build
+docs/cpp/source/api
+docs/cpp/source/html/
+docs/cpp/source/latex/
+docs/source/compile/generated/
+docs/source/generated/
+docs/source/compile/generated/
+log
+usage_log.txt
+usage_log*
+test-reports/
+test/*.bak
+test/**/*.bak
+test/.coverage
+test/.hypothesis/
+test/cpp/api/mnist
+test/custom_operator/model.pt
+test/debug/
+test/jit_hooks/*.pt
+test/data/legacy_modules.t7
+test/data/*.pt
+test/forward_backward_compatibility/nightly_schemas.txt
+dropout_model.pt
+test/generated_type_hints_smoketest.py
+test/htmlcov
+test/cpp_extensions/**/install
+test/kernel.errors.txt
+third_party/build/
+third_party/nccl/
+tools/coverage_plugins_package/pip-wheel-metadata/
+tools/shared/_utils_internal.py
+tools/fast_nvcc/wrap_nvcc.sh
+tools/fast_nvcc/wrap_nvcc.bat
+tools/fast_nvcc/tmp/
+torch.egg-info/
+torch/_C/__init__.pyi
+torch/_C/_nn.pyi
+torch/_C/_VariableFunctions.pyi
+torch/_VF.pyi
+torch/return_types.pyi
+torch/nn/functional.pyi
+torch/utils/data/datapipes/datapipe.pyi
+torch/csrc/autograd/generated/*
+torch/csrc/functionalization/generated/*
+torch/csrc/lazy/generated/*.[!m]*
+torch_compile_debug/
+# Listed manually because some files in this directory are not generated
+torch/testing/_internal/generated/annotated_fn_args.py
+torch/testing/_internal/data/*.pt
+torch/headeronly/version.h
+torch/csrc/cudnn/cuDNN.cpp
+torch/csrc/generated
+torch/csrc/generic/TensorMethods.cpp
+torch/csrc/inductor/aoti_torch/generated/*.cpp
+torch/csrc/inductor/aoti_torch/generated/extend/*
+torch/csrc/jit/generated/*
+torch/csrc/jit/fuser/config.h
+torch/csrc/nn/THCUNN.cpp
+torch/csrc/nn/THCUNN.cwrap
+torch/bin/
+torch/cmake/
+torch/lib/*.a*
+torch/lib/*.dll*
+torch/lib/*.exe*
+torch/lib/*.dylib*
+torch/lib/*.h
+torch/lib/*.lib
+torch/lib/*.pdb
+torch/lib/*.so*
+torch/lib/protobuf*.pc
+torch/lib/build
+torch/lib/caffe2/
+torch/lib/cmake
+torch/lib/include
+torch/lib/pkgconfig
+torch/lib/protoc
+torch/lib/protobuf/
+torch/lib/tmp_install
+torch/lib/torch_shm_manager
+torch/lib/site-packages/
+torch/lib/python*
+torch/lib64
+torch/include/
+torch/share/
+torch/test/
+torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
+torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
+torch/version.py
+torch/_inductor/kernel/vendored_templates/*
+test/inductor/test_tlx*
+minifier_launcher.py
+aten/src/ATen/native/transformers/hip/flash_attn/ck/fav_v3/asm_fmha_v3_bwd_configs.hpp
+aten/src/ATen/native/transformers/hip/flash_attn/ck/fav_v3/mha_bwd.hip
+aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d*
+aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d*
+aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_convert*
+aten/src/ATen/native/transformers/hip/flash_attn/ck/fwd_blob*
+aten/src/ATen/native/transformers/hip/flash_attn/ck/bwd_blob*
+aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_api*
+aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_api*
+# Root level file used in CI to specify certain env configs.
+# E.g., see .circleci/config.yaml
+env
+.circleci/scripts/COMMIT_MSG
+scripts/release_notes/*.json
+sccache-stats*.json
+lint.json
+merge_record.json
+.github/scripts/nightly_source_matrix.json
+
+# These files get copied over on invoking setup.py
+torchgen/packaged/*
+!torchgen/packaged/README.md
+
+# This file is injected by ROCm build scripts to bootstrap in torch/__init__.py.
+torch/_rocm_init.py
+
+# IPython notebook checkpoints
+.ipynb_checkpoints
+
+# Editor temporaries
+*.swa
+*.swb
+*.swc
+*.swd
+*.swe
+*.swf
+*.swg
+*.swh
+*.swi
+*.swj
+*.swk
+*.swl
+*.swm
+*.swn
+*.swo
+*.swp
+*~
+.~lock.*
+
+# macOS dir files
+.DS_Store
+
+# Ninja files
+.ninja_deps
+.ninja_log
+compile_commands.json
+*.egg-info/
+docs/source/scripts/activation_images/
+docs/source/scripts/quantization_backend_configs/
+docs/source/scripts/lr_scheduler_images/
+
+## General
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.cuo
+*.obj
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Compiled protocol buffers
+*.pb.h
+*.pb.cc
+*_pb2.py
+
+# Compiled python
+*.pyc
+*.pyd
+
+# Compiled MATLAB
+*.mex*
+
+# NFS handle files
+**/.nfs*
+
+# Sublime Text settings
+*.sublime-workspace
+*.sublime-project
+
+# Eclipse Project settings
+*.*project
+.settings
+
+# QtCreator files
+*.user
+
+# PyCharm files
+.idea
+
+# GDB history
+.gdb_history
+
+## Caffe2
+
+# build, distribute, and bins (+ python proto bindings)
+build/
+# Allow tools/build/ for build support.
+!tools/build/
+build_host_protoc
+build_android
+build_ios
+.build_debug/*
+.build_release/*
+.build_profile/*
+distribute/*
+*.testbin
+*.bin
+cmake_build
+.cmake_build
+gen
+.setuptools-cmake-build
+.pytest_cache
+aten/build/*
+
+# Linker scripts for prioritized text optimization
+cmake/linker_script.ld
+
+# Bram
+plsdontbreak
+
+# Generated documentation
+docs/_site
+docs/gathered
+_site
+doxygen
+docs/dev
+
+# LevelDB files
+*.sst
+*.ldb
+LOCK
+CURRENT
+MANIFEST-*
+
+# generated version file
+caffe2/version.py
+
+# setup.py intermediates
+.eggs
+caffe2.egg-info
+MANIFEST
+
+# Atom/Watchman required file
+.watchmanconfig
+.watchman
+
+# Files generated by CLion
+cmake-build-debug
+
+# BEGIN NOT-CLEAN-FILES (setup.py handles this marker. Do not change.)
+#
+# Below files are not deleted by "setup.py clean".
+
+# Downloaded bazel
+tools/bazel
+
+# Visual Studio Code files
+.vs
+/.vscode/*
+!/.vscode/extensions.json
+!/.vscode/settings_recommended.json
+
+# YouCompleteMe config file
+.ycm_extra_conf.py
+
+# Files generated when a patch is rejected
+*.orig
+*.rej
+
+# Files generated by ctags
+CTAGS
+GTAGS
+GRTAGS
+GSYMS
+GPATH
+tags
+TAGS
+
+
+# ccls file
+.ccls-cache/
+
+# clang tooling storage location
+.clang-format-bin
+.clang-tidy-bin
+.lintbin
+
+# clangd background index
+.clangd/
+.cache/
+
+# bazel symlinks
+bazel-*
+
+# xla repo
+xla/
+
+# direnv, posh-direnv
+.env
+.envrc
+.psenvrc
+
+# generated shellcheck directories
+.shellcheck_generated*/
+
+# zip archives
+*.zip
+
+# core dump files
+**/core.[1-9]*
+
+# Generated if you use the pre-commit script for clang-tidy
+pr.diff
+
+# coverage files
+*/**/.coverage.*
+
+# buck generated files
+.buckd/
+.lsp-buck-out/
+.lsp.buckd/
+buck-out/
+
+# Downloaded libraries
+third_party/ruy/
+third_party/glog/
+
+# Virtualenv
+.venv/
+venv/
+
+# Log files
+*.log
+sweep/
+
+# Android build artifacts
+android/pytorch_android/.cxx
+android/pytorch_android_torchvision/.cxx
+
+# Pyre configs (for internal usage)
+.pyre_configuration
+.pyre_configuration.codenav
+.arcconfig
+.stable_pyre_client
+.pyre_client
+
+# Claude Code local configuration
+CLAUDE.local.md
+/test_*.py
+/debug_*.py
+CLAUDE_CONTEXT/
+/.claude/settings.local.json
--- a/AMD_FIX_SUMMARY.md
+++ b/AMD_FIX_SUMMARY.md
@@ -0,0 +1,100 @@
+# AMD GPU Monitoring Fix Summary
+
+## Issue
+The AMDMonitor class was using incorrect pyrsmi API calls. The implementation attempted to use low-level `rocmsmi` module which has complex initialization and function signatures.
+
+## Solution
+Updated to use the correct `rocml` high-level API from pyrsmi, based on the official example at:
+`/anvme/workspace/ihpc125h-llm-profiles/pyrsmi/examples/llm_monitoring/monitor_llm_inference.py`
+
+## Changes Made
+
+### 1. Fixed AMDMonitor Class
+
+**Before** (incorrect):
+```python
+from pyrsmi import rocmsmi
+ret = self.rocmsmi.rsmi_init(0)
+power_uw = self.rocmsmi.rsmi_dev_power_ave_get(self.device_id)
+```
+
+**After** (correct):
+```python
+from pyrsmi import rocml
+self.rocml.smi_initialize()
+power_watts = self.rocml.smi_get_device_average_power(self.device_id)
+```
+
+**Key API Functions**:
+- `rocml.smi_initialize()` - Initialize monitoring
+- `rocml.smi_get_device_average_power(device_id)` - Get power in Watts (not microwatts!)
+- `rocml.smi_get_device_utilization(device_id)` - Get GPU utilization %
+- `rocml.smi_get_device_memory_used(device_id)` - Get memory used in bytes
+- `rocml.smi_get_device_memory_total(device_id)` - Get total memory in bytes
+- `rocml.smi_get_device_temperature(device_id)` - Get temperature
+- `rocml.smi_get_device_name(device_id)` - Get device name
+- `rocml.smi_shutdown()` - Cleanup
+
+### 2. Updated All SLURM Scripts for Apptainer
+
+All GPU benchmark scripts now run inside the apptainer container:
+
+**A100, H100, H200** (NVIDIA):
+```bash
+APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_tilelang.sif"
+apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py ...
+```
+
+**MI300X** (AMD):
+```bash
+APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_tilelang.sif"
+apptainer exec --rocm $APPTAINER_IMAGE python run_benchmark.py ...
+```
+
+Note: `--nv` for NVIDIA, `--rocm` for AMD
+
+### 3. Updated Documentation
+
+- README.md now mentions apptainer usage
+- Updated setup instructions to use apptainer for model caching
+- Added notes about container flags (--nv vs --rocm)
+
+## Testing
+
+To verify the AMD monitoring works:
+
+```bash
+# Inside apptainer on MI300X node
+apptainer exec --rocm pytorch_25.10_tilelang.sif python -c "
+from utils.gpu_monitor import AMDMonitor
+m = AMDMonitor(0)
+print(f'GPU: {m.get_device_name()}')
+metrics = m.get_metrics()
+print(f'Power: {metrics.power_watts:.2f} W')
+print(f'Utilization: {metrics.gpu_utilization_percent:.1f}%')
+print(f'Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB')
+m.cleanup()
+"
+```
+
+## Files Modified
+
+1. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/utils/gpu_monitor.py` - Fixed AMDMonitor class
+2. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_a100.sh` - Added apptainer
+3. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_h100.sh` - Added apptainer
+4. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_h200.sh` - Added apptainer
+5. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_mi300x.sh` - Added apptainer with --rocm
+6. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/README.md` - Updated documentation
+
+## Key Differences: rocml vs rocmsmi
+
+| Feature | rocml (High-level) | rocmsmi (Low-level) |
+|---------|-------------------|---------------------|
+| API Style | Simple functions | Complex C-style API |
+| Initialization | `smi_initialize()` | `rsmi_init(0)` + error codes |
+| Power | Returns Watts | Returns microwatts |
+| Memory | Returns bytes | Returns bytes via enums |
+| Error Handling | Returns -1 on error | Returns error codes |
+| Ease of Use | Much easier | Complex |
+
+The `rocml` module is the recommended high-level Python API for pyrsmi.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,311 @@
+# LLM Benchmark Suite
+
+A comprehensive benchmarking suite for comparing LLM performance (Qwen3-4B) across different GPU architectures: **MI300X**, **A100 80G**, **H100**, and **H200**.
+
+## Features
+
+- **Pretraining Benchmarks**: Separate metrics for forward, backward, and optimizer stages
+- **Inference Benchmarks**: Separate metrics for prefill (TTFT) and decode (ITL) stages
+- **Energy Monitoring**: GPU-specific energy and power measurement
+  - NVIDIA: pynvml
+  - AMD: pyrsmi
+- **Attention Implementations**:
+  - FlashAttention-2 (A100, MI300X)
+  - FlashAttention-3 Hopper (H100, H200)
+  - Configurable via CLI
+- **Comprehensive Metrics**:
+  - Tokens per second
+  - Energy per token
+  - Time to First Token (TTFT)
+  - Inter-Token Latency (ITL)
+  - End-to-End Request Latency
+  - GPU utilization and memory usage
+
+## Directory Structure
+
+```
+llm-benchmark/
+├── cache_model.py           # Model caching script
+├── benchmark_pretrain.py    # Pretraining benchmark
+├── benchmark_inference.py   # Inference benchmark
+├── run_benchmark.py         # Main orchestration script
+├── requirements.txt         # Python dependencies
+├── utils/
+│   ├── gpu_monitor.py       # GPU monitoring (NVIDIA & AMD)
+│   ├── metrics.py           # Metrics collection and reporting
+│   └── attention.py         # Attention implementation helpers
+├── configs/
+│   ├── a100.yaml
+│   ├── h100.yaml
+│   ├── h200.yaml
+│   └── mi300x.yaml
+└── results/                 # Benchmark results (JSON)
+```
+
+## Setup
+
+### 1. Container Environment
+
+All benchmarks should be run inside the apptainer container:
+
+```bash
+# Container is located at:
+/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_tilelang.sif
+```
+
+### 2. Install Dependencies (if not using apptainer)
+
+If you want to run directly without apptainer:
+
+```bash
+# Install Python dependencies
+pip install -r requirements.txt
+
+# For AMD GPUs, ensure ROCm and pyrsmi are installed
+# For NVIDIA GPUs, ensure CUDA and pynvml are installed
+```
+
+### 3. Cache Model (Run on Head Node)
+
+**IMPORTANT**: Run this on the head node BEFORE allocating compute nodes, as compute nodes are typically offline.
+
+```bash
+# Using apptainer (recommended)
+apptainer exec --nv pytorch_25.10_tilelang.sif python cache_model.py \
+  --model-name Qwen/Qwen3-4B \
+  --cache-dir ./model_cache
+
+# Or directly (if dependencies installed)
+python cache_model.py --model-name Qwen/Qwen3-4B --cache-dir ./model_cache
+```
+
+The model will be cached to `./model_cache` in the current directory (avoiding slow NFS $HOME).
+
+## Usage
+
+### Quick Start
+
+```bash
+# Run both pretraining and inference benchmarks
+python run_benchmark.py --mode both --model-path ./model_cache
+
+# Run only pretraining
+python run_benchmark.py --mode pretrain --num-steps 20
+
+# Run only inference
+python run_benchmark.py --mode inference --num-requests 20
+```
+
+### Detailed Usage
+
+#### List Available GPUs
+
+```bash
+python run_benchmark.py --list-gpus
+```
+
+#### Pretraining Benchmark
+
+```bash
+python benchmark_pretrain.py \
+  --model-path ./model_cache \
+  --model-name Qwen/Qwen3-4B \
+  --attn-implementation auto \
+  --batch-size 8 \
+  --sequence-length 8192 \
+  --num-steps 10 \
+  --warmup-steps 3 \
+  --output-dir ./results
+```
+
+**Metrics Reported** (per stage: forward, backward, optimizer):
+- Duration (ms)
+- Tokens processed
+- Throughput (tokens/s)
+- Energy (J)
+- Energy per token (J/token)
+- Average power (W)
+- Peak memory (GB)
+- GPU utilization (%)
+
+#### Inference Benchmark
+
+```bash
+python benchmark_inference.py \
+  --model-path ./model_cache \
+  --model-name Qwen/Qwen3-4B \
+  --attn-implementation auto \
+  --num-requests 10 \
+  --prompt-length 512 \
+  --generation-length 100 \
+  --warmup-requests 2 \
+  --output-dir ./results
+```
+
+**Metrics Reported**:
+- **Prefill**: TTFT, throughput, energy per token
+- **Decode**: ITL, throughput, energy per token
+- **End-to-End**: Request latency, total throughput, total energy
+
+### Attention Implementations
+
+The benchmark automatically selects the optimal attention implementation based on GPU:
+- **A100, MI300X**: `flash_attention_2`
+- **H100, H200**: `flash_attention_3_hopper`
+
+Override with `--attn-implementation`:
+
+```bash
+# Force FlashAttention-3 Hopper on H100
+python run_benchmark.py --attn-implementation flash_attention_3_hopper
+
+# Use SDPA instead
+python run_benchmark.py --attn-implementation sdpa
+```
+
+Available options:
+- `auto` - Auto-detect based on GPU
+- `flash_attention_2` - FlashAttention-2 (all GPUs)
+- `flash_attention_3_hopper` - FlashAttention-3 for H100/H200
+- `sdpa` - PyTorch Scaled Dot Product Attention
+- `eager` - Standard PyTorch attention
+
+## Running on SLURM
+
+All SLURM scripts are configured to run inside the apptainer container. First cache the model on the head node:
+
+```bash
+# On head node (with internet access)
+apptainer exec --nv pytorch_25.10_tilelang.sif python cache_model.py \
+  --model-name Qwen/Qwen3-4B \
+  --cache-dir ./model_cache
+```
+
+Then submit jobs:
+
+```bash
+# A100
+sbatch slurm_a100.sh
+
+# H100
+sbatch slurm_h100.sh
+
+# H200
+sbatch slurm_h200.sh
+
+# MI300X
+sbatch slurm_mi300x.sh
+```
+
+**Note**: 
+- NVIDIA GPUs use `--nv` flag
+- AMD GPUs use `--rocm` flag
+
+## Output
+
+Results are saved to the `--output-dir` directory (default: `./results/`):
+
+- `pretrain_<GPU>_<ATTENTION>.json` - Pretraining metrics
+- `inference_<GPU>_<ATTENTION>.json` - Inference metrics
+
+Example output:
+
+```
+===============================================================================
+PRETRAINING BENCHMARK RESULTS
+===============================================================================
+
+Model: Qwen/Qwen3-4B
+GPU: NVIDIA A100 80GB
+Attention: flash_attention_2
+Batch Size: 8
+Sequence Length: 8192
+Training Steps: 10
+
+-------------------------------------------------------------------------------
+STAGE BREAKDOWN
+-------------------------------------------------------------------------------
+
+[1] FORWARD PASS
+  Duration:              1005.23 ms
+  Tokens:                 163,840
+  Throughput:           163,012.45 tokens/s
+  Energy:                    253.0 J
+  Energy per Token:       1.5443 mJ/token
+
+[2] BACKWARD PASS
+  Duration:              2052.11 ms
+  Tokens:                 163,840
+  Throughput:            79,857.23 tokens/s
+  Energy:                    516.2 J
+  Energy per Token:       3.1513 mJ/token
+
+[3] OPTIMIZER STEP
+  Duration:                153.42 ms
+  Tokens:                 163,840
+  Throughput:         1,068,012.34 tokens/s
+  Energy:                     38.4 J
+  Energy per Token:       0.2344 mJ/token
+
+-------------------------------------------------------------------------------
+OVERALL METRICS
+-------------------------------------------------------------------------------
+  Total Duration:        3210.76 ms
+  Total Tokens:          163,840
+  Throughput:           51,012.45 tokens/s
+  Total Energy:             807.6 J
+  Energy per Token:       4.9300 mJ/token
+===============================================================================
+```
+
+## Key Metrics Reference
+
+### Pretraining
+- **Forward**: Input processing and loss calculation
+- **Backward**: Gradient computation
+- **Optimizer**: Weight updates
+
+### Inference
+- **TTFT (Time to First Token)**: Prefill latency
+- **ITL (Inter-Token Latency)**: Average decode time per token
+- **E2E Latency**: Total request time (prefill + decode)
+
+### Energy
+- **Energy (J)**: Total energy consumed
+- **Energy per Token (mJ/token)**: Energy efficiency metric
+- **Average Power (W)**: Power consumption during stage
+
+## Troubleshooting
+
+### Model Not Found
+Ensure you've cached the model first:
+```bash
+python cache_model.py --model-name Qwen/Qwen2.5-3B-Instruct --cache-dir ./model_cache
+```
+
+### GPU Monitoring Errors
+- **NVIDIA**: Install pynvml: `pip install pynvml`
+- **AMD**: Install pyrsmi: `pip install pyrsmi`
+
+### FlashAttention-3 Not Found
+For H100/H200, ensure FlashAttention-3 is installed. If not available, use:
+```bash
+python run_benchmark.py --attn-implementation flash_attention_2
+```
+
+### Out of Memory
+Reduce batch size or sequence length:
+```bash
+python run_benchmark.py --batch-size 4 --sequence-length 1024
+```
+
+## Citation
+
+If you use this benchmark suite, please cite:
+- [FlashAttention-2](https://github.com/Dao-AILab/flash-attention)
+- [FlashAttention-3](https://github.com/Dao-AILab/flash-attention) (for Hopper)
+- [Qwen Models](https://huggingface.co/Qwen)
+
+## License
+
+MIT License - see LICENSE file for details
--- a/benchmark_inference.py
+++ b/benchmark_inference.py
@@ -0,0 +1,417 @@
+#!/usr/bin/env python3
+"""
+Inference Benchmark for LLM Performance Evaluation
+
+Measures performance and energy metrics for inference workloads with
+separate measurements for prefill and decode stages.
+"""
+
+import argparse
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Optional
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from tqdm import tqdm
+
+# Add utils to path
+sys.path.insert(0, str(Path(__file__).parent))
+from utils.gpu_monitor import get_gpu_monitor
+from utils.metrics import StageMetrics, InferenceMetrics, MetricsReporter
+from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu
+
+
+def benchmark_inference(
+    model_name_or_path: str,
+    attn_implementation: str = "auto",
+    num_requests: int = 10,
+    prompt_length: int = 512,
+    generation_length: int = 100,
+    warmup_requests: int = 2,
+    device: str = "cuda",
+    device_id: int = 0,
+    output_dir: Optional[str] = None,
+    verbose: bool = True,
+):
+    """
+    Run inference benchmark.
+    
+    Args:
+        model_name_or_path: Path to model or HuggingFace identifier
+        attn_implementation: Attention implementation to use
+        num_requests: Number of inference requests to measure
+        prompt_length: Length of input prompt
+        generation_length: Number of tokens to generate
+        warmup_requests: Number of warmup requests
+        device: Device to use
+        device_id: GPU device ID
+        output_dir: Directory to save results
+        verbose: Print verbose output
+    """
+    print("=" * 80)
+    print("INFERENCE BENCHMARK")
+    print("=" * 80)
+    
+    # Initialize GPU monitor
+    if verbose:
+        print("\n[1/7] Initializing GPU monitor...")
+    monitor = get_gpu_monitor(device_id)
+    gpu_name = monitor.get_device_name()
+    if verbose:
+        print(f"  GPU: {gpu_name}")
+    
+    # Determine attention implementation
+    if attn_implementation == "auto":
+        attn_implementation = get_default_attention(gpu_name)
+        if verbose:
+            print(f"  Auto-selected attention: {attn_implementation}")
+    
+    # Validate attention for GPU
+    valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name)
+    if warning and verbose:
+        print(f"  ⚠ {warning}")
+    
+    # Load model
+    if verbose:
+        print(f"\n[2/7] Loading model: {model_name_or_path}")
+    
+    # Determine attn_implementation parameter for model loading
+    load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation
+    
+    try:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path,
+            torch_dtype=torch.bfloat16,
+            attn_implementation=load_attn,
+            trust_remote_code=True,
+        )
+        model = model.to(device)
+        
+        # Configure attention (patch if needed for FA3)
+        model = configure_model_attention(model, attn_implementation, verbose=verbose)
+        
+        if verbose:
+            total_params = sum(p.numel() for p in model.parameters())
+            print(f"  Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
+    except Exception as e:
+        print(f"✗ Error loading model: {e}")
+        sys.exit(1)
+    
+    # Load tokenizer
+    if verbose:
+        print(f"\n[3/7] Loading tokenizer...")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name_or_path,
+            trust_remote_code=True
+        )
+    except Exception as e:
+        print(f"✗ Error loading tokenizer: {e}")
+        sys.exit(1)
+    
+    # Generate synthetic prompts
+    if verbose:
+        print(f"\n[4/7] Generating synthetic prompts...")
+        print(f"  Prompt length: {prompt_length}")
+        print(f"  Generation length: {generation_length}")
+    
+    # Create random input_ids (synthetic prompts)
+    vocab_size = model.config.vocab_size
+    # We'll create one prompt and reuse it
+    prompt_ids = torch.randint(0, vocab_size, (1, prompt_length), device=device)
+    
+    # Warmup
+    if verbose:
+        print(f"\n[5/7] Running warmup ({warmup_requests} requests)...")
+    model.eval()
+    with torch.no_grad():
+        for _ in range(warmup_requests):
+            _ = model.generate(
+                prompt_ids,
+                max_new_tokens=generation_length,
+                do_sample=False,
+                pad_token_id=tokenizer.eos_token_id
+            )
+    
+    # Synchronize before benchmarking
+    torch.cuda.synchronize()
+    
+    # Benchmark
+    if verbose:
+        print(f"\n[6/7] Running benchmark ({num_requests} requests)...")
+    
+    # Storage for per-request metrics
+    prefill_times = []
+    decode_times = []
+    e2e_times = []
+    
+    prefill_energies = []
+    decode_energies = []
+    e2e_energies = []
+    
+    prefill_powers = []
+    decode_powers = []
+    
+    memory_usage = []
+    gpu_utils = []
+    
+    # For inference, we separate prefill (first token) from decode (remaining tokens)
+    # We'll use a custom generation loop to measure them separately
+    
+    for req_idx in tqdm(range(num_requests), desc="Benchmarking"):
+        # === PREFILL PHASE (Time to First Token) ===
+        # This is the forward pass with the prompt to get the first token
+        
+        monitor.start_monitoring()
+        torch.cuda.synchronize()
+        prefill_start = time.perf_counter()
+        
+        with torch.no_grad():
+            # Forward pass with prompt
+            outputs = model(input_ids=prompt_ids, use_cache=True)
+            logits = outputs.logits
+            past_key_values = outputs.past_key_values
+            
+            # Get first generated token
+            next_token_logits = logits[:, -1, :]
+            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+        
+        torch.cuda.synchronize()
+        prefill_time = time.perf_counter() - prefill_start
+        prefill_energy = monitor.get_energy_consumed()
+        prefill_power = monitor.get_average_power()
+        
+        prefill_times.append(prefill_time * 1000)  # Convert to ms
+        prefill_energies.append(prefill_energy)
+        prefill_powers.append(prefill_power)
+        
+        # === DECODE PHASE (Inter-Token Latency) ===
+        # Generate remaining tokens one by one
+        
+        monitor.start_monitoring()
+        torch.cuda.synchronize()
+        decode_start = time.perf_counter()
+        
+        generated_tokens = [next_token]
+        
+        with torch.no_grad():
+            for _ in range(generation_length - 1):
+                # Forward pass with single token using cached keys/values
+                outputs = model(
+                    input_ids=next_token,
+                    past_key_values=past_key_values,
+                    use_cache=True
+                )
+                logits = outputs.logits
+                past_key_values = outputs.past_key_values
+                
+                # Get next token
+                next_token_logits = logits[:, -1, :]
+                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+                generated_tokens.append(next_token)
+        
+        torch.cuda.synchronize()
+        decode_time = time.perf_counter() - decode_start
+        decode_energy = monitor.get_energy_consumed()
+        decode_power = monitor.get_average_power()
+        
+        decode_times.append(decode_time * 1000)  # Convert to ms
+        decode_energies.append(decode_energy)
+        decode_powers.append(decode_power)
+        
+        # End-to-end metrics
+        e2e_time = prefill_time + decode_time
+        e2e_energy = prefill_energy + decode_energy
+        
+        e2e_times.append(e2e_time * 1000)  # Convert to ms
+        e2e_energies.append(e2e_energy)
+        
+        # Get memory and utilization
+        metrics = monitor.get_metrics()
+        memory_usage.append(metrics.memory_used_gb)
+        gpu_utils.append(metrics.gpu_utilization_percent)
+    
+    # Compute aggregated metrics
+    
+    # Prefill metrics (TTFT)
+    prefill_duration_ms = sum(prefill_times)
+    prefill_energy_j = sum(prefill_energies)
+    prefill_tokens = prompt_length * num_requests
+    prefill_tps = prefill_tokens / (prefill_duration_ms / 1000)
+    prefill_ept = prefill_energy_j / prefill_tokens
+    avg_ttft_ms = sum(prefill_times) / len(prefill_times)
+    
+    prefill_metrics = StageMetrics(
+        stage_name="prefill",
+        duration_ms=prefill_duration_ms,
+        tokens_processed=prefill_tokens,
+        tokens_per_second=prefill_tps,
+        energy_joules=prefill_energy_j,
+        energy_per_token=prefill_ept,
+        avg_power_watts=sum(prefill_powers) / len(prefill_powers),
+        peak_memory_gb=max(memory_usage),
+        avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
+    )
+    
+    # Decode metrics (ITL)
+    decode_duration_ms = sum(decode_times)
+    decode_energy_j = sum(decode_energies)
+    decode_tokens = generation_length * num_requests
+    decode_tps = decode_tokens / (decode_duration_ms / 1000)
+    decode_ept = decode_energy_j / decode_tokens
+    avg_itl_ms = sum(decode_times) / len(decode_times) / generation_length
+    
+    decode_metrics = StageMetrics(
+        stage_name="decode",
+        duration_ms=decode_duration_ms,
+        tokens_processed=decode_tokens,
+        tokens_per_second=decode_tps,
+        energy_joules=decode_energy_j,
+        energy_per_token=decode_ept,
+        avg_power_watts=sum(decode_powers) / len(decode_powers),
+        peak_memory_gb=max(memory_usage),
+        avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
+    )
+    
+    # End-to-end metrics
+    e2e_latency_ms = sum(e2e_times) / len(e2e_times)
+    e2e_energy_j = sum(e2e_energies)
+    total_tokens = (prompt_length + generation_length) * num_requests
+    e2e_tps = total_tokens / (sum(e2e_times) / 1000)
+    e2e_ept = e2e_energy_j / total_tokens
+    
+    # Create metrics object
+    metrics = InferenceMetrics(
+        model_name=model_name_or_path,
+        gpu_name=gpu_name,
+        attention_implementation=attn_implementation,
+        num_requests=num_requests,
+        prompt_length=prompt_length,
+        generation_length=generation_length,
+        prefill=prefill_metrics,
+        decode=decode_metrics,
+        e2e_latency_ms=e2e_latency_ms,
+        e2e_tokens_per_second=e2e_tps,
+        e2e_energy_joules=e2e_energy_j,
+        e2e_energy_per_token=e2e_ept,
+        ttft_ms=avg_ttft_ms,
+        itl_ms=avg_itl_ms
+    )
+    
+    # Print results
+    if verbose:
+        print()
+    MetricsReporter.print_inference_metrics(metrics, verbose=verbose)
+    
+    # Save results
+    if output_dir:
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        
+        # Save JSON
+        json_path = output_path / f"inference_{gpu_name.replace(' ', '_')}_{attn_implementation}.json"
+        MetricsReporter.save_json(metrics, json_path)
+    
+    # Cleanup
+    monitor.cleanup()
+    del model
+    torch.cuda.empty_cache()
+    
+    return metrics
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="LLM Inference Benchmark",
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="./model_cache",
+        help="Path to cached model"
+    )
+    
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="Qwen/Qwen3-4B",
+        help="Model name (for reporting)"
+    )
+    
+    parser.add_argument(
+        "--attn-implementation",
+        type=str,
+        default="auto",
+        choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
+        help="Attention implementation to use"
+    )
+    
+    parser.add_argument(
+        "--num-requests",
+        type=int,
+        default=10,
+        help="Number of inference requests"
+    )
+    
+    parser.add_argument(
+        "--prompt-length",
+        type=int,
+        default=512,
+        help="Prompt length in tokens"
+    )
+    
+    parser.add_argument(
+        "--generation-length",
+        type=int,
+        default=100,
+        help="Number of tokens to generate"
+    )
+    
+    parser.add_argument(
+        "--warmup-requests",
+        type=int,
+        default=2,
+        help="Number of warmup requests"
+    )
+    
+    parser.add_argument(
+        "--device-id",
+        type=int,
+        default=0,
+        help="GPU device ID"
+    )
+    
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="./results",
+        help="Output directory for results"
+    )
+    
+    args = parser.parse_args()
+    
+    # Set environment variables for HuggingFace cache
+    if Path(args.model_path).exists():
+        os.environ['HF_HOME'] = args.model_path
+    
+    benchmark_inference(
+        model_name_or_path=args.model_name,
+        attn_implementation=args.attn_implementation,
+        num_requests=args.num_requests,
+        prompt_length=args.prompt_length,
+        generation_length=args.generation_length,
+        warmup_requests=args.warmup_requests,
+        device="cuda",
+        device_id=args.device_id,
+        output_dir=args.output_dir,
+        verbose=True
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmark_pretrain.py
+++ b/benchmark_pretrain.py
@@ -0,0 +1,406 @@
+#!/usr/bin/env python3
+"""
+Pretraining Benchmark for LLM Performance Evaluation
+
+Measures performance and energy metrics for pretraining workloads with
+separate measurements for forward, backward, and optimizer stages.
+"""
+
+import argparse
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from tqdm import tqdm
+
+# Add utils to path
+sys.path.insert(0, str(Path(__file__).parent))
+from utils.gpu_monitor import get_gpu_monitor
+from utils.metrics import StageMetrics, PretrainMetrics, MetricsReporter
+from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu
+
+
+def benchmark_pretrain(
+    model_name_or_path: str,
+    attn_implementation: str = "auto",
+    batch_size: int = 8,
+    sequence_length: int = 2048,
+    num_steps: int = 10,
+    warmup_steps: int = 3,
+    device: str = "cuda",
+    device_id: int = 0,
+    output_dir: Optional[str] = None,
+    verbose: bool = True,
+):
+    """
+    Run pretraining benchmark.
+    
+    Args:
+        model_name_or_path: Path to model or HuggingFace identifier
+        attn_implementation: Attention implementation to use
+        batch_size: Batch size for training
+        sequence_length: Sequence length
+        num_steps: Number of training steps to measure
+        warmup_steps: Number of warmup steps before measurement
+        device: Device to use
+        device_id: GPU device ID
+        output_dir: Directory to save results
+        verbose: Print verbose output
+    """
+    print("=" * 80)
+    print("PRETRAINING BENCHMARK")
+    print("=" * 80)
+    
+    # Initialize GPU monitor
+    if verbose:
+        print("\n[1/6] Initializing GPU monitor...")
+    monitor = get_gpu_monitor(device_id)
+    gpu_name = monitor.get_device_name()
+    if verbose:
+        print(f"  GPU: {gpu_name}")
+    
+    # Determine attention implementation
+    if attn_implementation == "auto":
+        attn_implementation = get_default_attention(gpu_name)
+        if verbose:
+            print(f"  Auto-selected attention: {attn_implementation}")
+    
+    # Validate attention for GPU
+    valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name)
+    if warning and verbose:
+        print(f"  ⚠ {warning}")
+    
+    # Load model
+    if verbose:
+        print(f"\n[2/6] Loading model: {model_name_or_path}")
+    
+    # Determine attn_implementation parameter for model loading
+    load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation
+    
+    try:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path,
+            torch_dtype=torch.bfloat16,
+            attn_implementation=load_attn,
+            trust_remote_code=True,
+        )
+        model = model.to(device)
+        
+        # Configure attention (patch if needed for FA3)
+        model = configure_model_attention(model, attn_implementation, verbose=verbose)
+        
+        if verbose:
+            total_params = sum(p.numel() for p in model.parameters())
+            trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+            print(f"  Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
+            print(f"  Trainable parameters: {trainable_params:,}")
+    except Exception as e:
+        print(f"✗ Error loading model: {e}")
+        sys.exit(1)
+    
+    # Setup optimizer
+    if verbose:
+        print(f"\n[3/6] Setting up optimizer...")
+    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
+    
+    # Generate synthetic training data
+    if verbose:
+        print(f"\n[4/6] Generating synthetic training data...")
+        print(f"  Batch size: {batch_size}")
+        print(f"  Sequence length: {sequence_length}")
+    
+    # Create random input_ids (synthetic data)
+    vocab_size = model.config.vocab_size
+    input_ids = torch.randint(0, vocab_size, (batch_size, sequence_length), device=device)
+    labels = input_ids.clone()
+    
+    # Warmup
+    if verbose:
+        print(f"\n[5/6] Running warmup ({warmup_steps} steps)...")
+    model.train()
+    for _ in range(warmup_steps):
+        optimizer.zero_grad()
+        outputs = model(input_ids=input_ids, labels=labels)
+        loss = outputs.loss
+        loss.backward()
+        optimizer.step()
+    
+    # Synchronize before benchmarking
+    torch.cuda.synchronize()
+    
+    # Benchmark
+    if verbose:
+        print(f"\n[6/6] Running benchmark ({num_steps} steps)...")
+    
+    # Storage for per-step metrics
+    forward_times = []
+    backward_times = []
+    optimizer_times = []
+    
+    forward_energies = []
+    backward_energies = []
+    optimizer_energies = []
+    
+    forward_powers = []
+    backward_powers = []
+    optimizer_powers = []
+    
+    memory_usage = []
+    gpu_utils = []
+    
+    total_tokens = batch_size * sequence_length * num_steps
+    
+    for step in tqdm(range(num_steps), desc="Benchmarking"):
+        # === FORWARD PASS ===
+        monitor.start_monitoring()
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+        
+        optimizer.zero_grad()
+        outputs = model(input_ids=input_ids, labels=labels)
+        loss = outputs.loss
+        
+        torch.cuda.synchronize()
+        forward_time = time.perf_counter() - start_time
+        forward_energy = monitor.get_energy_consumed()
+        forward_power = monitor.get_average_power()
+        
+        forward_times.append(forward_time * 1000)  # Convert to ms
+        forward_energies.append(forward_energy)
+        forward_powers.append(forward_power)
+        
+        # === BACKWARD PASS ===
+        monitor.start_monitoring()
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+        
+        loss.backward()
+        
+        torch.cuda.synchronize()
+        backward_time = time.perf_counter() - start_time
+        backward_energy = monitor.get_energy_consumed()
+        backward_power = monitor.get_average_power()
+        
+        backward_times.append(backward_time * 1000)  # Convert to ms
+        backward_energies.append(backward_energy)
+        backward_powers.append(backward_power)
+        
+        # === OPTIMIZER STEP ===
+        monitor.start_monitoring()
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+        
+        optimizer.step()
+        
+        torch.cuda.synchronize()
+        optimizer_time = time.perf_counter() - start_time
+        optimizer_energy = monitor.get_energy_consumed()
+        optimizer_power = monitor.get_average_power()
+        
+        optimizer_times.append(optimizer_time * 1000)  # Convert to ms
+        optimizer_energies.append(optimizer_energy)
+        optimizer_powers.append(optimizer_power)
+        
+        # Get memory and utilization
+        metrics = monitor.get_metrics()
+        memory_usage.append(metrics.memory_used_gb)
+        gpu_utils.append(metrics.gpu_utilization_percent)
+    
+    # Compute aggregated metrics
+    tokens_per_step = batch_size * sequence_length
+    
+    # Forward metrics
+    forward_duration_ms = sum(forward_times)
+    forward_energy_j = sum(forward_energies)
+    forward_tokens = tokens_per_step * num_steps
+    forward_tps = forward_tokens / (forward_duration_ms / 1000)
+    forward_ept = forward_energy_j / forward_tokens
+    forward_metrics = StageMetrics(
+        stage_name="forward",
+        duration_ms=forward_duration_ms,
+        tokens_processed=forward_tokens,
+        tokens_per_second=forward_tps,
+        energy_joules=forward_energy_j,
+        energy_per_token=forward_ept,
+        avg_power_watts=sum(forward_powers) / len(forward_powers),
+        peak_memory_gb=max(memory_usage),
+        avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
+    )
+    
+    # Backward metrics
+    backward_duration_ms = sum(backward_times)
+    backward_energy_j = sum(backward_energies)
+    backward_tokens = tokens_per_step * num_steps
+    backward_tps = backward_tokens / (backward_duration_ms / 1000)
+    backward_ept = backward_energy_j / backward_tokens
+    backward_metrics = StageMetrics(
+        stage_name="backward",
+        duration_ms=backward_duration_ms,
+        tokens_processed=backward_tokens,
+        tokens_per_second=backward_tps,
+        energy_joules=backward_energy_j,
+        energy_per_token=backward_ept,
+        avg_power_watts=sum(backward_powers) / len(backward_powers),
+        peak_memory_gb=max(memory_usage),
+        avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
+    )
+    
+    # Optimizer metrics
+    optimizer_duration_ms = sum(optimizer_times)
+    optimizer_energy_j = sum(optimizer_energies)
+    optimizer_tokens = tokens_per_step * num_steps
+    optimizer_tps = optimizer_tokens / (optimizer_duration_ms / 1000)
+    optimizer_ept = optimizer_energy_j / optimizer_tokens
+    optimizer_metrics = StageMetrics(
+        stage_name="optimizer",
+        duration_ms=optimizer_duration_ms,
+        tokens_processed=optimizer_tokens,
+        tokens_per_second=optimizer_tps,
+        energy_joules=optimizer_energy_j,
+        energy_per_token=optimizer_ept,
+        avg_power_watts=sum(optimizer_powers) / len(optimizer_powers),
+        peak_memory_gb=max(memory_usage),
+        avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
+    )
+    
+    # Overall metrics
+    total_duration_ms = forward_duration_ms + backward_duration_ms + optimizer_duration_ms
+    total_energy_j = forward_energy_j + backward_energy_j + optimizer_energy_j
+    total_tps = total_tokens / (total_duration_ms / 1000)
+    total_ept = total_energy_j / total_tokens
+    
+    # Create metrics object
+    metrics = PretrainMetrics(
+        model_name=model_name_or_path,
+        gpu_name=gpu_name,
+        attention_implementation=attn_implementation,
+        batch_size=batch_size,
+        sequence_length=sequence_length,
+        num_steps=num_steps,
+        forward=forward_metrics,
+        backward=backward_metrics,
+        optimizer=optimizer_metrics,
+        total_duration_ms=total_duration_ms,
+        total_tokens=total_tokens,
+        total_tokens_per_second=total_tps,
+        total_energy_joules=total_energy_j,
+        total_energy_per_token=total_ept
+    )
+    
+    # Print results
+    MetricsReporter.print_pretrain_metrics(metrics, verbose=verbose)
+    
+    # Save results
+    if output_dir:
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        
+        # Save JSON
+        json_path = output_path / f"pretrain_{gpu_name.replace(' ', '_')}_{attn_implementation}.json"
+        MetricsReporter.save_json(metrics, json_path)
+    
+    # Cleanup
+    monitor.cleanup()
+    del model
+    torch.cuda.empty_cache()
+    
+    return metrics
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="LLM Pretraining Benchmark",
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="./model_cache",
+        help="Path to cached model"
+    )
+    
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="Qwen/Qwen3-4B",
+        help="Model name (for reporting)"
+    )
+    
+    parser.add_argument(
+        "--attn-implementation",
+        type=str,
+        default="auto",
+        choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
+        help="Attention implementation to use"
+    )
+    
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=8,
+        help="Batch size"
+    )
+    
+    parser.add_argument(
+        "--sequence-length",
+        type=int,
+        default=8192,
+        help="Sequence length"
+    )
+    
+    parser.add_argument(
+        "--num-steps",
+        type=int,
+        default=10,
+        help="Number of training steps"
+    )
+    
+    parser.add_argument(
+        "--warmup-steps",
+        type=int,
+        default=3,
+        help="Number of warmup steps"
+    )
+    
+    parser.add_argument(
+        "--device-id",
+        type=int,
+        default=0,
+        help="GPU device ID"
+    )
+    
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="./results",
+        help="Output directory for results"
+    )
+    
+    args = parser.parse_args()
+    
+    # Set environment variables for HuggingFace cache
+    if Path(args.model_path).exists():
+        os.environ['HF_HOME'] = args.model_path
+    
+    benchmark_pretrain(
+        model_name_or_path=args.model_name,
+        attn_implementation=args.attn_implementation,
+        batch_size=args.batch_size,
+        sequence_length=args.sequence_length,
+        num_steps=args.num_steps,
+        warmup_steps=args.warmup_steps,
+        device="cuda",
+        device_id=args.device_id,
+        output_dir=args.output_dir,
+        verbose=True
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/cache_model.py
+++ b/cache_model.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""
+Model Caching Script for LLM Benchmarking
+
+This script downloads and caches the Qwen3-4B model from HuggingFace
+before running benchmarks on offline compute nodes.
+"""
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+def cache_model(model_name: str, cache_dir: str, force: bool = False):
+    """
+    Download and cache a HuggingFace model.
+    
+    Args:
+        model_name: HuggingFace model identifier (e.g., "Qwen/Qwen3-4B-Instruct-2507")
+        cache_dir: Local directory to cache the model
+        force: Force re-download even if model exists
+    """
+    try:
+        from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+    except ImportError:
+        print("Error: transformers library not found. Please install it:")
+        print("  pip install transformers")
+        sys.exit(1)
+    
+    # Create cache directory
+    cache_path = Path(cache_dir).resolve()
+    cache_path.mkdir(parents=True, exist_ok=True)
+    
+    print(f"Caching model: {model_name}")
+    print(f"Cache directory: {cache_path}")
+    print("-" * 60)
+    
+    # Set HuggingFace cache directory
+    os.environ['HF_HOME'] = str(cache_path)
+    
+    # Check if model already exists
+    model_path = cache_path / model_name.replace("/", "--")
+    if model_path.exists() and not force:
+        print(f"Model already cached at: {model_path}")
+        print("Use --force to re-download")
+        return str(cache_path)
+    
+    try:
+        # Download config
+        print("\n[1/3] Downloading model config...")
+        config = AutoConfig.from_pretrained(
+            model_name,
+            cache_dir=cache_path,
+            trust_remote_code=True
+        )
+        print(f"  ✓ Config downloaded")
+        print(f"    - Model type: {config.model_type}")
+        print(f"    - Hidden size: {config.hidden_size}")
+        print(f"    - Num layers: {config.num_hidden_layers}")
+        print(f"    - Num attention heads: {config.num_attention_heads}")
+        
+        # Download tokenizer
+        print("\n[2/3] Downloading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            cache_dir=cache_path,
+            trust_remote_code=True
+        )
+        print(f"  ✓ Tokenizer downloaded")
+        print(f"    - Vocab size: {len(tokenizer)}")
+        print(f"    - Model max length: {tokenizer.model_max_length}")
+        
+        # Download model weights
+        print("\n[3/3] Downloading model weights...")
+        print("  (This may take several minutes depending on connection speed)")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            cache_dir=cache_path,
+            trust_remote_code=True,
+            torch_dtype="auto",
+            low_cpu_mem_usage=True
+        )
+        print(f"  ✓ Model weights downloaded")
+        
+        # Calculate total parameters
+        total_params = sum(p.numel() for p in model.parameters())
+        print(f"    - Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
+        
+        # Clean up model from memory
+        del model
+        
+        print("\n" + "=" * 60)
+        print("✓ Model successfully cached!")
+        print("=" * 60)
+        print(f"\nCache location: {cache_path}")
+        print(f"\nTo use in benchmarks, set:")
+        print(f"  --model-path {cache_path}")
+        print(f"\nOr set environment variable:")
+        print(f"  export HF_HOME={cache_path}")
+        
+        return str(cache_path)
+        
+    except Exception as e:
+        print(f"\n✗ Error downloading model: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Cache HuggingFace model for offline use",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Cache model to default location
+  python cache_model.py
+  
+  # Cache model to custom directory
+  python cache_model.py --cache-dir /path/to/cache
+  
+  # Force re-download
+  python cache_model.py --force
+        """
+    )
+    
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="Qwen/Qwen3-4B",
+        help="HuggingFace model identifier (default: Qwen/Qwen3-4B)"
+    )
+    
+    parser.add_argument(
+        "--cache-dir",
+        type=str,
+        default="./model_cache",
+        help="Directory to cache model (default: ./model_cache in current directory)"
+    )
+    
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Force re-download even if model exists"
+    )
+    
+    args = parser.parse_args()
+    
+    cache_model(args.model_name, args.cache_dir, args.force)
+
+
+if __name__ == "__main__":
+    main()
--- a/configs/a100.yaml
+++ b/configs/a100.yaml
@@ -0,0 +1,26 @@
+# A100 Configuration
+gpu_type: a100
+gpu_model: "NVIDIA A100 80GB"
+
+# Default attention implementation
+default_attention: flash_attention_2
+
+# Pretraining defaults
+pretrain:
+  batch_size: 8
+  sequence_length: 8192
+  num_steps: 10
+  warmup_steps: 3
+
+# Inference defaults
+inference:
+  num_requests: 10
+  prompt_length: 512
+  generation_length: 100
+  warmup_requests: 2
+
+# Hardware specs (for reference)
+hardware:
+  memory_gb: 80
+  tdp_watts: 400
+  compute_capability: "8.0"
--- a/configs/h100.yaml
+++ b/configs/h100.yaml
@@ -0,0 +1,26 @@
+# H100 Configuration
+gpu_type: h100
+gpu_model: "NVIDIA H100 80GB"
+
+# Default attention implementation
+default_attention: flash_attention_3_hopper
+
+# Pretraining defaults
+pretrain:
+  batch_size: 8
+  sequence_length: 8192
+  num_steps: 10
+  warmup_steps: 3
+
+# Inference defaults
+inference:
+  num_requests: 10
+  prompt_length: 512
+  generation_length: 100
+  warmup_requests: 2
+
+# Hardware specs (for reference)
+hardware:
+  memory_gb: 80
+  tdp_watts: 700
+  compute_capability: "9.0"
--- a/configs/h200.yaml
+++ b/configs/h200.yaml
@@ -0,0 +1,26 @@
+# H200 Configuration
+gpu_type: h200
+gpu_model: "NVIDIA H200 141GB"
+
+# Default attention implementation
+default_attention: flash_attention_3_hopper
+
+# Pretraining defaults
+pretrain:
+  batch_size: 8
+  sequence_length: 8192
+  num_steps: 10
+  warmup_steps: 3
+
+# Inference defaults
+inference:
+  num_requests: 10
+  prompt_length: 512
+  generation_length: 100
+  warmup_requests: 2
+
+# Hardware specs (for reference)
+hardware:
+  memory_gb: 141
+  tdp_watts: 700
+  compute_capability: "9.0"
--- a/configs/mi300x.yaml
+++ b/configs/mi300x.yaml
@@ -0,0 +1,26 @@
+# MI300X Configuration
+gpu_type: mi300x
+gpu_model: "AMD Instinct MI300X"
+
+# Default attention implementation
+default_attention: flash_attention_2
+
+# Pretraining defaults
+pretrain:
+  batch_size: 8
+  sequence_length: 8192
+  num_steps: 10
+  warmup_steps: 3
+
+# Inference defaults
+inference:
+  num_requests: 10
+  prompt_length: 512
+  generation_length: 100
+  warmup_requests: 2
+
+# Hardware specs (for reference)
+hardware:
+  memory_gb: 192
+  tdp_watts: 750
+  compute_capability: "gfx940"
--- a/quick_start.sh
+++ b/quick_start.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+# Quick Start Script for LLM Benchmark Suite
+#
+# This script helps you get started quickly with the benchmark suite.
+# It will:
+# 1. Check dependencies
+# 2. Cache the model if needed
+# 3. Run a quick test benchmark
+#
+# Usage: ./quick_start.sh [--skip-cache]
+
+set -e  # Exit on error
+
+echo "========================================="
+echo "LLM Benchmark Suite - Quick Start"
+echo "========================================="
+
+# Parse arguments
+SKIP_CACHE=false
+if [[ "$1" == "--skip-cache" ]]; then
+    SKIP_CACHE=true
+fi
+
+# Check Python
+echo ""
+echo "[1/5] Checking Python..."
+if ! command -v python &> /dev/null; then
+    echo "✗ Python not found. Please install Python 3.8+"
+    exit 1
+fi
+PYTHON_VERSION=$(python --version 2>&1 | awk '{print $2}')
+echo "  ✓ Python $PYTHON_VERSION found"
+
+# Check dependencies
+echo ""
+echo "[2/5] Checking dependencies..."
+MISSING_DEPS=()
+
+if ! python -c "import torch" 2>/dev/null; then
+    MISSING_DEPS+=("torch")
+fi
+
+if ! python -c "import transformers" 2>/dev/null; then
+    MISSING_DEPS+=("transformers")
+fi
+
+if ${#MISSING_DEPS[@]} -gt 0; then
+    echo "  ⚠ Missing dependencies: ${MISSING_DEPS[*]}"
+    echo "  Installing dependencies..."
+    pip install -r requirements.txt
+else
+    echo "  ✓ All dependencies installed"
+fi
+
+# Check GPU
+echo ""
+echo "[3/5] Checking GPU..."
+if python -c "import torch; assert torch.cuda.is_available()" 2>/dev/null; then
+    GPU_NAME=$(python -c "import torch; print(torch.cuda.get_device_name(0))")
+    echo "  ✓ GPU found: $GPU_NAME"
+else
+    echo "  ✗ No GPU found or CUDA not available"
+    echo "  This benchmark requires a GPU to run."
+    exit 1
+fi
+
+# Cache model
+if [ "$SKIP_CACHE" = false ]; then
+    echo ""
+    echo "[4/5] Caching model..."
+    if [ -d "./model_cache" ] && [ "$(ls -A ./model_cache)" ]; then
+        echo "  ✓ Model cache already exists at ./model_cache"
+        echo "  To re-download, remove the directory and run again."
+    else
+        echo "  Downloading Qwen/Qwen3-4B..."
+        echo "  (This may take several minutes depending on your connection)"
+        python cache_model.py --model-name Qwen/Qwen3-4B --cache-dir ./model_cache
+    fi
+else
+    echo ""
+    echo "[4/5] Skipping model cache (--skip-cache specified)"
+fi
+
+# Run quick test
+echo ""
+echo "[5/5] Running quick test benchmark..."
+echo "  This will run a minimal benchmark to verify everything works."
+echo "  Parameters: 2 steps, batch size 2, sequence length 512"
+echo ""
+
+python run_benchmark.py \
+    --mode both \
+    --model-path ./model_cache \
+    --model-name Qwen/Qwen3-4B \
+    --batch-size 2 \
+    --sequence-length 512 \
+    --num-steps 2 \
+    --num-requests 2 \
+    --prompt-length 256 \
+    --generation-length 20 \
+    --output-dir ./results/test
+
+echo ""
+echo "========================================="
+echo "Quick Start Complete!"
+echo "========================================="
+echo ""
+echo "Next steps:"
+echo "  1. Run full benchmarks:"
+echo "     python run_benchmark.py --mode both"
+echo ""
+echo "  2. Run on different GPUs using SLURM:"
+echo "     sbatch slurm_a100.sh"
+echo "     sbatch slurm_h100.sh"
+echo "     sbatch slurm_h200.sh"
+echo "     sbatch slurm_mi300x.sh"
+echo ""
+echo "  3. View results:"
+echo "     ls -l results/"
+echo ""
+echo "For more information, see README.md"
+echo ""
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,22 @@
+# LLM Benchmark Suite - Requirements
+
+# Core dependencies
+torch>=2.0.0
+transformers>=4.35.0
+accelerate>=0.24.0
+tokenizers>=0.14.0
+
+# Attention implementations
+flash-attn>=2.0.0
+
+# GPU monitoring
+pynvml>=11.5.0  # NVIDIA GPU monitoring
+pyrsmi>=1.0.0   # AMD GPU monitoring
+
+# Utilities
+numpy>=1.24.0
+pyyaml>=6.0
+tqdm>=4.65.0
+
+# Optional: for better performance
+triton>=2.0.0
--- a/results/a100/inference_NVIDIA_A100-SXM4-80GB_flash_attention_2.json
+++ b/results/a100/inference_NVIDIA_A100-SXM4-80GB_flash_attention_2.json
@@ -0,0 +1,37 @@
+{
+  "model_name": "Qwen/Qwen3-4B",
+  "gpu_name": "NVIDIA A100-SXM4-80GB",
+  "attention_implementation": "flash_attention_2",
+  "num_requests": 10,
+  "prompt_length": 512,
+  "generation_length": 100,
+  "prefill": {
+    "stage_name": "prefill",
+    "duration_ms": 475.62581300735474,
+    "tokens_processed": 5120,
+    "tokens_per_second": 10764.76477932628,
+    "energy_joules": 21.409000039100647,
+    "energy_per_token": 0.004181445320136845,
+    "avg_power_watts": 68.91171083870925,
+    "peak_memory_gb": 45.87115478515625,
+    "avg_gpu_util_percent": 38.1
+  },
+  "decode": {
+    "stage_name": "decode",
+    "duration_ms": 41460.768724791706,
+    "tokens_processed": 1000,
+    "tokens_per_second": 24.119186179055195,
+    "energy_joules": 4684.697999954224,
+    "energy_per_token": 4.684697999954223,
+    "avg_power_watts": 112.85507087682042,
+    "peak_memory_gb": 45.87115478515625,
+    "avg_gpu_util_percent": 38.1
+  },
+  "e2e_latency_ms": 4193.639453779906,
+  "e2e_tokens_per_second": 145.93529242204605,
+  "e2e_energy_joules": 4706.106999993324,
+  "e2e_energy_per_token": 0.768971732025053,
+  "ttft_ms": 47.562581300735474,
+  "itl_ms": 41.460768724791706,
+  "timestamp": 1768519487.5402663
+}
--- a/results/a100/pretrain_NVIDIA_A100-SXM4-80GB_flash_attention_2.json
+++ b/results/a100/pretrain_NVIDIA_A100-SXM4-80GB_flash_attention_2.json
@@ -0,0 +1,47 @@
+{
+  "model_name": "Qwen/Qwen3-4B",
+  "gpu_name": "NVIDIA A100-SXM4-80GB",
+  "attention_implementation": "flash_attention_2",
+  "batch_size": 3,
+  "sequence_length": 2048,
+  "num_steps": 10,
+  "forward": {
+    "stage_name": "forward",
+    "duration_ms": 3359.0412912890315,
+    "tokens_processed": 61440,
+    "tokens_per_second": 18290.933237210196,
+    "energy_joules": 1292.2280000448227,
+    "energy_per_token": 0.021032356771562868,
+    "avg_power_watts": 387.19580415542595,
+    "peak_memory_gb": 79.66021728515625,
+    "avg_gpu_util_percent": 97.8
+  },
+  "backward": {
+    "stage_name": "backward",
+    "duration_ms": 6954.944152384996,
+    "tokens_processed": 61440,
+    "tokens_per_second": 8834.003358449821,
+    "energy_joules": 2729.588000059128,
+    "energy_per_token": 0.0444268880217957,
+    "avg_power_watts": 394.24766095856324,
+    "peak_memory_gb": 79.66021728515625,
+    "avg_gpu_util_percent": 97.8
+  },
+  "optimizer": {
+    "stage_name": "optimizer",
+    "duration_ms": 1153.845101594925,
+    "tokens_processed": 61440,
+    "tokens_per_second": 53248.048559614595,
+    "energy_joules": 362.6529998779297,
+    "energy_per_token": 0.005902555336554845,
+    "avg_power_watts": 299.1223537953503,
+    "peak_memory_gb": 79.66021728515625,
+    "avg_gpu_util_percent": 97.8
+  },
+  "total_duration_ms": 11467.830545268953,
+  "total_tokens": 61440,
+  "total_tokens_per_second": 5357.595733340081,
+  "total_energy_joules": 4384.46899998188,
+  "total_energy_per_token": 0.07136180012991342,
+  "timestamp": 1768519431.5985208
+}
--- a/results/h100/inference_NVIDIA_H100_flash_attention_3_hopper.json
+++ b/results/h100/inference_NVIDIA_H100_flash_attention_3_hopper.json
@@ -0,0 +1,37 @@
+{
+  "model_name": "Qwen/Qwen3-4B",
+  "gpu_name": "NVIDIA H100",
+  "attention_implementation": "flash_attention_3_hopper",
+  "num_requests": 10,
+  "prompt_length": 512,
+  "generation_length": 100,
+  "prefill": {
+    "stage_name": "prefill",
+    "duration_ms": 323.99015384726226,
+    "tokens_processed": 5120,
+    "tokens_per_second": 15802.949377324925,
+    "energy_joules": 17.092000007629395,
+    "energy_per_token": 0.0033382812514901163,
+    "avg_power_watts": 93.64442380045372,
+    "peak_memory_gb": 46.02825927734375,
+    "avg_gpu_util_percent": 40.0
+  },
+  "decode": {
+    "stage_name": "decode",
+    "duration_ms": 30513.75844143331,
+    "tokens_processed": 1000,
+    "tokens_per_second": 32.772101867403634,
+    "energy_joules": 4915.5139999985695,
+    "energy_per_token": 4.915513999998569,
+    "avg_power_watts": 161.199160874206,
+    "peak_memory_gb": 46.02825927734375,
+    "avg_gpu_util_percent": 40.0
+  },
+  "e2e_latency_ms": 3083.7748595280573,
+  "e2e_tokens_per_second": 198.4580677506596,
+  "e2e_energy_joules": 4932.606000006199,
+  "e2e_energy_per_token": 0.8059813725500325,
+  "ttft_ms": 32.399015384726226,
+  "itl_ms": 30.51375844143331,
+  "timestamp": 1768541839.3186588
+}
--- a/results/h100/pretrain_NVIDIA_H100_flash_attention_3_hopper.json
+++ b/results/h100/pretrain_NVIDIA_H100_flash_attention_3_hopper.json
@@ -0,0 +1,47 @@
+{
+  "model_name": "Qwen/Qwen3-4B",
+  "gpu_name": "NVIDIA H100",
+  "attention_implementation": "flash_attention_3_hopper",
+  "batch_size": 3,
+  "sequence_length": 2048,
+  "num_steps": 10,
+  "forward": {
+    "stage_name": "forward",
+    "duration_ms": 1748.5067250672728,
+    "tokens_processed": 61440,
+    "tokens_per_second": 35138.55515633555,
+    "energy_joules": 946.9269999563694,
+    "energy_per_token": 0.015412223306581534,
+    "avg_power_watts": 501.76439870614394,
+    "peak_memory_gb": 76.45208740234375,
+    "avg_gpu_util_percent": 97.0
+  },
+  "backward": {
+    "stage_name": "backward",
+    "duration_ms": 3761.718863155693,
+    "tokens_processed": 61440,
+    "tokens_per_second": 16332.959010248362,
+    "energy_joules": 1904.104000031948,
+    "energy_per_token": 0.030991276042186655,
+    "avg_power_watts": 491.250130606127,
+    "peak_memory_gb": 76.45208740234375,
+    "avg_gpu_util_percent": 97.0
+  },
+  "optimizer": {
+    "stage_name": "optimizer",
+    "duration_ms": 896.0564862936735,
+    "tokens_processed": 61440,
+    "tokens_per_second": 68567.1059133025,
+    "energy_joules": 349.722000002861,
+    "energy_per_token": 0.0056920898437965665,
+    "avg_power_watts": 356.92130879075387,
+    "peak_memory_gb": 76.45208740234375,
+    "avg_gpu_util_percent": 97.0
+  },
+  "total_duration_ms": 6406.282074516639,
+  "total_tokens": 61440,
+  "total_tokens_per_second": 9590.586128637759,
+  "total_energy_joules": 3200.7529999911785,
+  "total_energy_per_token": 0.052095589192564754,
+  "timestamp": 1768541796.4011748
+}
--- a/results/h100_sdpa/inference_NVIDIA_H100_sdpa.json
+++ b/results/h100_sdpa/inference_NVIDIA_H100_sdpa.json
@@ -0,0 +1,37 @@
+{
+  "model_name": "Qwen/Qwen3-4B",
+  "gpu_name": "NVIDIA H100",
+  "attention_implementation": "sdpa",
+  "num_requests": 10,
+  "prompt_length": 512,
+  "generation_length": 100,
+  "prefill": {
+    "stage_name": "prefill",
+    "duration_ms": 253.97859653458,
+    "tokens_processed": 5120,
+    "tokens_per_second": 20159.179040517676,
+    "energy_joules": 0.0,
+    "energy_per_token": 0.0,
+    "avg_power_watts": 0.0,
+    "peak_memory_gb": 46.01458740234375,
+    "avg_gpu_util_percent": 48.8
+  },
+  "decode": {
+    "stage_name": "decode",
+    "duration_ms": 23519.252635538578,
+    "tokens_processed": 1000,
+    "tokens_per_second": 42.51835785330007,
+    "energy_joules": 4544.901999980211,
+    "energy_per_token": 4.544901999980211,
+    "avg_power_watts": 192.5432634001641,
+    "peak_memory_gb": 46.01458740234375,
+    "avg_gpu_util_percent": 48.8
+  },
+  "e2e_latency_ms": 2377.323123207316,
+  "e2e_tokens_per_second": 257.43240118504923,
+  "e2e_energy_joules": 4544.901999980211,
+  "e2e_energy_per_token": 0.7426310457484006,
+  "ttft_ms": 25.397859653458,
+  "itl_ms": 23.519252635538578,
+  "timestamp": 1769149269.5228984
+}
--- a/results/h100_sdpa/pretrain_NVIDIA_H100_sdpa.json
+++ b/results/h100_sdpa/pretrain_NVIDIA_H100_sdpa.json
@@ -0,0 +1,47 @@
+{
+  "model_name": "Qwen/Qwen3-4B",
+  "gpu_name": "NVIDIA H100",
+  "attention_implementation": "sdpa",
+  "batch_size": 3,
+  "sequence_length": 2048,
+  "num_steps": 10,
+  "forward": {
+    "stage_name": "forward",
+    "duration_ms": 1790.2467511594296,
+    "tokens_processed": 61440,
+    "tokens_per_second": 34319.29143857359,
+    "energy_joules": 981.029000043869,
+    "energy_per_token": 0.01596726888092235,
+    "avg_power_watts": 520.9058508009567,
+    "peak_memory_gb": 76.45208740234375,
+    "avg_gpu_util_percent": 100.0
+  },
+  "backward": {
+    "stage_name": "backward",
+    "duration_ms": 3854.5540031045675,
+    "tokens_processed": 61440,
+    "tokens_per_second": 15939.587290906931,
+    "energy_joules": 1953.71099999547,
+    "energy_per_token": 0.03179868164055127,
+    "avg_power_watts": 491.5443624439596,
+    "peak_memory_gb": 76.45208740234375,
+    "avg_gpu_util_percent": 100.0
+  },
+  "optimizer": {
+    "stage_name": "optimizer",
+    "duration_ms": 899.9840868636966,
+    "tokens_processed": 61440,
+    "tokens_per_second": 68267.87372886644,
+    "energy_joules": 365.9209999740124,
+    "energy_per_token": 0.005955745442285358,
+    "avg_power_watts": 377.8756124501158,
+    "peak_memory_gb": 76.45208740234375,
+    "avg_gpu_util_percent": 100.0
+  },
+  "total_duration_ms": 6544.784841127694,
+  "total_tokens": 61440,
+  "total_tokens_per_second": 9387.627170553957,
+  "total_energy_joules": 3300.6610000133514,
+  "total_energy_per_token": 0.053721695963758975,
+  "timestamp": 1769149234.99943
+}
--- a/results/h200/inference_NVIDIA_H200_flash_attention_3_hopper.json
+++ b/results/h200/inference_NVIDIA_H200_flash_attention_3_hopper.json
@@ -0,0 +1,37 @@
+{
+  "model_name": "Qwen/Qwen3-4B",
+  "gpu_name": "NVIDIA H200",
+  "attention_implementation": "flash_attention_3_hopper",
+  "num_requests": 10,
+  "prompt_length": 512,
+  "generation_length": 100,
+  "prefill": {
+    "stage_name": "prefill",
+    "duration_ms": 323.8773119999223,
+    "tokens_processed": 5120,
+    "tokens_per_second": 15808.455270868828,
+    "energy_joules": 98.1449999999968,
+    "energy_per_token": 0.019168945312499373,
+    "avg_power_watts": 250.96736239598317,
+    "peak_memory_gb": 46.1302490234375,
+    "avg_gpu_util_percent": 32.2
+  },
+  "decode": {
+    "stage_name": "decode",
+    "duration_ms": 30558.618001000013,
+    "tokens_processed": 1000,
+    "tokens_per_second": 32.72399294913388,
+    "energy_joules": 4828.459999999999,
+    "energy_per_token": 4.828459999999999,
+    "avg_power_watts": 157.61927190444868,
+    "peak_memory_gb": 46.1302490234375,
+    "avg_gpu_util_percent": 32.2
+  },
+  "e2e_latency_ms": 3088.2495312999936,
+  "e2e_tokens_per_second": 198.17051497855476,
+  "e2e_energy_joules": 4926.604999999996,
+  "e2e_energy_per_token": 0.8050008169934634,
+  "ttft_ms": 32.38773119999223,
+  "itl_ms": 30.558618001000013,
+  "timestamp": 1768541964.4743361
+}
--- a/results/h200/pretrain_NVIDIA_H200_flash_attention_3_hopper.json
+++ b/results/h200/pretrain_NVIDIA_H200_flash_attention_3_hopper.json
@@ -0,0 +1,47 @@
+{
+  "model_name": "Qwen/Qwen3-4B",
+  "gpu_name": "NVIDIA H200",
+  "attention_implementation": "flash_attention_3_hopper",
+  "batch_size": 3,
+  "sequence_length": 2048,
+  "num_steps": 10,
+  "forward": {
+    "stage_name": "forward",
+    "duration_ms": 1605.9521619997668,
+    "tokens_processed": 61440,
+    "tokens_per_second": 38257.67756587068,
+    "energy_joules": 817.7539999999863,
+    "energy_per_token": 0.01330979817708311,
+    "avg_power_watts": 476.6091506406698,
+    "peak_memory_gb": 76.5540771484375,
+    "avg_gpu_util_percent": 95.1
+  },
+  "backward": {
+    "stage_name": "backward",
+    "duration_ms": 3448.8081949999696,
+    "tokens_processed": 61440,
+    "tokens_per_second": 17814.849804948502,
+    "energy_joules": 1765.182000000008,
+    "energy_per_token": 0.02873017578125013,
+    "avg_power_watts": 498.84691252245983,
+    "peak_memory_gb": 76.5540771484375,
+    "avg_gpu_util_percent": 95.1
+  },
+  "optimizer": {
+    "stage_name": "optimizer",
+    "duration_ms": 545.701982000196,
+    "tokens_processed": 61440,
+    "tokens_per_second": 112588.92587268984,
+    "energy_joules": 332.4770000000135,
+    "energy_per_token": 0.005411409505208553,
+    "avg_power_watts": 521.4900438388863,
+    "peak_memory_gb": 76.5540771484375,
+    "avg_gpu_util_percent": 95.1
+  },
+  "total_duration_ms": 5600.462338999932,
+  "total_tokens": 61440,
+  "total_tokens_per_second": 10970.522839186035,
+  "total_energy_joules": 2915.4130000000077,
+  "total_energy_per_token": 0.047451383463541795,
+  "timestamp": 1768541921.6000674
+}
--- a/results/h200_sdpa/inference_NVIDIA_H200_sdpa.json
+++ b/results/h200_sdpa/inference_NVIDIA_H200_sdpa.json
@@ -0,0 +1,37 @@
+{
+  "model_name": "Qwen/Qwen3-4B",
+  "gpu_name": "NVIDIA H200",
+  "attention_implementation": "sdpa",
+  "num_requests": 10,
+  "prompt_length": 512,
+  "generation_length": 100,
+  "prefill": {
+    "stage_name": "prefill",
+    "duration_ms": 247.9969559935853,
+    "tokens_processed": 5120,
+    "tokens_per_second": 20645.414696672466,
+    "energy_joules": 73.83399999141693,
+    "energy_per_token": 0.014420703123323619,
+    "avg_power_watts": 222.33737204549297,
+    "peak_memory_gb": 46.1165771484375,
+    "avg_gpu_util_percent": 40.0
+  },
+  "decode": {
+    "stage_name": "decode",
+    "duration_ms": 23003.622506046668,
+    "tokens_processed": 1000,
+    "tokens_per_second": 43.47141411041425,
+    "energy_joules": 4033.3500000089407,
+    "energy_per_token": 4.033350000008941,
+    "avg_power_watts": 174.6335604209662,
+    "peak_memory_gb": 46.1165771484375,
+    "avg_gpu_util_percent": 40.0
+  },
+  "e2e_latency_ms": 2325.1619462040253,
+  "e2e_tokens_per_second": 263.20747292425324,
+  "e2e_energy_joules": 4107.184000000358,
+  "e2e_energy_per_token": 0.6711084967320846,
+  "ttft_ms": 24.79969559935853,
+  "itl_ms": 23.003622506046668,
+  "timestamp": 1769149520.7919798
+}
--- a/results/h200_sdpa/pretrain_NVIDIA_H200_sdpa.json
+++ b/results/h200_sdpa/pretrain_NVIDIA_H200_sdpa.json
@@ -0,0 +1,47 @@
+{
+  "model_name": "Qwen/Qwen3-4B",
+  "gpu_name": "NVIDIA H200",
+  "attention_implementation": "sdpa",
+  "batch_size": 3,
+  "sequence_length": 2048,
+  "num_steps": 10,
+  "forward": {
+    "stage_name": "forward",
+    "duration_ms": 1615.8598741167225,
+    "tokens_processed": 61440,
+    "tokens_per_second": 38023.09902248482,
+    "energy_joules": 873.9250000119209,
+    "energy_per_token": 0.014224039713735693,
+    "avg_power_watts": 541.9081076256928,
+    "peak_memory_gb": 76.5540771484375,
+    "avg_gpu_util_percent": 100.0
+  },
+  "backward": {
+    "stage_name": "backward",
+    "duration_ms": 3462.180594098754,
+    "tokens_processed": 61440,
+    "tokens_per_second": 17746.04135460864,
+    "energy_joules": 1696.024000003934,
+    "energy_per_token": 0.027604557291730693,
+    "avg_power_watts": 472.8399628680292,
+    "peak_memory_gb": 76.5540771484375,
+    "avg_gpu_util_percent": 100.0
+  },
+  "optimizer": {
+    "stage_name": "optimizer",
+    "duration_ms": 551.849422918167,
+    "tokens_processed": 61440,
+    "tokens_per_second": 111334.71821915968,
+    "energy_joules": 316.88299998641014,
+    "energy_per_token": 0.005157600911237144,
+    "avg_power_watts": 499.2301039455484,
+    "peak_memory_gb": 76.5540771484375,
+    "avg_gpu_util_percent": 100.0
+  },
+  "total_duration_ms": 5629.889891133644,
+  "total_tokens": 61440,
+  "total_tokens_per_second": 10913.179687005982,
+  "total_energy_joules": 2886.832000002265,
+  "total_energy_per_token": 0.04698619791670353,
+  "timestamp": 1769149487.0005488
+}
--- a/run_benchmark.py
+++ b/run_benchmark.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+"""
+Main LLM Benchmark Runner
+
+Orchestrates pretraining and inference benchmarks with auto-detection
+of GPU type and configuration.
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+# Import benchmark functions
+import benchmark_pretrain
+import benchmark_inference
+
+from utils.gpu_monitor import get_gpu_monitor, list_available_gpus
+from utils.metrics import MetricsReporter
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="LLM Benchmark Suite - Compare GPU performance for pretraining and inference",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Run both pretrain and inference benchmarks
+  python run_benchmark.py --mode both
+  
+  # Run only pretraining benchmark
+  python run_benchmark.py --mode pretrain --num-steps 20
+  
+  # Run inference with custom settings
+  python run_benchmark.py --mode inference --num-requests 20 --generation-length 200
+  
+  # Use specific attention implementation
+  python run_benchmark.py --attn-implementation flash_attention_3_hopper
+        """
+    )
+    
+    # Model configuration
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="./model_cache",
+        help="Path to cached model directory"
+    )
+    
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="Qwen/Qwen3-4B",
+        help="Model name for reporting"
+    )
+    
+    # Benchmark mode
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="both",
+        choices=["pretrain", "inference", "both"],
+        help="Benchmark mode to run"
+    )
+    
+    # Attention configuration
+    parser.add_argument(
+        "--attn-implementation",
+        type=str,
+        default="auto",
+        choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
+        help="Attention implementation (auto selects based on GPU)"
+    )
+    
+    # Pretraining parameters
+    pretrain_group = parser.add_argument_group("pretraining parameters")
+    pretrain_group.add_argument(
+        "--batch-size",
+        type=int,
+        default=3,
+        help="Batch size for pretraining"
+    )
+    pretrain_group.add_argument(
+        "--sequence-length",
+        type=int,
+        default=2048,
+        help="Sequence length for pretraining"
+    )
+    pretrain_group.add_argument(
+        "--num-steps",
+        type=int,
+        default=10,
+        help="Number of training steps"
+    )
+    pretrain_group.add_argument(
+        "--warmup-steps",
+        type=int,
+        default=3,
+        help="Number of warmup steps"
+    )
+    
+    # Inference parameters
+    inference_group = parser.add_argument_group("inference parameters")
+    inference_group.add_argument(
+        "--num-requests",
+        type=int,
+        default=10,
+        help="Number of inference requests"
+    )
+    inference_group.add_argument(
+        "--prompt-length",
+        type=int,
+        default=512,
+        help="Prompt length in tokens"
+    )
+    inference_group.add_argument(
+        "--generation-length",
+        type=int,
+        default=100,
+        help="Number of tokens to generate"
+    )
+    inference_group.add_argument(
+        "--warmup-requests",
+        type=int,
+        default=2,
+        help="Number of warmup requests"
+    )
+    
+    # General parameters
+    parser.add_argument(
+        "--device-id",
+        type=int,
+        default=0,
+        help="GPU device ID"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="./results",
+        help="Output directory for results"
+    )
+    parser.add_argument(
+        "--list-gpus",
+        action="store_true",
+        help="List available GPUs and exit"
+    )
+    
+    args = parser.parse_args()
+    
+    # List GPUs if requested
+    if args.list_gpus:
+        print("Available GPUs:")
+        gpus = list_available_gpus()
+        if not gpus:
+            print("  No GPUs found!")
+        else:
+            for gpu in gpus:
+                print(f"  {gpu}")
+        return
+    
+    # Print header
+    print("=" * 80)
+    print("LLM BENCHMARK SUITE")
+    print("=" * 80)
+    print(f"\nModel: {args.model_name}")
+    print(f"Model Path: {args.model_path}")
+    print(f"Mode: {args.mode}")
+    print(f"Attention: {args.attn_implementation}")
+    print(f"Output Directory: {args.output_dir}")
+    
+    # Detect GPU
+    print("\nDetecting GPU...")
+    try:
+        monitor = get_gpu_monitor(args.device_id)
+        gpu_name = monitor.get_device_name()
+        print(f"  GPU {args.device_id}: {gpu_name}")
+        monitor.cleanup()
+    except Exception as e:
+        print(f"✗ Error detecting GPU: {e}")
+        sys.exit(1)
+    
+    # Create output directory
+    output_path = Path(args.output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    # Run benchmarks
+    pretrain_metrics = None
+    inference_metrics = None
+    
+    if args.mode in ["pretrain", "both"]:
+        print("\n" + "=" * 80)
+        print("Running Pretraining Benchmark...")
+        print("=" * 80)
+        
+        pretrain_metrics = benchmark_pretrain.benchmark_pretrain(
+            model_name_or_path=args.model_name,
+            attn_implementation=args.attn_implementation,
+            batch_size=args.batch_size,
+            sequence_length=args.sequence_length,
+            num_steps=args.num_steps,
+            warmup_steps=args.warmup_steps,
+            device="cuda",
+            device_id=args.device_id,
+            output_dir=args.output_dir,
+            verbose=True
+        )
+    
+    if args.mode in ["inference", "both"]:
+        print("\n" + "=" * 80)
+        print("Running Inference Benchmark...")
+        print("=" * 80)
+        
+        inference_metrics = benchmark_inference.benchmark_inference(
+            model_name_or_path=args.model_name,
+            attn_implementation=args.attn_implementation,
+            num_requests=args.num_requests,
+            prompt_length=args.prompt_length,
+            generation_length=args.generation_length,
+            warmup_requests=args.warmup_requests,
+            device="cuda",
+            device_id=args.device_id,
+            output_dir=args.output_dir,
+            verbose=True
+        )
+    
+    # Summary
+    print("\n" + "=" * 80)
+    print("BENCHMARK COMPLETE")
+    print("=" * 80)
+    print(f"\nResults saved to: {output_path}")
+    
+    if pretrain_metrics:
+        print(f"\nPretraining:")
+        print(f"  Duration: {pretrain_metrics.total_duration_ms:.2f} ms")
+        print(f"  Throughput: {pretrain_metrics.total_tokens_per_second:.2f} tokens/s")
+        print(f"  Energy: {pretrain_metrics.total_energy_joules:.2f} J")
+        print(f"  Energy/token: {pretrain_metrics.total_energy_per_token*1000:.4f} mJ/token")
+    
+    if inference_metrics:
+        print(f"\nInference:")
+        print(f"  TTFT: {inference_metrics.ttft_ms:.2f} ms")
+        print(f"  ITL: {inference_metrics.itl_ms:.2f} ms/token")
+        print(f"  Throughput: {inference_metrics.e2e_tokens_per_second:.2f} tokens/s")
+        print(f"  Energy: {inference_metrics.e2e_energy_joules:.2f} J")
+        print(f"  Energy/token: {inference_metrics.e2e_energy_per_token*1000:.4f} mJ/token")
+
+
+if __name__ == "__main__":
+    main()
--- a/slurm_a100.sh
+++ b/slurm_a100.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+#SBATCH --job-name=llm_bench_a100
+#SBATCH --partition=a100        # Adjust to your A100 partition name
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:a100:1       # Request 1 A100 GPU
+#SBATCH -C a100_80
+#SBATCH --time=02:00:00
+#SBATCH --output=logs/benchmark_a100_sdpa_%j.out
+#SBATCH --error=logs/benchmark_a100_sdpa_%j.err
+
+# Create logs directory
+mkdir -p logs
+
+# Print job info
+echo "========================================="
+echo "Job ID: $SLURM_JOB_ID"
+echo "Job Name: $SLURM_JOB_NAME"
+echo "Node: $SLURM_NODELIST"
+echo "Date: $(date)"
+echo "========================================="
+
+# Set cache paths
+export TRANSFORMERS_CACHE=$(pwd)/model_cache
+export HF_HOME=$(pwd)/model_cache
+
+# Path to apptainer image
+APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_updated_ao.sif"
+
+# Run benchmark inside apptainer
+apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \
+    --mode both \
+    --model-path ./model_cache \
+    --model-name Qwen/Qwen3-4B \
+    --attn-implementation sdpa \
+    --batch-size 3 \
+    --sequence-length 2048 \
+    --num-steps 10 \
+    --num-requests 10 \
+    --prompt-length 512 \
+    --generation-length 100 \
+    --output-dir ./results/a100
+
+echo "========================================="
+echo "Benchmark Complete!"
+echo "========================================="
--- a/slurm_h100.sh
+++ b/slurm_h100.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+#SBATCH --job-name=llm_bench_h100
+#SBATCH --partition=h100        # Adjust to your H100 partition name
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:h100:1       # Request 1 H100 GPU
+#SBATCH --time=02:00:00
+#SBATCH --output=logs/benchmark_h100_%j.out
+#SBATCH --error=logs/benchmark_h100_%j.err
+
+# Create logs directory
+mkdir -p logs
+
+# Print job info
+echo "========================================="
+echo "Job ID: $SLURM_JOB_ID"
+echo "Job Name: $SLURM_JOB_NAME"
+echo "Node: $SLURM_NODELIST"
+echo "Date: $(date)"
+echo "========================================="
+
+# Set cache paths
+export TRANSFORMERS_CACHE=$(pwd)/model_cache
+export HF_HOME=$(pwd)/model_cache
+
+# Path to apptainer image
+APPTAINER_IMAGE="/hnvme/workspace/ihpc125h-llm-profiler/pytorch_25.10_updated_ao.sif"
+
+# Run benchmark with FlashAttention-3 Hopper inside apptainer
+apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \
+    --mode both \
+    --model-path ./model_cache \
+    --model-name Qwen/Qwen3-4B \
+    --attn-implementation sdpa \
+    --batch-size 3 \
+    --sequence-length 2048 \
+    --num-steps 10 \
+    --num-requests 10 \
+    --prompt-length 512 \
+    --generation-length 100 \
+    --output-dir ./results/h100_sdpa
+
+# --attn-implementation flash_attention_3_hopper \
+
+echo "========================================="
+echo "Benchmark Complete!"
+echo "========================================="
--- a/slurm_h200.sh
+++ b/slurm_h200.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+#SBATCH --job-name=llm_bench_h200
+#SBATCH --partition=h200        # Adjust to your H200 partition name
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:h200:1       # Request 1 H200 GPU
+#SBATCH --time=02:00:00
+#SBATCH --output=logs/benchmark_h200_%j.out
+#SBATCH --error=logs/benchmark_h200_%j.err
+
+# Create logs directory
+mkdir -p logs
+
+# Print job info
+echo "========================================="
+echo "Job ID: $SLURM_JOB_ID"
+echo "Job Name: $SLURM_JOB_NAME"
+echo "Node: $SLURM_NODELIST"
+echo "Date: $(date)"
+echo "========================================="
+
+# Set cache paths
+export TRANSFORMERS_CACHE=$(pwd)/model_cache
+export HF_HOME=$(pwd)/model_cache
+
+# Path to apptainer image
+APPTAINER_IMAGE="/hnvme/workspace/ihpc125h-llm-profiler/pytorch_25.10_updated_ao.sif"
+
+# Run benchmark with FlashAttention-3 Hopper inside apptainer
+apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \
+    --mode both \
+    --model-path ./model_cache \
+    --model-name Qwen/Qwen3-4B \
+    --attn-implementation sdpa \
+    --batch-size 3 \
+    --sequence-length 2048 \
+    --num-steps 10 \
+    --num-requests 10 \
+    --prompt-length 512 \
+    --generation-length 100 \
+    --output-dir ./results/h200_sdpa
+    # --attn-implementation flash_attention_3_hopper \
+
+echo "========================================="
+echo "Benchmark Complete!"
+echo "========================================="
--- a/slurm_mi300x.sh
+++ b/slurm_mi300x.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+#SBATCH --job-name=llm_bench_mi300x
+#SBATCH --nodes=1
+#SBATCH -w=aquavan1     # Request MI300X GPUs
+#SBATCH --time=02:00:00
+#SBATCH --output=logs/benchmark_mi300x_%j.out
+#SBATCH --error=logs/benchmark_mi300x_%j.err
+
+# Create logs directory
+mkdir -p logs
+
+# Print job info
+echo "========================================="
+echo "Job ID: $SLURM_JOB_ID"
+echo "Job Name: $SLURM_JOB_NAME"
+echo "Node: $SLURM_NODELIST"
+echo "Date: $(date)"
+echo "========================================="
+
+# Set cache paths
+export TRANSFORMERS_CACHE=$(pwd)/models
+export HF_HOME=$(pwd)/models
+
+# Path to apptainer image
+#APPTAINER_IMAGE="/home/woody/ihpc/ihpc125h/pytorch_25.10_updated_ao.sif"
+
+apptainer exec --writable ../rocm_sandbox/ python run_benchmark.py \
+    --mode both \
+    --model-path ./model_cache \
+    --model-name Qwen/Qwen3-4B \
+    --attn-implementation sdpa \
+    --batch-size 3 \
+    --sequence-length 2048 \
+    --num-steps 10 \
+    --num-requests 10 \
+    --prompt-length 512 \
+    --generation-length 100 \
+    --output-dir ./results/mi300x_sdpa
+
+echo "========================================="
+echo "Benchmark Complete!"
+echo "========================================="
--- a/utils/init.py
+++ b/utils/init.py
@@ -0,0 +1,3 @@
+"""Utility package for LLM benchmarking."""
+
+__version__ = "1.0.0"
--- a/utils/attention.py
+++ b/utils/attention.py
@@ -0,0 +1,295 @@
+"""
+Attention Implementation Helpers for LLM Benchmarking
+
+Provides functions for configuring different attention implementations
+based on GPU type.
+"""
+
+from typing import Optional
+import warnings
+
+
+def get_default_attention(gpu_name: str) -> str:
+    """
+    Get default attention implementation for GPU type.
+    
+    Args:
+        gpu_name: GPU device name (from monitoring)
+        
+    Returns:
+        Attention implementation string
+    """
+    gpu_lower = gpu_name.lower()
+    
+    # H100/H200: FlashAttention-3 Hopper
+    if 'h100' in gpu_lower or 'h200' in gpu_lower:
+        return "flash_attention_3_hopper"
+    
+    # A100, MI300X, other: FlashAttention-2
+    return "flash_attention_2"
+
+
+def configure_model_attention(model, attn_implementation: str, verbose: bool = True):
+    """
+    Configure model to use specified attention implementation.
+    
+    This function patches the model if needed to use the specified attention.
+    For standard implementations like flash_attention_2, the model should already
+    be loaded with the correct implementation via AutoModelForCausalLM.from_pretrained().
+    
+    For FlashAttention-3 Hopper, this patches the model's attention modules.
+    
+    Args:
+        model: The loaded model
+        attn_implementation: Attention implementation to use
+        verbose: Print configuration messages
+        
+    Returns:
+        Configured model
+    """
+    if verbose:
+        print(f"Configuring attention: {attn_implementation}")
+    
+    if attn_implementation == "flash_attention_3_hopper":
+        # Patch model to use FlashAttention-3 Hopper
+        try:
+            import flash_attn_interface
+        except ImportError:
+            raise ImportError(
+                "flash_attn_interface not found. This is required for FlashAttention-3.\n"
+                "Install with appropriate method for your system."
+            )
+        
+        # Patch the model's attention function
+        _patch_fa3_hopper(model, verbose=verbose)
+        
+    elif attn_implementation == "flash_attention_2":
+        # Model should already be loaded with FA2
+        if verbose:
+            print("  Using FlashAttention-2 (configured during model loading)")
+    
+    elif attn_implementation == "sdpa":
+        # PyTorch Scaled Dot Product Attention
+        if verbose:
+            print("  Using PyTorch SDPA")
+    
+    elif attn_implementation == "eager":
+        # Standard PyTorch attention
+        if verbose:
+            print("  Using eager attention")
+    
+    else:
+        warnings.warn(f"Unknown attention implementation: {attn_implementation}")
+    
+    return model
+
+
+def _patch_fa3_hopper(model, verbose: bool = True):
+    """
+    Patch model to use FlashAttention-3 Hopper.
+    
+    This replaces the attention computation in the model's attention layers
+    with calls to flash_attn_interface.flash_attn_func().
+    
+    Args:
+        model: The model to patch
+        verbose: Print patching messages
+    """
+    import flash_attn_interface
+    import torch
+    
+    # Counter for patched modules
+    num_patched = 0
+    
+    # Iterate through all modules in the model
+    for name, module in model.named_modules():
+        # Look for attention modules (this will vary by model architecture)
+        # Common names: "self_attn", "attn", "attention"
+        if any(attn_name in name.lower() for attn_name in ['self_attn', 'attention']):
+            # Check if module has a forward method we can patch
+            if hasattr(module, 'forward'):
+                # Save original forward
+                original_forward = module.forward
+                
+                # Create patched forward function
+                def create_patched_forward(orig_forward):
+                    def patched_forward(hidden_states, *args, **kwargs):
+                        # Check if this is an attention computation
+                        # For Qwen models, attention modules typically have q, k, v projections
+                        if hasattr(module, 'q_proj') and hasattr(module, 'k_proj') and hasattr(module, 'v_proj'):
+                            # Extract batch, seq_len, hidden_dim
+                            batch_size, seq_len, hidden_dim = hidden_states.shape
+                            
+                            # Compute Q, K, V
+                            q = module.q_proj(hidden_states)
+                            k = module.k_proj(hidden_states)
+                            v = module.v_proj(hidden_states)
+                            
+                            # Reshape for multi-head attention
+                            num_heads = module.num_heads
+                            head_dim = hidden_dim // num_heads
+                            
+                            q = q.view(batch_size, seq_len, num_heads, head_dim)
+                            k = k.view(batch_size, seq_len, num_heads, head_dim)
+                            v = v.view(batch_size, seq_len, num_heads, head_dim)
+                            
+                            # Call FlashAttention-3
+                            # Note: flash_attn_func expects (batch, seqlen, nheads, headdim)
+                            attn_output = flash_attn_interface.flash_attn_func(
+                                q, k, v,
+                                dropout_p=0.0,
+                                softmax_scale=None,  # Will use default 1/sqrt(head_dim)
+                                causal=True,  # For causal LM
+                            )
+                            
+                            # Reshape back
+                            attn_output = attn_output.view(batch_size, seq_len, hidden_dim)
+                            
+                            # Apply output projection if it exists
+                            if hasattr(module, 'o_proj'):
+                                attn_output = module.o_proj(attn_output)
+                            
+                            return (attn_output,) + (None,) * (len(orig_forward(hidden_states, *args, **kwargs)) - 1)
+                        
+                        else:
+                            # Not an attention module we can patch, use original
+                            return orig_forward(hidden_states, *args, **kwargs)
+                    
+                    return patched_forward
+                
+                # Apply patch
+                module.forward = create_patched_forward(original_forward)
+                num_patched += 1
+    
+    if verbose:
+        if num_patched > 0:
+            print(f"  ✓ Patched {num_patched} attention modules to use FlashAttention-3 Hopper")
+        else:
+            warnings.warn("  ⚠ No attention modules found to patch for FlashAttention-3")
+
+
+def get_attention_info(attn_implementation: str) -> dict:
+    """
+    Get information about an attention implementation.
+    
+    Args:
+        attn_implementation: Attention implementation string
+        
+    Returns:
+        Dictionary with info about the implementation
+    """
+    info = {
+        "flash_attention_2": {
+            "name": "FlashAttention-2",
+            "description": "Optimized attention for A100 and other GPUs",
+            "gpu_support": ["A100", "MI300X", "V100", "RTX"],
+            "memory_efficient": True,
+            "requires_cuda": True,
+        },
+        "flash_attention_3_hopper": {
+            "name": "FlashAttention-3 Hopper",
+            "description": "Optimized attention for H100/H200 Hopper architecture",
+            "gpu_support": ["H100", "H200"],
+            "memory_efficient": True,
+            "requires_cuda": True,
+        },
+        "sdpa": {
+            "name": "PyTorch SDPA",
+            "description": "PyTorch Scaled Dot Product Attention",
+            "gpu_support": ["All"],
+            "memory_efficient": True,
+            "requires_cuda": False,
+        },
+        "eager": {
+            "name": "Eager Attention",
+            "description": "Standard PyTorch attention implementation",
+            "gpu_support": ["All"],
+            "memory_efficient": False,
+            "requires_cuda": False,
+        },
+    }
+    
+    return info.get(attn_implementation, {
+        "name": attn_implementation,
+        "description": "Unknown attention implementation",
+        "gpu_support": ["Unknown"],
+        "memory_efficient": False,
+        "requires_cuda": False,
+    })
+
+
+def validate_attention_for_gpu(attn_implementation: str, gpu_name: str) -> tuple[bool, Optional[str]]:
+    """
+    Validate if attention implementation is suitable for GPU.
+    
+    Args:
+        attn_implementation: Attention implementation
+        gpu_name: GPU device name
+        
+    Returns:
+        Tuple of (is_valid, warning_message)
+    """
+    gpu_lower = gpu_name.lower()
+    
+    # FlashAttention-3 Hopper validation
+    if attn_implementation == "flash_attention_3_hopper":
+        if 'h100' not in gpu_lower and 'h200' not in gpu_lower:
+            return False, (
+                f"FlashAttention-3 Hopper is optimized for H100/H200. "
+                f"Current GPU: {gpu_name}. Consider using flash_attention_2 instead."
+            )
+    
+    # FlashAttention-2 on Hopper GPUs
+    if attn_implementation == "flash_attention_2":
+        if 'h100' in gpu_lower or 'h200' in gpu_lower:
+            return True, (
+                f"FlashAttention-2 will work on {gpu_name}, but FlashAttention-3 Hopper "
+                f"may provide better performance."
+            )
+    
+    return True, None
+
+
+if __name__ == "__main__":
+    """Test attention configuration."""
+    print("=" * 60)
+    print("Attention Implementation Test")
+    print("=" * 60)
+    
+    # Test getting default attention for different GPUs
+    test_gpus = [
+        "NVIDIA A100 80GB",
+        "NVIDIA H100 80GB",
+        "NVIDIA H200 141GB",
+        "AMD Instinct MI300X",
+    ]
+    
+    print("\nDefault attention implementations:")
+    for gpu in test_gpus:
+        attn = get_default_attention(gpu)
+        print(f"  {gpu:30s} → {attn}")
+    
+    # Test validation
+    print("\nValidation tests:")
+    test_cases = [
+        ("flash_attention_3_hopper", "NVIDIA H100 80GB"),
+        ("flash_attention_3_hopper", "NVIDIA A100 80GB"),
+        ("flash_attention_2", "NVIDIA H100 80GB"),
+        ("flash_attention_2", "NVIDIA A100 80GB"),
+    ]
+    
+    for attn, gpu in test_cases:
+        valid, warning = validate_attention_for_gpu(attn, gpu)
+        status = "✓" if valid else "✗"
+        print(f"  {status} {attn:30s} on {gpu:25s}")
+        if warning:
+            print(f"    ⚠ {warning}")
+    
+    # Test getting info
+    print("\nAttention implementation info:")
+    for attn in ["flash_attention_2", "flash_attention_3_hopper", "sdpa"]:
+        info = get_attention_info(attn)
+        print(f"\n  {info['name']}:")
+        print(f"    Description: {info['description']}")
+        print(f"    GPU Support: {', '.join(info['gpu_support'])}")
+        print(f"    Memory Efficient: {info['memory_efficient']}")
--- a/utils/gpu_monitor.py
+++ b/utils/gpu_monitor.py
@@ -0,0 +1,562 @@
+"""
+GPU Monitoring Infrastructure for LLM Benchmarking
+
+Provides unified interface for monitoring both NVIDIA and AMD GPUs.
+"""
+
+import time
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional, List
+import warnings
+
+
+@dataclass
+class GPUMetrics:
+    """Container for GPU metrics."""
+    timestamp: float
+    power_watts: float
+    gpu_utilization_percent: float
+    memory_used_gb: float
+    memory_total_gb: float
+    temperature_celsius: Optional[float] = None
+    energy_joules: Optional[float] = None  # Cumulative energy
+
+
+class GPUMonitor(ABC):
+    """Abstract base class for GPU monitoring."""
+    
+    def __init__(self, device_id: int = 0):
+        """
+        Initialize GPU monitor.
+        
+        Args:
+            device_id: GPU device ID to monitor
+        """
+        self.device_id = device_id
+        self.start_time = None
+        self.start_energy = None
+        self.last_metrics = None
+    
+    @abstractmethod
+    def get_metrics(self) -> GPUMetrics:
+        """Get current GPU metrics."""
+        pass
+    
+    @abstractmethod
+    def get_device_name(self) -> str:
+        """Get GPU device name."""
+        pass
+    
+    @abstractmethod
+    def cleanup(self):
+        """Cleanup resources."""
+        pass
+    
+    def start_monitoring(self):
+        """Start energy monitoring session."""
+        self.start_time = time.time()
+        metrics = self.get_metrics()
+        self.start_energy = metrics.energy_joules if metrics.energy_joules is not None else 0.0
+        self.last_metrics = metrics
+    
+    def get_energy_consumed(self) -> float:
+        """
+        Get energy consumed since start_monitoring() was called.
+        
+        Returns:
+            Energy in Joules
+        """
+        if self.start_time is None:
+            raise RuntimeError("Must call start_monitoring() first")
+        
+        current_metrics = self.get_metrics()
+        
+        if current_metrics.energy_joules is not None:
+            # If GPU provides cumulative energy, use it
+            return current_metrics.energy_joules - self.start_energy
+        else:
+            # Otherwise, integrate power over time
+            elapsed_time = time.time() - self.start_time
+            # Use average of start and current power
+            avg_power = (self.last_metrics.power_watts + current_metrics.power_watts) / 2.0
+            return avg_power * elapsed_time
+    
+    def get_average_power(self) -> float:
+        """
+        Get average power consumption since start_monitoring().
+        
+        Returns:
+            Average power in Watts
+        """
+        if self.start_time is None:
+            raise RuntimeError("Must call start_monitoring() first")
+        
+        elapsed_time = time.time() - self.start_time
+        if elapsed_time == 0:
+            return 0.0
+        
+        energy = self.get_energy_consumed()
+        return energy / elapsed_time
+
+
+class NVIDIAMonitor(GPUMonitor):
+    """NVIDIA GPU monitor using pynvml."""
+    
+    def __init__(self, device_id: int = 0):
+        """Initialize NVIDIA monitor."""
+        try:
+            import pynvml
+            self.pynvml = pynvml
+        except ImportError:
+            raise ImportError(
+                "pynvml not found. Install with: pip install pynvml"
+            )
+        
+        try:
+            self.pynvml.nvmlInit()
+            self.handle = self.pynvml.nvmlDeviceGetHandleByIndex(device_id)
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize NVIDIA GPU {device_id}: {e}")
+        
+        super().__init__(device_id)
+    
+    def get_metrics(self) -> GPUMetrics:
+        """Get current NVIDIA GPU metrics."""
+        try:
+            # Power (in milliwatts)
+            power_mw = self.pynvml.nvmlDeviceGetPowerUsage(self.handle)
+            power_watts = power_mw / 1000.0
+            
+            # Utilization
+            util = self.pynvml.nvmlDeviceGetUtilizationRates(self.handle)
+            gpu_util = util.gpu
+            
+            # Memory
+            mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(self.handle)
+            memory_used_gb = mem_info.used / (1024**3)
+            memory_total_gb = mem_info.total / (1024**3)
+            
+            # Temperature
+            try:
+                temp = self.pynvml.nvmlDeviceGetTemperature(
+                    self.handle, 
+                    self.pynvml.NVML_TEMPERATURE_GPU
+                )
+            except:
+                temp = None
+            
+            # Try to get cumulative energy (newer GPUs)
+            energy_joules = None
+            try:
+                energy_mj = self.pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
+                energy_joules = energy_mj / 1000.0
+            except:
+                # Not supported on this GPU, will use power integration
+                pass
+            
+            return GPUMetrics(
+                timestamp=time.time(),
+                power_watts=power_watts,
+                gpu_utilization_percent=gpu_util,
+                memory_used_gb=memory_used_gb,
+                memory_total_gb=memory_total_gb,
+                temperature_celsius=temp,
+                energy_joules=energy_joules
+            )
+        except Exception as e:
+            raise RuntimeError(f"Failed to get NVIDIA GPU metrics: {e}")
+    
+    def get_device_name(self) -> str:
+        """Get NVIDIA GPU device name."""
+        try:
+            name = self.pynvml.nvmlDeviceGetName(self.handle)
+            if isinstance(name, bytes):
+                name = name.decode('utf-8')
+            return name
+        except:
+            return f"NVIDIA GPU {self.device_id}"
+    
+    def cleanup(self):
+        """Cleanup NVIDIA resources."""
+        try:
+            self.pynvml.nvmlShutdown()
+        except:
+            pass
+
+
+class AMDMonitor(GPUMonitor):
+    """AMD GPU monitor using rocm-smi command line tool."""
+    
+    def __init__(self, device_id: int = 0):
+        """Initialize AMD monitor."""
+        import subprocess
+        import shutil
+        
+        # Check if rocm-smi is available
+        if shutil.which('rocm-smi') is None:
+            raise RuntimeError("rocm-smi command not found. Make sure ROCm is installed and in PATH.")
+        
+        self.device_id = device_id
+        
+        # Verify device exists
+        try:
+            result = subprocess.run(
+                ['rocm-smi', '--showid'],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            if result.returncode != 0:
+                raise RuntimeError(f"rocm-smi failed: {result.stderr}")
+        except subprocess.TimeoutExpired:
+            raise RuntimeError("rocm-smi command timed out")
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize AMD GPU {device_id}: {e}")
+        
+        super().__init__(device_id)
+    
+    def _parse_detailed_output(self, output: str) -> dict:
+        """Parse rocm-smi detailed output format."""
+        lines = output.strip().split('\n')
+        
+        # Parse detailed format: GPU[X] : Metric : Value
+        metrics = {
+            'temperature': None,
+            'power': None,
+            'vram_percent': None,
+            'gpu_percent': None,
+        }
+        
+        device_prefix = f"GPU[{self.device_id}]"
+        
+        for line in lines:
+            if not line.strip() or not line.startswith(device_prefix):
+                continue
+            
+            # Split by colon
+            parts = line.split(':')
+            if len(parts) < 3:
+                continue
+            
+            metric_name = parts[1].strip().lower()
+            value_str = parts[2].strip()
+            
+            try:
+                # Temperature (Sensor junction)
+                if 'temperature' in metric_name and 'junction' in metric_name:
+                    metrics['temperature'] = float(value_str)
+                
+                # Power consumption
+                elif 'power' in metric_name and 'package' in metric_name:
+                    metrics['power'] = float(value_str)
+                
+                # GPU utilization
+                elif 'gpu use' in metric_name:
+                    metrics['gpu_percent'] = float(value_str)
+                
+                # VRAM usage percentage
+                elif 'memory allocated' in metric_name and 'vram%' in metric_name:
+                    metrics['vram_percent'] = float(value_str)
+                    
+            except (ValueError, IndexError):
+                continue
+        
+        # Validate we got the required metrics
+        if metrics['temperature'] is None:
+            raise ValueError(f"Could not find temperature for GPU[{self.device_id}]")
+        if metrics['power'] is None:
+            raise ValueError(f"Could not find power for GPU[{self.device_id}]")
+        if metrics['gpu_percent'] is None:
+            metrics['gpu_percent'] = 0.0
+        if metrics['vram_percent'] is None:
+            metrics['vram_percent'] = 0.0
+        
+        return metrics
+    
+    def _get_memory_info(self) -> tuple:
+        """Get memory usage in GB using rocm-smi --showmeminfo."""
+        import subprocess
+        
+        try:
+            result = subprocess.run(
+                ['rocm-smi', '--showmeminfo', 'vram', '-d', str(self.device_id)],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            
+            if result.returncode != 0:
+                return 0.0, 0.0
+            
+            # Parse output for memory info
+            # Looking for lines like "GPU memory used: X MiB" and "GPU memory total: Y MiB"
+            used_gb = 0.0
+            total_gb = 0.0
+            
+            for line in result.stdout.split('\n'):
+                if 'Used' in line or 'used' in line:
+                    # Extract number
+                    parts = line.split()
+                    for i, part in enumerate(parts):
+                        if part.replace('.', '').isdigit():
+                            used_bytes = float(part)
+                            # Check if next part indicates unit
+                            if i + 1 < len(parts):
+                                unit = parts[i + 1].lower()
+                                if 'mb' in unit or 'mib' in unit:
+                                    used_gb = used_bytes / 1024
+                                elif 'gb' in unit or 'gib' in unit:
+                                    used_gb = used_bytes
+                                elif 'kb' in unit or 'kib' in unit:
+                                    used_gb = used_bytes / (1024 * 1024)
+                            break
+                
+                if 'Total' in line or 'total' in line:
+                    parts = line.split()
+                    for i, part in enumerate(parts):
+                        if part.replace('.', '').isdigit():
+                            total_bytes = float(part)
+                            if i + 1 < len(parts):
+                                unit = parts[i + 1].lower()
+                                if 'mb' in unit or 'mib' in unit:
+                                    total_gb = total_bytes / 1024
+                                elif 'gb' in unit or 'gib' in unit:
+                                    total_gb = total_bytes
+                                elif 'kb' in unit or 'kib' in unit:
+                                    total_gb = total_bytes / (1024 * 1024)
+                            break
+            
+            return used_gb, total_gb
+            
+        except Exception:
+            return 0.0, 0.0
+    
+    def get_metrics(self) -> GPUMetrics:
+        """Get current AMD GPU metrics."""
+        import subprocess
+        
+        try:
+            # Get main metrics from concise output
+            result = subprocess.run(
+                ['rocm-smi', '--showid', '--showtemp', '--showpower', '--showuse', '--showmemuse'],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            
+            if result.returncode != 0:
+                raise RuntimeError(f"rocm-smi failed: {result.stderr}")
+            
+            metrics = self._parse_detailed_output(result.stdout)
+            
+            # Get detailed memory info
+            memory_used_gb, memory_total_gb = self._get_memory_info()
+            
+            # If we couldn't get absolute memory, estimate from percentage
+            if memory_total_gb == 0.0:
+                # MI300X has ~192GB, MI250X has ~128GB - use a reasonable default
+                memory_total_gb = 192.0  # Assume MI300X
+                memory_used_gb = memory_total_gb * (metrics['vram_percent'] / 100.0)
+            
+            return GPUMetrics(
+                timestamp=time.time(),
+                power_watts=metrics['power'],
+                gpu_utilization_percent=metrics['gpu_percent'],
+                memory_used_gb=memory_used_gb,
+                memory_total_gb=memory_total_gb,
+                temperature_celsius=metrics['temperature'],
+                energy_joules=None  # Will use power integration
+            )
+            
+        except subprocess.TimeoutExpired:
+            raise RuntimeError("rocm-smi command timed out")
+        except Exception as e:
+            raise RuntimeError(f"Failed to get AMD GPU metrics: {e}")
+    
+    def get_device_name(self) -> str:
+        """Get AMD GPU device name."""
+        import subprocess
+        
+        try:
+            result = subprocess.run(
+                ['rocm-smi', '--showproductname', '-d', str(self.device_id)],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            
+            if result.returncode == 0:
+                # Parse output to find device name
+                for line in result.stdout.split('\n'):
+                    if 'Card series' in line or 'Card model' in line or 'name' in line.lower():
+                        parts = line.split(':')
+                        if len(parts) > 1:
+                            return parts[1].strip()
+        except Exception:
+            pass
+        
+        return f"AMD GPU {self.device_id}"
+    
+    def cleanup(self):
+        """Cleanup AMD resources."""
+        # No cleanup needed for command-line tool
+        pass
+
+
+def get_gpu_monitor(device_id: int = 0) -> GPUMonitor:
+    """
+    Factory function to automatically detect and create appropriate GPU monitor.
+    
+    Args:
+        device_id: GPU device ID to monitor
+        
+    Returns:
+        GPUMonitor instance (NVIDIAMonitor or AMDMonitor)
+        
+    Raises:
+        RuntimeError: If no supported GPU is found
+    """
+    # Try AMD first (rocm-smi based) as it's more commonly available
+    try:
+        return AMDMonitor(device_id)
+    except:
+        pass
+    
+    # Try NVIDIA if AMD fails
+    try:
+        return NVIDIAMonitor(device_id)
+    except:
+        pass
+    
+    # Try to import torch to detect GPU type as last resort
+    try:
+        import torch
+        if torch.cuda.is_available():
+            # Check if it's NVIDIA or AMD
+            device_name = torch.cuda.get_device_name(device_id).lower()
+            
+            if 'nvidia' in device_name or 'tesla' in device_name or 'geforce' in device_name:
+                return NVIDIAMonitor(device_id)
+            elif 'amd' in device_name or 'radeon' in device_name or 'mi300' in device_name or 'mi200' in device_name:
+                return AMDMonitor(device_id)
+    except:
+        pass
+    
+    raise RuntimeError(
+        "No supported GPU found. Make sure either ROCm (rocm-smi) or NVIDIA (pynvml) drivers are installed."
+    )
+
+
+def list_available_gpus() -> List[str]:
+    """
+    List all available GPUs.
+    
+    Returns:
+        List of GPU names
+    """
+    gpus = []
+    
+    # Try NVIDIA
+    try:
+        import pynvml
+        pynvml.nvmlInit()
+        device_count = pynvml.nvmlDeviceGetCount()
+        for i in range(device_count):
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            name = pynvml.nvmlDeviceGetName(handle)
+            if isinstance(name, bytes):
+                name = name.decode('utf-8')
+            gpus.append(f"GPU {i}: {name} (NVIDIA)")
+        pynvml.nvmlShutdown()
+    except:
+        pass
+    
+    # Try AMD with rocm-smi
+    try:
+        import subprocess
+        import shutil
+        
+        if shutil.which('rocm-smi'):
+            result = subprocess.run(
+                ['rocm-smi', '--showid'],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            if result.returncode == 0:
+                # Parse device IDs from output
+                for line in result.stdout.split('\n'):
+                    if not line.strip() or line.startswith('=') or 'Device' in line or 'ROCm' in line:
+                        continue
+                    parts = line.split()
+                    if parts and parts[0].isdigit():
+                        device_id = int(parts[0])
+                        # Try to get device name
+                        name_result = subprocess.run(
+                            ['rocm-smi', '--showproductname', '-d', str(device_id)],
+                            capture_output=True,
+                            text=True,
+                            timeout=5
+                        )
+                        name = f"AMD GPU"
+                        if name_result.returncode == 0:
+                            for name_line in name_result.stdout.split('\n'):
+                                if 'Card' in name_line or 'name' in name_line.lower():
+                                    parts_name = name_line.split(':')
+                                    if len(parts_name) > 1:
+                                        name = parts_name[1].strip()
+                                        break
+                        gpus.append(f"GPU {device_id}: {name} (AMD)")
+    except:
+        pass
+    
+    return gpus
+
+
+if __name__ == "__main__":
+    """Test GPU monitoring."""
+    print("=" * 60)
+    print("GPU Monitoring Test")
+    print("=" * 60)
+    
+    # List available GPUs
+    print("\nAvailable GPUs:")
+    gpus = list_available_gpus()
+    if not gpus:
+        print("  No GPUs found!")
+        exit(1)
+    
+    for gpu in gpus:
+        print(f"  {gpu}")
+    
+    # Test monitoring
+    print("\nTesting GPU 0 monitoring...")
+    try:
+        monitor = get_gpu_monitor(0)
+        print(f"  Device: {monitor.get_device_name()}")
+        
+        # Get metrics
+        metrics = monitor.get_metrics()
+        print(f"\nCurrent Metrics:")
+        print(f"  Power: {metrics.power_watts:.2f} W")
+        print(f"  GPU Utilization: {metrics.gpu_utilization_percent:.1f}%")
+        print(f"  Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB")
+        if metrics.temperature_celsius:
+            print(f"  Temperature: {metrics.temperature_celsius:.1f}°C")
+        
+        # Test energy monitoring
+        print("\nTesting energy monitoring (5 seconds)...")
+        monitor.start_monitoring()
+        time.sleep(5)
+        energy = monitor.get_energy_consumed()
+        avg_power = monitor.get_average_power()
+        print(f"  Energy consumed: {energy:.2f} J")
+        print(f"  Average power: {avg_power:.2f} W")
+        
+        monitor.cleanup()
+        print("\n✓ Monitoring test successful!")
+        
+    except Exception as e:
+        print(f"\n✗ Error: {e}")
+        exit(1)
--- a/utils/metrics.py
+++ b/utils/metrics.py
@@ -0,0 +1,473 @@
+"""
+Metrics Collection and Reporting for LLM Benchmarking
+
+Provides centralized metrics collection, aggregation, and reporting.
+"""
+
+import json
+import csv
+from dataclasses import dataclass, asdict, field
+from typing import Dict, List, Optional, Any
+from pathlib import Path
+import time
+
+
+@dataclass
+class StageMetrics:
+    """Metrics for a specific stage (e.g., forward pass, prefill, etc.)."""
+    stage_name: str
+    duration_ms: float
+    tokens_processed: int
+    tokens_per_second: float
+    energy_joules: float
+    energy_per_token: float
+    avg_power_watts: float
+    peak_memory_gb: float
+    avg_gpu_util_percent: float
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary."""
+        return asdict(self)
+
+
+@dataclass
+class PretrainMetrics:
+    """Metrics for pretraining benchmark."""
+    model_name: str
+    gpu_name: str
+    attention_implementation: str
+    batch_size: int
+    sequence_length: int
+    num_steps: int
+    
+    # Stage-specific metrics
+    forward: StageMetrics
+    backward: StageMetrics
+    optimizer: StageMetrics
+    
+    # Overall metrics
+    total_duration_ms: float
+    total_tokens: int
+    total_tokens_per_second: float
+    total_energy_joules: float
+    total_energy_per_token: float
+    
+    timestamp: float = field(default_factory=time.time)
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary."""
+        return {
+            "model_name": self.model_name,
+            "gpu_name": self.gpu_name,
+            "attention_implementation": self.attention_implementation,
+            "batch_size": self.batch_size,
+            "sequence_length": self.sequence_length,
+            "num_steps": self.num_steps,
+            "forward": self.forward.to_dict(),
+            "backward": self.backward.to_dict(),
+            "optimizer": self.optimizer.to_dict(),
+            "total_duration_ms": self.total_duration_ms,
+            "total_tokens": self.total_tokens,
+            "total_tokens_per_second": self.total_tokens_per_second,
+            "total_energy_joules": self.total_energy_joules,
+            "total_energy_per_token": self.total_energy_per_token,
+            "timestamp": self.timestamp,
+        }
+
+
+@dataclass
+class InferenceMetrics:
+    """Metrics for inference benchmark."""
+    model_name: str
+    gpu_name: str
+    attention_implementation: str
+    num_requests: int
+    prompt_length: int
+    generation_length: int
+    
+    # Stage-specific metrics
+    prefill: StageMetrics  # Time to First Token
+    decode: StageMetrics   # Inter-Token Latency
+    
+    # End-to-end metrics
+    e2e_latency_ms: float
+    e2e_tokens_per_second: float
+    e2e_energy_joules: float
+    e2e_energy_per_token: float
+    
+    # Additional metrics
+    ttft_ms: float  # Time to First Token (same as prefill duration)
+    itl_ms: float   # Inter-Token Latency (decode duration / num_tokens)
+    
+    timestamp: float = field(default_factory=time.time)
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary."""
+        return {
+            "model_name": self.model_name,
+            "gpu_name": self.gpu_name,
+            "attention_implementation": self.attention_implementation,
+            "num_requests": self.num_requests,
+            "prompt_length": self.prompt_length,
+            "generation_length": self.generation_length,
+            "prefill": self.prefill.to_dict(),
+            "decode": self.decode.to_dict(),
+            "e2e_latency_ms": self.e2e_latency_ms,
+            "e2e_tokens_per_second": self.e2e_tokens_per_second,
+            "e2e_energy_joules": self.e2e_energy_joules,
+            "e2e_energy_per_token": self.e2e_energy_per_token,
+            "ttft_ms": self.ttft_ms,
+            "itl_ms": self.itl_ms,
+            "timestamp": self.timestamp,
+        }
+
+
+class MetricsCollector:
+    """Collects metrics during benchmark runs."""
+    
+    def __init__(self):
+        """Initialize metrics collector."""
+        self.metrics_history: List[Dict[str, Any]] = []
+    
+    def add_pretrain_metrics(self, metrics: PretrainMetrics):
+        """Add pretraining metrics."""
+        self.metrics_history.append({
+            "type": "pretrain",
+            "metrics": metrics.to_dict()
+        })
+    
+    def add_inference_metrics(self, metrics: InferenceMetrics):
+        """Add inference metrics."""
+        self.metrics_history.append({
+            "type": "inference",
+            "metrics": metrics.to_dict()
+        })
+    
+    def get_all_metrics(self) -> List[Dict[str, Any]]:
+        """Get all collected metrics."""
+        return self.metrics_history
+    
+    def clear(self):
+        """Clear all metrics."""
+        self.metrics_history.clear()
+
+
+class MetricsReporter:
+    """Formats and outputs benchmark results."""
+    
+    @staticmethod
+    def print_pretrain_metrics(metrics: PretrainMetrics, verbose: bool = True):
+        """Print pretraining metrics to console."""
+        print("\n" + "=" * 80)
+        print("PRETRAINING BENCHMARK RESULTS")
+        print("=" * 80)
+        print(f"\nModel: {metrics.model_name}")
+        print(f"GPU: {metrics.gpu_name}")
+        print(f"Attention: {metrics.attention_implementation}")
+        print(f"Batch Size: {metrics.batch_size}")
+        print(f"Sequence Length: {metrics.sequence_length}")
+        print(f"Training Steps: {metrics.num_steps}")
+        
+        print("\n" + "-" * 80)
+        print("STAGE BREAKDOWN")
+        print("-" * 80)
+        
+        # Forward pass
+        print(f"\n[1] FORWARD PASS")
+        MetricsReporter._print_stage_metrics(metrics.forward, verbose)
+        
+        # Backward pass
+        print(f"\n[2] BACKWARD PASS")
+        MetricsReporter._print_stage_metrics(metrics.backward, verbose)
+        
+        # Optimizer step
+        print(f"\n[3] OPTIMIZER STEP")
+        MetricsReporter._print_stage_metrics(metrics.optimizer, verbose)
+        
+        # Overall
+        print("\n" + "-" * 80)
+        print("OVERALL METRICS")
+        print("-" * 80)
+        print(f"  Total Duration:        {metrics.total_duration_ms:>10.2f} ms")
+        print(f"  Total Tokens:          {metrics.total_tokens:>10,}")
+        print(f"  Throughput:            {metrics.total_tokens_per_second:>10.2f} tokens/s")
+        print(f"  Total Energy:          {metrics.total_energy_joules:>10.2f} J")
+        print(f"  Energy per Token:      {metrics.total_energy_per_token*1000:>10.4f} mJ/token")
+        print("=" * 80 + "\n")
+    
+    @staticmethod
+    def print_inference_metrics(metrics: InferenceMetrics, verbose: bool = True):
+        """Print inference metrics to console."""
+        print("\n" + "=" * 80)
+        print("INFERENCE BENCHMARK RESULTS")
+        print("=" * 80)
+        print(f"\nModel: {metrics.model_name}")
+        print(f"GPU: {metrics.gpu_name}")
+        print(f"Attention: {metrics.attention_implementation}")
+        print(f"Requests: {metrics.num_requests}")
+        print(f"Prompt Length: {metrics.prompt_length}")
+        print(f"Generation Length: {metrics.generation_length}")
+        
+        print("\n" + "-" * 80)
+        print("STAGE BREAKDOWN")
+        print("-" * 80)
+        
+        # Prefill
+        print(f"\n[1] PREFILL (Time to First Token)")
+        MetricsReporter._print_stage_metrics(metrics.prefill, verbose)
+        print(f"  TTFT:                  {metrics.ttft_ms:>10.2f} ms")
+        
+        # Decode
+        print(f"\n[2] DECODE (Inter-Token Latency)")
+        MetricsReporter._print_stage_metrics(metrics.decode, verbose)
+        print(f"  ITL:                   {metrics.itl_ms:>10.2f} ms/token")
+        
+        # End-to-end
+        print("\n" + "-" * 80)
+        print("END-TO-END METRICS")
+        print("-" * 80)
+        print(f"  Request Latency:       {metrics.e2e_latency_ms:>10.2f} ms")
+        print(f"  Throughput:            {metrics.e2e_tokens_per_second:>10.2f} tokens/s")
+        print(f"  Total Energy:          {metrics.e2e_energy_joules:>10.2f} J")
+        print(f"  Energy per Token:      {metrics.e2e_energy_per_token*1000:>10.4f} mJ/token")
+        print("=" * 80 + "\n")
+    
+    @staticmethod
+    def _print_stage_metrics(stage: StageMetrics, verbose: bool = True):
+        """Print metrics for a single stage."""
+        print(f"  Duration:              {stage.duration_ms:>10.2f} ms")
+        print(f"  Tokens:                {stage.tokens_processed:>10,}")
+        print(f"  Throughput:            {stage.tokens_per_second:>10.2f} tokens/s")
+        print(f"  Energy:                {stage.energy_joules:>10.2f} J")
+        print(f"  Energy per Token:      {stage.energy_per_token*1000:>10.4f} mJ/token")
+        
+        if verbose:
+            print(f"  Avg Power:             {stage.avg_power_watts:>10.2f} W")
+            print(f"  Peak Memory:           {stage.peak_memory_gb:>10.2f} GB")
+            print(f"  Avg GPU Utilization:   {stage.avg_gpu_util_percent:>10.1f} %")
+    
+    @staticmethod
+    def save_json(metrics: Any, output_path: Path):
+        """
+        Save metrics to JSON file.
+        
+        Args:
+            metrics: PretrainMetrics or InferenceMetrics object
+            output_path: Path to output JSON file
+        """
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        with open(output_path, 'w') as f:
+            json.dump(metrics.to_dict(), f, indent=2)
+        
+        print(f"Metrics saved to: {output_path}")
+    
+    @staticmethod
+    def save_csv(metrics_list: List[Any], output_path: Path, benchmark_type: str = "pretrain"):
+        """
+        Save multiple metrics to CSV file for comparison.
+        
+        Args:
+            metrics_list: List of PretrainMetrics or InferenceMetrics objects
+            output_path: Path to output CSV file
+            benchmark_type: "pretrain" or "inference"
+        """
+        if not metrics_list:
+            print("No metrics to save")
+            return
+        
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        with open(output_path, 'w', newline='') as f:
+            if benchmark_type == "pretrain":
+                MetricsReporter._save_pretrain_csv(metrics_list, f)
+            else:
+                MetricsReporter._save_inference_csv(metrics_list, f)
+        
+        print(f"CSV saved to: {output_path}")
+    
+    @staticmethod
+    def _save_pretrain_csv(metrics_list: List[PretrainMetrics], file):
+        """Save pretraining metrics to CSV."""
+        fieldnames = [
+            'gpu_name', 'attention_implementation', 'batch_size', 'sequence_length', 'num_steps',
+            'forward_duration_ms', 'forward_tokens_per_sec', 'forward_energy_j', 'forward_energy_per_token_mj',
+            'backward_duration_ms', 'backward_tokens_per_sec', 'backward_energy_j', 'backward_energy_per_token_mj',
+            'optimizer_duration_ms', 'optimizer_tokens_per_sec', 'optimizer_energy_j', 'optimizer_energy_per_token_mj',
+            'total_duration_ms', 'total_tokens_per_sec', 'total_energy_j', 'total_energy_per_token_mj',
+            'timestamp'
+        ]
+        
+        writer = csv.DictWriter(file, fieldnames=fieldnames)
+        writer.writeheader()
+        
+        for m in metrics_list:
+            writer.writerow({
+                'gpu_name': m.gpu_name,
+                'attention_implementation': m.attention_implementation,
+                'batch_size': m.batch_size,
+                'sequence_length': m.sequence_length,
+                'num_steps': m.num_steps,
+                'forward_duration_ms': m.forward.duration_ms,
+                'forward_tokens_per_sec': m.forward.tokens_per_second,
+                'forward_energy_j': m.forward.energy_joules,
+                'forward_energy_per_token_mj': m.forward.energy_per_token * 1000,
+                'backward_duration_ms': m.backward.duration_ms,
+                'backward_tokens_per_sec': m.backward.tokens_per_second,
+                'backward_energy_j': m.backward.energy_joules,
+                'backward_energy_per_token_mj': m.backward.energy_per_token * 1000,
+                'optimizer_duration_ms': m.optimizer.duration_ms,
+                'optimizer_tokens_per_sec': m.optimizer.tokens_per_second,
+                'optimizer_energy_j': m.optimizer.energy_joules,
+                'optimizer_energy_per_token_mj': m.optimizer.energy_per_token * 1000,
+                'total_duration_ms': m.total_duration_ms,
+                'total_tokens_per_sec': m.total_tokens_per_second,
+                'total_energy_j': m.total_energy_joules,
+                'total_energy_per_token_mj': m.total_energy_per_token * 1000,
+                'timestamp': m.timestamp,
+            })
+    
+    @staticmethod
+    def _save_inference_csv(metrics_list: List[InferenceMetrics], file):
+        """Save inference metrics to CSV."""
+        fieldnames = [
+            'gpu_name', 'attention_implementation', 'num_requests', 'prompt_length', 'generation_length',
+            'prefill_duration_ms', 'prefill_tokens_per_sec', 'prefill_energy_j', 'prefill_energy_per_token_mj',
+            'ttft_ms',
+            'decode_duration_ms', 'decode_tokens_per_sec', 'decode_energy_j', 'decode_energy_per_token_mj',
+            'itl_ms',
+            'e2e_latency_ms', 'e2e_tokens_per_sec', 'e2e_energy_j', 'e2e_energy_per_token_mj',
+            'timestamp'
+        ]
+        
+        writer = csv.DictWriter(file, fieldnames=fieldnames)
+        writer.writeheader()
+        
+        for m in metrics_list:
+            writer.writerow({
+                'gpu_name': m.gpu_name,
+                'attention_implementation': m.attention_implementation,
+                'num_requests': m.num_requests,
+                'prompt_length': m.prompt_length,
+                'generation_length': m.generation_length,
+                'prefill_duration_ms': m.prefill.duration_ms,
+                'prefill_tokens_per_sec': m.prefill.tokens_per_second,
+                'prefill_energy_j': m.prefill.energy_joules,
+                'prefill_energy_per_token_mj': m.prefill.energy_per_token * 1000,
+                'ttft_ms': m.ttft_ms,
+                'decode_duration_ms': m.decode.duration_ms,
+                'decode_tokens_per_sec': m.decode.tokens_per_second,
+                'decode_energy_j': m.decode.energy_joules,
+                'decode_energy_per_token_mj': m.decode.energy_per_token * 1000,
+                'itl_ms': m.itl_ms,
+                'e2e_latency_ms': m.e2e_latency_ms,
+                'e2e_tokens_per_sec': m.e2e_tokens_per_second,
+                'e2e_energy_j': m.e2e_energy_joules,
+                'e2e_energy_per_token_mj': m.e2e_energy_per_token * 1000,
+                'timestamp': m.timestamp,
+            })
+
+
+if __name__ == "__main__":
+    """Test metrics reporting."""
+    # Create sample pretraining metrics
+    forward = StageMetrics(
+        stage_name="forward",
+        duration_ms=100.5,
+        tokens_processed=1024,
+        tokens_per_second=10189.3,
+        energy_joules=25.3,
+        energy_per_token=0.0247,
+        avg_power_watts=251.7,
+        peak_memory_gb=45.2,
+        avg_gpu_util_percent=95.3
+    )
+    
+    backward = StageMetrics(
+        stage_name="backward",
+        duration_ms=205.2,
+        tokens_processed=1024,
+        tokens_per_second=4991.2,
+        energy_joules=51.6,
+        energy_per_token=0.0504,
+        avg_power_watts=251.5,
+        peak_memory_gb=48.6,
+        avg_gpu_util_percent=97.1
+    )
+    
+    optimizer = StageMetrics(
+        stage_name="optimizer",
+        duration_ms=15.3,
+        tokens_processed=1024,
+        tokens_per_second=66928.1,
+        energy_joules=3.8,
+        energy_per_token=0.0037,
+        avg_power_watts=248.4,
+        peak_memory_gb=48.6,
+        avg_gpu_util_percent=42.1
+    )
+    
+    pretrain_metrics = PretrainMetrics(
+        model_name="Qwen/Qwen2.5-3B-Instruct",
+        gpu_name="NVIDIA A100 80GB",
+        attention_implementation="flash_attention_2",
+        batch_size=8,
+        sequence_length=2048,
+        num_steps=10,
+        forward=forward,
+        backward=backward,
+        optimizer=optimizer,
+        total_duration_ms=321.0,
+        total_tokens=10240,
+        total_tokens_per_second=31900.3,
+        total_energy_joules=80.7,
+        total_energy_per_token=0.00788
+    )
+    
+    # Print pretrain metrics
+    MetricsReporter.print_pretrain_metrics(pretrain_metrics)
+    
+    # Create sample inference metrics
+    prefill = StageMetrics(
+        stage_name="prefill",
+        duration_ms=45.2,
+        tokens_processed=512,
+        tokens_per_second=11327.4,
+        energy_joules=11.3,
+        energy_per_token=0.0221,
+        avg_power_watts=250.0,
+        peak_memory_gb=42.1,
+        avg_gpu_util_percent=89.2
+    )
+    
+    decode = StageMetrics(
+        stage_name="decode",
+        duration_ms=223.5,
+        tokens_processed=100,
+        tokens_per_second=447.4,
+        energy_joules=55.9,
+        energy_per_token=0.559,
+        avg_power_watts=250.1,
+        peak_memory_gb=42.1,
+        avg_gpu_util_percent=62.3
+    )
+    
+    inference_metrics = InferenceMetrics(
+        model_name="Qwen/Qwen2.5-3B-Instruct",
+        gpu_name="NVIDIA A100 80GB",
+        attention_implementation="flash_attention_2",
+        num_requests=10,
+        prompt_length=512,
+        generation_length=100,
+        prefill=prefill,
+        decode=decode,
+        e2e_latency_ms=268.7,
+        e2e_tokens_per_second=2277.9,
+        e2e_energy_joules=67.2,
+        e2e_energy_per_token=0.110,
+        ttft_ms=45.2,
+        itl_ms=2.235
+    )
+    
+    # Print inference metrics
+    MetricsReporter.print_inference_metrics(inference_metrics)